Exemplos de Fastq em Python, exemplos de pyfastx.Fastq em Python

Exemplo n.º 1

0

Exibir arquivo

	def setUp(self):
		
		self.fastq = pyfastx.Fastq(gzip_fastq)

		#reload index
		self.fastq = pyfastx.Fastq(gzip_fastq)

		#flat fastq
		self.flatq = pyfastx.Fastq(flat_fastq)

		self.reads = {}
		self.bases = {'A': 0, 'T': 0, 'G': 0, 'C':0, 'N':0}
		i = 0
		c = -1
		with open(flat_fastq) as fh:
			for line in fh:
				i += 1
				
				if i % 4 == 1:
					c += 1
					self.reads[c] = [line[1:].strip().split()[0], 0, 0]

				elif i % 4 == 2:
					self.reads[c][1] = line.strip()
					
					self.bases['A'] += line.count('A')
					self.bases['T'] += line.count('T')
					self.bases['G'] += line.count('G')
					self.bases['C'] += line.count('C')
					self.bases['N'] += line.count('N')

				elif i % 4 == 0:
					self.reads[c][2] = line.strip()

Exemplo n.º 2

0

Exibir arquivo

Arquivo: test_fastq.py Projeto: wx904/pyfastx

    def test_build(self):
        del self.fastq

        if os.path.exists('{}.fxi'.format(gzip_fastq)):
            os.remove('{}.fxi'.format(gzip_fastq))

        fq = pyfastx.Fastq(gzip_fastq, build_index=False)
        fq.build_index()

        self.fastq = pyfastx.Fastq(gzip_fastq)

Exemplo n.º 3

0

Exibir arquivo

def load_seqfile(infile):
    fxifile = infile + ".fxi"
    if os.path.exists(fxifile) and infile.endswith(FASTA_SUFFIX):
        seqfile = pyfastx.Fasta(infile, build_index=False)
    elif not os.path.exists(fxifile) and infile.endswith(FASTA_SUFFIX):
        seqfile = pyfastx.Fasta(infile, build_index=True)
    elif os.path.exists(fxifile) and infile.endswith(FASTQ_SUFFIX):
        seqfile = pyfastx.Fastq(infile, build_index=False)
    elif not os.path.exists(fxifile) and infile.endswith(FASTQ_SUFFIX):
        seqfile = pyfastx.Fastq(infile, build_index=True)
    return seqfile

Exemplo n.º 4

0

Exibir arquivo

def fastx_info(args):
	fastx_type = fastx_format_check(args.fastx)

	if fastx_type == 'fasta':
		fa = pyfastx.Fasta(args.fastx)
		comp = fa.composition
		print("Sequence counts: {}".format(len(fa)))
		print("Total bases: {}".format(fa.size))
		print("GC content: {:.2f}%".format(fa.gc_content))
		for b in comp:
			print("{} counts: {}".format(b, comp[b]))
		print("Mean length: {:.2f}".format(fa.mean))
		print("Median length: {:.2f}".format(fa.median))
		print("Max length: {}".format(len(fa.longest)))
		print("Min length: {}".format(len(fa.shortest)))
		print("N50, L50: {}, {}".format(*fa.nl()))
		print("length >= 1000: {}".format(fa.count(1000)))

	elif fastx_type == 'fastq':
		fq = pyfastx.Fastq(args.fastx)
		comp = fq.composition
		print("Read counts: {}".format(len(fq)))
		print("Total bases: {}".format(fq.size))
		print("GC content: {:.2f}%".format(fq.gc_content))
		for b in comp:
			print("{} counts: {}".format(b, comp[b]))
		print("Quality encoding system maybe: {}".format(", ".join(fq.encoding_type)))

Exemplo n.º 5

0

Exibir arquivo

Arquivo: utils.py Projeto: np-core/nanopath

def get_output_handle(fpath: str, fastx: bool = False, out: bool = True):

    if fpath == "-":
        if out:
            handle = sys.stdout
        else:
            handle = sys.stdin
    else:
        p = Path(fpath)
        if not p.parent.is_dir():
            raise NotADirectoryError(
                "Directory specified for output file does not exist: {}".format(
                    p.parent
                )
            )

        if fastx:
            if fpath.endswith("a"):
                handle = pyfastx.Fasta(p)
            else:
                handle = pyfastx.Fastq(p)
        else:
            handle = p.open("w")

    return handle

Exemplo n.º 6

0

Exibir arquivo

def run(args, name):
    count = 0
    # read fastq line and extract parts
    for read in pyfastx.Fastq(args.inputFile):
        count += 1
        split = read.description.split(':')
        runid = f"{split[0].replace('@', '')}_0{split[1]}_A{split[2]}"
        barcode = split[-1]
        if count == 1:
            break

    # list of tuples of input for JSON
    results = [("runID", runid),
               ("barcode", barcode)]

    # if JSON is present use exiting, else create new unique name
    JSON = storage.JSON()
    if not args.JSON:
        JSON.name(args.sample)
    JSON.open(args.JSON)
    JSON.add_results(args.name, results)
    JSON.pretty_print()
    JSON.write(args.outputDir)

    logging.info(results)

Exemplo n.º 7

0

Exibir arquivo

def fastq_sample(args):
	fq = pyfastx.Fastq(args.fastx)

	if args.num is not None and args.num > 0:
		seq_num = args.num
		if seq_num > len(fq):
			seq_num = len(fq)

	elif args.num is not None and 0 < args.prop <= 1:
		seq_num = round(len(fq)*args.prop)
		if seq_num == 0:
			raise RuntimeError("the proportion is too small")

	else:
		raise RuntimeError("specify a right number for seq number or proportion")

	selected = random.sample(range(len(fq)), k=seq_num)

	if args.outfile is None:
		fw = sys.stdout
	else:
		fw = open(args.outfile, 'w')

	for idx in selected:
		r = fq[idx]
		fw.write("@{}\n{}\n+\n{}\n".format(r.name, r.seq, r.qual))

	if args.outfile is None:
		fw.flush()
	else:
		fw.close()

Exemplo n.º 8

0

Exibir arquivo

	def test_iter_tuple(self):
		i = -1
		for name, seq, qual in pyfastx.Fastq(flat_fastq, build_index=False):
			i += 1
			self.assertEqual(name, self.reads[i][0])
			self.assertEqual(seq, self.reads[i][1])
			self.assertEqual(qual, self.reads[i][2])

Exemplo n.º 9

0

Exibir arquivo

Arquivo: utils.py Projeto: Wytamma/sketchy

def create_fastx_index(fastx):

    if is_fasta(fastx):
        return pyfastx.Fasta(str(fastx), build_index=True), build_read_fasta
    elif is_fastq(fastx):
        return pyfastx.Fastq(str(fastx), build_index=True), build_read_fastq
    else:
        raise ValueError(f'Could not determine input file format: {fastx}')

Exemplo n.º 10

0

Exibir arquivo

Arquivo: index.py Projeto: sjin09/fastx

def build_index(infile):
    fxifile = infile + ".fxi"
    if os.path.exists(fxifile):
        print("fxi index is present")
    else:
        print("buliding fxi index for {}".format(infile))
        if infile.endswith((".fa", ".fa.gz", ".fasta", ".fasta.gz")):
            pyfastx.Fasta(infile)
        else:
            pyfastx.Fastq(infile)
        print("fxi index has been created for {}".format(infile))

Exemplo n.º 11

0

Exibir arquivo

Arquivo: test_fastq.py Projeto: wx904/pyfastx

    def test_exception(self):
        with self.assertRaises(FileExistsError):
            _ = pyfastx.Fastq('a_fastq_file_not_exists')

        with self.assertRaises(IndexError):
            _ = self.fastq[len(self.fastq)]

        with self.assertRaises(KeyError):
            _ = self.fastq[int]

        with self.assertRaises(KeyError):
            _ = self.fastq['abc']

Exemplo n.º 12

0

Exibir arquivo

Arquivo: utils.py Projeto: np-core/nanopath

def create_fastx_index(fastx: Path) -> (pyfastx.Fasta, Path):

    if is_fasta(fastx):
        return pyfastx.Fasta(
            str(fastx), build_index=True
        ), Path(str(fastx) + '.fxi')
    elif is_fastq(fastx):
        return pyfastx.Fastq(
            str(fastx), build_index=True
        ), Path(str(fastx) + '.fxi')
    else:
        raise ValueError(
            f'Could not determine input file format: {fastx}'
        )

Exemplo n.º 13

0

Exibir arquivo

def fastx_fq2fa(args):
	fq = pyfastx.Fastq(args.fastx)

	if args.outfile:
		fh = open(args.outfile, 'w')
	else:
		fh = sys.stdout

	for read in fq:
		fh.write(">{}\n{}\n".format(read.name, read.seq))

	if args.outfile:
		fh.close()
	else:
		fh.flush()

Exemplo n.º 14

0

Exibir arquivo

Arquivo: utils.py Projeto: np-core/nanopath

    def prepare_fastq(self) -> dict:

        """ Checks file paths of input files and creates indices """

        fastq = {}
        for organism, data in self.composition.items():
            file = data['file']
            file_path = Path(file)
            if not file_path.exists():
                raise ValueError(f'File {file_path} does not exist.')
            else:
                fastq[organism] = pyfastx.Fastq(file)

        self.logger.info('Prepared read files - proceeding')

        return fastq

Exemplo n.º 15

0

Exibir arquivo

def fastq_split(args):
	fq = pyfastx.Fastq(args.fastx)

	if args.file_num:
		seqs_num = math.ceil(len(fq)/args.file_num)
		parts_num = args.file_num
	else:
		seqs_num = args.seq_count
		parts_num = math.ceil(len(fq)/seqs_num)

	name, suffix1 = os.path.splitext(os.path.basename(args.fastx))

	if fq.is_gzip:
		name, suffix2 = os.path.splitext(name)

	digit = len(str(parts_num))

	seq_write = 0
	fh = None
	file_num = 0

	for read in fq:
		if seq_write == 0:
			file_num += 1

			if fq.is_gzip:
				subfile = "{}.{}{}{}".format(name, str(file_num).zfill(digit), suffix2, suffix1)
			else:
				subfile = "{}.{}{}".format(name, str(file_num).zfill(digit), suffix1)

			if args.outdir is not None:
				subfile = os.path.join(args.outdir, subfile)

			if fq.is_gzip:
				fh = gzip.open(subfile, 'wt')
			else:
				fh = open(subfile, 'w')

		fh.write("@{}\n{}\n+\n{}\n".format(read.name, read.seq, read.qual))
		seq_write += 1

		if seq_write == seqs_num:
			fh.close()
			seq_write = 0
	
	fh.close()

Exemplo n.º 16

0

Exibir arquivo

    def setUp(self):
        self.fastq = pyfastx.Fastq(gzip_fastq)

        with open(flat_fastq) as fh:
            self.keys = [line.split()[0][1:] for line in fh if line[0] == '@']
        self.count = len(self.keys)

Exemplo n.º 17

0

Exibir arquivo

import sys
import pyfastx

for read in pyfastx.Fastq(sys.argv[1]):
    print(read.name)

Exemplo n.º 18

0

Exibir arquivo

Arquivo: pyfastx_fastq_build_index.py Projeto: wx904/pyfastx

import sys
import pyfastx

pyfastx.Fastq(sys.argv[1])

Exemplo n.º 19

0

Exibir arquivo

import sys
import random
import pyfastx

random.seed(sys.argv[1])

for fqfile in sys.argv[2:]:
    fq = pyfastx.Fastq(fqfile)
    samples = set(random.sample(range(len(fq)), 10000))
    with open("{}.list".format(fqfile), 'w') as fw:
        for r in fq:
            if (r.id - 1) in samples:
                fw.write("{}\n".format(r.name))

    print(fqfile)

Exemplo n.º 20

0

Exibir arquivo

Arquivo: cutsite.py Projeto: koszullab/hicstuff

def cut_ligation_sites(
    fq_for, fq_rev, digest_for, digest_rev, enzyme, mode, seed_size, n_cpu
):
    """Create new reads to manage pairs with a digestion and create multiple
    pairs to take into account all the contact present.

    The function write two files for both the forward and reverse fastq with the
    new reads. The new reads have at the end of the ID ":[0-9]" added to
    differentiate the different pairs created from one read.

    The function will look for all the sites present and create new pairs of
    reads according to the mode given to retreive as much as possible of the HiC
    signal.

    Parameters
    ----------
    fq_for : str
        Path to the forward fastq file to digest.
    fq_rev : str
        Path to the reverse fatsq file to digest.
    digest_for : str
        Path to the output digested forward fatsq file to write.
    digest_rev : str
        Path to the output digested reverse fatsq file to write.
    enzyme : str
        The list of restriction enzyme used to digest the genome separated by a
        comma. Example: HpaII,MluCI.
    mode : str
        Mode to use to make the digestion. Three values possible: "all",
        "for_vs_rev", "pile".
    seed_size : int
        Minimum size of a fragment (i.e. seed size used in mapping as reads 
        smaller won't be mapped.)
    n_cpu : int
        Number of CPUs.
    """
    # Process the ligation sites given
    ligation_sites = hcd.gen_enzyme_religation_regex(enzyme)

    # Defined stop_token which is used to mark the end of input file
    stop_token = "STOP"
    # A stack is a string cointaining multiple read pairs
    max_stack_size = 1000

    # Create count to have an idea of the digested pairs repartition.
    original_number_of_pairs = 0
    final_number_of_pairs = 0
    new_reads_for = ""
    new_reads_rev = ""
    current_stack = 0

    # Start parallel threading to compute the
    # ctx = multiprocessing.get_context("spawn")
    queue = multiprocessing.Queue(max(1, n_cpu - 1))
    writer_process = multiprocessing.Process(
        target=_writer, args=(digest_for, digest_rev, queue, stop_token)
    )
    writer_process.start()

    # Iterate on all pairs
    for read_for, read_rev in zip(
        pyfastx.Fastq(fq_for, build_index=False),
        pyfastx.Fastq(fq_rev, build_index=False),
    ):

        # Count the numbers of original reads processed.
        original_number_of_pairs += 1

        # Count for stack size.
        current_stack += 1

        # Extract components of the reads.
        for_name, for_seq, for_qual = read_for
        rev_name, rev_seq, rev_qual = read_rev

        # Sanity check to be sure all reads are with their mate.
        if for_name != rev_name:
            logger.error(
                "The fastq files contains reads not sorted :\n{0}\n{1}".format(
                    read_for.id, read_rev.id
                )
            )
            sys.exit(1)

        # Cut the forward and reverse reads at the ligation sites.
        for_seq_list, for_qual_list = cutsite_read(
            ligation_sites, for_seq, for_qual, seed_size,
        )
        rev_seq_list, rev_qual_list = cutsite_read(
            ligation_sites, rev_seq, rev_qual, seed_size,
        )

        # Write the new combinations of fragments.
        new_reads_for, new_reads_rev, final_number_of_pairs = write_pair(
            new_reads_for,
            new_reads_rev,
            for_name,
            for_seq_list,
            for_qual_list,
            rev_seq_list,
            rev_qual_list,
            mode,
            final_number_of_pairs,
        )

        # If stack full, add it in the queue.
        if current_stack == max_stack_size:

            # Add the pair in the queue.
            pairs = (new_reads_for.encode(), new_reads_rev.encode())
            queue.put(pairs)

            # Empty the stack
            current_stack = 0
            new_reads_for = ""
            new_reads_rev = ""

    # End the parallel processing.
    pairs = (new_reads_for.encode(), new_reads_rev.encode())
    queue.put(pairs)
    queue.put(stop_token)
    writer_process.join()

    # Return information on the different pairs created
    logger.info(f"Library used: {fq_for} - {fq_rev}")
    logger.info(
        f"Number of pairs before digestion: {original_number_of_pairs}"
    )
    logger.info(
        f"Number of pairs after digestion: {final_number_of_pairs}"
    )

Exemplo n.º 21

0

Exibir arquivo

Arquivo: fqcnt_py6x_pyfx.py Projeto: thomasvangurp/biofast

#!/usr/bin/env python

if __name__ == "__main__":
	import sys, re, gzip, pyfastx
	if len(sys.argv) == 1:
		print("Usage: fqcnt.py <in.fq.gz>")
		sys.exit(0)
	n, slen, qlen = 0, 0, 0
	for name, seq, qual in pyfastx.Fastq(sys.argv[1], build_index=False):
		n += 1
		slen += len(seq)
		qlen += qual and len(qual) or 0
	print('{}\t{}\t{}'.format(n, slen, qlen))

Exemplo n.º 22

0

Exibir arquivo

Arquivo: test_fastq.py Projeto: wx904/pyfastx

    def test_full_name(self):
        fq = pyfastx.Fastq(flat_fastq, build_index=False, full_name=True)

        for name, _, _ in fq:
            self.assertTrue(name, self.fastq[name.split()[0]].description)

Exemplo n.º 23

0

Exibir arquivo

Arquivo: pyfastx_fastq_random_access.py Projeto: wx904/pyfastx

import sys
import pyfastx
fq = pyfastx.Fastq(sys.argv[2])

with open(sys.argv[1]) as fh:
    for line in fh:
        name = line.strip()
        print(fq[name].seq)