def setUp(self): self.fastq = pyfastx.Fastq(gzip_fastq) #reload index self.fastq = pyfastx.Fastq(gzip_fastq) #flat fastq self.flatq = pyfastx.Fastq(flat_fastq) self.reads = {} self.bases = {'A': 0, 'T': 0, 'G': 0, 'C':0, 'N':0} i = 0 c = -1 with open(flat_fastq) as fh: for line in fh: i += 1 if i % 4 == 1: c += 1 self.reads[c] = [line[1:].strip().split()[0], 0, 0] elif i % 4 == 2: self.reads[c][1] = line.strip() self.bases['A'] += line.count('A') self.bases['T'] += line.count('T') self.bases['G'] += line.count('G') self.bases['C'] += line.count('C') self.bases['N'] += line.count('N') elif i % 4 == 0: self.reads[c][2] = line.strip()
def test_build(self): del self.fastq if os.path.exists('{}.fxi'.format(gzip_fastq)): os.remove('{}.fxi'.format(gzip_fastq)) fq = pyfastx.Fastq(gzip_fastq, build_index=False) fq.build_index() self.fastq = pyfastx.Fastq(gzip_fastq)
def load_seqfile(infile): fxifile = infile + ".fxi" if os.path.exists(fxifile) and infile.endswith(FASTA_SUFFIX): seqfile = pyfastx.Fasta(infile, build_index=False) elif not os.path.exists(fxifile) and infile.endswith(FASTA_SUFFIX): seqfile = pyfastx.Fasta(infile, build_index=True) elif os.path.exists(fxifile) and infile.endswith(FASTQ_SUFFIX): seqfile = pyfastx.Fastq(infile, build_index=False) elif not os.path.exists(fxifile) and infile.endswith(FASTQ_SUFFIX): seqfile = pyfastx.Fastq(infile, build_index=True) return seqfile
def fastx_info(args): fastx_type = fastx_format_check(args.fastx) if fastx_type == 'fasta': fa = pyfastx.Fasta(args.fastx) comp = fa.composition print("Sequence counts: {}".format(len(fa))) print("Total bases: {}".format(fa.size)) print("GC content: {:.2f}%".format(fa.gc_content)) for b in comp: print("{} counts: {}".format(b, comp[b])) print("Mean length: {:.2f}".format(fa.mean)) print("Median length: {:.2f}".format(fa.median)) print("Max length: {}".format(len(fa.longest))) print("Min length: {}".format(len(fa.shortest))) print("N50, L50: {}, {}".format(*fa.nl())) print("length >= 1000: {}".format(fa.count(1000))) elif fastx_type == 'fastq': fq = pyfastx.Fastq(args.fastx) comp = fq.composition print("Read counts: {}".format(len(fq))) print("Total bases: {}".format(fq.size)) print("GC content: {:.2f}%".format(fq.gc_content)) for b in comp: print("{} counts: {}".format(b, comp[b])) print("Quality encoding system maybe: {}".format(", ".join(fq.encoding_type)))
def get_output_handle(fpath: str, fastx: bool = False, out: bool = True): if fpath == "-": if out: handle = sys.stdout else: handle = sys.stdin else: p = Path(fpath) if not p.parent.is_dir(): raise NotADirectoryError( "Directory specified for output file does not exist: {}".format( p.parent ) ) if fastx: if fpath.endswith("a"): handle = pyfastx.Fasta(p) else: handle = pyfastx.Fastq(p) else: handle = p.open("w") return handle
def run(args, name): count = 0 # read fastq line and extract parts for read in pyfastx.Fastq(args.inputFile): count += 1 split = read.description.split(':') runid = f"{split[0].replace('@', '')}_0{split[1]}_A{split[2]}" barcode = split[-1] if count == 1: break # list of tuples of input for JSON results = [("runID", runid), ("barcode", barcode)] # if JSON is present use exiting, else create new unique name JSON = storage.JSON() if not args.JSON: JSON.name(args.sample) JSON.open(args.JSON) JSON.add_results(args.name, results) JSON.pretty_print() JSON.write(args.outputDir) logging.info(results)
def fastq_sample(args): fq = pyfastx.Fastq(args.fastx) if args.num is not None and args.num > 0: seq_num = args.num if seq_num > len(fq): seq_num = len(fq) elif args.num is not None and 0 < args.prop <= 1: seq_num = round(len(fq)*args.prop) if seq_num == 0: raise RuntimeError("the proportion is too small") else: raise RuntimeError("specify a right number for seq number or proportion") selected = random.sample(range(len(fq)), k=seq_num) if args.outfile is None: fw = sys.stdout else: fw = open(args.outfile, 'w') for idx in selected: r = fq[idx] fw.write("@{}\n{}\n+\n{}\n".format(r.name, r.seq, r.qual)) if args.outfile is None: fw.flush() else: fw.close()
def test_iter_tuple(self): i = -1 for name, seq, qual in pyfastx.Fastq(flat_fastq, build_index=False): i += 1 self.assertEqual(name, self.reads[i][0]) self.assertEqual(seq, self.reads[i][1]) self.assertEqual(qual, self.reads[i][2])
def create_fastx_index(fastx): if is_fasta(fastx): return pyfastx.Fasta(str(fastx), build_index=True), build_read_fasta elif is_fastq(fastx): return pyfastx.Fastq(str(fastx), build_index=True), build_read_fastq else: raise ValueError(f'Could not determine input file format: {fastx}')
def build_index(infile): fxifile = infile + ".fxi" if os.path.exists(fxifile): print("fxi index is present") else: print("buliding fxi index for {}".format(infile)) if infile.endswith((".fa", ".fa.gz", ".fasta", ".fasta.gz")): pyfastx.Fasta(infile) else: pyfastx.Fastq(infile) print("fxi index has been created for {}".format(infile))
def test_exception(self): with self.assertRaises(FileExistsError): _ = pyfastx.Fastq('a_fastq_file_not_exists') with self.assertRaises(IndexError): _ = self.fastq[len(self.fastq)] with self.assertRaises(KeyError): _ = self.fastq[int] with self.assertRaises(KeyError): _ = self.fastq['abc']
def create_fastx_index(fastx: Path) -> (pyfastx.Fasta, Path): if is_fasta(fastx): return pyfastx.Fasta( str(fastx), build_index=True ), Path(str(fastx) + '.fxi') elif is_fastq(fastx): return pyfastx.Fastq( str(fastx), build_index=True ), Path(str(fastx) + '.fxi') else: raise ValueError( f'Could not determine input file format: {fastx}' )
def fastx_fq2fa(args): fq = pyfastx.Fastq(args.fastx) if args.outfile: fh = open(args.outfile, 'w') else: fh = sys.stdout for read in fq: fh.write(">{}\n{}\n".format(read.name, read.seq)) if args.outfile: fh.close() else: fh.flush()
def prepare_fastq(self) -> dict: """ Checks file paths of input files and creates indices """ fastq = {} for organism, data in self.composition.items(): file = data['file'] file_path = Path(file) if not file_path.exists(): raise ValueError(f'File {file_path} does not exist.') else: fastq[organism] = pyfastx.Fastq(file) self.logger.info('Prepared read files - proceeding') return fastq
def fastq_split(args): fq = pyfastx.Fastq(args.fastx) if args.file_num: seqs_num = math.ceil(len(fq)/args.file_num) parts_num = args.file_num else: seqs_num = args.seq_count parts_num = math.ceil(len(fq)/seqs_num) name, suffix1 = os.path.splitext(os.path.basename(args.fastx)) if fq.is_gzip: name, suffix2 = os.path.splitext(name) digit = len(str(parts_num)) seq_write = 0 fh = None file_num = 0 for read in fq: if seq_write == 0: file_num += 1 if fq.is_gzip: subfile = "{}.{}{}{}".format(name, str(file_num).zfill(digit), suffix2, suffix1) else: subfile = "{}.{}{}".format(name, str(file_num).zfill(digit), suffix1) if args.outdir is not None: subfile = os.path.join(args.outdir, subfile) if fq.is_gzip: fh = gzip.open(subfile, 'wt') else: fh = open(subfile, 'w') fh.write("@{}\n{}\n+\n{}\n".format(read.name, read.seq, read.qual)) seq_write += 1 if seq_write == seqs_num: fh.close() seq_write = 0 fh.close()
def setUp(self): self.fastq = pyfastx.Fastq(gzip_fastq) with open(flat_fastq) as fh: self.keys = [line.split()[0][1:] for line in fh if line[0] == '@'] self.count = len(self.keys)
import sys import pyfastx for read in pyfastx.Fastq(sys.argv[1]): print(read.name)
import sys import pyfastx pyfastx.Fastq(sys.argv[1])
import sys import random import pyfastx random.seed(sys.argv[1]) for fqfile in sys.argv[2:]: fq = pyfastx.Fastq(fqfile) samples = set(random.sample(range(len(fq)), 10000)) with open("{}.list".format(fqfile), 'w') as fw: for r in fq: if (r.id - 1) in samples: fw.write("{}\n".format(r.name)) print(fqfile)
def cut_ligation_sites( fq_for, fq_rev, digest_for, digest_rev, enzyme, mode, seed_size, n_cpu ): """Create new reads to manage pairs with a digestion and create multiple pairs to take into account all the contact present. The function write two files for both the forward and reverse fastq with the new reads. The new reads have at the end of the ID ":[0-9]" added to differentiate the different pairs created from one read. The function will look for all the sites present and create new pairs of reads according to the mode given to retreive as much as possible of the HiC signal. Parameters ---------- fq_for : str Path to the forward fastq file to digest. fq_rev : str Path to the reverse fatsq file to digest. digest_for : str Path to the output digested forward fatsq file to write. digest_rev : str Path to the output digested reverse fatsq file to write. enzyme : str The list of restriction enzyme used to digest the genome separated by a comma. Example: HpaII,MluCI. mode : str Mode to use to make the digestion. Three values possible: "all", "for_vs_rev", "pile". seed_size : int Minimum size of a fragment (i.e. seed size used in mapping as reads smaller won't be mapped.) n_cpu : int Number of CPUs. """ # Process the ligation sites given ligation_sites = hcd.gen_enzyme_religation_regex(enzyme) # Defined stop_token which is used to mark the end of input file stop_token = "STOP" # A stack is a string cointaining multiple read pairs max_stack_size = 1000 # Create count to have an idea of the digested pairs repartition. original_number_of_pairs = 0 final_number_of_pairs = 0 new_reads_for = "" new_reads_rev = "" current_stack = 0 # Start parallel threading to compute the # ctx = multiprocessing.get_context("spawn") queue = multiprocessing.Queue(max(1, n_cpu - 1)) writer_process = multiprocessing.Process( target=_writer, args=(digest_for, digest_rev, queue, stop_token) ) writer_process.start() # Iterate on all pairs for read_for, read_rev in zip( pyfastx.Fastq(fq_for, build_index=False), pyfastx.Fastq(fq_rev, build_index=False), ): # Count the numbers of original reads processed. original_number_of_pairs += 1 # Count for stack size. current_stack += 1 # Extract components of the reads. for_name, for_seq, for_qual = read_for rev_name, rev_seq, rev_qual = read_rev # Sanity check to be sure all reads are with their mate. if for_name != rev_name: logger.error( "The fastq files contains reads not sorted :\n{0}\n{1}".format( read_for.id, read_rev.id ) ) sys.exit(1) # Cut the forward and reverse reads at the ligation sites. for_seq_list, for_qual_list = cutsite_read( ligation_sites, for_seq, for_qual, seed_size, ) rev_seq_list, rev_qual_list = cutsite_read( ligation_sites, rev_seq, rev_qual, seed_size, ) # Write the new combinations of fragments. new_reads_for, new_reads_rev, final_number_of_pairs = write_pair( new_reads_for, new_reads_rev, for_name, for_seq_list, for_qual_list, rev_seq_list, rev_qual_list, mode, final_number_of_pairs, ) # If stack full, add it in the queue. if current_stack == max_stack_size: # Add the pair in the queue. pairs = (new_reads_for.encode(), new_reads_rev.encode()) queue.put(pairs) # Empty the stack current_stack = 0 new_reads_for = "" new_reads_rev = "" # End the parallel processing. pairs = (new_reads_for.encode(), new_reads_rev.encode()) queue.put(pairs) queue.put(stop_token) writer_process.join() # Return information on the different pairs created logger.info(f"Library used: {fq_for} - {fq_rev}") logger.info( f"Number of pairs before digestion: {original_number_of_pairs}" ) logger.info( f"Number of pairs after digestion: {final_number_of_pairs}" )
#!/usr/bin/env python if __name__ == "__main__": import sys, re, gzip, pyfastx if len(sys.argv) == 1: print("Usage: fqcnt.py <in.fq.gz>") sys.exit(0) n, slen, qlen = 0, 0, 0 for name, seq, qual in pyfastx.Fastq(sys.argv[1], build_index=False): n += 1 slen += len(seq) qlen += qual and len(qual) or 0 print('{}\t{}\t{}'.format(n, slen, qlen))
def test_full_name(self): fq = pyfastx.Fastq(flat_fastq, build_index=False, full_name=True) for name, _, _ in fq: self.assertTrue(name, self.fastq[name.split()[0]].description)
import sys import pyfastx fq = pyfastx.Fastq(sys.argv[2]) with open(sys.argv[1]) as fh: for line in fh: name = line.strip() print(fq[name].seq)