def print_reads(miss, fq1, fq2): """ Print the missing reads from the two fastq files :param miss1: the set of reads missing from fq1 :param miss2: the set of reads missing from fq2 :param fq1: the first fastq file :param fq2: the second fastq file :return: """ bn = re.search('/(\w+)_pass_1.fastq', fq1) if not bn: sys.stderr.write(f"Can't parse the base filename from {fq1}\n") sys.exit(-1) fqo1 = bn.groups()[0] + "_missed_1.fastq" fqo2 = bn.groups()[0] + "_missed_2.fastq" if os.path.exists(fqo1): sys.stderr.write(f"Not overwrting {fqo1}\n") sys.exit(-1) if os.path.exists(fqo2): sys.stderr.write(f"Not overwrting {fqo2}\n") sys.exit(-1) with open(fqo1, 'w') as out: sys.stderr.write("Finding reads from {}\n".format(fq1)) c = 0 for sid, allid, seq, qual in stream_fastq(fq1): c += 1 if not c % 100000: sys.stderr.write(".") sys.stderr.flush() test = sid[:sid.rindex(".1")].replace('@', '', 1) if test in miss: out.write("@{}\n{}\n+\n{}\n".format(allid, seq, qual)) out.flush() with open(fqo2, 'w') as out: sys.stderr.write("\nFinding reads from {}\n".format(fq2)) c=0 for sid, allid, seq, qual in stream_fastq(fq2): c += 1 if not c % 100000: sys.stderr.write(".") sys.stderr.flush() test = sid[:sid.rindex(".2")].replace('@', '', 1) if test in miss: out.write("@{}\n{}\n+\n{}\n".format(allid, seq, qual)) out.flush() sys.stderr.write("\n")
def filter_fastq(fqf, br, matchout=None, nomatchout=None, verbose=False): """ Filter the fastq file and print out matches or no matches :param fqf: The fastq file to filter :param br: the set of query blast results :param matchout: The file to write matches to :param nomatchout: the file to write no matches to :param verbose: more output :return: nothing """ mo = open(matchout, 'w') nmo = open(nomatchout, 'w') matches = 0 nonmatches = 0 for sid, allid, seq, qual in stream_fastq(fqf): if sid in br: if matchout: mo.write(f"@{allid}\n{seq}\n+\n{qual}\n") matches += 1 else: if nomatchout: nmo.write(f"@{allid}\n{seq}\n+\n{qual}\n") nonmatches += 1 sys.stderr.write(f"{bcolors.GREEN}FINISHED:{bcolors.ENDC} Sequences Matched: {matches} Sequences without match {nonmatches}\n")
def fq_ids(fqdir, verbose=False): """ Get a list of fastq ids for each of the fastq files in fqdir :param fqdir: directory of fastq files :return: a dict of ids """ if verbose: sys.stderr.write("Reading fastq files\n") fqids = {} for fqf in os.listdir(fqdir): if not fqf.endswith('fastq'): continue if verbose: sys.stderr.write("\t{}\n".format(fqf)) for seqid, fullid, seq, qual in stream_fastq(os.path.join(fqdir, fqf)): if fullid in fqids: if fqids[fullid] == fqf.replace('.fastq', ''): continue sys.stderr.write( "WARNING: {} is not a unique id. It is in {} and {}\n". format(fullid, fqids[fullid], fqf)) fqids[fullid] = fqf.replace('.fastq', '') if ' ' in fullid: fi = fullid.replace(' ', '_') fqids[fi] = fqf.replace('.fastq', '') return fqids
def fq_ids(fqdir, verbose=False): """ Get a list of fastq ids for each of the fastq files in fqdir :param fqdir: directory of fastq files :return: a dict of ids """ if verbose: sys.stderr.write("Reading fastq files\n") fqids = {} for fqf in os.listdir(fqdir): if not fqf.endswith('fastq'): continue if verbose: sys.stderr.write("\t{}\n".format(fqf)) for seqid, fullid, seq, qual in stream_fastq(os.path.join(fqdir, fqf)): if fullid in fqids: if fqids[fullid] == fqf.replace('.fastq', ''): continue sys.stderr.write("WARNING: {} is not a unique id. It is in {} and {}\n".format(fullid, fqids[fullid], fqf)) fqids[fullid] = fqf.replace('.fastq', '') if ' ' in fullid: fi = fullid.replace(' ', '_') fqids[fi] = fqf.replace('.fastq', '') return fqids
def count_kmers(fqf, kmer, verbose=False): """ Count hte frequency of bases in the first k-mer bp of a sequences :param fqf: fastq file :param kmer: length to count :param verbose: more output """ if verbose: sys.stderr.write(f"{bcolors.GREEN}Reading {fqf}{bcolors.ENDC}\n") counts = [[0, 0, 0, 0] for x in range(kmer)] for sid, seqid, seq, qual in stream_fastq(fqf): if 'N' in seq or 'n' in seq: continue seq = seq[::-1] # reverse the sequence! for x in range(kmer): try: counts[x][k[seq[x]]] += 1 except KeyError as e: base = e.args[0] if base.upper() != "N": sys.stderr.write( f'{bcolors.PINK}Unknown base {base}{bcolors.ENDC}\n') counts.reverse() return counts
def read_fastqs(fastqdir, fname, seqids, verbose=True): """ Read the fastq files and store the sequences we want to save :param fastqdir: the directory with fastq files :param fname: the likely filename :param seqids: the seqids we want to save :param verbose: more output :return : a dict of seqids: left, left qual, right, right qual """ seqs = {x:[None, None, None, None] for x in seqids} wanted = set() for s in seqids: wanted.add(f"@{s}.1") wanted.add(f"@{s}.2") for f in os.listdir(fastqdir): if fname in f: if verbose: sys.stderr.write("Reading {}\n".format(os.path.join(fastqdir, f))) for seqid, header, seq, qualscores in stream_fastq(os.path.join(fastqdir, f)): if seqid in wanted: s = re.sub('.\d$', '', seqid) s = s.replace('@', '', 1) if seqid.endswith('.1'): seqs[s][0] = seq seqs[s][1] = qualscores elif seqid.endswith('.2'): seqs[s][2] = seq seqs[s][3] = qualscores else: sys.stderr.write("ERR: Not sure about sequence ID {}\n".format(seqid)) return seqs
def filter_fastq(fqf, br, matchout=None, nomatchout=None, verbose=False): """ Filter the fastq file and print out matches or no matches :param fqf: The fastq file to filter :param br: the set of query blast results :param matchout: The file to write matches to :param nomatchout: the file to write no matches to :param verbose: more output :return: nothing """ mo = open(matchout, 'w') nmo = open(nomatchout, 'w') matches = 0 nonmatches = 0 for sid, allid, seq, qual in stream_fastq(fqf): if sid in br: if matchout: mo.write(f"@{allid}\n{seq}\n+\n{qual}\n") matches += 1 else: if nomatchout: nmo.write(f"@{allid}\n{seq}\n+\n{qual}\n") nonmatches += 1 sys.stderr.write( f"{bcolors.GREEN}FINISHED:{bcolors.ENDC} Sequences Matched: {matches} Sequences without match {nonmatches}\n" )
def countcpgs(fqfile): """ Count the CpGs in a file :param fqfile: the fastq file :return: """ count = {} for seqid, header, seq, qual in stream_fastq(fqfile): cg = seq.count('CG') count[cg] = count.get(cg, 0) + 1 return count
def write_sequences(reads, outdir, leftfq, rightfq, singlefq = None, verbose = False): """ Write the sequences out to a file :param reads: the dict of reads and bins :param outdir: the output dir to write to :param leftfq: the left reads :param rightfq: the right reads :param singlefq: the single reads (optional) :param verbose: more output :return: """ if verbose: sys.stderr.write(f"{bcolors.GREEN}Writing sequences\n{bcolors.ENDC}") if not os.path.exists(outdir): os.mkdir(outdir) files = {} psc = 0 for seqid, header1, seq1, qualscores1, header2, seq2, qualscores2 in stream_paired_fastq(leftfq, rightfq): if seqid in reads: for clst in reads[seqid]: if clst not in files: files[clst] = [ open(os.path.join(outdir, clst + ".R1.fastq"), 'w'), open(os.path.join(outdir, clst + ".R2.fastq"), 'w') ] files[clst][0].write(f"@{header1}\n{seq1}\n+\n{qualscores1}\n") files[clst][1].write(f"@{header2}\n{seq2}\n+\n{qualscores2}\n") psc += 1 singlefiles = {} sc = 0 if singlefq: for seqid, header, seq, qualscores in stream_fastq(singlefq): if seqid in reads: for clst in reads[seqid]: if clst not in singlefiles: singlefiles[clst] = open(os.path.join(outdir, clst + ".single.fastq"), 'w') singlefiles[clst].write(f"@{header}\n{seq}\n+\n{qualscores}\n") sc += 1 for f in files: files[f][0].close() files[f][1].close() for f in singlefiles: singlefiles[f].close() if verbose: sys.stderr.write(f"{bcolors.GREEN}Wrote {psc} paired end sequences and {sc} single reads\n{bcolors.ENDC}")
def split_fastq(fqf, outdir, frac, verbose=False): """ Split a fastq file :param fqf: fastq file :param outdir: output directory to write all the files to :param frac: fraction of the sequence for each end :param verbose: more output :return: nothing """ if not os.path.exists(outdir): os.path.mkdir(outdir) for seqid, header, seq, qual in stream_fastq(fqf): with open (os.path.join(outdir, seq + ".left.fna"), 'w') as out:
def read_fastq(fqfile, blast, verbose=False): """ Read the fastq file and print only sequences we need :param fqfile: The fastq file :param blast: the blast reads that matched (ie. reads to delete) :param verbose: more output :return: """ for seqid, fullid, seq, qual in stream_fastq(fqfile): if seqid.startswith('@'): seqid = seqid[1:] if seqid in blast or fullid in blast: continue print("@{}\n{}\n+\n{}".format(fullid, seq, qual))
def count_kmers(faf, type, k, jsonout=None, verbose=False): """ Count the kmers :param faf: fasta file :param type: str either fasta or fastq :param k: kmer size :param verbose: more output :return: a dict of kmers """ if verbose: sys.stderr.write(f"{bcolors.GREEN}Counting kmers (k={k}) in {faf}\n") kmers = {} if type == "fasta": for id, seq in stream_fasta(faf): rcseq = rc(seq) posn = 0 while posn < len(seq) - k - 1: kmers[seq[posn:posn + k]] = kmers.get(seq[posn:posn + k], 0) + 1 kmers[rcseq[posn:posn + k]] = kmers.get(rcseq[posn:posn + k], 0) + 1 posn += 1 if type == "fastq": for id, fullid, seq, qual in stream_fastq(faf): rcseq = rc(seq) posn = 0 while posn < len(seq) - k - 1: kmers[seq[posn:posn + k]] = kmers.get(seq[posn:posn + k], 0) + 1 kmers[rcseq[posn:posn + k]] = kmers.get(rcseq[posn:posn + k], 0) + 1 posn += 1 if jsonout: if verbose: sys.stderr.write(f"{bcolors.BLUE}\tWriting to {jsonout}\n") with open(jsonout, 'w') as out: json.dump({faf: kmers}, out) if verbose: sys.stderr.write( f"{bcolors.BLUE}\tDone counting kmers (k={k}) in {faf}\n") return kmers
def fq_ids(fnames): """ Get a list of fastq ids for each of the files in fnames :param fnames: a list of files :return: a dict of ids """ res = {} for f in fnames: for seqid, fullid, seq, qual in stream_fastq(f): # note we store several versions of the id as phylosift does some munging on them res[fullid] = f fullid = fullid.replace(' ', '_') res[fullid] = f return res
def extract_fastq(fqf, reads, verbose): """ Extract the reads from the fastq file :param fqf: fastq file :param reads: set of reads to ignore :param verbose: more output :return: nada """ for (sid, label, seq, qual) in stream_fastq(fqf): if sid.startswith('@'): sid = sid[1:] if sid not in reads: if verbose: sys.stderr.write("Keeping: {} --> {}\n".format(sid, label)) print("@{}\n{}\n+\n{}".format(label, seq, qual)) elif verbose: sys.stderr.write("Skipping: {} --> {}\n".format(sid, label))
def print_locations(fastqf, s, pl): """ Print the location of s in all reads in fastqf :param fastqf: :param s: :param pl: print the sequence length :return: """ for seqid, header, seq, qual in stream_fastq(fastqf): r = seq.find(s) while r > -1: if pl: print(f"{seqid}\t{r}\t{len(seq)}") else: print(f"{seqid}\t{r}") r += 1 r = seq.find(s, r)
def fq_ids(fnames, verbose=False): """ Get a list of fastq ids for each of the files in fnames :param fnames: a list of files :return: a dict of ids """ if verbose: sys.stderr.write("Reading fastq files\n") fqids = {} for f in fnames: for seqid, fullid, seq, qual in stream_fastq(f): # note we store several versions of the id as phylosift does some munging on them fqids[fullid] = f.split(os.path.sep)[-1] fullid = clean_newick_id(fullid) fqids[fullid] = f.split(os.path.sep)[-1] return fqids
def fq_ids(fnames, verbose=False): """ Get a list of fastq ids for each of the files in fnames :param fnames: a list of files :return: a dict of ids """ if verbose: sys.stderr.write("Reading fastq files\n") fqids = {} for f in fnames: for seqid, fullid, seq, qual in stream_fastq(f): # note we store several versions of the id as phylosift does some munging on them fqids[fullid] = f fullid = fullid.replace(' ', '_') fqids[fullid] = f return fqids
def parse_dir(dir, verbose=False): """ Parse the directory of files :param dir: :param verbose: :return: """ lengths = {} for f in os.listdir(dir): m = re.search('^(\w+)_(Primer\w)_', f) if not m: sys.stderr.write("Error: can't parse {}\n".format(f)) continue (srr, primer) = m.groups() if srr not in lengths: lengths[srr] = {'PrimerA' : 0, 'PrimerB' : 0, 'PrimerC' : 0} for seqid, header, seq, qualscores in stream_fastq(os.path.join(dir, f)): lengths[srr][primer] += len(seq) return lengths
def read_fastqs(fastqdir, fname, seqids, verbose=True): """ Read the fastq files and store the sequences we want to save :param fastqdir: the directory with fastq files :param fname: the likely filename :param seqids: the seqids we want to save :param verbose: more output :return : a dict of seqids: left, left qual, right, right qual """ seqs = {x: [None, None, None, None] for x in seqids} wanted = set() for s in seqids: wanted.add(f"@{s}.1") wanted.add(f"@{s}.2") for f in os.listdir(fastqdir): if fname in f: if verbose: sys.stderr.write("Reading {}\n".format( os.path.join(fastqdir, f))) for seqid, header, seq, qualscores in stream_fastq( os.path.join(fastqdir, f)): if seqid in wanted: s = re.sub('.\d$', '', seqid) s = s.replace('@', '', 1) if seqid.endswith('.1'): seqs[s][0] = seq seqs[s][1] = qualscores elif seqid.endswith('.2'): seqs[s][2] = seq seqs[s][3] = qualscores else: sys.stderr.write( "ERR: Not sure about sequence ID {}\n".format( seqid)) return seqs
__maintainer__ = 'Rob Edwards' __email__ = '*****@*****.**' if __name__ == '__main__': parser = argparse.ArgumentParser(description=" ") parser.add_argument('-f', help='fastq file', required=True) parser.add_argument('-o', help='output directory', required=True) parser.add_argument('-v', help='verbose output', action='store_true') args = parser.parse_args() tags = ['GGTTCACTTGAGACAC', 'CTTGAGACAC'] os.makedirs(args.o, exist_ok=True) outfiles = {'none': open(os.path.join(args.o, "none.fastq"), 'w')} for seqid, header, seq, qual in stream_fastq(args.f): written = False for t in tags: if t in seq and seq.index(t) < 25: tag = seq[0:seq.index(t) + len(t)] if tag not in outfiles: outfiles[tag] = open(os.path.join(args.o, f"{tag}.fastq"), 'w') outfiles[tag].write(f"@{header}\n{seq}\n+\n{qual}\n") written = True break if not written: outfiles['none'].write(f"@{header}\n{seq}\n+\n{qual}\n") for t in outfiles: outfiles[t].close()
""" if __name__ == "__main__": parser = argparse.ArgumentParser( description='Check paired end files and make sure the pairs match up') parser.add_argument( '-l', help='The file where the reads end /1 (the option is a lowercase L)', required=True) parser.add_argument('-r', help='The file where the reads end /2', required=True) args = parser.parse_args() lseq = {} for (seqid, header, seq, qual) in stream_fastq(args.l): if not seqid.endswith('/1'): sys.stderr.write( "Sequence {} in {} does not appear to be a read /1\n".format( seqid, args.l)) continue seqid = seqid.replace('/1', '') lseq[seqid] = [header, seq, qual] rseq = {} for (seqid, header, seq, qual) in stream_fastq(args.r): if not seqid.endswith('/2'): sys.stderr.write( "Sequence {} in {} does not appear to be a read /2\n".format( seqid, args.l)) continue
""" import os import sys import argparse from roblib import stream_fastq __author__ = 'Rob Edwards' if __name__ == "__main__": parser = argparse.ArgumentParser(description=' ') parser.add_argument('-f', help='fasta file', required=True) args = parser.parse_args() lens = [] for (sid, label, seq, qual) in stream_fastq(args.f): lens.append(len(seq)) lens.sort() length=sum(lens) len_so_far = 0 n50 = None n75 = None for i in lens: len_so_far += i if not n50 and len_so_far >= length * 0.5: n50 = i if not n75 and len_so_far >= length * 0.75: n75 = i
files = [] if args.d: for subdir in args.d: for f in os.listdir(subdir): files.append(os.path.join(subdir, f)) overall = {'number': 0, 'total': 0, 'shortest': 1e6, 'longest': 0} for faf in files: if not os.path.exists(faf): sys.stderr.write( f"{bcolors.RED}FATAL: {faf} not found{bcolors.ENDC}\n") sys.exit(1) lens = [] for (sid, label, seq, qual) in stream_fastq(faf): lens.append(len(seq)) lens.sort() length = sum(lens) len_so_far = 0 n50 = None n75 = None auN = 0 for i in lens: len_so_far += i if not n50 and len_so_far >= length * 0.5: n50 = i if not n75 and len_so_far >= length * 0.75: n75 = i auN += i**2
sys.stderr.write(f"{bcolors.GREEN}Filtering on length{bcolors.ENDC}\n") fqnew = [] for s in fq: if len(s[2]) > length: fqnew.append(s) return fqnew if __name__ == "__main__": parser = argparse.ArgumentParser(description=' ') parser.add_argument('-f', help='fastq file', required=True) parser.add_argument( '-m', help='filter based on sequence length. Supply minimum length', type=int) parser.add_argument('-v', help='verbose output', action='store_true') args = parser.parse_args() # just read the whole file into an array, and then we # can run serial filters fq = [] for seqid, header, seq, scores in stream_fastq(args.f): fq.append([seqid, header, seq, scores]) if args.m: fq = filter_len(fq, args.m, args.v) for s in fq: print(f"@{s[1]}\n{s[2]}\n+\n{s[3]}")
help='base file name. Everything upto the _R1', required=True) parser.add_argument('-q', help='QC dir (default: %(default)s', default='QC') parser.add_argument('-o', help='output directory', required=True) parser.add_argument('-v', help='verbose output', action='store_true') args = parser.parse_args() os.makedirs(args.o, exist_ok=True) dna = {} qual = {} header = {} # initially didn't plan to keep all these :) for seqid, hd, seq, qualscores in stream_fastq(args.f): dna[seqid] = seq.upper() qual[seqid] = qualscores header[seqid] = hd changed = set() deleted = set() for step in range(1, 10): if args.v: message(f"Working on step {step}", "GREEN") fqf = os.path.join(args.q, f"step_{step}", f"{args.n}.s{step}.out.fastq") if not os.path.exists(fqf): message(f"FQ File {fqf} not found", "RED") continue seqs = []
if not args.forward and not args.reverse: message( "Either --forward or --reverse primer must be specified otherwise nothing will be removed" ) sys.exit(-1) fwd = None rev = None if args.forward: fwd = args.forward.upper() if args.reverse: rev = args.reverse.upper() with open(args.o, 'w') as out: for sid, seqid, seq, qual in stream_fastq(args.f): original = [seq, qual] trimmed = False if fwd and fwd in seq.upper(): idx = seq.upper().index(fwd) if idx < args.maxfwd: if idx > 10: message( f"WARNING: Trimming forward primer {fwd} from {sid} starting at position {idx}", "PINK") seq = seq[idx + len(args.forward):] qual = qual[idx + len(args.forward):] trimmed = True else: if args.v: message(