def split_by_rank(faf, ranks, outdir, verbose=False): """ Split the fasta file :param faf: fasta file :param ranks: dict of taxid and rank :param outdir: output directory :param verbose: more output :return: """ s = re.compile('TaxID=(\d+)') if args.v: sys.stderr.write(f"{colours.GREEN}Splitting {faf}{colours.ENDC}\n") fhs = {} for seqid, seq in stream_fasta(faf): m = s.search(seqid) rnk = "root" if m: tid = m.groups()[0] if tid in ranks: rnk = ranks[tid] else: sys.stderr.write( f"{colours.RED}ERROR: No taxonomy in {seqid}{colours.ENDC}\n") if rnk not in fhs: fhs[rnk] = open(os.path.join(outdir, rnk + ".fasta"), 'w') fhs[rnk].write(f">{seqid}\n{seq}\n") for fh in fhs: fhs[fh].close()
def write_fasta_files(faf, odir, bins, maxb, verbose=False): """ Read the sequences from faf and write them into a set of files in odir. :param faf: The source fasta file :param odir: the output directory :param bins: the hash of contigs -> bin :param maxb: the maximum bin number :param verbose: more output :return: nada """ if not os.path.exists(odir): os.mkdir(odir) outputfiles = [] for i in range(maxb + 1): outputfiles.append(open(os.path.join(odir, f"bin_{i}.fna"), 'w')) written_to = set() for fa, seq in stream_fasta(faf, True): faid = fa.split(" ")[0] if faid not in bins: if verbose: sys.stderr.write(f"Sequence {faid} not found in a bin\n") continue outputfiles[bins[faid]].write(">{}\n{}\n".format(fa, seq)) written_to.add(bins[faid]) for o in outputfiles: o.close() for i in range(maxb + 1): if i not in written_to: os.remove(os.path.join(odir, f"bin_{i}.fna"))
def parse_contigs(locations, gdir, odir): """ Parse the contigs file and print non-prophage regions :param locations: the locations hash from the phispy directory :param gdir: the genome directory that contains the contigs file :param odir: the output directory :return: None """ p = re.compile('>?(\S+)') out = open(os.path.join(odir, "contigs_no_pp.fasta"), 'w') for contig, seq in stream_fasta(os.path.join(gdir, "contigs")): if contig not in locations: out.write(">{}\n{}\n".format(contig, seq)) continue m = p.match(contig) tag = m.groups()[0] c = 0 ses = sorted(locations[contig], key=itemgetter(0)) posn = 0 for start, end in ses: c += 1 out.write(">{}.{}\n{}\n".format(tag, c, seq[posn:start])) posn = end + 1 out.close()
def fasta2tax(faf, outdir, trank, tdb, verbose=False): """ Split the fasta file by taxonomy :param faf: fasta file to split :param outdir: output directory to write to :param trank: taxonomic rank to choose :param tdb: The taxonomy database :param verbose: more output :return: """ s = re.compile('TaxID=(\d+)') fhs = {} for seqid, seq in stream_fasta(faf): m = s.search(seqid) if m: tid = m.groups()[0] rnk = find_rank(tid, trank, tdb, verbose) if rnk not in fhs: fhs[rnk] = open(os.path.join(outdir, rnk + ".fasta"), 'w') fhs[rnk].write(f">{seqid}\n{seq}\n") else: sys.stderr.write( f"{colours.RED}ERROR: No taxonomy in {seqid}{colours.ENDC}\n") for fh in fhs: fhs[fh].close()
def print_trim(faf, start=0, end=None, seqid=None, verbose=False): """ We don't actually trim the sequence, we print the first and last 20bp of the sequence and the trim If start is not provided we use the first base. If end is not provided we trim to the end. If sequence ID is not provided we trim the first sequence, otherwise we'll trim all the sequences :param faf: fasta file :param start: optional start position :param end: optional end position :param seqid: optional sequence ID to trim :param verbose: more output :return: Nothing """ for sid, seq in stream_fasta(faf): seqname = sid.split(" ")[0] if seqid and seqname != seqid: continue trimmed_seq = seq[start:end] aftertr_seq = seq[end:] print("Sequence 1: {} ... {}\nSequence 2: {} ... {}".format( trimmed_seq[0:20], trimmed_seq[-20:], aftertr_seq[0:20], aftertr_seq[-20:] ))
def parse_contigs(locations, gdir, odir): """ Parse the contigs file and print non-prophage regions :param locations: the locations hash from the phispy directory :param gdir: the genome directory that contains the contigs file :param odir: the output directory :return: None """ p = re.compile('>?(\S+)') out = open(os.path.join(odir, "contigs_no_pp.fasta"), 'w') for contig, seq in stream_fasta(os.path.join(gdir, "contigs")): if contig not in locations: out.write(">{}\n{}\n".format(contig, seq)) continue m = p.match(contig) tag = m.groups()[0] c = 0 ses = sorted(locations[contig], key=itemgetter(0)) posn = 0 for start, end in ses: c += 1 out.write(">{}.{}\n{}\n".format(tag, c, seq[posn:start])) posn = end + 1 out.close()
def write_fasta_files(faf, odir, bins, maxb, verbose=False): """ Read the sequences from faf and write them into a set of files in odir. :param faf: The source fasta file :param odir: the output directory :param bins: the hash of contigs -> bin :param maxb: the maximum bin number :param verbose: more output :return: nada """ if not os.path.exists(odir): os.mkdir(odir) outputfiles = [] for i in range(maxb+1): outputfiles.append(open(os.path.join(odir, f"bin_{i}.fna"), 'w')) written_to=set() for fa, seq in stream_fasta(faf, True): faid = fa.split(" ")[0] if faid not in bins: if verbose: sys.stderr.write(f"Sequence {faid} not found in a bin\n") continue outputfiles[bins[faid]].write(">{}\n{}\n".format(fa, seq)) written_to.add(bins[faid]) for o in outputfiles: o.close() for i in range(maxb+1): if i not in written_to: os.remove(os.path.join(odir, f"bin_{i}.fna"))
def trim_seq(faf, start=0, end=None, seqid=None, verbose=False): """ Trim a sequence from start to end. If start is not provided we use the first base. If end is not provided we trim to the end. If sequence ID is not provided we trim the first sequence, otherwise we'll trim all the sequences :param faf: fasta file :param start: optional start position :param end: optional end position :param seqid: optional sequence ID to trim :param verbose: more output :return: Nothing """ for sid, seq in stream_fasta(faf): seqname = sid.split(" ")[0] if seqid and seqname != seqid: continue print(">{}\n{}".format(sid, seq[start:end])) if verbose: sys.stderr.write("Trimmed {}. Next stretch is {}\n".format( sid, seq[end:end + 20]))
def print_trim(faf, start=0, end=None, seqid=None, verbose=False): """ We don't actually trim the sequence, we print the first and last 20bp of the sequence and the trim If start is not provided we use the first base. If end is not provided we trim to the end. If sequence ID is not provided we trim the first sequence, otherwise we'll trim all the sequences :param faf: fasta file :param start: optional start position :param end: optional end position :param seqid: optional sequence ID to trim :param verbose: more output :return: Nothing """ for sid, seq in stream_fasta(faf): seqname = sid.split(" ")[0] if seqid and seqname != seqid: continue trimmed_seq = seq[start:end] aftertr_seq = seq[end:] print("Sequence 1: {} ... {}\nSequence 2: {} ... {}".format( trimmed_seq[0:20], trimmed_seq[-20:], aftertr_seq[0:20], aftertr_seq[-20:]))
def read_fasta(fafile, idmapfile, minlen=100, verbose=False): """ Read a fasta file and return a dict of proteins and their md5 sums :param fafile: fasta file :param minlen: minimum protein sequence length (in amino acids) to be included :param verbose: more output :return: """ if verbose: sys.stderr.write(f"{bcolors.GREEN}Reading {fafile}{bcolors.ENDC}\n") seqs = {} with open(idmapfile, 'a') as idout: for seqid, seq in stream_fasta(fafile): # ignore a sequence with a stop codon if '*' in seq: continue if len(seq) < minlen: continue md5 = hashlib.md5(seq.upper().encode('utf-8')).hexdigest() seqs[md5] = seq idout.write(f"{md5}\t{seqid}\n") return seqs
def read_sequence(conf, verbose=False): """ Read the contigs file for this genome and return it :param conf: the contigs file :param verbose: :return: a dict of contig/sequence """ seqs = {} for seqid, seq in stream_fasta(conf, whole_id=False): seqs[seqid] = seq return seqs
def coding_versus_noncoding(sample, contigs, orfs, outputfile): """ Count the number of coding vs. noncoding bases """ sys.stderr.write( f"{bcolors.GREEN}Counting coding vs. non coding bases{bcolors.ENDC}\n") # read the DNA sequences dnalen = 0 for seqid, seq in stream_fasta(contigs): dnalen += len(seq) # read the protein sequences protlen = 0 for seqid, seq in stream_fasta(orfs): protlen += len(seq) with open(outputfile, 'w') as out: out.write(f"{sample}\tCoding vs non coding\t") out.write(f"[coding bp, total bp, fraction coding]\t") out.write(f"{protlen}\t{dnalen}\t{protlen / dnalen}\n")
def count_len(fastaf, verbose=False): """ Count the sequence lengths and return a dict of len:count :param fastaf: fasta file :param verbose: more output :return: """ counts = {} for seqid, seq in stream_fasta(fastaf): counts[len(seq)] = counts.get(len(seq), 0) + 1 return counts
def seq_lengths(fafile, verbose=False): """ Read the sequence length from a fasta file :param fafile: the fasta file to read :param verbose: more output :return: a dict of sequence id and length """ length = {} for i,s in stream_fasta(fafile): length[i] = len(s) return length
def length_filter(f, l, verbose=False): """ Filter a fasta file based on the minimum length, l :param f: fasta file :param l: minimum sequene length :param verbose: more output :return: """ for seqid, seq in stream_fasta(f, True): if len(seq) < l: continue print(">{}\n{}".format(seqid, seq))
def count_len(fastaf, verbose=False): """ Count the sequence lengths and return a dict of len:count :param fastaf: fasta file :param verbose: more output :return: """ counts = {} for seqid, seq in stream_fasta(fastaf): counts[len(seq)] = counts.get(len(seq), 0) + 1 return counts
def introduce_break(fastaf, lsgenes, ssgenes, numbp, verbose=False): """ Introduce a break into the contigs upstream of the large subunit, but also ignore the small subunit :param fastaf: fasta file to break :param lsgenes: dict of ls genes :param ssgenes: dict of ss genes :param numbp: how far upstream to break the DNA :param verbose: more output :return: """ for seqdef, seq in stream_fasta(fastaf): seqid = seqdef.split(" ")[0] if verbose: sys.stderr.write(f"Looking for {seqid}\n") if seqid in lsgenes: breakat = 0 dorc = False if '+ve' in lsgenes[seqid]: # the terminase is on the + strand so we go n bp less than the mininmum breakat = lsgenes[seqid]['+ve'][0] - numbp # check to see if the small subunit is here too if seqid in ssgenes and '+ve' in ssgenes[seqid]: if ssgenes[seqid]['+ve'][0] - numbp < breakat: breakat = ssgenes[seqid]['+ve'][0] - numbp if verbose: sys.stderr.write( "Because we have a small subunit at {} we moved the start\n" .format(ssgenes[seqid]['+ve'][0])) elif '-ve' in lsgenes[seqid]: dorc = True # the terminase is on the - strand so we go n bp more than the maximum breakat = lsgenes[seqid]['-ve'][1] + numbp # check to see if the small subunit is here too if seqid in ssgenes and '-ve' in ssgenes[seqid]: if ssgenes[seqid]['-ve'][1] + numbp > breakat: breakat = ssgenes[seqid]['-ve'][1] + numbp if verbose: sys.stderr.write( "Because we have a small subunit at {} we moved the start\n" .format(ssgenes[seqid]['-ve'][0])) else: sys.stderr.write("No terminase found in {}\n".format(seqid)) newseq = seq if breakat > 0: sys.stderr.write("Reformating the sequence {}\n".format(seqid)) newseq = seq[breakat:] + seq[0:breakat] if dorc: newseq = rc(newseq) print(">{}\n{}".format(seqdef, newseq))
def get_reps(faf, outf, verbose=False): """ Get the repeats and write them out :param faf: fasta file :param outf: output file :param verbose: more output :return: """ with open(outf, 'w') as out: for seqid, seq in stream_fasta(faf): for r in RobRepeatFinder.repeatFinder(seq, 3): out.write( f"{r['first_start']}\t{r['first_end']}\t{r['second_start']}\t{r['second_end']}\n" )
def count_kmers(faf, type, k, jsonout=None, verbose=False): """ Count the kmers :param faf: fasta file :param type: str either fasta or fastq :param k: kmer size :param verbose: more output :return: a dict of kmers """ if verbose: sys.stderr.write(f"{bcolors.GREEN}Counting kmers (k={k}) in {faf}\n") kmers = {} if type == "fasta": for id, seq in stream_fasta(faf): rcseq = rc(seq) posn = 0 while posn < len(seq) - k - 1: kmers[seq[posn:posn + k]] = kmers.get(seq[posn:posn + k], 0) + 1 kmers[rcseq[posn:posn + k]] = kmers.get(rcseq[posn:posn + k], 0) + 1 posn += 1 if type == "fastq": for id, fullid, seq, qual in stream_fastq(faf): rcseq = rc(seq) posn = 0 while posn < len(seq) - k - 1: kmers[seq[posn:posn + k]] = kmers.get(seq[posn:posn + k], 0) + 1 kmers[rcseq[posn:posn + k]] = kmers.get(rcseq[posn:posn + k], 0) + 1 posn += 1 if jsonout: if verbose: sys.stderr.write(f"{bcolors.BLUE}\tWriting to {jsonout}\n") with open(jsonout, 'w') as out: json.dump({faf: kmers}, out) if verbose: sys.stderr.write( f"{bcolors.BLUE}\tDone counting kmers (k={k}) in {faf}\n") return kmers
def introduce_break(fastaf, lsgenes, ssgenes, numbp, verbose=False): """ Introduce a break into the contigs upstream of the large subunit, but also ignore the small subunit :param fastaf: fasta file to break :param lsgenes: dict of ls genes :param ssgenes: dict of ss genes :param numbp: how far upstream to break the DNA :param verbose: more output :return: """ for seqdef, seq in stream_fasta(fastaf): seqid = seqdef.split(" ")[0] if verbose: sys.stderr.write(f"Looking for {seqid}\n") if seqid in lsgenes: breakat = 0 dorc = False if '+ve' in lsgenes[seqid]: # the terminase is on the + strand so we go n bp less than the mininmum breakat = lsgenes[seqid]['+ve'][0] - numbp # check to see if the small subunit is here too if seqid in ssgenes and '+ve' in ssgenes[seqid]: if ssgenes[seqid]['+ve'][0] - numbp < breakat: breakat = ssgenes[seqid]['+ve'][0] - numbp if verbose: sys.stderr.write("Because we have a small subunit at {} we moved the start\n".format(ssgenes[seqid]['+ve'][0])) elif '-ve' in lsgenes[seqid]: dorc = True # the terminase is on the - strand so we go n bp more than the maximum breakat = lsgenes[seqid]['-ve'][1] + numbp # check to see if the small subunit is here too if seqid in ssgenes and '-ve' in ssgenes[seqid]: if ssgenes[seqid]['-ve'][1] + numbp > breakat: breakat = ssgenes[seqid]['-ve'][1] + numbp if verbose: sys.stderr.write("Because we have a small subunit at {} we moved the start\n".format(ssgenes[seqid]['-ve'][0])) else: sys.stderr.write("No terminase found in {}\n".format(seqid)) newseq = seq if breakat > 0: sys.stderr.write("Reformating the sequence {}\n".format(seqid)) newseq = seq[breakat:] + seq[0:breakat] if dorc: newseq = rc(newseq) print(">{}\n{}".format(seqdef, newseq))
def parse_contigs(locations, gdir, odir): """ Parse the contigs file and print non-prophage regions :param locations: the locations hash from the phispy directory :param gdir: the genome directory that contains the contigs file :param odir: the output directory :return: None """ out = open(os.path.join(odir, "pp.fasta"), 'w') for contig, seq in stream_fasta(os.path.join(gdir, "contigs")): if contig not in locations: continue ses = sorted(locations[contig], key=itemgetter(0)) for start, end, ppid in ses: out.write(">{} {}_{}_{}\n{}\n".format(ppid, contig, start, end, seq[start:end])) out.close()
def parse_contigs(locations, gdir, odir): """ Parse the contigs file and print non-prophage regions :param locations: the locations hash from the phispy directory :param gdir: the genome directory that contains the contigs file :param odir: the output directory :return: None """ out = open(os.path.join(odir, "pp.fasta"), 'w') for contig, seq in stream_fasta(os.path.join(gdir, "contigs")): if contig not in locations: continue ses = sorted(locations[contig], key=itemgetter(0)) for start, end, ppid in ses: out.write(">{} {}_{}_{}\n{}\n".format(ppid, contig, start, end, seq[start:end])) out.close()
def count_pairwise_no_fn(faf, kmer, verbose=True): """ Count all pairwise amino acids :param faf: fasta file :param kmer: kmer size :param verbose: more output """ if verbose: sys.stderr.write(f"{bcolors.GREEN}Reading {faf}{bcolors.ENDC}\n") count = {} for sid, seq in stream_fasta(faf, whole_id=True): posn = 0 count[sid] = {} while posn < len(seq) - (kmer - 1): count[sid][seq[posn:posn + kmer]] = count[sid].get( seq[posn:posn + kmer], 0) + 1 posn += 1 # normalize by protein length for aa in count[sid]: count[sid][aa] /= len(seq) return count
def split_contigs(inf, outf, length, minlen, verbose=False): """ Split the contigs :param inf: input fasta file :param outf: output fasta file :param length: length to split into :param minlen: minimum length that a contig must be to be printed :param verbose: more output :return: """ with open(outf, 'w') as out: for seqid, seq in stream_fasta(inf, True): if verbose: sys.stderr.write("{}\n".format(seqid)) posn = 0 seqcounter = 0 seqidparts = seqid.split(" ") while posn < len(seq) - length: seqcounter += 1 if len(seqidparts) == 1: out.write(">{}_{} {}\n".format(seqidparts[0], seqcounter, "".join(seqidparts[1:]))) else: out.write(">{}_{}\n".format(seqidparts[0], seqcounter)) out.write("{}\n".format(seq[posn:posn + length])) posn += length seqcounter += 1 if len(seq) - posn < minlen: continue if len(seqidparts) == 1: out.write(">{}_{} {}\n".format(seqidparts[0], seqcounter, "".join(seqidparts[1:]))) else: out.write(">{}_{}\n".format(seqidparts[0], seqcounter)) out.write("{}\n".format(seq[posn:]))
def count_pairwise(faf, kmer, verbose=True): """ Count all pairwise amino acids :param faf: fasta file :param kmer: kmer size :param verbose: more output """ if verbose: sys.stderr.write(f"{bcolors.GREEN}Reading {faf}{bcolors.ENDC}\n") count = {} fns = {} for sidf, seq in stream_fasta(faf, whole_id=True): posn = 0 # split the sequence id into sequence and function # may need to provide an alternate way to do this # note we also strip out [organism name] sid = sidf[:sidf.index(" ")] try: fns[sid] = sidf[sidf.index(" ") + 1:sidf.index("[") - 1] except: fns[sid] = sidf[sidf.index(" ") + 1:] count[sid] = {} while posn < len(seq) - (kmer - 1): count[sid][seq[posn:posn + kmer]] = count[sid].get( seq[posn:posn + kmer], 0) + 1 posn += 1 # normalize by protein length for aa in count[sid]: count[sid][aa] /= len(seq) if verbose: sys.stderr.write(f"{bcolors.GREEN}Read {faf}{bcolors.ENDC}\n") return count, fns
def trim_seq(faf, start=0, end=None, seqid=None, verbose=False): """ Trim a sequence from start to end. If start is not provided we use the first base. If end is not provided we trim to the end. If sequence ID is not provided we trim the first sequence, otherwise we'll trim all the sequences :param faf: fasta file :param start: optional start position :param end: optional end position :param seqid: optional sequence ID to trim :param verbose: more output :return: Nothing """ for sid, seq in stream_fasta(faf): seqname = sid.split(" ")[0] if seqid and seqname != seqid: continue print(">{}\n{}".format(sid, seq[start:end])) if verbose: sys.stderr.write("Trimmed {}. Next stretch is {}\n".format(sid, seq[end:end+20]))
# read the directories for phispydir in os.listdir(args.p): if not os.path.exists(os.path.join(args.s, phispydir)): if args.v: sys.stderr.write( 'A seed directory matching the phispy directory {} was not found. Skipped\n' .format(phispydir)) continue # read the phages phageseqs = [] for phagefile in [ x for x in os.listdir(os.path.join(args.p, phispydir)) if x.endswith('.fasta') ]: for pid, phagecontig in roblib.stream_fasta( os.path.join(args.p, phispydir, phagefile)): phageseqs.append(phagecontig) # sort the phages longest to smallest phageseqs = sorted(phageseqs, key=len, reverse=True) # now read the sequences and split out on phageseqs if not os.path.exists(os.path.join(args.s, phispydir, "contigs")): sys.stderr.write( "Error: no contigs file was found at {}. Skipped\n".format( os.path.join(args.s, phispydir, "contigs"))) os.mkdir(os.path.join(args.o, phispydir)) out = open(os.path.join(args.o, phispydir, "contigs"), 'w') for gid, genomecontig in roblib.stream_fasta(
def count_kmers(dir, kmersize, outdir, verbose=False): """ Count the kmers in all the fasta files :param dir: the directory :param kmersize: the kmer size :param outdir: the output directory :param verbose: :return: """ bases = {'A', 'C', 'T', 'G'} fcount = {} rcount = {} kmers = ["".join(s) for s in product(bases, repeat=kmersize)] fare = re.compile('.fasta$|.fna$|.fa$') for f in filter(fare.search, os.listdir(dir)): if args.v: sys.stderr.write(f"{bcolors.GREEN}Reading {f}{bcolors.ENDC}\n") for seqid, seq in stream_fasta(os.path.join(dir, f)): fcount[seqid] = [] rcount[seqid] = [] for k in kmers: fcount[seqid].append(seq.count(k)) rcount[seqid].append(seq.count(rc(k))) # now we have a hash with all the fwd counts and fwd->rev counts. Which is closer dists = {} for s in fcount: dists[s] = {} for t in fcount: distf = distance.euclidean(fcount[s], fcount[t]) distr = distance.euclidean(fcount[s], rcount[t]) deltad = distf / (distr + 0.000001) # add episilon incase rd = 0 """ # Uncomment this line to print the results as a list print("\t".join(map(str, [s, t, fd, rd, dd]))) """ """ deltad is the ratio between the number of fwd kmers and the number of reverse kmers. When deltad == 0, the sequences are the same. When deltad is 1 the sequences are random When deltad is very high (>2 but sometimes much bigger) the sequences are the reverse complement We set: -1 -- they are the same 0 -- no relationship 1 -- need reverse complementing """ if deltad < 0.5: dists[s][t] = -1 elif deltad > 2: dists[s][t] = 1 else: dists[s][t] = 0 # now we just need to figure out which ones minimize the score by reverse complementing them # we are going to do this with pandas. I have a jupyter notebook (calculate_which_to_rc) about # this df = pd.DataFrame.from_dict(dists) seen = {} oldsum = 0 maxiters = 100 iters = 0 """ there are three ways to control this loop: - run it for n (100) times - run it until we see the same thing again - run it while we are always improving. It probably doesn't matter which one, we just run it 100 times We also check whether we are just flipping the same thing """ oldsum = 0 seen = {} maxiters = 100 iters = 0 lastflip = None while True: # this is based on number of runs iters += 1 if iters > maxiters: break totalsum = sum(df.sum(axis=1)) """ # uncomment to run while we are getting better if totalsum > oldsum: sys.stderr.write(f"Total sum is {totalsum}. Oldsum is {oldsum}. Breaking\n") break oldsum = totalsum """ ## what is the top one to change idx = df.sum(axis=1).sort_values(ascending=False).head(2).index nm = idx[0] if nm == lastflip: nm = idx[1] lastflip = nm """ # uncomment to break on if we have run before if nm in seen: sys.stderr.write(f"Total sum is {totalsum}. Found {nm} ({seen[nm]}) again. Breaking\n") break """ seen[nm] = seen.get(nm, 0) + 1 df.loc[nm] *= -1 df[nm] *= -1 # finally, we need to know which sequences to reverse complement. Only those where seen is an odd number! torc = set() for s in seen: if seen[s] % 2: torc.add(s) if args.v: sys.stderr.write( f"{bcolors.BLUE}Will reverse complement: {torc}{bcolors.ENDC}\n") # and now we read all the fasta files again and put them in output directory if not os.path.exists(outdir): os.mkdir(outdir) for f in filter(fare.search, os.listdir(dir)): if args.v: sys.stderr.write( f"{bcolors.GREEN}Re-reading to check for rc {f}{bcolors.ENDC}\n" ) with open(os.path.join(outdir, f), 'w') as out: for seqid, seq in stream_fasta(os.path.join(dir, f)): if seqid in torc: out.write(f">{seqid}_rc\n{rc(seq)}\n") else: out.write(f">{seqid}\n{seq}\n")
""" Print the IDs and lengths of sequences in a fasta file """ import os import sys import argparse from roblib import stream_fasta parser = argparse.ArgumentParser(description="Print the lengths of sequences in a fasta file") parser.add_argument('-f', help='fasta file', required=True) parser.add_argument('-w', help='whole sequence ID. Default is to use ID upto whitespace', action="store_true", default=False) parser.add_argument('-v', help='verbose output', action="store_true") args = parser.parse_args() for seqid, seq in stream_fasta(args.f, args.w): print("{}\t{}".format(seqid, len(seq)))
""" import os import sys import argparse from roblib import stream_fasta, colours __author__ = 'Rob Edwards' __copyright__ = 'Copyright 2020, Rob Edwards' __credits__ = ['Rob Edwards'] __license__ = 'MIT' __maintainer__ = 'Rob Edwards' __email__ = '*****@*****.**' if __name__ == '__main__': parser = argparse.ArgumentParser(description=" ") parser.add_argument('-f', help='fasta file', required=True) parser.add_argument('-d', help='output directory', required=True) parser.add_argument('-v', help='verbose output', action='store_true') args = parser.parse_args() if not os.path.exists(args.d): os.makedirs(args.d, exist_ok=True) for seqid, seq in stream_fasta(args.f, True): sname = seqid.split(" ")[0] if args.v: sys.stderr.write(f"{colours.GREEN}Writing {sname}{colours.ENDC}\n") with open(os.path.join(args.d, f"{sname}.fasta"), 'w') as out: out.write(f">{seqid}\n{seq}\n")
parser.add_argument('-f', help='fasta DNA sequence file', required=True) parser.add_argument('-o', help='ORFs file', required=True) parser.add_argument( '-s', help= 'sources file that has the source of the ORF calls (default = sources.txt)', default="sources.txt") parser.add_argument('-v', help='verbose output', action='store_true') args = parser.parse_args() sources = read_sources(args.s) # generate the six frame translations seqs = {} lengths = {} for seqid, seq in stream_fasta(args.f, False): seq = seq.upper() lengths[seqid] = len(seq) seqs[seqid] = {} seqs[seqid]["f1"] = translate_dna(seq, args.v) seqs[seqid]["f2"] = translate_dna(seq[1:], args.v) seqs[seqid]["f3"] = translate_dna(seq[2:], args.v) rcseq = rc(seq) seqs[seqid]["r1"] = translate_dna(rcseq, args.v) seqs[seqid]["r2"] = translate_dna(rcseq[1:], args.v) seqs[seqid]["r3"] = translate_dna(rcseq[2:], args.v) for orfid, orf in stream_fasta(args.o): for s in seqs: for fr in ["f1", "f2", "f3", "r1", "r2", "r3"]: if orf in seqs[s][fr]:
NOTE: This is faster (not really fast), but only allows you to run one fasta """ import os import sys import argparse import hashlib from roblib import stream_fasta, bcolors if __name__ == '__main__': parser = argparse.ArgumentParser(description=" ") parser.add_argument('-f', help='fasta file') parser.add_argument('-i', help='id map file to write', required=True) parser.add_argument('-o', help='output file', required=True) args = parser.parse_args() md5seen = set() idseen = set() with open(args.o, 'w') as out, open(args.i, 'w') as idout: for seqid, seq in stream_fasta(args.f): if seqid in idseen: continue idseen.add(seqid) md5 = hashlib.md5(seq.upper().encode('utf-8')).hexdigest() idout.write(f"{md5}\t{seqid}\n") if md5 not in md5seen: out.write(f">{md5}\n{seq}\n") md5seen.add(md5)
if not os.path.exists(args.o): os.mkdir(args.o) # read the directories for phispydir in os.listdir(args.p): if not os.path.exists(os.path.join(args.s, phispydir)): if args.v: sys.stderr.write( 'A seed directory matching the phispy directory {} was not found. Skipped\n'.format(phispydir)) continue # read the phages phageseqs = [] for phagefile in [x for x in os.listdir(os.path.join(args.p, phispydir)) if x.endswith('.fasta')]: for pid, phagecontig in roblib.stream_fasta(os.path.join(args.p, phispydir, phagefile)): phageseqs.append(phagecontig) # sort the phages longest to smallest phageseqs = sorted(phageseqs, key=len, reverse=True) # now read the sequences and split out on phageseqs if not os.path.exists(os.path.join(args.s, phispydir, "contigs")): sys.stderr.write( "Error: no contigs file was found at {}. Skipped\n".format(os.path.join(args.s, phispydir, "contigs"))) os.mkdir(os.path.join(args.o, phispydir)) out = open(os.path.join(args.o, phispydir, "contigs"), 'w') for gid, genomecontig in roblib.stream_fasta(os.path.join(args.s, phispydir, "contigs")): contigcount = 0