def extract_sequences(fastafile, hits, addhitname=False): """ Extract the sequences from a fasta file :param fastafile: The fasta file to get the sequences from :type fastafile: str :param hits: The dict of hits using contig, start, end :type hits: dict :return: A dict of the sequences with contig_start_end as ID and sequence as value :rtype: dict """ sequences = {} if not os.path.exists(fastafile): sys.exit("{} not found\n".format(fastafile)) fa = roblib.read_fasta(fastafile) for contig in hits: if contig not in fa: sys.stderr.write("WARNING: {} was not found in {}\n".format( contig, fastafile)) for tple in hits[contig]: seq = fa[contig][tple[0]:tple[1]] if tple[2]: seq = roblib.rc(seq) loc = "_".join(map(str, [contig, tple[0] + 1, tple[1]])) if addhitname: loc += " [hit={}]".format(tple[3]) sequences[loc] = seq return sequences
def read_sequence(conf, verbose=False): """ Read the contigs file for this genome and return it :param conf: the contigs file :param verbose: :return: a dict of contig/sequence """ if verbose: sys.stderr.write(f"{bcolors.GREEN}READING {conf}{bcolors.ENDC}\n") return read_fasta(conf, whole_id=False)
def write_permutations(faf, outputf, verbose=False): """ Create and write all the permutations """ fa = read_fasta(faf, whole_id=False) ids = list(fa.keys()) with open(outputf, 'w') as out: for tple in combinations(ids, 2): out.write( f">{tple[0]}\n{fa[tple[0]]}\n>{tple[1]}\n{fa[tple[1]]}\n")
def fasta2ids(faf, verbose=False): """ Extract IDs from a fasta file :param faf: fasta file :param verbose: more output :return: a set of IDS """ if verbose: sys.stderr.write( f"{bcolors.GREEN} Reading IDs from fasta file: {faf}{bcolors.ENDC}" ) f = read_fasta(faf, whole_id=False) return set(f.keys())
from roblib import read_fasta from random import randint __author__ = 'Rob Edwards' parser = argparse.ArgumentParser( description='Convert a fasta file to fastq, faking the qual scores') parser.add_argument('-f', help='fasta file', required=True) parser.add_argument('-q', help='fastq output file', required=True) parser.add_argument('-s', help='quality score. Default = 40', default=40, type=int) parser.add_argument('-r', help='random quality scores between 5 and 40', action='store_true') args = parser.parse_args() c = chr(args.s) fa = read_fasta(args.f) with open(args.q, 'w') as out: for i in fa: l = len(fa[i]) q = l * c if args.r: q = "" for s in range(l): q = q + chr(randint(33, 125)) out.write("@{}\n{}\n+\n{}\n".format(i, fa[i], q))
sys.exit(1) if args.f: files = args.f else: files = [] if args.d: for subdir in args.d: for f in os.listdir(subdir): files.append(os.path.join(subdir, f)) overall = {'number': 0, 'total': 0, 'shortest': 1e6, 'longest': 0} for faf in files: fa = read_fasta(faf) if len(fa.keys()) == 1 and list(fa.keys())[0] == '': sys.stderr.write(f"No sequences found in {faf}\n") sys.exit(0) if args.l: for i in fa: print("{}\t{}".format(i, len(fa[i]))) print() lensall = [len(fa[i]) for i in fa] lens = list(filter(lambda x: x > args.m, lensall)) lens.sort() length = sum(lens)
import os import sys import argparse from roblib import read_fasta __author__ = "Rob Edwards" parser = argparse.ArgumentParser(description="Convert a fasta file to fastq, faking the qual scores") parser.add_argument("-f", help="fasta file", required=True) parser.add_argument("-q", help="fastq output file", required=True) parser.add_argument("-s", help="quality score. Default = 40", default=40, type=int) args = parser.parse_args() c = chr(args.s) fa = read_fasta(args.f) with open(args.q, "w") as out: for i in fa: l = len(fa[i]) out.write("@{}\n{}\n+\n{}\n".format(i, fa[i], l * c))
import numpy from roblib import read_fasta __author__ = 'Rob Edwards' if __name__ == "__main__": parser = argparse.ArgumentParser(description=' ') parser.add_argument('-d', help='directory of fasta files', required=True, action='append') parser.add_argument('-p', help='figure file name for the graph', required=True) parser.add_argument('-m', help='minimum length to be included (default = all reads)', default=0, type=int) args = parser.parse_args() lengths = {} maxd = 0 for d in args.d: lengths[d] = [] for f in os.listdir(d): fa = read_fasta(os.path.join(d, f)) lengths[d].extend([len(fa[x]) for x in fa]) maxd = max(lengths[d]) if max(lengths[d]) > maxd else maxd bins = numpy.linspace(args.m, maxd, 100) alpha = 1.0 / len(args.d) #pyplot.ylim(ymin=args.m) for d in args.d: data = list(filter(lambda x: x > args.m, lengths[d])) pyplot.hist(data, bins, alpha=alpha, label=d) pyplot.legend(loc='upper right') pyplot.savefig(args.p)
counter = args.n - 1 if not args.f and not args.d: sys.stderr.write( f"{bcolors.RED}FATAL: Please supply either -d or -f options{bcolors.ENDC}\n" ) sys.exit(-1) idmap = open(args.i, 'w') out = open(args.o, 'w') if args.f: for f in args.f: if args.v: sys.stderr.write(f"{bcolors.GREEN}Reading {f}{bcolors.ENDC}\n") fa = read_fasta(f) for id in fa: counter += 1 out.write(">{}\n{}\n".format(counter, fa[id])) idmap.write("{}\t{}\t{}\n".format(f, id, counter)) if args.x and (counter - (args.n - 2)) > args.x: break if args.d: for d in args.d: if args.v: sys.stderr.write(f"{bcolors.GREEN}Reading {d}{bcolors.ENDC}\n") for f in os.listdir(d): if args.v: sys.stderr.write( f"{bcolors.BLUE}\tReading {f}{bcolors.ENDC}\n")
parser.add_argument('-v', help='verbose output', action='store_true') args = parser.parse_args() endings = {'.fna', '.fasta', '.fa'} for f in os.listdir(args.d): longest = [0, None, None] isfasta = False for e in endings: if f.endswith(e): isfasta = True break if not isfasta: if args.v: sys.stderr.write( f"{bcolors.PINK}Don't think {f} is a fasta file. Skipped\n{bcolors.ENDC}" ) continue if args.v: sys.stderr.write(f"{bcolors.GREEN}{f}{bcolors.ENDC}\n") fa = read_fasta(os.path.join(args.d, f)) for x in fa: if len(fa[x]) > longest[0]: longest = [len(fa[x]), x, fa[x]] if 0 == longest[0]: continue print("{}\t{}".format(f, longest[0])) if args.f: with open(args.f, 'a') as out: out.write(f">{longest[1]} [from {f}]\n{longest[2]}\n")
""" Print the length of the longest contig for each file in a directory of fasta files. """ import os import sys import argparse from roblib import read_fasta __author__ = 'Rob Edwards' if __name__ == "__main__": parser = argparse.ArgumentParser(description='Print the length of the longest contig for each file in a directory of fasta files') parser.add_argument('-d', help='Directory of fasta files', required=True) args = parser.parse_args() for f in os.listdir(args.d): fa = read_fasta(os.path.join(args.d, f)) lengths = [len(fa[x]) for x in fa] lengths.sort() print("{}\t{}".format(f, lengths[-1]))
import os import sys import argparse from roblib import read_fasta, write_fastq, message __author__ = 'Rob Edwards' __copyright__ = 'Copyright 2020, Rob Edwards' __credits__ = ['Rob Edwards'] __license__ = 'MIT' __maintainer__ = 'Rob Edwards' __email__ = '*****@*****.**' if __name__ == '__main__': parser = argparse.ArgumentParser(description=" ") parser.add_argument('-f', help='fasta file', required=True) parser.add_argument('-q', help='quality file', required=True) parser.add_argument('-o', help='output fastq file', required=True) parser.add_argument('-v', help='verbose output', action='store_true') args = parser.parse_args() if not os.path.exists(args.f) and not os.path.exists(args.q): message("FATAL: either {args.f} or {args.q} not found", "RED") sys.exit(-1) fa = read_fasta(args.f, True, False) qu = read_fasta(args.q, True, True) write_fastq(fa, qu, args.o, args.v)