def main(args): tags = {} if args.verbose: sys.stderr.write(">> reading in tag sequences...\n") with nopen(args.tags) as fasta: for name, seq in read_fasta(fasta): tags[name] = seq i = 0 for fx in args.reads: if args.verbose: sys.stderr.write(">> processing %s...\n" % op.basename(fx)) # process either fasta or fastq. if ".fasta" in fx or ".fa" in fx: with nopen(fx) as fa: for f_id, f_seq in read_fasta(fa): i += 1 if i % 1000000 == 0 and args.verbose: sys.stderr.write(">> processed %d reads...\n" % i) print_record(tags, f_id, f_seq) else: with nopen(fx) as fq: for f_id, f_seq, f_qual in read_fastq(fq): i += 1 if i % 1000000 == 0 and args.verbose: sys.stderr.write(">> processed %d reads...\n" % i) print_record(tags, f_id, f_seq)
def main(args): # fields from issake fields = "contig_id length reads avg_coverage seed v_region j_region".split() # the only fields i believe make any sense to keep out = "id v_region j_region length reads avg_coverage percent_of_total sequence".split() # total reads used in assembly total = 0. with nopen(args.fasta_in) as fasta: for name, seq in read_fasta(fasta): name = name.replace("size","").replace("cov","").replace("read","").replace("seed:","") d = dict(zip(fields, name.split("|"))) total += int(d['reads']) with nopen(args.fasta_in) as fasta,\ open(args.fasta_out, 'wb') as fasta_out,\ open(args.meta, 'wb') as meta: # print header meta.write("\t".join(out) + "\n") for i, (name, seq) in enumerate(read_fasta(fasta)): # remove some text from iSSAKE output name = name.replace("size","").replace("cov","").replace("read","").replace("seed:","") d = dict(zip(fields, name.split("|"))) # want to shorten the read names d['id'] = "contig_%d" % i d['percent_of_total'] = "%.6g" % (100 * (int(d['reads']) / total)) d['sequence'] = seq meta.write("\t".join(map(str, [d[o] for o in out])) + "\n") write_fasta(fasta_out, d['id'], seq.upper())
def main(args): full_seqs = {} tags = {} with nopen(args.fasta) as fasta: for name, seq in read_fasta(fasta): full_seqs[name] = seq.upper() # each tcr in original fasta for tcr, seq in full_seqs.iteritems(): unique_expected = len(full_seqs) - 1 # all possible tags for i in range(len(seq) - args.length, 0, -1): # tag matches favor 3' end tag = seq[i:args.length + i] # reached the end of the sequence if len(tag) < args.length: break unique_found = 0 # tag not present in any other tcr for ss_tcr, ss_seq in full_seqs.iteritems(): # the current tcr if ss_tcr == tcr: continue # finding unique tags if ss_seq.find(tag) == -1: unique_found += 1 if unique_found == unique_expected: tags[tcr] = tag # exit loop on first unique tag break # ensure this actually worked taglist = [tag for name, tag in tags.iteritems()] tagset = set(taglist) assert(len(taglist) == len(tagset)) taglist = [] # print results for tcr, tag in tags.iteritems(): taglist.append(tcr) print ">%s\n%s" % (tcr, tag) # tags found stats if args.verbose: sys.stderr.write("Of %d regions, %d tags were found.\n" \ % (len(full_seqs), len(tagset))) alltcrs = [] for tcr, seq in full_seqs.iteritems(): alltcrs.append(tcr) alltcrs = set(alltcrs) taglist = set(taglist) diff = alltcrs - taglist if diff: sys.stderr.write("Unable to find a unique tag for:\n") sys.stderr.write("\n".join(diff) + "\n")
#!/usr/bin/env python # encoding: utf-8 """ Parses the read name down to only include the necessary gene label. """ import re import sys import itertools from toolshed import nopen from parsers import read_fasta def main(args): with nopen(args.fasta) as fasta for name, seq in read_fasta(fasta): try: # rename from imgt name = re.findall(r'(%s[^\|]+)' % args.gene.upper(), name)[0] print ">%s\n%s" % (name, seq.upper()) except IndexError: sys.stderr.write(">> unable to parse: %s\n>> for gene: %s\n" \ % (name, args.gene)) pass if __name__ == "__main__": import argparse p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument('fasta') req = p.add_argument_group('required arguments') req.add_argument('-g', '--gene', required=True, help="gene name, eg. TRAJ or TRBV")