예제 #1
0
def main(args):
    tags = {}
    if args.verbose:
        sys.stderr.write(">> reading in tag sequences...\n")
    with nopen(args.tags) as fasta:
        for name, seq in read_fasta(fasta):
            tags[name] = seq
    i = 0
    for fx in args.reads:
        if args.verbose:
            sys.stderr.write(">> processing %s...\n" % op.basename(fx))
        # process either fasta or fastq.
        if ".fasta" in fx or ".fa" in fx:
            with nopen(fx) as fa:
                for f_id, f_seq in read_fasta(fa):
                    i += 1
                    if i % 1000000 == 0 and args.verbose:
                        sys.stderr.write(">> processed %d reads...\n" % i)
                    print_record(tags, f_id, f_seq)
        else:
            with nopen(fx) as fq:
                for f_id, f_seq, f_qual in read_fastq(fq):
                    i += 1
                    if i % 1000000 == 0 and args.verbose:
                        sys.stderr.write(">> processed %d reads...\n" % i)
                    print_record(tags, f_id, f_seq)
예제 #2
0
def main(args):
    # fields from issake
    fields = "contig_id length reads avg_coverage seed v_region j_region".split()
    # the only fields i believe make any sense to keep
    out = "id v_region j_region length reads avg_coverage percent_of_total sequence".split()
    # total reads used in assembly
    total = 0.
    with nopen(args.fasta_in) as fasta:
        for name, seq in read_fasta(fasta):
            name = name.replace("size","").replace("cov","").replace("read","").replace("seed:","")
            d = dict(zip(fields, name.split("|")))
            total += int(d['reads'])
    with nopen(args.fasta_in) as fasta,\
            open(args.fasta_out, 'wb') as fasta_out,\
            open(args.meta, 'wb') as meta:
        # print header
        meta.write("\t".join(out) + "\n")
        for i, (name, seq) in enumerate(read_fasta(fasta)):
            # remove some text from iSSAKE output
            name = name.replace("size","").replace("cov","").replace("read","").replace("seed:","")
            d = dict(zip(fields, name.split("|")))
            # want to shorten the read names
            d['id'] = "contig_%d" % i
            d['percent_of_total'] = "%.6g" % (100 * (int(d['reads']) / total))
            d['sequence'] = seq
            meta.write("\t".join(map(str, [d[o] for o in out])) + "\n")
            write_fasta(fasta_out, d['id'], seq.upper())
예제 #3
0
def main(args):
    full_seqs = {}
    tags = {}
    with nopen(args.fasta) as fasta:
        for name, seq in read_fasta(fasta):
            full_seqs[name] = seq.upper()
        
    # each tcr in original fasta
    for tcr, seq in full_seqs.iteritems():
        unique_expected = len(full_seqs) - 1
        # all possible tags
        for i in range(len(seq) - args.length, 0, -1):
            # tag matches favor 3' end
            tag = seq[i:args.length + i]
            # reached the end of the sequence
            if len(tag) < args.length: break
            unique_found = 0

            # tag not present in any other tcr
            for ss_tcr, ss_seq in full_seqs.iteritems():
                # the current tcr
                if ss_tcr == tcr: continue
                # finding unique tags
                if ss_seq.find(tag) == -1:
                    unique_found += 1

            if unique_found == unique_expected:
                tags[tcr] = tag
                # exit loop on first unique tag
                break
                
    # ensure this actually worked
    taglist = [tag for name, tag in tags.iteritems()]
    tagset = set(taglist)
    assert(len(taglist) == len(tagset))
    taglist = []
    
    # print results
    for tcr, tag in tags.iteritems():
        taglist.append(tcr)
        print ">%s\n%s" % (tcr, tag)

    # tags found stats
    if args.verbose:
        sys.stderr.write("Of %d regions, %d tags were found.\n" \
                            % (len(full_seqs), len(tagset)))
        alltcrs = []
        for tcr, seq in full_seqs.iteritems():
            alltcrs.append(tcr)
        alltcrs = set(alltcrs)
        taglist = set(taglist)
        diff = alltcrs - taglist
        if diff:
            sys.stderr.write("Unable to find a unique tag for:\n")
            sys.stderr.write("\n".join(diff) + "\n")
예제 #4
0
#!/usr/bin/env python
# encoding: utf-8
"""
Parses the read name down to only include the necessary gene label.
"""
import re
import sys
import itertools
from toolshed import nopen
from parsers import read_fasta

def main(args):
    with nopen(args.fasta) as fasta
        for name, seq in read_fasta(fasta):
            try:
                # rename from imgt
                name = re.findall(r'(%s[^\|]+)' % args.gene.upper(), name)[0]
                print ">%s\n%s" % (name, seq.upper())
            except IndexError:
                sys.stderr.write(">> unable to parse: %s\n>> for gene: %s\n" \
                                    % (name, args.gene))
                pass

if __name__ == "__main__":
    import argparse
    p = argparse.ArgumentParser(description=__doc__,
            formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument('fasta')
    req = p.add_argument_group('required arguments')
    req.add_argument('-g', '--gene', required=True,
            help="gene name, eg. TRAJ or TRBV")