示例#1
0
文件: relabel.py 项目: trmznt/seqpy
def main(args):

    container = None

    for infile in args.files:

        obj = bioio.load(infile, options=args.io_opts or [])
        cout('reading %d sequences from %s' % (len(obj), infile))
        if container is None:
            container = obj
        else:
            container += obj

    indexes = []
    counter = 0
    for s in container:
        counter += 1
        new_label = '%04d' % counter
        indexes.append((new_label, s.label))
        s.label = new_label

    if args.outfile:
        bioio.save(container, args.outfile, options=args.io_opts or [])

    if args.tabfile:
        with open(args.tabfile, 'w') as f:
            for i in indexes:
                f.write('%s\t%s\n' % i)
示例#2
0
文件: gb2table.py 项目: trmznt/seqpy
def main(args):

    tables = []
    container = bioio.multisequence()

    n = 1
    for infile in args.files:

        mseqs = bioio.load(infile)

        mseqs.sort(lambda x: x.label)

        for s in mseqs:
            tables.append((
                n,
                s.label,
                s.attr.get('collection_date', ''),
                s.attr.get('country', ''),
                s.attr.get('isolate', ''),
                s.definition,
            ))
            container.append(bioio.biosequence('%04d' % n, s.seq.upper()))
            n += 1

    # write to output file
    tabfile = open(args.tabfile, 'w')
    tabfile.write('LABEL\tACCNO\tDATE\tCOUNTRY\tISOLATE\tDEFINITION\n')
    tables.sort()
    for r in tables:
        tabfile.write('%04d\t%s\t%s\t%s\t%s\t%s\n' % r)
    tabfile.close()

    bioio.save(container, args.outfile)
示例#3
0
def main(args):

    aaseqs = bioio.multisequence()

    if args.start_sequence:
        args.start_sequence = args.start_sequence.upper().encode('ASCII')

    for infile in args.files:

        mseq = bioio.load(infile, options=args.io_opts)
        cout('reading %d sequences from %s' % (len(mseq), infile))

        for seq in mseq:
            aaseq = seq.clone()
            if args.start_sequence:
                # we use search restriction pattern function to locate
                # the position
                target_seq = funcs.uppercased(funcs.degapped(seq))
                res = funcs.search_restriction_site(target_seq,
                                                    args.start_sequence)
                if len(res) != 1:
                    continue
                print(target_seq[res[0][0]:res[0][0] + 30])
                aaseq.set_sequence(
                    funcs.translated(target_seq, start_pos=res[0][0] + 1))
            else:
                aaseq.set_sequence(
                    funcs.translated(seq, start_pos=args.start_codon))
            aaseqs.append(aaseq)

    bioio.save(aaseqs, args.outfile)
示例#4
0
文件: condense.py 项目: trmznt/seqpy
def main( args ):

    mseq = bioio.load( args.infile, options = args.io_opts )
    cout('reading %d sequences from %s' % (len(mseq), args.infile))
    c_mseq = funcs.condensed( mseq )
    bioio.save( c_mseq, args.outfile )

    if args.report:
        write_report(c_mseq, args.report)
示例#5
0
def vcf2seq(args):

    vcf2seqhelper = VCF2SeqHelper(
        args.vcffile, args.chr,
        'NoIndel,LowQual,MissingThreshold=0.05,HetThreshold=0.25,' + args.opts)
    vcf2seqhelper.parse()
    mseq = vcf2seqhelper.get_multisequence()
    cout('Report:')
    for k, v in vcf2seqhelper.chr_used.items():
        cout(' %s\t%d' % (k, v))
    cout('Writing to %s' % args.outfile)
    bioio.save(mseq, args.outfile)
示例#6
0
文件: readseq.py 项目: trmznt/seqpy
def main(args):

    container = None

    for infile in args.files:

        obj = bioio.load(infile, options=args.io_opts or [])
        cout('reading %d sequences from %s' % (len(obj), infile))
        if container is None:
            container = obj
        else:
            container += obj

    append_attributes(container, args.src, args.src_isolate, args.definition)

    if args.accno:
        set_label_to_accno(container)

    if args.degap:
        container.degap()

    if args.minlen > 0 or args.maxlen > 0 or args.maxN > 0:
        new_container = bioio.multisequence()
        for s in container:
            if args.minlen > 0 and len(s) < args.minlen:
                continue
            if args.maxlen > 0 and len(s) > args.maxlen:
                continue
            if args.maxN > 0 and s.seq.count(b'N') / len(s) > args.maxN:
                continue
            new_container.append(s)

        container = new_container

    if args.sort:
        if args.sort.startswith('len'):
            container.sort(lambda x: len(x), reverse=True)
        elif args.sort.startswith('lab'):
            container.sort(lambda x: x.label)

    if args.summary:
        for s in container:
            seq = s.seq.upper()
            print(">%s\nA:%d\tC:%d\tG:%d\tT:%d\t-:%d" %
                  (s.label.decode('ASCII'), seq.count(b'A'), seq.count(b'C'),
                   seq.count(b'G'), seq.count(b'T'), seq.count(b'-')))

    if args.outfile:
        bioio.save(container, args.outfile, options=args.io_opts or [])
示例#7
0
文件: translate.py 项目: afifai/seqpy
def main( args ):

    aaseqs = bioio.multisequence()

    for infile in args.files:

        mseq = bioio.load( infile, options = args.io_opts )
        cout('reading %d sequences from %s' % (len(mseq), infile))

        for seq in mseq:
            aaseq = seq.clone()
            aaseq.set_sequence( funcs.translated(seq, start_pos = args.start_codon ) )
            aaseqs.append( aaseq )

    bioio.save( aaseqs, args.outfile )
def gather_consensus( args ):

    # set output directory
    args.outdir = args.indir + '-results' if not args.outdir else args.outdir

    # open input file
    cons = multisequence()
    header = None
    stat_lines = []

    if args.add:
        seqs = load(args.add)
        cons.extend( seqs )

    for indir in sorted(os.listdir(args.indir)):

        seqpath = os.path.join(args.indir, indir, args.consfile)
        print(args.indir, indir, args.consfile, seqpath)
        try:
            seqs = load(seqpath)
        except FileNotFoundError:
            cerr('[WARN: no such file: %s]' % (seqpath) )
            continue

        cons.append( seqs[0] )

        statpath = os.path.join(args.indir, indir, args.statfile)
        with open(statpath) as fin:
            lines = fin.read().split('\n')
            if not header:
                header = lines[0].strip()
            stat_lines.append( lines[1].strip() )

    try:
        os.mkdir(args.outdir)
    except:
        pass

    save( cons, os.path.join(args.outdir, 'consensus.fas' ) )
    with open( os.path.join(args.outdir, 'stats.tsv'), 'w') as fout:
        fout.write(header)
        fout.write('\n')
        fout.write('\n'.join(stat_lines))

    cerr(f'[Writing results to directory {args.outdir}]')
示例#9
0
def main(args):

    mseq = bioio.multisequence()

    for infile in args.files:
        trace = bioio.load(infile)
        result = traceutils.trim(trace, args.winsize, args.qual_threshold)
        if not result:
            continue

        bases, quals, upstream_trim, downstream_trim = result
        seq = bioio.biosequence(infile, bases)
        seq.add_attr('upstream_trim', str(upstream_trim))
        seq.add_attr('downstream_trim', str(downstream_trim))

        mseq.append(seq)

    bioio.save(mseq, args.outfile)
示例#10
0
def main(args):

    circseqs = bioio.multisequence()

    mseq = bioio.load(args.infile, options=args.io_opts)
    rseq = bioio.load(args.reffile)

    for seq in mseq:
        circseq = seq.clone()
        if args.minlen > 0 and len(seq) > args.minlen:
            print('seq:', circseq.label)
            circseq.set_sequence(
                recircularize_sequence(seq.seq,
                                       rseq[0].seq,
                                       max_mismatch=args.max_mismatch))
        else:
            circseq.set_sequence(seq.seq)
        circseqs.append(circseq)

    bioio.save(circseqs, args.outfile)
示例#11
0
文件: patdist.py 项目: trmznt/seqpy
def main(args):

    import dendropy

    tree = dendropy.Tree.get(path=args.treefile, schema="newick")

    pdc = tree.phylogenetic_distance_matrix()

    cerr('Reading: %d taxa' % len(tree.taxon_namespace))

    if args.collect > 0:

        ref_seqs = bioio.load(args.reffile)

        ref_taxa = []
        for taxon in tree.taxon_namespace:
            if ref_seqs.get_by_label(taxon.label) != None:
                print('appended')
                ref_taxa.append(taxon)

        cerr('Referenced: %d taxa' % len(ref_taxa))

        collected_taxa = set()
        for t1 in ref_taxa:
            d = []
            for t2 in tree.taxon_namespace[:-1]:
                d.append((pdc(t1, t2), t2))
            d.sort()

            for i in range(args.collect):
                collected_taxa.add(d[i][1])
            collected_taxa.add(t1)

        cerr('Collected: %d taxa' % len(collected_taxa))

        db_seqs = bioio.load(args.dbfile)
        mseq = bioio.multisequence()
        for taxon in collected_taxa:
            mseq.append(db_seqs.get_by_label(taxon.label))

        bioio.save(mseq, args.outfile)
示例#12
0
def main(args):

    circseqs = bioio.multisequence()

    mseq = bioio.load(args.infile, options=args.io_opts)
    rseq = bioio.load(args.reffile)

    for seq in mseq:
        if seq.label != 'NODE_2_length_4501_cov_41.785': continue
        if len(seq) < len(rseq[0]):
            cerr('WARNING: %s is shorter than reference' % seq.label)
        circseq = seq.clone()
        if args.minlen > 0 and len(seq) > args.minlen:
            print('seq:', circseq.label)
            circseq.set_sequence(
                recircularize_sequence(seq.seq,
                                       rseq[0].seq,
                                       max_mismatch=args.max_mismatch))
        else:
            circseq.set_sequence(seq.seq)
        circseqs.append(circseq)

    bioio.save(circseqs, args.outfile)
示例#13
0
def main(args):

    # read tables

    tables = {}
    tabfile = open(args.tabfile)
    next(tabfile)
    for line in tabfile:
        items = line.strip().split('\t')
        tables[items[0]] = items

    mseq = bioio.load(args.infile)
    for s in mseq:
        rec = tables[s.label]
        mo = re_date.search(rec[2])
        if mo:
            year = mo.group()
        else:
            year = '-'
        #print('%s/%s/%s' % (s.label, rec[3], year))
        s.label = '%s/%s' % (s.label, year)

    bioio.save(mseq, args.outfile)
示例#14
0
文件: readseq.py 项目: afifai/seqpy
def main(args):

    container = None

    for infile in args.files:

        obj = bioio.load(infile, options=args.io_opts or [])
        cout('reading %d sequences from %s' % (len(obj), infile))
        if container is None:
            container = obj
        else:
            container += obj

    append_attributes(container, args.src, args.src_isolate, args.definition)

    if args.summary:
        for s in container:
            seq = s.seq.upper()
            print(">%s\nA:%d\tC:%d\tG:%d\tT:%d\t-:%d" %
                  (s.label.decode('ASCII'), seq.count(b'A'), seq.count(b'C'),
                   seq.count(b'G'), seq.count(b'T'), seq.count(b'-')))

    if args.outfile:
        bioio.save(container, args.outfile, options=args.io_opts or [])
示例#15
0
文件: model.py 项目: trmznt/insane
 def save(self, filename):
     bioio.save(self._msa, filename)
示例#16
0
文件: condense.py 项目: afifai/seqpy
def main(args):

    mseq = bioio.load(args.infile, options=args.io_opts)
    cout('reading %d sequences from %s' % (len(mseq), args.infile))
    bioio.save(funcs.condensed(mseq), args.outfile)
示例#17
0
def prepare_submission(args):

    out_metadata = args.outprefix + '.csv'
    out_fasta = args.outprefix + '.fas'

    # open metadata file
    if args.metafile.lower().endswith('.csv'):
        separator = ','
    elif args.metafile.lowe().endswith('.tsv'):
        separator = '\t'
    cerr(f'[Reading metadata file {args.metafile}]')
    metadata_df = pd.read_table(args.metafile, sep=separator)

    # make sure sequence name is a string (in case the the column is automatically
    # converted to number)
    metadata_df['fn'] = metadata_df['fn'].astype('str')
    metadata_df['covv_assembly_method'] = metadata_df['covv_assembly_method'].astype('str')
    metadata_df.set_index('fn', drop=False, inplace=True )

    #import IPython; IPython.embed()

    # open infile tsv
    cerr(f'[Reading infile {args.infile}]')
    submission_df = pd.read_table(args.infile, sep='\t')

    # check for available field in submission_df
    code_field = 'SAMPLE' if 'SAMPLE' in submission_df.columns else 'fn'
    submission_df[code_field] = submission_df[code_field].astype('str')

    # open sequence file
    cerr(f'[Reading sequence file {args.seqfile}]')
    mseq = bioio.load( args.seqfile )
    mseq_keys = {}
    for i in range(len(mseq)):
        mseq_keys[ mseq[i].label ] = i

    # iterate over submission_df
    used = []
    #import IPython; IPython.embed()

    for (i, s) in submission_df.iterrows():

        sample_id = s[code_field]
        r = metadata_df.loc[sample_id]

        if sample_id not in mseq_keys:
            continue

        cerr(f'[Preparing sample {sample_id}]')
        # set coverage
        # import IPython; IPython.embed()
        metadata_df.at[sample_id, 'covv_coverage'] = s['AVGDEPTH']
        metadata_df.at[sample_id, 'fn'] = out_fasta
        metadata_df.at[sample_id, 'covv_seq_technology'] = args.covv_seq_technology
        metadata_df.at[sample_id, 'covv_assembly_method'] = args.covv_assembly_method

        # set sequence name
        idx = mseq_keys[sample_id]
        mseq[idx].label = r['covv_virus_name']
        mseq[idx].seq = mseq[idx].seq.strip(b'-')
        used.append(sample_id)
        cerr(f'[Finish preparing  sample {sample_id}]')

    # remove unused metadata
    metadata_df = metadata_df.loc[ used ]

    # write to new fasta & metadata file
    metadata_df.to_csv(out_metadata, sep=',', index=False)
    bioio.save(mseq, out_fasta)