예제 #1
0
        cols = line.split('\t')
        OTU = cols[9]
        Hit = cols[8]
        if not OTU in iSeqMapped:
            iSeqMapped[OTU] = [Hit]
        else:
            iSeqMapped[OTU].append(Hit)
with open(ClusterComp, 'w') as clusters:
    clusters.write('OTU\tiSeqs\n')
    for k, v in natsorted(iSeqMapped.items()):
        clusters.write('%s\t%s\n' % (k, ', '.join(v)))

#strip N's
ufitslib.log.info("Cleaning up padding from OTUs")
otu_clean = os.path.join(tmp, args.out + '.EE' + args.maxee + '.clean.fa')
ufitslib.fasta_strip_padding(uclust_out, otu_clean)

#run optional uchime_ref
if not args.uchime_ref:
    uchime_out = otu_clean
else:
    uchime_out = os.path.join(
        tmp, args.out + '.EE' + args.maxee + '.uchime.otus.fa')
    #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy
    if args.uchime_ref in [
            'ITS', '16S', 'LSU', 'COI'
    ]:  #test if it is one that is setup, otherwise default to full path
        uchime_db = os.path.join(parentdir, 'DB',
                                 args.uchime_ref + '.extracted.fa')
        if not os.path.isfile(uchime_db):
            ufitslib.log.error(
예제 #2
0
                tax = col[2]
                if any(x in tax for x in filt_tax_values):
                    record = seqDict[ID]
                    record.id = 'OTU' + str(otu_counter) + ';UTAX;tax=' + tax
                    record.name = ''
                    record.description = ''
                    SeqIO.write(record, output, 'fasta')
                    otu_counter += 1
    total = ufitslib.countfasta(ref_clustered) - num_refcluster
    ufitslib.log.info('{0:,}'.format(total) + ' classified to %s' %
                      taxonomyLookup.get(args.utax_level))

#clean up padded N's
ufitslib.log.info("Cleaning up padding from OTUs")
otu_clean = os.path.join(tmp, args.out + '.clean.otus.fa')
ufitslib.fasta_strip_padding(ref_clustered, otu_clean)
total = ufitslib.countfasta(otu_clean)
ufitslib.log.info('{0:,}'.format(total) + ' total OTUs')

#now map reads back to OTUs
uc_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.mapping.uc')
otu_table = os.path.join(tmp, args.out + '.EE' + args.maxee + '.otu_table.txt')
#setup reads to map
if args.map_filtered:
    reads = filter_fasta
else:
    reads = orig_fasta
ufitslib.log.info("Mapping Reads to OTUs and Building OTU table")
cmd = [
    'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id', '0.97',
    '--db', otu_clean, '--uc', uc_out, '--otutabout', otu_table
예제 #3
0
                ID = col[0]
                tax = col[2]
                if any(x in tax for x in filt_tax_values):
                    record = seqDict[ID]
                    record.id = 'OTU'+str(otu_counter)+';UTAX;tax='+tax
                    record.name = ''
                    record.description = ''
                    SeqIO.write(record, output, 'fasta')
                    otu_counter += 1
    total = ufitslib.countfasta(ref_clustered) - num_refcluster
    ufitslib.log.info('{0:,}'.format(total) + ' classified to %s' % taxonomyLookup.get(args.utax_level))

#clean up padded N's
ufitslib.log.info("Cleaning up padding from OTUs")
otu_clean = os.path.join(tmp, args.out + '.clean.otus.fa')
ufitslib.fasta_strip_padding(ref_clustered, otu_clean)           
total = ufitslib.countfasta(otu_clean)
ufitslib.log.info('{0:,}'.format(total) + ' total OTUs')
       
#now map reads back to OTUs
uc_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.mapping.uc')
otu_table = os.path.join(tmp, args.out + '.EE' + args.maxee + '.otu_table.txt')
#setup reads to map
if args.map_filtered:
    reads = filter_fasta
else:
    reads = orig_fasta
ufitslib.log.info("Mapping Reads to OTUs and Building OTU table")
cmd = ['vsearch', '--usearch_global', reads, '--strand', 'plus', '--id', '0.97', '--db', otu_clean, '--uc', uc_out, '--otutabout', otu_table]
ufitslib.runSubprocess(cmd, ufitslib.log)
예제 #4
0
        cols = line.split('\t')
        OTU = cols[9]
        Hit = cols[8]
        if not OTU in iSeqMapped:
            iSeqMapped[OTU] = [Hit]
        else:
            iSeqMapped[OTU].append(Hit)
with open(ClusterComp, 'w') as clusters:
    clusters.write('OTU\tiSeqs\n')
    for k,v in natsorted(iSeqMapped.items()):
        clusters.write('%s\t%s\n' % (k, ', '.join(v)))

#strip N's
ufitslib.log.info("Cleaning up padding from OTUs")
otu_clean = os.path.join(tmp, args.out + '.EE' + args.maxee + '.clean.fa')
ufitslib.fasta_strip_padding(uclust_out, otu_clean)


#run optional uchime_ref
if not args.uchime_ref:
    uchime_out = otu_clean
else:
    uchime_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.uchime.otus.fa')
    #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy
    if args.uchime_ref in ['ITS', '16S', 'LSU', 'COI']: #test if it is one that is setup, otherwise default to full path
        uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref+'.extracted.fa')
        if not os.path.isfile(uchime_db):
            ufitslib.log.error("Database not properly configured, run `ufits install` to setup DB, skipping chimera filtering")
            uchime_out = otu_clean
    else:
        uchime_db = os.path.abspath(args.uchime_ref)
예제 #5
0
cmd = ["vsearch", "--sortbysize", unoise_out, "--minsize", args.minsize, "--output", sort_out]
ufitslib.runSubprocess(cmd, ufitslib.log)

# now run clustering algorithm
radius = str(100 - int(args.pct_otu))
otu_out = os.path.join(tmp, args.out + ".EE" + args.maxee + ".otus.fa")
ufitslib.log.info("Clustering OTUs (UPARSE)")
cmd = [usearch, "-cluster_otus", sort_out, "-relabel", "OTU", "-otu_radius_pct", radius, "-otus", otu_out]
ufitslib.runSubprocess(cmd, ufitslib.log)
numOTUs = ufitslib.countfasta(otu_out)
ufitslib.log.info("{0:,}".format(numOTUs) + " OTUs")

# clean up padded N's
ufitslib.log.info("Cleaning up padding from OTUs")
otu_clean = os.path.join(tmp, args.out + ".EE" + args.maxee + ".clean.otus.fa")
ufitslib.fasta_strip_padding(otu_out, otu_clean)

# optional UCHIME Ref
if not args.uchime_ref:
    uchime_out = otu_clean
else:
    uchime_out = os.path.join(tmp, args.out + ".EE" + args.maxee + ".uchime.otus.fa")
    # check if file is present, remove from previous run if it is.
    if os.path.isfile(uchime_out):
        os.remove(uchime_out)
    # R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy
    if args.uchime_ref in [
        "ITS",
        "16S",
        "LSU",
        "COI",