def main(args): parser = argparse.ArgumentParser( prog='amptk-filter.py', description='''Script inspects output of amptk-OTU_cluster.py and determines useful threshold for OTU output based on a spike-in mock community.''', epilog="""Written by Jon Palmer (2015) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--otu_table', required=True, help='Input OTU table') parser.add_argument('-f', '--fasta', required=True, help='Input OTUs (multi-fasta)') parser.add_argument('-b', '--mock_barcode', help='Barocde of Mock community') parser.add_argument('-p', '--index_bleed', help='Index Bleed filter. Default: auto') parser.add_argument('-t', '--threshold', default='max', choices=['sum', 'max', 'top25', 'top10', 'top5'], help='Threshold to use when calculating index-bleed') parser.add_argument( '-c', '--calculate', default='all', choices=['all', 'in'], help='Calculate index-bleed, if synthetic mock use all otherwise use in' ) parser.add_argument('-s', '--subtract', default=0, help='Threshold to subtract') parser.add_argument('-n', '--normalize', default='y', choices=['y', 'n'], help='Normalize OTU table prior to filtering') parser.add_argument('-m', '--mc', help='Multi-FASTA mock community') parser.add_argument( '-d', '--drop', nargs='+', help='samples to drop from table after index-bleed filtering') parser.add_argument('--ignore', nargs='+', help='Ignore OTUs during index-bleed') parser.add_argument('--delimiter', default='tsv', choices=['csv', 'tsv'], help='Delimiter') parser.add_argument('--col_order', nargs='+', dest="col_order", help='Provide space separated list') parser.add_argument('--keep_mock', action='store_true', help='Keep mock sample in OTU table (Default: False)') parser.add_argument('--show_stats', action='store_true', help='Show stats datatable STDOUT') parser.add_argument('--negatives', nargs='+', help='Negative Control Sample names') parser.add_argument('-o', '--out', help='Base output name') parser.add_argument('--min_reads_otu', default=2, type=int, help='Minimum number of reads per OTU for experiment') parser.add_argument( '--min_samples_otu', default=1, type=int, help='Minimum number of samples per OTU for experiment') parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH9 EXE') parser.add_argument('--debug', action='store_true', help='Remove Intermediate Files') args = parser.parse_args(args) parentdir = os.path.join(os.path.dirname(amptklib.__file__)) if not args.out: #get base name of files base = args.otu_table.split(".otu_table")[0] else: base = args.out #remove logfile if exists log_name = base + '.amptk-filter.log' amptklib.removefile(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #check if otu_table is empty amptklib.log.info("Loading OTU table: %s" % args.otu_table) check = os.stat(args.otu_table).st_size if check == 0: amptklib.log.error("Input OTU table is empty") sys.exit(1) #get the OTU header info (depending on how OTU table was constructed, this might be different, so find it as you need for indexing) with open(args.otu_table, 'r') as f: first_line = f.readline() OTUhead = first_line.split('\t')[0] if args.delimiter == 'csv': delim = str(',') ending = '.csv' elif args.delimiter == 'tsv': delim = str('\t') ending = '.txt' #setup outputs sorted_table = base + '.sorted' + ending normal_table_pct = base + '.normalized.pct' + ending normal_table_nums = base + '.normalized.num' + ending subtract_table = base + '.normalized.subtract' + ending filtered_table = base + '.normalized' + ending final_table = base + '.final' + ending final_binary_table = base + '.final.binary' + ending stats_table = base + '.stats' + ending #load OTU table into pandas DataFrame df = pd.read_csv(args.otu_table, sep='\t') df.set_index(OTUhead, inplace=True) headers = df.columns.values.tolist() if headers[-1] == 'taxonomy' or headers[-1] == 'Taxonomy': otuDict = df[headers[-1]].to_dict() del df[headers[-1]] else: otuDict = False #parse OTU table to get count data for each OTU AddCounts = {} OTUcounts = df.sum(1) for x in OTUcounts.index: AddCounts[x] = int(OTUcounts[x]) #now add counts to fasta header FastaCounts = base + '.otus.counts.fa' OTU_tax = {} with open(FastaCounts, 'w') as outfile: with open(args.fasta, 'r') as infile: for rec in SeqIO.parse(infile, 'fasta'): if ';' in rec.id: #this should mean there is taxonomy, so split it ID = rec.id.split(';', 1)[0] tax = rec.id.split(';', 1)[-1] OTU_tax[ID] = tax if ID in AddCounts: count = AddCounts.get(ID) else: count = 0 outfile.write('>%s;size=%i\n%s\n' % (ID, count, rec.seq)) else: #no tax, just process if rec.id in AddCounts: count = AddCounts.get(rec.id) else: count = 0 outfile.write('>%s;size=%i\n%s\n' % (rec.id, count, rec.seq)) amptklib.log.info( 'OTU table contains {:,} samples, {:,} OTUs, and {:,} reads counts'. format(len(df.columns.values.tolist()), len(df.index), int(df.values.sum()))) #setup output files/variables mock_out = base + '.mockmap.txt' if args.mock_barcode: #if user passes a column name for mock #check if mock barcode is valid validBCs = df.columns.values.tolist() if not args.mock_barcode in validBCs: amptklib.log.error("%s not a valid barcode." % args.mock_barcode) amptklib.log.error("Valid barcodes: %s" % (' '.join(validBCs))) sys.exit(1) if args.col_order and not args.mock_barcode in args.col_order: amptklib.log.error("Error: %s not listed in --col_order." % args.mock_barcode) sys.exit(1) #make sure there is a --mc passed here otherwise throw error if not args.mc: amptklib.log.error( "If using the -b,--barcode option you must specify a fasta file of mock community via the --mc option" ) sys.exit(1) #get default mock community value if args.mc == "mock3": mock = os.path.join(parentdir, 'DB', 'amptk_mock3.fa') elif args.mc == "mock2": mock = os.path.join(parentdir, 'DB', 'amptk_mock2.fa') elif args.mc == "mock1": mock = os.path.join(parentdir, 'DB', 'amptk_mock1.fa') elif args.mc == "synmock": mock = os.path.join(parentdir, 'DB', 'amptk_synmock.fa') else: mock = os.path.abspath(args.mc) #open mock community fasta and count records mock_ref_count = amptklib.countfasta(mock) #load OTU lengths into dictionary SeqLength = amptklib.fastalen2dict(args.fasta) #map OTUs to mock community, this is fast enough, but running twice, first to get only top hit, then amptklib.log.info("Mapping OTUs to Mock Community (USEARCH)") cmd = [ usearch, '-usearch_global', mock, '-strand', 'plus', '-id', '0.65', '-db', FastaCounts, '-userout', mock_out, '-userfields', 'query+target+id+ql+tl+alnlen+caln+mism+diffs', '-maxaccepts', '0', '-maxrejects', '0' ] amptklib.runSubprocess(cmd, amptklib.log) #generate dictionary for name change ''' If args.calculate is set to all, that means the script is trying to measure a synthetic mock of some kind. if that is the case, then chimeras are < 95% identical to mock members and variants would be hits in between, i.e 95% > but not the best hit. ''' Results = {} errorrate = {} with open(mock_out, 'r') as map: for line in map: line = line.replace('\n', '') cols = line.split('\t') MockID = cols[0] hit = cols[1].split(';size=') otuID = hit[0] abundance = int(hit[1]) pident = float(cols[2]) length = int(cols[4]) mism = int(cols[7]) diffs = int(cols[8]) score = abundance * pident * length if not otuID in errorrate: errorrate[otuID] = [MockID, diffs] else: olderror = errorrate.get(otuID) if diffs < olderror[1]: errorrate[otuID] = [MockID, diffs] if not MockID in Results: Results[MockID] = [(otuID, abundance, pident, length, mism, diffs, score)] else: Results[MockID].append( (otuID, abundance, pident, length, mism, diffs, score)) found_dict = {} chimeras = [] variants = [] missing = [] for k, v in natsorted(list(Results.items())): besthit = [] #v is a list of tuples of results, parse through to get best hit for y in v: if y[2] >= 97.0: besthit.append(y) elif y[2] >= 95.0 and y[2] < 97.0: if not y[0] in variants: variants.append(y[0]) else: if not y[0] in chimeras: chimeras.append(y[0]) if len(besthit) > 0: besthit.sort(key=lambda x: x[1], reverse=True) best = sorted(besthit[:3], key=lambda x: x[6], reverse=True) found_dict[k] = best[0] else: missing.append(k) #make name change dict annotate_dict = {} seen = [] for k, v in natsorted(list(found_dict.items())): ID = v[0].replace('_chimera', '') newID = k + '_pident=' + str(v[2]) + '_' + v[0] annotate_dict[ID] = newID if not v[0] in seen: seen.append(v[0]) if args.calculate == 'all': chimeras = [x for x in chimeras if x not in seen] variants = [x for x in variants if x not in seen] for i in chimeras: annotate_dict[i] = i + '_suspect_mock_chimera' for x in variants: annotate_dict[x] = x + '_suspect_mock_variant' if len(missing) > 0: amptklib.log.info("%i mock missing: %s" % (len(missing), ', '.join(missing))) else: otu_new = args.fasta #rename OTUs if args.mock_barcode: df.rename(index=annotate_dict, inplace=True) #sort the table df2 = df.reindex(index=natsorted(df.index)) if not args.col_order: amptklib.log.info("Sorting OTU table naturally") df = df2.reindex(columns=natsorted(df2.columns)) else: amptklib.log.info( "Sorting OTU table by user defined order (--col_order)") col_headers = args.col_order #check if all names in headers or not for i in col_headers: if not i in df2.columns.values: col_headers.remove(i) df = df2.reindex(columns=col_headers) SortedTable = df if otuDict: df['Taxonomy'] = pd.Series(otuDict) df.to_csv(sorted_table, sep=delim) del df['Taxonomy'] else: df.to_csv(sorted_table, sep=delim) #get sums of columns fs = df.sum(axis=0) #fs.to_csv('reads.per.sample.csv') otus_per_sample_original = df[df > 0].count(axis=0, numeric_only=True) filtered = pd.DataFrame(df, columns=fs.index) filt2 = filtered.loc[(filtered != 0).any(1)] tos = filt2.sum(axis=1) fotus = tos[ tos >= args. min_reads_otu] #valid allele must be found atleast from than 2 times, i.e. no singletons if len(fotus.index) < len(tos.index): diff = len(tos.index) - len(fotus.index) amptklib.log.info( "Removing {:,} OTUs according to --min_reads_otu {:,}".format( diff, args.min_reads_otu)) filt3 = pd.DataFrame(filt2, index=fotus.index) if args.normalize == 'y': #normalize the OTU table normal = filt3.truediv(fs) if otuDict: normal['Taxonomy'] = pd.Series(otuDict) normal.to_csv(normal_table_pct, sep=delim) del normal['Taxonomy'] else: normal.to_csv(normal_table_pct, sep=delim) #normalize back to read counts, pretend 100,000 reads in each norm_round = np.round(normal.multiply(100000), decimals=0) if otuDict: norm_round['Taxonomy'] = pd.Series(otuDict) norm_round.to_csv(normal_table_nums, sep=delim) del norm_round['Taxonomy'] else: norm_round.to_csv(normal_table_nums, sep=delim) amptklib.log.info( "Normalizing OTU table to number of reads per sample") else: norm_round = filt3 if args.mock_barcode: #now calculate the index-bleed in both directions (into the mock and mock into the other samples) mock = [] sample = [] #get names from mapping for k, v in list(annotate_dict.items()): if not '_suspect_mock_' in v: mock.append(v) for i in norm_round.index: if not i in mock: sample.append(i) if args.ignore: mock = [x for x in mock if x not in args.ignore] sample = [x for x in sample if x not in args.ignore] #first calculate bleed out of mock community #slice normalized dataframe to get only mock OTUs from table mock_df = pd.DataFrame(norm_round, index=mock) #if there are samples to drop, make sure they aren't being used in this calculation if args.drop: mock_df.drop(args.drop, axis=1, inplace=True) #get total number of reads from mock OTUs from entire table total = np.sum(np.sum(mock_df, axis=None)) #now drop the mock barcode sample mock_df.drop(args.mock_barcode, axis=1, inplace=True) #get number of reads that are result of bleed over bleed1 = np.sum(np.sum(mock_df, axis=None)) #calculate rate of bleed by taking num reads bleed divided by the total bleed1max = bleed1 / float(total) #second, calculate bleed into mock community #get list of mock OTUs not found in any other sample -> these are likely chimeras mock_only = pd.DataFrame(norm_round, index=list(norm_round.index), columns=[args.mock_barcode]) mock_OTUs_zeros = mock_only.loc[(mock_only == 0).all(axis=1)] theRest = [ x for x in list(norm_round.columns.values) if x not in [args.mock_barcode] ] non_mocks = pd.DataFrame(norm_round, index=sample, columns=theRest) non_mock_zeros = non_mocks.loc[(non_mocks == 0).all(axis=1)] zeros = [ x for x in list(non_mock_zeros.index) if x not in list(mock_OTUs_zeros.index) ] if len(zeros) > 0: amptklib.log.info( "Found {:,} mock chimeras (only in mock sample and not mapped to mock sequences) excluding from index-bleed calculation" .format(len(zeros))) amptklib.log.debug('{:}'.format(', '.join(zeros))) #now get updated list of samples, dropping chimeras samples_trimmed = [x for x in sample if x not in zeros] #slice the OTU table to get all OTUs that are not in mock community from the mock sample sample_df = pd.DataFrame(norm_round, index=samples_trimmed, columns=[args.mock_barcode]) #get total number of reads that don't belong in mock bleed2 = np.sum(np.sum(sample_df, axis=None)) #now pull the entire mock sample mock_sample = pd.DataFrame(norm_round, columns=[args.mock_barcode]) #calcuate bleed into mock by taking num reads that don't belong divided by the total, so this is x% of bad reads in the mock bleed2max = bleed2 / float(np.sum(mock_sample.sum(axis=1))) #autocalculate the subtraction filter by taking the maximum value that doesn't belong subtract_num = max(sample_df.max()) #get max values for bleed #can only use into samples measurement if not using synmock if args.calculate == 'all': if bleed1max > bleed2max: bleedfilter = math.ceil(bleed1max * 1000) / 1000 else: bleedfilter = math.ceil(bleed2max * 1000) / 1000 amptklib.log.info( "Index bleed, mock into samples: %f%%. Index bleed, samples into mock: %f%%." % (bleed1max * 100, bleed2max * 100)) else: bleedfilter = math.ceil(bleed2max * 1000) / 1000 amptklib.log.info("Index bleed, samples into mock: %f%%." % (bleed2max * 100)) else: bleedfilter = args.index_bleed #this is value needed to filter MiSeq, Ion is likely less, but shouldn't effect the data very much either way. if args.index_bleed: args.index_bleed = float(args.index_bleed) amptklib.log.info( "Overwriting auto detect index-bleed, setting to %f%%" % (args.index_bleed * 100)) bleedfilter = args.index_bleed else: if bleedfilter: amptklib.log.info( "Will use value of %f%% for index-bleed OTU filtering." % (bleedfilter * 100)) else: bleedfilter = 0 #no filtering if you don't pass -p or -b amptklib.log.info( "No spike-in mock (-b) or index-bleed (-p) specified, thus not running index-bleed filtering" ) if bleedfilter > 0.05: amptklib.log.info( "Index bleed into samples is abnormally high (%f%%), if you have biological mock you should use `--calculate in`" % (bleedfilter * 100)) #to combat barcode switching, loop through each OTU filtering out if less than bleedfilter threshold cleaned = [] for row in norm_round.itertuples(): result = [row[0]] if args.threshold == 'max': total = max( row[1:] ) #get max OTU count from table to calculate index bleed from. elif args.threshold == 'sum': total = sum(row[1:]) elif args.threshold == 'top25': top = sorted(row[1:], key=int, reverse=True) topn = int(round(len(row[1:]) * 0.25)) total = sum(top[:topn]) elif args.threshold == 'top10': top = sorted(row[1:], key=int, reverse=True) topn = int(round(len(row[1:]) * 0.10)) total = sum(top[:topn]) elif args.threshold == 'top5': top = sorted(row[1:], key=int, reverse=True) topn = int(round(len(row[1:]) * 0.05)) total = sum(top[:topn]) sub = total * bleedfilter for i in row[1:]: if i < sub: i = 0 result.append(i) cleaned.append(result) header = [OTUhead] for i in norm_round.columns: header.append(i) #create data frame of index bleed filtered results final = pd.DataFrame(cleaned, columns=header) final.set_index(OTUhead, inplace=True) if args.drop: #if user has passed samples to drop, do it here, subtract drop list from Header amptklib.log.info("Dropping %i samples from table: %s" % (len(args.drop), ', '.join(args.drop))) colsdrop = [] for x in args.drop: if x in header: colsdrop.append(x) #now drop those columns final.drop(colsdrop, axis=1, inplace=True) if args.subtract != 'auto': subtract_num = int(args.subtract) else: try: subtract_num = int(subtract_num) amptklib.log.info("Auto subtract filter set to %i" % subtract_num) except NameError: subtract_num = 0 amptklib.log.info( "Error: to use 'auto' subtract feature, provide a sample name to -b,--mock_barcode." ) if subtract_num != 0: amptklib.log.info("Subtracting %i from OTU table" % subtract_num) sub = final.subtract(subtract_num) sub[sub < 0] = 0 #if negative, change to zero sub = sub.loc[~(sub == 0).all(axis=1)] sub = sub.astype(int) if otuDict: sub['Taxonomy'] = pd.Series(otuDict) sub.to_csv(subtract_table, sep=delim) del sub['Taxonomy'] else: sub.to_csv(subtract_table, sep=delim) otus_if_sub = sub[sub > 0].count(axis=0, numeric_only=True) final = sub.astype(int) otus_per_sample = final[final > 0].count(axis=0, numeric_only=True) stats = pd.concat([fs, otus_per_sample_original, otus_per_sample], axis=1) stats.columns = ['reads per sample', 'original OTUs', 'final OTUs'] stats.fillna(0, inplace=True) stats = stats.astype(int) if args.show_stats: print(stats.to_string()) stats.to_csv(stats_table, sep=delim) #after all filtering, get list of OTUs in mock barcode if args.mock_barcode: mocks = final[args.mock_barcode] mocks = mocks.loc[~(mocks == 0)].astype(int) totalmismatches = 0 totallength = 0 chimera_count = 0 variant_count = 0 for otu in mocks.index: count = mocks[otu] if 'suspect_mock' in otu: if 'chimera' in otu: chimera_count += 1 if 'variant' in otu: variant_count += 1 otu = otu.split('_', 1)[0] else: otu = otu.split('_', -1)[-1] otu_length = SeqLength.get(otu) countlen = otu_length * count totallength += countlen if otu in errorrate: otu_diffs = errorrate.get(otu)[1] totaldiffs = otu_diffs * count totalmismatches += totaldiffs else: totalmismatches += countlen e_rate = totalmismatches / float(totallength) * 100 amptklib.log.info(args.mock_barcode + ' sample has ' + '{0:,}'.format(len(mocks)) + ' OTUS out of ' + '{0:,}'.format(mock_ref_count) + ' expected; ' + '{0:,}'.format(variant_count) + ' mock variants; ' + '{0:,}'.format(chimera_count) + ' mock chimeras; Error rate: ' + '{0:.3f}%'.format(e_rate)) if not args.keep_mock: try: final.drop(args.mock_barcode, axis=1, inplace=True) except: pass #drop OTUs that are now zeros through whole table final = final.loc[~(final == 0).all(axis=1)] final = final.astype(int) #output filtered normalized table if otuDict: final['Taxonomy'] = pd.Series(otuDict) final.to_csv(filtered_table, sep=delim) del final['Taxonomy'] else: final.to_csv(filtered_table, sep=delim) #convert to binary final[final > 0] = 1 #apply min_sample_otu here (most stringent filter, not sure I would use this unless you know what you are doing) los = final.sum(axis=1) fotus = los[los >= args.min_samples_otu] keep = fotus.index final2 = pd.DataFrame(final, index=keep) diff = len(final.index) - len(keep) if diff > 0: amptklib.log.info( 'Dropped {:,} OTUs found in fewer than {:,} samples'.format( diff, args.min_samples_otu)) #drop samples that don't have any OTUs after filtering final3 = final2.loc[:, (final2 != 0).any(axis=0)] final3 = final3.astype(int) #get the actual read counts from binary table merge = {} for index, row in final3.items(): merge[index] = [] for i in range(0, len(row)): if row[i] == 0: merge[index].append(row[i]) else: merge[index].append(SortedTable[index][row.index[i]]) FiltTable = pd.DataFrame(merge, index=list(final3.index)) FiltTable.index.name = '#OTU ID' #order the filtered table #sort the table FiltTable2 = FiltTable.reindex(index=natsorted(FiltTable.index)) if not args.col_order: FiltTable = FiltTable2.reindex(columns=natsorted(FiltTable2.columns)) else: col_headers = args.col_order #check if all names in headers or not for i in col_headers: if not i in FiltTable2.columns.values: col_headers.remove(i) FiltTable = FiltTable2.reindex(columns=col_headers) #check for negative samples and how many OTUs are in these samples #if found, filter the OTUs and alert user to rebuild OTU table, I could do this automatically, but would then require #there to be reads passed to this script which seems stupid. Just deleting the OTUs is probably not okay.... if args.negatives: if len(args.negatives ) > 1: #if greater than 1 then assuming list of sample names Neg = args.negatives else: if os.path.isfile( args.negatives[0]): #check if it is a file or not Neg = [] with open(args.negatives[0], 'r') as negfile: for line in negfile: line = line.replace('\n', '') Neg.append(line) else: Neg = args.negatives #Now slice the final OTU table, check if values are valid NotFound = [] for i in Neg: if not i in FiltTable.columns.values: Neg.remove(i) NotFound.append(i) if len(NotFound) > 0: amptklib.log.info('Samples not found: %s' % ' '.join(NotFound)) #slice table NegTable = FiltTable.reindex(columns=Neg) #drop those that are zeros through all samples, just pull out OTUs found in the negative samples NegTable = NegTable.loc[~(NegTable == 0).all(axis=1)] NegOTUs = list(NegTable.index) #now make sure you aren't dropping mock OTUs as you want to keep those for filtering new OTU table NegOTUs = [item for item in NegOTUs if item not in mock] else: NegOTUs = [] #check if negative OTUs exist, if so, then output updated OTUs and instructions on creating new OTU table if len(NegOTUs) > 0: amptklib.log.info("%i OTUs are potentially contamination" % len(NegOTUs)) otu_clean = base + '.cleaned.otus.fa' with open(otu_clean, 'w') as otu_update: with open(args.fasta, "rU") as myfasta: for rec in SeqIO.parse(myfasta, 'fasta'): if not rec.id in NegOTUs: SeqIO.write(rec, otu_update, 'fasta') amptklib.log.info("Cleaned OTUs saved to: %s" % otu_clean) amptklib.log.info( "Generate a new OTU table like so:\namptk remove -i %s --format fasta -l %s -o %s\nvsearch --usearch_global %s --db %s --strand plus --id 0.97 --otutabout newOTU.table.txt\n" % (base + '.demux.fq', ' '.join(Neg), base + '.cleaned.fa', base + '.cleaned.fa', otu_clean)) else: #proceed with rest of script #output final table if otuDict: FiltTable['Taxonomy'] = pd.Series(otuDict) FiltTable.to_csv(final_table, sep=delim) del FiltTable['Taxonomy'] else: FiltTable.to_csv(final_table, sep=delim) finalSamples = FiltTable.columns.values.tolist() if 'Taxonomy' in finalSamples: numFinalSamples = len(finalSamples) - 1 else: numFinalSamples = len(finalSamples) amptklib.log.info( 'Filtered OTU table contains {:,} samples, {:,} OTUs, and {:,} read counts' .format(numFinalSamples, len(FiltTable.index), FiltTable.values.sum())) if numFinalSamples < len(df.columns.values.tolist()): diffSamples = [ item for item in headers if item not in FiltTable.columns.values.tolist() ] amptklib.log.info('Samples dropped: %s' % (','.join(diffSamples))) #output binary table if otuDict: final3['Taxonomy'] = pd.Series(otuDict) final3.to_csv(final_binary_table, sep=delim) else: final3.to_csv(final_binary_table, sep=delim) #generate final OTU list for taxonomy amptklib.log.info("Finding valid OTUs") otu_new = base + '.filtered.otus.fa' with open(otu_new, 'w') as otu_update: with open(args.fasta, "rU") as myfasta: for rec in SeqIO.parse(myfasta, 'fasta'): if ';' in rec.id: rec.id = rec.id.split(';', 1)[0] if args.mock_barcode: #map new names of mock if rec.id in annotate_dict: newname = annotate_dict.get(rec.id) rec.id = newname rec.description = '' if rec.id in final3.index: if rec.id in OTU_tax: otu_update.write( '>%s;%s\n%s\n' % (rec.id, OTU_tax.get(rec.id), rec.seq)) else: otu_update.write('>%s\n%s\n' % (rec.id, rec.seq)) #tell user what output files are print("-------------------------------------------------------") print("OTU Table filtering finished") print("-------------------------------------------------------") print("OTU Table Stats: %s" % stats_table) print("Sorted OTU table: %s" % sorted_table) if not args.debug: for i in [ normal_table_pct, normal_table_nums, subtract_table, mock_out, FastaCounts ]: amptklib.removefile(i) else: print("Normalized (pct): %s" % normal_table_pct) print("Normalized (10k): %s" % normal_table_nums) if args.subtract != 0: print("Subtracted table: %s" % subtract_table) print("Normalized/filter: %s" % filtered_table) print("Final Binary table: %s" % final_binary_table) print("Final OTU table: %s" % final_table) print("Filtered OTUs: %s" % otu_new) print("-------------------------------------------------------") if 'darwin' in sys.platform: print(colr.WARN + "\nExample of next cmd:" + colr.END + " amptk taxonomy -f %s -i %s -m mapping_file.txt -d ITS2\n" % (otu_new, final_table)) else: print( "\nExample of next cmd: amptk taxonomy -f %s -i %s -m mapping_file.txt -d ITS2\n" % (otu_new, final_table))
def main(args): parser = argparse.ArgumentParser( prog='amptk-assign_taxonomy.py', usage="%(prog)s [options] -f <FASTA File>", description='''assign taxonomy to OTUs''', epilog="""Written by Jon Palmer (2015) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--otu_table', dest="otu_table", help='Append Taxonomy to OTU table') parser.add_argument('-f', '--fasta', required=True, help='FASTA input') parser.add_argument('-o', '--out', help='Output file (FASTA)') parser.add_argument( '-m', '--mapping_file', help='Mapping file: QIIME format can have extra meta data columns') parser.add_argument( '--method', default='hybrid', choices=['utax', 'usearch', 'sintax', 'hybrid', 'rdp', 'blast'], help='Taxonomy method') parser.add_argument( '-d', '--db', help='Pre-installed Databases: [ITS,ITS1,ITS2,16S,LSU,COI]') parser.add_argument( '-t', '--taxonomy', help='Incorporate taxonomy calculated elsewhere, 2 column file') parser.add_argument('--fasta_db', help='Alternative database of fasta sequences') parser.add_argument('--add2db', help='Custom FASTA database to add to DB on the fly') parser.add_argument('--utax_db', help='UTAX Reference Database') parser.add_argument('--utax_cutoff', default=0.8, type=restricted_float, help='UTAX confidence value threshold.') parser.add_argument('--usearch_db', help='USEARCH Reference Database') parser.add_argument('--usearch_cutoff', default=0.7, type=restricted_float, help='USEARCH percent ID threshold.') parser.add_argument( '-r', '--rdp', dest='rdp', default='/Users/jon/scripts/rdp_classifier_2.10.1/dist/classifier.jar', help='Path to RDP Classifier') parser.add_argument('--rdp_db', dest='rdp_tax', default='fungalits_unite', choices=[ '16srrna', 'fungallsu', 'fungalits_warcup', 'fungalits_unite' ], help='Training set for RDP Classifier') parser.add_argument('--rdp_cutoff', default=0.8, type=restricted_float, help='RDP confidence value threshold') parser.add_argument('--local_blast', help='Path to local Blast DB') parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH8 EXE') parser.add_argument('--tax_filter', help='Retain only OTUs with match in OTU table') parser.add_argument('--sintax_cutoff', default=0.8, type=restricted_float, help='SINTAX threshold.') parser.add_argument('--debug', action='store_true', help='Remove Intermediate Files') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") args = parser.parse_args(args) parentdir = os.path.join(os.path.dirname(amptklib.__file__)) if not args.out: #get base name of files if 'filtered' in args.fasta: base = args.fasta.split(".filtered")[0] elif 'otu' in args.fasta: base = args.fasta.split('.otu')[0] else: base = args.fasta.split('.fa')[0] else: base = args.out #remove logfile if exists log_name = base + '.amptk-taxonomy.log' if os.path.isfile(log_name): os.remove(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of cpus if args.cpus: cpus = args.cpus else: cpus = amptklib.getCPUS() #Setup DB locations and names, etc DBdir = os.path.join(parentdir, 'DB') DataBase = { 'ITS1': (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS1_UTAX.udb'), os.path.join(DBdir, 'ITS_SINTAX.udb')), 'ITS2': (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS2_UTAX.udb'), os.path.join(DBdir, 'ITS_SINTAX.udb')), 'ITS': (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS_UTAX.udb'), os.path.join(DBdir, 'ITS_SINTAX.udb')), '16S': (os.path.join(DBdir, '16S.udb'), os.path.join(DBdir, '16S.udb'), os.path.join(DBdir, '16S_SINTAX.udb')), 'LSU': (os.path.join(DBdir, 'LSU.udb'), os.path.join(DBdir, 'LSU_UTAX.udb'), os.path.join(DBdir, 'LSU_SINTAX.udb')), 'COI': (os.path.join(DBdir, 'COI.udb'), os.path.join(DBdir, 'COI_UTAX.udb'), os.path.join(DBdir, 'COI_SINTAX.udb')) } #get DB names up front if args.db in DataBase: utax_db = DataBase.get(args.db)[1] usearch_db = DataBase.get(args.db)[0] sintax_db = DataBase.get(args.db)[2] if not utax_db: utax_db = args.utax_db if not usearch_db: usearch_db = args.usearch_db else: utax_db = args.utax_db usearch_db = args.usearch_db if args.fasta_db: sintax_db = args.fasta_db else: sintax_db = args.usearch_db if args.method in ['hybrid', 'usearch', 'utax']: if not utax_db and not usearch_db and not args.fasta_db: amptklib.log.error( "You have not selected a database, need either --db, --utax_db, --usearch_db, or --fasta_db" ) sys.exit(1) else: #check that the DB exists if args.method == 'usearch' and usearch_db: if not amptklib.checkfile(usearch_db): amptklib.log.error( 'USEARCH DB not found: {:}'.format(usearch_db)) amptklib.log.derror( 'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB' ) sys.exit(1) if args.method == 'sintax' and sintax_db: if not amptklib.checkfile(sintax_db): amptklib.log.error( 'SINTAX DB not found: {:}'.format(sintax_db)) amptklib.log.derror( 'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB' ) sys.exit(1) if args.method == 'utax' and utax_db: if not amptklib.checkfile(utax_db): amptklib.log.error( 'UTAX DB not found: {:}'.format(utax_db)) amptklib.log.error( 'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB' ) sys.exit(1) custom_db = None if args.add2db: #means user wants to add sequences to the usearch database on the so will need to rebuild database custom_db = base + '.custom_database.fa' if amptklib.checkfile(custom_db): amptklib.SafeRemove(custom_db) if args.db: #this means that the fasta files need to be extracted amptklib.log.info("Adding {:} to the {:} database".format( os.path.basename(args.add2db), os.path.basename(usearch_db))) cmd = ['vsearch', '--udb2fasta', usearch_db, '--output', custom_db] amptklib.runSubprocess(cmd, amptklib.log) with open(custom_db, 'a') as outfile: with open(args.add2db, 'r') as infile: shutil.copyfileobj(infile, outfile) elif args.fasta_db: amptklib.log.info("Adding {:} to the {:} database".format( os.path.basename(args.add2db), os.path.basename(args.fasta_db))) with open(custom_db, 'w') as outfile: with open(args.fasta_db, 'r') as infile: shutil.copyfileobj(infile, outfile) with open(args.add2db, 'r') as infile: shutil.copyfileobj(infile, outfile) #Count records amptklib.log.info("Loading FASTA Records") total = amptklib.countfasta(args.fasta) amptklib.log.info('{0:,}'.format(total) + ' OTUs') #declare output files/variables here blast_out = base + '.blast.txt' rdp_out = base + '.rdp.txt' utax_out = base + '.usearch.txt' usearch_out = base + '.usearch.txt' sintax_out = base + '.sintax.txt' otuDict = {} if not args.taxonomy: #start with less common uses, i.e. Blast, rdp if args.method == 'blast': #check if command line blast installed if not amptklib.which('blastn'): amptklib.log.error("BLASTN not found in your PATH, exiting.") sys.exit(1) #now run blast remotely using NCBI nt database outformat = "6 qseqid sseqid pident stitle" if args.local_blast: #get number of cpus amptklib.log.info("Running local BLAST using db: %s" % args.local_blast) cmd = [ 'blastn', '-num_threads', str(cpus), '-query', args.fasta, '-db', os.path.abspath(args.local_blast), '-max_target_seqs', '1', '-outfmt', outformat, '-out', blast_out ] amptklib.runSubprocess(cmd, amptklib.log) else: amptklib.log.info( "Running BLASTN using NCBI remote nt database, this may take awhile" ) cmd = [ 'blastn', '-query', args.fasta, '-db', 'nt', '-remote', '-max_target_seqs', '1', '-outfmt', outformat, '-out', blast_out ] amptklib.runSubprocess(cmd, amptklib.log) #load results and reformat new = [] f = csv.reader(open(blast_out), delimiter=str('\t')) for col in f: query = col[0] gbID = col[1].split("|")[3] pident = col[2] name = col[3] tax = gbID + ";" + name + " (" + pident + ")" line = [query, tax] new.append(line) otuDict = dict(new) elif args.method == 'rdp': #check that classifier is installed try: rdp_test = subprocess.Popen( ['java', '-Xmx2000m', '-jar', args.rdp, 'classify'], stdout=subprocess.PIPE).communicate()[0].rstrip() except OSError: amptklib.log.error("%s not found in your PATH, exiting." % args.rdp) sys.exit(1) #RDP database amptklib.log.info("Using RDP classifier %s training set" % args.rdp_tax) #run RDP cmd = [ 'java', '-Xmx2000m', '-jar', args.rdp, 'classify', '-g', args.rdp_tax, '-o', rdp_out, '-f', 'fixrank', args.fasta ] amptklib.runSubprocess(cmd, amptklib.log) #load in results and put into dictionary new = [] removal = ["unidentified", "Incertae", "uncultured", "incertae"] remove_exp = [re.compile(x) for x in removal] f = csv.reader(open(rdp_out), delimiter=str('\t')) for col in f: if float(col[19]) > args.rdp_cutoff: tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[ 8] + ",o:" + col[11] + ",f:" + col[14] + ",g:" + col[17] elif float(col[16]) > args.rdp_cutoff: tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[ 8] + ",o:" + col[11] + ",f:" + col[14] elif float(col[13]) > args.rdp_cutoff: tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[ 8] + ",o:" + col[11] elif float(col[10]) > args.rdp_cutoff: tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[8] elif float(col[7]) > args.rdp_cutoff: tax = "RDP;k:" + col[2] + ",p:" + col[5] elif float(col[4]) > args.rdp_cutoff: tax = "RDP;k:" + col[2] else: tax = "RDP;k:unclassified" tax_split = tax.split(",") tax = [ s for s in tax_split if not any(re.search(s) for re in remove_exp) ] tax = ",".join(tax) line = [col[0], tax] new.append(line) otuDict = dict(new) else: #check status of USEARCH DB and run if args.method in ['hybrid', 'usearch']: if args.fasta_db: #now run through usearch global amptklib.log.info( "Global alignment OTUs with usearch_global (VSEARCH) against {:}" .format(os.path.basename(args.fasta_db))) cmd = [ 'vsearch', '--usearch_global', args.fasta, '--db', os.path.abspath(args.fasta_db), '--userout', usearch_out, '--id', str(args.usearch_cutoff), '--strand', 'both', '--output_no_hits', '--maxaccepts', '0', '--top_hits_only', '--userfields', 'query+target+id', '--notrunclabels', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) elif custom_db: #now run through usearch global amptklib.log.info( "Global alignment OTUs with usearch_global (VSEARCH) against custom DB" ) cmd = [ 'vsearch', '--usearch_global', args.fasta, '--db', os.path.abspath(custom_db), '--userout', usearch_out, '--id', str(args.usearch_cutoff), '--strand', 'both', '--output_no_hits', '--maxaccepts', '0', '--top_hits_only', '--userfields', 'query+target+id', '--notrunclabels', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) else: if usearch_db: amptklib.log.info( "Global alignment OTUs with usearch_global (VSEARCH) against {:}" .format(os.path.basename(usearch_db))) cmd = [ 'vsearch', '--usearch_global', args.fasta, '--db', os.path.abspath(usearch_db), '--userout', usearch_out, '--id', str(args.usearch_cutoff), '--strand', 'both', '--output_no_hits', '--maxaccepts', '0', '--top_hits_only', '--userfields', 'query+target+id', '--notrunclabels', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) if args.method in ['hybrid', 'utax']: if utax_db: #now run through UTAX utax_out = base + '.utax.txt' amptklib.log.info("Classifying OTUs with UTAX (USEARCH)") cutoff = str(args.utax_cutoff) cmd = [ usearch, '-utax', args.fasta, '-db', utax_db, '-utaxout', utax_out, '-utax_cutoff', cutoff, '-strand', 'plus', '-notrunclabels', '-threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) else: amptklib.log.error("UTAX DB %s not found, skipping" % utax_db) if args.method in ['hybrid', 'sintax']: if args.fasta_db: #if you pass fasta file here, over ride any auto detection sintax_db = args.fasta_db #now run sintax amptklib.log.info("Classifying OTUs with SINTAX (USEARCH)") cmd = [ usearch, '-sintax', args.fasta, '-db', os.path.abspath(sintax_db), '-tabbedout', sintax_out, '-sintax_cutoff', str(args.sintax_cutoff), '-strand', 'both', '-threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #now process results, load into dictionary - slightly different depending on which classification was run. if args.method == 'hybrid': #run upgraded method, first load dictionaries with resuls if amptklib.checkfile(utax_out): utaxDict = amptklib.classifier2dict( utax_out, args.utax_cutoff) amptklib.log.debug( 'UTAX results parsed, resulting in {:,} taxonomy predictions' .format(len(utaxDict))) else: amptklib.log.info('UTAX results empty') utaxDict = {} if amptklib.checkfile(sintax_out): sintaxDict = amptklib.classifier2dict( sintax_out, args.sintax_cutoff) amptklib.log.debug( 'SINTAX results parsed, resulting in {:,} taxonomy predictions' .format(len(sintaxDict))) else: amptklib.log.info('SINTAX results empty') sintaxDict = {} usearchDict = amptklib.usearchglobal2dict(usearch_out) amptklib.log.debug( 'Global alignment results parsed, resulting in {:,} taxonomy predictions' .format(len(usearchDict))) otuList = natsorted(list(usearchDict.keys())) #first compare classifier results, getting better of the two bestClassify = amptklib.bestclassifier(utaxDict, sintaxDict, otuList) #now get best taxonomy by comparing to global alignment results otuDict = amptklib.bestTaxonomy(usearchDict, bestClassify) amptklib.log.debug( 'Combined OTU taxonomy dictionary contains {:,} taxonomy predictions' .format(len(otuDict))) if len(otuDict) < 1: amptklib.log.info('Parsing taxonomy failed -- see logfile') sys.exit(1) elif args.method == 'utax' and amptklib.checkfile(utax_out): #load results into dictionary for appending to OTU table amptklib.log.debug("Loading UTAX results into dictionary") with open(utax_out, 'r') as infile: reader = csv.reader(infile, delimiter=str("\t")) otuDict = {rows[0]: 'UTAX;' + rows[2] for rows in reader} elif args.method == 'usearch' and amptklib.checkfile(usearch_out): #load results into dictionary for appending to OTU table amptklib.log.debug( "Loading Global Alignment results into dictionary") otuDict = {} usearchDict = amptklib.usearchglobal2dict(usearch_out) for k, v in natsorted(list(usearchDict.items())): pident = float(v[0]) * 100 pident = "{0:.1f}".format(pident) ID = v[1] tax = ','.join(v[-1]) LCA = v[2] if LCA == '': fulltax = 'GS|' + pident + '|' + ID + ';' + tax else: fulltax = 'GSL|' + pident + '|' + ID + ';' + tax otuDict[k] = fulltax elif args.method == 'sintax' and amptklib.checkfile(sintax_out): #load results into dictionary for appending to OTU table amptklib.log.debug("Loading SINTAX results into dictionary") with open(sintax_out, 'r') as infile: reader = csv.reader(infile, delimiter=(str("\t"))) otuDict = {rows[0]: 'SINTAX;' + rows[3] for rows in reader} else: #you have supplied a two column taxonomy file, parse and build otuDict amptklib.log.debug("Loading custom Taxonomy into dictionary") with open(args.taxonomy, 'r') as infile: reader = csv.reader(infile, delimiter=str("\t")) otuDict = {rows[0]: rows[1] for rows in reader} #now format results if args.otu_table: #check if otu_table variable is empty, then load in otu table amptklib.log.info("Appending taxonomy to OTU table and OTUs") taxTable = base + '.otu_table.taxonomy.txt' tmpTable = base + '.otu_table.tmp' #append to OTU table counts = 0 with open(taxTable, 'w') as outTable: with open(args.otu_table, 'r') as inTable: #guess the delimiter format firstline = inTable.readline() dialect = amptklib.guess_csv_dialect(firstline) inTable.seek(0) #parse OTU table reader = csv.reader(inTable, dialect) for line in reader: if line[0].startswith(("#OTU", "OTUId")): line.append('Taxonomy') else: tax = otuDict.get(line[0]) or "No Hit" line.append(tax) if args.tax_filter and not args.method == 'blast': if line[0].startswith(("#OTU", "OTUId")): join_line = ('\t'.join(str(x) for x in line)) else: if args.tax_filter in line[-1]: join_line = ('\t'.join(str(x) for x in line)) counts += 1 else: continue else: join_line = ('\t'.join(str(x) for x in line)) counts += 1 outTable.write("%s\n" % join_line) if args.tax_filter: if args.method == 'blast': amptklib.log.info( "Blast is incompatible with --tax_filter, use a different method" ) tmpTable = args.otu_table else: nonfungal = total - counts amptklib.log.info( "Found %i OTUs not matching %s, writing %i %s hits to taxonomy OTU table" % (nonfungal, args.tax_filter, counts, args.tax_filter)) #need to create a filtered table without taxonomy for BIOM output with open(tmpTable, 'w') as output: with open(taxTable, 'r') as input: firstline = input.readline() dialect = amptklib.guess_csv_dialect(firstline) input.seek(0) #parse OTU table reader = csv.reader(input, dialect) for line in reader: del line[-1] join_line = '\t'.join(str(x) for x in line) output.write("%s\n" % join_line) else: tmpTable = args.otu_table #append to OTUs otuTax = base + '.otus.taxonomy.fa' with open(otuTax, 'w') as output: with open(args.fasta, 'r') as input: SeqRecords = SeqIO.parse(input, 'fasta') for rec in SeqRecords: tax = otuDict.get(rec.id) or "No hit" rec.description = tax SeqIO.write(rec, output, 'fasta') if not args.taxonomy: #output final taxonomy in two-column format, followed by the hits for usearch/sintax/utax if hybrid is used. taxFinal = base + '.taxonomy.txt' with open(taxFinal, 'w') as finaltax: if args.method == 'hybrid': finaltax.write('#OTUID\ttaxonomy\tUSEARCH\tSINTAX\tUTAX\n') for k, v in natsorted(list(otuDict.items())): if k in usearchDict: usearchResult = usearchDict.get(k) usearchResult = ','.join(usearchResult[-1]) else: usearchResult = 'No hit' if k in sintaxDict: sintaxResult = sintaxDict.get(k) sintaxResult = ','.join(sintaxResult[-1]) else: sintaxResult = 'No hit' if k in utaxDict: utaxResult = utaxDict.get(k) utaxResult = ','.join(utaxResult[-1]) else: utaxResult = 'No hit' finaltax.write('{:}\t{:}\t{:}\t{:}\t{:}\n'.format( k, v, usearchResult, sintaxResult, utaxResult)) else: finaltax.write('#OTUID\ttaxonomy\n') for k, v in natsorted(list(otuDict.items())): finaltax.write('%s\t%s\n' % (k, v)) else: taxFinal = args.taxonomy #convert taxonomy to qiime format for biom qiimeTax = None if not args.method == 'blast': qiimeTax = base + '.qiime.taxonomy.txt' amptklib.utax2qiime(taxFinal, qiimeTax) else: amptklib.log.error( "Blast taxonomy is not compatible with BIOM output, use a different method" ) #create OTU phylogeny for downstream processes amptklib.log.info("Generating phylogenetic tree") tree_out = base + '.tree.phy' cmd = [usearch, '-cluster_agg', args.fasta, '-treeout', tree_out] amptklib.runSubprocess(cmd, amptklib.log) #print some summary file locations amptklib.log.info("Taxonomy finished: %s" % taxFinal) if args.otu_table and not args.method == 'blast': amptklib.log.info("Classic OTU table with taxonomy: %s" % taxTable) #output final OTU table in Biom v1.0 (i.e. json format if biom installed) outBiom = base + '.biom' if amptklib.which('biom'): amptklib.removefile(outBiom) cmd = [ 'biom', 'convert', '-i', tmpTable, '-o', outBiom + '.tmp', '--table-type', "OTU table", '--to-json' ] amptklib.runSubprocess(cmd, amptklib.log) if args.mapping_file: mapSamples = [] repeatSamples = [] with open(args.mapping_file, 'r') as mapin: for line in mapin: line = line.rstrip() if line.startswith('#'): continue sampleID = line.split('\t')[0] if not sampleID in mapSamples: mapSamples.append(sampleID) else: repeatSamples.append(sampleID) otuSamples = [] with open(tmpTable, 'r') as otuin: for line in otuin: line = line.rstrip() if line.startswith('#'): otuSamples = line.split('\t')[1:] missingMap = [] for otu in otuSamples: if not otu in mapSamples: missingMap.append(otu) if len(missingMap) > 0: amptklib.log.error( "%s are missing from mapping file (metadata), skipping biom file creation" % ', '.join(missingMap)) elif len(repeatSamples) > 0: amptklib.log.error( '%s duplicate sample IDs in mapping file, skipping biom file creation' % ', '.join(repeatSamples)) else: if qiimeTax: cmd = [ 'biom', 'add-metadata', '-i', outBiom + '.tmp', '-o', outBiom, '--observation-metadata-fp', qiimeTax, '-m', args.mapping_file, '--sc-separated', 'taxonomy', '--output-as-json' ] else: cmd = [ 'biom', 'add-metadata', '-i', outBiom + '.tmp', '-o', outBiom, '-m', args.mapping_file, '--output-as-json' ] amptklib.runSubprocess(cmd, amptklib.log) else: cmd = [ 'biom', 'add-metadata', '-i', outBiom + '.tmp', '-o', outBiom, '--observation-metadata-fp', qiimeTax, '--sc-separated', 'taxonomy', '--output-as-json' ] amptklib.runSubprocess(cmd, amptklib.log) amptklib.removefile(outBiom + '.tmp') amptklib.log.info("BIOM OTU table created: %s" % outBiom) else: amptklib.log.info( "biom program not installed, install via `pip install biom-format` or `conda install biom-format`" ) amptklib.log.info("OTUs with taxonomy: %s" % otuTax) amptklib.log.info("OTU phylogeny: %s" % tree_out) #clean up intermediate files if not args.debug: for i in [ utax_out, usearch_out, sintax_out, qiimeTax, base + '.otu_table.tmp' ]: if i: amptklib.removefile(i) print("-------------------------------------------------------")
def main(args): parser = argparse.ArgumentParser( prog='amptk-OTU_cluster_ref.py', usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu", description='''Script runs UPARSE OTU clustering. Requires USEARCH by Robert C. Edgar: http://drive5.com/usearch''', epilog="""Written by Jon Palmer (2016) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--fastq', dest="FASTQ", required=True, help='FASTQ file (Required)') parser.add_argument('-o', '--out', help='Base output name') parser.add_argument('-e', '--maxee', default='1.0', help='Quality trim EE value') parser.add_argument('-p', '--pct_otu', default='97', help="OTU Clustering Percent") parser.add_argument('--id', default='97', help="Threshold for alignment") parser.add_argument('-m', '--minsize', default='2', help='Min identical seqs to process') parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH9 EXE') parser.add_argument('--map_filtered', action='store_true', help='map quality filtered reads back to OTUs') parser.add_argument( '-d', '--db', required=True, help='Reference Database [ITS,ITS1,ITS2,16S,LSU,COI,custom]') parser.add_argument('--utax_db', help='UTAX Reference Database') parser.add_argument('--utax_cutoff', default=0.8, type=restricted_float, help='UTAX confidence value threshold.') parser.add_argument('--utax_level', default='k', choices=['k', 'p', 'c', 'o', 'f', 'g', 's'], help='UTAX classification level to retain') parser.add_argument('--mock', default='synmock', help='Spike-in mock community (fasta)') parser.add_argument('--debug', action='store_true', help='Remove Intermediate Files') parser.add_argument('--closed_ref_only', action='store_true', help='Only run closed reference clustering') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") args = parser.parse_args(args) parentdir = os.path.join(os.path.dirname(amptklib.__file__)) #get basename if not args.out passed if args.out: base = args.out else: if 'demux' in args.FASTQ: base = os.path.basename(args.FASTQ).split('.demux')[0] else: base = os.path.basename(args.FASTQ).split('.f')[0] taxonomyLookup = { 'k': 'Kingdom', 'p': 'Phylum', 'c': 'Class', 'o': 'Order', 'f': 'Family', 'g': 'Genus', 's': 'Species' } #remove logfile if exists log_name = base + '.amptk-cluster_ref.log' if os.path.isfile(log_name): os.remove(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of cpus if args.cpus: cpus = args.cpus else: cpus = amptklib.getCPUS() #make tmp folder tmp = base + '_tmp' if not os.path.exists(tmp): os.makedirs(tmp) #Setup DB locations and names, etc DBdir = os.path.join(parentdir, 'DB') DataBase = { 'ITS1': (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS1_UTAX.udb')), 'ITS2': (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS2_UTAX.udb')), 'ITS': (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS_UTAX.udb')), '16S': (os.path.join(DBdir, '16S.udb'), os.path.join(DBdir, '16S.udb')), 'LSU': (os.path.join(DBdir, 'LSU.udb'), os.path.join(DBdir, 'LSU_UTAX.udb')), 'COI': (os.path.join(DBdir, 'COI.udb'), os.path.join(DBdir, 'COI_UTAX.udb')) } #setup refDB amptklib.log.info("Checking Reference Database") if args.db in DataBase: #need to write to fasta from vsearch UDB DB = os.path.join(tmp, args.db + '.extracted.fa') cmd = [ 'vsearch', '--udb2fasta', DataBase.get(args.db)[0], '--output', DB ] amptklib.runSubprocess(cmd, amptklib.log) else: DB = os.path.abspath(args.db) refDB = os.path.join(tmp, 'reference_DB.fa') if args.mock: if args.mock == 'synmock': mock = os.path.join(parentdir, 'DB', 'amptk_synmock.fa') else: mock = os.path.abspath(args.mock) seen = [] with open(refDB, 'w') as output: if args.mock: with open(mock) as input1: for rec in SeqIO.parse(input1, 'fasta'): if not rec.id in seen: SeqIO.write(rec, output, 'fasta') else: amptklib.log.error( "Duplicate ID's in Ref DB: %s, exiting" % rec.id) sys.exit(1) with open(DB) as input2: for rec in SeqIO.parse(input2, 'fasta'): if not rec.id in seen: SeqIO.write(rec, output, 'fasta') else: amptklib.log.error( "Duplicate ID's in Ref DB: %s, exiting" % rec.id) sys.exit(1) #get utax_database if args.db in DataBase: utaxDB = DataBase.get(args.db)[1] else: if not args.closed_ref_only: if args.utax_db: utaxDB = os.path.abspath(args.utax_db) else: amptklib.log.error( "%s not pre-installed DB, must then also specify valid UTAX database via --utax_db" % args.db) sys.exit(1) #Count FASTQ records amptklib.log.info("Loading FASTQ Records") #convert to FASTA for mapping orig_fasta = os.path.join(tmp, base + '.orig.fa') cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) orig_total = amptklib.countfasta(orig_fasta) size = amptklib.checkfastqsize(args.FASTQ) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #Expected Errors filtering step filter_out = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fq') filter_fasta = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fa') amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) qtrimtotal = amptklib.countfastq(filter_out) amptklib.log.info('{0:,}'.format(qtrimtotal) + ' reads passed') #now run full length dereplication derep_out = os.path.join(tmp, base + '.EE' + args.maxee + '.derep.fa') amptklib.log.info("De-replication (remove duplicate reads)") cmd = [ 'vsearch', '--derep_fulllength', filter_fasta, '--sizeout', '--output', derep_out, '--threads', str(cpus), '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(derep_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #now run sort by size sort_out = os.path.join(tmp, base + '.EE' + args.maxee + '.sort.fa') amptklib.log.info( "Sorting reads by size: removing reads seen less than %s times" % args.minsize) cmd = [ 'vsearch', '--sortbysize', derep_out, '--minsize', args.minsize, '--output', sort_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(sort_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #chimera detection #first run through de novo chimera detection amptklib.log.info("De novo chimera detection (VSEARCH)") chimera_out = os.path.join(tmp, base + '.EE' + args.maxee + '.chimera_check.fa') cmd = [ 'vsearch', '--uchime_denovo', sort_out, '--relabel', 'Seq', '--sizeout', '--nonchimeras', chimera_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(chimera_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #now run uchime_ref uchime_out = os.path.join(tmp, base + '.EE' + args.maxee + '.uchime.otus.fa') #now run chimera filtering if all checks out amptklib.log.info("Chimera Filtering (VSEARCH)") cmd = [ 'vsearch', '--mindiv', '1.0', '--uchime_ref', chimera_out, '--db', refDB, '--sizeout', '--nonchimeras', uchime_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(uchime_out) amptklib.log.info('{0:,}'.format(total) + ' OTUs passed') #now run usearch_global versus reference database align_out = os.path.join(tmp, base + '.align.uc') pident = int(args.id) * 0.01 amptklib.log.info( "Reference Clustering using Global Alignment, %s%% identity" % args.id) cmd = [ 'vsearch', '--usearch_global', uchime_out, '--db', refDB, '--id', str(pident), '--output_no_hits', '--top_hits_only', '--notrunclabels', '--uc', align_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #parse results ref_results = {} nohits = [] with open(align_out, 'r') as alignment: for line in alignment: line = line.replace('\n', '') col = line.split('\t') counts = col[8].split(';') counts = int(counts[1].replace('size=', '')) if col[3] == '*': nohits.append(col[8]) continue if float(col[3]) >= float(args.id): if not col[8] in ref_results: ref_results[col[8]] = (col[9], col[3], counts) else: print("Error: %s duplicated ID" % col[8]) else: nohits.append(col[8]) #summarize results from first ref clustering num_refcluster = len(ref_results) seqs_refcluster = 0 for k, v in list(ref_results.items()): seqs_refcluster += v[2] amptklib.log.info("%i OTUs classified " % num_refcluster + "({0:.0f}%".format(seqs_refcluster / float(qtrimtotal) * 100) + " of reads)") #get ref clustered hits to file with taxonomy ref_clustered = os.path.join(tmp, base + '.ref_clustered.fa') with open(ref_clustered, 'w') as refoutput: with open(uchime_out, 'r') as input: otu_counter = 1 for rec in SeqIO.parse(input, 'fasta'): if rec.id in ref_results: res = ref_results.get(rec.id) pident = res[1] tax = res[0] newID = 'OTU' + str( otu_counter) + ';pident=' + pident + ';' + tax rec.id = newID rec.name = '' rec.description = '' SeqIO.write(rec, refoutput, 'fasta') otu_counter += 1 if not args.closed_ref_only: #get nohits file to run clustering utax_ref = os.path.join(tmp, base + '.EE' + args.maxee + '.utax_ref.fa') with open(utax_ref, 'w') as output: with open(uchime_out, 'r') as input: for rec in SeqIO.parse(input, 'fasta'): if rec.id in nohits: SeqIO.write(rec, output, 'fasta') #input needs to be sorted, so ref_sort = os.path.join(tmp, base + '.utax_ref.sorted.fa') cmd = [ 'vsearch', '--sortbysize', utax_ref, '--minsize', args.minsize, '--output', ref_sort, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #now run clustering algorithm on those not found in reference database radius = str(100 - int(args.pct_otu)) otu_out = os.path.join(tmp, base + '.EE' + args.maxee + '.otus.fa') amptklib.log.info("De novo Clustering remaining sequences (UPARSE)") cmd = [ usearch, '-cluster_otus', ref_sort, '-relabel', 'OTU', '-otu_radius_pct', radius, '-otus', otu_out ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(otu_out) amptklib.log.info('{0:,}'.format(total) + ' de novo OTUs') #try utax reference clustering amptklib.log.info("Reference Clustering de novo OTUs using UTAX") cmd = [ usearch, '-cluster_otus_utax', otu_out, '-db', utaxDB, '-utax_cutoff', str(args.utax_cutoff), '-utax_level', 's', '-strand', 'plus', '-utaxout', os.path.join(tmp, base + '.utax.out') ] amptklib.runSubprocess(cmd, amptklib.log) #setup tax filtering tax_values = ['k', 'p', 'c', 'o', 'f', 'g', 's'] filter_index = tax_values.index(args.utax_level) filt_tax_values = [s + ':' for s in tax_values[filter_index:]] #get results from utax with open(ref_clustered, 'a') as output: seqDict = SeqIO.index(otu_out, 'fasta') utaxresults = [] with open(os.path.join(tmp, base + '.utax.out'), 'r') as utax: for line in utax: line = line.replace('\n', '') col = line.split('\t') ID = col[0] tax = col[2] if any(x in tax for x in filt_tax_values): record = seqDict[ID] record.id = 'OTU' + str( otu_counter) + ';UTAX;tax=' + tax record.name = '' record.description = '' SeqIO.write(record, output, 'fasta') otu_counter += 1 total = amptklib.countfasta(ref_clustered) - num_refcluster amptklib.log.info('{0:,}'.format(total) + ' classified to %s' % taxonomyLookup.get(args.utax_level)) #clean up padded N's amptklib.log.info("Cleaning up padding from OTUs") otu_clean = os.path.join(tmp, base + '.clean.otus.fa') amptklib.fasta_strip_padding(ref_clustered, otu_clean) total = amptklib.countfasta(otu_clean) amptklib.log.info('{0:,}'.format(total) + ' total OTUs') #now map reads back to OTUs uc_out = os.path.join(tmp, base + '.EE' + args.maxee + '.mapping.uc') otu_table = os.path.join(tmp, base + '.EE' + args.maxee + '.otu_table.txt') #setup reads to map if args.map_filtered: reads = filter_fasta else: reads = orig_fasta amptklib.log.info("Mapping Reads to OTUs and Building OTU table") cmd = [ 'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id', '0.97', '--db', otu_clean, '--uc', uc_out, '--otutabout', otu_table, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #count reads mapped total = amptklib.line_count2(uc_out) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) #Move files around, delete tmp if argument passed. currentdir = os.getcwd() final_otu = os.path.join(currentdir, base + '.cluster.otus.fa') shutil.copyfile(otu_clean, final_otu) final_otu_table = os.path.join(currentdir, base + '.otu_table.txt') shutil.copyfile(otu_table, final_otu_table) if not args.debug: shutil.rmtree(tmp) #Print location of files to STDOUT print("-------------------------------------------------------") print("OTU Clustering Script has Finished Successfully") print("-------------------------------------------------------") if not not args.debug: print("Tmp Folder of files: %s" % tmp) print("Clustered OTUs: %s" % os.path.basename(final_otu)) print("OTU Table: %s" % os.path.basename(final_otu_table)) print("-------------------------------------------------------") otu_print = final_otu.split('/')[-1] tab_print = final_otu_table.split('/')[-1] if 'darwin' in sys.platform: print(colr.WARN + "\nExample of next cmd:" + colr.END + " amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print)) else: print( "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))
def main(args): parser = argparse.ArgumentParser( prog='amptk-OTU_cluster.py', usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu", description='''Script runs UPARSE OTU clustering. Requires USEARCH by Robert C. Edgar: http://drive5.com/usearch''', epilog="""Written by Jon Palmer (2015) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--fastq', dest="FASTQ", required=True, help='FASTQ file (Required)') parser.add_argument('-o', '--out', help='Base output name') parser.add_argument('-e', '--maxee', default='1.0', help='Quality trim EE value') parser.add_argument('-p', '--pct_otu', default='97', help="OTU Clustering Percent") parser.add_argument('-m', '--minsize', default='2', help='Min size to keep for clustering') parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH9 EXE') parser.add_argument('--uchime_ref', help='Run UCHIME REF [ITS,16S,LSU,COI,custom]') parser.add_argument('--map_filtered', action='store_true', help='map quality filtered reads back to OTUs') parser.add_argument('--unoise', action='store_true', help='Run De-noising (UNOISE)') parser.add_argument('--debug', action='store_true', help='Remove Intermediate Files') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") args = parser.parse_args(args) parentdir = os.path.join(os.path.dirname(amptklib.__file__)) #get basename if not args.out passed if args.out: base = args.out else: if 'demux' in args.FASTQ: base = os.path.basename(args.FASTQ).split('.demux')[0] else: base = os.path.basename(args.FASTQ).split('.f')[0] #remove logfile if exists log_name = base + '.amptk-cluster.log' if os.path.isfile(log_name): os.remove(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of cpus if args.cpus: cpus = args.cpus else: cpus = amptklib.getCPUS() #make tmp folder tmp = base + '_tmp' if not os.path.exists(tmp): os.makedirs(tmp) #Count FASTQ records amptklib.log.info("Loading FASTQ Records") #convert to FASTA for mapping orig_fasta = os.path.join(tmp, base + '.orig.fa') cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) orig_total = amptklib.countfasta(orig_fasta) size = amptklib.checkfastqsize(args.FASTQ) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #Expected Errors filtering step filter_out = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fq') filter_fasta = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fa') amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfastq(filter_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #now run full length dereplication derep_out = os.path.join(tmp, base + '.EE' + args.maxee + '.derep.fa') amptklib.log.info("De-replication (remove duplicate reads)") cmd = [ 'vsearch', '--derep_fulllength', filter_fasta, '--sizeout', '--output', derep_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(derep_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #optional run UNOISE if args.unoise: unoise_out = unoise_out = os.path.join( tmp, base + '.EE' + args.maxee + '.denoised.fa') amptklib.log.info("Denoising Data with UNOISE") cmd = [ usearch, '-cluster_fast', derep_out, '-centroids', unoise_out, '-id', '0.9', '--maxdiffs', '5', '-abskew', '10', '-sizein', '-sizeout', '-sort', 'size', '-threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(unoise_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') else: unoise_out = derep_out #now sort by size remove singletons sort_out = os.path.join(tmp, base + '.EE' + args.maxee + '.sort.fa') cmd = [ 'vsearch', '--sortbysize', unoise_out, '--minsize', args.minsize, '--output', sort_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #now run clustering algorithm radius = str(100 - int(args.pct_otu)) otu_out = os.path.join(tmp, base + '.EE' + args.maxee + '.otus.fa') amptklib.log.info("Clustering OTUs (UPARSE)") cmd = [ usearch, '-cluster_otus', sort_out, '-relabel', 'OTU', '-otu_radius_pct', radius, '-otus', otu_out, '-threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) numOTUs = amptklib.countfasta(otu_out) amptklib.log.info('{0:,}'.format(numOTUs) + ' OTUs') #clean up padded N's amptklib.log.info("Cleaning up padding from OTUs") otu_clean = os.path.join(tmp, base + '.EE' + args.maxee + '.clean.otus.fa') amptklib.fasta_strip_padding(otu_out, otu_clean) #optional UCHIME Ref if not args.uchime_ref: uchime_out = otu_clean else: uchime_out = os.path.join( tmp, base + '.EE' + args.maxee + '.uchime.otus.fa') #check if file is present, remove from previous run if it is. if os.path.isfile(uchime_out): os.remove(uchime_out) #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy if args.uchime_ref in [ 'ITS', '16S', 'LSU', 'COI' ]: #test if it is one that is setup, otherwise default to full path uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb') if not os.path.isfile(uchime_db): amptklib.log.error( "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering" ) uchime_out = otu_clean #since uchime cannot work with udb database, need to extract fasta sequences, do this if if not amptklib.checkfile( os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa')): uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') cmd = [ 'vsearch', '--udb2fasta', os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'), '--output', uchime_db ] amptklib.runSubprocess(cmd, amptklib.log) else: uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') else: if os.path.isfile(args.uchime_ref): uchime_db = os.path.abspath(args.uchime_ref) else: amptklib.log.error( "%s is not a valid file, skipping reference chimera filtering" % args.uchime_ref) uchime_out = otu_clean #now run chimera filtering if all checks out if not os.path.isfile(uchime_out): amptklib.log.info("Chimera Filtering (VSEARCH) using %s DB" % args.uchime_ref) cmd = [ 'vsearch', '--mindiv', '1.0', '--uchime_ref', otu_clean, '--db', uchime_db, '--nonchimeras', uchime_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(uchime_out) uchime_chimeras = numOTUs - total amptklib.log.info('{0:,}'.format(total) + ' OTUs passed, ' + '{0:,}'.format(uchime_chimeras) + ' ref chimeras') #Filter out OTUs in wrong orientation amptklib.log.info('Validating OTU orientation') passingOTUs = os.path.join(tmp, base + '.passed.otus.fa') numKept, numDropped = amptklib.validateorientation(tmp, sort_out, uchime_out, passingOTUs) amptklib.log.info('{:,} OTUs validated ({:,} dropped)'.format( numKept, numDropped)) #now map reads back to OTUs and build OTU table uc_out = os.path.join(tmp, base + '.EE' + args.maxee + '.mapping.uc') otu_table = os.path.join(tmp, base + '.EE' + args.maxee + '.otu_table.txt') #setup reads to map if args.map_filtered: reads = filter_fasta else: reads = orig_fasta amptklib.log.info("Mapping Reads to OTUs and Building OTU table") cmd = [ 'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id', '0.97', '--db', passingOTUs, '--uc', uc_out, '--otutabout', otu_table, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #count reads mapped total = amptklib.line_count2(uc_out) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) #Move files around, delete tmp if argument passed. currentdir = os.getcwd() final_otu = os.path.join(currentdir, base + '.cluster.otus.fa') shutil.copyfile(passingOTUs, final_otu) final_otu_table = os.path.join(currentdir, base + '.otu_table.txt') shutil.copyfile(otu_table, final_otu_table) if not args.debug: shutil.rmtree(tmp) #Print location of files to STDOUT print("-------------------------------------------------------") print("OTU Clustering Script has Finished Successfully") print("-------------------------------------------------------") if not not args.debug: print("Tmp Folder of files: %s" % tmp) print("Clustered OTUs: %s" % os.path.basename(final_otu)) print("OTU Table: %s" % os.path.basename(final_otu_table)) print("-------------------------------------------------------") otu_print = final_otu.split('/')[-1] tab_print = final_otu_table.split('/')[-1] if 'darwin' in sys.platform: print(colr.WARN + "\nExample of next cmd:" + colr.END + " amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print)) else: print( "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))
def main(args): parser = argparse.ArgumentParser( prog='amptk-dada2.py', description= '''Script takes output from amptk pre-processing and runs DADA2''', epilog="""Written by Jon Palmer (2016) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--fastq', required=True, help='Input Demuxed containing FASTQ') parser.add_argument('-o', '--out', help='Output Basename') parser.add_argument( '-m', '--min_reads', default=10, type=int, help="Minimum number of reads after Q filtering to run DADA2 on") parser.add_argument('-l', '--length', type=int, help='Length to truncate reads') parser.add_argument('-e', '--maxee', default='1.0', help='MaxEE quality filtering') parser.add_argument('-p', '--pct_otu', default='97', help="Biological OTU Clustering Percent") parser.add_argument('--platform', default='ion', choices=['ion', 'illumina', '454'], help='Sequencing platform') parser.add_argument('--chimera_method', default='consensus', choices=['consensus', 'pooled', 'per-sample'], help='bimera removal method') parser.add_argument('--uchime_ref', help='Run UCHIME REF [ITS,16S,LSU,COI,custom]') parser.add_argument('--pool', action='store_true', help='Pool all sequences together for DADA2') parser.add_argument('--debug', action='store_true', help='Keep all intermediate files') parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH9 EXE') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") args = parser.parse_args(args) parentdir = os.path.join(os.path.dirname(amptklib.__file__)) dada2script = os.path.join(parentdir, 'dada2_pipeline_nofilt.R') #get basename if not args.out passed if args.out: base = args.out else: if 'demux' in args.fastq: base = os.path.basename(args.fastq).split('.demux')[0] else: base = os.path.basename(args.fastq).split('.f')[0] #remove logfile if exists log_name = base + '.amptk-dada2.log' if os.path.isfile(log_name): amptklib.removefile(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of cores if args.cpus: CORES = str(args.cpus) else: CORES = str(amptklib.getCPUS()) #check dependencies programs = ['Rscript'] amptklib.CheckDependencies(programs) Rversions = amptklib.checkRversion() R_pass = '******' dada2_pass = '******' #check dada2 first, if good move on, otherwise issue warning if not amptklib.gvc(Rversions[1], dada2_pass): amptklib.log.error("R v%s; DADA2 v%s detected, need atleast v%s" % (Rversions[0], Rversions[1], dada2_pass)) amptklib.log.error( "See: http://benjjneb.github.io/dada2/dada-installation.html") sys.exit(1) amptklib.log.info("R v%s; DADA2 v%s" % (Rversions[0], Rversions[1])) #Count FASTQ records and remove 3' N's as dada2 can't handle them amptklib.log.info("Loading FASTQ Records") no_ns = base + '.cleaned_input.fq' if args.fastq.endswith('.gz'): fastqInput = args.fastq.replace('.gz', '') amptklib.Funzip(os.path.abspath(args.fastq), os.path.basename(fastqInput), CORES) else: fastqInput = os.path.abspath(args.fastq) amptklib.fastq_strip_padding(os.path.basename(fastqInput), no_ns) demuxtmp = base + '.original.fa' cmd = [ 'vsearch', '--fastq_filter', os.path.abspath(no_ns), '--fastq_qmax', '55', '--fastaout', demuxtmp, '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) orig_total = amptklib.countfasta(demuxtmp) size = amptklib.checkfastqsize(no_ns) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #quality filter amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) derep = base + '.qual-filtered.fq' filtercmd = [ 'vsearch', '--fastq_filter', no_ns, '--fastq_maxee', str(args.maxee), '--fastqout', derep, '--fastq_qmax', '55', '--fastq_maxns', '0', '--threads', CORES ] amptklib.runSubprocess(filtercmd, amptklib.log) total = amptklib.countfastq(derep) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #split into individual files amptklib.log.info("Splitting FASTQ file by Sample into individual files") filtfolder = base + '_filtered' if os.path.isdir(filtfolder): shutil.rmtree(filtfolder) os.makedirs(filtfolder) splitDemux2(derep, filtfolder, args=args) #check for minimum number of reads in each sample remove = [] files = [i for i in os.listdir(filtfolder) if i.endswith('.fastq')] for x in files: if amptklib.countfastq(os.path.join(filtfolder, x)) < args.min_reads: remove.append(x) if len(remove) > 0: amptklib.log.info("Dropping %s as fewer than %i reads" % (', '.join(remove), args.min_reads)) for y in remove: os.remove(os.path.join(filtfolder, y)) #now run DADA2 on filtered folder amptklib.log.info("Running DADA2 pipeline") dada2log = base + '.dada2.Rscript.log' dada2out = base + '.dada2.csv' #check pooling vs notpooled, default is not pooled. if args.pool: POOL = 'TRUE' else: POOL = 'FALSE' with open(dada2log, 'w') as logfile: subprocess.call([ 'Rscript', '--vanilla', dada2script, filtfolder, dada2out, args.platform, POOL, CORES, args.chimera_method ], stdout=logfile, stderr=logfile) #check for results if not os.path.isfile(dada2out): amptklib.log.error("DADA2 run failed, please check %s logfile" % dada2log) sys.exit(1) #now process the output, pull out fasta, rename, etc fastaout = base + '.otus.tmp' OTUCounts = {} counter = 1 with open(fastaout, 'w') as writefasta: with open(dada2out, 'r') as input: next(input) for line in input: line = line.replace('\n', '') line = line.replace('"', '') cols = line.split(',') Seq = cols[0] countList = [int(x) for x in cols[1:]] counts = sum(countList) ID = 'ASV' + str(counter) if not ID in OTUCounts: OTUCounts[ID] = counts writefasta.write(">%s\n%s\n" % (ID, Seq)) counter += 1 #get number of bimeras from logfile with open(dada2log, 'r') as bimeracheck: for line in bimeracheck: if line.startswith('Identified '): bimeraline = line.split(' ') bimeras = int(bimeraline[1]) totalSeqs = int(bimeraline[5]) validSeqs = totalSeqs - bimeras amptklib.log.info('{0:,}'.format(totalSeqs) + ' total amplicon sequence variants (ASVs)') amptklib.log.info('{0:,}'.format(bimeras) + ' denovo chimeras removed') amptklib.log.info('{0:,}'.format(validSeqs) + ' valid ASVs') #optional UCHIME Ref uchime_out = base + '.nonchimeras.fa' chimeraFreeTable = base + '.otu_table.txt' iSeqs = base + '.ASVs.fa' if not args.uchime_ref: os.rename(fastaout, iSeqs) else: #check if file is present, remove from previous run if it is. if os.path.isfile(iSeqs): amptklib.removefile(iSeqs) #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy if args.uchime_ref in [ 'ITS', '16S', 'LSU', 'COI' ]: #test if it is one that is setup, otherwise default to full path uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb') if not os.path.isfile(uchime_db): amptklib.log.error( "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering" ) uchime_out = fastaout #since uchime cannot work with udb database, need to extract fasta sequences, do this if if not amptklib.checkfile( os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa')): uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') cmd = [ 'vsearch', '--udb2fasta', os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'), '--output', uchime_db ] amptklib.runSubprocess(cmd, amptklib.log) else: uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') else: if os.path.isfile(args.uchime_ref): uchime_db = os.path.abspath(args.uchime_ref) else: amptklib.log.error( "%s is not a valid file, skipping reference chimera filtering" % args.uchime_ref) iSeqs = fastaout #now run chimera filtering if all checks out if not os.path.isfile(iSeqs): amptklib.log.info("Chimera Filtering (VSEARCH) using %s DB" % args.uchime_ref) cmd = [ 'vsearch', '--mindiv', '1.0', '--uchime_ref', fastaout, '--db', uchime_db, '--nonchimeras', iSeqs, '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(iSeqs) uchime_chimeras = validSeqs - total amptklib.log.info('{0:,}'.format(total) + ' ASVs passed, ' + '{0:,}'.format(uchime_chimeras) + ' ref chimeras removed') if os.path.isfile(fastaout): amptklib.removefile(fastaout) #setup output files dadademux = base + '.dada2.map.uc' bioSeqs = base + '.cluster.otus.fa' bioTable = base + '.cluster.otu_table.txt' uctmp = base + '.map.uc' ClusterComp = base + '.ASVs2clusters.txt' #Filter out ASVs in wrong orientation amptklib.log.info('Validating ASV orientation') os.rename(iSeqs, iSeqs + '.bak') numKept, numDropped = amptklib.validateorientationDADA2( OTUCounts, iSeqs + '.bak', iSeqs) amptklib.log.info('{:,} ASVs validated ({:,} dropped)'.format( numKept, numDropped)) amptklib.SafeRemove(iSeqs + '.bak') #map reads to DADA2 OTUs amptklib.log.info("Mapping reads to DADA2 ASVs") cmd = [ 'vsearch', '--usearch_global', demuxtmp, '--db', iSeqs, '--id', '0.97', '--uc', dadademux, '--strand', 'plus', '--otutabout', chimeraFreeTable, '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.line_count2(dadademux) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to ASVs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) #cluster amptklib.log.info("Clustering ASVs at %s%% to generate biological OTUs" % args.pct_otu) radius = float(args.pct_otu) / 100. cmd = [ 'vsearch', '--cluster_smallmem', iSeqs, '--centroids', bioSeqs, '--id', str(radius), '--strand', 'plus', '--relabel', 'OTU', '--qmask', 'none', '--usersort', '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(bioSeqs) amptklib.log.info('{0:,}'.format(total) + ' OTUs generated') #determine where iSeqs clustered iSeqmap = base + '.ASV_map.uc' cmd = [ 'vsearch', '--usearch_global', iSeqs, '--db', bioSeqs, '--id', str(radius), '--uc', iSeqmap, '--strand', 'plus', '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) iSeqMapped = {} with open(iSeqmap, 'r') as mapping: for line in mapping: line = line.replace('\n', '') cols = line.split('\t') OTU = cols[9] Hit = cols[8] if not OTU in iSeqMapped: iSeqMapped[OTU] = [Hit] else: iSeqMapped[OTU].append(Hit) with open(ClusterComp, 'w') as clusters: clusters.write('OTU\tASVs\n') for k, v in natsorted(list(iSeqMapped.items())): clusters.write('%s\t%s\n' % (k, ', '.join(v))) #create OTU table amptklib.log.info("Mapping reads to OTUs") cmd = [ 'vsearch', '--usearch_global', demuxtmp, '--db', bioSeqs, '--id', '0.97', '--uc', uctmp, '--strand', 'plus', '--otutabout', bioTable, '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.line_count2(uctmp) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) if not args.debug: amptklib.removefile(no_ns) shutil.rmtree(filtfolder) amptklib.removefile(dada2out) amptklib.removefile(derep) amptklib.removefile(demuxtmp) amptklib.removefile(uctmp) amptklib.removefile(iSeqmap) amptklib.removefile(dadademux) #Print location of files to STDOUT print("-------------------------------------------------------") print("DADA2 Script has Finished Successfully") print("-------------------------------------------------------") if args.debug: print("Tmp Folder of files: %s" % filtfolder) print("Amplicon sequence variants: %s" % iSeqs) print("ASV OTU Table: %s" % chimeraFreeTable) print("Clustered OTUs: %s" % bioSeqs) print("OTU Table: %s" % bioTable) print("ASVs 2 OTUs: %s" % ClusterComp) print("-------------------------------------------------------") otu_print = bioSeqs.split('/')[-1] tab_print = bioTable.split('/')[-1] if 'darwin' in sys.platform: print(colr.WARN + "\nExample of next cmd:" + colr.END + " amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print)) else: print( "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))
def main(args): parser = argparse.ArgumentParser( prog='amptk-unoise2.py', usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu", description='''Script runs UNOISE2 algorithm. Requires USEARCH9 by Robert C. Edgar: http://drive5.com/usearch''', epilog="""Written by Jon Palmer (2016) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--fastq', dest="FASTQ", required=True, help='FASTQ file (Required)') parser.add_argument('-o', '--out', help='Base output name') parser.add_argument('-e', '--maxee', default='1.0', help='Quality trim EE value') parser.add_argument('-m', '--minsize', default='8', help='Min size to keep for denoising') parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH9 EXE') parser.add_argument('-p', '--pct_otu', default='97', help="Biological OTU Clustering Percent") parser.add_argument('--uchime_ref', help='Run UCHIME2 REF [ITS,16S,LSU,COI,custom]') parser.add_argument('--map_filtered', action='store_true', help='map quality filtered reads back to OTUs') parser.add_argument('--debug', action='store_true', help='Remove Intermediate Files') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") args = parser.parse_args(args) parentdir = os.path.join(os.path.dirname(amptklib.__file__)) #get basename if not args.out passed if args.out: base = args.out else: if 'demux' in args.FASTQ: base = os.path.basename(args.FASTQ).split('.demux')[0] else: base = os.path.basename(args.FASTQ).split('.f')[0] #remove logfile if exists log_name = base + '.amptk-unoise2.log' if os.path.isfile(log_name): os.remove(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of cpus if args.cpus: cpus = args.cpus else: cpus = amptklib.getCPUS() #make tmp folder tmp = base + '_tmp' if not os.path.exists(tmp): os.makedirs(tmp) #Count FASTQ records amptklib.log.info("Loading FASTQ Records") #convert to FASTA for mapping orig_fasta = os.path.join(tmp, base + '.orig.fa') cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) orig_total = amptklib.countfasta(orig_fasta) size = amptklib.checkfastqsize(args.FASTQ) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #Expected Errors filtering step filter_out = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fq') filter_fasta = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fa') amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfastq(filter_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #now run full length dereplication derep_out = os.path.join(tmp, base + '.EE' + args.maxee + '.derep.fa') amptklib.log.info("De-replication (remove duplicate reads)") cmd = [ 'vsearch', '--derep_fulllength', filter_out, '--relabel', 'Read_', '--sizeout', '--output', derep_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(derep_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #now run de-noiser UNOISE2 amptklib.log.info("Denoising reads with UNOISE2") unoise_out = os.path.join(tmp, base + '.EE' + args.maxee + '.unoise.fa') cmd = [ usearch, '-unoise2', derep_out, '-fastaout', unoise_out, '-minampsize', args.minsize, '-threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(unoise_out) amptklib.log.info('{0:,}'.format(total) + ' denoised sequences') #strip N's amptklib.log.info("Cleaning up padding from OTUs") otu_clean = os.path.join(tmp, base + '.EE' + args.maxee + '.clean.fa') amptklib.fasta_strip_padding(unoise_out, otu_clean) #run optional uchime_ref if not args.uchime_ref: uchime_out = otu_clean else: uchime_out = os.path.join( tmp, base + '.EE' + args.maxee + '.uchime.otus.fa') #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy if args.uchime_ref in [ 'ITS', '16S', 'LSU', 'COI' ]: #test if it is one that is setup, otherwise default to full path uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb') if not os.path.isfile(uchime_db): amptklib.log.error( "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering" ) uchime_out = otu_clean #since uchime cannot work with udb database, need to extract fasta sequences, do this if if not amptklib.checkfile( os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa')): uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') cmd = [ 'vsearch', '--udb2fasta', os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'), '--output', uchime_db ] amptklib.runSubprocess(cmd, amptklib.log) else: uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') else: uchime_db = os.path.abspath(args.uchime_ref) #now run chimera filtering if all checks out if not os.path.isfile(uchime_out): amptklib.log.info("Chimera Filtering (VSEARCH)") cmd = [ 'vsearch', '--mindiv', '1.0', '--uchime_ref', otu_clean, '--db', uchime_db, '--nonchimeras', uchime_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(uchime_out) amptklib.log.info('{0:,}'.format(total) + ' OTUs passed') #inferred sequences iSeqs = base + '.ASVs.fa' amptklib.fastarename(uchime_out, 'ASV', iSeqs) #Filter out ASVs in wrong orientation amptklib.log.info('Validating ASV orientation') passingOTUs = os.path.join(tmp, base + '.passed.asvs.fa') numKept, numDropped = amptklib.validateorientation(tmp, derep_out, uchime_out, passingOTUs) amptklib.log.info('{:,} ASVs validated ({:,} dropped)'.format( numKept, numDropped)) #build OTU table with iSeqs uc_iSeq_out = os.path.join(tmp, base + '.EE' + args.maxee + '.mapping.uc') iSeq_otu_table = base + '.otu_table.txt' #setup reads to map if args.map_filtered: reads = filter_fasta else: reads = orig_fasta amptklib.log.info("Mapping Reads to ASVs and Building OTU table") cmd = [ 'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id', '0.97', '--db', passingOTUs, '--uc', uc_iSeq_out, '--otutabout', iSeq_otu_table, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #count reads mapped total = amptklib.line_count2(uc_iSeq_out) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to ASVs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) #now cluster to biological OTUs with UCLUST radius = float(args.pct_otu) / 100. amptklib.log.info( "Clustering denoised sequences into biological OTUs at %s%%" % args.pct_otu) uclust_out = os.path.join(tmp, base + '.EE' + args.maxee + '.uclust.fa') cmd = [ 'vsearch', '--cluster_smallmem', passingOTUs, '--centroids', uclust_out, '--id', str(radius), '--strand', 'plus', '--relabel', 'OTU', '--qmask', 'none', '--usersort', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(uclust_out) amptklib.log.info('{0:,}'.format(total) + ' OTUs generated') #determine where denoised sequences clustered ClusterComp = base + '.ASVs2clusters.txt' iSeqmap = base + '.unoise_map.uc' cmd = [ usearch, '-usearch_global', passingOTUs, '-db', uclust_out, '-id', str(radius), '-uc', iSeqmap, '-strand', 'plus', '-threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) iSeqMapped = {} with open(iSeqmap, 'r') as mapping: for line in mapping: line = line.replace('\n', '') cols = line.split('\t') OTU = cols[9] Hit = cols[8] if not OTU in iSeqMapped: iSeqMapped[OTU] = [Hit] else: iSeqMapped[OTU].append(Hit) with open(ClusterComp, 'w') as clusters: clusters.write('OTU\tASVs\n') for k, v in natsorted(list(iSeqMapped.items())): clusters.write('%s\t%s\n' % (k, ', '.join(v))) #now map reads back to OTUs and build OTU table uc_out = os.path.join(tmp, base + '.EE' + args.maxee + '.cluster.mapping.uc') otu_table = os.path.join( tmp, base + '.EE' + args.maxee + '.cluster.otu_table.txt') #setup reads to map if args.map_filtered: reads = filter_fasta else: reads = orig_fasta amptklib.log.info("Mapping Reads to OTUs and Building OTU table") cmd = [ 'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id', '0.97', '--db', uclust_out, '--uc', uc_out, '--otutabout', otu_table, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #count reads mapped total = amptklib.line_count2(uc_out) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) #Move files around, delete tmp if argument passed. currentdir = os.getcwd() final_otu = os.path.join(currentdir, base + '.cluster.otus.fa') shutil.copyfile(uclust_out, final_otu) final_otu_table = os.path.join(currentdir, base + '.cluster.otu_table.txt') shutil.copyfile(otu_table, final_otu_table) if not args.debug: shutil.rmtree(tmp) #Print location of files to STDOUT print("-------------------------------------------------------") print("UNOISE2 Script has Finished Successfully") print("-------------------------------------------------------") if not not args.debug: print("Tmp Folder of files: %s" % tmp) print("Amplicon sequence variants: %s" % passingOTUs) print("ASV OTU Table: %s" % iSeq_otu_table) print("Clustered OTUs: %s" % os.path.basename(final_otu)) print("OTU Table: %s" % os.path.basename(final_otu_table)) print("ASVs 2 OTUs: %s" % ClusterComp) print("-------------------------------------------------------") otu_print = final_otu.split('/')[-1] tab_print = final_otu_table.split('/')[-1] if 'darwin' in sys.platform: print(colr.WARN + "\nExample of next cmd:" + colr.END + " amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print)) else: print( "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))
def main(args): parser = argparse.ArgumentParser( prog='amptk-drop.py', description='''Script that drops OTUs and then creates OTU table''', epilog="""Written by Jon Palmer (2016) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--input', required=True, help='OTUs in FASTA format') parser.add_argument('-r', '--reads', required=True, help='Demuxed reads FASTQ format') parser.add_argument('-o', '--out', help='Base output name') parser.add_argument('-l', '--list', nargs='+', help='Input list of (BC) names to remove') parser.add_argument('-f', '--file', help='File containing list of names to remove') args = parser.parse_args(args) #get basename if not args.out passed if args.out: base = args.out else: if 'otus' in args.input: base = os.path.basename(args.input).split('.otus')[0] else: base = os.path.basename(args.input).split('.f')[0] #remove logfile if exists log_name = base + '.amptk-drop.log' if os.path.isfile(log_name): os.remove(log_name) amptklib.setupLogging(log_name) cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #check the list or file parameters, one of them must have something if not args.list: if not args.file: amptklib.log.error( "Error, you must specifiy a list of OTU names or a file containing names" ) sys.exit(1) if not args.file: if not args.list: amptklib.log.error( "Error, you must specifiy a list of OTU names or a file containing names" ) sys.exit(1) if args.list and args.file: amptklib.log.error( "Error, you must specifiy either list of OTU names or a file containing OTU names, not both" ) sys.exit(1) if args.file: count = amptklib.line_count(args.file) #load in list of names to remove with open(args.file, 'r') as input: lines = [line.rstrip('\n') for line in input] if args.list: count = len(args.list) lines = args.list #make sure it is a set, faster lookup dropList = set(lines) #load data total = amptklib.countfasta(args.input) amptklib.log.info("Loading %i OTUs" % total) #load in the fasta file, change if in dictionary and output to stdout amptklib.log.info("Dropping %i OTUs" % count) newOTUs = base + '.cleaned.otus.fa' with open(newOTUs, 'w') as otus: with open(args.input, 'r') as fasta: for rec in SeqIO.parse(fasta, 'fasta'): if not rec.id in dropList: SeqIO.write(rec, otus, 'fasta') #now make new OTU table amptklib.log.info("Mapping Reads to OTUs and Building OTU table") newTable = base + '.cleaned.otu_table.txt' tmpReads = base + '.reads.tmp' uc_out = base + '.mapping.uc' cmd = [ 'vsearch', '--fastq_filter', args.reads, '--fastaout', tmpReads, '--fastq_qmax', '55' ] amptklib.runSubprocess(cmd, amptklib.log) cmd = [ 'vsearch', '--usearch_global', tmpReads, '--strand', 'plus', '--id', '0.97', '--db', newOTUs, '--uc', uc_out, '--otutabout', newTable ] amptklib.runSubprocess(cmd, amptklib.log) #count OTUs otu_count = amptklib.countfasta(newOTUs) amptklib.log.info('{0:,}'.format(otu_count) + ' OTUs remaining') #count reads mapped total = amptklib.line_count(uc_out) orig_total = amptklib.countfasta(tmpReads) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) #Print location of files to STDOUT print("-------------------------------------------------------") print("Clustered OTUs: %s" % newOTUs) print("OTU Table: %s" % newTable) print("-------------------------------------------------------") #cleanup amptklib.removefile(tmpReads) amptklib.removefile(uc_out) otu_print = newOTUs.split('/')[-1] tab_print = newTable.split('/')[-1] if 'darwin' in sys.platform: print(colr.WARN + "\nExample of next cmd:" + colr.END + " amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print)) else: print( "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))
def main(args): parser = argparse.ArgumentParser( prog='amptk-lulu.py', description= '''Script runs OTU table post processing LULU to identify low abundance error OTUs''', epilog="""Written by Jon Palmer (2018) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--otu_table', required=True, help='Input OTU table') parser.add_argument('-f', '--fasta', required=True, help='Input OTUs (multi-fasta)') parser.add_argument('-o', '--out', help='Output folder basename') parser.add_argument('--min_ratio_type', default='min', choices=['min', 'avg'], help="LULU minimum ratio threshold") parser.add_argument('--min_ratio', default=1, type=int, help="LULU minimum ratio") parser.add_argument('--min_match', default=84, type=int, help="LULU minimum match percent identity") parser.add_argument('--min_relative_cooccurence', default=95, type=int, help="LULU minimum relative cooccurance") parser.add_argument('--debug', action='store_true', help='Remove Intermediate Files') args = parser.parse_args(args) #get location of R script parentdir = os.path.join(os.path.dirname(amptklib.__file__)) luluScript = os.path.join(parentdir, 'runLULU.R') if not args.out: #get base name of files if 'otu_table' in args.otu_table: base = os.path.basename(args.otu_table).split(".otu_table")[0] elif 'final.txt' in args.otu_table: base = os.path.basename(args.otu_table).split(".final")[0] else: base = os.path.basename(args.otu_table).split(".txt")[0] else: base = args.out #remove logfile if exists log_name = base + '.amptk-lulu.log' if os.path.isfile(log_name): amptklib.removefile(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() amptklib.versionDependencyChecks('usearch9') #check dependencies programs = ['Rscript', 'vsearch'] amptklib.CheckDependencies(programs) Rversions = amptklib.checkRversion() if Rversions[3] == '0.0.0': amptklib.log.info("R v%s installed, LULU not installed") sys.exit(1) else: amptklib.log.info("R v%s; LULU v%s" % (Rversions[0], Rversions[3])) #this is a simple wrapper for an R script so easier to run from amptk menu tmpdir = 'lulu_' + str(os.getpid()) if not os.path.isdir(tmpdir): os.makedirs(tmpdir) #generate the match list using the minimum match pident match_file = os.path.join(tmpdir, 'match_file.txt') amptklib.log.info("Loading {:,} OTUs".format( amptklib.countfasta(args.fasta))) amptklib.log.info( "Generating pairwise percent identity between OTUs using VSEARCH at {:}% identity" .format(args.min_match)) cmd = [ 'vsearch', '--usearch_global', os.path.abspath(args.fasta), '--db', os.path.abspath(args.fasta), '--self', '--id', str(args.min_match / 100), '--iddef', '1', '--userout', match_file, '--userfields', 'query+target+id', '--maxaccepts', '0', '--query_cov', '.9', '--maxhits', '10' ] amptklib.runSubprocess(cmd, amptklib.log) #now run LULU in R LULU_log = os.path.join(tmpdir, 'LULU-R.log') lulu_otu_table = base + '.lulu.otu_table.txt' dropList = os.path.join(tmpdir, 'droplist.txt') MapData = base + '.lulu.otu-map.txt' amptklib.log.info("Running LULU algorithm") cmd = [ 'Rscript', '--vanilla', luluScript, os.path.abspath(args.otu_table), os.path.abspath(match_file), args.min_ratio_type, str(args.min_ratio), str(args.min_match), str(args.min_relative_cooccurence / 100), lulu_otu_table, dropList, MapData ] amptklib.runSubprocess4(cmd, amptklib.log, LULU_log) #get updated OTUs remove = [] with open(dropList, 'rU') as dropped: for line in dropped: remove.append(line.rstrip()) lulu_otus = base + '.lulu.otus.fa' with open(lulu_otus, 'w') as output: with open(args.fasta, 'rU') as infasta: for record in SeqIO.parse(infasta, 'fasta'): if not record.id in remove: output.write('>%s\n%s\n' % (record.id, record.seq)) amptklib.log.info( "LULU has merged {:,} OTUs, output data contains {:,} OTUs".format( len(remove), amptklib.countfasta(lulu_otus))) amptklib.log.info("LULU OTU table post processing finished\n\ ----------------------------------\n\ OTU table: {:}\n\ OTU FASTA: {:}\n\ LULU map: {:}\n\ ----------------------------------".format(lulu_otu_table, lulu_otus, MapData)) if 'win32' in sys.platform: print( "\nExample of next cmd: amptk taxonomy -f %s -i %s -m mapping_file.txt -d ITS2\n" % (lulu_otus, lulu_otu_table)) else: print(colr.WARN + "\nExample of next cmd:" + colr.END + " amptk taxonomy -f %s -i %s -m mapping_file.txt -d ITS2\n" % (lulu_otus, lulu_otu_table)) if not args.debug: if os.path.isdir(tmpdir): shutil.rmtree(tmpdir) print("-------------------------------------------------------")