def main(args): parser = argparse.ArgumentParser( prog='amptk-OTU_cluster_ref.py', usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu", description='''Script runs UPARSE OTU clustering. Requires USEARCH by Robert C. Edgar: http://drive5.com/usearch''', epilog="""Written by Jon Palmer (2016) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--fastq', dest="FASTQ", required=True, help='FASTQ file (Required)') parser.add_argument('-o', '--out', help='Base output name') parser.add_argument('-e', '--maxee', default='1.0', help='Quality trim EE value') parser.add_argument('-p', '--pct_otu', default='97', help="OTU Clustering Percent") parser.add_argument('--id', default='97', help="Threshold for alignment") parser.add_argument('-m', '--minsize', default='2', help='Min identical seqs to process') parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH9 EXE') parser.add_argument('--map_filtered', action='store_true', help='map quality filtered reads back to OTUs') parser.add_argument( '-d', '--db', required=True, help='Reference Database [ITS,ITS1,ITS2,16S,LSU,COI,custom]') parser.add_argument('--utax_db', help='UTAX Reference Database') parser.add_argument('--utax_cutoff', default=0.8, type=restricted_float, help='UTAX confidence value threshold.') parser.add_argument('--utax_level', default='k', choices=['k', 'p', 'c', 'o', 'f', 'g', 's'], help='UTAX classification level to retain') parser.add_argument('--mock', default='synmock', help='Spike-in mock community (fasta)') parser.add_argument('--debug', action='store_true', help='Remove Intermediate Files') parser.add_argument('--closed_ref_only', action='store_true', help='Only run closed reference clustering') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") args = parser.parse_args(args) parentdir = os.path.join(os.path.dirname(amptklib.__file__)) #get basename if not args.out passed if args.out: base = args.out else: if 'demux' in args.FASTQ: base = os.path.basename(args.FASTQ).split('.demux')[0] else: base = os.path.basename(args.FASTQ).split('.f')[0] taxonomyLookup = { 'k': 'Kingdom', 'p': 'Phylum', 'c': 'Class', 'o': 'Order', 'f': 'Family', 'g': 'Genus', 's': 'Species' } #remove logfile if exists log_name = base + '.amptk-cluster_ref.log' if os.path.isfile(log_name): os.remove(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of cpus if args.cpus: cpus = args.cpus else: cpus = amptklib.getCPUS() #make tmp folder tmp = base + '_tmp' if not os.path.exists(tmp): os.makedirs(tmp) #Setup DB locations and names, etc DBdir = os.path.join(parentdir, 'DB') DataBase = { 'ITS1': (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS1_UTAX.udb')), 'ITS2': (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS2_UTAX.udb')), 'ITS': (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS_UTAX.udb')), '16S': (os.path.join(DBdir, '16S.udb'), os.path.join(DBdir, '16S.udb')), 'LSU': (os.path.join(DBdir, 'LSU.udb'), os.path.join(DBdir, 'LSU_UTAX.udb')), 'COI': (os.path.join(DBdir, 'COI.udb'), os.path.join(DBdir, 'COI_UTAX.udb')) } #setup refDB amptklib.log.info("Checking Reference Database") if args.db in DataBase: #need to write to fasta from vsearch UDB DB = os.path.join(tmp, args.db + '.extracted.fa') cmd = [ 'vsearch', '--udb2fasta', DataBase.get(args.db)[0], '--output', DB ] amptklib.runSubprocess(cmd, amptklib.log) else: DB = os.path.abspath(args.db) refDB = os.path.join(tmp, 'reference_DB.fa') if args.mock: if args.mock == 'synmock': mock = os.path.join(parentdir, 'DB', 'amptk_synmock.fa') else: mock = os.path.abspath(args.mock) seen = [] with open(refDB, 'w') as output: if args.mock: with open(mock) as input1: for rec in SeqIO.parse(input1, 'fasta'): if not rec.id in seen: SeqIO.write(rec, output, 'fasta') else: amptklib.log.error( "Duplicate ID's in Ref DB: %s, exiting" % rec.id) sys.exit(1) with open(DB) as input2: for rec in SeqIO.parse(input2, 'fasta'): if not rec.id in seen: SeqIO.write(rec, output, 'fasta') else: amptklib.log.error( "Duplicate ID's in Ref DB: %s, exiting" % rec.id) sys.exit(1) #get utax_database if args.db in DataBase: utaxDB = DataBase.get(args.db)[1] else: if not args.closed_ref_only: if args.utax_db: utaxDB = os.path.abspath(args.utax_db) else: amptklib.log.error( "%s not pre-installed DB, must then also specify valid UTAX database via --utax_db" % args.db) sys.exit(1) #Count FASTQ records amptklib.log.info("Loading FASTQ Records") #convert to FASTA for mapping orig_fasta = os.path.join(tmp, base + '.orig.fa') cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) orig_total = amptklib.countfasta(orig_fasta) size = amptklib.checkfastqsize(args.FASTQ) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #Expected Errors filtering step filter_out = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fq') filter_fasta = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fa') amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) qtrimtotal = amptklib.countfastq(filter_out) amptklib.log.info('{0:,}'.format(qtrimtotal) + ' reads passed') #now run full length dereplication derep_out = os.path.join(tmp, base + '.EE' + args.maxee + '.derep.fa') amptklib.log.info("De-replication (remove duplicate reads)") cmd = [ 'vsearch', '--derep_fulllength', filter_fasta, '--sizeout', '--output', derep_out, '--threads', str(cpus), '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(derep_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #now run sort by size sort_out = os.path.join(tmp, base + '.EE' + args.maxee + '.sort.fa') amptklib.log.info( "Sorting reads by size: removing reads seen less than %s times" % args.minsize) cmd = [ 'vsearch', '--sortbysize', derep_out, '--minsize', args.minsize, '--output', sort_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(sort_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #chimera detection #first run through de novo chimera detection amptklib.log.info("De novo chimera detection (VSEARCH)") chimera_out = os.path.join(tmp, base + '.EE' + args.maxee + '.chimera_check.fa') cmd = [ 'vsearch', '--uchime_denovo', sort_out, '--relabel', 'Seq', '--sizeout', '--nonchimeras', chimera_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(chimera_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #now run uchime_ref uchime_out = os.path.join(tmp, base + '.EE' + args.maxee + '.uchime.otus.fa') #now run chimera filtering if all checks out amptklib.log.info("Chimera Filtering (VSEARCH)") cmd = [ 'vsearch', '--mindiv', '1.0', '--uchime_ref', chimera_out, '--db', refDB, '--sizeout', '--nonchimeras', uchime_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(uchime_out) amptklib.log.info('{0:,}'.format(total) + ' OTUs passed') #now run usearch_global versus reference database align_out = os.path.join(tmp, base + '.align.uc') pident = int(args.id) * 0.01 amptklib.log.info( "Reference Clustering using Global Alignment, %s%% identity" % args.id) cmd = [ 'vsearch', '--usearch_global', uchime_out, '--db', refDB, '--id', str(pident), '--output_no_hits', '--top_hits_only', '--notrunclabels', '--uc', align_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #parse results ref_results = {} nohits = [] with open(align_out, 'r') as alignment: for line in alignment: line = line.replace('\n', '') col = line.split('\t') counts = col[8].split(';') counts = int(counts[1].replace('size=', '')) if col[3] == '*': nohits.append(col[8]) continue if float(col[3]) >= float(args.id): if not col[8] in ref_results: ref_results[col[8]] = (col[9], col[3], counts) else: print("Error: %s duplicated ID" % col[8]) else: nohits.append(col[8]) #summarize results from first ref clustering num_refcluster = len(ref_results) seqs_refcluster = 0 for k, v in list(ref_results.items()): seqs_refcluster += v[2] amptklib.log.info("%i OTUs classified " % num_refcluster + "({0:.0f}%".format(seqs_refcluster / float(qtrimtotal) * 100) + " of reads)") #get ref clustered hits to file with taxonomy ref_clustered = os.path.join(tmp, base + '.ref_clustered.fa') with open(ref_clustered, 'w') as refoutput: with open(uchime_out, 'r') as input: otu_counter = 1 for rec in SeqIO.parse(input, 'fasta'): if rec.id in ref_results: res = ref_results.get(rec.id) pident = res[1] tax = res[0] newID = 'OTU' + str( otu_counter) + ';pident=' + pident + ';' + tax rec.id = newID rec.name = '' rec.description = '' SeqIO.write(rec, refoutput, 'fasta') otu_counter += 1 if not args.closed_ref_only: #get nohits file to run clustering utax_ref = os.path.join(tmp, base + '.EE' + args.maxee + '.utax_ref.fa') with open(utax_ref, 'w') as output: with open(uchime_out, 'r') as input: for rec in SeqIO.parse(input, 'fasta'): if rec.id in nohits: SeqIO.write(rec, output, 'fasta') #input needs to be sorted, so ref_sort = os.path.join(tmp, base + '.utax_ref.sorted.fa') cmd = [ 'vsearch', '--sortbysize', utax_ref, '--minsize', args.minsize, '--output', ref_sort, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #now run clustering algorithm on those not found in reference database radius = str(100 - int(args.pct_otu)) otu_out = os.path.join(tmp, base + '.EE' + args.maxee + '.otus.fa') amptklib.log.info("De novo Clustering remaining sequences (UPARSE)") cmd = [ usearch, '-cluster_otus', ref_sort, '-relabel', 'OTU', '-otu_radius_pct', radius, '-otus', otu_out ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(otu_out) amptklib.log.info('{0:,}'.format(total) + ' de novo OTUs') #try utax reference clustering amptklib.log.info("Reference Clustering de novo OTUs using UTAX") cmd = [ usearch, '-cluster_otus_utax', otu_out, '-db', utaxDB, '-utax_cutoff', str(args.utax_cutoff), '-utax_level', 's', '-strand', 'plus', '-utaxout', os.path.join(tmp, base + '.utax.out') ] amptklib.runSubprocess(cmd, amptklib.log) #setup tax filtering tax_values = ['k', 'p', 'c', 'o', 'f', 'g', 's'] filter_index = tax_values.index(args.utax_level) filt_tax_values = [s + ':' for s in tax_values[filter_index:]] #get results from utax with open(ref_clustered, 'a') as output: seqDict = SeqIO.index(otu_out, 'fasta') utaxresults = [] with open(os.path.join(tmp, base + '.utax.out'), 'r') as utax: for line in utax: line = line.replace('\n', '') col = line.split('\t') ID = col[0] tax = col[2] if any(x in tax for x in filt_tax_values): record = seqDict[ID] record.id = 'OTU' + str( otu_counter) + ';UTAX;tax=' + tax record.name = '' record.description = '' SeqIO.write(record, output, 'fasta') otu_counter += 1 total = amptklib.countfasta(ref_clustered) - num_refcluster amptklib.log.info('{0:,}'.format(total) + ' classified to %s' % taxonomyLookup.get(args.utax_level)) #clean up padded N's amptklib.log.info("Cleaning up padding from OTUs") otu_clean = os.path.join(tmp, base + '.clean.otus.fa') amptklib.fasta_strip_padding(ref_clustered, otu_clean) total = amptklib.countfasta(otu_clean) amptklib.log.info('{0:,}'.format(total) + ' total OTUs') #now map reads back to OTUs uc_out = os.path.join(tmp, base + '.EE' + args.maxee + '.mapping.uc') otu_table = os.path.join(tmp, base + '.EE' + args.maxee + '.otu_table.txt') #setup reads to map if args.map_filtered: reads = filter_fasta else: reads = orig_fasta amptklib.log.info("Mapping Reads to OTUs and Building OTU table") cmd = [ 'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id', '0.97', '--db', otu_clean, '--uc', uc_out, '--otutabout', otu_table, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #count reads mapped total = amptklib.line_count2(uc_out) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) #Move files around, delete tmp if argument passed. currentdir = os.getcwd() final_otu = os.path.join(currentdir, base + '.cluster.otus.fa') shutil.copyfile(otu_clean, final_otu) final_otu_table = os.path.join(currentdir, base + '.otu_table.txt') shutil.copyfile(otu_table, final_otu_table) if not args.debug: shutil.rmtree(tmp) #Print location of files to STDOUT print("-------------------------------------------------------") print("OTU Clustering Script has Finished Successfully") print("-------------------------------------------------------") if not not args.debug: print("Tmp Folder of files: %s" % tmp) print("Clustered OTUs: %s" % os.path.basename(final_otu)) print("OTU Table: %s" % os.path.basename(final_otu_table)) print("-------------------------------------------------------") otu_print = final_otu.split('/')[-1] tab_print = final_otu_table.split('/')[-1] if 'darwin' in sys.platform: print(colr.WARN + "\nExample of next cmd:" + colr.END + " amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print)) else: print( "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))
def main(args): parser = argparse.ArgumentParser( prog='amptk-assign_taxonomy.py', usage="%(prog)s [options] -f <FASTA File>", description='''assign taxonomy to OTUs''', epilog="""Written by Jon Palmer (2015) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--otu_table', dest="otu_table", help='Append Taxonomy to OTU table') parser.add_argument('-f', '--fasta', required=True, help='FASTA input') parser.add_argument('-o', '--out', help='Output file (FASTA)') parser.add_argument( '-m', '--mapping_file', help='Mapping file: QIIME format can have extra meta data columns') parser.add_argument( '--method', default='hybrid', choices=['utax', 'usearch', 'sintax', 'hybrid', 'rdp', 'blast'], help='Taxonomy method') parser.add_argument( '-d', '--db', help='Pre-installed Databases: [ITS,ITS1,ITS2,16S,LSU,COI]') parser.add_argument( '-t', '--taxonomy', help='Incorporate taxonomy calculated elsewhere, 2 column file') parser.add_argument('--fasta_db', help='Alternative database of fasta sequences') parser.add_argument('--add2db', help='Custom FASTA database to add to DB on the fly') parser.add_argument('--utax_db', help='UTAX Reference Database') parser.add_argument('--utax_cutoff', default=0.8, type=restricted_float, help='UTAX confidence value threshold.') parser.add_argument('--usearch_db', help='USEARCH Reference Database') parser.add_argument('--usearch_cutoff', default=0.7, type=restricted_float, help='USEARCH percent ID threshold.') parser.add_argument( '-r', '--rdp', dest='rdp', default='/Users/jon/scripts/rdp_classifier_2.10.1/dist/classifier.jar', help='Path to RDP Classifier') parser.add_argument('--rdp_db', dest='rdp_tax', default='fungalits_unite', choices=[ '16srrna', 'fungallsu', 'fungalits_warcup', 'fungalits_unite' ], help='Training set for RDP Classifier') parser.add_argument('--rdp_cutoff', default=0.8, type=restricted_float, help='RDP confidence value threshold') parser.add_argument('--local_blast', help='Path to local Blast DB') parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH8 EXE') parser.add_argument('--tax_filter', help='Retain only OTUs with match in OTU table') parser.add_argument('--sintax_cutoff', default=0.8, type=restricted_float, help='SINTAX threshold.') parser.add_argument('--debug', action='store_true', help='Remove Intermediate Files') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") args = parser.parse_args(args) parentdir = os.path.join(os.path.dirname(amptklib.__file__)) if not args.out: #get base name of files if 'filtered' in args.fasta: base = args.fasta.split(".filtered")[0] elif 'otu' in args.fasta: base = args.fasta.split('.otu')[0] else: base = args.fasta.split('.fa')[0] else: base = args.out #remove logfile if exists log_name = base + '.amptk-taxonomy.log' if os.path.isfile(log_name): os.remove(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of cpus if args.cpus: cpus = args.cpus else: cpus = amptklib.getCPUS() #Setup DB locations and names, etc DBdir = os.path.join(parentdir, 'DB') DataBase = { 'ITS1': (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS1_UTAX.udb'), os.path.join(DBdir, 'ITS_SINTAX.udb')), 'ITS2': (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS2_UTAX.udb'), os.path.join(DBdir, 'ITS_SINTAX.udb')), 'ITS': (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS_UTAX.udb'), os.path.join(DBdir, 'ITS_SINTAX.udb')), '16S': (os.path.join(DBdir, '16S.udb'), os.path.join(DBdir, '16S.udb'), os.path.join(DBdir, '16S_SINTAX.udb')), 'LSU': (os.path.join(DBdir, 'LSU.udb'), os.path.join(DBdir, 'LSU_UTAX.udb'), os.path.join(DBdir, 'LSU_SINTAX.udb')), 'COI': (os.path.join(DBdir, 'COI.udb'), os.path.join(DBdir, 'COI_UTAX.udb'), os.path.join(DBdir, 'COI_SINTAX.udb')) } #get DB names up front if args.db in DataBase: utax_db = DataBase.get(args.db)[1] usearch_db = DataBase.get(args.db)[0] sintax_db = DataBase.get(args.db)[2] if not utax_db: utax_db = args.utax_db if not usearch_db: usearch_db = args.usearch_db else: utax_db = args.utax_db usearch_db = args.usearch_db if args.fasta_db: sintax_db = args.fasta_db else: sintax_db = args.usearch_db if args.method in ['hybrid', 'usearch', 'utax']: if not utax_db and not usearch_db and not args.fasta_db: amptklib.log.error( "You have not selected a database, need either --db, --utax_db, --usearch_db, or --fasta_db" ) sys.exit(1) else: #check that the DB exists if args.method == 'usearch' and usearch_db: if not amptklib.checkfile(usearch_db): amptklib.log.error( 'USEARCH DB not found: {:}'.format(usearch_db)) amptklib.log.derror( 'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB' ) sys.exit(1) if args.method == 'sintax' and sintax_db: if not amptklib.checkfile(sintax_db): amptklib.log.error( 'SINTAX DB not found: {:}'.format(sintax_db)) amptklib.log.derror( 'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB' ) sys.exit(1) if args.method == 'utax' and utax_db: if not amptklib.checkfile(utax_db): amptklib.log.error( 'UTAX DB not found: {:}'.format(utax_db)) amptklib.log.error( 'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB' ) sys.exit(1) custom_db = None if args.add2db: #means user wants to add sequences to the usearch database on the so will need to rebuild database custom_db = base + '.custom_database.fa' if amptklib.checkfile(custom_db): amptklib.SafeRemove(custom_db) if args.db: #this means that the fasta files need to be extracted amptklib.log.info("Adding {:} to the {:} database".format( os.path.basename(args.add2db), os.path.basename(usearch_db))) cmd = ['vsearch', '--udb2fasta', usearch_db, '--output', custom_db] amptklib.runSubprocess(cmd, amptklib.log) with open(custom_db, 'a') as outfile: with open(args.add2db, 'r') as infile: shutil.copyfileobj(infile, outfile) elif args.fasta_db: amptklib.log.info("Adding {:} to the {:} database".format( os.path.basename(args.add2db), os.path.basename(args.fasta_db))) with open(custom_db, 'w') as outfile: with open(args.fasta_db, 'r') as infile: shutil.copyfileobj(infile, outfile) with open(args.add2db, 'r') as infile: shutil.copyfileobj(infile, outfile) #Count records amptklib.log.info("Loading FASTA Records") total = amptklib.countfasta(args.fasta) amptklib.log.info('{0:,}'.format(total) + ' OTUs') #declare output files/variables here blast_out = base + '.blast.txt' rdp_out = base + '.rdp.txt' utax_out = base + '.usearch.txt' usearch_out = base + '.usearch.txt' sintax_out = base + '.sintax.txt' otuDict = {} if not args.taxonomy: #start with less common uses, i.e. Blast, rdp if args.method == 'blast': #check if command line blast installed if not amptklib.which('blastn'): amptklib.log.error("BLASTN not found in your PATH, exiting.") sys.exit(1) #now run blast remotely using NCBI nt database outformat = "6 qseqid sseqid pident stitle" if args.local_blast: #get number of cpus amptklib.log.info("Running local BLAST using db: %s" % args.local_blast) cmd = [ 'blastn', '-num_threads', str(cpus), '-query', args.fasta, '-db', os.path.abspath(args.local_blast), '-max_target_seqs', '1', '-outfmt', outformat, '-out', blast_out ] amptklib.runSubprocess(cmd, amptklib.log) else: amptklib.log.info( "Running BLASTN using NCBI remote nt database, this may take awhile" ) cmd = [ 'blastn', '-query', args.fasta, '-db', 'nt', '-remote', '-max_target_seqs', '1', '-outfmt', outformat, '-out', blast_out ] amptklib.runSubprocess(cmd, amptklib.log) #load results and reformat new = [] f = csv.reader(open(blast_out), delimiter=str('\t')) for col in f: query = col[0] gbID = col[1].split("|")[3] pident = col[2] name = col[3] tax = gbID + ";" + name + " (" + pident + ")" line = [query, tax] new.append(line) otuDict = dict(new) elif args.method == 'rdp': #check that classifier is installed try: rdp_test = subprocess.Popen( ['java', '-Xmx2000m', '-jar', args.rdp, 'classify'], stdout=subprocess.PIPE).communicate()[0].rstrip() except OSError: amptklib.log.error("%s not found in your PATH, exiting." % args.rdp) sys.exit(1) #RDP database amptklib.log.info("Using RDP classifier %s training set" % args.rdp_tax) #run RDP cmd = [ 'java', '-Xmx2000m', '-jar', args.rdp, 'classify', '-g', args.rdp_tax, '-o', rdp_out, '-f', 'fixrank', args.fasta ] amptklib.runSubprocess(cmd, amptklib.log) #load in results and put into dictionary new = [] removal = ["unidentified", "Incertae", "uncultured", "incertae"] remove_exp = [re.compile(x) for x in removal] f = csv.reader(open(rdp_out), delimiter=str('\t')) for col in f: if float(col[19]) > args.rdp_cutoff: tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[ 8] + ",o:" + col[11] + ",f:" + col[14] + ",g:" + col[17] elif float(col[16]) > args.rdp_cutoff: tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[ 8] + ",o:" + col[11] + ",f:" + col[14] elif float(col[13]) > args.rdp_cutoff: tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[ 8] + ",o:" + col[11] elif float(col[10]) > args.rdp_cutoff: tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[8] elif float(col[7]) > args.rdp_cutoff: tax = "RDP;k:" + col[2] + ",p:" + col[5] elif float(col[4]) > args.rdp_cutoff: tax = "RDP;k:" + col[2] else: tax = "RDP;k:unclassified" tax_split = tax.split(",") tax = [ s for s in tax_split if not any(re.search(s) for re in remove_exp) ] tax = ",".join(tax) line = [col[0], tax] new.append(line) otuDict = dict(new) else: #check status of USEARCH DB and run if args.method in ['hybrid', 'usearch']: if args.fasta_db: #now run through usearch global amptklib.log.info( "Global alignment OTUs with usearch_global (VSEARCH) against {:}" .format(os.path.basename(args.fasta_db))) cmd = [ 'vsearch', '--usearch_global', args.fasta, '--db', os.path.abspath(args.fasta_db), '--userout', usearch_out, '--id', str(args.usearch_cutoff), '--strand', 'both', '--output_no_hits', '--maxaccepts', '0', '--top_hits_only', '--userfields', 'query+target+id', '--notrunclabels', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) elif custom_db: #now run through usearch global amptklib.log.info( "Global alignment OTUs with usearch_global (VSEARCH) against custom DB" ) cmd = [ 'vsearch', '--usearch_global', args.fasta, '--db', os.path.abspath(custom_db), '--userout', usearch_out, '--id', str(args.usearch_cutoff), '--strand', 'both', '--output_no_hits', '--maxaccepts', '0', '--top_hits_only', '--userfields', 'query+target+id', '--notrunclabels', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) else: if usearch_db: amptklib.log.info( "Global alignment OTUs with usearch_global (VSEARCH) against {:}" .format(os.path.basename(usearch_db))) cmd = [ 'vsearch', '--usearch_global', args.fasta, '--db', os.path.abspath(usearch_db), '--userout', usearch_out, '--id', str(args.usearch_cutoff), '--strand', 'both', '--output_no_hits', '--maxaccepts', '0', '--top_hits_only', '--userfields', 'query+target+id', '--notrunclabels', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) if args.method in ['hybrid', 'utax']: if utax_db: #now run through UTAX utax_out = base + '.utax.txt' amptklib.log.info("Classifying OTUs with UTAX (USEARCH)") cutoff = str(args.utax_cutoff) cmd = [ usearch, '-utax', args.fasta, '-db', utax_db, '-utaxout', utax_out, '-utax_cutoff', cutoff, '-strand', 'plus', '-notrunclabels', '-threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) else: amptklib.log.error("UTAX DB %s not found, skipping" % utax_db) if args.method in ['hybrid', 'sintax']: if args.fasta_db: #if you pass fasta file here, over ride any auto detection sintax_db = args.fasta_db #now run sintax amptklib.log.info("Classifying OTUs with SINTAX (USEARCH)") cmd = [ usearch, '-sintax', args.fasta, '-db', os.path.abspath(sintax_db), '-tabbedout', sintax_out, '-sintax_cutoff', str(args.sintax_cutoff), '-strand', 'both', '-threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #now process results, load into dictionary - slightly different depending on which classification was run. if args.method == 'hybrid': #run upgraded method, first load dictionaries with resuls if amptklib.checkfile(utax_out): utaxDict = amptklib.classifier2dict( utax_out, args.utax_cutoff) amptklib.log.debug( 'UTAX results parsed, resulting in {:,} taxonomy predictions' .format(len(utaxDict))) else: amptklib.log.info('UTAX results empty') utaxDict = {} if amptklib.checkfile(sintax_out): sintaxDict = amptklib.classifier2dict( sintax_out, args.sintax_cutoff) amptklib.log.debug( 'SINTAX results parsed, resulting in {:,} taxonomy predictions' .format(len(sintaxDict))) else: amptklib.log.info('SINTAX results empty') sintaxDict = {} usearchDict = amptklib.usearchglobal2dict(usearch_out) amptklib.log.debug( 'Global alignment results parsed, resulting in {:,} taxonomy predictions' .format(len(usearchDict))) otuList = natsorted(list(usearchDict.keys())) #first compare classifier results, getting better of the two bestClassify = amptklib.bestclassifier(utaxDict, sintaxDict, otuList) #now get best taxonomy by comparing to global alignment results otuDict = amptklib.bestTaxonomy(usearchDict, bestClassify) amptklib.log.debug( 'Combined OTU taxonomy dictionary contains {:,} taxonomy predictions' .format(len(otuDict))) if len(otuDict) < 1: amptklib.log.info('Parsing taxonomy failed -- see logfile') sys.exit(1) elif args.method == 'utax' and amptklib.checkfile(utax_out): #load results into dictionary for appending to OTU table amptklib.log.debug("Loading UTAX results into dictionary") with open(utax_out, 'r') as infile: reader = csv.reader(infile, delimiter=str("\t")) otuDict = {rows[0]: 'UTAX;' + rows[2] for rows in reader} elif args.method == 'usearch' and amptklib.checkfile(usearch_out): #load results into dictionary for appending to OTU table amptklib.log.debug( "Loading Global Alignment results into dictionary") otuDict = {} usearchDict = amptklib.usearchglobal2dict(usearch_out) for k, v in natsorted(list(usearchDict.items())): pident = float(v[0]) * 100 pident = "{0:.1f}".format(pident) ID = v[1] tax = ','.join(v[-1]) LCA = v[2] if LCA == '': fulltax = 'GS|' + pident + '|' + ID + ';' + tax else: fulltax = 'GSL|' + pident + '|' + ID + ';' + tax otuDict[k] = fulltax elif args.method == 'sintax' and amptklib.checkfile(sintax_out): #load results into dictionary for appending to OTU table amptklib.log.debug("Loading SINTAX results into dictionary") with open(sintax_out, 'r') as infile: reader = csv.reader(infile, delimiter=(str("\t"))) otuDict = {rows[0]: 'SINTAX;' + rows[3] for rows in reader} else: #you have supplied a two column taxonomy file, parse and build otuDict amptklib.log.debug("Loading custom Taxonomy into dictionary") with open(args.taxonomy, 'r') as infile: reader = csv.reader(infile, delimiter=str("\t")) otuDict = {rows[0]: rows[1] for rows in reader} #now format results if args.otu_table: #check if otu_table variable is empty, then load in otu table amptklib.log.info("Appending taxonomy to OTU table and OTUs") taxTable = base + '.otu_table.taxonomy.txt' tmpTable = base + '.otu_table.tmp' #append to OTU table counts = 0 with open(taxTable, 'w') as outTable: with open(args.otu_table, 'r') as inTable: #guess the delimiter format firstline = inTable.readline() dialect = amptklib.guess_csv_dialect(firstline) inTable.seek(0) #parse OTU table reader = csv.reader(inTable, dialect) for line in reader: if line[0].startswith(("#OTU", "OTUId")): line.append('Taxonomy') else: tax = otuDict.get(line[0]) or "No Hit" line.append(tax) if args.tax_filter and not args.method == 'blast': if line[0].startswith(("#OTU", "OTUId")): join_line = ('\t'.join(str(x) for x in line)) else: if args.tax_filter in line[-1]: join_line = ('\t'.join(str(x) for x in line)) counts += 1 else: continue else: join_line = ('\t'.join(str(x) for x in line)) counts += 1 outTable.write("%s\n" % join_line) if args.tax_filter: if args.method == 'blast': amptklib.log.info( "Blast is incompatible with --tax_filter, use a different method" ) tmpTable = args.otu_table else: nonfungal = total - counts amptklib.log.info( "Found %i OTUs not matching %s, writing %i %s hits to taxonomy OTU table" % (nonfungal, args.tax_filter, counts, args.tax_filter)) #need to create a filtered table without taxonomy for BIOM output with open(tmpTable, 'w') as output: with open(taxTable, 'r') as input: firstline = input.readline() dialect = amptklib.guess_csv_dialect(firstline) input.seek(0) #parse OTU table reader = csv.reader(input, dialect) for line in reader: del line[-1] join_line = '\t'.join(str(x) for x in line) output.write("%s\n" % join_line) else: tmpTable = args.otu_table #append to OTUs otuTax = base + '.otus.taxonomy.fa' with open(otuTax, 'w') as output: with open(args.fasta, 'r') as input: SeqRecords = SeqIO.parse(input, 'fasta') for rec in SeqRecords: tax = otuDict.get(rec.id) or "No hit" rec.description = tax SeqIO.write(rec, output, 'fasta') if not args.taxonomy: #output final taxonomy in two-column format, followed by the hits for usearch/sintax/utax if hybrid is used. taxFinal = base + '.taxonomy.txt' with open(taxFinal, 'w') as finaltax: if args.method == 'hybrid': finaltax.write('#OTUID\ttaxonomy\tUSEARCH\tSINTAX\tUTAX\n') for k, v in natsorted(list(otuDict.items())): if k in usearchDict: usearchResult = usearchDict.get(k) usearchResult = ','.join(usearchResult[-1]) else: usearchResult = 'No hit' if k in sintaxDict: sintaxResult = sintaxDict.get(k) sintaxResult = ','.join(sintaxResult[-1]) else: sintaxResult = 'No hit' if k in utaxDict: utaxResult = utaxDict.get(k) utaxResult = ','.join(utaxResult[-1]) else: utaxResult = 'No hit' finaltax.write('{:}\t{:}\t{:}\t{:}\t{:}\n'.format( k, v, usearchResult, sintaxResult, utaxResult)) else: finaltax.write('#OTUID\ttaxonomy\n') for k, v in natsorted(list(otuDict.items())): finaltax.write('%s\t%s\n' % (k, v)) else: taxFinal = args.taxonomy #convert taxonomy to qiime format for biom qiimeTax = None if not args.method == 'blast': qiimeTax = base + '.qiime.taxonomy.txt' amptklib.utax2qiime(taxFinal, qiimeTax) else: amptklib.log.error( "Blast taxonomy is not compatible with BIOM output, use a different method" ) #create OTU phylogeny for downstream processes amptklib.log.info("Generating phylogenetic tree") tree_out = base + '.tree.phy' cmd = [usearch, '-cluster_agg', args.fasta, '-treeout', tree_out] amptklib.runSubprocess(cmd, amptklib.log) #print some summary file locations amptklib.log.info("Taxonomy finished: %s" % taxFinal) if args.otu_table and not args.method == 'blast': amptklib.log.info("Classic OTU table with taxonomy: %s" % taxTable) #output final OTU table in Biom v1.0 (i.e. json format if biom installed) outBiom = base + '.biom' if amptklib.which('biom'): amptklib.removefile(outBiom) cmd = [ 'biom', 'convert', '-i', tmpTable, '-o', outBiom + '.tmp', '--table-type', "OTU table", '--to-json' ] amptklib.runSubprocess(cmd, amptklib.log) if args.mapping_file: mapSamples = [] repeatSamples = [] with open(args.mapping_file, 'r') as mapin: for line in mapin: line = line.rstrip() if line.startswith('#'): continue sampleID = line.split('\t')[0] if not sampleID in mapSamples: mapSamples.append(sampleID) else: repeatSamples.append(sampleID) otuSamples = [] with open(tmpTable, 'r') as otuin: for line in otuin: line = line.rstrip() if line.startswith('#'): otuSamples = line.split('\t')[1:] missingMap = [] for otu in otuSamples: if not otu in mapSamples: missingMap.append(otu) if len(missingMap) > 0: amptklib.log.error( "%s are missing from mapping file (metadata), skipping biom file creation" % ', '.join(missingMap)) elif len(repeatSamples) > 0: amptklib.log.error( '%s duplicate sample IDs in mapping file, skipping biom file creation' % ', '.join(repeatSamples)) else: if qiimeTax: cmd = [ 'biom', 'add-metadata', '-i', outBiom + '.tmp', '-o', outBiom, '--observation-metadata-fp', qiimeTax, '-m', args.mapping_file, '--sc-separated', 'taxonomy', '--output-as-json' ] else: cmd = [ 'biom', 'add-metadata', '-i', outBiom + '.tmp', '-o', outBiom, '-m', args.mapping_file, '--output-as-json' ] amptklib.runSubprocess(cmd, amptklib.log) else: cmd = [ 'biom', 'add-metadata', '-i', outBiom + '.tmp', '-o', outBiom, '--observation-metadata-fp', qiimeTax, '--sc-separated', 'taxonomy', '--output-as-json' ] amptklib.runSubprocess(cmd, amptklib.log) amptklib.removefile(outBiom + '.tmp') amptklib.log.info("BIOM OTU table created: %s" % outBiom) else: amptklib.log.info( "biom program not installed, install via `pip install biom-format` or `conda install biom-format`" ) amptklib.log.info("OTUs with taxonomy: %s" % otuTax) amptklib.log.info("OTU phylogeny: %s" % tree_out) #clean up intermediate files if not args.debug: for i in [ utax_out, usearch_out, sintax_out, qiimeTax, base + '.otu_table.tmp' ]: if i: amptklib.removefile(i) print("-------------------------------------------------------")
def main(args): parser = argparse.ArgumentParser( prog='amptk-dada2.py', description= '''Script takes output from amptk pre-processing and runs DADA2''', epilog="""Written by Jon Palmer (2016) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--fastq', required=True, help='Input Demuxed containing FASTQ') parser.add_argument('-o', '--out', help='Output Basename') parser.add_argument( '-m', '--min_reads', default=10, type=int, help="Minimum number of reads after Q filtering to run DADA2 on") parser.add_argument('-l', '--length', type=int, help='Length to truncate reads') parser.add_argument('-e', '--maxee', default='1.0', help='MaxEE quality filtering') parser.add_argument('-p', '--pct_otu', default='97', help="Biological OTU Clustering Percent") parser.add_argument('--platform', default='ion', choices=['ion', 'illumina', '454'], help='Sequencing platform') parser.add_argument('--chimera_method', default='consensus', choices=['consensus', 'pooled', 'per-sample'], help='bimera removal method') parser.add_argument('--uchime_ref', help='Run UCHIME REF [ITS,16S,LSU,COI,custom]') parser.add_argument('--pool', action='store_true', help='Pool all sequences together for DADA2') parser.add_argument('--debug', action='store_true', help='Keep all intermediate files') parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH9 EXE') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") args = parser.parse_args(args) parentdir = os.path.join(os.path.dirname(amptklib.__file__)) dada2script = os.path.join(parentdir, 'dada2_pipeline_nofilt.R') #get basename if not args.out passed if args.out: base = args.out else: if 'demux' in args.fastq: base = os.path.basename(args.fastq).split('.demux')[0] else: base = os.path.basename(args.fastq).split('.f')[0] #remove logfile if exists log_name = base + '.amptk-dada2.log' if os.path.isfile(log_name): amptklib.removefile(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of cores if args.cpus: CORES = str(args.cpus) else: CORES = str(amptklib.getCPUS()) #check dependencies programs = ['Rscript'] amptklib.CheckDependencies(programs) Rversions = amptklib.checkRversion() R_pass = '******' dada2_pass = '******' #check dada2 first, if good move on, otherwise issue warning if not amptklib.gvc(Rversions[1], dada2_pass): amptklib.log.error("R v%s; DADA2 v%s detected, need atleast v%s" % (Rversions[0], Rversions[1], dada2_pass)) amptklib.log.error( "See: http://benjjneb.github.io/dada2/dada-installation.html") sys.exit(1) amptklib.log.info("R v%s; DADA2 v%s" % (Rversions[0], Rversions[1])) #Count FASTQ records and remove 3' N's as dada2 can't handle them amptklib.log.info("Loading FASTQ Records") no_ns = base + '.cleaned_input.fq' if args.fastq.endswith('.gz'): fastqInput = args.fastq.replace('.gz', '') amptklib.Funzip(os.path.abspath(args.fastq), os.path.basename(fastqInput), CORES) else: fastqInput = os.path.abspath(args.fastq) amptklib.fastq_strip_padding(os.path.basename(fastqInput), no_ns) demuxtmp = base + '.original.fa' cmd = [ 'vsearch', '--fastq_filter', os.path.abspath(no_ns), '--fastq_qmax', '55', '--fastaout', demuxtmp, '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) orig_total = amptklib.countfasta(demuxtmp) size = amptklib.checkfastqsize(no_ns) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #quality filter amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) derep = base + '.qual-filtered.fq' filtercmd = [ 'vsearch', '--fastq_filter', no_ns, '--fastq_maxee', str(args.maxee), '--fastqout', derep, '--fastq_qmax', '55', '--fastq_maxns', '0', '--threads', CORES ] amptklib.runSubprocess(filtercmd, amptklib.log) total = amptklib.countfastq(derep) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #split into individual files amptklib.log.info("Splitting FASTQ file by Sample into individual files") filtfolder = base + '_filtered' if os.path.isdir(filtfolder): shutil.rmtree(filtfolder) os.makedirs(filtfolder) splitDemux2(derep, filtfolder, args=args) #check for minimum number of reads in each sample remove = [] files = [i for i in os.listdir(filtfolder) if i.endswith('.fastq')] for x in files: if amptklib.countfastq(os.path.join(filtfolder, x)) < args.min_reads: remove.append(x) if len(remove) > 0: amptklib.log.info("Dropping %s as fewer than %i reads" % (', '.join(remove), args.min_reads)) for y in remove: os.remove(os.path.join(filtfolder, y)) #now run DADA2 on filtered folder amptklib.log.info("Running DADA2 pipeline") dada2log = base + '.dada2.Rscript.log' dada2out = base + '.dada2.csv' #check pooling vs notpooled, default is not pooled. if args.pool: POOL = 'TRUE' else: POOL = 'FALSE' with open(dada2log, 'w') as logfile: subprocess.call([ 'Rscript', '--vanilla', dada2script, filtfolder, dada2out, args.platform, POOL, CORES, args.chimera_method ], stdout=logfile, stderr=logfile) #check for results if not os.path.isfile(dada2out): amptklib.log.error("DADA2 run failed, please check %s logfile" % dada2log) sys.exit(1) #now process the output, pull out fasta, rename, etc fastaout = base + '.otus.tmp' OTUCounts = {} counter = 1 with open(fastaout, 'w') as writefasta: with open(dada2out, 'r') as input: next(input) for line in input: line = line.replace('\n', '') line = line.replace('"', '') cols = line.split(',') Seq = cols[0] countList = [int(x) for x in cols[1:]] counts = sum(countList) ID = 'ASV' + str(counter) if not ID in OTUCounts: OTUCounts[ID] = counts writefasta.write(">%s\n%s\n" % (ID, Seq)) counter += 1 #get number of bimeras from logfile with open(dada2log, 'r') as bimeracheck: for line in bimeracheck: if line.startswith('Identified '): bimeraline = line.split(' ') bimeras = int(bimeraline[1]) totalSeqs = int(bimeraline[5]) validSeqs = totalSeqs - bimeras amptklib.log.info('{0:,}'.format(totalSeqs) + ' total amplicon sequence variants (ASVs)') amptklib.log.info('{0:,}'.format(bimeras) + ' denovo chimeras removed') amptklib.log.info('{0:,}'.format(validSeqs) + ' valid ASVs') #optional UCHIME Ref uchime_out = base + '.nonchimeras.fa' chimeraFreeTable = base + '.otu_table.txt' iSeqs = base + '.ASVs.fa' if not args.uchime_ref: os.rename(fastaout, iSeqs) else: #check if file is present, remove from previous run if it is. if os.path.isfile(iSeqs): amptklib.removefile(iSeqs) #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy if args.uchime_ref in [ 'ITS', '16S', 'LSU', 'COI' ]: #test if it is one that is setup, otherwise default to full path uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb') if not os.path.isfile(uchime_db): amptklib.log.error( "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering" ) uchime_out = fastaout #since uchime cannot work with udb database, need to extract fasta sequences, do this if if not amptklib.checkfile( os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa')): uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') cmd = [ 'vsearch', '--udb2fasta', os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'), '--output', uchime_db ] amptklib.runSubprocess(cmd, amptklib.log) else: uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') else: if os.path.isfile(args.uchime_ref): uchime_db = os.path.abspath(args.uchime_ref) else: amptklib.log.error( "%s is not a valid file, skipping reference chimera filtering" % args.uchime_ref) iSeqs = fastaout #now run chimera filtering if all checks out if not os.path.isfile(iSeqs): amptklib.log.info("Chimera Filtering (VSEARCH) using %s DB" % args.uchime_ref) cmd = [ 'vsearch', '--mindiv', '1.0', '--uchime_ref', fastaout, '--db', uchime_db, '--nonchimeras', iSeqs, '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(iSeqs) uchime_chimeras = validSeqs - total amptklib.log.info('{0:,}'.format(total) + ' ASVs passed, ' + '{0:,}'.format(uchime_chimeras) + ' ref chimeras removed') if os.path.isfile(fastaout): amptklib.removefile(fastaout) #setup output files dadademux = base + '.dada2.map.uc' bioSeqs = base + '.cluster.otus.fa' bioTable = base + '.cluster.otu_table.txt' uctmp = base + '.map.uc' ClusterComp = base + '.ASVs2clusters.txt' #Filter out ASVs in wrong orientation amptklib.log.info('Validating ASV orientation') os.rename(iSeqs, iSeqs + '.bak') numKept, numDropped = amptklib.validateorientationDADA2( OTUCounts, iSeqs + '.bak', iSeqs) amptklib.log.info('{:,} ASVs validated ({:,} dropped)'.format( numKept, numDropped)) amptklib.SafeRemove(iSeqs + '.bak') #map reads to DADA2 OTUs amptklib.log.info("Mapping reads to DADA2 ASVs") cmd = [ 'vsearch', '--usearch_global', demuxtmp, '--db', iSeqs, '--id', '0.97', '--uc', dadademux, '--strand', 'plus', '--otutabout', chimeraFreeTable, '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.line_count2(dadademux) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to ASVs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) #cluster amptklib.log.info("Clustering ASVs at %s%% to generate biological OTUs" % args.pct_otu) radius = float(args.pct_otu) / 100. cmd = [ 'vsearch', '--cluster_smallmem', iSeqs, '--centroids', bioSeqs, '--id', str(radius), '--strand', 'plus', '--relabel', 'OTU', '--qmask', 'none', '--usersort', '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(bioSeqs) amptklib.log.info('{0:,}'.format(total) + ' OTUs generated') #determine where iSeqs clustered iSeqmap = base + '.ASV_map.uc' cmd = [ 'vsearch', '--usearch_global', iSeqs, '--db', bioSeqs, '--id', str(radius), '--uc', iSeqmap, '--strand', 'plus', '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) iSeqMapped = {} with open(iSeqmap, 'r') as mapping: for line in mapping: line = line.replace('\n', '') cols = line.split('\t') OTU = cols[9] Hit = cols[8] if not OTU in iSeqMapped: iSeqMapped[OTU] = [Hit] else: iSeqMapped[OTU].append(Hit) with open(ClusterComp, 'w') as clusters: clusters.write('OTU\tASVs\n') for k, v in natsorted(list(iSeqMapped.items())): clusters.write('%s\t%s\n' % (k, ', '.join(v))) #create OTU table amptklib.log.info("Mapping reads to OTUs") cmd = [ 'vsearch', '--usearch_global', demuxtmp, '--db', bioSeqs, '--id', '0.97', '--uc', uctmp, '--strand', 'plus', '--otutabout', bioTable, '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.line_count2(uctmp) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) if not args.debug: amptklib.removefile(no_ns) shutil.rmtree(filtfolder) amptklib.removefile(dada2out) amptklib.removefile(derep) amptklib.removefile(demuxtmp) amptklib.removefile(uctmp) amptklib.removefile(iSeqmap) amptklib.removefile(dadademux) #Print location of files to STDOUT print("-------------------------------------------------------") print("DADA2 Script has Finished Successfully") print("-------------------------------------------------------") if args.debug: print("Tmp Folder of files: %s" % filtfolder) print("Amplicon sequence variants: %s" % iSeqs) print("ASV OTU Table: %s" % chimeraFreeTable) print("Clustered OTUs: %s" % bioSeqs) print("OTU Table: %s" % bioTable) print("ASVs 2 OTUs: %s" % ClusterComp) print("-------------------------------------------------------") otu_print = bioSeqs.split('/')[-1] tab_print = bioTable.split('/')[-1] if 'darwin' in sys.platform: print(colr.WARN + "\nExample of next cmd:" + colr.END + " amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print)) else: print( "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))
def main(args): parser = argparse.ArgumentParser( prog='amptk-OTU_cluster.py', usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu", description='''Script runs UPARSE OTU clustering. Requires USEARCH by Robert C. Edgar: http://drive5.com/usearch''', epilog="""Written by Jon Palmer (2015) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--fastq', dest="FASTQ", required=True, help='FASTQ file (Required)') parser.add_argument('-o', '--out', help='Base output name') parser.add_argument('-e', '--maxee', default='1.0', help='Quality trim EE value') parser.add_argument('-p', '--pct_otu', default='97', help="OTU Clustering Percent") parser.add_argument('-m', '--minsize', default='2', help='Min size to keep for clustering') parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH9 EXE') parser.add_argument('--uchime_ref', help='Run UCHIME REF [ITS,16S,LSU,COI,custom]') parser.add_argument('--map_filtered', action='store_true', help='map quality filtered reads back to OTUs') parser.add_argument('--unoise', action='store_true', help='Run De-noising (UNOISE)') parser.add_argument('--debug', action='store_true', help='Remove Intermediate Files') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") args = parser.parse_args(args) parentdir = os.path.join(os.path.dirname(amptklib.__file__)) #get basename if not args.out passed if args.out: base = args.out else: if 'demux' in args.FASTQ: base = os.path.basename(args.FASTQ).split('.demux')[0] else: base = os.path.basename(args.FASTQ).split('.f')[0] #remove logfile if exists log_name = base + '.amptk-cluster.log' if os.path.isfile(log_name): os.remove(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of cpus if args.cpus: cpus = args.cpus else: cpus = amptklib.getCPUS() #make tmp folder tmp = base + '_tmp' if not os.path.exists(tmp): os.makedirs(tmp) #Count FASTQ records amptklib.log.info("Loading FASTQ Records") #convert to FASTA for mapping orig_fasta = os.path.join(tmp, base + '.orig.fa') cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) orig_total = amptklib.countfasta(orig_fasta) size = amptklib.checkfastqsize(args.FASTQ) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #Expected Errors filtering step filter_out = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fq') filter_fasta = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fa') amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfastq(filter_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #now run full length dereplication derep_out = os.path.join(tmp, base + '.EE' + args.maxee + '.derep.fa') amptklib.log.info("De-replication (remove duplicate reads)") cmd = [ 'vsearch', '--derep_fulllength', filter_fasta, '--sizeout', '--output', derep_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(derep_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #optional run UNOISE if args.unoise: unoise_out = unoise_out = os.path.join( tmp, base + '.EE' + args.maxee + '.denoised.fa') amptklib.log.info("Denoising Data with UNOISE") cmd = [ usearch, '-cluster_fast', derep_out, '-centroids', unoise_out, '-id', '0.9', '--maxdiffs', '5', '-abskew', '10', '-sizein', '-sizeout', '-sort', 'size', '-threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(unoise_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') else: unoise_out = derep_out #now sort by size remove singletons sort_out = os.path.join(tmp, base + '.EE' + args.maxee + '.sort.fa') cmd = [ 'vsearch', '--sortbysize', unoise_out, '--minsize', args.minsize, '--output', sort_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #now run clustering algorithm radius = str(100 - int(args.pct_otu)) otu_out = os.path.join(tmp, base + '.EE' + args.maxee + '.otus.fa') amptklib.log.info("Clustering OTUs (UPARSE)") cmd = [ usearch, '-cluster_otus', sort_out, '-relabel', 'OTU', '-otu_radius_pct', radius, '-otus', otu_out, '-threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) numOTUs = amptklib.countfasta(otu_out) amptklib.log.info('{0:,}'.format(numOTUs) + ' OTUs') #clean up padded N's amptklib.log.info("Cleaning up padding from OTUs") otu_clean = os.path.join(tmp, base + '.EE' + args.maxee + '.clean.otus.fa') amptklib.fasta_strip_padding(otu_out, otu_clean) #optional UCHIME Ref if not args.uchime_ref: uchime_out = otu_clean else: uchime_out = os.path.join( tmp, base + '.EE' + args.maxee + '.uchime.otus.fa') #check if file is present, remove from previous run if it is. if os.path.isfile(uchime_out): os.remove(uchime_out) #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy if args.uchime_ref in [ 'ITS', '16S', 'LSU', 'COI' ]: #test if it is one that is setup, otherwise default to full path uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb') if not os.path.isfile(uchime_db): amptklib.log.error( "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering" ) uchime_out = otu_clean #since uchime cannot work with udb database, need to extract fasta sequences, do this if if not amptklib.checkfile( os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa')): uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') cmd = [ 'vsearch', '--udb2fasta', os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'), '--output', uchime_db ] amptklib.runSubprocess(cmd, amptklib.log) else: uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') else: if os.path.isfile(args.uchime_ref): uchime_db = os.path.abspath(args.uchime_ref) else: amptklib.log.error( "%s is not a valid file, skipping reference chimera filtering" % args.uchime_ref) uchime_out = otu_clean #now run chimera filtering if all checks out if not os.path.isfile(uchime_out): amptklib.log.info("Chimera Filtering (VSEARCH) using %s DB" % args.uchime_ref) cmd = [ 'vsearch', '--mindiv', '1.0', '--uchime_ref', otu_clean, '--db', uchime_db, '--nonchimeras', uchime_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(uchime_out) uchime_chimeras = numOTUs - total amptklib.log.info('{0:,}'.format(total) + ' OTUs passed, ' + '{0:,}'.format(uchime_chimeras) + ' ref chimeras') #Filter out OTUs in wrong orientation amptklib.log.info('Validating OTU orientation') passingOTUs = os.path.join(tmp, base + '.passed.otus.fa') numKept, numDropped = amptklib.validateorientation(tmp, sort_out, uchime_out, passingOTUs) amptklib.log.info('{:,} OTUs validated ({:,} dropped)'.format( numKept, numDropped)) #now map reads back to OTUs and build OTU table uc_out = os.path.join(tmp, base + '.EE' + args.maxee + '.mapping.uc') otu_table = os.path.join(tmp, base + '.EE' + args.maxee + '.otu_table.txt') #setup reads to map if args.map_filtered: reads = filter_fasta else: reads = orig_fasta amptklib.log.info("Mapping Reads to OTUs and Building OTU table") cmd = [ 'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id', '0.97', '--db', passingOTUs, '--uc', uc_out, '--otutabout', otu_table, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #count reads mapped total = amptklib.line_count2(uc_out) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) #Move files around, delete tmp if argument passed. currentdir = os.getcwd() final_otu = os.path.join(currentdir, base + '.cluster.otus.fa') shutil.copyfile(passingOTUs, final_otu) final_otu_table = os.path.join(currentdir, base + '.otu_table.txt') shutil.copyfile(otu_table, final_otu_table) if not args.debug: shutil.rmtree(tmp) #Print location of files to STDOUT print("-------------------------------------------------------") print("OTU Clustering Script has Finished Successfully") print("-------------------------------------------------------") if not not args.debug: print("Tmp Folder of files: %s" % tmp) print("Clustered OTUs: %s" % os.path.basename(final_otu)) print("OTU Table: %s" % os.path.basename(final_otu_table)) print("-------------------------------------------------------") otu_print = final_otu.split('/')[-1] tab_print = final_otu_table.split('/')[-1] if 'darwin' in sys.platform: print(colr.WARN + "\nExample of next cmd:" + colr.END + " amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print)) else: print( "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))
def main(args): parser = argparse.ArgumentParser( prog='amptk-unoise2.py', usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu", description='''Script runs UNOISE2 algorithm. Requires USEARCH9 by Robert C. Edgar: http://drive5.com/usearch''', epilog="""Written by Jon Palmer (2016) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--fastq', dest="FASTQ", required=True, help='FASTQ file (Required)') parser.add_argument('-o', '--out', help='Base output name') parser.add_argument('-e', '--maxee', default='1.0', help='Quality trim EE value') parser.add_argument('-m', '--minsize', default='8', help='Min size to keep for denoising') parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH9 EXE') parser.add_argument('-p', '--pct_otu', default='97', help="Biological OTU Clustering Percent") parser.add_argument('--uchime_ref', help='Run UCHIME2 REF [ITS,16S,LSU,COI,custom]') parser.add_argument('--map_filtered', action='store_true', help='map quality filtered reads back to OTUs') parser.add_argument('--debug', action='store_true', help='Remove Intermediate Files') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") args = parser.parse_args(args) parentdir = os.path.join(os.path.dirname(amptklib.__file__)) #get basename if not args.out passed if args.out: base = args.out else: if 'demux' in args.FASTQ: base = os.path.basename(args.FASTQ).split('.demux')[0] else: base = os.path.basename(args.FASTQ).split('.f')[0] #remove logfile if exists log_name = base + '.amptk-unoise2.log' if os.path.isfile(log_name): os.remove(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of cpus if args.cpus: cpus = args.cpus else: cpus = amptklib.getCPUS() #make tmp folder tmp = base + '_tmp' if not os.path.exists(tmp): os.makedirs(tmp) #Count FASTQ records amptklib.log.info("Loading FASTQ Records") #convert to FASTA for mapping orig_fasta = os.path.join(tmp, base + '.orig.fa') cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) orig_total = amptklib.countfasta(orig_fasta) size = amptklib.checkfastqsize(args.FASTQ) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #Expected Errors filtering step filter_out = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fq') filter_fasta = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fa') amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfastq(filter_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #now run full length dereplication derep_out = os.path.join(tmp, base + '.EE' + args.maxee + '.derep.fa') amptklib.log.info("De-replication (remove duplicate reads)") cmd = [ 'vsearch', '--derep_fulllength', filter_out, '--relabel', 'Read_', '--sizeout', '--output', derep_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(derep_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #now run de-noiser UNOISE2 amptklib.log.info("Denoising reads with UNOISE2") unoise_out = os.path.join(tmp, base + '.EE' + args.maxee + '.unoise.fa') cmd = [ usearch, '-unoise2', derep_out, '-fastaout', unoise_out, '-minampsize', args.minsize, '-threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(unoise_out) amptklib.log.info('{0:,}'.format(total) + ' denoised sequences') #strip N's amptklib.log.info("Cleaning up padding from OTUs") otu_clean = os.path.join(tmp, base + '.EE' + args.maxee + '.clean.fa') amptklib.fasta_strip_padding(unoise_out, otu_clean) #run optional uchime_ref if not args.uchime_ref: uchime_out = otu_clean else: uchime_out = os.path.join( tmp, base + '.EE' + args.maxee + '.uchime.otus.fa') #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy if args.uchime_ref in [ 'ITS', '16S', 'LSU', 'COI' ]: #test if it is one that is setup, otherwise default to full path uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb') if not os.path.isfile(uchime_db): amptklib.log.error( "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering" ) uchime_out = otu_clean #since uchime cannot work with udb database, need to extract fasta sequences, do this if if not amptklib.checkfile( os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa')): uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') cmd = [ 'vsearch', '--udb2fasta', os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'), '--output', uchime_db ] amptklib.runSubprocess(cmd, amptklib.log) else: uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') else: uchime_db = os.path.abspath(args.uchime_ref) #now run chimera filtering if all checks out if not os.path.isfile(uchime_out): amptklib.log.info("Chimera Filtering (VSEARCH)") cmd = [ 'vsearch', '--mindiv', '1.0', '--uchime_ref', otu_clean, '--db', uchime_db, '--nonchimeras', uchime_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(uchime_out) amptklib.log.info('{0:,}'.format(total) + ' OTUs passed') #inferred sequences iSeqs = base + '.ASVs.fa' amptklib.fastarename(uchime_out, 'ASV', iSeqs) #Filter out ASVs in wrong orientation amptklib.log.info('Validating ASV orientation') passingOTUs = os.path.join(tmp, base + '.passed.asvs.fa') numKept, numDropped = amptklib.validateorientation(tmp, derep_out, uchime_out, passingOTUs) amptklib.log.info('{:,} ASVs validated ({:,} dropped)'.format( numKept, numDropped)) #build OTU table with iSeqs uc_iSeq_out = os.path.join(tmp, base + '.EE' + args.maxee + '.mapping.uc') iSeq_otu_table = base + '.otu_table.txt' #setup reads to map if args.map_filtered: reads = filter_fasta else: reads = orig_fasta amptklib.log.info("Mapping Reads to ASVs and Building OTU table") cmd = [ 'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id', '0.97', '--db', passingOTUs, '--uc', uc_iSeq_out, '--otutabout', iSeq_otu_table, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #count reads mapped total = amptklib.line_count2(uc_iSeq_out) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to ASVs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) #now cluster to biological OTUs with UCLUST radius = float(args.pct_otu) / 100. amptklib.log.info( "Clustering denoised sequences into biological OTUs at %s%%" % args.pct_otu) uclust_out = os.path.join(tmp, base + '.EE' + args.maxee + '.uclust.fa') cmd = [ 'vsearch', '--cluster_smallmem', passingOTUs, '--centroids', uclust_out, '--id', str(radius), '--strand', 'plus', '--relabel', 'OTU', '--qmask', 'none', '--usersort', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(uclust_out) amptklib.log.info('{0:,}'.format(total) + ' OTUs generated') #determine where denoised sequences clustered ClusterComp = base + '.ASVs2clusters.txt' iSeqmap = base + '.unoise_map.uc' cmd = [ usearch, '-usearch_global', passingOTUs, '-db', uclust_out, '-id', str(radius), '-uc', iSeqmap, '-strand', 'plus', '-threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) iSeqMapped = {} with open(iSeqmap, 'r') as mapping: for line in mapping: line = line.replace('\n', '') cols = line.split('\t') OTU = cols[9] Hit = cols[8] if not OTU in iSeqMapped: iSeqMapped[OTU] = [Hit] else: iSeqMapped[OTU].append(Hit) with open(ClusterComp, 'w') as clusters: clusters.write('OTU\tASVs\n') for k, v in natsorted(list(iSeqMapped.items())): clusters.write('%s\t%s\n' % (k, ', '.join(v))) #now map reads back to OTUs and build OTU table uc_out = os.path.join(tmp, base + '.EE' + args.maxee + '.cluster.mapping.uc') otu_table = os.path.join( tmp, base + '.EE' + args.maxee + '.cluster.otu_table.txt') #setup reads to map if args.map_filtered: reads = filter_fasta else: reads = orig_fasta amptklib.log.info("Mapping Reads to OTUs and Building OTU table") cmd = [ 'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id', '0.97', '--db', uclust_out, '--uc', uc_out, '--otutabout', otu_table, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #count reads mapped total = amptklib.line_count2(uc_out) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) #Move files around, delete tmp if argument passed. currentdir = os.getcwd() final_otu = os.path.join(currentdir, base + '.cluster.otus.fa') shutil.copyfile(uclust_out, final_otu) final_otu_table = os.path.join(currentdir, base + '.cluster.otu_table.txt') shutil.copyfile(otu_table, final_otu_table) if not args.debug: shutil.rmtree(tmp) #Print location of files to STDOUT print("-------------------------------------------------------") print("UNOISE2 Script has Finished Successfully") print("-------------------------------------------------------") if not not args.debug: print("Tmp Folder of files: %s" % tmp) print("Amplicon sequence variants: %s" % passingOTUs) print("ASV OTU Table: %s" % iSeq_otu_table) print("Clustered OTUs: %s" % os.path.basename(final_otu)) print("OTU Table: %s" % os.path.basename(final_otu_table)) print("ASVs 2 OTUs: %s" % ClusterComp) print("-------------------------------------------------------") otu_print = final_otu.split('/')[-1] tab_print = final_otu_table.split('/')[-1] if 'darwin' in sys.platform: print(colr.WARN + "\nExample of next cmd:" + colr.END + " amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print)) else: print( "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))