def processReadsPE(input, args=False): base = os.path.basename(input) forward_reads = os.path.join(tmpdir, base+'_R1.fq') reverse_reads = os.path.join(tmpdir, base+'_R2.fq') orientR1 = os.path.join(tmpdir, base+'_R1.oriented.fq') orientR2 = os.path.join(tmpdir, base+'_R2.oriented.fq') trim_forward = os.path.join(tmpdir, base+'_R1.trimmed.fq') trim_reverse = os.path.join(tmpdir, base+'_R2.trimmed.fq') merged_reads = os.path.join(tmpdir, base+'.merged.fq') DemuxOut = os.path.join(tmpdir, base+'.demux.fq') StatsOut = os.path.join(tmpdir, base+'.stats') RL = amptklib.GuessRL(forward_reads) Total, Correct, Flip, Drop = amptklib.illuminaReorient(forward_reads, reverse_reads, FwdPrimer, RevPrimer, args.primer_mismatch, RL, orientR1, orientR2) amptklib.log.debug('Re-oriented PE reads for {:}: {:,} total, {:,} correct, {:,} flipped, {:,} dropped.'.format(base, Total, Correct, Flip, Drop)) if args.barcode_not_anchored: amptklib.demuxIlluminaPE2(orientR1, orientR2, FwdPrimer, RevPrimer, SampleData, Barcodes, RevBarcodes, args.barcode_mismatch, args.primer_mismatch, trim_forward, trim_reverse, StatsOut) else: amptklib.demuxIlluminaPE(orientR1, orientR2, FwdPrimer, RevPrimer, SampleData, Barcodes, RevBarcodes, args.barcode_mismatch, args.primer_mismatch, trim_forward, trim_reverse, StatsOut) if args.full_length: amptklib.MergeReadsSimple(trim_forward, trim_reverse, '.', DemuxOut, args.min_len, usearch, 'off', args.merge_method) else: amptklib.MergeReadsSimple(trim_forward, trim_reverse, '.', merged_reads, args.min_len, usearch, 'on', args.merge_method) amptklib.losslessTrim(merged_reads, FwdPrimer, RevPrimer, args.primer_mismatch, args.trim_len, args.pad, args.min_len, DemuxOut) amptklib.SafeRemove(orientR1) amptklib.SafeRemove(orientR2) amptklib.SafeRemove(forward_reads) amptklib.SafeRemove(reverse_reads) amptklib.SafeRemove(merged_reads)
def main(args): global FwdPrimer, RevPrimer, SampleData, Barcodes, RevBarcodes, tmpdir, usearch parser=argparse.ArgumentParser(prog='amptk-process_ion.py', usage="%(prog)s [options] -i file.fastq\n%(prog)s -h for help menu", description='''Script finds barcodes, strips forward and reverse primers, relabels, and then trim/pads reads to a set length''', epilog="""Written by Jon Palmer (2015) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i','--fastq', dest='fastq', required=True, help='FASTQ R1 file') parser.add_argument('--reverse', help='Illumina R2 reverse reads') parser.add_argument('-o','--out', dest="out", default='illumina2', help='Base name for output') parser.add_argument('-f','--fwd_primer', dest="F_primer", default='fITS7', help='Forward Primer') parser.add_argument('-r','--rev_primer', dest="R_primer", default='ITS4', help='Reverse Primer') parser.add_argument('-m','--mapping_file', help='Mapping file: QIIME format can have extra meta data columns') parser.add_argument('-p','--pad', default='off', choices=['on', 'off'], help='Pad with Ns to a set length') parser.add_argument('--primer_mismatch', default=2, type=int, help='Number of mis-matches in primer') parser.add_argument('--barcode_mismatch', default=0, type=int, help='Number of mis-matches in barcode') parser.add_argument('--barcode_fasta', help='FASTA file containing Barcodes (Names & Sequences)') parser.add_argument('--barcode_not_anchored', action='store_true', help='Barcodes (indexes) are not at start of reads') parser.add_argument('--reverse_barcode', help='FASTA file containing 3 prime Barocdes') parser.add_argument('--min_len', default=100, type=int, help='Minimum read length to keep') parser.add_argument('-l','--trim_len', default=300, type=int, help='Trim length for reads') parser.add_argument('--full_length', action='store_true', help='Keep only full length reads (no trimming/padding)') parser.add_argument('--merge_method', default='usearch', choices=['usearch', 'vsearch'], help='Software to use for PE read merging') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") parser.add_argument('-u','--usearch', dest="usearch", default='usearch9', help='USEARCH EXE') args=parser.parse_args(args) args.out = re.sub(r'\W+', '', args.out) log_name = args.out + '.amptk-demux.log' if os.path.isfile(log_name): os.remove(log_name) FNULL = open(os.devnull, 'w') amptklib.setupLogging(log_name) cmd_args = " ".join(sys.argv)+'\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of CPUs to use if not args.cpus: cpus = multiprocessing.cpu_count() else: cpus = args.cpus #parse a mapping file or a barcode fasta file, primers, etc get setup #dealing with Barcodes, get ion barcodes or parse the barcode_fasta argument barcode_file = args.out + ".barcodes_used.fa" rev_barcode_file = args.out + '.revbarcodes_used.fa' amptklib.SafeRemove(barcode_file) amptklib.SafeRemove(rev_barcode_file) #check if mapping file passed, use this if present, otherwise use command line arguments SampleData = {} Barcodes = {} RevBarcodes = {} FwdPrimer = '' RevPrimer = '' if args.mapping_file: if not os.path.isfile(args.mapping_file): amptklib.log.error("Mapping file not found: %s" % args.mapping_file) sys.exit(1) SampleData, Barcodes, RevBarcodes, FwdPrimer, RevPrimer = amptklib.parseMappingFileNEW(args.mapping_file) else: #no mapping file, so create dictionaries from barcode fasta files if not args.barcode_fasta: amptklib.log.error("You did not specify a --barcode_fasta or --mapping_file, one is required") sys.exit(1) else: shutil.copyfile(args.barcode_fasta, barcode_file) Barcodes = amptklib.fasta2barcodes(barcode_file, False) if args.reverse_barcode: shutil.copyfile(args.reverse_barcode, rev_barcode_file) RevBarcodes = amptklib.fasta2barcodes(rev_barcode_file, False) #parse primers here so doesn't conflict with mapping primers #look up primer db otherwise default to entry if args.F_primer in amptklib.primer_db: FwdPrimer = amptklib.primer_db.get(args.F_primer) amptklib.log.info("{:} fwd primer found in AMPtk primer db, setting to: {:}".format(args.F_primer, FwdPrimer)) else: FwdPrimer = args.F_primer amptklib.log.info("{:} fwd primer not found in AMPtk primer db, assuming it is actual primer sequence.".format(args.F_primer)) if args.R_primer in amptklib.primer_db: RevPrimer = amptklib.primer_db.get(args.R_primer) amptklib.log.info("{:} rev primer found in AMPtk primer db, setting to: {:}".format(args.R_primer, RevPrimer)) else: RevPrimer = args.R_primer amptklib.log.info("{:} rev primer not found in AMPtk primer db, assuming it is actual primer sequence.".format(args.R_primer)) #check if input is compressed gzip_list = [] if args.fastq.endswith('.gz'): gzip_list.append(os.path.abspath(args.fastq)) if args.reverse: if args.reverse.endswith('.gz'): gzip_list.append(os.path.abspath(args.reverse)) if gzip_list: amptklib.log.info("Gzipped input files detected, uncompressing") for file in gzip_list: file_out = file.replace('.gz', '') amptklib.Funzip(file, file_out, cpus) args.fastq = args.fastq.replace('.gz', '') if args.reverse: args.reverse = args.reverse.replace('.gz', '') #Count FASTQ records amptklib.log.info("Loading FASTQ Records") orig_total = amptklib.countfastq(args.fastq) size = amptklib.checkfastqsize(args.fastq) readablesize = amptklib.convertSize(size*2) amptklib.log.info('{:,} reads ({:})'.format(orig_total, readablesize)) #output barcodes/samples amptklib.log.info('Searching for {:} forward barcodes and {:} reverse barcodes'.format(len(Barcodes), len(RevBarcodes))) #create tmpdir and split input into n cpus tmpdir = args.out.split('.')[0]+'_'+str(os.getpid()) if not os.path.exists(tmpdir): os.makedirs(tmpdir) #tell user about number of cores using amptklib.log.info("Splitting FASTQ files over {:} cpus".format(cpus)) if args.reverse: amptklib.log.info("Demuxing PE Illumina reads; FwdPrimer: {:} RevPrimer: {:}".format(FwdPrimer, RevPrimer)) else: amptklib.log.info("Demuxing SE Illumina reads; FwdPrimer: {:} RevPrimer: {:}".format(FwdPrimer, amptklib.RevComp(RevPrimer))) amptklib.log.info('Dropping reads less than {:} bp and setting lossless trimming to {:} bp.'.format(args.min_len, args.trim_len)) if cpus > 1: if args.reverse: amptklib.split_fastqPE(args.fastq, args.reverse, orig_total, tmpdir, cpus*4) file_list = [] for file in os.listdir(tmpdir): if file.endswith('.fq'): filepart = os.path.join(tmpdir, file.split('_R')[0]) if not filepart in file_list: file_list.append(filepart) amptklib.runMultiProgress(processReadsPE, file_list, cpus, args=args) else: #split fastq file amptklib.split_fastq(args.fastq, orig_total, tmpdir, cpus*4) #now get file list from tmp folder file_list = [] for file in os.listdir(tmpdir): if file.endswith(".fq"): file = os.path.join(tmpdir, file) file_list.append(file) #finally process reads over number of cpus amptklib.runMultiProgress(processRead, file_list, cpus, args=args) else: if args.reverse: shutil.copyfile(args.fastq, os.path.join(tmpdir, 'chunk_R1.fq')) shutil.copyfile(args.reverse, os.path.join(tmpdir, 'chunk_R2.fq')) processReadsPE(os.path.join(tmpdir, 'chunk'), args=args) else: shutil.copyfile(args.fastq, os.path.join(tmpdir, 'chunk.fq')) processRead(os.path.join(tmpdir, 'chunk.fq'), args=args) print("-------------------------------------------------------") #Now concatenate all of the demuxed files together amptklib.log.info("Concatenating Demuxed Files") tmpDemux = args.out + '.tmp.demux.fq' with open(tmpDemux, 'w') as outfile: for filename in glob.glob(os.path.join(tmpdir,'*.demux.fq')): if filename == tmpDemux: continue with open(filename, 'r') as readfile: shutil.copyfileobj(readfile, outfile) if args.reverse: #parse the stats finalstats = [0,0,0,0,0,0] for file in os.listdir(tmpdir): if file.endswith('.stats'): with open(os.path.join(tmpdir, file), 'r') as statsfile: line = statsfile.readline() line = line.rstrip() newstats = line.split(',') newstats = [int(i) for i in newstats] for x, num in enumerate(newstats): finalstats[x] += num amptklib.log.info('{0:,}'.format(finalstats[0])+' total reads') amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1]-finalstats[3])+' valid Barcodes') amptklib.log.info('{0:,}'.format(finalstats[5])+' valid output reads (Barcodes and Primers)') else: #parse the stats finalstats = [0,0,0,0,0,0,0] for file in os.listdir(tmpdir): if file.endswith('.stats'): with open(os.path.join(tmpdir, file), 'r') as statsfile: line = statsfile.readline() line = line.rstrip() newstats = line.split(',') newstats = [int(i) for i in newstats] for x, num in enumerate(newstats): finalstats[x] += num amptklib.log.info('{0:,}'.format(finalstats[0])+' total reads') if args.reverse_barcode: amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1]-finalstats[2]-finalstats[4])+' valid Fwd and Rev Barcodes') else: amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1])+' valid Barcode') amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1]-finalstats[2])+' Fwd Primer found, {0:,}'.format(finalstats[3])+ ' Rev Primer found') amptklib.log.info('{0:,}'.format(finalstats[5])+' discarded too short (< %i bp)' % args.min_len) amptklib.log.info('{0:,}'.format(finalstats[6])+' valid output reads') #clean up tmp folder amptklib.SafeRemove(tmpdir) #last thing is to re-number of reads as it is possible they could have same name from multitprocessor split catDemux = args.out+'.demux.fq' amptklib.fastqreindex(tmpDemux, catDemux) amptklib.SafeRemove(tmpDemux) #now loop through data and find barcoded samples, counting each..... BarcodeCount = {} with open(catDemux, 'r') as input: header = itertools.islice(input, 0, None, 4) for line in header: ID = line.split("=",1)[-1].split(";")[0] if ID not in BarcodeCount: BarcodeCount[ID] = 1 else: BarcodeCount[ID] += 1 #now let's count the barcodes found and count the number of times they are found. barcode_counts = "%22s: %s" % ('Sample', 'Count') for k,v in natsorted(list(BarcodeCount.items()), key=lambda k_v: k_v[1], reverse=True): barcode_counts += "\n%22s: %s" % (k, str(BarcodeCount[k])) amptklib.log.info("Found %i barcoded samples\n%s" % (len(BarcodeCount), barcode_counts)) genericmapfile = args.out + '.mapping_file.txt' if not args.mapping_file: #create a generic mappingfile for downstream processes amptklib.CreateGenericMappingFile(Barcodes, RevBarcodes, FwdPrimer, RevPrimer, genericmapfile, BarcodeCount) else: amptklib.updateMappingFile(args.mapping_file, BarcodeCount, genericmapfile) #compress the output to save space FinalDemux = catDemux+'.gz' amptklib.Fzip(catDemux, FinalDemux, cpus) amptklib.removefile(catDemux) if gzip_list: for file in gzip_list: file = file.replace('.gz', '') amptklib.removefile(file) #get file size filesize = os.path.getsize(FinalDemux) readablesize = amptklib.convertSize(filesize) amptklib.log.info("Output file: %s (%s)" % (FinalDemux, readablesize)) amptklib.log.info("Mapping file: %s" % genericmapfile) print("-------------------------------------------------------") if 'darwin' in sys.platform: print(col.WARN + "\nExample of next cmd: " + col.END + "amptk cluster -i %s -o out\n" % (FinalDemux)) else: print("\nExample of next cmd: amptk cluster -i %s -o out\n" % (FinalDemux))
def main(args): parser = argparse.ArgumentParser( prog='amptk-assign_taxonomy.py', usage="%(prog)s [options] -f <FASTA File>", description='''assign taxonomy to OTUs''', epilog="""Written by Jon Palmer (2015) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--otu_table', dest="otu_table", help='Append Taxonomy to OTU table') parser.add_argument('-f', '--fasta', required=True, help='FASTA input') parser.add_argument('-o', '--out', help='Output file (FASTA)') parser.add_argument( '-m', '--mapping_file', help='Mapping file: QIIME format can have extra meta data columns') parser.add_argument( '--method', default='hybrid', choices=['utax', 'usearch', 'sintax', 'hybrid', 'rdp', 'blast'], help='Taxonomy method') parser.add_argument( '-d', '--db', help='Pre-installed Databases: [ITS,ITS1,ITS2,16S,LSU,COI]') parser.add_argument( '-t', '--taxonomy', help='Incorporate taxonomy calculated elsewhere, 2 column file') parser.add_argument('--fasta_db', help='Alternative database of fasta sequences') parser.add_argument('--add2db', help='Custom FASTA database to add to DB on the fly') parser.add_argument('--utax_db', help='UTAX Reference Database') parser.add_argument('--utax_cutoff', default=0.8, type=restricted_float, help='UTAX confidence value threshold.') parser.add_argument('--usearch_db', help='USEARCH Reference Database') parser.add_argument('--usearch_cutoff', default=0.7, type=restricted_float, help='USEARCH percent ID threshold.') parser.add_argument( '-r', '--rdp', dest='rdp', default='/Users/jon/scripts/rdp_classifier_2.10.1/dist/classifier.jar', help='Path to RDP Classifier') parser.add_argument('--rdp_db', dest='rdp_tax', default='fungalits_unite', choices=[ '16srrna', 'fungallsu', 'fungalits_warcup', 'fungalits_unite' ], help='Training set for RDP Classifier') parser.add_argument('--rdp_cutoff', default=0.8, type=restricted_float, help='RDP confidence value threshold') parser.add_argument('--local_blast', help='Path to local Blast DB') parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH8 EXE') parser.add_argument('--tax_filter', help='Retain only OTUs with match in OTU table') parser.add_argument('--sintax_cutoff', default=0.8, type=restricted_float, help='SINTAX threshold.') parser.add_argument('--debug', action='store_true', help='Remove Intermediate Files') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") args = parser.parse_args(args) parentdir = os.path.join(os.path.dirname(amptklib.__file__)) if not args.out: #get base name of files if 'filtered' in args.fasta: base = args.fasta.split(".filtered")[0] elif 'otu' in args.fasta: base = args.fasta.split('.otu')[0] else: base = args.fasta.split('.fa')[0] else: base = args.out #remove logfile if exists log_name = base + '.amptk-taxonomy.log' if os.path.isfile(log_name): os.remove(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of cpus if args.cpus: cpus = args.cpus else: cpus = amptklib.getCPUS() #Setup DB locations and names, etc DBdir = os.path.join(parentdir, 'DB') DataBase = { 'ITS1': (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS1_UTAX.udb'), os.path.join(DBdir, 'ITS_SINTAX.udb')), 'ITS2': (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS2_UTAX.udb'), os.path.join(DBdir, 'ITS_SINTAX.udb')), 'ITS': (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS_UTAX.udb'), os.path.join(DBdir, 'ITS_SINTAX.udb')), '16S': (os.path.join(DBdir, '16S.udb'), os.path.join(DBdir, '16S.udb'), os.path.join(DBdir, '16S_SINTAX.udb')), 'LSU': (os.path.join(DBdir, 'LSU.udb'), os.path.join(DBdir, 'LSU_UTAX.udb'), os.path.join(DBdir, 'LSU_SINTAX.udb')), 'COI': (os.path.join(DBdir, 'COI.udb'), os.path.join(DBdir, 'COI_UTAX.udb'), os.path.join(DBdir, 'COI_SINTAX.udb')) } #get DB names up front if args.db in DataBase: utax_db = DataBase.get(args.db)[1] usearch_db = DataBase.get(args.db)[0] sintax_db = DataBase.get(args.db)[2] if not utax_db: utax_db = args.utax_db if not usearch_db: usearch_db = args.usearch_db else: utax_db = args.utax_db usearch_db = args.usearch_db if args.fasta_db: sintax_db = args.fasta_db else: sintax_db = args.usearch_db if args.method in ['hybrid', 'usearch', 'utax']: if not utax_db and not usearch_db and not args.fasta_db: amptklib.log.error( "You have not selected a database, need either --db, --utax_db, --usearch_db, or --fasta_db" ) sys.exit(1) else: #check that the DB exists if args.method == 'usearch' and usearch_db: if not amptklib.checkfile(usearch_db): amptklib.log.error( 'USEARCH DB not found: {:}'.format(usearch_db)) amptklib.log.derror( 'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB' ) sys.exit(1) if args.method == 'sintax' and sintax_db: if not amptklib.checkfile(sintax_db): amptklib.log.error( 'SINTAX DB not found: {:}'.format(sintax_db)) amptklib.log.derror( 'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB' ) sys.exit(1) if args.method == 'utax' and utax_db: if not amptklib.checkfile(utax_db): amptklib.log.error( 'UTAX DB not found: {:}'.format(utax_db)) amptklib.log.error( 'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB' ) sys.exit(1) custom_db = None if args.add2db: #means user wants to add sequences to the usearch database on the so will need to rebuild database custom_db = base + '.custom_database.fa' if amptklib.checkfile(custom_db): amptklib.SafeRemove(custom_db) if args.db: #this means that the fasta files need to be extracted amptklib.log.info("Adding {:} to the {:} database".format( os.path.basename(args.add2db), os.path.basename(usearch_db))) cmd = ['vsearch', '--udb2fasta', usearch_db, '--output', custom_db] amptklib.runSubprocess(cmd, amptklib.log) with open(custom_db, 'a') as outfile: with open(args.add2db, 'r') as infile: shutil.copyfileobj(infile, outfile) elif args.fasta_db: amptklib.log.info("Adding {:} to the {:} database".format( os.path.basename(args.add2db), os.path.basename(args.fasta_db))) with open(custom_db, 'w') as outfile: with open(args.fasta_db, 'r') as infile: shutil.copyfileobj(infile, outfile) with open(args.add2db, 'r') as infile: shutil.copyfileobj(infile, outfile) #Count records amptklib.log.info("Loading FASTA Records") total = amptklib.countfasta(args.fasta) amptklib.log.info('{0:,}'.format(total) + ' OTUs') #declare output files/variables here blast_out = base + '.blast.txt' rdp_out = base + '.rdp.txt' utax_out = base + '.usearch.txt' usearch_out = base + '.usearch.txt' sintax_out = base + '.sintax.txt' otuDict = {} if not args.taxonomy: #start with less common uses, i.e. Blast, rdp if args.method == 'blast': #check if command line blast installed if not amptklib.which('blastn'): amptklib.log.error("BLASTN not found in your PATH, exiting.") sys.exit(1) #now run blast remotely using NCBI nt database outformat = "6 qseqid sseqid pident stitle" if args.local_blast: #get number of cpus amptklib.log.info("Running local BLAST using db: %s" % args.local_blast) cmd = [ 'blastn', '-num_threads', str(cpus), '-query', args.fasta, '-db', os.path.abspath(args.local_blast), '-max_target_seqs', '1', '-outfmt', outformat, '-out', blast_out ] amptklib.runSubprocess(cmd, amptklib.log) else: amptklib.log.info( "Running BLASTN using NCBI remote nt database, this may take awhile" ) cmd = [ 'blastn', '-query', args.fasta, '-db', 'nt', '-remote', '-max_target_seqs', '1', '-outfmt', outformat, '-out', blast_out ] amptklib.runSubprocess(cmd, amptklib.log) #load results and reformat new = [] f = csv.reader(open(blast_out), delimiter=str('\t')) for col in f: query = col[0] gbID = col[1].split("|")[3] pident = col[2] name = col[3] tax = gbID + ";" + name + " (" + pident + ")" line = [query, tax] new.append(line) otuDict = dict(new) elif args.method == 'rdp': #check that classifier is installed try: rdp_test = subprocess.Popen( ['java', '-Xmx2000m', '-jar', args.rdp, 'classify'], stdout=subprocess.PIPE).communicate()[0].rstrip() except OSError: amptklib.log.error("%s not found in your PATH, exiting." % args.rdp) sys.exit(1) #RDP database amptklib.log.info("Using RDP classifier %s training set" % args.rdp_tax) #run RDP cmd = [ 'java', '-Xmx2000m', '-jar', args.rdp, 'classify', '-g', args.rdp_tax, '-o', rdp_out, '-f', 'fixrank', args.fasta ] amptklib.runSubprocess(cmd, amptklib.log) #load in results and put into dictionary new = [] removal = ["unidentified", "Incertae", "uncultured", "incertae"] remove_exp = [re.compile(x) for x in removal] f = csv.reader(open(rdp_out), delimiter=str('\t')) for col in f: if float(col[19]) > args.rdp_cutoff: tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[ 8] + ",o:" + col[11] + ",f:" + col[14] + ",g:" + col[17] elif float(col[16]) > args.rdp_cutoff: tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[ 8] + ",o:" + col[11] + ",f:" + col[14] elif float(col[13]) > args.rdp_cutoff: tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[ 8] + ",o:" + col[11] elif float(col[10]) > args.rdp_cutoff: tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[8] elif float(col[7]) > args.rdp_cutoff: tax = "RDP;k:" + col[2] + ",p:" + col[5] elif float(col[4]) > args.rdp_cutoff: tax = "RDP;k:" + col[2] else: tax = "RDP;k:unclassified" tax_split = tax.split(",") tax = [ s for s in tax_split if not any(re.search(s) for re in remove_exp) ] tax = ",".join(tax) line = [col[0], tax] new.append(line) otuDict = dict(new) else: #check status of USEARCH DB and run if args.method in ['hybrid', 'usearch']: if args.fasta_db: #now run through usearch global amptklib.log.info( "Global alignment OTUs with usearch_global (VSEARCH) against {:}" .format(os.path.basename(args.fasta_db))) cmd = [ 'vsearch', '--usearch_global', args.fasta, '--db', os.path.abspath(args.fasta_db), '--userout', usearch_out, '--id', str(args.usearch_cutoff), '--strand', 'both', '--output_no_hits', '--maxaccepts', '0', '--top_hits_only', '--userfields', 'query+target+id', '--notrunclabels', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) elif custom_db: #now run through usearch global amptklib.log.info( "Global alignment OTUs with usearch_global (VSEARCH) against custom DB" ) cmd = [ 'vsearch', '--usearch_global', args.fasta, '--db', os.path.abspath(custom_db), '--userout', usearch_out, '--id', str(args.usearch_cutoff), '--strand', 'both', '--output_no_hits', '--maxaccepts', '0', '--top_hits_only', '--userfields', 'query+target+id', '--notrunclabels', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) else: if usearch_db: amptklib.log.info( "Global alignment OTUs with usearch_global (VSEARCH) against {:}" .format(os.path.basename(usearch_db))) cmd = [ 'vsearch', '--usearch_global', args.fasta, '--db', os.path.abspath(usearch_db), '--userout', usearch_out, '--id', str(args.usearch_cutoff), '--strand', 'both', '--output_no_hits', '--maxaccepts', '0', '--top_hits_only', '--userfields', 'query+target+id', '--notrunclabels', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) if args.method in ['hybrid', 'utax']: if utax_db: #now run through UTAX utax_out = base + '.utax.txt' amptklib.log.info("Classifying OTUs with UTAX (USEARCH)") cutoff = str(args.utax_cutoff) cmd = [ usearch, '-utax', args.fasta, '-db', utax_db, '-utaxout', utax_out, '-utax_cutoff', cutoff, '-strand', 'plus', '-notrunclabels', '-threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) else: amptklib.log.error("UTAX DB %s not found, skipping" % utax_db) if args.method in ['hybrid', 'sintax']: if args.fasta_db: #if you pass fasta file here, over ride any auto detection sintax_db = args.fasta_db #now run sintax amptklib.log.info("Classifying OTUs with SINTAX (USEARCH)") cmd = [ usearch, '-sintax', args.fasta, '-db', os.path.abspath(sintax_db), '-tabbedout', sintax_out, '-sintax_cutoff', str(args.sintax_cutoff), '-strand', 'both', '-threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #now process results, load into dictionary - slightly different depending on which classification was run. if args.method == 'hybrid': #run upgraded method, first load dictionaries with resuls if amptklib.checkfile(utax_out): utaxDict = amptklib.classifier2dict( utax_out, args.utax_cutoff) amptklib.log.debug( 'UTAX results parsed, resulting in {:,} taxonomy predictions' .format(len(utaxDict))) else: amptklib.log.info('UTAX results empty') utaxDict = {} if amptklib.checkfile(sintax_out): sintaxDict = amptklib.classifier2dict( sintax_out, args.sintax_cutoff) amptklib.log.debug( 'SINTAX results parsed, resulting in {:,} taxonomy predictions' .format(len(sintaxDict))) else: amptklib.log.info('SINTAX results empty') sintaxDict = {} usearchDict = amptklib.usearchglobal2dict(usearch_out) amptklib.log.debug( 'Global alignment results parsed, resulting in {:,} taxonomy predictions' .format(len(usearchDict))) otuList = natsorted(list(usearchDict.keys())) #first compare classifier results, getting better of the two bestClassify = amptklib.bestclassifier(utaxDict, sintaxDict, otuList) #now get best taxonomy by comparing to global alignment results otuDict = amptklib.bestTaxonomy(usearchDict, bestClassify) amptklib.log.debug( 'Combined OTU taxonomy dictionary contains {:,} taxonomy predictions' .format(len(otuDict))) if len(otuDict) < 1: amptklib.log.info('Parsing taxonomy failed -- see logfile') sys.exit(1) elif args.method == 'utax' and amptklib.checkfile(utax_out): #load results into dictionary for appending to OTU table amptklib.log.debug("Loading UTAX results into dictionary") with open(utax_out, 'r') as infile: reader = csv.reader(infile, delimiter=str("\t")) otuDict = {rows[0]: 'UTAX;' + rows[2] for rows in reader} elif args.method == 'usearch' and amptklib.checkfile(usearch_out): #load results into dictionary for appending to OTU table amptklib.log.debug( "Loading Global Alignment results into dictionary") otuDict = {} usearchDict = amptklib.usearchglobal2dict(usearch_out) for k, v in natsorted(list(usearchDict.items())): pident = float(v[0]) * 100 pident = "{0:.1f}".format(pident) ID = v[1] tax = ','.join(v[-1]) LCA = v[2] if LCA == '': fulltax = 'GS|' + pident + '|' + ID + ';' + tax else: fulltax = 'GSL|' + pident + '|' + ID + ';' + tax otuDict[k] = fulltax elif args.method == 'sintax' and amptklib.checkfile(sintax_out): #load results into dictionary for appending to OTU table amptklib.log.debug("Loading SINTAX results into dictionary") with open(sintax_out, 'r') as infile: reader = csv.reader(infile, delimiter=(str("\t"))) otuDict = {rows[0]: 'SINTAX;' + rows[3] for rows in reader} else: #you have supplied a two column taxonomy file, parse and build otuDict amptklib.log.debug("Loading custom Taxonomy into dictionary") with open(args.taxonomy, 'r') as infile: reader = csv.reader(infile, delimiter=str("\t")) otuDict = {rows[0]: rows[1] for rows in reader} #now format results if args.otu_table: #check if otu_table variable is empty, then load in otu table amptklib.log.info("Appending taxonomy to OTU table and OTUs") taxTable = base + '.otu_table.taxonomy.txt' tmpTable = base + '.otu_table.tmp' #append to OTU table counts = 0 with open(taxTable, 'w') as outTable: with open(args.otu_table, 'r') as inTable: #guess the delimiter format firstline = inTable.readline() dialect = amptklib.guess_csv_dialect(firstline) inTable.seek(0) #parse OTU table reader = csv.reader(inTable, dialect) for line in reader: if line[0].startswith(("#OTU", "OTUId")): line.append('Taxonomy') else: tax = otuDict.get(line[0]) or "No Hit" line.append(tax) if args.tax_filter and not args.method == 'blast': if line[0].startswith(("#OTU", "OTUId")): join_line = ('\t'.join(str(x) for x in line)) else: if args.tax_filter in line[-1]: join_line = ('\t'.join(str(x) for x in line)) counts += 1 else: continue else: join_line = ('\t'.join(str(x) for x in line)) counts += 1 outTable.write("%s\n" % join_line) if args.tax_filter: if args.method == 'blast': amptklib.log.info( "Blast is incompatible with --tax_filter, use a different method" ) tmpTable = args.otu_table else: nonfungal = total - counts amptklib.log.info( "Found %i OTUs not matching %s, writing %i %s hits to taxonomy OTU table" % (nonfungal, args.tax_filter, counts, args.tax_filter)) #need to create a filtered table without taxonomy for BIOM output with open(tmpTable, 'w') as output: with open(taxTable, 'r') as input: firstline = input.readline() dialect = amptklib.guess_csv_dialect(firstline) input.seek(0) #parse OTU table reader = csv.reader(input, dialect) for line in reader: del line[-1] join_line = '\t'.join(str(x) for x in line) output.write("%s\n" % join_line) else: tmpTable = args.otu_table #append to OTUs otuTax = base + '.otus.taxonomy.fa' with open(otuTax, 'w') as output: with open(args.fasta, 'r') as input: SeqRecords = SeqIO.parse(input, 'fasta') for rec in SeqRecords: tax = otuDict.get(rec.id) or "No hit" rec.description = tax SeqIO.write(rec, output, 'fasta') if not args.taxonomy: #output final taxonomy in two-column format, followed by the hits for usearch/sintax/utax if hybrid is used. taxFinal = base + '.taxonomy.txt' with open(taxFinal, 'w') as finaltax: if args.method == 'hybrid': finaltax.write('#OTUID\ttaxonomy\tUSEARCH\tSINTAX\tUTAX\n') for k, v in natsorted(list(otuDict.items())): if k in usearchDict: usearchResult = usearchDict.get(k) usearchResult = ','.join(usearchResult[-1]) else: usearchResult = 'No hit' if k in sintaxDict: sintaxResult = sintaxDict.get(k) sintaxResult = ','.join(sintaxResult[-1]) else: sintaxResult = 'No hit' if k in utaxDict: utaxResult = utaxDict.get(k) utaxResult = ','.join(utaxResult[-1]) else: utaxResult = 'No hit' finaltax.write('{:}\t{:}\t{:}\t{:}\t{:}\n'.format( k, v, usearchResult, sintaxResult, utaxResult)) else: finaltax.write('#OTUID\ttaxonomy\n') for k, v in natsorted(list(otuDict.items())): finaltax.write('%s\t%s\n' % (k, v)) else: taxFinal = args.taxonomy #convert taxonomy to qiime format for biom qiimeTax = None if not args.method == 'blast': qiimeTax = base + '.qiime.taxonomy.txt' amptklib.utax2qiime(taxFinal, qiimeTax) else: amptklib.log.error( "Blast taxonomy is not compatible with BIOM output, use a different method" ) #create OTU phylogeny for downstream processes amptklib.log.info("Generating phylogenetic tree") tree_out = base + '.tree.phy' cmd = [usearch, '-cluster_agg', args.fasta, '-treeout', tree_out] amptklib.runSubprocess(cmd, amptklib.log) #print some summary file locations amptklib.log.info("Taxonomy finished: %s" % taxFinal) if args.otu_table and not args.method == 'blast': amptklib.log.info("Classic OTU table with taxonomy: %s" % taxTable) #output final OTU table in Biom v1.0 (i.e. json format if biom installed) outBiom = base + '.biom' if amptklib.which('biom'): amptklib.removefile(outBiom) cmd = [ 'biom', 'convert', '-i', tmpTable, '-o', outBiom + '.tmp', '--table-type', "OTU table", '--to-json' ] amptklib.runSubprocess(cmd, amptklib.log) if args.mapping_file: mapSamples = [] repeatSamples = [] with open(args.mapping_file, 'r') as mapin: for line in mapin: line = line.rstrip() if line.startswith('#'): continue sampleID = line.split('\t')[0] if not sampleID in mapSamples: mapSamples.append(sampleID) else: repeatSamples.append(sampleID) otuSamples = [] with open(tmpTable, 'r') as otuin: for line in otuin: line = line.rstrip() if line.startswith('#'): otuSamples = line.split('\t')[1:] missingMap = [] for otu in otuSamples: if not otu in mapSamples: missingMap.append(otu) if len(missingMap) > 0: amptklib.log.error( "%s are missing from mapping file (metadata), skipping biom file creation" % ', '.join(missingMap)) elif len(repeatSamples) > 0: amptklib.log.error( '%s duplicate sample IDs in mapping file, skipping biom file creation' % ', '.join(repeatSamples)) else: if qiimeTax: cmd = [ 'biom', 'add-metadata', '-i', outBiom + '.tmp', '-o', outBiom, '--observation-metadata-fp', qiimeTax, '-m', args.mapping_file, '--sc-separated', 'taxonomy', '--output-as-json' ] else: cmd = [ 'biom', 'add-metadata', '-i', outBiom + '.tmp', '-o', outBiom, '-m', args.mapping_file, '--output-as-json' ] amptklib.runSubprocess(cmd, amptklib.log) else: cmd = [ 'biom', 'add-metadata', '-i', outBiom + '.tmp', '-o', outBiom, '--observation-metadata-fp', qiimeTax, '--sc-separated', 'taxonomy', '--output-as-json' ] amptklib.runSubprocess(cmd, amptklib.log) amptklib.removefile(outBiom + '.tmp') amptklib.log.info("BIOM OTU table created: %s" % outBiom) else: amptklib.log.info( "biom program not installed, install via `pip install biom-format` or `conda install biom-format`" ) amptklib.log.info("OTUs with taxonomy: %s" % otuTax) amptklib.log.info("OTU phylogeny: %s" % tree_out) #clean up intermediate files if not args.debug: for i in [ utax_out, usearch_out, sintax_out, qiimeTax, base + '.otu_table.tmp' ]: if i: amptklib.removefile(i) print("-------------------------------------------------------")
def main(args): parser = argparse.ArgumentParser( prog='amptk-dada2.py', description= '''Script takes output from amptk pre-processing and runs DADA2''', epilog="""Written by Jon Palmer (2016) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--fastq', required=True, help='Input Demuxed containing FASTQ') parser.add_argument('-o', '--out', help='Output Basename') parser.add_argument( '-m', '--min_reads', default=10, type=int, help="Minimum number of reads after Q filtering to run DADA2 on") parser.add_argument('-l', '--length', type=int, help='Length to truncate reads') parser.add_argument('-e', '--maxee', default='1.0', help='MaxEE quality filtering') parser.add_argument('-p', '--pct_otu', default='97', help="Biological OTU Clustering Percent") parser.add_argument('--platform', default='ion', choices=['ion', 'illumina', '454'], help='Sequencing platform') parser.add_argument('--chimera_method', default='consensus', choices=['consensus', 'pooled', 'per-sample'], help='bimera removal method') parser.add_argument('--uchime_ref', help='Run UCHIME REF [ITS,16S,LSU,COI,custom]') parser.add_argument('--pool', action='store_true', help='Pool all sequences together for DADA2') parser.add_argument('--debug', action='store_true', help='Keep all intermediate files') parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH9 EXE') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") args = parser.parse_args(args) parentdir = os.path.join(os.path.dirname(amptklib.__file__)) dada2script = os.path.join(parentdir, 'dada2_pipeline_nofilt.R') #get basename if not args.out passed if args.out: base = args.out else: if 'demux' in args.fastq: base = os.path.basename(args.fastq).split('.demux')[0] else: base = os.path.basename(args.fastq).split('.f')[0] #remove logfile if exists log_name = base + '.amptk-dada2.log' if os.path.isfile(log_name): amptklib.removefile(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of cores if args.cpus: CORES = str(args.cpus) else: CORES = str(amptklib.getCPUS()) #check dependencies programs = ['Rscript'] amptklib.CheckDependencies(programs) Rversions = amptklib.checkRversion() R_pass = '******' dada2_pass = '******' #check dada2 first, if good move on, otherwise issue warning if not amptklib.gvc(Rversions[1], dada2_pass): amptklib.log.error("R v%s; DADA2 v%s detected, need atleast v%s" % (Rversions[0], Rversions[1], dada2_pass)) amptklib.log.error( "See: http://benjjneb.github.io/dada2/dada-installation.html") sys.exit(1) amptklib.log.info("R v%s; DADA2 v%s" % (Rversions[0], Rversions[1])) #Count FASTQ records and remove 3' N's as dada2 can't handle them amptklib.log.info("Loading FASTQ Records") no_ns = base + '.cleaned_input.fq' if args.fastq.endswith('.gz'): fastqInput = args.fastq.replace('.gz', '') amptklib.Funzip(os.path.abspath(args.fastq), os.path.basename(fastqInput), CORES) else: fastqInput = os.path.abspath(args.fastq) amptklib.fastq_strip_padding(os.path.basename(fastqInput), no_ns) demuxtmp = base + '.original.fa' cmd = [ 'vsearch', '--fastq_filter', os.path.abspath(no_ns), '--fastq_qmax', '55', '--fastaout', demuxtmp, '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) orig_total = amptklib.countfasta(demuxtmp) size = amptklib.checkfastqsize(no_ns) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #quality filter amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) derep = base + '.qual-filtered.fq' filtercmd = [ 'vsearch', '--fastq_filter', no_ns, '--fastq_maxee', str(args.maxee), '--fastqout', derep, '--fastq_qmax', '55', '--fastq_maxns', '0', '--threads', CORES ] amptklib.runSubprocess(filtercmd, amptklib.log) total = amptklib.countfastq(derep) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #split into individual files amptklib.log.info("Splitting FASTQ file by Sample into individual files") filtfolder = base + '_filtered' if os.path.isdir(filtfolder): shutil.rmtree(filtfolder) os.makedirs(filtfolder) splitDemux2(derep, filtfolder, args=args) #check for minimum number of reads in each sample remove = [] files = [i for i in os.listdir(filtfolder) if i.endswith('.fastq')] for x in files: if amptklib.countfastq(os.path.join(filtfolder, x)) < args.min_reads: remove.append(x) if len(remove) > 0: amptklib.log.info("Dropping %s as fewer than %i reads" % (', '.join(remove), args.min_reads)) for y in remove: os.remove(os.path.join(filtfolder, y)) #now run DADA2 on filtered folder amptklib.log.info("Running DADA2 pipeline") dada2log = base + '.dada2.Rscript.log' dada2out = base + '.dada2.csv' #check pooling vs notpooled, default is not pooled. if args.pool: POOL = 'TRUE' else: POOL = 'FALSE' with open(dada2log, 'w') as logfile: subprocess.call([ 'Rscript', '--vanilla', dada2script, filtfolder, dada2out, args.platform, POOL, CORES, args.chimera_method ], stdout=logfile, stderr=logfile) #check for results if not os.path.isfile(dada2out): amptklib.log.error("DADA2 run failed, please check %s logfile" % dada2log) sys.exit(1) #now process the output, pull out fasta, rename, etc fastaout = base + '.otus.tmp' OTUCounts = {} counter = 1 with open(fastaout, 'w') as writefasta: with open(dada2out, 'r') as input: next(input) for line in input: line = line.replace('\n', '') line = line.replace('"', '') cols = line.split(',') Seq = cols[0] countList = [int(x) for x in cols[1:]] counts = sum(countList) ID = 'ASV' + str(counter) if not ID in OTUCounts: OTUCounts[ID] = counts writefasta.write(">%s\n%s\n" % (ID, Seq)) counter += 1 #get number of bimeras from logfile with open(dada2log, 'r') as bimeracheck: for line in bimeracheck: if line.startswith('Identified '): bimeraline = line.split(' ') bimeras = int(bimeraline[1]) totalSeqs = int(bimeraline[5]) validSeqs = totalSeqs - bimeras amptklib.log.info('{0:,}'.format(totalSeqs) + ' total amplicon sequence variants (ASVs)') amptklib.log.info('{0:,}'.format(bimeras) + ' denovo chimeras removed') amptklib.log.info('{0:,}'.format(validSeqs) + ' valid ASVs') #optional UCHIME Ref uchime_out = base + '.nonchimeras.fa' chimeraFreeTable = base + '.otu_table.txt' iSeqs = base + '.ASVs.fa' if not args.uchime_ref: os.rename(fastaout, iSeqs) else: #check if file is present, remove from previous run if it is. if os.path.isfile(iSeqs): amptklib.removefile(iSeqs) #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy if args.uchime_ref in [ 'ITS', '16S', 'LSU', 'COI' ]: #test if it is one that is setup, otherwise default to full path uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb') if not os.path.isfile(uchime_db): amptklib.log.error( "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering" ) uchime_out = fastaout #since uchime cannot work with udb database, need to extract fasta sequences, do this if if not amptklib.checkfile( os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa')): uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') cmd = [ 'vsearch', '--udb2fasta', os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'), '--output', uchime_db ] amptklib.runSubprocess(cmd, amptklib.log) else: uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') else: if os.path.isfile(args.uchime_ref): uchime_db = os.path.abspath(args.uchime_ref) else: amptklib.log.error( "%s is not a valid file, skipping reference chimera filtering" % args.uchime_ref) iSeqs = fastaout #now run chimera filtering if all checks out if not os.path.isfile(iSeqs): amptklib.log.info("Chimera Filtering (VSEARCH) using %s DB" % args.uchime_ref) cmd = [ 'vsearch', '--mindiv', '1.0', '--uchime_ref', fastaout, '--db', uchime_db, '--nonchimeras', iSeqs, '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(iSeqs) uchime_chimeras = validSeqs - total amptklib.log.info('{0:,}'.format(total) + ' ASVs passed, ' + '{0:,}'.format(uchime_chimeras) + ' ref chimeras removed') if os.path.isfile(fastaout): amptklib.removefile(fastaout) #setup output files dadademux = base + '.dada2.map.uc' bioSeqs = base + '.cluster.otus.fa' bioTable = base + '.cluster.otu_table.txt' uctmp = base + '.map.uc' ClusterComp = base + '.ASVs2clusters.txt' #Filter out ASVs in wrong orientation amptklib.log.info('Validating ASV orientation') os.rename(iSeqs, iSeqs + '.bak') numKept, numDropped = amptklib.validateorientationDADA2( OTUCounts, iSeqs + '.bak', iSeqs) amptklib.log.info('{:,} ASVs validated ({:,} dropped)'.format( numKept, numDropped)) amptklib.SafeRemove(iSeqs + '.bak') #map reads to DADA2 OTUs amptklib.log.info("Mapping reads to DADA2 ASVs") cmd = [ 'vsearch', '--usearch_global', demuxtmp, '--db', iSeqs, '--id', '0.97', '--uc', dadademux, '--strand', 'plus', '--otutabout', chimeraFreeTable, '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.line_count2(dadademux) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to ASVs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) #cluster amptklib.log.info("Clustering ASVs at %s%% to generate biological OTUs" % args.pct_otu) radius = float(args.pct_otu) / 100. cmd = [ 'vsearch', '--cluster_smallmem', iSeqs, '--centroids', bioSeqs, '--id', str(radius), '--strand', 'plus', '--relabel', 'OTU', '--qmask', 'none', '--usersort', '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(bioSeqs) amptklib.log.info('{0:,}'.format(total) + ' OTUs generated') #determine where iSeqs clustered iSeqmap = base + '.ASV_map.uc' cmd = [ 'vsearch', '--usearch_global', iSeqs, '--db', bioSeqs, '--id', str(radius), '--uc', iSeqmap, '--strand', 'plus', '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) iSeqMapped = {} with open(iSeqmap, 'r') as mapping: for line in mapping: line = line.replace('\n', '') cols = line.split('\t') OTU = cols[9] Hit = cols[8] if not OTU in iSeqMapped: iSeqMapped[OTU] = [Hit] else: iSeqMapped[OTU].append(Hit) with open(ClusterComp, 'w') as clusters: clusters.write('OTU\tASVs\n') for k, v in natsorted(list(iSeqMapped.items())): clusters.write('%s\t%s\n' % (k, ', '.join(v))) #create OTU table amptklib.log.info("Mapping reads to OTUs") cmd = [ 'vsearch', '--usearch_global', demuxtmp, '--db', bioSeqs, '--id', '0.97', '--uc', uctmp, '--strand', 'plus', '--otutabout', bioTable, '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.line_count2(uctmp) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) if not args.debug: amptklib.removefile(no_ns) shutil.rmtree(filtfolder) amptklib.removefile(dada2out) amptklib.removefile(derep) amptklib.removefile(demuxtmp) amptklib.removefile(uctmp) amptklib.removefile(iSeqmap) amptklib.removefile(dadademux) #Print location of files to STDOUT print("-------------------------------------------------------") print("DADA2 Script has Finished Successfully") print("-------------------------------------------------------") if args.debug: print("Tmp Folder of files: %s" % filtfolder) print("Amplicon sequence variants: %s" % iSeqs) print("ASV OTU Table: %s" % chimeraFreeTable) print("Clustered OTUs: %s" % bioSeqs) print("OTU Table: %s" % bioTable) print("ASVs 2 OTUs: %s" % ClusterComp) print("-------------------------------------------------------") otu_print = bioSeqs.split('/')[-1] tab_print = bioTable.split('/')[-1] if 'darwin' in sys.platform: print(colr.WARN + "\nExample of next cmd:" + colr.END + " amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print)) else: print( "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))
def main(args): global FwdPrimer, RevPrimer, Barcodes, tmpdir, usearch parser = argparse.ArgumentParser( prog='amptk-process_illumina_raw.py', usage="%(prog)s [options] -i file.fastq\n%(prog)s -h for help menu", description= '''Script finds barcodes, strips forward and reverse primers, relabels, and then trim/pads reads to a set length''', epilog="""Written by Jon Palmer (2015) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-f', '--forward', dest='fastq', required=True, help='Illumina FASTQ R1 reads') parser.add_argument('-r', '--reverse', required=True, help='Illumina FASTQ R2 reads') parser.add_argument('-i', '--index', nargs='+', required=True, help='Illumina FASTQ index reads') parser.add_argument('-m', '--mapping_file', help='QIIME-like mapping file') parser.add_argument('--read_length', type=int, help='Read length, i.e. 2 x 300 bp = 300') parser.add_argument('-o', '--out', dest="out", default='illumina_out', help='Base name for output') parser.add_argument('--fwd_primer', dest="F_primer", default='515FB', help='Forward Primer') parser.add_argument('--rev_primer', dest="R_primer", default='806RB', help='Reverse Primer') parser.add_argument('--primer_mismatch', default=2, type=int, help='Number of mis-matches in primer') parser.add_argument('--barcode_mismatch', default=0, type=int, help='Number of mis-matches in barcode') parser.add_argument( '--barcode_fasta', help='FASTA file containing Barcodes (Names & Sequences)') parser.add_argument('--rescue_forward', default='on', choices=['on', 'off'], help='Rescue Not-merged forward reads') parser.add_argument('--barcode_rev_comp', action='store_true', help='Reverse complement barcode sequences') parser.add_argument('--min_len', default=100, type=int, help='Minimum read length to keep') parser.add_argument('-l', '--trim_len', default=300, type=int, help='Trim length for reads') parser.add_argument('-p', '--pad', default='off', choices=['on', 'off'], help='Pad with Ns to a set length') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH9 EXE') parser.add_argument('--cleanup', action='store_true', help='remove intermediate files') parser.add_argument('--merge_method', default='usearch', choices=['usearch', 'vsearch'], help='Software to use for PE read merging') args = parser.parse_args(args) args.out = re.sub(r'\W+', '', args.out) log_name = args.out + '.amptk-demux.log' if os.path.isfile(log_name): os.remove(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #get version of amptk usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of CPUs to use if not args.cpus: cpus = multiprocessing.cpu_count() else: cpus = args.cpus #create tmpdir tmpdir = args.out.split('.')[0] + '_' + str(os.getpid()) if not os.path.exists(tmpdir): os.makedirs(tmpdir) #parse a mapping file or a barcode fasta file, primers, etc get setup #dealing with Barcodes, get ion barcodes or parse the barcode_fasta argument barcode_file = args.out + ".barcodes_used.fa" if os.path.isfile(barcode_file): os.remove(barcode_file) #check if mapping file passed, use this if present, otherwise use command line arguments SampleData = {} Barcodes = {} RevBarcodes = {} FwdPrimer = '' RevPrimer = '' if args.mapping_file: if not os.path.isfile(args.mapping_file): amptklib.log.error("Mapping file not found: %s" % args.mapping_file) sys.exit(1) SampleData, Barcodes, RevBarcodes, FwdPrimer, RevPrimer = amptklib.parseMappingFileNEW( args.mapping_file) else: #no mapping file, so create dictionaries from barcode fasta files if not args.barcode_fasta: amptklib.log.error( "You did not specify a --barcode_fasta or --mapping_file, one is required" ) sys.exit(1) else: shutil.copyfile(args.barcode_fasta, barcode_file) Barcodes = amptklib.fasta2barcodes(barcode_file, False) if FwdPrimer == '' or RevPrimer == '': #parse primers here so doesn't conflict with mapping primers #look up primer db otherwise default to entry if args.F_primer in amptklib.primer_db: FwdPrimer = amptklib.primer_db.get(args.F_primer) amptklib.log.info( "{:} fwd primer found in AMPtk primer db, setting to: {:}". format(args.F_primer, FwdPrimer)) else: FwdPrimer = args.F_primer amptklib.log.info( "{:} fwd primer not found in AMPtk primer db, assuming it is actual primer sequence." .format(args.F_primer)) if args.R_primer in amptklib.primer_db: RevPrimer = amptklib.primer_db.get(args.R_primer) amptklib.log.info( "{:} rev primer found in AMPtk primer db, setting to: {:}". format(args.R_primer, RevPrimer)) else: RevPrimer = args.R_primer amptklib.log.info( "{:} rev primer not found in AMPtk primer db, assuming it is actual primer sequence." .format(args.R_primer)) #if still no primers set, then exit if FwdPrimer == '' or RevPrimer == '': amptklib.log.error( "Please provide primer sequences via --fwd_primer and --rev_primer" ) sys.exit(1) #if barcodes_rev_comp passed then reverse complement the keys in mapdict if args.barcode_rev_comp: amptklib.log.info("Reverse complementing barcode sequences") backupDict = Barcodes Barcodes = {} for k, v in list(backupDict.items()): RCkey = amptklib.RevComp(v) Barcodes[k] = RCkey amptklib.log.info("Loading %i samples from mapping file" % len(Barcodes)) amptklib.log.info('FwdPrimer: {:} RevPrimer: {:}'.format( FwdPrimer, RevPrimer)) amptklib.log.info( 'Dropping reads less than {:} bp and setting lossless trimming to {:} bp.' .format(args.min_len, args.trim_len)) #rename reads according to indexes if not amptklib.PEandIndexCheck( args.fastq, args.reverse, args.index[0]): #check they are all same length amptklib.log.error("FASTQ input malformed, read numbers do not match") sys.exit(1) amptklib.log.info("Loading FASTQ Records") NumSeqs = amptklib.countfastq(args.fastq) if cpus > 1: amptklib.log.info("Splitting FASTQ files over {:} cpus".format(cpus)) amptklib.split_fastqPEandI(args.fastq, args.reverse, args.index[0], NumSeqs, tmpdir, cpus * 2) file_list = [] for file in os.listdir(tmpdir): if file.endswith('.fq'): filepart = os.path.join(tmpdir, file.split('_R')[0]) if not filepart in file_list: file_list.append(filepart) amptklib.log.info("Mapping indexes to reads and renaming PE reads") amptklib.runMultiProgress(safe_run, file_list, cpus, args=args) else: amptklib.log.info("Mapping indexes to reads and renaming PE reads") shutil.copyfile(args.fastq, os.path.join(tmpdir, 'chunk_R1.fq')) shutil.copyfile(args.reverse, os.path.join(tmpdir, 'chunk_R2.fq')) shutil.copyfile(args.index[0], os.path.join(tmpdir, 'chunk_R3.fq')) processReadsPE(os.path.join(tmpdir, 'chunk'), args=args) print("-------------------------------------------------------") #Now concatenate all of the demuxed files together amptklib.log.info("Concatenating Demuxed Files") tmpDemux = os.path.join(tmpdir, args.out + '.demux.fq') with open(tmpDemux, 'wb') as outfile: for filename in glob.glob(os.path.join(tmpdir, '*.demux.fq')): if filename == tmpDemux: continue with open(filename, 'r') as readfile: shutil.copyfileobj(readfile, outfile) #parse the stats finalstats = [0, 0, 0, 0, 0, 0] for file in os.listdir(tmpdir): if file.endswith('.stats'): with open(os.path.join(tmpdir, file), 'r') as statsfile: line = statsfile.readline() line = line.replace('\n', '') newstats = line.split(',') newstats = [int(i) for i in newstats] for x, num in enumerate(newstats): finalstats[x] += num #finally reindex output #last thing is to re-number of reads as it is possible they could have same name from multitprocessor split Demux = args.out + '.demux.fq' amptklib.fastqreindex(tmpDemux, Demux) amptklib.SafeRemove(tmpDemux) #output stats of the run amptklib.log.info('{0:,}'.format(finalstats[0]) + ' total reads') amptklib.log.info('{0:,}'.format(finalstats[0] - finalstats[1]) + ' discarded no index match') amptklib.log.info('{0:,}'.format(finalstats[2]) + ' Fwd Primer found, {0:,}'.format(finalstats[3]) + ' Rev Primer found') amptklib.log.info('{0:,}'.format(finalstats[4]) + ' discarded too short (< %i bp)' % args.min_len) amptklib.log.info('{0:,}'.format(finalstats[5]) + ' valid output reads') #now loop through data and find barcoded samples, counting each..... BarcodeCount = {} with open(Demux, 'r') as input: header = itertools.islice(input, 0, None, 4) for line in header: ID = line.split("=", 1)[-1].split(";")[0] if ID not in BarcodeCount: BarcodeCount[ID] = 1 else: BarcodeCount[ID] += 1 #now let's count the barcodes found and count the number of times they are found. barcode_counts = "%30s: %s" % ('Sample', 'Count') for k, v in natsorted(list(BarcodeCount.items()), key=lambda k_v: k_v[1], reverse=True): barcode_counts += "\n%30s: %s" % (k, str(BarcodeCount[k])) amptklib.log.info("Found %i barcoded samples\n%s" % (len(BarcodeCount), barcode_counts)) #create mapping file if one doesn't exist genericmapfile = args.out + '.mapping_file.txt' amptklib.CreateGenericMappingFile(Barcodes, {}, FwdPrimer, RevPrimer, genericmapfile, BarcodeCount) #compress the output to save space FinalDemux = Demux + '.gz' amptklib.Fzip(Demux, FinalDemux, cpus) amptklib.removefile(Demux) if args.cleanup: amptklib.SafeRemove(tmpdir) #get file size filesize = os.path.getsize(FinalDemux) readablesize = amptklib.convertSize(filesize) amptklib.log.info("Output file: %s (%s)" % (FinalDemux, readablesize)) amptklib.log.info("Mapping file: %s" % genericmapfile) print("-------------------------------------------------------") if 'darwin' in sys.platform: print(col.WARN + "\nExample of next cmd: " + col.END + "amptk cluster -i %s -o out\n" % (FinalDemux)) else: print("\nExample of next cmd: amptk cluster -i %s -o out\n" % (FinalDemux))
def main(args): global FwdPrimer, RevPrimer, Barcodes, tmpdir parser = argparse.ArgumentParser( prog='amptk-process_ion.py', usage="%(prog)s [options] -i file.fastq\n%(prog)s -h for help menu", description= '''Script finds barcodes, strips forward and reverse primers, relabels, and then trim/pads reads to a set length''', epilog="""Written by Jon Palmer (2015) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--fastq', '--sff', '--fasta', '--bam', dest='fastq', required=True, help='BAM/FASTQ/SFF/FASTA file') parser.add_argument('-q', '--qual', help='QUAL file (if -i is FASTA)') parser.add_argument('-o', '--out', dest="out", default='ion', help='Base name for output') parser.add_argument('-f', '--fwd_primer', dest="F_primer", default='fITS7-ion', help='Forward Primer') parser.add_argument('-r', '--rev_primer', dest="R_primer", default='ITS4', help='Reverse Primer') parser.add_argument( '-m', '--mapping_file', help='Mapping file: QIIME format can have extra meta data columns') parser.add_argument('-p', '--pad', default='off', choices=['on', 'off'], help='Pad with Ns to a set length') parser.add_argument('--primer_mismatch', default=2, type=int, help='Number of mis-matches in primer') parser.add_argument('--barcode_mismatch', default=0, type=int, help='Number of mis-matches in barcode') parser.add_argument( '--barcode_fasta', default='ionxpress', help='FASTA file containing Barcodes (Names & Sequences)') parser.add_argument('--reverse_barcode', help='FASTA file containing 3 prime Barocdes') parser.add_argument('-b', '--list_barcodes', dest="barcodes", default='all', help='Enter Barcodes used separated by commas') parser.add_argument('--min_len', default=100, type=int, help='Minimum read length to keep') parser.add_argument('-l', '--trim_len', default=300, type=int, help='Trim length for reads') parser.add_argument( '--full_length', action='store_true', help='Keep only full length reads (no trimming/padding)') parser.add_argument('--mult_samples', dest="multi", default='False', help='Combine multiple samples (i.e. FACE1)') parser.add_argument('--ion', action='store_true', help='Input data is Ion Torrent') parser.add_argument('--454', action='store_true', help='Input data is 454') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH EXE') args = parser.parse_args(args) args.out = re.sub(r'\W+', '', args.out) log_name = args.out + '.amptk-demux.log' if os.path.isfile(log_name): os.remove(log_name) FNULL = open(os.devnull, 'w') amptklib.setupLogging(log_name) cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of CPUs to use if not args.cpus: cpus = multiprocessing.cpu_count() else: cpus = args.cpus #parse a mapping file or a barcode fasta file, primers, etc get setup #dealing with Barcodes, get ion barcodes or parse the barcode_fasta argument barcode_file = args.out + ".barcodes_used.fa" rev_barcode_file = args.out + '.revbarcodes_used.fa' amptklib.SafeRemove(barcode_file) amptklib.SafeRemove(rev_barcode_file) #check if mapping file passed, use this if present, otherwise use command line arguments SampleData = {} Barcodes = {} RevBarcodes = {} if args.mapping_file: if not os.path.isfile(args.mapping_file): amptklib.log.error("Mapping file not found: %s" % args.mapping_file) sys.exit(1) SampleData, Barcodes, RevBarcodes, FwdPrimer, RevPrimer = amptklib.parseMappingFileNEW( args.mapping_file) genericmapfile = args.mapping_file else: #no mapping file, so create dictionaries from barcode fasta files if args.barcode_fasta == 'ionxpress': #get script path and barcode file name pgm_barcodes = os.path.join(os.path.dirname(amptklib.__file__), 'DB', 'ionxpress_barcodes.fa') elif args.barcode_fasta == 'ioncode': pgm_barcodes = os.path.join(os.path.dirname(amptklib.__file__), 'DB', 'ioncode_barcodes.fa') if args.barcode_fasta == 'ionxpress' or args.barcode_fasta == 'ioncode': if args.barcodes == "all": if args.multi == 'False': shutil.copyfile(pgm_barcodes, barcode_file) else: with open(barcode_file, 'w') as barcodeout: with open(pgm_barcodes, 'r') as input: for rec in SeqIO.parse(input, 'fasta'): outname = args.multi + '.' + rec.id barcodeout.write(">%s\n%s\n" % (outname, rec.seq)) else: bc_list = args.barcodes.split(",") inputSeqFile = open(pgm_barcodes, "rU") SeqRecords = SeqIO.to_dict(SeqIO.parse(inputSeqFile, "fasta")) for rec in bc_list: name = "BC." + rec seq = SeqRecords[name].seq if args.multi != 'False': outname = args.multi + '.' + name else: outname = name outputSeqFile = open(barcode_file, "a") outputSeqFile.write(">%s\n%s\n" % (outname, seq)) outputSeqFile.close() inputSeqFile.close() else: #check for multi_samples and add if necessary if args.multi == 'False': shutil.copyfile(args.barcode_fasta, barcode_file) if args.reverse_barcode: shutil.copyfile(args.reverse_barcode, rev_barcode_file) else: with open(barcode_file, 'w') as barcodeout: with open(args.barcode_fasta, 'r') as input: for rec in SeqIO.parse(input, 'fasta'): outname = args.multi + '.' + rec.id barcodeout.write(">%s\n%s\n" % (outname, rec.seq)) if args.reverse_barcode: with open(rev_barcode_file, 'w') as barcodeout: with open(args.reverse_barcode, 'r') as input: for rec in SeqIO.parse(input, 'fasta'): outname = args.multi + '.' + rec.id barcodeout.write(">%s\n%s\n" % (outname, rec.seq)) #parse primers here so doesn't conflict with mapping primers #look up primer db otherwise default to entry if args.F_primer in amptklib.primer_db: FwdPrimer = amptklib.primer_db.get(args.F_primer) amptklib.log.info( "{:} fwd primer found in AMPtk primer db, setting to: {:}". format(args.F_primer, FwdPrimer)) else: FwdPrimer = args.F_primer amptklib.log.info( "{:} fwd primer not found in AMPtk primer db, assuming it is actual primer sequence." .format(args.F_primer)) if args.R_primer in amptklib.primer_db: RevPrimer = amptklib.primer_db.get(args.R_primer) amptklib.log.info( "{:} rev primer found in AMPtk primer db, setting to: {:}". format(args.R_primer, RevPrimer)) else: RevPrimer = args.R_primer amptklib.log.info( "{:} rev primer not found in AMPtk primer db, assuming it is actual primer sequence." .format(args.R_primer)) #check if input is compressed gzip_list = [] if args.fastq.endswith('.gz'): gzip_list.append(os.path.abspath(args.fastq)) if gzip_list: amptklib.log.info("Gzipped input files detected, uncompressing") for file in gzip_list: file_out = file.replace('.gz', '') amptklib.Funzip(file, file_out, cpus) args.fastq = args.fastq.replace('.gz', '') #if SFF file passed, convert to FASTQ with biopython if args.fastq.endswith(".sff"): if args.barcode_fasta == 'ionxpress': if not args.mapping_file: amptklib.log.error( "You did not specify a --barcode_fasta or --mapping_file, one is required for 454 data" ) sys.exit(1) amptklib.log.info("SFF input detected, converting to FASTQ") SeqIn = args.out + '.sff.extract.fastq' SeqIO.convert(args.fastq, "sff-trim", SeqIn, "fastq") elif args.fastq.endswith(".fas") or args.fastq.endswith( ".fasta") or args.fastq.endswith(".fa"): if not args.qual: amptklib.log.error( "FASTA input detected, however no QUAL file was given. You must have FASTA + QUAL files" ) sys.exit(1) else: if args.barcode_fasta == 'ionxpress': if not args.mapping_file: amptklib.log.error( "You did not specify a --barcode_fasta or --mapping_file, one is required for 454 data" ) sys.exit(1) SeqIn = args.out + '.fastq' amptklib.log.info("FASTA + QUAL detected, converting to FASTQ") amptklib.faqual2fastq(args.fastq, args.qual, SeqIn) elif args.fastq.endswith('.bam'): #so we can convert natively with pybam, however it is 10X slower than bedtools/samtools #since samtools is fastest, lets use that if exists, if not then bedtools, else default to pybam amptklib.log.info("Converting Ion Torrent BAM file to FASTQ") SeqIn = args.out + '.fastq' if amptklib.which('samtools'): cmd = ['samtools', 'fastq', '-@', str(cpus), args.fastq] amptklib.runSubprocess2(cmd, amptklib.log, SeqIn) else: if amptklib.which('bedtools'): cmd = [ 'bedtools', 'bamtofastq', '-i', args.fastq, '-fq', SeqIn ] amptklib.runSubprocess(cmd, amptklib.log) else: #default to pybam amptklib.bam2fastq(args.fastq, SeqIn) else: SeqIn = args.fastq #start here to process the reads, first reverse complement the reverse primer catDemux = args.out + '.demux.fq' origRevPrimer = RevPrimer RevPrimer = amptklib.RevComp(RevPrimer) amptklib.log.info("Foward primer: %s, Rev comp'd rev primer: %s" % (FwdPrimer, RevPrimer)) #then setup barcode dictionary if len(Barcodes) < 1: Barcodes = amptklib.fasta2barcodes(barcode_file, False) #setup for looking for reverse barcode if len(RevBarcodes) < 1 and args.reverse_barcode: if not os.path.isfile(args.reverse_barcode): amptklib.log.info("Reverse barcode is not a valid file, exiting") sys.exit(1) shutil.copyfile(args.reverse_barcode, rev_barcode_file) RevBarcodes = amptklib.fasta2barcodes(rev_barcode_file, True) #Count FASTQ records amptklib.log.info("Loading FASTQ Records") orig_total = amptklib.countfastq(SeqIn) size = amptklib.checkfastqsize(SeqIn) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #create tmpdir and split input into n cpus tmpdir = args.out.split('.')[0] + '_' + str(os.getpid()) if not os.path.exists(tmpdir): os.makedirs(tmpdir) amptklib.log.info( 'Dropping reads less than {:} bp and setting lossless trimming to {:} bp.' .format(args.min_len, args.trim_len)) if cpus > 1: #split fastq file amptklib.log.info("Splitting FASTQ files over {:} cpus".format(cpus)) amptklib.split_fastq(SeqIn, orig_total, tmpdir, cpus * 2) #now get file list from tmp folder file_list = [] for file in os.listdir(tmpdir): if file.endswith(".fq"): file = os.path.join(tmpdir, file) file_list.append(file) #finally process reads over number of cpus amptklib.runMultiProgress(processRead, file_list, cpus, args=args) else: shutil.copyfile(SeqIn, os.path.join(tmpdir, 'chunk.fq')) processRead(os.path.join(tmpdir, 'chunk.fq'), args=args) print("-------------------------------------------------------") #Now concatenate all of the demuxed files together amptklib.log.info("Concatenating Demuxed Files") tmpDemux = args.out + '.tmp.demux.fq' with open(tmpDemux, 'w') as outfile: for filename in glob.glob(os.path.join(tmpdir, '*.demux.fq')): if filename == tmpDemux: continue with open(filename, 'r') as readfile: shutil.copyfileobj(readfile, outfile) #parse the stats finalstats = [0, 0, 0, 0, 0, 0, 0] for file in os.listdir(tmpdir): if file.endswith('.stats'): with open(os.path.join(tmpdir, file), 'r') as statsfile: line = statsfile.readline() line = line.rstrip() newstats = line.split(',') newstats = [int(i) for i in newstats] for x, num in enumerate(newstats): finalstats[x] += num #clean up tmp folder shutil.rmtree(tmpdir) #last thing is to re-number of reads as it is possible they could have same name from multitprocessor split amptklib.fastqreindex(tmpDemux, catDemux) os.remove(tmpDemux) amptklib.log.info('{0:,}'.format(finalstats[0]) + ' total reads') if args.reverse_barcode: amptklib.log.info('{0:,}'.format(finalstats[0] - finalstats[1] - finalstats[2] - finalstats[4]) + ' valid Fwd and Rev Barcodes') else: amptklib.log.info('{0:,}'.format(finalstats[0] - finalstats[1]) + ' valid Barcode') amptklib.log.info('{0:,}'.format(finalstats[0] - finalstats[1] - finalstats[2]) + ' Fwd Primer found, {0:,}'.format(finalstats[3]) + ' Rev Primer found') amptklib.log.info('{0:,}'.format(finalstats[5]) + ' discarded too short (< %i bp)' % args.min_len) amptklib.log.info('{0:,}'.format(finalstats[6]) + ' valid output reads') #now loop through data and find barcoded samples, counting each..... BarcodeCount = {} with open(catDemux, 'r') as input: header = itertools.islice(input, 0, None, 4) for line in header: ID = line.split("=", 1)[-1].split(";")[0] if ID not in BarcodeCount: BarcodeCount[ID] = 1 else: BarcodeCount[ID] += 1 #now let's count the barcodes found and count the number of times they are found. barcode_counts = "%22s: %s" % ('Sample', 'Count') for k, v in natsorted(list(BarcodeCount.items()), key=lambda k_v: k_v[1], reverse=True): barcode_counts += "\n%22s: %s" % (k, str(BarcodeCount[k])) amptklib.log.info("Found %i barcoded samples\n%s" % (len(BarcodeCount), barcode_counts)) #create a generic mappingfile for downstream processes genericmapfile = args.out + '.mapping_file.txt' if not args.mapping_file: amptklib.CreateGenericMappingFile(Barcodes, RevBarcodes, FwdPrimer, origRevPrimer, genericmapfile, BarcodeCount) else: amptklib.updateMappingFile(args.mapping_file, BarcodeCount, genericmapfile) #compress the output to save space FinalDemux = catDemux + '.gz' amptklib.Fzip(catDemux, FinalDemux, cpus) amptklib.removefile(catDemux) if gzip_list: for file in gzip_list: file = file.replace('.gz', '') amptklib.removefile(file) #get file size filesize = os.path.getsize(FinalDemux) readablesize = amptklib.convertSize(filesize) amptklib.log.info("Output file: %s (%s)" % (FinalDemux, readablesize)) amptklib.log.info("Mapping file: %s" % genericmapfile) print("-------------------------------------------------------") if 'darwin' in sys.platform: print(col.WARN + "\nExample of next cmd: " + col.END + "amptk cluster -i %s -o out\n" % (FinalDemux)) else: print("\nExample of next cmd: amptk cluster -i %s -o out\n" % (FinalDemux))