Пример #1
0
def processReadsPE(input, args=False):
    base = os.path.basename(input)
    forward_reads = os.path.join(tmpdir, base+'_R1.fq')
    reverse_reads = os.path.join(tmpdir, base+'_R2.fq')
    orientR1 = os.path.join(tmpdir, base+'_R1.oriented.fq')
    orientR2 = os.path.join(tmpdir, base+'_R2.oriented.fq')
    trim_forward = os.path.join(tmpdir, base+'_R1.trimmed.fq')
    trim_reverse = os.path.join(tmpdir, base+'_R2.trimmed.fq')
    merged_reads = os.path.join(tmpdir, base+'.merged.fq')
    DemuxOut = os.path.join(tmpdir, base+'.demux.fq')
    StatsOut = os.path.join(tmpdir, base+'.stats')
    RL = amptklib.GuessRL(forward_reads)
    Total, Correct, Flip, Drop = amptklib.illuminaReorient(forward_reads, reverse_reads, FwdPrimer, RevPrimer, args.primer_mismatch, RL, orientR1, orientR2)
    amptklib.log.debug('Re-oriented PE reads for {:}: {:,} total, {:,} correct, {:,} flipped, {:,} dropped.'.format(base, Total, Correct, Flip, Drop))
    if args.barcode_not_anchored:
        amptklib.demuxIlluminaPE2(orientR1, orientR2, FwdPrimer, RevPrimer, SampleData, Barcodes, RevBarcodes, args.barcode_mismatch, args.primer_mismatch, trim_forward, trim_reverse, StatsOut)
    else:
        amptklib.demuxIlluminaPE(orientR1, orientR2, FwdPrimer, RevPrimer, SampleData, Barcodes, RevBarcodes, args.barcode_mismatch, args.primer_mismatch, trim_forward, trim_reverse, StatsOut)
    if args.full_length:
        amptklib.MergeReadsSimple(trim_forward, trim_reverse, '.', DemuxOut, args.min_len, usearch, 'off', args.merge_method)
    else:
        amptklib.MergeReadsSimple(trim_forward, trim_reverse, '.', merged_reads, args.min_len, usearch, 'on', args.merge_method)
        amptklib.losslessTrim(merged_reads, FwdPrimer, RevPrimer, args.primer_mismatch, args.trim_len, args.pad, args.min_len, DemuxOut) 
    amptklib.SafeRemove(orientR1)
    amptklib.SafeRemove(orientR2)
    amptklib.SafeRemove(forward_reads)
    amptklib.SafeRemove(reverse_reads)
    amptklib.SafeRemove(merged_reads)
Пример #2
0
def main(args):
	global FwdPrimer, RevPrimer, SampleData, Barcodes, RevBarcodes, tmpdir, usearch
	parser=argparse.ArgumentParser(prog='amptk-process_ion.py', usage="%(prog)s [options] -i file.fastq\n%(prog)s -h for help menu",
		description='''Script finds barcodes, strips forward and reverse primers, relabels, and then trim/pads reads to a set length''',
		epilog="""Written by Jon Palmer (2015) [email protected]""",
		formatter_class=MyFormatter)

	parser.add_argument('-i','--fastq', dest='fastq', required=True, help='FASTQ R1 file')
	parser.add_argument('--reverse', help='Illumina R2 reverse reads')
	parser.add_argument('-o','--out', dest="out", default='illumina2', help='Base name for output')
	parser.add_argument('-f','--fwd_primer', dest="F_primer", default='fITS7', help='Forward Primer')
	parser.add_argument('-r','--rev_primer', dest="R_primer", default='ITS4', help='Reverse Primer')
	parser.add_argument('-m','--mapping_file', help='Mapping file: QIIME format can have extra meta data columns')
	parser.add_argument('-p','--pad', default='off', choices=['on', 'off'], help='Pad with Ns to a set length')
	parser.add_argument('--primer_mismatch', default=2, type=int, help='Number of mis-matches in primer')
	parser.add_argument('--barcode_mismatch', default=0, type=int, help='Number of mis-matches in barcode')
	parser.add_argument('--barcode_fasta', help='FASTA file containing Barcodes (Names & Sequences)')
	parser.add_argument('--barcode_not_anchored', action='store_true', help='Barcodes (indexes) are not at start of reads')
	parser.add_argument('--reverse_barcode', help='FASTA file containing 3 prime Barocdes')
	parser.add_argument('--min_len', default=100, type=int, help='Minimum read length to keep')
	parser.add_argument('-l','--trim_len', default=300, type=int, help='Trim length for reads')
	parser.add_argument('--full_length', action='store_true', help='Keep only full length reads (no trimming/padding)')
	parser.add_argument('--merge_method', default='usearch', choices=['usearch', 'vsearch'], help='Software to use for PE read merging')
	parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto")
	parser.add_argument('-u','--usearch', dest="usearch", default='usearch9', help='USEARCH EXE')
	args=parser.parse_args(args)    

	args.out = re.sub(r'\W+', '', args.out)

	log_name = args.out + '.amptk-demux.log'
	if os.path.isfile(log_name):
		os.remove(log_name)
	FNULL = open(os.devnull, 'w')
	amptklib.setupLogging(log_name)
	cmd_args = " ".join(sys.argv)+'\n'
	amptklib.log.debug(cmd_args)
	print("-------------------------------------------------------")

	#initialize script, log system info and usearch version
	amptklib.SystemInfo()
	#Do a version check
	usearch = args.usearch
	amptklib.versionDependencyChecks(usearch)

	#get number of CPUs to use
	if not args.cpus:
		cpus = multiprocessing.cpu_count()
	else:
		cpus = args.cpus

	#parse a mapping file or a barcode fasta file, primers, etc get setup
	#dealing with Barcodes, get ion barcodes or parse the barcode_fasta argument
	barcode_file = args.out + ".barcodes_used.fa"
	rev_barcode_file = args.out + '.revbarcodes_used.fa'
	amptklib.SafeRemove(barcode_file)
	amptklib.SafeRemove(rev_barcode_file)

	#check if mapping file passed, use this if present, otherwise use command line arguments
	SampleData = {}
	Barcodes = {}
	RevBarcodes = {}
	FwdPrimer = ''
	RevPrimer = ''
	if args.mapping_file:
		if not os.path.isfile(args.mapping_file):
			amptklib.log.error("Mapping file not found: %s" % args.mapping_file)
			sys.exit(1)
		SampleData, Barcodes, RevBarcodes, FwdPrimer, RevPrimer = amptklib.parseMappingFileNEW(args.mapping_file)  
	else: #no mapping file, so create dictionaries from barcode fasta files
		if not args.barcode_fasta:
			amptklib.log.error("You did not specify a --barcode_fasta or --mapping_file, one is required")
			sys.exit(1)
		else:
			shutil.copyfile(args.barcode_fasta, barcode_file)
			Barcodes = amptklib.fasta2barcodes(barcode_file, False)
			if args.reverse_barcode:
				shutil.copyfile(args.reverse_barcode, rev_barcode_file)
				RevBarcodes = amptklib.fasta2barcodes(rev_barcode_file, False)                   
	
		#parse primers here so doesn't conflict with mapping primers
		#look up primer db otherwise default to entry
		if args.F_primer in amptklib.primer_db:
			FwdPrimer = amptklib.primer_db.get(args.F_primer)
			amptklib.log.info("{:} fwd primer found in AMPtk primer db, setting to: {:}".format(args.F_primer, FwdPrimer))
		else:
			FwdPrimer = args.F_primer
			amptklib.log.info("{:} fwd primer not found in AMPtk primer db, assuming it is actual primer sequence.".format(args.F_primer))
		if args.R_primer in amptklib.primer_db:
			RevPrimer = amptklib.primer_db.get(args.R_primer)
			amptklib.log.info("{:} rev primer found in AMPtk primer db, setting to: {:}".format(args.R_primer, RevPrimer))
		else:
			RevPrimer = args.R_primer
			amptklib.log.info("{:} rev primer not found in AMPtk primer db, assuming it is actual primer sequence.".format(args.R_primer))

	#check if input is compressed
	gzip_list = []
	if args.fastq.endswith('.gz'):
		gzip_list.append(os.path.abspath(args.fastq))
	if args.reverse:
		if args.reverse.endswith('.gz'):
			gzip_list.append(os.path.abspath(args.reverse))
	if gzip_list:
		amptklib.log.info("Gzipped input files detected, uncompressing")
		for file in gzip_list:
			file_out = file.replace('.gz', '')
			amptklib.Funzip(file, file_out, cpus)
		args.fastq = args.fastq.replace('.gz', '')
		if args.reverse:
			args.reverse = args.reverse.replace('.gz', '')

	#Count FASTQ records
	amptklib.log.info("Loading FASTQ Records")
	orig_total = amptklib.countfastq(args.fastq)
	size = amptklib.checkfastqsize(args.fastq)
	readablesize = amptklib.convertSize(size*2)
	amptklib.log.info('{:,} reads ({:})'.format(orig_total, readablesize))

	#output barcodes/samples
	amptklib.log.info('Searching for {:} forward barcodes and {:} reverse barcodes'.format(len(Barcodes), len(RevBarcodes)))

	#create tmpdir and split input into n cpus
	tmpdir = args.out.split('.')[0]+'_'+str(os.getpid())
	if not os.path.exists(tmpdir):
		os.makedirs(tmpdir)
	
	#tell user about number of cores using
	amptklib.log.info("Splitting FASTQ files over {:} cpus".format(cpus))

	if args.reverse:
		amptklib.log.info("Demuxing PE Illumina reads; FwdPrimer: {:} RevPrimer: {:}".format(FwdPrimer, RevPrimer))
	else:
		amptklib.log.info("Demuxing SE Illumina reads; FwdPrimer: {:} RevPrimer: {:}".format(FwdPrimer, amptklib.RevComp(RevPrimer)))

	amptklib.log.info('Dropping reads less than {:} bp and setting lossless trimming to {:} bp.'.format(args.min_len, args.trim_len))

	if cpus > 1:
		if args.reverse:
			amptklib.split_fastqPE(args.fastq, args.reverse, orig_total, tmpdir, cpus*4)
			file_list = []
			for file in os.listdir(tmpdir):
				if file.endswith('.fq'):
					filepart = os.path.join(tmpdir, file.split('_R')[0])
					if not filepart in file_list:
						file_list.append(filepart)
			amptklib.runMultiProgress(processReadsPE, file_list, cpus, args=args)               
		else:
			#split fastq file
			amptklib.split_fastq(args.fastq, orig_total, tmpdir, cpus*4)    
			#now get file list from tmp folder
			file_list = []
			for file in os.listdir(tmpdir):
				if file.endswith(".fq"):
					file = os.path.join(tmpdir, file)
					file_list.append(file)
			#finally process reads over number of cpus
			amptklib.runMultiProgress(processRead, file_list, cpus, args=args)
	else:
		if args.reverse:
			shutil.copyfile(args.fastq, os.path.join(tmpdir, 'chunk_R1.fq'))
			shutil.copyfile(args.reverse, os.path.join(tmpdir, 'chunk_R2.fq'))
			processReadsPE(os.path.join(tmpdir, 'chunk'), args=args)
		else:
			shutil.copyfile(args.fastq, os.path.join(tmpdir, 'chunk.fq'))
			processRead(os.path.join(tmpdir, 'chunk.fq'), args=args)

	print("-------------------------------------------------------")
	#Now concatenate all of the demuxed files together
	amptklib.log.info("Concatenating Demuxed Files")

	tmpDemux = args.out + '.tmp.demux.fq'
	with open(tmpDemux, 'w') as outfile:
		for filename in glob.glob(os.path.join(tmpdir,'*.demux.fq')):
			if filename == tmpDemux:
				continue
			with open(filename, 'r') as readfile:
				shutil.copyfileobj(readfile, outfile)
	if args.reverse:
		#parse the stats
		finalstats = [0,0,0,0,0,0]
		for file in os.listdir(tmpdir):
			if file.endswith('.stats'):
				with open(os.path.join(tmpdir, file), 'r') as statsfile:
					line = statsfile.readline()
					line = line.rstrip()
					newstats = line.split(',')
					newstats = [int(i) for i in newstats]
					for x, num in enumerate(newstats):
						finalstats[x] += num
	
		amptklib.log.info('{0:,}'.format(finalstats[0])+' total reads')
		amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1]-finalstats[3])+' valid Barcodes')
		amptklib.log.info('{0:,}'.format(finalstats[5])+' valid output reads (Barcodes and Primers)')
	else:
		#parse the stats
		finalstats = [0,0,0,0,0,0,0]
		for file in os.listdir(tmpdir):
			if file.endswith('.stats'):
				with open(os.path.join(tmpdir, file), 'r') as statsfile:
					line = statsfile.readline()
					line = line.rstrip()
					newstats = line.split(',')
					newstats = [int(i) for i in newstats]
					for x, num in enumerate(newstats):
						finalstats[x] += num
			
		amptklib.log.info('{0:,}'.format(finalstats[0])+' total reads')
		if args.reverse_barcode:
			amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1]-finalstats[2]-finalstats[4])+' valid Fwd and Rev Barcodes')
		else:
			amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1])+' valid Barcode')
			amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1]-finalstats[2])+' Fwd Primer found, {0:,}'.format(finalstats[3])+ ' Rev Primer found')
		amptklib.log.info('{0:,}'.format(finalstats[5])+' discarded too short (< %i bp)' % args.min_len)
		amptklib.log.info('{0:,}'.format(finalstats[6])+' valid output reads')


	#clean up tmp folder
	amptklib.SafeRemove(tmpdir)

	#last thing is to re-number of reads as it is possible they could have same name from multitprocessor split
	catDemux = args.out+'.demux.fq'
	amptklib.fastqreindex(tmpDemux, catDemux)
	amptklib.SafeRemove(tmpDemux)
	#now loop through data and find barcoded samples, counting each.....
	BarcodeCount = {}
	with open(catDemux, 'r') as input:
		header = itertools.islice(input, 0, None, 4)
		for line in header:
			ID = line.split("=",1)[-1].split(";")[0]
			if ID not in BarcodeCount:
				BarcodeCount[ID] = 1
			else:
				BarcodeCount[ID] += 1

	#now let's count the barcodes found and count the number of times they are found.
	barcode_counts = "%22s:  %s" % ('Sample', 'Count')
	for k,v in natsorted(list(BarcodeCount.items()), key=lambda k_v: k_v[1], reverse=True):
		barcode_counts += "\n%22s:  %s" % (k, str(BarcodeCount[k]))
	amptklib.log.info("Found %i barcoded samples\n%s" % (len(BarcodeCount), barcode_counts))

	genericmapfile = args.out + '.mapping_file.txt'
	if not args.mapping_file:
		#create a generic mappingfile for downstream processes
		amptklib.CreateGenericMappingFile(Barcodes, RevBarcodes, FwdPrimer, RevPrimer, genericmapfile, BarcodeCount)
	else:
		amptklib.updateMappingFile(args.mapping_file, BarcodeCount, genericmapfile)
	#compress the output to save space
	FinalDemux = catDemux+'.gz'
	amptklib.Fzip(catDemux, FinalDemux, cpus)
	amptklib.removefile(catDemux)
	if gzip_list:
		for file in gzip_list:
			file = file.replace('.gz', '')
			amptklib.removefile(file)

	#get file size
	filesize = os.path.getsize(FinalDemux)
	readablesize = amptklib.convertSize(filesize)
	amptklib.log.info("Output file:  %s (%s)" % (FinalDemux, readablesize))
	amptklib.log.info("Mapping file: %s" % genericmapfile)

	print("-------------------------------------------------------")
	if 'darwin' in sys.platform:
		print(col.WARN + "\nExample of next cmd: " + col.END + "amptk cluster -i %s -o out\n" % (FinalDemux))
	else:
		print("\nExample of next cmd: amptk cluster -i %s -o out\n" % (FinalDemux))
Пример #3
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-assign_taxonomy.py',
        usage="%(prog)s [options] -f <FASTA File>",
        description='''assign taxonomy to OTUs''',
        epilog="""Written by Jon Palmer (2015) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--otu_table',
                        dest="otu_table",
                        help='Append Taxonomy to OTU table')
    parser.add_argument('-f', '--fasta', required=True, help='FASTA input')
    parser.add_argument('-o', '--out', help='Output file (FASTA)')
    parser.add_argument(
        '-m',
        '--mapping_file',
        help='Mapping file: QIIME format can have extra meta data columns')
    parser.add_argument(
        '--method',
        default='hybrid',
        choices=['utax', 'usearch', 'sintax', 'hybrid', 'rdp', 'blast'],
        help='Taxonomy method')
    parser.add_argument(
        '-d',
        '--db',
        help='Pre-installed Databases: [ITS,ITS1,ITS2,16S,LSU,COI]')
    parser.add_argument(
        '-t',
        '--taxonomy',
        help='Incorporate taxonomy calculated elsewhere, 2 column file')
    parser.add_argument('--fasta_db',
                        help='Alternative database of fasta sequences')
    parser.add_argument('--add2db',
                        help='Custom FASTA database to add to DB on the fly')
    parser.add_argument('--utax_db', help='UTAX Reference Database')
    parser.add_argument('--utax_cutoff',
                        default=0.8,
                        type=restricted_float,
                        help='UTAX confidence value threshold.')
    parser.add_argument('--usearch_db', help='USEARCH Reference Database')
    parser.add_argument('--usearch_cutoff',
                        default=0.7,
                        type=restricted_float,
                        help='USEARCH percent ID threshold.')
    parser.add_argument(
        '-r',
        '--rdp',
        dest='rdp',
        default='/Users/jon/scripts/rdp_classifier_2.10.1/dist/classifier.jar',
        help='Path to RDP Classifier')
    parser.add_argument('--rdp_db',
                        dest='rdp_tax',
                        default='fungalits_unite',
                        choices=[
                            '16srrna', 'fungallsu', 'fungalits_warcup',
                            'fungalits_unite'
                        ],
                        help='Training set for RDP Classifier')
    parser.add_argument('--rdp_cutoff',
                        default=0.8,
                        type=restricted_float,
                        help='RDP confidence value threshold')
    parser.add_argument('--local_blast', help='Path to local Blast DB')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH8 EXE')
    parser.add_argument('--tax_filter',
                        help='Retain only OTUs with match in OTU table')
    parser.add_argument('--sintax_cutoff',
                        default=0.8,
                        type=restricted_float,
                        help='SINTAX threshold.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Remove Intermediate Files')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))

    if not args.out:
        #get base name of files
        if 'filtered' in args.fasta:
            base = args.fasta.split(".filtered")[0]
        elif 'otu' in args.fasta:
            base = args.fasta.split('.otu')[0]
        else:
            base = args.fasta.split('.fa')[0]
    else:
        base = args.out

    #remove logfile if exists
    log_name = base + '.amptk-taxonomy.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of cpus
    if args.cpus:
        cpus = args.cpus
    else:
        cpus = amptklib.getCPUS()

    #Setup DB locations and names, etc
    DBdir = os.path.join(parentdir, 'DB')
    DataBase = {
        'ITS1': (os.path.join(DBdir,
                              'ITS.udb'), os.path.join(DBdir, 'ITS1_UTAX.udb'),
                 os.path.join(DBdir, 'ITS_SINTAX.udb')),
        'ITS2': (os.path.join(DBdir,
                              'ITS.udb'), os.path.join(DBdir, 'ITS2_UTAX.udb'),
                 os.path.join(DBdir, 'ITS_SINTAX.udb')),
        'ITS': (os.path.join(DBdir,
                             'ITS.udb'), os.path.join(DBdir, 'ITS_UTAX.udb'),
                os.path.join(DBdir, 'ITS_SINTAX.udb')),
        '16S': (os.path.join(DBdir, '16S.udb'), os.path.join(DBdir, '16S.udb'),
                os.path.join(DBdir, '16S_SINTAX.udb')),
        'LSU': (os.path.join(DBdir,
                             'LSU.udb'), os.path.join(DBdir, 'LSU_UTAX.udb'),
                os.path.join(DBdir, 'LSU_SINTAX.udb')),
        'COI': (os.path.join(DBdir,
                             'COI.udb'), os.path.join(DBdir, 'COI_UTAX.udb'),
                os.path.join(DBdir, 'COI_SINTAX.udb'))
    }

    #get DB names up front
    if args.db in DataBase:
        utax_db = DataBase.get(args.db)[1]
        usearch_db = DataBase.get(args.db)[0]
        sintax_db = DataBase.get(args.db)[2]
        if not utax_db:
            utax_db = args.utax_db
        if not usearch_db:
            usearch_db = args.usearch_db
    else:
        utax_db = args.utax_db
        usearch_db = args.usearch_db
        if args.fasta_db:
            sintax_db = args.fasta_db
        else:
            sintax_db = args.usearch_db

    if args.method in ['hybrid', 'usearch', 'utax']:
        if not utax_db and not usearch_db and not args.fasta_db:
            amptklib.log.error(
                "You have not selected a database, need either --db, --utax_db, --usearch_db, or --fasta_db"
            )
            sys.exit(1)
        else:  #check that the DB exists
            if args.method == 'usearch' and usearch_db:
                if not amptklib.checkfile(usearch_db):
                    amptklib.log.error(
                        'USEARCH DB not found: {:}'.format(usearch_db))
                    amptklib.log.derror(
                        'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB'
                    )
                    sys.exit(1)
            if args.method == 'sintax' and sintax_db:
                if not amptklib.checkfile(sintax_db):
                    amptklib.log.error(
                        'SINTAX DB not found: {:}'.format(sintax_db))
                    amptklib.log.derror(
                        'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB'
                    )
                    sys.exit(1)
            if args.method == 'utax' and utax_db:
                if not amptklib.checkfile(utax_db):
                    amptklib.log.error(
                        'UTAX DB not found: {:}'.format(utax_db))
                    amptklib.log.error(
                        'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB'
                    )
                    sys.exit(1)

    custom_db = None
    if args.add2db:  #means user wants to add sequences to the usearch database on the so will need to rebuild database
        custom_db = base + '.custom_database.fa'
        if amptklib.checkfile(custom_db):
            amptklib.SafeRemove(custom_db)
        if args.db:  #this means that the fasta files need to be extracted
            amptklib.log.info("Adding {:} to the {:} database".format(
                os.path.basename(args.add2db), os.path.basename(usearch_db)))
            cmd = ['vsearch', '--udb2fasta', usearch_db, '--output', custom_db]
            amptklib.runSubprocess(cmd, amptklib.log)
            with open(custom_db, 'a') as outfile:
                with open(args.add2db, 'r') as infile:
                    shutil.copyfileobj(infile, outfile)
        elif args.fasta_db:
            amptklib.log.info("Adding {:} to the {:} database".format(
                os.path.basename(args.add2db),
                os.path.basename(args.fasta_db)))
            with open(custom_db, 'w') as outfile:
                with open(args.fasta_db, 'r') as infile:
                    shutil.copyfileobj(infile, outfile)
                with open(args.add2db, 'r') as infile:
                    shutil.copyfileobj(infile, outfile)

    #Count records
    amptklib.log.info("Loading FASTA Records")
    total = amptklib.countfasta(args.fasta)
    amptklib.log.info('{0:,}'.format(total) + ' OTUs')

    #declare output files/variables here
    blast_out = base + '.blast.txt'
    rdp_out = base + '.rdp.txt'
    utax_out = base + '.usearch.txt'
    usearch_out = base + '.usearch.txt'
    sintax_out = base + '.sintax.txt'
    otuDict = {}

    if not args.taxonomy:
        #start with less common uses, i.e. Blast, rdp
        if args.method == 'blast':
            #check if command line blast installed
            if not amptklib.which('blastn'):
                amptklib.log.error("BLASTN not found in your PATH, exiting.")
                sys.exit(1)

            #now run blast remotely using NCBI nt database
            outformat = "6 qseqid sseqid pident stitle"
            if args.local_blast:
                #get number of cpus
                amptklib.log.info("Running local BLAST using db: %s" %
                                  args.local_blast)
                cmd = [
                    'blastn', '-num_threads',
                    str(cpus), '-query', args.fasta, '-db',
                    os.path.abspath(args.local_blast), '-max_target_seqs', '1',
                    '-outfmt', outformat, '-out', blast_out
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            else:
                amptklib.log.info(
                    "Running BLASTN using NCBI remote nt database, this may take awhile"
                )
                cmd = [
                    'blastn', '-query', args.fasta, '-db', 'nt', '-remote',
                    '-max_target_seqs', '1', '-outfmt', outformat, '-out',
                    blast_out
                ]
                amptklib.runSubprocess(cmd, amptklib.log)

            #load results and reformat
            new = []
            f = csv.reader(open(blast_out), delimiter=str('\t'))
            for col in f:
                query = col[0]
                gbID = col[1].split("|")[3]
                pident = col[2]
                name = col[3]
                tax = gbID + ";" + name + " (" + pident + ")"
                line = [query, tax]
                new.append(line)
            otuDict = dict(new)
        elif args.method == 'rdp':
            #check that classifier is installed
            try:
                rdp_test = subprocess.Popen(
                    ['java', '-Xmx2000m', '-jar', args.rdp, 'classify'],
                    stdout=subprocess.PIPE).communicate()[0].rstrip()
            except OSError:
                amptklib.log.error("%s not found in your PATH, exiting." %
                                   args.rdp)
                sys.exit(1)

            #RDP database
            amptklib.log.info("Using RDP classifier %s training set" %
                              args.rdp_tax)

            #run RDP
            cmd = [
                'java', '-Xmx2000m', '-jar', args.rdp, 'classify', '-g',
                args.rdp_tax, '-o', rdp_out, '-f', 'fixrank', args.fasta
            ]
            amptklib.runSubprocess(cmd, amptklib.log)

            #load in results and put into dictionary
            new = []
            removal = ["unidentified", "Incertae", "uncultured", "incertae"]
            remove_exp = [re.compile(x) for x in removal]
            f = csv.reader(open(rdp_out), delimiter=str('\t'))
            for col in f:
                if float(col[19]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[
                        8] + ",o:" + col[11] + ",f:" + col[14] + ",g:" + col[17]
                elif float(col[16]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[
                        8] + ",o:" + col[11] + ",f:" + col[14]
                elif float(col[13]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[
                        8] + ",o:" + col[11]
                elif float(col[10]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[8]
                elif float(col[7]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5]
                elif float(col[4]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2]
                else:
                    tax = "RDP;k:unclassified"
                tax_split = tax.split(",")
                tax = [
                    s for s in tax_split
                    if not any(re.search(s) for re in remove_exp)
                ]
                tax = ",".join(tax)
                line = [col[0], tax]
                new.append(line)
            otuDict = dict(new)
        else:
            #check status of USEARCH DB and run
            if args.method in ['hybrid', 'usearch']:
                if args.fasta_db:
                    #now run through usearch global
                    amptklib.log.info(
                        "Global alignment OTUs with usearch_global (VSEARCH) against {:}"
                        .format(os.path.basename(args.fasta_db)))
                    cmd = [
                        'vsearch', '--usearch_global', args.fasta, '--db',
                        os.path.abspath(args.fasta_db), '--userout',
                        usearch_out, '--id',
                        str(args.usearch_cutoff), '--strand', 'both',
                        '--output_no_hits', '--maxaccepts', '0',
                        '--top_hits_only', '--userfields', 'query+target+id',
                        '--notrunclabels', '--threads',
                        str(cpus)
                    ]
                    amptklib.runSubprocess(cmd, amptklib.log)
                elif custom_db:
                    #now run through usearch global
                    amptklib.log.info(
                        "Global alignment OTUs with usearch_global (VSEARCH) against custom DB"
                    )
                    cmd = [
                        'vsearch', '--usearch_global', args.fasta, '--db',
                        os.path.abspath(custom_db), '--userout', usearch_out,
                        '--id',
                        str(args.usearch_cutoff), '--strand', 'both',
                        '--output_no_hits', '--maxaccepts', '0',
                        '--top_hits_only', '--userfields', 'query+target+id',
                        '--notrunclabels', '--threads',
                        str(cpus)
                    ]
                    amptklib.runSubprocess(cmd, amptklib.log)
                else:
                    if usearch_db:
                        amptklib.log.info(
                            "Global alignment OTUs with usearch_global (VSEARCH) against {:}"
                            .format(os.path.basename(usearch_db)))
                        cmd = [
                            'vsearch', '--usearch_global', args.fasta, '--db',
                            os.path.abspath(usearch_db), '--userout',
                            usearch_out, '--id',
                            str(args.usearch_cutoff), '--strand', 'both',
                            '--output_no_hits', '--maxaccepts', '0',
                            '--top_hits_only', '--userfields',
                            'query+target+id', '--notrunclabels', '--threads',
                            str(cpus)
                        ]
                        amptklib.runSubprocess(cmd, amptklib.log)

            if args.method in ['hybrid', 'utax']:
                if utax_db:
                    #now run through UTAX
                    utax_out = base + '.utax.txt'
                    amptklib.log.info("Classifying OTUs with UTAX (USEARCH)")
                    cutoff = str(args.utax_cutoff)
                    cmd = [
                        usearch, '-utax', args.fasta, '-db', utax_db,
                        '-utaxout', utax_out, '-utax_cutoff', cutoff,
                        '-strand', 'plus', '-notrunclabels', '-threads',
                        str(cpus)
                    ]
                    amptklib.runSubprocess(cmd, amptklib.log)
                else:
                    amptklib.log.error("UTAX DB %s not found, skipping" %
                                       utax_db)

            if args.method in ['hybrid', 'sintax']:
                if args.fasta_db:  #if you pass fasta file here, over ride any auto detection
                    sintax_db = args.fasta_db
                #now run sintax
                amptklib.log.info("Classifying OTUs with SINTAX (USEARCH)")
                cmd = [
                    usearch, '-sintax', args.fasta, '-db',
                    os.path.abspath(sintax_db), '-tabbedout', sintax_out,
                    '-sintax_cutoff',
                    str(args.sintax_cutoff), '-strand', 'both', '-threads',
                    str(cpus)
                ]
                amptklib.runSubprocess(cmd, amptklib.log)

            #now process results, load into dictionary - slightly different depending on which classification was run.
            if args.method == 'hybrid':
                #run upgraded method, first load dictionaries with resuls
                if amptklib.checkfile(utax_out):
                    utaxDict = amptklib.classifier2dict(
                        utax_out, args.utax_cutoff)
                    amptklib.log.debug(
                        'UTAX results parsed, resulting in {:,} taxonomy predictions'
                        .format(len(utaxDict)))
                else:
                    amptklib.log.info('UTAX results empty')
                    utaxDict = {}
                if amptklib.checkfile(sintax_out):
                    sintaxDict = amptklib.classifier2dict(
                        sintax_out, args.sintax_cutoff)
                    amptklib.log.debug(
                        'SINTAX results parsed, resulting in {:,} taxonomy predictions'
                        .format(len(sintaxDict)))
                else:
                    amptklib.log.info('SINTAX results empty')
                    sintaxDict = {}
                usearchDict = amptklib.usearchglobal2dict(usearch_out)
                amptklib.log.debug(
                    'Global alignment results parsed, resulting in {:,} taxonomy predictions'
                    .format(len(usearchDict)))
                otuList = natsorted(list(usearchDict.keys()))
                #first compare classifier results, getting better of the two
                bestClassify = amptklib.bestclassifier(utaxDict, sintaxDict,
                                                       otuList)
                #now get best taxonomy by comparing to global alignment results
                otuDict = amptklib.bestTaxonomy(usearchDict, bestClassify)
                amptklib.log.debug(
                    'Combined OTU taxonomy dictionary contains {:,} taxonomy predictions'
                    .format(len(otuDict)))
                if len(otuDict) < 1:
                    amptklib.log.info('Parsing taxonomy failed -- see logfile')
                    sys.exit(1)

            elif args.method == 'utax' and amptklib.checkfile(utax_out):
                #load results into dictionary for appending to OTU table
                amptklib.log.debug("Loading UTAX results into dictionary")
                with open(utax_out, 'r') as infile:
                    reader = csv.reader(infile, delimiter=str("\t"))
                    otuDict = {rows[0]: 'UTAX;' + rows[2] for rows in reader}

            elif args.method == 'usearch' and amptklib.checkfile(usearch_out):
                #load results into dictionary for appending to OTU table
                amptklib.log.debug(
                    "Loading Global Alignment results into dictionary")
                otuDict = {}
                usearchDict = amptklib.usearchglobal2dict(usearch_out)
                for k, v in natsorted(list(usearchDict.items())):
                    pident = float(v[0]) * 100
                    pident = "{0:.1f}".format(pident)
                    ID = v[1]
                    tax = ','.join(v[-1])
                    LCA = v[2]
                    if LCA == '':
                        fulltax = 'GS|' + pident + '|' + ID + ';' + tax
                    else:
                        fulltax = 'GSL|' + pident + '|' + ID + ';' + tax
                    otuDict[k] = fulltax

            elif args.method == 'sintax' and amptklib.checkfile(sintax_out):
                #load results into dictionary for appending to OTU table
                amptklib.log.debug("Loading SINTAX results into dictionary")
                with open(sintax_out, 'r') as infile:
                    reader = csv.reader(infile, delimiter=(str("\t")))
                    otuDict = {rows[0]: 'SINTAX;' + rows[3] for rows in reader}
    else:
        #you have supplied a two column taxonomy file, parse and build otuDict
        amptklib.log.debug("Loading custom Taxonomy into dictionary")
        with open(args.taxonomy, 'r') as infile:
            reader = csv.reader(infile, delimiter=str("\t"))
            otuDict = {rows[0]: rows[1] for rows in reader}

    #now format results
    if args.otu_table:
        #check if otu_table variable is empty, then load in otu table
        amptklib.log.info("Appending taxonomy to OTU table and OTUs")
        taxTable = base + '.otu_table.taxonomy.txt'
        tmpTable = base + '.otu_table.tmp'

        #append to OTU table
        counts = 0
        with open(taxTable, 'w') as outTable:
            with open(args.otu_table, 'r') as inTable:
                #guess the delimiter format
                firstline = inTable.readline()
                dialect = amptklib.guess_csv_dialect(firstline)
                inTable.seek(0)
                #parse OTU table
                reader = csv.reader(inTable, dialect)
                for line in reader:
                    if line[0].startswith(("#OTU", "OTUId")):
                        line.append('Taxonomy')
                    else:
                        tax = otuDict.get(line[0]) or "No Hit"
                        line.append(tax)
                    if args.tax_filter and not args.method == 'blast':
                        if line[0].startswith(("#OTU", "OTUId")):
                            join_line = ('\t'.join(str(x) for x in line))
                        else:
                            if args.tax_filter in line[-1]:
                                join_line = ('\t'.join(str(x) for x in line))
                                counts += 1
                            else:
                                continue
                    else:
                        join_line = ('\t'.join(str(x) for x in line))
                        counts += 1
                    outTable.write("%s\n" % join_line)

        if args.tax_filter:
            if args.method == 'blast':
                amptklib.log.info(
                    "Blast is incompatible with --tax_filter, use a different method"
                )
                tmpTable = args.otu_table
            else:
                nonfungal = total - counts
                amptklib.log.info(
                    "Found %i OTUs not matching %s, writing %i %s hits to taxonomy OTU table"
                    % (nonfungal, args.tax_filter, counts, args.tax_filter))
                #need to create a filtered table without taxonomy for BIOM output
                with open(tmpTable, 'w') as output:
                    with open(taxTable, 'r') as input:
                        firstline = input.readline()
                        dialect = amptklib.guess_csv_dialect(firstline)
                        input.seek(0)
                        #parse OTU table
                        reader = csv.reader(input, dialect)
                        for line in reader:
                            del line[-1]
                            join_line = '\t'.join(str(x) for x in line)
                            output.write("%s\n" % join_line)
        else:
            tmpTable = args.otu_table

    #append to OTUs
    otuTax = base + '.otus.taxonomy.fa'
    with open(otuTax, 'w') as output:
        with open(args.fasta, 'r') as input:
            SeqRecords = SeqIO.parse(input, 'fasta')
            for rec in SeqRecords:
                tax = otuDict.get(rec.id) or "No hit"
                rec.description = tax
                SeqIO.write(rec, output, 'fasta')

    if not args.taxonomy:
        #output final taxonomy in two-column format, followed by the hits for usearch/sintax/utax if hybrid is used.
        taxFinal = base + '.taxonomy.txt'
        with open(taxFinal, 'w') as finaltax:
            if args.method == 'hybrid':
                finaltax.write('#OTUID\ttaxonomy\tUSEARCH\tSINTAX\tUTAX\n')
                for k, v in natsorted(list(otuDict.items())):
                    if k in usearchDict:
                        usearchResult = usearchDict.get(k)
                        usearchResult = ','.join(usearchResult[-1])
                    else:
                        usearchResult = 'No hit'
                    if k in sintaxDict:
                        sintaxResult = sintaxDict.get(k)
                        sintaxResult = ','.join(sintaxResult[-1])
                    else:
                        sintaxResult = 'No hit'
                    if k in utaxDict:
                        utaxResult = utaxDict.get(k)
                        utaxResult = ','.join(utaxResult[-1])
                    else:
                        utaxResult = 'No hit'
                    finaltax.write('{:}\t{:}\t{:}\t{:}\t{:}\n'.format(
                        k, v, usearchResult, sintaxResult, utaxResult))
            else:
                finaltax.write('#OTUID\ttaxonomy\n')
                for k, v in natsorted(list(otuDict.items())):
                    finaltax.write('%s\t%s\n' % (k, v))
    else:
        taxFinal = args.taxonomy
    #convert taxonomy to qiime format for biom
    qiimeTax = None
    if not args.method == 'blast':
        qiimeTax = base + '.qiime.taxonomy.txt'
        amptklib.utax2qiime(taxFinal, qiimeTax)
    else:
        amptklib.log.error(
            "Blast taxonomy is not compatible with BIOM output, use a different method"
        )

    #create OTU phylogeny for downstream processes
    amptklib.log.info("Generating phylogenetic tree")
    tree_out = base + '.tree.phy'
    cmd = [usearch, '-cluster_agg', args.fasta, '-treeout', tree_out]
    amptklib.runSubprocess(cmd, amptklib.log)

    #print some summary file locations
    amptklib.log.info("Taxonomy finished: %s" % taxFinal)
    if args.otu_table and not args.method == 'blast':
        amptklib.log.info("Classic OTU table with taxonomy: %s" % taxTable)
        #output final OTU table in Biom v1.0 (i.e. json format if biom installed)
        outBiom = base + '.biom'
        if amptklib.which('biom'):
            amptklib.removefile(outBiom)
            cmd = [
                'biom', 'convert', '-i', tmpTable, '-o', outBiom + '.tmp',
                '--table-type', "OTU table", '--to-json'
            ]
            amptklib.runSubprocess(cmd, amptklib.log)
            if args.mapping_file:
                mapSamples = []
                repeatSamples = []
                with open(args.mapping_file, 'r') as mapin:
                    for line in mapin:
                        line = line.rstrip()
                        if line.startswith('#'):
                            continue
                        sampleID = line.split('\t')[0]
                        if not sampleID in mapSamples:
                            mapSamples.append(sampleID)
                        else:
                            repeatSamples.append(sampleID)
                otuSamples = []
                with open(tmpTable, 'r') as otuin:
                    for line in otuin:
                        line = line.rstrip()
                        if line.startswith('#'):
                            otuSamples = line.split('\t')[1:]
                missingMap = []
                for otu in otuSamples:
                    if not otu in mapSamples:
                        missingMap.append(otu)
                if len(missingMap) > 0:
                    amptklib.log.error(
                        "%s are missing from mapping file (metadata), skipping biom file creation"
                        % ', '.join(missingMap))
                elif len(repeatSamples) > 0:
                    amptklib.log.error(
                        '%s duplicate sample IDs in mapping file, skipping biom file creation'
                        % ', '.join(repeatSamples))
                else:
                    if qiimeTax:
                        cmd = [
                            'biom', 'add-metadata', '-i', outBiom + '.tmp',
                            '-o', outBiom, '--observation-metadata-fp',
                            qiimeTax, '-m', args.mapping_file,
                            '--sc-separated', 'taxonomy', '--output-as-json'
                        ]
                    else:
                        cmd = [
                            'biom', 'add-metadata', '-i', outBiom + '.tmp',
                            '-o', outBiom, '-m', args.mapping_file,
                            '--output-as-json'
                        ]
                    amptklib.runSubprocess(cmd, amptklib.log)
            else:
                cmd = [
                    'biom', 'add-metadata', '-i', outBiom + '.tmp', '-o',
                    outBiom, '--observation-metadata-fp', qiimeTax,
                    '--sc-separated', 'taxonomy', '--output-as-json'
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            amptklib.removefile(outBiom + '.tmp')
            amptklib.log.info("BIOM OTU table created: %s" % outBiom)
        else:
            amptklib.log.info(
                "biom program not installed, install via `pip install biom-format` or `conda install biom-format`"
            )
    amptklib.log.info("OTUs with taxonomy: %s" % otuTax)
    amptklib.log.info("OTU phylogeny: %s" % tree_out)

    #clean up intermediate files
    if not args.debug:
        for i in [
                utax_out, usearch_out, sintax_out, qiimeTax,
                base + '.otu_table.tmp'
        ]:
            if i:
                amptklib.removefile(i)
    print("-------------------------------------------------------")
Пример #4
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-dada2.py',
        description=
        '''Script takes output from amptk pre-processing and runs DADA2''',
        epilog="""Written by Jon Palmer (2016) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--fastq',
                        required=True,
                        help='Input Demuxed containing FASTQ')
    parser.add_argument('-o', '--out', help='Output Basename')
    parser.add_argument(
        '-m',
        '--min_reads',
        default=10,
        type=int,
        help="Minimum number of reads after Q filtering to run DADA2 on")
    parser.add_argument('-l',
                        '--length',
                        type=int,
                        help='Length to truncate reads')
    parser.add_argument('-e',
                        '--maxee',
                        default='1.0',
                        help='MaxEE quality filtering')
    parser.add_argument('-p',
                        '--pct_otu',
                        default='97',
                        help="Biological OTU Clustering Percent")
    parser.add_argument('--platform',
                        default='ion',
                        choices=['ion', 'illumina', '454'],
                        help='Sequencing platform')
    parser.add_argument('--chimera_method',
                        default='consensus',
                        choices=['consensus', 'pooled', 'per-sample'],
                        help='bimera removal method')
    parser.add_argument('--uchime_ref',
                        help='Run UCHIME REF [ITS,16S,LSU,COI,custom]')
    parser.add_argument('--pool',
                        action='store_true',
                        help='Pool all sequences together for DADA2')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Keep all intermediate files')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH9 EXE')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))
    dada2script = os.path.join(parentdir, 'dada2_pipeline_nofilt.R')

    #get basename if not args.out passed
    if args.out:
        base = args.out
    else:
        if 'demux' in args.fastq:
            base = os.path.basename(args.fastq).split('.demux')[0]
        else:
            base = os.path.basename(args.fastq).split('.f')[0]

    #remove logfile if exists
    log_name = base + '.amptk-dada2.log'
    if os.path.isfile(log_name):
        amptklib.removefile(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")
    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of cores
    if args.cpus:
        CORES = str(args.cpus)
    else:
        CORES = str(amptklib.getCPUS())

    #check dependencies
    programs = ['Rscript']
    amptklib.CheckDependencies(programs)
    Rversions = amptklib.checkRversion()
    R_pass = '******'
    dada2_pass = '******'

    #check dada2 first, if good move on, otherwise issue warning
    if not amptklib.gvc(Rversions[1], dada2_pass):
        amptklib.log.error("R v%s; DADA2 v%s detected, need atleast v%s" %
                           (Rversions[0], Rversions[1], dada2_pass))
        amptklib.log.error(
            "See: http://benjjneb.github.io/dada2/dada-installation.html")
        sys.exit(1)
    amptklib.log.info("R v%s; DADA2 v%s" % (Rversions[0], Rversions[1]))

    #Count FASTQ records and remove 3' N's as dada2 can't handle them
    amptklib.log.info("Loading FASTQ Records")
    no_ns = base + '.cleaned_input.fq'
    if args.fastq.endswith('.gz'):
        fastqInput = args.fastq.replace('.gz', '')
        amptklib.Funzip(os.path.abspath(args.fastq),
                        os.path.basename(fastqInput), CORES)
    else:
        fastqInput = os.path.abspath(args.fastq)
    amptklib.fastq_strip_padding(os.path.basename(fastqInput), no_ns)
    demuxtmp = base + '.original.fa'
    cmd = [
        'vsearch', '--fastq_filter',
        os.path.abspath(no_ns), '--fastq_qmax', '55', '--fastaout', demuxtmp,
        '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    orig_total = amptklib.countfasta(demuxtmp)
    size = amptklib.checkfastqsize(no_ns)
    readablesize = amptklib.convertSize(size)
    amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize +
                      ')')

    #quality filter
    amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
    derep = base + '.qual-filtered.fq'
    filtercmd = [
        'vsearch', '--fastq_filter', no_ns, '--fastq_maxee',
        str(args.maxee), '--fastqout', derep, '--fastq_qmax', '55',
        '--fastq_maxns', '0', '--threads', CORES
    ]
    amptklib.runSubprocess(filtercmd, amptklib.log)
    total = amptklib.countfastq(derep)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #split into individual files
    amptklib.log.info("Splitting FASTQ file by Sample into individual files")
    filtfolder = base + '_filtered'
    if os.path.isdir(filtfolder):
        shutil.rmtree(filtfolder)
    os.makedirs(filtfolder)
    splitDemux2(derep, filtfolder, args=args)

    #check for minimum number of reads in each sample
    remove = []
    files = [i for i in os.listdir(filtfolder) if i.endswith('.fastq')]
    for x in files:
        if amptklib.countfastq(os.path.join(filtfolder, x)) < args.min_reads:
            remove.append(x)
    if len(remove) > 0:
        amptklib.log.info("Dropping %s as fewer than %i reads" %
                          (', '.join(remove), args.min_reads))
        for y in remove:
            os.remove(os.path.join(filtfolder, y))

    #now run DADA2 on filtered folder
    amptklib.log.info("Running DADA2 pipeline")
    dada2log = base + '.dada2.Rscript.log'
    dada2out = base + '.dada2.csv'
    #check pooling vs notpooled, default is not pooled.
    if args.pool:
        POOL = 'TRUE'
    else:
        POOL = 'FALSE'
    with open(dada2log, 'w') as logfile:
        subprocess.call([
            'Rscript', '--vanilla', dada2script, filtfolder, dada2out,
            args.platform, POOL, CORES, args.chimera_method
        ],
                        stdout=logfile,
                        stderr=logfile)

    #check for results
    if not os.path.isfile(dada2out):
        amptklib.log.error("DADA2 run failed, please check %s logfile" %
                           dada2log)
        sys.exit(1)

    #now process the output, pull out fasta, rename, etc
    fastaout = base + '.otus.tmp'
    OTUCounts = {}
    counter = 1
    with open(fastaout, 'w') as writefasta:
        with open(dada2out, 'r') as input:
            next(input)
            for line in input:
                line = line.replace('\n', '')
                line = line.replace('"', '')
                cols = line.split(',')
                Seq = cols[0]
                countList = [int(x) for x in cols[1:]]
                counts = sum(countList)
                ID = 'ASV' + str(counter)
                if not ID in OTUCounts:
                    OTUCounts[ID] = counts
                writefasta.write(">%s\n%s\n" % (ID, Seq))
                counter += 1

    #get number of bimeras from logfile
    with open(dada2log, 'r') as bimeracheck:
        for line in bimeracheck:
            if line.startswith('Identified '):
                bimeraline = line.split(' ')
                bimeras = int(bimeraline[1])
                totalSeqs = int(bimeraline[5])
    validSeqs = totalSeqs - bimeras
    amptklib.log.info('{0:,}'.format(totalSeqs) +
                      ' total amplicon sequence variants (ASVs)')
    amptklib.log.info('{0:,}'.format(bimeras) + ' denovo chimeras removed')
    amptklib.log.info('{0:,}'.format(validSeqs) + ' valid ASVs')

    #optional UCHIME Ref
    uchime_out = base + '.nonchimeras.fa'
    chimeraFreeTable = base + '.otu_table.txt'
    iSeqs = base + '.ASVs.fa'
    if not args.uchime_ref:
        os.rename(fastaout, iSeqs)
    else:
        #check if file is present, remove from previous run if it is.
        if os.path.isfile(iSeqs):
            amptklib.removefile(iSeqs)
        #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy
        if args.uchime_ref in [
                'ITS', '16S', 'LSU', 'COI'
        ]:  #test if it is one that is setup, otherwise default to full path
            uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb')
            if not os.path.isfile(uchime_db):
                amptklib.log.error(
                    "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering"
                )
                uchime_out = fastaout
            #since uchime cannot work with udb database, need to extract fasta sequences, do this if
            if not amptklib.checkfile(
                    os.path.join(parentdir, 'DB',
                                 args.uchime_ref + '.extracted.fa')):
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
                cmd = [
                    'vsearch', '--udb2fasta',
                    os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'),
                    '--output', uchime_db
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            else:
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
        else:
            if os.path.isfile(args.uchime_ref):
                uchime_db = os.path.abspath(args.uchime_ref)
            else:
                amptklib.log.error(
                    "%s is not a valid file, skipping reference chimera filtering"
                    % args.uchime_ref)
                iSeqs = fastaout
        #now run chimera filtering if all checks out
        if not os.path.isfile(iSeqs):
            amptklib.log.info("Chimera Filtering (VSEARCH) using %s DB" %
                              args.uchime_ref)
            cmd = [
                'vsearch', '--mindiv', '1.0', '--uchime_ref', fastaout, '--db',
                uchime_db, '--nonchimeras', iSeqs, '--threads', CORES
            ]
            amptklib.runSubprocess(cmd, amptklib.log)
            total = amptklib.countfasta(iSeqs)
            uchime_chimeras = validSeqs - total
            amptklib.log.info('{0:,}'.format(total) + ' ASVs passed, ' +
                              '{0:,}'.format(uchime_chimeras) +
                              ' ref chimeras removed')
            if os.path.isfile(fastaout):
                amptklib.removefile(fastaout)

    #setup output files
    dadademux = base + '.dada2.map.uc'
    bioSeqs = base + '.cluster.otus.fa'
    bioTable = base + '.cluster.otu_table.txt'
    uctmp = base + '.map.uc'
    ClusterComp = base + '.ASVs2clusters.txt'

    #Filter out ASVs in wrong orientation
    amptklib.log.info('Validating ASV orientation')
    os.rename(iSeqs, iSeqs + '.bak')
    numKept, numDropped = amptklib.validateorientationDADA2(
        OTUCounts, iSeqs + '.bak', iSeqs)
    amptklib.log.info('{:,} ASVs validated ({:,} dropped)'.format(
        numKept, numDropped))
    amptklib.SafeRemove(iSeqs + '.bak')

    #map reads to DADA2 OTUs
    amptklib.log.info("Mapping reads to DADA2 ASVs")
    cmd = [
        'vsearch', '--usearch_global', demuxtmp, '--db', iSeqs, '--id', '0.97',
        '--uc', dadademux, '--strand', 'plus', '--otutabout', chimeraFreeTable,
        '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.line_count2(dadademux)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to ASVs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    #cluster
    amptklib.log.info("Clustering ASVs at %s%% to generate biological OTUs" %
                      args.pct_otu)
    radius = float(args.pct_otu) / 100.
    cmd = [
        'vsearch', '--cluster_smallmem', iSeqs, '--centroids', bioSeqs, '--id',
        str(radius), '--strand', 'plus', '--relabel', 'OTU', '--qmask', 'none',
        '--usersort', '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(bioSeqs)
    amptklib.log.info('{0:,}'.format(total) + ' OTUs generated')

    #determine where iSeqs clustered
    iSeqmap = base + '.ASV_map.uc'
    cmd = [
        'vsearch', '--usearch_global', iSeqs, '--db', bioSeqs, '--id',
        str(radius), '--uc', iSeqmap, '--strand', 'plus', '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    iSeqMapped = {}
    with open(iSeqmap, 'r') as mapping:
        for line in mapping:
            line = line.replace('\n', '')
            cols = line.split('\t')
            OTU = cols[9]
            Hit = cols[8]
            if not OTU in iSeqMapped:
                iSeqMapped[OTU] = [Hit]
            else:
                iSeqMapped[OTU].append(Hit)
    with open(ClusterComp, 'w') as clusters:
        clusters.write('OTU\tASVs\n')
        for k, v in natsorted(list(iSeqMapped.items())):
            clusters.write('%s\t%s\n' % (k, ', '.join(v)))
    #create OTU table
    amptklib.log.info("Mapping reads to OTUs")
    cmd = [
        'vsearch', '--usearch_global', demuxtmp, '--db', bioSeqs, '--id',
        '0.97', '--uc', uctmp, '--strand', 'plus', '--otutabout', bioTable,
        '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.line_count2(uctmp)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    if not args.debug:
        amptklib.removefile(no_ns)
        shutil.rmtree(filtfolder)
        amptklib.removefile(dada2out)
        amptklib.removefile(derep)
        amptklib.removefile(demuxtmp)
        amptklib.removefile(uctmp)
        amptklib.removefile(iSeqmap)
        amptklib.removefile(dadademux)

    #Print location of files to STDOUT
    print("-------------------------------------------------------")
    print("DADA2 Script has Finished Successfully")
    print("-------------------------------------------------------")
    if args.debug:
        print("Tmp Folder of files: %s" % filtfolder)
    print("Amplicon sequence variants: %s" % iSeqs)
    print("ASV OTU Table: %s" % chimeraFreeTable)
    print("Clustered OTUs: %s" % bioSeqs)
    print("OTU Table: %s" % bioTable)
    print("ASVs 2 OTUs: %s" % ClusterComp)
    print("-------------------------------------------------------")

    otu_print = bioSeqs.split('/')[-1]
    tab_print = bioTable.split('/')[-1]
    if 'darwin' in sys.platform:
        print(colr.WARN + "\nExample of next cmd:" + colr.END +
              " amptk filter -i %s -f %s -b <mock barcode>\n" %
              (tab_print, otu_print))
    else:
        print(
            "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n"
            % (tab_print, otu_print))
Пример #5
0
def main(args):
    global FwdPrimer, RevPrimer, Barcodes, tmpdir, usearch
    parser = argparse.ArgumentParser(
        prog='amptk-process_illumina_raw.py',
        usage="%(prog)s [options] -i file.fastq\n%(prog)s -h for help menu",
        description=
        '''Script finds barcodes, strips forward and reverse primers, relabels, and then trim/pads reads to a set length''',
        epilog="""Written by Jon Palmer (2015) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-f',
                        '--forward',
                        dest='fastq',
                        required=True,
                        help='Illumina FASTQ R1 reads')
    parser.add_argument('-r',
                        '--reverse',
                        required=True,
                        help='Illumina FASTQ R2 reads')
    parser.add_argument('-i',
                        '--index',
                        nargs='+',
                        required=True,
                        help='Illumina FASTQ index reads')
    parser.add_argument('-m', '--mapping_file', help='QIIME-like mapping file')
    parser.add_argument('--read_length',
                        type=int,
                        help='Read length, i.e. 2 x 300 bp = 300')
    parser.add_argument('-o',
                        '--out',
                        dest="out",
                        default='illumina_out',
                        help='Base name for output')
    parser.add_argument('--fwd_primer',
                        dest="F_primer",
                        default='515FB',
                        help='Forward Primer')
    parser.add_argument('--rev_primer',
                        dest="R_primer",
                        default='806RB',
                        help='Reverse Primer')
    parser.add_argument('--primer_mismatch',
                        default=2,
                        type=int,
                        help='Number of mis-matches in primer')
    parser.add_argument('--barcode_mismatch',
                        default=0,
                        type=int,
                        help='Number of mis-matches in barcode')
    parser.add_argument(
        '--barcode_fasta',
        help='FASTA file containing Barcodes (Names & Sequences)')
    parser.add_argument('--rescue_forward',
                        default='on',
                        choices=['on', 'off'],
                        help='Rescue Not-merged forward reads')
    parser.add_argument('--barcode_rev_comp',
                        action='store_true',
                        help='Reverse complement barcode sequences')
    parser.add_argument('--min_len',
                        default=100,
                        type=int,
                        help='Minimum read length to keep')
    parser.add_argument('-l',
                        '--trim_len',
                        default=300,
                        type=int,
                        help='Trim length for reads')
    parser.add_argument('-p',
                        '--pad',
                        default='off',
                        choices=['on', 'off'],
                        help='Pad with Ns to a set length')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH9 EXE')
    parser.add_argument('--cleanup',
                        action='store_true',
                        help='remove intermediate files')
    parser.add_argument('--merge_method',
                        default='usearch',
                        choices=['usearch', 'vsearch'],
                        help='Software to use for PE read merging')
    args = parser.parse_args(args)

    args.out = re.sub(r'\W+', '', args.out)

    log_name = args.out + '.amptk-demux.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #get version of amptk
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of CPUs to use
    if not args.cpus:
        cpus = multiprocessing.cpu_count()
    else:
        cpus = args.cpus

    #create tmpdir
    tmpdir = args.out.split('.')[0] + '_' + str(os.getpid())
    if not os.path.exists(tmpdir):
        os.makedirs(tmpdir)

    #parse a mapping file or a barcode fasta file, primers, etc get setup
    #dealing with Barcodes, get ion barcodes or parse the barcode_fasta argument
    barcode_file = args.out + ".barcodes_used.fa"
    if os.path.isfile(barcode_file):
        os.remove(barcode_file)

    #check if mapping file passed, use this if present, otherwise use command line arguments
    SampleData = {}
    Barcodes = {}
    RevBarcodes = {}
    FwdPrimer = ''
    RevPrimer = ''
    if args.mapping_file:
        if not os.path.isfile(args.mapping_file):
            amptklib.log.error("Mapping file not found: %s" %
                               args.mapping_file)
            sys.exit(1)
        SampleData, Barcodes, RevBarcodes, FwdPrimer, RevPrimer = amptklib.parseMappingFileNEW(
            args.mapping_file)
    else:  #no mapping file, so create dictionaries from barcode fasta files
        if not args.barcode_fasta:
            amptklib.log.error(
                "You did not specify a --barcode_fasta or --mapping_file, one is required"
            )
            sys.exit(1)
        else:
            shutil.copyfile(args.barcode_fasta, barcode_file)
            Barcodes = amptklib.fasta2barcodes(barcode_file, False)

    if FwdPrimer == '' or RevPrimer == '':
        #parse primers here so doesn't conflict with mapping primers
        #look up primer db otherwise default to entry
        if args.F_primer in amptklib.primer_db:
            FwdPrimer = amptklib.primer_db.get(args.F_primer)
            amptklib.log.info(
                "{:} fwd primer found in AMPtk primer db, setting to: {:}".
                format(args.F_primer, FwdPrimer))
        else:
            FwdPrimer = args.F_primer
            amptklib.log.info(
                "{:} fwd primer not found in AMPtk primer db, assuming it is actual primer sequence."
                .format(args.F_primer))
        if args.R_primer in amptklib.primer_db:
            RevPrimer = amptklib.primer_db.get(args.R_primer)
            amptklib.log.info(
                "{:} rev primer found in AMPtk primer db, setting to: {:}".
                format(args.R_primer, RevPrimer))
        else:
            RevPrimer = args.R_primer
            amptklib.log.info(
                "{:} rev primer not found in AMPtk primer db, assuming it is actual primer sequence."
                .format(args.R_primer))

    #if still no primers set, then exit
    if FwdPrimer == '' or RevPrimer == '':
        amptklib.log.error(
            "Please provide primer sequences via --fwd_primer and --rev_primer"
        )
        sys.exit(1)

    #if barcodes_rev_comp passed then reverse complement the keys in mapdict
    if args.barcode_rev_comp:
        amptklib.log.info("Reverse complementing barcode sequences")
        backupDict = Barcodes
        Barcodes = {}
        for k, v in list(backupDict.items()):
            RCkey = amptklib.RevComp(v)
            Barcodes[k] = RCkey

    amptklib.log.info("Loading %i samples from mapping file" % len(Barcodes))
    amptklib.log.info('FwdPrimer: {:}  RevPrimer: {:}'.format(
        FwdPrimer, RevPrimer))
    amptklib.log.info(
        'Dropping reads less than {:} bp and setting lossless trimming to {:} bp.'
        .format(args.min_len, args.trim_len))

    #rename reads according to indexes
    if not amptklib.PEandIndexCheck(
            args.fastq, args.reverse,
            args.index[0]):  #check they are all same length
        amptklib.log.error("FASTQ input malformed, read numbers do not match")
        sys.exit(1)
    amptklib.log.info("Loading FASTQ Records")
    NumSeqs = amptklib.countfastq(args.fastq)
    if cpus > 1:
        amptklib.log.info("Splitting FASTQ files over {:} cpus".format(cpus))
        amptklib.split_fastqPEandI(args.fastq, args.reverse, args.index[0],
                                   NumSeqs, tmpdir, cpus * 2)
        file_list = []
        for file in os.listdir(tmpdir):
            if file.endswith('.fq'):
                filepart = os.path.join(tmpdir, file.split('_R')[0])
                if not filepart in file_list:
                    file_list.append(filepart)

        amptklib.log.info("Mapping indexes to reads and renaming PE reads")
        amptklib.runMultiProgress(safe_run, file_list, cpus, args=args)
    else:
        amptklib.log.info("Mapping indexes to reads and renaming PE reads")
        shutil.copyfile(args.fastq, os.path.join(tmpdir, 'chunk_R1.fq'))
        shutil.copyfile(args.reverse, os.path.join(tmpdir, 'chunk_R2.fq'))
        shutil.copyfile(args.index[0], os.path.join(tmpdir, 'chunk_R3.fq'))
        processReadsPE(os.path.join(tmpdir, 'chunk'), args=args)

    print("-------------------------------------------------------")
    #Now concatenate all of the demuxed files together
    amptklib.log.info("Concatenating Demuxed Files")

    tmpDemux = os.path.join(tmpdir, args.out + '.demux.fq')
    with open(tmpDemux, 'wb') as outfile:
        for filename in glob.glob(os.path.join(tmpdir, '*.demux.fq')):
            if filename == tmpDemux:
                continue
            with open(filename, 'r') as readfile:
                shutil.copyfileobj(readfile, outfile)
    #parse the stats
    finalstats = [0, 0, 0, 0, 0, 0]
    for file in os.listdir(tmpdir):
        if file.endswith('.stats'):
            with open(os.path.join(tmpdir, file), 'r') as statsfile:
                line = statsfile.readline()
                line = line.replace('\n', '')
                newstats = line.split(',')
                newstats = [int(i) for i in newstats]
                for x, num in enumerate(newstats):
                    finalstats[x] += num

    #finally reindex output
    #last thing is to re-number of reads as it is possible they could have same name from multitprocessor split
    Demux = args.out + '.demux.fq'
    amptklib.fastqreindex(tmpDemux, Demux)
    amptklib.SafeRemove(tmpDemux)

    #output stats of the run
    amptklib.log.info('{0:,}'.format(finalstats[0]) + ' total reads')
    amptklib.log.info('{0:,}'.format(finalstats[0] - finalstats[1]) +
                      ' discarded no index match')
    amptklib.log.info('{0:,}'.format(finalstats[2]) +
                      ' Fwd Primer found, {0:,}'.format(finalstats[3]) +
                      ' Rev Primer found')
    amptklib.log.info('{0:,}'.format(finalstats[4]) +
                      ' discarded too short (< %i bp)' % args.min_len)
    amptklib.log.info('{0:,}'.format(finalstats[5]) + ' valid output reads')

    #now loop through data and find barcoded samples, counting each.....
    BarcodeCount = {}
    with open(Demux, 'r') as input:
        header = itertools.islice(input, 0, None, 4)
        for line in header:
            ID = line.split("=", 1)[-1].split(";")[0]
            if ID not in BarcodeCount:
                BarcodeCount[ID] = 1
            else:
                BarcodeCount[ID] += 1

    #now let's count the barcodes found and count the number of times they are found.
    barcode_counts = "%30s:  %s" % ('Sample', 'Count')
    for k, v in natsorted(list(BarcodeCount.items()),
                          key=lambda k_v: k_v[1],
                          reverse=True):
        barcode_counts += "\n%30s:  %s" % (k, str(BarcodeCount[k]))
    amptklib.log.info("Found %i barcoded samples\n%s" %
                      (len(BarcodeCount), barcode_counts))

    #create mapping file if one doesn't exist
    genericmapfile = args.out + '.mapping_file.txt'
    amptklib.CreateGenericMappingFile(Barcodes, {}, FwdPrimer, RevPrimer,
                                      genericmapfile, BarcodeCount)

    #compress the output to save space
    FinalDemux = Demux + '.gz'
    amptklib.Fzip(Demux, FinalDemux, cpus)
    amptklib.removefile(Demux)

    if args.cleanup:
        amptklib.SafeRemove(tmpdir)

    #get file size
    filesize = os.path.getsize(FinalDemux)
    readablesize = amptklib.convertSize(filesize)
    amptklib.log.info("Output file:  %s (%s)" % (FinalDemux, readablesize))
    amptklib.log.info("Mapping file: %s" % genericmapfile)
    print("-------------------------------------------------------")
    if 'darwin' in sys.platform:
        print(col.WARN + "\nExample of next cmd: " + col.END +
              "amptk cluster -i %s -o out\n" % (FinalDemux))
    else:
        print("\nExample of next cmd: amptk cluster -i %s -o out\n" %
              (FinalDemux))
Пример #6
0
def main(args):
    global FwdPrimer, RevPrimer, Barcodes, tmpdir
    parser = argparse.ArgumentParser(
        prog='amptk-process_ion.py',
        usage="%(prog)s [options] -i file.fastq\n%(prog)s -h for help menu",
        description=
        '''Script finds barcodes, strips forward and reverse primers, relabels, and then trim/pads reads to a set length''',
        epilog="""Written by Jon Palmer (2015) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--fastq',
                        '--sff',
                        '--fasta',
                        '--bam',
                        dest='fastq',
                        required=True,
                        help='BAM/FASTQ/SFF/FASTA file')
    parser.add_argument('-q', '--qual', help='QUAL file (if -i is FASTA)')
    parser.add_argument('-o',
                        '--out',
                        dest="out",
                        default='ion',
                        help='Base name for output')
    parser.add_argument('-f',
                        '--fwd_primer',
                        dest="F_primer",
                        default='fITS7-ion',
                        help='Forward Primer')
    parser.add_argument('-r',
                        '--rev_primer',
                        dest="R_primer",
                        default='ITS4',
                        help='Reverse Primer')
    parser.add_argument(
        '-m',
        '--mapping_file',
        help='Mapping file: QIIME format can have extra meta data columns')
    parser.add_argument('-p',
                        '--pad',
                        default='off',
                        choices=['on', 'off'],
                        help='Pad with Ns to a set length')
    parser.add_argument('--primer_mismatch',
                        default=2,
                        type=int,
                        help='Number of mis-matches in primer')
    parser.add_argument('--barcode_mismatch',
                        default=0,
                        type=int,
                        help='Number of mis-matches in barcode')
    parser.add_argument(
        '--barcode_fasta',
        default='ionxpress',
        help='FASTA file containing Barcodes (Names & Sequences)')
    parser.add_argument('--reverse_barcode',
                        help='FASTA file containing 3 prime Barocdes')
    parser.add_argument('-b',
                        '--list_barcodes',
                        dest="barcodes",
                        default='all',
                        help='Enter Barcodes used separated by commas')
    parser.add_argument('--min_len',
                        default=100,
                        type=int,
                        help='Minimum read length to keep')
    parser.add_argument('-l',
                        '--trim_len',
                        default=300,
                        type=int,
                        help='Trim length for reads')
    parser.add_argument(
        '--full_length',
        action='store_true',
        help='Keep only full length reads (no trimming/padding)')
    parser.add_argument('--mult_samples',
                        dest="multi",
                        default='False',
                        help='Combine multiple samples (i.e. FACE1)')
    parser.add_argument('--ion',
                        action='store_true',
                        help='Input data is Ion Torrent')
    parser.add_argument('--454', action='store_true', help='Input data is 454')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH EXE')
    args = parser.parse_args(args)

    args.out = re.sub(r'\W+', '', args.out)

    log_name = args.out + '.amptk-demux.log'
    if os.path.isfile(log_name):
        os.remove(log_name)
    FNULL = open(os.devnull, 'w')
    amptklib.setupLogging(log_name)
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of CPUs to use
    if not args.cpus:
        cpus = multiprocessing.cpu_count()
    else:
        cpus = args.cpus

    #parse a mapping file or a barcode fasta file, primers, etc get setup
    #dealing with Barcodes, get ion barcodes or parse the barcode_fasta argument
    barcode_file = args.out + ".barcodes_used.fa"
    rev_barcode_file = args.out + '.revbarcodes_used.fa'
    amptklib.SafeRemove(barcode_file)
    amptklib.SafeRemove(rev_barcode_file)

    #check if mapping file passed, use this if present, otherwise use command line arguments
    SampleData = {}
    Barcodes = {}
    RevBarcodes = {}
    if args.mapping_file:
        if not os.path.isfile(args.mapping_file):
            amptklib.log.error("Mapping file not found: %s" %
                               args.mapping_file)
            sys.exit(1)
        SampleData, Barcodes, RevBarcodes, FwdPrimer, RevPrimer = amptklib.parseMappingFileNEW(
            args.mapping_file)
        genericmapfile = args.mapping_file
    else:  #no mapping file, so create dictionaries from barcode fasta files
        if args.barcode_fasta == 'ionxpress':
            #get script path and barcode file name
            pgm_barcodes = os.path.join(os.path.dirname(amptklib.__file__),
                                        'DB', 'ionxpress_barcodes.fa')
        elif args.barcode_fasta == 'ioncode':
            pgm_barcodes = os.path.join(os.path.dirname(amptklib.__file__),
                                        'DB', 'ioncode_barcodes.fa')
        if args.barcode_fasta == 'ionxpress' or args.barcode_fasta == 'ioncode':
            if args.barcodes == "all":
                if args.multi == 'False':
                    shutil.copyfile(pgm_barcodes, barcode_file)
                else:
                    with open(barcode_file, 'w') as barcodeout:
                        with open(pgm_barcodes, 'r') as input:
                            for rec in SeqIO.parse(input, 'fasta'):
                                outname = args.multi + '.' + rec.id
                                barcodeout.write(">%s\n%s\n" %
                                                 (outname, rec.seq))
            else:
                bc_list = args.barcodes.split(",")
                inputSeqFile = open(pgm_barcodes, "rU")
                SeqRecords = SeqIO.to_dict(SeqIO.parse(inputSeqFile, "fasta"))
                for rec in bc_list:
                    name = "BC." + rec
                    seq = SeqRecords[name].seq
                    if args.multi != 'False':
                        outname = args.multi + '.' + name
                    else:
                        outname = name
                    outputSeqFile = open(barcode_file, "a")
                    outputSeqFile.write(">%s\n%s\n" % (outname, seq))
                outputSeqFile.close()
                inputSeqFile.close()
        else:
            #check for multi_samples and add if necessary
            if args.multi == 'False':
                shutil.copyfile(args.barcode_fasta, barcode_file)
                if args.reverse_barcode:
                    shutil.copyfile(args.reverse_barcode, rev_barcode_file)
            else:
                with open(barcode_file, 'w') as barcodeout:
                    with open(args.barcode_fasta, 'r') as input:
                        for rec in SeqIO.parse(input, 'fasta'):
                            outname = args.multi + '.' + rec.id
                            barcodeout.write(">%s\n%s\n" % (outname, rec.seq))
                if args.reverse_barcode:
                    with open(rev_barcode_file, 'w') as barcodeout:
                        with open(args.reverse_barcode, 'r') as input:
                            for rec in SeqIO.parse(input, 'fasta'):
                                outname = args.multi + '.' + rec.id
                                barcodeout.write(">%s\n%s\n" %
                                                 (outname, rec.seq))

        #parse primers here so doesn't conflict with mapping primers
        #look up primer db otherwise default to entry
        if args.F_primer in amptklib.primer_db:
            FwdPrimer = amptklib.primer_db.get(args.F_primer)
            amptklib.log.info(
                "{:} fwd primer found in AMPtk primer db, setting to: {:}".
                format(args.F_primer, FwdPrimer))
        else:
            FwdPrimer = args.F_primer
            amptklib.log.info(
                "{:} fwd primer not found in AMPtk primer db, assuming it is actual primer sequence."
                .format(args.F_primer))
        if args.R_primer in amptklib.primer_db:
            RevPrimer = amptklib.primer_db.get(args.R_primer)
            amptklib.log.info(
                "{:} rev primer found in AMPtk primer db, setting to: {:}".
                format(args.R_primer, RevPrimer))
        else:
            RevPrimer = args.R_primer
            amptklib.log.info(
                "{:} rev primer not found in AMPtk primer db, assuming it is actual primer sequence."
                .format(args.R_primer))

    #check if input is compressed
    gzip_list = []
    if args.fastq.endswith('.gz'):
        gzip_list.append(os.path.abspath(args.fastq))
    if gzip_list:
        amptklib.log.info("Gzipped input files detected, uncompressing")
        for file in gzip_list:
            file_out = file.replace('.gz', '')
            amptklib.Funzip(file, file_out, cpus)
        args.fastq = args.fastq.replace('.gz', '')

    #if SFF file passed, convert to FASTQ with biopython
    if args.fastq.endswith(".sff"):
        if args.barcode_fasta == 'ionxpress':
            if not args.mapping_file:
                amptklib.log.error(
                    "You did not specify a --barcode_fasta or --mapping_file, one is required for 454 data"
                )
                sys.exit(1)
        amptklib.log.info("SFF input detected, converting to FASTQ")
        SeqIn = args.out + '.sff.extract.fastq'
        SeqIO.convert(args.fastq, "sff-trim", SeqIn, "fastq")
    elif args.fastq.endswith(".fas") or args.fastq.endswith(
            ".fasta") or args.fastq.endswith(".fa"):
        if not args.qual:
            amptklib.log.error(
                "FASTA input detected, however no QUAL file was given.  You must have FASTA + QUAL files"
            )
            sys.exit(1)
        else:
            if args.barcode_fasta == 'ionxpress':
                if not args.mapping_file:
                    amptklib.log.error(
                        "You did not specify a --barcode_fasta or --mapping_file, one is required for 454 data"
                    )
                    sys.exit(1)
            SeqIn = args.out + '.fastq'
            amptklib.log.info("FASTA + QUAL detected, converting to FASTQ")
            amptklib.faqual2fastq(args.fastq, args.qual, SeqIn)
    elif args.fastq.endswith('.bam'):
        #so we can convert natively with pybam, however it is 10X slower than bedtools/samtools
        #since samtools is fastest, lets use that if exists, if not then bedtools, else default to pybam
        amptklib.log.info("Converting Ion Torrent BAM file to FASTQ")
        SeqIn = args.out + '.fastq'
        if amptklib.which('samtools'):
            cmd = ['samtools', 'fastq', '-@', str(cpus), args.fastq]
            amptklib.runSubprocess2(cmd, amptklib.log, SeqIn)
        else:
            if amptklib.which('bedtools'):
                cmd = [
                    'bedtools', 'bamtofastq', '-i', args.fastq, '-fq', SeqIn
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            else:  #default to pybam
                amptklib.bam2fastq(args.fastq, SeqIn)
    else:
        SeqIn = args.fastq

    #start here to process the reads, first reverse complement the reverse primer
    catDemux = args.out + '.demux.fq'
    origRevPrimer = RevPrimer
    RevPrimer = amptklib.RevComp(RevPrimer)
    amptklib.log.info("Foward primer: %s,  Rev comp'd rev primer: %s" %
                      (FwdPrimer, RevPrimer))

    #then setup barcode dictionary
    if len(Barcodes) < 1:
        Barcodes = amptklib.fasta2barcodes(barcode_file, False)

    #setup for looking for reverse barcode
    if len(RevBarcodes) < 1 and args.reverse_barcode:
        if not os.path.isfile(args.reverse_barcode):
            amptklib.log.info("Reverse barcode is not a valid file, exiting")
            sys.exit(1)
        shutil.copyfile(args.reverse_barcode, rev_barcode_file)
        RevBarcodes = amptklib.fasta2barcodes(rev_barcode_file, True)

    #Count FASTQ records
    amptklib.log.info("Loading FASTQ Records")
    orig_total = amptklib.countfastq(SeqIn)
    size = amptklib.checkfastqsize(SeqIn)
    readablesize = amptklib.convertSize(size)
    amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize +
                      ')')

    #create tmpdir and split input into n cpus
    tmpdir = args.out.split('.')[0] + '_' + str(os.getpid())
    if not os.path.exists(tmpdir):
        os.makedirs(tmpdir)

    amptklib.log.info(
        'Dropping reads less than {:} bp and setting lossless trimming to {:} bp.'
        .format(args.min_len, args.trim_len))

    if cpus > 1:
        #split fastq file
        amptklib.log.info("Splitting FASTQ files over {:} cpus".format(cpus))
        amptklib.split_fastq(SeqIn, orig_total, tmpdir, cpus * 2)
        #now get file list from tmp folder
        file_list = []
        for file in os.listdir(tmpdir):
            if file.endswith(".fq"):
                file = os.path.join(tmpdir, file)
                file_list.append(file)
        #finally process reads over number of cpus
        amptklib.runMultiProgress(processRead, file_list, cpus, args=args)
    else:
        shutil.copyfile(SeqIn, os.path.join(tmpdir, 'chunk.fq'))
        processRead(os.path.join(tmpdir, 'chunk.fq'), args=args)

    print("-------------------------------------------------------")
    #Now concatenate all of the demuxed files together
    amptklib.log.info("Concatenating Demuxed Files")

    tmpDemux = args.out + '.tmp.demux.fq'
    with open(tmpDemux, 'w') as outfile:
        for filename in glob.glob(os.path.join(tmpdir, '*.demux.fq')):
            if filename == tmpDemux:
                continue
            with open(filename, 'r') as readfile:
                shutil.copyfileobj(readfile, outfile)
    #parse the stats
    finalstats = [0, 0, 0, 0, 0, 0, 0]
    for file in os.listdir(tmpdir):
        if file.endswith('.stats'):
            with open(os.path.join(tmpdir, file), 'r') as statsfile:
                line = statsfile.readline()
                line = line.rstrip()
                newstats = line.split(',')
                newstats = [int(i) for i in newstats]
                for x, num in enumerate(newstats):
                    finalstats[x] += num

    #clean up tmp folder
    shutil.rmtree(tmpdir)

    #last thing is to re-number of reads as it is possible they could have same name from multitprocessor split
    amptklib.fastqreindex(tmpDemux, catDemux)
    os.remove(tmpDemux)

    amptklib.log.info('{0:,}'.format(finalstats[0]) + ' total reads')
    if args.reverse_barcode:
        amptklib.log.info('{0:,}'.format(finalstats[0] - finalstats[1] -
                                         finalstats[2] - finalstats[4]) +
                          ' valid Fwd and Rev Barcodes')
    else:
        amptklib.log.info('{0:,}'.format(finalstats[0] - finalstats[1]) +
                          ' valid Barcode')
        amptklib.log.info('{0:,}'.format(finalstats[0] - finalstats[1] -
                                         finalstats[2]) +
                          ' Fwd Primer found, {0:,}'.format(finalstats[3]) +
                          ' Rev Primer found')
    amptklib.log.info('{0:,}'.format(finalstats[5]) +
                      ' discarded too short (< %i bp)' % args.min_len)
    amptklib.log.info('{0:,}'.format(finalstats[6]) + ' valid output reads')

    #now loop through data and find barcoded samples, counting each.....
    BarcodeCount = {}
    with open(catDemux, 'r') as input:
        header = itertools.islice(input, 0, None, 4)
        for line in header:
            ID = line.split("=", 1)[-1].split(";")[0]
            if ID not in BarcodeCount:
                BarcodeCount[ID] = 1
            else:
                BarcodeCount[ID] += 1

    #now let's count the barcodes found and count the number of times they are found.
    barcode_counts = "%22s:  %s" % ('Sample', 'Count')
    for k, v in natsorted(list(BarcodeCount.items()),
                          key=lambda k_v: k_v[1],
                          reverse=True):
        barcode_counts += "\n%22s:  %s" % (k, str(BarcodeCount[k]))
    amptklib.log.info("Found %i barcoded samples\n%s" %
                      (len(BarcodeCount), barcode_counts))

    #create a generic mappingfile for downstream processes
    genericmapfile = args.out + '.mapping_file.txt'
    if not args.mapping_file:
        amptklib.CreateGenericMappingFile(Barcodes, RevBarcodes, FwdPrimer,
                                          origRevPrimer, genericmapfile,
                                          BarcodeCount)
    else:
        amptklib.updateMappingFile(args.mapping_file, BarcodeCount,
                                   genericmapfile)

    #compress the output to save space
    FinalDemux = catDemux + '.gz'
    amptklib.Fzip(catDemux, FinalDemux, cpus)
    amptklib.removefile(catDemux)
    if gzip_list:
        for file in gzip_list:
            file = file.replace('.gz', '')
            amptklib.removefile(file)

    #get file size
    filesize = os.path.getsize(FinalDemux)
    readablesize = amptklib.convertSize(filesize)
    amptklib.log.info("Output file:  %s (%s)" % (FinalDemux, readablesize))
    amptklib.log.info("Mapping file: %s" % genericmapfile)

    print("-------------------------------------------------------")
    if 'darwin' in sys.platform:
        print(col.WARN + "\nExample of next cmd: " + col.END +
              "amptk cluster -i %s -o out\n" % (FinalDemux))
    else:
        print("\nExample of next cmd: amptk cluster -i %s -o out\n" %
              (FinalDemux))