Exemplo n.º 1
0
def MergeReads(R1, R2, outname, read_length):
    usearch = args.usearch
    pretrim_R1 = outname + '.pretrim_R1.fq'
    pretrim_R2 = outname + '.pretrim_R2.fq'
    ufitslib.log.debug("Removing index 3prime bp 'A' from reads")
    cmd = [
        'vsearch', '--fastq_filter', R1, '--fastq_trunclen',
        str(read_length), '--fastqout', pretrim_R1
    ]
    ufitslib.runSubprocess(cmd, ufitslib.log)
    cmd = [
        'vsearch', '--fastq_filter', R2, '--fastq_trunclen',
        str(read_length), '--fastqout', pretrim_R2
    ]
    ufitslib.runSubprocess(cmd, ufitslib.log)

    #next run USEARCH mergepe
    merge_out = outname + '.merged.fq'
    skip_for = outname + '.notmerged.R1.fq'
    ufitslib.log.debug("Now merging PE reads")
    cmd = [
        usearch, '-fastq_mergepairs', for_reads, '-reverse', rev_reads,
        '-fastqout', merge_out, '-fastqout_notmerged_fwd', skip_for, '-minhsp',
        '12', '-fastq_maxdiffs', '8'
    ]
    ufitslib.runSubprocess(cmd, ufitslib.log)

    #now concatenate files for downstream pre-process_illumina.py script
    outname = outname + '.fq'
    final_out = os.path.join(args.out, outname)
    with open(final_out, 'w') as cat_file:
        shutil.copyfileobj(open(merge_out, 'rU'), cat_file)
        if args.rescue_forward == 'on':
            shutil.copyfileobj(open(skip_for, 'rU'), cat_file)

    #count output
    origcount = ufitslib.countfastq(R1)
    finalcount = ufitslib.countfastq(final_out)
    pct_out = finalcount / float(origcount)

    #clean and close up intermediate files
    os.remove(merge_out)
    os.remove(pretrim_R1)
    os.remove(pretrim_R2)
    os.remove(skip_for)
    return ufitslib.log.info('{0:,}'.format(finalcount) + ' reads passed (' +
                             '{0:.1%}'.format(pct_out) + ')')
def MergeReads(R1, R2, outname, read_length):
    usearch = args.usearch
    pretrim_R1 = outname + '.pretrim_R1.fq'
    pretrim_R2 = outname + '.pretrim_R2.fq'
    ufitslib.log.debug("Removing index 3prime bp 'A' from reads")    
    cmd = ['vsearch', '--fastq_filter', R1, '--fastq_trunclen', str(read_length), '--fastqout', pretrim_R1]
    ufitslib.runSubprocess(cmd, ufitslib.log)
    cmd = ['vsearch', '--fastq_filter', R2, '--fastq_trunclen', str(read_length), '--fastqout', pretrim_R2]
    ufitslib.runSubprocess(cmd, ufitslib.log)

    #next run USEARCH mergepe
    merge_out = outname + '.merged.fq'
    skip_for = outname + '.notmerged.R1.fq'
    ufitslib.log.debug("Now merging PE reads")
    cmd = [usearch, '-fastq_mergepairs', for_reads, '-reverse', rev_reads, '-fastqout', merge_out, '-fastqout_notmerged_fwd', skip_for,'-minhsp', '12','-fastq_maxdiffs', '8']
    ufitslib.runSubprocess(cmd, ufitslib.log)

    #now concatenate files for downstream pre-process_illumina.py script
    outname = outname + '.fq'
    final_out = os.path.join(args.out, outname)
    with open(final_out, 'w') as cat_file:
        shutil.copyfileobj(open(merge_out,'rU'), cat_file)
        if args.rescue_forward == 'on':
            shutil.copyfileobj(open(skip_for,'rU'), cat_file)
    
    #count output
    origcount = ufitslib.countfastq(R1)
    finalcount = ufitslib.countfastq(final_out)
    pct_out = finalcount / float(origcount)

    #clean and close up intermediate files
    os.remove(merge_out)
    os.remove(pretrim_R1)
    os.remove(pretrim_R2)
    os.remove(skip_for)
    return ufitslib.log.info('{0:,}'.format(finalcount) + ' reads passed ('+'{0:.1%}'.format(pct_out)+')')
Exemplo n.º 3
0
orig_total = ufitslib.countfastq(args.FASTQ)
size = checkfastqsize(args.FASTQ)
readablesize = ufitslib.convertSize(size)
ufitslib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')

#Expected Errors filtering step and convert to fasta
filter_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fq')
filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa')
orig_fasta = os.path.join(tmp, args.out + '.orig.fa')
ufitslib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
cmd = [
    'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee',
    str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta,
    '--fastq_qmax', '55'
]
ufitslib.runSubprocess(cmd, ufitslib.log)
cmd = [
    'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta,
    '--fastq_qmax', '55'
]
ufitslib.runSubprocess(cmd, ufitslib.log)
total = ufitslib.countfastq(filter_out)
ufitslib.log.info('{0:,}'.format(total) + ' reads passed')

#now run full length dereplication
derep_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.derep.fa')
ufitslib.log.info("De-replication (remove duplicate reads)")
cmd = [
    'vsearch', '--derep_fulllength', filter_out, '--relabel', 'Read_',
    '--sizeout', '--output', derep_out
]
Exemplo n.º 4
0
if not args.taxonomy:
    #start with less common uses, i.e. Blast, rdp
    if args.method == 'blast':
        #check if command line blast installed
        if not ufitslib.which('blastn'):
            ufitslib.log.error("BLASTN not found in your PATH, exiting.")
            sys.exit(1)
    
        #now run blast remotely using NCBI nt database
        outformat = "6 qseqid sseqid pident stitle"
        if args.local_blast:
            #get number of cpus
            cpus = multiprocessing.cpu_count() - 2
            ufitslib.log.info("Running local BLAST using db: %s" % args.local_blast)
            cmd = ['blastn', '-num_threads', str(cpus), '-query', args.fasta, '-db', os.path.abspath(args.local_blast), '-max_target_seqs', '1', '-outfmt', outformat, '-out', blast_out]
            ufitslib.runSubprocess(cmd, ufitslib.log)
        else:
            ufitslib.log.info("Running BLASTN using NCBI remote nt database, this may take awhile")
            cmd = ['blastn', '-query', args.fasta, '-db', 'nt', '-remote', '-max_target_seqs', '1', '-outfmt', outformat, '-out', blast_out]
            ufitslib.runSubprocess(cmd, ufitslib.log)
    
        #load results and reformat
        new = []
        f = csv.reader(open(blast_out), delimiter='\t')
        for col in f:
            query = col[0]
            gbID = col[1].split("|")[3]
            pident = col[2]
            name = col[3]
            tax = gbID + ";" + name + " (" + pident + ")"
            line = [query,tax]