示例#1
0
文件: shorah.py 项目: JudoWill/shorah
        # 4. run mm.py to find minimal matching
        sholog.debug('running mm.py')
        mm.main('%s_cor.rest' % in_stem, maxhaplo=200)

        # 5. run EM freqEst output: sample.$nr.popl
        sholog.debug('running freqEst')
        my_prog = os.path.join(dn, 'freqEst')
        my_arg = " -f %s_cor" % in_stem
        assert os.path.isfile('%s_cor.rest' % in_stem), \
            'File %s_cor.rest not found' % in_stem
        retcode_em = run_child(my_prog, my_arg)
        if retcode_em:
            sholog.error('freqEst did not return 0')
            sys.exit('Something went wrong in the EM step')
        else:
            sholog.debug('freqEst exited successfully')

    # 6. run snv.py to parse single nucleotide variants
    sholog.debug('running snv.py')
    snv.main(reference=options.f, bam_file=options.b, sigma=options.i,
             increment=options.w / options.s)

    # tidy snvs
    try:
        os.mkdir('snv')
    except OSError:
        pass
    for snv_file in glob.glob('./SNV*'):
        shutil.move(snv_file, 'snv/')
示例#2
0
文件: dec.py 项目: JhuangLab/qap
def main(in_bam='',
         in_fasta='',
         win_length=201,
         win_shifts=3,
         region='',
         max_coverage=10000,
         alpha=0.1,
         keep_files=True):
    '''
    Performs the error correction analysis, running diri_sampler
    and analyzing the result
    '''
    from multiprocessing import Pool, cpu_count
    import shutil
    import glob
    import time
    import snv

    # set logging level
    declog.setLevel(logging.DEBUG)
    # This handler writes everything to a file.
    LOG_FILENAME = './dec.log'
    hl = logging.handlers.RotatingFileHandler(LOG_FILENAME,
                                              'w',
                                              maxBytes=100000,
                                              backupCount=5)
    f = logging.Formatter("%(levelname)s %(asctime)s\
                          %(funcName)s %(lineno)d %(message)s")
    hl.setFormatter(f)
    declog.addHandler(hl)
    declog.info(' '.join(sys.argv))

    # check options
    if win_length % win_shifts != 0:
        sys.exit('Window size must be divisible by win_shifts')
    if win_min_ext < 1 / float(win_shifts):
        declog.warning('Some bases might not be covered by any window')
    if max_coverage / win_length < 1:
        sys.exit('Please increase max_coverage')
    if not os.path.isfile(in_bam):
        sys.exit("File '%s' not found" % in_bam)
    if not os.path.isfile(in_fasta):
        sys.exit("File '%s' not found" % in_fasta)

    incr = win_length / win_shifts
    max_c = max_coverage / win_length
    keep_all_files = keep_files

    #run b2w
    retcode = windows((in_bam, in_fasta, win_length, incr,
                       win_min_ext * win_length, max_c, region))
    if retcode is not 0:
        sys.exit('b2w run not successful')

    aligned_reads = parse_aligned_reads('reads.fas')
    r = aligned_reads.keys()[0]
    gen_length = aligned_reads[r][1] - aligned_reads[r][0]

    if win_length > gen_length:
        sys.exit('The window size must be smaller than the genome region')

    declog.info('%s reads are being considered' % len(aligned_reads))

    for k in aligned_reads.keys():
        to_correct[k] = [None, None, None, None, []]
        to_correct[k][0] = aligned_reads[k][0]
        to_correct[k][1] = aligned_reads[k][1]
        to_correct[k][2] = aligned_reads[k][2]
        to_correct[k][3] = aligned_reads[k][3]
        to_correct[k][4] = []  # aligned_reads[k][4][:]

    ############################################
    # Now the windows and the error correction #
    ############################################

    runlist = win_to_run(alpha)
    declog.info('will run on %d windows' % len(runlist))
    # run diri_sampler on all available processors but one
    max_proc = max(cpu_count() - 1, 1)
    pool = Pool(processes=max_proc)
    pool.map(run_dpm, runlist)

    # prepare directories
    if keep_all_files:
        for sd_name in [
                'debug', 'sampling', 'freq', 'support', 'corrected',
                'raw_reads'
        ]:
            try:
                os.mkdir(sd_name)
            except OSError:
                pass

    # parse corrected reads
    proposed = {}
    for i in runlist:
        winFile, j, a = i
        del (a)  # in future alpha might be different on each window
        parts = winFile.split('.')[0].split('-')
        chrom = '-'.join(parts[1:-2])
        beg = parts[-2]
        end = parts[-1]
        declog.info('reading windows for start position %s' % beg)
        # correct reads populates correction and quality, globally defined
        correct_reads(chrom, beg, end)
        stem = 'w-%s-%s-%s' % (chrom, beg, end)
        declog.info('this is window %s' % stem)
        dbg_file = stem + '.dbg'
        # if os.path.exists(dbg_file):
        proposed[beg] = (get_prop(dbg_file), j)
        declog.info('there were %s proposed' % str(proposed[beg][0]))

    # (re)move intermediate files
    if not keep_all_files:
        declog.info('removing intermediate files')
        tr_files = glob.glob('./w*reads.fas')
        tr_files.extend(glob.glob('./*.smp'))
        tr_files.extend(glob.glob('./w*.dbg'))
        for trf in tr_files:
            os.remove(trf)

        tr_files = glob.glob('./w*reads-cor.fas')
        tr_files.extend(glob.glob('./w*reads-freq.csv'))
        tr_files.extend(glob.glob('./w*reads-support.fas'))
        for trf in tr_files:
            if os.stat(trf).st_size == 0:
                os.remove(trf)
    else:

        for dbg_file in glob.glob('./w*dbg'):
            if os.stat(dbg_file).st_size > 0:
                gzf = gzip_file(dbg_file)
                try:
                    os.remove('debug/%s' % gzf)
                except OSError:
                    pass
                shutil.move(gzf, 'debug/')
            else:
                os.remove(dbg_file)

        for smp_file in glob.glob('./w*smp'):
            if os.stat(smp_file).st_size > 0:
                gzf = gzip_file(smp_file)
                try:
                    os.remove('sampling/%s' % gzf)
                except OSError:
                    pass
                shutil.move(gzf, 'sampling/')
            else:
                os.remove(smp_file)

        for cor_file in glob.glob('./w*reads-cor.fas'):
            if os.stat(cor_file).st_size > 0:
                gzf = gzip_file(cor_file)
                try:
                    os.remove('corrected/%s' % gzf)
                except OSError:
                    pass
                shutil.move(gzf, 'corrected/')
            else:
                os.remove(cor_file)

        for sup_file in glob.glob('./w*reads-support.fas'):
            if os.stat(sup_file).st_size > 0:
                gzf = gzip_file(sup_file)
                try:
                    os.remove('support/%s' % gzf)
                except OSError:
                    pass
                shutil.move(gzf, 'support/')
            else:
                os.remove(sup_file)

        for freq_file in glob.glob('./w*reads-freq.csv'):
            if os.stat(freq_file).st_size > 0:
                gzf = gzip_file(freq_file)
                try:
                    os.remove('freq/%s' % gzf)
                except OSError:
                    pass
                shutil.move(gzf, 'freq/')
            else:
                os.remove(freq_file)

        for raw_file in glob.glob('./w*reads.fas'):
            if os.stat(raw_file).st_size > 0:
                gzf = gzip_file(raw_file)
                try:
                    os.remove('raw_reads/%s' % gzf)
                except OSError:
                    pass
                shutil.move(gzf, 'raw_reads/')
            else:
                os.remove(raw_file)

    ############################################
    ##      Print the corrected reads         ##
    ##
    ## correction[read_id][wstart] = sequence ##
    ## quality[read_id][wstart] = posterior   ##
    # ##########################################
    reason = [0, 0, 0]
    declog.info('now correct, %d reads will be analysed' % len(to_correct))
    creads = 0
    for r in to_correct:
        if r not in correction.keys():
            continue
        creads += 1
        if creads % 500 == 0:
            declog.info('considered %d corrected reads' % creads)
            print aligned_reads[r][4]
        rlen = len(aligned_reads[r][4])  # length of original read
        rst = aligned_reads[r][2]  # read start in the reference

        corrstore = []
        for rpos in range(rlen):
            this = []
            for cst in correction[r]:
                tp = rpos + rst - int(cst)
                if tp < 0:
                    reason[0] += 1
                if tp >= len(correction[r][cst]):
                    reason[1] += 1
                if (tp >= 0 and tp < len(correction[r][cst])
                        and quality[r][cst] > min_quality):
                    reason[2] += 1
                    tc = correction[r][cst][tp]
                    this.append(tc)
                    corrstore.append(rpos)

            if len(this) > 0:
                cb = base_break(this)
            else:
                cb = 'X'
            to_correct[r][4].append(cb)
            del this
    declog.info('considered all corrected reads')

    ccx = {}
    cin_stem = '.'.join(os.path.split(in_bam)[1].split('.')[:-1])
    fch = open('%s.cor.fas' % cin_stem, 'w')
    declog.debug('writing to file %s.cor.fas' % cin_stem)
    for r in to_correct:
        cor_read = ''.join(to_correct[r][4])
        init_x = len(cor_read.lstrip('-')) - len(cor_read.lstrip('-X'))
        fin_x = len(cor_read.rstrip('-')) - len(cor_read.rstrip('-X'))
        cx = to_correct[r][4].count('X') - init_x - fin_x
        ccx[cx] = ccx.get(cx, 0) + 1
        if cx <= min_x_thresh and cor_read.lstrip('-X') != '':
            fch.write('>%s %d\n' %
                      (r, to_correct[r][2] + init_x - to_correct[r][0]))
            cc = 0
            for c in cor_read.lstrip('-X'):
                if c != 'X':
                    fch.write(str(c))
                    fch.flush()
                    cc = cc + 1
                    if cc % fasta_length == 0:
                        fch.write('\n')

            if cc % fasta_length != 0:
                fch.write('\n')
    print ccx
    fch.close()

    # write proposed_per_step to file
    ph = open('proposed.dat', 'w')
    ph.write('#base\tproposed_per_step\n')
    for kp in sorted(proposed.iterkeys()):
        if proposed[kp] != 'not found':
            ph.write('%s\t%f\n' %
                     (kp, float(proposed[kp][0]) / proposed[kp][1]))
    ph.close()

    declog.info('running snv.py')
    snv.main(reference=in_fasta,
             bam_file=in_bam,
             increment=win_length / win_shifts)

    # tidy snvs
    try:
        os.mkdir('snv')
    except OSError:
        os.rename('snv', 'snv_before_%d' % int(time.time()))
        os.mkdir('snv')
    for snv_file in glob.glob('./SNV*'):
        shutil.move(snv_file, 'snv/')

    declog.info('dec.py ends')
示例#3
0
文件: amplian.py 项目: georgek/shorah
def main(in_bam='', in_fasta='', min_overlap=0.95, max_coverage=50000,
         alpha=0.5, s=0.01, region='', diversity=False):
    '''
    Performs the amplicon analysis, running diri_sampler
    and analyzing the result
    '''

    import snv

    # set logging level
    amplog.setLevel(logging.DEBUG)
    # This handler writes everything to a file.
    LOG_FILENAME = './amplian.log'
    hl = logging.handlers.RotatingFileHandler(LOG_FILENAME, 'w',
                                              maxBytes=100000, backupCount=5)
    f = logging.Formatter("%(levelname)s %(asctime)s %(funcName)s\
                          %(lineno)d %(message)s")
    hl.setFormatter(f)
    amplog.addHandler(hl)
    amplog.info(' '.join(sys.argv))
    # info on reference and region if given, or discover high entropy one
    ref_seq = list(SeqIO.parse(in_fasta, 'fasta'))[0]
    ref_name = ref_seq.id
    if region:
        reg_bound = region.split(':')[1].split('-')
        reg_start, reg_stop = int(reg_bound[0]), int(reg_bound[1])
        ref_length = reg_stop - reg_start + 1
    elif region == '' and diversity:
        reg_start, reg_stop = highest_entropy(in_bam, in_fasta)
        ref_length = reg_stop - reg_start + 1
        region = '%s:%d-%d' % (ref_seq.id, reg_start, reg_stop)
    elif region == '' and not diversity:
        reg_start = 1
        ref_length = len(ref_seq)
        reg_stop = ref_length

    amplog.info('analysing region from %d to %d' % (reg_start, reg_stop))

    # output the reads, aligned to the amplicon
    b2w_exe = os.path.join(dn, 'b2w')
    b2w_args = ' -i 0 -w %d -m %d -x %d %s %s %s' % \
        (ref_length, int(min_overlap * ref_length),
         max_coverage, in_bam, in_fasta, region)
    ret_b2w = run_child(b2w_exe, b2w_args)
    amplog.debug('b2w returned %d' % ret_b2w)

    # run diri_sampler on the aligned reads
    win_file = 'w-%s-%d-%d.reads.fas' % (ref_name, reg_start, reg_stop)
    h = list(open('coverage.txt'))[0]
    n_reads = int(h.split()[-1])
    assert os.path.exists(win_file), 'window file %s not found' % win_file
    diri_exe = os.path.join(dn, 'diri_sampler')
    iterations = min(200000, n_reads * 20)
    diri_args = '-i %s -j %d -a %f -t 2000' % (win_file, iterations, alpha)
    ret_diri = run_child(diri_exe, diri_args)
    amplog.debug('diri_sampler returned %d' % ret_diri)

    # diagnostics on the convergence
    run_diagnostics(win_file, n_reads)

    # run snv.py to parse single nucleotide variants
    snv.main(reference=options.in_fasta, bam_file=options.in_bam,
             sigma=s, increment=1)
示例#4
0
        sholog.debug('running mm.py')
        mm.main('%s_cor.rest' % in_stem, maxhaplo=200)

        # 5. run EM freqEst output: sample.$nr.popl
        sholog.debug('running freqEst')
        my_prog = os.path.join(dn, 'freqEst')
        my_arg = " -f %s_cor" % in_stem
        assert os.path.isfile('%s_cor.rest' % in_stem), \
            'File %s_cor.rest not found' % in_stem
        retcode_em = run_child(my_prog, my_arg)
        if retcode_em:
            sholog.error('freqEst did not return 0')
            sys.exit('Something went wrong in the EM step')
        else:
            sholog.debug('freqEst exited successfully')

    # 6. run snv.py to parse single nucleotide variants
    sholog.debug('running snv.py')
    snv.main(reference=options.f,
             bam_file=options.b,
             sigma=options.i,
             increment=options.w / options.s)

    # tidy snvs
    try:
        os.mkdir('snv')
    except OSError:
        pass
    for snv_file in glob.glob('./SNV*'):
        shutil.move(snv_file, 'snv/')
示例#5
0
def runPipeline(args, sampleName, sampleDir):
    """
		Runs the main ProDuSe analysis stages on the provided sample

		Args:
			args: A namespace object listing command line parameters to be passed to subscripts
			sampleName: Name of the sample currently being processed
			sampleDir: Output directory
	"""

    printPrefix = "PRODUSE-MAIN\t"

    # Run Trim
    args.config = getConfig(sampleDir, "trim")
    trim.main(args)
    sys.stderr.write("\t".join([
        printPrefix,
        time.strftime('%X'), sampleName + ": Trimming Complete\n"
    ]))

    # Run bwa on the trimmed fastqs
    args.config = getConfig(sampleDir, "trim_bwa")
    bwa.main(args)
    sys.stderr.write("\t".join([
        printPrefix,
        time.strftime('%X'), sampleName + ": Alignment Complete\n"
    ]))

    # Run collapse on the trimmed BAM file
    args.config = getConfig(sampleDir, "collapse")
    collapse.main(args)
    sys.stderr.write("\t".join([
        printPrefix,
        time.strftime('%X'), sampleName + ": Collapse Complete\n"
    ]))

    # Run bwa on the collapsed
    args.config = getConfig(sampleDir, "collapse_bwa")
    bwa.main(args)
    sys.stderr.write("\t".join([
        printPrefix,
        time.strftime('%X'), sampleName + ": Alignment Complete\n"
    ]))

    # Run stitcher
    collapsedBamFile = os.path.abspath(
        os.path.join(sampleDir, "tmp", sampleName + ".collapse.bam"))
    stitchedBam = runStitcher(collapsedBamFile, args.stitcherpath)
    sys.stderr.write("\t".join([
        printPrefix,
        time.strftime('%X'), sampleName + ": Stitching Complete\n"
    ]))

    # Sort files prior to splitmerge
    runSort(stitchedBam, byName=True)
    runSort(collapsedBamFile, byName=True)

    args.config = getConfig(sampleDir, "splitmerge")
    splitMergeBam = os.path.join(sampleDir, "results",
                                 sampleName + ".SplitMerge.bam")
    SplitMerge.main(args)
    runSort(splitMergeBam)
    sys.stderr.write("\t".join([
        printPrefix,
        time.strftime('%X'), sampleName + ": SplitMerge Complete\n"
    ]))

    # Time for SNV calling, what everyone has been waiting for
    args.config = getConfig(sampleDir, "snv")
    snv.main(args)
    sys.stderr.write("\t".join([
        printPrefix,
        time.strftime('%X'), sampleName + ": SNV Calling Complete\n"
    ]))

    # Filter variants
    args.config = getConfig(sampleDir, "filter")
    filter_produse.main(args)
    # runFilter(args.vaf, vcfFile, scriptDir + os.sep + "filter_produse.pl")
    sys.stderr.write("\t".join([
        printPrefix,
        time.strftime('%X'), sampleName + ": Variant Filtering Complete\n"
    ]))

    sys.stderr.write("\t".join([
        printPrefix,
        time.strftime('%X'), sampleName + ": ProDuSe analysis Complete\n"
    ]))
示例#6
0
文件: amplian.py 项目: georgek/shorah
def main(in_bam='',
         in_fasta='',
         min_overlap=0.95,
         max_coverage=50000,
         alpha=0.5,
         s=0.01,
         region='',
         diversity=False):
    '''
    Performs the amplicon analysis, running diri_sampler
    and analyzing the result
    '''

    import snv

    # set logging level
    amplog.setLevel(logging.DEBUG)
    # This handler writes everything to a file.
    LOG_FILENAME = './amplian.log'
    hl = logging.handlers.RotatingFileHandler(LOG_FILENAME,
                                              'w',
                                              maxBytes=100000,
                                              backupCount=5)
    f = logging.Formatter("%(levelname)s %(asctime)s %(funcName)s\
                          %(lineno)d %(message)s")
    hl.setFormatter(f)
    amplog.addHandler(hl)
    amplog.info(' '.join(sys.argv))
    # info on reference and region if given, or discover high entropy one
    ref_seq = list(SeqIO.parse(in_fasta, 'fasta'))[0]
    ref_name = ref_seq.id
    if region:
        reg_bound = region.split(':')[1].split('-')
        reg_start, reg_stop = int(reg_bound[0]), int(reg_bound[1])
        ref_length = reg_stop - reg_start + 1
    elif region == '' and diversity:
        reg_start, reg_stop = highest_entropy(in_bam, in_fasta)
        ref_length = reg_stop - reg_start + 1
        region = '%s:%d-%d' % (ref_seq.id, reg_start, reg_stop)
    elif region == '' and not diversity:
        reg_start = 1
        ref_length = len(ref_seq)
        reg_stop = ref_length

    amplog.info('analysing region from %d to %d' % (reg_start, reg_stop))

    # output the reads, aligned to the amplicon
    b2w_exe = os.path.join(dn, 'b2w')
    b2w_args = ' -i 0 -w %d -m %d -x %d %s %s %s' % \
        (ref_length, int(min_overlap * ref_length),
         max_coverage, in_bam, in_fasta, region)
    ret_b2w = run_child(b2w_exe, b2w_args)
    amplog.debug('b2w returned %d' % ret_b2w)

    # run diri_sampler on the aligned reads
    win_file = 'w-%s-%d-%d.reads.fas' % (ref_name, reg_start, reg_stop)
    h = list(open('coverage.txt'))[0]
    n_reads = int(h.split()[-1])
    assert os.path.exists(win_file), 'window file %s not found' % win_file
    diri_exe = os.path.join(dn, 'diri_sampler')
    iterations = min(200000, n_reads * 20)
    diri_args = '-i %s -j %d -a %f -t 2000' % (win_file, iterations, alpha)
    ret_diri = run_child(diri_exe, diri_args)
    amplog.debug('diri_sampler returned %d' % ret_diri)

    # diagnostics on the convergence
    run_diagnostics(win_file, n_reads)

    # run snv.py to parse single nucleotide variants
    snv.main(reference=options.in_fasta,
             bam_file=options.in_bam,
             sigma=s,
             increment=1)