Пример #1
0
def main():
    opts          = get_options()

    inbam          = opts.inbam
    resolution     = opts.reso
    filter_exclude = opts.filter
    min_count      = opts.min_count
    ncpus          = opts.cpus
    factor         = 1
    outdir         = opts.outdir
    sigma          = 2

    mkdir(outdir)

    sys.stdout.write('\nNormalization of full genome\n')

    biases, decay, badcol = read_bam(inbam, filter_exclude, resolution,
                                     min_count=min_count, ncpus=ncpus, sigma=sigma,
                                     factor=factor, outdir=outdir, check_sum=opts.check_sum)

    printime('  - Saving biases and badcol columns')
    # biases
    out = open(os.path.join(outdir, 'biases_%s.pickle' % (
        nicer(resolution).replace(' ', ''))), 'w')

    dump({'biases'    : biases,
          'decay'     : decay,
          'badcol'    : badcol,
          'resolution': resolution}, out)
    out.close()

    # hic_data.write_matrix('chr_names%s_%d-%d.mat' % (region, start, end), focus=())
    printime('\nDone.')
Пример #2
0
def main():
    opts = get_options()
    inbam = opts.inbam
    resolution = opts.resolution
    outfile = opts.outfile
    biases_file = opts.biases_file
    window = opts.window

    if window not in ['inter', 'intra', 'all']:
        window = [int(x) / resolution for x in window.split('-')]
        if window[0] >= window[1]:
            raise Exception('ERROR: beginning of window should be smaller '
                            'than end')

    nheader = write_matrix(inbam,
                           resolution,
                           biases_file,
                           outfile,
                           nchunks=opts.nchunks,
                           ncpus=opts.ncpus,
                           clean=opts.clean,
                           window=window)

    rand_hash = "%016x" % getrandbits(64)
    tmpdir = os.path.join('.', '_tmp_%s' % (rand_hash))
    mkdir(tmpdir)

    #sort all files for only read once per pair of peaks to extract
    sort_BAMtsv(nheader, outfile, tmpdir)

    os.system('rm -rf {}'.format(tmpdir))

    printime('Done.')
Пример #3
0
def write_matrix(inbam, resolution, biases, outdir,
                 filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10),
                 region1=None, start1=None, end1=None, clean=True,
                 region2=None, start2=None, end2=None,
                 tmpdir='.', ncpus=8, verbose=True):

    if not isinstance(filter_exclude, int):
        filter_exclude = filters_to_bin(filter_exclude)

    regions, rand_hash, bin_coords, chunks = read_bam(
        inbam, filter_exclude, resolution, ncpus=ncpus,
        region1=region1, start1=start1, end1=end1,
        region2=region2, start2=start2, end2=end2,
        tmpdir=tmpdir, verbose=verbose)

    bamfile = pysam.AlignmentFile(inbam, 'rb')
    sections = OrderedDict(zip(bamfile.references,[x / resolution + 1 for x in bamfile.lengths]))

    total = 0
    section_pos = dict()
    for crm in sections:
        section_pos[crm] = (total, total + sections[crm])
        total += sections[crm]

    if biases:
        bias1, bias2, decay, bads1, bads2 = get_biases_region(biases, bin_coords)

    else:
        bads1 = bads2 = {}

    start_bin1, start_bin2 = bin_coords[::2]
    if verbose:
        printime('  - Writing matrices')

    fnam = outdir + '{}_mat_{}kb.tsv'.format(region1, resolution / 1000)
    mkdir (outdir)
    out = open(os.path.join(outdir, fnam), 'w')

    # pull all sub-matrices and write full matrix
    for c,j, k, v in _iter_matrix_frags(chunks, tmpdir, rand_hash,
                                         verbose=verbose, clean=clean):
        if k < j: # we are only going to keep half of the matrix
            continue
        if j not in bads1 and k not in bads2 and abs(j-k) in decay[c]:
            n = v / bias1[j] / bias2[k] / decay[c][abs(j-k)]
            pos1 = j + section_pos[region1][0]
            pos2 = k + section_pos[region1][0]
            out.write('{}\t{}\t{}\t{}\n'.format(pos1, pos2, v, n))

    out.close()

    # this is the last thing we do in case something goes wrong
    os.system('rm -rf %s' % (os.path.join(tmpdir, '_tmp_%s' % (rand_hash))))

    if  verbose:
        printime('\nDone.')
Пример #4
0
def sort_BAMtsv(nheader, outfile, tmp):
    tsv = outfile
    printime('Sorting BAM matrix: {}'.format(tsv))
    # sort file first and second column and write to same file
    print(("(head -n {0} {1} && tail -n +{0} {1} | "
           "sort -k1n -k2n -S 10% -T {2}) > {1}").format(nheader, tsv, tmp))
    _ = Popen(("(head -n {0} {2} && tail -n +{1} {2} | "
               "sort -k1n -k2n -S 10% -T {3}) > {2}_").format(
                   nheader, nheader + 1, tsv, tmp),
              shell=True).communicate()
    os.system("mv {0}_ {0}".format(tsv))
Пример #5
0
def main():
    opts = get_options()

    inbam = opts.inbam
    resolution = opts.reso
    filter_exclude = opts.filter
    min_count = opts.min_count
    ncpus = opts.cpus
    factor = 1
    outdir = opts.outdir
    sigma = 2

    mkdir(outdir)

    sys.stdout.write('\nNormalization of full genome\n')

    biases, decay, badcol = read_bam(inbam,
                                     filter_exclude,
                                     resolution,
                                     min_count=min_count,
                                     ncpus=ncpus,
                                     sigma=sigma,
                                     factor=factor,
                                     outdir=outdir,
                                     check_sum=opts.check_sum)

    printime('  - Saving biases and badcol columns')
    # biases
    out = open(
        os.path.join(outdir, 'biases_%s.pickle' %
                     (nicer(resolution).replace(' ', ''))), 'w')

    dump(
        {
            'biases': biases,
            'decay': decay,
            'badcol': badcol,
            'resolution': resolution
        }, out)
    out.close()

    # hic_data.write_matrix('chr_names%s_%d-%d.mat' % (region, start, end), focus=())
    printime('\nDone.')
Пример #6
0
def main():
    opts         = get_options()
    inbam        = opts.inbam
    resolution   = opts.resolution
    outdir      = opts.outdir
    tmppath      = opts.tmppath
    biases_file  = opts.biases_file
    dry_run      = opts.dry_run
    
    # a bit of hardcoded parameter never hurts
    metric = 'loop'

    printime('Generating huge matrix')
    nheader,outfile = write_big_matrix(inbam, resolution, biases_file, outdir, 
                               nchunks=opts.nchunks, wanted_chrom=opts.chrom,
                               wanted_pos1=opts.pos1, wanted_pos2=opts.pos2,
                               dry_run=dry_run, ncpus=opts.ncpus, 
                               tmpdir=tmppath,
                               clean=not opts.dirty, verbose=opts.verbose,
                               waffle_radii=opts.waffle_radii,
                               metric=metric)
Пример #7
0
def run(opts):
    check_options(opts)
    samtools = which(opts.samtools)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)

    reso1 = reso2 = None
    if opts.bam1:
        mreads1 = path.realpath(opts.bam1)
        biases1 = opts.biases1
    else:
        biases1, mreads1, reso1 = load_parameters_fromdb(
            opts.workdir1, opts.jobid1, opts, opts.tmpdb1)
        mreads1 = path.join(opts.workdir1, mreads1)
        try:
            biases1 = path.join(opts.workdir1, biases1)
        except AttributeError:
            biases1 = None
        except TypeError:  # Py3
            biases1 = None

    if opts.bam2:
        mreads2 = path.realpath(opts.bam2)
        biases2 = opts.biases2
    else:
        biases2, mreads2, reso2 = load_parameters_fromdb(
            opts.workdir2, opts.jobid2, opts, opts.tmpdb2)
        mreads2 = path.join(opts.workdir2, mreads2)
        try:
            biases2 = path.join(opts.workdir2, biases2)
        except AttributeError:
            biases2 = None
        except TypeError:  # Py3
            biases1 = None

    filter_exclude = opts.filter

    if reso1 != reso2:
        raise Exception('ERROR: differing resolutions between experiments to '
                        'be merged')

    mkdir(path.join(opts.workdir, '00_merge'))

    if not opts.skip_comparison:
        printime('  - loading first sample %s' % (mreads1))
        hic_data1 = load_hic_data_from_bam(mreads1, opts.reso, biases=biases1,
                                           tmpdir=path.join(opts.workdir, '00_merge'),
                                           ncpus=opts.cpus,
                                           filter_exclude=filter_exclude)

        printime('  - loading second sample %s' % (mreads2))
        hic_data2 = load_hic_data_from_bam(mreads2, opts.reso, biases=biases2,
                                           tmpdir=path.join(opts.workdir, '00_merge'),
                                           ncpus=opts.cpus,
                                           filter_exclude=filter_exclude)

        if opts.workdir1 and opts.workdir2:
            masked1 = {'valid-pairs': {'count': 0}}
            masked2 = {'valid-pairs': {'count': 0}}
        else:
            masked1 = {'valid-pairs': {'count': sum(hic_data1.values())}}
            masked2 = {'valid-pairs': {'count': sum(hic_data2.values())}}

        decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash))
        eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash))

        printime('  - comparing experiments')
        printime('    => correlation between equidistant loci')
        corr, _, scc, std, bads = correlate_matrices(
            hic_data1, hic_data2, normalized=opts.norm,
            remove_bad_columns=True, savefig=decay_corr_fig,
            savedata=decay_corr_dat, get_bads=True)
        print('         - correlation score (SCC): %.4f (+- %.7f)' % (scc, std))
        printime('    => correlation between eigenvectors')
        eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm,
                                          remove_bad_columns=True, nvect=6,
                                          savefig=eigen_corr_fig,
                                          savedata=eigen_corr_dat)

        printime('    => reproducibility score')
        reprod = get_reproducibility(hic_data1, hic_data2, num_evec=20, normalized=opts.norm,
                                     verbose=False, remove_bad_columns=True)
        print('         - reproducibility score: %.4f' % (reprod))
        ncols = len(hic_data1)
    else:
        ncols = 0
        decay_corr_dat = 'None'
        decay_corr_fig = 'None'
        eigen_corr_dat = 'None'
        eigen_corr_fig = 'None'
        masked1 = {}
        masked2 = {}

        corr = eig_corr = scc = std = reprod = 0
        bads = {}

    # merge inputs
    mkdir(path.join(opts.workdir, '03_filtered_reads'))
    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s.bam' % (param_hash))

    if not opts.skip_merge:
        outbam = path.join(opts.workdir, '03_filtered_reads',
                           'intersection_%s.bam' % (param_hash))
        printime('  - Mergeing experiments')
        system(samtools  + ' merge -@ %d %s %s %s' % (opts.cpus, outbam, mreads1, mreads2))
        printime('  - Indexing new BAM file')
        # check samtools version number and modify command line
        version = LooseVersion([l.split()[1]
                                for l in Popen(samtools, stderr=PIPE,
                                               universal_newlines=True).communicate()[1].split('\n')
                                if 'Version' in l][0])
        if version >= LooseVersion('1.3.1'):
            system(samtools  + ' index -@ %d %s' % (opts.cpus, outbam))
        else:
            system(samtools  + ' index %s' % (outbam))
    else:
        outbam = ''

    finish_time = time.localtime()
    save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig,
                len(list(bads.keys())), ncols, scc, std, reprod,
                eigen_corr_dat, eigen_corr_fig, outbam, corr, eig_corr,
                biases1, biases2, masked1, masked2, launch_time, finish_time)
    printime('\nDone.')
Пример #8
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts, extra=['quiet'])

    if opts.zrange:
        vmin = float(opts.zrange.split(',')[0])
        vmax = float(opts.zrange.split(',')[1])
    else:
        vmin = vmax = None

    if opts.figsize:
        opts.figsize = map(float, opts.figsize.split(','))
    else:
        vmin = vmax = None

    clean = True  # change for debug

    if opts.bam:
        mreads = path.realpath(opts.bam)
        if not opts.biases and all(v !='raw' for v in opts.normalizations):
            raise Exception('ERROR: external BAM input, should provide path to'
                            ' biases file.')
        biases = opts.biases
    else:
        biases, mreads = load_parameters_fromdb(opts)
        mreads = path.join(opts.workdir, mreads)
        biases = path.join(opts.workdir, biases) if biases else None
    if opts.biases:
        biases = opts.biases

    coord1         = opts.coord1
    coord2         = opts.coord2

    if coord2 and not coord1:
        coord1, coord2 = coord2, coord1

    if not coord1:
        region1 = None
        start1  = None
        end1    = None
        region2 = None
        start2  = None
        end2    = None
    else:
        try:
            crm1, pos1   = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1  = int(start1)
            end1    = int(end1)
        except ValueError:
            region1 = coord1
            start1  = None
            end1    = None
        if coord2:
            try:
                crm2, pos2   = coord2.split(':')
                start2, end2 = pos2.split('-')
                region2 = crm2
                start2  = int(start2)
                end2    = int(end2)
            except ValueError:
                region2 = coord2
                start2  = None
                end2    = None
        else:
            region2 = None
            start2  = None
            end2    = None

    if opts.plot and not opts.force_plot:
        if opts.interactive:
            max_size = 1500**2
        else:
            max_size = 5000**2
    else:
        max_size = None

    outdir = path.join(opts.workdir, '05_sub-matrices')
    mkdir(outdir)
    tmpdir = path.join(opts.workdir, '05_sub-matrices',
                       '_tmp_sub-matrices_%s' % param_hash)
    mkdir(tmpdir)

    if region1:
        if region1:
            if not opts.quiet:
                stdout.write('\nExtraction of %s' % (region1))
            if start1:
                if not opts.quiet:
                    stdout.write(':%s-%s' % (start1, end1))
            else:
                if not opts.quiet:
                    stdout.write(' (full chromosome)')
            if region2:
                if not opts.quiet:
                    stdout.write(' intersection with %s' % (region2))
                if start2:
                    if not opts.quiet:
                        stdout.write(':%s-%s\n' % (start2, end2))
                else:
                    if not opts.quiet:
                        stdout.write(' (full chromosome)\n')
            else:
                if not opts.quiet:
                    stdout.write('\n')
    else:
        if not opts.quiet:
            stdout.write('\nExtraction of full genome\n')

    out_files = {}
    out_plots = {}

    if opts.matrix or opts.plot:
        bamfile = AlignmentFile(mreads, 'rb')
        sections = OrderedDict(zip(bamfile.references,
                                   [x for x in bamfile.lengths]))
        total = 0
        section_pos = OrderedDict()
        for crm in sections:
            section_pos[crm] = (total, total + sections[crm])
            total += sections[crm]
        for norm in opts.normalizations:
            norm_string = ('RAW' if norm == 'raw' else 'NRM'
                           if norm == 'norm' else 'DEC')
            printime('Getting %s matrices' % norm)
            try:
                matrix, bads1, bads2, regions, name, bin_coords = get_matrix(
                    mreads, opts.reso,
                    load(open(biases)) if biases and norm != 'raw' else None,
                    normalization=norm,
                    region1=region1, start1=start1, end1=end1,
                    region2=region2, start2=start2, end2=end2,
                    tmpdir=tmpdir, ncpus=opts.cpus,
                    return_headers=True,
                    nchunks=opts.nchunks, verbose=not opts.quiet,
                    clean=clean, max_size=max_size)
            except NotImplementedError:
                if norm == "raw&decay":
                    warn('WARNING: raw&decay normalization not implemented '
                         'for matrices\n... skipping\n')
                    continue
                raise
            b1, e1, b2, e2 = bin_coords
            b1, e1 = 0, e1 - b1
            b2, e2 = 0, e2 - b2
            if opts.row_names:
                starts = [start1, start2]
                ends = [end1, end2]
                row_names = ((reg, p + 1 , p + opts.reso) for r, reg in enumerate(regions)
                             for p in range(starts[r] if r < len(starts) and starts[r] else 0,
                                            ends[r] if r < len(ends) and ends[r] else sections[reg],
                                            opts.reso))
            if opts.matrix:
                printime(' - Writing: %s' % norm)
                fnam = '%s_%s_%s%s.mat' % (norm, name,
                                           nicer(opts.reso, sep=''),
                                           ('_' + param_hash))
                out_files[norm_string] = path.join(outdir, fnam)
                out = open(path.join(outdir, fnam), 'w')
                for reg in regions:
                    out.write('# CRM %s\t%d\n' % (reg, sections[reg]))
                if region2:
                    out.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1])))
                    out.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2])))
                else:
                    out.write('# MASKED %s\n' % (','.join([str(b) for b in bads1])))
                if opts.row_names:
                    out.write('\n'.join('%s\t%d\t%d\t' % (row_names.next()) +
                                        '\t'.join(str(matrix.get((i, j), 0))
                                                  for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                else:
                    out.write('\n'.join('\t'.join(str(matrix.get((i, j), 0))
                                                  for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                out.close()
            if opts.plot:
                # transform matrix
                matrix = array([array([matrix.get((i, j), 0)
                                       for i in xrange(b1, e1)])
                                for j in xrange(b2, e2)])
                m = zeros_like(matrix)
                for bad1 in bads1:
                    m[:,bad1] = 1
                    for bad2 in bads2:
                        m[bad2,:] = 1
                matrix = ma.masked_array(matrix, m)
                printime(' - Plotting: %s' % norm)
                fnam = '%s_%s_%s%s%s.%s' % (
                    norm, name, nicer(opts.reso, sep=''),
                    ('_' + param_hash), '_tri' if opts.triangular else '',
                    opts.format)
                out_plots[norm_string] = path.join(outdir, fnam)
                pltbeg1 = 0 if start1 is None else start1
                pltend1 = sections[regions[0]] if end1 is None else end1
                pltbeg2 = 0 if start2 is None else start2
                pltend2 = sections[regions[-1]] if end2 is None else end2
                xlabel = '{}:{:,}-{:,}'.format(
                    regions[0], pltbeg1 if pltbeg1 else 1, pltend1)
                ylabel = '{}:{:,}-{:,}'.format(
                    regions[-1], pltbeg2 if pltbeg2 else 1, pltend2)
                section_pos = OrderedDict((k, section_pos[k]) for k in section_pos
                                   if k in regions)
                ax1, _ = plot_HiC_matrix(
                    matrix, triangular=opts.triangular,
                    vmin=vmin, vmax=vmax, cmap=opts.cmap,
                    figsize=opts.figsize,
                    bad_color=opts.bad_color if norm != 'raw' else None)
                ax1.set_title('Region: %s, normalization: %s, resolution: %s' % (
                    name, norm, nicer(opts.reso)), y=1.05)
                _format_axes(ax1, start1, end1, start2, end2, opts.reso,
                             regions, section_pos, sections,
                             opts.xtick_rotation, triangular=False)
                if opts.interactive:
                    plt.show()
                    plt.close('all')
                else:
                    tadbit_savefig(path.join(outdir, fnam))
    if not opts.matrix and not opts.only_plot:
        printime('Getting and writing matrices')
        out_files.update(write_matrix(
            mreads, opts.reso,
            load(open(biases)) if biases else None,
            outdir, filter_exclude=opts.filter,
            normalizations=opts.normalizations,
            region1=region1, start1=start1, end1=end1,
            region2=region2, start2=start2, end2=end2,
            tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus,
            nchunks=opts.nchunks, verbose=not opts.quiet,
            extra=param_hash, clean=clean))

    if clean:
        printime('Cleaning')
        system('rm -rf %s '% tmpdir)

    if not opts.interactive:
        printime('Saving to DB')
        finish_time = time.localtime()
        save_to_db(opts, launch_time, finish_time, out_files, out_plots)
Пример #9
0
def write_matrix(inbam,
                 resolution,
                 biases,
                 outfile,
                 filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10),
                 region1=None,
                 start1=None,
                 end1=None,
                 clean=True,
                 region2=None,
                 start2=None,
                 end2=None,
                 nchunks=100,
                 tmpdir='.',
                 ncpus=8,
                 verbose=True,
                 window=None):

    if not isinstance(filter_exclude, int):
        filter_exclude = filters_to_bin(filter_exclude)

    _, rand_hash, bin_coords, chunks = read_bam(inbam,
                                                filter_exclude,
                                                resolution,
                                                ncpus=ncpus,
                                                region1=region1,
                                                start1=start1,
                                                end1=end1,
                                                region2=region2,
                                                start2=start2,
                                                end2=end2,
                                                tmpdir=tmpdir,
                                                nchunks=nchunks,
                                                verbose=verbose)

    bamfile = AlignmentFile(inbam, 'rb')
    sections = OrderedDict(
        zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths]))

    total = 0
    section_pos = dict()
    for crm in sections:
        section_pos[crm] = (total, total + sections[crm])
        total += sections[crm]

    if biases:
        bias1, bias2, decay, bads1, bads2 = get_biases_region(
            biases, bin_coords)
        transform = lambda x, c, j, k: x / bias1[j] / bias2[k] / decay[c][abs(
            k - j)]
        transform2 = lambda x, j, k: x / bias1[j] / bias2[k]
    else:
        bads1 = bads2 = {}
        transform = transform2 = lambda x, c, k, j: x

    if bads1 is bads2:
        badcols = bads1
    else:  # should never happen
        badcols = bads1
        badcols.update(bads2)

    if verbose:
        printime('  - Writing matrices')

    mkdir(os.path.split(os.path.abspath(outfile))[0])
    # write the rest of the file to be sorted
    out = open(outfile, 'w')
    nheader = 0
    for i, c in enumerate(bamfile.references):
        out.write('# CHROM\t{}\t{}\n'.format(c, bamfile.lengths[i]))
        nheader += 1
    out.write('# RESOLUTION\t{}\n'.format(resolution))
    nheader += 1
    out.write('# BADCOLS\t{}\n'.format(','.join(map(str, badcols.keys()))))
    nheader += 1

    if window == 'all':
        outside = lambda c_, j_, k_: False
    elif window == 'intra':
        outside = lambda c_, j_, k_: c_ == ''
    elif window == 'inter':
        outside = lambda c_, j_, k_: c_ != ''
    else:
        min_, max_ = window
        outside = lambda c_, j_, k_: (k_ - j_) < min_ or (k_ - j_) > max_

    # pull all sub-matrices and write full matrix
    for c, j, k, v in _iter_matrix_frags(chunks,
                                         tmpdir,
                                         rand_hash,
                                         verbose=verbose,
                                         clean=clean):
        if k < j or j in badcols or k in badcols:  # we keep only half matrix
            continue
        if outside(c, j, k):
            continue
        try:
            n = transform(v, c, j, k)  # normalize
        except KeyError:
            n = transform2(v, j, k)  # normalize no decay
        out.write('{}\t{}\t{}\t{}\n'.format(j, k, v, n))
    out.close()

    # this is the last thing we do in case something goes wrong
    if clean:
        os.system('rm -rf %s' % (os.path.join(tmpdir, '_tmp_%s' %
                                              (rand_hash))))
    return nheader
Пример #10
0
def read_bam(inbam, filter_exclude, resolution, min_count=2500,
             sigma=2, ncpus=8, factor=1, outdir='.', check_sum=False):
    bamfile = AlignmentFile(inbam, 'rb')
    sections = OrderedDict(zip(bamfile.references,
                               [x / resolution + 1 for x in bamfile.lengths]))
    total = 0
    section_pos = dict()
    for crm in sections:
        section_pos[crm] = (total, total + sections[crm])
        total += sections[crm] + 1
    bins = []
    for crm in sections:
        len_crm = sections[crm]
        bins.extend([(crm, i) for i in xrange(len_crm + 1)])

    start_bin = 0
    end_bin   = len(bins) + 1
    total = len(bins)

    total = end_bin - start_bin + 1
    regs = []
    begs = []
    ends = []
    njobs = min(total, 100) + 1
    nbins = total / njobs + 1
    for i in range(start_bin, end_bin, nbins):
        if i + nbins > end_bin:  # make sure that we stop at the right place
            nbins = end_bin - i
        try:
            (crm1, beg1), (crm2, end2) = bins[i], bins[i + nbins - 1]
        except IndexError:
            (crm1, beg1), (crm2, end2) = bins[i], bins[-1]
        if crm1 != crm2:
            end1 = sections[crm1]
            beg2 = 0
            regs.append(crm1)
            regs.append(crm2)
            begs.append(beg1 * resolution)
            begs.append(beg2 * resolution)
            ends.append(end1 * resolution + resolution)  # last nt included
            ends.append(end2 * resolution + resolution - 1)  # last nt not included (overlap with next window)
        else:
            regs.append(crm1)
            begs.append(beg1 * resolution)
            ends.append(end2 * resolution + resolution - 1)
    ends[-1] += 1  # last nucleotide included

    # print '\n'.join(['%s %d %d' % (a, b, c) for a, b, c in zip(regs, begs, ends)])
    printime('\n  - Parsing BAM (%d chunks)' % (len(regs)))
    bins_dict = dict([(j, i) for i, j in enumerate(bins)])
    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        procs.append(pool.apply_async(
            read_bam_frag, args=(inbam, filter_exclude, bins, bins_dict,
                                 resolution, outdir, region, start, end,)))
    pool.close()
    print_progress(procs)
    pool.join()

    ## COLLECT RESULTS
    verbose = True
    cisprc = {}
    for countbin, (region, start, end) in enumerate(zip(regs, begs, ends)):
        if verbose:
            if not countbin % 10 and countbin:
                sys.stdout.write(' ')
            if not countbin % 50 and countbin:
                sys.stdout.write(' %9s\n     ' % ('%s/%s' % (countbin , len(regs))))
            sys.stdout.write('.')
            sys.stdout.flush()

        fname = os.path.join(outdir,
                             'tmp_bins_%s:%d-%d.pickle' % (region, start, end))
        tmp_cisprc = load(open(fname))
        cisprc.update(tmp_cisprc)
    if verbose:
        print '%s %9s\n' % (' ' * (54 - (countbin % 50) - (countbin % 50) / 10),
                            '%s/%s' % (len(regs),len(regs)))

    # out = open(os.path.join(outdir, 'dicos_%s.pickle' % (
    #     nicer(resolution).replace(' ', ''))), 'w')
    # dump(cisprc, out)
    # out.close()
    # bad columns
    def func_gen(x, *args):
        cmd = "zzz = " + func_restring % (args)
        exec(cmd) in globals(), locals()
        #print cmd
        try:
            return np.lib.asarray_chkfinite(zzz)
        except:
            # avoid the creation of NaNs when invalid values for power or log
            return x
    print '  - Removing columns with too few or too much interactions'
    if not min_count:

        badcol = filter_by_cis_percentage(
            cisprc, sigma=sigma, verbose=True,
            savefig=os.path.join(outdir + 'filtered_bins_%s.png' % (
                nicer(resolution).replace(' ', ''))))
    else:
        print '      -> too few  interactions defined as less than %9d interactions' % (
            min_count)
        for k in cisprc:
            cisprc[k] = cisprc[k][1]
        badcol = {}
        countL = 0
        countZ = 0
        for c in xrange(total):
            if cisprc.get(c, 0) < min_count:
                badcol[c] = cisprc.get(c, 0)
                countL += 1
                if not c in cisprc:
                    countZ += 1
        print '      -> removed %d columns (%d/%d null/high counts) of %d (%.1f%%)' % (
            len(badcol), countZ, countL, total, float(len(badcol)) / total * 100)

    printime('  - Rescaling biases')
    size = len(bins)
    biases = [cisprc.get(k, 1.) for k in range(size)]
    mean_col = float(sum(biases)) / len(biases)
    biases = dict([(k, b / mean_col * mean_col**0.5)
                   for k, b in enumerate(biases)])

    # collect subset-matrices and write genomic one
    # out = open(os.path.join(outdir,
    #                         'hicdata_%s.abc' % (nicer(resolution).replace(' ', ''))), 'w')
    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        fname = os.path.join(outdir, 'tmp_%s:%d-%d.pickle' % (region, start, end))
        procs.append(pool.apply_async(sum_nrm_matrix, args=(fname, biases, )))
    pool.close()
    print_progress(procs)
    pool.join()

    # to correct biases
    sumnrm = sum(p.get() for p in procs)

    target = (sumnrm / float(size * size * factor))**0.5
    biases = dict([(b, biases[b] * target) for b in biases])

    # check the sum
    if check_sum:
        pool = mu.Pool(ncpus)
        procs = []
        for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
            fname = os.path.join(outdir, 'tmp_%s:%d-%d.pickle' % (region, start, end))
            procs.append(pool.apply_async(sum_nrm_matrix, args=(fname, biases, )))
        pool.close()
        print_progress(procs)
        pool.join()

        # to correct biases
        sumnrm = sum(p.get() for p in procs)
        print 'SUM:', sumnrm

    printime('  - Rescaling decay')
    # normalize decay by size of the diagonal, and by Vanilla correction
    # (all cells must still be equals to 1 in average)

    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        fname = os.path.join(outdir,
                             'tmp_%s:%d-%d.pickle' % (region, start, end))
        procs.append(pool.apply_async(sum_dec_matrix,
                                      args=(fname, biases, badcol, bins)))
    pool.close()
    print_progress(procs)
    pool.join()

    # collect results
    sumdec = {}
    for proc in procs:
        for k, v in proc.get().iteritems():
            try:
                sumdec[k] += v
            except KeyError:
                sumdec[k]  = v

    # count the number of cells per diagonal
    # TODO: parallelize
    # find larget chromsome
    len_big = max(section_pos[c][1] - section_pos[c][0] for c in section_pos)
    # initialize dictionary
    ndiags = dict((k, 0) for k in xrange(len_big))
    for crm in section_pos:
        beg_chr, end_chr = section_pos[crm][0], section_pos[crm][1]
        chr_size = end_chr - beg_chr
        thesebads = [b for b in badcol if beg_chr <= b <= end_chr]
        for dist in xrange(1, chr_size):
            ndiags[dist] += chr_size - dist
            # from this we remove bad columns
            # bad columns will only affect if they are at least as distant from
            # a border as the distance between the longest diagonal and the
            # current diagonal.
            bad_diag = set()  # 2 bad rows can point to the same bad cell in diagonal
            maxp = end_chr - dist
            minp = beg_chr + dist
            for b in thesebads:
                if b <= maxp:
                    bad_diag.add(b)
                if b >= minp:
                    bad_diag.add(b - dist)
            ndiags[dist] -= len(bad_diag)
        # chr_sizeerent behavior for longest diagonal:
        ndiags[0] += chr_size - len(thesebads)

    # normalize sum per diagonal by total number of cells in diagonal
    for k in sumdec:
        try:
            sumdec[k] /= ndiags[k]
        except ZeroDivisionError:  # all columns at this distance are "bad"
            pass

    return biases, sumdec, badcol
Пример #11
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts, extra=['quiet'])

    coord1 = opts.coord1

    if not coord1:
        region1 = None
        start1 = None
        end1 = None
    else:
        try:
            crm1, pos1 = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1 = int(start1)
            end1 = int(end1)
        except ValueError:
            region1 = coord1
            start1 = None
            end1 = None

    printime('Importing hic in %s format' % opts.format)
    if opts.format == 'matrix' or opts.format == 'text':
        with gzopen(opts.input) as f_thing:
            masked, chroms_gen, crm, beg, _, _ = read_file_header(f_thing)
        if not chroms_gen or (region1 and region1 not in chroms_gen):
            raise Exception(
                '''ERROR: Chromosome size not included in import file.
                             Please include the chromosome sizes of the data that
                             you want to import in the header of the file. Example:
                             # CRM chr1    249250621''')
    elif opts.format == 'cooler':
        if is_cooler(opts.input, opts.reso if opts.reso > 1 else None):
            chroms_gen = parse_header(opts.input,
                                      opts.reso if opts.reso > 1 else None)
            if not chroms_gen or (region1 and region1 not in chroms_gen):
                raise Exception(
                    '''ERROR: Chromosome size not included in import file.
                                ''')
        else:
            raise Exception('''ERROR: The input file is not a cooler''')

    chroms = OrderedDict(
        (crm, int(chroms_gen[crm] // opts.reso) + 1) for crm in chroms_gen)
    sections = []
    if not region1:
        size = 0
        for crm in chroms:
            size += chroms[crm]
            sections.extend([(crm, i) for i in range(chroms[crm])])
    elif not start1:
        size = chroms[region1]
        sections.extend([(region1, i) for i in range(size)])
    else:
        #size = (end1 - start1)//opts.reso
        size = chroms[region1]
        sections.extend([
            (region1, i)
            for i in range(start1 // opts.reso, (end1 // opts.reso))
        ])
    dict_sec = dict([(j, i) for i, j in enumerate(sections)])
    bias_file = None
    badcol = {}
    if opts.format == 'text':
        with gzopen(opts.input) as f_thing:
            matrix = abc_reader(f_thing, size,
                                start1 // opts.reso if start1 else None)
        size_mat = size
    elif opts.format == 'matrix':
        with gzopen(opts.input) as in_f:
            matrix, size_mat, _, masked, _ = autoreader(in_f)
        if size != size_mat:
            raise Exception('''ERROR: The size of the specified region is
                            different from the data in the matrix''')
    elif opts.format == 'cooler':
        matrix, weights, size, header = parse_cooler(
            opts.input,
            opts.reso if opts.reso > 1 else None,
            normalized=True,
            raw_values=True)
        masked = {}
        size_mat = size
        if len(set(weights)) > 1:
            printime('Transforming cooler weights to biases')
            outdir_norm = path.join(opts.workdir, '04_normalization')
            mkdir(outdir_norm)

            bias_file = path.join(
                outdir_norm, 'biases_%s_%s.pickle' %
                (nicer(opts.reso).replace(' ', ''), param_hash))
            out = open(bias_file, 'wb')
            badcol.update((i, True) for i, m in enumerate(weights) if m == 0)
            dump(
                {
                    'biases':
                    dict((k, b if b > 0 else float('nan'))
                         for k, b in enumerate(weights)),
                    'decay': {},
                    'badcol':
                    badcol,
                    'resolution':
                    opts.reso
                }, out, HIGHEST_PROTOCOL)
            out.close()

    hic = HiC_data(matrix,
                   size_mat,
                   dict_sec=dict_sec,
                   chromosomes=chroms,
                   masked=masked,
                   resolution=opts.reso)

    #from pytadbit.mapping.analyze import hic_map
    #hic_map(hic, normalized=False, focus='chr1', show=True, cmap='viridis')

    printime('Creating BAM file')
    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s' % param_hash)

    total_counts = create_BAMhic(hic,
                                 opts.cpus,
                                 outbam,
                                 chroms_gen,
                                 opts.reso,
                                 samtools=opts.samtools)

    finish_time = time.localtime()
    # save all job information to sqlite DB
    save_to_db(opts, total_counts, size_mat, bias_file, len(badcol),
               outbam + '.bam', launch_time, finish_time)
Пример #12
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts, extra=['quiet'])

    if opts.zrange:
        vmin = float(opts.zrange.split(',')[0])
        vmax = float(opts.zrange.split(',')[1])
    else:
        vmin = vmax = None

    clean = True  # change for debug

    if opts.bam:
        mreads = path.realpath(opts.bam)
        if not opts.biases and all(v != 'raw' for v in opts.normalizations):
            raise Exception('ERROR: external BAM input, should provide path to'
                            ' biases file.')
        biases = opts.biases
    else:
        biases, mreads = load_parameters_fromdb(opts)
        mreads = path.join(opts.workdir, mreads)
        biases = path.join(opts.workdir, biases) if biases else None
    if opts.biases:
        biases = opts.biases

    coord1 = opts.coord1
    coord2 = opts.coord2

    if coord2 and not coord1:
        coord1, coord2 = coord2, coord1

    if not coord1:
        region1 = None
        start1 = None
        end1 = None
        region2 = None
        start2 = None
        end2 = None
    else:
        try:
            crm1, pos1 = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1 = int(start1)
            end1 = int(end1)
        except ValueError:
            region1 = coord1
            start1 = None
            end1 = None
        if coord2:
            try:
                crm2, pos2 = coord2.split(':')
                start2, end2 = pos2.split('-')
                region2 = crm2
                start2 = int(start2)
                end2 = int(end2)
            except ValueError:
                region2 = coord2
                start2 = None
                end2 = None
        else:
            region2 = None
            start2 = None
            end2 = None

    outdir = path.join(opts.workdir, '05_sub-matrices')
    mkdir(outdir)
    tmpdir = path.join(opts.workdir, '05_sub-matrices',
                       '_tmp_sub-matrices_%s' % param_hash)
    mkdir(tmpdir)

    if region1:
        if region1:
            if not opts.quiet:
                stdout.write('\nExtraction of %s' % (region1))
            if start1:
                if not opts.quiet:
                    stdout.write(':%s-%s' % (start1, end1))
            else:
                if not opts.quiet:
                    stdout.write(' (full chromosome)')
            if region2:
                if not opts.quiet:
                    stdout.write(' intersection with %s' % (region2))
                if start2:
                    if not opts.quiet:
                        stdout.write(':%s-%s\n' % (start2, end2))
                else:
                    if not opts.quiet:
                        stdout.write(' (full chromosome)\n')
            else:
                if not opts.quiet:
                    stdout.write('\n')
    else:
        if not opts.quiet:
            stdout.write('\nExtraction of full genome\n')

    out_files = {}
    out_plots = {}

    if opts.matrix or opts.plot:
        bamfile = AlignmentFile(mreads, 'rb')
        sections = OrderedDict(
            zip(bamfile.references, [x for x in bamfile.lengths]))
        total = 0
        section_pos = dict()
        for crm in sections:
            section_pos[crm] = (total, total + sections[crm])
            total += sections[crm]
        for norm in opts.normalizations:
            norm_string = ('RAW' if norm == 'raw' else
                           'NRM' if norm == 'norm' else 'DEC')
            printime('Getting %s matrices' % norm)
            try:
                matrix, bads1, bads2, regions, name, bin_coords = get_matrix(
                    mreads,
                    opts.reso,
                    load(open(biases)) if biases and norm != 'raw' else None,
                    normalization=norm,
                    region1=region1,
                    start1=start1,
                    end1=end1,
                    region2=region2,
                    start2=start2,
                    end2=end2,
                    tmpdir=tmpdir,
                    ncpus=opts.cpus,
                    return_headers=True,
                    nchunks=opts.nchunks,
                    verbose=not opts.quiet,
                    clean=clean)
            except NotImplementedError:
                if norm == "raw&decay":
                    warn('WARNING: raw&decay normalization not implemeted for '
                         'matrices\n... skipping\n')
                    continue
                raise
            b1, e1, b2, e2 = bin_coords
            b1, e1 = 0, e1 - b1
            b2, e2 = 0, e2 - b2
            if opts.row_names:
                starts = [start1, start2]
                ends = [end1, end2]
                row_names = ((reg, p + 1, p + opts.reso)
                             for r, reg in enumerate(regions) for p in range(
                                 starts[r] if r < len(starts) and starts[r]
                                 else 0, ends[r] if r < len(ends) and ends[r]
                                 else sections[reg], opts.reso))
            if opts.matrix:
                printime(' - Writing: %s' % norm)
                fnam = '%s_%s_%s%s.mat' % (norm, name, nicer(
                    opts.reso).replace(' ', ''), ('_' + param_hash))
                out_files[norm_string] = path.join(outdir, fnam)
                out = open(path.join(outdir, fnam), 'w')
                for reg in regions:
                    out.write('# CRM %s\t%d\n' % (reg, sections[reg]))
                if region2:
                    out.write('# BADROWS %s\n' %
                              (','.join([str(b) for b in bads1])))
                    out.write('# BADCOLS %s\n' %
                              (','.join([str(b) for b in bads2])))
                else:
                    out.write('# MASKED %s\n' %
                              (','.join([str(b) for b in bads1])))
                if opts.row_names:
                    out.write('\n'.join('%s\t%d\t%d\t' %
                                        (row_names.next()) + '\t'.join(
                                            str(matrix.get((i, j), 0))
                                            for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                else:
                    out.write('\n'.join('\t'.join(
                        str(matrix.get((i, j), 0)) for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                out.close()
            if opts.plot:
                cmap = plt.get_cmap(opts.cmap)
                if norm != 'raw':
                    cmap.set_bad('grey', 1.)
                printime(' - Plotting: %s' % norm)
                fnam = '%s_%s_%s%s.%s' % (norm, name, nicer(opts.reso).replace(
                    ' ', ''), ('_' + param_hash), opts.format)
                out_plots[norm_string] = path.join(outdir, fnam)
                if opts.interactive:
                    _ = plt.figure(figsize=(8, 7))
                else:
                    _ = plt.figure(figsize=(16, 14))
                # ax1 = plt.subplot(111)
                ax1 = plt.axes([0.1, 0.1, 0.7, 0.8])
                ax2 = plt.axes([0.82, 0.1, 0.07, 0.8])
                matrix = array([
                    array([matrix.get((i, j), 0) for i in xrange(b1, e1)])
                    for j in xrange(b2, e2)
                ])
                mini = np_min(matrix[nonzero(matrix)]) / 2.
                matrix[matrix == 0] = mini
                m = zeros_like(matrix)
                for bad1 in bads1:
                    m[:, bad1] = 1
                    for bad2 in bads2:
                        m[bad2, :] = 1
                matrix = log2(ma.masked_array(matrix, m))
                ax1.imshow(matrix,
                           interpolation='None',
                           origin='lower',
                           cmap=cmap,
                           vmin=vmin,
                           vmax=vmax)

                if len(regions) <= 2:
                    pltbeg1 = 0 if start1 is None else start1
                    pltend1 = sections[regions[0]] if end1 is None else end1
                    pltbeg2 = pltbeg1 if len(
                        regions) == 1 else 0 if start2 is None else start2
                    pltend2 = pltend1 if len(regions) == 1 else sections[
                        regions[-1]] if end2 is None else end2

                    ax1.set_xlabel('{}:{:,}-{:,}'.format(
                        regions[0], pltbeg1 if pltbeg1 else 1, pltend1))
                    ax1.set_ylabel('{}:{:,}-{:,}'.format(
                        regions[-1], pltbeg2 if pltbeg2 else 1, pltend2))

                    def format_xticks(tickstring, _=None):
                        tickstring = int(tickstring * opts.reso + pltbeg1)
                        return nicer(tickstring if tickstring else 1,
                                     coma=True)

                    def format_yticks(tickstring, _=None):
                        tickstring = int(tickstring * opts.reso + pltbeg2)
                        return nicer(tickstring if tickstring else 1,
                                     coma=True)

                    ax1.xaxis.set_major_formatter(FuncFormatter(format_xticks))
                    ax1.yaxis.set_major_formatter(FuncFormatter(format_yticks))

                    labels = ax1.get_xticklabels()
                    plt.setp(labels, rotation=-25, ha='left')

                    ax1.set_xlim(-0.5, len(matrix[0]) - 0.5)
                    ax1.set_ylim(-0.5, len(matrix) - 0.5)
                else:
                    vals = [0]
                    keys = ['']
                    for crm in regions:
                        vals.append(section_pos[crm][0] / opts.reso)
                        keys.append(crm)
                    vals.append(section_pos[crm][1] / opts.reso)
                    ax1.set_yticks(vals)
                    ax1.set_yticklabels('')
                    ax1.set_yticks([
                        float(vals[i] + vals[i + 1]) / 2
                        for i in xrange(len(vals) - 1)
                    ],
                                   minor=True)
                    ax1.set_yticklabels(keys, minor=True)
                    for t in ax1.yaxis.get_minor_ticks():
                        t.tick1On = False
                        t.tick2On = False

                    ax1.set_xticks(vals)
                    ax1.set_xticklabels('')
                    ax1.set_xticks([
                        float(vals[i] + vals[i + 1]) / 2
                        for i in xrange(len(vals) - 1)
                    ],
                                   minor=True)
                    ax1.set_xticklabels(keys, minor=True)
                    for t in ax1.xaxis.get_minor_ticks():
                        t.tick1On = False
                        t.tick2On = False
                    ax1.set_xlabel('Chromosomes')
                    ax1.set_ylabel('Chromosomes')
                    ax1.set_xlim(-0.5, len(matrix[0]) - 0.5)
                    ax1.set_ylim(-0.5, len(matrix) - 0.5)
                data = [i for d in matrix for i in d if isfinite(i)]
                mindata = nanmin(data)
                maxdata = nanmax(data)
                gradient = linspace(maxdata, mindata,
                                    max((len(matrix), len(matrix[0]))))
                gradient = dstack((gradient, gradient))[0]
                h = ax2.hist(data,
                             color='darkgrey',
                             linewidth=2,
                             orientation='horizontal',
                             bins=50,
                             histtype='step',
                             normed=True)
                _ = ax2.imshow(gradient,
                               aspect='auto',
                               cmap=cmap,
                               extent=(0, max(h[0]), mindata, maxdata))
                ax2.yaxis.tick_right()
                ax2.yaxis.set_label_position("right")
                ax2.set_xticks([])
                ax1.set_title('Region: %s, normalization: %s, resolution: %s' %
                              (name, norm, nicer(opts.reso)))
                ax2.set_ylabel('Hi-C Log2 interactions', rotation=-90)
                ax2.set_xlabel('Count')
                if opts.interactive:
                    plt.show()
                    plt.close('all')
                else:
                    tadbit_savefig(path.join(outdir, fnam))
    if not opts.matrix and not opts.only_plot:
        printime('Getting and writing matrices')
        out_files.update(
            write_matrix(mreads,
                         opts.reso,
                         load(open(biases)) if biases else None,
                         outdir,
                         filter_exclude=opts.filter,
                         normalizations=opts.normalizations,
                         region1=region1,
                         start1=start1,
                         end1=end1,
                         region2=region2,
                         start2=start2,
                         end2=end2,
                         tmpdir=tmpdir,
                         append_to_tar=None,
                         ncpus=opts.cpus,
                         nchunks=opts.nchunks,
                         verbose=not opts.quiet,
                         extra=param_hash,
                         clean=clean))

    if clean:
        printime('Cleaning')
        system('rm -rf %s ' % tmpdir)

    if not opts.interactive:
        printime('Saving to DB')
        finish_time = time.localtime()
        save_to_db(opts, launch_time, finish_time, out_files, out_plots)
Пример #13
0
def read_bam(inbam,
             filter_exclude,
             resolution,
             min_count=2500,
             sigma=2,
             ncpus=8,
             factor=1,
             outdir='.',
             check_sum=False):
    bamfile = AlignmentFile(inbam, 'rb')
    sections = OrderedDict(
        zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths]))
    total = 0
    section_pos = dict()
    for crm in sections:
        section_pos[crm] = (total, total + sections[crm])
        total += sections[crm] + 1
    bins = []
    for crm in sections:
        len_crm = sections[crm]
        bins.extend([(crm, i) for i in xrange(len_crm + 1)])

    start_bin = 0
    end_bin = len(bins) + 1
    total = len(bins)

    total = end_bin - start_bin + 1
    regs = []
    begs = []
    ends = []
    njobs = min(total, 100) + 1
    nbins = total / njobs + 1
    for i in range(start_bin, end_bin, nbins):
        if i + nbins > end_bin:  # make sure that we stop at the right place
            nbins = end_bin - i
        try:
            (crm1, beg1), (crm2, end2) = bins[i], bins[i + nbins - 1]
        except IndexError:
            (crm1, beg1), (crm2, end2) = bins[i], bins[-1]
        if crm1 != crm2:
            end1 = sections[crm1]
            beg2 = 0
            regs.append(crm1)
            regs.append(crm2)
            begs.append(beg1 * resolution)
            begs.append(beg2 * resolution)
            ends.append(end1 * resolution + resolution)  # last nt included
            ends.append(end2 * resolution + resolution -
                        1)  # last nt not included (overlap with next window)
        else:
            regs.append(crm1)
            begs.append(beg1 * resolution)
            ends.append(end2 * resolution + resolution - 1)
    ends[-1] += 1  # last nucleotide included

    # print '\n'.join(['%s %d %d' % (a, b, c) for a, b, c in zip(regs, begs, ends)])
    printime('\n  - Parsing BAM (%d chunks)' % (len(regs)))
    bins_dict = dict([(j, i) for i, j in enumerate(bins)])
    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        procs.append(
            pool.apply_async(read_bam_frag,
                             args=(
                                 inbam,
                                 filter_exclude,
                                 bins,
                                 bins_dict,
                                 resolution,
                                 outdir,
                                 region,
                                 start,
                                 end,
                             )))
    pool.close()
    print_progress(procs)
    pool.join()

    ## COLLECT RESULTS
    verbose = True
    cisprc = {}
    for countbin, (region, start, end) in enumerate(zip(regs, begs, ends)):
        if verbose:
            if not countbin % 10 and countbin:
                sys.stdout.write(' ')
            if not countbin % 50 and countbin:
                sys.stdout.write(' %9s\n     ' % ('%s/%s' %
                                                  (countbin, len(regs))))
            sys.stdout.write('.')
            sys.stdout.flush()

        fname = os.path.join(outdir,
                             'tmp_bins_%s:%d-%d.pickle' % (region, start, end))
        tmp_cisprc = load(open(fname))
        cisprc.update(tmp_cisprc)
    if verbose:
        print '%s %9s\n' % (' ' * (54 - (countbin % 50) -
                                   (countbin % 50) / 10), '%s/%s' %
                            (len(regs), len(regs)))

    # out = open(os.path.join(outdir, 'dicos_%s.pickle' % (
    #     nicer(resolution).replace(' ', ''))), 'w')
    # dump(cisprc, out)
    # out.close()
    # bad columns
    def func_gen(x, *args):
        cmd = "zzz = " + func_restring % (args)
        exec(cmd) in globals(), locals()
        #print cmd
        try:
            return np.lib.asarray_chkfinite(zzz)
        except:
            # avoid the creation of NaNs when invalid values for power or log
            return x

    print '  - Removing columns with too few or too much interactions'
    if not min_count:

        badcol = filter_by_cis_percentage(
            cisprc,
            sigma=sigma,
            verbose=True,
            savefig=os.path.join(outdir + 'filtered_bins_%s.png' %
                                 (nicer(resolution).replace(' ', ''))))
    else:
        print '      -> too few  interactions defined as less than %9d interactions' % (
            min_count)
        for k in cisprc:
            cisprc[k] = cisprc[k][1]
        badcol = {}
        countL = 0
        countZ = 0
        for c in xrange(total):
            if cisprc.get(c, 0) < min_count:
                badcol[c] = cisprc.get(c, 0)
                countL += 1
                if not c in cisprc:
                    countZ += 1
        print '      -> removed %d columns (%d/%d null/high counts) of %d (%.1f%%)' % (
            len(badcol), countZ, countL, total,
            float(len(badcol)) / total * 100)

    printime('  - Rescaling biases')
    size = len(bins)
    biases = [cisprc.get(k, 1.) for k in range(size)]
    mean_col = float(sum(biases)) / len(biases)
    biases = dict([(k, b / mean_col * mean_col**0.5)
                   for k, b in enumerate(biases)])

    # collect subset-matrices and write genomic one
    # out = open(os.path.join(outdir,
    #                         'hicdata_%s.abc' % (nicer(resolution).replace(' ', ''))), 'w')
    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        fname = os.path.join(outdir,
                             'tmp_%s:%d-%d.pickle' % (region, start, end))
        procs.append(pool.apply_async(sum_nrm_matrix, args=(
            fname,
            biases,
        )))
    pool.close()
    print_progress(procs)
    pool.join()

    # to correct biases
    sumnrm = sum(p.get() for p in procs)

    target = (sumnrm / float(size * size * factor))**0.5
    biases = dict([(b, biases[b] * target) for b in biases])

    # check the sum
    if check_sum:
        pool = mu.Pool(ncpus)
        procs = []
        for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
            fname = os.path.join(outdir,
                                 'tmp_%s:%d-%d.pickle' % (region, start, end))
            procs.append(
                pool.apply_async(sum_nrm_matrix, args=(
                    fname,
                    biases,
                )))
        pool.close()
        print_progress(procs)
        pool.join()

        # to correct biases
        sumnrm = sum(p.get() for p in procs)
        print 'SUM:', sumnrm

    printime('  - Rescaling decay')
    # normalize decay by size of the diagonal, and by Vanilla correction
    # (all cells must still be equals to 1 in average)

    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        fname = os.path.join(outdir,
                             'tmp_%s:%d-%d.pickle' % (region, start, end))
        procs.append(
            pool.apply_async(sum_dec_matrix,
                             args=(fname, biases, badcol, bins)))
    pool.close()
    print_progress(procs)
    pool.join()

    # collect results
    sumdec = {}
    for proc in procs:
        for k, v in proc.get().iteritems():
            try:
                sumdec[k] += v
            except KeyError:
                sumdec[k] = v

    # count the number of cells per diagonal
    # TODO: parallelize
    # find larget chromsome
    len_big = max(section_pos[c][1] - section_pos[c][0] for c in section_pos)
    # initialize dictionary
    ndiags = dict((k, 0) for k in xrange(len_big))
    for crm in section_pos:
        beg_chr, end_chr = section_pos[crm][0], section_pos[crm][1]
        chr_size = end_chr - beg_chr
        thesebads = [b for b in badcol if beg_chr <= b <= end_chr]
        for dist in xrange(1, chr_size):
            ndiags[dist] += chr_size - dist
            # from this we remove bad columns
            # bad columns will only affect if they are at least as distant from
            # a border as the distance between the longest diagonal and the
            # current diagonal.
            bad_diag = set(
            )  # 2 bad rows can point to the same bad cell in diagonal
            maxp = end_chr - dist
            minp = beg_chr + dist
            for b in thesebads:
                if b <= maxp:
                    bad_diag.add(b)
                if b >= minp:
                    bad_diag.add(b - dist)
            ndiags[dist] -= len(bad_diag)
        # chr_sizeerent behavior for longest diagonal:
        ndiags[0] += chr_size - len(thesebads)

    # normalize sum per diagonal by total number of cells in diagonal
    for k in sumdec:
        try:
            sumdec[k] /= ndiags[k]
        except ZeroDivisionError:  # all columns at this distance are "bad"
            pass

    return biases, sumdec, badcol
Пример #14
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception(
                'ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception(
                'ERROR: missing restriction enzyme name for oneD normalization'
            )
        if not opts.mappability:
            raise Exception(
                'ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (
                len(fas - bam))
            if len(fas - bam) <= 50:
                print '\n'.join([('  - ' + c) for c in (fas - bam)])
        if bam - fas:
            txt = ('\n'.join([('  - ' + c)
                              for c in (bam -
                                        fas)]) if len(bam - fas) <= 50 else '')
            raise Exception(
                'ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' %
                (len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception(
                "ERROR: chromosomes in FASTA different the ones in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        fh = open(opts.mappability)
        mappability = dict((c, []) for c in refs)
        line = fh.next()
        crmM, begM, endM, val = line.split()
        crm = crmM
        if crmM not in mappability:
            print('     skipping %s' % crmM)
            while crmM not in mappability:
                line = fh.next()
                crmM, begM, endM, val = line.split()
                crm = crmM
        while any(not mappability[c] for c in mappability):
            for begB in xrange(0, len(genome[crmM]), opts.reso):
                endB = begB + opts.reso
                tmp = 0
                try:
                    while True:
                        crmM, begM, endM, val = line.split()
                        if crm != crmM:
                            try:
                                while crmM not in refs:
                                    line = fh.next()
                                    crmM, _ = line.split('\t', 1)
                            except StopIteration:
                                pass
                            break
                        begM = int(begM)
                        endM = int(endM)
                        if endM > endB:
                            weight = endB - begM
                            if weight >= 0:
                                tmp += weight * float(val)
                            break
                        weight = endM - (begM if begM > begB else begB)
                        if weight < 0:
                            break
                        tmp += weight * float(val)
                        line = fh.next()
                except StopIteration:
                    pass
                mappability[crm].append(tmp / opts.reso)
                crm = crmM
        mappability = reduce(lambda x, y: x + y,
                             (mappability[c] for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome,
                                    opts.reso,
                                    chromosomes=refs,
                                    n_cpus=opts.cpus)
        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in xrange(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos - 200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads,
        filter_exclude,
        opts.reso,
        min_count=opts.min_count,
        sigma=2,
        factor=1,
        outdir=outdir,
        extra_out=param_hash,
        ncpus=opts.cpus,
        normalization=opts.normalization,
        mappability=mappability,
        cg_content=gc_content,
        n_rsites=n_rsites,
        min_perc=opts.min_perc,
        max_perc=opts.max_perc,
        normalize_only=opts.normalize_only,
        max_njobs=opts.max_njobs,
        extra_bads=opts.badcols)

    bad_col_image = path.join(
        outdir, 'filtered_bins_%s_%s.png' %
        (nicer(opts.reso).replace(' ', ''), param_hash))

    inter_vs_gcoord = path.join(
        opts.workdir, '04_normalization',
        'interactions_vs_genomic-coords.png_%s_%s.png' %
        (opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay,
            max_diff=10000,
            resolution=opts.reso,
            normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(
        outdir, 'biases_%s_%s.pickle' %
        (nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'w')

    dump(
        {
            'biases': biases,
            'decay': decay,
            'badcol': badcol,
            'resolution': opts.reso
        }, out)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, bad_col_image, len(badcol),
                   len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2,
                   opts.filter, launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
Пример #15
0
def read_bam(inbam, filter_exclude, resolution, min_count=2500, biases_path='',
             normalization='Vanilla', mappability=None, n_rsites=None,
             cg_content=None, sigma=2, ncpus=8, factor=1, outdir='.', seed=1,
             extra_out='', only_valid=False, normalize_only=False, p_fit=None,
             max_njobs=100, min_perc=None, max_perc=None, extra_bads=None):
    bamfile = AlignmentFile(inbam, 'rb')
    sections = OrderedDict(zip(bamfile.references,
                               [x / resolution + 1 for x in bamfile.lengths]))
    total = 0
    section_pos = dict()
    for crm in sections:
        section_pos[crm] = (total, total + sections[crm])
        total += sections[crm]
    bins = []
    for crm in sections:
        len_crm = sections[crm]
        bins.extend([(crm, i) for i in xrange(len_crm)])

    start_bin = 0
    end_bin   = len(bins)
    total     = len(bins)

    regs = []
    begs = []
    ends = []
    njobs = min(total, max_njobs) + 1
    nbins = total / njobs + 1
    for i in range(start_bin, end_bin, nbins):
        if i + nbins > end_bin:  # make sure that we stop
            nbins = end_bin - i
        try:
            (crm1, beg1), (crm2, end2) = bins[i], bins[i + nbins - 1]
        except IndexError:
            try:
                (crm1, beg1), (crm2, end2) = bins[i], bins[-1]
            except IndexError:
                break
        if crm1 != crm2:
            end1 = sections[crm1]
            beg2 = 0
            regs.append(crm1)
            regs.append(crm2)
            begs.append(beg1 * resolution)
            begs.append(beg2 * resolution)
            ends.append(end1 * resolution + resolution)  # last nt included
            ends.append(end2 * resolution + resolution - 1)  # last nt not included (overlap with next window)
        else:
            regs.append(crm1)
            begs.append(beg1 * resolution)
            ends.append(end2 * resolution + resolution - 1)
    ends[-1] += 1  # last nucleotide included

    # print '\n'.join(['%s %d %d' % (a, b, c) for a, b, c in zip(regs, begs, ends)])
    printime('  - Parsing BAM (%d chunks)' % (len(regs)))
    bins_dict = dict([(j, i) for i, j in enumerate(bins)])
    pool = mu.Pool(ncpus)
    procs = []
    read_bam_frag = read_bam_frag_valid if only_valid else read_bam_frag_filter
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        procs.append(pool.apply_async(
            read_bam_frag, args=(inbam, filter_exclude, bins, bins_dict,
                                 resolution, outdir, extra_out,
                                 region, start, end,)))
    pool.close()
    print_progress(procs)
    pool.join()
    ## COLLECT RESULTS
    cisprc = {}
    printime('  - Collecting cis and total interactions per bin (%d chunks)' % (len(regs)))
    stdout.write('     ')
    for countbin, (region, start, end) in enumerate(zip(regs, begs, ends)):
        if not countbin % 10 and countbin:
            stdout.write(' ')
        if not countbin % 50 and countbin:
            stdout.write(' %9s\n     ' % ('%s/%s' % (countbin , len(regs))))
        stdout.write('.')
        stdout.flush()

        fname = path.join(outdir,
                          'tmp_bins_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
        tmp_cisprc = load(open(fname))
        system('rm -f %s' % fname)
        cisprc.update(tmp_cisprc)
    stdout.write('\n')

    printime('  - Removing columns with too few or too much interactions')
    if len(bamfile.references) == 1 and min_count is None:
        raise Exception("ERROR: only one chromosome can't filter by "
                        "cis-percentage, set min_count instead")
    elif min_count is None and len(bamfile.references) > 1:
        badcol = filter_by_cis_percentage(
            cisprc, sigma=sigma, verbose=True, min_perc=min_perc, max_perc=max_perc,
            size=total, savefig=None)
    else:
        print ('      -> too few interactions defined as less than %9d '
               'interactions') % (min_count)
        badcol = {}
        countL = 0
        countZ = 0
        for c in xrange(total):
            if cisprc.get(c, [0, 0])[1] < min_count:
                badcol[c] = cisprc.get(c, [0, 0])[1]
                countL += 1
                if not c in cisprc:
                    countZ += 1
        print '      -> removed %d columns (%d/%d null/high counts) of %d (%.1f%%)' % (
            len(badcol), countZ, countL, total, float(len(badcol)) / total * 100)

    # no mappability will result in NaNs, better to filter out these columns
    if mappability:
        badcol.update((i, True) for i, m in enumerate(mappability) if not m)

    # add manually columns to bad columns
    if extra_bads:
        removed_manually = 0
        for ebc in extra_bads:
            c, ebc = ebc.split(':')
            b, e = map(int, ebc.split('-'))
            b = b / resolution + section_pos[c][0]
            e = e / resolution + section_pos[c][0]
            removed_manually += (e - b)
            badcol.update(dict((p, 'manual') for p in xrange(b, e)))
        printime('  - Removed %d columns manually.' % removed_manually)
    raw_cisprc = sum(float(cisprc[k][0]) / cisprc[k][1]
                     for k in cisprc if not k in badcol) / (len(cisprc) - len(badcol))

    printime('  - Rescaling sum of interactions per bins')
    size = len(bins)
    biases = [float('nan') if k in badcol else cisprc.get(k, [0, 1.])[1]
              for k in xrange(size)]

    if normalization == 'ICE':
        printime('  - ICE normalization')
        hic_data = load_hic_data_from_bam(
            inbam, resolution, filter_exclude=filter_exclude,
            tmpdir=outdir, ncpus=ncpus)
        hic_data.bads = badcol
        hic_data.normalize_hic(iterations=100, max_dev=0.000001)
        biases = hic_data.bias.copy()
        del(hic_data)
    elif normalization == 'Vanilla':
        printime('  - Vanilla normalization')
        mean_col = nanmean(biases)
        biases   = dict((k, b / mean_col * mean_col**0.5)
                        for k, b in enumerate(biases))
    elif normalization == 'SQRT':
        printime('  - Vanilla-SQRT normalization')
        biases = [b**0.5 for b in biases]
        mean_col = nanmean(biases)
        biases   = dict((k, b / mean_col * mean_col**0.5)
                        for k, b in enumerate(biases))
    elif normalization == 'oneD':
        printime('  - oneD normalization')
        if len(set([len(biases), len(mappability), len(n_rsites), len(cg_content)])) > 1:
            print "biases", "mappability", "n_rsites", "cg_content"
            print len(biases), len(mappability), len(n_rsites), len(cg_content)
            raise Exception('Error: not all arrays have the same size')
        tmp_oneD = path.join(outdir,'tmp_oneD_%s' % (extra_out))
        mkdir(tmp_oneD)
        biases = oneD(tmp_dir=tmp_oneD, p_fit=p_fit, tot=biases, map=mappability,
                      res=n_rsites, cg=cg_content, seed=seed)
        biases = dict((k, b) for k, b in enumerate(biases))
        rmtree(tmp_oneD)
    elif normalization == 'custom':
        n_pos = 0
        biases = {}
        print 'Using provided biases...'
        with open(biases_path, 'r') as r:
            r.next()
            for line in r:
                if line[0] == 'N':
                    #b = float('nan')
                    badcol[n_pos] = 0
                    biases[n_pos] = float('nan')
                else:
                    b = float(line)
                    if b == 0:
                        badcol[n_pos] = 0
                        biases[n_pos] = float('nan')
                    else:
                        biases[n_pos] = b
                n_pos += 1
        for add in range(max(biases.keys()), total + 1):
            biases[add] = float('nan')
    else:
        raise NotImplementedError('ERROR: method %s not implemented' %
                                  normalization)

    # collect subset-matrices and write genomic one
    # out = open(os.path.join(outdir,
    #                         'hicdata_%s.abc' % (nicer(resolution).replace(' ', ''))), 'w')
    printime('  - Getting sum of normalized bins')
    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        fname = path.join(outdir,
                          'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
        procs.append(pool.apply_async(sum_nrm_matrix,
                                      args=(fname, biases,)))
    pool.close()
    print_progress(procs)
    pool.join()

    # to correct biases
    sumnrm = sum(p.get() for p in procs)

    target = (sumnrm / float(size * size * factor))**0.5
    biases = dict([(b, biases[b] * target) for b in biases])

    if not normalize_only:
        printime('  - Computing Cis percentage')
        # Calculate Cis percentage

        pool = mu.Pool(ncpus)
        procs = []
        for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
            fname = path.join(outdir,
                              'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
            procs.append(pool.apply_async(get_cis_perc,
                                          args=(fname, biases, badcol, bins)))
        pool.close()
        print_progress(procs)
        pool.join()

        # collect results
        cis = total = 0
        for proc in procs:
            c, t = proc.get()
            cis += c
            total += t
        norm_cisprc = float(cis) / total
        print '    * Cis-percentage: %.1f%%' % (norm_cisprc * 100)
    else:
        norm_cisprc = 0.

    printime('  - Rescaling decay')
    # normalize decay by size of the diagonal, and by Vanilla correction
    # (all cells must still be equals to 1 in average)

    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        fname = path.join(outdir,
                          'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
        procs.append(pool.apply_async(sum_dec_matrix,
                                      args=(fname, biases, badcol, bins)))
    pool.close()
    print_progress(procs)
    pool.join()

    # collect results
    nrmdec = {}
    rawdec = {}
    for proc in procs:
        tmpnrm, tmpraw = proc.get()
        for c, d in tmpnrm.iteritems():
            for k, v in d.iteritems():
                try:
                    nrmdec[c][k] += v
                    rawdec[c][k] += tmpraw[c][k]
                except KeyError:
                    try:
                        nrmdec[c][k]  = v
                        rawdec[c][k] = tmpraw[c][k]
                    except KeyError:
                        nrmdec[c] = {k: v}
                        rawdec[c] = {k: tmpraw[c][k]}
    # count the number of cells per diagonal
    # TODO: parallelize
    # find largest chromosome
    len_crms = dict((c, section_pos[c][1] - section_pos[c][0]) for c in section_pos)
    # initialize dictionary
    ndiags = dict((c, dict((k, 0) for k in xrange(len_crms[c]))) for c in sections)
    for crm in section_pos:
        beg_chr, end_chr = section_pos[crm][0], section_pos[crm][1]
        chr_size = end_chr - beg_chr
        thesebads = [b for b in badcol if beg_chr <= b <= end_chr]
        for dist in xrange(1, chr_size):
            ndiags[crm][dist] += chr_size - dist
            # from this we remove bad columns
            # bad columns will only affect if they are at least as distant from
            # a border as the distance between the longest diagonal and the
            # current diagonal.
            bad_diag = set()  # 2 bad rows can point to the same bad cell in diagonal
            maxp = end_chr - dist
            minp = beg_chr + dist
            for b in thesebads:
                if b < maxp:  # not inclusive!!
                    bad_diag.add(b)
                if b >= minp:
                    bad_diag.add(b - dist)
            ndiags[crm][dist] -= len(bad_diag)
        # different behavior for longest diagonal:
        ndiags[crm][0] += chr_size - sum(beg_chr <= b < end_chr for b in thesebads)

    # normalize sum per diagonal by total number of cells in diagonal
    signal_to_noise = 0.05
    min_n = signal_to_noise ** -2. # equals 400 when default
    for crm in sections:
        if not crm in nrmdec:
            nrmdec[crm] = {}
            rawdec[crm] = {}
        tmpdec = 0  # store count by diagonal
        tmpsum = 0  # store count by diagonal
        ndiag  = 0
        val    = 0
        previous = [] # store diagonals to be summed in case not reaching the minimum
        for k in ndiags[crm]:
            tmpdec += nrmdec[crm].get(k, 0.)
            tmpsum += rawdec[crm].get(k, 0.)
            previous.append(k)
            if tmpsum > min_n:
                ndiag = sum(ndiags[crm][k] for k in previous)
                val = tmpdec  # backup of tmpdec kept for last ones outside the loop
                try:
                    ratio = val / ndiag
                    for l in previous:
                        nrmdec[crm][l] = ratio
                except ZeroDivisionError:  # all columns at this distance are "bad"
                    pass
                previous = []
                tmpdec = 0
                tmpsum = 0
        # last ones we average with previous result
        if  len(previous) == len(ndiags[crm]):
            nrmdec[crm] = {}
        elif tmpsum < min_n:
            ndiag += sum(ndiags[crm][k] for k in previous)
            val += tmpdec
            try:
                ratio = val / ndiag
                for k in previous:
                    nrmdec[crm][k] = ratio
            except ZeroDivisionError:  # all columns at this distance are "bad"
                pass
    return biases, nrmdec, badcol, raw_cisprc, norm_cisprc
Пример #16
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception('ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception('ERROR: missing restriction enzyme name for oneD normalization')
        if not opts.mappability:
            raise Exception('ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam))
            if len(fas - bam) <= 50:
                print '\n'.join([('  - ' + c) for c in (fas - bam)])
        if bam - fas:
            txt = ('\n'.join([('  - ' + c) for c in (bam - fas)])
                   if len(bam - fas) <= 50 else '')
            raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % (
                len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception("ERROR: chromosomes in FASTA different the ones"
                            " in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        mappability = parse_mappability_bedGraph(
            opts.mappability, opts.reso,
            wanted_chrom=refs[0] if len(refs)==1 else None)
        # resize chomosomes
        for c in refs:
            if not c in mappability:
                mappability[c] = [float('nan')] * (len(refs) / opts.reso + 1)
            if len(mappability[c]) < len(refs) / opts.reso + 1:
                mappability[c] += [float('nan')] * (
                    (len(refs) / opts.reso + 1) - len(mappability[c]))
        # concatenates
        mappability = reduce(lambda x, y: x + y,
                             (mappability.get(c, []) for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome, opts.reso, chromosomes=refs,
                                    n_cpus=opts.cpus)
        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites  = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in xrange(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos-200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2,
        factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus,
        normalization=opts.normalization, mappability=mappability,
        p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites,
        min_perc=opts.min_perc, max_perc=opts.max_perc, seed=opts.seed,
        normalize_only=opts.normalize_only, max_njobs=opts.max_njobs,
        extra_bads=opts.badcols, biases_path=opts.biases_path)

    bad_col_image = path.join(outdir, 'filtered_bins_%s_%s.png' % (
        nicer(opts.reso).replace(' ', ''), param_hash))

    inter_vs_gcoord = path.join(opts.workdir, '04_normalization',
                                'interactions_vs_genomic-coords.png_%s_%s.png' % (
                                    opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print ('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(outdir, 'biases_%s_%s.pickle' % (
        nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'w')

    dump({'biases'    : biases,
          'decay'     : decay,
          'badcol'    : badcol,
          'resolution': opts.reso}, out, HIGHEST_PROTOCOL)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, bad_col_image,
                   len(badcol), len(biases), raw_cisprc, norm_cisprc,
                   inter_vs_gcoord, a2, opts.filter,
                   launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
Пример #17
0
def run(opts):
    check_options(opts)
    samtools = which(opts.samtools)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)

    reso1 = reso2 = None
    if opts.bam1:
        mreads1 = path.realpath(opts.bam1)
        biases1 = opts.biases1
    else:
        biases1, mreads1, reso1 = load_parameters_fromdb(
            opts.workdir1, opts.jobid1, opts, opts.tmpdb1)
        mreads1 = path.join(opts.workdir1, mreads1)
        try:
            biases1 = path.join(opts.workdir1, biases1)
        except AttributeError:
            biases1 = None

    if opts.bam2:
        mreads2 = path.realpath(opts.bam2)
        biases2 = opts.biases2
    else:
        biases2, mreads2, reso2 = load_parameters_fromdb(
            opts.workdir2, opts.jobid2, opts, opts.tmpdb2)
        mreads2 = path.join(opts.workdir2, mreads2)
        try:
            biases2 = path.join(opts.workdir2, biases2)
        except AttributeError:
            biases2 = None

    filter_exclude = opts.filter

    if reso1 != reso2:
        raise Exception('ERROR: differing resolutions between experiments to '
                        'be merged')

    mkdir(path.join(opts.workdir, '00_merge'))

    if not opts.skip_comparison:
        printime('  - loading first sample %s' % (mreads1))
        hic_data1 = load_hic_data_from_bam(mreads1, opts.reso, biases=biases1,
                                           tmpdir=path.join(opts.workdir, '00_merge'),
                                           ncpus=opts.cpus,
                                           filter_exclude=filter_exclude)

        printime('  - loading second sample %s' % (mreads2))
        hic_data2 = load_hic_data_from_bam(mreads2, opts.reso, biases=biases2,
                                           tmpdir=path.join(opts.workdir, '00_merge'),
                                           ncpus=opts.cpus,
                                           filter_exclude=filter_exclude)
        decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash))
        eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash))

        printime('  - comparing experiments')
        printime('    => correlation between equidistant loci')
        corr, _, scc, std, bads = correlate_matrices(
            hic_data1, hic_data2, normalized=opts.norm,
            remove_bad_columns=True, savefig=decay_corr_fig,
            savedata=decay_corr_dat, get_bads=True)
        print '         - correlation score (SCC): %.4f (+- %.7f)' % (scc, std)
        printime('    => correlation between eigenvectors')
        eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm,
                                          remove_bad_columns=True, nvect=6,
                                          savefig=eigen_corr_fig,
                                          savedata=eigen_corr_dat)

        printime('    => reproducibility score')
        reprod = get_reproducibility(hic_data1, hic_data2, num_evec=20, normalized=opts.norm,
                                     verbose=False, remove_bad_columns=True)
        print '         - reproducibility score: %.4f' % (reprod)
        ncols = len(hic_data1)
    else:
        ncols = 0
        decay_corr_dat = 'None'
        decay_corr_fig = 'None'
        eigen_corr_dat = 'None'
        eigen_corr_fig = 'None'

        corr = eig_corr = 0
        bads = {}

    # merge inputs
    mkdir(path.join(opts.workdir, '03_filtered_reads'))
    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s.bam' % (param_hash))

    printime('  - Mergeing experiments')
    system(samtools  + ' merge -@ %d %s %s %s' % (opts.cpus, outbam, mreads1, mreads2))
    printime('  - Indexing new BAM file')
    # check samtools version number and modify command line
    version = LooseVersion([l.split()[1]
                            for l in Popen(samtools, stderr=PIPE).communicate()[1].split('\n')
                            if 'Version' in l][0])
    if version >= LooseVersion('1.3.1'):
        system(samtools  + ' index -@ %d %s' % (opts.cpus, outbam))
    else:
        system(samtools  + ' index %s' % (outbam))

    finish_time = time.localtime()
    save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig,
                len(bads.keys()), ncols, scc, std, reprod,
                eigen_corr_dat, eigen_corr_fig, outbam, corr, eig_corr,
                biases1, biases2, launch_time, finish_time)
    printime('\nDone.')
Пример #18
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception(
                'ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception(
                'ERROR: missing restriction enzyme name for oneD normalization'
            )
        if not opts.mappability:
            raise Exception(
                'ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (
                len(fas - bam))
            if len(fas - bam) <= 50:
                print '\n'.join([('  - ' + c) for c in (fas - bam)])
        if bam - fas:
            txt = ('\n'.join([('  - ' + c)
                              for c in (bam -
                                        fas)]) if len(bam - fas) <= 50 else '')
            raise Exception(
                'ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' %
                (len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception("ERROR: chromosomes in FASTA different the ones"
                            " in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        mappability = parse_mappability_bedGraph(
            opts.mappability,
            opts.reso,
            wanted_chrom=refs[0] if len(refs) == 1 else None)
        # resize chomosomes
        for c in refs:
            if not c in mappability:
                mappability[c] = [float('nan')] * (len(refs) / opts.reso + 1)
            if len(mappability[c]) < len(refs) / opts.reso + 1:
                mappability[c] += [float('nan')] * (
                    (len(refs) / opts.reso + 1) - len(mappability[c]))
        # concatenates
        mappability = reduce(lambda x, y: x + y,
                             (mappability.get(c, []) for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome,
                                    opts.reso,
                                    chromosomes=refs,
                                    n_cpus=opts.cpus)
        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in xrange(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos - 200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads,
        filter_exclude,
        opts.reso,
        min_count=opts.min_count,
        sigma=2,
        factor=1,
        outdir=outdir,
        extra_out=param_hash,
        ncpus=opts.cpus,
        normalization=opts.normalization,
        mappability=mappability,
        p_fit=opts.p_fit,
        cg_content=gc_content,
        n_rsites=n_rsites,
        min_perc=opts.min_perc,
        max_perc=opts.max_perc,
        seed=opts.seed,
        normalize_only=opts.normalize_only,
        max_njobs=opts.max_njobs,
        extra_bads=opts.badcols,
        biases_path=opts.biases_path)

    bad_col_image = path.join(
        outdir, 'filtered_bins_%s_%s.png' %
        (nicer(opts.reso).replace(' ', ''), param_hash))

    inter_vs_gcoord = path.join(
        opts.workdir, '04_normalization',
        'interactions_vs_genomic-coords.png_%s_%s.png' %
        (opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay,
            max_diff=10000,
            resolution=opts.reso,
            normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(
        outdir, 'biases_%s_%s.pickle' %
        (nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'w')

    dump(
        {
            'biases': biases,
            'decay': decay,
            'badcol': badcol,
            'resolution': opts.reso
        }, out, HIGHEST_PROTOCOL)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, bad_col_image, len(badcol),
                   len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2,
                   opts.filter, launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
Пример #19
0
def read_bam(inbam,
             filter_exclude,
             resolution,
             min_count=2500,
             normalization='Vanilla',
             mappability=None,
             n_rsites=None,
             cg_content=None,
             sigma=2,
             ncpus=8,
             factor=1,
             outdir='.',
             extra_out='',
             only_valid=False,
             normalize_only=False,
             max_njobs=100,
             min_perc=None,
             max_perc=None,
             extra_bads=None):
    bamfile = AlignmentFile(inbam, 'rb')
    sections = OrderedDict(
        zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths]))
    total = 0
    section_pos = dict()
    for crm in sections:
        section_pos[crm] = (total, total + sections[crm])
        total += sections[crm]
    bins = []
    for crm in sections:
        len_crm = sections[crm]
        bins.extend([(crm, i) for i in xrange(len_crm)])

    start_bin = 0
    end_bin = len(bins)
    total = len(bins)

    regs = []
    begs = []
    ends = []
    njobs = min(total, max_njobs) + 1
    nbins = total / njobs + 1
    for i in range(start_bin, end_bin, nbins):
        if i + nbins > end_bin:  # make sure that we stop
            nbins = end_bin - i
        try:
            (crm1, beg1), (crm2, end2) = bins[i], bins[i + nbins - 1]
        except IndexError:
            try:
                (crm1, beg1), (crm2, end2) = bins[i], bins[-1]
            except IndexError:
                break
        if crm1 != crm2:
            end1 = sections[crm1]
            beg2 = 0
            regs.append(crm1)
            regs.append(crm2)
            begs.append(beg1 * resolution)
            begs.append(beg2 * resolution)
            ends.append(end1 * resolution + resolution)  # last nt included
            ends.append(end2 * resolution + resolution -
                        1)  # last nt not included (overlap with next window)
        else:
            regs.append(crm1)
            begs.append(beg1 * resolution)
            ends.append(end2 * resolution + resolution - 1)
    ends[-1] += 1  # last nucleotide included

    # print '\n'.join(['%s %d %d' % (a, b, c) for a, b, c in zip(regs, begs, ends)])
    printime('  - Parsing BAM (%d chunks)' % (len(regs)))
    bins_dict = dict([(j, i) for i, j in enumerate(bins)])
    pool = mu.Pool(ncpus)
    procs = []
    read_bam_frag = read_bam_frag_valid if only_valid else read_bam_frag_filter
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        procs.append(
            pool.apply_async(read_bam_frag,
                             args=(
                                 inbam,
                                 filter_exclude,
                                 bins,
                                 bins_dict,
                                 resolution,
                                 outdir,
                                 extra_out,
                                 region,
                                 start,
                                 end,
                             )))
    pool.close()
    print_progress(procs)
    pool.join()
    ## COLLECT RESULTS
    cisprc = {}
    printime('  - Collecting cis and total interactions per bin (%d chunks)' %
             (len(regs)))
    stdout.write('     ')
    for countbin, (region, start, end) in enumerate(zip(regs, begs, ends)):
        if not countbin % 10 and countbin:
            stdout.write(' ')
        if not countbin % 50 and countbin:
            stdout.write(' %9s\n     ' % ('%s/%s' % (countbin, len(regs))))
        stdout.write('.')
        stdout.flush()

        fname = path.join(
            outdir,
            'tmp_bins_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
        tmp_cisprc = load(open(fname))
        system('rm -f %s' % fname)
        cisprc.update(tmp_cisprc)
    stdout.write('\n')

    printime('  - Removing columns with too few or too much interactions')
    if len(bamfile.references) == 1 and min_count is None:
        raise Exception("ERROR: only one chromosome can't filter by "
                        "cis-percentage, set min_count instead")
    elif min_count is None and len(bamfile.references) > 1:
        badcol = filter_by_cis_percentage(
            cisprc,
            sigma=sigma,
            verbose=True,
            min_perc=min_perc,
            max_perc=max_perc,
            size=total,
            savefig=path.join(
                outdir, 'filtered_bins_%s_%s.png' %
                (nicer(resolution).replace(' ', ''), extra_out)))
    else:
        print(
            '      -> too few interactions defined as less than %9d '
            'interactions') % (min_count)
        badcol = {}
        countL = 0
        countZ = 0
        for c in xrange(total):
            if cisprc.get(c, [0, 0])[1] < min_count:
                badcol[c] = cisprc.get(c, [0, 0])[1]
                countL += 1
                if not c in cisprc:
                    countZ += 1
        print '      -> removed %d columns (%d/%d null/high counts) of %d (%.1f%%)' % (
            len(badcol), countZ, countL, total,
            float(len(badcol)) / total * 100)

    # no mappability will result in NaNs, better to filter out these columns
    if mappability:
        badcol.update((i, True) for i, m in enumerate(mappability) if not m)

    # add manually columns to bad columns
    if extra_bads:
        removed_manually = 0
        for ebc in extra_bads:
            c, ebc = ebc.split(':')
            b, e = map(int, ebc.split('-'))
            b = b / resolution + section_pos[c][0]
            e = e / resolution + section_pos[c][0]
            removed_manually += (e - b)
            badcol.update(dict((p, 'manual') for p in xrange(b, e)))
        printime('  - Removed %d columns manually.' % removed_manually)
    raw_cisprc = sum(
        float(cisprc[k][0]) / cisprc[k][1]
        for k in cisprc if not k in badcol) / (len(cisprc) - len(badcol))

    printime('  - Rescaling sum of interactions per bins')
    size = len(bins)
    biases = [
        float('nan') if k in badcol else cisprc.get(k, [0, 1.])[1]
        for k in xrange(size)
    ]

    if normalization == 'Vanilla':
        printime('  - Vanilla normalization')
        mean_col = nanmean(biases)
        biases = dict(
            (k, b / mean_col * mean_col**0.5) for k, b in enumerate(biases))
    elif normalization == 'oneD':
        printime('  - oneD normalization')
        if len(
                set([
                    len(biases),
                    len(mappability),
                    len(n_rsites),
                    len(cg_content)
                ])) > 1:
            print "biases", "mappability", "n_rsites", "cg_content"
            print len(biases), len(mappability), len(n_rsites), len(cg_content)
            raise Exception('Error: not all arrays have the same size')
        tmp_oneD = path.join(outdir, 'tmp_oneD_%s' % (extra_out))
        mkdir(tmp_oneD)
        biases = oneD(tmp_dir=tmp_oneD,
                      tot=biases,
                      map=mappability,
                      res=n_rsites,
                      cg=cg_content)
        biases = dict((k, b) for k, b in enumerate(biases))
        rmtree(tmp_oneD)
    else:
        raise NotImplementedError('ERROR: method %s not implemented' %
                                  normalization)

    # collect subset-matrices and write genomic one
    # out = open(os.path.join(outdir,
    #                         'hicdata_%s.abc' % (nicer(resolution).replace(' ', ''))), 'w')
    printime('  - Getting sum of normalized bins')
    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        fname = path.join(
            outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
        procs.append(pool.apply_async(sum_nrm_matrix, args=(
            fname,
            biases,
        )))
    pool.close()
    print_progress(procs)
    pool.join()

    # to correct biases
    sumnrm = sum(p.get() for p in procs)

    target = (sumnrm / float(size * size * factor))**0.5
    biases = dict([(b, biases[b] * target) for b in biases])

    if not normalize_only:
        printime('  - Computing Cis percentage')
        # Calculate Cis percentage

        pool = mu.Pool(ncpus)
        procs = []
        for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
            fname = path.join(
                outdir,
                'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
            procs.append(
                pool.apply_async(get_cis_perc,
                                 args=(fname, biases, badcol, bins)))
        pool.close()
        print_progress(procs)
        pool.join()

        # collect results
        cis = total = 0
        for proc in procs:
            c, t = proc.get()
            cis += c
            total += t
        norm_cisprc = float(cis) / total
        print '    * Cis-percentage: %.1f%%' % (norm_cisprc * 100)
    else:
        norm_cisprc = 0.

    printime('  - Rescaling decay')
    # normalize decay by size of the diagonal, and by Vanilla correction
    # (all cells must still be equals to 1 in average)

    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        fname = path.join(
            outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
        procs.append(
            pool.apply_async(sum_dec_matrix,
                             args=(fname, biases, badcol, bins)))
    pool.close()
    print_progress(procs)
    pool.join()

    # collect results
    nrmdec = {}
    rawdec = {}
    for proc in procs:
        tmpnrm, tmpraw = proc.get()
        for c, d in tmpnrm.iteritems():
            for k, v in d.iteritems():
                try:
                    nrmdec[c][k] += v
                    rawdec[c][k] += tmpraw[c][k]
                except KeyError:
                    try:
                        nrmdec[c][k] = v
                        rawdec[c][k] = tmpraw[c][k]
                    except KeyError:
                        nrmdec[c] = {k: v}
                        rawdec[c] = {k: tmpraw[c][k]}
    # count the number of cells per diagonal
    # TODO: parallelize
    # find largest chromosome
    len_crms = dict(
        (c, section_pos[c][1] - section_pos[c][0]) for c in section_pos)
    # initialize dictionary
    ndiags = dict(
        (c, dict((k, 0) for k in xrange(len_crms[c]))) for c in sections)
    for crm in section_pos:
        beg_chr, end_chr = section_pos[crm][0], section_pos[crm][1]
        chr_size = end_chr - beg_chr
        thesebads = [b for b in badcol if beg_chr <= b <= end_chr]
        for dist in xrange(1, chr_size):
            ndiags[crm][dist] += chr_size - dist
            # from this we remove bad columns
            # bad columns will only affect if they are at least as distant from
            # a border as the distance between the longest diagonal and the
            # current diagonal.
            bad_diag = set(
            )  # 2 bad rows can point to the same bad cell in diagonal
            maxp = end_chr - dist
            minp = beg_chr + dist
            for b in thesebads:
                if b < maxp:  # not inclusive!!
                    bad_diag.add(b)
                if b >= minp:
                    bad_diag.add(b - dist)
            ndiags[crm][dist] -= len(bad_diag)
        # different behavior for longest diagonal:
        ndiags[crm][0] += chr_size - sum(beg_chr <= b < end_chr
                                         for b in thesebads)

    # normalize sum per diagonal by total number of cells in diagonal
    signal_to_noise = 0.05
    min_n = signal_to_noise**-2.  # equals 400 when default
    for crm in sections:
        if not crm in nrmdec:
            nrmdec[crm] = {}
            rawdec[crm] = {}
        tmpdec = 0  # store count by diagonal
        tmpsum = 0  # store count by diagonal
        ndiag = 0
        val = 0
        previous = [
        ]  # store diagonals to be summed in case not reaching the minimum
        for k in ndiags[crm]:
            tmpdec += nrmdec[crm].get(k, 0.)
            tmpsum += rawdec[crm].get(k, 0.)
            previous.append(k)
            if tmpsum > min_n:
                ndiag = sum(ndiags[crm][k] for k in previous)
                val = tmpdec  # backup of tmpdec kept for last ones outside the loop
                try:
                    ratio = val / ndiag
                    for k in previous:
                        nrmdec[crm][k] = ratio
                except ZeroDivisionError:  # all columns at this distance are "bad"
                    pass
                previous = []
                tmpdec = 0
                tmpsum = 0
        # last ones we average with previous result
        if len(previous) == len(ndiags[crm]):
            nrmdec[crm] = {}
        elif tmpsum < min_n:
            ndiag += sum(ndiags[crm][k] for k in previous)
            val += tmpdec
            try:
                ratio = val / ndiag
                for k in previous:
                    nrmdec[crm][k] = ratio
            except ZeroDivisionError:  # all columns at this distance are "bad"
                pass
    return biases, nrmdec, badcol, raw_cisprc, norm_cisprc
Пример #20
0
def run(opts):
    check_options(opts)
    param_hash = digest_parameters(opts, extra=['quiet'])
    opts.normalizations = ['norm' if opts.norm else 'raw']
    biases = None

    clean = True  # change for debug
    if opts.bam:
        mreads = path.realpath(opts.bam)
        if not opts.biases and opts.norm:
            raise Exception('ERROR: external BAM input, should provide path to'
                            ' biases file.')
    else:
        biases, mreads = load_parameters_fromdb(opts)
        mreads = path.join(opts.workdir, mreads)
        biases = path.join(opts.workdir, biases) if biases else None
    if opts.biases:
        biases = opts.biases

    coord1 = opts.coord1
    coord2 = opts.coord2

    if coord2 and not coord1:
        coord1, coord2 = coord2, coord1

    if not coord1:
        region1 = None
        start1 = None
        end1 = None
        region2 = None
        start2 = None
        end2 = None
    else:
        try:
            crm1, pos1 = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1 = int(start1)
            end1 = int(end1)
        except ValueError:
            region1 = coord1
            start1 = None
            end1 = None
        if coord2:
            try:
                crm2, pos2 = coord2.split(':')
                start2, end2 = pos2.split('-')
                region2 = crm2
                start2 = int(start2)
                end2 = int(end2)
            except ValueError:
                region2 = coord2
                start2 = None
                end2 = None
        else:
            region2 = None
            start2 = None
            end2 = None

    outdir = path.join(opts.workdir, '05_sub-matrices')
    mkdir(outdir)
    tmpdir = path.join(opts.workdir, '05_sub-matrices',
                       '_tmp_sub-matrices_%s' % param_hash)
    mkdir(tmpdir)

    if region1:
        if region1:
            if not opts.quiet:
                stdout.write('\nExtraction of %s' % (region1))
            if start1:
                if not opts.quiet:
                    stdout.write(':%s-%s' % (start1, end1))
            else:
                if not opts.quiet:
                    stdout.write(' (full chromosome)')
            if region2:
                if not opts.quiet:
                    stdout.write(' intersection with %s' % (region2))
                if start2:
                    if not opts.quiet:
                        stdout.write(':%s-%s\n' % (start2, end2))
                else:
                    if not opts.quiet:
                        stdout.write(' (full chromosome)\n')
            else:
                if not opts.quiet:
                    stdout.write('\n')
    else:
        if not opts.quiet:
            stdout.write('\nExtraction of %s genome\n' %
                         ('partial' if opts.chr_name else 'full'))

    norm = 'norm' if opts.norm else 'raw'

    if opts.format == 'matrix' or opts.format == 'hic':
        bamfile = AlignmentFile(mreads, 'rb')
        bam_refs = bamfile.references
        bam_lengths = bamfile.lengths
        if opts.chr_name:
            bam_refs_idx = [
                bam_refs.index(chr_ord) for chr_ord in opts.chr_name
                if chr_ord in bam_refs
            ]
            if not bam_refs_idx:
                raise Exception(
                    '''ERROR: Wrong number of chromosomes in chr_order.
                    Found %s in bam file \n''' % (' '.join(bam_refs)))
            bam_refs = [
                bam_ref for bam_ref in
                [bam_refs[bam_ref_idx] for bam_ref_idx in bam_refs_idx]
            ]
            bam_lengths = [
                bam_len for bam_len in
                [bam_lengths[bam_ref_idx] for bam_ref_idx in bam_refs_idx]
            ]
        sections = OrderedDict(list(zip(bam_refs, [x for x in bam_lengths])))
        printime('Getting %s matrices' % norm)
        matrix, bads1, bads2, regions, name, bin_coords = get_matrix(
            mreads,
            opts.reso,
            load(open(biases, 'rb')) if biases and norm != 'raw' else None,
            normalization=norm,
            filter_exclude=opts.filter,
            region1=region1,
            start1=start1,
            end1=end1,
            region2=region2,
            start2=start2,
            end2=end2,
            tmpdir=tmpdir,
            ncpus=opts.cpus,
            return_headers=True,
            nchunks=opts.nchunks,
            verbose=not opts.quiet,
            clean=clean,
            chr_order=opts.chr_name)

        b1, e1, b2, e2 = bin_coords
        b1, e1 = 0, e1 - b1
        b2, e2 = 0, e2 - b2

        if opts.format == 'matrix':
            if opts.row_names:
                starts = [start1, start2]
                ends = [end1, end2]
                row_names = ((reg, p + 1, p + opts.reso)
                             for r, reg in enumerate(regions) for p in range(
                                 starts[r] if r < len(starts) and starts[r]
                                 else 0, ends[r] if r < len(ends) and ends[r]
                                 else sections[reg], opts.reso))

            printime(' - Writing: %s' % norm)
            out = open(opts.out, 'w')
            for reg in regions:
                out.write('# CRM %s\t%d\n' % (reg, sections[reg]))
            if region2:
                out.write('# BADROWS %s\n' % (','.join([str(b)
                                                        for b in bads1])))
                out.write('# BADCOLS %s\n' % (','.join([str(b)
                                                        for b in bads2])))
            else:
                out.write('# MASKED %s\n' % (','.join([str(b)
                                                       for b in bads1])))
            if opts.row_names:
                out.write('\n'.join('%s\t%d\t%d\t' %
                                    (next(row_names)) + '\t'.join(
                                        str(matrix.get((i, j), 0))
                                        for i in range(b1, e1))
                                    for j in range(b2, e2)) + '\n')
            else:
                out.write('\n'.join('\t'.join(
                    str(matrix.get((i, j), 0)) for i in range(b1, e1))
                                    for j in range(b2, e2)) + '\n')
            out.close()
        else:
            printime(' - Writing: %s' % norm)
            tmp_chromsize = path.join(tmpdir,
                                      'hic_%s.chrom.sizes' % param_hash)
            out = open(tmp_chromsize, 'w')
            for reg in regions:
                out.write('%s\t%d\n' % (reg, sections[reg]))
            out.close()
            tmpfl = path.join(tmpdir, 'hic_export_%s.tsv' % param_hash)
            out = open(tmpfl, 'w')
            out_ln = '0\t%s\t%d\t0\t1\t%s\t%d\t1\t1%f' if opts.norm else '0\t%s\t%d\t0\t1\t%s\t%d\t1\t1%d'
            if region1:
                starts = [start1, start2]
                ends = [end1, end2]
                row_names = [
                    (reg, pos + 1) for r, reg in enumerate(regions)
                    for pos in range(
                        starts[r] if r < len(starts) and starts[r] else 0,
                        ends[r] if r < len(ends) and ends[r] else
                        sections[reg], opts.reso)
                ]
                out.write('\n'.join(
                    out_ln %
                    (row_names[i][0], row_names[i][1], row_names[j][0],
                     row_names[j][1], matrix.get((i, j), 0))
                    for i in range(b1, e1) for j in range(i, e2)))
            else:
                totals = OrderedDict()
                total_num = 0
                for c in sections:
                    totals[c] = (total_num,
                                 total_num + sections[c] // opts.reso + 1)
                    total_num += sections[c] // opts.reso + 1

                for crm1_id, crm1 in enumerate(sections):
                    b1, e1 = totals[crm1]
                    row_names1 = dict((b1 + ipos, pos + 1)
                                      for ipos, pos in enumerate(
                                          range(0, sections[crm1], opts.reso)))
                    for crm2 in list(sections.keys())[crm1_id:]:
                        b2, e2 = totals[crm2]
                        row_names2 = dict(
                            (b2 + ipos, pos + 1) for ipos, pos in enumerate(
                                range(0, sections[crm2], opts.reso)))
                        out.write('\n'.join(
                            out_ln % (crm1, row_names1[i], crm2, row_names2[j],
                                      matrix.get((i, j), 0))
                            for i in range(b1, e1)
                            for j in range(max(b2, i), e2)))

            out.close()
            do_norm = '-n' if opts.norm else ''
            _ = Popen('java -Xmx32g -jar %s pre -j %d %s %s %s %s' %
                      (opts.juicerjar, opts.cpus, do_norm, tmpfl, opts.out,
                       tmp_chromsize),
                      shell=True,
                      universal_newlines=True).communicate()
    elif opts.format == 'text':
        printime('Getting and writing matrix to text format')
        fnames = write_matrix(mreads,
                              opts.reso,
                              load(open(biases, 'rb')) if biases else None,
                              outdir,
                              filter_exclude=opts.filter,
                              normalizations=[norm],
                              region1=region1,
                              start1=start1,
                              end1=end1,
                              region2=region2,
                              start2=start2,
                              end2=end2,
                              tmpdir=tmpdir,
                              append_to_tar=None,
                              ncpus=opts.cpus,
                              nchunks=opts.nchunks,
                              verbose=not opts.quiet,
                              extra=param_hash,
                              cooler=False,
                              clean=clean,
                              chr_order=opts.chr_name)
        rename(list(fnames.values())[0], opts.out)
    elif opts.format == 'cooler':
        printime('Getting and writing matrix to cooler format')
        fnames = write_matrix(mreads,
                              opts.reso,
                              load(open(biases, 'rb')) if biases else None,
                              outdir,
                              filter_exclude=opts.filter,
                              normalizations=[norm],
                              region1=region1,
                              start1=start1,
                              end1=end1,
                              region2=region2,
                              start2=start2,
                              end2=end2,
                              tmpdir=tmpdir,
                              append_to_tar=None,
                              ncpus=opts.cpus,
                              nchunks=opts.nchunks,
                              verbose=not opts.quiet,
                              extra=param_hash,
                              cooler=True,
                              clean=clean,
                              chr_order=opts.chr_name)
        for zoom_c in ZOOMS_COOLER:
            if opts.reso >= zoom_c:
                continue
            if start1 is not None and end1:
                if end1 - start1 < zoom_c:
                    continue
            if start2 is not None and end2:
                if end2 - start2 < zoom_c:
                    continue
            printime('Building cooler zoom %d' % zoom_c)
            _ = write_matrix(mreads,
                             zoom_c,
                             None,
                             outdir,
                             filter_exclude=opts.filter,
                             normalizations=['raw'],
                             region1=region1,
                             start1=start1,
                             end1=end1,
                             region2=region2,
                             start2=start2,
                             end2=end2,
                             tmpdir=tmpdir,
                             append_to_tar=None,
                             ncpus=opts.cpus,
                             nchunks=opts.nchunks,
                             verbose=not opts.quiet,
                             extra=param_hash,
                             cooler=True,
                             cooler_name=fnames['NRM' if opts.norm else 'RAW'],
                             clean=clean,
                             chr_order=opts.chr_name)
        rename(fnames['NRM' if opts.norm else 'RAW'], opts.out)
        if 'NRM' in fnames and not opts.norm:
            remove(fnames['NRM'])
        if 'RAW' in fnames and opts.norm:
            remove(fnames['RAW'])

    if clean:
        printime('Cleaning')
        system('rm -rf %s ' % tmpdir)