示例#1
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts, extra=['quiet'])

    if opts.zrange:
        vmin = float(opts.zrange.split(',')[0])
        vmax = float(opts.zrange.split(',')[1])
    else:
        vmin = vmax = None

    if opts.figsize:
        opts.figsize = map(float, opts.figsize.split(','))
    else:
        vmin = vmax = None

    clean = True  # change for debug

    if opts.bam:
        mreads = path.realpath(opts.bam)
        if not opts.biases and all(v !='raw' for v in opts.normalizations):
            raise Exception('ERROR: external BAM input, should provide path to'
                            ' biases file.')
        biases = opts.biases
    else:
        biases, mreads = load_parameters_fromdb(opts)
        mreads = path.join(opts.workdir, mreads)
        biases = path.join(opts.workdir, biases) if biases else None
    if opts.biases:
        biases = opts.biases

    coord1         = opts.coord1
    coord2         = opts.coord2

    if coord2 and not coord1:
        coord1, coord2 = coord2, coord1

    if not coord1:
        region1 = None
        start1  = None
        end1    = None
        region2 = None
        start2  = None
        end2    = None
    else:
        try:
            crm1, pos1   = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1  = int(start1)
            end1    = int(end1)
        except ValueError:
            region1 = coord1
            start1  = None
            end1    = None
        if coord2:
            try:
                crm2, pos2   = coord2.split(':')
                start2, end2 = pos2.split('-')
                region2 = crm2
                start2  = int(start2)
                end2    = int(end2)
            except ValueError:
                region2 = coord2
                start2  = None
                end2    = None
        else:
            region2 = None
            start2  = None
            end2    = None

    if opts.plot and not opts.force_plot:
        if opts.interactive:
            max_size = 1500**2
        else:
            max_size = 5000**2
    else:
        max_size = None

    outdir = path.join(opts.workdir, '05_sub-matrices')
    mkdir(outdir)
    tmpdir = path.join(opts.workdir, '05_sub-matrices',
                       '_tmp_sub-matrices_%s' % param_hash)
    mkdir(tmpdir)

    if region1:
        if region1:
            if not opts.quiet:
                stdout.write('\nExtraction of %s' % (region1))
            if start1:
                if not opts.quiet:
                    stdout.write(':%s-%s' % (start1, end1))
            else:
                if not opts.quiet:
                    stdout.write(' (full chromosome)')
            if region2:
                if not opts.quiet:
                    stdout.write(' intersection with %s' % (region2))
                if start2:
                    if not opts.quiet:
                        stdout.write(':%s-%s\n' % (start2, end2))
                else:
                    if not opts.quiet:
                        stdout.write(' (full chromosome)\n')
            else:
                if not opts.quiet:
                    stdout.write('\n')
    else:
        if not opts.quiet:
            stdout.write('\nExtraction of full genome\n')

    out_files = {}
    out_plots = {}

    if opts.matrix or opts.plot:
        bamfile = AlignmentFile(mreads, 'rb')
        sections = OrderedDict(zip(bamfile.references,
                                   [x for x in bamfile.lengths]))
        total = 0
        section_pos = OrderedDict()
        for crm in sections:
            section_pos[crm] = (total, total + sections[crm])
            total += sections[crm]
        for norm in opts.normalizations:
            norm_string = ('RAW' if norm == 'raw' else 'NRM'
                           if norm == 'norm' else 'DEC')
            printime('Getting %s matrices' % norm)
            try:
                matrix, bads1, bads2, regions, name, bin_coords = get_matrix(
                    mreads, opts.reso,
                    load(open(biases)) if biases and norm != 'raw' else None,
                    normalization=norm,
                    region1=region1, start1=start1, end1=end1,
                    region2=region2, start2=start2, end2=end2,
                    tmpdir=tmpdir, ncpus=opts.cpus,
                    return_headers=True,
                    nchunks=opts.nchunks, verbose=not opts.quiet,
                    clean=clean, max_size=max_size)
            except NotImplementedError:
                if norm == "raw&decay":
                    warn('WARNING: raw&decay normalization not implemented '
                         'for matrices\n... skipping\n')
                    continue
                raise
            b1, e1, b2, e2 = bin_coords
            b1, e1 = 0, e1 - b1
            b2, e2 = 0, e2 - b2
            if opts.row_names:
                starts = [start1, start2]
                ends = [end1, end2]
                row_names = ((reg, p + 1 , p + opts.reso) for r, reg in enumerate(regions)
                             for p in range(starts[r] if r < len(starts) and starts[r] else 0,
                                            ends[r] if r < len(ends) and ends[r] else sections[reg],
                                            opts.reso))
            if opts.matrix:
                printime(' - Writing: %s' % norm)
                fnam = '%s_%s_%s%s.mat' % (norm, name,
                                           nicer(opts.reso, sep=''),
                                           ('_' + param_hash))
                out_files[norm_string] = path.join(outdir, fnam)
                out = open(path.join(outdir, fnam), 'w')
                for reg in regions:
                    out.write('# CRM %s\t%d\n' % (reg, sections[reg]))
                if region2:
                    out.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1])))
                    out.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2])))
                else:
                    out.write('# MASKED %s\n' % (','.join([str(b) for b in bads1])))
                if opts.row_names:
                    out.write('\n'.join('%s\t%d\t%d\t' % (row_names.next()) +
                                        '\t'.join(str(matrix.get((i, j), 0))
                                                  for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                else:
                    out.write('\n'.join('\t'.join(str(matrix.get((i, j), 0))
                                                  for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                out.close()
            if opts.plot:
                # transform matrix
                matrix = array([array([matrix.get((i, j), 0)
                                       for i in xrange(b1, e1)])
                                for j in xrange(b2, e2)])
                m = zeros_like(matrix)
                for bad1 in bads1:
                    m[:,bad1] = 1
                    for bad2 in bads2:
                        m[bad2,:] = 1
                matrix = ma.masked_array(matrix, m)
                printime(' - Plotting: %s' % norm)
                fnam = '%s_%s_%s%s%s.%s' % (
                    norm, name, nicer(opts.reso, sep=''),
                    ('_' + param_hash), '_tri' if opts.triangular else '',
                    opts.format)
                out_plots[norm_string] = path.join(outdir, fnam)
                pltbeg1 = 0 if start1 is None else start1
                pltend1 = sections[regions[0]] if end1 is None else end1
                pltbeg2 = 0 if start2 is None else start2
                pltend2 = sections[regions[-1]] if end2 is None else end2
                xlabel = '{}:{:,}-{:,}'.format(
                    regions[0], pltbeg1 if pltbeg1 else 1, pltend1)
                ylabel = '{}:{:,}-{:,}'.format(
                    regions[-1], pltbeg2 if pltbeg2 else 1, pltend2)
                section_pos = OrderedDict((k, section_pos[k]) for k in section_pos
                                   if k in regions)
                ax1, _ = plot_HiC_matrix(
                    matrix, triangular=opts.triangular,
                    vmin=vmin, vmax=vmax, cmap=opts.cmap,
                    figsize=opts.figsize,
                    bad_color=opts.bad_color if norm != 'raw' else None)
                ax1.set_title('Region: %s, normalization: %s, resolution: %s' % (
                    name, norm, nicer(opts.reso)), y=1.05)
                _format_axes(ax1, start1, end1, start2, end2, opts.reso,
                             regions, section_pos, sections,
                             opts.xtick_rotation, triangular=False)
                if opts.interactive:
                    plt.show()
                    plt.close('all')
                else:
                    tadbit_savefig(path.join(outdir, fnam))
    if not opts.matrix and not opts.only_plot:
        printime('Getting and writing matrices')
        out_files.update(write_matrix(
            mreads, opts.reso,
            load(open(biases)) if biases else None,
            outdir, filter_exclude=opts.filter,
            normalizations=opts.normalizations,
            region1=region1, start1=start1, end1=end1,
            region2=region2, start2=start2, end2=end2,
            tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus,
            nchunks=opts.nchunks, verbose=not opts.quiet,
            extra=param_hash, clean=clean))

    if clean:
        printime('Cleaning')
        system('rm -rf %s '% tmpdir)

    if not opts.interactive:
        printime('Saving to DB')
        finish_time = time.localtime()
        save_to_db(opts, launch_time, finish_time, out_files, out_plots)
示例#2
0
def main():
    opts = get_options()
    inbam = opts.inbam
    resolution = opts.reso
    filter_exclude = opts.filter
    ncpus = opts.cpus
    if opts.biases:
        biases = load(open(opts.biases))
    else:
        biases = {}
    outdir = opts.outdir
    tmpdir = opts.tmpdir
    coord1 = opts.coord1
    coord2 = opts.coord2

    if biases and biases['resolution'] != resolution:
        raise Exception(
            'ERROR: different resolution in bias file (you want %d,'
            ' there is %d).\n' % (resolution, biases['resolution']))
    if coord2 and not coord1:
        coord1, coord2 = coord2, coord1

    if not coord1:
        region1 = None
        start1 = None
        end1 = None
        region2 = None
        start2 = None
        end2 = None
    else:
        try:
            crm1, pos1 = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1 = int(start1)
            end1 = int(end1)
        except ValueError:
            region1 = coord1
            start1 = None
            end1 = None
        if coord2:
            try:
                crm2, pos2 = coord2.split(':')
                start2, end2 = pos2.split('-')
                region2 = crm2
                start2 = int(start2)
                end2 = int(end2)
            except ValueError:
                region2 = coord2
                start2 = None
                end2 = None
        else:
            region2 = None
            start2 = None
            end2 = None

    mkdir(outdir)
    mkdir(tmpdir)
    if region1:
        if region1:
            if not opts.quiet:
                stdout.write('\nExtraction of %s' % (region1))
            if start1:
                if not opts.quiet:
                    stdout.write(':%s-%s' % (start1, end1))
            else:
                if not opts.quiet:
                    stdout.write(' (full chromosome)')
            if region2:
                if not opts.quiet:
                    stdout.write(' intersection with %s' % (region2))
                if start2:
                    if not opts.quiet:
                        stdout.write(':%s-%s\n' % (start2, end2))
                else:
                    if not opts.quiet:
                        stdout.write(' (full chromosome)\n')
            else:
                if not opts.quiet:
                    stdout.write('\n')
    else:
        if not opts.quiet:
            stdout.write('\nExtraction of full genome\n')

    write_matrix(inbam,
                 resolution,
                 biases,
                 outdir,
                 filter_exclude=filter_exclude,
                 normalizations=opts.matrices,
                 region1=region1,
                 start1=start1,
                 end1=end1,
                 region2=region2,
                 start2=start2,
                 end2=end2,
                 nchunks=opts.nchunks,
                 append_to_tar=opts.tarfile,
                 ncpus=ncpus,
                 tmpdir=tmpdir,
                 verbose=not opts.quiet)
示例#3
0
def main():
    opts          = get_options()
    inbam          = opts.inbam
    resolution     = opts.reso
    filter_exclude = opts.filter
    ncpus          = opts.cpus
    if opts.biases:
        biases     = load(open(opts.biases))
    else:
        biases     = {}
    outdir         = opts.outdir
    tmpdir         = opts.tmpdir
    coord1         = opts.coord1
    coord2         = opts.coord2

    if biases and biases['resolution'] != resolution:
        raise Exception('ERROR: different resolution in bias file (you want %d,'
                        ' there is %d).\n' % (resolution, biases['resolution']))
    if coord2 and not coord1:
        coord1, coord2 = coord2, coord1

    if not coord1:
        region1 = None
        start1  = None
        end1    = None
        region2 = None
        start2  = None
        end2    = None
    else:
        try:
            crm1, pos1   = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1  = int(start1)
            end1    = int(end1)
        except ValueError:
            region1 = coord1
            start1  = None
            end1    = None
        if coord2:
            try:
                crm2, pos2   = coord2.split(':')
                start2, end2 = pos2.split('-')
                region2 = crm2
                start2  = int(start2)
                end2    = int(end2)
            except ValueError:
                region2 = coord2
                start2  = None
                end2    = None
        else:
            region2 = None
            start2  = None
            end2    = None

    mkdir(outdir)
    mkdir(tmpdir)
    if region1:
        if region1:
            if not opts.quiet:
                stdout.write('\nExtraction of %s' % (region1))
            if start1:
                if not opts.quiet:
                    stdout.write(':%s-%s' % (start1, end1))
            else:
                if not opts.quiet:
                    stdout.write(' (full chromosome)')
            if region2:
                if not opts.quiet:
                    stdout.write(' intersection with %s' % (region2))
                if start2:
                    if not opts.quiet:
                        stdout.write(':%s-%s\n' % (start2, end2))
                else:
                    if not opts.quiet:
                        stdout.write(' (full chromosome)\n')
            else:
                if not opts.quiet:
                    stdout.write('\n')
    else:
        if not opts.quiet:
            stdout.write('\nExtraction of full genome\n')

    write_matrix(inbam, resolution, biases, outdir,
                 filter_exclude=filter_exclude,
                 normalizations=opts.matrices,
                 region1=region1, start1=start1, end1=end1,
                 region2=region2, start2=start2, end2=end2,
                 nchunks=opts.nchunks, append_to_tar=opts.tarfile,
                 ncpus=ncpus, tmpdir=tmpdir, verbose=not opts.quiet)
示例#4
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts, extra=['quiet'])

    if opts.zrange:
        vmin = float(opts.zrange.split(',')[0])
        vmax = float(opts.zrange.split(',')[1])
    else:
        vmin = vmax = None

    clean = True  # change for debug

    if opts.bam:
        mreads = path.realpath(opts.bam)
        if not opts.biases and all(v != 'raw' for v in opts.normalizations):
            raise Exception('ERROR: external BAM input, should provide path to'
                            ' biases file.')
        biases = opts.biases
    else:
        biases, mreads = load_parameters_fromdb(opts)
        mreads = path.join(opts.workdir, mreads)
        biases = path.join(opts.workdir, biases) if biases else None
    if opts.biases:
        biases = opts.biases

    coord1 = opts.coord1
    coord2 = opts.coord2

    if coord2 and not coord1:
        coord1, coord2 = coord2, coord1

    if not coord1:
        region1 = None
        start1 = None
        end1 = None
        region2 = None
        start2 = None
        end2 = None
    else:
        try:
            crm1, pos1 = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1 = int(start1)
            end1 = int(end1)
        except ValueError:
            region1 = coord1
            start1 = None
            end1 = None
        if coord2:
            try:
                crm2, pos2 = coord2.split(':')
                start2, end2 = pos2.split('-')
                region2 = crm2
                start2 = int(start2)
                end2 = int(end2)
            except ValueError:
                region2 = coord2
                start2 = None
                end2 = None
        else:
            region2 = None
            start2 = None
            end2 = None

    outdir = path.join(opts.workdir, '05_sub-matrices')
    mkdir(outdir)
    tmpdir = path.join(opts.workdir, '05_sub-matrices',
                       '_tmp_sub-matrices_%s' % param_hash)
    mkdir(tmpdir)

    if region1:
        if region1:
            if not opts.quiet:
                stdout.write('\nExtraction of %s' % (region1))
            if start1:
                if not opts.quiet:
                    stdout.write(':%s-%s' % (start1, end1))
            else:
                if not opts.quiet:
                    stdout.write(' (full chromosome)')
            if region2:
                if not opts.quiet:
                    stdout.write(' intersection with %s' % (region2))
                if start2:
                    if not opts.quiet:
                        stdout.write(':%s-%s\n' % (start2, end2))
                else:
                    if not opts.quiet:
                        stdout.write(' (full chromosome)\n')
            else:
                if not opts.quiet:
                    stdout.write('\n')
    else:
        if not opts.quiet:
            stdout.write('\nExtraction of full genome\n')

    out_files = {}
    out_plots = {}

    if opts.matrix or opts.plot:
        bamfile = AlignmentFile(mreads, 'rb')
        sections = OrderedDict(
            zip(bamfile.references, [x for x in bamfile.lengths]))
        total = 0
        section_pos = dict()
        for crm in sections:
            section_pos[crm] = (total, total + sections[crm])
            total += sections[crm]
        for norm in opts.normalizations:
            norm_string = ('RAW' if norm == 'raw' else
                           'NRM' if norm == 'norm' else 'DEC')
            printime('Getting %s matrices' % norm)
            try:
                matrix, bads1, bads2, regions, name, bin_coords = get_matrix(
                    mreads,
                    opts.reso,
                    load(open(biases)) if biases and norm != 'raw' else None,
                    normalization=norm,
                    region1=region1,
                    start1=start1,
                    end1=end1,
                    region2=region2,
                    start2=start2,
                    end2=end2,
                    tmpdir=tmpdir,
                    ncpus=opts.cpus,
                    return_headers=True,
                    nchunks=opts.nchunks,
                    verbose=not opts.quiet,
                    clean=clean)
            except NotImplementedError:
                if norm == "raw&decay":
                    warn('WARNING: raw&decay normalization not implemeted for '
                         'matrices\n... skipping\n')
                    continue
                raise
            b1, e1, b2, e2 = bin_coords
            b1, e1 = 0, e1 - b1
            b2, e2 = 0, e2 - b2
            if opts.row_names:
                starts = [start1, start2]
                ends = [end1, end2]
                row_names = ((reg, p + 1, p + opts.reso)
                             for r, reg in enumerate(regions) for p in range(
                                 starts[r] if r < len(starts) and starts[r]
                                 else 0, ends[r] if r < len(ends) and ends[r]
                                 else sections[reg], opts.reso))
            if opts.matrix:
                printime(' - Writing: %s' % norm)
                fnam = '%s_%s_%s%s.mat' % (norm, name, nicer(
                    opts.reso).replace(' ', ''), ('_' + param_hash))
                out_files[norm_string] = path.join(outdir, fnam)
                out = open(path.join(outdir, fnam), 'w')
                for reg in regions:
                    out.write('# CRM %s\t%d\n' % (reg, sections[reg]))
                if region2:
                    out.write('# BADROWS %s\n' %
                              (','.join([str(b) for b in bads1])))
                    out.write('# BADCOLS %s\n' %
                              (','.join([str(b) for b in bads2])))
                else:
                    out.write('# MASKED %s\n' %
                              (','.join([str(b) for b in bads1])))
                if opts.row_names:
                    out.write('\n'.join('%s\t%d\t%d\t' %
                                        (row_names.next()) + '\t'.join(
                                            str(matrix.get((i, j), 0))
                                            for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                else:
                    out.write('\n'.join('\t'.join(
                        str(matrix.get((i, j), 0)) for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                out.close()
            if opts.plot:
                cmap = plt.get_cmap(opts.cmap)
                if norm != 'raw':
                    cmap.set_bad('grey', 1.)
                printime(' - Plotting: %s' % norm)
                fnam = '%s_%s_%s%s.%s' % (norm, name, nicer(opts.reso).replace(
                    ' ', ''), ('_' + param_hash), opts.format)
                out_plots[norm_string] = path.join(outdir, fnam)
                if opts.interactive:
                    _ = plt.figure(figsize=(8, 7))
                else:
                    _ = plt.figure(figsize=(16, 14))
                # ax1 = plt.subplot(111)
                ax1 = plt.axes([0.1, 0.1, 0.7, 0.8])
                ax2 = plt.axes([0.82, 0.1, 0.07, 0.8])
                matrix = array([
                    array([matrix.get((i, j), 0) for i in xrange(b1, e1)])
                    for j in xrange(b2, e2)
                ])
                mini = np_min(matrix[nonzero(matrix)]) / 2.
                matrix[matrix == 0] = mini
                m = zeros_like(matrix)
                for bad1 in bads1:
                    m[:, bad1] = 1
                    for bad2 in bads2:
                        m[bad2, :] = 1
                matrix = log2(ma.masked_array(matrix, m))
                ax1.imshow(matrix,
                           interpolation='None',
                           origin='lower',
                           cmap=cmap,
                           vmin=vmin,
                           vmax=vmax)

                if len(regions) <= 2:
                    pltbeg1 = 0 if start1 is None else start1
                    pltend1 = sections[regions[0]] if end1 is None else end1
                    pltbeg2 = pltbeg1 if len(
                        regions) == 1 else 0 if start2 is None else start2
                    pltend2 = pltend1 if len(regions) == 1 else sections[
                        regions[-1]] if end2 is None else end2

                    ax1.set_xlabel('{}:{:,}-{:,}'.format(
                        regions[0], pltbeg1 if pltbeg1 else 1, pltend1))
                    ax1.set_ylabel('{}:{:,}-{:,}'.format(
                        regions[-1], pltbeg2 if pltbeg2 else 1, pltend2))

                    def format_xticks(tickstring, _=None):
                        tickstring = int(tickstring * opts.reso + pltbeg1)
                        return nicer(tickstring if tickstring else 1,
                                     coma=True)

                    def format_yticks(tickstring, _=None):
                        tickstring = int(tickstring * opts.reso + pltbeg2)
                        return nicer(tickstring if tickstring else 1,
                                     coma=True)

                    ax1.xaxis.set_major_formatter(FuncFormatter(format_xticks))
                    ax1.yaxis.set_major_formatter(FuncFormatter(format_yticks))

                    labels = ax1.get_xticklabels()
                    plt.setp(labels, rotation=-25, ha='left')

                    ax1.set_xlim(-0.5, len(matrix[0]) - 0.5)
                    ax1.set_ylim(-0.5, len(matrix) - 0.5)
                else:
                    vals = [0]
                    keys = ['']
                    for crm in regions:
                        vals.append(section_pos[crm][0] / opts.reso)
                        keys.append(crm)
                    vals.append(section_pos[crm][1] / opts.reso)
                    ax1.set_yticks(vals)
                    ax1.set_yticklabels('')
                    ax1.set_yticks([
                        float(vals[i] + vals[i + 1]) / 2
                        for i in xrange(len(vals) - 1)
                    ],
                                   minor=True)
                    ax1.set_yticklabels(keys, minor=True)
                    for t in ax1.yaxis.get_minor_ticks():
                        t.tick1On = False
                        t.tick2On = False

                    ax1.set_xticks(vals)
                    ax1.set_xticklabels('')
                    ax1.set_xticks([
                        float(vals[i] + vals[i + 1]) / 2
                        for i in xrange(len(vals) - 1)
                    ],
                                   minor=True)
                    ax1.set_xticklabels(keys, minor=True)
                    for t in ax1.xaxis.get_minor_ticks():
                        t.tick1On = False
                        t.tick2On = False
                    ax1.set_xlabel('Chromosomes')
                    ax1.set_ylabel('Chromosomes')
                    ax1.set_xlim(-0.5, len(matrix[0]) - 0.5)
                    ax1.set_ylim(-0.5, len(matrix) - 0.5)
                data = [i for d in matrix for i in d if isfinite(i)]
                mindata = nanmin(data)
                maxdata = nanmax(data)
                gradient = linspace(maxdata, mindata,
                                    max((len(matrix), len(matrix[0]))))
                gradient = dstack((gradient, gradient))[0]
                h = ax2.hist(data,
                             color='darkgrey',
                             linewidth=2,
                             orientation='horizontal',
                             bins=50,
                             histtype='step',
                             normed=True)
                _ = ax2.imshow(gradient,
                               aspect='auto',
                               cmap=cmap,
                               extent=(0, max(h[0]), mindata, maxdata))
                ax2.yaxis.tick_right()
                ax2.yaxis.set_label_position("right")
                ax2.set_xticks([])
                ax1.set_title('Region: %s, normalization: %s, resolution: %s' %
                              (name, norm, nicer(opts.reso)))
                ax2.set_ylabel('Hi-C Log2 interactions', rotation=-90)
                ax2.set_xlabel('Count')
                if opts.interactive:
                    plt.show()
                    plt.close('all')
                else:
                    tadbit_savefig(path.join(outdir, fnam))
    if not opts.matrix and not opts.only_plot:
        printime('Getting and writing matrices')
        out_files.update(
            write_matrix(mreads,
                         opts.reso,
                         load(open(biases)) if biases else None,
                         outdir,
                         filter_exclude=opts.filter,
                         normalizations=opts.normalizations,
                         region1=region1,
                         start1=start1,
                         end1=end1,
                         region2=region2,
                         start2=start2,
                         end2=end2,
                         tmpdir=tmpdir,
                         append_to_tar=None,
                         ncpus=opts.cpus,
                         nchunks=opts.nchunks,
                         verbose=not opts.quiet,
                         extra=param_hash,
                         clean=clean))

    if clean:
        printime('Cleaning')
        system('rm -rf %s ' % tmpdir)

    if not opts.interactive:
        printime('Saving to DB')
        finish_time = time.localtime()
        save_to_db(opts, launch_time, finish_time, out_files, out_plots)
示例#5
0
def run(opts):
    check_options(opts)
    param_hash = digest_parameters(opts, extra=['quiet'])
    opts.normalizations = ['norm' if opts.norm else 'raw']
    biases = None

    clean = True  # change for debug
    if opts.bam:
        mreads = path.realpath(opts.bam)
        if not opts.biases and opts.norm:
            raise Exception('ERROR: external BAM input, should provide path to'
                            ' biases file.')
    else:
        biases, mreads = load_parameters_fromdb(opts)
        mreads = path.join(opts.workdir, mreads)
        biases = path.join(opts.workdir, biases) if biases else None
    if opts.biases:
        biases = opts.biases

    coord1 = opts.coord1
    coord2 = opts.coord2

    if coord2 and not coord1:
        coord1, coord2 = coord2, coord1

    if not coord1:
        region1 = None
        start1 = None
        end1 = None
        region2 = None
        start2 = None
        end2 = None
    else:
        try:
            crm1, pos1 = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1 = int(start1)
            end1 = int(end1)
        except ValueError:
            region1 = coord1
            start1 = None
            end1 = None
        if coord2:
            try:
                crm2, pos2 = coord2.split(':')
                start2, end2 = pos2.split('-')
                region2 = crm2
                start2 = int(start2)
                end2 = int(end2)
            except ValueError:
                region2 = coord2
                start2 = None
                end2 = None
        else:
            region2 = None
            start2 = None
            end2 = None

    outdir = path.join(opts.workdir, '05_sub-matrices')
    mkdir(outdir)
    tmpdir = path.join(opts.workdir, '05_sub-matrices',
                       '_tmp_sub-matrices_%s' % param_hash)
    mkdir(tmpdir)

    if region1:
        if region1:
            if not opts.quiet:
                stdout.write('\nExtraction of %s' % (region1))
            if start1:
                if not opts.quiet:
                    stdout.write(':%s-%s' % (start1, end1))
            else:
                if not opts.quiet:
                    stdout.write(' (full chromosome)')
            if region2:
                if not opts.quiet:
                    stdout.write(' intersection with %s' % (region2))
                if start2:
                    if not opts.quiet:
                        stdout.write(':%s-%s\n' % (start2, end2))
                else:
                    if not opts.quiet:
                        stdout.write(' (full chromosome)\n')
            else:
                if not opts.quiet:
                    stdout.write('\n')
    else:
        if not opts.quiet:
            stdout.write('\nExtraction of %s genome\n' %
                         ('partial' if opts.chr_name else 'full'))

    norm = 'norm' if opts.norm else 'raw'

    if opts.format == 'matrix' or opts.format == 'hic':
        bamfile = AlignmentFile(mreads, 'rb')
        bam_refs = bamfile.references
        bam_lengths = bamfile.lengths
        if opts.chr_name:
            bam_refs_idx = [
                bam_refs.index(chr_ord) for chr_ord in opts.chr_name
                if chr_ord in bam_refs
            ]
            if not bam_refs_idx:
                raise Exception(
                    '''ERROR: Wrong number of chromosomes in chr_order.
                    Found %s in bam file \n''' % (' '.join(bam_refs)))
            bam_refs = [
                bam_ref for bam_ref in
                [bam_refs[bam_ref_idx] for bam_ref_idx in bam_refs_idx]
            ]
            bam_lengths = [
                bam_len for bam_len in
                [bam_lengths[bam_ref_idx] for bam_ref_idx in bam_refs_idx]
            ]
        sections = OrderedDict(list(zip(bam_refs, [x for x in bam_lengths])))
        printime('Getting %s matrices' % norm)
        matrix, bads1, bads2, regions, name, bin_coords = get_matrix(
            mreads,
            opts.reso,
            load(open(biases, 'rb')) if biases and norm != 'raw' else None,
            normalization=norm,
            filter_exclude=opts.filter,
            region1=region1,
            start1=start1,
            end1=end1,
            region2=region2,
            start2=start2,
            end2=end2,
            tmpdir=tmpdir,
            ncpus=opts.cpus,
            return_headers=True,
            nchunks=opts.nchunks,
            verbose=not opts.quiet,
            clean=clean,
            chr_order=opts.chr_name)

        b1, e1, b2, e2 = bin_coords
        b1, e1 = 0, e1 - b1
        b2, e2 = 0, e2 - b2

        if opts.format == 'matrix':
            if opts.row_names:
                starts = [start1, start2]
                ends = [end1, end2]
                row_names = ((reg, p + 1, p + opts.reso)
                             for r, reg in enumerate(regions) for p in range(
                                 starts[r] if r < len(starts) and starts[r]
                                 else 0, ends[r] if r < len(ends) and ends[r]
                                 else sections[reg], opts.reso))

            printime(' - Writing: %s' % norm)
            out = open(opts.out, 'w')
            for reg in regions:
                out.write('# CRM %s\t%d\n' % (reg, sections[reg]))
            if region2:
                out.write('# BADROWS %s\n' % (','.join([str(b)
                                                        for b in bads1])))
                out.write('# BADCOLS %s\n' % (','.join([str(b)
                                                        for b in bads2])))
            else:
                out.write('# MASKED %s\n' % (','.join([str(b)
                                                       for b in bads1])))
            if opts.row_names:
                out.write('\n'.join('%s\t%d\t%d\t' %
                                    (next(row_names)) + '\t'.join(
                                        str(matrix.get((i, j), 0))
                                        for i in range(b1, e1))
                                    for j in range(b2, e2)) + '\n')
            else:
                out.write('\n'.join('\t'.join(
                    str(matrix.get((i, j), 0)) for i in range(b1, e1))
                                    for j in range(b2, e2)) + '\n')
            out.close()
        else:
            printime(' - Writing: %s' % norm)
            tmp_chromsize = path.join(tmpdir,
                                      'hic_%s.chrom.sizes' % param_hash)
            out = open(tmp_chromsize, 'w')
            for reg in regions:
                out.write('%s\t%d\n' % (reg, sections[reg]))
            out.close()
            tmpfl = path.join(tmpdir, 'hic_export_%s.tsv' % param_hash)
            out = open(tmpfl, 'w')
            out_ln = '0\t%s\t%d\t0\t1\t%s\t%d\t1\t1%f' if opts.norm else '0\t%s\t%d\t0\t1\t%s\t%d\t1\t1%d'
            if region1:
                starts = [start1, start2]
                ends = [end1, end2]
                row_names = [
                    (reg, pos + 1) for r, reg in enumerate(regions)
                    for pos in range(
                        starts[r] if r < len(starts) and starts[r] else 0,
                        ends[r] if r < len(ends) and ends[r] else
                        sections[reg], opts.reso)
                ]
                out.write('\n'.join(
                    out_ln %
                    (row_names[i][0], row_names[i][1], row_names[j][0],
                     row_names[j][1], matrix.get((i, j), 0))
                    for i in range(b1, e1) for j in range(i, e2)))
            else:
                totals = OrderedDict()
                total_num = 0
                for c in sections:
                    totals[c] = (total_num,
                                 total_num + sections[c] // opts.reso + 1)
                    total_num += sections[c] // opts.reso + 1

                for crm1_id, crm1 in enumerate(sections):
                    b1, e1 = totals[crm1]
                    row_names1 = dict((b1 + ipos, pos + 1)
                                      for ipos, pos in enumerate(
                                          range(0, sections[crm1], opts.reso)))
                    for crm2 in list(sections.keys())[crm1_id:]:
                        b2, e2 = totals[crm2]
                        row_names2 = dict(
                            (b2 + ipos, pos + 1) for ipos, pos in enumerate(
                                range(0, sections[crm2], opts.reso)))
                        out.write('\n'.join(
                            out_ln % (crm1, row_names1[i], crm2, row_names2[j],
                                      matrix.get((i, j), 0))
                            for i in range(b1, e1)
                            for j in range(max(b2, i), e2)))

            out.close()
            do_norm = '-n' if opts.norm else ''
            _ = Popen('java -Xmx32g -jar %s pre -j %d %s %s %s %s' %
                      (opts.juicerjar, opts.cpus, do_norm, tmpfl, opts.out,
                       tmp_chromsize),
                      shell=True,
                      universal_newlines=True).communicate()
    elif opts.format == 'text':
        printime('Getting and writing matrix to text format')
        fnames = write_matrix(mreads,
                              opts.reso,
                              load(open(biases, 'rb')) if biases else None,
                              outdir,
                              filter_exclude=opts.filter,
                              normalizations=[norm],
                              region1=region1,
                              start1=start1,
                              end1=end1,
                              region2=region2,
                              start2=start2,
                              end2=end2,
                              tmpdir=tmpdir,
                              append_to_tar=None,
                              ncpus=opts.cpus,
                              nchunks=opts.nchunks,
                              verbose=not opts.quiet,
                              extra=param_hash,
                              cooler=False,
                              clean=clean,
                              chr_order=opts.chr_name)
        rename(list(fnames.values())[0], opts.out)
    elif opts.format == 'cooler':
        printime('Getting and writing matrix to cooler format')
        fnames = write_matrix(mreads,
                              opts.reso,
                              load(open(biases, 'rb')) if biases else None,
                              outdir,
                              filter_exclude=opts.filter,
                              normalizations=[norm],
                              region1=region1,
                              start1=start1,
                              end1=end1,
                              region2=region2,
                              start2=start2,
                              end2=end2,
                              tmpdir=tmpdir,
                              append_to_tar=None,
                              ncpus=opts.cpus,
                              nchunks=opts.nchunks,
                              verbose=not opts.quiet,
                              extra=param_hash,
                              cooler=True,
                              clean=clean,
                              chr_order=opts.chr_name)
        for zoom_c in ZOOMS_COOLER:
            if opts.reso >= zoom_c:
                continue
            if start1 is not None and end1:
                if end1 - start1 < zoom_c:
                    continue
            if start2 is not None and end2:
                if end2 - start2 < zoom_c:
                    continue
            printime('Building cooler zoom %d' % zoom_c)
            _ = write_matrix(mreads,
                             zoom_c,
                             None,
                             outdir,
                             filter_exclude=opts.filter,
                             normalizations=['raw'],
                             region1=region1,
                             start1=start1,
                             end1=end1,
                             region2=region2,
                             start2=start2,
                             end2=end2,
                             tmpdir=tmpdir,
                             append_to_tar=None,
                             ncpus=opts.cpus,
                             nchunks=opts.nchunks,
                             verbose=not opts.quiet,
                             extra=param_hash,
                             cooler=True,
                             cooler_name=fnames['NRM' if opts.norm else 'RAW'],
                             clean=clean,
                             chr_order=opts.chr_name)
        rename(fnames['NRM' if opts.norm else 'RAW'], opts.out)
        if 'NRM' in fnames and not opts.norm:
            remove(fnames['NRM'])
        if 'RAW' in fnames and opts.norm:
            remove(fnames['RAW'])

    if clean:
        printime('Cleaning')
        system('rm -rf %s ' % tmpdir)