예제 #1
0
def test_aggregate_records():
    bins = cooler.binnify(
        cooler.util.read_chromsizes(op.join(datadir, "toy.chrom.sizes")), 1
    )
    records = pd.read_csv(
        op.join(datadir, "toy.pairs"),
        sep='\t',
        names=[
            "read_id",
            "chrom1", "pos1",
            "chrom2", "pos2",
            "strand1", "strand2",
            "value"
        ]
    )
    sanitizer = sanitize_records(
        bins,
        schema="pairs",
        validate=False,
        tril_action="reflect",
        is_one_based=False,
        sort=False,
    )
    chunk = sanitizer(records)

    aggregator = aggregate_records()
    aggregator(chunk)
예제 #2
0
def hdf2cool(infile, outfile, chrms_sizes, assembly='dm3', correct=True):
    """
    This function converts hiclib .hdf5 to .cool file.
    Note that attributes "heatmap" (whole-genome heatmap), "resolution" and "genomeIdxToLabel" are required in .hdf5 file.

    :param infile: input .hdf5 file
    :param outfile: output .cool file
    :param chrms_sizes: tab-separated file with chromosome lengths
    :param assembly: genome assembly (dm3 by default)
    :param correct: iteratively correct the heatmap? (True by default)

    :return: Python cooler object which was written to the file.
    """
    
    a = h5py.File(infile, 'r')
    heatmap = a['heatmap'].value

    chrms = list( map( lambda x: 'chr'+x if 'chr' not in x else x, pickle.loads(a['genomeIdxToLabel'].value).values() ))

    chromsizes = pd.read_csv(chrms_sizes, sep='\t', names=['name', 'length']).set_index('name').loc[chrms, 'length']
    binsize = pickle.loads(a['resolution'].value)
    bins    = cooler.binnify(chromsizes, binsize)

    iterator = cooler.io.ArrayLoader(bins, heatmap, binsize)
    cooler.io.create(outfile, bins, iterator, assembly=assembly)

    c = cooler.Cooler(outfile)
    if correct:
        bias, stats = cooler.ice.iterative_correction(c, store=c)
    
    return c
예제 #3
0
    def _write_mtx(self, output_cooler, binsize, assembly_name, ch, chromsize):

        self.compute()

        chromsizes = pd.Series({ch: chromsize}, name='length')
        bins = cooler.binnify(chromsizes, binsize)

        pixels = cooler.io.ArrayLoader(bins, self._mtx, chunksize=10000000)
        cooler.io.create(output_cooler, bins, pixels, assembly=assembly_name)
예제 #4
0
def test_from_readhdf5():
    # uniform bins
    binsize = 100
    bintable = cooler.binnify(chromsizes, binsize)
    yield should_not_depend_on_chunksize, bintable
    yield should_raise_if_input_not_sorted, bintable
    yield should_work_with_int32_cols, bintable

    # non-uniform bins
    steps = [10, 100]
    bintable = _alternating_bins(chromsizes, steps)
    yield should_not_depend_on_chunksize, bintable
    yield should_raise_if_input_not_sorted, bintable
    yield should_work_with_int32_cols, bintable
예제 #5
0
def test_sanitize_pixels():
    bins = cooler.binnify(
        cooler.util.read_chromsizes(op.join(datadir, "toy.chrom.sizes")), 1)
    chunk = pd.read_csv(op.join(datadir, "toy.symm.upper.1.zb.coo"),
                        sep='\t',
                        names=['bin1_id', 'bin2_id', 'count'])
    chunk['foo1'] = 4
    chunk['foo2'] = 2
    sanitize_pixels(bins, )(chunk.copy())

    # one-based bin IDs
    out = sanitize_pixels(
        bins,
        is_one_based=True,
    )(chunk.copy())
    assert (out['bin1_id'] == chunk['bin1_id'] - 1).all()

    # tril action: reflect (after swapping bin1, bin2)
    tril_chunk = chunk.copy()
    tril_chunk['bin2_id'] = chunk['bin1_id']
    tril_chunk['bin1_id'] = chunk['bin2_id']
    out = sanitize_pixels(
        bins,
        tril_action="reflect",
        sided_fields=['foo'],
    )(tril_chunk.copy())
    assert len(out) == len(chunk)
    assert (out['foo2'] == chunk['foo1']).all()
    assert (out['foo1'] == chunk['foo2']).all()
    assert (out['bin1_id'] == chunk['bin1_id']).all()
    assert (out['bin2_id'] == chunk['bin2_id']).all()

    # tril action: drop
    out = sanitize_pixels(
        bins,
        tril_action="drop",
    )(tril_chunk.copy())
    assert len(out) == 0

    # tril action: raise
    with pytest.raises(BadInputError):
        sanitize_pixels(
            bins,
            tril_action="raise",
        )(tril_chunk.copy())
예제 #6
0
def test_roundtrip(f_hm, f_cool):
    chromsizes = cooler.read_chromsizes(
        "http://genome.ucsc.edu/goldenpath/help/hg19.chrom.sizes",
        name_patterns=(r"^chr[0-9]+$", r"chrX$"),
    )
    binsize = 2000000
    bintable = cooler.binnify(chromsizes, binsize)

    heatmap = np.load(f_hm)
    reader = cooler.create.ArrayLoader(bintable, heatmap, 100000)
    cooler.create.create(f_cool, bintable, reader, assembly="hg19")

    h5 = h5py.File(f_cool, "r")
    new_chromtable = cooler.api.chroms(h5)
    assert np.all(chromsizes.index == new_chromtable["name"])

    new_bintable = cooler.api.bins(h5)
    assert np.all(bintable == new_bintable)

    info = cooler.api.info(h5)
    assert info["genome-assembly"] == "hg19"
    assert info["bin-type"] == "fixed"
    assert info["bin-size"] == binsize

    mat = cooler.api.matrix(h5, 0, 100, 0, 100, "count", balance=False)
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[:100, :100], mat)

    mat = cooler.Cooler(h5).matrix("count", balance=False)[:100, :100]
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[:100, :100], mat)

    mat = cooler.api.matrix(h5, 100, 200, 100, 200, "count", balance=False)
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[100:200, 100:200], mat)

    mat = cooler.Cooler(h5).matrix("count", balance=False)[100:200, 100:200]
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[100:200, 100:200], mat)

    try:
        os.remove(f_cool)
    except OSError:
        pass
예제 #7
0
def test_roundtrip(f_hm, f_cool):
    chromsizes = cooler.read_chromsizes(
        'https://genome.ucsc.edu/goldenpath/help/hg19.chrom.sizes',
        name_patterns=(r'^chr[0-9]+$', r'chrX$'))
    binsize = 2000000
    bintable = cooler.binnify(chromsizes, binsize)

    heatmap = np.load(f_hm)
    reader = cooler.create.ArrayLoader(bintable, heatmap, 100000)
    cooler.create.create(f_cool, bintable, reader, assembly='hg19')

    h5 = h5py.File(f_cool, 'r')
    new_chromtable = cooler.api.chroms(h5)
    assert np.all(chromsizes.index == new_chromtable['name'])

    new_bintable = cooler.api.bins(h5)
    assert np.all(bintable == new_bintable)

    info = cooler.api.info(h5)
    assert info['genome-assembly'] == 'hg19'
    assert info['bin-type'] == 'fixed'
    assert info['bin-size'] == binsize

    mat = cooler.api.matrix(h5, 0, 100, 0, 100, 'count', balance=False)
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[:100, :100], mat)

    mat = cooler.Cooler(h5).matrix('count', balance=False)[:100, :100]
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[:100, :100], mat)

    mat = cooler.api.matrix(h5, 100, 200, 100, 200, 'count', balance=False)
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[100:200, 100:200], mat)

    mat = cooler.Cooler(h5).matrix('count', balance=False)[100:200, 100:200]
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[100:200, 100:200], mat)

    try:
        os.remove(f_cool)
    except OSError:
        pass
예제 #8
0
def test_roundtrip():
    chromsizes = cooler.read_chromsizes(
        'https://genome.ucsc.edu/goldenpath/help/hg19.chrom.sizes',
        name_patterns=(r'^chr[0-9]+$', r'chrX$'))
    binsize = 2000000
    bintable = cooler.binnify(chromsizes, binsize)

    heatmap = np.load(
        os.path.join(testdir, 'data', 'IMR90-MboI-matrix.2000kb.npy'))
    reader = cooler.io.DenseLoader(heatmap)
    cooler.io.create(testfile_path,
                     chromsizes,
                     bintable,
                     reader,
                     assembly='hg19')

    h5 = h5py.File(testfile_path, 'r')
    new_chromtable = cooler.chroms(h5)
    assert np.all(chromsizes.index == new_chromtable['name'])

    new_bintable = cooler.bins(h5)
    assert np.all(bintable == new_bintable)

    info = cooler.info(h5)
    assert info['genome-assembly'] == 'hg19'
    assert info['bin-type'] == 'fixed'
    assert info['bin-size'] == binsize

    mat = cooler.matrix(h5, 0, 100, 0, 100, 'count', balance=False)
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[:100, :100], mat)

    mat = cooler.Cooler(h5).matrix('count', balance=False)[:100, :100]
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[:100, :100], mat)

    mat = cooler.matrix(h5, 100, 200, 100, 200, 'count', balance=False)
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[100:200, 100:200], mat)

    mat = cooler.Cooler(h5).matrix('count', balance=False)[100:200, 100:200]
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[100:200, 100:200], mat)
예제 #9
0
def test_validate_pixels():
    bins = cooler.binnify(
        cooler.util.read_chromsizes(op.join(datadir, "toy.chrom.sizes")), 1
    )
    chunk = pd.read_csv(
        op.join(datadir, "toy.symm.upper.1.zb.coo"),
        sep='\t',
        names=['bin1_id', 'bin2_id', 'count']
    )
    validator = validate_pixels(
        len(bins),
        boundscheck=True,
        triucheck=True,
        dupcheck=True,
        ensure_sorted=True
    )
    validator(chunk.copy())
    validator(chunk.to_dict(orient='series'))

    # wrongly assume zero-based, producing -1 bins IDs
    chunk_ = sanitize_pixels(
        bins,
        is_one_based=True,
    )(chunk.copy())
    with pytest.raises(BadInputError):
        validator(chunk_)

    # out-of-bounds bin ID
    chunk_ = chunk.copy()
    chunk_.at[-1, 'bin1_id'] = len(bins) + 1
    with pytest.raises(BadInputError):
        validator(chunk_)

    # pass in non-triu data
    tril_chunk = chunk.copy()
    tril_chunk['bin2_id'] = chunk['bin1_id']
    tril_chunk['bin1_id'] = chunk['bin2_id']
    with pytest.raises(BadInputError):
        validator(tril_chunk)

    # pass in duplicates
    with pytest.raises(BadInputError):
        validator(pd.concat([chunk, chunk], ignore_index=True))
예제 #10
0
def test_roundtrip():
    chromsizes = cooler.read_chromsizes(
        'https://genome.ucsc.edu/goldenpath/help/hg19.chrom.sizes',
        name_patterns=(r'^chr[0-9]+$', r'chrX$'))
    chroms, lengths = zip(*iteritems(chromsizes))

    binsize = 2000000
    bintable = cooler.binnify(chromsizes, binsize)

    heatmap = np.load(os.path.join(testdir, 'data', 'IMR90-MboI-matrix.2000kb.npy'))
    with h5py.File(testfile_path, 'w') as h5:
        reader = cooler.io.DenseLoader(heatmap)
        cooler.io.create(h5, chroms, lengths, bintable, reader, assembly='hg19')

    h5 = h5py.File(testfile_path, 'r')
    new_chromtable = cooler.chroms(h5)
    assert np.all(chromsizes.index == new_chromtable['name'])

    new_bintable = cooler.bins(h5)
    assert np.all(bintable == new_bintable)

    info = cooler.info(h5)
    assert info['genome-assembly'] == 'hg19'
    assert info['bin-type'] == 'fixed'
    assert info['bin-size'] == binsize

    mat = cooler.matrix(h5, 0, 100, 0, 100, 'count')
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[:100,:100], mat.toarray())

    mat = cooler.Cooler(h5).matrix('count')[:100, :100]
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[:100,:100], mat.toarray())

    mat = cooler.matrix(h5, 100, 200, 100, 200, 'count')
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[100:200,100:200], mat.toarray())

    mat = cooler.Cooler(h5).matrix('count')[100:200, 100:200]
    assert mat.shape == (100, 100)
    assert np.allclose(heatmap[100:200,100:200], mat.toarray())
예제 #11
0
def test_from_hdf5_pairs():
    def should_not_depend_on_chunksize(chromsizes, bintable, mock_pairs):
        # try different chunk sizes
        binner = cooler.create.HDF5Aggregator(mock_pairs,
                                              chromsizes,
                                              bintable,
                                              chunksize=66)
        cooler.create.create(testcool_path, bintable, binner)
        with h5py.File(testcool_path, 'r') as h5:
            oc1 = h5['indexes']['chrom_offset'][:]
            ob1 = h5['indexes']['bin1_offset'][:]
            p1 = cooler.api.pixels(h5, join=False)

        binner = cooler.create.HDF5Aggregator(mock_pairs,
                                              chromsizes,
                                              bintable,
                                              chunksize=666)
        cooler.create.create(testcool_path, bintable, binner)
        with h5py.File(testcool_path, 'r') as h5:
            oc2 = h5['indexes']['chrom_offset'][:]
            ob2 = h5['indexes']['bin1_offset'][:]
            p2 = cooler.api.pixels(h5, join=False)

        assert np.all(oc1 == oc2)
        assert np.all(ob1 == ob2)
        assert np.all(p1.values == p2.values)

    def should_raise_if_input_not_sorted(chromsizes, bintable, mock_pairs):
        # not sorted by chrm1
        #with h5py.File(testcool_path, 'w') as h5:
        bad_reads = {
            'chrms1': mock_pairs['chrms2'],
            'cuts1': mock_pairs['cuts2'],
            'chrms2': mock_pairs['chrms1'],
            'cuts2': mock_pairs['cuts1'],
        }
        with pytest.raises(ValueError):
            cooler.create.HDF5Aggregator(bad_reads,
                                         chromsizes,
                                         bintable,
                                         chunksize=66)

        # not triu
        bad_reads = {
            'chrms1': mock_pairs['chrms1'].copy(),
            'cuts1': mock_pairs['cuts1'].copy(),
            'chrms2': mock_pairs['chrms2'].copy(),
            'cuts2': mock_pairs['cuts2'].copy(),
        }
        bad_reads['chrms1'][0] = 0
        bad_reads['chrms2'][0] = 0
        bad_reads['cuts1'][0] = 10
        bad_reads['cuts2'][0] = 9
        binner = cooler.create.HDF5Aggregator(bad_reads,
                                              chromsizes,
                                              bintable,
                                              chunksize=66)
        with pytest.raises(ValueError):
            cooler.create.create(testcool_path, bintable, binner)

    def should_work_with_int32_cols(chromsizes, bintable, mock_pairs):
        # int64
        binner = cooler.create.HDF5Aggregator(mock_pairs,
                                              chromsizes,
                                              bintable,
                                              chunksize=66)
        cooler.create.create(testcool_path, bintable, binner)
        with h5py.File(testcool_path, 'r') as h5:
            oc1 = h5['indexes']['chrom_offset'][:]
            ob1 = h5['indexes']['bin1_offset'][:]
            p1 = cooler.api.pixels(h5, join=False)

        # int32
        mock_pairs32 = {
            'chrms1': mock_pairs['chrms1'].astype(np.int32),
            'cuts1': mock_pairs['cuts1'].astype(np.int32),
            'chrms2': mock_pairs['chrms2'].astype(np.int32),
            'cuts2': mock_pairs['cuts2'].astype(np.int32),
        }
        binner = cooler.create.HDF5Aggregator(mock_pairs32,
                                              chromsizes,
                                              bintable,
                                              chunksize=66)
        cooler.create.create(testcool_path, bintable, binner)
        with h5py.File(testcool_path, 'r') as h5:
            oc2 = h5['indexes']['chrom_offset'][:]
            ob2 = h5['indexes']['bin1_offset'][:]
            p2 = cooler.api.pixels(h5, join=False)

        assert np.all(oc1 == oc2)
        assert np.all(ob1 == ob2)
        assert np.all(p1.values == p2.values)

    def _mock_hdf5_pairs():
        np.random.seed(1)
        chrms = np.random.randint(0, n_chroms, n_records * 2)
        cuts = np.random.randint(0, clen, n_records * 2)
        abs_cuts = np.array(
            [clen * chrm + cut for chrm, cut in zip(chrms, cuts)])
        abs_cuts1, abs_cuts2 = abs_cuts[:n_records], abs_cuts[n_records:]
        mock_pairs = {
            'chrms1': chrms[:n_records],
            'cuts1': cuts[:n_records],
            'chrms2': chrms[n_records:],
            'cuts2': cuts[n_records:],
        }
        # Triu-sort
        mask = abs_cuts1 > abs_cuts2
        mock_pairs['chrms1'][mask], mock_pairs['chrms2'][mask] = mock_pairs[
            'chrms2'][mask], mock_pairs['chrms1'][mask]
        mock_pairs['cuts1'][mask], mock_pairs['cuts2'][mask] = mock_pairs[
            'cuts2'][mask], mock_pairs['cuts1'][mask]
        abs_cuts1[mask], abs_cuts2[mask] = abs_cuts2[mask], abs_cuts1[mask]
        idx = np.lexsort([abs_cuts2, abs_cuts1])
        for key in mock_pairs:
            mock_pairs[key] = mock_pairs[key][idx]
        return mock_pairs

    n_chroms = 2
    clen = 2000
    n_records = 3000
    chromsizes = pd.Series(index=['chr1', 'chr2'], data=[clen, clen])
    mock_pairs = _mock_hdf5_pairs()

    # uniform bins
    bintable = cooler.binnify(chromsizes, 100)
    should_not_depend_on_chunksize(chromsizes, bintable, mock_pairs)
    should_raise_if_input_not_sorted(chromsizes, bintable, mock_pairs)
    should_work_with_int32_cols(chromsizes, bintable, mock_pairs)

    # non-uniform bins
    bintable = _alternating_bins(chromsizes, [10, 100])
    should_not_depend_on_chunksize(chromsizes, bintable, mock_pairs)
    should_raise_if_input_not_sorted(chromsizes, bintable, mock_pairs)
    should_work_with_int32_cols(chromsizes, bintable, mock_pairs)
예제 #12
0
    def convert(self,
                input_filename,
                output_filename,
                input_format=None,
                output_format=None,
                **kwargs):
        """
        Convert files/matrix without reading them into InteractionMatrix._mtx
        :param input_format:
        :param output_format:
        :param input:
        :param output:
        :return:
        """

        if type(input_filename) == np.ndarray:
            input_format = 'mtx'
            mtx = input_filename

        if input_format is None:
            input_format = input_filename.split('.')
        if output_format is None:
            output_format = output_filename.split('.')

        TADselect_logger.info("Converting %s -> %s: from %s to %s",
                              input_format, output_format, input_filename,
                              output_filename)

        chromosome = kwargs.get('chr', 'chr2L')
        resolution = kwargs.get('res', 20000)
        remove_intermediary_files = kwargs.get('remove_intermediary_files',
                                               True)
        balance = kwargs.get('balance', False)

        if 'cool' in input_format and 'txt' in output_format:
            output_prefix = '.'.join(input_filename.split('.')[:-1])
            output_filename = output_prefix + '.{}.txt'.format(chromosome)

            c = cooler.Cooler(input_filename)
            mtx = c.matrix(balance=balance,
                           as_pixels=False).fetch(chromosome, chromosome)

            np.savetxt(output_filename, mtx, delimiter='\t')

            if 'gz' in output_format:
                command = 'gzip {}'.format(output_filename)
                run_command(command)
                output_filename += '.gz'

            return output_filename

        elif 'cool' in input_format and output_format == 'sparse':

            output_prefix = '.'.join(input_filename.split('.')[:-1])
            output_filename = output_prefix + '.{}.sparse.txt'.format(
                chromosome)

            c = cooler.Cooler(input_filename)
            mtx_df = c.matrix(balance=balance,
                              as_pixels=True,
                              join=True,
                              ignore_index=False).fetch(
                                  chromosome, chromosome)
            if balance:
                mtx_df.loc[:, 'count'] = mtx_df.loc[:, 'balanced']
                mtx_df = mtx_df.drop('balanced', axis=1)
                mtx_df = mtx_df.dropna()
            mtx_df.to_csv(output_filename, index=False, sep='\t', header=False)

            return output_filename

        elif 'cool' in input_format and 'mr_sparse' in output_format:
            output_prefix = '.'.join(input_filename.split('.')[:-1])
            output_filename = output_prefix + '.{}.mr_sparse.txt'.format(
                chromosome)

            c = cooler.Cooler(input_filename)
            mtx = c.matrix(balance=True,
                           as_pixels=True).fetch(chromosome, chromosome)
            mtx.loc[:, "bin1_id":"bin2_id"] += 1

            mtx.loc[:, 'bin1_id':'count'].to_csv(output_filename,
                                                 header=False,
                                                 index=False,
                                                 sep='\t')

            max_bin = mtx.loc[:, 'bin1_id':'bin2_id'].max().max()

            with open(output_prefix + ".{}.genome_bin.txt".format(chromosome),
                      'w') as outfile:
                outfile.write("1\tchr1\t0\t{}".format(max_bin - 1))

            with open(output_prefix + ".{}.all_bins.txt".format(chromosome),
                      'w') as outfile:
                for i in range(max_bin):
                    outfile.write("0\t{}\t{}\n".format(i * resolution + 1,
                                                       (i + 1) * resolution))

            return output_filename

        elif 'txt' in input_format and 'cool' in output_format:
            mtx = np.loadtxt(input_filename)
            chromsizes = pd.Series({ch: resolution * mtx.shape[0]},
                                   name='length')
            bins = cooler.binnify(chromsizes, resolution)

            pixels = cooler.io.ArrayLoader(bins, mtx, chunksize=10000000)
            cooler.io.create(output_filename, bins, pixels)

        elif 'cool' in input_format and output_format == 'h5':
            output_prefix = '.'.join(input_filename.split('.')[:-1])
            output_filename = output_prefix + '.h5'
            command = "hicExport --inFile {} --outFileName {} --inputFormat cool --outputFormat h5" \
                .format(input_filename, output_filename)
            run_command(command)
            return output_filename

        elif 'cool' in input_format and output_format == 'hic':

            binary_path = kwargs.get('binary_path', 'java')
            juicer_path = kwargs.get('juicer_path',
                                     './juicer_tools.1.8.9_jcuda.0.8.jar')
            genome = kwargs.get('genome', 'dm3')

            output_prefix = '.'.join(input_filename.split('.')[:-1])
            outfile_hic = "{}.{}.hic".format(output_prefix, chromosome)

            outfile_txt = outfile_hic + '.txt'
            outfile_tmp = outfile_hic + '.tmp'

            with open(outfile_tmp, 'w'):
                pass

            outfile_tmp = self.convert(input_filename=input_filename,
                                       output_filename=outfile_tmp,
                                       input_format='cool',
                                       output_format='sparse')

            command1 = "awk '{{print 0, $1, $2, 0, 0, $4, $5, 1, $7}}' {} > {}".format(
                outfile_tmp, outfile_txt)
            command2 = "gzip -f {}".format(outfile_txt)
            command3 = "{} -Xmx2g -jar {} pre -r {} -c {} {}.gz {} {}".format(
                binary_path, juicer_path, resolution, chromosome, outfile_txt,
                outfile_hic, genome)
            run_command(command1)
            run_command(command2)
            run_command(command3)

            if remove_intermediary_files:
                os.remove(outfile_txt + '.gz')

        elif input_format == 'mtx':

            if 'txt' in output_format:
                np.savetxt(output_filename, mtx, delimiter='\t')

                if 'gz' in output_format:
                    command = 'gzip {}'.format(output_filename)
                    run_command(command)
                    output_filename += '.gz'

            elif output_format == 'cool':
                chromsizes = pd.Series({ch: resolution * mtx.shape[0]},
                                       name='length')
                bins = cooler.binnify(chromsizes, resolution)

                pixels = cooler.io.ArrayLoader(bins, mtx, chunksize=10000000)
                cooler.io.create(output_filename, bins, pixels)

        return output_filename
예제 #13
0
        description="Output a genome segmentation of restriction fragments as a BED file.")
    parser.add_argument(
        "chromsizes",
        help="UCSC-like chromsizes file, with chromosomes in desired order",
        metavar="CHROMSIZES_PATH")
    parser.add_argument(
        "binsize",
        help="Resolution (bin size) in base pairs <int>",
        metavar="BINSIZE")
    parser.add_argument(
        "--out", "-o",
        help="Output file (defaults to stdout)")
    args = vars(parser.parse_args())

    binsize = int(args['binsize'])
    chromsizes = cooler.read_chromsizes(args['chromsizes'])
    bins = cooler.binnify(chromsizes, binsize)

    # Write output
    out = args['out']
    try:
        if out is None:
            f = sys.stdout
        else:
            f = open(out, 'wt')
        bins.to_csv(f, sep='\t', index=False, header=False)
    except OSError:
        pass
    finally:
        f.close()
def main():
    parser = argparse.ArgumentParser(description="""python matrix_storage_benchmark.py matrix_tsv""")
    parser.add_argument('matrix_tsv')
    parser.add_argument(
        '-s', '--square',
        default=256,
        type=int,
        help="The size of the square within which to return values")
    parser.add_argument(
        '-i', '--iterations',
        default=100,
        type=int,
        help="The number of times to run the range query")

    args = vars(parser.parse_args())
    binsize = 5000

    infilepath = args['matrix_tsv']
    outfilepath = op.join(op.dirname(infilepath), 'chrX.{}kb.cool'.format(binsize//1000))


    # Build "index"
    t1 = time.time()
    chromsizes = cooler.read_chromsizes('test/data/hg19.chrom.sizes')
    chroms = ['chrX']
    lengths = [chromsizes['chrX']]
    bins = cooler.binnify(chromsizes.loc['chrX':'chrX'], binsize)
    chunksize = int(100e6)
    reader = cooler.io.SparseLoader(infilepath, chunksize)
    h5opts = dict(compression='gzip', compression_opts=6)
    with h5py.File(outfilepath, 'w') as h5:
        cooler.io.create(h5, chroms, lengths, bins, reader, binsize, h5opts=h5opts)

    c = cooler.Cooler(outfilepath)
    print("Time creating index: {:.3f} seconds".format(time.time() - t1))


    # Normalization
    t15 = time.time()
    N_CPUS = 8
    chunksize = int(100e6)
    with h5py.File(outfilepath, 'r+') as h5, Pool(N_CPUS) as pool:
        bias = ice.iterative_correction(
            h5, chunksize=chunksize, tol=1e-05, mad_max=3,
            cis_only=False, ignore_diags=3, map=pool.map)

        h5opts = dict(compression='gzip', compression_opts=6)
        h5['bins'].create_dataset('weight', data=bias, **h5opts)
    print("Time for normalization (cis and trans): {:.3f} seconds".format(time.time() - t15))


    # The bounds of the contact coordinates
    c = cooler.Cooler(outfilepath)
    matrix = c.matrix()
    min_x = 0
    min_y = 0
    max_x = c.shape[0]
    max_y = c.shape[1]
    print("max_x:", max_x)
    print("max_y:", max_y)


    # Range queries
    square_size = args['square']
    t2 = time.time()

    for i in range(args['iterations']):
        point1 = random.randint(min_x, max_x - square_size)
        point2 = random.randint(min_y, max_y - square_size)
        mat = matrix[point1 : point1+square_size, point2 : point2+square_size]
        selected_points = list(zip(mat.row, mat.col, mat.data))

    t25 = time.time()
    print("Time performing range queries (256x256): {:.3f} seconds (per query): {:.3f} seconds".format(t25 - t2, (t25 - t2) / args['iterations']))

    weights = c.bins()['weight'][:].values
    for i in range(args['iterations']):
        point1 = random.randint(min_x, max_x - square_size)
        point2 = random.randint(min_y, max_y - square_size)
        mat = matrix[point1 : point1+square_size, point2 : point2+square_size]
        bias1 = weights[point1:point1+square_size]
        bias2 = weights[point2:point2+square_size]
        mat.data = bias1[mat.row] * bias2[mat.col] * mat.data
        selected_points = list(zip(mat.row, mat.col, mat.data))

    t26 = time.time()
    print("Time performing range queries (256x256) with balancing: {:.3f} seconds (per query): {:.3f} seconds".format(t26 - t25, (t26 - t25) / args['iterations']))

    for i in range(args['iterations']):
        point1 = random.randint(min_x, max_x - square_size * 8)
        point2 = random.randint(min_y, max_y - square_size * 8)
        mat = matrix[point1 : point1+square_size*8, point2 : point2+square_size*8]
        selected_points = list(zip(mat.row, mat.col, mat.data))

    t3 = time.time()
    print("Time performing range queries (2048 x 2048): {:.3f} seconds (per query): {:.3f} seconds".format(t3 - t26, (t3 - t26) / args['iterations']))

    weights = c.bins()['weight'][:].values
    for i in range(args['iterations']):
        point1 = random.randint(min_x, max_x - square_size * 8)
        point2 = random.randint(min_y, max_y - square_size * 8)
        mat = matrix[point1 : point1+square_size*8, point2 : point2+square_size*8]
        selected_points = list(zip(mat.row, mat.col, mat.data))

    t35 = time.time()
    print("Time performing range queries (2048 x 2048) with balancing: {:.3f} seconds (per query): {:.3f} seconds".format(t35 - t3, (t35 - t3) / args['iterations']))

    for i in range(args['iterations']):
        point1 = random.randint(min_x, max_x - square_size)
        mat = matrix[point1, :]
        selected_points = list(zip(mat.row, mat.col, mat.data))

    t4 = time.time()
    print("Time slicing across first dimension: {:.3f} seconds (per query): {:.3f} seconds".format(t4 - t35, (t4 - t35) / args['iterations']))

    for i in range(args['iterations']):
        point2 = random.randint(min_y, max_y - square_size)
        mat = matrix[:, point2]
        selected_points = list(zip(mat.row, mat.col, mat.data))

    t5 = time.time()
    print("Time slicing across second dimension: {:.3f} seconds (per query): {:.3f} seconds".format(t5 - t4, (t5 - t4) / args['iterations']))

    selected_points = []
    for i in range(args['iterations']):
        for pix in c.pixels().iterchunks(size=1000000):
            diag = pix[pix.bin1_id == pix.bin2_id]
            selected_points.extend( list(zip(diag['bin1_id'], diag['bin2_id'], diag['count'])) )

    t6 = time.time()
    print("Time slicing across the diagonal: {:.3f} seconds (per query): {:.3f} seconds".format(t6 - t5, (t6 - t5) / args['iterations']))


    # Dump
    print("Size of index: {} bytes".format(op.getsize(outfilepath)))
    with open('/tmp/tmp.tsv', 'wt') as f:
        for pix in c.pixels().iterchunks(size=100000):
            pix.to_csv(f, sep='\t', index=False, header=False)
    print("Time outputting the index: {:.3f}".format(time.time() - t6))
    print("Size of output: {} bytes".format(op.getsize('/tmp/tmp.tsv')))
mpl.style.use('seaborn-white')

import multiprocess as mp
import numpy as np
import pandas as pd
import bioframe
import cooltools
import cooler
from cooltools.eigdecomp import cooler_cis_eig

mm10 = bioframe.fetch_chromsizes('mm10')
chromsizes = bioframe.fetch_chromsizes('mm10')
chromosomes = list(chromsizes.index)

binsize = 10000
bins = cooler.binnify(mm10, binsize)
fasta_records = bioframe.load_fasta('/data05/genomes/mm10_20chr.fa')
bins['GC'] = bioframe.tools.frac_gc(bins, fasta_records)
bins.head()

import fnmatch
import os

for file in os.listdir('.'):
    if fnmatch.fnmatch(file, '*_10kb.cool'):
        clr = cooler.Cooler(file)
        cond = file.split('.')[0]
        lam, eigs = cooler_cis_eig(clr,
                                   bins,
                                   n_eigs=3,
                                   phasing_track_col='GC',
예제 #16
0
def hic2cool_extractnorms(infile,
                          outfile,
                          exclude_mt=False,
                          show_warnings=False,
                          silent=False):
    """
    Find all normalization vectors in the given hic file at all resolutions and
    attempts to add them to the given cooler file. Does not add any metadata
    to the cooler file. TODO: should we add `extract-norms-date` attr?

    Params:
    <infile> str .hic filename
    <outfile> str .cool output filename
    <exclude_mt> bool. If True, ignore MT contacts. Defaults to False.
    <show_warnings> bool. If True, print out WARNING messages
    <silent> bool. If true, hide standard output
    """
    unit = 'BP'  # only using base pair unit for now
    # Global hic normalization types used
    global NORMS
    NORMS = []
    global WARN
    WARN = False
    req = open(infile, 'rb')
    buf = mmap.mmap(req.fileno(), 0, access=mmap.ACCESS_READ)
    used_chrs, resolutions, masteridx, genome, metadata = read_header(req)
    pair_footer_info, expected, factors, norm_info = read_footer(
        req, buf, masteridx)
    # expected/factors unused for now
    del expected
    del factors

    chr_names = [used_chrs[key][1] for key in used_chrs.keys()]
    if not silent:  # print hic header info for command line usage
        print('################################')
        print('### hic2cool / extract-norms ###')
        print('################################')
        print('Header info from hic:')
        print('... Chromosomes: ', chr_names)
        print('... Resolutions: ', resolutions)
        print('... Normalizations: ', NORMS)
        print('... Genome: ', genome)

    if exclude_mt:  # remove mitchondrial chr by name if this flag is set
        # try to find index of chrM (a.k.a chrMT) if it is present
        mt_names = ['m', 'mt', 'chrm', 'chrmt']
        found_idxs = [
            idx for idx, fv in used_chrs.items() if fv[1].lower() in mt_names
        ]
        if len(found_idxs) == 1:
            excl = used_chrs.pop(found_idxs[0], None)
            if not silent:
                print('... Excluding chromosome %s with index %s' %
                      (excl[1], excl[0]))
        if len(found_idxs) > 1:
            error_str = (
                'ERROR. More than one chromosome was found when attempting to'
                ' exclude MT. Found chromosomes: %s' % chr_names)
            force_exit(error_str, req)
        else:
            if not silent:
                print('... No chromosome found when attempting to exclude MT.')

    # exclude 'all' from chromsomes
    chromosomes = [
        uc[1] for uc in used_chrs.values() if uc[1].lower() != 'all'
    ]
    lengths = [uc[2] for uc in used_chrs.values() if uc[1].lower() != 'all']
    chromsizes = pd.Series(index=chromosomes, data=lengths)

    cooler_groups = {}
    for path in cooler.fileops.list_coolers(outfile):
        binsize = cooler.Cooler(outfile + '::' + path).info['bin-size']
        cooler_groups[binsize] = path
    if not silent:
        print('### Found cooler contents:')
        print('... %s' % cooler_groups)

    for norm in NORMS:
        for binsize in resolutions:
            if binsize not in cooler_groups:
                if not silent:
                    print('... Skip resolution %s; it is not in cooler file' %
                          binsize)
                continue
            if not silent:
                print('... Extracting %s normalization vector at %s BP' %
                      (norm, binsize))
            chrom_map = {}
            bins = cooler.binnify(chromsizes, binsize)
            lengths_in_bins = bins.groupby('chrom').size()
            for chr_val in [
                    uc for uc in used_chrs.values() if uc[1].lower() != 'all'
            ]:
                chr_num_bins = lengths_in_bins.loc[chr_val[1]]
                try:
                    norm_key = norm_info[norm, unit, binsize, chr_val[0]]
                except KeyError:
                    WARN = True
                    if show_warnings and not silent:
                        print_stderr(
                            '!!! WARNING. Normalization vector %s does not exist for %s.'
                            % (norm, chr_val[1]))
                    # add a vector of 0's with length equal to by_chr_bins[chr_idx]
                    norm_vector = [np.nan] * chr_num_bins
                else:
                    norm_vector = read_normalization_vector(
                        req, buf, norm_key)[:chr_num_bins]
                chrom_map[chr_val[1]] = norm_vector

            # hic normalization vector lengths have inconsistent lengths...
            # truncate appropriately
            bins[norm] = np.concatenate(
                [chrom_map[chrom] for chrom in chromosomes])
            if not silent:
                print('... Writing to cool file ...')
                print('%s\n... Truncated ...' % bins.head())
            group_path = cooler_groups[binsize]
            cooler.create.append(outfile + '::' + group_path,
                                 'bins', {norm: bins[norm].values},
                                 force=True)
    req.close()
    if not silent:
        if WARN and not show_warnings:
            print(
                '... Warnings were found in this run. Run with -v to display them.'
            )
        print('### Finished! Output written to: %s' % outfile)