예제 #1
0
def test_convert_to_zscore_matrix_2():

    # load test matrix
    hic = hm.hiCMatrix(ROOT + '/Li_et_al_2015.h5')
    hic.maskBins(hic.nan_bins)

    mat = hic.matrix.todense()
    max_depth = 10000
    bin_size = hic.getBinSize()
    max_depth_in_bins = int(float(max_depth) / bin_size)

    m_size = mat.shape[0]
    # compute matrix values per distance
    chrom, start, end, extra = zip(
        *hm.hiCMatrix.fit_cut_intervals(hic.cut_intervals))
    #    chrom, start, end, extra = zip(*hic.cut_intervals)
    dist_values = {}
    sys.stderr.write("Computing values per distance for each matrix entry\n")

    for _i in range(mat.shape[0]):
        for _j in range(mat.shape[0]):
            if _j >= _i:
                # dist is translated to bins
                dist = int(float(start[_j] - start[_i]) / bin_size)
                if dist <= max_depth_in_bins:
                    if dist not in dist_values:
                        dist_values[dist] = []
                    dist_values[dist].append(mat[_i, _j])

    mu = {}
    std = {}
    for dist, values in dist_values.iteritems():
        mu[dist] = np.mean(values)
        std[dist] = np.std(values)

    # compute z-score for test matrix
    sys.stderr.write("Computing zscore for each matrix entry\n")
    zscore_mat = np.full((m_size, m_size), np.nan)
    for _i in range(mat.shape[0]):
        for _j in range(mat.shape[0]):
            if _j >= _i:
                dist = int(float(start[_j] - start[_i]) / bin_size)
                if dist <= max_depth_in_bins:
                    zscore = (mat[_i, _j] - mu[dist]) / std[dist]
                    zscore_mat[_i, _j] = zscore

    # compare with zscore from class
    hic.convert_to_zscore_matrix(maxdepth=max_depth)

    from numpy.testing import assert_almost_equal
    # only the main diagonal is check. Other diagonals show minimal differences
    assert_almost_equal(hic.matrix.todense().diagonal(0).A1,
                        zscore_mat.diagonal(0))
예제 #2
0
def main(args=None):

    args = parse_arguments().parse_args(args)
    if args.operation not in ['diff', 'ratio', 'log2ratio']:
        exit("Operation not found. Please use 'diff', 'ratio' or 'log2ratio'.")

    hic1 = hm.hiCMatrix(args.matrices[0])
    hic2 = hm.hiCMatrix(args.matrices[1])

    if hic1.matrix.shape != hic2.matrix.shape:
        exit("The two matrices have different size. Use matrices having the same resolution and created using"
             "the same parameters. Check the matrix values using the tool `hicInfo`.")

    if hic1.chrBinBoundaries != hic2.chrBinBoundaries:
        exit("The two matrices have different chromosome order. Use the tool `hicExport` to change the order.\n"
             "{}: {}\n"
             "{}: {}".format(args.matrices[0], hic1.chrBinBoundaries.keys(),
                             args.matrices[1], hic2.chrBinBoundaries.keys()))

    # normalize by total matrix sum
    hic1.matrix.data = hic1.matrix.data.astype(float) / hic1.matrix.data.sum()
    hic2.matrix.data = hic2.matrix.data.astype(float) / hic2.matrix.data.sum()

    nan_bins = set(hic1.nan_bins)
    nan_bins = nan_bins.union(hic2.nan_bins)

    if args.operation == 'diff':
        new_matrix = hic1.matrix - hic2.matrix
    elif args.operation == 'ratio' or args.operation == 'log2ratio':
        hic2.matrix.data = float(1) / hic2.matrix.data
        new_matrix = hic1.matrix.multiply(hic2.matrix)
        # just in case
        new_matrix.eliminate_zeros()
        if args.operation == 'log2ratio':
            new_matrix.data = np.log2(new_matrix.data)
            new_matrix.eliminate_zeros()

    hic1.setMatrixValues(new_matrix)
    hic1.maskBins(sorted(nan_bins))
    hic1.save(args.outFileName)
예제 #3
0
def test_hic_transfer_all():
    outfile = NamedTemporaryFile(suffix='all.h5', delete=False)
    outfile.close()

    args = "--matrix {} --outFileName {} --method all".format(
        original_matrix, outfile.name).split()
    hicTransform.main(args)

    dirname_new = dirname(outfile.name)
    basename_new = basename(outfile.name)
    # obs_exp
    test = hm.hiCMatrix(ROOT + "hicTransform/obs_exp_small_50kb.h5")
    new = hm.hiCMatrix(dirname_new + "/obs_exp_" + basename_new)
    nt.assert_array_almost_equal(test.matrix.data, new.matrix.data)
    os.unlink(dirname_new + "/obs_exp_" + basename_new)

    # pearson
    test = hm.hiCMatrix(ROOT + "hicTransform/pearson_small_50kb.h5")
    new = hm.hiCMatrix(dirname_new + "/pearson_" + basename_new)
    nt.assert_array_almost_equal(test.matrix.data, new.matrix.data)
    os.unlink(dirname_new + "/pearson_" + basename_new)

    # covariance
    test = hm.hiCMatrix(ROOT + "hicTransform/covariance_small_50kb.h5")
    new = hm.hiCMatrix(dirname_new + "/covariance_" + basename_new)
    nt.assert_array_almost_equal(test.matrix.data, new.matrix.data)
    os.unlink(dirname_new + "/covariance_" + basename_new)
    os.unlink(outfile.name)
예제 #4
0
def test_intervalListToIntervalTree(capsys):
    # get matrix
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)

    nt.assert_equal(hic.getMatrix(), matrix)

    # empty list should raise AssertionError
    interval_list = []
    with pytest.raises(AssertionError):
        hic.intervalListToIntervalTree(interval_list)

        captured = capsys.readouterr()
        assert captured.out == "Interval list is empty"

    # test with correct interval_list
    interval_list = [('a', 0, 10, 1), ('a', 10, 20, 1), ('b', 20, 30, 1),
                     ('b', 30, 50, 1), ('b', 50, 100, 1), ('c', 100, 200, 1),
                     ('c', 200, 210, 1), ('d', 210, 220, 1), ('e', 220, 250)]

    tree, boundaries = hic.intervalListToIntervalTree(interval_list)

    # test tree
    nt.assert_equal(tree['a'],
                    IntervalTree([Interval(0, 10, 0),
                                  Interval(10, 20, 1)]))
    nt.assert_equal(
        tree['b'],
        IntervalTree(
            [Interval(20, 30, 2),
             Interval(30, 50, 3),
             Interval(50, 100, 4)]))
    nt.assert_equal(
        tree['c'], IntervalTree([Interval(100, 200, 5),
                                 Interval(200, 210, 6)]))
    nt.assert_equal(tree['d'], IntervalTree([Interval(210, 220, 7)]))
    nt.assert_equal(tree['e'], IntervalTree([Interval(220, 250, 8)]))

    # test boundaries
    nt.assert_equal(
        boundaries,
        OrderedDict([('a', (0, 2)), ('b', (2, 5)), ('c', (5, 7)),
                     ('d', (7, 8)), ('e', (8, 9))]))
예제 #5
0
def test_build_matrix():
    outfile = NamedTemporaryFile(suffix='.h5', delete=False)
    outfile.close()
    qc_folder = mkdtemp(prefix="testQC_")
    args = "-s {} {} --outFileName {} -bs 5000 -b /tmp/test.bam --QCfolder {} --threads 4".format(
        sam_R1, sam_R2, outfile.name, qc_folder).split()
    hicBuildMatrix.main(args)

    test = hm.hiCMatrix(ROOT + "small_test_matrix_parallel.h5")
    new = hm.hiCMatrix(outfile.name)
    nt.assert_equal(test.matrix.data, new.matrix.data)
    nt.assert_equal(test.cut_intervals, new.cut_intervals)
    # print("MATRIX NAME:", outfile.name)
    print(set(os.listdir(ROOT + "QC/")))
    assert are_files_equal(ROOT + "QC/QC.log", qc_folder + "/QC.log")
    assert set(os.listdir(ROOT + "QC/")) == set(os.listdir(qc_folder))
    assert abs(
        os.path.getsize(ROOT + "small_test_matrix_result.bam") -
        os.path.getsize("/tmp/test.bam")) < 1000
    os.unlink(outfile.name)
    shutil.rmtree(qc_folder)
    os.unlink("/tmp/test.bam")
예제 #6
0
def main():

    args = parse_arguments().parse_args()
    hic = hm.hiCMatrix(args.matrices[0])
    summed_matrix = hic.matrix
    nan_bins = set(hic.nan_bins)
    for matrix in args.matrices[1:]:
        hic_to_append = hm.hiCMatrix(matrix)

        try:
            summed_matrix = summed_matrix + hic_to_append.matrix
            if len(hic_to_append.nan_bins):
                nan_bins = nan_bins.union(hic_to_append.nan_bins)
        except:
            print "\nMatrix {} seems to be corrupted or of different " \
                  "shape".format(matrix)
            exit(1)

    # save only the upper triangle of the
    # symmetric matrix
    hic.setMatrixValues(summed_matrix)
    hic.maskBins(sorted(nan_bins))
    hic.save(args.outFileName)
예제 #7
0
def getViewpointValues(pMatrix,
                       pReferencePoint,
                       pChromViewpoint,
                       pRegion_start,
                       pRegion_end,
                       pInteractionList=None,
                       pChromosome=None):

    hic = hm.hiCMatrix(pMatrix)
    if pChromosome is not None:
        hic.keepOnlyTheseChr(pChromosome)

    if len(pReferencePoint) == 2:
        view_point_start, view_point_end = hic.getRegionBinRange(
            pReferencePoint[0], int(pReferencePoint[1]),
            int(pReferencePoint[1]))
    elif len(pReferencePoint) == 3:
        view_point_start, view_point_end = hic.getRegionBinRange(
            pReferencePoint[0], int(pReferencePoint[1]),
            int(pReferencePoint[2]))
    else:
        log.error("No valid reference point given. {}".format(pReferencePoint))
        exit(1)

    view_point_range = hic.getRegionBinRange(pChromViewpoint, pRegion_start,
                                             pRegion_end)
    elements_of_viewpoint = view_point_range[1] - view_point_range[0]
    data_list = np.zeros(elements_of_viewpoint)
    view_point_start_ = view_point_start
    interactions_list = None
    if pInteractionList is not None:
        interactions_list = []
    while view_point_start_ <= view_point_end:
        chrom, start, end, _ = hic.getBinPos(view_point_start_)
        for j, idx in zip(range(elements_of_viewpoint),
                          range(view_point_range[0], view_point_range[1], 1)):
            data_list[j] += hic.matrix[view_point_start_, idx]
            if interactions_list is not None:
                chrom_second, start_second, end_second, _ = hic.getBinPos(idx)
                interactions_list.append(
                    (chrom, start, end, chrom_second, start_second, end_second,
                     hic.matrix[view_point_start_, idx]))
        view_point_start_ += 1

    return [
        view_point_start, view_point_end, view_point_range, data_list,
        interactions_list
    ]
예제 #8
0
def test_setCorrectionFactors_fail():
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)

    assert hic.correction_factors is None
    with pytest.raises(AssertionError):
        hic.setCorrectionFactors([5, 5, 5, 5])
예제 #9
0
def test_reorderChromosomes_fail():
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)

    # name 'c' not in chromosome names, thus fail
    false_chr_order = ['a', 'b', 'c']
    with pytest.raises(SystemExit):
        hic.reorderChromosomes(false_chr_order)
예제 #10
0
def get_test_matrix(cut_intervals=None, matrix=None):
    hic = HiCMatrix.hiCMatrix()
    hic.nan_bins = []
    if matrix is None:
        matrix = np.array([[1, 8, 5, 3, 0, 8], [0, 4, 15, 5, 1, 7],
                           [0, 0, 0, 7, 2, 8], [0, 0, 0, 0, 1, 5],
                           [0, 0, 0, 0, 0, 6], [0, 0, 0, 0, 0, 0]])

    # make matrix symmetric
    matrix = csr_matrix(matrix + matrix.T)

    if not cut_intervals:
        cut_intervals = [('c-0', 0, 1, 1), ('c-1', 0, 1, 1), ('c-2', 0, 1, 1),
                         ('c-4', 0, 1, 1), ('c-4', 0, 1, 1)]
    hic.matrix = csr_matrix(matrix[0:len(cut_intervals), 0:len(cut_intervals)])
    hic.setMatrix(hic.matrix, cut_intervals)
    return hic
예제 #11
0
def test_setCorrectionFactors_success():
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)

    assert hic.correction_factors is None

    hic.setCorrectionFactors([5, 5, 5, 5, 5])

    nt.assert_equal(hic.correction_factors, [5, 5, 5, 5, 5])
예제 #12
0
def main(args=None):
    args = parse_arguments().parse_args(args)

    hic_ma = HiCMatrix.hiCMatrix(args.matrix)
    try:
        hic_ma.maskBins(hic_ma.nan_bins)
    except AttributeError:
        pass

    if args.skipDiagonal:
        hic_ma.diagflat()

    if args.method == 'obs/exp':
        hic_ma.convert_to_obs_exp_matrix(maxdepth=args.depth, perchr=args.perchr)
    else:
        hic_ma.convert_to_zscore_matrix(maxdepth=args.depth, perchr=args.perchr)

    hic_ma.save(args.outFileName)
예제 #13
0
def test_setMatrixValues_fail():
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1)]

    new_matrix = np.array([[10, 80, 50, 30], [0, 40, 150, 50], [0, 0, 0, 0],
                           [0, 0, 0, 0]])
    with pytest.raises(AssertionError):
        hic.setMatrixValues(new_matrix)
예제 #14
0
def test_restoreMaskedBins():
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)

    nt.assert_equal(hic.getMatrix(), matrix)
    nt.assert_equal(hic.orig_bin_ids, [])

    # function should directly return if there are no masked_bins
    hic.restoreMaskedBins()

    nt.assert_equal(hic.getMatrix(), matrix)
    nt.assert_equal(hic.orig_bin_ids, [])

    # test general use
    # first get some masked bins
    masking_ids = [0, 1]
    hic.maskBins(masking_ids)

    new_matrix = np.matrix([[0, 0, 2], [0, 0, 1], [0, 0, 0]])

    nt.assert_equal(hic.getMatrix(), new_matrix)
    nt.assert_equal(sorted(hic.orig_bin_ids), sorted([0, 1, 2, 3, 4]))

    # and now restore masked bins
    hic.restoreMaskedBins()

    result_matrix = np.matrix([[np.nan, np.nan, np.nan, np.nan, np.nan],
                               [np.nan, np.nan, np.nan, np.nan, np.nan],
                               [np.nan, np.nan, 0, 0, 2],
                               [np.nan, np.nan, 0, 0, 1],
                               [np.nan, np.nan, 0, 0, 0]])

    nt.assert_equal(hic.getMatrix(), result_matrix)
    nt.assert_equal(hic.orig_bin_ids, [])
예제 #15
0
def test_truncTrans_bk(capsys):
    # get matrix
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[-1, 8, 5, 3, 0], [np.nan, 4, 15, 5, 100],
                       [0, 0, 0, 0, 2000], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)

    nt.assert_equal(hic.getMatrix(), matrix)

    try:
        hic.truncTrans_bk()
    except TypeError:
        pass
예제 #16
0
def test_setMatrixValues_success():
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)

    new_matrix = np.array([[10, 80, 50, 30, 0], [0, 40, 150, 50, 10],
                           [0, 0, 0, 0, 20], [0, 0, 0, 0, 10], [0, 0, 0, 0,
                                                                0]])

    hic.setMatrixValues(new_matrix)

    nt.assert_equal(hic.getMatrix(), new_matrix)
예제 #17
0
def test_filterOutInterChrCounts():
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)
    hic.matrix = hic.fillLowerTriangle(hic.matrix)
    hic.filterOutInterChrCounts()

    filtered_matrix = np.matrix([[1, 8, 5, 0, 0], [8, 4, 15, 0, 0],
                                 [5, 15, 0, 0, 0], [0, 0, 0, 0, 1],
                                 [0, 0, 0, 1, 0]])

    nt.assert_equal(hic.getMatrix(), filtered_matrix)
예제 #18
0
def test_diagflat():
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('a', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)
    hic.matrix = hic.fillLowerTriangle(hic.matrix)

    hic.diagflat(value=1000)
    nt.assert_equal(np.array([1000 for x in range(matrix.shape[0])]),
                    hic.matrix.diagonal())

    hic.diagflat()
    nt.assert_equal(np.array([np.nan for x in range(5)]),
                    hic.matrix.diagonal())
예제 #19
0
def test_removePoorRegions(capsys):
    # get matrix
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[-1, 8, 5, 3, 0], [np.nan, 4, 15, 5, 100],
                       [0, 0, 0, 0, 2000], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)

    nt.assert_equal(hic.getMatrix(), matrix)

    # removePoorRegions
    try:
        hic.removePoorRegions()
    except IndexError:
        pass
예제 #20
0
def test_reorderBins():
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)

    nt.assert_equal(hic.getMatrix(), matrix)

    new_order = [0, 1, 3, 2, 4]
    new_matrix = np.matrix([[1, 8, 3, 5, 0], [0, 4, 5, 15, 1], [0, 0, 0, 0, 1],
                            [0, 0, 0, 0, 2], [0, 0, 0, 0, 0]])

    hic.reorderBins(new_order)

    nt.assert_equal(hic.getMatrix(), new_matrix)

    hic.reorderBins(new_order)

    nt.assert_equal(hic.getMatrix(), matrix)

    # order smaller than original matrix should delete unused ids
    small_order = [2, 3]
    small_matrix = np.matrix([[0, 0], [0, 0]])

    hic.reorderBins(small_order)

    nt.assert_equal(hic.getMatrix(), small_matrix)
    nt.assert_equal(hic.matrix.shape, small_matrix.shape)
    nt.assert_equal(hic.chrBinBoundaries,
                    OrderedDict([('a', (0, 1)), ('b', (1, 2))]))
    nt.assert_equal(hic.cut_intervals, [('a', 20, 30, 1), ('b', 30, 40, 1)])
    nt.assert_equal(hic.nan_bins, [])
예제 #21
0
def test_create_empty_cool_file():
    """
    Test fails. As far as I can see function is never called from anywhere. Perhaps not important.
    Perhaps test is not correctly written...
    """
    outfile = '/tmp/matrix3.cool'

    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('a', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    # make matrix symmetric
    hic.setMatrix(hic.matrix, cut_intervals)
    hic.matrix = hm.hiCMatrix.fillLowerTriangle(hic.matrix)

    hic.create_empty_cool_file(outfile)
예제 #22
0
def test_dist_list_to_dict():
    hic = hm.hiCMatrix()

    data = np.array([1, 8, 5, 3, 0, 4, 15, 5, 1, 0, 0, 2, 0, 1, 0])
    dist_list = np.array(
        [0, 10, 20, 30, -1, 0, 10, 20, -1, 0, 10, -1, 0, -1, 0])

    distance = hic.dist_list_to_dict(data, dist_list)

    nt.assert_equal(distance[-1], [0, 1, 2, 1])
    nt.assert_equal(distance[0], [1, 4, 0, 0, 0])
    nt.assert_equal(distance[10], [8, 15, 0])
    nt.assert_equal(distance[20], [5, 5])
    nt.assert_equal(distance[30], [3])

    data = np.array([0, 100, 200, 0, 100, 200, 0, 100, 0])
    dist_list = np.array([0, 100, 200, 0, 100, 200, 0, 100, 0])

    distance = hic.dist_list_to_dict(data, dist_list)

    nt.assert_equal(distance[0], [0, 0, 0, 0])
    nt.assert_equal(distance[100], [100, 100, 100])
    nt.assert_equal(distance[200], [200, 200])
예제 #23
0
def test_truncTrans():
    # get matrix
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[-1, 8, 5, 3, 0], [np.nan, 4, 15, 5, 100],
                       [0, 0, 0, 0, 2000], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)

    nt.assert_equal(hic.getMatrix(), matrix)

    # define expected outcome
    new_matrix = np.matrix([[-1., 8., 5., 3., 0.],
                            [np.nan, 4., 15., 5., 1.e+2],
                            [0., 0., 0., 0., 2.e+3], [0., 0., 0., 0., 1.],
                            [0., 0., 0., 0., 0.]])

    # truncTrans of matrix
    hic.truncTrans()

    # test against expected outcome
    nt.assert_equal(hic.getMatrix(), new_matrix)

    # reset matrix
    matrix = np.array([[-1, 8, 5, 3, 0], [np.nan, 4, 15, 5, 1],
                       [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])
    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)

    # method should directly return if nothing to do, matrix stays the same
    hic.truncTrans()
    nt.assert_equal(hic.getMatrix(), matrix)
예제 #24
0
def test_printchrtoremove(capsys):
    # get matrix
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)

    nt.assert_equal(hic.getMatrix(), matrix)

    # first test exception message for no self.prev_to_remove
    to_remove = [0, 1]

    with pytest.raises(Exception):
        hic.printchrtoremove(to_remove)

        captured = capsys.readouterr()
        assert captured.out == "No self.prev_to_remove defined, defining it now."

        nt.assert_equal(hic.prev_to_remove, np.array(to_remove))

    nt.assert_equal(hic.orig_bin_ids, [])

    # also test with masked_bins
    hic.maskBins(to_remove)

    assert len(hic.orig_bin_ids) > 0

    hic.printchrtoremove(to_remove)

    nt.assert_equal(hic.prev_to_remove, np.array(to_remove))
예제 #25
0
def test_reorderChromosomes():
    hic = hm.hiCMatrix()
    cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1),
                     ('b', 30, 40, 1), ('b', 40, 50, 1)]

    hic.nan_bins = []

    matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2],
                       [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]])

    hic.matrix = csr_matrix(matrix)
    hic.setMatrix(hic.matrix, cut_intervals)

    new_chr_order = ['b', 'a']
    hic.reorderChromosomes(new_chr_order)

    nt.assert_equal(hic.chrBinBoundaries,
                    OrderedDict([('b', (0, 2)), ('a', (2, 5))]))

    old_chr_order = ['a', 'b']
    hic.reorderChromosomes(old_chr_order)

    nt.assert_equal(hic.chrBinBoundaries,
                    OrderedDict([('a', (0, 3)), ('b', (3, 5))]))
예제 #26
0
def main(args=None):
    """
    for each distance, compare the
    distribution of two samples,
    report number of cases were they differ
    """

    args = parse_arguments().parse_args(args)
    mean_dict = OrderedDict()
    matrix_sum = {}
    if args.labels is None:
        labels = OrderedDict([(x, os.path.basename(x)) for x in args.matrices])
    else:
        labels = OrderedDict(zip(args.matrices, args.labels))

    chroms = set()
    for matrix_file in args.matrices:
        hic_ma = HiCMatrix.hiCMatrix(matrix_file)
        matrix_sum[matrix_file] = hic_ma.matrix.sum()
        if args.chromosomeExclude is None:
            args.chromosomeExclude = []

        chrtokeep = [x for x in list(hic_ma.interval_trees) if x not in args.chromosomeExclude]
        hic_ma.keepOnlyTheseChr(chrtokeep)

        mean_dict[matrix_file] = compute_distance_mean(hic_ma, maxdepth=args.maxdepth, perchr=args.perchr)
        chroms = chroms.union([k for k in list(mean_dict[matrix_file]) if len(mean_dict[matrix_file][k]) > 1])

    # compute scale factors such that values are comparable
    min_sum = min(matrix_sum.values())
    scale_factor = dict([(matrix_file, float(min_sum) / mat_sum) for matrix_file, mat_sum in iteritems(matrix_sum)])
    log.info("The scale factors used are: {}".format(scale_factor))
    if len(args.matrices) > 1 and args.perchr:
        # in this case, for each chromosome a plot is made that combines the data from the
        # hic matrices
        max_cols = 4
        num_rows = int(np.ceil(float(len(chroms)) / max_cols))
        num_cols = min(len(chroms), max_cols)

    else:
        num_cols = num_rows = 1

    if args.plotsize is None:
        width = 6
        height = 4
    else:
        width, height = args.plotsize
    fig = plt.figure(figsize=(width * num_cols, height * num_rows))

    axs = np.empty((num_rows, num_cols), dtype='object')
    for matrix_file in args.matrices:
        idx = 0
        for chrom, mean_values in iteritems(mean_dict[matrix_file]):
            if len(mean_values) <= 1:
                log.debug("No values found for: {}, chromosome: {}\n".format(matrix_file, chrom))
                continue
            x, y = zip(*[(k, v) for k, v in iteritems(mean_values) if v > 0])
            if len(x) <= 1:
                log.debug("No values found for: {}, chromosome: {}\n".format(matrix_file, chrom))
                continue
            if args.perchr and len(args.matrices) == 1:
                col = 0
                row = 0
            else:
                col = idx % num_cols
                row = idx // num_cols
            if axs[row, col] is None:
                ax = plt.subplot2grid((num_rows, num_cols), (row, col))
                ax.set_xlabel('genomic distance')
                ax.set_ylabel('corrected Hi-C counts')
                try:
                    ax.set_yscale('log')
                    ax.set_xscale('log')
                except ValueError:
                    continue
            else:
                ax = axs[row, col]
            y = np.array(y) * scale_factor[matrix_file]
            if args.perchr and len(args.matrices) > 1:
                label = labels[matrix_file]
                ax.set_title(chrom)
            elif args.perchr:
                label = chrom
            else:
                label = labels[matrix_file]

            ax.plot(x, y, label=label)
            axs[row, col] = ax
            idx += 1
            if args.outFileData is not None:
                if args.perchr and len(args.matrices) > 1:
                    label = labels[matrix_file]
                    args.outFileData.write("#{}\n".format(chrom))

                elif args.perchr:
                    label = chrom
                else:
                    label = labels[matrix_file]
                args.outFileData.write("#{}\n".format(label))
                args.outFileData.write("\t".join(map(str, x)) + "\n")
                args.outFileData.write("\t".join(map(str, y)) + "\n")

    for ax in axs.reshape(-1):
        if ax is None:
            continue
        ax.legend(prop={'size': 'small'})
        ax.set_xlim(0, args.maxdepth)
        handles, labels = ax.get_legend_handles_labels()
        lgd = ax.legend(handles, labels, loc='center left', bbox_to_anchor=(1, 0.5))

    plt.tight_layout()
    plt.savefig(args.plotFile.name, bbox_inches='tight', bbox_extra_artists=(lgd,))
    plt.close(fig)
예제 #27
0
def combine_matrices(matrix_list, bplimit=None):
    """
    Combines individual matrices, stored per chromosome into
    one matrix

    :param matrix_list: name of the matrices that will be combined into one.
    :param bplimit: To reduce the final file size, counts over the given distance can be removed
    :return: sparse matrix, bin intervals, nan bins, corrrections factors, distance counts
    """

    # Create empty row, col, value for the matrix
    from scipy.sparse import coo_matrix, triu
    new_cut_intervals = []
    row = np.array([]).astype("int")
    col = np.array([]).astype("int")
    values = np.array([])
    new_nan_bins = np.array([]).astype('int')
    new_correction_factors = np.array([])
    new_distance_counts = np.array([])

    # for each chr, append the row, col, value to the first one. Extend the dim
    size = 0
    for i in range(0, len(matrix_list)):
        hic = hm.hiCMatrix(matrix_list[i])

        # trim matrix if bplimit given
        if bplimit is not None:
            limit = bplimit // hic.getBinSize()
            matrix = (triu(hic.matrix, k=-limit) - triu(hic.matrix, k=limit)).tocoo()
        else:
            matrix = hic.matrix.tocoo()

        # add data
        row = np.concatenate([row, matrix.row + size])
        col = np.concatenate([col, matrix.col + size])
        values = np.concatenate([values, matrix.data])
        new_nan_bins = np.concatenate([new_nan_bins, hic.nan_bins + size])
        new_cut_intervals.extend(hic.cut_intervals)
        size += matrix.shape[0]

        # also add correction_factors
        if hic.correction_factors is not None:
            new_correction_factors = np.append(new_correction_factors, hic.correction_factors)
        else:
            # add an array with NaNs
            arr = np.empty(matrix.shape[0])
            arr[:] = np.NAN
            new_correction_factors = np.concatenate([new_correction_factors, arr])
        if hic.distance_counts is not None:
            new_distance_counts = np.concatenate([new_distance_counts, hic.distance_counts])

    final_mat = coo_matrix((values, (row, col)), shape=(size, size)).tocsr()

    assert len(new_cut_intervals) == final_mat.shape[0], \
        "Corrupted matrix file. Matrix size and " \
        "matrix bin definitions do not correspond"

    if len(new_distance_counts) == 0:
        new_distance_counts = None
    if len(new_correction_factors) == 0:
        new_correction_factors = None

    return final_mat, new_cut_intervals, new_nan_bins, new_correction_factors, new_distance_counts
예제 #28
0
def main(args=None):
    log.debug(args)
    args = parse_arguments().parse_args(args)
    # create hiC matrix with given input format
    # additional file needed for lieberman format
    if args.inputFormat == 'lieberman':
        if args.chrNameList is None:
            log.error("Error: --chrNameList is required when the input format is lieberman.")
            exit()
        else:
            hic_ma = hm.hiCMatrix(matrixFile=args.inFile, file_format='lieberman', chrnameList=args.chrNameList)

    elif args.inputFormat == 'npz' and len(args.inFile) > 1:  # assume hicexplorer_multi format
        if args.bplimit:
            log.info("\nCutting maximum matrix depth to {} for saving\n".format(args.bplimit))

        matrix, cut_intervals, nan_bins, corrections_factors, distance_counts = \
            combine_matrices(args.inFile, bplimit=args.bplimit)
        hic_ma = hm.hiCMatrix()
        hic_ma.setMatrix(matrix, cut_intervals=cut_intervals)

        if len(nan_bins):
            hic_ma.nan_bins = nan_bins
        if corrections_factors is not None:
            hic_ma.correction_factors = corrections_factors
        if distance_counts is not None:
            hic_ma.distance_counts = distance_counts

    else:
        if args.inputFormat == 'cool' and args.chromosomeOrder is not None and len(args.chromosomeOrder) == 1:
            hic_ma = hm.hiCMatrix(matrixFile=args.inFile[0], file_format=args.inputFormat, chrnameList=args.chromosomeOrder)
        else:
            hic_ma = hm.hiCMatrix(matrixFile=args.inFile[0], file_format=args.inputFormat)

        if args.bplimit:
            from scipy.sparse import triu
            log.info("\nCutting maximum matrix depth to {} for saving\n".format(args.bplimit))

            limit = args.bplimit // hic_ma.getBinSize()
            hic_ma.matrix = (triu(hic_ma.matrix, k=-limit) - triu(hic_ma.matrix, k=limit)).tocsr()
            hic_ma.matrix.eliminate_zeros()

    if not args.inputFormat == 'cool' and args.chromosomeOrder is not None and len(args.chromosomeOrder) == 1:
        if args.chromosomeOrder:
            hic_ma.keepOnlyTheseChr(args.chromosomeOrder)

        if args.clearMaskedBins:
            hic_ma.maskBins(hic_ma.nan_bins)

    if not args.outFileName.endswith(args.outputFormat):
        args.outFileName += "."
        args.outFileName += args.outputFormat

    if args.outputFormat == 'dekker':
        log.info('saving as dekker...')
        hic_ma.save_dekker(args.outFileName)
    elif args.outputFormat == 'ren':
        log.info('saving as ren...')
        hic_ma.save_bing_ren(args.outFileName)
    elif args.outputFormat == 'lieberman':
        log.info('saving as lieberman...')
        hic_ma.save_lieberman(args.outFileName)
    elif args.outputFormat == 'npz':
        log.info('saving as npz...')
        hic_ma.save_npz(args.outFileName)
    elif args.outputFormat == 'GInteractions':
        log.info('saving as GInteractions...')
        hic_ma.save_GInteractions(args.outFileName)
    elif args.outputFormat == 'cool':
        log.info('saving as cool...')
        hic_ma.save_cooler(args.outFileName)
    elif args.outputFormat == 'h5':
        log.info('saving as h5...')
        hic_ma.save(args.outFileName)
    else:
        log.error("An error occurred. hicExport aborted!")
        exit()
예제 #29
0
def main():
    """
    collects all arguments and executes
    the appropriate functions
    """
    args = parse_arguments().parse_args()

    hic_ma = HiCMatrix.hiCMatrix(args.matrix)
    if args.originalMat:
        orig_ma = HiCMatrix.hiCMatrix(args.originalMat.name)
    else:
        orig_ma = None

    try:
        hic_ma.maskBins(hic_ma.nan_bins)
    except AttributeError:
        pass

    # remove unwanted Chrs or select a given chromosome
    # in case is given
    hic_ma.filterUnwantedChr()
    if args.originalMat:
        orig_ma.filterUnwantedChr()
    if args.chromosomes:
        hic_ma.keepOnlyTheseChr(args.chromosomes)
        if args.originalMat:
            orig_ma.reorderChromosomes(hic_ma.chrBinBoundaries.keys())

    if args.skipDiagonal:
        hic_ma.diagflat()
        if args.originalMat:
            orig_ma.diagflat()

    max_depth_in_bins = None
    if args.depth:
        binsize = hic_ma.getBinSize()
        if args.depth < binsize:
            exit("Please specify a depth larger than bin size ({})".format(
                binsize))
        max_depth_in_bins = int(args.depth / binsize)
        import scipy.sparse
        # work only with the upper matrix
        # and remove all pixels that are beyond
        # max_depth_in_bis
        # (this is done by subtracting a second sparse matrix
        # that contains only the upper matrix that wants to be removed.
        hic_ma.matrix = scipy.sparse.triu(hic_ma.matrix, k=0, format='csr') - \
                        scipy.sparse.triu(hic_ma.matrix, k=max_depth_in_bins, format='csr')
        hic_ma.matrix.eliminate_zeros()

    if args.method == 'obs/exp':
        hic_ma.convert_to_obs_exp_matrix()
        new_ma = hic_ma.matrix
    elif args.method == 'pearson':
        sys.stderr.write("\nComputing observed / expected\n")
        hic_ma.convert_to_obs_exp_matrix()
        sys.stderr.write("\nComputing pearson\n")
        new_ma = getPearson(hic_ma.matrix)
    elif args.method != 'none':
        # check that the normalized and original matrices
        # have the same size
        if orig_ma:
            assert np.all(hic_ma.matrix.shape == orig_ma.matrix.shape), \
                "original and derived matrices do not have same shape"
        new_ma = transformMatrix(hic_ma,
                                 args.method,
                                 per_chr=args.perChromosome,
                                 original_matrix=orig_ma,
                                 depth_in_bins=max_depth_in_bins)
    else:
        new_ma = hic_ma.matrix

    if args.applyFdr:
        new_ma = applyFdr(new_ma)

    hic_ma.setMatrixValues(new_ma)
    hic_ma.restoreMaskedBins()
    if args.outFormat == 'dekker':
        hic_ma.save_dekker(args.outFileName)
    else:
        hic_ma.save(args.outFileName)
예제 #30
0
    def __init__(self, *args, **kwargs):
        super(HiCMatrixTrack, self).__init__(*args, **kwargs)

        if self.properties['file'].endswith('.cool'):
            # just init the cooler matrix.
            self.hic_ma = HiCMatrix.hiCMatrix(self.properties['file'],
                                              color_only_init=True)
        else:
            self.hic_ma = HiCMatrix.hiCMatrix(self.properties['file'])

        if len(self.hic_ma.matrix.data) == 0:
            self.log.error("Matrix {} is empty".format(
                self.properties['file']))
            exit(1)
        if 'show_masked_bins' in self.properties and self.properties[
                'show_masked_bins'] == 'yes':
            pass
        else:
            self.hic_ma.maskBins(self.hic_ma.nan_bins)

        # check that the matrix can be log transformed
        if 'transform' in self.properties:
            if self.properties['transform'] == 'log1p':
                if self.hic_ma.matrix.data.min() + 1 < 0:
                    self.log.error(
                        "\n*ERROR*\nMatrix contains negative values.\n"
                        "log1p transformation can not be applied to \n"
                        "values in matrix: {}".format(self.properties['file']))
                    exit(1)

            elif self.properties['transform'] == '-log':
                if self.hic_ma.matrix.data.min() < 0:
                    self.log.error(
                        "\n*ERROR*\nMatrix contains negative values.\n"
                        "log(-1 * <values>) transformation can not be applied to \n"
                        "values in matrix: {}".format(self.properties['file']))
                    exit(1)

            elif self.properties['transform'] == 'log':
                if self.hic_ma.matrix.data.min() < 0:
                    self.log.error(
                        "\n*ERROR*\nMatrix contains negative values.\n"
                        "log transformation can not be applied to \n"
                        "values in matrix: {}".format(self.properties['file']))
                    exit(1)

        new_intervals = hicexplorer.utilities.enlarge_bins(
            self.hic_ma.cut_intervals)
        self.hic_ma.interval_trees, self.hic_ma.chrBinBoundaries = \
            self.hic_ma.intervalListToIntervalTree(new_intervals)

        self.hic_ma.cut_intervals = new_intervals
        binsize = self.hic_ma.getBinSize()
        max_depth_in_bins = int(self.properties['depth'] / binsize)

        # work only with the lower matrix
        # and remove all pixels that are beyond
        # 2 * max_depth_in_bis which are not required
        # (this is done by subtracting a second sparse matrix
        # that contains only the lower matrix that wants to be removed.
        limit = 2 * max_depth_in_bins
        self.hic_ma.matrix = scipy.sparse.triu(self.hic_ma.matrix, k=0, format='csr') - \
            scipy.sparse.triu(self.hic_ma.matrix, k=limit, format='csr')
        self.hic_ma.matrix.eliminate_zeros()

        # fill the main diagonal, otherwise it looks
        # not so good. The main diagonal is filled
        # with an array containing the max value found
        # in the matrix
        if sum(self.hic_ma.matrix.diagonal()) == 0:
            self.log.info(
                "Filling main diagonal with max value because it empty and looks bad...\n"
            )
            max_value = self.hic_ma.matrix.data.max()
            main_diagonal = scipy.sparse.dia_matrix(
                ([max_value] * self.hic_ma.matrix.shape[0], [0]),
                shape=self.hic_ma.matrix.shape)
            self.hic_ma.matrix = self.hic_ma.matrix + main_diagonal

        self.plot_inverted = False
        if 'orientation' in self.properties and self.properties[
                'orientation'] == 'inverted':
            self.plot_inverted = True

        self.norm = None

        if 'colormap' not in self.properties:
            self.properties['colormap'] = DEFAULT_MATRIX_COLORMAP
        self.cmap = cm.get_cmap(self.properties['colormap'])
        self.cmap.set_bad('white')

        self.cmap.set_bad('black')