def test_convert_to_zscore_matrix_2(): # load test matrix hic = hm.hiCMatrix(ROOT + '/Li_et_al_2015.h5') hic.maskBins(hic.nan_bins) mat = hic.matrix.todense() max_depth = 10000 bin_size = hic.getBinSize() max_depth_in_bins = int(float(max_depth) / bin_size) m_size = mat.shape[0] # compute matrix values per distance chrom, start, end, extra = zip( *hm.hiCMatrix.fit_cut_intervals(hic.cut_intervals)) # chrom, start, end, extra = zip(*hic.cut_intervals) dist_values = {} sys.stderr.write("Computing values per distance for each matrix entry\n") for _i in range(mat.shape[0]): for _j in range(mat.shape[0]): if _j >= _i: # dist is translated to bins dist = int(float(start[_j] - start[_i]) / bin_size) if dist <= max_depth_in_bins: if dist not in dist_values: dist_values[dist] = [] dist_values[dist].append(mat[_i, _j]) mu = {} std = {} for dist, values in dist_values.iteritems(): mu[dist] = np.mean(values) std[dist] = np.std(values) # compute z-score for test matrix sys.stderr.write("Computing zscore for each matrix entry\n") zscore_mat = np.full((m_size, m_size), np.nan) for _i in range(mat.shape[0]): for _j in range(mat.shape[0]): if _j >= _i: dist = int(float(start[_j] - start[_i]) / bin_size) if dist <= max_depth_in_bins: zscore = (mat[_i, _j] - mu[dist]) / std[dist] zscore_mat[_i, _j] = zscore # compare with zscore from class hic.convert_to_zscore_matrix(maxdepth=max_depth) from numpy.testing import assert_almost_equal # only the main diagonal is check. Other diagonals show minimal differences assert_almost_equal(hic.matrix.todense().diagonal(0).A1, zscore_mat.diagonal(0))
def main(args=None): args = parse_arguments().parse_args(args) if args.operation not in ['diff', 'ratio', 'log2ratio']: exit("Operation not found. Please use 'diff', 'ratio' or 'log2ratio'.") hic1 = hm.hiCMatrix(args.matrices[0]) hic2 = hm.hiCMatrix(args.matrices[1]) if hic1.matrix.shape != hic2.matrix.shape: exit("The two matrices have different size. Use matrices having the same resolution and created using" "the same parameters. Check the matrix values using the tool `hicInfo`.") if hic1.chrBinBoundaries != hic2.chrBinBoundaries: exit("The two matrices have different chromosome order. Use the tool `hicExport` to change the order.\n" "{}: {}\n" "{}: {}".format(args.matrices[0], hic1.chrBinBoundaries.keys(), args.matrices[1], hic2.chrBinBoundaries.keys())) # normalize by total matrix sum hic1.matrix.data = hic1.matrix.data.astype(float) / hic1.matrix.data.sum() hic2.matrix.data = hic2.matrix.data.astype(float) / hic2.matrix.data.sum() nan_bins = set(hic1.nan_bins) nan_bins = nan_bins.union(hic2.nan_bins) if args.operation == 'diff': new_matrix = hic1.matrix - hic2.matrix elif args.operation == 'ratio' or args.operation == 'log2ratio': hic2.matrix.data = float(1) / hic2.matrix.data new_matrix = hic1.matrix.multiply(hic2.matrix) # just in case new_matrix.eliminate_zeros() if args.operation == 'log2ratio': new_matrix.data = np.log2(new_matrix.data) new_matrix.eliminate_zeros() hic1.setMatrixValues(new_matrix) hic1.maskBins(sorted(nan_bins)) hic1.save(args.outFileName)
def test_hic_transfer_all(): outfile = NamedTemporaryFile(suffix='all.h5', delete=False) outfile.close() args = "--matrix {} --outFileName {} --method all".format( original_matrix, outfile.name).split() hicTransform.main(args) dirname_new = dirname(outfile.name) basename_new = basename(outfile.name) # obs_exp test = hm.hiCMatrix(ROOT + "hicTransform/obs_exp_small_50kb.h5") new = hm.hiCMatrix(dirname_new + "/obs_exp_" + basename_new) nt.assert_array_almost_equal(test.matrix.data, new.matrix.data) os.unlink(dirname_new + "/obs_exp_" + basename_new) # pearson test = hm.hiCMatrix(ROOT + "hicTransform/pearson_small_50kb.h5") new = hm.hiCMatrix(dirname_new + "/pearson_" + basename_new) nt.assert_array_almost_equal(test.matrix.data, new.matrix.data) os.unlink(dirname_new + "/pearson_" + basename_new) # covariance test = hm.hiCMatrix(ROOT + "hicTransform/covariance_small_50kb.h5") new = hm.hiCMatrix(dirname_new + "/covariance_" + basename_new) nt.assert_array_almost_equal(test.matrix.data, new.matrix.data) os.unlink(dirname_new + "/covariance_" + basename_new) os.unlink(outfile.name)
def test_intervalListToIntervalTree(capsys): # get matrix hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) nt.assert_equal(hic.getMatrix(), matrix) # empty list should raise AssertionError interval_list = [] with pytest.raises(AssertionError): hic.intervalListToIntervalTree(interval_list) captured = capsys.readouterr() assert captured.out == "Interval list is empty" # test with correct interval_list interval_list = [('a', 0, 10, 1), ('a', 10, 20, 1), ('b', 20, 30, 1), ('b', 30, 50, 1), ('b', 50, 100, 1), ('c', 100, 200, 1), ('c', 200, 210, 1), ('d', 210, 220, 1), ('e', 220, 250)] tree, boundaries = hic.intervalListToIntervalTree(interval_list) # test tree nt.assert_equal(tree['a'], IntervalTree([Interval(0, 10, 0), Interval(10, 20, 1)])) nt.assert_equal( tree['b'], IntervalTree( [Interval(20, 30, 2), Interval(30, 50, 3), Interval(50, 100, 4)])) nt.assert_equal( tree['c'], IntervalTree([Interval(100, 200, 5), Interval(200, 210, 6)])) nt.assert_equal(tree['d'], IntervalTree([Interval(210, 220, 7)])) nt.assert_equal(tree['e'], IntervalTree([Interval(220, 250, 8)])) # test boundaries nt.assert_equal( boundaries, OrderedDict([('a', (0, 2)), ('b', (2, 5)), ('c', (5, 7)), ('d', (7, 8)), ('e', (8, 9))]))
def test_build_matrix(): outfile = NamedTemporaryFile(suffix='.h5', delete=False) outfile.close() qc_folder = mkdtemp(prefix="testQC_") args = "-s {} {} --outFileName {} -bs 5000 -b /tmp/test.bam --QCfolder {} --threads 4".format( sam_R1, sam_R2, outfile.name, qc_folder).split() hicBuildMatrix.main(args) test = hm.hiCMatrix(ROOT + "small_test_matrix_parallel.h5") new = hm.hiCMatrix(outfile.name) nt.assert_equal(test.matrix.data, new.matrix.data) nt.assert_equal(test.cut_intervals, new.cut_intervals) # print("MATRIX NAME:", outfile.name) print(set(os.listdir(ROOT + "QC/"))) assert are_files_equal(ROOT + "QC/QC.log", qc_folder + "/QC.log") assert set(os.listdir(ROOT + "QC/")) == set(os.listdir(qc_folder)) assert abs( os.path.getsize(ROOT + "small_test_matrix_result.bam") - os.path.getsize("/tmp/test.bam")) < 1000 os.unlink(outfile.name) shutil.rmtree(qc_folder) os.unlink("/tmp/test.bam")
def main(): args = parse_arguments().parse_args() hic = hm.hiCMatrix(args.matrices[0]) summed_matrix = hic.matrix nan_bins = set(hic.nan_bins) for matrix in args.matrices[1:]: hic_to_append = hm.hiCMatrix(matrix) try: summed_matrix = summed_matrix + hic_to_append.matrix if len(hic_to_append.nan_bins): nan_bins = nan_bins.union(hic_to_append.nan_bins) except: print "\nMatrix {} seems to be corrupted or of different " \ "shape".format(matrix) exit(1) # save only the upper triangle of the # symmetric matrix hic.setMatrixValues(summed_matrix) hic.maskBins(sorted(nan_bins)) hic.save(args.outFileName)
def getViewpointValues(pMatrix, pReferencePoint, pChromViewpoint, pRegion_start, pRegion_end, pInteractionList=None, pChromosome=None): hic = hm.hiCMatrix(pMatrix) if pChromosome is not None: hic.keepOnlyTheseChr(pChromosome) if len(pReferencePoint) == 2: view_point_start, view_point_end = hic.getRegionBinRange( pReferencePoint[0], int(pReferencePoint[1]), int(pReferencePoint[1])) elif len(pReferencePoint) == 3: view_point_start, view_point_end = hic.getRegionBinRange( pReferencePoint[0], int(pReferencePoint[1]), int(pReferencePoint[2])) else: log.error("No valid reference point given. {}".format(pReferencePoint)) exit(1) view_point_range = hic.getRegionBinRange(pChromViewpoint, pRegion_start, pRegion_end) elements_of_viewpoint = view_point_range[1] - view_point_range[0] data_list = np.zeros(elements_of_viewpoint) view_point_start_ = view_point_start interactions_list = None if pInteractionList is not None: interactions_list = [] while view_point_start_ <= view_point_end: chrom, start, end, _ = hic.getBinPos(view_point_start_) for j, idx in zip(range(elements_of_viewpoint), range(view_point_range[0], view_point_range[1], 1)): data_list[j] += hic.matrix[view_point_start_, idx] if interactions_list is not None: chrom_second, start_second, end_second, _ = hic.getBinPos(idx) interactions_list.append( (chrom, start, end, chrom_second, start_second, end_second, hic.matrix[view_point_start_, idx])) view_point_start_ += 1 return [ view_point_start, view_point_end, view_point_range, data_list, interactions_list ]
def test_setCorrectionFactors_fail(): hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) assert hic.correction_factors is None with pytest.raises(AssertionError): hic.setCorrectionFactors([5, 5, 5, 5])
def test_reorderChromosomes_fail(): hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) # name 'c' not in chromosome names, thus fail false_chr_order = ['a', 'b', 'c'] with pytest.raises(SystemExit): hic.reorderChromosomes(false_chr_order)
def get_test_matrix(cut_intervals=None, matrix=None): hic = HiCMatrix.hiCMatrix() hic.nan_bins = [] if matrix is None: matrix = np.array([[1, 8, 5, 3, 0, 8], [0, 4, 15, 5, 1, 7], [0, 0, 0, 7, 2, 8], [0, 0, 0, 0, 1, 5], [0, 0, 0, 0, 0, 6], [0, 0, 0, 0, 0, 0]]) # make matrix symmetric matrix = csr_matrix(matrix + matrix.T) if not cut_intervals: cut_intervals = [('c-0', 0, 1, 1), ('c-1', 0, 1, 1), ('c-2', 0, 1, 1), ('c-4', 0, 1, 1), ('c-4', 0, 1, 1)] hic.matrix = csr_matrix(matrix[0:len(cut_intervals), 0:len(cut_intervals)]) hic.setMatrix(hic.matrix, cut_intervals) return hic
def test_setCorrectionFactors_success(): hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) assert hic.correction_factors is None hic.setCorrectionFactors([5, 5, 5, 5, 5]) nt.assert_equal(hic.correction_factors, [5, 5, 5, 5, 5])
def main(args=None): args = parse_arguments().parse_args(args) hic_ma = HiCMatrix.hiCMatrix(args.matrix) try: hic_ma.maskBins(hic_ma.nan_bins) except AttributeError: pass if args.skipDiagonal: hic_ma.diagflat() if args.method == 'obs/exp': hic_ma.convert_to_obs_exp_matrix(maxdepth=args.depth, perchr=args.perchr) else: hic_ma.convert_to_zscore_matrix(maxdepth=args.depth, perchr=args.perchr) hic_ma.save(args.outFileName)
def test_setMatrixValues_fail(): hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1)] new_matrix = np.array([[10, 80, 50, 30], [0, 40, 150, 50], [0, 0, 0, 0], [0, 0, 0, 0]]) with pytest.raises(AssertionError): hic.setMatrixValues(new_matrix)
def test_restoreMaskedBins(): hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) nt.assert_equal(hic.getMatrix(), matrix) nt.assert_equal(hic.orig_bin_ids, []) # function should directly return if there are no masked_bins hic.restoreMaskedBins() nt.assert_equal(hic.getMatrix(), matrix) nt.assert_equal(hic.orig_bin_ids, []) # test general use # first get some masked bins masking_ids = [0, 1] hic.maskBins(masking_ids) new_matrix = np.matrix([[0, 0, 2], [0, 0, 1], [0, 0, 0]]) nt.assert_equal(hic.getMatrix(), new_matrix) nt.assert_equal(sorted(hic.orig_bin_ids), sorted([0, 1, 2, 3, 4])) # and now restore masked bins hic.restoreMaskedBins() result_matrix = np.matrix([[np.nan, np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, 0, 0, 2], [np.nan, np.nan, 0, 0, 1], [np.nan, np.nan, 0, 0, 0]]) nt.assert_equal(hic.getMatrix(), result_matrix) nt.assert_equal(hic.orig_bin_ids, [])
def test_truncTrans_bk(capsys): # get matrix hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[-1, 8, 5, 3, 0], [np.nan, 4, 15, 5, 100], [0, 0, 0, 0, 2000], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) nt.assert_equal(hic.getMatrix(), matrix) try: hic.truncTrans_bk() except TypeError: pass
def test_setMatrixValues_success(): hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) new_matrix = np.array([[10, 80, 50, 30, 0], [0, 40, 150, 50, 10], [0, 0, 0, 0, 20], [0, 0, 0, 0, 10], [0, 0, 0, 0, 0]]) hic.setMatrixValues(new_matrix) nt.assert_equal(hic.getMatrix(), new_matrix)
def test_filterOutInterChrCounts(): hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) hic.matrix = hic.fillLowerTriangle(hic.matrix) hic.filterOutInterChrCounts() filtered_matrix = np.matrix([[1, 8, 5, 0, 0], [8, 4, 15, 0, 0], [5, 15, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0]]) nt.assert_equal(hic.getMatrix(), filtered_matrix)
def test_diagflat(): hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) hic.matrix = hic.fillLowerTriangle(hic.matrix) hic.diagflat(value=1000) nt.assert_equal(np.array([1000 for x in range(matrix.shape[0])]), hic.matrix.diagonal()) hic.diagflat() nt.assert_equal(np.array([np.nan for x in range(5)]), hic.matrix.diagonal())
def test_removePoorRegions(capsys): # get matrix hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[-1, 8, 5, 3, 0], [np.nan, 4, 15, 5, 100], [0, 0, 0, 0, 2000], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) nt.assert_equal(hic.getMatrix(), matrix) # removePoorRegions try: hic.removePoorRegions() except IndexError: pass
def test_reorderBins(): hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) nt.assert_equal(hic.getMatrix(), matrix) new_order = [0, 1, 3, 2, 4] new_matrix = np.matrix([[1, 8, 3, 5, 0], [0, 4, 5, 15, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 0]]) hic.reorderBins(new_order) nt.assert_equal(hic.getMatrix(), new_matrix) hic.reorderBins(new_order) nt.assert_equal(hic.getMatrix(), matrix) # order smaller than original matrix should delete unused ids small_order = [2, 3] small_matrix = np.matrix([[0, 0], [0, 0]]) hic.reorderBins(small_order) nt.assert_equal(hic.getMatrix(), small_matrix) nt.assert_equal(hic.matrix.shape, small_matrix.shape) nt.assert_equal(hic.chrBinBoundaries, OrderedDict([('a', (0, 1)), ('b', (1, 2))])) nt.assert_equal(hic.cut_intervals, [('a', 20, 30, 1), ('b', 30, 40, 1)]) nt.assert_equal(hic.nan_bins, [])
def test_create_empty_cool_file(): """ Test fails. As far as I can see function is never called from anywhere. Perhaps not important. Perhaps test is not correctly written... """ outfile = '/tmp/matrix3.cool' hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) # make matrix symmetric hic.setMatrix(hic.matrix, cut_intervals) hic.matrix = hm.hiCMatrix.fillLowerTriangle(hic.matrix) hic.create_empty_cool_file(outfile)
def test_dist_list_to_dict(): hic = hm.hiCMatrix() data = np.array([1, 8, 5, 3, 0, 4, 15, 5, 1, 0, 0, 2, 0, 1, 0]) dist_list = np.array( [0, 10, 20, 30, -1, 0, 10, 20, -1, 0, 10, -1, 0, -1, 0]) distance = hic.dist_list_to_dict(data, dist_list) nt.assert_equal(distance[-1], [0, 1, 2, 1]) nt.assert_equal(distance[0], [1, 4, 0, 0, 0]) nt.assert_equal(distance[10], [8, 15, 0]) nt.assert_equal(distance[20], [5, 5]) nt.assert_equal(distance[30], [3]) data = np.array([0, 100, 200, 0, 100, 200, 0, 100, 0]) dist_list = np.array([0, 100, 200, 0, 100, 200, 0, 100, 0]) distance = hic.dist_list_to_dict(data, dist_list) nt.assert_equal(distance[0], [0, 0, 0, 0]) nt.assert_equal(distance[100], [100, 100, 100]) nt.assert_equal(distance[200], [200, 200])
def test_truncTrans(): # get matrix hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[-1, 8, 5, 3, 0], [np.nan, 4, 15, 5, 100], [0, 0, 0, 0, 2000], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) nt.assert_equal(hic.getMatrix(), matrix) # define expected outcome new_matrix = np.matrix([[-1., 8., 5., 3., 0.], [np.nan, 4., 15., 5., 1.e+2], [0., 0., 0., 0., 2.e+3], [0., 0., 0., 0., 1.], [0., 0., 0., 0., 0.]]) # truncTrans of matrix hic.truncTrans() # test against expected outcome nt.assert_equal(hic.getMatrix(), new_matrix) # reset matrix matrix = np.array([[-1, 8, 5, 3, 0], [np.nan, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) # method should directly return if nothing to do, matrix stays the same hic.truncTrans() nt.assert_equal(hic.getMatrix(), matrix)
def test_printchrtoremove(capsys): # get matrix hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) nt.assert_equal(hic.getMatrix(), matrix) # first test exception message for no self.prev_to_remove to_remove = [0, 1] with pytest.raises(Exception): hic.printchrtoremove(to_remove) captured = capsys.readouterr() assert captured.out == "No self.prev_to_remove defined, defining it now." nt.assert_equal(hic.prev_to_remove, np.array(to_remove)) nt.assert_equal(hic.orig_bin_ids, []) # also test with masked_bins hic.maskBins(to_remove) assert len(hic.orig_bin_ids) > 0 hic.printchrtoremove(to_remove) nt.assert_equal(hic.prev_to_remove, np.array(to_remove))
def test_reorderChromosomes(): hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) new_chr_order = ['b', 'a'] hic.reorderChromosomes(new_chr_order) nt.assert_equal(hic.chrBinBoundaries, OrderedDict([('b', (0, 2)), ('a', (2, 5))])) old_chr_order = ['a', 'b'] hic.reorderChromosomes(old_chr_order) nt.assert_equal(hic.chrBinBoundaries, OrderedDict([('a', (0, 3)), ('b', (3, 5))]))
def main(args=None): """ for each distance, compare the distribution of two samples, report number of cases were they differ """ args = parse_arguments().parse_args(args) mean_dict = OrderedDict() matrix_sum = {} if args.labels is None: labels = OrderedDict([(x, os.path.basename(x)) for x in args.matrices]) else: labels = OrderedDict(zip(args.matrices, args.labels)) chroms = set() for matrix_file in args.matrices: hic_ma = HiCMatrix.hiCMatrix(matrix_file) matrix_sum[matrix_file] = hic_ma.matrix.sum() if args.chromosomeExclude is None: args.chromosomeExclude = [] chrtokeep = [x for x in list(hic_ma.interval_trees) if x not in args.chromosomeExclude] hic_ma.keepOnlyTheseChr(chrtokeep) mean_dict[matrix_file] = compute_distance_mean(hic_ma, maxdepth=args.maxdepth, perchr=args.perchr) chroms = chroms.union([k for k in list(mean_dict[matrix_file]) if len(mean_dict[matrix_file][k]) > 1]) # compute scale factors such that values are comparable min_sum = min(matrix_sum.values()) scale_factor = dict([(matrix_file, float(min_sum) / mat_sum) for matrix_file, mat_sum in iteritems(matrix_sum)]) log.info("The scale factors used are: {}".format(scale_factor)) if len(args.matrices) > 1 and args.perchr: # in this case, for each chromosome a plot is made that combines the data from the # hic matrices max_cols = 4 num_rows = int(np.ceil(float(len(chroms)) / max_cols)) num_cols = min(len(chroms), max_cols) else: num_cols = num_rows = 1 if args.plotsize is None: width = 6 height = 4 else: width, height = args.plotsize fig = plt.figure(figsize=(width * num_cols, height * num_rows)) axs = np.empty((num_rows, num_cols), dtype='object') for matrix_file in args.matrices: idx = 0 for chrom, mean_values in iteritems(mean_dict[matrix_file]): if len(mean_values) <= 1: log.debug("No values found for: {}, chromosome: {}\n".format(matrix_file, chrom)) continue x, y = zip(*[(k, v) for k, v in iteritems(mean_values) if v > 0]) if len(x) <= 1: log.debug("No values found for: {}, chromosome: {}\n".format(matrix_file, chrom)) continue if args.perchr and len(args.matrices) == 1: col = 0 row = 0 else: col = idx % num_cols row = idx // num_cols if axs[row, col] is None: ax = plt.subplot2grid((num_rows, num_cols), (row, col)) ax.set_xlabel('genomic distance') ax.set_ylabel('corrected Hi-C counts') try: ax.set_yscale('log') ax.set_xscale('log') except ValueError: continue else: ax = axs[row, col] y = np.array(y) * scale_factor[matrix_file] if args.perchr and len(args.matrices) > 1: label = labels[matrix_file] ax.set_title(chrom) elif args.perchr: label = chrom else: label = labels[matrix_file] ax.plot(x, y, label=label) axs[row, col] = ax idx += 1 if args.outFileData is not None: if args.perchr and len(args.matrices) > 1: label = labels[matrix_file] args.outFileData.write("#{}\n".format(chrom)) elif args.perchr: label = chrom else: label = labels[matrix_file] args.outFileData.write("#{}\n".format(label)) args.outFileData.write("\t".join(map(str, x)) + "\n") args.outFileData.write("\t".join(map(str, y)) + "\n") for ax in axs.reshape(-1): if ax is None: continue ax.legend(prop={'size': 'small'}) ax.set_xlim(0, args.maxdepth) handles, labels = ax.get_legend_handles_labels() lgd = ax.legend(handles, labels, loc='center left', bbox_to_anchor=(1, 0.5)) plt.tight_layout() plt.savefig(args.plotFile.name, bbox_inches='tight', bbox_extra_artists=(lgd,)) plt.close(fig)
def combine_matrices(matrix_list, bplimit=None): """ Combines individual matrices, stored per chromosome into one matrix :param matrix_list: name of the matrices that will be combined into one. :param bplimit: To reduce the final file size, counts over the given distance can be removed :return: sparse matrix, bin intervals, nan bins, corrrections factors, distance counts """ # Create empty row, col, value for the matrix from scipy.sparse import coo_matrix, triu new_cut_intervals = [] row = np.array([]).astype("int") col = np.array([]).astype("int") values = np.array([]) new_nan_bins = np.array([]).astype('int') new_correction_factors = np.array([]) new_distance_counts = np.array([]) # for each chr, append the row, col, value to the first one. Extend the dim size = 0 for i in range(0, len(matrix_list)): hic = hm.hiCMatrix(matrix_list[i]) # trim matrix if bplimit given if bplimit is not None: limit = bplimit // hic.getBinSize() matrix = (triu(hic.matrix, k=-limit) - triu(hic.matrix, k=limit)).tocoo() else: matrix = hic.matrix.tocoo() # add data row = np.concatenate([row, matrix.row + size]) col = np.concatenate([col, matrix.col + size]) values = np.concatenate([values, matrix.data]) new_nan_bins = np.concatenate([new_nan_bins, hic.nan_bins + size]) new_cut_intervals.extend(hic.cut_intervals) size += matrix.shape[0] # also add correction_factors if hic.correction_factors is not None: new_correction_factors = np.append(new_correction_factors, hic.correction_factors) else: # add an array with NaNs arr = np.empty(matrix.shape[0]) arr[:] = np.NAN new_correction_factors = np.concatenate([new_correction_factors, arr]) if hic.distance_counts is not None: new_distance_counts = np.concatenate([new_distance_counts, hic.distance_counts]) final_mat = coo_matrix((values, (row, col)), shape=(size, size)).tocsr() assert len(new_cut_intervals) == final_mat.shape[0], \ "Corrupted matrix file. Matrix size and " \ "matrix bin definitions do not correspond" if len(new_distance_counts) == 0: new_distance_counts = None if len(new_correction_factors) == 0: new_correction_factors = None return final_mat, new_cut_intervals, new_nan_bins, new_correction_factors, new_distance_counts
def main(args=None): log.debug(args) args = parse_arguments().parse_args(args) # create hiC matrix with given input format # additional file needed for lieberman format if args.inputFormat == 'lieberman': if args.chrNameList is None: log.error("Error: --chrNameList is required when the input format is lieberman.") exit() else: hic_ma = hm.hiCMatrix(matrixFile=args.inFile, file_format='lieberman', chrnameList=args.chrNameList) elif args.inputFormat == 'npz' and len(args.inFile) > 1: # assume hicexplorer_multi format if args.bplimit: log.info("\nCutting maximum matrix depth to {} for saving\n".format(args.bplimit)) matrix, cut_intervals, nan_bins, corrections_factors, distance_counts = \ combine_matrices(args.inFile, bplimit=args.bplimit) hic_ma = hm.hiCMatrix() hic_ma.setMatrix(matrix, cut_intervals=cut_intervals) if len(nan_bins): hic_ma.nan_bins = nan_bins if corrections_factors is not None: hic_ma.correction_factors = corrections_factors if distance_counts is not None: hic_ma.distance_counts = distance_counts else: if args.inputFormat == 'cool' and args.chromosomeOrder is not None and len(args.chromosomeOrder) == 1: hic_ma = hm.hiCMatrix(matrixFile=args.inFile[0], file_format=args.inputFormat, chrnameList=args.chromosomeOrder) else: hic_ma = hm.hiCMatrix(matrixFile=args.inFile[0], file_format=args.inputFormat) if args.bplimit: from scipy.sparse import triu log.info("\nCutting maximum matrix depth to {} for saving\n".format(args.bplimit)) limit = args.bplimit // hic_ma.getBinSize() hic_ma.matrix = (triu(hic_ma.matrix, k=-limit) - triu(hic_ma.matrix, k=limit)).tocsr() hic_ma.matrix.eliminate_zeros() if not args.inputFormat == 'cool' and args.chromosomeOrder is not None and len(args.chromosomeOrder) == 1: if args.chromosomeOrder: hic_ma.keepOnlyTheseChr(args.chromosomeOrder) if args.clearMaskedBins: hic_ma.maskBins(hic_ma.nan_bins) if not args.outFileName.endswith(args.outputFormat): args.outFileName += "." args.outFileName += args.outputFormat if args.outputFormat == 'dekker': log.info('saving as dekker...') hic_ma.save_dekker(args.outFileName) elif args.outputFormat == 'ren': log.info('saving as ren...') hic_ma.save_bing_ren(args.outFileName) elif args.outputFormat == 'lieberman': log.info('saving as lieberman...') hic_ma.save_lieberman(args.outFileName) elif args.outputFormat == 'npz': log.info('saving as npz...') hic_ma.save_npz(args.outFileName) elif args.outputFormat == 'GInteractions': log.info('saving as GInteractions...') hic_ma.save_GInteractions(args.outFileName) elif args.outputFormat == 'cool': log.info('saving as cool...') hic_ma.save_cooler(args.outFileName) elif args.outputFormat == 'h5': log.info('saving as h5...') hic_ma.save(args.outFileName) else: log.error("An error occurred. hicExport aborted!") exit()
def main(): """ collects all arguments and executes the appropriate functions """ args = parse_arguments().parse_args() hic_ma = HiCMatrix.hiCMatrix(args.matrix) if args.originalMat: orig_ma = HiCMatrix.hiCMatrix(args.originalMat.name) else: orig_ma = None try: hic_ma.maskBins(hic_ma.nan_bins) except AttributeError: pass # remove unwanted Chrs or select a given chromosome # in case is given hic_ma.filterUnwantedChr() if args.originalMat: orig_ma.filterUnwantedChr() if args.chromosomes: hic_ma.keepOnlyTheseChr(args.chromosomes) if args.originalMat: orig_ma.reorderChromosomes(hic_ma.chrBinBoundaries.keys()) if args.skipDiagonal: hic_ma.diagflat() if args.originalMat: orig_ma.diagflat() max_depth_in_bins = None if args.depth: binsize = hic_ma.getBinSize() if args.depth < binsize: exit("Please specify a depth larger than bin size ({})".format( binsize)) max_depth_in_bins = int(args.depth / binsize) import scipy.sparse # work only with the upper matrix # and remove all pixels that are beyond # max_depth_in_bis # (this is done by subtracting a second sparse matrix # that contains only the upper matrix that wants to be removed. hic_ma.matrix = scipy.sparse.triu(hic_ma.matrix, k=0, format='csr') - \ scipy.sparse.triu(hic_ma.matrix, k=max_depth_in_bins, format='csr') hic_ma.matrix.eliminate_zeros() if args.method == 'obs/exp': hic_ma.convert_to_obs_exp_matrix() new_ma = hic_ma.matrix elif args.method == 'pearson': sys.stderr.write("\nComputing observed / expected\n") hic_ma.convert_to_obs_exp_matrix() sys.stderr.write("\nComputing pearson\n") new_ma = getPearson(hic_ma.matrix) elif args.method != 'none': # check that the normalized and original matrices # have the same size if orig_ma: assert np.all(hic_ma.matrix.shape == orig_ma.matrix.shape), \ "original and derived matrices do not have same shape" new_ma = transformMatrix(hic_ma, args.method, per_chr=args.perChromosome, original_matrix=orig_ma, depth_in_bins=max_depth_in_bins) else: new_ma = hic_ma.matrix if args.applyFdr: new_ma = applyFdr(new_ma) hic_ma.setMatrixValues(new_ma) hic_ma.restoreMaskedBins() if args.outFormat == 'dekker': hic_ma.save_dekker(args.outFileName) else: hic_ma.save(args.outFileName)
def __init__(self, *args, **kwargs): super(HiCMatrixTrack, self).__init__(*args, **kwargs) if self.properties['file'].endswith('.cool'): # just init the cooler matrix. self.hic_ma = HiCMatrix.hiCMatrix(self.properties['file'], color_only_init=True) else: self.hic_ma = HiCMatrix.hiCMatrix(self.properties['file']) if len(self.hic_ma.matrix.data) == 0: self.log.error("Matrix {} is empty".format( self.properties['file'])) exit(1) if 'show_masked_bins' in self.properties and self.properties[ 'show_masked_bins'] == 'yes': pass else: self.hic_ma.maskBins(self.hic_ma.nan_bins) # check that the matrix can be log transformed if 'transform' in self.properties: if self.properties['transform'] == 'log1p': if self.hic_ma.matrix.data.min() + 1 < 0: self.log.error( "\n*ERROR*\nMatrix contains negative values.\n" "log1p transformation can not be applied to \n" "values in matrix: {}".format(self.properties['file'])) exit(1) elif self.properties['transform'] == '-log': if self.hic_ma.matrix.data.min() < 0: self.log.error( "\n*ERROR*\nMatrix contains negative values.\n" "log(-1 * <values>) transformation can not be applied to \n" "values in matrix: {}".format(self.properties['file'])) exit(1) elif self.properties['transform'] == 'log': if self.hic_ma.matrix.data.min() < 0: self.log.error( "\n*ERROR*\nMatrix contains negative values.\n" "log transformation can not be applied to \n" "values in matrix: {}".format(self.properties['file'])) exit(1) new_intervals = hicexplorer.utilities.enlarge_bins( self.hic_ma.cut_intervals) self.hic_ma.interval_trees, self.hic_ma.chrBinBoundaries = \ self.hic_ma.intervalListToIntervalTree(new_intervals) self.hic_ma.cut_intervals = new_intervals binsize = self.hic_ma.getBinSize() max_depth_in_bins = int(self.properties['depth'] / binsize) # work only with the lower matrix # and remove all pixels that are beyond # 2 * max_depth_in_bis which are not required # (this is done by subtracting a second sparse matrix # that contains only the lower matrix that wants to be removed. limit = 2 * max_depth_in_bins self.hic_ma.matrix = scipy.sparse.triu(self.hic_ma.matrix, k=0, format='csr') - \ scipy.sparse.triu(self.hic_ma.matrix, k=limit, format='csr') self.hic_ma.matrix.eliminate_zeros() # fill the main diagonal, otherwise it looks # not so good. The main diagonal is filled # with an array containing the max value found # in the matrix if sum(self.hic_ma.matrix.diagonal()) == 0: self.log.info( "Filling main diagonal with max value because it empty and looks bad...\n" ) max_value = self.hic_ma.matrix.data.max() main_diagonal = scipy.sparse.dia_matrix( ([max_value] * self.hic_ma.matrix.shape[0], [0]), shape=self.hic_ma.matrix.shape) self.hic_ma.matrix = self.hic_ma.matrix + main_diagonal self.plot_inverted = False if 'orientation' in self.properties and self.properties[ 'orientation'] == 'inverted': self.plot_inverted = True self.norm = None if 'colormap' not in self.properties: self.properties['colormap'] = DEFAULT_MATRIX_COLORMAP self.cmap = cm.get_cmap(self.properties['colormap']) self.cmap.set_bad('white') self.cmap.set_bad('black')