def fit_cut_intervals(cut_intervals): # check that the matrix has bins of same size # otherwise try to adjust the bins to # to match a regular binning if len(cut_intervals) <= 1: # do nothing if there is only one interval return cut_intervals chrom, start, end, extra = zip(*cut_intervals) median = int(np.median(np.diff(start))) diff = np.array(end) - np.array(start) # check if the bin size is homogeneous if len(np.flatnonzero(diff != median)) > (len(diff) * 0.01): # set the start position of a bin to the closest multiple # of the median def snap_nearest_multiple(start_x, m): resi = [-1 * (start_x % m), -start_x % m] return start_x + resi[np.argmin(np.abs(resi))] start = [snap_nearest_multiple(x, median) for x in start] end = [snap_nearest_multiple(x, median) for x in end] cut_intervals = zip(chrom, start, end, extra) log.info('[getCountsByDistance] Bin size is not ' 'homogeneous, setting \n' 'the bin distance to the median: {}\n'.format(median)) return cut_intervals
def getRegion(args, ma): chrom = region_start = region_end = idx1 = start_pos1 = chrom2 = region_start2 = region_end2 = idx2 = start_pos2 = None chrom, region_start, region_end = translate_region(args.region) chrom = check_chrom_str_bytes(ma.interval_trees, chrom) # if type(next(iter(ma.interval_trees))) in [np.bytes_, bytes]: # chrom = toBytes(chrom) if chrom not in list(ma.interval_trees): chrom = change_chrom_names(chrom) chrom = check_chrom_str_bytes(ma.interval_trees, chrom) # if type(next(iter(ma.interval_trees))) in [np.bytes_, bytes]: # chrom = toBytes(chrom) if chrom not in list(ma.interval_trees): exit("Chromosome name {} in --region not in matrix".format(change_chrom_names(chrom))) args.region = [chrom, region_start, region_end] is_cooler = check_cooler(args.matrix) if is_cooler: idx1, start_pos1 = zip(*[(idx, x[1]) for idx, x in enumerate(ma.cut_intervals) if x[0] == chrom and ((x[1] >= region_start and x[2] < region_end) or (x[1] < region_end and x[2] < region_end and x[2] > region_start) or (x[1] > region_start and x[1] < region_end))]) else: idx1, start_pos1 = zip(*[(idx, x[1]) for idx, x in enumerate(ma.cut_intervals) if x[0] == chrom and x[1] >= region_start and x[2] < region_end]) if args.region2: chrom2, region_start2, region_end2 = translate_region(args.region2) chrom2 = check_chrom_str_bytes(ma.interval_trees, chrom2) # if type(next(iter(ma.interval_trees))) in [np.bytes_, bytes]: # chrom2 = toBytes(chrom) if chrom2 not in list(ma.interval_trees): chrom2 = change_chrom_names(chrom2) chrom2 = check_chrom_str_bytes(ma.interval_trees, chrom2) # if type(next(iter(ma.interval_trees))) in [np.bytes_, bytes]: # chrom2 = toBytes(chrom) if chrom2 not in list(ma.interval_trees): exit("Chromosome name {} in --region2 not in matrix".format(change_chrom_names(chrom2))) if is_cooler: idx2, start_pos2 = zip(*[(idx, x[1]) for idx, x in enumerate(ma.cut_intervals) if x[0] == chrom2 and ((x[1] >= region_start2 and x[2] < region_end2) or (x[1] < region_end2 and x[2] < region_end2 and x[2] > region_start2) or (x[1] > region_start2 and x[1] < region_end2))]) else: idx2, start_pos2 = zip(*[(idx, x[1]) for idx, x in enumerate(ma.cut_intervals) if x[0] == chrom2 and x[1] >= region_start2 and x[2] < region_end2]) else: idx2 = idx1 chrom2 = chrom start_pos2 = start_pos1 return chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2
def load(self): """ Loads a matrix stored in h5 format :param matrix_filename: :return: matrix, cut_intervals, nan_bins, distance_counts, correction_factors """ log.debug('Load in h5 format') with tables.open_file(self.matrixFileName) as f: parts = {} for matrix_part in ('data', 'indices', 'indptr', 'shape'): parts[matrix_part] = getattr(f.root.matrix, matrix_part).read() matrix = csr_matrix(tuple([parts['data'], parts['indices'], parts['indptr']]), shape=parts['shape']) # matrix = hiCMatrix.fillLowerTriangle(matrix) # get intervals intvals = {} for interval_part in ('chr_list', 'start_list', 'end_list', 'extra_list'): if toString(interval_part) == toString('chr_list'): chrom_list = getattr(f.root.intervals, interval_part).read() intvals[interval_part] = toString(chrom_list) else: intvals[interval_part] = getattr(f.root.intervals, interval_part).read() cut_intervals = zip(intvals['chr_list'], intvals['start_list'], intvals['end_list'], intvals['extra_list']) assert len(cut_intervals) == matrix.shape[0], \ "Error loading matrix. Length of bin intervals ({}) is different than the " \ "size of the matrix ({})".format(len(cut_intervals), matrix.shape[0]) # get nan_bins try: if hasattr(f.root, 'nan_bins'): nan_bins = f.root.nan_bins.read() else: nan_bins = np.array([]) except Exception: nan_bins = np.array([]) # get correction factors try: if hasattr(f.root, 'correction_factors'): correction_factors = f.root.correction_factors.read() assert len(correction_factors) == matrix.shape[0], \ "Error loading matrix. Length of correction factors does not" \ "match size of matrix" else: correction_factors = None except Exception: correction_factors = None try: # get correction factors if hasattr(f.root, 'distance_counts'): distance_counts = f.root.correction_factors.read() else: distance_counts = None except Exception: distance_counts = None return matrix, cut_intervals, nan_bins, distance_counts, correction_factors
def getRegion(args, ma): chrom = region_start = region_end = idx1 = start_pos1 = chrom2 = region_start2 = region_end2 = idx2 = start_pos2 = None chrom, region_start, region_end = translate_region(args.region) if type(next(iter(ma.interval_trees))) in [np.bytes_, bytes]: chrom = toBytes(chrom) if chrom not in list(ma.interval_trees): if type(next(iter(ma.interval_trees))) in [np.bytes_, bytes]: chrom = toBytes(chrom) if chrom not in list(ma.interval_trees): exit( "The contig/scaffold name '{}' given in --region is not part of the Hi-C matrix. " "Check spelling".format(chrom)) args.region = [chrom, region_start, region_end] idx1, start_pos1 = zip( *[(idx, x[1]) for idx, x in enumerate(ma.cut_intervals) if x[0] == chrom and x[1] >= region_start and x[2] < region_end]) idx2 = idx1 chrom2 = chrom start_pos2 = start_pos1 return chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2
def test_load_cool_save_and_load_h5(): hic = hm.hiCMatrix(ROOT + 'Li_et_al_2015.cool') outfile = NamedTemporaryFile(suffix='.h5', prefix='hicexplorer_test') hic.matrixFileHandler = None hic.save(pMatrixName=outfile.name) hic_cool = hm.hiCMatrix(outfile.name) nt.assert_equal(hic_cool.matrix.data, hic.matrix.data) chrom_cool, start_cool, end_cool, _ = zip(*hic_cool.cut_intervals) chrom, start, end, _ = zip(*hic_cool.cut_intervals) nt.assert_equal(chrom_cool, chrom) nt.assert_equal(start_cool, start) nt.assert_equal(end_cool, end)
def test_removeBySequencedCount(): # get matrix hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) nt.assert_equal(hic.getMatrix(), matrix) # function returns directly if last entry of cut_intervals not float64 _, _, _, coverage = zip(*hic.cut_intervals) assert type(coverage[0]) != np.float64 # define expected outcome to_remove_expected = None # and test outcome to_remove = hic.removeBySequencedCount() nt.assert_equal(to_remove, to_remove_expected)
def _build_commands(self): commands = [] for opts, label in zip(self.optionstrings, self.labels): if self.command == 'stat': commands.append(self._build_perf_stat_command(opts, self.events, label)) else: commands.append(self._build_perf_record_command(opts, label)) return commands
def test_convert_to_zscore_matrix_2(): # load test matrix hic = hm.hiCMatrix(ROOT + 'Li_et_al_2015.h5') hic.maskBins(hic.nan_bins) mat = hic.matrix.todense() max_depth = 10000 bin_size = hic.getBinSize() max_depth_in_bins = int(float(max_depth) / bin_size) m_size = mat.shape[0] # compute matrix values per distance chrom, start, end, extra = zip( *hm.hiCMatrix.fit_cut_intervals(hic.cut_intervals)) dist_values = {} sys.stderr.write("Computing values per distance for each matrix entry\n") for _i in range(mat.shape[0]): for _j in range(mat.shape[0]): if _j >= _i: # dist is translated to bins dist = int(float(start[_j] - start[_i]) / bin_size) if dist <= max_depth_in_bins: if dist not in dist_values: dist_values[dist] = [] dist_values[dist].append(mat[_i, _j]) mu = {} std = {} for dist, values in iteritems(dist_values): mu[dist] = np.mean(values) std[dist] = np.std(values) # compute z-score for test matrix sys.stderr.write("Computing zscore for each matrix entry\n") zscore_mat = np.full((m_size, m_size), np.nan) for _i in range(mat.shape[0]): for _j in range(mat.shape[0]): if _j >= _i: dist = int(float(start[_j] - start[_i]) / bin_size) if dist <= max_depth_in_bins: zscore = (mat[_i, _j] - mu[dist]) / std[dist] zscore_mat[_i, _j] = zscore # compare with zscore from class hic.convert_to_zscore_matrix(maxdepth=max_depth) # from numpy.testing import assert_almost_equal # only the main diagonal is check. Other diagonals show minimal differences nt.assert_almost_equal(hic.matrix.todense().diagonal(0).A1, zscore_mat.diagonal(0))
def merge_tad_bins(hic, boundary_id_list, filename): """ Reduces the HiCMatrix by merging the counts of tad bins. :param hic: HiCMatrix object :param boundary_id_list list of tad boundary bin ids :param filename Name to save the resulting matrix :return: HiCMatrix object """ from hicexplorer.reduceMatrix import reduce_matrix hic.restoreMaskedBins() ref_name_list, start_list, end_list, coverage_list = zip( *hic.cut_intervals) new_bins = [] bins_to_merge = [] prev_ref = ref_name_list[0] # prepare new intervals idx_start = 0 new_start = start_list[0] count = 0 for idx, ref in enumerate(ref_name_list): if (count > 0 and idx in boundary_id_list) or ref != prev_ref: coverage = np.mean(coverage_list[idx_start:idx]) new_bins.append((ref_name_list[idx_start], new_start, end_list[idx - 1], coverage)) bins_to_merge.append(list(range(idx_start, idx))) idx_start = idx new_start = start_list[idx] count = 0 prev_ref = ref count += 1 # check that the previous for loop ran, otherwise # some variables may not be set if len(bins_to_merge) > 0: coverage = np.mean(coverage_list[idx_start:]) new_bins.append((ref, new_start, end_list[idx], coverage)) bins_to_merge.append(list(range(idx_start, idx + 1))) # remove correction factors otherwise they are # saved but they no longer correspond to the # size of the matrix. hic.correction_factors = None hic.update_matrix( reduce_matrix(hic.matrix, bins_to_merge, diagonal=True), new_bins) hic.save(filename) else: log.info("Nothing to merge.")
def test_noniterators_produce_lists(self): l = range(10) self.assertTrue(isinstance(l, list)) l2 = zip(l, list('ABCDE')*2) self.assertTrue(isinstance(l2, list)) double = lambda x: x*2 l3 = map(double, l) self.assertTrue(isinstance(l3, list)) is_odd = lambda x: x % 2 == 1 l4 = filter(is_odd, range(10)) self.assertEqual(l4, [1, 3, 5, 7, 9]) self.assertTrue(isinstance(l4, list))
def getDistList(rows, cols, cut_intervals): """ Given a list of rows and cols an array is returned containing the genomic distance between each element of the row array with each element of the col array. -1 is returned for inter-chromosomal interactions. A matching list containing the chromosome name is also returned """ chrnamelist, startlist, endlist, extralist = zip(*cut_intervals) # now the distance between any two points # is computed and arranged such that for each # element of the data array, a corespondent distance is stored start_row = np.take(startlist, rows) start_col = np.take(startlist, cols) dist_list = start_col - start_row # now all distances that are between chromosomes are removed # to do this I convert the array of chromosomes to # a array of indices. Then, when subtracting the # values that correspond to matrix.row and matrix.col # using the array of indices, any value other # than 0 means inter-chromosomal row,col combination. # chr_id_list is based on a trick using np.unique # to get from a list of strings # a list of integers chr_id_list = np.unique(chrnamelist, return_inverse=True)[1] chr_row = np.take(chr_id_list, rows) chr_col = np.take(chr_id_list, cols) chr_diff = chr_row - chr_col # set in dist_list array '-1' for all interchromosomal values dist_list[chr_diff != 0] = -1 # make a corresponding chromosome name list # if filtering per chromosome is required chrom_list = np.take(chrnamelist, rows) chrom_list[chr_diff != 0] = '' return dist_list, chrom_list
def consistency_merge(list_of_tuples, message='values differ:', error='raise', merge=mostfrequent): assert error in ('raise', 'warn', 'ignore') if len(list_of_tuples) == 0: raise Exception('cannot merge empty sequence') try: consistency_check(list_of_tuples, message) return list_of_tuples[0][1:] except Inconsistency as e: if error == 'raise': raise elif error == 'warn': logger.warning(str(e)) return tuple([merge(x) for x in zip(*list_of_tuples)[1:]])
def getBinSize(self): """ estimates the bin size. In case the bin size is not equal for all bins (maybe except for the bin at the en of the chromosomes) a warning is issued. In case of uneven bins, the median is returned. """ if self.bin_size is None: chrom, start, end, extra = zip(*self.cut_intervals) median = int(np.median(np.diff(start))) diff = np.array(end) - np.array(start) # check if the bin size is # homogeneous if len(np.flatnonzero(diff != median)) > (len(diff) * 0.01): self.bin_size_homogeneous = False if self.non_homogeneous_warning_already_printed is False: log.warning('Bin size is not homogeneous. \ Median {}\n'.format(median)) self.non_homogeneous_warning_already_printed = True self.bin_size = median return self.bin_size
def main(args=None): args = parse_arguments().parse_args(args) if args.BED: count = 0 for line in args.BED.readlines(): count += 1 try: chrom, start, end = line.strip().split('\t')[0:3] except ValueError: continue try: start, end = map(int, [start, end]) except ValueError as detail: sys.stderr.write( "Invalid value found at line\t{}\t. {}\n".format( line, detail)) file_name = "{}_{}:{}-{}".format(args.outFileName, chrom, start, end) if end - start < 200000: start -= 100000 end += 100000 sys.stderr.write("saving {}'\n".format(file_name)) region = zip(chrom, start, end) else: region = get_region(args.region) trp = pygenometracks.tracksClass.PlotTracks( args.tracks.name, args.width, fig_height=args.height, fontsize=args.fontSize, dpi=args.dpi, track_label_width=args.trackLabelFraction, pRegion=region) trp.plot(args.outFileName, *region, title=args.title)
def merge_bins(hic, num_bins): """ Merge the bins using the specified number of bins. This functions takes care to make new intervals Parameters ---------- hic : HiCMatrix object num_bins : number of consecutive bins to merge. Returns ------- A sparse matrix. Set up a Hi-C test matrix >>> from scipy.sparse import csr_matrix >>> row, col = np.triu_indices(5) >>> cut_intervals = [('a', 0, 10, 0.5), ('a', 10, 20, 1), ... ('a', 20, 30, 1), ('a', 30, 40, 0.1), ('b', 40, 50, 1)] >>> hic = hm.hiCMatrix() >>> hic.nan_bins = [] >>> matrix = np.array([ ... [ 50, 10, 5, 3, 0], ... [ 0, 60, 15, 5, 1], ... [ 0, 0, 80, 7, 3], ... [ 0, 0, 0, 90, 1], ... [ 0, 0, 0, 0, 100]], dtype=np.int32) make the matrix symmetric: >>> from scipy.sparse import dia_matrix >>> dia = dia_matrix(([matrix.diagonal()], [0]), shape=matrix.shape) >>> hic.matrix = csr_matrix(matrix + matrix.T - dia) >>> hic.setMatrix(hic.matrix, cut_intervals) run merge_matrix >>> merge_matrix = merge_bins(hic, 2) >>> merge_matrix.cut_intervals [('a', 0, 20, 0.75), ('a', 20, 40, 0.55000000000000004), ('b', 40, 50, 1.0)] >>> merge_matrix.matrix.todense() matrix([[120, 28, 1], [ 28, 177, 4], [ 1, 4, 100]], dtype=int32) """ hic = remove_nans_if_needed(hic) # get the bins to merge ref_name_list, start_list, end_list, coverage_list = zip( *hic.cut_intervals) new_bins = [] bins_to_merge = [] prev_ref = ref_name_list[0] # prepare new intervals idx_start = 0 new_start = start_list[0] count = 0 for idx, ref in enumerate(ref_name_list): if (count > 0 and count % num_bins == 0) or ref != prev_ref: if count < num_bins / 2: log.debug("{} has few bins ({}). Skipping it\n".format( prev_ref, count)) else: coverage = np.mean(coverage_list[idx_start:idx]) new_bins.append((ref_name_list[idx_start], new_start, end_list[idx - 1], coverage)) bins_to_merge.append(list(range(idx_start, idx))) idx_start = idx new_start = start_list[idx] count = 0 prev_ref = ref count += 1 coverage = np.mean(coverage_list[idx_start:]) new_bins.append((ref, new_start, end_list[idx], coverage)) bins_to_merge.append(list(range(idx_start, idx + 1))) hic.matrix = reduce_matrix(hic.matrix, bins_to_merge, diagonal=True) hic.matrix.eliminate_zeros() hic.setCutIntervals(new_bins) hic.nan_bins = np.flatnonzero(hic.matrix.sum(0).A == 0) return hic
def main(args=None): args = parse_arguments().parse_args(args) if args.verbose: log.setLevel(logging.INFO) # args.chromosomes if check_cooler(args.matrix) and args.chromosomes is not None and len( args.chromosomes) == 1: ma = hm.hiCMatrix(args.matrix, pChrnameList=toString(args.chromosomes)) else: ma = hm.hiCMatrix(args.matrix) if args.chromosomes: ma.reorderChromosomes(toString(args.chromosomes)) # mask all zero value bins if 'correctionMethod' in args: if args.correctionMethod == 'ICE': row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() log.info("Removing {} zero value bins".format(sum(row_sum == 0))) ma.maskBins(np.flatnonzero(row_sum == 0)) matrix_shape = ma.matrix.shape if 'plotName' in args: row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() log.info("Removing {} zero value bins".format(sum(row_sum == 0))) ma.maskBins(np.flatnonzero(row_sum == 0)) matrix_shape = ma.matrix.shape ma.matrix = convertNansToZeros(ma.matrix) ma.matrix = convertInfsToZeros(ma.matrix) ma.matrix = ma.matrix.astype(np.float64, copy=True) log.debug('ma.matrix.indices {}'.format(ma.matrix.indices.dtype)) log.debug('ma.matrix.data {}'.format(ma.matrix.data.dtype)) log.debug('ma.matrix.indptr {}'.format(ma.matrix.indptr.dtype)) # log.debug('ma.matrix.indices {}'.format(np.max(ma.matrix.indices))) # log.debug('ma.matrix.data {}'.format(np.max(ma.matrix.data))) # log.debug('ma.matrix.indptr {}'.format(np.max(ma.matrix.indptr))) # ma.matrix.indptr = ma.matrix.indptr.astype(np.int32, copy=False) # ma.matrix.indices = ma.matrix.indices.astype(np.int32, copy=False) if 'plotName' in args: plot_total_contact_dist(ma, args) log.info("Saving diagnostic plot {}\n".format(args.plotName)) return log.info("matrix contains {} data points. Sparsity {:.3f}.".format( len(ma.matrix.data), float(len(ma.matrix.data)) / (ma.matrix.shape[0]**2))) if args.skipDiagonal: ma.diagflat(value=0) total_filtered_out = set() if args.correctionMethod == 'ICE': if not args.filterThreshold: log.error('min and max filtering thresholds should be set') sys.exit(1) outlier_regions = filter_by_zscore(ma, args.filterThreshold[0], args.filterThreshold[1], perchr=args.perchr) # compute and print some statistics pct_outlier = 100 * float(len(outlier_regions)) / ma.matrix.shape[0] ma.printchrtoremove(outlier_regions, label="Bins that are MAD outliers ({:.2f}%) " "out of".format(pct_outlier, ma.matrix.shape[0]), restore_masked_bins=False) assert matrix_shape == ma.matrix.shape # mask filtered regions ma.maskBins(outlier_regions) total_filtered_out = set(outlier_regions) if args.sequencedCountCutoff and 0 < args.sequencedCountCutoff < 1: chrom, _, _, coverage = zip(*ma.cut_intervals) assert type(coverage[0]) == np.float64 failed_bins = np.flatnonzero( np.array(coverage) < args.sequencedCountCutoff) ma.printchrtoremove(failed_bins, label="Bins with low coverage", restore_masked_bins=False) ma.maskBins(failed_bins) total_filtered_out = set(failed_bins) """ ma.matrix, to_remove = fill_gaps(ma, failed_bins) log.warning("From {} failed bins, {} could " "not be filled\n".format(len(failed_bins), len(to_remove))) ma.maskBins(to_remove) """ if args.transCutoff and 0 < args.transCutoff < 100: cutoff = float(args.transCutoff) / 100 # a usual cutoff is 0.05 ma.truncTrans(high=cutoff) pre_row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() correction_factors = [] corrected_matrix = lil_matrix(ma.matrix.shape) if args.perchr: # normalize each chromosome independently for chrname in list(ma.interval_trees): chr_range = ma.getChrBinRange(chrname) chr_submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] if args.correctionMethod == 'ICE': _matrix, _corr_factors = iterative_correction( chr_submatrix, args) corrected_matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = _matrix correction_factors.append(_corr_factors) else: # Set the kr matrix along with its correction factors vector assert (args.correctionMethod == 'KR') log.debug("Loading a float sparse matrix for KR balancing") kr = kr_balancing( chr_submatrix.shape[0], chr_submatrix.shape[1], chr_submatrix.count_nonzero(), chr_submatrix.indptr.astype(np.int64, copy=False), chr_submatrix.indices.astype(np.int64, copy=False), chr_submatrix.data.astype(np.float64, copy=False)) kr.computeKR() if args.outFileName.endswith('.h5'): corrected_matrix[ chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = kr.get_normalised_matrix( True) # correction_factors.append(np.true_divide(1, # kr.get_normalisation_vector(False).todense())) correction_factors.append( kr.get_normalisation_vector(False).todense()) correction_factors = np.concatenate(correction_factors) else: if args.correctionMethod == 'ICE': corrected_matrix, correction_factors = iterative_correction( ma.matrix, args) ma.setMatrixValues(corrected_matrix) else: assert (args.correctionMethod == 'KR') log.debug("Loading a float sparse matrix for KR balancing") kr = kr_balancing(ma.matrix.shape[0], ma.matrix.shape[1], ma.matrix.count_nonzero(), ma.matrix.indptr.astype(np.int64, copy=False), ma.matrix.indices.astype(np.int64, copy=False), ma.matrix.data.astype(np.float64, copy=False)) log.debug('passed pointers') kr.computeKR() log.debug('computation done') # set it to False since the vector is already normalised # with the previous True # correction_factors = np.true_divide(1, kr.get_normalisation_vector(False).todense()) correction_factors = kr.get_normalisation_vector(False).todense() if args.outFileName.endswith('.h5'): corrected_matrix = kr.get_normalised_matrix(True) if args.outFileName.endswith('.h5'): ma.setMatrixValues(corrected_matrix) # if ma.setCorrectionFactors(correction_factors) log.debug("Correction factors {}".format(correction_factors[:10])) if args.inflationCutoff and args.inflationCutoff > 0 and args.correctionMethod == 'ICE': after_row_sum = np.asarray(corrected_matrix.sum(axis=1)).flatten() # identify rows that were expanded more than args.inflationCutoff times to_remove = np.flatnonzero( after_row_sum / pre_row_sum >= args.inflationCutoff) ma.printchrtoremove(to_remove, label="inflated >={} " "regions".format(args.inflationCutoff), restore_masked_bins=False) total_filtered_out = total_filtered_out.union(to_remove) ma.maskBins(to_remove) ma.printchrtoremove(sorted(list(total_filtered_out)), label="Total regions to be removed", restore_masked_bins=False) ma.save(args.outFileName, pApplyCorrection=False)
def getDistList(rows, cols, cut_intervals): """ Given a list of rows and cols an array is returned containing the genomic distance between each element of the row array with each element of the col array. -1 is returned for inter-chromosomal interactions. A matching list containing the chromosome name is also returned >>> from scipy.sparse import coo_matrix >>> import numpy as np >>> row, col = np.triu_indices(5) >>> cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ... ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] >>> dist_list, chrom_list = hiCMatrix.getDistList(row, col, ... cut_intervals) >>> coo_matrix((dist_list, (row, col)), shape=(5,5), dtype=np.int32).todense() matrix([[ 0, 10, 20, 30, -1], [ 0, 0, 10, 20, -1], [ 0, 0, 0, 10, -1], [ 0, 0, 0, 0, -1], [ 0, 0, 0, 0, 0]], dtype=int32) >>> chrom_list.tolist() ['a', 'a', 'a', 'a', '', 'a', 'a', 'a', '', 'a', 'a', '', 'a', '', 'b'] """ chrnamelist, startlist, endlist, extralist = zip(*cut_intervals) # now the distance between any two points # is computed and arranged such that for each # element of the data array, a corespondent distance is stored start_row = np.take(startlist, rows) start_col = np.take(startlist, cols) dist_list = start_col - start_row # now all distances that are between chromosomes are removed # to do this I convert the array of chromosomes to # a array of indices. Then, when subtracting the # values that correspond to matrix.row and matrix.col # using the array of indices, any value other # than 0 means inter-chromosomal row,col combination. # chr_id_list is based on a trick using np.unique # to get from a list of strings # a list of integers chr_id_list = np.unique(chrnamelist, return_inverse=True)[1] chr_row = np.take(chr_id_list, rows) chr_col = np.take(chr_id_list, cols) chr_diff = chr_row - chr_col # set in dist_list array '-1' for all interchromosomal values dist_list[chr_diff != 0] = -1 # make a corresponding chromosome name list # if filtering per chromosome is required chrom_list = np.take(chrnamelist, rows) chrom_list[chr_diff != 0] = '' return dist_list, chrom_list
def plotPerChr(hic_matrix, cmap, args, pBigwig): """ plots each chromosome individually, one after the other in one row. scale bar is added at the end """ from math import ceil chromosomes = hic_matrix.getChrNames() chrom_per_row = 5 num_rows = int(ceil(float(len(chromosomes)) / chrom_per_row)) num_cols = min(chrom_per_row, len(chromosomes)) width_ratios = [1.0] * num_cols + [0.05] grids = gridspec.GridSpec(num_rows, num_cols + 1, width_ratios=width_ratios, height_ratios=[1] * num_rows) fig_height = 6 * num_rows fig_width = sum((np.array(width_ratios) + 0.05) * 6) fig = plt.figure(figsize=(fig_width, fig_height), dpi=args.dpi) chrom, start, end, _ = zip(*hic_matrix.cut_intervals) for idx, chrname in enumerate(chromosomes): log.debug('chrom: {}'.format(chrname)) row = idx // chrom_per_row col = idx % chrom_per_row if pBigwig: inner_grid = gridspec.GridSpecFromSubplotSpec(2, 2, height_ratios=[0.85, 0.15], width_ratios=[0.93, 0.07], subplot_spec=grids[row, col], wspace=0.1, hspace=0.1) axis = plt.subplot(inner_grid[0, 0]) axis_eigenvector = plt.subplot(inner_grid[1, 0]) axis_scale = plt.subplot(inner_grid[0, 1]) else: axis = plt.subplot(grids[row, col]) axis.set_title(toString(chrname)) chrom_range = hic_matrix.getChrBinRange(chrname) matrix = np.asarray(hic_matrix.matrix[chrom_range[0]:chrom_range[1], chrom_range[0]:chrom_range[1]].todense().astype(float)) norm = None if args.log or args.log1p: mask = matrix == 0 mask_nan = np.isnan(matrix) mask_inf = np.isinf(matrix) log.debug("any nan {}".format(np.isnan(matrix).any())) log.debug("any inf {}".format(np.isinf(matrix).any())) try: matrix[mask] = np.nanmin(matrix[mask == False]) matrix[mask_nan] = np.nanmin(matrix[mask_nan == False]) matrix[mask_inf] = np.nanmin(matrix[mask_inf == False]) except Exception: log.debug("Clearing of matrix failed.") log.debug("any nanafter remove of nan: {}".format(np.isnan(matrix).any())) log.debug("any inf after remove of inf: {}".format(np.isinf(matrix).any())) if args.log1p: matrix += 1 norm = LogNorm() elif args.log: norm = LogNorm() bigwig_info = None if pBigwig: bigwig_info = {'args': args, 'axis': None, 'axis_colorbar': None, 'nan_bins': hic_matrix.nan_bins} bigwig_info['axis'] = axis_eigenvector bigwig_info['axis_colorbar'] = axis_scale chr_bin_boundary = OrderedDict() chr_bin_boundary[chrname] = hic_matrix.get_chromosome_sizes()[chrname] args.region = toString(chrname) chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = getRegion(args, hic_matrix) plotHeatmap(matrix, chr_bin_boundary, fig, None, args, cmap, xlabel=chrname, ylabel=chrname, start_pos=start_pos1, start_pos2=start_pos2, pNorm=norm, pAxis=axis, pBigwig=bigwig_info) return fig
def main(args=None): """ for each distance, compare the distribution of two samples, report number of cases were they differ """ args = parse_arguments().parse_args(args) mean_dict = OrderedDict() matrix_sum = {} if args.labels is None: labels = OrderedDict([(x, os.path.basename(x)) for x in args.matrices]) else: labels = OrderedDict(zip(args.matrices, args.labels)) chroms = set() for matrix_file in args.matrices: hic_ma = HiCMatrix.hiCMatrix(matrix_file) matrix_sum[matrix_file] = hic_ma.matrix.sum() if args.chromosomeExclude is None: args.chromosomeExclude = [] chrtokeep = [x for x in list(hic_ma.interval_trees) if x not in args.chromosomeExclude] hic_ma.keepOnlyTheseChr(chrtokeep) mean_dict[matrix_file] = compute_distance_mean(hic_ma, maxdepth=args.maxdepth, perchr=args.perchr) chroms = chroms.union([k for k in list(mean_dict[matrix_file]) if len(mean_dict[matrix_file][k]) > 1]) # compute scale factors such that values are comparable min_sum = min(matrix_sum.values()) scale_factor = dict([(matrix_file, float(min_sum) / mat_sum) for matrix_file, mat_sum in iteritems(matrix_sum)]) log.info("The scale factors used are: {}".format(scale_factor)) if len(args.matrices) > 1 and args.perchr: # in this case, for each chromosome a plot is made that combines the data from the # hic matrices max_cols = 4 num_rows = int(np.ceil(float(len(chroms)) / max_cols)) num_cols = min(len(chroms), max_cols) else: num_cols = num_rows = 1 if args.plotsize is None: width = 6 height = 4 else: width, height = args.plotsize fig = plt.figure(figsize=(width * num_cols, height * num_rows)) axs = np.empty((num_rows, num_cols), dtype='object') for matrix_file in args.matrices: idx = 0 for chrom, mean_values in iteritems(mean_dict[matrix_file]): if len(mean_values) <= 1: log.debug("No values found for: {}, chromosome: {}\n".format(matrix_file, chrom)) continue x, y = zip(*[(k, v) for k, v in iteritems(mean_values) if v > 0]) if len(x) <= 1: log.debug("No values found for: {}, chromosome: {}\n".format(matrix_file, chrom)) continue if args.perchr and len(args.matrices) == 1: col = 0 row = 0 else: col = idx % num_cols row = idx // num_cols if axs[row, col] is None: ax = plt.subplot2grid((num_rows, num_cols), (row, col)) ax.set_xlabel('genomic distance') ax.set_ylabel('corrected Hi-C counts') try: ax.set_yscale('log') ax.set_xscale('log') except ValueError: continue else: ax = axs[row, col] y = np.array(y) * scale_factor[matrix_file] if args.perchr and len(args.matrices) > 1: label = labels[matrix_file] ax.set_title(chrom) elif args.perchr: label = chrom else: label = labels[matrix_file] ax.plot(x, y, label=label) axs[row, col] = ax idx += 1 if args.outFileData is not None: if args.perchr and len(args.matrices) > 1: label = labels[matrix_file] args.outFileData.write("#{}\n".format(chrom)) elif args.perchr: label = chrom else: label = labels[matrix_file] args.outFileData.write("#{}\n".format(label)) args.outFileData.write("\t".join(map(str, x)) + "\n") args.outFileData.write("\t".join(map(str, y)) + "\n") for ax in axs.reshape(-1): if ax is None: continue ax.legend(prop={'size': 'small'}) ax.set_xlim(0, args.maxdepth) handles, labels = ax.get_legend_handles_labels() lgd = ax.legend(handles, labels, loc='center left', bbox_to_anchor=(1, 0.5)) plt.tight_layout() plt.savefig(args.plotFile.name, bbox_inches='tight', bbox_extra_artists=(lgd,)) plt.close(fig)
def plotPerChr(hic_matrix, cmap, args, pBigwig, pResolution): """ plots each chromosome individually, one after the other in one row. scale bar is added at the end """ from math import ceil chromosomes = hic_matrix.getChrNames() chrom_per_row = 5 num_rows = int(ceil(float(len(chromosomes)) / chrom_per_row)) num_cols = min(chrom_per_row, len(chromosomes)) width_ratios = [1.0] * num_cols + [0.05] grids = gridspec.GridSpec(num_rows, num_cols + 1, width_ratios=width_ratios, height_ratios=[1] * num_rows) fig_height = 6 * num_rows fig_width = sum((np.array(width_ratios) + 0.05) * 6) if pBigwig: for i in range(len(args.bigwig)): fig_height += args.increaseFigureHeight # if args.bigwigAdditionalVerticalAxis: fig_width += args.increaseFigureWidth fig = plt.figure(figsize=(fig_width, fig_height), dpi=args.dpi) chrom, start, end, _ = zip(*hic_matrix.cut_intervals) for idx, chrname in enumerate(chromosomes): log.debug('chrom: {}'.format(chrname)) bigwig_info = None # if pBigwig: # bigwig_info['axis'] = axis_eigenvector # bigwig_info['axis_colorbar'] = axis_scale row = idx // chrom_per_row col = idx % chrom_per_row if pBigwig: bigwig_info = {'args': args, 'axis': None, 'axis_colorbar': None, 'nan_bins': hic_matrix.nan_bins} # bigwig_info, axis = bigwig_axes_config(args, bigwig_info) # bigwig_info['nan_bins'] = hic_matrix.nan_bins # bigwig_info['args'] = args # inner_grid = gridspec.GridSpecFromSubplotSpec(2, 2, height_ratios=[0.85, 0.15], width_ratios=[0.93, 0.07], # subplot_spec=grids[row, col], wspace=0.1, hspace=0.1) # axis = plt.subplot(inner_grid[0, 0]) # axis_eigenvector = plt.subplot(inner_grid[1, 0]) # axis_scale = plt.subplot(inner_grid[0, 1]) number_of_rows_plot = len(args.bigwig) bigwig_heights = [0.07] * number_of_rows_plot bigwig_height_ratio = 0.95 - (0.07 * number_of_rows_plot) if bigwig_height_ratio < 0.4: bigwig_height_ratio = 0.4 _ratio = 0.6 / len(number_of_rows_plot) bigwig_heights = [_ratio] * number_of_rows_plot if args.bigwigAdditionalVerticalAxis: # gs = gridspec.GridSpecFromSubplotSpec(1 + len(args.bigwig), 3, height_ratios=[0.90, 0.1], width_ratios=[0.15, 0.82, 0.03], # subplot_spec=grids[row, col], wspace=0.1, hspace=0.1) # # gs = gridspec.GridSpec(1 + len(args.bigwig), 3, height_ratios=[0.90, 0.1], width_ratios=[0.15, 0.82, 0.03]) # # gs.update(hspace=0.05, wspace=0.05) # bigwig_vertical_axis = plt.subplot(gs[0, 0]) # axis = plt.subplot(gs[0, 1]) # ax2 = plt.subplot(gs[1, 1]) # ax3 = plt.subplot(gs[0, 2]) # bigwig_info['axis'] = ax2 # bigwig_info['axis_colorbar'] = ax3 # bigwig_info['axis_vertical'] = bigwig_vertical_axis gs = gridspec.GridSpecFromSubplotSpec(1 + len(args.bigwig), 2 + len(args.bigwig), height_ratios=[0.95 - (0.07 * number_of_rows_plot), *bigwig_heights], width_ratios=[*bigwig_heights, 0.97 - (0.07 * number_of_rows_plot), 0.03], subplot_spec=grids[row, col], wspace=0.1, hspace=0.1) # gs.update(hspace=0.05, wspace=0.05) # gs.update(hspace=0.05, wspace=0.05) axis = plt.subplot(gs[0, len(args.bigwig)]) ax2_list = [] for i in range(len(args.bigwig)): ax2_list.append(plt.subplot(gs[1 + i, len(args.bigwig)])) bigwig_vertical_axis_list = [] for i in range(len(args.bigwig)): bigwig_vertical_axis_list.append(plt.subplot(gs[0, i])) # ax2 = plt.subplot(gs[1, 0]) ax3 = plt.subplot(gs[0, len(args.bigwig) + 1]) bigwig_info['axis'] = ax2_list bigwig_info['axis_colorbar'] = ax3 bigwig_info['axis_vertical'] = bigwig_vertical_axis_list else: # [0.95 - (0.07 * number_of_rows_plot), *z_score_heights], width_ratios=[0.75, 0.25]) gs = gridspec.GridSpecFromSubplotSpec(1 + len(args.bigwig), 2, height_ratios=[0.95 - (0.07 * number_of_rows_plot), *bigwig_heights], width_ratios=[0.97, 0.03], subplot_spec=grids[row, col], wspace=0.1, hspace=0.1) # gs.update(hspace=0.05, wspace=0.05) axis = plt.subplot(gs[0, 0]) ax2_list = [] for i in range(len(args.bigwig)): ax2_list.append(plt.subplot(gs[1 + i, 0])) # ax2 = plt.subplot(gs[1, 0]) ax3 = plt.subplot(gs[0, 1]) bigwig_info['axis'] = ax2_list bigwig_info['axis_colorbar'] = ax3 else: axis = plt.subplot(grids[row, col]) axis.set_title(toString(chrname)) chrom_range = hic_matrix.getChrBinRange(chrname) matrix = np.asarray(hic_matrix.matrix[chrom_range[0]:chrom_range[1], chrom_range[0]:chrom_range[1]].todense().astype(float)) norm = None if args.log or args.log1p: mask = matrix == 0 mask_nan = np.isnan(matrix) mask_inf = np.isinf(matrix) log.debug("any nan {}".format(np.isnan(matrix).any())) log.debug("any inf {}".format(np.isinf(matrix).any())) try: matrix[mask] = np.nanmin(matrix[mask == False]) matrix[mask_nan] = np.nanmin(matrix[mask_nan == False]) matrix[mask_inf] = np.nanmin(matrix[mask_inf == False]) except Exception: log.debug("Clearing of matrix failed.") log.debug("any nanafter remove of nan: {}".format( np.isnan(matrix).any())) log.debug("any inf after remove of inf: {}".format( np.isinf(matrix).any())) if args.log1p: matrix += 1 norm = LogNorm() elif args.log: norm = LogNorm() chr_bin_boundary = OrderedDict() chr_bin_boundary[chrname] = hic_matrix.get_chromosome_sizes()[chrname] args.region = toString(chrname) chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = getRegion( args, hic_matrix) plotHeatmap(matrix, chr_bin_boundary, fig, None, args, cmap, xlabel=chrname, ylabel=chrname, start_pos=start_pos1, start_pos2=start_pos2, pNorm=norm, pAxis=axis, pBigwig=bigwig_info, pChromsomeStartEndDict=chromosome_start_end(hic_matrix), pResolution=pResolution) return fig
def main(args=None): args = parse_arguments().parse_args(args) if args.verbose: log.setLevel(logging.INFO) # args.chromosomes if check_cooler(args.matrix) and args.chromosomes is not None and len(args.chromosomes) == 1: ma = hm.hiCMatrix(args.matrix, pChrnameList=toString(args.chromosomes)) else: ma = hm.hiCMatrix(args.matrix) if args.chromosomes: ma.reorderChromosomes(toString(args.chromosomes)) # mask all zero value bins row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() log.info("Removing {} zero value bins".format(sum(row_sum == 0))) ma.maskBins(np.flatnonzero(row_sum == 0)) matrix_shape = ma.matrix.shape ma.matrix = convertNansToZeros(ma.matrix) ma.matrix = convertInfsToZeros(ma.matrix) if 'plotName' in args: plot_total_contact_dist(ma, args) log.info("Saving diagnostic plot {}\n".format(args.plotName)) return log.info("matrix contains {} data points. Sparsity {:.3f}.".format( len(ma.matrix.data), float(len(ma.matrix.data)) / (ma.matrix.shape[0] ** 2))) if args.skipDiagonal: ma.diagflat(value=0) outlier_regions = filter_by_zscore(ma, args.filterThreshold[0], args.filterThreshold[1], perchr=args.perchr) # compute and print some statistics pct_outlier = 100 * float(len(outlier_regions)) / ma.matrix.shape[0] ma.printchrtoremove(outlier_regions, label="Bins that are MAD outliers ({:.2f}%) " "out of".format(pct_outlier, ma.matrix.shape[0]), restore_masked_bins=False) assert matrix_shape == ma.matrix.shape # mask filtered regions ma.maskBins(outlier_regions) total_filtered_out = set(outlier_regions) if args.sequencedCountCutoff and 0 < args.sequencedCountCutoff < 1: chrom, _, _, coverage = zip(*ma.cut_intervals) assert type(coverage[0]) == np.float64 failed_bins = np.flatnonzero( np.array(coverage) < args.sequencedCountCutoff) ma.printchrtoremove(failed_bins, label="Bins with low coverage", restore_masked_bins=False) ma.maskBins(failed_bins) total_filtered_out = set(failed_bins) """ ma.matrix, to_remove = fill_gaps(ma, failed_bins) log.warning("From {} failed bins, {} could " "not be filled\n".format(len(failed_bins), len(to_remove))) ma.maskBins(to_remove) """ if args.transCutoff and 0 < args.transCutoff < 100: cutoff = float(args.transCutoff) / 100 # a usual cutoff is 0.05 ma.truncTrans(high=cutoff) pre_row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() correction_factors = [] if args.perchr: corrected_matrix = lil_matrix(ma.matrix.shape) # normalize each chromosome independently for chrname in list(ma.interval_trees): chr_range = ma.getChrBinRange(chrname) chr_submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] _matrix, _corr_factors = iterative_correction(chr_submatrix, args) corrected_matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = _matrix correction_factors.append(_corr_factors) correction_factors = np.concatenate(correction_factors) else: corrected_matrix, correction_factors = iterative_correction(ma.matrix, args) ma.setMatrixValues(corrected_matrix) ma.setCorrectionFactors(correction_factors) log.info("Correction factors {}".format(correction_factors[:10])) if args.inflationCutoff and args.inflationCutoff > 0: after_row_sum = np.asarray(corrected_matrix.sum(axis=1)).flatten() # identify rows that were expanded more than args.inflationCutoff times to_remove = np.flatnonzero(after_row_sum / pre_row_sum >= args.inflationCutoff) ma.printchrtoremove(to_remove, label="inflated >={} " "regions".format(args.inflationCutoff), restore_masked_bins=False) total_filtered_out = total_filtered_out.union(to_remove) ma.maskBins(to_remove) ma.printchrtoremove(sorted(list(total_filtered_out)), label="Total regions to be removed", restore_masked_bins=False) ma.save(args.outFileName, pApplyCorrection=False)
def _build_commands(self): commands = [] for opts, label in zip(self.optionstrings, self.labels): commands.append(self._build_perf_command(opts, self.events, label)) return commands
def plot(self, ax, chrom_region, region_start, region_end): chrom_sizes = self.hic_ma.get_chromosome_sizes() if chrom_region not in chrom_sizes: chrom_region = self.change_chrom_names(chrom_region) chrom_region = self.check_chrom_str_bytes(chrom_sizes, chrom_region) if region_end > chrom_sizes[chrom_region]: self.log.error("*Error*\nThe region to plot extends beyond the chromosome size. Please check.\n") self.log.error("{} size: {}. Region to plot {}-{}\n".format(chrom_region, chrom_sizes[chrom_region], region_start, region_end)) if self.properties['file'].endswith('.cool'): # load now the region to be plotted pass # expand region to plus depth on both sides # to avoid a 45 degree 'cut' on the edges # get bin id of start and end of region in given chromosome chr_start_id, chr_end_id = self.hic_ma.getChrBinRange(chrom_region) chr_start = self.hic_ma.cut_intervals[chr_start_id][1] chr_end = self.hic_ma.cut_intervals[chr_end_id - 1][1] start_bp = max(chr_start, region_start - self.properties['depth']) end_bp = min(chr_end, region_end + self.properties['depth']) idx, start_pos = zip(*[(idx, x[1]) for idx, x in enumerate(self.hic_ma.cut_intervals) if x[0] == chrom_region and x[1] >= start_bp and x[2] <= end_bp]) idx = idx[0:-1] # select only relevant matrix part matrix = self.hic_ma.matrix[idx, :][:, idx] # limit the 'depth' based on the length of the region being viewed region_len = region_end - region_start depth = min(self.properties['depth'], int(region_len * 1.25)) depth_in_bins = int(1.5 * region_len / self.hic_ma.getBinSize()) if depth < self.properties['depth']: # remove from matrix all data points that are not visible. matrix = matrix - scipy.sparse.triu(matrix, k=depth_in_bins, format='csr') matrix = np.asarray(matrix.todense().astype(float)) if 'scale factor' in self.properties: matrix = matrix * self.properties['scale factor'] if 'transform' in self.properties: if self.properties['transform'] == 'log1p': matrix += 1 self.norm = colors.LogNorm() elif self.properties['transform'] == '-log': mask = matrix == 0 matrix[mask] = matrix[mask is False].min() matrix = -1 * np.log(matrix) elif self.properties['transform'] == 'log': mask = matrix == 0 matrix[mask] = matrix[mask is False].min() matrix = np.log(matrix) if 'max_value' in self.properties and self.properties['max_value'] != 'auto': vmax = self.properties['max_value'] else: # try to use a 'aesthetically pleasant' max value vmax = np.percentile(matrix.diagonal(1), 80) if 'min_value' in self.properties and self.properties['min_value'] != 'auto': vmin = self.properties['min_value'] else: if depth_in_bins > matrix.shape[0]: depth_in_bins = matrix.shape[0] - 5 # if the region length is large with respect to the chromosome length, the diagonal may have # very few values or none. Thus, the following lines reduce the number of bins until the # diagonal is at least length 5 num_bins_from_diagonal = int(region_len / self.hic_ma.getBinSize()) for num_bins in range(0, num_bins_from_diagonal)[::-1]: distant_diagonal_values = matrix.diagonal(num_bins) if len(distant_diagonal_values) > 5: break vmin = np.median(distant_diagonal_values) self.log.info("setting min, max values for track {} to: {}, {}\n". format(self.properties['section_name'], vmin, vmax)) self.img = self.pcolormesh_45deg(ax, matrix, start_pos, vmax=vmax, vmin=vmin) self.img.set_rasterized(True) if self.plot_inverted: ax.set_ylim(depth, 0) else: ax.set_ylim(0, depth)
def save(self, filename, pSymmetric=True, pApplyCorrection=None): """ Saves a matrix using hdf5 format :param filename: :return: None """ log.debug('Save in h5 format') # self.restoreMaskedBins() if not filename.endswith(".h5"): filename += ".h5" # if the file name already exists # try to find a new suitable name if os.path.isfile(filename): log.warning("*WARNING* File already exists {}\n " "Overwriting ...\n".format(filename)) unlink(filename) if self.nan_bins is None: self.nan_bins = np.array([]) elif not isinstance(self.nan_bins, np.ndarray): self.nan_bins = np.array(self.nan_bins) # save only the upper triangle of the if pSymmetric: # symmetric matrix matrix = triu(self.matrix, k=0, format='csr') else: matrix = self.matrix matrix.eliminate_zeros() filters = tables.Filters(complevel=5, complib='blosc') with tables.open_file(filename, mode="w", title="HiCExplorer matrix") as h5file: matrix_group = h5file.create_group( "/", "matrix", ) # save the parts of the csr matrix for matrix_part in ('data', 'indices', 'indptr', 'shape'): arr = np.array(getattr(matrix, matrix_part)) atom = tables.Atom.from_dtype(arr.dtype) ds = h5file.create_carray(matrix_group, matrix_part, atom, shape=arr.shape, filters=filters) ds[:] = arr # save the matrix intervals intervals_group = h5file.create_group( "/", "intervals", ) chr_list, start_list, end_list, extra_list = zip( *self.cut_intervals) for interval_part in ('chr_list', 'start_list', 'end_list', 'extra_list'): arr = np.array(eval(interval_part)) atom = tables.Atom.from_dtype(arr.dtype) ds = h5file.create_carray(intervals_group, interval_part, atom, shape=arr.shape, filters=filters) ds[:] = arr # save nan bins if len(self.nan_bins): atom = tables.Atom.from_dtype(self.nan_bins.dtype) ds = h5file.create_carray(h5file.root, 'nan_bins', atom, shape=self.nan_bins.shape, filters=filters) ds[:] = self.nan_bins # save corrections factors if self.correction_factors is not None and len( self.correction_factors): self.correction_factors = np.array(self.correction_factors) mask = np.isnan(self.correction_factors) self.correction_factors[mask] = 0 atom = tables.Atom.from_dtype(self.correction_factors.dtype) ds = h5file.create_carray(h5file.root, 'correction_factors', atom, shape=self.correction_factors.shape, filters=filters) ds[:] = np.array(self.correction_factors) # save distance counts if self.distance_counts is not None and len(self.distance_counts): atom = tables.Atom.from_dtype(self.distance_counts.dtype) ds = h5file.create_carray(h5file.root, 'distance_counts', atom, shape=self.distance_counts.shape, filters=filters) ds[:] = np.array(self.distance_counts)