def computeDifferentialTADs(pMatrixTarget, pMatrixControl, pDomainList, pCoolOrH5, pPValue, pThreadId, pQueue): accepted_inter_left = [] accepted_inter_right = [] accepted_intra = [] p_values_list = [] rows = [] for i, row in enumerate(pDomainList): if pThreadId is None: log.debug('first thread') if i == len(pDomainList) - 1: continue elif pThreadId == True: log.debug('middle thread') if i == 0 or i == len(pDomainList) - 1: continue elif pThreadId == False: log.debug('last thread') if i == 0: continue if i - 1 >= 0: chromosom = pDomainList[i - 1][0] start = pDomainList[i - 1][1] else: chromosom = pDomainList[i][0] start = pDomainList[i][1] if i + 1 < len(pDomainList): end = pDomainList[i + 1][2] else: end = pDomainList[i][2] # midpos = row[1] + ((row[2] - row[1]) / 2) if pCoolOrH5: # # get intra-TAD data hic_matrix_target = hm.hiCMatrix( pMatrixFile=pMatrixTarget, pChrnameList=[ str(row[0]) + ':' + str(row[1]) + '-' + str(row[2]) ]) hic_matrix_control = hm.hiCMatrix( pMatrixFile=pMatrixControl, pChrnameList=[ str(row[0]) + ':' + str(row[1]) + '-' + str(row[2]) ]) matrix_target = hic_matrix_target.matrix.toarray() matrix_control = hic_matrix_control.matrix.toarray() hic_matrix_target_inter_tad = hm.hiCMatrix( pMatrixFile=pMatrixTarget, pChrnameList=[ str(chromosom) + ':' + str(start) + '-' + str(end) ]) hic_matrix_control_inter_tad = hm.hiCMatrix( pMatrixFile=pMatrixControl, pChrnameList=[ str(chromosom) + ':' + str(start) + '-' + str(end) ]) matrix_target_inter_tad = hic_matrix_target_inter_tad.matrix matrix_control_inter_tad = hic_matrix_control_inter_tad.matrix else: # in case of h5 pMatrixTarget is already a HiCMatrix object hic_matrix_target = pMatrixTarget hic_matrix_control = pMatrixControl hic_matrix_target_inter_tad = pMatrixTarget hic_matrix_control_inter_tad = pMatrixControl indices_target = hic_matrix_target.getRegionBinRange( str(row[0]), row[1], row[2]) indices_control = hic_matrix_control.getRegionBinRange( str(row[0]), row[1], row[2]) matrix_target = hic_matrix_target.matrix[ indices_target[0]:indices_target[1], indices_target[0]:indices_target[1]].toarray() matrix_control = hic_matrix_control.matrix[ indices_control[0]:indices_control[1], indices_control[0]:indices_control[1]].toarray() matrix_target_inter_tad = pMatrixTarget.matrix matrix_control_inter_tad = pMatrixControl.matrix matrix_target = matrix_target.flatten() matrix_control = matrix_control.flatten() # tad_midpoint = hic_matrix_target_inter_tad.getRegionBinRange(str(row[0]), midpos, midpos)[0] # if i - 1 >= 0: # get index position left tad with tad left_boundary_index_target = hic_matrix_target_inter_tad.getRegionBinRange( str(chromosom), row[1], row[1])[0] left_boundary_index_control = hic_matrix_control_inter_tad.getRegionBinRange( str(chromosom), row[1], row[1])[0] if pCoolOrH5: outer_left_boundary_index_target = 0 outer_left_boundary_index_control = 0 outer_right_boundary_index_control = -1 outer_right_boundary_index_target = -1 else: outer_left_boundary_index_target = hic_matrix_target_inter_tad.getRegionBinRange( str(chromosom), start, end)[0] outer_left_boundary_index_control = hic_matrix_control_inter_tad.getRegionBinRange( str(chromosom), start, end)[0] outer_right_boundary_index_control = hic_matrix_control_inter_tad.getRegionBinRange( str(chromosom), start, end)[1] outer_right_boundary_index_target = hic_matrix_target_inter_tad.getRegionBinRange( str(chromosom), start, end)[1] if i + 1 < len(pDomainList) and not pCoolOrH5: # get index position left tad with tad right_boundary_index_target = hic_matrix_target_inter_tad.getRegionBinRange( str(chromosom), row[2], row[2])[0] right_boundary_index_control = hic_matrix_control_inter_tad.getRegionBinRange( str(chromosom), row[2], row[2])[0] elif i + 1 < len(pDomainList) - 1: right_boundary_index_target = hic_matrix_target_inter_tad.getRegionBinRange( str(chromosom), row[2], row[2])[0] right_boundary_index_control = hic_matrix_control_inter_tad.getRegionBinRange( str(chromosom), row[2], row[2])[0] if i - 1 >= 0 and i + 1 < len(pDomainList): intertad_left_target = matrix_target_inter_tad[ outer_left_boundary_index_target:left_boundary_index_target, left_boundary_index_target: right_boundary_index_target].toarray() intertad_right_target = matrix_target_inter_tad[ left_boundary_index_target:right_boundary_index_target, right_boundary_index_target: outer_right_boundary_index_target].toarray() intertad_left_control = matrix_control_inter_tad[ outer_left_boundary_index_control:left_boundary_index_control, left_boundary_index_control: right_boundary_index_control].toarray() intertad_right_control = matrix_control_inter_tad[ left_boundary_index_control:right_boundary_index_control, right_boundary_index_control: outer_right_boundary_index_control].toarray() elif i - 1 < 0 and i + 1 < len(pDomainList): intertad_right_target = matrix_target_inter_tad[ left_boundary_index_target:right_boundary_index_target, right_boundary_index_target: outer_right_boundary_index_target].toarray() intertad_right_control = matrix_control_inter_tad[ left_boundary_index_control:right_boundary_index_control, right_boundary_index_control: outer_right_boundary_index_control].toarray() elif i - 1 > 0 and i + 1 >= len(pDomainList): intertad_left_target = matrix_target_inter_tad[ outer_left_boundary_index_target:left_boundary_index_target, left_boundary_index_target: right_boundary_index_target].toarray() intertad_left_control = matrix_control_inter_tad[ outer_left_boundary_index_control:left_boundary_index_control, left_boundary_index_control: right_boundary_index_control].toarray() significance_level_left = None significance_level_right = None statistic_left = None statistic_right = None if i - 1 >= 0 and i + 1 < len(pDomainList): intertad_left_target = intertad_left_target.flatten() intertad_left_control = intertad_left_control.flatten() intertad_right_target = intertad_right_target.flatten() intertad_right_control = intertad_right_control.flatten() statistic_left, significance_level_left = ranksums( intertad_left_target, intertad_left_control) statistic_right, significance_level_right = ranksums( intertad_right_target, intertad_right_control) elif i - 1 < 0 and i + 1 < len(pDomainList): intertad_right_target = intertad_right_target.flatten() intertad_right_control = intertad_right_control.flatten() statistic_right, significance_level_right = ranksums( intertad_right_target, intertad_right_control) elif i - 1 > 0 and i + 1 >= len(pDomainList): intertad_left_target = intertad_left_target.flatten() intertad_left_control = intertad_left_control.flatten() log.debug('intertad_left_target {}'.format(intertad_left_target)) log.debug('intertad_left_control {}'.format(intertad_left_control)) statistic_left, significance_level_left = ranksums( intertad_left_target, intertad_left_control) # log.debug('matrix_target {}'.format(matrix_target)) # log.debug('matrix_control {}'.format(matrix_control)) statistic, significance_level = ranksums(matrix_target, matrix_control) log.debug('statistic {}, significance_level {}'.format( statistic, significance_level)) log.debug('right statistic {}, significance_level {}'.format( statistic_right, significance_level_right)) log.debug('left statistic {}, significance_level {}'.format( statistic_left, significance_level_left)) p_values = [] if significance_level_left is None or np.isnan( significance_level_left): accepted_inter_left.append(0) p_values.append(np.nan) elif significance_level_left <= pPValue: accepted_inter_left.append(1) p_values.append(significance_level_left) else: accepted_inter_left.append(0) p_values.append(significance_level_left) if significance_level_right is None or np.isnan( significance_level_right): accepted_inter_right.append(0) p_values.append(np.nan) elif significance_level_right <= pPValue: accepted_inter_right.append(1) p_values.append(significance_level_right) else: accepted_inter_right.append(0) p_values.append(significance_level_right) if significance_level is None or np.isnan(significance_level): accepted_intra.append(0) p_values.append(np.nan) elif significance_level <= pPValue: accepted_intra.append(1) p_values.append(significance_level) else: accepted_intra.append(0) p_values.append(significance_level) p_values_list.append(p_values) rows.append(row) # hic_matrix_target_inter_tad.save('manipulated_target.cool') # hic_matrix_control_inter_tad.save('manipulated_control.cool') pQueue.put([ p_values_list, accepted_inter_left, accepted_inter_right, accepted_intra, rows ])
def open_and_store_matrix(pMatrixName, pMatricesList, pIndex, pXDimension, pChromosomes, pNorm, pExtraTrack, pHistonMarkType, pBinarization, pQueue): compartments_matrix = None for i, matrix in enumerate(pMatricesList): ma = hm.hiCMatrix(pMatrixName + '::' + matrix) # WARNING # DO NOT APPLY BIN MASKING, WILL LEAD TO DIFFERENT SIZES OF THE CHROMOSOMES # THIS IS CAUSING A FAIL OF THE COMPUTATION # ma.maskBins(ma.nan_bins) k = 1 if pChromosomes: ma.keepOnlyTheseChr(pChromosomes) vecs_list = [] chrom_list = [] start_list = [] end_list = [] # PCA is computed per chromosome length_chromosome = 0 chromosome_count = len(ma.getChrNames()) for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) length_chromosome += chr_range[1] - chr_range[0] if pExtraTrack and (pExtraTrack.endswith('.bw') or pExtraTrack.endswith('.bigwig')): bwTrack = pyBigWig.open(pExtraTrack, 'r') for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] if pNorm: obs_exp_matrix_ = obs_exp_matrix_norm(submatrix) else: obs_exp_matrix_ = obs_exp_matrix_lieberman( submatrix, length_chromosome, chromosome_count) obs_exp_matrix_ = convertNansToZeros( csr_matrix(obs_exp_matrix_)).todense() obs_exp_matrix_ = convertInfsToZeros( csr_matrix(obs_exp_matrix_)).todense() pearson_correlation_matrix = np.corrcoef(obs_exp_matrix_) pearson_correlation_matrix = convertNansToZeros( csr_matrix(pearson_correlation_matrix)).todense() pearson_correlation_matrix = convertInfsToZeros( csr_matrix(pearson_correlation_matrix)).todense() corrmatrix = np.cov(pearson_correlation_matrix) corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense() corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense() evals, eigs = linalg.eig(corrmatrix) chrom, start, end, _ = zip( *ma.cut_intervals[chr_range[0]:chr_range[1]]) chrom_list += chrom start_list += start end_list += end if pExtraTrack and (pExtraTrack.endswith('.bw') or pExtraTrack.endswith('.bigwig')): assert (len(end) == len(start)) correlateEigenvectorWithHistonMarkTrack( eigs[:, :k].transpose(), bwTrack, chrname, start, end, pExtraTrack, pHistonMarkType) vecs_list += eigs[:, :k].tolist() if compartments_matrix is None: compartments_matrix = np.zeros( [pXDimension, len(np.array(vecs_list).flatten())], dtype=np.float) eigenvector = np.real(np.array(vecs_list).flatten()) mask = np.isnan(eigenvector) if len(mask) > 0: eigenvector[mask] = 0 mask = np.isinf(eigenvector) if len(mask) > 0: eigenvector[mask] = 0 if pBinarization: mask = eigenvector <= 0 eigenvector[mask] = -1 mask = eigenvector > 0 eigenvector[mask] = 1 compartments_matrix[pIndex + i, :] = eigenvector pQueue.put(compartments_matrix) return
def main(args=None): args = parse_arguments().parse_args(args) hic_ma = hm.hiCMatrix(pMatrixFile=args.matrix) indices_values = [] with open(args.regions, 'r') as file: for line in file.readlines(): _line = line.strip().split('\t') if len(line) == 0: continue if len(_line) == 2: chrom, start = _line[0], _line[1] viewpoint = (chrom, start, start) elif len(_line) >= 3: chrom, start, end = _line[0], _line[1], _line[2] viewpoint = (chrom, start, end) if args.range: start_range_genomic, end_range_genomic, _ = calculateViewpointRange( hic_ma, viewpoint, args.range) # min_length, max_length = hic_ma.getBinPos(hic_ma.getChrBinRange(pViewpoint[0])[1] - 1)[1:] # if start_range_genomic < min_length: # log.warning('Ignoring {} {} {} because the reference point minus the range {} is smaller than the chromosome border.'.format(viewpoint[0], viewpoint[1], viewpoint[2], args.range)) # continue # if end_bin > : # log.warning('Ignoring {} {} {} because the reference point plus the range {} is greater than the chromosome border.'.format(viewpoint[0], viewpoint[1], viewpoint[2], args.range)) # continue start_bin, end_bin = getBinIndices( hic_ma, (chrom, start_range_genomic, end_range_genomic)) else: start_bin, end_bin = calculateViewpointRangeBins( hic_ma, viewpoint, args.rangeInBins) # if start_bin < 0: # log.warning('Ignoring {} {} {} because the reference point minus the range {} is smaller than the chromosome border.'.format(viewpoint[0], viewpoint[1], viewpoint[2], args.range)) # continue # if end_bin > : # log.warning('Ignoring {} {} {} because the reference point plus the range {} is greater than the chromosome border.'.format(viewpoint[0], viewpoint[1], viewpoint[2], args.range)) # continue indices_values.append([start_bin, end_bin]) if args.range: dimensions_new_matrix = (args.range[0] // hic_ma.getBinSize()) + ( args.range[1] // hic_ma.getBinSize()) elif args.rangeInBins: dimensions_new_matrix = args.rangeInBins[0] + args.rangeInBins[1] # summed_matrix = csr_matrix((dimensions_new_matrix, dimensions_new_matrix), dtype=np.float32) summed_matrix = lil_matrix((dimensions_new_matrix, dimensions_new_matrix), dtype=np.float32) max_length = hic_ma.matrix.shape[1] for start, end in indices_values: _start = 0 _end = summed_matrix.shape[1] if start < 0: _start = np.absolute(start) start = 0 if end >= max_length: _end = end end = max_length summed_matrix[_start:_end, _start:_end] += hic_ma.matrix[start:end, start:end] summed_matrix /= len(indices_values) summed_matrix = summed_matrix.tocsr() save_npz(args.outFileName, summed_matrix)
def main(args=None): args = parse_arguments().parse_args(args) if args.verbose: log.setLevel(logging.INFO) # args.chromosomes if check_cooler(args.matrix) and args.chromosomes is not None and len( args.chromosomes) == 1: ma = hm.hiCMatrix(args.matrix, pChrnameList=toString(args.chromosomes)) else: ma = hm.hiCMatrix(args.matrix) if args.chromosomes: ma.reorderChromosomes(toString(args.chromosomes)) # mask all zero value bins if 'correctionMethod' in args: if args.correctionMethod == 'ICE': row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() log.info("Removing {} zero value bins".format(sum(row_sum == 0))) ma.maskBins(np.flatnonzero(row_sum == 0)) matrix_shape = ma.matrix.shape if 'plotName' in args: row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() log.info("Removing {} zero value bins".format(sum(row_sum == 0))) ma.maskBins(np.flatnonzero(row_sum == 0)) matrix_shape = ma.matrix.shape ma.matrix = convertNansToZeros(ma.matrix) ma.matrix = convertInfsToZeros(ma.matrix) ma.matrix = ma.matrix.astype(np.float64, copy=True) log.debug('ma.matrix.indices {}'.format(ma.matrix.indices.dtype)) log.debug('ma.matrix.data {}'.format(ma.matrix.data.dtype)) log.debug('ma.matrix.indptr {}'.format(ma.matrix.indptr.dtype)) # log.debug('ma.matrix.indices {}'.format(np.max(ma.matrix.indices))) # log.debug('ma.matrix.data {}'.format(np.max(ma.matrix.data))) # log.debug('ma.matrix.indptr {}'.format(np.max(ma.matrix.indptr))) # ma.matrix.indptr = ma.matrix.indptr.astype(np.int32, copy=False) # ma.matrix.indices = ma.matrix.indices.astype(np.int32, copy=False) if 'plotName' in args: plot_total_contact_dist(ma, args) log.info("Saving diagnostic plot {}\n".format(args.plotName)) return log.info("matrix contains {} data points. Sparsity {:.3f}.".format( len(ma.matrix.data), float(len(ma.matrix.data)) / (ma.matrix.shape[0]**2))) if args.skipDiagonal: ma.diagflat(value=0) total_filtered_out = set() if args.correctionMethod == 'ICE': if not args.filterThreshold: log.error('min and max filtering thresholds should be set') sys.exit(1) outlier_regions = filter_by_zscore(ma, args.filterThreshold[0], args.filterThreshold[1], perchr=args.perchr) # compute and print some statistics pct_outlier = 100 * float(len(outlier_regions)) / ma.matrix.shape[0] ma.printchrtoremove(outlier_regions, label="Bins that are MAD outliers ({:.2f}%) " "out of {}".format(pct_outlier, ma.matrix.shape[0]), restore_masked_bins=False) assert matrix_shape == ma.matrix.shape # mask filtered regions ma.maskBins(outlier_regions) total_filtered_out = set(outlier_regions) if args.sequencedCountCutoff and 0 < args.sequencedCountCutoff < 1: chrom, _, _, coverage = zip(*ma.cut_intervals) assert type(coverage[0]) == np.float64 failed_bins = np.flatnonzero( np.array(coverage) < args.sequencedCountCutoff) ma.printchrtoremove(failed_bins, label="Bins with low coverage", restore_masked_bins=False) ma.maskBins(failed_bins) total_filtered_out = set(failed_bins) """ ma.matrix, to_remove = fill_gaps(ma, failed_bins) log.warning("From {} failed bins, {} could " "not be filled\n".format(len(failed_bins), len(to_remove))) ma.maskBins(to_remove) """ if args.transCutoff and 0 < args.transCutoff < 100: cutoff = float(args.transCutoff) / 100 # a usual cutoff is 0.05 ma.truncTrans(high=cutoff) pre_row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() correction_factors = [] corrected_matrix = lil_matrix(ma.matrix.shape) if args.perchr: # normalize each chromosome independently for chrname in list(ma.interval_trees): chr_range = ma.getChrBinRange(chrname) chr_submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] if args.correctionMethod == 'ICE': _matrix, _corr_factors = iterative_correction( chr_submatrix, args) corrected_matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = _matrix correction_factors.append(_corr_factors) else: # Set the kr matrix along with its correction factors vector assert (args.correctionMethod == 'KR') log.debug("Loading a float sparse matrix for KR balancing") kr = kr_balancing( chr_submatrix.shape[0], chr_submatrix.shape[1], chr_submatrix.count_nonzero(), chr_submatrix.indptr.astype(np.int64, copy=False), chr_submatrix.indices.astype(np.int64, copy=False), chr_submatrix.data.astype(np.float64, copy=False)) kr.computeKR() if args.outFileName.endswith('.h5'): corrected_matrix[ chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = kr.get_normalised_matrix( True) # correction_factors.append(np.true_divide(1, # kr.get_normalisation_vector(False).todense())) correction_factors.append( kr.get_normalisation_vector(False).todense()) correction_factors = np.concatenate(correction_factors) else: if args.correctionMethod == 'ICE': corrected_matrix, correction_factors = iterative_correction( ma.matrix, args) ma.setMatrixValues(corrected_matrix) else: assert (args.correctionMethod == 'KR') log.debug("Loading a float sparse matrix for KR balancing") kr = kr_balancing(ma.matrix.shape[0], ma.matrix.shape[1], ma.matrix.count_nonzero(), ma.matrix.indptr.astype(np.int64, copy=False), ma.matrix.indices.astype(np.int64, copy=False), ma.matrix.data.astype(np.float64, copy=False)) log.debug('passed pointers') kr.computeKR() log.debug('computation done') # set it to False since the vector is already normalised # with the previous True # correction_factors = np.true_divide(1, kr.get_normalisation_vector(False).todense()) correction_factors = kr.get_normalisation_vector(False).todense() if args.outFileName.endswith('.h5'): corrected_matrix = kr.get_normalised_matrix(True) if args.outFileName.endswith('.h5'): ma.setMatrixValues(corrected_matrix) # if ma.setCorrectionFactors(correction_factors) log.debug("Correction factors {}".format(correction_factors[:10])) if args.inflationCutoff and args.inflationCutoff > 0 and args.correctionMethod == 'ICE': after_row_sum = np.asarray(corrected_matrix.sum(axis=1)).flatten() # identify rows that were expanded more than args.inflationCutoff times to_remove = np.flatnonzero( after_row_sum / pre_row_sum >= args.inflationCutoff) ma.printchrtoremove(to_remove, label="inflated >={} " "regions".format(args.inflationCutoff), restore_masked_bins=False) total_filtered_out = total_filtered_out.union(to_remove) ma.maskBins(to_remove) ma.printchrtoremove(sorted(list(total_filtered_out)), label="Total regions to be removed", restore_masked_bins=False) ma.save(args.outFileName, pApplyCorrection=False)
def test_restoreMaskedBins(): hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) nt.assert_equal(hic.getMatrix(), matrix) nt.assert_equal(hic.orig_bin_ids, []) # function should directly return if there are no masked_bins hic.restoreMaskedBins() nt.assert_equal(hic.getMatrix(), matrix) nt.assert_equal(hic.orig_bin_ids, []) # test general use # first get some masked bins masking_ids = [0, 1] hic.maskBins(masking_ids) new_matrix = np.array([[0, 0, 2], [0, 0, 1], [0, 0, 0]]) nt.assert_equal(hic.getMatrix(), new_matrix) nt.assert_equal(sorted(hic.orig_bin_ids), sorted([0, 1, 2, 3, 4])) # and now restore masked bins hic.restoreMaskedBins() result_matrix = np.array([[np.nan, np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, 0, 0, 2], [np.nan, np.nan, 0, 0, 1], [np.nan, np.nan, 0, 0, 0]]) nt.assert_equal(hic.getMatrix(), result_matrix) nt.assert_equal(hic.orig_bin_ids, []) row, col = np.triu_indices(5) cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] hic = hm.hiCMatrix() hic.nan_bins = [] matrix = np.array([[0, 10, 5, 3, 0], [0, 0, 15, 5, 1], [0, 0, 0, 7, 3], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]], dtype=np.int32) # make the matrix symmetric: hic.matrix = csr_matrix(matrix + matrix.T) hic.setMatrix(csr_matrix(matrix + matrix.T), cut_intervals) # Add masked bins masked bins hic.maskBins([3]) matrix = hic.matrix.todense() test_matrix = np.array( [[0, 10, 5, 0], [10, 0, 15, 1], [5, 15, 0, 3], [0, 1, 3, 0]], dtype=np.int32) nt.assert_equal(matrix, test_matrix) cut_int = hic.cut_intervals test_cut_int = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 40, 50, 1)] nt.assert_equal(cut_int, test_cut_int) hic.restoreMaskedBins() dense = hic.matrix.todense() test_dense = np.array([[0., 10., 5., 0., 0.], [10., 0., 15., 0., 1.], [5., 15., 0., 0., 3.], [0., 0., 0., 0., 0.], [0., 1., 3., 0., 0.]]) nt.assert_equal(dense, test_dense) cut_int = hic.cut_intervals test_cut_int = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] nt.assert_equal(cut_int, test_cut_int)
def set_properties_defaults(self): super(HiCMatrixTrack, self).set_properties_defaults() region = None if self.properties['region'] is not None: if self.properties['region'][2] == 1e15: region = [str(self.properties['region'][0])] elif len(self.properties['region']) == 3: start = int( self.properties['region'][1]) - self.properties['depth'] if start < 0: start = 0 end = int( self.properties['region'][2]) + self.properties['depth'] region = [ str(self.properties['region'][0]) + ':' + str(start) + '-' + str(end) ] # try to open with end region + depth to avoid triangle effect in the plot # if it fails open it with given end region. try: self.hic_ma = HiCMatrix.hiCMatrix(self.properties['file'], pChrnameList=region) except Exception: region = [ str(self.properties['region'][0]) + ':' + str(start) + '-' + str(self.properties['region'][2]) ] self.hic_ma = HiCMatrix.hiCMatrix(self.properties['file'], pChrnameList=region) if len(self.hic_ma.matrix.data) == 0: raise Exception("Matrix {} is empty".format( self.properties['file'])) if self.properties['show_masked_bins']: pass else: self.hic_ma.maskBins(self.hic_ma.nan_bins) # check that the matrix can be log transformed if self.properties['transform'] != 'no': if self.properties['transform'] == 'log1p': if self.hic_ma.matrix.data.min() + 1 <= 0: raise Exception( "\n*ERROR*\nMatrix contains values below - 1.\n" "log1p transformation can not be applied to \n" "values in matrix: {}".format(self.properties['file'])) elif self.properties['transform'] in ['-log', 'log']: if self.hic_ma.matrix.data.min() < 0: # For values not filled or equal to zero there will be a # mask, they will be replaced by the minimum value after 0. raise Exception( "\n*ERROR*\nMatrix contains negative values.\n" "log transformation can not be applied to \n" "values in matrix: {}".format(self.properties['file'])) new_intervals = hicmatrix.utilities.enlarge_bins( self.hic_ma.cut_intervals) self.hic_ma.interval_trees, self.hic_ma.chrBinBoundaries = \ self.hic_ma.intervalListToIntervalTree(new_intervals) self.hic_ma.cut_intervals = new_intervals binsize = self.hic_ma.getBinSize() max_depth_in_bins = int(self.properties['depth'] / binsize) # work only with the lower matrix # and remove all pixels that are beyond # 2 * max_depth_in_bis which are not required # (this is done by subtracting a second sparse matrix # that contains only the lower matrix that wants to be removed. limit = 2 * max_depth_in_bins self.hic_ma.matrix = scipy.sparse.triu(self.hic_ma.matrix, k=0, format='csr') - \ scipy.sparse.triu(self.hic_ma.matrix, k=limit, format='csr') self.hic_ma.matrix.eliminate_zeros() # fill the main diagonal, otherwise it looks # not so good. The main diagonal is filled # with an array containing the max value found # in the matrix if sum(self.hic_ma.matrix.diagonal()) == 0: self.log.info( "Filling main diagonal with max value because it empty and looks bad...\n" ) max_value = self.hic_ma.matrix.data.max() main_diagonal = scipy.sparse.dia_matrix( ([max_value] * self.hic_ma.matrix.shape[0], [0]), shape=self.hic_ma.matrix.shape) self.hic_ma.matrix = self.hic_ma.matrix + main_diagonal self.norm = None self.process_color('colormap', colormap_possible=True, colormap_only=True, default_value_is_colormap=True) self.cmap = cm.get_cmap(self.properties['colormap']) self.cmap.set_bad('black')
def main(args=None): args = parse_arguments().parse_args(args) log.debug(args) # parse from hicpro, homer, h5 and hic to cool if args.inputFormat != 'hic' and args.outputFormat != 'mcool': if len(args.matrices) != len(args.outFileName): log.error( 'Number of input matrices does not match number output matrices!' ) exit(1) if args.inputFormat == 'hic' and args.outputFormat == 'cool': log.info('Converting with hic2cool.') for i, matrix in enumerate(args.matrices): if args.resolutions is None: hic2cool_convert(matrix, args.outFileName[i], 0) else: for resolution in args.resolutions: out_name = args.outFileName[i].split('.') out_name[-2] = out_name[-2] + '_' + str(resolution) out_name = '.'.join(out_name) hic2cool_convert(matrix, out_name, resolution) return elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool']: format_was_h5 = False if args.inputFormat == 'h5': format_was_h5 = True applyCorrection = True if args.store_applied_correction: applyCorrection = False if args.inputFormat == 'hicpro': if len(args.matrices) != len(args.bedFileHicpro): log.error( 'Number of matrices and associated bed files need to be the same.' ) log.error('Matrices: {}; Bed files: {}'.format( len(args.matrices), len(args.bedFileHicpro))) sys.exit(1) for i, matrix in enumerate(args.matrices): if args.inputFormat == 'hicpro': matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pBedFileHicPro=args.bedFileHicpro[i]) else: correction_operator = None if args.correction_division: correction_operator = '/' chromosomes_to_load = None if args.chromosome: chromosomes_to_load = [args.chromosome] applyCorrectionCoolerLoad = True if args.load_raw_values: applyCorrectionCoolerLoad = False matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pCorrectionFactorTable=args.correction_name, pCorrectionOperator=correction_operator, pChrnameList=chromosomes_to_load, pEnforceInteger=args.enforce_integer, pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() log.debug('Setting done') if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']: if args.outputFormat in ['homer', 'ginteractions']: # make it a upper triangular matrix in case it is not already _matrix = triu(_matrix) # make it a full symmetrical matrix _matrix = _matrix.maximum(_matrix.T) matrixFileHandlerOutput = MatrixFileHandler( pFileType=args.outputFormat, pEnforceInteger=args.enforce_integer, pFileWasH5=format_was_h5) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save(args.outFileName[i], pSymmetric=True, pApplyCorrection=applyCorrection) elif args.outputFormat in ['mcool']: log.debug('outformat is mcool') if args.resolutions and len(args.matrices) > 1: log.error( 'Please define one matrix and many resolutions which should be created or multiple matrices.' ) if args.resolutions: log.info( 'Correction factors are removed. They are not valid for any new created resolution.' ) hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() for j, resolution in enumerate(args.resolutions): hic_matrix_res = deepcopy(hic_matrix) _mergeFactor = int(resolution) // bin_size log.debug('bin size {}'.format(bin_size)) log.debug('_mergeFactor {}'.format(_mergeFactor)) if int(resolution) != bin_size: merged_matrix = hicMergeMatrixBins.merge_bins( hic_matrix_res, _mergeFactor) else: merged_matrix = hic_matrix_res append = False if j > 0: append = True matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pEnforceInteger=args.enforce_integer, pAppend=append, pFileWasH5=format_was_h5) matrixFileHandlerOutput.set_matrix_variables( merged_matrix.matrix, merged_matrix.cut_intervals, merged_matrix.nan_bins, merged_matrix.correction_factors, merged_matrix.distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '::/resolutions/' + str(resolution), pSymmetric=True, pApplyCorrection=applyCorrection) else: append = False if i > 0: append = True hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pAppend=append, pFileWasH5=format_was_h5) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '::/resolutions/' + str(bin_size), pSymmetric=True, pApplyCorrection=applyCorrection)
def computeInterIntraTADs(pMatrix, pDomainList, pCoolOrH5, pThreadId, pQueue): try: inter_left_sum_list = [] inter_right_sum_list = [] inter_left_densit_list = [] inter_right_density_list = [] inter_left_number_of_contacts_list = [] inter_right_number_of_contacts_list = [] inter_left_number_of_contacts_nnz_list = [] inter_right_number_of_contacts_nzz_list = [] intra_sum_list = [] intra_number_of_contacts_list = [] intra_number_of_contacts_nnz_list = [] intra_density_list = [] inter_left_intra_ratio_list = [] inter_right_intra_ratio_list = [] inter_left_inter_right_intra_ratio_list = [] rows = [] chromosome_list = pDomainList for i, row in enumerate(chromosome_list): if pThreadId is None: log.debug('first thread') if i == len(chromosome_list) - 1: continue elif pThreadId == True: log.debug('middle thread') if i == 0 or i == len(chromosome_list) - 1: log.debug('i: {}'.format(i)) log.debug('len(chromosome_list): {}'.format( len(chromosome_list))) continue elif pThreadId == False: log.debug('last thread') if i == 0: continue if i - 1 >= 0: chromosom = chromosome_list[i - 1][0] start = chromosome_list[i - 1][1] else: chromosom = chromosome_list[i][0] start = chromosome_list[i][1] if i + 1 < len(chromosome_list): end = chromosome_list[i + 1][2] else: end = chromosome_list[i][2] # midpos = row[1] + ((row[2] - row[1]) / 2) if pCoolOrH5: # # get intra-TAD data hic_matrix = hm.hiCMatrix(pMatrixFile=pMatrix, pChrnameList=[ str(row[0]) + ':' + str(row[1]) + '-' + str(row[2]) ]) matrix = hic_matrix.matrix hic_matrix_inter_tad = hm.hiCMatrix( pMatrixFile=pMatrix, pChrnameList=[ str(chromosom) + ':' + str(start) + '-' + str(end) ]) matrix_inter_tad = hic_matrix_inter_tad.matrix else: # in case of h5 pMatrixTarget is already a HiCMatrix object hic_matrix = pMatrix hic_matrix_inter_tad = pMatrix indices = hic_matrix.getRegionBinRange(str(row[0]), row[1], row[2]) matrix = hic_matrix.matrix[indices[0]:indices[1], indices[0]:indices[1]] matrix_inter_tad = pMatrix.matrix # matrix = matrix.flatten() # get index position left tad with tad left_boundary_index = hic_matrix_inter_tad.getRegionBinRange( str(chromosom), row[1], row[1])[0] if pCoolOrH5: outer_left_boundary_index = 0 outer_right_boundary_index = -1 else: outer_left_boundary_index = hic_matrix_inter_tad.getRegionBinRange( str(chromosom), start, end)[0] outer_right_boundary_index = hic_matrix_inter_tad.getRegionBinRange( str(chromosom), start, end)[1] if i + 1 < len(chromosome_list) and not pCoolOrH5: # get index position right tad with tad right_boundary_index = hic_matrix_inter_tad.getRegionBinRange( str(chromosom), row[2], row[2])[0] elif i + 1 < len(chromosome_list): right_boundary_index = hic_matrix_inter_tad.getRegionBinRange( str(chromosom), row[2], row[2])[0] if i - 1 >= 0 and i + 1 < len(chromosome_list): intertad_left = matrix_inter_tad[ outer_left_boundary_index:left_boundary_index, left_boundary_index:right_boundary_index] intertad_right = matrix_inter_tad[ left_boundary_index:right_boundary_index, right_boundary_index:outer_right_boundary_index] elif i - 1 < 0 and i + 1 < len(chromosome_list): intertad_right = matrix_inter_tad[ left_boundary_index:right_boundary_index, right_boundary_index:outer_right_boundary_index] elif i - 1 > 0 and i + 1 >= len(chromosome_list): intertad_left = matrix_inter_tad[ outer_left_boundary_index:left_boundary_index, left_boundary_index:right_boundary_index] inter_left_sum = 0 inter_right_sum = 0 inter_left_density = 0 inter_right_density = 0 inter_left_number_of_contacts = 0 inter_right_number_of_contacts = 0 inter_left_number_of_contacts_nnz = 0 inter_right_number_of_contacts_nzz = 0 intra_sum = matrix.sum() intra_number_of_contacts = matrix.shape[0] * matrix.shape[1] intra_number_of_contacts_nnz = matrix.nnz intra_density = intra_number_of_contacts_nnz / intra_number_of_contacts # both inter, left and right is available if i - 1 >= 0 and i + 1 < len(chromosome_list): # intertad_left = intertad_left.flatten() # intertad_right = intertad_right.flatten() inter_left_sum = intertad_left.sum() inter_right_sum = intertad_right.sum() inter_left_number_of_contacts = intertad_left.shape[ 0] * intertad_left.shape[1] inter_right_number_of_contacts = intertad_right.shape[ 0] * intertad_right.shape[1] inter_left_number_of_contacts_nnz = intertad_left.nnz inter_right_number_of_contacts_nzz = intertad_right.nnz inter_left_density = inter_left_number_of_contacts_nnz / inter_left_number_of_contacts inter_right_density = inter_right_number_of_contacts_nzz / inter_right_number_of_contacts # statistic_left, significance_level_left = ranksums(intertad_left, intertad_left_control) # statistic_right, significance_level_right = ranksums(intertad_right, intertad_right_control) elif i - 1 < 0 and i + 1 < len(chromosome_list): # inter right is available # intertad_right = intertad_right.flatten() inter_right_sum = intertad_right.sum() inter_right_number_of_contacts = intertad_right.shape[ 0] * intertad_right.shape[1] inter_right_number_of_contacts_nzz = intertad_right.nnz inter_right_density = inter_right_number_of_contacts_nzz / inter_right_number_of_contacts # statistic_right, significance_level_right = ranksums(intertad_right, intertad_right_control) elif i - 1 > 0 and i + 1 >= len(chromosome_list): # inter left is available # intertad_left = intertad_left.flatten() inter_left_sum = intertad_left.sum() inter_left_number_of_contacts = intertad_left.shape[ 0] * intertad_left.shape[1] inter_left_number_of_contacts_nnz = intertad_left.nnz inter_left_density = inter_left_number_of_contacts_nnz / inter_left_number_of_contacts # statistic_left, significance_level_left = ranksums(intertad_left, intertad_left_control) inter_left_intra_ratio = inter_left_sum / intra_sum inter_right_intra_ratio = inter_right_sum / intra_sum inter_left_inter_right_intra_ratio = (inter_left_sum + inter_right_sum) / intra_sum inter_left_sum_list.append(inter_left_sum) inter_right_sum_list.append(inter_right_sum) inter_left_densit_list.append(inter_left_density) inter_right_density_list.append(inter_right_density) inter_left_number_of_contacts_list.append( inter_left_number_of_contacts) inter_right_number_of_contacts_list.append( inter_right_number_of_contacts) inter_left_number_of_contacts_nnz_list.append( inter_left_number_of_contacts_nnz) inter_right_number_of_contacts_nzz_list.append( inter_right_number_of_contacts_nzz) intra_sum_list.append(intra_sum) intra_number_of_contacts_list.append(intra_number_of_contacts) intra_number_of_contacts_nnz_list.append( intra_number_of_contacts_nnz) intra_density_list.append(intra_density) inter_left_intra_ratio_list.append(inter_left_intra_ratio) inter_right_intra_ratio_list.append(inter_right_intra_ratio) inter_left_inter_right_intra_ratio_list.append( inter_left_inter_right_intra_ratio) rows.append(row) except Exception as exp: pQueue.put('Fail: ' + str(exp) + traceback.format_exc()) return pQueue.put([ inter_left_sum_list, inter_right_sum_list, inter_left_densit_list, inter_right_density_list, inter_left_number_of_contacts_list, inter_right_number_of_contacts_list, inter_left_number_of_contacts_nnz_list, inter_right_number_of_contacts_nzz_list, intra_sum_list, intra_number_of_contacts_list, intra_number_of_contacts_nnz_list, intra_density_list, inter_left_intra_ratio_list, inter_right_intra_ratio_list, inter_left_inter_right_intra_ratio_list, rows ])
def test_build_matrix_cooler_multiple(): outfile = NamedTemporaryFile(suffix='.cool', delete=False) outfile.close() qc_folder = mkdtemp(prefix="testQC_") args = "-s {} {} --outFileName {} -bs 5000 10000 20000 -b /tmp/test.bam --QCfolder {} --threads 4".format( sam_R1, sam_R2, outfile.name, qc_folder).split() hicBuildMatrix.main(args) test_5000 = hm.hiCMatrix( ROOT + "hicBuildMatrix/multi_small_test_matrix.cool::/resolutions/5000") test_10000 = hm.hiCMatrix( ROOT + "hicBuildMatrix/multi_small_test_matrix.cool::/resolutions/10000") test_20000 = hm.hiCMatrix( ROOT + "hicBuildMatrix/multi_small_test_matrix.cool::/resolutions/20000") new_5000 = hm.hiCMatrix(outfile.name + '::/resolutions/5000') new_10000 = hm.hiCMatrix(outfile.name + '::/resolutions/10000') new_20000 = hm.hiCMatrix(outfile.name + '::/resolutions/20000') nt.assert_equal(test_5000.matrix.data, new_5000.matrix.data) nt.assert_equal(test_10000.matrix.data, new_10000.matrix.data) nt.assert_equal(test_20000.matrix.data, new_20000.matrix.data) # nt.assert_equal(test.cut_intervals, new.cut_intervals) nt.assert_equal(len(new_5000.cut_intervals), len(test_5000.cut_intervals)) nt.assert_equal(len(new_10000.cut_intervals), len(test_10000.cut_intervals)) nt.assert_equal(len(new_20000.cut_intervals), len(test_20000.cut_intervals)) cut_interval_new_ = [] cut_interval_test_ = [] for x in new_5000.cut_intervals: cut_interval_new_.append(x[:3]) for x in test_5000.cut_intervals: cut_interval_test_.append(x[:3]) nt.assert_equal(cut_interval_new_, cut_interval_test_) cut_interval_new_ = [] cut_interval_test_ = [] for x in new_10000.cut_intervals: cut_interval_new_.append(x[:3]) for x in test_10000.cut_intervals: cut_interval_test_.append(x[:3]) nt.assert_equal(cut_interval_new_, cut_interval_test_) cut_interval_new_ = [] cut_interval_test_ = [] for x in new_20000.cut_intervals: cut_interval_new_.append(x[:3]) for x in test_20000.cut_intervals: cut_interval_test_.append(x[:3]) nt.assert_equal(cut_interval_new_, cut_interval_test_) # print(set(os.listdir(ROOT + "QC/"))) assert are_files_equal(ROOT + "QC/QC.log", qc_folder + "/QC.log") assert set(os.listdir(ROOT + "QC/")) == set(os.listdir(qc_folder)) os.unlink(outfile.name) shutil.rmtree(qc_folder)
def adjustMatrix(pArgs): if pArgs.chromosomes is not None and pArgs.regions is not None: log.error('Please specify either --chromosomes or --regions.') exit(1) hic_matrix = None if pArgs.chromosomes: if check_cooler(pArgs.matrix) and len(pArgs.chromosomes) == 1 and pArgs.action == 'keep': chromosomes_list = cooler.Cooler(pArgs.matrix).chromnames if pArgs.chromosomes[0] in chromosomes_list: hic_matrix = hm.hiCMatrix(pArgs.matrix, pChrnameList=pArgs.chromosomes) else: log.error('Chromosome not available in matrix: {} {}'.format(pArgs.matrix, pArgs.chromosomes[0])) exit(1) else: hic_matrix = hm.hiCMatrix(pArgs.matrix) chromosomes_list = list(hic_matrix.chrBinBoundaries) chromosomes_list_to_operate_on = [] for chromosome in pArgs.chromosomes: if chromosome in chromosomes_list: chromosomes_list_to_operate_on.append(chromosome) else: log.warning('Chromosome not available in matrix: {} {}'.format(pArgs.matrix, chromosome)) if len(chromosomes_list_to_operate_on) == 0: log.error('No valid chromosome given: {}. Available: {}'.format(pArgs.chromosomes, chromosomes_list)) exit(1) if pArgs.action == 'keep': hic_matrix.reorderChromosomes(chromosomes_list_to_operate_on) elif pArgs.action == 'remove': # chromosomes = list(hic_matrix.chrBinBoundaries) for chromosome in chromosomes_list: if chromosome in chromosomes_list_to_operate_on: chromosomes_list.remove(chromosome) hic_matrix.reorderChromosomes(chromosomes_list) elif pArgs.action == 'mask': hic_matrix.maskChromosomes(chromosomes_list_to_operate_on) elif pArgs.regions: hic_matrix = hm.hiCMatrix(pArgs.matrix) chromosomes_list = list(hic_matrix.chrBinBoundaries) genomic_regions = [] with open(pArgs.regions, 'r') as file: for line in file.readlines(): _line = line.strip().split('\t') if len(line) < 3: log.warning("An entry shorter than 3 columns has been found!") continue if len(_line) >= 3: chrom, start, end = _line[0], int(_line[1]), int(_line[2]) if chrom in chromosomes_list: genomic_regions.append((chrom, start, end)) else: log.warning('Chromosome not available in matrix, ignoring regions: {} {}'.format(pArgs.matrix, chrom)) if len(genomic_regions) == 0: log.error('No valid chromosome given. Available: {}'.format(chromosomes_list)) exit(1) # log.debug('genomic_regions {}'.format(genomic_regions)) matrix_indices_regions = [] for region in genomic_regions: _regionBinRange = hic_matrix.getRegionBinRange(region[0], region[1], region[2]) if _regionBinRange is not None: start, end = _regionBinRange matrix_indices_regions.extend(list(range(start, end))) # log.debug('matrix_indices_regions {}'.format(matrix_indices_regions)) if pArgs.action == 'keep': values_submatrix = matrix_indices_regions instances, features = hic_matrix.matrix.nonzero() mask = np.isin(instances, values_submatrix) mask = np.logical_not(mask) hic_matrix.matrix.data[mask] = 0 hic_matrix.matrix.eliminate_zeros() elif pArgs.action == 'mask': hic_matrix.maskBins(matrix_indices_regions) elif pArgs.action == 'remove': full_matrix_range = np.array(range(0, max(hic_matrix.matrix.shape[0], hic_matrix.matrix.shape[1]))) matrix_indices_regions = np.array(matrix_indices_regions) full_matrix_range[matrix_indices_regions] = -1 mask = full_matrix_range != -1 full_matrix_range = full_matrix_range[mask] hic_matrix.reorderBins(full_matrix_range) elif pArgs.maskBadRegions: if check_cooler(pArgs.matrix) and len(pArgs.chromosomes) == 1 and pArgs.action == 'keep': hic_matrix = hm.hiCMatrix(pArgs.matrix, pChrnameList=pArgs.chromosomes) else: hic_matrix = hm.hiCMatrix(pArgs.matrix) else: log.info('No data to adjust given. Please specify either --chromosomes or --region parameter.') return hic_matrix
def main(args=None): args = parse_arguments().parse_args(args) mpl.rcParams['pdf.fonttype'] = 42 # read domains file domains_df = readDomainBoundaries(args.tadDomains) # log.debug('len(domains_df) {}'.format(len(domains_df))) domains = domains_df.values.tolist() old_chromosome = None tads_per_chromosome = [] for j in range(len(domains)): if old_chromosome is None: old_chromosome = domains[j][0] per_chromosome = [] per_chromosome.append(domains[j]) elif old_chromosome == domains[j][0]: per_chromosome.append(domains[j]) continue else: tads_per_chromosome.append(per_chromosome) per_chromosome = [] per_chromosome.append(domains[j]) old_chromosome = domains[j][0] tads_per_chromosome.append(per_chromosome) # read full h5 or only region if cooler is_cooler = check_cooler(args.matrix) if not is_cooler: hic_matrix = hm.hiCMatrix(args.matrix) else: hic_matrix = args.matrix inter_left_sum_list_chromosomes = [] inter_right_sum_list_chromosomes = [] inter_left_density_list_chromosomes = [] inter_right_density_list_chromosomes = [] inter_left_number_of_contacts_list_chromosomes = [] inter_right_number_of_contacts_list_chromosomes = [] inter_left_number_of_contacts_nnz_list_chromosomes = [] inter_right_number_of_contacts_nzz_list_chromosomes = [] intra_sum_list_chromosomes = [] intra_number_of_contacts_list_chromosomes = [] intra_number_of_contacts_nnz_list_chromosomes = [] intra_density_list_chromosomes = [] inter_left_intra_ratio_list_chromosomes = [] inter_right_intra_ratio_list_chromosomes = [] inter_left_inter_right_intra_ratio_list_chromosomes = [] rows_chromosomes = [] inter_left_sum_list_threads = [[]] * args.threads inter_right_sum_list_threads = [[]] * args.threads inter_left_density_list_threads = [[]] * args.threads inter_right_density_list_threads = [[]] * args.threads inter_left_number_of_contacts_list_threads = [[]] * args.threads inter_right_number_of_contacts_list_threads = [[]] * args.threads inter_left_number_of_contacts_nnz_list_threads = [[]] * args.threads inter_right_number_of_contacts_nzz_list_threads = [[]] * args.threads intra_sum_list_threads = [[]] * args.threads intra_number_of_contacts_list_threads = [[]] * args.threads intra_number_of_contacts_nnz_list_threads = [[]] * args.threads intra_density_list_threads = [[]] * args.threads inter_left_intra_ratio_list_threads = [[]] * args.threads inter_right_intra_ratio_list_threads = [[]] * args.threads inter_left_inter_right_intra_ratio_list_threads = [[]] * args.threads rows_threads = [[]] * args.threads threads_save = deepcopy(args.threads) for chromosome in tads_per_chromosome: # log.debug('tads_per_chromosome {}'.format(chromosome)) domainsPerThread = len(chromosome) // args.threads if domainsPerThread == 0 and len(chromosome) > 0: domainsPerThread = 1 args.threads = 1 elif domainsPerThread > 0: args.threads = threads_save all_data_collected = False queue = [None] * args.threads process = [None] * args.threads thread_done = [False] * args.threads # None --> first thread, process first element in list, ignore last one # True --> middle thread: ignore first and last element in tad processing # False --> last thread: ignore first element, process last one thread_id = None for i in range(args.threads): if args.threads == 1: domainListThread = chromosome elif i == 0: domainListThread = chromosome[i * domainsPerThread:( (i + 1) * domainsPerThread) + 1] thread_id = None elif i < args.threads - 1: domainListThread = chromosome[(i * domainsPerThread) - 1:((i + 1) * domainsPerThread) + 1] thread_id = True else: domainListThread = chromosome[(i * domainsPerThread) - 1:] thread_id = False if args.threads == 1: thread_id = '' # log.debug('len(domainListThread) {}'.format(len(domainListThread))) # log.debug('len(thread_id) {}'.format(thread_id)) queue[i] = Queue() process[i] = Process( target=computeInterIntraTADs, kwargs=dict( pMatrix=hic_matrix, # pMatrixControl=hic_matrix_control, pDomainList=domainListThread, pCoolOrH5=is_cooler, # pPValue=args.pValue, pThreadId=thread_id, pQueue=queue[i])) process[i].start() fail_flag = False fail_message = '' while not all_data_collected: for i in range(args.threads): if queue[i] is not None and not queue[i].empty(): queue_data = queue[i].get() if 'Fail:' in queue_data: fail_flag = True fail_message = queue_data else: inter_left_sum_list_threads[i], \ inter_right_sum_list_threads[i], \ inter_left_density_list_threads[i], \ inter_right_density_list_threads[i], \ inter_left_number_of_contacts_list_threads[i], \ inter_right_number_of_contacts_list_threads[i], \ inter_left_number_of_contacts_nnz_list_threads[i], \ inter_right_number_of_contacts_nzz_list_threads[i], \ intra_sum_list_threads[i], \ intra_number_of_contacts_list_threads[i], \ intra_number_of_contacts_nnz_list_threads[i], \ intra_density_list_threads[i], \ inter_left_intra_ratio_list_threads[i], \ inter_right_intra_ratio_list_threads[i], \ inter_left_inter_right_intra_ratio_list_threads[i], \ rows_threads[i] = queue_data queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True # elif queue[i] is None and all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) if fail_flag: log.error(fail_message[6:]) exit(1) inter_left_sum_list_chromosomes.append([ item for sublist in inter_left_sum_list_threads for item in sublist ]) inter_right_sum_list_chromosomes.append([ item for sublist in inter_right_sum_list_threads for item in sublist ]) inter_left_density_list_chromosomes.append([ item for sublist in inter_left_density_list_threads for item in sublist ]) inter_right_density_list_chromosomes.append([ item for sublist in inter_right_density_list_threads for item in sublist ]) inter_left_number_of_contacts_list_chromosomes.append([ item for sublist in inter_left_number_of_contacts_list_threads for item in sublist ]) inter_right_number_of_contacts_list_chromosomes.append([ item for sublist in inter_right_number_of_contacts_list_threads for item in sublist ]) inter_left_number_of_contacts_nnz_list_chromosomes.append([ item for sublist in inter_left_number_of_contacts_nnz_list_threads for item in sublist ]) inter_right_number_of_contacts_nzz_list_chromosomes.append([ item for sublist in inter_right_number_of_contacts_nzz_list_threads for item in sublist ]) intra_sum_list_chromosomes.append( [item for sublist in intra_sum_list_threads for item in sublist]) intra_number_of_contacts_list_chromosomes.append([ item for sublist in intra_number_of_contacts_list_threads for item in sublist ]) intra_number_of_contacts_nnz_list_chromosomes.append([ item for sublist in intra_number_of_contacts_nnz_list_threads for item in sublist ]) intra_density_list_chromosomes.append([ item for sublist in intra_density_list_threads for item in sublist ]) inter_left_intra_ratio_list_chromosomes.append([ item for sublist in inter_left_intra_ratio_list_threads for item in sublist ]) inter_right_intra_ratio_list_chromosomes.append([ item for sublist in inter_right_intra_ratio_list_threads for item in sublist ]) inter_left_inter_right_intra_ratio_list_chromosomes.append([ item for sublist in inter_left_inter_right_intra_ratio_list_threads for item in sublist ]) rows_chromosomes.append( [item for sublist in rows_threads for item in sublist]) inter_left_sum_list = [ item for sublist in inter_left_sum_list_chromosomes for item in sublist ] inter_right_sum_list = [ item for sublist in inter_right_sum_list_chromosomes for item in sublist ] inter_left_density_list = [ item for sublist in inter_left_density_list_chromosomes for item in sublist ] inter_right_density_list = [ item for sublist in inter_right_density_list_chromosomes for item in sublist ] inter_left_number_of_contacts_list = [ item for sublist in inter_left_number_of_contacts_list_chromosomes for item in sublist ] inter_right_number_of_contacts_list = [ item for sublist in inter_right_number_of_contacts_list_chromosomes for item in sublist ] inter_left_number_of_contacts_nnz_list = [ item for sublist in inter_left_number_of_contacts_nnz_list_chromosomes for item in sublist ] inter_right_number_of_contacts_nzz_list = [ item for sublist in inter_right_number_of_contacts_nzz_list_chromosomes for item in sublist ] intra_sum_list = [ item for sublist in intra_sum_list_chromosomes for item in sublist ] intra_number_of_contacts_list = [ item for sublist in intra_number_of_contacts_list_chromosomes for item in sublist ] intra_number_of_contacts_nnz_list = [ item for sublist in intra_number_of_contacts_nnz_list_chromosomes for item in sublist ] intra_density_list = [ item for sublist in intra_density_list_chromosomes for item in sublist ] inter_left_intra_ratio_list = [ item for sublist in inter_left_intra_ratio_list_chromosomes for item in sublist ] inter_right_intra_ratio_list = [ item for sublist in inter_right_intra_ratio_list_chromosomes for item in sublist ] inter_left_inter_right_intra_ratio_list = [ item for sublist in inter_left_inter_right_intra_ratio_list_chromosomes for item in sublist ] rows = [item for sublist in rows_chromosomes for item in sublist] with open(args.outFileName, 'w') as file: header = '# Created with HiCExplorer\'s hicInterIntraTAD version ' + __version__ + '\n' header += '# Chromosome\tstart\tend\tname\tscore\tstrand\tinter_left_sum\tinter_right_sum\tinter_left_density\tinter_right_density\tinter_left_number_of_contacts\tinter_right_number_of_contacts\t' \ 'inter_left_number_of_contacts_nnz\tinter_right_number_of_contacts_nnz\tintra_sum\tintra_number_of_contacts\tintra_number_of_contacts_nnz\tintra_density\tinter_left_intra_ratio\tinter_right_intra_ratio\tinter_left_inter_right_intra_ratio\n' file.write(header) for i, row in enumerate(rows): row_list = list(map(str, row)) file.write('\t'.join(row_list)) file.write('\t{}'.format(inter_left_sum_list[i])) file.write('\t{}'.format(inter_right_sum_list[i])) file.write('\t{}'.format(inter_left_density_list[i])) file.write('\t{}'.format(inter_right_density_list[i])) file.write('\t{}'.format(inter_left_number_of_contacts_list[i])) file.write('\t{}'.format(inter_right_number_of_contacts_list[i])) file.write('\t{}'.format( inter_left_number_of_contacts_nnz_list[i])) file.write('\t{}'.format( inter_right_number_of_contacts_nzz_list[i])) file.write('\t{}'.format(intra_sum_list[i])) file.write('\t{}'.format(intra_number_of_contacts_list[i])) file.write('\t{}'.format(intra_number_of_contacts_nnz_list[i])) file.write('\t{}'.format(intra_density_list[i])) file.write('\t{}'.format(inter_left_intra_ratio_list[i])) file.write('\t{}'.format(inter_right_intra_ratio_list[i])) file.write('\t{}'.format( inter_left_inter_right_intra_ratio_list[i])) file.write('\n') plt.scatter(inter_left_intra_ratio_list, inter_right_intra_ratio_list, s=20, alpha=0.7) plt.xlabel('Inter-left/intra TAD contact ratio', fontsize=args.fontsize) plt.ylabel('Inter-right/intra TAD contact ratio', fontsize=args.fontsize) plt.tight_layout() plt.savefig(args.outFileNameRatioPlot, dpi=args.dpi) plt.close()
def main(args=None): args = parse_arguments().parse_args(args) hic_ma = hm.hiCMatrix(pMatrixFile=args.matrix) indices_values = [] with open(args.regions, 'r') as file: for line in file.readlines(): _line = line.strip().split('\t') if len(line) == 0: continue if len(_line) == 2: chrom, start = _line[0], _line[1] viewpoint = (chrom, start, start) elif len(_line) >= 3: chrom, start, end = _line[0], _line[1], _line[2] if args.considerStrandDirection and len(_line) < 6: log.error( 'Strand orientation should be considered but file does not contain the 6th column of the bed file containing this information. Exiting!' ) exit(1) viewpoint = (chrom, start, end) if args.range: start_range_genomic, end_range_genomic, start_out, end_out = calculateViewpointRange( hic_ma, viewpoint, args.range, args.coordinatesToBinMapping) start_bin, end_bin = getBinIndices( hic_ma, (chrom, start_range_genomic, end_range_genomic)) else: start_bin, end_bin, start_out, end_out = calculateViewpointRangeBins( hic_ma, viewpoint, args.rangeInBins, args.coordinatesToBinMapping) if args.considerStrandDirection: indices_values.append( [start_bin, end_bin, start_out, end_out, _line[5]]) else: indices_values.append( [start_bin, end_bin, start_out, end_out, None]) if args.range: dimensions_new_matrix = (args.range[0] // hic_ma.getBinSize()) + ( args.range[1] // hic_ma.getBinSize()) elif args.rangeInBins: dimensions_new_matrix = args.rangeInBins[0] + args.rangeInBins[1] summed_matrix = lil_matrix((dimensions_new_matrix, dimensions_new_matrix), dtype=np.float32) count_matrix = np.zeros(shape=(dimensions_new_matrix, dimensions_new_matrix)) # max_length = hic_ma.matrix.shape[1] for start, end, start_out, end_out, orientation in indices_values: _start = 0 _end = summed_matrix.shape[1] # if start < 0: # _start = np.absolute(start) # start = 0 # if end >= max_length: # _end = end # end = max_length orig_matrix_length = end - start if start_out: _start = _end - orig_matrix_length if end_out: _end = start + orig_matrix_length submatrix = hic_ma.matrix[start:end, start:end] if summed_matrix.shape != submatrix.shape: log.warning('Shape of a submatrix does not match. It is ignored.') log.warning('Region: {}'.format(hic_ma.getBinPos(start))) continue count_matrix[_start:_end, _start:_end] += 1 if orientation is None or orientation == '+': summed_matrix[_start:_end, _start:_end] += hic_ma.matrix[start:end, start:end] elif orientation == '-': summed_matrix[_start:_end, _start:_end] += hic_ma.matrix[start:end, start:end].T summed_matrix /= count_matrix summed_matrix = np.array(summed_matrix) data = summed_matrix[np.nonzero(summed_matrix)] row = np.nonzero(summed_matrix)[0] col = np.nonzero(summed_matrix)[1] summed_matrix = csr_matrix( (data, (row, col)), shape=(dimensions_new_matrix, dimensions_new_matrix)) save_npz(args.outFileName, summed_matrix)
def mergeLoops(pDataFrame, pLowestResolution, pTupleX, pTupleY): hic = hm.hiCMatrix() target_regions_intervaltree_x = hic.intervalListToIntervalTree(pTupleX)[0] target_regions_intervaltree_y = hic.intervalListToIntervalTree(pTupleY)[0] for i, loop in enumerate(pDataFrame.values): # neighborhood factor to extent the search range. This allows to consider the smaller bin sizes # like they would be bins of the lowest resolution neighborhood_factor_x = int(pLowestResolution) - abs( int(loop[2]) - int(loop[1])) neighborhood_factor_y = int(pLowestResolution) - abs( int(loop[5]) - int(loop[4])) if loop[0] in target_regions_intervaltree_x: x_interval = target_regions_intervaltree_x[loop[0]].overlap( loop[1] - neighborhood_factor_x - 1, loop[2] + neighborhood_factor_x + 1) if loop[3] in target_regions_intervaltree_y: y_interval = target_regions_intervaltree_y[loop[0]].overlap( loop[4] - neighborhood_factor_y - 1, loop[5] + neighborhood_factor_y + 1) if len(x_interval) <= 1 or len(y_interval) <= 1: continue dict_of_interest_x = {} list_of_interest = [] for data in x_interval: dict_of_interest_x[data[2]] = [data[0], data[1]] for data in y_interval: if data[2] in dict_of_interest_x: list_of_interest.append(data) max_index = 0 max_distance = 0 all_id_list = [] for data in list_of_interest: if abs(data[0] - data[1]) > max_distance: max_distance = abs(data[0] - data[1]) max_index = data[2] all_id_list.append(data[2]) for data in x_interval: if data[2] == max_index: continue if data[2] not in all_id_list: continue target_regions_intervaltree_x[loop[0]].remove(data) for data in y_interval: if data[2] == max_index: continue if data[2] not in all_id_list: continue target_regions_intervaltree_y[loop[0]].remove(data) result_list_index = [] dict_x = {} dict_y = {} for chromosome_x, chromosome_y in zip(target_regions_intervaltree_x, target_regions_intervaltree_y): target_regions_intervaltree_x[chromosome_x] = sorted( target_regions_intervaltree_x[chromosome_x]) target_regions_intervaltree_y[chromosome_y] = sorted( target_regions_intervaltree_y[chromosome_y]) for x in target_regions_intervaltree_x[chromosome_x]: dict_x[x[2]] = (x[0], x[1]) for y in target_regions_intervaltree_y[chromosome_y]: dict_y[y[2]] = (y[0], y[1]) for x in dict_x: if x in dict_y: result_list_index.append(x) dict_x = None dict_x = {} dict_y = None dict_y = {} return result_list_index
def main(args=None): args = parse_arguments().parse_args(args) if not args.outFileName.endswith('.h5') or args.outFileName.endswith( '.cool'): log.error('Output filetype not known.') log.error('It is: {}'.format(args.outFileName)) log.error('Accepted is .h5 or .cool') exit(1) hic_ma = hm.hiCMatrix(pMatrixFile=args.matrix) log.info("hic_ma.matrix: {}".format(hic_ma.matrix)) if args.chromosomes: hic_ma.keepOnlyTheseChr(args.chromosomes) length_chromosome = 0 chromosome_count = len(hic_ma.getChrNames()) for chrname in hic_ma.getChrNames(): chr_range = hic_ma.getChrBinRange(chrname) length_chromosome += chr_range[1] - chr_range[0] trasf_matrix = lil_matrix(hic_ma.matrix.shape) if args.method == 'norm': trasf_matrix = lil_matrix(hic_ma.matrix.shape) # trasf_matrix_pearson = lil_matrix(hic_ma.matrix.shape) # trasf_matrix_corr = lil_matrix(hic_ma.matrix.shape) for chrname in hic_ma.getChrNames(): chr_range = hic_ma.getChrBinRange(chrname) submatrix = hic_ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] submatrix.astype(float) submatrix = _obs_exp_norm(submatrix, length_chromosome, chromosome_count) submatrix = __pearson(submatrix) trasf_matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix(submatrix) # hic_ma.setMatrix(trasf_matrix.tocsr(), cut_intervals=hic_ma.cut_intervals) # hic_ma.save('obs_norm_pearson.'+ args.outFileName, pSymmetric=False, pApplyCorrection=False) elif args.method == 'obs_exp': for chrname in hic_ma.getChrNames(): chr_range = hic_ma.getChrBinRange(chrname) submatrix = hic_ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] submatrix.astype(float) trasf_matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix( __obs_exp(submatrix, length_chromosome, chromosome_count)) elif args.method == 'pearson': for chrname in hic_ma.getChrNames(): chr_range = hic_ma.getChrBinRange(chrname) submatrix = hic_ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] log.debug("shape: {}".format(submatrix.shape)) submatrix.astype(float) log.debug("shape: {}".format(submatrix.shape)) trasf_matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix( __pearson(submatrix.todense())) elif args.method == 'covariance': for chrname in hic_ma.getChrNames(): chr_range = hic_ma.getChrBinRange(chrname) submatrix = hic_ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] log.debug("shape: {}".format(submatrix.shape)) submatrix.astype(float) log.debug("shape: {}".format(submatrix.shape)) corrmatrix = np.cov(submatrix.todense()) trasf_matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix(corrmatrix) elif args.method == 'all': trasf_matrix_obs_exp = lil_matrix(hic_ma.matrix.shape) trasf_matrix_pearson = lil_matrix(hic_ma.matrix.shape) trasf_matrix_corr = lil_matrix(hic_ma.matrix.shape) for chrname in hic_ma.getChrNames(): chr_range = hic_ma.getChrBinRange(chrname) submatrix = hic_ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] submatrix.astype(float) submatrix = __obs_exp(submatrix, length_chromosome, chromosome_count) trasf_matrix_obs_exp[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix( submatrix) submatrix = __pearson(submatrix) trasf_matrix_pearson[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix( submatrix) corrmatrix = np.cov(submatrix) trasf_matrix_corr[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix( corrmatrix) hic_ma.setMatrix(trasf_matrix_obs_exp.tocsr(), cut_intervals=hic_ma.cut_intervals) basename_outFileName = basename(args.outFileName) basename_obs_exp = "obs_exp_" + basename_outFileName basename_pearson = "pearson_" + basename_outFileName basename_covariance = "covariance_" + basename_outFileName path = dirname(args.outFileName) if path != '': path += '/' hic_ma.save(path + basename_obs_exp, pSymmetric=False, pApplyCorrection=False) hic_ma.setMatrix(trasf_matrix_pearson.tocsr(), cut_intervals=hic_ma.cut_intervals) hic_ma.save(path + basename_pearson, pSymmetric=False, pApplyCorrection=False) hic_ma.setMatrix(trasf_matrix_corr.tocsr(), cut_intervals=hic_ma.cut_intervals) hic_ma.save(path + basename_covariance, pSymmetric=False, pApplyCorrection=False) if not args.method == 'all': hic_ma.setMatrix(trasf_matrix.tocsr(), cut_intervals=hic_ma.cut_intervals) hic_ma.save(args.outFileName, pSymmetric=False, pApplyCorrection=False)
def main(args=None): args = parse_arguments().parse_args(args) if args.verbose: log.setLevel(logging.INFO) # args.chromosomes if check_cooler(args.matrix) and args.chromosomes is not None and len( args.chromosomes) == 1: ma = hm.hiCMatrix(args.matrix, pChrnameList=toString(args.chromosomes)) else: ma = hm.hiCMatrix(args.matrix) if args.chromosomes: ma.reorderChromosomes(toString(args.chromosomes)) # mask all zero value bins row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() log.info("Removing {} zero value bins".format(sum(row_sum == 0))) ma.maskBins(np.flatnonzero(row_sum == 0)) matrix_shape = ma.matrix.shape ma.matrix = convertNansToZeros(ma.matrix) ma.matrix = convertInfsToZeros(ma.matrix) if 'plotName' in args: plot_total_contact_dist(ma, args) log.info("Saving diagnostic plot {}\n".format(args.plotName)) return log.info("matrix contains {} data points. Sparsity {:.3f}.".format( len(ma.matrix.data), float(len(ma.matrix.data)) / (ma.matrix.shape[0]**2))) if args.skipDiagonal: ma.diagflat(value=0) outlier_regions = filter_by_zscore(ma, args.filterThreshold[0], args.filterThreshold[1], perchr=args.perchr) # compute and print some statistics pct_outlier = 100 * float(len(outlier_regions)) / ma.matrix.shape[0] ma.printchrtoremove(outlier_regions, label="Bins that are MAD outliers ({:.2f}%) " "out of".format(pct_outlier, ma.matrix.shape[0]), restore_masked_bins=False) assert matrix_shape == ma.matrix.shape # mask filtered regions ma.maskBins(outlier_regions) total_filtered_out = set(outlier_regions) if args.sequencedCountCutoff and 0 < args.sequencedCountCutoff < 1: chrom, _, _, coverage = zip(*ma.cut_intervals) assert type(coverage[0]) == np.float64 failed_bins = np.flatnonzero( np.array(coverage) < args.sequencedCountCutoff) ma.printchrtoremove(failed_bins, label="Bins with low coverage", restore_masked_bins=False) ma.maskBins(failed_bins) total_filtered_out = set(failed_bins) """ ma.matrix, to_remove = fill_gaps(ma, failed_bins) log.warning("From {} failed bins, {} could " "not be filled\n".format(len(failed_bins), len(to_remove))) ma.maskBins(to_remove) """ if args.transCutoff and 0 < args.transCutoff < 100: cutoff = float(args.transCutoff) / 100 # a usual cutoff is 0.05 ma.truncTrans(high=cutoff) pre_row_sum = np.asarray(ma.matrix.sum(axis=1)).flatten() correction_factors = [] if args.perchr: corrected_matrix = lil_matrix(ma.matrix.shape) # normalize each chromosome independently for chrname in list(ma.interval_trees): chr_range = ma.getChrBinRange(chrname) chr_submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] _matrix, _corr_factors = iterative_correction(chr_submatrix, args) corrected_matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = _matrix correction_factors.append(_corr_factors) correction_factors = np.concatenate(correction_factors) else: corrected_matrix, correction_factors = iterative_correction( ma.matrix, args) ma.setMatrixValues(corrected_matrix) ma.setCorrectionFactors(correction_factors) log.info("Correction factors {}".format(correction_factors[:10])) if args.inflationCutoff and args.inflationCutoff > 0: after_row_sum = np.asarray(corrected_matrix.sum(axis=1)).flatten() # identify rows that were expanded more than args.inflationCutoff times to_remove = np.flatnonzero( after_row_sum / pre_row_sum >= args.inflationCutoff) ma.printchrtoremove(to_remove, label="inflated >={} " "regions".format(args.inflationCutoff), restore_masked_bins=False) total_filtered_out = total_filtered_out.union(to_remove) ma.maskBins(to_remove) ma.printchrtoremove(sorted(list(total_filtered_out)), label="Total regions to be removed", restore_masked_bins=False) ma.save(args.outFileName, pApplyCorrection=False)
def main(args=None): args = parse_arguments().parse_args(args) if int(args.numberOfEigenvectors) != len(args.outputFileName): log.error("Number of output file names and number of eigenvectors" " does not match. Please" "provide the name of each file.\nFiles: {}\nNumber of " "eigenvectors: {}".format(args.outputFileName, args.numberOfEigenvectors)) exit(1) ma = hm.hiCMatrix(args.matrix) ma.maskBins(ma.nan_bins) if args.chromosomes: ma.keepOnlyTheseChr(args.chromosomes) vecs_list = [] chrom_list = [] start_list = [] end_list = [] # PCA is computed per chromosome length_chromosome = 0 chromosome_count = len(ma.getChrNames()) if args.pearsonMatrix: trasf_matrix_pearson = lil_matrix(ma.matrix.shape) if args.obsexpMatrix: trasf_matrix_obsexp = lil_matrix(ma.matrix.shape) for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) length_chromosome += chr_range[1] - chr_range[0] if args.extraTrack and (args.extraTrack.endswith('.bw') or args.extraTrack.endswith('.bigwig')): bwTrack = pyBigWig.open(args.extraTrack, 'r') for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] if args.norm: obs_exp_matrix_ = obs_exp_matrix_norm(submatrix) else: obs_exp_matrix_ = obs_exp_matrix_lieberman(submatrix, length_chromosome, chromosome_count) obs_exp_matrix_ = convertNansToZeros( csr_matrix(obs_exp_matrix_)).todense() obs_exp_matrix_ = convertInfsToZeros( csr_matrix(obs_exp_matrix_)).todense() if args.obsexpMatrix: trasf_matrix_obsexp[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix( obs_exp_matrix_) pearson_correlation_matrix = np.corrcoef(obs_exp_matrix_) pearson_correlation_matrix = convertNansToZeros( csr_matrix(pearson_correlation_matrix)).todense() pearson_correlation_matrix = convertInfsToZeros( csr_matrix(pearson_correlation_matrix)).todense() if args.pearsonMatrix: trasf_matrix_pearson[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix( pearson_correlation_matrix) corrmatrix = np.cov(pearson_correlation_matrix) corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense() corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense() evals, eigs = linalg.eig(corrmatrix) k = args.numberOfEigenvectors chrom, start, end, _ = zip( *ma.cut_intervals[chr_range[0]:chr_range[1]]) chrom_list += chrom start_list += start end_list += end if args.extraTrack and (args.extraTrack.endswith('.bw') or args.extraTrack.endswith('.bigwig')): assert (len(end) == len(start)) correlateEigenvectorWithHistonMarkTrack(eigs[:, :k].transpose(), bwTrack, chrname, start, end, args.extraTrack, args.histonMarkType) vecs_list += eigs[:, :k].tolist() if args.pearsonMatrix: file_type = 'cool' if args.pearsonMatrix.endswith('.h5'): file_type = 'h5' matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type) matrixFileHandlerOutput.set_matrix_variables( trasf_matrix_pearson.tocsr(), ma.cut_intervals, ma.nan_bins, ma.correction_factors, ma.distance_counts) matrixFileHandlerOutput.save(args.pearsonMatrix, pSymmetric=True, pApplyCorrection=False) if args.obsexpMatrix: file_type = 'cool' if args.obsexpMatrix.endswith('.h5'): file_type = 'h5' matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type) matrixFileHandlerOutput.set_matrix_variables( trasf_matrix_obsexp.tocsr(), ma.cut_intervals, ma.nan_bins, ma.correction_factors, ma.distance_counts) matrixFileHandlerOutput.save(args.obsexpMatrix, pSymmetric=True, pApplyCorrection=False) if args.extraTrack and not args.extraTrack.endswith( '.bw') and not args.extraTrack.endswith('.bigwig'): vecs_list = correlateEigenvectorWithGeneTrack(ma, vecs_list, args.extraTrack) if args.format == 'bedgraph': for idx, outfile in enumerate(args.outputFileName): assert (len(vecs_list) == len(chrom_list)) with open(outfile, 'w') as fh: for i, value in enumerate(vecs_list): if len(value) == args.numberOfEigenvectors: if isinstance(value[idx], np.complex): value[idx] = value[idx].real fh.write("{}\t{}\t{}\t{:.12f}\n".format( toString(chrom_list[i]), start_list[i], end_list[i], value[idx])) elif args.format == 'bigwig': if not pyBigWig.numpy == 1: log.error("ERROR: Your version of pyBigWig is not supporting " "numpy: {}".format(pyBigWig.__file__)) exit(1) old_chrom = chrom_list[0] header = [] for i, _chrom in enumerate(chrom_list): if old_chrom != _chrom: header.append((toString(old_chrom), end_list[i - 1])) old_chrom = _chrom header.append((toString(chrom_list[-1]), end_list[-1])) for idx, outfile in enumerate(args.outputFileName): log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list))) log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list))) assert (len(vecs_list) == len(chrom_list)) _chrom_list = [] _start_list = [] _end_list = [] values = [] bw = pyBigWig.open(outfile, 'w') # set big wig header bw.addHeader(header) # create entry lists for i, value in enumerate(vecs_list): # it can happen that some 'value' is having less dimensions than it should if len(value) == args.numberOfEigenvectors: if isinstance(value[idx], np.complex): value[idx] = value[idx].real values.append(value[idx]) _chrom_list.append(toString(chrom_list[i])) _start_list.append(start_list[i]) _end_list.append(end_list[i]) # write entries bw.addEntries(_chrom_list, _start_list, ends=_end_list, values=values) bw.close() else: log.error("Output format not known: {}".format(args.format)) exit(1)
def main(args=None): args = parse_arguments().parse_args(args) if args.labels and len(args.matrices) != len(args.labels): log.error("The number of labels does not match the number of matrices.") exit(0) if not args.labels: args.labels = map(lambda x: os.path.basename(x), args.matrices) num_files = len(args.matrices) map(lambda x: os.path.basename(x), args.matrices) # initialize results matrix results = np.zeros((num_files, num_files), dtype='float') rows, cols = np.triu_indices(num_files) correlation_opts = {'spearman': spearmanr, 'pearson': pearsonr} hic_mat_list = [] max_value = None min_value = None all_mat = None all_nan = [] for i, matrix in enumerate(args.matrices): log.info("loading hic matrix {}\n".format(matrix)) if (check_cooler(args.matrices[i])) and args.chromosomes is not None and len(args.chromosomes) == 1: _mat = hm.hiCMatrix(matrix, pChrnameList=args.chromosomes) else: _mat = hm.hiCMatrix(matrix) if args.chromosomes: _mat.keepOnlyTheseChr(args.chromosomes) _mat.filterOutInterChrCounts() _mat.diagflat(0) log.info("restore masked bins {}\n".format(matrix)) bin_size = _mat.getBinSize() all_nan = np.unique(np.concatenate([all_nan, _mat.nan_bins])) _mat = triu(_mat.matrix, k=0, format='csr') if args.range: min_dist, max_dist = args.range.split(":") min_dist = int(min_dist) max_dist = int(max_dist) if max_dist < bin_size: log.error("Please specify a max range that is larger than bin size ({})".format(bin_size)) exit() max_depth_in_bins = int(max_dist / bin_size) max_dist = int(max_dist) // bin_size min_dist = int(min_dist) // bin_size # work only with the upper matrix # and remove all pixels that are beyond # max_depth_in_bis # (this is done by subtracting a second sparse matrix # that contains only the upper matrix that wants to be removed. _mat = triu(_mat, k=0, format='csr') - triu(_mat, k=max_depth_in_bins, format='csr') _mat.eliminate_zeros() _mat_coo = _mat.tocoo() dist = _mat_coo.col - _mat_coo.row keep = np.flatnonzero((dist <= max_dist) & (dist >= min_dist)) _mat_coo.data = _mat_coo.data[keep] _mat_coo.row = _mat_coo.row[keep] _mat_coo.col = _mat_coo.col[keep] _mat = _mat_coo.tocsr() else: _mat = triu(_mat, k=0, format='csr') if args.log1p: _mat.data = np.log1p(_mat.data) if all_mat is None: all_mat = _mat else: all_mat = all_mat + _mat if max_value is None or max_value < _mat.data.max(): max_value = _mat.data.max() if min_value is None or min_value > _mat.data.min(): min_value = _mat.data.min() hic_mat_list.append(_mat) # remove nan bins rows_keep = cols_keep = np.delete(list(range(all_mat.shape[1])), all_nan) all_mat = all_mat[rows_keep, :][:, cols_keep] # make large matrix to correlate by # using sparse matrix tricks big_mat = None for mat in hic_mat_list: mat = mat[rows_keep, :][:, cols_keep] sample_vector = (mat + all_mat).data - all_mat.data if big_mat is None: big_mat = sample_vector else: big_mat = np.vstack([big_mat, sample_vector]) # take the transpose such that columns represent each of the samples big_mat = np.ma.masked_invalid(big_mat).T grids = gridspec.GridSpec(num_files, num_files) grids.update(wspace=0, hspace=0) fig = plt.figure(figsize=(2 * num_files, 2 * num_files)) plt.rcParams['font.size'] = 8.0 min_value = int(big_mat.min()) max_value = int(big_mat.max()) if (min_value % 2 == 0 and max_value % 2 == 0) or \ (min_value % 1 == 0 and max_value % 2 == 1): # make one value odd and the other even max_value += 1 if args.log1p: major_locator = FixedLocator(list(range(min_value, max_value, 2))) minor_locator = FixedLocator(list(range(min_value, max_value, 1))) for index in range(len(rows)): row = rows[index] col = cols[index] if row == col: results[row, col] = 1 # add titles as # empty plot in the diagonal ax = fig.add_subplot(grids[row, col]) ax.text(0.6, 0.6, args.labels[row], verticalalignment='center', horizontalalignment='center', fontsize=10, fontweight='bold', transform=ax.transAxes) ax.set_axis_off() continue log.info("comparing {} and {}\n".format(args.matrices[row], args.matrices[col])) # remove cases in which both are zero or one is zero and # the other is one _mat = big_mat[:, [row, col]] _mat = _mat[_mat.sum(axis=1) > 1, :] vector1 = _mat[:, 0] vector2 = _mat[:, 1] results[row, col] = correlation_opts[args.method](vector1, vector2)[0] # scatter plots ax = fig.add_subplot(grids[row, col]) if args.log1p: ax.xaxis.set_major_locator(major_locator) ax.xaxis.set_minor_locator(minor_locator) ax.yaxis.set_major_locator(major_locator) ax.yaxis.set_minor_locator(minor_locator) ax.text(0.2, 0.8, "{}={:.2f}".format(args.method, results[row, col]), horizontalalignment='left', transform=ax.transAxes) ax.get_yaxis().set_tick_params( which='both', left='off', right='off', direction='out') ax.get_xaxis().set_tick_params( which='both', top='off', bottom='off', direction='out') if col != num_files - 1: ax.set_yticklabels([]) else: ax.yaxis.tick_right() ax.get_yaxis().set_tick_params( which='both', left='off', right='on', direction='out') if col - row == 1: ax.xaxis.tick_bottom() ax.get_xaxis().set_tick_params( which='both', top='off', bottom='on', direction='out') else: ax.set_xticklabels([]) ax.hist2d(vector1, vector2, bins=150, cmin=0.1) fig.tight_layout() log.info("saving {}".format(args.outFileNameScatter)) fig.savefig(args.outFileNameScatter, bbox_inches='tight') results = results + np.triu(results, 1).T plot_correlation(results, args.labels, args.outFileNameHeatmap, args.zMax, args.zMin, args.colorMap, image_format=args.plotFileFormat)
def __init__(self, *args, **kwargs): super(EngineHiCTrack, self).__init__(*args, **kwargs) log.debug('FILE {}'.format(self.properties)) # log.debug('pRegion {}'.format(pRegion)) region = None if self.properties['region'] is not None: if self.properties['region'][2] == 1e15: region = [str(self.properties['region'][0])] elif len(self.properties['region']) == 3: start = int(self.properties['region'][1]) - int( self.properties['depth']) if start < 0: start = 0 end = int(self.properties['region'][2]) + int( self.properties['depth']) region = [ str(self.properties['region'][0]) + ':' + str(start) + '-' + str(end) ] # initialize matrix as HiCMatrix object with no data self.hic_ma = HiCMatrix.hiCMatrix(pMatrixFile=None, pChrnameList=region) # create matrix to fill out data and intervals if 'matrix shape' not in self.properties: self.properties['matrix shape'] = 1000 if 'binsize' not in self.properties: self.properties['binsize'] = 3000 if 'intervals start' not in self.properties: self.properties['intervals start'] = 0 self.hic_ma.matrix, self.hic_ma.cut_intervals = \ self.definematrix(self.properties['matrix shape'], self.properties['binsize'], self.properties['intervals start'], self.properties['chrom']) self.hic_ma.interval_trees, self.hic_ma.chrBinBoundaries = \ self.hic_ma.intervalListToIntervalTree(self.hic_ma.cut_intervals) if len(self.hic_ma.matrix.data) == 0: self.log.error("Matrix {} is empty".format( self.properties['file'])) exit(1) if 'show_masked_bins' in self.properties and self.properties[ 'show_masked_bins'] == 'yes': pass else: self.hic_ma.maskBins(self.hic_ma.nan_bins) # check that the matrix can be log transformed if 'transform' in self.properties: if self.properties['transform'] == 'log1p': if self.hic_ma.matrix.data.min() + 1 < 0: self.log.error( "\n*ERROR*\nMatrix contains negative values.\n" "log1p transformation can not be applied to \n" "values in matrix: {}".format(self.properties['file'])) exit(1) elif self.properties['transform'] == '-log': if self.hic_ma.matrix.data.min() < 0: self.log.error( "\n*ERROR*\nMatrix contains negative values.\n" "log(-1 * <values>) transformation can not be applied to \n" "values in matrix: {}".format(self.properties['file'])) exit(1) elif self.properties['transform'] == 'log': if self.hic_ma.matrix.data.min() < 0: self.log.error( "\n*ERROR*\nMatrix contains negative values.\n" "log transformation can not be applied to \n" "values in matrix: {}".format(self.properties['file'])) exit(1) binsize = self.hic_ma.getBinSize() max_depth_in_bins = int(self.properties['depth'] / binsize) # work only with the lower matrix # and remove all pixels that are beyond # 2 * max_depth_in_bis which are not required # (this is done by subtracting a second sparse matrix # that contains only the lower matrix that wants to be removed. limit = 2 * max_depth_in_bins self.hic_ma.matrix = scipy.sparse.triu(self.hic_ma.matrix, k=0, format='csr') - \ scipy.sparse.triu(self.hic_ma.matrix, k=limit, format='csr') self.hic_ma.matrix.eliminate_zeros() # fill the main diagonal, otherwise it looks # not so good. The main diagonal is filled # with an array containing the max value found # in the matrix if sum(self.hic_ma.matrix.diagonal()) == 0: self.log.info( "Filling main diagonal with max value because it empty and looks bad...\n" ) max_value = self.hic_ma.matrix.data.max() main_diagonal = scipy.sparse.dia_matrix( ([max_value] * self.hic_ma.matrix.shape[0], [0]), shape=self.hic_ma.matrix.shape) self.hic_ma.matrix = self.hic_ma.matrix + main_diagonal self.plot_inverted = False if 'orientation' in self.properties and self.properties[ 'orientation'] == 'inverted': self.plot_inverted = True self.norm = None if 'colormap' not in self.properties: self.properties['colormap'] = DEFAULT_MATRIX_COLORMAP self.cmap = cm.get_cmap(self.properties['colormap']) self.cmap.set_bad('white') #self.cmap.set_over('blue') self.background = True
def main(args=None): args = parse_arguments().parse_args(args) viewpointObj = Viewpoint() referencePoints, _ = viewpointObj.readReferencePointFile( args.referencePoints) # compute for each viewpoint the sparsity and consider these as bad with a sparsity less than given. referencePointsPerThread = len(referencePoints) // args.threads queue = [None] * args.threads process = [None] * args.threads sparsity = [] fail_flag = False fail_message = '' for j, matrix in enumerate(args.matrices): sparsity_local = [None] * args.threads hic_ma = hm.hiCMatrix(matrix) viewpointObj.hicMatrix = hic_ma all_data_collected = False thread_done = [False] * args.threads for i in range(args.threads): if i < args.threads - 1: referencePointsThread = referencePoints[ i * referencePointsPerThread:(i + 1) * referencePointsPerThread] else: referencePointsThread = referencePoints[ i * referencePointsPerThread:] if len(referencePointsThread) == 0: process[i] = None queue[i] = None sparsity_local[i] = [] continue else: queue[i] = Queue() process[i] = Process( target=compute_sparsity, kwargs=dict(pReferencePoints=referencePointsThread, pViewpointObj=viewpointObj, pArgs=args, pQueue=queue[i])) process[i].start() log.debug('process started {}'.format(i)) while not all_data_collected: for i in range(args.threads): if queue[i] is not None and not queue[i].empty(): sparsity_ = queue[i].get() if 'Fail:' in sparsity_: fail_flag = True fail_message = sparsity_[6:] log.debug('process computed: {}'.format(i)) sparsity_local[i] = sparsity_ queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) del hic_ma del viewpointObj.hicMatrix # merge sparsity data per matrix from each thread to one list if fail_flag: log.error(fail_message) exit(1) sparsity_local = [ item for sublist in sparsity_local for item in sublist ] sparsity.append(sparsity_local) # sparsity = np.array(sparsity) # mask = sparsity == -1.0 # change sparsity to sparsity values per viewpoint per matrix: viewpoint = [matrix1, ..., matrix_n] sparsity = np.array(sparsity).T count_accepted = 0 count_rejected = 0 count_failure = 0 with open(args.referencePoints, 'r') as reference_file_input: with open(args.outFileName + '_raw_filter', 'w') as output_file_raw: output_file_raw.write( '# Created with chicQualityControl version {}\n'.format( __version__)) output_file_raw.write( '# A sparsity of -1.0 indicates a faulty reference point e.g. no data for this reference point was in the matrix.\n' ) output_file_raw.write('# Used Matrices ') for matrix in args.matrices: output_file_raw.write('{}\t'.format(matrix)) output_file_raw.write('\n# Chromosome\tStart\tEnd') for matrix in args.matrices: output_file_raw.write('\tSparsity {}'.format( os.path.basename(matrix))) output_file_raw.write('\n') with open(args.outFileName + '_failed_reference_points', 'w') as output_file_failed: with open(args.outFileName + '_rejected_filter', 'w') as output_file_rejected: with open(args.outFileName, 'w') as output_file: for i, line in enumerate( reference_file_input.readlines()): sparsity_str = '\t'.join( str(x) for x in sparsity[i]) output_file_raw.write(line.strip() + '\t' + sparsity_str + '\n') count = 0 count_negative = 0 for j in range(len(sparsity[i])): if sparsity[i][j] == -1.0: count_negative += 1 elif sparsity[i][j] > args.sparsity: count += 1 if count_negative: output_file_failed.write(line) count_failure += 1 elif count: output_file.write(line) count_accepted += 1 else: output_file_rejected.write(line) count_rejected += 1 with open(args.outFileName + '_report', 'w') as output_file_report: output_file_report.write( '# Created with chicQualityControl version {}\n'.format( __version__)) output_file_report.write('# QC report for matrices: ') for matrix in args.matrices: output_file_report.write(matrix + ' ') output_file_report.write('\n') output_file_report.write( '#Sparsity threshold for rejection: sparsity <= {} are rejected.\n' .format(args.sparsity)) output_file_report.write('\nNumber of reference points: {}\n'.format( str(count_accepted + count_rejected + count_failure))) output_file_report.write( 'Number of accepted reference points: {}\n'.format( str(count_accepted))) output_file_report.write( 'Number of rejected reference points: {}\n'.format( str(count_rejected))) output_file_report.write( 'Number of faulty reference points: {}\n'.format( str(count_failure))) output_file_report.write( '\n\nA faulty reference point is caused by the non-presence of the chromosome in one of the given matrices.\n' ) output_file_report.write( 'It can also be caused by the non-presence of valid Hi-C reads in a region, especially at the chromosome ends.\n' ) output_file_report.write( 'Please check the results of hicInfo to validate this for your data.\n' ) # output plot of sparsity distribution per sample # remove fault reference points from statistics x = [[]] * len(args.matrices) y = [[]] * len(args.matrices) mask = [True] * len(sparsity) for i in range(len(sparsity)): delete_instance = False for j in range(len(args.matrices)): if sparsity[i][j] == -1.0: delete_instance = True if delete_instance: mask[i] = False mask = np.array(mask) sparsity = sparsity[mask] for i in range(len(args.matrices)): y[i] = [i] * len(sparsity) sparsity = sparsity.T for i in range(len(args.matrices)): x[i] = sparsity[i].flatten() for i in range(len(args.matrices)): plt.plot(x[i], y[i], 'o', mfc='none', markersize=0.3, label=args.matrices[i].split('/')[-1]) plt.yticks([]) plt.xlabel("Sparsity level") plt.axvline(x=args.sparsity, c='r', label='sparsity threshold', linewidth=0.3) plt.xscale('log') ax = plt.gca() box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.6, box.height]) plt.legend(loc='center', bbox_to_anchor=(1.4, 0.5)) plt.savefig(args.outFileNameSparsity, dpi=args.dpi) # plt.xlabel("Length of list (number)") # plt.ylabel("Time taken (seconds)") plt.close() for i in range(len(args.matrices)): plt.hist(x[i], bins=100, alpha=0.5, label=args.matrices[i].split('/')[-1]) plt.xlabel("Sparsity level") plt.ylabel("Number of counts") # plt.legend(loc='upper right') ax = plt.gca() box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.6, box.height]) plt.legend(loc='center', bbox_to_anchor=(1.4, 0.5)) plt.savefig(args.outFileNameHistogram, dpi=args.dpi)
def main(args=None): args = parse_arguments().parse_args(args) viewpointObj = Viewpoint() referencePoints, _ = viewpointObj.readReferencePointFile( args.referencePoints) # compute for each viewpoint the sparsity and consider these as bad with a sparsity less than given. referencePointsPerThread = len(referencePoints) // args.threads queue = [None] * args.threads process = [None] * args.threads sparsity = [] for j, matrix in enumerate(args.matrices): sparsity_local = [None] * args.threads hic_ma = hm.hiCMatrix(matrix) viewpointObj.hicMatrix = hic_ma all_data_collected = False thread_done = [False] * args.threads for i in range(args.threads): if i < args.threads - 1: referencePointsThread = referencePoints[ i * referencePointsPerThread:(i + 1) * referencePointsPerThread] else: referencePointsThread = referencePoints[ i * referencePointsPerThread:] if len(referencePointsThread) == 0: process[i] = None queue[i] = None sparsity_local[i] = [] continue else: queue[i] = Queue() process[i] = Process( target=compute_sparsity, kwargs=dict(pReferencePoints=referencePointsThread, pViewpointObj=viewpointObj, pArgs=args, pQueue=queue[i])) process[i].start() while not all_data_collected: for i in range(args.threads): if queue[i] is not None and not queue[i].empty(): sparsity_ = queue[i].get() sparsity_local[i] = sparsity_ queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) del hic_ma del viewpointObj.hicMatrix # merge sparsity data per matrix from each thread to one list sparsity_local = [ item for sublist in sparsity_local for item in sublist ] sparsity.append(sparsity_local) # change sparsity to sparsity values per viewpoint per matrix: viewpoint = [matrix1, ..., matrix_n] sparsity = np.array(sparsity).T with open(args.referencePoints, 'r') as reference_file_input: with open(args.outFileName + '_raw_filter', 'w') as output_file_raw: output_file_raw.write( '# Created with chicQualityControl version {}\n'.format( __version__)) output_file_raw.write('# Chromosome\tStart\tEnd\t') for matrix in args.matrices: output_file_raw.write('Sparsity {}\t'.format(matrix)) output_file_raw.write('\n') with open(args.outFileName + '_rejected_filter', 'w') as output_file_rejected: with open(args.outFileName, 'w') as output_file: for i, line in enumerate(reference_file_input.readlines()): sparsity_str = '\t'.join(str(x) for x in sparsity[i]) output_file_raw.write(line.strip() + '\t' + sparsity_str + '\n') count = 0 for j in range(len(sparsity[i])): if sparsity[i][j] > args.sparsity: count += 1 if count: output_file.write(line) else: output_file_rejected.write(line) # output plot of sparsity distribution per sample # re-arange values again x = [[]] * len(args.matrices) y = [[]] * len(args.matrices) for i in range(len(args.matrices)): y[i] = [i] * len(sparsity) sparsity = sparsity.T for i in range(len(args.matrices)): x[i] = sparsity[i].flatten() for i in range(len(args.matrices)): plt.plot(x[i], y[i], 'o', mfc='none', markersize=0.3, label=args.matrices[i].split('/')[-1]) plt.yticks([]) plt.xlabel("Sparsity level") plt.axvline(x=args.sparsity, c='r', label='sparsity threshold', linewidth=0.3) plt.xscale('log') ax = plt.gca() box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.6, box.height]) plt.legend(loc='center', bbox_to_anchor=(1.4, 0.5)) plt.savefig(args.outFileNameSparsity, dpi=args.dpi) # plt.xlabel("Length of list (number)") # plt.ylabel("Time taken (seconds)") plt.close() for i in range(len(args.matrices)): plt.hist(x[i], bins=100, alpha=0.5, label=args.matrices[i].split('/')[-1]) plt.xlabel("Sparsity level") plt.ylabel("Number of counts") # plt.legend(loc='upper right') ax = plt.gca() box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.6, box.height]) plt.legend(loc='center', bbox_to_anchor=(1.4, 0.5)) plt.savefig(args.outFileNameHistogram, dpi=args.dpi)
def main(args=None): args = parse_arguments().parse_args(args) viewpointObj = Viewpoint() referencePoints, gene_list = viewpointObj.readReferencePointFile( args.referencePoints) referencePointsPerThread = len(referencePoints) // args.threads queue = [None] * args.threads process = [None] * args.threads file_list = [] background_model = viewpointObj.readBackgroundDataFile( args.backgroundModelFile, args.range, args.fixateRange) background_model_mean_values = viewpointObj.readBackgroundDataFile( args.backgroundModelFile, args.range, args.fixateRange, pMean=True) # background_sum_of_densities_dict = viewpointObj.computeSumOfDensities( # background_model, args, pXfoldMaxValue=args.xFoldMaxValueNB) if not os.path.exists(args.outputFolder): try: os.makedirs(args.outputFolder) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise fail_flag = False fail_message = '' for matrix in args.matrices: hic_ma = hm.hiCMatrix(matrix) viewpointObj.hicMatrix = hic_ma file_list_sample = [None] * args.threads all_data_collected = False for i in range(args.threads): if i < args.threads - 1: referencePointsThread = referencePoints[i * referencePointsPerThread:(i + 1) * referencePointsPerThread] geneListThread = gene_list[i * referencePointsPerThread:(i + 1) * referencePointsPerThread] else: referencePointsThread = referencePoints[i * referencePointsPerThread:] geneListThread = gene_list[i * referencePointsPerThread:] if len(referencePointsThread) == 0: process[i] = None queue[i] = None file_list_sample[i] = [] continue queue[i] = Queue() process[i] = Process(target=compute_viewpoint, kwargs=dict( pViewpointObj=viewpointObj, pArgs=args, pQueue=queue[i], pReferencePoints=referencePointsThread, pGeneList=geneListThread, pMatrix=matrix, pBackgroundModel=background_model, pBackgroundModelRelativeInteractions=background_model_mean_values, pOutputFolder=args.outputFolder ) ) process[i].start() while not all_data_collected: for i in range(args.threads): if queue[i] is not None and not queue[i].empty(): file_list_ = queue[i].get() if 'Fail:' in file_list_: fail_flag = True fail_message = file_list_[6:] file_list_sample[i] = file_list_ process[i].join() process[i].terminate() process[i] = None all_data_collected = True for i in range(args.threads): if process[i] is not None: all_data_collected = False time.sleep(1) if fail_flag: log.error(fail_message) exit(1) file_list_sample = [item for sublist in file_list_sample for item in sublist] file_list.append(file_list_sample) log.debug('file_list {}'.format(file_list)) if args.writeFileNamesToFile: with open(args.writeFileNamesToFile, 'w') as file: log.debug('len(file_list) {}'.format(len(file_list))) if len(file_list) > 1: for i, sample in enumerate(file_list): for sample2 in file_list[i + 1:]: for viewpoint, viewpoint2 in zip(sample, sample2): file.write(viewpoint + '\n') file.write(viewpoint2 + '\n') else: for viewpoint in file_list[0]: file.write(viewpoint + '\n') if args.allViewpointsList: with open(args.writeFileNamesToFile + 'all', 'w') as file: if len(file_list) > 1: for i, sample in enumerate(file_list[0]): file.write(sample + '\n') for j in range(1, len(file_list)): file.write(file_list[j][i] + '\n') else: for viewpoint in file_list[0]: file.write(viewpoint + '\n')
def main(args=None): args = parse_arguments().parse_args(args) if args.chromosomes is not None and args.regions is not None: log.error('Please specify either --chromosomes or --regions.') exit(1) hic_ma = None if args.chromosomes: if check_cooler(args.matrix) and len( args.chromosomes) == 1 and args.action == 'keep': hic_ma = hm.hiCMatrix(args.matrix, pChrnameList=args.chromosomes) else: hic_ma = hm.hiCMatrix(args.matrix) if args.action == 'keep': hic_ma.reorderChromosomes(args.chromosomes) elif args.action == 'remove': chromosomes = list(hic_ma.chrBinBoundaries) for chromosome in args.chromosomes: if chromosome in chromosomes: chromosomes.remove(chromosome) hic_ma.reorderChromosomes(chromosomes) elif args.action == 'mask': hic_ma.maskChromosomes(args.chromosomes) elif args.regions: hic_ma = hm.hiCMatrix(args.matrix) genomic_regions = [] with open(args.regions, 'r') as file: for line in file.readlines(): _line = line.strip().split('\t') if len(line) == 0: continue if len(_line) == 3: chrom, start, end = _line[0], _line[1], int(_line[2]) - 1 genomic_regions.append((chrom, start, end)) # log.debug('genomic_regions {}'.format(genomic_regions)) matrix_indices_regions = [] for region in genomic_regions: _regionBinRange = hic_ma.getRegionBinRange(region[0], region[1], region[2]) if _regionBinRange is not None: start, end = _regionBinRange matrix_indices_regions.extend(list(range(start, end))) # log.debug('matrix_indices_regions {}'.format(matrix_indices_regions)) if args.action == 'keep': hic_ma.reorderBins(matrix_indices_regions) elif args.action == 'mask': hic_ma.maskBins(matrix_indices_regions) elif args.action == 'remove': full_matrix_range = np.array( range(0, max(hic_ma.matrix.shape[0], hic_ma.matrix.shape[1]))) matrix_indices_regions = np.array(matrix_indices_regions) full_matrix_range[matrix_indices_regions] = -1 mask = full_matrix_range != -1 full_matrix_range = full_matrix_range[mask] hic_ma.reorderBins(full_matrix_range) elif args.maskBadRegions: if check_cooler(args.matrix) and len( args.chromosomes) == 1 and args.action == 'keep': hic_ma = hm.hiCMatrix(args.matrix, pChrnameList=args.chromosomes) else: hic_ma = hm.hiCMatrix(args.matrix) else: log.info( 'No data to adjust given. Please specify either --chromosomes or --region parameter.' ) if hic_ma is not None: hic_ma.save(args.outFileName)
def main(args=None): args = parse_arguments().parse_args(args) for matrix in args.matrices: # if generated_by = None genome_assembly = None statistics = None generated_by_cooler_lib = None tool_url = None matrix_generated_by = None matrix_generated_by_url = None creation_date = None chromosomes = None bin_length = None size = None nchroms = None num_non_zero = None min_non_zero = None max_non_zero = None sum_elements = None num_nan_bins = None if check_cooler(matrix) and args.no_metadata: cooler_file = cooler.Cooler(matrix) if cooler_file.info is not None: # log.debug('cooler_file.info {}'.format(cooler_file.info)) if 'bin-size' in cooler_file.info: bin_length = cooler_file.info['bin-size'] if 'nbins' in cooler_file.info: size = cooler_file.info['nbins'] if 'nchroms' in cooler_file.info: nchroms = cooler_file.info['nchroms'] if 'chromosomes' in cooler_file.info: chromosomes = cooler_file.info['chromosomes'] if 'nnz' in cooler_file.info: num_non_zero = cooler_file.info['nnz'] if 'min-value' in cooler_file.info: min_non_zero = cooler_file.info['min-value'] if 'max-value' in cooler_file.info: max_non_zero = cooler_file.info['max-value'] if 'generated-by' in cooler_file.info: generated_by = toString(cooler_file.info['generated-by']) if 'genome-assembly' in cooler_file.info: genome_assembly = toString( cooler_file.info['genome-assembly']) if 'metadata' in cooler_file.info: if cooler_file.info['metadata'] is not None: if 'statistics' in cooler_file.info['metadata']: statistics = cooler_file.info['metadata'][ 'statistics'] if 'generated-by-cooler-lib' in cooler_file.info: generated_by_cooler_lib = toString( cooler_file.info['generated-by-cooler-lib']) if 'tool-url' in cooler_file.info: tool_url = toString(cooler_file.info['tool-url']) if 'matrix-generated-by' in cooler_file.info: matrix_generated_by = toString( cooler_file.info['matrix-generated-by']) if 'matrix-generated-by-url' in cooler_file.info: matrix_generated_by_url = toString( cooler_file.info['matrix-generated-by-url']) if 'creation-date' in cooler_file.info: creation_date = cooler_file.info['creation-date'] if 'sum-elements' in cooler_file.info: sum_elements = cooler_file.info['sum-elements'] else: hic_ma = hm.hiCMatrix(matrix) size = hic_ma.matrix.shape[0] num_non_zero = hic_ma.matrix.nnz sum_elements = hic_ma.matrix.sum() / 2 bin_length = hic_ma.getBinSize() num_nan_bins = len(hic_ma.nan_bins) min_non_zero = hic_ma.matrix.data.min() max_non_zero = hic_ma.matrix.data.max() chromosomes = list(hic_ma.chrBinBoundaries) information = StringIO() information.write( "# Matrix information file. Created with HiCExplorer's hicInfo version {}\n" .format(__version__)) if matrix is not None: information.write("File:\t{}\n".format(matrix)) if creation_date is not None: information.write("Date:\t{}\n".format(creation_date)) if genome_assembly is not None: information.write("Genome assembly:\t{}\n".format(genome_assembly)) if size is not None: information.write("Size:\t{:,}\n".format(size)) if bin_length is not None: information.write("Bin_length:\t{}\n".format(bin_length)) if sum_elements is not None: information.write("Sum of matrix:\t{}\n".format(sum_elements)) if chromosomes is not None: information.write("Chromosomes:\t{}\n".format(", ".join( toString(chromosomes)))) if nchroms is not None: information.write("Number of chromosomes:\t{}\n".format(nchroms)) if num_non_zero is not None: information.write( "Non-zero elements:\t{:,}\n".format(num_non_zero)) if min_non_zero is not None: information.write("Minimum (non zero):\t{}\n".format(min_non_zero)) if max_non_zero is not None: information.write("Maximum:\t{}\n".format(max_non_zero)) if num_nan_bins is not None: information.write("NaN bins:\t{}\n".format(num_nan_bins)) if check_cooler(matrix): information.write( 'The following columns are available: {}\n'.format( cooler.Cooler(matrix).bins().columns.values)) if generated_by is not None: information.write("\n\nGenerated by:\t{}\n".format(generated_by)) if generated_by_cooler_lib is not None: information.write("Cooler library version:\t{}\n".format( generated_by_cooler_lib)) if tool_url is not None: information.write("HiCMatrix url:\t{}\n".format(tool_url)) if matrix_generated_by is not None: information.write("Interaction matrix created with:\t{}\n".format( matrix_generated_by)) if matrix_generated_by_url is not None: information.write("URL:\t{}\n".format(matrix_generated_by_url)) if statistics is not None: information.write("\n\nBuild statistics:\n{}\n".format(statistics)) if args.outFileName: with open(args.outFileName, 'w') as file: file.write(information.getvalue()) else: print(information.getvalue()) information.close()
def main(args=None): """ for each distance, compare the distribution of two samples, report number of cases were they differ """ args = parse_arguments().parse_args(args) mean_dict = OrderedDict() matrix_sum = {} if args.labels is None: labels = OrderedDict([(x, os.path.basename(x)) for x in args.matrices]) else: labels = OrderedDict(zip(args.matrices, args.labels)) chroms = set() for matrix_file in args.matrices: hic_ma = HiCMatrix.hiCMatrix(matrix_file) matrix_sum[matrix_file] = hic_ma.matrix.sum() if args.chromosomeExclude is None: args.chromosomeExclude = [] chrtokeep = [x for x in list(hic_ma.interval_trees) if x not in args.chromosomeExclude] hic_ma.keepOnlyTheseChr(chrtokeep) mean_dict[matrix_file] = compute_distance_mean(hic_ma, maxdepth=args.maxdepth, perchr=args.perchr) chroms = chroms.union([k for k in list(mean_dict[matrix_file]) if len(mean_dict[matrix_file][k]) > 1]) # compute scale factors such that values are comparable min_sum = min(matrix_sum.values()) scale_factor = dict([(matrix_file, float(min_sum) / mat_sum) for matrix_file, mat_sum in matrix_sum.items()]) log.info("The scale factors used are: {}".format(scale_factor)) if len(args.matrices) > 1 and args.perchr: # in this case, for each chromosome a plot is made that combines the data from the # hic matrices max_cols = 4 num_rows = int(np.ceil(float(len(chroms)) / max_cols)) num_cols = min(len(chroms), max_cols) else: num_cols = num_rows = 1 if args.plotsize is None: width = 6 height = 4 else: width, height = args.plotsize fig = plt.figure(figsize=(width * num_cols, height * num_rows)) axs = np.empty((num_rows, num_cols), dtype='object') for matrix_file in args.matrices: idx = 0 for chrom, mean_values in mean_dict[matrix_file].items(): if len(mean_values) <= 1: log.debug("No values found for: {}, chromosome: {}\n".format(matrix_file, chrom)) continue x, y = zip(*[(k, v) for k, v in mean_values.items() if v > 0]) if len(x) <= 1: log.debug("No values found for: {}, chromosome: {}\n".format(matrix_file, chrom)) continue if args.perchr and len(args.matrices) == 1: col = 0 row = 0 else: col = idx % num_cols row = idx // num_cols if axs[row, col] is None: ax = plt.subplot2grid((num_rows, num_cols), (row, col)) ax.set_xlabel('genomic distance') ax.set_ylabel('corrected Hi-C counts') try: ax.set_yscale('log') ax.set_xscale('log') except ValueError: continue else: ax = axs[row, col] y = np.array(y) * scale_factor[matrix_file] if args.perchr and len(args.matrices) > 1: label = labels[matrix_file] ax.set_title(chrom) elif args.perchr: label = chrom else: label = labels[matrix_file] ax.plot(x, y, label=label) axs[row, col] = ax idx += 1 if args.outFileData is not None: x_vals = np.stack(x).T y_vals = np.stack(y).T table_to_export = pd.DataFrame({'Matrix': labels[matrix_file], 'Chromosome': chrom, 'Distance': x_vals, 'Contacts': y_vals}) table_to_export.to_csv(args.outFileData, sep='\t') for ax in axs.reshape(-1): if ax is None: continue ax.legend(prop={'size': 'small'}) ax.set_xlim(0, args.maxdepth) handles, labels = ax.get_legend_handles_labels() lgd = ax.legend(handles, labels, loc='center left', bbox_to_anchor=(1, 0.5)) plt.tight_layout() plt.savefig(args.plotFile.name, bbox_inches='tight', bbox_extra_artists=(lgd,)) plt.close(fig)
def plotMatrix(matrixinputfile,imageoutputfile, regionindex1, regionindex2, comparematrix, title, bigwig): if not checkExtension(matrixinputfile, '.cool'): msg = "input matrix must be in cooler format (.cool)" raise SystemExit(msg) if comparematrix and not checkExtension(comparematrix, ".cool"): msg = "if specified, compare matrix must be in cooler format (.cool)" raise SystemExit(msg) if not imageoutputfile: imageoutputfile = matrixinputfile.rstrip('cool') + 'png' elif imageoutputfile and not checkExtension(imageoutputfile, ".png"): imageoutputfile = os.path.splitext(imageoutputfile)[0] + ".png" #get the full matrix first to extract the desired region ma = hm.hiCMatrix(matrixinputfile) cuts = ma.cut_intervals chromosome = cuts[0][0] maxIndex = len(cuts) - 1 #check indices and get the region if ok if regionindex1 > maxIndex: msg = "invalid start region. Allowed is 0 to {0:d} (0 to {1:d})".format(maxIndex, cuts[maxIndex][1]) raise SystemExit(msg) if regionindex2 < regionindex1: msg = "region index 2 must be smaller than region index 1" raise SystemExit(msg) if regionindex2 > maxIndex: regionindex2 = maxIndex print("region index 2 clamped to max. value {0:d}".format(maxIndex)) region = str(chromosome) +":"+str(cuts[regionindex1][1])+"-"+ str(cuts[regionindex2][1]) #now get the data for the input matrix, restricted to the desired region upperHiCMatrix = hm.hiCMatrix(matrixinputfile ,pChrnameList=[region]) upperMatrix = triu(upperHiCMatrix.matrix, k=1, format="csr") #if set, get data from the same region also for the compare matrix #there's no compatibility check so far lowerHiCMatrix = None lowerMatrix = None if comparematrix: lowerHiCMatrix = hm.hiCMatrix(comparematrix) if chromosome not in [row[0] for row in lowerHiCMatrix.cut_intervals]: msg = "compare matrix must contain the same chromosome as the input matrix" raise SystemExit(msg) lowerHiCMatrix = hm.hiCMatrix(comparematrix , pChrnameList=[region]) lowerMatrix = tril(lowerHiCMatrix.matrix, k=0, format="csr") if lowerMatrix.get_shape() != upperMatrix.get_shape(): msg = "shapes of input matrix and compare matrix do not match. Check resolutions" raise SystemExit(msg) #arguments for plotting plotArgs = Namespace(bigwig=bigwig, chromosomeOrder=None, clearMaskedBins=False, colorMap='RdYlBu_r', disable_tight_layout=False, dpi=300, flipBigwigSign=False, log=False, log1p=True, perChromosome=False, region=region, region2=None, scaleFactorBigwig=1.0, scoreName=None, title=title, vMax=None, vMaxBigwig=None, vMin=1.0, vMinBigwig=None, matrix = matrixinputfile) #following code is largely duplicated from hicPlotMatrix #not exactly beautiful, but works for now chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = hicPlot.getRegion(plotArgs, upperHiCMatrix) mixedMatrix = None if comparematrix: mixedMatrix = np.asarray((lowerMatrix + upperMatrix).todense().astype(float)) else: mixedMatrix = np.asarray(upperHiCMatrix.matrix.todense().astype(float)) #colormap for plotting cmap = cm.get_cmap(plotArgs.colorMap) # pylint: disable=no-member cmap.set_bad('black') bigwig_info = None if plotArgs.bigwig: # pylint: disable=no-member bigwig_info = {'args': plotArgs, 'axis': None, 'axis_colorbar': None, 'nan_bins': upperHiCMatrix.nan_bins} norm = None if plotArgs.log or plotArgs.log1p: # pylint: disable=no-member mask = mixedMatrix == 0 try: mixedMatrix[mask] = np.nanmin(mixedMatrix[mask == False]) except ValueError: log.info('Matrix contains only 0. Set all values to {}'.format(np.finfo(float).tiny)) mixedMatrix[mask] = np.finfo(float).tiny if np.isnan(mixedMatrix).any() or np.isinf(mixedMatrix).any(): log.debug("any nan {}".format(np.isnan(mixedMatrix).any())) log.debug("any inf {}".format(np.isinf(mixedMatrix).any())) mask_nan = np.isnan(mixedMatrix) mask_inf = np.isinf(mixedMatrix) mixedMatrix[mask_nan] = np.nanmin(mixedMatrix[mask_nan == False]) mixedMatrix[mask_inf] = np.nanmin(mixedMatrix[mask_inf == False]) log.debug("any nan after remove of nan: {}".format(np.isnan(mixedMatrix).any())) log.debug("any inf after remove of inf: {}".format(np.isinf(mixedMatrix).any())) if plotArgs.log1p: # pylint: disable=no-member mixedMatrix += 1 norm = LogNorm() elif plotArgs.log: # pylint: disable=no-member norm = LogNorm() if plotArgs.bigwig: # pylint: disable=no-member # increase figure height to accommodate bigwig track fig_height = 8.5 else: fig_height = 7 height = 4.8 / fig_height fig_width = 8 width = 5.0 / fig_width left_margin = (1.0 - width) * 0.5 fig = plt.figure(figsize=(fig_width, fig_height), dpi=plotArgs.dpi) # pylint: disable=no-member if plotArgs.bigwig: # pylint: disable=no-member gs = gridspec.GridSpec(2, 2, height_ratios=[0.90, 0.1], width_ratios=[0.97, 0.03]) gs.update(hspace=0.05, wspace=0.05) ax1 = plt.subplot(gs[0, 0]) ax2 = plt.subplot(gs[1, 0]) ax3 = plt.subplot(gs[0, 1]) bigwig_info['axis'] = ax2 bigwig_info['axis_colorbar'] = ax3 else: ax1 = None bottom = 1.3 / fig_height position = [left_margin, bottom, width, height] hicPlot.plotHeatmap(mixedMatrix, ma.get_chromosome_sizes(), fig, position, plotArgs, cmap, xlabel=chrom, ylabel=chrom2, start_pos=start_pos1, start_pos2=start_pos2, pNorm=norm, pAxis=ax1, pBigwig=bigwig_info) plt.savefig(imageoutputfile, dpi=plotArgs.dpi) # pylint: disable=no-member plt.close(fig)
def main(args=None): args = parse_arguments().parse_args(args) short_v_long_range = [] sum_smaller = [] sum_greater = [] for matrix in args.matrices: is_cooler = check_cooler(matrix) if not is_cooler: hic_matrix = hm.hiCMatrix(matrix) else: hic_matrix = matrix if args.chromosomes is None: # get all chromosomes from cooler file if not is_cooler: chromosomes_list = list(hic_matrix.chrBinBoundaries) else: chromosomes_list = cooler.Cooler(matrix).chromnames else: chromosomes_list = args.chromosomes short_v_long_range_matrix_threads = [None] * args.threads sum_smaller_threads = [None] * args.threads sum_greater_threads = [None] * args.threads chromosomesListPerThread = len(chromosomes_list) // args.threads all_data_collected = False queue = [None] * args.threads process = [None] * args.threads thread_done = [False] * args.threads for i in range(args.threads): if i < args.threads - 1: chromosomeListThread = chromosomes_list[ i * chromosomesListPerThread:(i + 1) * chromosomesListPerThread] else: chromosomeListThread = chromosomes_list[ i * chromosomesListPerThread:] queue[i] = Queue() process[i] = Process(target=compute_relation_short_long_range, kwargs=dict(pHiCMatrix=hic_matrix, pChromosomes=chromosomeListThread, pDistance=args.distance, pIsCooler=is_cooler, pQueue=queue[i])) process[i].start() while not all_data_collected: for i in range(args.threads): if queue[i] is not None and not queue[i].empty(): short_v_long_range_matrix_threads[i], sum_smaller_threads[ i], sum_greater_threads[i] = queue[i].get() queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) short_v_long_range_matrix = [ item for sublist in short_v_long_range_matrix_threads for item in sublist ] sum_smaller_matrix = [ item for sublist in sum_smaller_threads for item in sublist ] sum_greater_matrix = [ item for sublist in sum_greater_threads for item in sublist ] short_v_long_range.append(short_v_long_range_matrix) sum_smaller.append(sum_smaller_matrix) sum_greater.append(sum_greater_matrix) log.debug(short_v_long_range) plt.ylabel('Sum short range / long range') plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False) box_plot = plt.boxplot(short_v_long_range, patch_artist=True) legend_handels_color = [] for i, patch in enumerate(box_plot['boxes']): patch.set_facecolor(args.colorList[i % len(args.colorList)]) legend_handels_color.append( mpatches.Patch(color=args.colorList[i % len(args.colorList)], label=args.matrices[i].split('/')[-1])) plt.legend(handles=legend_handels_color) plt.savefig(args.plotFileName, dpi=args.dpi) if len(args.matrices) > 1: p_values = [] for i, sample in enumerate(short_v_long_range): for sample2 in short_v_long_range[i + 1:]: statistic, significance_level = ranksums(sample, sample2) p_values.append(significance_level) log.debug('p_values {}'.format(p_values)) with open(args.outFileName, 'w') as file: header = '# Created with HiCExplorer\'s hicPlotSVL ' + __version__ + '\n' header += "# Short range vs long range contacts per chromosome, p-values of each distribution against each other distribution with Wilcoxon rank-sum\n" header += '# Short range contacts: <= ' + str(args.distance) + '\n' file.write(header) counter = 0 for i, matrix_0 in enumerate(args.matrices): for j, matrix_1 in enumerate(args.matrices[i + 1:]): file.write(matrix_0 + '\t' + matrix_1 + '\t' + str(p_values[counter]) + '\n') counter += 1 with open(args.outFileNameData, 'w') as file: header = '# Created with HiCExplorer\'s hicPlotSVL ' + __version__ + '\n' header += "# Short range vs long range contacts per chromosome: raw data\n" header += '# Short range contacts: <= ' + str(args.distance) + '\n' matrices_names = '\t\t\t'.join(args.matrices) header += '#\t{}\n'.format(matrices_names) header += '# Chromosome\t' header += '\t'.join([ 'Ratio', 'Sum <= {}'.format(args.distance), 'Sum > {}'.format( args.distance) ] * len(args.matrices)) header += '\n' file.write(header) counter = 0 for i, chromosome in enumerate(chromosomes_list): file.write('{}\t'.format(chromosome)) for j, matrix in enumerate(args.matrices): if i < len(short_v_long_range[j]): file.write('{}\t{}\t{}\t'.format(short_v_long_range[j][i], sum_smaller[j][i], sum_greater[j][i])) else: file.write('\t') file.write('\n')
def main(args=None): args = parse_arguments().parse_args(args) viewpointObj = Viewpoint() referencePoints, _ = viewpointObj.readReferencePointFile( args.referencePoints) relative_positions = set() bin_size = 0 # - compute for each condition (matrix): # - all viewpoints and smooth them: sliding window approach # - after smoothing, sum all viewpoints up to one # - compute the percentage of each position with respect to the total interaction count # for models of all conditions: # - compute nbinom parameters referencePointsPerThread = len(referencePoints) // args.threads queue = [None] * args.threads process = [None] * args.threads background_model_data = None fail_flag = False fail_message = '' for matrix in args.matrices: hic_ma = hm.hiCMatrix(matrix) viewpointObj.hicMatrix = hic_ma bin_size = hic_ma.getBinSize() all_data_collected = False thread_done = [False] * args.threads for i in range(args.threads): if i < args.threads - 1: referencePointsThread = referencePoints[i * referencePointsPerThread:(i + 1) * referencePointsPerThread] else: referencePointsThread = referencePoints[i * referencePointsPerThread:] queue[i] = Queue() process[i] = Process(target=compute_background, kwargs=dict( pReferencePoints=referencePointsThread, pViewpointObj=viewpointObj, pArgs=args, pQueue=queue[i] ) ) process[i].start() while not all_data_collected: for i in range(args.threads): if queue[i] is not None and not queue[i].empty(): background_data_thread = queue[i].get() if 'Fail:' in background_data_thread: fail_flag = True fail_message = background_data_thread[6:] queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True continue background_model_data_thread, relative_positions_thread = background_data_thread if background_model_data is None: background_model_data = background_model_data_thread else: for relativePosition in background_model_data_thread: if relativePosition in background_model_data: background_model_data[relativePosition].extend( background_model_data_thread[relativePosition]) else: background_model_data[relativePosition] = background_model_data_thread[relativePosition] relative_positions = relative_positions.union( relative_positions_thread) queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) del hic_ma del viewpointObj.hicMatrix if fail_flag: log.error('An error occurred caused by one or many faulty reference points.') log.error('Please run chicQualityControl to remove these from your reference point file: {}'.format(args.referencePoints)) log.error(fail_message) exit(1) # for models of all conditions: # - fit negative binomial for each relative distance relative_positions = sorted(relative_positions) nbinom_parameters = {} max_value = {} mean_value = {} sum_all_values = 0 data_of_distribution = None for relative_position in relative_positions: if args.truncateZeros: data_of_distribution = np.array(background_model_data[relative_position]) mask = data_of_distribution > 0.0 data_of_distribution = data_of_distribution[mask] else: data_of_distribution = np.array(background_model_data[relative_position]) nbinom_parameters[relative_position] = fit_nbinom.fit(data_of_distribution) if len(data_of_distribution) > 0: max_value[relative_position] = np.max(data_of_distribution) average_value = np.average(data_of_distribution) mean_value[relative_position] = average_value sum_all_values += average_value else: max_value[relative_position] = 0.0 average_value = 0.0 mean_value[relative_position] = 0.0 sum_all_values += 0.0 for relative_position in relative_positions: mean_value[relative_position] /= sum_all_values # write result to file with open(args.outFileName, 'w') as file: file.write( 'Relative position\tsize nbinom\tprob nbinom\tmax value\tmean value\n') for relative_position in relative_positions: relative_position_in_genomic_scale = relative_position * bin_size file.write("{}\t{:.12f}\t{:.12f}\t{:.12f}\t{:.12f}\n".format(relative_position_in_genomic_scale, nbinom_parameters[relative_position]['size'], nbinom_parameters[relative_position]['prob'], max_value[relative_position], mean_value[relative_position]))
def main(args=None): args = parse_arguments().parse_args(args) hic_matrix_list = [] sum_list = [] for matrix in args.matrices: hic_ma = hm.hiCMatrix(matrix) if args.normalize == 'smallest': sum_list.append(hic_ma.matrix.sum()) hic_matrix_list.append(hic_ma) if args.normalize == 'norm_range': for i, hic_matrix in enumerate(hic_matrix_list): hic_matrix.matrix.data = hic_matrix.matrix.data.astype(np.float32) mask = np.isnan(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 mask = np.isinf(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 min_value = np.min(hic_matrix.matrix.data) max_value = np.max(hic_matrix.matrix.data) min_max_difference = np.float64(max_value - min_value) hic_matrix.matrix.data -= min_value hic_matrix.matrix.data /= min_max_difference mask = np.isnan(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 mask = np.isinf(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 hic_matrix.matrix.eliminate_zeros() hic_matrix.save(args.outFileName[i], pApplyCorrection=False) elif args.normalize == 'smallest': argmin = np.argmin(sum_list) for i, hic_matrix in enumerate(hic_matrix_list): hic_matrix.matrix.data = hic_matrix.matrix.data.astype(np.float32) if i != argmin: mask = np.isnan(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 mask = np.isinf(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 adjust_factor = sum_list[i] / sum_list[argmin] hic_matrix.matrix.data /= adjust_factor mask = np.isnan(hic_matrix.matrix.data) mask = np.isnan(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 mask = np.isinf(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 hic_matrix.matrix.eliminate_zeros() hic_matrix.save(args.outFileName[i], pApplyCorrection=False) elif args.normalize == 'multiplicative': for i, hic_matrix in enumerate(hic_matrix_list): hic_matrix.matrix.data = hic_matrix.matrix.data.astype(np.float32) mask = np.isnan(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 mask = np.isinf(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 # adjust_factor = sum_list[i] / sum_list[argmin] hic_matrix.matrix.data *= args.multiplicativeValue mask = np.isnan(hic_matrix.matrix.data) mask = np.isnan(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 mask = np.isinf(hic_matrix.matrix.data) hic_matrix.matrix.data[mask] = 0 hic_matrix.matrix.eliminate_zeros() hic_matrix.save(args.outFileName[i], pApplyCorrection=False)
def test_maskBins(): hic = hm.hiCMatrix() cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)] hic.nan_bins = [] matrix = np.array([[1, 8, 5, 3, 0], [0, 4, 15, 5, 1], [0, 0, 0, 0, 2], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]]) hic.matrix = csr_matrix(matrix) hic.setMatrix(hic.matrix, cut_intervals) nt.assert_equal(hic.getMatrix(), matrix) nt.assert_equal(hic.orig_bin_ids, []) new_matrix = np.array([[0, 0, 2], [0, 0, 1], [0, 0, 0]]) masking_ids = [0, 1] hic.maskBins(masking_ids) nt.assert_equal(hic.getMatrix(), new_matrix) nt.assert_equal( sorted(hic.orig_cut_intervals), sorted([('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)])) nt.assert_equal( sorted(hic.cut_intervals), sorted([('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)])) nt.assert_equal(hic.chrBinBoundaries, OrderedDict([('a', (0, 1)), ('b', (1, 3))])) nt.assert_equal(sorted(hic.orig_bin_ids), sorted([0, 1, 2, 3, 4])) # direct return if masking_ids is None or has len() == 0, thus no changes to matrix masking_ids = None hic.maskBins(masking_ids) nt.assert_equal(hic.getMatrix(), new_matrix) nt.assert_equal( sorted(hic.orig_cut_intervals), sorted([('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)])) nt.assert_equal( sorted(hic.cut_intervals), sorted([('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)])) nt.assert_equal(hic.chrBinBoundaries, OrderedDict([('a', (0, 1)), ('b', (1, 3))])) masking_ids = [] hic.maskBins(masking_ids) nt.assert_equal(hic.getMatrix(), new_matrix) nt.assert_equal( sorted(hic.orig_cut_intervals), sorted([('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)])) nt.assert_equal( sorted(hic.cut_intervals), sorted([('a', 20, 30, 1), ('b', 30, 40, 1), ('b', 40, 50, 1)])) nt.assert_equal(hic.chrBinBoundaries, OrderedDict([('a', (0, 1)), ('b', (1, 3))])) nt.assert_equal(sorted(hic.orig_bin_ids), sorted([0, 1, 2, 3, 4]))
def main(args=None): args = parse_arguments().parse_args(args) # read domains file domains_df = readDomainBoundaries(args.tadDomains) # read full h5 or only region if cooler is_cooler_target = check_cooler(args.targetMatrix) is_cooler_control = check_cooler(args.controlMatrix) if is_cooler_target != is_cooler_control: log.error('Matrices are not given in the same format!') exit(1) if not is_cooler_control: hic_matrix_target = hm.hiCMatrix(args.targetMatrix) hic_matrix_control = hm.hiCMatrix(args.controlMatrix) else: hic_matrix_target = args.targetMatrix hic_matrix_control = args.controlMatrix # accepted_H0 = [] # rejected_H0 = [] # log.debug('domains_df {}'.format(domains_df)) domains = domains_df.values.tolist() p_values_threads = [None] * args.threads accepted_left_inter_threads = [None] * args.threads accepted_right_inter_threads = [None] * args.threads accepted_intra_threads = [None] * args.threads rows_threads = [None] * args.threads domainsPerThread = len(domains) // args.threads all_data_collected = False queue = [None] * args.threads process = [None] * args.threads thread_done = [False] * args.threads # None --> first thread, process first element in list, ignore last one # True --> middle thread: ignore first and last element in tad processing # False --> last thread: ignore first element, process last one thread_id = None for i in range(args.threads): if i == 0: domainListThread = domains[i * domainsPerThread:( (i + 1) * domainsPerThread) + 1] thread_id = None elif i < args.threads - 1: domainListThread = domains[(i * domainsPerThread) - 1:((i + 1) * domainsPerThread) + 1] thread_id = True else: domainListThread = domains[(i * domainsPerThread) - 1:] thread_id = False if args.threads == 1: thread_id = '' queue[i] = Queue() process[i] = Process(target=computeDifferentialTADs, kwargs=dict(pMatrixTarget=hic_matrix_target, pMatrixControl=hic_matrix_control, pDomainList=domainListThread, pCoolOrH5=is_cooler_control, pPValue=args.pValue, pThreadId=thread_id, pQueue=queue[i])) process[i].start() while not all_data_collected: for i in range(args.threads): if queue[i] is not None and not queue[i].empty(): p_values_threads[i], accepted_left_inter_threads[i], \ accepted_right_inter_threads[i], \ accepted_intra_threads[i], rows_threads[i] = queue[i].get() queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) # outfile_names = [item for sublist in outfile_names for item in sublist] # target_list_name = [ # item for sublist in target_list_name for item in sublist] p_values_list = [item for sublist in p_values_threads for item in sublist] accepted_inter_left = [ item for sublist in accepted_left_inter_threads for item in sublist ] accepted_inter_right = [ item for sublist in accepted_right_inter_threads for item in sublist ] accepted_intra = [ item for sublist in accepted_intra_threads for item in sublist ] rows = [item for sublist in rows_threads for item in sublist] p_values_list = np.array(p_values_list) accepted_inter_left = np.array(accepted_inter_left) accepted_inter_right = np.array(accepted_inter_right) accepted_intra = np.array(accepted_intra) rows = np.array(rows) if args.mode == 'intra-TAD': mask = accepted_intra elif args.mode == 'left-inter-TAD': if args.modeReject == 'all': mask = np.logical_and(accepted_inter_left, accepted_intra) else: mask = np.logical_or(accepted_inter_left, accepted_intra) elif args.mode == 'right-inter-TAD': if args.modeReject == 'all': mask = np.logical_and(accepted_intra, accepted_inter_right) else: mask = np.logical_or(accepted_intra, accepted_inter_right) else: if args.modeReject == 'all': mask = np.logical_and(accepted_inter_left, accepted_inter_right) mask = np.logical_and(mask, accepted_intra) else: mask = np.logical_or(accepted_inter_left, accepted_inter_right) mask = np.logical_or(mask, accepted_intra) accepted_H0 = p_values_list[~mask] rejected_H0 = p_values_list[mask] accepted_rows = rows[~mask] rejected_rows = rows[mask] with open(args.outFileNamePrefix + '_accepted.diff_tad', 'w') as file: header = '# Created with HiCExplorer\'s hicDifferentialTAD version ' + __version__ + '\n' header += '# H0 \'regions are equal\' H0 is accepted for all p-value greater the user given p-value threshold; i.e. regions in this file are not considered as differential.\n' header += '# Accepted regions with Wilcoxon rank-sum test to p-value: {} with used mode: {} and modeReject: {} \n'.format( args.pValue, args.mode, args.modeReject) header += '# Chromosome\tstart\tend\tname\tscore\tstrand\tp-value left-inter-TAD\tp-value right-inter-TAD\tp-value intra-TAD\n' file.write(header) for i, row in enumerate(accepted_rows): row_list = list(map(str, row)) file.write('\t'.join(row_list)) file.write('\t') pvalue_list = list(map(str, accepted_H0[i])) file.write('\t'.join(pvalue_list)) file.write('\n') with open(args.outFileNamePrefix + '_rejected.diff_tad', 'w') as file: header = '# Created with HiCExplorer\'s hicDifferentialTAD version ' + __version__ + '\n' header += '# H0 \'regions are equal\' H0 is rejected for all p-value smaller or equal the user given p-value threshold; i.e. regions in this file are considered as differential.\n' header += '# Rejected regions with Wilcoxon rank-sum test to p-value: {} with used mode: {} and modeReject: {} \n'.format( args.pValue, args.mode, args.modeReject) header += '# Chromosome\tstart\tend\tname\tscore\tstrand\tp-value left-inter-TAD\tp-value right-inter-TAD\tp-value intra-TAD\n' file.write(header) for i, row in enumerate(rejected_rows): row_list = list(map(str, row)) file.write('\t'.join(row_list)) file.write('\t') pvalue_list = list(map(str, rejected_H0[i])) file.write('\t'.join(pvalue_list)) file.write('\n')