def compute_consensus_matrix(pMatrixName, pClusterMatricesList, pAppend, pQueue): cluster_consensus_matrices_list = [] for i, cluster in enumerate(pClusterMatricesList): consensus_matrix = None if i == 0 and pAppend: append = False else: append = True for matrix in cluster: matrixFileHandlerInput = MatrixFileHandler( pFileType='cool', pMatrixFile=pMatrixName + '::' + matrix) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() if consensus_matrix is None: consensus_matrix = _matrix else: consensus_matrix += _matrix hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pAppend=append, pEnforceInteger=False, pFileWasH5=False, pHic2CoolVersion=hic2CoolVersion) matrixFileHandlerOutput.set_matrix_variables(consensus_matrix, cut_intervals, nan_bins, correction_factors, distance_counts) cluster_consensus_matrices_list.append(matrixFileHandlerOutput) pQueue.put(cluster_consensus_matrices_list)
def compute_merge(pMatrixName, pMatrixList, pRunningWindow, pNumBins, pQueue): out_queue_list = [] try: for matrix in pMatrixList: hic = hm.hiCMatrix(pMatrixName + '::' + matrix) if pRunningWindow: merged_matrix = running_window_merge(hic, pNumBins) else: merged_matrix = merge_bins(hic, pNumBins) matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool', pMatrixFile=matrix, pEnforceInteger=False, pFileWasH5=False) matrixFileHandlerOutput.set_matrix_variables( merged_matrix.matrix, merged_matrix.cut_intervals, merged_matrix.nan_bins, merged_matrix.correction_factors, merged_matrix.distance_counts) out_queue_list.append(matrixFileHandlerOutput) pQueue.put(out_queue_list) except Exception as exp: pQueue.put(["Fail: {}".format(str(exp))]) return
def test_save_cool(): cool_outfile = outfile + '.cool' # create matrixFileHandler instance with filetype 'cool' pMatrixFile = ROOT + 'Li_et_al_2015.cool' fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile) assert fh is not None # load data matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load( ) # set matrix variables fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) # and save it. fh.save(pName=cool_outfile, pSymmetric=True, pApplyCorrection=True) fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile) assert fh_test is not None matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load( ) nt.assert_equal(matrix.data, matrix_test.data) nt.assert_equal(cut_intervals, cut_intervals_test) nt.assert_equal(nan_bins, nan_bins_test) nt.assert_equal(distance_counts, distance_counts_test) nt.assert_equal(correction_factors, correction_factors_test) os.unlink(cool_outfile)
def load_cool_files(pMatrixName, pMatricesList, pCutIntervals, pQueue): matrixFileHandlerList = [] try: for i, matrix in enumerate(pMatricesList): matrixFileHandlerInput = MatrixFileHandler( pFileType='cool', pMatrixFile=pMatrixName + "::" + matrix, pNoCutIntervals=True) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pMatrixFile=matrix.split('/')[-1]) matrixFileHandlerOutput.set_matrix_variables( _matrix, pCutIntervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerList.append(matrixFileHandlerOutput) except Exception as exp: pQueue.put('Fail: ' + str(exp) + traceback.format_exc()) return pQueue.put(matrixFileHandlerList)
def load_cool_files(pMatricesList, pCutIntervals, pQueue): matrixFileHandlerList = [] for i, matrix in enumerate(pMatricesList): try: matrixFileHandlerInput = MatrixFileHandler(pFileType='cool', pMatrixFile=matrix, pNoCutIntervals=True) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool', pMatrixFile=matrix) matrixFileHandlerOutput.set_matrix_variables( _matrix, pCutIntervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerList.append(matrixFileHandlerOutput) except Exception as exp: log.warning( 'File could not be opend and is excluded: {}. Error message: {} ' .format(matrix, str(exp))) pQueue.put(matrixFileHandlerList)
def test_save_cool_enforce_integer(): cool_outfile = outfile + '.cool' # create matrixFileHandler instance with filetype 'h5' pMatrixFile = ROOT + 'Li_et_al_2015.h5' fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile) assert fh is not None # load data matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load( ) # set matrix variables fh_new = MatrixFileHandler(pFileType='cool', pEnforceInteger=True) fh_new.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) fh_new.matrixFile.fileWasH5 = True # and save it. fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True) fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile, pApplyCorrectionCoolerLoad=False) assert fh_test is not None matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load( ) # pMatrixFile = ROOT + 'Li_et_al_2015.h5' # fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile) # assert fh is not None # load data # matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() # instances, features = matrix.nonzero() # instances_factors = correction_factors[instances] # features_factors = correction_factors[features] # instances_factors *= features_factors # matrix_applied_correction = matrix.data / instances_factors # mask = matrix.data == 0 matrix.data = np.rint(matrix.data) matrix.eliminate_zeros() # matrix_test.eliminate_zeros() nt.assert_almost_equal(matrix.data, matrix_test.data, decimal=0) nt.assert_equal(len(cut_intervals), len(cut_intervals_test)) nt.assert_equal(nan_bins, nan_bins_test) nt.assert_equal(distance_counts, distance_counts_test) # os.unlink(cool_outfile) os.unlink(cool_outfile)
def test_load_distance_cool(): cool_outfile = outfile + '.cool' # create matrixFileHandler instance with filetype 'cool' pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool' fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pChrnameList=['1'], pDistance=2500000) assert fh is not None # load data matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load( ) # set matrix variables fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) # and save it. fh.save(pName=cool_outfile, pSymmetric=True, pApplyCorrection=True) fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile) assert fh_test is not None matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load( ) # check distance load works as expected instances, features = matrix.nonzero() distances = np.absolute(instances - features) # log.debug('max: {}'.format(np.max(distances))) mask = distances > 1 # 2.5 mb res --> all with 2.5 Mb distance assert np.sum(mask) == 0 fh = MatrixFileHandler(pFileType='cool', pChrnameList=['1'], pMatrixFile=pMatrixFile) assert fh is not None # load data matrix2, _, _, _, _ = fh.load() instances, features = matrix2.nonzero() distances = np.absolute(instances - features) mask = distances > 1 # 2.5 mb res --> all with 2.5 Mb distance assert np.sum(mask) > 0 # check if load and save matrix are equal nt.assert_equal(matrix.data, matrix_test.data) nt.assert_equal(cut_intervals, cut_intervals_test) nt.assert_equal(nan_bins, nan_bins_test) nt.assert_equal(distance_counts, distance_counts_test) nt.assert_equal(correction_factors, correction_factors_test) os.unlink(cool_outfile)
def compute_consensus_matrix(pMatrixName, pClusterMatricesList, pClusterName, pQueue): counter = 0 consensus_matrix = None try: matrixFileHandlerInput = MatrixFileHandler( pFileType='cool', pMatrixFile=pMatrixName + '::' + pClusterMatricesList[0]) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() consensus_matrix = _matrix for j, matrix in enumerate(pClusterMatricesList[1:]): matrixFileHandlerInput = MatrixFileHandler( pFileType='cool', pMatrixFile=pMatrixName + '::' + matrix, pLoadMatrixOnly=True) _matrix, _, _, _, _ = matrixFileHandlerInput.load() _matrix = csr_matrix((_matrix[2], (_matrix[0], _matrix[1])), (_matrix[3], _matrix[3]), dtype=np.float) if consensus_matrix is None: consensus_matrix = _matrix else: consensus_matrix += _matrix hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pMatrixFile='consensus_matrix_cluster_' + str(pClusterName) + ':' + str(len(pClusterMatricesList)), pEnforceInteger=False, pFileWasH5=False, pHic2CoolVersion=hic2CoolVersion) matrixFileHandlerOutput.set_matrix_variables(consensus_matrix, cut_intervals, nan_bins, correction_factors, distance_counts) if counter > 0: log.info( '{} matrices were not considered because of a wrong size.'. format(counter)) except Exception as exp: log.debug('exception! {}'.format(str(exp))) log.debug('computaiton of {} done'.format(str(pClusterName))) pQueue.put(matrixFileHandlerOutput)
def test_save_homer(): homer_outfile = outfile + '.homer' # create matrixFileHandler instance with filetype 'homer' pMatrixFile = ROOT + 'test_matrix.homer' fh = MatrixFileHandler(pFileType='homer', pMatrixFile=pMatrixFile) assert fh is not None # load data matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() # set matrix variables fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) # noqa E501 # and save it. fh.save(pName=homer_outfile, pSymmetric=False, pApplyCorrection=False) # not implemented os.unlink(homer_outfile)
def test_load_h5_save_cool(): cool_outfile = outfile + '.cool' # create matrixFileHandler instance with filetype 'h5' pMatrixFile = ROOT + 'Li_et_al_2015.h5' fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile) assert fh is not None # load data matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load( ) # set matrix variables fh_new = MatrixFileHandler(pFileType='cool') fh_new.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) fh_new.matrixFile.fileWasH5 = True # and save it. fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True) fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile) assert fh_test is not None matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load( ) instances, features = matrix.nonzero() instances_factors = correction_factors[instances] features_factors = correction_factors[features] instances_factors *= features_factors matrix_applied_correction = matrix.data / instances_factors nt.assert_almost_equal(matrix_applied_correction, matrix_test.data, decimal=1) nt.assert_equal(len(cut_intervals), len(cut_intervals_test)) nt.assert_equal(nan_bins, nan_bins_test) nt.assert_equal(distance_counts, distance_counts_test) correction_factors = 1 / correction_factors mask = np.isnan(correction_factors) correction_factors[mask] = 0 mask = np.isinf(correction_factors) correction_factors[mask] = 0 nt.assert_equal(correction_factors, correction_factors_test) # os.unlink(cool_outfile) os.unlink(cool_outfile)
def test_save_h5(): h5_outfile = outfile + '.h5' # create matrixFileHandler instance with filetype 'h5' pMatrixFile = ROOT + 'Li_et_al_2015.h5' fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile) assert fh is not None # load data matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() # set matrix variables fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) # noqa E501 # and save it. fh.save(h5_outfile, True, None) os.unlink(h5_outfile)
def txt_to_matrixFileHandler(pMatricesList, pMatrixDimensions, pCutIntervals, pQueue): matrixFileHandlerList = [] for i, matrix in enumerate(pMatricesList): # create csr matrix instances = [] features = [] data = [] with open(matrix, 'r') as file: for i, line in enumerate(file.readlines()): line = line.strip() if len(line) == 0: continue x, y, count = line.split('\t')[:3] instances.append(int(x)) features.append(int(y)) data.append(float(count)) cell_type = matrix.split('_')[2] log.debug('matrix name {}'.format(matrix)) log.debug( 'max(instances) {} max(features) {} pMatrixDimensions {}'.format( max(instances), max(features), pMatrixDimensions)) hic_matrix = csr_matrix((data, (instances, features)), (pMatrixDimensions, pMatrixDimensions), dtype=np.float) matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool', pMatrixFile=matrix) matrixFileHandlerOutput.set_matrix_variables(hic_matrix, pCutIntervals, None, None, None) if matrixFileHandlerOutput.matrixFile.hic_metadata is None: matrixFileHandlerOutput.matrixFile.hic_metadata = {} matrixFileHandlerOutput.matrixFile.hic_metadata[ 'cell_type'] = cell_type matrixFileHandlerList.append(matrixFileHandlerOutput) pQueue.put(matrixFileHandlerList)
def compute_normalize(pMatrixName, pMatricesList, pArgminSum, pSumOfAll, pAppend, pQueue): matrixFileHandlerList = [] for i, matrix in enumerate(pMatricesList): if i == 0 and pAppend: append = False else: append = True matrixFileHandler = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixName + '::' + matrix) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandler.load() _matrix.data = _matrix.data.astype(np.float32) mask = np.isnan(_matrix.data) _matrix.data[mask] = 0 mask = np.isinf(_matrix.data) _matrix.data[mask] = 0 adjust_factor = pSumOfAll[i] / pArgminSum _matrix.data /= adjust_factor mask = np.isnan(_matrix.data) mask = np.isnan(_matrix.data) _matrix.data[mask] = 0 mask = np.isinf(_matrix.data) _matrix.data[mask] = 0 _matrix.eliminate_zeros() matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool', pAppend=append, pEnforceInteger=False, pFileWasH5=False, pHic2CoolVersion=None) matrixFileHandlerOutput.set_matrix_variables(_matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerList.append(matrixFileHandlerOutput) pQueue.put(matrixFileHandlerList)
def compute_correction(pMatrixName, pMatrixList, pCutIntervals, pQueue): out_queue_list = [] print('len(pMatrixList): ' + str(len(pMatrixList))) try: for i, matrix in enumerate(pMatrixList): pixels, shape, _ = load_matrix(pMatrixName + '::' + matrix, None, False, None) # _matrix = [None, None, None] if 'bin1_id' in pixels.columns and 'bin2_id' in pixels.columns and 'count' in pixels.columns: instances = pixels['bin1_id'].values features = pixels['bin2_id'].values data = pixels['count'].values matrix = csr_matrix((data, (instances, features)), (shape[0], shape[1]), dtype=np.float) else: continue kr = kr_balancing(shape[0], shape[1], matrix.count_nonzero(), matrix.indptr.astype(np.int64, copy=False), matrix.indices.astype(np.int64, copy=False), matrix.data.astype(np.float64, copy=False)) kr.computeKR() correction_factors = kr.get_normalisation_vector(False).todense() matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool', pMatrixFile=matrix) matrixFileHandlerOutput.set_matrix_variables(matrix, pCutIntervals, None, correction_factors, None) out_queue_list.append(matrixFileHandlerOutput) print('DOne i: ' + str(i)) except Exception as exp: print('Exception: ' + str(exp)) log.debug('Exception! {}'.format(str(exp))) pQueue.put(str(exp)) return pQueue.put(out_queue_list) return
def test_save_scool_matrixHandlersCool(): outfile = NamedTemporaryFile(suffix='.scool', prefix='hicmatrix_scool_test') pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool' matrixFileHandlerInput = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile) matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() matrixFileHandlerOutput1 = MatrixFileHandler(pFileType='cool', pMatrixFile='cell1', pEnforceInteger=False, pFileWasH5=False, pHic2CoolVersion=None) matrixFileHandlerOutput1.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput2 = MatrixFileHandler(pFileType='cool', pMatrixFile='cell2', pEnforceInteger=False, pFileWasH5=False, pHic2CoolVersion=None) matrixFileHandlerOutput2.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput3 = MatrixFileHandler(pFileType='cool', pMatrixFile='cell3', pEnforceInteger=False, pFileWasH5=False, pHic2CoolVersion=None) matrixFileHandlerOutput3.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandler = MatrixFileHandler(pFileType='scool') matrixFileHandler.matrixFile.coolObjectsList = [ matrixFileHandlerOutput1, matrixFileHandlerOutput2, matrixFileHandlerOutput3 ] matrixFileHandler.save(outfile.name, pSymmetric=True, pApplyCorrection=False) content_of_scool = cooler.fileops.list_scool_cells(outfile.name) content_expected = ['/cells/cell1', '/cells/cell2', '/cells/cell3'] for content in content_expected: assert content in content_of_scool
def main(args=None): args = parse_arguments().parse_args(args) ma = hm.hiCMatrix(args.matrix) ma.maskBins(ma.nan_bins) ma.matrix.data[np.isnan(ma.matrix.data)] = 0 ma.maskBins(ma.nan_bins) ma.matrix.data = ma.matrix.data new_intervals = hicexplorer.utilities.enlarge_bins(ma.cut_intervals) ma.setCutIntervals(new_intervals) if args.chromosomes: ma.keepOnlyTheseChr(args.chromosomes) default_range = '1000000:20000000' if args.range is None: if args.mode == "intra-chr": log.warning("You have not set any range. This is by default set to {} for intra-chr.".format(default_range)) args.range = default_range min_dist, max_dist = args.range.split(":") log.info("checking range {}-{}".format(min_dist, max_dist)) assert int(min_dist) < int(max_dist), "Error lower range is larger than upper range!" if args.transform == "z-score": # use zscore matrix log.info("Computing z-score matrix. This may take a while.\n") if args.mode == 'intra-chr': ma.convert_to_zscore_matrix(maxdepth=int(max_dist) * 2.5, perchr=True) else: ma.convert_to_zscore_matrix(maxdepth=None, perchr=True) elif args.transform == "obs/exp": # use obs/exp matrix log.info("Computing observed vs. expected matrix. This may take a while.\n") if args.mode == 'intra-chr': ma.convert_to_obs_exp_matrix(maxdepth=int(max_dist) * 2.5, perchr=True) else: ma.convert_to_obs_exp_matrix(maxdepth=None, perchr=True) if args.outFileObsExp: file_type = 'cool' if args.outFileObsExp.endswith('.h5'): file_type = 'h5' matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type) matrixFileHandlerOutput.set_matrix_variables(ma.matrix, ma.cut_intervals, ma.nan_bins, ma.correction_factors, ma.distance_counts) matrixFileHandlerOutput.save(args.outFileObsExp, pSymmetric=True, pApplyCorrection=False) M = args.numberOfBins if args.numberOfBins % 2 == 1 else args.numberOfBins + 1 M_half = int((M - 1) // 2) chrom_coord = dict() chrom_list = ma.getChrNames() for chrom in chrom_list: first, last = ma.getChrBinRange(chrom) first = ma.getBinPos(first) last = ma.getBinPos(last - 1) chrom_coord[chrom] = (first[1], last[2]) agg_info = dict() agg_info["chrom_coord"] = chrom_coord agg_info["seen"] = [] agg_info["agg_matrix"] = OrderedDict() agg_info["agg_total"] = {} agg_info["agg_diagonals"] = OrderedDict() agg_info["agg_contact_position"] = {} agg_info["agg_center_values"] = {} agg_info["counter"] = 0 agg_info["used_counter"] = 0 agg_info["empty_mat"] = 0 if (args.mode == 'inter-chr') and (len(agg_info["chrom_coord"]) == 1): exit("Error: 'inter-chr' mode can not be applied on matrices of only one chromosme.") if args.row_wise: # read bed files bed_intervals = args.BED.readlines() if args.BED2: bed_intervals2 = args.BED2.readlines() else: log.error("Error computing row-wise contacts requires two bed files!") exit("Error computing row-wise contacts requires two bed files!") # agg_matrix could be either per chromosome or genome wide aggregate_contacts_per_row(bed_intervals, bed_intervals2, agg_info, ma, chrom_list, M_half, args.largeRegionsOperation, args.range, args.transform, mode=args.mode, perChr=args.perChr) else: # not row-wise # read and sort bed files. bed_intervals = read_bed_per_chrom(args.BED, chrom_list) if args.BED2: bed_intervals2 = read_bed_per_chrom(args.BED2, chrom_list) else: bed_intervals2 = bed_intervals # agg_matrix could be either per chromosome or genome wide aggregate_contacts(bed_intervals, bed_intervals2, agg_info, ma, M_half, args.largeRegionsOperation, args.range, args.transform, mode=args.mode, perChr=args.perChr) if args.kmeans is not None: cluster_ids = cluster_matrices(agg_info["agg_matrix"], args.kmeans, method='kmeans', how=args.howToCluster) num_clusters = args.kmeans elif args.hclust is not None: log.info("Performing hierarchical clustering." "Please note that it might be very slow for large datasets.\n") cluster_ids = cluster_matrices(agg_info["agg_matrix"], args.hclust, method='hierarchical', how=args.howToCluster) num_clusters = args.hclust else: # make a 'fake' clustering to generalize the plotting of the submatrices cluster_ids = {} num_clusters = 1 for k in agg_info["agg_matrix"].keys(): cluster_ids[k] = [range(len(agg_info["agg_matrix"][k]))] if len(agg_info["agg_matrix"]) == 0: exit("No susbmatrix found to be aggregated.") plot_aggregated_contacts(agg_info["agg_matrix"], agg_info["agg_contact_position"], cluster_ids, num_clusters, M_half, args) if args.outFileContactPairs: for idx, chrom in enumerate(agg_info["agg_matrix"]): if chrom not in bed_intervals or chrom not in bed_intervals2: continue for cluster_number, cluster_indices in enumerate(cluster_ids[chrom]): center_values_to_order = np.array(agg_info["agg_center_values"][chrom])[cluster_indices] center_values_order = np.argsort(center_values_to_order)[::-1] output_name = "{file}_{chrom}_cluster_{id}.tab".format(file=args.outFileContactPairs, chrom=chrom, id=cluster_number + 1) with open(output_name, 'w') as fh: for cl_idx in center_values_order: value = center_values_to_order[cl_idx] start, end, start2, end2 = agg_info["agg_contact_position"][chrom][cl_idx] fh.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(chrom, start, end, chrom, start2, end2, value)) # plot the diagonals # the diagonals plot is useful to see individual cases and if they had a contact in the center if args.diagnosticHeatmapFile: plot_diagnostic_heatmaps(agg_info["agg_diagonals"], cluster_ids, M_half, args)
def main(args=None): args = parse_arguments().parse_args(args) if int(args.numberOfEigenvectors) != len(args.outputFileName): log.error( "Number of output file names and number of eigenvectors does not match. Please" "provide the name of each file.\nFiles: {}\nNumber of eigenvectors: {}" .format(args.outputFileName, args.numberOfEigenvectors)) exit(1) ma = hm.hiCMatrix(args.matrix) ma.maskBins(ma.nan_bins) if args.chromosomes: ma.keepOnlyTheseChr(args.chromosomes) vecs_list = [] chrom_list = [] start_list = [] end_list = [] # PCA is computed per chromosome length_chromosome = 0 chromosome_count = len(ma.getChrNames()) if args.pearsonMatrix: trasf_matrix_pearson = lil_matrix(ma.matrix.shape) if args.obsexpMatrix: trasf_matrix_obsexp = lil_matrix(ma.matrix.shape) for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) length_chromosome += chr_range[1] - chr_range[0] for chrname in ma.getChrNames(): chr_range = ma.getChrBinRange(chrname) submatrix = ma.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] if args.norm: exp_obs_matrix_ = exp_obs_matrix_norm(submatrix, length_chromosome, chromosome_count) exp_obs_matrix_ = convertNansToZeros( csr_matrix(exp_obs_matrix_)).todense() exp_obs_matrix_ = convertInfsToZeros( csr_matrix(exp_obs_matrix_)).todense() else: exp_obs_matrix_ = exp_obs_matrix_lieberman(submatrix, length_chromosome, chromosome_count) exp_obs_matrix_ = convertNansToZeros( csr_matrix(exp_obs_matrix_)).todense() exp_obs_matrix_ = convertInfsToZeros( csr_matrix(exp_obs_matrix_)).todense() if args.obsexpMatrix: trasf_matrix_obsexp[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix( exp_obs_matrix_) pearson_correlation_matrix = np.corrcoef(exp_obs_matrix_) pearson_correlation_matrix = convertNansToZeros( csr_matrix(pearson_correlation_matrix)).todense() pearson_correlation_matrix = convertInfsToZeros( csr_matrix(pearson_correlation_matrix)).todense() if args.pearsonMatrix: trasf_matrix_pearson[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]] = lil_matrix( pearson_correlation_matrix) corrmatrix = np.cov(pearson_correlation_matrix) corrmatrix = convertNansToZeros(csr_matrix(corrmatrix)).todense() corrmatrix = convertInfsToZeros(csr_matrix(corrmatrix)).todense() evals, eigs = linalg.eig(corrmatrix) k = args.numberOfEigenvectors chrom, start, end, _ = zip( *ma.cut_intervals[chr_range[0]:chr_range[1]]) vecs_list += eigs[:, :k].tolist() chrom_list += chrom start_list += start end_list += end if args.pearsonMatrix: file_type = 'cool' if args.pearsonMatrix.endswith('.h5'): file_type = 'h5' matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type) matrixFileHandlerOutput.set_matrix_variables( trasf_matrix_pearson.tocsr(), ma.cut_intervals, ma.nan_bins, ma.correction_factors, ma.distance_counts) matrixFileHandlerOutput.save(args.pearsonMatrix, pSymmetric=True, pApplyCorrection=False) if args.obsexpMatrix: file_type = 'cool' if args.obsexpMatrix.endswith('.h5'): file_type = 'h5' matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type) matrixFileHandlerOutput.set_matrix_variables( trasf_matrix_obsexp.tocsr(), ma.cut_intervals, ma.nan_bins, ma.correction_factors, ma.distance_counts) matrixFileHandlerOutput.save(args.obsexpMatrix, pSymmetric=True, pApplyCorrection=False) if args.geneTrack: vecs_list = correlateEigenvectorWithGeneTrack(ma, vecs_list, args.geneTrack) if args.format == 'bedgraph': for idx, outfile in enumerate(args.outputFileName): assert (len(vecs_list) == len(chrom_list)) with open(outfile, 'w') as fh: for i, value in enumerate(vecs_list): if len(value) == args.numberOfEigenvectors: if isinstance(value[idx], np.complex): value[idx] = value[idx].real fh.write("{}\t{}\t{}\t{:.12f}\n".format( toString(chrom_list[i]), start_list[i], end_list[i], value[idx])) elif args.format == 'bigwig': if not pyBigWig.numpy == 1: log.error( "ERROR: Your version of pyBigWig is not supporting numpy: {}". format(pyBigWig.__file__)) exit(1) old_chrom = chrom_list[0] header = [] for i, _chrom in enumerate(chrom_list): if old_chrom != _chrom: header.append((toString(old_chrom), end_list[i - 1])) old_chrom = _chrom header.append((toString(chrom_list[-1]), end_list[-1])) for idx, outfile in enumerate(args.outputFileName): log.debug("bigwig: len(vecs_list) {}".format(len(vecs_list))) log.debug("bigwig: len(chrom_list) {}".format(len(chrom_list))) assert (len(vecs_list) == len(chrom_list)) _chrom_list = [] _start_list = [] _end_list = [] values = [] bw = pyBigWig.open(outfile, 'w') # set big wig header bw.addHeader(header) # create entry lists for i, value in enumerate(vecs_list): # it can happen that some 'value' is having less dimensions than it should if len(value) == args.numberOfEigenvectors: if isinstance(value[idx], np.complex): value[idx] = value[idx].real values.append(value[idx]) _chrom_list.append(toString(chrom_list[i])) _start_list.append(start_list[i]) _end_list.append(end_list[i]) # write entries bw.addEntries(_chrom_list, _start_list, ends=_end_list, values=values) bw.close() else: log.error("Output format not known: {}".format(args.format)) exit(1)
def main(args=None): args = parse_arguments().parse_args(args) log.debug(args) # parse from hicpro, homer, h5 and hic to cool if args.inputFormat != 'hic' and args.outputFormat != 'mcool': if len(args.matrices) != len(args.outFileName): log.error( 'Number of input matrices does not match number output matrices!: Input matrices {}; output matrices {}' .format(len(args.matrices), len(args.outFileName))) exit(1) if args.inputFormat == 'hic' and args.outputFormat != 'cool': log.error('The export of a hic file is only possible to a cool file.') exit(1) if args.inputFormat == 'hic' and args.outputFormat == 'cool': log.info('Converting with hic2cool.') for i, matrix in enumerate(args.matrices): if args.resolutions is None: hic2cool_convert(matrix, args.outFileName[i], 0) else: for resolution in args.resolutions: out_name = args.outFileName[i].split('.') out_name[-2] = out_name[-2] + '_' + str(resolution) out_name = '.'.join(out_name) hic2cool_convert(matrix, out_name, resolution) return elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool', '2D-text']: format_was_h5 = False if args.inputFormat == 'h5': format_was_h5 = True applyCorrection = True if args.store_applied_correction: applyCorrection = False if args.inputFormat == 'hicpro': if len(args.matrices) != len(args.bedFileHicpro): log.error( 'Number of matrices and associated bed files need to be the same.' ) log.error('Matrices: {}; Bed files: {}'.format( len(args.matrices), len(args.bedFileHicpro))) sys.exit(1) if args.inputFormat == '2D-text': if args.resolutions is None: log.error('The resolution must be defined via --resolutions') sys.exit(1) if args.chromosomeSizes is None: log.error( 'The sizes of the chromosomes must be defined via --chromosomeSizes.' ) sys.exit(1) for i, matrix in enumerate(args.matrices): if args.inputFormat == 'hicpro': matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pBedFileHicPro=args.bedFileHicpro[i]) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() elif args.inputFormat == '2D-text': chrom_sizes = OrderedDict() size_genome = 0 with open(args.chromosomeSizes.name, 'r') as file: file_ = True while file_: file_ = file.readline().strip() if file_ != '': line_split = file_.split('\t') chrom_sizes[line_split[0]] = int(line_split[1]) size_genome += int(line_split[1]) chrom_sizes = list(chrom_sizes.items()) # log.debug('chrom_sizes: {}'.format(chrom_sizes)) args.resolutions = [int(x) for x in args.resolutions] # internal_matrix_size = size_genome // args.resolutions[0] cut_intervals = [] for chromosome in chrom_sizes: for interval in range(0, chromosome[1], args.resolutions[0]): cut_intervals.append( tuple([ chromosome[0], interval, min(chromosome[1], interval + args.resolutions[0]), 1.0 ])) hic_matrix_csr = lil_matrix( (len(cut_intervals), len(cut_intervals))) log.debug('cut_intervals {}'.format(cut_intervals[:20])) hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(hic_matrix_csr, cut_intervals) # tmp_matrix = coo_matrix(()) with open(matrix, 'r') as file: for j, line in enumerate(file): line_split = line.split('\t') chromosome_1 = str(line_split[0]) start_1 = int(line_split[1]) end_1 = int(line_split[2]) chromosome_2 = str(line_split[3]) start_2 = int(line_split[4]) end_2 = int(line_split[5]) value = float(line_split[6]) bin_id_1 = hic_matrix.getRegionBinRange( chromosome_1, start_1, end_1) bin_id_2 = hic_matrix.getRegionBinRange( chromosome_2, start_2, end_2) try: hic_matrix.matrix[bin_id_1, bin_id_2] = value except Exception as exp: log.debug(str(exp)) if j % 1000 == 0: log.debug('{} lines computed'.format(j)) log.debug('csr with values filled!') hic_matrix.matrix = hic_matrix.matrix.tocsr() _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = hic_matrix.matrix, hic_matrix.cut_intervals, hic_matrix.nan_bins, \ hic_matrix.distance_counts, hic_matrix.correction_factors else: correction_operator = None if args.correction_division: correction_operator = '/' chromosomes_to_load = None if args.chromosome: chromosomes_to_load = [args.chromosome] applyCorrectionCoolerLoad = True if args.load_raw_values: applyCorrectionCoolerLoad = False matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pCorrectionFactorTable=args.correction_name, pCorrectionOperator=correction_operator, pChrnameList=chromosomes_to_load, pEnforceInteger=args.enforce_integer, pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() log.debug('cut_intervals {}'.format(cut_intervals[:20])) log.debug('Setting done') if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']: log.debug('cool h5 homer ginteractions hicpro branch') if args.outputFormat in ['homer', 'ginteractions']: log.debug('homer ginteractions branch') # make it a upper triangular matrix in case it is not already _matrix = triu(_matrix) # make it a full symmetrical matrix _matrix = _matrix.maximum(_matrix.T) hic2CoolVersion = None cool_metadata = None if args.inputFormat == 'cool': hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version cool_metadata = matrixFileHandlerInput.matrixFile.hic_metadata log.debug('cool_metadata {}'.format(cool_metadata)) matrixFileHandlerOutput = MatrixFileHandler( pFileType=args.outputFormat, pEnforceInteger=args.enforce_integer, pFileWasH5=format_was_h5, pHic2CoolVersion=hic2CoolVersion, pHiCInfo=cool_metadata) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) log.debug('len(args.outFileName) {}, i {}'.format( len(args.outFileName), i)) matrixFileHandlerOutput.save(args.outFileName[i], pSymmetric=True, pApplyCorrection=applyCorrection) if args.outputFormat == 'hicpro': log.debug('hicpro branch') if len(args.matrices) == len(args.outFileName) and len( args.outFileName) == len(args.bedFileHicpro): log.debug('args.bedFileHicpro[i] {}'.format( args.bedFileHicpro[i])) matrixFileHandlerOutput = MatrixFileHandler( pFileType=args.outputFormat, pBedFileHicPro=args.bedFileHicpro[i]) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save( args.outFileName[i], pSymmetric=True, pApplyCorrection=applyCorrection) else: log.error( 'The number of input matrices, output files and bed files does not match: Input: {}; Output: {}; Bed: {}' .format(len(args.matrix), len(args.outFileName), len(args.bedFileHicpro))) exit(1) elif args.outputFormat in ['mcool']: log.debug('outformat is mcool') if args.resolutions and len(args.matrices) > 1: log.error( 'Please define one matrix and many resolutions which should be created or multiple matrices.' ) if args.resolutions: log.info( 'Correction factors are removed. They are not valid for any new created resolution.' ) hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() hic2CoolVersion = None cool_metadata = None if args.inputFormat == 'cool': hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version cool_metadata = matrixFileHandlerInput.matrixFile.hic_metadata for j, resolution in enumerate(args.resolutions): hic_matrix_res = deepcopy(hic_matrix) _mergeFactor = int(resolution) // bin_size log.debug('bin size {}'.format(bin_size)) log.debug('_mergeFactor {}'.format(_mergeFactor)) if int(resolution) != bin_size: merged_matrix = hicMergeMatrixBins.merge_bins( hic_matrix_res, _mergeFactor) else: merged_matrix = hic_matrix_res append = False if j > 0: append = True matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pEnforceInteger=args.enforce_integer, pAppend=append, pFileWasH5=format_was_h5, pHic2CoolVersion=hic2CoolVersion, pHiCInfo=cool_metadata) matrixFileHandlerOutput.set_matrix_variables( merged_matrix.matrix, merged_matrix.cut_intervals, merged_matrix.nan_bins, merged_matrix.correction_factors, merged_matrix.distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '::/resolutions/' + str(resolution), pSymmetric=True, pApplyCorrection=applyCorrection) else: append = False if i > 0: append = True hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pAppend=append, pFileWasH5=format_was_h5) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '::/resolutions/' + str(bin_size), pSymmetric=True, pApplyCorrection=applyCorrection)
def main(args=None): args = parse_arguments().parse_args(args) threads = args.threads merged_matrices = [None] * threads matrices_list = cooler.fileops.list_coolers(args.matrix) if len(matrices_list) < threads: threads = len(matrices_list) all_data_collected = False thread_done = [False] * threads length_index = [None] * threads length_index[0] = 0 matricesPerThread = len(matrices_list) // threads queue = [None] * threads process = [None] * threads for i in range(threads): if i < threads - 1: matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread] length_index[i + 1] = length_index[i] + len(matrices_name_list) else: matrices_name_list = matrices_list[i * matricesPerThread:] queue[i] = Queue() process[i] = Process(target=compute_merge, kwargs=dict(pMatrixName=args.matrix, pMatrixList=matrices_name_list, pRunningWindow=args.runningWindow, pNumBins=args.numBins, pQueue=queue[i])) process[i].start() while not all_data_collected: for i in range(threads): if queue[i] is not None and not queue[i].empty(): log.debug('i {}'.format(i)) log.debug('len(queue) {}'.format(len(queue))) log.debug('len(merged_matrices) {}'.format( len(merged_matrices))) merged_matrices[i] = queue[i].get() queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) merged_matrices = [item for sublist in merged_matrices for item in sublist] for i, hic_matrix in enumerate(merged_matrices): append = False if i > 0: append = True matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool', pAppend=append, pFileWasH5=False) matrixFileHandlerOutput.set_matrix_variables( hic_matrix.matrix, hic_matrix.cut_intervals, hic_matrix.nan_bins, hic_matrix.correction_factors, hic_matrix.distance_counts) matrixFileHandlerOutput.save(args.outFileName + '::' + matrices_list[i], pSymmetric=True, pApplyCorrection=False)
def main(args=None): args = parse_arguments().parse_args(args) log.debug(args) # parse from hicpro, homer, h5 and hic to cool if args.inputFormat != 'hic' and args.outputFormat != 'mcool': if len(args.matrices) != len(args.outFileName): log.error( 'Number of input matrices does not match number output matrices!' ) exit(1) if args.inputFormat == 'hic' and args.outputFormat == 'cool': log.info('Converting with hic2cool.') for i, matrix in enumerate(args.matrices): if args.resolutions is None: hic2cool_convert(matrix, args.outFileName[i], 0) else: for resolution in args.resolutions: out_name = args.outFileName[i].split('.') out_name[-2] = out_name[-2] + '_' + str(resolution) out_name = '.'.join(out_name) hic2cool_convert(matrix, out_name, resolution) return elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool']: format_was_h5 = False if args.inputFormat == 'h5': format_was_h5 = True applyCorrection = True if args.store_applied_correction: applyCorrection = False if args.inputFormat == 'hicpro': if len(args.matrices) != len(args.bedFileHicpro): log.error( 'Number of matrices and associated bed files need to be the same.' ) log.error('Matrices: {}; Bed files: {}'.format( len(args.matrices), len(args.bedFileHicpro))) sys.exit(1) for i, matrix in enumerate(args.matrices): if args.inputFormat == 'hicpro': matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pBedFileHicPro=args.bedFileHicpro[i]) else: correction_operator = None if args.correction_division: correction_operator = '/' chromosomes_to_load = None if args.chromosome: chromosomes_to_load = [args.chromosome] applyCorrectionCoolerLoad = True if args.load_raw_values: applyCorrectionCoolerLoad = False matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pCorrectionFactorTable=args.correction_name, pCorrectionOperator=correction_operator, pChrnameList=chromosomes_to_load, pEnforceInteger=args.enforce_integer, pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() log.debug('Setting done') if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']: if args.outputFormat in ['homer', 'ginteractions']: # make it a upper triangular matrix in case it is not already _matrix = triu(_matrix) # make it a full symmetrical matrix _matrix = _matrix.maximum(_matrix.T) hic2CoolVersion = None if args.inputFormat == 'cool': hic2CoolVersion = matrixFileHandlerInput.matrixFile.hic2cool_version matrixFileHandlerOutput = MatrixFileHandler( pFileType=args.outputFormat, pEnforceInteger=args.enforce_integer, pFileWasH5=format_was_h5, pHic2CoolVersion=hic2CoolVersion) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save(args.outFileName[i], pSymmetric=True, pApplyCorrection=applyCorrection) elif args.outputFormat in ['mcool']: log.debug('outformat is mcool') if args.resolutions and len(args.matrices) > 1: log.error( 'Please define one matrix and many resolutions which should be created or multiple matrices.' ) if args.resolutions: log.info( 'Correction factors are removed. They are not valid for any new created resolution.' ) hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() for j, resolution in enumerate(args.resolutions): hic_matrix_res = deepcopy(hic_matrix) _mergeFactor = int(resolution) // bin_size log.debug('bin size {}'.format(bin_size)) log.debug('_mergeFactor {}'.format(_mergeFactor)) if int(resolution) != bin_size: merged_matrix = hicMergeMatrixBins.merge_bins( hic_matrix_res, _mergeFactor) else: merged_matrix = hic_matrix_res append = False if j > 0: append = True matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pEnforceInteger=args.enforce_integer, pAppend=append, pFileWasH5=format_was_h5) matrixFileHandlerOutput.set_matrix_variables( merged_matrix.matrix, merged_matrix.cut_intervals, merged_matrix.nan_bins, merged_matrix.correction_factors, merged_matrix.distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '::/resolutions/' + str(resolution), pSymmetric=True, pApplyCorrection=applyCorrection) else: append = False if i > 0: append = True hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pAppend=append, pFileWasH5=format_was_h5) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '::/resolutions/' + str(bin_size), pSymmetric=True, pApplyCorrection=applyCorrection)
def main(args=None): args = parse_arguments().parse_args(args) matplotlib.rcParams['pdf.fonttype'] = 42 ma = hm.hiCMatrix(args.matrix) ma.maskBins(ma.nan_bins) ma.matrix.data[np.isnan(ma.matrix.data)] = 0 ma.maskBins(ma.nan_bins) ma.matrix.data = ma.matrix.data new_intervals = hicexplorer.utilities.enlarge_bins(ma.cut_intervals) ma.setCutIntervals(new_intervals) if args.chromosomes: ma.keepOnlyTheseChr(args.chromosomes) default_range = '1000000:20000000' if args.range is not None: if (args.mode == "inter-chr") or (args.mode == "all"): log.info("--range is ineffective for inter-chr and all mode.") if args.range is None: if args.mode == "intra-chr": log.warning("You have not set any range. This is by default set to {} for intra-chr.".format(default_range)) args.range = default_range min_dist, max_dist = args.range.split(":") if args.mode == "intra-chr": log.info("checking range {}-{}".format(min_dist, max_dist)) assert int(min_dist) < int(max_dist), "Error lower range is larger than upper range!" if args.transform == "z-score": # use zscore matrix log.info("Computing z-score matrix. This may take a while.\n") if args.mode == 'intra-chr': ma.convert_to_zscore_matrix(maxdepth=int(max_dist) * 2.5, perchr=True) else: ma.convert_to_zscore_matrix(maxdepth=None, perchr=True) elif args.transform == "obs/exp": # use obs/exp matrix log.info("Computing observed vs. expected matrix. This may take a while.\n") if args.mode == 'intra-chr': ma.convert_to_obs_exp_matrix(maxdepth=int(max_dist) * 2.5, perchr=True) else: ma.convert_to_obs_exp_matrix(maxdepth=None, perchr=True) if args.outFileObsExp: file_type = 'cool' if args.outFileObsExp.endswith('.h5'): file_type = 'h5' matrixFileHandlerOutput = MatrixFileHandler(pFileType=file_type) matrixFileHandlerOutput.set_matrix_variables(ma.matrix, ma.cut_intervals, ma.nan_bins, ma.correction_factors, ma.distance_counts) matrixFileHandlerOutput.save(args.outFileObsExp, pSymmetric=True, pApplyCorrection=False) M = args.numberOfBins if args.numberOfBins % 2 == 1 else args.numberOfBins + 1 M_half = int((M - 1) // 2) chrom_coord = dict() chrom_list = ma.getChrNames() for chrom in chrom_list: first, last = ma.getChrBinRange(chrom) first = ma.getBinPos(first) last = ma.getBinPos(last - 1) chrom_coord[chrom] = (first[1], last[2]) agg_info = dict() agg_info["chrom_coord"] = chrom_coord # coordinates of each chrom agg_info["seen"] = [] # seen bins agg_info["agg_matrix"] = {chrom: {} for chrom in chrom_list} # important agg_info["agg_total"] = {chrom: {} for chrom in chrom_list} agg_info["agg_diagonals"] = {chrom: {} for chrom in chrom_list} agg_info["agg_contact_position"] = {chrom: {} for chrom in chrom_list} # important agg_info["agg_center_values"] = {chrom: {} for chrom in chrom_list} # important agg_info["counter"] = 0 agg_info["used_counter"] = 0 agg_info["empty_mat"] = 0 log.debug('agg_info["agg_matrix"] {}'.format(agg_info["agg_matrix"])) if (args.mode == 'inter-chr') and (len(agg_info["chrom_coord"]) == 1): exit("Error: 'inter-chr' mode can not be applied on matrices of only one chromosme.") if (args.mode == 'inter-chr') and (args.perChr): exit("Error: 'inter-chr' mode can not be used along with --perChr.") if (args.mode == 'all') and (args.perChr): exit("Error: 'all' mode can not be used along with --perChr.") if args.row_wise: # read bed files bed_intervals = args.BED.readlines() if args.BED2: bed_intervals2 = args.BED2.readlines() else: log.error("Error computing row-wise contacts requires two bed files!") exit("Error computing row-wise contacts requires two bed files!") if len(bed_intervals) != len(bed_intervals2): log.error("row_wise only works if both bed files have the same length.") exit("Error row_wise only works if both bed files have the same length.") # agg_matrix could be either per chromosome or genome wide aggregate_contacts_per_row(bed_intervals, bed_intervals2, agg_info, ma, chrom_list, M_half, args.largeRegionsOperation, args.range, args.transform, mode=args.mode, perChr=args.perChr, pConsiderStrandDirection=args.considerStrandDirection) else: # not row-wise # read and sort bed files. bed_intervals = read_bed_per_chrom(args.BED, chrom_list, args.considerStrandDirection) if args.BED2: bed_intervals2 = read_bed_per_chrom(args.BED2, chrom_list, args.considerStrandDirection) else: bed_intervals2 = bed_intervals # agg_matrix could be either per chromosome or genome wide aggregate_contacts(bed_intervals, bed_intervals2, agg_info, ma, M_half, args.largeRegionsOperation, args.range, args.transform, mode=args.mode, pConsiderStrandDirection=args.considerStrandDirection) if len(agg_info["agg_matrix"]) == 0: exit("No susbmatrix found to be aggregated.") if args.kmeans is not None: assert(args.kmeans > 1) if args.perChr == True: clustered_info = cluster_matrices(agg_info, k=args.kmeans, method='kmeans', how=args.howToCluster, perChr=args.perChr, max_deviation=args.max_deviation, keep_outlier=args.keep_outlier) else: clustered_info = cluster_matrices(agg_info, k=args.kmeans, method='kmeans', how=args.howToCluster, perChr=False, max_deviation=args.max_deviation, keep_outlier=args.keep_outlier) num_clusters = args.kmeans elif args.hclust is not None: assert(args.hclust > 1) log.info("Performing hierarchical clustering." "Please note that it might be very slow for large datasets.\n") if args.perChr == True: clustered_info = cluster_matrices(agg_info, k=args.hclust, method='hierarchical', how=args.howToCluster, perChr=args.perChr, max_deviation=args.max_deviation, keep_outlier=args.keep_outlier) else: clustered_info = cluster_matrices(agg_info, k=args.hclust, method='hierarchical', how=args.howToCluster, perChr=False, max_deviation=args.max_deviation, keep_outlier=args.keep_outlier) num_clusters = args.hclust else: # make a 'fake' clustering to generalize the plotting of the submatrices k = 1 if args.perChr == True: clustered_info = cluster_matrices(agg_info, k=k, method='no_clust', how='full', perChr=args.perChr, max_deviation=args.max_deviation, keep_outlier=args.keep_outlier) else: clustered_info = cluster_matrices(agg_info, k=k, method='no_clust', how='full', perChr=False, max_deviation=args.max_deviation, keep_outlier=args.keep_outlier) num_clusters = k plot_aggregated_contacts(clustered_info, num_clusters, M_half, args) # plot the diagonals # the diagonals plot is useful to see individual cases and if they had a contact in the center if args.diagnosticHeatmapFile: plot_diagnostic_heatmaps(clustered_info, M_half, args)
def main(args=None): # args_string args = parse_arguments().parse_args(args) hicmatrix_adjusted_objects = [] matrices_name = args.matrix threads = args.threads matrices_list = cooler.fileops.list_coolers(matrices_name) if args.createSubmatrix is not None and args.regions is None and args.chromosomes is None: for matrix in matrices_list[:args.createSubmatrix]: cooler.fileops.cp(args.matrix + '::' + matrix, args.outFileName + '::' + matrix) exit(0) input_count_matrices = len(matrices_list) # log.debug('args.createSubmatrix {}, args.action {}, args.chromosomes {}'.format(args.createSubmatrix, args.action, args.chromosomes )) # exit() if threads > len(matrices_list): threads = len(matrices_list) all_data_collected = False thread_done = [False] * threads hicmatrix_adjusted_objects_threads = [None] * threads keep_matrices_list_threads = [None] * threads matricesPerThread = len(matrices_list) // threads queue = [None] * threads process = [None] * threads for i in range(threads): if i < threads - 1: matrices_name_list = matrices_list[i * matricesPerThread:(i + 1) * matricesPerThread] else: matrices_name_list = matrices_list[i * matricesPerThread:] queue[i] = Queue() process[i] = Process(target=compute_adjust_matrix, kwargs=dict(pMatrixName=matrices_name, pMatricesList=matrices_name_list, pArgs=args, pQueue=queue[i])) process[i].start() log.debug("foo") while not all_data_collected: for i in range(threads): if queue[i] is not None and not queue[i].empty(): hicmatrix_adjusted_objects_threads[ i], keep_matrices_list_threads[i] = queue[i].get() queue[i] = None process[i].join() process[i].terminate() process[i] = None thread_done[i] = True all_data_collected = True for thread in thread_done: if not thread: all_data_collected = False time.sleep(1) # TODO: implement this! hicmatrix_adjusted_objects = [ item for sublist in hicmatrix_adjusted_objects_threads for item in sublist ] keep_matrices_list = [ item for sublist in keep_matrices_list_threads for item in sublist ] log.debug('length out {}'.format(len(hicmatrix_adjusted_objects))) for i, hic_matrix in enumerate(hicmatrix_adjusted_objects): if args.createSubmatrix and i > args.createSubmatrix: break append = True if i == 0: append = False if keep_matrices_list[i] == 0: continue matrixFileHandlerOutput = MatrixFileHandler(pFileType='cool', pAppend=append, pEnforceInteger=False, pFileWasH5=False, pHic2CoolVersion=None) matrixFileHandlerOutput.set_matrix_variables( hic_matrix.matrix, hic_matrix.cut_intervals, hic_matrix.nan_bins, hic_matrix.correction_factors, hic_matrix.distance_counts) matrixFileHandlerOutput.save(args.outFileName + '::' + matrices_list[i], pSymmetric=True, pApplyCorrection=False) broken_count = input_count_matrices - np.sum(np.array(keep_matrices_list)) print( 'Out of {} matrices, {} were removed because they were broken.'.format( input_count_matrices, broken_count))
def main(args=None): args = parse_arguments().parse_args(args) log.debug(args) # parse from hicpro, homer, h5 and hic to cool if args.inputFormat != 'hic' and args.outputFormat != 'mcool': if len(args.matrices) != len(args.outFileName): log.error( 'Number of input matrices does not match number output matrices!' ) exit(1) if args.inputFormat == 'hic' and args.outputFormat == 'cool': log.info('Converting with hic2cool.') for i, matrix in enumerate(args.matrices): if args.resolutions is None: hic2cool_convert(matrix, args.outFileName[i], 0) else: out_name = args.outFileName[i].split('.') out_name[-2] = split_name[-2] + '_' + str(resolution) out_name = '.'.join(out_name) for resolution in args.resolutions: hic2cool_convert(matrix, out_name, resolution) return elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool']: applyCorrection = True if args.store_applied_correction: applyCorrection = False if args.inputFormat == 'hicpro': if len(args.matrices) != len(args.bedFileHicpro): log.error( 'Number of matrices and associated bed files need to be the same.' ) log.error('Matrices: {}; Bed files: {}'.format( len(args.matrices), len(args.bedFileHicpro))) sys.exit(1) for i, matrix in enumerate(args.matrices): if args.inputFormat == 'hicpro': matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pBedFileHicPro=args.bedFileHicpro[i]) else: correction_operator = None if args.correction_division: correction_operator = '/' chromosomes_to_load = None if args.chromosome: chromosomes_to_load = [args.chromosome] matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pCorrectionFactorTable=args.correction_name, pCorrectionOperator=correction_operator, pChrnameList=chromosomes_to_load, pEnforceInteger=args.enforce_integer) _matrix, cut_intervals, nan_bins, \ correction_factors, distance_counts = matrixFileHandlerInput.load() log.debug('Setting done') if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']: matrixFileHandlerOutput = MatrixFileHandler( pFileType=args.outputFormat) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save(args.outFileName[i] + '.' + args.outputFormat, pSymmetric=True, pApplyCorrection=applyCorrection) elif args.outputFormat in ['mcool']: log.debug('outformat is mcool') if args.resolutions and len(args.matrices) > 1: log.error( 'Please define either one matrix and many resolutions which should be created.' ) if args.resolutions: log.info( 'Correction factors are removed. They are not valid for any new created resolution' ) hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() for resolution in args.resolutions: _mergeFactor = int(resolution) // bin_size merged_matrix = hicMergeMatrixBins.merge_bins( hic_matrix, _mergeFactor) matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pEnforceInteger=args.enforce_integer) matrixFileHandlerOutput.set_matrix_variables( merged_matrix.matrix, merged_matrix.cut_intervals, merged_matrix.nan_bins, merged_matrix.correction_factors, merged_matrix.distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '.mcool' + '::/resolutions/' + str(resolution), pSymmetric=True, pApplyCorrection=applyCorrection) else: hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool') matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '.mcool' + '::/resolutions/' + str(bin_size), pSymmetric=True, pApplyCorrection=applyCorrection)