def load(self): """ Loads a matrix stored in h5 format :param matrix_filename: :return: matrix, cut_intervals, nan_bins, distance_counts, correction_factors """ log.debug('Load in h5 format') with tables.open_file(self.matrixFileName) as f: parts = {} for matrix_part in ('data', 'indices', 'indptr', 'shape'): parts[matrix_part] = getattr(f.root.matrix, matrix_part).read() matrix = csr_matrix(tuple([parts['data'], parts['indices'], parts['indptr']]), shape=parts['shape']) # matrix = hiCMatrix.fillLowerTriangle(matrix) # get intervals intvals = {} for interval_part in ('chr_list', 'start_list', 'end_list', 'extra_list'): if toString(interval_part) == toString('chr_list'): chrom_list = getattr(f.root.intervals, interval_part).read() intvals[interval_part] = toString(chrom_list) else: intvals[interval_part] = getattr(f.root.intervals, interval_part).read() cut_intervals = zip(intvals['chr_list'], intvals['start_list'], intvals['end_list'], intvals['extra_list']) assert len(cut_intervals) == matrix.shape[0], \ "Error loading matrix. Length of bin intervals ({}) is different than the " \ "size of the matrix ({})".format(len(cut_intervals), matrix.shape[0]) # get nan_bins try: if hasattr(f.root, 'nan_bins'): nan_bins = f.root.nan_bins.read() else: nan_bins = np.array([]) except Exception: nan_bins = np.array([]) # get correction factors try: if hasattr(f.root, 'correction_factors'): correction_factors = f.root.correction_factors.read() assert len(correction_factors) == matrix.shape[0], \ "Error loading matrix. Length of correction factors does not" \ "match size of matrix" else: correction_factors = None except Exception: correction_factors = None try: # get correction factors if hasattr(f.root, 'distance_counts'): distance_counts = f.root.correction_factors.read() else: distance_counts = None except Exception: distance_counts = None return matrix, cut_intervals, nan_bins, distance_counts, correction_factors
def load(self): log.debug('Load in cool format') self.minValue = None self.maxValue = None if self.matrixFileName is None: log.warning('No matrix is initialized') try: cooler_file = cooler.Cooler(self.matrixFileName) if 'metadata' in cooler_file.info: self.hic_metadata = cooler_file.info['metadata'] else: self.hic_metadata = None self.cool_info = deepcopy(cooler_file.info) except Exception as e: log.warning("Could not open cooler file. Maybe the path is wrong or the given node is not available.") log.warning('The following file was tried to open: {}'.format(self.matrixFileName)) log.warning("The following nodes are available: {}".format(cooler.fileops.list_coolers(self.matrixFileName.split("::")[0]))) return None, e if self.chrnameList is None: matrixDataFrame = cooler_file.matrix(balance=False, sparse=True, as_pixels=True) used_dtype = np.int32 if np.iinfo(np.int32).max < cooler_file.info['nbins']: used_dtype = np.int64 count_dtype = matrixDataFrame[0]['count'].dtype data = np.empty(cooler_file.info['nnz'], dtype=count_dtype) instances = np.empty(cooler_file.info['nnz'], dtype=used_dtype) features = np.empty(cooler_file.info['nnz'], dtype=used_dtype) i = 0 size = cooler_file.info['nbins'] // 32 if size == 0: size = 1 start_pos = 0 while i < cooler_file.info['nbins']: matrixDataFrameChunk = matrixDataFrame[i:i + size] _data = matrixDataFrameChunk['count'].values.astype(count_dtype) _instances = matrixDataFrameChunk['bin1_id'].values.astype(used_dtype) _features = matrixDataFrameChunk['bin2_id'].values.astype(used_dtype) data[start_pos:start_pos + len(_data)] = _data instances[start_pos:start_pos + len(_instances)] = _instances features[start_pos:start_pos + len(_features)] = _features start_pos += len(_features) i += size del _data del _instances del _features matrix = csr_matrix((data, (instances, features)), shape=(np.int(cooler_file.info['nbins']), np.int(cooler_file.info['nbins'])), dtype=count_dtype) del data del instances del features gc.collect() else: if len(self.chrnameList) == 1: try: if self.distance is None or cooler_file.binsize is None: # load the full chromosome matrix = cooler_file.matrix(balance=False, sparse=True, as_pixels=False).fetch(self.chrnameList[0]).tocsr() else: # load only the values up to a specific distance lo, hi = cooler_file.extent(self.chrnameList[0]) dist = self.distance // cooler_file.binsize step = (hi - lo) // 32 if step < 1: step = 1 mat = lil_matrix((hi - lo, hi - lo), dtype=np.float32) for i0, i1 in cooler.util.partition(lo, hi, step): # fetch stripe pixels = cooler_file.matrix(balance=False, as_pixels=True)[i0:i1, lo:hi] # filter pixels = pixels[(pixels['bin2_id'] - pixels['bin1_id']) < dist] # insert into sparse matrix mat[pixels['bin1_id'] - lo, pixels['bin2_id'] - lo] = pixels['count'].astype(np.float32) del pixels matrix = mat.tocsr() del mat gc.collect() except ValueError as ve: log.exception("Wrong chromosome format. Please check UCSC / ensembl notation.") log.exception('Error: {}'.format(str(ve))) else: raise Exception("Operation to load more as one region is not supported.") cut_intervals_data_frame = None correction_factors_data_frame = None if self.chrnameList is not None: if len(self.chrnameList) == 1: cut_intervals_data_frame = cooler_file.bins().fetch(self.chrnameList[0]) log.debug('cut_intervals_data_frame {}'.format(list(cut_intervals_data_frame.columns))) if self.correctionFactorTable in cut_intervals_data_frame: correction_factors_data_frame = cut_intervals_data_frame[self.correctionFactorTable] else: raise Exception("Operation to load more than one chr from bins is not supported.") else: if self.applyCorrectionLoad and self.correctionFactorTable in cooler_file.bins(): correction_factors_data_frame = cooler_file.bins()[[self.correctionFactorTable]][:] cut_intervals_data_frame = cooler_file.bins()[['chrom', 'start', 'end']][:] correction_factors = None if correction_factors_data_frame is not None and self.applyCorrectionLoad: # apply correction factors to matrix # a_i,j = a_i,j * c_i *c_j matrix.eliminate_zeros() if len(matrix.data) > 1: matrix.data = matrix.data.astype(float) correction_factors = np.array(correction_factors_data_frame.values).flatten() # Don't apply correction if weight were just 'nans' if np.sum(np.isnan(correction_factors)) != len(correction_factors): # correction_factors = convertNansToZeros(correction_factors) matrix.sort_indices() instances, features = matrix.nonzero() instances_factors = correction_factors[instances] features_factors = correction_factors[features] if self.correctionOperator is None: if self.correctionFactorTable in ['KR', 'VC', 'SQRT_VC']: self.correctionOperator = '/' else: self.correctionOperator = '*' if 'generated-by' in cooler_file.info: log.debug('cooler_file.info[\'generated-by\'] {} {}'.format(cooler_file.info['generated-by'], type(cooler_file.info['generated-by']))) generated_by = toString(cooler_file.info['generated-by']) if 'hic2cool' in generated_by: self.hic2cool_version = generated_by.split('-')[1] elif 'hicmatrix' in generated_by: self.hicmatrix_version = generated_by.split('-')[1] instances_factors *= features_factors log.debug('hic2cool: {}'.format(self.hic2cool_version)) log.debug('self.correctionOperator: {}'.format(self.correctionOperator)) if self.correctionOperator == '*': matrix.data *= instances_factors elif self.correctionOperator == '/': matrix.data /= instances_factors cut_intervals = [] for values in cut_intervals_data_frame.values: cut_intervals.append(tuple([toString(values[0]), values[1], values[2], 1.0])) del cut_intervals_data_frame del correction_factors_data_frame # try to restore nan_bins. try: # remove possible nan bins introduced by the correction factors # to have them part of the nan_bins vector mask = np.isnan(matrix.data) matrix.data[mask] = 0 matrix.eliminate_zeros() shape = matrix.shape[0] if matrix.shape[0] < matrix.shape[1] else matrix.shape[1] nan_bins_indices = np.arange(shape) nan_bins_indices = np.setdiff1d(nan_bins_indices, matrix.indices) nan_bins = [] for bin_id in nan_bins_indices: if len(matrix[bin_id, :].data) == 0: nan_bins.append(bin_id) nan_bins = np.array(nan_bins) except Exception: nan_bins = None distance_counts = None return matrix, cut_intervals, nan_bins, distance_counts, correction_factors
def load(self): log.debug('Load in cool format') self.minValue = None self.maxValue = None if self.matrixFileName is None: log.info('No matrix is initialized') try: cooler_file = cooler.Cooler(self.matrixFileName) if 'metadata' in cooler_file.info: self.hic_metadata = cooler_file.info['metadata'] else: self.hic_metadata = None self.cool_info = deepcopy(cooler_file.info) # log.debug("cooler_file.info {}".format(cooler_file.info)) except Exception: log.info("Could not open cooler file. Maybe the path is wrong or the given node is not available.") log.info('The following file was tried to open: {}'.format(self.matrixFileName)) log.info("The following nodes are available: {}".format(cooler.fileops.list_coolers(self.matrixFileName.split("::")[0]))) exit() log.debug('self.chrnameList {}'.format(self.chrnameList)) if self.chrnameList is None: log.debug('muh 69') matrixDataFrame = cooler_file.matrix(balance=False, sparse=True, as_pixels=True) used_dtype = np.int32 if np.iinfo(np.int32).max < cooler_file.info['nbins']: used_dtype = np.int64 count_dtype = matrixDataFrame[0]['count'].dtype data = np.empty(cooler_file.info['nnz'], dtype=count_dtype) instances = np.empty(cooler_file.info['nnz'], dtype=used_dtype) features = np.empty(cooler_file.info['nnz'], dtype=used_dtype) i = 0 size = cooler_file.info['nbins'] // 32 if size == 0: size = 1 start_pos = 0 while i < cooler_file.info['nbins']: matrixDataFrameChunk = matrixDataFrame[i:i + size] _data = matrixDataFrameChunk['count'].values.astype(count_dtype) _instances = matrixDataFrameChunk['bin1_id'].values.astype(used_dtype) _features = matrixDataFrameChunk['bin2_id'].values.astype(used_dtype) data[start_pos:start_pos + len(_data)] = _data instances[start_pos:start_pos + len(_instances)] = _instances features[start_pos:start_pos + len(_features)] = _features start_pos += len(_features) i += size del _data del _instances del _features matrix = csr_matrix((data, (instances, features)), shape=(np.int(cooler_file.info['nbins']), np.int(cooler_file.info['nbins'])), dtype=count_dtype) self.minValue = data.min() self.maxValue = data.max() del data del instances del features else: if len(self.chrnameList) == 1: try: log.debug('Load data') matrix = cooler_file.matrix(balance=False, sparse=True).fetch(self.chrnameList[0]).tocsr() # handle the case of an empty csr matrix if len(matrix.data) == 0: self.minValue = 0 self.maxValue = 0 else: self.minValue = matrix.data.min() self.maxValue = matrix.data.max() except ValueError: exit("Wrong chromosome format. Please check UCSC / ensembl notation.") else: exit("Operation to load more as one region is not supported.") cut_intervals_data_frame = None correction_factors_data_frame = None if self.chrnameList is not None: if len(self.chrnameList) == 1: cut_intervals_data_frame = cooler_file.bins().fetch(self.chrnameList[0]) if self.correctionFactorTable in cut_intervals_data_frame: correction_factors_data_frame = cut_intervals_data_frame[self.correctionFactorTable] else: exit("Operation to load more than one chr from bins is not supported.") else: if self.applyCorrectionLoad and self.correctionFactorTable in cooler_file.bins(): correction_factors_data_frame = cooler_file.bins()[[self.correctionFactorTable]][:] cut_intervals_data_frame = cooler_file.bins()[['chrom', 'start', 'end']][:] correction_factors = None if correction_factors_data_frame is not None and self.applyCorrectionLoad: # apply correction factors to matrix # a_i,j = a_i,j * c_i *c_j matrix.eliminate_zeros() if len(matrix.data) > 1: matrix.data = matrix.data.astype(float) correction_factors = convertNansToOnes(np.array(correction_factors_data_frame.values).flatten()) # apply only if there are not only 1's if np.sum(correction_factors) != len(correction_factors): matrix.sort_indices() instances, features = matrix.nonzero() instances_factors = correction_factors[instances] features_factors = correction_factors[features] if self.correctionOperator is None: if 'generated-by' in cooler_file.info: log.debug('cooler_file.info[\'generated-by\'] {} {}'.format(cooler_file.info['generated-by'], type(cooler_file.info['generated-by']))) generated_by = toString(cooler_file.info['generated-by']) if 'hic2cool' in generated_by: self.hic2cool_version = generated_by.split('-')[1] if self.hic2cool_version >= '0.5': log.debug('0.5') self.correctionOperator = '/' else: log.debug('0.4') self.correctionOperator = '*' else: self.correctionOperator = '*' log.debug('hic2cool: {}'.format(self.hic2cool_version)) log.debug('self.correctionOperator : {}'.format(self.correctionOperator)) # elif 'hicmatrix' in generated_by: # self.hicmatrix_version = generated_by.split('-')[1] # if self.hicmatrix_version >= '8': # self.correctionOperator = '/' # else: # self.correctionOperator = '*' else: self.correctionOperator = '*' instances_factors *= features_factors log.debug('hic2cool: {}'.format(self.hic2cool_version)) log.debug('self.correctionOperator: {}'.format(self.correctionOperator)) if self.correctionOperator == '*': matrix.data *= instances_factors elif self.correctionOperator == '/': matrix.data /= instances_factors # if self.scaleToOriginalRange is not None: min_value = matrix.data.min() max_value = matrix.data.max() # check if max smaller one or if not same mangnitude if max_value < 1 or (np.absolute(int(math.log10(max_value)) - int(math.log10(self.maxValue))) > 1): desired_range_difference = self.maxValue - self.minValue min_value = matrix.data.min() max_value = matrix.data.max() matrix.data = (matrix.data - min_value) matrix.data /= (max_value - min_value) matrix.data *= desired_range_difference matrix.data += self.minValue self.scaleToOriginalRange = True # diff_scale_factor = matrix.data.max() / max_value # if self.correctionOperator == '*': # correction_factors *= diff_scale_factor # if self.correctionOperator == '/': # correction_factors /= diff_scale_factor cut_intervals = [] time_start = time.time() log.debug('Creating cut_intervals {}'.format(time_start)) for values in cut_intervals_data_frame.values: cut_intervals.append(tuple([toString(values[0]), values[1], values[2], 1.0])) log.debug('Creating cut_intervals {} DONE'.format(time.time() - time_start)) del cut_intervals_data_frame del correction_factors_data_frame # try to restore nan_bins. try: shape = matrix.shape[0] if matrix.shape[0] < matrix.shape[1] else matrix.shape[1] nan_bins = np.arange(shape) nan_bins = np.setdiff1d(nan_bins, matrix.indices[:-1]) except Exception: nan_bins = None distance_counts = None return matrix, cut_intervals, nan_bins, distance_counts, correction_factors
def load(self, pApplyCorrection=None, pMatrixOnly=None): log.debug('Load in cool format') log.debug('self.chrnameList {}'.format(self.chrnameList)) if self.matrixFileName is None: log.info('No matrix is initalized') if pApplyCorrection is None: pApplyCorrection = True try: cooler_file = cooler.Cooler(self.matrixFileName) except Exception: log.info( "Could not open cooler file. Maybe the path is wrong or the given node is not available." ) log.info('The following file was tried to open: {}'.format( self.matrixFileName)) log.info("The following nodes are available: {}".format( cooler.io.ls(self.matrixFileName.split("::")[0]))) exit() if self.chrnameList is None: matrixDataFrame = cooler_file.matrix(balance=False, sparse=True, as_pixels=True) used_dtype = np.int32 if np.iinfo(np.int32).max < cooler_file.info['nbins']: used_dtype = np.int64 data = np.empty(cooler_file.info['nnz'], dtype=used_dtype) instances = np.empty(cooler_file.info['nnz'], dtype=used_dtype) features = np.empty(cooler_file.info['nnz'], dtype=used_dtype) i = 0 size = cooler_file.info['nbins'] // 32 if size == 0: size = 1 start_pos = 0 while i < cooler_file.info['nbins']: csr_data = matrixDataFrame[i:i + size].values.astype(used_dtype).T lenght_data = len(csr_data[0]) data[start_pos:start_pos + lenght_data] = csr_data[2] instances[start_pos:start_pos + lenght_data] = csr_data[0] features[start_pos:start_pos + lenght_data] = csr_data[1] start_pos += lenght_data del csr_data i += size matrix = csr_matrix( (data, (instances, features)), shape=(cooler_file.info['nbins'], cooler_file.info['nbins']), dtype=used_dtype) del data del instances del features else: if len(self.chrnameList) == 1: try: matrix = cooler_file.matrix( balance=False, sparse=True).fetch(self.chrnameList[0]).tocsr() except ValueError: exit( "Wrong chromosome format. Please check UCSC / ensembl notation." ) else: exit("Operation to load more as one region is not supported.") cut_intervals_data_frame = None correction_factors_data_frame = None if self.chrnameList is not None: if len(self.chrnameList) == 1: cut_intervals_data_frame = cooler_file.bins().fetch( self.chrnameList[0]) if self.correctionFactorTable in cut_intervals_data_frame: correction_factors_data_frame = cut_intervals_data_frame[ self.correctionFactorTable] else: exit( "Operation to load more than one chr from bins is not supported." ) else: if pApplyCorrection and self.correctionFactorTable in cooler_file.bins( ): correction_factors_data_frame = cooler_file.bins()[[ self.correctionFactorTable ]][:] cut_intervals_data_frame = cooler_file.bins()[[ 'chrom', 'start', 'end' ]][:] correction_factors = None # log.debug("{} {}".format(correction_factors_data_frame, pApplyCorrection)) if correction_factors_data_frame is not None and pApplyCorrection: log.debug("Apply correction factors") # apply correction factors to matrix # a_i,j = a_i,j * c_i *c_j matrix.eliminate_zeros() matrix.data = matrix.data.astype(float) correction_factors = convertNansToOnes( np.array(correction_factors_data_frame.values).flatten()) # apply only if there are not only 1's if np.sum(correction_factors) != len(correction_factors): instances, features = matrix.nonzero() instances_factors = correction_factors[instances] features_factors = correction_factors[features] instances_factors *= features_factors if self.correctionOperator == '*': matrix.data *= instances_factors elif self.correctionOperator == '/': matrix.data /= instances_factors cut_intervals = [] for values in cut_intervals_data_frame.values: cut_intervals.append( tuple([toString(values[0]), values[1], values[2], 1.0])) # try to restore nan_bins. try: shape = matrix.shape[ 0] if matrix.shape[0] < matrix.shape[1] else matrix.shape[1] nan_bins = np.array(range(shape)) nan_bins = np.setxor1d(nan_bins, matrix.indices) i = 0 while i < len(nan_bins): if nan_bins[i] >= shape: break i += 1 nan_bins = nan_bins[:i] except Exception: nan_bins = None distance_counts = None # matrix = hiCMatrix.fillLowerTriangle(matrix) return matrix, cut_intervals, nan_bins, distance_counts, correction_factors