def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): pixel_dict = {} bins_dict = {} if self.coolObjectsList is not None: for coolObject in self.coolObjectsList: bins_data_frame, matrix_data_frame, dtype_pixel, info = coolObject.matrixFile.create_cooler_input(pSymmetric=pSymmetric, pApplyCorrection=pApplyCorrection) bins_dict[coolObject.matrixFile.matrixFileName] = bins_data_frame pixel_dict[coolObject.matrixFile.matrixFileName] = matrix_data_frame else: try: dtype_pixel = {'bin1_id': np.int32, 'bin2_id': np.int32, 'count': self.pixel_list[0]['count'].dtype} # dtype_pixel = self.pixel_list[0]['count'].dtype for i, pixels in enumerate(self.pixel_list): bins_dict[self.name_list[i]] = self.bins pixel_dict[self.name_list[i]] = pixels log.debug('self.name_list[i] {}'.format(self.name_list[i])) except Exception as exp: log.debug('Exception {}'.format(str(exp))) local_temp_dir = os.path.dirname(os.path.realpath(pFileName)) cooler.create_scool(cool_uri=pFileName, bins=bins_dict, cell_name_pixels_dict=pixel_dict, dtypes=dtype_pixel, ordered=True, temp_dir=local_temp_dir)
def test_create_scool(fp): c = cooler.Cooler(fp) # chromsizes = c.chromsizes bins = c.bins()[:] pixels = c.pixels()[:] # random and different content to prove only chrom, start, end is linked and the rest is independent for each cell from copy import deepcopy bins_cell1 = deepcopy(bins) bins_cell2 = deepcopy(bins) bins_cell3 = deepcopy(bins) bins_cell1['weight'] = np.array([0] * len(bins_cell1["start"])) bins_cell2['weight'] = np.array([1] * len(bins_cell1["start"])) bins_cell3['weight'] = np.array([2] * len(bins_cell1["start"])) bins_cell1['KR'] = np.array([3] * len(bins_cell1["start"])) bins_cell2['KR'] = np.array([4] * len(bins_cell1["start"])) bins_cell3['KR'] = np.array([5] * len(bins_cell1["start"])) name_pixel_dict = {'cell1': pixels, 'cell2': pixels, 'cell3': pixels} name_bins_dict = {'cell1': bins_cell1, 'cell2': bins_cell2, 'cell3': bins_cell3} with isolated_filesystem(): cooler.create_scool('outfile_test.scool', name_bins_dict, name_pixel_dict) content_of_scool = cooler.fileops.list_scool_cells('outfile_test.scool') content_expected = ['/cells/cell1', '/cells/cell2', '/cells/cell3'] for content in content_expected: assert content in content_of_scool cooler.create_scool('outfile_test.scool', bins, name_pixel_dict) content_of_scool = cooler.fileops.list_scool_cells('outfile_test.scool') content_expected = ['/cells/cell1', '/cells/cell2', '/cells/cell3'] for content in content_expected: assert content in content_of_scool
def scool_raw(config): chrom_list = config['chrom_list'] temp_dir = config['temp_dir'] raw_dir = os.path.join(temp_dir, "raw") rw_dir = os.path.join(temp_dir, "rw") res = config['resolution'] import cooler off_set = 0 bins_chrom = [] bins_start = [] bins_end = [] cell_list = [] cell_name_pixels_dict = {} for chrom_index, chrom in enumerate(chrom_list): origin_sparse = np.load(os.path.join(raw_dir, "%s_sparse_adj.npy" % chrom), allow_pickle=True) size = origin_sparse[0].shape[0] bins_chrom += [chrom] * size bins_start.append(np.arange(size) * res) bins_end.append(np.arange(size) * res + res) if chrom_index == 0: for i in range(len(origin_sparse)): cell_list.append("cell_%d" % i) for i in range(len(origin_sparse)): xs, ys = origin_sparse[i].nonzero() v = np.array(origin_sparse[i].data).reshape((-1)) mask = ys >= xs temp = pd.DataFrame( {'bin1_id': xs[mask] + off_set, 'bin2_id': ys[mask] + off_set, 'count':v[mask]}) if 'cell_%d' % i not in cell_name_pixels_dict: cell_name_pixels_dict['cell_%d' % i] = temp else: cell_name_pixels_dict['cell_%d' % i] = pd.concat([cell_name_pixels_dict['cell_%d' % i], temp], axis=0) off_set += size print("Start creating scool") bins = pd.DataFrame( {'chrom': bins_chrom, 'start': np.concatenate(bins_start), 'end': np.concatenate(bins_end)}) cooler.create_scool(os.path.join(temp_dir, "raw.scool"), bins, cell_name_pixels_dict, dtypes={'count': 'float32'}, ordered=True)
def scool_rwr(config): chrom_list = config['impute_list'] temp_dir = config['temp_dir'] raw_dir = os.path.join(temp_dir, "raw") rw_dir = os.path.join(temp_dir, "rw") res = config['resolution'] import cooler from Higashi2Scool import HigashiDict, skip_start_end chrom2info = {} off_set = 0 bins_chrom = [] bins_start = [] bins_end = [] cell_list = [] for chrom_index, chrom in enumerate(chrom_list): impute_f = h5py.File(os.path.join(rw_dir, "rw_%s.hdf5" % (chrom)), "r") origin_sparse = np.load(os.path.join(raw_dir, "%s_sparse_adj.npy" % chrom), allow_pickle=True) size = origin_sparse[0].shape[0] mask_start, mask_end = skip_start_end(config, chrom) del origin_sparse coordinates = np.array(impute_f['coordinates']).astype('int') xs, ys = coordinates[:, 0], coordinates[:, 1] m1 = np.zeros((size, size)) chrom2info[chrom] = [size, mask_start, mask_end, impute_f, xs, ys, m1, off_set] off_set += size bins_chrom += [chrom] * size bins_start.append(np.arange(size) * res) bins_end.append(np.arange(size) * res + res) if chrom_index == 0: for i in range(len(list(impute_f.keys())) - 1): cell_list.append("cell_%d" % i) bins = pd.DataFrame({'chrom': bins_chrom, 'start': np.concatenate(bins_start), 'end': np.concatenate(bins_end)}) cell_name_pixels_dict = HigashiDict(chrom2info, cell_list, chrom_list) print("Start creating scool") cooler.create_scool(os.path.join(rw_dir, "rw_impute.scool"), bins, cell_name_pixels_dict, dtypes={'count': 'float32'}, ordered=True)
def generate_scool_single_resolution(cell_path_dict, chrom_size_path, resolution, output_path, batch_n=20, cpu=1): # parse chromosome sizes, prepare bin_df chrom_sizes = pd.read_csv(chrom_size_path, header=None, index_col=0, sep='\t', squeeze=True).sort_index() chrom_offset = get_chrom_offsets(chrom_sizes, resolution) bins_df = generate_bins_df_from_chrom_sizes(chrom_sizes, resolution) chunk_dicts = defaultdict(dict) for i, (cell, path) in enumerate(cell_path_dict.items()): chunk_dicts[i // batch_n][cell] = path with ProcessPoolExecutor(cpu) as exe: futures = {} for batch, cell_path_dict in chunk_dicts.items(): batch_output = output_path + f'_{batch}' f = exe.submit(generate_scool_batch_data, cell_path_dict=cell_path_dict, resolution=resolution, chrom_offset=chrom_offset, output_path=batch_output) futures[f] = batch_output for future in as_completed(futures): # batch finished batch_output = futures[future] future.result() # dump batch result into scool cell_pixel_dict = {} with pd.HDFStore(batch_output, mode='r') as hdf: for cell_id in hdf.keys(): cell_id = cell_id[1:] # remove '/' cell_pixel_dict[cell_id] = hdf[cell_id] create_scool(output_path, bins=bins_df, cell_name_pixels_dict=cell_pixel_dict, ordered=True, mode='a') subprocess.run(['rm', '-f', batch_output], check=True) return
A table, given as a dataframe or a column-oriented dict, containing columns labeled bin1_id, bin2_id and count, sorted by (bin1_id, bin2_id). If additional columns are included in the pixel table, their names and dtypes must be specified using the columns and dtypes arguments. For larger input data, an iterable can be provided that yields the pixel data as a sequence of chunks. If the input is a dask DataFrame, it will also be processed one chunk at a time. columns (sequence of str, optional) – Customize which value columns from the input pixels to store in the cooler. Non-standard value columns will be given dtype float64 unless overriden using the dtypes argument. If None, we only attempt to store a value column named "count". dtypes (dict, optional) – Dictionary mapping column names to dtypes. Can be used to override the default dtypes of bin1_id, bin2_id or count or assign dtypes to custom value columns. Non-standard value columns given in dtypes must also be provided in the columns argument or they will be ignored. metadata (dict, optional) – Experiment metadata to store in the file. Must be JSON compatible. assembly (str, optional) – Name of genome assembly. ordered (bool, optional [default: False]) – If the input chunks of pixels are provided with correct triangularity and in ascending order of (bin1_id, bin2_id), set this to True to write the cooler in one step. If False (default), we create the cooler in two steps using an external sort mechanism. See Notes for more details. symmetric_upper (bool, optional [default: True]) – If True, sets the file’s storage-mode property to symmetric-upper: use this only if the input data references the upper triangle of a symmetric matrix! For all other cases, set this option to False. mode ({'w' , 'a'}, optional [default: 'w']) – Write mode for the output file. ‘a’: if the output file exists, append the new cooler to it. ‘w’: if the output file exists, it will be truncated. Default is ‘w’. ''' cooler.create_scool(os.path.join( temp_dir, "nbr_%d_impute.scool" % (neighbor_num if args.neighbor else 0)), bins, cell_name_pixels_dict, dtypes={'count': 'float32'}, ordered=True)