示例#1
0
    def save(self, pFileName, pSymmetric=True, pApplyCorrection=True):

        pixel_dict = {}
        bins_dict = {}

        if self.coolObjectsList is not None:
            for coolObject in self.coolObjectsList:
                bins_data_frame, matrix_data_frame, dtype_pixel, info = coolObject.matrixFile.create_cooler_input(pSymmetric=pSymmetric, pApplyCorrection=pApplyCorrection)
                bins_dict[coolObject.matrixFile.matrixFileName] = bins_data_frame
                pixel_dict[coolObject.matrixFile.matrixFileName] = matrix_data_frame

        else:
            try:
                dtype_pixel = {'bin1_id': np.int32, 'bin2_id': np.int32, 'count': self.pixel_list[0]['count'].dtype}
                # dtype_pixel = self.pixel_list[0]['count'].dtype

                for i, pixels in enumerate(self.pixel_list):
                    bins_dict[self.name_list[i]] = self.bins
                    pixel_dict[self.name_list[i]] = pixels
                    log.debug('self.name_list[i] {}'.format(self.name_list[i]))
            except Exception as exp:
                log.debug('Exception {}'.format(str(exp)))

        local_temp_dir = os.path.dirname(os.path.realpath(pFileName))

        cooler.create_scool(cool_uri=pFileName, bins=bins_dict, cell_name_pixels_dict=pixel_dict,
                            dtypes=dtype_pixel,
                            ordered=True,
                            temp_dir=local_temp_dir)
示例#2
0
def test_create_scool(fp):
    c = cooler.Cooler(fp)
    # chromsizes = c.chromsizes
    bins = c.bins()[:]
    pixels = c.pixels()[:]

    # random and different content to prove only chrom, start, end is linked and the rest is independent for each cell
    from copy import deepcopy
    bins_cell1 = deepcopy(bins)
    bins_cell2 = deepcopy(bins)
    bins_cell3 = deepcopy(bins)
    bins_cell1['weight'] = np.array([0] * len(bins_cell1["start"]))
    bins_cell2['weight'] = np.array([1] * len(bins_cell1["start"]))
    bins_cell3['weight'] = np.array([2] * len(bins_cell1["start"]))

    bins_cell1['KR'] = np.array([3] * len(bins_cell1["start"]))
    bins_cell2['KR'] = np.array([4] * len(bins_cell1["start"]))
    bins_cell3['KR'] = np.array([5] * len(bins_cell1["start"]))

    name_pixel_dict = {'cell1': pixels, 'cell2': pixels, 'cell3': pixels}
    name_bins_dict = {'cell1': bins_cell1, 'cell2': bins_cell2, 'cell3': bins_cell3}

    with isolated_filesystem():
        cooler.create_scool('outfile_test.scool', name_bins_dict, name_pixel_dict)
        content_of_scool = cooler.fileops.list_scool_cells('outfile_test.scool')
        content_expected = ['/cells/cell1', '/cells/cell2', '/cells/cell3']
        for content in content_expected:
            assert content in content_of_scool

        cooler.create_scool('outfile_test.scool', bins, name_pixel_dict)
        content_of_scool = cooler.fileops.list_scool_cells('outfile_test.scool')
        content_expected = ['/cells/cell1', '/cells/cell2', '/cells/cell3']
        for content in content_expected:
            assert content in content_of_scool
示例#3
0
def scool_raw(config):
	chrom_list = config['chrom_list']
	temp_dir = config['temp_dir']
	raw_dir = os.path.join(temp_dir, "raw")
	rw_dir = os.path.join(temp_dir, "rw")
	res = config['resolution']
	
	import cooler
	
	off_set = 0
	
	bins_chrom = []
	bins_start = []
	bins_end = []
	
	cell_list = []
	cell_name_pixels_dict = {}
	for chrom_index, chrom in enumerate(chrom_list):
		
		origin_sparse = np.load(os.path.join(raw_dir, "%s_sparse_adj.npy" % chrom), allow_pickle=True)
		size = origin_sparse[0].shape[0]
		
		
		bins_chrom += [chrom] * size
		bins_start.append(np.arange(size) * res)
		bins_end.append(np.arange(size) * res + res)
		
		
		if chrom_index == 0:
			for i in range(len(origin_sparse)):
				cell_list.append("cell_%d" % i)
			
			
		
		for i in range(len(origin_sparse)):
			xs, ys = origin_sparse[i].nonzero()
			
			v = np.array(origin_sparse[i].data).reshape((-1))
			
			mask = ys >= xs
			temp = pd.DataFrame(
					{'bin1_id': xs[mask] + off_set, 'bin2_id': ys[mask] + off_set, 'count':v[mask]})
			if 'cell_%d' % i not in cell_name_pixels_dict:
				cell_name_pixels_dict['cell_%d' % i] = temp
			else:
				cell_name_pixels_dict['cell_%d' % i] = pd.concat([cell_name_pixels_dict['cell_%d' % i], temp], axis=0)
		off_set += size
			
			
	print("Start creating scool")
	
	bins = pd.DataFrame(
		{'chrom': bins_chrom, 'start': np.concatenate(bins_start), 'end': np.concatenate(bins_end)})
	cooler.create_scool(os.path.join(temp_dir, "raw.scool"), bins, cell_name_pixels_dict,
	                    dtypes={'count': 'float32'}, ordered=True)
示例#4
0
def scool_rwr(config):
	chrom_list = config['impute_list']
	temp_dir = config['temp_dir']
	raw_dir = os.path.join(temp_dir, "raw")
	rw_dir = os.path.join(temp_dir, "rw")
	res = config['resolution']
	
	import cooler
	from Higashi2Scool import HigashiDict, skip_start_end
	chrom2info = {}
	
	
	off_set = 0
	
	bins_chrom = []
	bins_start = []
	bins_end = []
	
	cell_list = []
	for chrom_index, chrom in enumerate(chrom_list):

		impute_f = h5py.File(os.path.join(rw_dir, "rw_%s.hdf5" % (chrom)), "r")
		
		origin_sparse = np.load(os.path.join(raw_dir, "%s_sparse_adj.npy" % chrom), allow_pickle=True)
		size = origin_sparse[0].shape[0]
		mask_start, mask_end = skip_start_end(config, chrom)
		del origin_sparse
		
		coordinates = np.array(impute_f['coordinates']).astype('int')
		xs, ys = coordinates[:, 0], coordinates[:, 1]
		m1 = np.zeros((size, size))
		chrom2info[chrom] = [size, mask_start, mask_end, impute_f, xs, ys, m1, off_set]
		off_set += size
		bins_chrom += [chrom] * size
		bins_start.append(np.arange(size) * res)
		bins_end.append(np.arange(size) * res + res)
		
		if chrom_index == 0:
			for i in range(len(list(impute_f.keys())) - 1):
				cell_list.append("cell_%d" % i)
	
	bins = pd.DataFrame({'chrom': bins_chrom, 'start': np.concatenate(bins_start), 'end': np.concatenate(bins_end)})
	cell_name_pixels_dict = HigashiDict(chrom2info, cell_list, chrom_list)
	
	
	print("Start creating scool")
	
	
	cooler.create_scool(os.path.join(rw_dir, "rw_impute.scool"), bins,
	                    cell_name_pixels_dict, dtypes={'count': 'float32'}, ordered=True)
示例#5
0
def generate_scool_single_resolution(cell_path_dict,
                                     chrom_size_path,
                                     resolution,
                                     output_path,
                                     batch_n=20,
                                     cpu=1):
    # parse chromosome sizes, prepare bin_df
    chrom_sizes = pd.read_csv(chrom_size_path,
                              header=None,
                              index_col=0,
                              sep='\t',
                              squeeze=True).sort_index()
    chrom_offset = get_chrom_offsets(chrom_sizes, resolution)
    bins_df = generate_bins_df_from_chrom_sizes(chrom_sizes, resolution)

    chunk_dicts = defaultdict(dict)
    for i, (cell, path) in enumerate(cell_path_dict.items()):
        chunk_dicts[i // batch_n][cell] = path

    with ProcessPoolExecutor(cpu) as exe:
        futures = {}
        for batch, cell_path_dict in chunk_dicts.items():
            batch_output = output_path + f'_{batch}'
            f = exe.submit(generate_scool_batch_data,
                           cell_path_dict=cell_path_dict,
                           resolution=resolution,
                           chrom_offset=chrom_offset,
                           output_path=batch_output)
            futures[f] = batch_output

        for future in as_completed(futures):
            # batch finished
            batch_output = futures[future]
            future.result()

            # dump batch result into scool
            cell_pixel_dict = {}
            with pd.HDFStore(batch_output, mode='r') as hdf:
                for cell_id in hdf.keys():
                    cell_id = cell_id[1:]  # remove '/'
                    cell_pixel_dict[cell_id] = hdf[cell_id]
            create_scool(output_path,
                         bins=bins_df,
                         cell_name_pixels_dict=cell_pixel_dict,
                         ordered=True,
                         mode='a')
            subprocess.run(['rm', '-f', batch_output], check=True)
    return
示例#6
0
	A table, given as a dataframe or a column-oriented dict, containing columns labeled bin1_id, bin2_id and count, sorted by (bin1_id, bin2_id).
	If additional columns are included in the pixel table, their names and dtypes must be specified using the columns and dtypes arguments.
	For larger input data, an iterable can be provided that yields the pixel data as a sequence of chunks. If the input is a dask DataFrame,
	it will also be processed one chunk at a time.
	
	columns (sequence of str, optional) – Customize which value columns from the input pixels to store in the cooler.
	Non-standard value columns will be given dtype float64 unless overriden using the dtypes argument.
	If None, we only attempt to store a value column named "count".
	
	dtypes (dict, optional) – Dictionary mapping column names to dtypes.
	Can be used to override the default dtypes of bin1_id, bin2_id or count or assign dtypes to custom value columns.
	Non-standard value columns given in dtypes must also be provided in the columns argument or they will be ignored.
	metadata (dict, optional) – Experiment metadata to store in the file. Must be JSON compatible.
	assembly (str, optional) – Name of genome assembly.
	ordered (bool, optional [default: False]) – If the input chunks of pixels are provided with correct triangularity
	and in ascending order of (bin1_id, bin2_id), set this to True to write the cooler in one step. If False (default),
	we create the cooler in two steps using an external sort mechanism. See Notes for more details.
	symmetric_upper (bool, optional [default: True]) – If True, sets the file’s storage-mode property to symmetric-upper:
	use this only if the input data references the upper triangle of a symmetric matrix! For all other cases, set this option to False.
	mode ({'w' , 'a'}, optional [default: 'w']) – Write mode for the output file.
	‘a’: if the output file exists, append the new cooler to it.
	‘w’: if the output file exists, it will be truncated. Default is ‘w’.
	'''

    cooler.create_scool(os.path.join(
        temp_dir,
        "nbr_%d_impute.scool" % (neighbor_num if args.neighbor else 0)),
                        bins,
                        cell_name_pixels_dict,
                        dtypes={'count': 'float32'},
                        ordered=True)