def create_zarr_count_assay(z: zarr.Group, assay_name: str, chunk_size: Tuple[int, int], n_cells: int, feat_ids: Union[np.ndarray, List[str]], feat_names: Union[np.ndarray, List[str]], dtype: str = 'uint32') -> zarr.hierarchy: """ Creates and returns a Zarr array with name 'counts'. Args: z (zarr.Group): assay_name (str): chunk_size (Tuple[int, int]): n_cells (int): feat_ids (Union[np.ndarray, List[str]]): feat_names (Union[np.ndarray, List[str]]): dtype (str = 'uint32'): Returns: A Zarr array. """ g = z.create_group(assay_name, overwrite=True) g.attrs['is_assay'] = True g.attrs['misc'] = {} create_zarr_obj_array(g, 'featureData/ids', feat_ids) create_zarr_obj_array(g, 'featureData/names', feat_names) create_zarr_obj_array(g, 'featureData/I', [True for _ in range(len(feat_ids))], 'bool') return create_zarr_dataset(g, 'counts', chunk_size, dtype, (n_cells, len(feat_ids)), overwrite=True)
def write_csr(self, group: zarr.Group, name: str, matrix: csr_matrix) -> None: sub_group = group.create_group(name, overwrite=True) sub_group.attrs.update(data_type='csr_matrix', shape=matrix.shape) sub_group.create_dataset('data', data=matrix.data, shape=matrix.data.shape, chunks=calc_chunk(matrix.data.shape), dtype=matrix.data.dtype, compressor=COMPRESSOR, overwrite=True) sub_group.create_dataset('indices', data=matrix.indices, shape=matrix.indices.shape, chunks=calc_chunk(matrix.indices.shape), dtype=matrix.indices.dtype, compressor=COMPRESSOR, overwrite=True) sub_group.create_dataset('indptr', data=matrix.indptr, shape=matrix.indptr.shape, chunks=calc_chunk(matrix.indptr.shape), dtype=matrix.indptr.dtype, compressor=COMPRESSOR, overwrite=True)
def write_dataframe(self, group: zarr.Group, name: str, df: Union[pd.DataFrame, np.recarray]) -> None: data_type = 'data_frame' if isinstance( df, pd.DataFrame) else 'record_array' sub_group = group.create_group(name, overwrite=True) attrs_dict = {'data_type': data_type} cols = list(df.columns if data_type == 'data_frame' else df.dtype.names) attrs_dict['columns'] = cols if data_type == 'data_frame': attrs_dict[ 'index_name'] = df.index.name if df.index.name is not None else 'index' sub_group.create_group( '_categories', overwrite=True ) # create a group to store category keys for catigorical columns self.write_series(sub_group, '_index', df.index.values, data_type) for col in cols: self.write_series( sub_group, col, (df[col].values if data_type == 'data_frame' else df[col]), data_type) sub_group.attrs.update(**attrs_dict)
def conv_chrom(fname: str, block_size: int, root: Group, chrom: int) -> None: num_positions = 0 with open(fname) as tfam: for line in tfam: # redo this with generators tokens = line.rstrip().split(' ') l_chrom = int(tokens[0]) if l_chrom < chrom: continue elif l_chrom > chrom: break num_positions += 1 tfam = open(fname) chrom_group = root.create_group(f'chromosome-{chrom}') all_calls = chrom_group.zeros('calls', shape=(num_positions, NUM_SAMPLES), dtype='B') block = [] all_positions = [] all_alleles = [] current_position = 0 for line in tfam: # redo this with generators tokens = line.rstrip().split(' ') l_chrom = int(tokens[0]) if l_chrom < chrom: continue elif l_chrom > chrom: break position = int(tokens[3]) all_positions.append(position) calls = tokens[4:] alleles = ''.join(set(calls[4:]) - set(['0'])) if len(alleles) == 1: alleles += alleles all_alleles.append(alleles) sample_calls = np.empty(shape=NUM_SAMPLES, dtype='B') for sample_position, sample in enumerate(range(NUM_SAMPLES)): a1, a2 = calls[2 * sample: 2 * sample + 2] try: sample_calls[sample_position] = encode_alleles(a1, a2, alleles) except Exception: print(chrom, current_position, sample_position) raise block.append(sample_calls) current_position += 1 if current_position % block_size == 0: all_calls[current_position - block_size:current_position, :] = block block = [] if len(block) > 0: all_calls[- len(block):, :] = block chrom_group.array('positions', all_positions) chrom_group.array('alleles', all_alleles)