Exemplo n.º 1
0
def create_zarr_count_assay(z: zarr.Group,
                            assay_name: str,
                            chunk_size: Tuple[int, int],
                            n_cells: int,
                            feat_ids: Union[np.ndarray, List[str]],
                            feat_names: Union[np.ndarray, List[str]],
                            dtype: str = 'uint32') -> zarr.hierarchy:
    """
    Creates and returns a Zarr array with name 'counts'.

    Args:
        z (zarr.Group):
        assay_name (str):
        chunk_size (Tuple[int, int]):
        n_cells (int):
        feat_ids (Union[np.ndarray, List[str]]):
        feat_names (Union[np.ndarray, List[str]]):
        dtype (str = 'uint32'):

    Returns:
        A Zarr array.
    """
    g = z.create_group(assay_name, overwrite=True)
    g.attrs['is_assay'] = True
    g.attrs['misc'] = {}
    create_zarr_obj_array(g, 'featureData/ids', feat_ids)
    create_zarr_obj_array(g, 'featureData/names', feat_names)
    create_zarr_obj_array(g, 'featureData/I',
                          [True for _ in range(len(feat_ids))], 'bool')
    return create_zarr_dataset(g,
                               'counts',
                               chunk_size,
                               dtype, (n_cells, len(feat_ids)),
                               overwrite=True)
Exemplo n.º 2
0
 def write_csr(self, group: zarr.Group, name: str,
               matrix: csr_matrix) -> None:
     sub_group = group.create_group(name, overwrite=True)
     sub_group.attrs.update(data_type='csr_matrix', shape=matrix.shape)
     sub_group.create_dataset('data',
                              data=matrix.data,
                              shape=matrix.data.shape,
                              chunks=calc_chunk(matrix.data.shape),
                              dtype=matrix.data.dtype,
                              compressor=COMPRESSOR,
                              overwrite=True)
     sub_group.create_dataset('indices',
                              data=matrix.indices,
                              shape=matrix.indices.shape,
                              chunks=calc_chunk(matrix.indices.shape),
                              dtype=matrix.indices.dtype,
                              compressor=COMPRESSOR,
                              overwrite=True)
     sub_group.create_dataset('indptr',
                              data=matrix.indptr,
                              shape=matrix.indptr.shape,
                              chunks=calc_chunk(matrix.indptr.shape),
                              dtype=matrix.indptr.dtype,
                              compressor=COMPRESSOR,
                              overwrite=True)
Exemplo n.º 3
0
    def write_dataframe(self, group: zarr.Group, name: str,
                        df: Union[pd.DataFrame, np.recarray]) -> None:
        data_type = 'data_frame' if isinstance(
            df, pd.DataFrame) else 'record_array'

        sub_group = group.create_group(name, overwrite=True)
        attrs_dict = {'data_type': data_type}
        cols = list(df.columns if data_type ==
                    'data_frame' else df.dtype.names)
        attrs_dict['columns'] = cols
        if data_type == 'data_frame':
            attrs_dict[
                'index_name'] = df.index.name if df.index.name is not None else 'index'
            sub_group.create_group(
                '_categories', overwrite=True
            )  # create a group to store category keys for catigorical columns
            self.write_series(sub_group, '_index', df.index.values, data_type)

        for col in cols:
            self.write_series(
                sub_group, col,
                (df[col].values if data_type == 'data_frame' else df[col]),
                data_type)

        sub_group.attrs.update(**attrs_dict)
Exemplo n.º 4
0
def conv_chrom(fname: str, block_size: int,
               root: Group, chrom: int) -> None:
    num_positions = 0
    with open(fname) as tfam:
        for line in tfam:  # redo this with generators
            tokens = line.rstrip().split(' ')
            l_chrom = int(tokens[0])
            if l_chrom < chrom:
                continue
            elif l_chrom > chrom:
                break
            num_positions += 1
    tfam = open(fname)
    chrom_group = root.create_group(f'chromosome-{chrom}')
    all_calls = chrom_group.zeros('calls', shape=(num_positions, NUM_SAMPLES),
                                  dtype='B')
    block = []
    all_positions = []
    all_alleles = []
    current_position = 0
    for line in tfam:  # redo this with generators
        tokens = line.rstrip().split(' ')
        l_chrom = int(tokens[0])
        if l_chrom < chrom:
            continue
        elif l_chrom > chrom:
            break
        position = int(tokens[3])
        all_positions.append(position)
        calls = tokens[4:]
        alleles = ''.join(set(calls[4:]) - set(['0']))
        if len(alleles) == 1:
            alleles += alleles
        all_alleles.append(alleles)
        sample_calls = np.empty(shape=NUM_SAMPLES, dtype='B')
        for sample_position, sample in enumerate(range(NUM_SAMPLES)):
            a1, a2 = calls[2 * sample: 2 * sample + 2]
            try:
                sample_calls[sample_position] = encode_alleles(a1, a2, alleles)
            except Exception:
                print(chrom, current_position, sample_position)
                raise
        block.append(sample_calls)
        current_position += 1
        if current_position % block_size == 0:
            all_calls[current_position - block_size:current_position, :] = block
            block = []
    if len(block) > 0:
        all_calls[- len(block):, :] = block
    chrom_group.array('positions', all_positions)
    chrom_group.array('alleles', all_alleles)