Пример #1
0
def _read(filename, backed=False, sheet=None, ext=None, delimiter=None,
          first_column_names=None, backup_url=None, cache=False,
          suppress_cache_warning=False):
    if ext is not None and ext not in avail_exts:
        raise ValueError('Please provide one of the available extensions.\n'
                         + avail_exts)
    else:
        ext = is_valid_filename(filename, return_ext=True)
    is_present = check_datafile_present_and_download(filename,
                                                     backup_url=backup_url)
    if not is_present: logg.msg('... did not find original file', filename)
    # read hdf5 files
    if ext in {'h5', 'h5ad'}:
        if sheet is None:
            return read_h5ad(filename, backed=backed)
        else:
            logg.msg('reading sheet', sheet, 'from file', filename, v=4)
            return read_hdf(filename, sheet)
    # read other file types
    filename_cache = (settings.cachedir + filename.lstrip(
        './').replace('/', '-').replace('.' + ext, '.h5ad'))
    if cache and os.path.exists(filename_cache):
        logg.info('... reading from cache file', filename_cache)
        adata = read_h5ad(filename_cache, backed=False)
    else:
        if not is_present:
            raise FileNotFoundError('Did not find file {}.'.format(filename))
        logg.msg('reading', filename, v=4)
        if not cache and not suppress_cache_warning:
            logg.hint('This might be very slow. Consider passing `cache=True`, '
                      'which enables much faster reading from a cache file.')
        # do the actual reading
        if ext == 'xlsx' or ext == 'xls':
            if sheet is None:
                raise ValueError(
                    'Provide `sheet` parameter when reading \'.xlsx\' files.')
            else:
                adata = read_excel(filename, sheet)
        elif ext == 'mtx':
            adata = read_mtx(filename)
        elif ext == 'csv':
            adata = read_csv(filename, first_column_names=first_column_names)
        elif ext in {'txt', 'tab', 'data', 'tsv'}:
            if ext == 'data':
                logg.msg('... assuming \'.data\' means tab or white-space '
                         'separated text file', v=3)
                logg.hint('change this by passing `ext` to sc.read')
            adata = read_text(filename, delimiter, first_column_names)
        elif ext == 'soft.gz':
            adata = _read_softgz(filename)
        else:
            raise ValueError('Unkown extension {}.'.format(ext))
        if cache:
            logg.info('... writing an', settings.file_format_data,
                      'cache file to speedup reading next time')
            if not os.path.exists(os.path.dirname(filename_cache)):
                os.makedirs(os.path.dirname(filename_cache))
            # write for faster reading when calling the next time
            adata.write(filename_cache)
    return adata
Пример #2
0
def load_file(filepath):
    if filepath == 'default' or filepath == 'datasets/user_uploaded/default':
        filepath = join_root("../datasets/default.csv")
    elif filepath == 'test':
        filepath = join_root('../../datasets/server/testdataset.h5ad')

    dataset = os.path.basename(filepath)
    dataset = os.path.splitext(dataset)[0]

    try:
        if filepath[-4:] == 'h5ad':
            adata = anndata.read_h5ad(filepath)
        if filepath[-3:] == 'csv':
            # TODO remove transpose
            adata = anndata.read_csv(filepath).T
        if filepath[-4:] == 'xlsx':
            adata = anndata.read_excel(filepath)
        if filepath[-3:] == 'mtx':
            adata = anndata.read_mtx(filepath)
        if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[
                -4:] == 'data':
            adata = anndata.read_text(filepath)
        if filepath[-2:] == 'h5':
            adata = anndata.read_hdf(filepath)
        if filepath[-4:] == 'loom':
            adata = anndata.read_loom(filepath)
    except Exception as e:
        print(str(e))
        raise IncorrectFileFormat(
            "File does not exist or file format is incorrect.")

    adata.uns['dataset'] = dataset
    return adata
Пример #3
0
def load_data(path, dtype='dge'):
    if dtype == 'dge':
        dataset = ad.read_text(path)

    elif dtype == '10x':
        dataset = sc.read_10x_mtx(path, var_names='gene_symbols', cache=True)

    return dataset
Пример #4
0
 def load_matrix(self):
     if self.raw_filename[-5:] == '.h5ad':
         adata = sc.read_h5ad(
             os.path.join(self.data_root, self.raw_filename))
     elif self.raw_filename[-4:] == '.tsv':
         adata = ad.read_text(os.path.join(self.data_root,
                                           self.raw_filename),
                              delimiter='\t',
                              first_column_names=True,
                              dtype='int')
     else:
         raise ImportError("Input format error!")
     return adata
Пример #5
0
def load_file(filepath):
    t_flag = False
    if filepath == 'default' or filepath == 'datasets/user_uploaded/default':
        filepath = join_root("../datasets/default.csv")
        t_flag = True
    elif filepath == 'test':
        filepath = join_root('../../datasets/server/testdataset.h5ad')

    dataset = os.path.basename(filepath)
    dataset = os.path.splitext(dataset)[0]

    try:
        if filepath[-4:] == 'h5ad':
            adata = anndata.read_h5ad(filepath)
        if filepath[-3:] == 'csv':
            adata = anndata.read_csv(filepath)
            if t_flag:
                adata = adata.T
        if filepath[-4:] == 'xlsx':
            adata = anndata.read_excel(filepath)
        if filepath[-3:] == 'mtx':
            adata = anndata.read_mtx(filepath)
        if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[-4:] == 'data':
            adata = anndata.read_text(filepath)
        if filepath[-2:] == 'h5':
            adata = anndata.read_hdf(filepath)
        if filepath[-4:] == 'loom':
            adata = anndata.read_loom(filepath)
    except Exception as e:
        print(str(e))
        raise IncorrectFileFormat(
            "File does not exist or file format is incorrect.")

    # Make sure cluster names are in proper format
    if 'cluster_names' in adata.uns:
        adata.uns['cluster_names'] = bidict(adata.uns['cluster_names'])
        for key in list(adata.uns['cluster_names'].keys()):
            adata.uns['cluster_names'][int(key)] = \
                adata.uns['cluster_names'].pop(key, None)

    adata.uns['dataset'] = dataset
    return adata
Пример #6
0
def test_read_tsv_iter():
    with (HERE / "adata-comments.tsv").open() as f:
        adata = ad.read_text(f, "\t")
    assert adata.obs_names.tolist() == ["r1", "r2", "r3"]
    assert adata.var_names.tolist() == ["c1", "c2"]
    assert adata.X.tolist() == X_list
Пример #7
0
def test_read_tsv_strpath():
    adata = ad.read_text(str(HERE / "adata-comments.tsv"), "\t")
    assert adata.obs_names.tolist() == ["r1", "r2", "r3"]
    assert adata.var_names.tolist() == ["c1", "c2"]
    assert adata.X.tolist() == X_list
Пример #8
0
def _read(
    filename: Path,
    backed=None,
    sheet=None,
    ext=None,
    delimiter=None,
    first_column_names=None,
    backup_url=None,
    cache=False,
    cache_compression=None,
    suppress_cache_warning=False,
    **kwargs,
):
    if ext is not None and ext not in avail_exts:
        raise ValueError('Please provide one of the available extensions.\n'
                         f'{avail_exts}')
    else:
        ext = is_valid_filename(filename, return_ext=True)
    is_present = _check_datafile_present_and_download(
        filename,
        backup_url=backup_url,
    )
    if not is_present:
        logg.debug(f'... did not find original file {filename}')
    # read hdf5 files
    if ext in {'h5', 'h5ad'}:
        if sheet is None:
            return read_h5ad(filename, backed=backed)
        else:
            logg.debug(f'reading sheet {sheet} from file {filename}')
            return read_hdf(filename, sheet)
    # read other file types
    path_cache = settings.cachedir / _slugify(filename).replace(
        '.' + ext, '.h5ad')  # type: Path
    if path_cache.suffix in {'.gz', '.bz2'}:
        path_cache = path_cache.with_suffix('')
    if cache and path_cache.is_file():
        logg.info(f'... reading from cache file {path_cache}')
        return read_h5ad(path_cache)

    if not is_present:
        raise FileNotFoundError(f'Did not find file {filename}.')
    logg.debug(f'reading {filename}')
    if not cache and not suppress_cache_warning:
        logg.hint('This might be very slow. Consider passing `cache=True`, '
                  'which enables much faster reading from a cache file.')
    # do the actual reading
    if ext == 'xlsx' or ext == 'xls':
        if sheet is None:
            raise ValueError(
                "Provide `sheet` parameter when reading '.xlsx' files.")
        else:
            adata = read_excel(filename, sheet)
    elif ext in {'mtx', 'mtx.gz'}:
        adata = read_mtx(filename)
    elif ext == 'csv':
        adata = read_csv(filename, first_column_names=first_column_names)
    elif ext in {'txt', 'tab', 'data', 'tsv'}:
        if ext == 'data':
            logg.hint(
                "... assuming '.data' means tab or white-space "
                'separated text file', )
            logg.hint('change this by passing `ext` to sc.read')
        adata = read_text(filename, delimiter, first_column_names)
    elif ext == 'soft.gz':
        adata = _read_softgz(filename)
    elif ext == 'loom':
        adata = read_loom(filename=filename, **kwargs)
    else:
        raise ValueError(f'Unknown extension {ext}.')
    if cache:
        logg.info(f'... writing an {settings.file_format_data} '
                  'cache file to speedup reading next time')
        if cache_compression is _empty:
            cache_compression = settings.cache_compression
        if not path_cache.parent.is_dir():
            path_cache.parent.mkdir(parents=True)
        # write for faster reading when calling the next time
        adata.write(path_cache, compression=cache_compression)
    return adata
Пример #9
0
 def load_matrix(self):
     adata = ad.read_text(os.path.join(self.data_root, self.raw_filename),
                          delimiter='\t',
                          first_column_names=True,
                          dtype='int')
     return adata
Пример #10
0
def test_read_tsv_iter():
    with (HERE / 'adata-comments.tsv').open() as f:
        adata = ad.read_text(f, '\t')
    assert adata.obs_names.tolist() == ['r1', 'r2', 'r3']
    assert adata.var_names.tolist() == ['c1', 'c2']
    assert adata.X.tolist() == X_list
Пример #11
0
def test_read_tsv_strpath():
    adata = ad.read_text(str(HERE / 'adata-comments.tsv'), '\t')
    assert adata.obs_names.tolist() == ['r1', 'r2', 'r3']
    assert adata.var_names.tolist() == ['c1', 'c2']
    assert adata.X.tolist() == X_list
Пример #12
0
def load_shareseq_data(tissue: str,
                       dirname: str,
                       mode: str = "RNA") -> AnnData:
    """Load the SHAREseq data"""
    assert os.path.isdir(dirname)
    atac_fname_dict = {
        "skin": [
            "GSM4156597_skin.late.anagen.barcodes.txt.gz",
            "GSM4156597_skin.late.anagen.counts.txt.gz",
            "GSM4156597_skin.late.anagen.peaks.bed.gz",
        ],
        "brain": [
            "GSM4156599_brain.barcodes.txt.gz",
            "GSM4156599_brain.counts.txt.gz",
            "GSM4156599_brain.peaks.bed.gz",
        ],
        "lung": [
            "GSM4156600_lung.barcodes.txt.gz",
            "GSM4156600_lung.counts.txt.gz",
            "GSM4156600_lung.peaks.bed.gz",
        ],
    }
    rna_fname_dict = {
        "skin": "GSM4156608_skin.late.anagen.rna.counts.txt.gz",
        "brain": "GSM4156610_brain.rna.counts.txt.gz",
        "lung": "GSM4156611_lung.rna.counts.txt.gz",
    }
    assert atac_fname_dict.keys() == rna_fname_dict.keys()
    assert tissue in atac_fname_dict.keys(), f"Unrecognized tissue: {tissue}"

    atac_barcodes_fname, atac_counts_fname, atac_peaks_fname = atac_fname_dict[
        tissue]
    assert "barcodes" in atac_barcodes_fname  # Check fnames are unpacked correctly
    assert "counts" in atac_counts_fname
    assert "peaks" in atac_peaks_fname
    atac_cell_barcodes = pd.read_csv(
        os.path.join(dirname, atac_barcodes_fname),
        delimiter="\t",
        index_col=0,
        header=None,
    )
    atac_cell_barcodes.index = [
        i.replace(",", ".") for i in atac_cell_barcodes.index
    ]

    # Load in RNA data
    if mode == "RNA":
        retval = ad.read_text(os.path.join(dirname, rna_fname_dict[tissue])).T
        # Ensure that we return a sparse matrix as the underlying datatype
        retval.X = scipy.sparse.csr_matrix(retval.X)
        # Fix formatting of obs names where commas were used for periods
        retval.obs.index = [i.replace(",", ".") for i in retval.obs.index]
        intersected_barcodes = [
            bc for bc in retval.obs_names
            if bc in set(atac_cell_barcodes.index)
        ]
        assert intersected_barcodes, f"No common barcodes between RNA/ATAC for {tissue}"
        logging.info(
            f"RNA {tissue} intersects {len(intersected_barcodes)}/{len(retval.obs_names)} barcodes with ATAC"
        )
        retval = retval[intersected_barcodes]

    elif mode == "ATAC":
        # Load in ATAC data
        # read_mtx automatically gives us a sparse matrix
        retval = ad.read_mtx(os.path.join(dirname, atac_counts_fname)).T
        # Attach metadata
        retval.obs = atac_cell_barcodes
        atac_peaks = pd.read_csv(
            os.path.join(dirname, atac_peaks_fname),
            delimiter="\t",
            header=None,
            names=["chrom", "start", "end"],
        )
        atac_peaks.index = [
            f"{c}:{s}-{e}" for _i, c, s, e in atac_peaks.itertuples()
        ]
        retval.var = atac_peaks
    else:
        raise ValueError("mode must be either RNA or ATAC")
    assert isinstance(retval.X, scipy.sparse.csr_matrix)
    return retval
Пример #13
0
df_csv = pd.read_csv(file_csv, index_col=0)
df_csv = df_csv.T
colnames = [x.replace('-', ':', 1) for x in df_csv.columns]
df_csv.columns = colnames
file_count_tsv = os.path.join(path_data_root, 'counts.tsv')
df_csv.to_csv(file_count_tsv, sep='\t')

file_meta_csv = os.path.join(path_human_brain,
                             'GSM5289636_s3atac.hg38.metadata.csv')
df_meta_csv = pd.read_csv(file_meta_csv, index_col=0)
df_meta_csv = df_meta_csv.loc[:, ['cellID', 'celltype']]
file_meta_tsv = os.path.join(path_data_root, 'metadata.tsv')
df_meta_csv.to_csv(file_meta_tsv, sep='\t')

adata = ad.read_text(file_count_tsv,
                     delimiter='\t',
                     first_column_names=True,
                     dtype='int')
df_meta = pd.read_csv(file_meta_tsv, sep='\t', index_col=0)
adata.obs['celltype'] = df_meta.loc[adata.obs.index, 'celltype']

print(np.max(adata.X))
if np.max(adata.X) > 1:
    epi.pp.binarize(adata)
    print(np.max(adata.X))
epi.pp.filter_cells(adata, min_features=1)
epi.pp.filter_features(adata, min_cells=1)
# QC
adata.obs['log_nb_features'] = [np.log10(x) for x in adata.obs['nb_features']]
epi.pl.violin(adata, ['nb_features'])
epi.pl.violin(adata, ['log_nb_features'])
epi.pp.coverage_cells(adata,
Пример #14
0
    #            cell_type_column='cell_ontology_class',
    #            n_cells=nct,
    #            )

    #    asub2.to_df().to_csv(
    #            '../data/for_scmap/TBS_kidney_newdata_subsample_100_counts_rep_{:}.tsv'.format(rep),
    #            sep='\t', index=True)
    #    asub2.obs[['CellType']].to_csv(
    #            '../data/for_scmap/TBS_kidney_newdata_subsample_100_metadata_rep_{:}.tsv'.format(rep),
    #            sep='\t', index=True)

    print('Include different subset of cell types in atlas')
    nat = [14, 15, 16, 17]
    csts = adata.obs['cell_ontology_class'].value_counts()
    asub = anndata.read_text(
        '../data/for_scmap/TBS_kidney_atlas_subsample_20_counts.tsv',
        delimiter='\t',
    )
    asub.obs['CellType'] = pd.read_csv(
        '../data/for_scmap/TBS_kidney_atlas_subsample_20_metadata.tsv',
        sep='\t',
        index_col=0)

    for na in nat:
        csti = csts.index[:na]
        idx = asub.obs['CellType'].isin(csti).values.nonzero()[0]
        asubr = asub[idx]

        asubr.to_df().to_csv(
            '../data/for_scmap/TBS_kidney_atlas_subsample_20_counts_na_{:}.tsv'
            .format(na),
            sep='\t',