Пример #1
0
def load_file(filepath):
    if filepath == 'default' or filepath == 'datasets/user_uploaded/default':
        filepath = join_root("../datasets/default.csv")
    elif filepath == 'test':
        filepath = join_root('../../datasets/server/testdataset.h5ad')

    dataset = os.path.basename(filepath)
    dataset = os.path.splitext(dataset)[0]

    try:
        if filepath[-4:] == 'h5ad':
            adata = anndata.read_h5ad(filepath)
        if filepath[-3:] == 'csv':
            # TODO remove transpose
            adata = anndata.read_csv(filepath).T
        if filepath[-4:] == 'xlsx':
            adata = anndata.read_excel(filepath)
        if filepath[-3:] == 'mtx':
            adata = anndata.read_mtx(filepath)
        if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[
                -4:] == 'data':
            adata = anndata.read_text(filepath)
        if filepath[-2:] == 'h5':
            adata = anndata.read_hdf(filepath)
        if filepath[-4:] == 'loom':
            adata = anndata.read_loom(filepath)
    except Exception as e:
        print(str(e))
        raise IncorrectFileFormat(
            "File does not exist or file format is incorrect.")

    adata.uns['dataset'] = dataset
    return adata
Пример #2
0
def _read(filename, backed=False, sheet=None, ext=None, delimiter=None,
          first_column_names=None, backup_url=None, cache=False,
          suppress_cache_warning=False):
    if ext is not None and ext not in avail_exts:
        raise ValueError('Please provide one of the available extensions.\n'
                         + avail_exts)
    else:
        ext = is_valid_filename(filename, return_ext=True)
    is_present = check_datafile_present_and_download(filename,
                                                     backup_url=backup_url)
    if not is_present: logg.msg('... did not find original file', filename)
    # read hdf5 files
    if ext in {'h5', 'h5ad'}:
        if sheet is None:
            return read_h5ad(filename, backed=backed)
        else:
            logg.msg('reading sheet', sheet, 'from file', filename, v=4)
            return read_hdf(filename, sheet)
    # read other file types
    filename_cache = (settings.cachedir + filename.lstrip(
        './').replace('/', '-').replace('.' + ext, '.h5ad'))
    if cache and os.path.exists(filename_cache):
        logg.info('... reading from cache file', filename_cache)
        adata = read_h5ad(filename_cache, backed=False)
    else:
        if not is_present:
            raise FileNotFoundError('Did not find file {}.'.format(filename))
        logg.msg('reading', filename, v=4)
        if not cache and not suppress_cache_warning:
            logg.hint('This might be very slow. Consider passing `cache=True`, '
                      'which enables much faster reading from a cache file.')
        # do the actual reading
        if ext == 'xlsx' or ext == 'xls':
            if sheet is None:
                raise ValueError(
                    'Provide `sheet` parameter when reading \'.xlsx\' files.')
            else:
                adata = read_excel(filename, sheet)
        elif ext == 'mtx':
            adata = read_mtx(filename)
        elif ext == 'csv':
            adata = read_csv(filename, first_column_names=first_column_names)
        elif ext in {'txt', 'tab', 'data', 'tsv'}:
            if ext == 'data':
                logg.msg('... assuming \'.data\' means tab or white-space '
                         'separated text file', v=3)
                logg.hint('change this by passing `ext` to sc.read')
            adata = read_text(filename, delimiter, first_column_names)
        elif ext == 'soft.gz':
            adata = _read_softgz(filename)
        else:
            raise ValueError('Unkown extension {}.'.format(ext))
        if cache:
            logg.info('... writing an', settings.file_format_data,
                      'cache file to speedup reading next time')
            if not os.path.exists(os.path.dirname(filename_cache)):
                os.makedirs(os.path.dirname(filename_cache))
            # write for faster reading when calling the next time
            adata.write(filename_cache)
    return adata
Пример #3
0
def load_file(filepath):
    t_flag = False
    if filepath == 'default' or filepath == 'datasets/user_uploaded/default':
        filepath = join_root("../datasets/default.csv")
        t_flag = True
    elif filepath == 'test':
        filepath = join_root('../../datasets/server/testdataset.h5ad')

    dataset = os.path.basename(filepath)
    dataset = os.path.splitext(dataset)[0]

    try:
        if filepath[-4:] == 'h5ad':
            adata = anndata.read_h5ad(filepath)
        if filepath[-3:] == 'csv':
            adata = anndata.read_csv(filepath)
            if t_flag:
                adata = adata.T
        if filepath[-4:] == 'xlsx':
            adata = anndata.read_excel(filepath)
        if filepath[-3:] == 'mtx':
            adata = anndata.read_mtx(filepath)
        if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[-4:] == 'data':
            adata = anndata.read_text(filepath)
        if filepath[-2:] == 'h5':
            adata = anndata.read_hdf(filepath)
        if filepath[-4:] == 'loom':
            adata = anndata.read_loom(filepath)
    except Exception as e:
        print(str(e))
        raise IncorrectFileFormat(
            "File does not exist or file format is incorrect.")

    # Make sure cluster names are in proper format
    if 'cluster_names' in adata.uns:
        adata.uns['cluster_names'] = bidict(adata.uns['cluster_names'])
        for key in list(adata.uns['cluster_names'].keys()):
            adata.uns['cluster_names'][int(key)] = \
                adata.uns['cluster_names'].pop(key, None)

    adata.uns['dataset'] = dataset
    return adata
Пример #4
0
def _read(
    filename: Path,
    backed=None,
    sheet=None,
    ext=None,
    delimiter=None,
    first_column_names=None,
    backup_url=None,
    cache=False,
    cache_compression=None,
    suppress_cache_warning=False,
    **kwargs,
):
    if ext is not None and ext not in avail_exts:
        raise ValueError('Please provide one of the available extensions.\n'
                         f'{avail_exts}')
    else:
        ext = is_valid_filename(filename, return_ext=True)
    is_present = _check_datafile_present_and_download(
        filename,
        backup_url=backup_url,
    )
    if not is_present:
        logg.debug(f'... did not find original file {filename}')
    # read hdf5 files
    if ext in {'h5', 'h5ad'}:
        if sheet is None:
            return read_h5ad(filename, backed=backed)
        else:
            logg.debug(f'reading sheet {sheet} from file {filename}')
            return read_hdf(filename, sheet)
    # read other file types
    path_cache = settings.cachedir / _slugify(filename).replace(
        '.' + ext, '.h5ad')  # type: Path
    if path_cache.suffix in {'.gz', '.bz2'}:
        path_cache = path_cache.with_suffix('')
    if cache and path_cache.is_file():
        logg.info(f'... reading from cache file {path_cache}')
        return read_h5ad(path_cache)

    if not is_present:
        raise FileNotFoundError(f'Did not find file {filename}.')
    logg.debug(f'reading {filename}')
    if not cache and not suppress_cache_warning:
        logg.hint('This might be very slow. Consider passing `cache=True`, '
                  'which enables much faster reading from a cache file.')
    # do the actual reading
    if ext == 'xlsx' or ext == 'xls':
        if sheet is None:
            raise ValueError(
                "Provide `sheet` parameter when reading '.xlsx' files.")
        else:
            adata = read_excel(filename, sheet)
    elif ext in {'mtx', 'mtx.gz'}:
        adata = read_mtx(filename)
    elif ext == 'csv':
        adata = read_csv(filename, first_column_names=first_column_names)
    elif ext in {'txt', 'tab', 'data', 'tsv'}:
        if ext == 'data':
            logg.hint(
                "... assuming '.data' means tab or white-space "
                'separated text file', )
            logg.hint('change this by passing `ext` to sc.read')
        adata = read_text(filename, delimiter, first_column_names)
    elif ext == 'soft.gz':
        adata = _read_softgz(filename)
    elif ext == 'loom':
        adata = read_loom(filename=filename, **kwargs)
    else:
        raise ValueError(f'Unknown extension {ext}.')
    if cache:
        logg.info(f'... writing an {settings.file_format_data} '
                  'cache file to speedup reading next time')
        if cache_compression is _empty:
            cache_compression = settings.cache_compression
        if not path_cache.parent.is_dir():
            path_cache.parent.mkdir(parents=True)
        # write for faster reading when calling the next time
        adata.write(path_cache, compression=cache_compression)
    return adata
Пример #5
0
def read_dataset(path):
    path = str(path)
    tmp_path = None
    if path.startswith('gs://'):
        tmp_path = download_gs_url(path)
        path = tmp_path
    basename_and_extension = get_filename_and_extension(path)
    ext = basename_and_extension[1]
    if ext == 'mtx':
        x = scipy.io.mmread(path)
        x = scipy.sparse.csr_matrix(x.T)
        # look for .barcodes.txt and .genes.txt
        import itertools
        sp = os.path.split(path)
        obs = None

        for sep_ext in itertools.product(['.', '_', '-'], ['tsv', 'txt']):
            for prefix in ['', basename_and_extension[0] + sep_ext[0]]:
                f = os.path.join(sp[0], prefix + 'barcodes.' + sep_ext[1])
                if os.path.isfile(f) or os.path.isfile(f + '.gz'):
                    obs = pd.read_table(f if os.path.isfile(f) else f + '.gz', index_col=0, sep='\t',
                                        header=None)
                    break
        var = None
        for sep_ext in itertools.product(['.', '_', '-'], ['tsv', 'txt']):
            for prefix in ['', basename_and_extension[0] + sep_ext[0]]:
                f = os.path.join(sp[0], prefix + 'genes.' + sep_ext[1])
                if os.path.isfile(f) or os.path.isfile(f + '.gz'):
                    var = pd.read_table(f if os.path.isfile(f) else f + '.gz', index_col=0, sep='\t',
                                        header=None)
                    break

        if var is None:
            print(basename_and_extension[0] + '.genes.txt not found')
            var = pd.DataFrame(index=pd.RangeIndex(start=0, stop=x.shape[1], step=1))
        if obs is None:
            print(basename_and_extension[0] + '.barcodes.txt not found')
            obs = pd.DataFrame(index=pd.RangeIndex(start=0, stop=x.shape[0], step=1))

        cell_count, gene_count = x.shape
        if len(obs) != cell_count:
            raise ValueError("Wrong number of cells : matrix has {} cells, barcodes file has {}" \
                             .format(cell_count, len(obs)))
        if len(var) != gene_count:
            raise ValueError("Wrong number of genes : matrix has {} genes, genes file has {}" \
                             .format(gene_count, len(var)))

        return anndata.AnnData(X=x, obs=obs, var=var)
    elif ext == 'npz':
        obj = np.load(path)
        if tmp_path is not None:
            os.remove(tmp_path)
        return anndata.AnnData(X=obj['x'], obs=pd.DataFrame(index=obj['rid']), var=pd.DataFrame(index=obj['cid']))
    elif ext == 'npy':
        x = np.load(path)
        if tmp_path is not None:
            os.remove(tmp_path)
        return anndata.AnnData(X=x, obs=pd.DataFrame(index=pd.RangeIndex(start=0, stop=x.shape[0], step=1)),
                               var=pd.DataFrame(index=pd.RangeIndex(start=0, stop=x.shape[1], step=1)))
    elif ext == 'loom':
        # in loom file, convention is rows are genes :(
        # return anndata.read_loom(path, X_name='matrix', sparse=True)
        f = h5py.File(path, 'r')
        x = f['/matrix']
        is_x_sparse = x.attrs.get('sparse')
        if is_x_sparse:
            # read in blocks of 1000
            chunk_start = 0
            nrows = x.shape[0]
            chunk_step = min(nrows, 1000)
            chunk_stop = chunk_step
            nchunks = int(np.ceil(max(1, nrows / chunk_step)))
            sparse_arrays = []
            for chunk in range(nchunks):
                chunk_stop = min(nrows, chunk_stop)
                subset = scipy.sparse.csr_matrix(x[chunk_start:chunk_stop])
                sparse_arrays.append(subset)
                chunk_start += chunk_step
                chunk_stop += chunk_step

            x = scipy.sparse.vstack(sparse_arrays)
        else:
            x = x[()]
        row_meta = {}
        row_attrs = f['/row_attrs']
        for key in row_attrs:
            values = row_attrs[key][()]
            if values.dtype.kind == 'S':
                values = values.astype(str)
            row_meta[key] = values
        row_meta = pd.DataFrame(data=row_meta)
        if row_meta.get('id') is not None:
            row_meta.set_index('id', inplace=True)

        col_meta = {}
        col_attrs = f['/col_attrs']
        for key in col_attrs:
            values = col_attrs[key][()]
            if values.dtype.kind == 'S':
                values = values.astype(str)
            col_meta[key] = values
        col_meta = pd.DataFrame(data=col_meta)
        if col_meta.get('id') is not None:
            col_meta.set_index('id', inplace=True)
        f.close()
        return anndata.AnnData(X=x, obs=row_meta, var=col_meta)
    elif ext == 'h5ad':
        return anndata.read_h5ad(path)
    elif ext == 'hdf5' or ext == 'h5':
        return anndata.read_hdf(path)
    elif ext == 'gct':
        ds = wot.io.read_gct(path)
        if tmp_path is not None:
            os.remove(tmp_path)
        return ds
    else:  # txt
        with open(path) as fp:
            row_ids = []
            header = fp.readline()
            sep = None
            for s in ['\t', ',', ' ']:
                test_tokens = header.split(s)
                if len(test_tokens) > 1:
                    sep = s
                    column_ids = test_tokens
                    break
            if sep is None:
                sep = '\t'
            column_ids = column_ids[1:]
            column_ids[len(column_ids) - 1] = column_ids[
                len(column_ids) - 1].rstrip()

            i = 0
            np_arrays = []
            for line in fp:
                line = line.rstrip()
                if line != '':
                    tokens = line.split(sep)
                    row_ids.append(tokens[0])
                    np_arrays.append(np.array(tokens[1:], dtype=np.float64))
                    i += 1
            if tmp_path is not None:
                os.remove(tmp_path)
            return anndata.AnnData(X=np.array(np_arrays),
                                   obs=pd.DataFrame(index=row_ids),
                                   var=pd.DataFrame(index=column_ids))