def _get_schema(self): if self.path is not '': xarr = xr.open_rasterio(self.path) ds2 = xr.Dataset({'raster': xarr}) metadata = { 'dims': dict(ds2.dims), 'data_vars': {k: list(ds2[k].coords) for k in ds2.data_vars.keys()}, 'coords': tuple(ds2.coords.keys()), 'array': 'raster' } atts = ['transform', 'crs', 'res', 'is_tiled', 'nodatavals'] for att in atts: if att in xarr.attrs: metadata[att] = xarr.attrs[att] return Schema(datashape=None, dtype=str(xarr.dtype), shape=xarr.shape, npartitions=1, extra_metadata=metadata) else: self._schema = Schema(datashape=None, dtype=None, shape=None, npartitions=1, extra_metadata={}) return self._schema
def _load_metadata(self): import dask.dataframe as dd import dask.delayed from dask.bytes import open_files self.files = open_files(self.url, **self.storage_options) def read_a_file(open_file, reader, kwargs): with open_file as of: df = reader(of, **kwargs) df['path'] = open_file.path return df if self.dataframe is None: self.parts = [ dask.delayed(read_a_file)(open_file, self.reader, self.kwargs) for open_file in self.files ] self.dataframe = dd.from_delayed(self.parts) self.npartitions = self.dataframe.npartitions self.shape = (None, len(self.dataframe.columns)) self.dtype = self.dataframe.dtypes.to_dict() self._schema = Schema(npartitions=self.npartitions, extra_metadata=self.metadata, dtype=self.dtype, shape=self.shape, datashape=None) return self._schema
def _get_schema(self): """Make schema object, which embeds xarray object and some details""" from .xarray_container import serialize_zarr_ds self.urlpath = self._get_cache(self.urlpath)[0] if self._ds is None: self._open_dataset() metadata = { 'dims': dict(self._ds.dims), 'data_vars': {k: list(self._ds[k].coords) for k in self._ds.data_vars.keys()}, 'coords': tuple(self._ds.coords.keys()), } if getattr(self, 'on_server', False): serialized = serialize_zarr_ds(self._ds) metadata['internal'] = serialized # The zarr serialization imposes a certain chunking, which will # be reflected in the xarray.Dataset object constructed on the # client side. We need to use that same chunking here on the # server side. Extract it from the serialized zarr metadata. self._chunks = {k.rsplit('/', 1)[0]: json.loads(v.decode())['chunks'] for k, v in serialized.items() if k.endswith('/.zarray')} metadata.update(self._ds.attrs) self._schema = Schema( datashape=None, dtype=None, shape=None, npartitions=None, extra_metadata=metadata) return self._schema
def _get_schema(self): """Make schema object, which embeds xarray object and some details""" from intake.source.base import Schema self.urlpath = self._get_cache(self.urlpath)[0] if self._ds is None: self._open_dataset() if isinstance(self._ds, xr.Dataset): metadata = { 'dims': dict(self._ds.dims), 'data_vars': { k: list(self._ds[k].coords) for k in self._ds.data_vars.keys() }, 'coords': tuple(self._ds.coords.keys()), } metadata.update(self._ds.attrs) else: metadata = {} self._schema = Schema(datashape=None, dtype=None, shape=None, npartitions=None, extra_metadata=metadata) return self._schema
def _get_schema(self): if self._dataset is None: self._open_dataset() self._chroms = list(self._dataset.contigs) rec = next(self._dataset.fetch(self._chroms[0], parser=asTuple())) num_fields = len(rec) chrom_coord_dtype = np.int64 dtypes = { "chrom": pd.CategorialDtype(self._chroms + ["NULL"], ordered=True), "start": chrom_coord_dtype, "end": chrom_coord_dtype, "name": str, "score": np.float32, "strand": bool, } self._dtype = { key: dtypes[key] for key in list(dtypes.keys())[:num_fields] } return Schema( datashape=None, dtype=self._dtype, shape=(None, len(self._dtype)), npartitions=len(self._chroms), extra_metadata={}, )
def _get_schema(self): """Make schema object, which embeds xarray object and some details""" from .xarray_container import serialize_zarr_ds self.urlpath = self._get_cache(self.urlpath)[0] if self._ds is None: self._open_dataset() metadata = { 'dims': dict(self._ds.dims), 'data_vars': {k: list(self._ds[k].coords) for k in self._ds.data_vars.keys()}, 'coords': tuple(self._ds.coords.keys()), } if getattr(self, 'on_server', False): metadata['internal'] = serialize_zarr_ds(self._ds) metadata.update(self._ds.attrs) self._schema = Schema( datashape=None, dtype=None, shape=None, npartitions=None, extra_metadata=metadata) return self._schema
def _get_schema(self): from intake.source.base import Schema self._open_dataset() self._schema = Schema( datashape=None, dtype=None, shape=None, npartitions=None, extra_metadata={} ) return self._schema
def _get_schema(self): """Make schema object, which embeds iris cubelist and some details""" metadata = {} self._schema = Schema(datashape=None, dtype=None, shape=len(self.cubelist), npartitions=len(self.cubelist), extra_metadata=metadata) return self._schema
def _get_schema(self): if self._df is None: self._df = self._metabase.get_card(self.question) return Schema(datashape=None, dtype=self._df.dtypes, shape=(None, len(self._df.columns)), npartitions=1, extra_metadata={})
def get_scheme(self): if self.df is None: self.df = self._make_df() dtypes = self.df.dtypes.to_dict() dtypes = {n: str(t) for (n, t) in dtypes.items()} return Schema(dtype=dtypes, shape=self.df.shape, extra_metadata=self.metadata, npartitions=1)
def _get_schema(self): """Make schema object, which embeds iris cube and some details""" metadata = {} self._schema = Schema(datashape=self.cube.shape, dtype=self.cube.dtype, shape=self.cube.shape, npartitions=self.cube.lazy_data().chunks, extra_metadata=metadata) return self._schema
def _get_schema(self): # get column info if self._df_schema is None: self._df_schema = self._stripe.get_table(resource=self.resource, schema=True) return Schema(datashape=None, dtype=self._df_schema, shape=(None, len(self._df_schema.columns)), npartitions=1, extra_metadata={})
def __init__(self, url, headers, **kwargs): self.url = url self.npartitions = kwargs.get('npartition', 1) self.partition_access = self.npartitions > 1 self.headers = headers self.metadata = kwargs.get('metadata', {}) self._schema = Schema(npartitions=self.npartitions, extra_metadata=self.metadata) self.bag = None super(RemoteSequenceSource, self).__init__(url, headers, **kwargs)
def _get_schema(self): """Reconstruct xarray arrays The schema returned is not very informative as a representation, this method fetches coordinates data and creates dask arrays. """ import dask.array as da if self._schema is None: metadata = { 'dims': dict(self._ds.dims), 'data_vars': {k: list(self._ds[k].coords) for k in self._ds.data_vars.keys()}, 'coords': tuple(self._ds.coords.keys()), } if getattr(self, 'on_server', False): metadata['internal'] = serialize_zarr_ds(self._ds) metadata.update(self._ds.attrs) self._schema = Schema( datashape=None, dtype=None, shape=None, npartitions=None, extra_metadata=metadata) # aparently can't replace coords in-place # we immediately fetch the values of coordinates # TODO: in the future, these could be functions from the metadata? self._ds = self._ds.assign_coords(**{c: self._get_partition((c, )) for c in metadata['coords']}) for var in list(self._ds.data_vars): # recreate dask arrays name = '-'.join(['remote-xarray', var, self._source_id]) arr = self._ds[var].data chunks = arr.chunks nparts = (range(len(n)) for n in chunks) if self.metadata.get('array', False): # original was an array, not dataset - no variable name extra = () else: extra = (var, ) dask = { (name, ) + part: (get_partition, self.url, self.headers, self._source_id, self.container, extra + part) for part in itertools.product(*nparts) } self._ds[var].data = da.Array( dask, name, chunks, dtype=arr.dtype, shape=arr.shape) if self.metadata.get('array', False): self._ds = self._ds[self.metadata.get('array')] return self._schema
def _get_schema(self): if self._dataframe is None: self._open_dataset() dtypes = self._dataframe._meta.dtypes.to_dict() dtypes = {n: str(t) for (n, t) in dtypes.items()} return Schema(datashape=None, dtype=dtypes, shape=(None, len(dtypes)), npartitions=self._dataframe.npartitions, extra_metadata={})
def _get_schema(self): if self.ref is None: self.ref = self.holder.setup() self.npartitions = self.ref.rdd.getNumPartitions() rows = self.ref.take(10) self.dtype = pandas_dtypes(self.ref.schema, rows) self.shape = (None, len(self.dtype)) return Schema(npartitions=self.npartitions, extra_metadata=self.metadata, dtype=self.dtype, shape=self.shape)
def _get_schema(self): if self.labels is None: lfile = self._get_cache(self.lfile)[0] ifile = self._get_cache(self.ifile)[0] self.labels = parse_idx(open(lfile[0], 'rb')) self.images = parse_idx(open(ifile[0], 'rb')) return Schema(datashape=None, dtype=self.images.dtype, shape=self.images.shape, npartitions=1, extra_metadata={})
def _get_schema(self): as_binary = self._load() s = re.search(b'_sklearn_versionq(.*\x00)((\d+\.)?(\d+\.)?(\*|\d+))q', as_binary) if s: sklearn_version = s.group(2).decode() else: sklearn_version = None self._schema = Schema( npartitions=1, extra_metadata={'sklearn_version': sklearn_version}) return self._schema
def _get_schema(self): if self._df is None: self._df = self._to_dask() dtypes = {k: str(v) for k, v in self._df._meta.dtypes.items()} self._schema = Schema( datashape=None, dtype=dtypes, shape=(None, len(self._df.columns)), npartitions=self._df.npartitions, extra_metadata={}, ) return self._schema
def __init__(self, url, headers, **kwargs): super(RemoteDataFrame, self).__init__(url, headers, **kwargs) self.npartitions = kwargs['npartitions'] self.shape = tuple(kwargs['shape']) self.metadata = kwargs['metadata'] self.dtype = kwargs['dtype'] self._schema = Schema(npartitions=self.npartitions, extra_metadata=self.metadata, dtype=self.dtype, shape=self.shape, datashape=None) self.dataframe = None
def _get_schema(self): if self._dataframe is None: self._open_dataset() return Schema( datashape=None, dtype=self._dtypes, shape=(None, len(self._dtypes)), npartitions=1, extra_metadata={}, )
def _get_schema(self): """Make schema object, which embeds iris object and some details""" if self._ds is None: self._open_dataset() metadata = {} self._schema = Schema(datashape=None, dtype=None, shape=None, npartitions=None, extra_metadata=metadata) return self._schema
def _get_schema(self) -> Schema: if self._ds is None: self._open_dataset() metadata = {'dims': {}, 'data_vars': {}, 'coords': ()} self._schema = Schema( datashape=None, dtype=None, shape=None, npartitions=None, extra_metadata=metadata, ) return self._schema
def _get_schema(self): if self._af is None: self._open_dataset() chrom_names = list(self._af.references) assert "NULL" not in chrom_names dtype = BamEntryDf.DTYPE.copy() dtype["chrom"] = pd.CategoricalDtype(chrom_names + ["NULL"], ordered=True) self._dtype = dtype return Schema(datashape=None, dtype=dtype, shape=(None, len(dtype)), npartitions=None, extra_metadata={})
def _get_schema(self): from dask.bytes import open_files import dask.array as da from dask.base import tokenize url = self._get_cache(self.url)[0] if self.arr is None: self.files = open_files(url, **self.storage_options) self.header, self.dtype, self.shape, self.wcs = _get_header( self.files[0], self.ext) name = 'fits-array-' + tokenize(url, self.chunks, self.ext) ch = self.chunks if self.chunks is not None else self.shape chunks = [] for c, s in zip(ch, self.shape): num = s // c part = [c] * num if s % c: part.append(s % c) chunks.append(tuple(part)) cums = tuple((0, ) + tuple(accumulate(ch)) for ch in chunks) dask = {} if len(self.files) > 1: # multi-file set self.shape = (len(self.files), ) + self.shape chunks.insert(0, (1, ) * len(self.files)) inds = tuple(range(len(ch)) for ch in chunks) for (fi, *bits) in product(*inds): slices = tuple(slice(i[bit], i[bit + 1]) for (i, bit) in zip(cums, bits)) dask[(name, fi) + tuple(bits)] = ( _get_section, self.files[fi], self.ext, slices, False ) else: # single-file set inds = tuple(range(len(ch)) for ch in chunks) for bits in product(*inds): slices = tuple(slice(i[bit], i[bit+1]) for (i, bit) in zip(cums, bits)) dask[(name,) + bits] = ( _get_section, self.files[0], self.ext, slices, True ) self.arr = da.Array(dask, name, chunks, dtype=self.dtype, shape=self.shape) self._schema = Schema( dtype=self.dtype, shape=self.shape, extra_metadata=dict(self.header.items()), npartitions=self.arr.npartitions, chunks=self.arr.chunks ) return self._schema
def _parse_open_response(self, response): dtype_descr = response['dtype'] if isinstance(dtype_descr, list): # Reformat because NumPy needs list of tuples dtype_descr = [tuple(x) for x in response['dtype']] self.dtype = dtype_descr self.shape = tuple(response['shape'] or ()) self.npartitions = response['npartitions'] self.metadata = response['metadata'] self._schema = Schema(datashape=None, dtype=self.dtype, shape=self.shape, npartitions=self.npartitions, metadata=self.metadata) self._source_id = response['source_id']
def _get_schema(self): if self._dataset is None: self._open_dataset() self._chroms = list(self._dataset.references) chrom_lengths = [{ "chrom": t[0], "length": t[1] } for t in zip(self._dataset.references, self._dataset.lengths)] return Schema( datashape=None, dtype=None, shape=None, npartitions=len(self._chroms), extra_metadata={"chroms": chrom_lengths}, )
def _get_schema(self): """Make schema object, which embeds xarray object and some details""" from intake.source.base import Schema self.urlpath = self._get_cache(self.urlpath)[0] if self._ds is None: self._open_dataset() metadata = {} self._schema = Schema( datashape=None, dtype=None, shape=None, npartitions=None, extra_metadata=metadata ) return self._schema
def _get_schema(self): if self._dtypes is None: cursor = self._make_cursor() records = cursor.fetchall() columns = [d.name for d in cursor.description] self._dataframe = pandas.DataFrame.from_records(records, columns=columns) self._dtypes = self._dataframe.dtypes cursor.close() return Schema( datashape="datashape", dtype=self._dtypes, shape=(None, len(self._dtypes)), npartitions=1, extra_metadata={}, )
def _get_schema(self): if self._ds is None: self._open_dataset() metadata = { 'dims': dict(self._ds.dims), 'data_vars': {k: list(self._ds[k].coords) for k in self._ds.data_vars.keys()}, 'coords': tuple(self._ds.coords.keys()), } self._schema = Schema( datashape=None, dtype=None, shape=None, npartitions=None, extra_metadata=metadata, ) return self._schema