def discover_dataframe(df): obj = datashape.coretypes.object_ names = list(df.columns) dtypes = list(map(datashape.CType.from_numpy_dtype, df.dtypes)) dtypes = [datashape.string if dt == obj else dt for dt in dtypes] schema = datashape.Record(list(zip(names, dtypes))) return len(df) * schema
def discover_dataframe(df): obj = object_ names = list(df.columns) dtypes = list(map(datashape.CType.from_numpy_dtype, df.dtypes)) dtypes = [string if dt == obj else dt for dt in dtypes] odtypes = [Option(dt) if dt in possibly_missing else dt for dt in dtypes] schema = datashape.Record(list(zip(names, odtypes))) return len(df) * schema
def discover_sqlcontext(ctx): try: table_names = list(map(str, ctx.tableNames())) except AttributeError: java_names = ctx._ssql_ctx.catalog().tables().keySet() table_names = list(scala_set_to_set(ctx, java_names)) table_names.sort() dshapes = zip(table_names, map(discover, map(ctx.table, table_names))) return datashape.DataShape(datashape.Record(dshapes))
def dshape_from_dask(df): """Return a datashape.DataShape object given a dask dataframe.""" cat_columns = [ col for col in df.columns if (isinstance(df[col].dtype, type(pd.Categorical.dtype)) or isinstance(df[col].dtype, pd.api.types.CategoricalDtype)) and not df[col].cat.known ] df = df.categorize(cat_columns, index=False) return datashape.var * datashape.Record( [(k, dshape_from_pandas_helper(df[k])) for k in df.columns])
def dshape_from_dask(df): """Return a datashape.DataShape object given a dask dataframe.""" cat_columns = [ col for col in df.columns if (isinstance(df[col].dtype, type(pd.Categorical.dtype)) or isinstance(df[col].dtype, pd.api.types.CategoricalDtype)) and not getattr(df[col].cat, 'known', True)] df = df.categorize(cat_columns, index=False) # get_partition(0) used below because categories are sometimes repeated # for dask-cudf DataFrames with multiple partitions return datashape.var * datashape.Record([ (k, dshape_from_pandas_helper(df[k].get_partition(0))) for k in df.columns ])
def discover_h5py_dataset(d): dshape = datashape.from_numpy(d.shape, d.dtype) shape, measure = dshape.shape, dshape.measure if not isrecord(measure): if dshape == datashape.object_: args = shape + (datashape.string, ) return DataShape(*args) return dshape else: records = list( record_dshape_replace(measure, datashape.object_, datashape.string)) args = shape + (datashape.Record(records), ) return DataShape(*args)
def get_datashape(odo_resource): dshape = odo.discover(odo_resource, **odo_args) dshape = ''.join(str(dshape).split("*")[1].split()).replace( " ", "").replace(":", "\":\"").replace(",", "\",\"").replace( "{", "{\"").replace("}", "\"}") dictshape = eval(dshape) dkeys = [ x.split(":")[0].replace("\"", "") for x in dshape.replace("{", "").replace("}", "").split(",") ] dictList = [] for key in dkeys: value = dictshape[key] value = value.strip().replace("?", "") if value == "bool": value = "int32" value = ds.Option(value) dictList.append([key, value]) dshape = ds.var * ds.Record(dictList) return dshape
def dshape_from_xarray_dataset(xr_ds): """Return a datashape.DataShape object given a xarray Dataset.""" return datashape.var * datashape.Record( [(k, dshape_from_pandas_helper(xr_ds[k])) for k in list(xr_ds.data_vars) + list(xr_ds.coords)])
def dshape_from_pandas(df): """Return a datashape.DataShape object given a pandas dataframe.""" return len(df) * datashape.Record([(k, dshape_from_pandas_helper(df[k])) for k in df.columns])
def out_dshape(self, input_dshape): cats = list(range(self.modulo)) red_shape = self.reduction.out_dshape(input_dshape) return datashape.util.dshape(datashape.Record([(c, red_shape) for c in cats]))
def discover_sqlcontext(ctx): table_names = sorted(map(str, ctx.tableNames())) dshapes = zip(table_names, map(discover, map(ctx.table, table_names))) return datashape.DataShape(datashape.Record(dshapes))
def dshape_from_pandas(df): return len(df) * datashape.Record([(k, dshape_from_pandas_helper(df[k])) for k in df.columns])
def discover_dataframe(df): return len(df) * datashape.Record([(k, dshape_from_pandas(df[k])) for k in df.columns])
def discover_dataframe(df): return len(df) * datashape.Record( zip(df.columns, map(dshape_from_pandas, df.dtypes)), )
def discover(t): return t.shape[0] * ds.Record([[col, discover(getattr(t.cols, col))] for col in t.colnames])