if extension in ("xlsx", "xltx"): return load_workbook(f) raise Exception( f"Deserialization: file extension {extension} is not supported by openpyxl_workbook type." ) def copy(self, data): return self.from_bytes(self.as_bytes(data)[0]) def data_characteristics(self, data): return dict( description=f"Excel workbook with {len(data.sheetnames)} sheets.") OPENPYXL_WORKBOOK_STATE_TYPE = OpenpyxlWorkbookStateType() register_state_type(Workbook, OPENPYXL_WORKBOOK_STATE_TYPE) @command def workbook(data, index=True, header=True, context=None): """Convert bytes or a dataframe to a workbook""" context = get_context(context) if type(data) == bytes: context.info("Workbook from bytes") return OPENPYXL_WORKBOOK_STATE_TYPE.from_bytes(data) elif isinstance(data, pd.DataFrame): context.info("Workbook from pandas DataFrame") wb = Workbook() ws = wb.active for r in dataframe_to_rows(df, index=index, header=header): ws.append(r)
) def data_characteristics(self, data): s = StringIO() summary=keras_summary(data) return dict(description=f"Keras model", summary=summary) def copy(self, data): print ("KERAS MODEL COPY") model = clone_model(data) model.set_weights(data.get_weights()) return model KERASMODEL_STATE_TYPE = KerasModelStateType() register_state_type(Model, KERASMODEL_STATE_TYPE) register_state_type(Functional, KERASMODEL_STATE_TYPE) @command def keras_plot_model( model, show_shapes: bool = False, show_layer_names: bool = True, rankdir: str = "TB", expand_nested: bool = False, dpi: int = 96, ): "Keras plot model as png" print("************* keras_plot_model") assert isinstance(model, Model)
return StoredDataframeIterator.from_dict( json.loads(b.decode("utf-8"))) raise Exception( f"Deserialization: file extension {extension} is not supported by dataframe type." ) def copy(self, data): return data.copy() def data_characteristics(self, data): return dict(description= f"Dataframe iterator with {len(data.item_keys)} batches.") STORED_DATAFRAME_ITERATOR_STATE_TYPE = StoredDataframeIteratorStateType() register_state_type(StoredDataframeIterator, STORED_DATAFRAME_ITERATOR_STATE_TYPE) def _store_batches(idf, key, max_batches=None, context=None): """Store iterator of dataframes (batches) in a store. The key specifies a directory in the store where the items will be stored. Helper function yielding StoredDataframeIterator object and dataframes. """ context = get_context(context) context.info(f"Store iterator") batch_number = 0 if max_batches in ("0", "", None): max_batches = 0 else: max_batches = int(max_batches) store = context.store()
if extension in ("pickle", "pkl"): return pickle.loads(b) raise Exception( f"Deserialization: file extension {extension} is not supported by Matplotlib Figure type." ) def copy(self, data): return data def data_characteristics(self, data): return dict(description=f"Matplotlib figure") MATPLOTLIB_FIGURE_STATE_TYPE = MatplotlibFigureStateType() register_state_type(plt.Figure, MATPLOTLIB_FIGURE_STATE_TYPE) @command def mpl(state, *series): """Matplotlib chart""" fig = plt.figure(figsize=(8, 6), dpi=300) axis = fig.add_subplot(1, 1, 1) series = list(reversed(list(series))) df = state.get() extension = None while len(series): t = series.pop() if t in ["jpg", "png", "svg"]: extension = t
if extension == "csv": return hxl.data(f) raise Exception( f"Deserialization: file extension {extension} is not supported by HXL dataset type." ) def copy(self, data): """Make a deep copy of the data""" return data def data_characteristics(self, data): return dict(description=f"HXL dataset") HXL_DATASET_STATE_TYPE = HxlStateType() register_state_type(hxl.Dataset, HXL_DATASET_STATE_TYPE) register_state_type(hxl.io.HXLReader, HXL_DATASET_STATE_TYPE) @first_command def hxl_from(url): """Load data from URL""" return hxl.data(url) @command def hxl2df(data): """Convert hxl dataset to pandas dataframe""" f = BytesIO() for line in data.gen_csv(show_headers=True, show_tags=True): f.write(line.encode("utf-8"))
f"Deserialization: file extension {extension} is not supported by dataframe type." ) def copy(self, data): return data.copy() def data_characteristics(self, data): return dict(description=f"Dataframe with {len(data.columns)} columns and {len(data)} rows.", columns=[str(c) for c in data.columns], number_of_columns = len(data.columns), number_of_rows = len(data), ) DATAFRAME_STATE_TYPE = DataframeStateType() register_state_type(pd.DataFrame, DATAFRAME_STATE_TYPE) @command def to_df(data): "Convert data to DataFrame; data should be list of dicts or dict of lists." return pd.DataFrame(data) @first_command def df_from(url, extension=None): """Load data from URL""" if extension is None: extension = url.split(".")[-1] if extension not in "csv tsv xls xlsx msgpack".split(): extension = "csv"
f"Serialization: file extension {extension} is not supported by DataFusion data-frame type." ) def from_bytes(self, b: bytes, extension=None): raise Exception( f"Deserialization is not supported by DataFusion data-frame type.") def copy(self, data): return self.from_bytes(self.as_bytes(data)[0]) def data_characteristics(self, data): return dict(description=f"DataFusion data-frame") DATAFUSION_DATAFRAME_STATE_TYPE = DatafusionDataframeStateType() register_state_type(daf.DataFrame, DATAFUSION_DATAFRAME_STATE_TYPE) class DatafusionContextStateType(StateType): def identifier(self): return "datafusion_context" def default_extension(self): return "pickle" def is_type_of(self, data): return isinstance(data, daf.ExecutionContext) def as_bytes(self, data, extension=None): raise Exception( f"Serialization is not supported by DataFusion ExecutionContext type."
return pl.read_csv(f) elif extension == "parquet": return pl.read_parquet(f) raise Exception( f"Deserialization: file extension {extension} is not supported by polars data-frame type." ) def copy(self, data): return self.from_bytes(self.as_bytes(data)[0]) def data_characteristics(self, data): return dict(description=f"Polars data-frame with {len(data.columns)} and {len(data)} rows.") POLARS_DATAFRAME_STATE_TYPE = PolarsDataframeStateType() register_state_type(pl.DataFrame, POLARS_DATAFRAME_STATE_TYPE) @command def polars_df(data, extension=None, context=None): """Convert bytes or a dataframe to a workbook""" context=get_context(context) if type(data)==bytes: context.info(f"Polars data-frame from bytes. Extension:'{extension}'") return POLARS_DATAFRAME_STATE_TYPE.from_bytes(data, extension=extension) elif isinstance(data,pd.DataFrame): context.info("Polars data-frame from Pandas data-frame") return pl.DataFrame(data) elif isinstance(data,pl.DataFrame): context.info("Polars data-frame kept as it is") return data raise Exception(f"Unsupported polars dataframe type: {type(data)}")
) else: raise Exception( f"Deserialization: file extension {extension} is not supported by PIL Image type." ) def copy(self, data): return data.copy() def data_characteristics(self, data): width, height = data.size return dict(description=f"Image {width}x{height}") PIL_IMAGE_STATE_TYPE = PILImageStateType() register_state_type(PIL.Image.Image, PIL_IMAGE_STATE_TYPE) @command(ns="pil") def resize(image, width, height, resample=None): """Resize image""" resample = dict( nearest=PIL.Image.NEAREST, box=PIL.Image.BOX, bilinear=PIL.Image.BILINEAR, hamming=PIL.Image.HAMMING, bicubic=PIL.Image.BICUBIC, lanczos=PIL.Image.LANCZOS, ).get(str(resample).lower()) return image.copy().resize((int(width), int(height)), resample=resample)