def store_dataframes( self, store: StoreInput, dataset_uuid: str, df_serializer: Optional[DataFrameSerializer] = None, ) -> "MetaPartition": """ Stores all dataframes of the MetaPartitions and registers the saved files under the `files` atrribute. The dataframe itself is deleted from memory. Parameters ---------- store If it is a function, the result of calling it must be a KeyValueStore. dataset_uuid The dataset UUID the partition will be assigned to df_serializer Serialiser to be used to store the dataframe Returns ------- MetaPartition """ df_serializer = (df_serializer if df_serializer is not None else default_serializer()) key = get_partition_file_prefix( partition_label=self.label, dataset_uuid=dataset_uuid, metadata_version=self.metadata_version, table=self.table_name, ) if self.data is not None: df = self.data try: file = df_serializer.store(store, key, df) except Exception as exc: try: if isinstance(df, pd.DataFrame): buf = io.StringIO() df.info(buf=buf) LOGGER.error( "Writing dataframe failed.\n" "%s\n" "%s\n" "%s", exc, buf.getvalue(), df.head(), ) else: LOGGER.error("Storage of dask dataframe failed.") pass finally: raise new_metapartition = self.copy(file=file, data=None) return new_metapartition else: return self
def _maybe_infer_files_attribute(metapartition, dataset_uuid): new_mp = metapartition.as_sentinel() for mp in metapartition: if len(mp.files) == 0: if mp.data is None or len(mp.data) == 0: raise ValueError( "Trying to commit partitions without `data` or `files` information." "Either one is necessary to infer the dataset tables") new_files = {} for table in mp.data: new_files[table] = ( get_partition_file_prefix( dataset_uuid=dataset_uuid, partition_label=mp.label, table=table, metadata_version=mp.metadata_version, ) + PARQUET_FILE_SUFFIX # noqa: W503 line break before binary operator ) mp = mp.copy(files=new_files) new_mp = new_mp.add_metapartition(mp) return new_mp