示例#1
0
    def store_dataframes(
        self,
        store: StoreInput,
        dataset_uuid: str,
        df_serializer: Optional[DataFrameSerializer] = None,
    ) -> "MetaPartition":
        """
        Stores all dataframes of the MetaPartitions and registers the saved
        files under the `files` atrribute. The dataframe itself is deleted from memory.

        Parameters
        ----------
        store
            If it is a function, the result of calling it must be a KeyValueStore.
        dataset_uuid
            The dataset UUID the partition will be assigned to
        df_serializer
            Serialiser to be used to store the dataframe
        Returns
        -------
        MetaPartition
        """
        df_serializer = (df_serializer if df_serializer is not None else
                         default_serializer())

        key = get_partition_file_prefix(
            partition_label=self.label,
            dataset_uuid=dataset_uuid,
            metadata_version=self.metadata_version,
            table=self.table_name,
        )
        if self.data is not None:
            df = self.data
            try:
                file = df_serializer.store(store, key, df)
            except Exception as exc:
                try:
                    if isinstance(df, pd.DataFrame):
                        buf = io.StringIO()
                        df.info(buf=buf)
                        LOGGER.error(
                            "Writing dataframe failed.\n"
                            "%s\n"
                            "%s\n"
                            "%s",
                            exc,
                            buf.getvalue(),
                            df.head(),
                        )
                    else:
                        LOGGER.error("Storage of dask dataframe failed.")
                        pass
                finally:
                    raise

            new_metapartition = self.copy(file=file, data=None)

            return new_metapartition
        else:
            return self
示例#2
0
def _maybe_infer_files_attribute(metapartition, dataset_uuid):
    new_mp = metapartition.as_sentinel()
    for mp in metapartition:
        if len(mp.files) == 0:
            if mp.data is None or len(mp.data) == 0:
                raise ValueError(
                    "Trying to commit partitions without `data` or `files` information."
                    "Either one is necessary to infer the dataset tables")
            new_files = {}
            for table in mp.data:
                new_files[table] = (
                    get_partition_file_prefix(
                        dataset_uuid=dataset_uuid,
                        partition_label=mp.label,
                        table=table,
                        metadata_version=mp.metadata_version,
                    ) +
                    PARQUET_FILE_SUFFIX  # noqa: W503 line break before binary operator
                )
            mp = mp.copy(files=new_files)

        new_mp = new_mp.add_metapartition(mp)
    return new_mp