Пример #1
0
 def to_orc(df: ddDataFrame,
            filepath: str,
            overwrite: bool = True,
            **kwargs) -> None:
     if not overwrite:
         # Check if file was already serialized
         if isfile(filepath):
             return
     df.to_orc(filepath, **kwargs)
Пример #2
0
 def to_csv(cls,
            df: ddDataFrame,
            filepath: str,
            overwrite: bool = True,
            **kwargs) -> None:
     if not overwrite:
         # Check if file was already serialized
         if isfile(filepath):
             return
     df.to_csv(filepath, index_label=cls.INDEX_COLUMN, **kwargs)
Пример #3
0
 def to_json(cls,
             df: ddDataFrame,
             filepath: str,
             overwrite: bool = True,
             **kwargs) -> None:
     if not overwrite:
         # Check if file was already serialized
         if isfile(filepath):
             return
     # json records do not include index so artificially inject
     if cls.INDEX_COLUMN in df.columns:
         df.to_json(filepath, **kwargs)
     else:
         df.reset_index(drop=False).rename(columns={
             "index": cls.INDEX_COLUMN
         }).to_json(filepath, **kwargs)
Пример #4
0
 def _dataframe(self, df: ddDataFrame) -> None:
     """
     Setter method for self._external_file
     Persists dask computations to new futures so retrieval calls do not
     reexecute the computation graph
     """
     self._external_file = df.persist()
Пример #5
0
    def _get(dataframe: ddDataFrame, columns: List[str],
             split: str) -> ddDataFrame:
        """
        Internal method to extract data subsets from a dataframe

        :param dataframe: the dataframe to subset from
        :param columns: List of columns to slice from the dataframe
        :param split: row identifiers to slice rows (in internal column mapped to `DATAFRAME_SPLIT_COLUMN`)
        """
        if split is not None:  # Return the full dataset (all splits) - already a copy
            # query automatically returns a copy wisth a weakref
            if DATAFRAME_SPLIT_COLUMN not in dataframe.columns:
                raise DatasetError(
                    f"Cannot retrieve dataset split `{split}` from dataframe without `{DATAFRAME_SPLIT_COLUMN}` column"
                )
            dataframe = dataframe.query("{}=='{}'".format(
                DATAFRAME_SPLIT_COLUMN, split))

        # drop extra columns
        drop_columns = [col for col in dataframe.columns if col not in columns]
        if drop_columns:
            dataframe = dataframe.drop(drop_columns, axis=1)

        return dataframe
Пример #6
0
 def to_sql(df: ddDataFrame, **kwargs) -> None:
     df.to_sql(**kwargs)
Пример #7
0
 def squeeze_dataframe(df: ddDataFrame) -> ddSeries:
     """
     Helper method to run dataframe squeeze and return a series
     """
     return df.squeeze(axis=1)