def to_orc(df: ddDataFrame, filepath: str, overwrite: bool = True, **kwargs) -> None: if not overwrite: # Check if file was already serialized if isfile(filepath): return df.to_orc(filepath, **kwargs)
def to_csv(cls, df: ddDataFrame, filepath: str, overwrite: bool = True, **kwargs) -> None: if not overwrite: # Check if file was already serialized if isfile(filepath): return df.to_csv(filepath, index_label=cls.INDEX_COLUMN, **kwargs)
def to_json(cls, df: ddDataFrame, filepath: str, overwrite: bool = True, **kwargs) -> None: if not overwrite: # Check if file was already serialized if isfile(filepath): return # json records do not include index so artificially inject if cls.INDEX_COLUMN in df.columns: df.to_json(filepath, **kwargs) else: df.reset_index(drop=False).rename(columns={ "index": cls.INDEX_COLUMN }).to_json(filepath, **kwargs)
def _dataframe(self, df: ddDataFrame) -> None: """ Setter method for self._external_file Persists dask computations to new futures so retrieval calls do not reexecute the computation graph """ self._external_file = df.persist()
def _get(dataframe: ddDataFrame, columns: List[str], split: str) -> ddDataFrame: """ Internal method to extract data subsets from a dataframe :param dataframe: the dataframe to subset from :param columns: List of columns to slice from the dataframe :param split: row identifiers to slice rows (in internal column mapped to `DATAFRAME_SPLIT_COLUMN`) """ if split is not None: # Return the full dataset (all splits) - already a copy # query automatically returns a copy wisth a weakref if DATAFRAME_SPLIT_COLUMN not in dataframe.columns: raise DatasetError( f"Cannot retrieve dataset split `{split}` from dataframe without `{DATAFRAME_SPLIT_COLUMN}` column" ) dataframe = dataframe.query("{}=='{}'".format( DATAFRAME_SPLIT_COLUMN, split)) # drop extra columns drop_columns = [col for col in dataframe.columns if col not in columns] if drop_columns: dataframe = dataframe.drop(drop_columns, axis=1) return dataframe
def to_sql(df: ddDataFrame, **kwargs) -> None: df.to_sql(**kwargs)
def squeeze_dataframe(df: ddDataFrame) -> ddSeries: """ Helper method to run dataframe squeeze and return a series """ return df.squeeze(axis=1)