def write_data_by_file_extension(data: dd = None, file_path: Path = None): """ write dask dataframe to file into based on the input path file extension :param file_path: path of output file :return: None """ data = data.compute() map_file_extension_to_read_function = { '.csv': 'to_csv', '.parquet': 'to_parquet' } name, extension = os.path.splitext(file_path) if extension.lower() in map_file_extension_to_read_function.keys(): write_function = getattr( data, map_file_extension_to_read_function[extension.lower()]) read_function = map_file_extension_to_read_function[extension.lower()] write_function(file_path, index=False) else: raise Exception(f"File extention {extension} not recognized")
def remove_outliers(self, data: dataframe, threshold: float): data = data.compute(num_workers=self.workers) stats: dict = { "mean": data[self.cols["CONTINUOUS"]].mean(axis=0), "std_dev": data[self.cols["CONTINUOUS"]].std(axis=0) } z_cols = list(map(lambda col: "z" + col, self.cols["CONTINUOUS"])) zdata = data[self.cols["CONTINUOUS"]].apply( lambda col: (col - stats["mean"][col.name]) / (stats["std_dev"][col.name]), axis=0) zdata.columns = z_cols data = concat([data, zdata], axis=1) for z_col in z_cols: data = data[data[z_col].between(-1 * threshold, threshold)] return dataframe.from_pandas( data.drop(columns=z_cols).reset_index(drop=True), npartitions=self.workers)
def null_data_check(data: dd = None) -> bool: """ Check if dataframe contains any null values if so return true """ data = data.compute() return data.isnull().values.any()