def table_summary(context: MLClientCtx, dask_client: Union[DataItem, str], dask_key: str = 'my_dask_dataframe', target_path: str = '', name: str = 'table_summary.csv', key: str = 'table_summary') -> None: """Summarize a table :param context: the function context :param dask_client: path to the dask client scheduler json file, as string or artifact :param dask_key: key of dataframe in dask client 'datasets' attribute :param target_path: destimation folder for table summary file :param name: name of table summary file (with extension like .csv) :param key: key of table summary in artifact store """ print(context.__dict__) dask_client = Client(scheduler_file=str(dask_client)) df = dask_client.get_dataset('dask_key') print(df.head()) dscr = df.describe() filepath = os.path.join(target_path, name) dd.to_csv(dscr, filepath, single_file=True, index=False) context.log_artifact(key, target_path=filepath)
def create_diffdiff(): print("\ngenerating spreads vs spreads...") diff_diff = triu(dateless_df, True) ## merge abbr together, eg. "XXXX (PBF) - XXXX (SSF)" to "XXXX - XXXX (PBF-SSF)" diff_diff.columns = [ f'{h.split(" (")[0]}{h.split(" (")[1].split(")")[1]} ({h.split(" (")[1].split(")")[0]}-{h.split(" (")[2]}' for h in diff_diff.columns ] ## verbose version of above's list comprehension # new_headers = [] # for header in diff_diff.columns: # split_header = header.split(" (") # product_A = split_header[0] # split_section = split_header[1].split(")") # product_B = split_section[1] # new_headers.append(f"{product_A}{product_B} ({split_section[0]}-{split_header[2]}") # diff_diff.columns = new_headers diff_diff = triu(diff_diff, False) diff_diff = diff_diff.repartition(npartitions=200) diff_diff = diff_diff.reset_index(drop=True) dd_date_col = dd.from_array(date_col) dd_date_col = dd_date_col.repartition(npartitions=200) dd_date_col = dd_date_col.reset_index(drop=True) diff_diff = diff_diff.assign(date=dd_date_col) diff_diff = dd.melt( diff_diff, id_vars="date", var_name="product_diff", value_name="price_diff").dropna().reset_index(drop=True) diff_diff["product_diff"] = diff_diff["product_diff"].astype( "category") diff_diff["differential_A"] = diff_diff["product_diff"].str.partition( " - ")[0] diff_diff["differential_B"] = diff_diff["product_diff"].str.partition( " - ")[2] print(f"\nsaving file... ({round((time.time() - starttime), 2)}s)") dd.to_csv(df=diff_diff, filename=os.path.join(os.getcwd(), "cleaned_data", "diff_diff.csv"), index=False, single_file=True, encoding="utf-8-sig", chunksize=10000) print( f"[diff_diff.csv] saved successfully... ({round((time.time() - starttime), 2)}s)" )
def resample_at_path(s3_in_url, s3_out_url, s3_options, group, index_col, out_file_prefix='out'): aggr_func: Callable filter_by_key: str = resample_map['filter_by']['key'] filter_by_val: int = resample_map['filter_by']['value'] resample_freq: str = resample_map['freq'] df = dd.read_parquet(path=s3_in_url, storage_options=s3_options, engine='fastparquet') # filter if filter_by_key == 'weekday': df = df.loc[df[index_col].dt.weekday == filter_by_val] if group['compute']: grouper_cols = group['by_cols'] aggr_func = group['aggr_func'] meta_cols = group['meta'] cols = list(meta_cols.keys()) print('meta_cols %s' % meta_cols) # resample using frequency and aggregate function specified df = df.groupby([pd.Grouper(key=index_col, freq=resample_freq)] + grouper_cols)[cols]. \ apply(aggr_func, meta=meta_cols) # df = df.resample(resample_freq).sum() # print('after resampling') print('after grouping and resampling %s' % str(df.shape)) # save in out bucket dd.to_csv(df=df, filename=s3_out_url, name_function=lambda i: out_file_prefix + '_' + str(i), storage_options=s3_options) # s3_out_url: str = 's3://' + out_bucket + '/' + year + '/' # dd.to_parquet(df=df, # path=s3_out_url, # engine='fastparquet', # compute=True, # compression='lz4', # storage_options=s3_options) return
def persist(self, output): assert output is not None if type(output) == df.core.DataFrame: if self.ext == "csv": df.to_csv(output, self.loc, index=False, encoding="utf-8") elif self.wxt == "json": df.to_json(output, self.loc, encoding="utf-8") else: raise Exception(self.ext + " not supported") else: if type(output) != db.core.Bag: logging.getLogger("system").warn("WARNING: converting to bag") assert isinstance(output, collections.Iterable) output = db.from_sequence(output, npartitions=self.npartitions) output.map(json.dumps).to_textfiles(self.loc)
def transform(self, input_scores, calibrated_scores): """ Calibrates a score Parameters ---------- input_scores: list Input score files to be calibrated calibrated_files: list Output score files """ assert isinstance(input_scores, list) or isinstance( input_scores, tuple) assert isinstance(calibrated_scores, list) or isinstance( calibrated_scores, tuple) assert len(calibrated_scores) == len(input_scores) for file_name, output_file_name in zip(input_scores, calibrated_scores): # Fetching scores dataframe = dask.dataframe.read_csv(file_name) dataframe = dataframe.compute() X = dataframe["score"].to_numpy() calibrated_scores = np.vstack([ fitter.predict_proba(X) for fitter in self._categorical_fitters ]).T calibrated_scores = self.reduction_function(calibrated_scores, axis=1) dataframe["score"] = calibrated_scores dataframe.to_csv(output_file_name, index=False) return calibrated_scores
def _write(self, collection, path, **kwargs): """ This method implements CSV writing. If the parent directory is missing, the function creates it as well as all the ancestors directories first. This is to align with the behavior with ParquetTarget. Args: collection: dask dataframe, to be written to disk path: str, full path of the target **kwargs: dictionary, named arguments to be passed to to_parquet. Returns: filenames read or today list of task """ if not self._exists(path): self.fs.mkdirs(path) new_path = super()._join(path, self.glob) return to_csv(collection, new_path, **kwargs)
def _write(self, collection, path, **kwargs): if not self._exists(path): self.fs.mkdirs(path) full_path = super()._join(path, self.glob) return to_csv(collection, full_path, **kwargs)
def _write(cls, collection, path, **kwargs): return to_csv(collection, path, **kwargs)
def perform_tsfare_dask(task_type: str, years: List[str]) -> bool: task_type_map: Dict = task_map.task_type_map[task_type] in_bucket: str = task_type_map['in'] out_bucket: str = task_type_map['out'] status: bool = False try: client: Client = dask.create_dask_client(num_workers=8) s3_options: Dict = ps.fetch_s3_options() month_st: int = 1 month_end: int = 13 calendar: cal.Calendar = cal.Calendar() for year in years: usecols = [ 'date', 'STATION', 'FF', 'SEN/DIS', '7-D AFAS UNL', '30-D AFAS/RMF UNL', 'JOINT RR TKT', '7-D UNL', '30-D UNL', '14-D RFM UNL', '1-D UNL', '14-D UNL', '7D-XBUS PASS', 'TCMC', 'RF 2 TRIP', 'RR UNL NO TRADE', 'TCMC ANNUAL MC', 'MR EZPAY EXP', 'MR EZPAY UNL', 'PATH 2-T', 'AIRTRAIN FF', 'AIRTRAIN 30-D', 'AIRTRAIN 10-T', 'AIRTRAIN MTHLY', 'STUDENTS' ] url_part1: str = 's3://' + in_bucket + '/fares_' url_part2: str = ".csv" # urls for all saturdays in month range for year urls: List[str] = [ url_part1 + year[2:] + prefix_zero(month) + prefix_zero(day_tuple[0]) + url_part2 for month in range(month_st, month_end) for day_tuple in calendar.itermonthdays2(int(year), month) if day_tuple[0] in range(1, 32) and day_tuple[1] == 5 ] #for url in urls: # print(url) df = dd.read_csv(urlpath=urls, storage_options=s3_options, header=0, usecols=usecols, skipinitialspace=True, skip_blank_lines=True, parse_dates=['date'], converters={ 'STATION': str.strip, 'FF': row_ops.clean_num, 'SEN/DIS': row_ops.clean_num, '7-D AFAS UNL': row_ops.clean_num, '30-D AFAS/RMF UNL': row_ops.clean_num, 'JOINT RR TKT': row_ops.clean_num, '7-D UNL': row_ops.clean_num, '30-D UNL': row_ops.clean_num, '14-D RFM UNL': row_ops.clean_num, '1-D UNL': row_ops.clean_num, '14-D UNL': row_ops.clean_num, '7D-XBUS PASS': row_ops.clean_num, 'TCMC': row_ops.clean_num, 'RF 2 TRIP': row_ops.clean_num, 'RR UNL NO TRADE': row_ops.clean_num, 'TCMC ANNUAL MC': row_ops.clean_num, 'MR EZPAY EXP': row_ops.clean_num, 'MR EZPAY UNL': row_ops.clean_num, 'PATH 2-T': row_ops.clean_num, 'AIRTRAIN FF': row_ops.clean_num, 'AIRTRAIN 30-D': row_ops.clean_num, 'AIRTRAIN 10-T': row_ops.clean_num, 'AIRTRAIN MTHLY': row_ops.clean_num, 'STUDENTS': row_ops.clean_num }, encoding='utf-8') #to_parquet(df=df, out_bucket=out_bucket, folder=year + '/', compute=True) dd.to_csv( df=df, filename='s3://' + out_bucket + '/' + year + '/', #name_function=lambda i: out_file_prefix + '_' + str(i), storage_options=s3_options) except Exception as err: raise err else: return status
def _write(cls, collection, path, storage_type, **kwargs): if storage_type == "parquet": return to_parquet(collection, path, engine="fastparquet", **kwargs) elif storage_type == "csv": path = "{}/export-*.{}".format(path, storage_type) return to_csv(collection, path, **kwargs)
def perform_dask(task_type: str, years: List[str]) -> bool: task_type_map: Dict = task_map.task_type_map[task_type] in_bucket: str = task_type_map['in'] out_bucket: str = task_type_map['out'] group: Dict = task_type_map['group'] index_col: str = task_type_map['index']['col'] aggr_func: Callable filter_by_key: str = resample_map['filter_by']['key'] filter_by_val: int = resample_map['filter_by']['value'] resample_freq: str = resample_map['freq'] s3_options: Dict = ps.fetch_s3_options() client: Client = dask.create_dask_client(num_workers=8) try: for year in years: s3_in_url: str = 's3://' + in_bucket + '/' + year + '/' s3_out_url: str = 's3://' + out_bucket + '/' + year + '/' \ + resample_freq + '/' + filter_by_key+str(filter_by_val) + '/' path: str = '' print('s3 url %s' % s3_in_url) if task_type in ['rs-gcabs', 'rs-ycabs']: if int(year) >= 2016: path = '/special/' elif int(year) < 2016: path = '/normal/' #resample_at_path(s3_in_url+path, # s3_out_url, # s3_options, # group, # index_col) df = dd.read_parquet(path=s3_in_url + path, storage_options=s3_options, engine='fastparquet') if task_type in ['rs-gcabs', 'rs-ycabs'] and int(year) == 2016: #resample_at_path(s3_in_url + '/normal/', # s3_out_url, # s3_options, # group, # index_col, # 'out2') df_2 = dd.read_parquet(path=s3_in_url + '/normal/', storage_options=s3_options, engine='fastparquet') df = dd.concat([df, df_2], axis=0) partitions = df.npartitions if partitions < 5: print('repartitioning to 5') df = df.repartition(npartitions=5) client.persist(df) # filter if filter_by_key == 'weekday': df = df.loc[df[index_col].dt.weekday == filter_by_val] if group['compute']: grouper_cols = group['by_cols'] aggr_func = group['aggr_func'] meta_cols = group['meta'] cols = [ col for col in meta_cols.keys() if col not in grouper_cols + [index_col] ] meta_types = [ meta_cols[key] for key in meta_cols.keys() if key not in grouper_cols + [index_col] ] print('meta_cols %s' % meta_cols) index = [index_col] + grouper_cols index_levels: List[List] = [[] for level in index] meta: pd.DataFrame = pd.DataFrame(columns=cols, index=pd.MultiIndex( index_levels, index_levels, names=index)) # resample using frequency and aggregate function specified df = df.groupby([pd.Grouper(key=index_col, freq=resample_freq)] + grouper_cols)[cols]. \ apply(aggr_func, meta=meta).reset_index() # df = df.resample(resample_freq).sum() # print('after resampling') print('after grouping and resampling %s' % str(df.shape)) # save in out bucket dd.to_csv( df=df, filename=s3_out_url, #name_function=lambda i: out_file_prefix + '_' + str(i), storage_options=s3_options) #dd.to_parquet(df=df, # path=s3_out_url, # engine='fastparquet', #compute=True, #write_index=True, # compression='lz4', # storage_options=s3_options) except Exception as err: print('error in perform_cabs %s') client.close() raise err client.close() return True
# df.to_dask_array(lengths=True)) to compute chunk sizes. Otherwise df.values has a # shape = (nan,6) print(df_in.to_dask_array(lengths=True)) # shape_in = (nbLine_into_File, nbColunm=6) shape_in = df_in.to_dask_array(lengths=True).shape ############################ Processing ############################### # Distribution of each line to launch_rk4_API function on Dask client futures = client.map(launch_rk4_API, df_in.to_dask_array(lengths=True)) # Block until result, and add the column to initial input tables results = client.gather(futures) # result is a list of dask array ########################### Write Output File ########################### # Create a dask array from results (only x and v : index 1 and 2) data = [results[i][0:3] for i in range(len(results))] da_output = da.concatenate(data, axis=0) da_output = da_output.reshape((shape_in[0], 3)) # Convert to dataFrame df_out = dd.from_dask_array(da_output) # Write into output_file dd.to_csv(df_out, args.output_file, single_file=True, sep=" ", index=False, header=False)
datasets["test"] = pre_proc.remove_outliers( data=datasets["test"], columns=config["COLUMNS"], threshold=config["HYPER_PARAMS"]["Z_THRESHOLD"]) print("Outliers removed!") # Encode data print("\nEncoding data...") datasets["train"], train_encoding = pre_proc.encode_data( data=datasets["train"]) datasets["test"], test_encoding = pre_proc.encode_data( data=datasets["test"], encoding=train_encoding, is_train=False) print("Data encoded!") # Save processed data dataframe.to_csv(datasets["train"].compute(num_workers=cpu_count()), filename=config["PATH"]["PROCESSED"]["TRAIN"]) dataframe.to_csv(datasets["test"].compute(num_workers=cpu_count()), filename=config["PATH"]["PROCESSED"]["TEST"]) # Split train data datasets["train"], datasets["validation"] = dataframe.DataFrame\ .random_split(datasets["train"], frac=[config["HYPER_PARAMS"]["TRAIN_SPLIT"], 1 - config["HYPER_PARAMS"]["TRAIN_SPLIT"]], random_state=config["HYPER_PARAMS"]["SEED"], shuffle=True) # Dump data to memory print("\nDumping data to memory...") datasets["train"] = datasets["train"].compute(num_workers=cpu_count()) datasets["validation"] = datasets["validation"].compute(