dropnacols=["dhdt_slope"], ) # %% # Read in Antarctic Drainage Basin Boundaries shapefile into a GeoDataFrame ice_boundaries: gpd.GeoDataFrame = ( deepicedrain.catalog.measures_antarctic_boundaries.read()) drainage_basins: gpd.GeoDataFrame = ice_boundaries.query(expr="TYPE == 'GR'") # %% [markdown] # ## Load in ICESat-2 data (x, y, dhdt) and do initial trimming # %% # Read in raw x, y, dhdt_slope and referencegroundtrack data into the GPU cudf_raw: cudf.DataFrame = cudf.read_parquet( filepath_or_buffer="ATLXI/df_dhdt_antarctica.parquet", columns=["x", "y", "dhdt_slope", "referencegroundtrack"], ) # Filter to points with dhdt that is less than -0.2 m/yr or more than +0.2 m/yr cudf_many = cudf_raw.loc[abs(cudf_raw.dhdt_slope) > 0.2] print(f"Trimmed {len(cudf_raw)} -> {len(cudf_many)}") # %% # Clip outlier values to 3 sigma (standard deviations) from mean _mean = cudf_many.dhdt_slope.mean() _std = cudf_many.dhdt_slope.std() cudf_many.dhdt_slope.clip(lower=np.float32(_mean - 3 * _std), upper=np.float32(_mean + 3 * _std), inplace=True) # %% [markdown] # ## Label ICESat-2 points according to their drainage basin
def test_parquet_reader_local_filepath(): fname = "~/TestLocalFile.parquet" if not os.path.isfile(fname): pytest.skip("Local .parquet file is not found") cudf.read_parquet(fname)
def read_dists(dist_files: Generator, pcs: cudf.DataFrame, ndvi) -> pd.DataFrame: dfs = ([ cudf.read_csv(file).drop_duplicates("postcode").set_index("postcode"). rename(columns={"distance": re.split(r"_|\.", file.name)[1]}) for file in dist_files ], ) dfs = cudf.concat(dfs, axis=1).reset_index().pipe(fix_postcodes) return (dfs.set_index("postcode").join(ndvi).join( pcs).reset_index().groupby("lsoa11").median()) if __name__ == "__main__": dist_files = list(Path(Config.OUT_DATA).glob("distances_*.csv")) pcs = cudf.read_parquet(Config.PROCESSED_DATA / "postcodes.parquet").set_index("postcode") gspassive = (cudf.read_csv(Config.RAW_DATA / "ndvi" / "sentinel_postcode_ndvi_20210419.csv").rename( columns={ "PCDS": "postcode", "NDVI_MEDIAN": "gspassive" })[["postcode", "gspassive"]].set_index("postcode")) pcs[pcs["lsoa11"] == "E01019077"] dists = read_dists(dist_files, pcs, gspassive) dists.to_csv(Config.OUT_DATA / "median_dists.csv")
def test_hugectr( tmpdir, client, df, dataset, output_format, engine, op_columns, num_io_threads, use_client ): client = client if use_client else None cat_names = ["name-cat", "name-string"] if engine == "parquet" else ["name-string"] cont_names = ["x", "y"] label_names = ["label"] # set variables nfiles = 10 ext = "" outdir = tmpdir + "/hugectr" os.mkdir(outdir) # process data processor = nvt.Workflow( client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_names ) processor.add_feature( [ ops.FillMissing(columns=op_columns), ops.Clip(min_value=0, columns=op_columns), ops.LogOp(), ] ) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) processor.finalize() # apply the workflow and write out the dataset processor.apply( dataset, output_path=outdir, out_files_per_proc=nfiles, output_format=output_format, shuffle=None, num_io_threads=num_io_threads, ) # Check for _file_list.txt assert os.path.isfile(outdir + "/_file_list.txt") # Check for _metadata.json assert os.path.isfile(outdir + "/_metadata.json") # Check contents of _metadata.json data = {} col_summary = {} with open(outdir + "/_metadata.json", "r") as fil: for k, v in json.load(fil).items(): data[k] = v assert "cats" in data assert "conts" in data assert "labels" in data assert "file_stats" in data assert len(data["file_stats"]) == nfiles if not client else nfiles * len(client.cluster.workers) for cdata in data["cats"] + data["conts"] + data["labels"]: col_summary[cdata["index"]] = cdata["col_name"] # Check that data files exist ext = "" if output_format == "parquet": ext = "parquet" elif output_format == "hugectr": ext = "data" data_files = [ os.path.join(outdir, filename) for filename in os.listdir(outdir) if filename.endswith(ext) ] # Make sure the columns in "_metadata.json" make sense if output_format == "parquet": df_check = cudf.read_parquet(os.path.join(outdir, data_files[0])) for i, name in enumerate(df_check.columns): if i in col_summary: assert col_summary[i] == name
dfs = (cudf.concat( [ cudf.read_csv(file).set_index("postcode").rename( columns={"distance": re.split(r"_|\.", file.name)[1]}) for file in dist_files ], axis=1, ).reset_index().pipe(fix_postcodes)) dfs = dfs.merge(uprn) for poi in Config.POI_LIST + ["gspassive"]: dfs[poi] = dfs[poi] * dfs["uprn_count"] dfs = dfs.groupby("lsoa11").sum() for poi in Config.POI_LIST + ["gspassive"]: dfs[poi] = dfs[poi] / dfs["uprn_count"] return dfs.drop("uprn_count", axis=1) if __name__ == "__main__": dist_files = list(Path(Config.OUT_DATA).glob("distances_*.csv")) uprn = (cudf.read_parquet(Config.PROCESSED_DATA / "uprn_pcs.parquet", ).rename(columns={ "lsoa11cd": "lsoa11" }).drop(["oa11cd"], axis=1).pipe(fix_postcodes)) uprn = uprn[~uprn["lsoa11"].str.contains(r"^\d.*")] dists = read_dists(dist_files, uprn) dists.to_csv(Config.OUT_DATA / "weighted_mean_dists.csv")
def parquet_reader_test(parquet_buffer): pdf = pd.read_parquet(parquet_buffer) gdf = cudf.read_parquet(parquet_buffer) assert_eq(gdf, pdf)
def test_gpu_dl(tmpdir, df, dataset, batch_size, part_mem_fraction, engine, devices): cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature([ops.FillMedian()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) output_train = os.path.join(tmpdir, "train/") os.mkdir(output_train) processor.apply( dataset, apply_offline=True, record_stats=True, shuffle=nvt.io.Shuffle.PER_PARTITION, output_path=output_train, out_files_per_proc=2, ) tar_paths = [ os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet") ] nvt_data = nvt.Dataset(tar_paths[0], engine="parquet", part_mem_fraction=part_mem_fraction) data_itr = torch_dataloader.TorchAsyncItr( nvt_data, batch_size=batch_size, cats=cat_names, conts=cont_names, labels=["label"], devices=devices, ) columns = mycols_pq df_test = cudf.read_parquet(tar_paths[0])[columns] df_test.columns = [x for x in range(0, len(columns))] num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata(tar_paths[0]) rows = 0 # works with iterator alone, needs to test inside torch dataloader for idx, chunk in enumerate(data_itr): if devices is None: assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0]) rows += len(chunk[0]) del chunk # accounts for incomplete batches at the end of chunks # that dont necesssarily have the full batch_size assert rows == num_rows def gen_col(batch): batch = batch[0] return batch[0], batch[1], batch[2] t_dl = torch_dataloader.DLDataLoader( data_itr, collate_fn=gen_col, pin_memory=False, num_workers=0 ) rows = 0 for idx, chunk in enumerate(t_dl): if devices is None: assert float(df_test.iloc[rows][0]) == float(chunk[0][0][0]) rows += len(chunk[0]) if os.path.exists(output_train): shutil.rmtree(output_train)
self.distances = cudf.read_csv(self.log_file).append(pc_dist) else: self.distances = pc_dist[["vertex", "distance", "idx"]] self.distances = (self.distances.sort_values( "distance").drop_duplicates("vertex").reset_index()[[ "vertex", "distance", "idx" ]]) self.distances.to_csv(self.log_file, index=False) if __name__ == "__main__": print("Starting Routing!") print("Reading graph and postcodes.") edges = cudf.read_parquet(Config.OS_GRAPH / "edges.parquet") nodes = cudf.read_parquet(Config.OS_GRAPH / "nodes.parquet") postcodes = cudf.read_parquet(Config.PROCESSED_DATA / "postcodes.parquet") print("Finished reading nodes, edges and postcodes.") print(f"Starting Routing for {Config.POI_LIST}.") for poi in Config.POI_LIST: df = pd.read_parquet(Config.PROCESSED_DATA / f"{poi}.parquet") OUT_FILE = Config.OUT_DATA / f"distances_{poi}.csv" if not OUT_FILE.exists(): routing = Routing( name=poi, edges=edges, nodes=nodes, postcodes=postcodes,
def load_data(self, filename = 'dataset.orc', col_labels = None, y_label = 'ArrDelayBinary'): """ Loading the data into the object from the filename and based on the columns that we are interested in. Also, generates y_label from 'ArrDelay' column to convert this into a binary classification problem. Parameters ---------- filename : string the path of the dataset to be loaded col_labels : list of strings The input columns that we are interested in. None selects all the columns y_label : string The column to perform the prediction task in. Returns ---------- dataset : dataframe (Pandas, cudf or dask-cudf) Ingested dataset in the format of a dataframe col_labels : list of strings The input columns selected y_label : string The generated y_label name for binary classification duration : float The time it took to execute the function """ target_filename = filename self.log_to_file( f'\n> Loading dataset from {target_filename}') with PerfTimer() as ingestion_timer: if 'CPU' in self.compute_type: # CPU Reading options self.log_to_file(f'\n\tCPU read') if self.data_type == 'ORC': with open( target_filename, mode='rb') as file: dataset = pyarrow_orc.ORCFile(file).read().to_pandas() elif self.data_type == 'CSV': dataset = pd.read_csv( target_filename, names = col_labels ) elif self.data_type == 'Parquet': if 'single' in self.compute_type: dataset = pd.read_parquet(target_filename) elif 'multi' in self.compute_type: self.log_to_file(f'\n\tReading using dask dataframe') dataset = dask.dataframe.read_parquet(target_filename, columns = columns) elif 'GPU' in self.compute_type: # GPU Reading Option self.log_to_file(f'\n\tGPU read') if self.data_type == 'ORC': dataset = cudf.read_orc(target_filename) elif self.data_type == 'CSV': dataset = cudf.read_csv(target_filename, names = col_labels) elif self.data_type == 'Parquet': if 'single' in self.compute_type: dataset = cudf.read_parquet(target_filename) elif 'multi' in self.compute_type: self.log_to_file(f'\n\tReading using dask_cudf') dataset = dask_cudf.read_parquet(target_filename, columns = col_labels) # cast all columns to float32 for col in dataset.columns: dataset[col] = dataset[col].astype(np.float32) # needed for random forest # Adding y_label column if it is not present if y_label not in dataset.columns: dataset[y_label] = 1.0 * ( dataset["ArrDelay"] > 10 ) dataset[y_label] = dataset[y_label].astype(np.int32) # Needed for cuml RF dataset = dataset.fillna(0.0) # Filling the null values. Needed for dask-cudf self.log_to_file(f'\n\tIngestion completed in {ingestion_timer.duration}') self.log_to_file(f'\n\tDataset descriptors: {dataset.shape}\n\t{dataset.dtypes}') return dataset, col_labels, y_label, ingestion_timer.duration
def read_partition(fs, piece, columns, index, categories=(), partitions=(), **kwargs): if columns is not None: columns = [c for c in columns] if isinstance(index, list): columns += index if isinstance(piece, str): # `piece` is a file-path string piece = pq.ParquetDatasetPiece(piece, open_file_func=partial(fs.open, mode="rb")) else: # `piece` = (path, row_group, partition_keys) (path, row_group, partition_keys) = piece piece = pq.ParquetDatasetPiece( path, row_group=row_group, partition_keys=partition_keys, open_file_func=partial(fs.open, mode="rb"), ) strings_to_cats = kwargs.get("strings_to_categorical", False) if cudf.utils.ioutils._is_local_filesystem(fs): df = cudf.read_parquet( piece.path, engine="cudf", columns=columns, row_group=piece.row_group, strings_to_categorical=strings_to_cats, **kwargs.get("read", {}), ) else: with fs.open(piece.path, mode="rb") as f: df = cudf.read_parquet( f, engine="cudf", columns=columns, row_group=piece.row_group, strings_to_categorical=strings_to_cats, **kwargs.get("read", {}), ) if index and index[0] in df.columns: df = df.set_index(index[0]) if len(piece.partition_keys) > 0: if partitions is None: raise ValueError("Must pass partition sets") for i, (name, index2) in enumerate(piece.partition_keys): categories = [ val.as_py() for val in partitions.levels[i].dictionary ] sr = cudf.Series(index2).astype(type(index2)).repeat(len(df)) df[name] = build_categorical_column( categories=categories, codes=as_column(sr._column.base_data, dtype=sr._column.dtype), size=sr._column.size, offset=sr._column.offset, ordered=False, ) return df
def _read_paths( cls, paths, fs, columns=None, row_groups=None, strings_to_categorical=None, partitions=None, partitioning=None, partition_keys=None, open_file_options=None, **kwargs, ): # Simplify row_groups if all None if row_groups == [None for path in paths]: row_groups = None with ExitStack() as stack: # Non-local filesystem handling paths_or_fobs = paths if not _is_local_filesystem(fs): paths_or_fobs = _open_remote_files( paths_or_fobs, fs, context_stack=stack, **_default_open_file_options(open_file_options, columns, row_groups), ) # Use cudf to read in data df = cudf.read_parquet( paths_or_fobs, engine="cudf", columns=columns, row_groups=row_groups if row_groups else None, strings_to_categorical=strings_to_categorical, **kwargs, ) if partitions and partition_keys is None: # Use `HivePartitioning` by default partitioning = partitioning or {"obj": pa_ds.HivePartitioning} ds = pa_ds.dataset( paths, filesystem=fs, format="parquet", partitioning=partitioning["obj"].discover( *partitioning.get("args", []), **partitioning.get("kwargs", {}), ), ) frag = next(ds.get_fragments()) if frag: # Extract hive-partition keys, and make sure they # are ordered the same as they are in `partitions` raw_keys = pa_ds._get_partition_keys(frag.partition_expression) partition_keys = [(hive_part.name, raw_keys[hive_part.name]) for hive_part in partitions] if partition_keys: if partitions is None: raise ValueError("Must pass partition sets") for i, (name, index2) in enumerate(partition_keys): # Build the column from `codes` directly # (since the category is often a larger dtype) codes = as_column( partitions[i].keys.index(index2), length=len(df), ) df[name] = build_categorical_column( categories=partitions[i].keys, codes=codes, size=codes.size, offset=codes.offset, ordered=False, ) return df
def test_dask_workflow_api_dlrm( client, tmpdir, datasets, freq_threshold, part_mem_fraction, engine, cat_cache, on_host, shuffle, cpu, ): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) paths = sorted(paths) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] elif engine == "csv": df1 = cudf.read_csv(paths[0], header=0)[mycols_csv] df2 = cudf.read_csv(paths[1], header=0)[mycols_csv] else: df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv] df0 = cudf.concat([df1, df2], axis=0) df0 = df0.to_pandas() if cpu else df0 if engine == "parquet": cat_names = ["name-cat", "name-string"] else: cat_names = ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] cats = cat_names >> ops.Categorify(freq_threshold=freq_threshold, out_path=str(tmpdir), cat_cache=cat_cache, on_host=on_host) conts = cont_names >> ops.FillMissing() >> ops.Clip( min_value=0) >> ops.LogOp() workflow = Workflow(cats + conts + label_name, client=client) if engine in ("parquet", "csv"): dataset = Dataset(paths, cpu=cpu, part_mem_fraction=part_mem_fraction) else: dataset = Dataset(paths, cpu=cpu, names=allcols_csv, part_mem_fraction=part_mem_fraction) output_path = os.path.join(tmpdir, "processed") transformed = workflow.fit_transform(dataset) transformed.to_parquet(output_path=output_path, shuffle=shuffle, out_files_per_proc=1) result = transformed.to_ddf().compute() assert len(df0) == len(result) assert result["x"].min() == 0.0 assert result["x"].isna().sum() == 0 assert result["y"].min() == 0.0 assert result["y"].isna().sum() == 0 # Check categories. Need to sort first to make sure we are comparing # "apples to apples" expect = df0.sort_values(["label", "x", "y", "id"]).reset_index(drop=True).reset_index() got = result.sort_values(["label", "x", "y", "id"]).reset_index(drop=True).reset_index() dfm = expect.merge(got, on="index", how="inner")[["name-string_x", "name-string_y"]] dfm_gb = dfm.groupby(["name-string_x", "name-string_y"]).agg({ "name-string_x": "count", "name-string_y": "count" }) if freq_threshold: dfm_gb = dfm_gb[dfm_gb["name-string_x"] >= freq_threshold] assert_eq(dfm_gb["name-string_x"], dfm_gb["name-string_y"], check_names=False) # Read back from disk if cpu: df_disk = dd_read_parquet(output_path).compute() else: df_disk = dask_cudf.read_parquet(output_path).compute() # we don't have a deterministic ordering here, especially when using # a dask client with multiple workers - so we need to sort the values here columns = ["label", "x", "y", "id"] + cat_names got = result.sort_values(columns).reset_index(drop=True) expect = df_disk.sort_values(columns).reset_index(drop=True) assert_eq(got, expect)
def parquet_reader_test(parquet_buffer): pdf = pd.read_parquet(parquet_buffer) gdf = cudf.read_parquet(parquet_buffer) compare_dataframe(gdf, pdf)
def ETL(self, columns=None, label_column=None, random_seed=0): """ Perfom ETL given a set of target dataset to prepare for model training. 1. Ingest parquet compressed dataset 2. Rebalance/Re-partition [ for multi-CPU and multi-GPU ] 3. Drop samples with missing data [ predominantly cancelled flights ] 4. Split dataset into train and test subsets """ with PerfTimer('ETL'): if 'single' in self.compute_type: if 'CPU' in self.compute_type: from sklearn.model_selection import train_test_split dataset = pandas.read_parquet(self.target_files, columns=columns, engine='pyarrow') dataset = dataset.dropna() X_train, X_test, y_train, y_test = train_test_split( dataset.loc[:, dataset.columns != label_column], dataset[label_column], random_state=random_seed) elif 'GPU' in self.compute_type: from cuml.preprocessing.model_selection import train_test_split dataset = cudf.read_parquet(self.target_files, columns=columns) dataset = dataset.dropna() X_train, X_test, y_train, y_test = train_test_split( dataset, label_column, random_state=random_seed) return X_train, X_test, y_train, y_test elif 'multi' in self.compute_type: from dask_ml.model_selection import train_test_split if 'CPU' in self.compute_type: dataset = dask.dataframe.read_parquet(self.target_files, columns=columns, engine='pyarrow') elif 'GPU' in self.compute_type: dataset = dask_cudf.read_parquet(self.target_files, columns=columns) # drop missing values [ ~2.5% -- predominantly cancelled flights ] dataset = dataset.dropna() # repartition [ inplace ], rebalance ratio of workers & data partitions initial_npartitions = dataset.npartitions dataset = dataset.repartition(npartitions=self.n_workers) # split [ always runs, regardless of whether dataset is cached ] train, test = train_test_split(dataset, random_state=random_seed) # build X [ features ], y [ labels ] for the train and test subsets y_train = train[label_column].astype('int32') X_train = train.drop(label_column, axis=1).astype('float32') y_test = test[label_column].astype('int32') X_test = test.drop(label_column, axis=1).astype('float32') # return [ CPU/GPU ] dask dataframes return X_train, X_test, y_train, y_test return None
def diff_time(train, valid): gf1 = cudf.from_pandas( train[["timestamp", "a_user_id", "b_user_id", "tweet_id", "no_tweet"]]).reset_index(drop=True) gf2 = cudf.from_pandas( valid[["timestamp", "a_user_id", "b_user_id", "tweet_id", "no_tweet"]]).reset_index(drop=True) gf = cudf.concat([gf1, gf2], axis=0) gf = dask_cudf.from_cudf(gf, npartitions=16) gf["timestamp"] = gf["timestamp"].astype("int64") / 1e9 gf_unique = gf[["timestamp", "a_user_id", "tweet_id"]].drop_duplicates() gf_unique.columns = ["tmp_timestamp", "tmp_a_user_id", "tmp_tweet_id"] gf = gf[gf["no_tweet"] != 0] gf = gf.drop("no_tweet", axis=1) gf = gf.drop("a_user_id", axis=1) gf = gf.merge(gf_unique, how="left", left_on="b_user_id", right_on="tmp_a_user_id") gf = gf[gf["tweet_id"] != gf["tmp_tweet_id"]] gf = gf[~gf["tmp_a_user_id"].isna()] gf["diff_timestamp_prev"] = gf["timestamp"] - gf["tmp_timestamp"] gf["diff_timestamp_after"] = gf["tmp_timestamp"] - gf["timestamp"] gf["diff_timestamp_after"] = gf.diff_timestamp_after.where( gf["diff_timestamp_after"] > 0, 15 * 24 * 3600) gf["diff_timestamp_prev"] = gf.diff_timestamp_prev.where( gf["diff_timestamp_prev"] > 0, 15 * 24 * 3600) gf = (gf[[ "tweet_id", "b_user_id", "diff_timestamp_prev", "diff_timestamp_after" ]].groupby(["tweet_id", "b_user_id"]).min().reset_index()) gf.to_parquet("/tmp/gf") del gf del gf_unique del gf1 del gf2 gc.collect() gf = cudf.read_parquet("/tmp/gf/part.0.parquet") gf1 = cudf.from_pandas(train[["b_user_id", "tweet_id"]]).reset_index(drop=True) gf1["idx"] = gf1.index gf1 = gf1.merge( gf, how="left", left_on=["tweet_id", "b_user_id"], right_on=["tweet_id", "b_user_id"], ) gf1 = gf1.sort_values("idx") train["diff_timestamp_prev"] = (gf1["diff_timestamp_prev"].fillna( 15 * 24 * 3600).astype("int32").to_array()) train["diff_timestamp_after"] = (gf1["diff_timestamp_after"].fillna( 15 * 24 * 3600).astype("int32").to_array()) del gf1 gc.collect() gf1 = cudf.from_pandas(valid[["b_user_id", "tweet_id"]]).reset_index(drop=True) gf1["idx"] = gf1.index gf1 = gf1.merge( gf, how="left", left_on=["tweet_id", "b_user_id"], right_on=["tweet_id", "b_user_id"], ) gf1 = gf1.sort_values("idx") valid["diff_timestamp_prev"] = (gf1["diff_timestamp_prev"].fillna( 15 * 24 * 3600).astype("int32").to_array()) valid["diff_timestamp_after"] = (gf1["diff_timestamp_after"].fillna( 15 * 24 * 3600).astype("int32").to_array())
def test_hugectr(tmpdir, client, df, dataset, output_format, engine, op_columns, num_io_threads, use_client): client = client if use_client else None cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y"] label_names = ["label"] # set variables nfiles = 10 ext = "" outdir = tmpdir + "/hugectr" os.mkdir(outdir) outdir = str(outdir) conts = nvt.ColumnGroup(cont_names) >> ops.Normalize cats = nvt.ColumnGroup(cat_names) >> ops.Categorify workflow = nvt.Workflow(conts + cats + label_names) transformed = workflow.fit_transform(dataset) if output_format == "hugectr": transformed.to_hugectr( cats=cat_names, conts=cont_names, labels=label_names, output_path=outdir, out_files_per_proc=nfiles, num_threads=num_io_threads, ) else: transformed.to_parquet( output_path=outdir, out_files_per_proc=nfiles, num_threads=num_io_threads, ) # Check for _file_list.txt assert os.path.isfile(outdir + "/_file_list.txt") # Check for _metadata.json assert os.path.isfile(outdir + "/_metadata.json") # Check contents of _metadata.json data = {} col_summary = {} with open(outdir + "/_metadata.json", "r") as fil: for k, v in json.load(fil).items(): data[k] = v assert "cats" in data assert "conts" in data assert "labels" in data assert "file_stats" in data assert len(data["file_stats"]) == nfiles if not client else nfiles * len( client.cluster.workers) for cdata in data["cats"] + data["conts"] + data["labels"]: col_summary[cdata["index"]] = cdata["col_name"] # Check that data files exist ext = "" if output_format == "parquet": ext = "parquet" elif output_format == "hugectr": ext = "data" data_files = [ os.path.join(outdir, filename) for filename in os.listdir(outdir) if filename.endswith(ext) ] # Make sure the columns in "_metadata.json" make sense if output_format == "parquet": df_check = cudf.read_parquet(os.path.join(outdir, data_files[0])) for i, name in enumerate(df_check.columns): if i in col_summary: assert col_summary[i] == name
import cudf as dd from feature_engineering_2 import ( pos_cash, process_unified, process_bureau_and_balance, process_previous_applications, installments_payments, credit_card_balance ) # initiating mem management # this allows for spilling out of the gpu ram dd.set_allocator("managed") ### Load datasets print("loading data") bureau_balance = dd.read_parquet('raw_data/bureau_balance.parquet') bureau = dd.read_parquet('raw_data/bureau.parquet') cc_balance = dd.read_parquet('raw_data/cc_balance.parquet') payments = dd.read_parquet('raw_data/payments.parquet') pc_balance = dd.read_parquet('raw_data/pc_balance.parquet') prev = dd.read_parquet('raw_data/prev.parquet') train = dd.read_parquet('raw_data/train.parquet') test = dd.read_parquet('raw_data/test.parquet') train_index = train.index test_index = test.index train_target = train['TARGET'] unified = dd.concat([train.drop('TARGET', axis=1), test]) print("starting processing") unified_feat = process_unified(unified, dd)
"buffer": distances["dist"].values }) buffers = buffers.sort_values("buffer", ascending=False).drop_duplicates("node_id") buffers["buffer"] = buffers["buffer"].astype("int") # this will drop rows that did not appear in the KNN i.e unneeded poi # BUG: Unsure this works, bluespace retains ~140,000 points return poi_nn.merge(buffers, on="node_id", how="left").dropna() if __name__ == "__main__": print("Starting routing data processing...") print("Reading and cleaning data...") nodes = cudf.read_parquet(Config.OS_GRAPH / "nodes.parquet") pcs: cudf.DataFrame = cudf.read_parquet( Config.PROCESSED_DATA / "postcodes.parquet").set_index("postcode") retail: cudf.DataFrame = clean_retail(path=Config.RAW_DATA / "LDC_Secure_Snapshot_2020_01.csv", postcodes=pcs) fast_food: cudf.DataFrame = clean_fast_food(retail=retail) gambling: cudf.DataFrame = clean_gambling(retail=retail) offlicences: cudf.DataFrame = clean_offlicences(retail=retail) pubs: cudf.DataFrame = clean_pubs(retail=retail) tobacconists: cudf.DataFrame = clean_tobacconists(retail=retail) leisure: cudf.DataFrame = clean_leisure(retail=retail) print("Finding nearest node to postcodes...") postcodes = nearest_nodes(df=pcs.reset_index(), nodes=nodes)
def test_hugectr(tmpdir, df, dataset, output_format, engine, op_columns): cat_names = ["name-cat", "name-string" ] if engine == "parquet" else ["name-string"] cont_names = ["x", "y"] label_names = ["label"] # set variables nfiles = 10 ext = "" outdir = tmpdir + "/hugectr" os.mkdir(outdir) # process data processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_names) processor.add_feature([ops.ZeroFill(columns=op_columns), ops.LogOp()]) processor.add_preprocess(ops.Normalize()) processor.add_preprocess(ops.Categorify()) processor.finalize() # Need to collect statistics first (for now) processor.update_stats(dataset) # Second "online" pass to write HugeCTR output processor.apply( dataset, apply_offline=False, record_stats=False, output_path=outdir, out_files_per_proc=nfiles, output_format=output_format, shuffle=False, ) # Check for _file_list.txt assert os.path.isfile(outdir + "/_file_list.txt") # Check for _metadata.json assert os.path.isfile(outdir + "/_metadata.json") # Check contents of _metadata.json data = {} col_summary = {} with open(outdir + "/_metadata.json", "r") as fil: for k, v in json.load(fil).items(): data[k] = v assert "cats" in data assert "conts" in data assert "labels" in data assert "file_stats" in data assert len(data["file_stats"]) == nfiles for cdata in data["cats"] + data["conts"] + data["labels"]: col_summary[cdata["index"]] = cdata["col_name"] # Check that data files exist ext = "" if output_format == "parquet": ext = "parquet" elif output_format == "hugectr": ext = "data" for n in range(nfiles): assert os.path.isfile(os.path.join(outdir, str(n) + "." + ext)) # Make sure the columns in "_metadata.json" make sense if output_format == "parquet": df_check = cudf.read_parquet(os.path.join(outdir, "0.parquet")) for i, name in enumerate(df_check.columns): if i in col_summary: assert col_summary[i] == name
"buffer": distances["dist"].values }) buffers = buffers.sort_values("buffer", ascending=False).drop_duplicates("node_id") buffers["buffer"] = buffers["buffer"].astype("int") # this will drop rows that did not appear in the KNN i.e unneeded poi return (poi_nn.merge(buffers, on="node_id", how="left").dropna().drop_duplicates("node_id")) if __name__ == "__main__": logger.info("Starting routing data processing...") logger.debug("Reading and cleaning data...") nodes: cudf.DataFrame = cudf.read_parquet(Config.OS_GRAPH / "nodes.parquet") pcs: cudf.DataFrame = clean_postcodes(path=Config.RAW_DATA / "onspd" / "ONSPD_FEB_2022_UK.csv", current=True) all_pcs: cudf.DataFrame = clean_postcodes(path=Config.RAW_DATA / "onspd" / "ONSPD_FEB_2022_UK.csv", current=False).drop("lsoa11", axis=1) all_pcs.reset_index().to_parquet(Config.PROCESSED_DATA / "all_pcs.parquet") gpp: cudf.DataFrame = clean_gpp( england=Config.RAW_DATA / "nhs" / "epraccur.csv", scotland=Config.RAW_DATA / "nhs" / "scotland" / "gpp.csv", postcodes=all_pcs, )
def read_partition( fs, piece, columns, index, categories=(), partitions=(), **kwargs ): if columns is not None: columns = [c for c in columns] if isinstance(index, list): columns += index if isinstance(piece, str): path = piece row_group = None partition_keys = [] else: (path, row_group, partition_keys) = piece strings_to_cats = kwargs.get("strings_to_categorical", False) if cudf.utils.ioutils._is_local_filesystem(fs): df = cudf.read_parquet( path, engine="cudf", columns=columns, row_groups=row_group, strings_to_categorical=strings_to_cats, **kwargs.get("read", {}), ) else: with fs.open(path, mode="rb") as f: df = cudf.read_parquet( f, engine="cudf", columns=columns, row_groups=row_group, strings_to_categorical=strings_to_cats, **kwargs.get("read", {}), ) if index and (index[0] in df.columns): df = df.set_index(index[0]) elif index is False and set(df.index.names).issubset(columns): # If index=False, we need to make sure all of the # names in `columns` are actually in `df.columns` df.reset_index(inplace=True) if partition_keys: if partitions is None: raise ValueError("Must pass partition sets") for i, (name, index2) in enumerate(partition_keys): categories = [ val.as_py() for val in partitions.levels[i].dictionary ] col = as_column(index2).as_frame().repeat(len(df))._data[None] df[name] = build_categorical_column( categories=categories, codes=as_column(col.base_data, dtype=col.dtype), size=col.size, offset=col.offset, ordered=False, ) return df
def test_dask_groupby_stats(client, tmpdir, datasets, part_mem_fraction): engine = "parquet" paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] df0 = cudf.concat([df1, df2], axis=0) cat_names = ["name-cat", "name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = Workflow(client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_preprocess( ops.JoinGroupby(cont_names=cont_names, stats=["count", "sum", "std", "min"], out_path=str(tmpdir))) processor.finalize() dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) processor.apply(dataset) result = processor.get_ddf().compute(scheduler="synchronous") # Validate result assert len(df0) == len(result) assert "name-cat_x_std" in result.columns assert "name-cat_x_var" not in result.columns assert "name-string_x_std" in result.columns assert "name-string_x_var" not in result.columns # Check "count" assert_eq( result[["name-cat", "name-cat_count" ]].drop_duplicates().sort_values("name-cat")["name-cat_count"], df0.groupby("name-cat").agg({"x": "count"})["x"].astype(np.int64), check_index=False, check_dtype=False, # May get int64 vs int32 check_names=False, ) # Check "min" assert_eq( result[[ "name-string", "name-string_x_min" ]].drop_duplicates().sort_values("name-string")["name-string_x_min"], df0.groupby("name-string").agg({"x": "min"})["x"], check_index=False, check_names=False, ) # Check "std" assert_eq( result[[ "name-string", "name-string_x_std" ]].drop_duplicates().sort_values("name-string")["name-string_x_std"], df0.groupby("name-string").agg({"x": "std"})["x"], check_index=False, check_names=False, )
def test_gpu_workflow_config(tmpdir, datasets, dump, gpu_memory_frac, engine, replace): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] else: df1 = cudf.read_csv(paths[0], header=False, names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], header=False, names=allcols_csv)[mycols_csv] df = cudf.concat([df1, df2], axis=0) df["id"] = df["id"].astype("int64") if engine == "parquet": cat_names = ["name-cat", "name-string"] columns = mycols_pq else: cat_names = ["name-string"] columns = mycols_csv cont_names = ["x", "y", "id"] label_name = ["label"] config = nvt.workflow.get_new_config() # add operators with dependencies config["FE"]["continuous"] = [[ ops.FillMissing(replace=replace), ops.LogOp() ]] config["PP"]["continuous"] = [[ ops.LogOp(replace=replace), ops.Normalize() ]] config["PP"]["categorical"] = [ops.Categorify()] processor = nvt.Workflow( cat_names=cat_names, cont_names=cont_names, label_name=label_name, config=config, to_cpu=False, ) data_itr = nvt.io.GPUDatasetIterator( paths, columns=columns, use_row_groups=True, gpu_memory_frac=gpu_memory_frac, names=allcols_csv, ) processor.update_stats(data_itr) if dump: config_file = tmpdir + "/temp.yaml" processor.save_stats(config_file) processor.clear_stats() processor.load_stats(config_file) def get_norms(tar: cudf.Series): ser_median = tar.dropna().quantile(0.5, interpolation="linear") gdf = tar.fillna(ser_median) gdf = np.log(gdf + 1) return gdf # Check mean and std - No good right now we have to add all other changes; Zerofill, Log concat_ops = "_FillMissing_LogOp" if replace: concat_ops = "" assert math.isclose( get_norms(df.x).mean(), processor.stats["means"]["x" + concat_ops], rel_tol=1e-1, ) assert math.isclose( get_norms(df.y).mean(), processor.stats["means"]["y" + concat_ops], rel_tol=1e-1, ) assert math.isclose( get_norms(df.x).std(), processor.stats["stds"]["x" + concat_ops], rel_tol=1e-1, ) assert math.isclose( get_norms(df.y).std(), processor.stats["stds"]["y" + concat_ops], rel_tol=1e-1, ) # Check that categories match if engine == "parquet": cats_expected0 = df["name-cat"].unique().values_to_string() cats0 = processor.stats["encoders"]["name-cat"].get_cats( ).values_to_string() # adding the None entry as a string because of move from gpu assert cats0 == ["None"] + cats_expected0 cats_expected1 = df["name-string"].unique().values_to_string() cats1 = processor.stats["encoders"]["name-string"].get_cats( ).values_to_string() # adding the None entry as a string because of move from gpu assert cats1 == ["None"] + cats_expected1 # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, data_itr, nfiles=10, shuffle=True, apply_ops=True) data_itr_2 = nvtabular.io.GPUDatasetIterator( glob.glob(str(tmpdir) + "/ds_part.*.parquet"), use_row_groups=True, gpu_memory_frac=gpu_memory_frac, ) df_pp = None for chunk in data_itr_2: df_pp = cudf.concat([df_pp, chunk], axis=0) if df_pp else chunk if engine == "parquet": assert df_pp["name-cat"].dtype == "int64" assert df_pp["name-string"].dtype == "int64" num_rows, num_row_groups, col_names = cudf.io.read_parquet_metadata( str(tmpdir) + "/_metadata") assert num_rows == len(df_pp) return processor.ds_exports
def test_dask_workflow_api_dlrm(client, tmpdir, datasets, freq_threshold, part_mem_fraction, engine, cat_cache, on_host, shuffle): paths = glob.glob(str(datasets[engine]) + "/*." + engine.split("-")[0]) if engine == "parquet": df1 = cudf.read_parquet(paths[0])[mycols_pq] df2 = cudf.read_parquet(paths[1])[mycols_pq] elif engine == "csv": df1 = cudf.read_csv(paths[0], header=0)[mycols_csv] df2 = cudf.read_csv(paths[1], header=0)[mycols_csv] else: df1 = cudf.read_csv(paths[0], names=allcols_csv)[mycols_csv] df2 = cudf.read_csv(paths[1], names=allcols_csv)[mycols_csv] df0 = cudf.concat([df1, df2], axis=0) if engine == "parquet": cat_names = ["name-cat", "name-string"] else: cat_names = ["name-string"] cont_names = ["x", "y", "id"] label_name = ["label"] processor = Workflow(client=client, cat_names=cat_names, cont_names=cont_names, label_name=label_name) processor.add_feature( [ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()]) processor.add_preprocess( ops.Categorify( freq_threshold=freq_threshold, out_path=str(tmpdir), cat_cache=cat_cache, on_host=on_host, )) processor.finalize() if engine in ("parquet", "csv"): dataset = Dataset(paths, part_mem_fraction=part_mem_fraction) else: dataset = Dataset(paths, names=allcols_csv, part_mem_fraction=part_mem_fraction) output_path = os.path.join(tmpdir, "processed") processor.apply(dataset, output_path=output_path, shuffle=shuffle) # Can still access the final ddf if we didn't shuffle if not shuffle: result = processor.get_ddf().compute() assert len(df0) == len(result) assert result["x"].min() == 0.0 assert result["x"].isna().sum() == 0 assert result["y"].min() == 0.0 assert result["y"].isna().sum() == 0 # Check category counts cat_expect = df0.groupby("name-string").agg({ "name-string": "count" }).reset_index(drop=True) cat_result = (result.groupby("name-string").agg({ "name-string": "count" }).reset_index(drop=True)) if freq_threshold: cat_expect = cat_expect[ cat_expect["name-string"] >= freq_threshold] # Note that we may need to skip the 0th element in result (null mapping) assert_eq( cat_expect, cat_result.iloc[1:] if len(cat_result) > len(cat_expect) else cat_result, check_index=False, ) else: assert_eq(cat_expect, cat_result) # Read back from disk df_disk = dask_cudf.read_parquet(output_path, index=False).compute() for col in df_disk: assert_eq(result[col], df_disk[col]) else: # Read back from disk df_disk = dask_cudf.read_parquet(output_path, index=False).compute() assert len(df0) == len(df_disk)
def test_parquet_reader_filenotfound(tmpdir): with pytest.raises(FileNotFoundError): cudf.read_parquet("TestMissingFile.parquet") with pytest.raises(FileNotFoundError): cudf.read_parquet(tmpdir.mkdir("cudf_parquet"))
def _get_random_movielens_data(tmpdir, rows, dataset="movie", valid=None): if dataset == "movie": json_sample_movie = { "conts": {}, "cats": { "genres": { "dtype": None, "cardinality": 50, "min_entry_size": 1, "max_entry_size": 5, "multi_min": 2, "multi_max": 4, "multi_avg": 3, }, "movieId": { "dtype": None, "cardinality": 500, "min_entry_size": 1, "max_entry_size": 5, }, }, } cols = datagen._get_cols_from_schema(json_sample_movie) if dataset == "ratings": json_sample_ratings = { "conts": {}, "cats": { "movieId": { "dtype": None, "cardinality": 500, "min_entry_size": 1, "max_entry_size": 5, }, "userId": { "dtype": None, "cardinality": 500, "min_entry_size": 1, "max_entry_size": 5, }, }, "labels": { "rating": { "dtype": None, "cardinality": 5 } }, } cols = datagen._get_cols_from_schema(json_sample_ratings) df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.1) target_path = tmpdir df_gen.full_df_create(rows, cols, output=target_path) if dataset == "movie": movies_converted = cudf.read_parquet( os.path.join(tmpdir, "dataset_0.parquet")) movies_converted = movies_converted.drop_duplicates(["movieId"], keep="first") movies_converted.to_parquet( os.path.join(tmpdir, "movies_converted.parquet")) elif dataset == "ratings" and not valid: os.rename(os.path.join(tmpdir, "dataset_0.parquet"), os.path.join(tmpdir, "train.parquet")) else: os.rename(os.path.join(tmpdir, "dataset_0.parquet"), os.path.join(tmpdir, "valid.parquet"))
def test_parquet_reader_filepath_or_buffer(parquet_path_or_buf, src): expect = pd.read_parquet(parquet_path_or_buf("filepath")) got = cudf.read_parquet(parquet_path_or_buf(src)) assert_eq(expect, got)
def test_parquet_write_bytes_io(simple_gdf): output = BytesIO() simple_gdf.to_parquet(output) assert_eq(cudf.read_parquet(output), simple_gdf)