def test_clip(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path) clip_tbl = feature_tbl.clip(["col_1", "col_2", "col_3"], min=2, max=None) assert isinstance(clip_tbl, FeatureTable), "clip_tbl should be a FeatureTable" assert feature_tbl.df.filter("col_1 < 2").count() != 0 and feature_tbl \ .df.filter("col_2 < 2").count() != 0, "feature_tbl should not be changed" assert clip_tbl.df.filter("col_1 < 2").count() == 0, "col_1 should >= 2" assert clip_tbl.df.filter("col_2 < 2").count() == 0, "col_2 should >= 2" assert clip_tbl.df.filter("col_3 < 2").count() == 0, "col_3 should >= 2" with self.assertRaises(Exception) as context: feature_tbl.clip(None, 2) self.assertTrue('columns should be str or list of str, but got None.' in str(context.exception)) feature_tbl = FeatureTable.read_parquet(file_path) clip_tbl = feature_tbl.clip(["col_1", "col_2", "col_3"], min=None, max=1) assert isinstance(clip_tbl, FeatureTable), "clip_tbl should be a FeatureTable" assert feature_tbl.df.filter("col_1 > 1").count() != 0 and feature_tbl \ .df.filter("col_2 > 1").count() != 0, "feature_tbl should not be changed" assert clip_tbl.df.filter("col_1 > 1").count() == 0, "col_1 should <= 1" assert clip_tbl.df.filter("col_2 > 1").count() == 0, "col_2 should <= 1" assert clip_tbl.df.filter("col_3 > 1").count() == 0, "col_3 should <= 1" feature_tbl = FeatureTable.read_parquet(file_path) clip_tbl = feature_tbl.clip(["col_1", "col_2", "col_3"], min=0, max=1) assert isinstance(clip_tbl, FeatureTable), "clip_tbl should be a FeatureTable" assert feature_tbl.df.filter("col_1 > 1 or col_1 < 0").count() != 0 and feature_tbl \ .df.filter("col_2 > 1 or col_2 < 0").count() != 0, "feature_tbl should not be changed" assert clip_tbl.df.filter("col_1 < 0").count() == 0, "col_1 should >= 0" assert clip_tbl.df.filter("col_2 > 1").count() == 0, "col_2 should <= 1" assert clip_tbl.df.filter("col_3 < 0 or col_3 > 1").count() == 0, "col_3 should >=0 " \ "and <= 1"
def test_write_parquet(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path) feature_tbl.write_parquet("saved.parquet") loaded_tbl = FeatureTable.read_parquet("saved.parquet") if os.path.exists("saved.parquet"): shutil.rmtree("saved.parquet")
def test_merge(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path) merged_tbl = feature_tbl.merge_cols(["col_1", "col_2", "col_3"], "int_cols") assert "col_1" not in merged_tbl.df.columns, "col_1 shouldn't be a column of merged_tbl" assert "int_cols" in merged_tbl.df.columns, "int_cols should be a column of merged_tbl" assert "col_1" in feature_tbl.df.columns, "col_1 should be a column of feature_tbl"
def test_dropna(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path) dropped_tbl = feature_tbl.dropna(["col_1", "col_4"]) assert isinstance(dropped_tbl, FeatureTable), "dropped_tbl should be a FeatureTable" assert feature_tbl.df.filter("col_1 is null").count() != 0 and feature_tbl\ .df.filter("col_4 is null").count() != 0, "feature_tbl should not be changed" assert dropped_tbl.df.filter("col_1 is null").count() == 0, "col_1 null values should " \ "be dropped" assert dropped_tbl.df.filter("col_4 is null").count() == 0, "col_4 null values should " \ "be dropped" assert 0 < dropped_tbl.df.count() < feature_tbl.df.count(), "the number of rows should " \ "be decreased" dropped_tbl = feature_tbl.dropna(["col_1", "col_4"], how="all") assert dropped_tbl.df.filter("col_1 is null and col_4 is null").count() == 0, \ "col_1 and col_4 should not both have null values" dropped_tbl = feature_tbl.dropna(["col_2", "col_4"], how="all") assert dropped_tbl.df.filter("col_2 is null").count() > 0, \ "col_2 should still have null values after dropna with how=all" dropped_tbl = feature_tbl.dropna(["col_2", "col_3", "col_5"], thresh=2) assert dropped_tbl.df.filter("col_2 is null").count() > 0, \ "col_2 should still have null values after dropna with thresh=2" assert dropped_tbl.df.filter("col_3 is null and col_5 is null").count() == 0, \ "col_3 and col_5 should not both have null values"
def test_gen_string_idx(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path) string_idx_list = feature_tbl.gen_string_idx(["col_4", "col_5"], freq_limit="1") assert string_idx_list[0].count() == 3, "col_4 should have 3 indices" assert string_idx_list[1].count() == 2, "col_5 should have 2 indices" with tempfile.TemporaryDirectory() as local_path: for str_idx in string_idx_list: str_idx.write_parquet(local_path) str_idx_log = str_idx.log(["id"]) assert str_idx.df.filter( "id == 1").count() == 1, "id in str_idx should = 1" assert str_idx_log.df.filter("id == 1").count() == 0, "id in str_idx_log should " \ "!= 1" assert os.path.isdir(local_path + "/col_4.parquet") assert os.path.isdir(local_path + "/col_5.parquet") new_col_4_idx = StringIndex.read_parquet(local_path + "/col_4.parquet") assert "col_4" in new_col_4_idx.df.columns, "col_4 should be a column of new_col_4_idx" with self.assertRaises(Exception) as context: StringIndex.read_parquet(local_path + "/col_5.parquet", "col_4") self.assertTrue('col_4 should be a column of the DataFrame' in str( context.exception))
def test_group_by(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data2.parquet") feature_tbl = FeatureTable.read_parquet(file_path) groupby_tbl1 = feature_tbl.group_by("col_4", agg={"col_1": ["sum", "count"]}) assert groupby_tbl1.df.filter("col_4 == 'a' and sum(col_1) == 3").count() == 1, \ "the sum of col_1 with col_4 = 'a' should be 3" assert groupby_tbl1.df.filter("col_4 == 'b' and `count(col_1)` == 5").count() == 1, \ "the count of col_1 with col_4 = 'b' should be 5" groupby_tbl2 = feature_tbl.group_by(agg={"target": "avg", "col_2": "last"}) assert groupby_tbl2.df.collect()[0]["avg(target)"] == 0.9, \ "the mean of target should be 0.9" groupby_tbl3 = feature_tbl.group_by("col_5", agg=["max", "min"], join=True) assert len(groupby_tbl3.df.columns) == len(feature_tbl.df.columns) + 10, \ "groupby_tbl3 should have (#df.columns - #columns)*len(agg)=10 more columns" assert groupby_tbl3.df.filter("col_5 == 'cc' and `max(col_2)` == 9").count() == \ feature_tbl.df.filter("col_5 == 'cc'").count(), \ "max of col_2 should 9 for all col_5 = 'cc' in groupby_tbl3" assert groupby_tbl3.df.filter("col_5 == 'aa' and `min(col_3)` == 1.0").count() == \ feature_tbl.df.filter("col_5 == 'aa'").count(), \ "min of col_3 should 1.0 for all col_5 = 'aa' in groupby_tbl3" groupby_tbl4 = feature_tbl.group_by(["col_4", "col_5"], agg="first", join=True) assert groupby_tbl4.df.filter("col_4 == 'b' and col_5 == 'dd' and `first(col_1)` == 0") \ .count() == feature_tbl.df.filter("col_4 == 'b' and col_5 == 'dd'").count(), \ "first of col_1 should be 0 for all col_4 = 'b' and col_5 = 'dd' in groupby_tbl4"
def test_filter(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path) filtered_tbl = feature_tbl.filter(feature_tbl.col_1 == 1) assert filtered_tbl.size() == 3, "Only 3 out of 5 rows has value 1 for col_1" filtered_tbl2 = feature_tbl.filter( (feature_tbl.col("col_1") == 1) & (feature_tbl.col_2 == 1)) assert filtered_tbl2.size() == 1, "Only 1 out of 5 rows has value 1 for col_1 and col_2"
def test_gen_string_idx_none(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path) string_idx_list = feature_tbl.gen_string_idx(["col_4", "col_5"], freq_limit=None) assert string_idx_list[0].size() == 3, "col_4 should have 3 indices" assert string_idx_list[1].size() == 2, "col_5 should have 2 indices"
def test_columns(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path) col_names = feature_tbl.columns assert isinstance(col_names, list), "col_names should be a list of strings" assert col_names == ["col_1", "col_2", "col_3", "col_4", "col_5"], \ "column names are incorrenct"
def test_rename(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path) name_dict = {"col_1": "new_col1", "col_4": "new_col4"} rename_tbl = feature_tbl.rename(name_dict) cols = rename_tbl.df.columns assert isinstance(rename_tbl, FeatureTable), "rename_tbl should be a FeatureTable" assert "col_1" in feature_tbl.df.columns, "feature_tbl should not be changed" assert "new_col1" in cols, "new_col1 should be a column of the renamed tbl." assert "new_col4" in cols, "new_col4 should be a column of the renamed tbl."
def test_log(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path) log_tbl = feature_tbl.log(["col_1", "col_2", "col_3"]) assert isinstance(log_tbl, FeatureTable), "log_tbl should be a FeatureTable" assert feature_tbl.df.filter("col_1 == 1").count() != 0 and feature_tbl \ .df.filter("col_2 == 1").count() != 0, "feature_tbl should not be changed" assert log_tbl.df.filter("col_1 == 1").count() == 0, "col_1 should != 1" assert log_tbl.df.filter("col_2 == 1").count() == 0, "col_2 should != 1" assert log_tbl.df.filter("col_3 == 1").count() == 0, "col_3 should != 1"
def test_gen_string_idx_dict(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path) string_idx_list = feature_tbl.gen_string_idx(["col_4", "col_5"], freq_limit={"col_4": 1, "col_5": 3}) with self.assertRaises(Exception) as context: feature_tbl.gen_string_idx(["col_4", "col_5"], freq_limit="col_4:1,col_5:3") self.assertTrue('freq_limit only supports int, dict or None, but get str' in str( context.exception)) assert string_idx_list[0].size() == 3, "col_4 should have 3 indices" assert string_idx_list[1].size() == 1, "col_5 should have 1 indices"
def test_select(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path) select_tbl = feature_tbl.select("col_1", "col_2") assert "col_1" in select_tbl.df.columns, "col_1 shoul be selected" assert "col_2" in select_tbl.df.columns, "col_2 shoud be selected" assert "col_3" not in select_tbl.df.columns, "col_3 shoud not be selected" assert feature_tbl.size() == select_tbl.size(), \ "the selected table should have the same rows" with self.assertRaises(Exception) as context: feature_tbl.select() self.assertTrue("cols should be str or a list of str, but got None." in str(context.exception))
def test_fillna_long(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path) filled_tbl = feature_tbl.fillna(3, ["col_1", "col_2", "col_3"]) assert isinstance(filled_tbl, FeatureTable), "filled_tbl should be a FeatureTable" assert feature_tbl.df.filter("col_2 is null").count() != 0 and feature_tbl \ .df.filter("col_3 is null").count() != 0, "feature_tbl should not be changed" assert filled_tbl.df.filter("col_1 is null").count() == 0, "col_1 null values should be " \ "filled" assert filled_tbl.df.filter("col_2 is null").count() == 0, "col_2 null values should be " \ "filled" assert filled_tbl.df.filter("col_3 is null").count() == 0, "col_3 null values should be " \ "filled"
def test_fillna_string(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path) with self.assertRaises(Exception) as context: feature_tbl.fillna(3.2, ["col_4", "col_5"]) self.assertTrue('numeric does not match the type of column col_4' in str(context.exception)) filled_tbl = feature_tbl.fillna("bb", ["col_4", "col_5"]) assert isinstance(filled_tbl, FeatureTable), "filled_tbl should be a FeatureTable" assert filled_tbl.df.filter("col_4 is null").count() == 0, "col_4 null values should be " \ "filled" assert filled_tbl.df.filter("col_5 is null").count() == 0, "col_5 null values should be " \ "filled"
def test_fill_median(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path) with self.assertRaises(Exception) as context: feature_tbl.fill_median(["col_4", "col_5"]) self.assertTrue('col_4 with data type StringType is not supported' in str(context.exception)) filled_tbl = feature_tbl.fill_median(["col_1", "col_2"]) assert isinstance(filled_tbl, FeatureTable), "filled_tbl should be a FeatureTable" assert filled_tbl.df.filter("col_1 is null").count() == 0, "col_1 null values should be " \ "filled" assert filled_tbl.df.filter("col_2 is null").count() == 0, "col_2 null values should be " \ "filled"
def test_cross(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path).fillna(0, ["col_2", "col_3"]) crossed_tbl = feature_tbl.cross_columns([["col_2", "col_3"]], [100]) assert "col_2_col_3" in crossed_tbl.df.columns, "crossed column is not created" max_value = crossed_tbl.df.select("col_2_col_3") \ .agg(max(col("col_2_col_3")).alias("max")) \ .rdd.map(lambda row: row['max']).collect()[0] min_value = crossed_tbl.df.select("col_2_col_3") \ .agg(min(col("col_2_col_3")).alias("min")) \ .rdd.map(lambda row: row['min']).collect()[0] assert max_value <= 100, "cross value shouldn't be more than 100 after cross" assert min_value > 0, "cross value shouldn't be less than 0 after cross"
def test_clip(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path) clip_tbl = feature_tbl.clip(["col_1", "col_2", "col_3"], 2) assert isinstance(clip_tbl, FeatureTable), "clip_tbl should be a FeatureTable" assert feature_tbl.df.filter("col_1 < 2").count() != 0 and feature_tbl \ .df.filter("col_2 < 2").count() != 0, "feature_tbl should not be changed" assert clip_tbl.df.filter( "col_1 < 2").count() == 0, "col_1 should >= 2" assert clip_tbl.df.filter( "col_2 < 2").count() == 0, "col_2 should >= 2" assert clip_tbl.df.filter( "col_3 < 2").count() == 0, "col_3 should >= 2"
def test_median(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path) with self.assertRaises(Exception) as context: feature_tbl.median(["col_4", "col_5"]) self.assertTrue('col_4 with data type StringType is not supported' in str(context.exception)) median_tbl = feature_tbl.median(["col_1", "col_2", "col_3"]) assert isinstance(median_tbl, FeatureTable), "median_tbl should be a FeatureTable" assert median_tbl.df.count() == 3, "the number of rows of median_tbl should be equal to " \ "the number of specified columns" assert median_tbl.df.filter("column == 'col_1'").count() == 1, "col_1 should exist in " \ "'column' of median_tbl" assert median_tbl.df.filter("column == 'col_2'").filter("median == 1.0").count() == 1, \ "the median of col_2 should be 1.0"
def test_norm(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path).fillna(0, ["col_2", "col_3"]) normalized_tbl = feature_tbl.normalize(["col_2"]) max_value = normalized_tbl.df.select("col_2") \ .agg(max(col("col_2")).alias("max")) \ .rdd.map(lambda row: row['max']).collect()[0] min_value = normalized_tbl.df.select("col_2") \ .agg(min(col("col_2")).alias("min")) \ .rdd.map(lambda row: row['min']).collect()[0] assert max_value <= 1, "col_2 shouldn't be more than 1 after normalization" assert min_value >= 0, "col_2 shouldn't be less than 0 after normalization" tbl2 = FeatureTable(feature_tbl.df.withColumn("col2-col3", array(["col_2", "col_3"]))) normalized_tbl2 = tbl2.normalize(["col_2", "col2-col3"]) normalized_tbl2.compute()
def test_fillna_int(self): file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet") feature_tbl = FeatureTable.read_parquet(file_path) filled_tbl = feature_tbl.fillna(5, ["col_2", "col_3"]) assert isinstance(filled_tbl, FeatureTable), "filled_tbl should be a FeatureTable" assert feature_tbl.df.filter("col_2 is null").count() != 0 and feature_tbl \ .df.filter("col_3 is null").count() != 0, "feature_tbl should not be changed" assert filled_tbl.df.filter("col_2 == 5").count() == 1, "col_2 null values should be " \ "filled with 5" assert filled_tbl.df.filter("col_3 == 5").count() == 1, "col_3 null values should be " \ "filled with 5" filled_tbl = feature_tbl.fillna(5, None) assert filled_tbl.df.filter("col_2 == 5").count() == 1, "col_2 null values should be " \ "filled with 5" assert filled_tbl.df.filter("col_3 == 5").count() == 1, "col_3 null values should be " \ "filled with 5" with self.assertRaises(Exception) as context: feature_tbl.fillna(0, ["col_2", "col_3", "col_8"]) self.assertTrue('do not exist in this Table' in str(context.exception))
"len_links", "hashtags", "present_links", "present_domains" ] cat_cols = [ "engaged_with_user_is_verified", "enaging_user_is_verified", "present_media", "tweet_type", "language" ] ratio_cols = [ "engaged_with_user_follower_following_ratio", "enaging_user_follower_following_ratio" ] embed_cols = [ "enaging_user_id", "engaged_with_user_id", "hashtags", "present_links", "present_domains" ] useful_cols = num_cols + cat_cols + embed_cols train_tbl = FeatureTable.read_parquet(args.data_dir + "/train_parquet") test_tbl = FeatureTable.read_parquet(args.data_dir + "/test_parquet") full_tbl = train_tbl.concat(test_tbl, "outer") reindex_tbls = full_tbl.gen_reindex_mapping( embed_cols, freq_limit=args.frequency_limit) train_tbl, test_tbl, user_info, item_info = prepare_features( train_tbl, test_tbl, reindex_tbls) output_dir = args.data_dir + "/embed_reindex" for i, c in enumerate(embed_cols): reindex_tbls[i].write_parquet(output_dir + "_c") train_config = { "lr": 1e-3, "user_col_info": user_info, "item_col_info": item_info,
if __name__ == '__main__': args = _parse_args() if args.cluster_mode == "local": init_orca_context("local", cores=args.executor_cores, memory=args.executor_memory) elif args.cluster_mode == "standalone": init_orca_context("standalone", master=args.master, cores=args.executor_cores, num_nodes=args.num_executor, memory=args.executor_memory, driver_cores=args.driver_cores, driver_memory=args.driver_memory, conf=conf) elif args.cluster_mode == "yarn": init_orca_context("yarn-client", cores=args.executor_cores, num_nodes=args.num_executor, memory=args.executor_memory, driver_cores=args.driver_cores, driver_memory=args.driver_memory, conf=conf) time_start = time() paths = [os.path.join(args.input_folder, 'day_%d.parquet' % i) for i in args.day_range] tbl = FeatureTable.read_parquet(paths) idx_list = tbl.gen_string_idx(CAT_COLS, freq_limit=args.frequency_limit) tbl_all_data = FeatureTable.read_parquet(paths[:-1]) tbl_all_data = tbl_all_data.encode_string(CAT_COLS, idx_list)\ .fillna(0, INT_COLS + CAT_COLS).log(INT_COLS) tbl_all_data = tbl_all_data.merge_cols(INT_COLS, "X_int").merge_cols(CAT_COLS, "X_cat") tbl_all_data.compute() time_end = time() print("Train data loading and preprocessing time: ", time_end - time_start) tbl_all_data.show(5) print("Finished") stop_orca_context()
if args.cluster_mode == "local": init_orca_context("local", cores=args.cores, memory=args.memory) elif args.cluster_mode == "standalone": init_orca_context("standalone", master=args.master, cores=args.cores, num_nodes=args.num_nodes, memory=args.memory, driver_cores=args.driver_cores, driver_memory=args.driver_memory, conf=conf) elif args.cluster_mode == "yarn": init_orca_context("yarn-client", cores=args.cores, num_nodes=args.num_nodes, memory=args.memory, driver_cores=args.driver_cores, driver_memory=args.driver_memory, conf=conf) time_start = time() paths = [os.path.join(args.input_folder, "day_%d.parquet" % i) for i in args.day_range] tbl = FeatureTable.read_parquet(paths) idx_list = tbl.gen_string_idx(CAT_COLS, freq_limit=args.frequency_limit) train_data = FeatureTable.read_parquet(paths[:-1]) train_preprocessed = preprocess_and_save(train_data, idx_list, "train", args.output_folder) if args.days == 24: # Full Criteo dataset test_data = FeatureTable.read_parquet( os.path.join(args.input_folder, "day_23_test.parquet")) test_preprocessed = preprocess_and_save(test_data, idx_list, "test", args.output_folder) time_end = time() print("Total preprocessing time: ", time_end - time_start) train_preprocessed.show(5) if args.output_folder:
conf=conf) elif args.cluster_mode == "yarn": init_orca_context("yarn-client", cores=args.executor_cores, num_nodes=args.num_executor, memory=args.executor_memory, driver_cores=args.driver_cores, driver_memory=args.driver_memory, conf=conf) time_start = time() paths = [ os.path.join(args.input_folder, 'day_%d.parquet' % i) for i in args.day_range ] tbl = FeatureTable.read_parquet(paths) # change name for all columns columns = dict([("_c{}".format(i), "c{}".format(i)) for i in range(40)]) tbl = tbl.rename(columns) idx_list = tbl.gen_string_idx(CAT_COLS, freq_limit=args.frequency_limit) cat_sizes = [idx.size() for idx in idx_list] cross_sizes = args.cross_sizes tbl_all_data = tbl.encode_string(CAT_COLS, idx_list)\ .fillna(0, INT_COLS + CAT_COLS)\ .normalize(INT_COLS)\ .cross_columns(crossed_columns=[CAT_COLS[0:2], CAT_COLS[2:4]], bucket_sizes=cross_sizes) tbl_all_data.compute() time_end = time()