예제 #1
0
    def test_clip(self):
        file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet")
        feature_tbl = FeatureTable.read_parquet(file_path)
        clip_tbl = feature_tbl.clip(["col_1", "col_2", "col_3"], min=2, max=None)
        assert isinstance(clip_tbl, FeatureTable), "clip_tbl should be a FeatureTable"
        assert feature_tbl.df.filter("col_1 < 2").count() != 0 and feature_tbl \
            .df.filter("col_2 < 2").count() != 0, "feature_tbl should not be changed"
        assert clip_tbl.df.filter("col_1 < 2").count() == 0, "col_1 should >= 2"
        assert clip_tbl.df.filter("col_2 < 2").count() == 0, "col_2 should >= 2"
        assert clip_tbl.df.filter("col_3 < 2").count() == 0, "col_3 should >= 2"
        with self.assertRaises(Exception) as context:
            feature_tbl.clip(None, 2)
        self.assertTrue('columns should be str or list of str, but got None.'
                        in str(context.exception))

        feature_tbl = FeatureTable.read_parquet(file_path)
        clip_tbl = feature_tbl.clip(["col_1", "col_2", "col_3"], min=None, max=1)
        assert isinstance(clip_tbl, FeatureTable), "clip_tbl should be a FeatureTable"
        assert feature_tbl.df.filter("col_1 > 1").count() != 0 and feature_tbl \
            .df.filter("col_2 > 1").count() != 0, "feature_tbl should not be changed"
        assert clip_tbl.df.filter("col_1 > 1").count() == 0, "col_1 should <= 1"
        assert clip_tbl.df.filter("col_2 > 1").count() == 0, "col_2 should <= 1"
        assert clip_tbl.df.filter("col_3 > 1").count() == 0, "col_3 should <= 1"

        feature_tbl = FeatureTable.read_parquet(file_path)
        clip_tbl = feature_tbl.clip(["col_1", "col_2", "col_3"], min=0, max=1)
        assert isinstance(clip_tbl, FeatureTable), "clip_tbl should be a FeatureTable"
        assert feature_tbl.df.filter("col_1 > 1 or col_1 < 0").count() != 0 and feature_tbl \
            .df.filter("col_2 > 1 or col_2 < 0").count() != 0, "feature_tbl should not be changed"
        assert clip_tbl.df.filter("col_1 < 0").count() == 0, "col_1 should >= 0"
        assert clip_tbl.df.filter("col_2 > 1").count() == 0, "col_2 should <= 1"
        assert clip_tbl.df.filter("col_3 < 0 or col_3 > 1").count() == 0, "col_3 should >=0 " \
                                                                          "and <= 1"
예제 #2
0
 def test_write_parquet(self):
     file_path = os.path.join(self.resource_path,
                              "friesian/feature/parquet/data1.parquet")
     feature_tbl = FeatureTable.read_parquet(file_path)
     feature_tbl.write_parquet("saved.parquet")
     loaded_tbl = FeatureTable.read_parquet("saved.parquet")
     if os.path.exists("saved.parquet"):
         shutil.rmtree("saved.parquet")
예제 #3
0
 def test_merge(self):
     file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet")
     feature_tbl = FeatureTable.read_parquet(file_path)
     merged_tbl = feature_tbl.merge_cols(["col_1", "col_2", "col_3"], "int_cols")
     assert "col_1" not in merged_tbl.df.columns, "col_1 shouldn't be a column of merged_tbl"
     assert "int_cols" in merged_tbl.df.columns, "int_cols should be a column of merged_tbl"
     assert "col_1" in feature_tbl.df.columns, "col_1 should be a column of feature_tbl"
예제 #4
0
    def test_dropna(self):
        file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet")
        feature_tbl = FeatureTable.read_parquet(file_path)
        dropped_tbl = feature_tbl.dropna(["col_1", "col_4"])
        assert isinstance(dropped_tbl, FeatureTable), "dropped_tbl should be a FeatureTable"
        assert feature_tbl.df.filter("col_1 is null").count() != 0 and feature_tbl\
            .df.filter("col_4 is null").count() != 0, "feature_tbl should not be changed"
        assert dropped_tbl.df.filter("col_1 is null").count() == 0, "col_1 null values should " \
                                                                    "be dropped"
        assert dropped_tbl.df.filter("col_4 is null").count() == 0, "col_4 null values should " \
                                                                    "be dropped"
        assert 0 < dropped_tbl.df.count() < feature_tbl.df.count(), "the number of rows should " \
                                                                    "be decreased"

        dropped_tbl = feature_tbl.dropna(["col_1", "col_4"], how="all")
        assert dropped_tbl.df.filter("col_1 is null and col_4 is null").count() == 0, \
            "col_1 and col_4 should not both have null values"
        dropped_tbl = feature_tbl.dropna(["col_2", "col_4"], how="all")
        assert dropped_tbl.df.filter("col_2 is null").count() > 0, \
            "col_2 should still have null values after dropna with how=all"

        dropped_tbl = feature_tbl.dropna(["col_2", "col_3", "col_5"], thresh=2)
        assert dropped_tbl.df.filter("col_2 is null").count() > 0, \
            "col_2 should still have null values after dropna with thresh=2"
        assert dropped_tbl.df.filter("col_3 is null and col_5 is null").count() == 0, \
            "col_3 and col_5 should not both have null values"
예제 #5
0
 def test_gen_string_idx(self):
     file_path = os.path.join(self.resource_path,
                              "friesian/feature/parquet/data1.parquet")
     feature_tbl = FeatureTable.read_parquet(file_path)
     string_idx_list = feature_tbl.gen_string_idx(["col_4", "col_5"],
                                                  freq_limit="1")
     assert string_idx_list[0].count() == 3, "col_4 should have 3 indices"
     assert string_idx_list[1].count() == 2, "col_5 should have 2 indices"
     with tempfile.TemporaryDirectory() as local_path:
         for str_idx in string_idx_list:
             str_idx.write_parquet(local_path)
             str_idx_log = str_idx.log(["id"])
             assert str_idx.df.filter(
                 "id == 1").count() == 1, "id in str_idx should = 1"
             assert str_idx_log.df.filter("id == 1").count() == 0, "id in str_idx_log should " \
                                                                   "!= 1"
         assert os.path.isdir(local_path + "/col_4.parquet")
         assert os.path.isdir(local_path + "/col_5.parquet")
         new_col_4_idx = StringIndex.read_parquet(local_path +
                                                  "/col_4.parquet")
         assert "col_4" in new_col_4_idx.df.columns, "col_4 should be a column of new_col_4_idx"
         with self.assertRaises(Exception) as context:
             StringIndex.read_parquet(local_path + "/col_5.parquet",
                                      "col_4")
         self.assertTrue('col_4 should be a column of the DataFrame' in str(
             context.exception))
예제 #6
0
    def test_group_by(self):
        file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data2.parquet")
        feature_tbl = FeatureTable.read_parquet(file_path)

        groupby_tbl1 = feature_tbl.group_by("col_4", agg={"col_1": ["sum", "count"]})
        assert groupby_tbl1.df.filter("col_4 == 'a' and sum(col_1) == 3").count() == 1, \
            "the sum of col_1 with col_4 = 'a' should be 3"
        assert groupby_tbl1.df.filter("col_4 == 'b' and `count(col_1)` == 5").count() == 1, \
            "the count of col_1 with col_4 = 'b' should be 5"

        groupby_tbl2 = feature_tbl.group_by(agg={"target": "avg", "col_2": "last"})
        assert groupby_tbl2.df.collect()[0]["avg(target)"] == 0.9, \
            "the mean of target should be 0.9"

        groupby_tbl3 = feature_tbl.group_by("col_5", agg=["max", "min"], join=True)
        assert len(groupby_tbl3.df.columns) == len(feature_tbl.df.columns) + 10, \
            "groupby_tbl3 should have (#df.columns - #columns)*len(agg)=10 more columns"
        assert groupby_tbl3.df.filter("col_5 == 'cc' and `max(col_2)` == 9").count() == \
            feature_tbl.df.filter("col_5 == 'cc'").count(), \
            "max of col_2 should 9 for all col_5 = 'cc' in groupby_tbl3"
        assert groupby_tbl3.df.filter("col_5 == 'aa' and `min(col_3)` == 1.0").count() == \
            feature_tbl.df.filter("col_5 == 'aa'").count(), \
            "min of col_3 should 1.0 for all col_5 = 'aa' in groupby_tbl3"

        groupby_tbl4 = feature_tbl.group_by(["col_4", "col_5"], agg="first", join=True)
        assert groupby_tbl4.df.filter("col_4 == 'b' and col_5 == 'dd' and `first(col_1)` == 0") \
            .count() == feature_tbl.df.filter("col_4 == 'b' and col_5 == 'dd'").count(), \
            "first of col_1 should be 0 for all col_4 = 'b' and col_5 = 'dd' in groupby_tbl4"
예제 #7
0
 def test_filter(self):
     file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet")
     feature_tbl = FeatureTable.read_parquet(file_path)
     filtered_tbl = feature_tbl.filter(feature_tbl.col_1 == 1)
     assert filtered_tbl.size() == 3, "Only 3 out of 5 rows has value 1 for col_1"
     filtered_tbl2 = feature_tbl.filter(
         (feature_tbl.col("col_1") == 1) & (feature_tbl.col_2 == 1))
     assert filtered_tbl2.size() == 1, "Only 1 out of 5 rows has value 1 for col_1 and col_2"
예제 #8
0
 def test_gen_string_idx_none(self):
     file_path = os.path.join(self.resource_path,
                              "friesian/feature/parquet/data1.parquet")
     feature_tbl = FeatureTable.read_parquet(file_path)
     string_idx_list = feature_tbl.gen_string_idx(["col_4", "col_5"],
                                                  freq_limit=None)
     assert string_idx_list[0].size() == 3, "col_4 should have 3 indices"
     assert string_idx_list[1].size() == 2, "col_5 should have 2 indices"
예제 #9
0
 def test_columns(self):
     file_path = os.path.join(self.resource_path,
                              "friesian/feature/parquet/data1.parquet")
     feature_tbl = FeatureTable.read_parquet(file_path)
     col_names = feature_tbl.columns
     assert isinstance(col_names,
                       list), "col_names should be a list of strings"
     assert col_names == ["col_1", "col_2", "col_3", "col_4", "col_5"], \
         "column names are incorrenct"
예제 #10
0
 def test_rename(self):
     file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet")
     feature_tbl = FeatureTable.read_parquet(file_path)
     name_dict = {"col_1": "new_col1", "col_4": "new_col4"}
     rename_tbl = feature_tbl.rename(name_dict)
     cols = rename_tbl.df.columns
     assert isinstance(rename_tbl, FeatureTable), "rename_tbl should be a FeatureTable"
     assert "col_1" in feature_tbl.df.columns, "feature_tbl should not be changed"
     assert "new_col1" in cols, "new_col1 should be a column of the renamed tbl."
     assert "new_col4" in cols, "new_col4 should be a column of the renamed tbl."
예제 #11
0
 def test_log(self):
     file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet")
     feature_tbl = FeatureTable.read_parquet(file_path)
     log_tbl = feature_tbl.log(["col_1", "col_2", "col_3"])
     assert isinstance(log_tbl, FeatureTable), "log_tbl should be a FeatureTable"
     assert feature_tbl.df.filter("col_1 == 1").count() != 0 and feature_tbl \
         .df.filter("col_2 == 1").count() != 0, "feature_tbl should not be changed"
     assert log_tbl.df.filter("col_1 == 1").count() == 0, "col_1 should != 1"
     assert log_tbl.df.filter("col_2 == 1").count() == 0, "col_2 should != 1"
     assert log_tbl.df.filter("col_3 == 1").count() == 0, "col_3 should != 1"
예제 #12
0
 def test_gen_string_idx_dict(self):
     file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet")
     feature_tbl = FeatureTable.read_parquet(file_path)
     string_idx_list = feature_tbl.gen_string_idx(["col_4", "col_5"], freq_limit={"col_4": 1,
                                                                                  "col_5": 3})
     with self.assertRaises(Exception) as context:
         feature_tbl.gen_string_idx(["col_4", "col_5"], freq_limit="col_4:1,col_5:3")
     self.assertTrue('freq_limit only supports int, dict or None, but get str' in str(
         context.exception))
     assert string_idx_list[0].size() == 3, "col_4 should have 3 indices"
     assert string_idx_list[1].size() == 1, "col_5 should have 1 indices"
예제 #13
0
 def test_select(self):
     file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet")
     feature_tbl = FeatureTable.read_parquet(file_path)
     select_tbl = feature_tbl.select("col_1", "col_2")
     assert "col_1" in select_tbl.df.columns, "col_1 shoul be selected"
     assert "col_2" in select_tbl.df.columns, "col_2 shoud be selected"
     assert "col_3" not in select_tbl.df.columns, "col_3 shoud not be selected"
     assert feature_tbl.size() == select_tbl.size(), \
         "the selected table should have the same rows"
     with self.assertRaises(Exception) as context:
         feature_tbl.select()
     self.assertTrue("cols should be str or a list of str, but got None."
                     in str(context.exception))
예제 #14
0
 def test_fillna_long(self):
     file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet")
     feature_tbl = FeatureTable.read_parquet(file_path)
     filled_tbl = feature_tbl.fillna(3, ["col_1", "col_2", "col_3"])
     assert isinstance(filled_tbl, FeatureTable), "filled_tbl should be a FeatureTable"
     assert feature_tbl.df.filter("col_2 is null").count() != 0 and feature_tbl \
         .df.filter("col_3 is null").count() != 0, "feature_tbl should not be changed"
     assert filled_tbl.df.filter("col_1 is null").count() == 0, "col_1 null values should be " \
                                                                "filled"
     assert filled_tbl.df.filter("col_2 is null").count() == 0, "col_2 null values should be " \
                                                                "filled"
     assert filled_tbl.df.filter("col_3 is null").count() == 0, "col_3 null values should be " \
                                                                "filled"
예제 #15
0
    def test_fillna_string(self):
        file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet")
        feature_tbl = FeatureTable.read_parquet(file_path)
        with self.assertRaises(Exception) as context:
            feature_tbl.fillna(3.2, ["col_4", "col_5"])
        self.assertTrue('numeric does not match the type of column col_4' in str(context.exception))

        filled_tbl = feature_tbl.fillna("bb", ["col_4", "col_5"])
        assert isinstance(filled_tbl, FeatureTable), "filled_tbl should be a FeatureTable"
        assert filled_tbl.df.filter("col_4 is null").count() == 0, "col_4 null values should be " \
                                                                   "filled"
        assert filled_tbl.df.filter("col_5 is null").count() == 0, "col_5 null values should be " \
                                                                   "filled"
예제 #16
0
    def test_fill_median(self):
        file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet")
        feature_tbl = FeatureTable.read_parquet(file_path)
        with self.assertRaises(Exception) as context:
            feature_tbl.fill_median(["col_4", "col_5"])
        self.assertTrue('col_4 with data type StringType is not supported' in
                        str(context.exception))

        filled_tbl = feature_tbl.fill_median(["col_1", "col_2"])
        assert isinstance(filled_tbl, FeatureTable), "filled_tbl should be a FeatureTable"
        assert filled_tbl.df.filter("col_1 is null").count() == 0, "col_1 null values should be " \
                                                                   "filled"
        assert filled_tbl.df.filter("col_2 is null").count() == 0, "col_2 null values should be " \
                                                                   "filled"
예제 #17
0
    def test_cross(self):
        file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet")
        feature_tbl = FeatureTable.read_parquet(file_path).fillna(0, ["col_2", "col_3"])
        crossed_tbl = feature_tbl.cross_columns([["col_2", "col_3"]], [100])
        assert "col_2_col_3" in crossed_tbl.df.columns, "crossed column is not created"
        max_value = crossed_tbl.df.select("col_2_col_3") \
            .agg(max(col("col_2_col_3")).alias("max")) \
            .rdd.map(lambda row: row['max']).collect()[0]
        min_value = crossed_tbl.df.select("col_2_col_3") \
            .agg(min(col("col_2_col_3")).alias("min")) \
            .rdd.map(lambda row: row['min']).collect()[0]

        assert max_value <= 100, "cross value shouldn't be more than 100 after cross"
        assert min_value > 0, "cross value shouldn't be less than 0 after cross"
예제 #18
0
 def test_clip(self):
     file_path = os.path.join(self.resource_path,
                              "friesian/feature/parquet/data1.parquet")
     feature_tbl = FeatureTable.read_parquet(file_path)
     clip_tbl = feature_tbl.clip(["col_1", "col_2", "col_3"], 2)
     assert isinstance(clip_tbl,
                       FeatureTable), "clip_tbl should be a FeatureTable"
     assert feature_tbl.df.filter("col_1 < 2").count() != 0 and feature_tbl \
         .df.filter("col_2 < 2").count() != 0, "feature_tbl should not be changed"
     assert clip_tbl.df.filter(
         "col_1 < 2").count() == 0, "col_1 should >= 2"
     assert clip_tbl.df.filter(
         "col_2 < 2").count() == 0, "col_2 should >= 2"
     assert clip_tbl.df.filter(
         "col_3 < 2").count() == 0, "col_3 should >= 2"
예제 #19
0
    def test_median(self):
        file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet")
        feature_tbl = FeatureTable.read_parquet(file_path)
        with self.assertRaises(Exception) as context:
            feature_tbl.median(["col_4", "col_5"])
        self.assertTrue('col_4 with data type StringType is not supported' in
                        str(context.exception))

        median_tbl = feature_tbl.median(["col_1", "col_2", "col_3"])
        assert isinstance(median_tbl, FeatureTable), "median_tbl should be a FeatureTable"
        assert median_tbl.df.count() == 3, "the number of rows of median_tbl should be equal to " \
                                           "the number of specified columns"
        assert median_tbl.df.filter("column == 'col_1'").count() == 1, "col_1 should exist in " \
                                                                       "'column' of median_tbl"
        assert median_tbl.df.filter("column == 'col_2'").filter("median == 1.0").count() == 1, \
            "the median of col_2 should be 1.0"
예제 #20
0
    def test_norm(self):
        file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet")
        feature_tbl = FeatureTable.read_parquet(file_path).fillna(0, ["col_2", "col_3"])
        normalized_tbl = feature_tbl.normalize(["col_2"])
        max_value = normalized_tbl.df.select("col_2") \
            .agg(max(col("col_2")).alias("max")) \
            .rdd.map(lambda row: row['max']).collect()[0]
        min_value = normalized_tbl.df.select("col_2") \
            .agg(min(col("col_2")).alias("min")) \
            .rdd.map(lambda row: row['min']).collect()[0]

        assert max_value <= 1, "col_2 shouldn't be more than 1 after normalization"
        assert min_value >= 0, "col_2 shouldn't be less than 0 after normalization"

        tbl2 = FeatureTable(feature_tbl.df.withColumn("col2-col3", array(["col_2", "col_3"])))
        normalized_tbl2 = tbl2.normalize(["col_2", "col2-col3"])
        normalized_tbl2.compute()
예제 #21
0
 def test_fillna_int(self):
     file_path = os.path.join(self.resource_path, "friesian/feature/parquet/data1.parquet")
     feature_tbl = FeatureTable.read_parquet(file_path)
     filled_tbl = feature_tbl.fillna(5, ["col_2", "col_3"])
     assert isinstance(filled_tbl, FeatureTable), "filled_tbl should be a FeatureTable"
     assert feature_tbl.df.filter("col_2 is null").count() != 0 and feature_tbl \
         .df.filter("col_3 is null").count() != 0, "feature_tbl should not be changed"
     assert filled_tbl.df.filter("col_2 == 5").count() == 1, "col_2 null values should be " \
                                                             "filled with 5"
     assert filled_tbl.df.filter("col_3 == 5").count() == 1, "col_3 null values should be " \
                                                             "filled with 5"
     filled_tbl = feature_tbl.fillna(5, None)
     assert filled_tbl.df.filter("col_2 == 5").count() == 1, "col_2 null values should be " \
                                                             "filled with 5"
     assert filled_tbl.df.filter("col_3 == 5").count() == 1, "col_3 null values should be " \
                                                             "filled with 5"
     with self.assertRaises(Exception) as context:
         feature_tbl.fillna(0, ["col_2", "col_3", "col_8"])
     self.assertTrue('do not exist in this Table' in str(context.exception))
예제 #22
0
        "len_links", "hashtags", "present_links", "present_domains"
    ]
    cat_cols = [
        "engaged_with_user_is_verified", "enaging_user_is_verified",
        "present_media", "tweet_type", "language"
    ]
    ratio_cols = [
        "engaged_with_user_follower_following_ratio",
        "enaging_user_follower_following_ratio"
    ]
    embed_cols = [
        "enaging_user_id", "engaged_with_user_id", "hashtags", "present_links",
        "present_domains"
    ]
    useful_cols = num_cols + cat_cols + embed_cols
    train_tbl = FeatureTable.read_parquet(args.data_dir + "/train_parquet")
    test_tbl = FeatureTable.read_parquet(args.data_dir + "/test_parquet")
    full_tbl = train_tbl.concat(test_tbl, "outer")
    reindex_tbls = full_tbl.gen_reindex_mapping(
        embed_cols, freq_limit=args.frequency_limit)
    train_tbl, test_tbl, user_info, item_info = prepare_features(
        train_tbl, test_tbl, reindex_tbls)

    output_dir = args.data_dir + "/embed_reindex"
    for i, c in enumerate(embed_cols):
        reindex_tbls[i].write_parquet(output_dir + "_c")

    train_config = {
        "lr": 1e-3,
        "user_col_info": user_info,
        "item_col_info": item_info,
예제 #23
0
if __name__ == '__main__':
    args = _parse_args()
    if args.cluster_mode == "local":
        init_orca_context("local", cores=args.executor_cores, memory=args.executor_memory)
    elif args.cluster_mode == "standalone":
        init_orca_context("standalone", master=args.master,
                          cores=args.executor_cores, num_nodes=args.num_executor,
                          memory=args.executor_memory,
                          driver_cores=args.driver_cores,
                          driver_memory=args.driver_memory, conf=conf)
    elif args.cluster_mode == "yarn":
        init_orca_context("yarn-client", cores=args.executor_cores,
                          num_nodes=args.num_executor, memory=args.executor_memory,
                          driver_cores=args.driver_cores, driver_memory=args.driver_memory,
                          conf=conf)
    time_start = time()
    paths = [os.path.join(args.input_folder, 'day_%d.parquet' % i) for i in args.day_range]
    tbl = FeatureTable.read_parquet(paths)
    idx_list = tbl.gen_string_idx(CAT_COLS, freq_limit=args.frequency_limit)
    tbl_all_data = FeatureTable.read_parquet(paths[:-1])
    tbl_all_data = tbl_all_data.encode_string(CAT_COLS, idx_list)\
        .fillna(0, INT_COLS + CAT_COLS).log(INT_COLS)
    tbl_all_data = tbl_all_data.merge_cols(INT_COLS, "X_int").merge_cols(CAT_COLS, "X_cat")
    tbl_all_data.compute()
    time_end = time()
    print("Train data loading and preprocessing time: ", time_end - time_start)
    tbl_all_data.show(5)
    print("Finished")
    stop_orca_context()
    if args.cluster_mode == "local":
        init_orca_context("local", cores=args.cores, memory=args.memory)
    elif args.cluster_mode == "standalone":
        init_orca_context("standalone", master=args.master, cores=args.cores,
                          num_nodes=args.num_nodes, memory=args.memory,
                          driver_cores=args.driver_cores, driver_memory=args.driver_memory,
                          conf=conf)
    elif args.cluster_mode == "yarn":
        init_orca_context("yarn-client", cores=args.cores,
                          num_nodes=args.num_nodes, memory=args.memory,
                          driver_cores=args.driver_cores, driver_memory=args.driver_memory,
                          conf=conf)
    time_start = time()

    paths = [os.path.join(args.input_folder, "day_%d.parquet" % i) for i in args.day_range]
    tbl = FeatureTable.read_parquet(paths)
    idx_list = tbl.gen_string_idx(CAT_COLS, freq_limit=args.frequency_limit)

    train_data = FeatureTable.read_parquet(paths[:-1])
    train_preprocessed = preprocess_and_save(train_data, idx_list, "train", args.output_folder)

    if args.days == 24:  # Full Criteo dataset
        test_data = FeatureTable.read_parquet(
            os.path.join(args.input_folder, "day_23_test.parquet"))
        test_preprocessed = preprocess_and_save(test_data, idx_list, "test", args.output_folder)

    time_end = time()
    print("Total preprocessing time: ", time_end - time_start)
    train_preprocessed.show(5)

    if args.output_folder:
                          conf=conf)
    elif args.cluster_mode == "yarn":
        init_orca_context("yarn-client",
                          cores=args.executor_cores,
                          num_nodes=args.num_executor,
                          memory=args.executor_memory,
                          driver_cores=args.driver_cores,
                          driver_memory=args.driver_memory,
                          conf=conf)

    time_start = time()
    paths = [
        os.path.join(args.input_folder, 'day_%d.parquet' % i)
        for i in args.day_range
    ]
    tbl = FeatureTable.read_parquet(paths)
    # change name for all columns
    columns = dict([("_c{}".format(i), "c{}".format(i)) for i in range(40)])
    tbl = tbl.rename(columns)
    idx_list = tbl.gen_string_idx(CAT_COLS, freq_limit=args.frequency_limit)
    cat_sizes = [idx.size() for idx in idx_list]

    cross_sizes = args.cross_sizes

    tbl_all_data = tbl.encode_string(CAT_COLS, idx_list)\
        .fillna(0, INT_COLS + CAT_COLS)\
        .normalize(INT_COLS)\
        .cross_columns(crossed_columns=[CAT_COLS[0:2], CAT_COLS[2:4]],
                       bucket_sizes=cross_sizes)
    tbl_all_data.compute()
    time_end = time()