def test_read_csv(self): file_path = os.path.join(self.resource_path, "friesian/feature/data.csv") feature_tbl = FeatureTable.read_csv(file_path, header=True) assert feature_tbl.size() == 4 columns = feature_tbl.columns assert columns == ["col1", "col2", "col3"] records = feature_tbl.df.collect() assert isinstance(records[0][0], float) assert isinstance(records[0][1], str) and isinstance( records[0][1], str) file_path2 = os.path.join(self.resource_path, "friesian/feature/data_no_header.csv") feature_tbl2 = FeatureTable.read_csv(file_path2, names=["col1", "_col2", "col3"], dtype={"col1": "int"}) assert feature_tbl2.size() == 4 columns2 = feature_tbl2.columns assert columns2 == ["col1", "_col2", "col3"] records2 = feature_tbl2.df.collect() assert isinstance(records2[0][0], int) assert isinstance(records2[0][1], str) and isinstance( records2[0][1], str) feature_tbl3 = FeatureTable.read_csv(file_path, header=True, dtype=["int", "str", "str"]) records3 = feature_tbl3.df.collect() assert isinstance(records3[0][0], int) assert isinstance(records3[0][1], str) and isinstance( records3[0][1], str)
def test_split(self): file_path = os.path.join(self.resource_path, "orca/learn/ncf.csv") feature_tbl = FeatureTable.read_csv(file_path, header=True, dtype="int") tbl1, tbl2 = feature_tbl.split([0.8, 0.2], seed=1128) total_size = feature_tbl.size() size1 = tbl1.size() size2 = tbl2.size() assert size1 + size2 == total_size
def test_append_column(self): file_path = os.path.join(self.resource_path, "friesian/feature/") df = FeatureTable.read_csv(file_path + "data.csv", header=True) df = df.append_column("z", 0) assert df.select("z").size() == 4 assert df.filter("z == 0").size() == 4 df = df.append_column("str", "a") assert df.select("str").size() == 4 assert df.filter("str == 'a'").size() == 4 df = df.append_column("float", 1.2) assert df.select("float").size() == 4 assert df.filter("float == 1.2").size() == 4
def test_category_encode_and_one_hot_encode(self): file_path = os.path.join(self.resource_path, "friesian/feature/data.csv") feature_tbl = FeatureTable.read_csv(file_path, header=True) feature_tbl, indices = feature_tbl.category_encode( columns=["col2", "col3"]) assert isinstance(indices, list) and len(indices) == 2 assert isinstance(indices[0], StringIndex) and isinstance( indices[1], StringIndex) assert indices[0].size() == 3 and indices[1].size() == 4 dict1 = indices[0].to_dict() dict2 = indices[1].to_dict() records = feature_tbl.df.collect() assert records[0][1] == dict1["x"] and records[0][2] == dict2["abc"] assert records[3][1] == dict1["z"] and records[2][2] == dict2["aaa"] feature_tbl = feature_tbl.one_hot_encode(columns=["col2", "col3"], prefix=["o1", "o2"]) feature_tbl.show() columns = feature_tbl.columns assert columns == [ "col1", "o1_0", "o1_1", "o1_2", "o1_3", "o2_0", "o2_1", "o2_2", "o2_3", "o2_4" ] records = feature_tbl.df.collect() record = records[0] value1 = dict1["x"] value2 = dict2["abc"] for i in range(1, 4): if i == value1: assert record[i + 1] == 1 else: assert record[i + 1] == 0 for i in range(1, 5): if i == value2: assert record[i + 5] == 1 else: assert record[i + 5] == 0