def test_purge(self): key = "patient_id" fset = fs.FeatureSet("purge", entities=[Entity(key)], timestamp_key="timestamp") path = os.path.relpath(str(self.assets_path / "testdata.csv")) source = CSVSource( "mycsv", path=path, time_field="timestamp", ) targets = [ CSVTarget(), CSVTarget(name="specified-path", path="v3io:///bigdata/csv-purge-test.csv"), ParquetTarget(partitioned=True, partition_cols=["timestamp"]), NoSqlTarget(), ] fset.set_targets( targets=targets, with_defaults=False, ) fs.ingest(fset, source) verify_purge(fset, targets) fs.ingest(fset, source) targets_to_purge = targets[:-1] verify_purge(fset, targets_to_purge)
def test_ordered_pandas_asof_merge(self): targets = [ParquetTarget(), NoSqlTarget()] left_set, left = prepare_feature_set( "left", "ticker", trades, timestamp_key="time", targets=targets ) right_set, right = prepare_feature_set( "right", "ticker", quotes, timestamp_key="time", targets=targets ) features = ["left.*", "right.*"] feature_vector = fs.FeatureVector("test_fv", features, description="test FV") res = fs.get_offline_features(feature_vector, entity_timestamp_column="time") res = res.to_dataframe() assert res.shape[0] == left.shape[0]
def test_overwrite(self): df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]}) df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]}) fset = fs.FeatureSet(name="overwrite-fs", entities=[fs.Entity("name")]) fs.ingest(fset, df1, targets=[CSVTarget(), ParquetTarget(), NoSqlTarget()]) features = ["overwrite-fs.*"] fvec = fs.FeatureVector("overwrite-vec", features=features) csv_path = fset.get_target_path(name="csv") csv_df = pd.read_csv(csv_path) assert ( df1.set_index(keys="name") .sort_index() .equals(csv_df.set_index(keys="name").sort_index()) ) parquet_path = fset.get_target_path(name="parquet") parquet_df = pd.read_parquet(parquet_path) assert df1.set_index(keys="name").sort_index().equals(parquet_df.sort_index()) svc = fs.get_online_feature_service(fvec) resp = svc.get(entity_rows=[{"name": "GHI"}]) assert resp[0]["value"] == 3 svc.close() fs.ingest(fset, df2) csv_path = fset.get_target_path(name="csv") csv_df = pd.read_csv(csv_path) assert ( df1.set_index(keys="name") .sort_index() .equals(csv_df.set_index(keys="name").sort_index()) ) parquet_path = fset.get_target_path(name="parquet") parquet_df = pd.read_parquet(parquet_path) assert df2.set_index(keys="name").sort_index().equals(parquet_df.sort_index()) svc = fs.get_online_feature_service(fvec) resp = svc.get(entity_rows=[{"name": "GHI"}]) assert resp[0] is None resp = svc.get(entity_rows=[{"name": "PQR"}]) assert resp[0]["value"] == 6 svc.close()
def test_overwrite_specified_nosql_path(self): df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]}) df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]}) targets = [NoSqlTarget(path="v3io:///bigdata/overwrite-spec")] fset = fs.FeatureSet(name="overwrite-spec-path", entities=[fs.Entity("name")]) features = ["overwrite-spec-path.*"] fvec = fs.FeatureVector("overwrite-spec-path-fvec", features=features) fs.ingest(fset, df1, targets=targets) fs.ingest(fset, df2, targets=targets) svc = fs.get_online_feature_service(fvec) resp = svc.get(entity_rows=[{"name": "PQR"}]) assert resp[0]["value"] == 6 resp = svc.get(entity_rows=[{"name": "ABC"}]) assert resp[0] is None svc.close()
def test_schedule_on_filtered_by_time(self, partitioned): name = f"sched-time-{str(partitioned)}" now = datetime.now() path = "v3io:///bigdata/bla.parquet" fsys = fsspec.filesystem(v3iofs.fs.V3ioFS.protocol) pd.DataFrame({ "time": [ pd.Timestamp("2021-01-10 10:00:00"), pd.Timestamp("2021-01-10 11:00:00"), ], "first_name": ["moshe", "yosi"], "data": [2000, 10], }).to_parquet(path=path, filesystem=fsys) cron_trigger = "*/2 * * * *" source = ParquetSource("myparquet", path=path, time_field="time", schedule=cron_trigger) feature_set = fs.FeatureSet( name=name, entities=[fs.Entity("first_name")], timestamp_key="time", engine="spark", ) if partitioned: targets = [ NoSqlTarget(), ParquetTarget( name="tar1", path="v3io:///bigdata/fs1/", partitioned=True, partition_cols=["time"], ), ] else: targets = [ ParquetTarget(name="tar2", path="v3io:///bigdata/fs2/", partitioned=False), NoSqlTarget(), ] fs.ingest( feature_set, source, run_config=fs.RunConfig(local=False), targets=targets, spark_context=self.spark_service, ) # ingest starts every second minute and it takes ~90 seconds to finish. if (now.minute % 2) == 0: sleep(60 - now.second + 60 + 90) else: sleep(60 - now.second + 90) features = [f"{name}.*"] vec = fs.FeatureVector("sched_test-vec", features) svc = fs.get_online_feature_service(vec) resp = svc.get([{"first_name": "yosi"}, {"first_name": "moshe"}]) assert resp[0]["data"] == 10 assert resp[1]["data"] == 2000 pd.DataFrame({ "time": [ pd.Timestamp("2021-01-10 12:00:00"), pd.Timestamp("2021-01-10 13:00:00"), now + pd.Timedelta(minutes=10), pd.Timestamp("2021-01-09 13:00:00"), ], "first_name": ["moshe", "dina", "katya", "uri"], "data": [50, 10, 25, 30], }).to_parquet(path=path) sleep(120) resp = svc.get([ { "first_name": "yosi" }, { "first_name": "moshe" }, { "first_name": "katya" }, { "first_name": "dina" }, { "first_name": "uri" }, ]) assert resp[0]["data"] == 10 assert resp[1]["data"] == 50 assert resp[2] is None assert resp[3]["data"] == 10 assert resp[4] is None svc.close() # check offline resp = fs.get_offline_features(vec) assert len(resp.to_dataframe() == 4) assert "uri" not in resp.to_dataframe( ) and "katya" not in resp.to_dataframe()