def test_check_permissions(): data = pd.DataFrame({ "time_stamp": [ pd.Timestamp("2021-06-09 09:30:06.008"), pd.Timestamp("2021-06-09 10:29:07.009"), pd.Timestamp("2021-06-09 09:29:08.010"), ], "data": [10, 20, 30], "string": ["ab", "cd", "ef"], }) data_set1 = fs.FeatureSet("fs1", entities=[Entity("string")]) mlrun.db.FileRunDB.verify_authorization = unittest.mock.Mock( side_effect=mlrun.errors.MLRunAccessDeniedError("")) try: fs.preview( data_set1, data, entity_columns=[Entity("string")], timestamp_key="time_stamp", ) assert False except mlrun.errors.MLRunAccessDeniedError: pass try: fs.ingest(data_set1, data, infer_options=fs.InferOptions.default()) assert False except mlrun.errors.MLRunAccessDeniedError: pass features = ["fs1.*"] feature_vector = fs.FeatureVector("test", features) try: fs.get_offline_features(feature_vector, entity_timestamp_column="time_stamp") assert False except mlrun.errors.MLRunAccessDeniedError: pass try: fs.get_online_feature_service(feature_vector) assert False except mlrun.errors.MLRunAccessDeniedError: pass try: fs.deploy_ingestion_service(featureset=data_set1) assert False except mlrun.errors.MLRunAccessDeniedError: pass try: data_set1.purge_targets() assert False except mlrun.errors.MLRunAccessDeniedError: pass
def test_overwrite(self): df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]}) df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]}) fset = fs.FeatureSet(name="overwrite-fs", entities=[fs.Entity("name")]) fs.ingest(fset, df1, targets=[CSVTarget(), ParquetTarget(), NoSqlTarget()]) features = ["overwrite-fs.*"] fvec = fs.FeatureVector("overwrite-vec", features=features) csv_path = fset.get_target_path(name="csv") csv_df = pd.read_csv(csv_path) assert ( df1.set_index(keys="name") .sort_index() .equals(csv_df.set_index(keys="name").sort_index()) ) parquet_path = fset.get_target_path(name="parquet") parquet_df = pd.read_parquet(parquet_path) assert df1.set_index(keys="name").sort_index().equals(parquet_df.sort_index()) svc = fs.get_online_feature_service(fvec) resp = svc.get(entity_rows=[{"name": "GHI"}]) assert resp[0]["value"] == 3 svc.close() fs.ingest(fset, df2) csv_path = fset.get_target_path(name="csv") csv_df = pd.read_csv(csv_path) assert ( df1.set_index(keys="name") .sort_index() .equals(csv_df.set_index(keys="name").sort_index()) ) parquet_path = fset.get_target_path(name="parquet") parquet_df = pd.read_parquet(parquet_path) assert df2.set_index(keys="name").sort_index().equals(parquet_df.sort_index()) svc = fs.get_online_feature_service(fvec) resp = svc.get(entity_rows=[{"name": "GHI"}]) assert resp[0] is None resp = svc.get(entity_rows=[{"name": "PQR"}]) assert resp[0]["value"] == 6 svc.close()
def test_unaggregated_columns(self): test_base_time = datetime(2020, 12, 1, 17, 33, 15) data = pd.DataFrame({ "time": [test_base_time, test_base_time - pd.Timedelta(minutes=1)], "first_name": ["moshe", "yosi"], "last_name": ["cohen", "levi"], "bid": [2000, 10], }) name = f"measurements_{uuid.uuid4()}" # write to kv data_set = fs.FeatureSet(name, entities=[Entity("first_name")]) data_set.add_aggregation( name="bids", column="bid", operations=["sum", "max"], windows="1h", period="10m", ) fs.ingest(data_set, data, return_df=True) features = [f"{name}.bids_sum_1h", f"{name}.last_name"] vector = fs.FeatureVector("my-vec", features) svc = fs.get_online_feature_service(vector) resp = svc.get([{"first_name": "moshe"}]) expected = {"bids_sum_1h": 2000.0, "last_name": "cohen"} assert resp[0] == expected svc.close()
def test_override_false(self): df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]}) df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]}) df3 = pd.concat([df1, df2]) fset = fs.FeatureSet(name="override-false", entities=[fs.Entity("name")]) fs.ingest(fset, df1) features = ["override-false.*"] fvec = fs.FeatureVector("override-false-vec", features=features) off1 = fs.get_offline_features(fvec).to_dataframe() assert df1.set_index(keys="name").sort_index().equals(off1.sort_index()) fs.ingest(fset, df2, overwrite=False) off2 = fs.get_offline_features(fvec).to_dataframe() assert df3.set_index(keys="name").sort_index().equals(off2.sort_index()) fs.ingest(fset, df1, targets=[ParquetTarget()]) off1 = fs.get_offline_features(fvec).to_dataframe() assert df1.set_index(keys="name").sort_index().equals(off1.sort_index()) svc = fs.get_online_feature_service(fvec) resp = svc.get(entity_rows=[{"name": "PQR"}]) assert resp[0]["value"] == 6 svc.close() with pytest.raises(mlrun.errors.MLRunInvalidArgumentError): fs.ingest(fset, df1, targets=[CSVTarget()], overwrite=False) fset.set_targets(targets=[CSVTarget()]) with pytest.raises(mlrun.errors.MLRunInvalidArgumentError): fs.ingest(fset, df1, overwrite=False)
def test_multiple_entities(self): name = f"measurements_{uuid.uuid4()}" current_time = pd.Timestamp.now() data = pd.DataFrame( { "time": [ current_time, current_time - pd.Timedelta(minutes=1), current_time - pd.Timedelta(minutes=2), current_time - pd.Timedelta(minutes=3), current_time - pd.Timedelta(minutes=4), current_time - pd.Timedelta(minutes=5), ], "first_name": ["moshe", "yosi", "yosi", "yosi", "moshe", "yosi"], "last_name": ["cohen", "levi", "levi", "levi", "cohen", "levi"], "bid": [2000, 10, 11, 12, 2500, 14], } ) # write to kv data_set = fs.FeatureSet( name, entities=[Entity("first_name"), Entity("last_name")] ) data_set.add_aggregation( name="bids", column="bid", operations=["sum", "max"], windows="1h", period="10m", emit_policy=EmitAfterMaxEvent(1), ) fs.infer_metadata( data_set, data, # source entity_columns=["first_name", "last_name"], timestamp_key="time", options=fs.InferOptions.default(), ) data_set.plot( str(self.results_path / "pipe.png"), rankdir="LR", with_targets=True ) fs.ingest(data_set, data, return_df=True) features = [ f"{name}.bids_sum_1h", ] vector = fs.FeatureVector("my-vec", features) svc = fs.get_online_feature_service(vector) resp = svc.get([{"first_name": "yosi", "last_name": "levi"}]) assert resp[0]["bids_sum_1h"] == 47.0 svc.close()
def _get_online_features(self, features, features_size): # test real-time query vector = fs.FeatureVector("my-vec", features) svc = fs.get_online_feature_service(vector) resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}]) resp = svc.get([{"ticker": "AAPL"}]) assert (resp[0]["name"] == "Apple Inc" and resp[0]["exchange"] == "NASDAQ"), "unexpected online result" resp2 = svc.get([{"ticker": "AAPL"}], as_list=True) assert (len(resp2[0]) == features_size - 1), "unexpected online vector size" # -1 label svc.close()
def test_none_value(self): data = pd.DataFrame( {"first_name": ["moshe", "yossi"], "bid": [2000, 10], "bool": [True, None]} ) # write to kv data_set = fs.FeatureSet("tests2", entities=[Entity("first_name")]) fs.ingest(data_set, data, return_df=True) features = ["tests2.*"] vector = fs.FeatureVector("my-vec", features) svc = fs.get_online_feature_service(vector) resp = svc.get([{"first_name": "yossi"}]) assert resp[0] == {"bid": 10, "bool": None} svc.close()
def test_realtime_query(): init_store() features = [ "stock-quotes.bid", "stock-quotes.asks_sum_5h", "stock-quotes.ask as mycol", "stocks.*", ] features_size = (len(features) + 1 + 1 ) # (*) returns 2 features, label adds 1 feature resp = fs.get_offline_features( features, entity_rows=trades, entity_timestamp_column="time", label_feature="stock-quotes.xx", ) vector = resp.vector assert len(vector.spec.features) == len( features), "unexpected num of requested features" assert (len(vector.status.features) == features_size ), "unexpected num of returned features" assert len(vector.status.stats ) == features_size, "unexpected num of feature stats" assert vector.status.label_column == "xx", "unexpected label_column name" df = resp.to_dataframe() columns = trades.shape[1] + features_size - 2 # - 2 keys assert df.shape[1] == columns, "unexpected num of returned df columns" resp.to_parquet(results_dir + "query.parquet") # test real-time query vector = fs.FeatureVector("my-vec", features) svc = fs.get_online_feature_service(vector) resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}]) resp = svc.get([{"ticker": "AAPL"}]) assert (resp[0]["name"] == "Apple Inc" and resp[0]["exchange"] == "NASDAQ"), "unexpected online result" resp2 = svc.get([{"ticker": "AAPL"}], as_list=True) assert (len(resp2[0]) == features_size - 1), "unexpected online vector size" # -1 label svc.close()
def test_overwrite_specified_nosql_path(self): df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]}) df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]}) targets = [NoSqlTarget(path="v3io:///bigdata/overwrite-spec")] fset = fs.FeatureSet(name="overwrite-spec-path", entities=[fs.Entity("name")]) features = ["overwrite-spec-path.*"] fvec = fs.FeatureVector("overwrite-spec-path-fvec", features=features) fs.ingest(fset, df1, targets=targets) fs.ingest(fset, df2, targets=targets) svc = fs.get_online_feature_service(fvec) resp = svc.get(entity_rows=[{"name": "PQR"}]) assert resp[0]["value"] == 6 resp = svc.get(entity_rows=[{"name": "ABC"}]) assert resp[0] is None svc.close()
def test_realtime_query(): init_store() features = [ "stock-quotes.bid", "stock-quotes.asks_sum_5h", "stock-quotes.ask as mycol", "stocks.*", ] resp = fs.get_offline_features(features, entity_rows=trades, entity_timestamp_column="time") vector = resp.vector assert len(vector.spec.features) == len( features), "unexpected num of requested features" # stocks (*) returns 2 features assert (len(vector.status.features) == len(features) + 1), "unexpected num of returned features" assert (len(vector.status.stats) == len(features) + 1), "unexpected num of feature stats" df = resp.to_dataframe() columns = trades.shape[1] + len(features) + 1 assert df.shape[1] == columns, "unexpected num of returned df columns" resp.to_parquet(results_dir + "query.parquet") # test real-time query vector = fs.FeatureVector("my-vec", features) svc = fs.get_online_feature_service(vector) resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}]) print(resp) resp = svc.get([{"ticker": "AAPL"}]) assert (resp[0]["ticker"] == "AAPL" and resp[0]["exchange"] == "NASDAQ"), "unexpected online result" svc.close()
def test_schedule_on_filtered_by_time(self, partitioned): name = f"sched-time-{str(partitioned)}" now = datetime.now() path = "v3io:///bigdata/bla.parquet" fsys = fsspec.filesystem(v3iofs.fs.V3ioFS.protocol) pd.DataFrame({ "time": [ pd.Timestamp("2021-01-10 10:00:00"), pd.Timestamp("2021-01-10 11:00:00"), ], "first_name": ["moshe", "yosi"], "data": [2000, 10], }).to_parquet(path=path, filesystem=fsys) cron_trigger = "*/2 * * * *" source = ParquetSource("myparquet", path=path, time_field="time", schedule=cron_trigger) feature_set = fs.FeatureSet( name=name, entities=[fs.Entity("first_name")], timestamp_key="time", engine="spark", ) if partitioned: targets = [ NoSqlTarget(), ParquetTarget( name="tar1", path="v3io:///bigdata/fs1/", partitioned=True, partition_cols=["time"], ), ] else: targets = [ ParquetTarget(name="tar2", path="v3io:///bigdata/fs2/", partitioned=False), NoSqlTarget(), ] fs.ingest( feature_set, source, run_config=fs.RunConfig(local=False), targets=targets, spark_context=self.spark_service, ) # ingest starts every second minute and it takes ~90 seconds to finish. if (now.minute % 2) == 0: sleep(60 - now.second + 60 + 90) else: sleep(60 - now.second + 90) features = [f"{name}.*"] vec = fs.FeatureVector("sched_test-vec", features) svc = fs.get_online_feature_service(vec) resp = svc.get([{"first_name": "yosi"}, {"first_name": "moshe"}]) assert resp[0]["data"] == 10 assert resp[1]["data"] == 2000 pd.DataFrame({ "time": [ pd.Timestamp("2021-01-10 12:00:00"), pd.Timestamp("2021-01-10 13:00:00"), now + pd.Timedelta(minutes=10), pd.Timestamp("2021-01-09 13:00:00"), ], "first_name": ["moshe", "dina", "katya", "uri"], "data": [50, 10, 25, 30], }).to_parquet(path=path) sleep(120) resp = svc.get([ { "first_name": "yosi" }, { "first_name": "moshe" }, { "first_name": "katya" }, { "first_name": "dina" }, { "first_name": "uri" }, ]) assert resp[0]["data"] == 10 assert resp[1]["data"] == 50 assert resp[2] is None assert resp[3]["data"] == 10 assert resp[4] is None svc.close() # check offline resp = fs.get_offline_features(vec) assert len(resp.to_dataframe() == 4) assert "uri" not in resp.to_dataframe( ) and "katya" not in resp.to_dataframe()