def test_parquet_target_vector_overwrite(self): df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]}) fset = fs.FeatureSet(name="fvec-parquet-fset", entities=[fs.Entity("name")]) fs.ingest(fset, df1) features = ["fvec-parquet-fset.*"] fvec = fs.FeatureVector("fvec-parquet", features=features) target = ParquetTarget() off1 = fs.get_offline_features(fvec, target=target) dfout1 = pd.read_parquet(target._target_path) assert ( df1.set_index(keys="name") .sort_index() .equals(off1.to_dataframe().sort_index()) ) assert df1.set_index(keys="name").sort_index().equals(dfout1.sort_index()) df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]}) fs.ingest(fset, df2) off2 = fs.get_offline_features(fvec, target=target) dfout2 = pd.read_parquet(target._target_path) assert ( df2.set_index(keys="name") .sort_index() .equals(off2.to_dataframe().sort_index()) ) assert df2.set_index(keys="name").sort_index().equals(dfout2.sort_index())
def test_override_false(self): df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]}) df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]}) df3 = pd.concat([df1, df2]) fset = fs.FeatureSet(name="override-false", entities=[fs.Entity("name")]) fs.ingest(fset, df1) features = ["override-false.*"] fvec = fs.FeatureVector("override-false-vec", features=features) off1 = fs.get_offline_features(fvec).to_dataframe() assert df1.set_index(keys="name").sort_index().equals(off1.sort_index()) fs.ingest(fset, df2, overwrite=False) off2 = fs.get_offline_features(fvec).to_dataframe() assert df3.set_index(keys="name").sort_index().equals(off2.sort_index()) fs.ingest(fset, df1, targets=[ParquetTarget()]) off1 = fs.get_offline_features(fvec).to_dataframe() assert df1.set_index(keys="name").sort_index().equals(off1.sort_index()) svc = fs.get_online_feature_service(fvec) resp = svc.get(entity_rows=[{"name": "PQR"}]) assert resp[0]["value"] == 6 svc.close() with pytest.raises(mlrun.errors.MLRunInvalidArgumentError): fs.ingest(fset, df1, targets=[CSVTarget()], overwrite=False) fset.set_targets(targets=[CSVTarget()]) with pytest.raises(mlrun.errors.MLRunInvalidArgumentError): fs.ingest(fset, df1, overwrite=False)
def prepare_feature_set(name: str, entity: str, data: pd.DataFrame, timestamp_key=None): df_source = mlrun.datastore.sources.DataFrameSource(data, entity, timestamp_key) feature_set = fs.FeatureSet( name, entities=[fs.Entity(entity)], timestamp_key=timestamp_key ) feature_set.set_targets() df = fs.ingest(feature_set, df_source, infer_options=fs.InferOptions.default()) return feature_set, df
def verify_target_list_fail(targets, with_defaults=None): feature_set = fs.FeatureSet(name="target-list-fail", entities=[fs.Entity("ticker")]) with pytest.raises(mlrun.errors.MLRunInvalidArgumentError): if with_defaults: feature_set.set_targets(targets=targets, with_defaults=with_defaults) else: feature_set.set_targets(targets=targets) with pytest.raises(mlrun.errors.MLRunInvalidArgumentError): fs.ingest(feature_set, quotes, targets=targets)
def test_post_aggregation_step(self): quotes_set = fs.FeatureSet("post-aggregation", entities=[fs.Entity("ticker")]) agg_step = quotes_set.add_aggregation( "asks", "ask", ["sum", "max"], "1h", "10m" ) agg_step.to("MyMap", "somemap1", field="multi1", multiplier=3) # Make sure the map step was added right after the aggregation step assert len(quotes_set.graph.states) == 2 assert quotes_set.graph.states[aggregates_step].after is None assert quotes_set.graph.states["somemap1"].after == [aggregates_step]
def test_overwrite_single_parquet_file(self): df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]}) df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]}) targets = [ ParquetTarget(path="v3io:///bigdata/overwrite-pq-spec/my.parquet") ] fset = fs.FeatureSet(name="overwrite-pq-spec-path", entities=[fs.Entity("name")]) fs.ingest(fset, df1, targets=targets) with pytest.raises(mlrun.errors.MLRunInvalidArgumentError): fs.ingest(fset, df2, targets=targets, overwrite=False)
def test_split_graph(self): quotes_set = fs.FeatureSet("stock-quotes", entities=[fs.Entity("ticker")]) quotes_set.graph.to("MyMap", "somemap1", field="multi1", multiplier=3).to( "storey.Extend", _fn="({'extra': event['bid'] * 77})").to( "storey.Filter", "filter", _fn="(event['bid'] > 70)").to( FeaturesetValidator()) side_step_name = "side-step" quotes_set.graph.to("storey.Extend", name=side_step_name, _fn="({'extra2': event['bid'] * 17})") with pytest.raises(mlrun.errors.MLRunPreconditionFailedError): fs.infer_metadata(quotes_set, quotes) non_default_target_name = "side-target" quotes_set.set_targets( targets=[ CSVTarget(name=non_default_target_name, after_state=side_step_name) ], default_final_state="FeaturesetValidator", ) quotes_set.plot(with_targets=True) inf_out = fs.infer_metadata(quotes_set, quotes) ing_out = fs.ingest(quotes_set, quotes, return_df=True) default_file_path = quotes_set.get_target_path(TargetTypes.parquet) side_file_path = quotes_set.get_target_path(non_default_target_name) side_file_out = pd.read_csv(side_file_path) default_file_out = pd.read_parquet(default_file_path) self._split_graph_expected_default.set_index("ticker", inplace=True) assert all( self._split_graph_expected_default == default_file_out.round(2)) assert all(self._split_graph_expected_default == ing_out.round(2)) assert all(self._split_graph_expected_default == inf_out.round(2)) assert all( self._split_graph_expected_side.sort_index( axis=1) == side_file_out.sort_index(axis=1).round(2))
def test_overwrite(self): df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]}) df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]}) fset = fs.FeatureSet(name="overwrite-fs", entities=[fs.Entity("name")]) fs.ingest(fset, df1, targets=[CSVTarget(), ParquetTarget(), NoSqlTarget()]) features = ["overwrite-fs.*"] fvec = fs.FeatureVector("overwrite-vec", features=features) csv_path = fset.get_target_path(name="csv") csv_df = pd.read_csv(csv_path) assert ( df1.set_index(keys="name") .sort_index() .equals(csv_df.set_index(keys="name").sort_index()) ) parquet_path = fset.get_target_path(name="parquet") parquet_df = pd.read_parquet(parquet_path) assert df1.set_index(keys="name").sort_index().equals(parquet_df.sort_index()) svc = fs.get_online_feature_service(fvec) resp = svc.get(entity_rows=[{"name": "GHI"}]) assert resp[0]["value"] == 3 svc.close() fs.ingest(fset, df2) csv_path = fset.get_target_path(name="csv") csv_df = pd.read_csv(csv_path) assert ( df1.set_index(keys="name") .sort_index() .equals(csv_df.set_index(keys="name").sort_index()) ) parquet_path = fset.get_target_path(name="parquet") parquet_df = pd.read_parquet(parquet_path) assert df2.set_index(keys="name").sort_index().equals(parquet_df.sort_index()) svc = fs.get_online_feature_service(fvec) resp = svc.get(entity_rows=[{"name": "GHI"}]) assert resp[0] is None resp = svc.get(entity_rows=[{"name": "PQR"}]) assert resp[0]["value"] == 6 svc.close()
def test_basic_remote_spark_ingest(self): key = "patient_id" measurements = fs.FeatureSet( "measurements", entities=[fs.Entity(key)], timestamp_key="timestamp", engine="spark", ) source = ParquetSource("myparquet", path=self.get_remote_pq_source_path()) fs.ingest( measurements, source, return_df=True, spark_context=self.spark_service, run_config=fs.RunConfig(local=False), )
def test_backwards_compatibility_step_vs_state(): quotes_set = fs.FeatureSet("post-aggregation", entities=[fs.Entity("ticker")]) agg_step = quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m") agg_step.to("MyMap", "somemap1", field="multi1", multiplier=3) quotes_set.set_targets( targets=[ParquetTarget("parquet1", after_state="somemap1")], with_defaults=False, ) feature_set_dict = quotes_set.to_dict() # Make sure we're backwards compatible feature_set_dict["spec"]["graph"]["states"] = feature_set_dict["spec"][ "graph"].pop("steps") feature_set_dict["spec"]["targets"][0]["after_state"] = feature_set_dict[ "spec"]["targets"][0].pop("after_step") from_dict_feature_set = fs.FeatureSet.from_dict(feature_set_dict) assert (deepdiff.DeepDiff(from_dict_feature_set.to_dict(), quotes_set.to_dict()) == {})
def test_error_flow(self): df = pd.DataFrame({ "name": ["Jean", "Jacques", "Pierre"], "last_name": ["Dubois", "Dupont", "Lavigne"], }) measurements = fs.FeatureSet( "measurements", entities=[fs.Entity("name")], engine="spark", ) with pytest.raises(mlrun.errors.MLRunInvalidArgumentError): fs.ingest( measurements, df, return_df=True, spark_context=self.spark_service, run_config=fs.RunConfig(local=False), )
def test_overwrite_specified_nosql_path(self): df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]}) df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]}) targets = [NoSqlTarget(path="v3io:///bigdata/overwrite-spec")] fset = fs.FeatureSet(name="overwrite-spec-path", entities=[fs.Entity("name")]) features = ["overwrite-spec-path.*"] fvec = fs.FeatureVector("overwrite-spec-path-fvec", features=features) fs.ingest(fset, df1, targets=targets) fs.ingest(fset, df2, targets=targets) svc = fs.get_online_feature_service(fvec) resp = svc.get(entity_rows=[{"name": "PQR"}]) assert resp[0]["value"] == 6 resp = svc.get(entity_rows=[{"name": "ABC"}]) assert resp[0] is None svc.close()
def test_featureset_uri(self): stocks_set = fs.FeatureSet("stocks01", entities=[fs.Entity("ticker")]) stocks_set.save() fs.ingest(stocks_set.uri, stocks)
def test_schedule_on_filtered_by_time(self, partitioned): name = f"sched-time-{str(partitioned)}" now = datetime.now() path = "v3io:///bigdata/bla.parquet" fsys = fsspec.filesystem(v3iofs.fs.V3ioFS.protocol) pd.DataFrame({ "time": [ pd.Timestamp("2021-01-10 10:00:00"), pd.Timestamp("2021-01-10 11:00:00"), ], "first_name": ["moshe", "yosi"], "data": [2000, 10], }).to_parquet(path=path, filesystem=fsys) cron_trigger = "*/2 * * * *" source = ParquetSource("myparquet", path=path, time_field="time", schedule=cron_trigger) feature_set = fs.FeatureSet( name=name, entities=[fs.Entity("first_name")], timestamp_key="time", engine="spark", ) if partitioned: targets = [ NoSqlTarget(), ParquetTarget( name="tar1", path="v3io:///bigdata/fs1/", partitioned=True, partition_cols=["time"], ), ] else: targets = [ ParquetTarget(name="tar2", path="v3io:///bigdata/fs2/", partitioned=False), NoSqlTarget(), ] fs.ingest( feature_set, source, run_config=fs.RunConfig(local=False), targets=targets, spark_context=self.spark_service, ) # ingest starts every second minute and it takes ~90 seconds to finish. if (now.minute % 2) == 0: sleep(60 - now.second + 60 + 90) else: sleep(60 - now.second + 90) features = [f"{name}.*"] vec = fs.FeatureVector("sched_test-vec", features) svc = fs.get_online_feature_service(vec) resp = svc.get([{"first_name": "yosi"}, {"first_name": "moshe"}]) assert resp[0]["data"] == 10 assert resp[1]["data"] == 2000 pd.DataFrame({ "time": [ pd.Timestamp("2021-01-10 12:00:00"), pd.Timestamp("2021-01-10 13:00:00"), now + pd.Timedelta(minutes=10), pd.Timestamp("2021-01-09 13:00:00"), ], "first_name": ["moshe", "dina", "katya", "uri"], "data": [50, 10, 25, 30], }).to_parquet(path=path) sleep(120) resp = svc.get([ { "first_name": "yosi" }, { "first_name": "moshe" }, { "first_name": "katya" }, { "first_name": "dina" }, { "first_name": "uri" }, ]) assert resp[0]["data"] == 10 assert resp[1]["data"] == 50 assert resp[2] is None assert resp[3]["data"] == 10 assert resp[4] is None svc.close() # check offline resp = fs.get_offline_features(vec) assert len(resp.to_dataframe() == 4) assert "uri" not in resp.to_dataframe( ) and "katya" not in resp.to_dataframe()