Exemplo n.º 1
0
    def test_parquet_target_vector_overwrite(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        fset = fs.FeatureSet(name="fvec-parquet-fset", entities=[fs.Entity("name")])
        fs.ingest(fset, df1)

        features = ["fvec-parquet-fset.*"]
        fvec = fs.FeatureVector("fvec-parquet", features=features)

        target = ParquetTarget()
        off1 = fs.get_offline_features(fvec, target=target)
        dfout1 = pd.read_parquet(target._target_path)

        assert (
            df1.set_index(keys="name")
            .sort_index()
            .equals(off1.to_dataframe().sort_index())
        )
        assert df1.set_index(keys="name").sort_index().equals(dfout1.sort_index())

        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})
        fs.ingest(fset, df2)
        off2 = fs.get_offline_features(fvec, target=target)
        dfout2 = pd.read_parquet(target._target_path)
        assert (
            df2.set_index(keys="name")
            .sort_index()
            .equals(off2.to_dataframe().sort_index())
        )
        assert df2.set_index(keys="name").sort_index().equals(dfout2.sort_index())
Exemplo n.º 2
0
    def test_override_false(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})
        df3 = pd.concat([df1, df2])

        fset = fs.FeatureSet(name="override-false", entities=[fs.Entity("name")])
        fs.ingest(fset, df1)

        features = ["override-false.*"]
        fvec = fs.FeatureVector("override-false-vec", features=features)

        off1 = fs.get_offline_features(fvec).to_dataframe()
        assert df1.set_index(keys="name").sort_index().equals(off1.sort_index())

        fs.ingest(fset, df2, overwrite=False)

        off2 = fs.get_offline_features(fvec).to_dataframe()
        assert df3.set_index(keys="name").sort_index().equals(off2.sort_index())

        fs.ingest(fset, df1, targets=[ParquetTarget()])

        off1 = fs.get_offline_features(fvec).to_dataframe()
        assert df1.set_index(keys="name").sort_index().equals(off1.sort_index())

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "PQR"}])
        assert resp[0]["value"] == 6
        svc.close()

        with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
            fs.ingest(fset, df1, targets=[CSVTarget()], overwrite=False)

        fset.set_targets(targets=[CSVTarget()])
        with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
            fs.ingest(fset, df1, overwrite=False)
Exemplo n.º 3
0
def prepare_feature_set(name: str, entity: str, data: pd.DataFrame, timestamp_key=None):
    df_source = mlrun.datastore.sources.DataFrameSource(data, entity, timestamp_key)

    feature_set = fs.FeatureSet(
        name, entities=[fs.Entity(entity)], timestamp_key=timestamp_key
    )
    feature_set.set_targets()
    df = fs.ingest(feature_set, df_source, infer_options=fs.InferOptions.default())
    return feature_set, df
Exemplo n.º 4
0
def verify_target_list_fail(targets, with_defaults=None):
    feature_set = fs.FeatureSet(name="target-list-fail", entities=[fs.Entity("ticker")])
    with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
        if with_defaults:
            feature_set.set_targets(targets=targets, with_defaults=with_defaults)
        else:
            feature_set.set_targets(targets=targets)
    with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
        fs.ingest(feature_set, quotes, targets=targets)
Exemplo n.º 5
0
    def test_post_aggregation_step(self):
        quotes_set = fs.FeatureSet("post-aggregation", entities=[fs.Entity("ticker")])
        agg_step = quotes_set.add_aggregation(
            "asks", "ask", ["sum", "max"], "1h", "10m"
        )
        agg_step.to("MyMap", "somemap1", field="multi1", multiplier=3)

        # Make sure the map step was added right after the aggregation step
        assert len(quotes_set.graph.states) == 2
        assert quotes_set.graph.states[aggregates_step].after is None
        assert quotes_set.graph.states["somemap1"].after == [aggregates_step]
Exemplo n.º 6
0
    def test_overwrite_single_parquet_file(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})

        targets = [
            ParquetTarget(path="v3io:///bigdata/overwrite-pq-spec/my.parquet")
        ]

        fset = fs.FeatureSet(name="overwrite-pq-spec-path",
                             entities=[fs.Entity("name")])

        fs.ingest(fset, df1, targets=targets)
        with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
            fs.ingest(fset, df2, targets=targets, overwrite=False)
Exemplo n.º 7
0
    def test_split_graph(self):
        quotes_set = fs.FeatureSet("stock-quotes",
                                   entities=[fs.Entity("ticker")])

        quotes_set.graph.to("MyMap", "somemap1", field="multi1",
                            multiplier=3).to(
                                "storey.Extend",
                                _fn="({'extra': event['bid'] * 77})").to(
                                    "storey.Filter",
                                    "filter",
                                    _fn="(event['bid'] > 70)").to(
                                        FeaturesetValidator())

        side_step_name = "side-step"
        quotes_set.graph.to("storey.Extend",
                            name=side_step_name,
                            _fn="({'extra2': event['bid'] * 17})")
        with pytest.raises(mlrun.errors.MLRunPreconditionFailedError):
            fs.infer_metadata(quotes_set, quotes)

        non_default_target_name = "side-target"
        quotes_set.set_targets(
            targets=[
                CSVTarget(name=non_default_target_name,
                          after_state=side_step_name)
            ],
            default_final_state="FeaturesetValidator",
        )

        quotes_set.plot(with_targets=True)

        inf_out = fs.infer_metadata(quotes_set, quotes)
        ing_out = fs.ingest(quotes_set, quotes, return_df=True)

        default_file_path = quotes_set.get_target_path(TargetTypes.parquet)
        side_file_path = quotes_set.get_target_path(non_default_target_name)

        side_file_out = pd.read_csv(side_file_path)
        default_file_out = pd.read_parquet(default_file_path)
        self._split_graph_expected_default.set_index("ticker", inplace=True)

        assert all(
            self._split_graph_expected_default == default_file_out.round(2))
        assert all(self._split_graph_expected_default == ing_out.round(2))
        assert all(self._split_graph_expected_default == inf_out.round(2))

        assert all(
            self._split_graph_expected_side.sort_index(
                axis=1) == side_file_out.sort_index(axis=1).round(2))
Exemplo n.º 8
0
    def test_overwrite(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})

        fset = fs.FeatureSet(name="overwrite-fs", entities=[fs.Entity("name")])
        fs.ingest(fset, df1, targets=[CSVTarget(), ParquetTarget(), NoSqlTarget()])

        features = ["overwrite-fs.*"]
        fvec = fs.FeatureVector("overwrite-vec", features=features)

        csv_path = fset.get_target_path(name="csv")
        csv_df = pd.read_csv(csv_path)
        assert (
            df1.set_index(keys="name")
            .sort_index()
            .equals(csv_df.set_index(keys="name").sort_index())
        )

        parquet_path = fset.get_target_path(name="parquet")
        parquet_df = pd.read_parquet(parquet_path)
        assert df1.set_index(keys="name").sort_index().equals(parquet_df.sort_index())

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "GHI"}])
        assert resp[0]["value"] == 3
        svc.close()

        fs.ingest(fset, df2)

        csv_path = fset.get_target_path(name="csv")
        csv_df = pd.read_csv(csv_path)
        assert (
            df1.set_index(keys="name")
            .sort_index()
            .equals(csv_df.set_index(keys="name").sort_index())
        )

        parquet_path = fset.get_target_path(name="parquet")
        parquet_df = pd.read_parquet(parquet_path)
        assert df2.set_index(keys="name").sort_index().equals(parquet_df.sort_index())

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "GHI"}])
        assert resp[0] is None

        resp = svc.get(entity_rows=[{"name": "PQR"}])
        assert resp[0]["value"] == 6
        svc.close()
Exemplo n.º 9
0
 def test_basic_remote_spark_ingest(self):
     key = "patient_id"
     measurements = fs.FeatureSet(
         "measurements",
         entities=[fs.Entity(key)],
         timestamp_key="timestamp",
         engine="spark",
     )
     source = ParquetSource("myparquet",
                            path=self.get_remote_pq_source_path())
     fs.ingest(
         measurements,
         source,
         return_df=True,
         spark_context=self.spark_service,
         run_config=fs.RunConfig(local=False),
     )
Exemplo n.º 10
0
def test_backwards_compatibility_step_vs_state():
    quotes_set = fs.FeatureSet("post-aggregation",
                               entities=[fs.Entity("ticker")])
    agg_step = quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m")
    agg_step.to("MyMap", "somemap1", field="multi1", multiplier=3)
    quotes_set.set_targets(
        targets=[ParquetTarget("parquet1", after_state="somemap1")],
        with_defaults=False,
    )

    feature_set_dict = quotes_set.to_dict()
    # Make sure we're backwards compatible
    feature_set_dict["spec"]["graph"]["states"] = feature_set_dict["spec"][
        "graph"].pop("steps")
    feature_set_dict["spec"]["targets"][0]["after_state"] = feature_set_dict[
        "spec"]["targets"][0].pop("after_step")

    from_dict_feature_set = fs.FeatureSet.from_dict(feature_set_dict)
    assert (deepdiff.DeepDiff(from_dict_feature_set.to_dict(),
                              quotes_set.to_dict()) == {})
Exemplo n.º 11
0
    def test_error_flow(self):
        df = pd.DataFrame({
            "name": ["Jean", "Jacques", "Pierre"],
            "last_name": ["Dubois", "Dupont", "Lavigne"],
        })

        measurements = fs.FeatureSet(
            "measurements",
            entities=[fs.Entity("name")],
            engine="spark",
        )

        with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
            fs.ingest(
                measurements,
                df,
                return_df=True,
                spark_context=self.spark_service,
                run_config=fs.RunConfig(local=False),
            )
Exemplo n.º 12
0
    def test_overwrite_specified_nosql_path(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})

        targets = [NoSqlTarget(path="v3io:///bigdata/overwrite-spec")]

        fset = fs.FeatureSet(name="overwrite-spec-path",
                             entities=[fs.Entity("name")])
        features = ["overwrite-spec-path.*"]
        fvec = fs.FeatureVector("overwrite-spec-path-fvec", features=features)

        fs.ingest(fset, df1, targets=targets)

        fs.ingest(fset, df2, targets=targets)

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "PQR"}])
        assert resp[0]["value"] == 6
        resp = svc.get(entity_rows=[{"name": "ABC"}])
        assert resp[0] is None
        svc.close()
Exemplo n.º 13
0
 def test_featureset_uri(self):
     stocks_set = fs.FeatureSet("stocks01", entities=[fs.Entity("ticker")])
     stocks_set.save()
     fs.ingest(stocks_set.uri, stocks)
Exemplo n.º 14
0
    def test_schedule_on_filtered_by_time(self, partitioned):
        name = f"sched-time-{str(partitioned)}"

        now = datetime.now()

        path = "v3io:///bigdata/bla.parquet"
        fsys = fsspec.filesystem(v3iofs.fs.V3ioFS.protocol)
        pd.DataFrame({
            "time": [
                pd.Timestamp("2021-01-10 10:00:00"),
                pd.Timestamp("2021-01-10 11:00:00"),
            ],
            "first_name": ["moshe", "yosi"],
            "data": [2000, 10],
        }).to_parquet(path=path, filesystem=fsys)

        cron_trigger = "*/2 * * * *"

        source = ParquetSource("myparquet",
                               path=path,
                               time_field="time",
                               schedule=cron_trigger)

        feature_set = fs.FeatureSet(
            name=name,
            entities=[fs.Entity("first_name")],
            timestamp_key="time",
            engine="spark",
        )

        if partitioned:
            targets = [
                NoSqlTarget(),
                ParquetTarget(
                    name="tar1",
                    path="v3io:///bigdata/fs1/",
                    partitioned=True,
                    partition_cols=["time"],
                ),
            ]
        else:
            targets = [
                ParquetTarget(name="tar2",
                              path="v3io:///bigdata/fs2/",
                              partitioned=False),
                NoSqlTarget(),
            ]

        fs.ingest(
            feature_set,
            source,
            run_config=fs.RunConfig(local=False),
            targets=targets,
            spark_context=self.spark_service,
        )
        # ingest starts every second minute and it takes ~90 seconds to finish.
        if (now.minute % 2) == 0:
            sleep(60 - now.second + 60 + 90)
        else:
            sleep(60 - now.second + 90)

        features = [f"{name}.*"]
        vec = fs.FeatureVector("sched_test-vec", features)

        svc = fs.get_online_feature_service(vec)

        resp = svc.get([{"first_name": "yosi"}, {"first_name": "moshe"}])
        assert resp[0]["data"] == 10
        assert resp[1]["data"] == 2000

        pd.DataFrame({
            "time": [
                pd.Timestamp("2021-01-10 12:00:00"),
                pd.Timestamp("2021-01-10 13:00:00"),
                now + pd.Timedelta(minutes=10),
                pd.Timestamp("2021-01-09 13:00:00"),
            ],
            "first_name": ["moshe", "dina", "katya", "uri"],
            "data": [50, 10, 25, 30],
        }).to_parquet(path=path)

        sleep(120)
        resp = svc.get([
            {
                "first_name": "yosi"
            },
            {
                "first_name": "moshe"
            },
            {
                "first_name": "katya"
            },
            {
                "first_name": "dina"
            },
            {
                "first_name": "uri"
            },
        ])
        assert resp[0]["data"] == 10
        assert resp[1]["data"] == 50
        assert resp[2] is None
        assert resp[3]["data"] == 10
        assert resp[4] is None

        svc.close()

        # check offline
        resp = fs.get_offline_features(vec)
        assert len(resp.to_dataframe() == 4)
        assert "uri" not in resp.to_dataframe(
        ) and "katya" not in resp.to_dataframe()