Exemplo n.º 1
0
    def test_csv_parquet_index_alignment(self):
        targets = [CSVTarget()]
        csv_align_set, _ = prepare_feature_set("csv-align",
                                               "ticker",
                                               quotes,
                                               timestamp_key="time",
                                               targets=targets)
        csv_df = csv_align_set.to_dataframe()

        features = ["csv-align.*"]
        csv_vec = fs.FeatureVector("csv-align-vector", features)
        resp = fs.get_offline_features(csv_vec)
        csv_vec_df = resp.to_dataframe()

        targets = [ParquetTarget()]
        parquet_align_set, _ = prepare_feature_set("parquet-align",
                                                   "ticker",
                                                   quotes,
                                                   timestamp_key="time",
                                                   targets=targets)
        parquet_df = parquet_align_set.to_dataframe()
        features = ["parquet-align.*"]
        parquet_vec = fs.FeatureVector("parquet-align-vector", features)
        resp = fs.get_offline_features(parquet_vec)
        parquet_vec_df = resp.to_dataframe()

        assert all(csv_df == parquet_df)
        assert all(csv_vec_df == parquet_vec_df)
Exemplo n.º 2
0
    def test_ingest_twice_with_nulls(self):
        name = f"test_ingest_twice_with_nulls_{uuid.uuid4()}"
        key = "key"

        measurements = fs.FeatureSet(
            name, entities=[Entity(key)], timestamp_key="my_time"
        )
        columns = [key, "my_string", "my_time"]
        df = pd.DataFrame(
            [["mykey1", "hello", pd.Timestamp("2019-01-26 14:52:37")]], columns=columns
        )
        df.set_index("my_string")
        source = DataFrameSource(df)
        measurements.set_targets(
            targets=[ParquetTarget(partitioned=True)], with_defaults=False,
        )
        resp1 = fs.ingest(measurements, source)
        assert resp1.to_dict() == {
            "my_string": {"mykey1": "hello"},
            "my_time": {"mykey1": pd.Timestamp("2019-01-26 14:52:37")},
        }

        features = [
            f"{name}.*",
        ]
        vector = fs.FeatureVector("myvector", features)
        resp2 = fs.get_offline_features(vector)
        resp2 = resp2.to_dataframe()
        assert resp2.to_dict() == {"my_string": {"mykey1": "hello"}}

        measurements = fs.FeatureSet(
            name, entities=[Entity(key)], timestamp_key="my_time"
        )
        columns = [key, "my_string", "my_time"]
        df = pd.DataFrame(
            [["mykey2", None, pd.Timestamp("2019-01-26 14:52:37")]], columns=columns
        )
        df.set_index("my_string")
        source = DataFrameSource(df)
        measurements.set_targets(
            targets=[ParquetTarget(partitioned=True)], with_defaults=False,
        )
        resp1 = fs.ingest(measurements, source, overwrite=False)
        assert resp1.to_dict() == {
            "my_string": {"mykey2": None},
            "my_time": {"mykey2": pd.Timestamp("2019-01-26 14:52:37")},
        }

        features = [
            f"{name}.*",
        ]
        vector = fs.FeatureVector("myvector", features)
        resp2 = fs.get_offline_features(vector)
        resp2 = resp2.to_dataframe()
        assert resp2.to_dict() == {"my_string": {"mykey1": "hello", "mykey2": None}}
Exemplo n.º 3
0
    def test_unaggregated_columns(self):
        test_base_time = datetime(2020, 12, 1, 17, 33, 15)

        data = pd.DataFrame({
            "time": [test_base_time, test_base_time - pd.Timedelta(minutes=1)],
            "first_name": ["moshe", "yosi"],
            "last_name": ["cohen", "levi"],
            "bid": [2000, 10],
        })

        name = f"measurements_{uuid.uuid4()}"

        # write to kv
        data_set = fs.FeatureSet(name, entities=[Entity("first_name")])

        data_set.add_aggregation(
            name="bids",
            column="bid",
            operations=["sum", "max"],
            windows="1h",
            period="10m",
        )

        fs.ingest(data_set, data, return_df=True)

        features = [f"{name}.bids_sum_1h", f"{name}.last_name"]

        vector = fs.FeatureVector("my-vec", features)
        svc = fs.get_online_feature_service(vector)

        resp = svc.get([{"first_name": "moshe"}])
        expected = {"bids_sum_1h": 2000.0, "last_name": "cohen"}
        assert resp[0] == expected
        svc.close()
Exemplo n.º 4
0
    def test_parquet_target_vector_overwrite(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        fset = fs.FeatureSet(name="fvec-parquet-fset", entities=[fs.Entity("name")])
        fs.ingest(fset, df1)

        features = ["fvec-parquet-fset.*"]
        fvec = fs.FeatureVector("fvec-parquet", features=features)

        target = ParquetTarget()
        off1 = fs.get_offline_features(fvec, target=target)
        dfout1 = pd.read_parquet(target._target_path)

        assert (
            df1.set_index(keys="name")
            .sort_index()
            .equals(off1.to_dataframe().sort_index())
        )
        assert df1.set_index(keys="name").sort_index().equals(dfout1.sort_index())

        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})
        fs.ingest(fset, df2)
        off2 = fs.get_offline_features(fvec, target=target)
        dfout2 = pd.read_parquet(target._target_path)
        assert (
            df2.set_index(keys="name")
            .sort_index()
            .equals(off2.to_dataframe().sort_index())
        )
        assert df2.set_index(keys="name").sort_index().equals(dfout2.sort_index())
Exemplo n.º 5
0
    def test_override_false(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})
        df3 = pd.concat([df1, df2])

        fset = fs.FeatureSet(name="override-false", entities=[fs.Entity("name")])
        fs.ingest(fset, df1)

        features = ["override-false.*"]
        fvec = fs.FeatureVector("override-false-vec", features=features)

        off1 = fs.get_offline_features(fvec).to_dataframe()
        assert df1.set_index(keys="name").sort_index().equals(off1.sort_index())

        fs.ingest(fset, df2, overwrite=False)

        off2 = fs.get_offline_features(fvec).to_dataframe()
        assert df3.set_index(keys="name").sort_index().equals(off2.sort_index())

        fs.ingest(fset, df1, targets=[ParquetTarget()])

        off1 = fs.get_offline_features(fvec).to_dataframe()
        assert df1.set_index(keys="name").sort_index().equals(off1.sort_index())

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "PQR"}])
        assert resp[0]["value"] == 6
        svc.close()

        with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
            fs.ingest(fset, df1, targets=[CSVTarget()], overwrite=False)

        fset.set_targets(targets=[CSVTarget()])
        with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
            fs.ingest(fset, df1, overwrite=False)
Exemplo n.º 6
0
    def _get_offline_vector(self, features, features_size):
        vector = fs.FeatureVector("myvector", features, "stock-quotes.xx")
        resp = fs.get_offline_features(
            vector,
            entity_rows=trades,
            entity_timestamp_column="time",
        )
        assert len(vector.spec.features) == len(
            features), "unexpected num of requested features"
        assert (len(vector.status.features) == features_size
                ), "unexpected num of returned features"
        assert (len(vector.status.stats) == features_size
                ), "unexpected num of feature stats"
        assert vector.status.label_column == "xx", "unexpected label_column name"

        df = resp.to_dataframe()
        columns = trades.shape[1] + features_size - 2  # - 2 keys
        assert df.shape[1] == columns, "unexpected num of returned df columns"
        resp.to_parquet(str(self.results_path / "query.parquet"))

        # check simple api without join with other df
        resp = fs.get_offline_features(vector)
        df = resp.to_dataframe()
        assert df.shape[
            1] == features_size, "unexpected num of returned df columns"
Exemplo n.º 7
0
def test_check_permissions():
    data = pd.DataFrame({
        "time_stamp": [
            pd.Timestamp("2021-06-09 09:30:06.008"),
            pd.Timestamp("2021-06-09 10:29:07.009"),
            pd.Timestamp("2021-06-09 09:29:08.010"),
        ],
        "data": [10, 20, 30],
        "string": ["ab", "cd", "ef"],
    })
    data_set1 = fs.FeatureSet("fs1", entities=[Entity("string")])

    mlrun.db.FileRunDB.verify_authorization = unittest.mock.Mock(
        side_effect=mlrun.errors.MLRunAccessDeniedError(""))

    try:
        fs.preview(
            data_set1,
            data,
            entity_columns=[Entity("string")],
            timestamp_key="time_stamp",
        )
        assert False
    except mlrun.errors.MLRunAccessDeniedError:
        pass

    try:
        fs.ingest(data_set1, data, infer_options=fs.InferOptions.default())
        assert False
    except mlrun.errors.MLRunAccessDeniedError:
        pass

    features = ["fs1.*"]
    feature_vector = fs.FeatureVector("test", features)
    try:
        fs.get_offline_features(feature_vector,
                                entity_timestamp_column="time_stamp")
        assert False
    except mlrun.errors.MLRunAccessDeniedError:
        pass

    try:
        fs.get_online_feature_service(feature_vector)
        assert False
    except mlrun.errors.MLRunAccessDeniedError:
        pass

    try:
        fs.deploy_ingestion_service(featureset=data_set1)
        assert False
    except mlrun.errors.MLRunAccessDeniedError:
        pass

    try:
        data_set1.purge_targets()
        assert False
    except mlrun.errors.MLRunAccessDeniedError:
        pass
Exemplo n.º 8
0
    def test_multiple_entities(self):
        name = f"measurements_{uuid.uuid4()}"
        current_time = pd.Timestamp.now()
        data = pd.DataFrame(
            {
                "time": [
                    current_time,
                    current_time - pd.Timedelta(minutes=1),
                    current_time - pd.Timedelta(minutes=2),
                    current_time - pd.Timedelta(minutes=3),
                    current_time - pd.Timedelta(minutes=4),
                    current_time - pd.Timedelta(minutes=5),
                ],
                "first_name": ["moshe", "yosi", "yosi", "yosi", "moshe", "yosi"],
                "last_name": ["cohen", "levi", "levi", "levi", "cohen", "levi"],
                "bid": [2000, 10, 11, 12, 2500, 14],
            }
        )

        # write to kv
        data_set = fs.FeatureSet(
            name, entities=[Entity("first_name"), Entity("last_name")]
        )

        data_set.add_aggregation(
            name="bids",
            column="bid",
            operations=["sum", "max"],
            windows="1h",
            period="10m",
            emit_policy=EmitAfterMaxEvent(1),
        )
        fs.infer_metadata(
            data_set,
            data,  # source
            entity_columns=["first_name", "last_name"],
            timestamp_key="time",
            options=fs.InferOptions.default(),
        )

        data_set.plot(
            str(self.results_path / "pipe.png"), rankdir="LR", with_targets=True
        )
        fs.ingest(data_set, data, return_df=True)

        features = [
            f"{name}.bids_sum_1h",
        ]

        vector = fs.FeatureVector("my-vec", features)
        svc = fs.get_online_feature_service(vector)

        resp = svc.get([{"first_name": "yosi", "last_name": "levi"}])
        assert resp[0]["bids_sum_1h"] == 47.0

        svc.close()
Exemplo n.º 9
0
    def test_ordered_pandas_asof_merge(self):
        left_set, left = prepare_feature_set(
            "left", "ticker", trades, timestamp_key="time"
        )
        right_set, right = prepare_feature_set(
            "right", "ticker", quotes, timestamp_key="time"
        )

        features = ["left.*", "right.*"]
        feature_vector = fs.FeatureVector("test_fv", features, description="test FV")
        res = fs.get_offline_features(feature_vector, entity_timestamp_column="time")
        res = res.to_dataframe()
        assert res.shape[0] == left.shape[0]
Exemplo n.º 10
0
    def _get_online_features(self, features, features_size):
        # test real-time query
        vector = fs.FeatureVector("my-vec", features)
        svc = fs.get_online_feature_service(vector)

        resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
        resp = svc.get([{"ticker": "AAPL"}])
        assert (resp[0]["name"] == "Apple Inc" and resp[0]["exchange"]
                == "NASDAQ"), "unexpected online result"
        resp2 = svc.get([{"ticker": "AAPL"}], as_list=True)
        assert (len(resp2[0]) == features_size -
                1), "unexpected online vector size"  # -1 label
        svc.close()
Exemplo n.º 11
0
    def test_overwrite(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})

        fset = fs.FeatureSet(name="overwrite-fs", entities=[fs.Entity("name")])
        fs.ingest(fset, df1, targets=[CSVTarget(), ParquetTarget(), NoSqlTarget()])

        features = ["overwrite-fs.*"]
        fvec = fs.FeatureVector("overwrite-vec", features=features)

        csv_path = fset.get_target_path(name="csv")
        csv_df = pd.read_csv(csv_path)
        assert (
            df1.set_index(keys="name")
            .sort_index()
            .equals(csv_df.set_index(keys="name").sort_index())
        )

        parquet_path = fset.get_target_path(name="parquet")
        parquet_df = pd.read_parquet(parquet_path)
        assert df1.set_index(keys="name").sort_index().equals(parquet_df.sort_index())

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "GHI"}])
        assert resp[0]["value"] == 3
        svc.close()

        fs.ingest(fset, df2)

        csv_path = fset.get_target_path(name="csv")
        csv_df = pd.read_csv(csv_path)
        assert (
            df1.set_index(keys="name")
            .sort_index()
            .equals(csv_df.set_index(keys="name").sort_index())
        )

        parquet_path = fset.get_target_path(name="parquet")
        parquet_df = pd.read_parquet(parquet_path)
        assert df2.set_index(keys="name").sort_index().equals(parquet_df.sort_index())

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "GHI"}])
        assert resp[0] is None

        resp = svc.get(entity_rows=[{"name": "PQR"}])
        assert resp[0]["value"] == 6
        svc.close()
Exemplo n.º 12
0
    def test_none_value(self):
        data = pd.DataFrame(
            {"first_name": ["moshe", "yossi"], "bid": [2000, 10], "bool": [True, None]}
        )

        # write to kv
        data_set = fs.FeatureSet("tests2", entities=[Entity("first_name")])
        fs.ingest(data_set, data, return_df=True)
        features = ["tests2.*"]
        vector = fs.FeatureVector("my-vec", features)
        svc = fs.get_online_feature_service(vector)

        resp = svc.get([{"first_name": "yossi"}])
        assert resp[0] == {"bid": 10, "bool": None}

        svc.close()
Exemplo n.º 13
0
def test_realtime_query():
    init_store()

    features = [
        "stock-quotes.bid",
        "stock-quotes.asks_sum_5h",
        "stock-quotes.ask as mycol",
        "stocks.*",
    ]
    features_size = (len(features) + 1 + 1
                     )  # (*) returns 2 features, label adds 1 feature

    resp = fs.get_offline_features(
        features,
        entity_rows=trades,
        entity_timestamp_column="time",
        label_feature="stock-quotes.xx",
    )
    vector = resp.vector
    assert len(vector.spec.features) == len(
        features), "unexpected num of requested features"
    assert (len(vector.status.features) == features_size
            ), "unexpected num of returned features"
    assert len(vector.status.stats
               ) == features_size, "unexpected num of feature stats"
    assert vector.status.label_column == "xx", "unexpected label_column name"

    df = resp.to_dataframe()
    columns = trades.shape[1] + features_size - 2  # - 2 keys
    assert df.shape[1] == columns, "unexpected num of returned df columns"
    resp.to_parquet(results_dir + "query.parquet")

    # test real-time query
    vector = fs.FeatureVector("my-vec", features)
    svc = fs.get_online_feature_service(vector)

    resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
    resp = svc.get([{"ticker": "AAPL"}])
    assert (resp[0]["name"] == "Apple Inc"
            and resp[0]["exchange"] == "NASDAQ"), "unexpected online result"
    resp2 = svc.get([{"ticker": "AAPL"}], as_list=True)
    assert (len(resp2[0]) == features_size -
            1), "unexpected online vector size"  # -1 label
    svc.close()
Exemplo n.º 14
0
    def test_feature_vector_db(self):
        name = "fvec-test"
        fvec = fs.FeatureVector(name=name)

        db = mlrun.get_run_db()

        # TODO: Using to_dict due to a bug in httpdb api which will be fixed in another PR
        db.create_feature_vector(feature_vector=fvec.to_dict(),
                                 project=self.project_name)

        vecs = db.list_feature_vectors(self.project_name, name)
        assert len(vecs) == 1, "bad number of results"

        feature_vec = db.get_feature_vector(name, self.project_name)
        assert feature_vec.metadata.name == name, "bad feature set response"

        fs.delete_feature_vector(name, self.project_name)
        vecs = db.list_feature_vectors(self.project_name, name)
        assert not vecs, "Feature vector should be deleted"
Exemplo n.º 15
0
def test_right_not_ordered_pandas_asof_merge():
    init_store()

    right = quotes.sort_values(by="bid")

    left_set, left = prepare_feature_set("left",
                                         "ticker",
                                         trades,
                                         timestamp_key="time")
    right_set, right = prepare_feature_set("right",
                                           "ticker",
                                           right,
                                           timestamp_key="time")

    features = ["left.*", "right.*"]
    feature_vector = fs.FeatureVector("test_fv", features, "test FV")
    res = fs.get_offline_features(feature_vector,
                                  entity_timestamp_column="time")
    res = res.to_dataframe()
    assert res.shape[0] == left.shape[0]
Exemplo n.º 16
0
    def test_overwrite_specified_nosql_path(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})

        targets = [NoSqlTarget(path="v3io:///bigdata/overwrite-spec")]

        fset = fs.FeatureSet(name="overwrite-spec-path",
                             entities=[fs.Entity("name")])
        features = ["overwrite-spec-path.*"]
        fvec = fs.FeatureVector("overwrite-spec-path-fvec", features=features)

        fs.ingest(fset, df1, targets=targets)

        fs.ingest(fset, df2, targets=targets)

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "PQR"}])
        assert resp[0]["value"] == 6
        resp = svc.get(entity_rows=[{"name": "ABC"}])
        assert resp[0] is None
        svc.close()
Exemplo n.º 17
0
    def test_offline_features_filter_non_partitioned(self):
        data = pd.DataFrame({
            "time_stamp": [
                pd.Timestamp("2021-06-09 09:30:06.008"),
                pd.Timestamp("2021-06-09 10:29:07.009"),
                pd.Timestamp("2021-06-09 09:29:08.010"),
            ],
            "data": [10, 20, 30],
            "string": ["ab", "cd", "ef"],
        })

        data_set1 = fs.FeatureSet("fs1", entities=[Entity("string")])
        fs.ingest(data_set1, data, infer_options=fs.InferOptions.default())
        features = ["fs1.*"]
        vector = fs.FeatureVector("vector", features)
        resp = fs.get_offline_features(
            vector,
            entity_timestamp_column="time_stamp",
            start_time=datetime(2021, 6, 9, 9, 30),
            end_time=datetime(2021, 6, 9, 10, 30),
        )
        assert len(resp.to_dataframe()) == 2
Exemplo n.º 18
0
def test_realtime_query():
    init_store()

    features = [
        "stock-quotes.bid",
        "stock-quotes.asks_sum_5h",
        "stock-quotes.ask as mycol",
        "stocks.*",
    ]

    resp = fs.get_offline_features(features,
                                   entity_rows=trades,
                                   entity_timestamp_column="time")
    vector = resp.vector
    assert len(vector.spec.features) == len(
        features), "unexpected num of requested features"
    # stocks (*) returns 2 features
    assert (len(vector.status.features) == len(features) +
            1), "unexpected num of returned features"
    assert (len(vector.status.stats) == len(features) +
            1), "unexpected num of feature stats"

    df = resp.to_dataframe()
    columns = trades.shape[1] + len(features) + 1
    assert df.shape[1] == columns, "unexpected num of returned df columns"
    resp.to_parquet(results_dir + "query.parquet")

    # test real-time query
    vector = fs.FeatureVector("my-vec", features)
    svc = fs.get_online_feature_service(vector)

    resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
    print(resp)
    resp = svc.get([{"ticker": "AAPL"}])
    assert (resp[0]["ticker"] == "AAPL"
            and resp[0]["exchange"] == "NASDAQ"), "unexpected online result"
    svc.close()
Exemplo n.º 19
0
    def test_schedule_on_filtered_by_time(self, partitioned):
        name = f"sched-time-{str(partitioned)}"

        now = datetime.now()

        path = "v3io:///bigdata/bla.parquet"
        fsys = fsspec.filesystem(v3iofs.fs.V3ioFS.protocol)
        pd.DataFrame({
            "time": [
                pd.Timestamp("2021-01-10 10:00:00"),
                pd.Timestamp("2021-01-10 11:00:00"),
            ],
            "first_name": ["moshe", "yosi"],
            "data": [2000, 10],
        }).to_parquet(path=path, filesystem=fsys)

        cron_trigger = "*/2 * * * *"

        source = ParquetSource("myparquet",
                               path=path,
                               time_field="time",
                               schedule=cron_trigger)

        feature_set = fs.FeatureSet(
            name=name,
            entities=[fs.Entity("first_name")],
            timestamp_key="time",
            engine="spark",
        )

        if partitioned:
            targets = [
                NoSqlTarget(),
                ParquetTarget(
                    name="tar1",
                    path="v3io:///bigdata/fs1/",
                    partitioned=True,
                    partition_cols=["time"],
                ),
            ]
        else:
            targets = [
                ParquetTarget(name="tar2",
                              path="v3io:///bigdata/fs2/",
                              partitioned=False),
                NoSqlTarget(),
            ]

        fs.ingest(
            feature_set,
            source,
            run_config=fs.RunConfig(local=False),
            targets=targets,
            spark_context=self.spark_service,
        )
        # ingest starts every second minute and it takes ~90 seconds to finish.
        if (now.minute % 2) == 0:
            sleep(60 - now.second + 60 + 90)
        else:
            sleep(60 - now.second + 90)

        features = [f"{name}.*"]
        vec = fs.FeatureVector("sched_test-vec", features)

        svc = fs.get_online_feature_service(vec)

        resp = svc.get([{"first_name": "yosi"}, {"first_name": "moshe"}])
        assert resp[0]["data"] == 10
        assert resp[1]["data"] == 2000

        pd.DataFrame({
            "time": [
                pd.Timestamp("2021-01-10 12:00:00"),
                pd.Timestamp("2021-01-10 13:00:00"),
                now + pd.Timedelta(minutes=10),
                pd.Timestamp("2021-01-09 13:00:00"),
            ],
            "first_name": ["moshe", "dina", "katya", "uri"],
            "data": [50, 10, 25, 30],
        }).to_parquet(path=path)

        sleep(120)
        resp = svc.get([
            {
                "first_name": "yosi"
            },
            {
                "first_name": "moshe"
            },
            {
                "first_name": "katya"
            },
            {
                "first_name": "dina"
            },
            {
                "first_name": "uri"
            },
        ])
        assert resp[0]["data"] == 10
        assert resp[1]["data"] == 50
        assert resp[2] is None
        assert resp[3]["data"] == 10
        assert resp[4] is None

        svc.close()

        # check offline
        resp = fs.get_offline_features(vec)
        assert len(resp.to_dataframe() == 4)
        assert "uri" not in resp.to_dataframe(
        ) and "katya" not in resp.to_dataframe()
Exemplo n.º 20
0
def feature_selection(context,
                      df_artifact,
                      k: int=5,
                      min_votes: float=0.5,
                      label_column: str=None,
                      stat_filters: list=['f_classif', 'mutual_info_classif', 'chi2', 'f_regression'],
                      model_filters: dict={'LinearSVC': 'LinearSVC',
                                     'LogisticRegression': 'LogisticRegression',
                                     'ExtraTreesClassifier': 'ExtraTreesClassifier'},
                      max_scaled_scores: bool=True,
                      sample_ratio: float=None,
                      output_vector_name: float=None,
                      ignore_type_errors: bool=False,
                      is_feature_vector: bool=False):
    
    """Applies selected feature selection statistical functions
    or models on our 'df_artifact'.

    Each statistical function or model will vote for it's best K selected features.
    If a feature has >= 'min_votes' votes, it will be selected.

    :param context:           the function context.
    
    :param k:                 number of top features to select from each statistical
                              function or model.
                              
    :param min_votes:         minimal number of votes (from a model or by statistical
                              function) needed for a feature to be selected.
                              Can be specified by percentage of votes or absolute
                              number of votes.
                              
    :param label_column:      ground-truth (y) labels.
    
    :param stat_filters:      statistical functions to apply to the features
                              (from sklearn.feature_selection).
                              
    :param model_filters:     models to use for feature evaluation, can be specified by
                              model name (ex. LinearSVC), formalized json (contains 'CLASS',
                              'FIT', 'META') or a path to such json file.
                              
    :param max_scaled_scores: produce feature scores table scaled with max_scaler.

    :param sample_ratio: percentage of the dataset the user whishes to compute the feature selection process on.
    
    :param output_vector_name: creates a new feature vector containing only the identifies features.
    
    :param ignore_type_errors: skips datatypes that are neither float or int within the feature vector.
    
    :param is_feature_vector: bool stating if the data is passed as a feature vector.
    """
        
    # Check if df.meta is valid, if it is, look for a feature vector
    if df_artifact.meta:
        if df_artifact.meta.kind == mlrun.api.schemas.ObjectKind.feature_vector:
            is_feature_vector = True
    
    # Look inside meta.spec.label_feature to identify the label_column if the user did not specify it
    if label_column is None:
        if is_feature_vector:
            label_column = df_artifact.meta.spec.label_feature.split('.')[1]
        else:
            raise ValueError('No label_column was given, please add a label_column.')
    
    # Use the feature vector as dataframe
    df = df_artifact.as_df()
    
    # Ensure k is not bigger than the the total number of features
    if k > df.shape[1]:
        raise ValueError(f'K cannot be bigger than the total number of features ({df.shape[1]}). Please choose a smaller K.')
    elif k < 1:
        raise ValueError(f'K cannot be smaller than 1. Please choose a bigger K.')
        
    # Create a sample dataframe of the original feature vector
    if sample_ratio:
        df = df.groupby(label_column).apply(lambda x: x.sample(frac=sample_ratio)).reset_index(drop=True)
        df = df.dropna()
        
    # Set feature vector and labels
    y = df.pop(label_column)
    X = df
    
    if np.object in list(X.dtypes) and ignore_type_errors is False:
        raise ValueError(f"{df.select_dtypes(include=['object']).columns.tolist()} are neither float or int.")
        
    # Create selected statistical estimators
    stat_functions_list = {stat_name: SelectKBest(create_class(f'sklearn.feature_selection.{stat_name}'), k)
                           for stat_name in stat_filters}
    requires_abs = ['chi2']

    # Run statistic filters
    selected_features_agg = {}
    stats_df = pd.DataFrame(index=X.columns).dropna()
                
    for stat_name, stat_func in stat_functions_list.items():
        try:
            params = (X, y) if stat_name in requires_abs else (abs(X), y)
            stat = stat_func.fit(*params)

            # Collect stat function results
            stat_df = pd.DataFrame(index=X.columns,
                                   columns=[stat_name],
                                   data=stat.scores_)
            plot_stat(context, stat_name, stat_df)
            stats_df = stats_df.join(stat_df)

            # Select K Best features
            selected_features = X.columns[stat_func.get_support()]
            selected_features_agg[stat_name] = selected_features
            
        except Exception as e:
            context.logger.info(f"Couldn't calculate {stat_name} because of: {e}")

    # Create models from class name / json file / json params
    all_sklearn_estimators = dict(all_estimators()) if len(model_filters) > 0 else {}
    selected_models = {}
    for model_name, model in model_filters.items():
        if '.json' in model:
            current_model = json.load(open(model, 'r'))
            ClassifierClass = create_class(current_model["META"]["class"])
            selected_models[model_name] = ClassifierClass(**current_model["CLASS"])
        elif model in all_sklearn_estimators:
            selected_models[model_name] = all_sklearn_estimators[model_name]()
            
        else:
            try:
                current_model = json.loads(model) if isinstance(model, str) else current_model
                ClassifierClass = create_class(current_model["META"]["class"])
                selected_models[model_name] = ClassifierClass(**current_model["CLASS"])
            except:
                context.logger.info(f'unable to load {model}')

    # Run model filters
    models_df = pd.DataFrame(index=X.columns)
    for model_name, model in selected_models.items():
        

        if model_name == 'LogisticRegression':
            model.set_params(solver='liblinear')
            
        # Train model and get feature importance
        select_from_model = SelectFromModel(model).fit(X, y)
        feature_idx = select_from_model.get_support()
        feature_names = X.columns[feature_idx]
        selected_features_agg[model_name] = feature_names.tolist()

        # Collect model feature importance
        if hasattr(select_from_model.estimator_, 'coef_'):
            stat_df = select_from_model.estimator_.coef_
        elif hasattr(select_from_model.estimator_, 'feature_importances_'):
            stat_df = select_from_model.estimator_.feature_importances_

        stat_df = pd.DataFrame(index=X.columns,
                               columns=[model_name],
                               data=stat_df[0])
        models_df = models_df.join(stat_df)

        plot_stat(context, model_name, stat_df)

    # Create feature_scores DF with stat & model filters scores
    result_matrix_df = pd.concat([stats_df, models_df], axis=1, sort=False)
    context.log_dataset(key='feature_scores',
                        df=result_matrix_df,
                        local_path='feature_scores.parquet',
                        format='parquet')
    if max_scaled_scores:
        normalized_df = result_matrix_df.replace([np.inf, -np.inf], np.nan).values
        min_max_scaler = MinMaxScaler()
        normalized_df = min_max_scaler.fit_transform(normalized_df)
        normalized_df = pd.DataFrame(data=normalized_df,
                                     columns=result_matrix_df.columns,
                                     index=result_matrix_df.index)
        context.log_dataset(key='max_scaled_scores_feature_scores',
                            df=normalized_df,
                            local_path='max_scaled_scores_feature_scores.parquet',
                            format='parquet')

    # Create feature count DataFrame
    for test_name in selected_features_agg:
        result_matrix_df[test_name] = [1 if x in selected_features_agg[test_name] else 0 for x in X.columns]
    result_matrix_df.loc[:, 'num_votes'] = result_matrix_df.sum(axis=1)
    context.log_dataset(key='selected_features_count',
                        df=result_matrix_df,
                        local_path='selected_features_count.parquet',
                        format='parquet')

    # How many votes are needed for a feature to be selected?
    if isinstance(min_votes, int):
        votes_needed = min_votes
    else:
        num_filters = len(stat_filters) + len(model_filters)
        votes_needed = int(np.floor(num_filters * max(min(min_votes, 1), 0)))
    context.logger.info(f'votes needed to be selected: {votes_needed}')

    # Create final feature dataframe
    selected_features = result_matrix_df[result_matrix_df.num_votes >= votes_needed].index.tolist()
    good_feature_df = df.loc[:, selected_features]
    final_df = pd.concat([good_feature_df, y], axis=1)
    context.log_dataset(key='selected_features',
                        df=final_df,
                        local_path='selected_features.parquet',
                        format='parquet')
    
    # Creating a new feature vector containing only the identified top features
    if is_feature_vector and df_artifact.meta.spec.features and output_vector_name:

        # Selecting the top K features from our top feature dataframe
        selected_features = result_matrix_df.head(k).index

        # Match the selected feature names to the FS Feature annotations
        matched_selections = [feature for feature in list(df_artifact.meta.spec.features) for selected in list(selected_features) if feature.endswith(selected)]

        # Defining our new feature vector
        top_features_fv = fs.FeatureVector(output_vector_name, 
                                    matched_selections, 
                                    label_feature="labels.label",
                                    description='feature vector composed strictly of our top features')

        # Saving
        top_features_fv.save()
        fs.get_offline_features(top_features_fv, target=ParquetTarget())

        # Logging our new feature vector URI
        context.log_result('top_features_vector', top_features_fv.uri)
Exemplo n.º 21
0
    def test_ingest_with_column_conversion(self):
        orig_df = source = pd.DataFrame(
            {
                "time_stamp": [
                    pd.Timestamp("2002-04-01 04:32:34.000"),
                    pd.Timestamp("2002-04-01 15:05:37.000"),
                    pd.Timestamp("2002-03-31 23:46:07.000"),
                ],
                "ssrxbtok": [488441267876, 438975336749, 298802679370],
                "nkxuonfx": [0.241233, 0.160264, 0.045345],
                "xzvipbmo": [True, False, None],
                "bikyseca": ["ONE", "TWO", "THREE"],
                "napxsuhp": [True, False, True],
                "oegndrxe": [
                    pd.Timestamp("2002-04-01 04:32:34.000"),
                    pd.Timestamp("2002-04-01 05:06:34.000"),
                    pd.Timestamp("2002-04-01 05:38:34.000"),
                ],
                "aatxnkgx": [-227504700006, -470002151801, -33193685176],
                "quupyoxi": ["FOUR", "FIVE", "SIX"],
                "temdojgz": [0.570031, 0.677182, 0.276053],
            },
            index=None,
        )

        fset = fs.FeatureSet(
            "rWQTKqbhje",
            timestamp_key="time_stamp",
            entities=[
                Entity("{}".format(k["name"])) for k in [
                    {
                        "dtype": "float",
                        "null_values": False,
                        "name": "temdojgz",
                        "df_dtype": "float64",
                    },
                    {
                        "dtype": "str",
                        "null_values": False,
                        "name": "bikyseca",
                        "df_dtype": "object",
                    },
                    {
                        "dtype": "float",
                        "null_values": False,
                        "name": "nkxuonfx",
                        "df_dtype": "float64",
                    },
                ]
            ],
        )

        fset.graph.to(name="s1", handler="my_func")
        ikjqkfcz = ParquetTarget(path="v3io:///bigdata/ifrlsjvxgv",
                                 partitioned=False)
        fs.ingest(fset, source, targets=[ikjqkfcz])

        features = ["rWQTKqbhje.*"]
        vector = fs.FeatureVector("WPAyrYux", features)
        vector.spec.with_indexes = False
        resp = fs.get_offline_features(vector)
        off_df = resp.to_dataframe()
        del orig_df["time_stamp"]
        if None in list(orig_df.index.names):
            orig_df.set_index(["temdojgz", "bikyseca", "nkxuonfx"],
                              inplace=True)
        orig_df = orig_df.sort_values(
            by=["temdojgz", "bikyseca", "nkxuonfx"]).sort_index(axis=1)
        off_df = off_df.sort_values(
            by=["temdojgz", "bikyseca", "nkxuonfx"]).sort_index(axis=1)
        pd.testing.assert_frame_equal(
            off_df,
            orig_df,
            check_dtype=True,
            check_index_type=True,
            check_column_type=True,
            check_like=True,
            check_names=True,
        )
Exemplo n.º 22
0
    def test_ingest_partitioned_by_key_and_time(
        self, key_bucketing_number, partition_cols, time_partitioning_granularity
    ):
        key = "patient_id"
        name = f"measurements_{uuid.uuid4()}"
        measurements = fs.FeatureSet(name, entities=[Entity(key)])
        source = CSVSource(
            "mycsv",
            path=os.path.relpath(str(self.assets_path / "testdata.csv")),
            time_field="timestamp",
        )
        measurements.set_targets(
            targets=[
                ParquetTarget(
                    partitioned=True,
                    key_bucketing_number=key_bucketing_number,
                    partition_cols=partition_cols,
                    time_partitioning_granularity=time_partitioning_granularity,
                )
            ],
            with_defaults=False,
        )
        resp1 = fs.ingest(measurements, source)

        features = [
            f"{name}.*",
        ]
        vector = fs.FeatureVector("myvector", features)
        resp = fs.get_offline_features(vector)
        resp2 = resp.to_dataframe()

        assert resp1.to_dict() == resp2.to_dict()

        file_system = fsspec.filesystem("v3io")
        kind = TargetTypes.parquet
        path = f"{get_default_prefix_for_target(kind)}/sets/{name}-latest"
        path = path.format(name=name, kind=kind, project="system-test-project")
        dataset = pq.ParquetDataset(path, filesystem=file_system,)
        partitions = [key for key, _ in dataset.pieces[0].partition_keys]

        if key_bucketing_number is None:
            expected_partitions = []
        elif key_bucketing_number == 0:
            expected_partitions = ["igzpart_key"]
        else:
            expected_partitions = [f"igzpart_hash{key_bucketing_number}_key"]
        expected_partitions += partition_cols or []
        if all(
            value is None
            for value in [
                key_bucketing_number,
                partition_cols,
                time_partitioning_granularity,
            ]
        ):
            time_partitioning_granularity = "hour"
        if time_partitioning_granularity:
            for unit in ["year", "month", "day", "hour"]:
                expected_partitions.append(f"igzpart_{unit}")
                if unit == time_partitioning_granularity:
                    break

        assert partitions == expected_partitions

        resp = fs.get_offline_features(
            vector,
            start_time=datetime(2020, 12, 1, 17, 33, 15),
            end_time=datetime(2020, 12, 1, 17, 33, 16),
            entity_timestamp_column="timestamp",
        )
        resp2 = resp.to_dataframe()
        assert len(resp2) == 10