예제 #1
0
파일: metrics.py 프로젝트: hirzel/lale
    def to_monoid(self, batch) -> _DIorSPDData:
        if len(batch) == 2:
            X, y_pred = batch
            y_true = None
        else:
            y_true, y_pred, X = batch
        assert y_pred is not None and X is not None, batch
        y_pred = self._y_pred_series(y_true, y_pred, X)
        encoded_X, y_pred = self.prot_attr_enc.transform(X, y_pred)
        df = pd.concat([encoded_X, y_pred], axis=1)
        pa_names = self.privileged_groups[0].keys()
        pipeline = GroupBy(by=[it[pa] for pa in pa_names] +
                           [it[y_pred.name]]) >> Aggregate(
                               columns={"count": count(it[y_pred.name])})
        agg_df = pipeline.transform(df)

        def count2(priv, fav):
            row = (priv, ) * len(pa_names) + (fav, )
            return agg_df.at[row, "count"] if row in agg_df.index else 0

        return _DIorSPDData(
            priv0_fav0=count2(priv=0, fav=0),
            priv0_fav1=count2(priv=0, fav=1),
            priv1_fav0=count2(priv=1, fav=0),
            priv1_fav1=count2(priv=1, fav=1),
        )
예제 #2
0
 def _lift(X, hyperparams):
     feature_names_in = X.columns
     count_op = Aggregate(columns={"count": count(it[feature_names_in[0]])})
     count_data = lale.helpers._ensure_pandas(count_op.transform(X))
     n_samples_seen = count_data.loc[0, "count"]
     if hyperparams["with_mean"] or hyperparams["with_std"]:
         sum1_op = Aggregate(
             columns={c: agg_sum(it[c])
                      for c in feature_names_in})
         sum1_data = lale.helpers._ensure_pandas(sum1_op.transform(X))
         sum1 = [sum1_data.loc[0, c] for c in feature_names_in]
     else:
         sum1 = None
     if hyperparams["with_std"]:
         sum2_op = Map(
             columns={c: it[c] * it[c]
                      for c in feature_names_in}
         ) >> Aggregate(
             columns={c: agg_sum(it[c])
                      for c in feature_names_in})
         sum2_data = lale.helpers._ensure_pandas(sum2_op.transform(X))
         sum2 = [sum2_data.loc[0, c] for c in feature_names_in]
     else:
         sum2 = None
     return feature_names_in, n_samples_seen, sum1, sum2
예제 #3
0
 def to_monoid(self, v):
     X, _ = v
     hyperparams = self._hyperparams
     feature_names_in = get_columns(X)
     count_op = Aggregate(columns={"count": count(it[feature_names_in[0]])})
     count_data = lale.helpers._ensure_pandas(count_op.transform(X))
     n_samples_seen = count_data.loc[0, "count"]
     if hyperparams["with_mean"] or hyperparams["with_std"]:
         sum1_op = Aggregate(
             columns={c: agg_sum(it[c])
                      for c in feature_names_in})
         sum1_data = lale.helpers._ensure_pandas(sum1_op.transform(X))
         sum1 = [sum1_data.loc[0, c] for c in feature_names_in]
     else:
         sum1 = None
     if hyperparams["with_std"]:
         sum2_op = Map(
             columns={c: it[c] * it[c]
                      for c in feature_names_in}
         ) >> Aggregate(
             columns={c: agg_sum(it[c])
                      for c in feature_names_in})
         sum2_data = lale.helpers._ensure_pandas(sum2_op.transform(X))
         sum2 = [sum2_data.loc[0, c] for c in feature_names_in]
     else:
         sum2 = None
     return _StandardScalerMonoid(
         feature_names_in_=feature_names_in,
         n_samples_seen_=n_samples_seen,
         _sum1=sum1,
         _sum2=sum2,
     )
예제 #4
0
파일: test_relational.py 프로젝트: IBM/lale
 def test_fit_error(self):
     relational = Relational(operator=(
         Scan(table=it.main) & Scan(table=it.delay)) >> Join(pred=[
             it.main.TrainId == it.delay.TrainId,
             it.main["Arrival time"] >= it.delay.TimeStamp,
         ]) >> Aggregate(columns=[count(it.Delay)], group_by=it.MessageId))
     with self.assertRaises(ValueError):
         _ = relational.fit([self.X_train], self.y_train)
예제 #5
0
파일: test_relational.py 프로젝트: IBM/lale
 def test_fit_transform(self):
     relational = Relational(operator=(
         Scan(table=it.main) & Scan(table=it.delay)) >> Join(pred=[
             it.main.TrainId == it.delay.TrainId,
             it.main["Arrival time"] >= it.delay.TimeStamp,
         ]) >> Aggregate(columns=[count(it.Delay)], group_by=it.MessageId))
     trained_relational = relational.fit(self.X_train, self.y_train)
     _ = trained_relational.transform(self.X_test)
예제 #6
0
파일: test_relational.py 프로젝트: IBM/lale
 def test_fit_transform_in_pipeline(self):
     relational = Relational(operator=(
         Scan(table=it.main) & Scan(table=it.delay)) >> Join(pred=[
             it.main.TrainId == it.delay.TrainId,
             it.main["Arrival time"] >= it.delay.TimeStamp,
         ]) >> Aggregate(columns=[count(it.Delay)], group_by=it.MessageId))
     pipeline = relational >> LogisticRegression()
     trained_pipeline = pipeline.fit(self.X_train, self.y_train)
     _ = trained_pipeline.predict(self.X_test)
예제 #7
0
파일: metrics.py 프로젝트: hirzel/lale
    def __init__(self):
        from lale.lib.rasl.concat_features import ConcatFeatures

        self._pipeline_suffix = (
            ConcatFeatures >> Map(
                columns={"match": astype("int", it.y_true == it.y_pred)
                         })  # type: ignore
            >> Aggregate(columns={
                "match": sum(it.match),
                "total": count(it.match)
            }))
예제 #8
0
파일: metrics.py 프로젝트: hirzel/lale
    def __init__(self):
        from lale.lib.rasl.concat_features import ConcatFeatures

        self._pipeline_suffix = (
            ConcatFeatures >> Map(
                columns={
                    "y": it.y_true,  # observed values
                    "f": it.y_pred,  # predicted values
                    "y2": it.y_true * it.y_true,  # squares
                    "e2": (it.y_true - it.y_pred) *
                    (it.y_true - it.y_pred),  # type: ignore
                }) >> Aggregate(
                    columns={
                        "n": count(it.y),
                        "sum": sum(it.y),
                        "sum_sq": sum(it.y2),
                        "res_sum_sq": sum(it.e2),  # residual sum of squares
                    }))
예제 #9
0
파일: metrics.py 프로젝트: hirzel/lale
    def to_monoid(self, batch) -> _AODorEODData:
        if len(batch) == 2:
            X, y_pred = batch
            y_true = None
        else:
            y_true, y_pred, X = batch
        assert y_pred is not None and X is not None, batch
        y_pred = self._y_pred_series(y_true, y_pred, X)
        encoded_X, y_pred = self.prot_attr_enc.transform(X, y_pred)

        def is_fresh(col_name):
            assert y_true is not None and isinstance(y_true, pd.Series), batch
            return col_name not in encoded_X.columns and col_name != y_true.name

        if is_fresh("y_pred"):
            y_pred_name = "y_pred"
        else:
            y_pred_name = next(f"y_pred_{i}" for i in itertools.count(0)
                               if is_fresh(f"y_pred_{i}"))
        y_pred = pd.Series(y_pred, y_pred.index, name=y_pred_name)
        _, y_true = self.prot_attr_enc.transform(X, y_true)
        df = pd.concat([y_true, y_pred, encoded_X], axis=1)
        pa_names = self.privileged_groups[0].keys()
        pipeline = GroupBy(by=[it[y_true.name], it[y_pred_name]] +
                           [it[pa] for pa in pa_names]) >> Aggregate(
                               columns={"count": count(it[y_pred.name])})
        agg_df = pipeline.transform(df)

        def count3(tru, pred, priv):
            row = (tru, pred) + (priv, ) * len(pa_names)
            return agg_df.at[row, "count"] if row in agg_df.index else 0

        return _AODorEODData(
            tru0_pred0_priv0=count3(tru=0, pred=0, priv=0),
            tru0_pred0_priv1=count3(tru=0, pred=0, priv=1),
            tru0_pred1_priv0=count3(tru=0, pred=1, priv=0),
            tru0_pred1_priv1=count3(tru=0, pred=1, priv=1),
            tru1_pred0_priv0=count3(tru=1, pred=0, priv=0),
            tru1_pred0_priv1=count3(tru=1, pred=0, priv=1),
            tru1_pred1_priv0=count3(tru=1, pred=1, priv=0),
            tru1_pred1_priv1=count3(tru=1, pred=1, priv=1),
        )
예제 #10
0
 def _lift(X, hyperparams):
     feature_names_in_ = get_columns(X)
     strategy = hyperparams["strategy"]
     if strategy == "constant":
         fill_value = _SimpleImputerImpl._get_fill_value(X, hyperparams)
         agg_data = [[fill_value for col in get_columns(X)]]
         lifted_statistics = pd.DataFrame(agg_data, columns=get_columns(X))
     elif strategy == "mean":
         agg_op_sum = Aggregate(
             columns={c: sum(it[c])
                      for c in get_columns(X)},
             exclude_value=hyperparams["missing_values"],
         )
         agg_op_count = Aggregate(
             columns={c: count(it[c])
                      for c in get_columns(X)},
             exclude_value=hyperparams["missing_values"],
         )
         lifted_statistics = {}
         agg_sum = agg_op_sum.transform(X)
         if agg_sum is not None and _is_spark_df(agg_sum):
             agg_sum = agg_sum.toPandas()
         agg_count = agg_op_count.transform(X)
         if agg_count is not None and _is_spark_df(agg_count):
             agg_count = agg_count.toPandas()
         lifted_statistics["sum"] = agg_sum
         lifted_statistics["count"] = agg_count
     else:
         raise ValueError(
             "_lift is only supported for imputation strategy `mean` and `constant`."
         )
     return (
         feature_names_in_,
         lifted_statistics,
         strategy,
     )  # strategy is added so that _combine can use it
예제 #11
0
파일: test_relational.py 프로젝트: IBM/lale
    def test_with_hyperopt2(self):
        from lale.expressions import (
            count,
            it,
            max,
            mean,
            min,
            string_indexer,
            sum,
            variance,
        )

        wrap_imported_operators()
        scan = Scan(table=it["main"])
        scan_0 = Scan(table=it["customers"])
        join = Join(pred=[(it["main"]["group_customer_id"] == it["customers"]
                           ["group_customer_id"])])
        map = Map(
            columns={
                "[main](group_customer_id)[customers]|number_children|identity":
                it["number_children"],
                "[main](group_customer_id)[customers]|name|identity":
                it["name"],
                "[main](group_customer_id)[customers]|income|identity":
                it["income"],
                "[main](group_customer_id)[customers]|address|identity":
                it["address"],
                "[main](group_customer_id)[customers]|age|identity":
                it["age"],
            },
            remainder="drop",
        )
        pipeline_4 = join >> map
        scan_1 = Scan(table=it["purchase"])
        join_0 = Join(
            pred=[(it["main"]["group_id"] == it["purchase"]["group_id"])],
            join_limit=50.0,
        )
        aggregate = Aggregate(
            columns={
                "[main](group_id)[purchase]|price|variance":
                variance(it["price"]),
                "[main](group_id)[purchase]|time|sum": sum(it["time"]),
                "[main](group_id)[purchase]|time|mean": mean(it["time"]),
                "[main](group_id)[purchase]|time|min": min(it["time"]),
                "[main](group_id)[purchase]|price|sum": sum(it["price"]),
                "[main](group_id)[purchase]|price|count": count(it["price"]),
                "[main](group_id)[purchase]|price|mean": mean(it["price"]),
                "[main](group_id)[purchase]|price|min": min(it["price"]),
                "[main](group_id)[purchase]|price|max": max(it["price"]),
                "[main](group_id)[purchase]|time|max": max(it["time"]),
                "[main](group_id)[purchase]|time|variance":
                variance(it["time"]),
            },
            group_by=it["row_id"],
        )
        pipeline_5 = join_0 >> aggregate
        map_0 = Map(
            columns={
                "[main]|group_customer_id|identity": it["group_customer_id"],
                "[main]|transaction_id|identity": it["transaction_id"],
                "[main]|group_id|identity": it["group_id"],
                "[main]|comments|identity": it["comments"],
                "[main]|id|identity": it["id"],
                "prefix_0_id": it["prefix_0_id"],
                "next_purchase": it["next_purchase"],
                "[main]|time|identity": it["time"],
            },
            remainder="drop",
        )
        scan_2 = Scan(table=it["transactions"])
        scan_3 = Scan(table=it["products"])
        join_1 = Join(pred=[
            (it["main"]["transaction_id"] == it["transactions"]
             ["transaction_id"]),
            (it["transactions"]["product_id"] == it["products"]["product_id"]),
        ])
        map_1 = Map(
            columns={
                "[main](transaction_id)[transactions](product_id)[products]|price|identity":
                it["price"],
                "[main](transaction_id)[transactions](product_id)[products]|type|identity":
                it["type"],
            },
            remainder="drop",
        )
        pipeline_6 = join_1 >> map_1
        join_2 = Join(pred=[(it["main"]["transaction_id"] == it["transactions"]
                             ["transaction_id"])])
        map_2 = Map(
            columns={
                "[main](transaction_id)[transactions]|description|identity":
                it["description"],
                "[main](transaction_id)[transactions]|product_id|identity":
                it["product_id"],
            },
            remainder="drop",
        )
        pipeline_7 = join_2 >> map_2
        map_3 = Map(columns=[
            string_indexer(it["[main]|comments|identity"]),
            string_indexer(
                it["[main](transaction_id)[transactions]|description|identity"]
            ),
            string_indexer(it[
                "[main](transaction_id)[transactions](product_id)[products]|type|identity"]
                           ),
            string_indexer(
                it["[main](group_customer_id)[customers]|name|identity"]),
            string_indexer(
                it["[main](group_customer_id)[customers]|address|identity"]),
        ])
        pipeline_8 = ConcatFeatures() >> map_3
        relational = Relational(operator=make_pipeline_graph(
            steps=[
                scan,
                scan_0,
                pipeline_4,
                scan_1,
                pipeline_5,
                map_0,
                scan_2,
                scan_3,
                pipeline_6,
                pipeline_7,
                pipeline_8,
            ],
            edges=[
                (scan, pipeline_4),
                (scan, pipeline_5),
                (scan, map_0),
                (scan, pipeline_6),
                (scan, pipeline_7),
                (scan_0, pipeline_4),
                (pipeline_4, pipeline_8),
                (scan_1, pipeline_5),
                (pipeline_5, pipeline_8),
                (map_0, pipeline_8),
                (scan_2, pipeline_6),
                (scan_2, pipeline_7),
                (scan_3, pipeline_6),
                (pipeline_6, pipeline_8),
                (pipeline_7, pipeline_8),
            ],
        ))
        pipeline = relational >> (KNeighborsClassifier | LogisticRegression)
        from sklearn.datasets import load_iris

        X, y = load_iris(return_X_y=True)
        from lale.lib.lale import Hyperopt

        opt = Hyperopt(estimator=pipeline, max_evals=2)
        opt.fit(X, y)