示例#1
0
    def update(self, x, y_pred, centers, sample_weight=1.0):

        self._minimum_separation = self._find_minimum_separation(centers)

        self._center_centers = {i: stats.Mean() for i in x}

        for i in self._center_centers:
            for j in centers:
                self._center_centers[i].update(centers[j][i], w=sample_weight)

        center_centers = {
            i: self._center_centers[i].get()
            for i in self._center_centers
        }
        beta_t = stats.Mean()
        for i in centers:
            beta_t.update(
                utils.math.minkowski_distance(centers[i], center_centers, 2))
        self._beta_t = beta_t.get()

        try:
            self._n_points_by_cluster[y_pred] += 1
        except KeyError:
            self._n_points_by_cluster[y_pred] = 1

        self._n_clusters = len(centers)

        return self
示例#2
0
文件: ssb.py 项目: AdilZouitine/creme
    def update(self, x, y_pred, centers, sample_weight=1.0):

        if not self._initialized:
            self._center_all_points = {i: stats.Mean() for i in x}
            self._initialized = True

        for i in self._center_all_points:
            self._center_all_points[i].update(x[i], w=sample_weight)
        center_all_points = {
            i: self._center_all_points[i].get()
            for i in self._center_all_points
        }

        self._n_points += 1

        try:
            self._n_points_by_clusters[y_pred] += 1
        except KeyError:
            self._n_points_by_clusters[y_pred] = 1

        for i in centers:
            self._squared_distances[i] = utils.math.minkowski_distance(
                centers[i], center_all_points, 2)

        return self
示例#3
0
文件: i_index.py 项目: Leo-VK/creme
    def update(self, x, y_pred, centers, sample_weight=1.0):

        self._furthest_cluster_distance = self._find_furthest_cluster_distance(centers)

        if not self._initialized:
            self._center_all_points = {i: stats.Mean() for i in x}
            self._dim = len(x)
            self._initialized = True

        for i in self._center_all_points:
            self._center_all_points[i].update(x[i], w=sample_weight)
        center_all_points = {
            i: self._center_all_points[i].get() for i in self._center_all_points
        }

        distance_point_cluster_center = math.sqrt(
            utils.math.minkowski_distance(centers[y_pred], x, 2)
        )
        distance_point_center = math.sqrt(
            utils.math.minkowski_distance(center_all_points, x, 2)
        )
        self._ssq_points_cluster_centers += distance_point_cluster_center
        self._ssq_points_center += distance_point_center
        self._n_clusters = len(centers)

        # To trace back
        self.sample_correction = {
            "distance_point_cluster_center": distance_point_cluster_center,
            "distance_point_center": distance_point_center,
        }

        return self
示例#4
0
    def __init__(
        self,
        optimizer: optim.Optimizer = None,
        loss: optim.losses.Loss = None,
        l2=0.0,
        initializer: optim.initializers.Initializer = None,
        clip_gradient=1e12,
        seed=None,
    ):
        super().__init__(seed=seed)

        self.optimizer = optim.SGD() if optimizer is None else copy.deepcopy(
            optimizer)
        self.u_optimizer = (optim.SGD()
                            if optimizer is None else copy.deepcopy(optimizer))
        self.i_optimizer = (optim.SGD()
                            if optimizer is None else copy.deepcopy(optimizer))
        self.loss = optim.losses.Squared() if loss is None else loss
        self.l2 = l2

        if initializer is None:
            initializer = optim.initializers.Zeros()
        self.initializer = initializer

        self.clip_gradient = clip_gradient
        self.global_mean = stats.Mean()
        self.u_biases: typing.DefaultDict[
            int, optim.initializers.Initializer] = collections.defaultdict(
                initializer)
        self.i_biases: typing.DefaultDict[
            int, optim.initializers.Initializer] = collections.defaultdict(
                initializer)
示例#5
0
    def __init__(
        self,
        n_factors=10,
        bias_optimizer: optim.Optimizer = None,
        latent_optimizer: optim.Optimizer = None,
        loss: optim.losses.Loss = None,
        l2_bias=0.0,
        l2_latent=0.0,
        weight_initializer: optim.initializers.Initializer = None,
        latent_initializer: optim.initializers.Initializer = None,
        clip_gradient=1e12,
        seed: int = None,
    ):

        self.n_factors = n_factors
        self.u_bias_optimizer = (
            optim.SGD() if bias_optimizer is None else copy.deepcopy(bias_optimizer)
        )
        self.i_bias_optimizer = (
            optim.SGD() if bias_optimizer is None else copy.deepcopy(bias_optimizer)
        )
        self.u_latent_optimizer = (
            optim.SGD() if latent_optimizer is None else copy.deepcopy(latent_optimizer)
        )
        self.i_latent_optimizer = (
            optim.SGD() if latent_optimizer is None else copy.deepcopy(latent_optimizer)
        )
        self.loss = optim.losses.Squared() if loss is None else loss
        self.l2_bias = l2_bias
        self.l2_latent = l2_latent

        if weight_initializer is None:
            weight_initializer = optim.initializers.Zeros()
        self.weight_initializer = weight_initializer

        if latent_initializer is None:
            latent_initializer = optim.initializers.Normal(sigma=0.1, seed=seed)
        self.latent_initializer = latent_initializer

        self.clip_gradient = clip_gradient
        self.seed = seed
        self.global_mean = stats.Mean()

        self.u_biases: typing.DefaultDict[
            int, optim.initializers.Initializer
        ] = collections.defaultdict(weight_initializer)
        self.i_biases: typing.DefaultDict[
            int, optim.initializers.Initializer
        ] = collections.defaultdict(weight_initializer)

        random_latents = functools.partial(
            self.latent_initializer, shape=self.n_factors
        )
        self.u_latents: typing.DefaultDict[
            int, optim.initializers.Initializer
        ] = collections.defaultdict(random_latents)
        self.i_latents: typing.DefaultDict[
            int, optim.initializers.Initializer
        ] = collections.defaultdict(random_latents)
示例#6
0
    def update(self, x, y_pred, centers, sample_weight=1.0):

        self._minimum_separation = self._find_minimum_separation(centers)

        distance = math.sqrt(utils.math.minkowski_distance(centers[y_pred], x, 2))

        if y_pred in self._avg_cp_by_clusters:
            self._avg_cp_by_clusters[y_pred].update(distance, w=sample_weight)
        else:
            self._avg_cp_by_clusters[y_pred] = stats.Mean()
            self._avg_cp_by_clusters[y_pred].update(distance, w=sample_weight)

        return self
示例#7
0
    def __init__(self,
                 x: float,
                 y=typing.Union[float, utils.VectorDict],
                 weight: float = 1.0):
        self.x_stats = stats.Mean()
        self.x_stats.update(x, weight)

        self.y_stats: typing.Union[stats.Var, utils.VectorDict]

        self._update_estimator: typing.Callable[
            [typing.Union[float, utils.VectorDict], float], None]
        self.is_single_target = True

        self._init_estimator(y)
        self._update_estimator(y, weight)
示例#8
0
def load_stats():
    for _, obj in inspect.getmembers(importlib.import_module("river.stats"), inspect.isclass):
        try:

            if issubclass(obj, stats.Link):
                yield obj(stats.Shift(1), stats.Mean())
                continue

            sig = inspect.signature(obj)
            yield obj(
                **{
                    param.name: param.default if param.default != param.empty else 1
                    for param in sig.parameters.values()
                }
            )
        except ValueError:
            yield obj()
示例#9
0
    def eval_relations(self, model, dataset):
        metrics = collections.OrderedDict({
            f"{metric}": stats.Mean()
            for metric in ["MRR", "MR", "HITS@1", "HITS@3", "HITS@10"]
        })

        with torch.no_grad():

            metrics = self.compute_score(
                model=model,
                test_set=self.get_relation_stream(dataset),
                metrics=metrics,
                device=self.device,
            )

        return {
            f"{name}_relations": round(metric.get(), 4)
            for name, metric in metrics.items()
        }
示例#10
0
    def eval(self, model, dataset):
        """Evaluate selected model with the metrics: MRR, MR, HITS@1, HITS@3, HITS@10"""
        metrics = collections.OrderedDict({
            metric: stats.Mean()
            for metric in ["MRR", "MR", "HITS@1", "HITS@3", "HITS@10"]
        })

        with torch.no_grad():

            for test_set in self.get_entity_stream(dataset):

                metrics = self.compute_score(model=model,
                                             test_set=test_set,
                                             metrics=metrics,
                                             device=self.device)

        return {
            name: round(metric.get(), 4)
            for name, metric in metrics.items()
        }
示例#11
0
文件: r2.py 项目: AdilZouitine/creme
    def update(self, x, y_pred, centers, sample_weight=1.0):

        if not self._initialized:
            self._center_all_points = {i: stats.Mean() for i in x}
            self._initialized = True
        for i in self._center_all_points:
            self._center_all_points[i].update(x[i], w=sample_weight)
        center_all_points = {
            i: self._center_all_points[i].get()
            for i in self._center_all_points
        }

        squared_distance_center = utils.math.minkowski_distance(
            x, center_all_points, 2)
        squared_distance_cluster_center = utils.math.minkowski_distance(
            x, centers[y_pred], 2)

        self._ssq_point_center += squared_distance_center
        self._ssq_point_cluster_centers += squared_distance_cluster_center

        return self
示例#12
0
 def _unit_test_params(cls):
     return {"statistic": stats.Mean()}
示例#13
0
文件: normal.py 项目: online-ml/river
 def __init__(self, seed=None):
     super().__init__(seed=seed)
     self.variance = stats.Var()
     self.mean = stats.Mean()
     self.seed = seed
示例#14
0
 def __init__(self, regressor: base.Regressor, window_size: int = None):
     self.regressor = regressor
     self.window_size = window_size
     self.mean = (stats.Mean() if self.window_size is None else
                  stats.RollingMean(self.window_size))
示例#15
0
 def __init__(self):
     self.x_m = stats.Mean()
     self.g_var = stats.Var()
     self.h_var = stats.Var()
     self.gh_cov = stats.Cov()

X_y = iter(datasets.Bikes())
# Peer one data
# for x, y in X_y:
#     pprint(x)
#     print(f'Number of avaliable biles: {y}')
#     break

# construct a model pipeline
model = compose.Select('clouds', 'humidity', 'pressure', 'temperature', 'wind')

# To extract feature (bike number by hour)
model += (
    get_hour
    | feature_extraction.TargetAgg(by=['station', 'hour'], how=stats.Mean()) +
    feature_extraction.TargetAgg(by='station', how=stats.EWMean(
        0.5))  # aggregate feature (station and time)
)

model |= preprocessing.StandardScaler()
model |= linear_model.LinearRegression()

evaluate.progressive_val_score(dataset=datasets.Bikes(),
                               model=model,
                               metric=metrics.MAE(),
                               moment='moment',
                               delay=dt.timedelta(minutes=30),
                               print_every=20_000)

for x, y, in itertools.islice(X_y, 10000):
示例#17
0
import copy
import functools
import math
import random

import numpy as np
import pytest

from river import stats


@pytest.mark.parametrize(
    "stat",
    [
        pytest.param(stat, id=stat.__class__.__name__) for stat in
        [stats.Mean(), stats.Var(
            ddof=0), stats.Var(ddof=1)]
    ],
)
def test_add(stat):
    A = copy.deepcopy(stat)
    B = copy.deepcopy(stat)
    C = copy.deepcopy(stat)

    X = [random.random() for _ in range(30)]
    Y = [random.random() for _ in range(30)]
    W = [random.random() for _ in range(30)]

    for x, y, w in zip(X, Y, W):
        A.update(x, w)
        B.update(y, w)
示例#18
0
    assert isinstance(pickle.loads(pickle.dumps(stat)), stat.__class__)
    assert isinstance(copy.deepcopy(stat), stat.__class__)

    # Check the statistic has a working __str__ and name method
    assert isinstance(str(stat), str)

    if isinstance(stat, stats.Univariate):
        assert isinstance(stat.name, str)


@pytest.mark.parametrize(
    'stat, func',
    [(stats.Kurtosis(bias=True), sp_stats.kurtosis),
     (stats.Kurtosis(bias=False),
      functools.partial(sp_stats.kurtosis, bias=False)),
     (stats.Mean(), statistics.mean), (stats.Skew(bias=True), sp_stats.skew),
     (stats.Skew(bias=False), functools.partial(sp_stats.skew, bias=False)),
     (stats.Var(ddof=0), np.var),
     (stats.Var(), functools.partial(np.var, ddof=1))])
def test_univariate(stat, func):

    # Shut up
    np.warnings.filterwarnings('ignore')

    X = [random.random() for _ in range(30)]

    for i, x in enumerate(X):
        stat.update(x)
        if i >= 1:
            assert math.isclose(stat.get(), func(X[:i + 1]), abs_tol=1e-10)
示例#19
0
    if isinstance(stat, stats.Univariate):
        assert isinstance(stat.name, str)


@pytest.mark.parametrize("stat", load_stats(), ids=lambda stat: stat.__class__.__name__)
def test_repr_with_no_updates(stat):
    assert isinstance(repr(stat), str)
    assert isinstance(str(stat), str)


@pytest.mark.parametrize(
    "stat, func",
    [
        (stats.Kurtosis(bias=True), sp_stats.kurtosis),
        (stats.Kurtosis(bias=False), functools.partial(sp_stats.kurtosis, bias=False)),
        (stats.Mean(), statistics.mean),
        (stats.Skew(bias=True), sp_stats.skew),
        (stats.Skew(bias=False), functools.partial(sp_stats.skew, bias=False)),
        (stats.Var(ddof=0), np.var),
        (stats.Var(), functools.partial(np.var, ddof=1)),
    ],
)
def test_univariate(stat, func):

    # Shut up
    np.warnings.filterwarnings("ignore")

    X = [random.random() for _ in range(30)]

    for i, x in enumerate(X):
        stat.update(x)
示例#20
0
 def _unit_test_params(cls):
     yield {"statistic": stats.Mean()}
示例#21
0
 def __init__(self, seed=None):
     super().__init__()
     self.variance = stats.Var()
     self.mean = stats.Mean()
     self.seed = seed
     self._rng = random.Random(seed)
示例#22
0
 def __init__(self):
     self.mean = stats.Mean()
示例#23
0
    def detail_eval(self, model, dataset, threshold=1.5):
        """
        Divide input dataset relations into different categories (i.e. ONE-TO-ONE, ONE-TO-MANY,
        MANY-TO-ONE and MANY-TO-MANY) according to the mapping properties of relationships.

        Reference:
            1. [Bordes, Antoine, et al. "Translating embeddings for modeling multi-relational data." Advances in neural information processing systems. 2013.](http://papers.nips.cc/paper/5071-translating-embeddings-for-modeling-multi-relational-data.pdf)

        """
        mapping_type_relations = self.types_relations(model=model,
                                                      dataset=dataset,
                                                      threshold=threshold)

        mapping_type_relations = {
            self.relations[key]: value
            for key, value in mapping_type_relations.items()
        }

        types_relations = ["1_1", "1_M", "M_1", "M_M"]

        metrics = collections.OrderedDict({
            "head-batch":
            collections.OrderedDict({}),
            "tail-batch":
            collections.OrderedDict({})
        })

        for mode in ["head-batch", "tail-batch"]:

            for type_relation in types_relations:

                metrics[mode][type_relation] = collections.OrderedDict({
                    f"{metric}": stats.Mean()
                    for metric in ["MRR", "MR", "HITS@1", "HITS@3", "HITS@10"]
                })

        with torch.no_grad():

            for test_set in self.get_entity_stream(dataset):

                metrics = self.compute_detailled_score(
                    model=model,
                    test_set=test_set,
                    metrics=metrics,
                    types_relations=mapping_type_relations,
                    device=self.device,
                )

        for mode in ["head-batch", "tail-batch"]:
            for type_relation in types_relations:
                for metric in ["MRR", "MR", "HITS@1", "HITS@3", "HITS@10"]:
                    metrics[mode][type_relation][metric] = round(
                        metrics[mode][type_relation][metric].get(), 4)

        results = pd.DataFrame(metrics)

        head = pd.DataFrame(results["head-batch"].values.tolist())
        tail = pd.DataFrame(results["tail-batch"].values.tolist())

        head.columns = pd.MultiIndex.from_product([["head"], head.columns])
        tail.columns = pd.MultiIndex.from_product([["tail"], tail.columns])
        results = pd.concat([head, tail], axis="columns")
        results = results.set_index(pd.Series(["1_1", "1_M", "M_1", "M_M"]))
        results.index.name = "relation"

        # Add frequency of each type of relation:
        frequency = collections.OrderedDict()
        for type_relation in types_relations:
            frequency[type_relation] = 0
        for _, type_relation in mapping_type_relations.items():
            frequency[type_relation] += 1
        for type_relation in types_relations:
            frequency[type_relation] /= len(mapping_type_relations)

        frequency = pd.DataFrame.from_dict(frequency,
                                           orient="index",
                                           columns=["frequency"])

        frequency.columns = pd.MultiIndex.from_product([["metadata"],
                                                        frequency.columns])

        results = pd.concat([results, frequency], axis="columns")

        return results