Exemplo n.º 1
0
    def __init__(self,
                 epochs,
                 eval_every=2000,
                 early_stopping_rounds=3,
                 device="cpu"):
        self.epochs = epochs
        self.eval_every = eval_every
        self.early_stopping_rounds = early_stopping_rounds
        self.device = device

        self.metric_loss = stats.RollingMean(1000)

        self.round_without_improvement_valid = 0
        self.round_without_improvement_test = 0

        self.history_valid = collections.defaultdict(float)
        self.history_test = collections.defaultdict(float)

        self.valid_scores = {}
        self.test_scores = {}
Exemplo n.º 2
0
     (stats.Var(), functools.partial(np.var, ddof=1))])
def test_univariate(stat, func):

    # Shut up
    np.warnings.filterwarnings('ignore')

    X = [random.random() for _ in range(30)]

    for i, x in enumerate(X):
        stat.update(x)
        if i >= 1:
            assert math.isclose(stat.get(), func(X[:i + 1]), abs_tol=1e-10)


@pytest.mark.parametrize('stat, func',
                         [(stats.RollingMean(3), statistics.mean),
                          (stats.RollingMean(10), statistics.mean),
                          (stats.RollingVar(3, ddof=0), np.var),
                          (stats.RollingVar(10, ddof=0), np.var)])
def test_rolling_univariate(stat, func):

    # We know what we're doing
    np.warnings.filterwarnings('ignore')

    def tail(iterable, n):
        return collections.deque(iterable, maxlen=n)

    n = stat.window_size
    X = [random.random() for _ in range(30)]

    for i, x in enumerate(X):
Exemplo n.º 3
0
def learn(
    model,
    dataset,
    optimizer,
    loss,
    evaluation,
    negative_sampling_size,
    device,
    epochs,
    eval_every,
    early_stopping_rounds,
):
    """Pipeline dedicated to automate training model.

    Parameters
    ----------
    dataset
        Dataset to dedicated to train the model.
    model
        Transformer based model.
    sampling
        Negative sampling method.
    epochs
        Number of epochs to train the model.
    validation
        Validation module.
    eval_every
        Eval the model every selected steps with the validation module.
    early_stopping_rounds
        Early stopping between validation steps.
    device
        Either cpu or cuda device.

    Examples
    --------

    >>> from mkb import losses, evaluation, datasets, text, models
    >>> from transformers import AutoTokenizer, AutoModel

    >>> import torch
    >>> _ = torch.manual_seed(42)

    >>> train = [
    ...    ("jaguar", "cousin", "cat"),
    ...    ("tiger", "cousin", "cat"),
    ...    ("dog", "cousin", "wolf"),
    ...    ("dog", "angry_against", "cat"),
    ...    ("wolf", "angry_against", "jaguar"),
    ... ]

    >>> valid = [
    ...     ("cat", "cousin", "jaguar"),
    ...     ("cat", "cousin", "tiger"),
    ...     ("dog", "angry_against", "tiger"),
    ... ]

    >>> test = [
    ...     ("wolf", "angry_against", "tiger"),
    ...     ("wolf", "angry_against", "cat"),
    ... ]

    >>> dataset = datasets.Dataset(
    ...     batch_size = 5,
    ...     train = train,
    ...     valid = valid,
    ...     test = test,
    ...     seed = 42,
    ... )

    >>> device = "cpu"

    >>> model = models.SentenceTransformer(
    ...    model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2"),
    ...    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2"),
    ...    entities = dataset.entities,
    ...    relations = dataset.relations,
    ...    gamma = 9,
    ...    device = device,
    ... )

    >>> model = model.to(device)

    >>> optimizer = torch.optim.Adam(
    ...     filter(lambda p: p.requires_grad, model.parameters()),
    ...     lr = 0.00005,
    ... )

    >>> evaluation = evaluation.TransformerEvaluation(
    ...     entities = dataset.entities,
    ...     relations = dataset.relations,
    ...     true_triples = dataset.train + dataset.valid + dataset.test,
    ...     batch_size = 2,
    ...     device = device,
    ... )

    >>> model = text.learn(
    ...     model = model,
    ...     dataset = dataset,
    ...     evaluation = evaluation,
    ...     optimizer = optimizer,
    ...     loss = losses.Adversarial(alpha=0.5),
    ...     negative_sampling_size = 5,
    ...     epochs = 1,
    ...     eval_every = 5,
    ...     early_stopping_rounds = 3,
    ...     device = device,
    ... )
    Validation:
        MRR: 0.2639
        MR: 3.8333
        HITS@1: 0.0
        HITS@3: 0.1667
        HITS@10: 1.0
        MRR_relations: 0.6667
        MR_relations: 1.6667
        HITS@1_relations: 0.3333
        HITS@3_relations: 1.0
        HITS@10_relations: 1.0
    Test:
        MRR: 0.3542
        MR: 3.0
        HITS@1: 0.0
        HITS@3: 0.75
        HITS@10: 1.0
        MRR_relations: 1.0
        MR_relations: 1.0
        HITS@1_relations: 1.0
        HITS@3_relations: 1.0
        HITS@10_relations: 1.0

    """
    metric_loss = stats.RollingMean(1000)
    round_without_improvement_valid, round_without_improvement_test = 0, 0
    history_valid, history_test = collections.defaultdict(
        float), collections.defaultdict(float)
    valid_scores, test_scores = {}, {}
    evaluation_done = False
    step = 0

    true_head, true_tail = positive_triples(triples=dataset.train +
                                            dataset.valid + dataset.test)

    entities = {id_e: e for e, id_e in dataset.entities.items()}

    for epoch in range(epochs):

        bar = utils.Bar(dataset=dataset, update_every=10)

        for data in bar:

            sample = data["sample"].to(device)
            weight = data["weight"].to(device)
            mode = data["mode"]

            triples = []
            for h, r, t in sample:
                h, r, t = h.item(), r.item(), t.item()
                triples.append((h, r, t))

            negative = in_batch_negative_triples(
                triples,
                negative_sampling_size=negative_sampling_size,
                mode=mode,
                true_head=true_head,
                true_tail=true_tail,
            )

            if not negative[0]:
                continue

            mapping_heads, mapping_tails = {}, {}
            e_encode = []

            for index, (h, r, t) in enumerate(triples):
                e_encode.append(entities[h])
                e_encode.append(entities[t])
                mapping_heads[h] = index
                mapping_tails[t] = index

            embeddings = model.encoder(e_encode)

            heads = torch.stack(
                [e for index, e in enumerate(embeddings) if index % 2 == 0],
                dim=0).unsqueeze(1)
            tails = torch.stack(
                [e for index, e in enumerate(embeddings) if index % 2 != 0],
                dim=0).unsqueeze(1)

            relations = torch.index_select(model.relation_embedding,
                                           dim=0,
                                           index=sample[:, 1]).unsqueeze(1)

            score = model.scoring(
                head=heads.to(device),
                relation=relations.to(device),
                tail=tails.to(device),
                mode=mode,
                gamma=model.gamma,
            )

            negative_scores = []
            for index, negative_sample in enumerate(negative):

                tensor_h = []
                tensor_r = []
                tensor_t = []

                for h, r, t in negative_sample:
                    tensor_h.append(heads[mapping_heads[h]])
                    tensor_r.append(relations[index])
                    tensor_t.append(tails[mapping_tails[t]])

                tensor_h = torch.stack(tensor_h, dim=0)
                tensor_r = torch.stack(tensor_r, dim=0)
                tensor_t = torch.stack(tensor_t, dim=0)

                negative_scores.append(
                    model.scoring(
                        head=tensor_h.to(device),
                        relation=tensor_r.to(device),
                        tail=tensor_t.to(device),
                        mode=mode,
                        gamma=model.gamma,
                    ).T)

            negative_scores = torch.stack(negative_scores, dim=1).squeeze(0)

            error = loss(score, negative_scores, weight)

            error.backward()

            _ = optimizer.step()

            optimizer.zero_grad()

            metric_loss.update(error.item())

            bar.set_description(
                f"Epoch: {epoch}, loss: {metric_loss.get():4f}")

            # Avoid doing evaluation twice for the same parameters.
            evaluation_done = False
            step += 1

            if evaluation is not None and not evaluation_done:

                if (step + 1) % eval_every == 0:

                    update_embeddings = True
                    evaluation_done = True

                    print(f"\n Epoch: {epoch}, step {step}.")

                    if dataset.valid:

                        valid_scores = evaluation.eval(
                            model=model,
                            dataset=dataset.valid,
                            update_embeddings=update_embeddings,
                        )

                        update_embeddings = False

                        valid_scores.update(
                            evaluation.eval_relations(
                                model=model,
                                dataset=dataset.valid,
                                update_embeddings=update_embeddings,
                            ))

                        print_metrics(description="Validation:",
                                      metrics=valid_scores)

                    if dataset.test:

                        test_scores = evaluation.eval(
                            model=model,
                            dataset=dataset.test,
                            update_embeddings=update_embeddings,
                        )

                        update_embeddings = False

                        test_scores.update(
                            evaluation.eval_relations(
                                model=model,
                                dataset=dataset.test,
                                update_embeddings=update_embeddings,
                            ))

                        print_metrics(description="Test:", metrics=test_scores)

                        if (history_test["HITS@3"] > test_scores["HITS@3"]
                                and history_test["HITS@1"] >
                                test_scores["HITS@1"]):
                            round_without_improvement_test += 1
                        else:
                            round_without_improvement_test = 0
                            history_test = test_scores
                    else:
                        if (history_valid["HITS@3"] > valid_scores["HITS@3"]
                                and history_valid["HITS@1"] >
                                valid_scores["HITS@1"]):
                            round_without_improvement_valid += 1
                        else:
                            round_without_improvement_valid = 0
                            history_valid = valid_scores

                    if (round_without_improvement_valid
                            == early_stopping_rounds
                            or round_without_improvement_test
                            == early_stopping_rounds):

                        print(
                            f"\n Early stopping at epoch {epoch}, step {step}."
                        )

                        return model

    update_embeddings = True

    if dataset.valid and not evaluation_done and evaluation is not None:

        valid_scores = evaluation.eval(model=model,
                                       dataset=dataset.valid,
                                       update_embeddings=update_embeddings)

        update_embeddings = False

        valid_scores.update(
            evaluation.eval_relations(model=model, dataset=dataset.valid))

        print_metrics(description="Validation:", metrics=valid_scores)

    if dataset.test and not evaluation_done:

        test_scores = evaluation.eval(model=model,
                                      dataset=dataset.test,
                                      update_embeddings=update_embeddings)

        update_embeddings = False

        test_scores.update(
            evaluation.eval_relations(model=model, dataset=dataset.test))

        print_metrics(description="Test:", metrics=test_scores)

    return model
Exemplo n.º 4
0
    def __init__(
        self,
        models,
        datasets,
        lr,
        alpha_kl,
        alpha_adv,
        negative_sampling_size,
        batch_size_entity,
        batch_size_relation,
        n_random_entities,
        n_random_relations,
        update_distillation_every=500,
        device="cuda",
        seed=None,
        warm_step=500,
    ):

        self.alpha_kl = alpha_kl
        self.batch_size_entity = batch_size_entity
        self.batch_size_relation = batch_size_relation
        self.n_random_entities = n_random_entities
        self.n_random_relations = n_random_relations
        self.update_distillation_every = update_distillation_every
        self.device = device
        self.seed = seed
        self._rng = np.random.RandomState(self.seed)  # pylint: disable=no-member
        self.warm_step = warm_step

        self.loss_function = collections.OrderedDict()

        for id_dataset, dataset in datasets.items():

            if dataset.classification:
                self.loss_function[id_dataset] = BCEWithLogitsLoss()

            else:
                self.loss_function[id_dataset] = Adversarial(
                    alpha=alpha_adv[id_dataset])

        self.optimizers = collections.OrderedDict()
        for id_dataset, learning_rate in lr.items():
            self.optimizers[id_dataset] = torch.optim.Adam(
                filter(lambda p: p.requires_grad,
                       models[id_dataset].parameters()),
                lr=learning_rate,
            )

        self.distillation = collections.OrderedDict()

        for id_dataset_teacher, dataset_teacher in datasets.items():

            for id_dataset_student, dataset_student in datasets.items():

                if id_dataset_teacher != id_dataset_student:

                    self.distillation[
                        f"{id_dataset_teacher}_{id_dataset_student}"] = self._init_distillation(
                            sampling_method=FastTopKSampling,
                            teacher=models[id_dataset_teacher],
                            dataset_teacher=dataset_teacher,
                            dataset_student=dataset_student,
                            batch_size_entity=self.
                            batch_size_entity[id_dataset_teacher],
                            batch_size_relation=self.
                            batch_size_relation[id_dataset_teacher],
                            n_random_entities=self.
                            n_random_entities[id_dataset_teacher],
                            n_random_relations=self.
                            n_random_relations[id_dataset_teacher],
                            seed=self.seed,
                            device=self.device,
                        )

        self.negative_sampling = collections.OrderedDict()

        self.validation = collections.OrderedDict()

        for id_dataset, dataset in datasets.items():

            if not dataset.classification:

                self.negative_sampling[id_dataset] = NegativeSampling(
                    size=negative_sampling_size[id_dataset],
                    entities=dataset.entities,
                    relations=dataset.relations,
                    train_triples=dataset.train_triples,
                    seed=seed,
                )

            self.validation[id_dataset] = Evaluation(
                entities=dataset.entities,
                relations=dataset.relations,
                batch_size=2,
                true_triples=dataset.true_triples,
                device=device,
            )

        self.metrics = {
            id_dataset: stats.RollingMean(1000)
            for id_dataset, _ in datasets.items()
        }
Exemplo n.º 5
0
 def __init__(self, regressor: base.Regressor, window_size: int = None):
     self.regressor = regressor
     self.window_size = window_size
     self.mean = (stats.Mean() if self.window_size is None else
                  stats.RollingMean(self.window_size))