Exemplo n.º 1
0
def make_update(model: Model, docs: Iterable[Doc], optimizer: Optimizer,
                objective_func: Callable) -> float:
    """Perform an update over a single batch of documents.

    docs (iterable): A batch of `Doc` objects.
    optimizer (callable): An optimizer.
    RETURNS loss: A float for the loss.
    """
    predictions, backprop = model.begin_update(docs)
    loss, gradients = objective_func(model.ops, docs, predictions)
    backprop(gradients)
    model.finish_update(optimizer)
    # Don't want to return a cupy object here
    # The gradients are modified in-place by the BERT MLM,
    # so we get an accurate loss
    return float(loss)
Exemplo n.º 2
0
def debug_model(
    config,
    resolved_train_config,
    nlp,
    model: Model,
    *,
    print_settings: Optional[Dict[str, Any]] = None,
):
    if not isinstance(model, Model):
        msg.fail(
            f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
            exits=1,
        )
    if print_settings is None:
        print_settings = {}

    # STEP 0: Printing before training
    msg.info(f"Analysing model with ID {model.id}")
    if print_settings.get("print_before_training"):
        msg.divider(f"STEP 0 - before training")
        _print_model(model, print_settings)

    # STEP 1: Initializing the model and printing again
    X = _get_docs()
    # The output vector might differ from the official type of the output layer
    with data_validation(False):
        try:
            dot_names = [resolved_train_config["train_corpus"]]
            with show_validation_error():
                (train_corpus, ) = resolve_dot_names(config, dot_names)
                nlp.initialize(lambda: train_corpus(nlp))
            msg.info("Initialized the model with the training corpus.")
        except ValueError:
            try:
                _set_output_dim(nO=7, model=model)
                with show_validation_error():
                    nlp.initialize(
                        lambda: [Example.from_dict(x, {}) for x in X])
                msg.info("Initialized the model with dummy data.")
            except Exception:
                msg.fail(
                    "Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.",
                    exits=1,
                )

    if print_settings.get("print_after_init"):
        msg.divider(f"STEP 1 - after initialization")
        _print_model(model, print_settings)

    # STEP 2: Updating the model and printing again
    optimizer = Adam(0.001)
    set_dropout_rate(model, 0.2)
    # ugly hack to deal with Tok2Vec listeners
    tok2vec = None
    if model.has_ref("tok2vec") and model.get_ref(
            "tok2vec").name == "tok2vec-listener":
        tok2vec = nlp.get_pipe("tok2vec")
    goldY = None
    for e in range(3):
        if tok2vec:
            tok2vec.update([Example.from_dict(x, {}) for x in X])
        Y, get_dX = model.begin_update(X)
        if goldY is None:
            goldY = _simulate_gold(Y)
        dY = get_gradient(goldY, Y, model.ops)
        get_dX(dY)
        model.finish_update(optimizer)
    if print_settings.get("print_after_training"):
        msg.divider(f"STEP 2 - after training")
        _print_model(model, print_settings)

    # STEP 3: the final prediction
    prediction = model.predict(X)
    if print_settings.get("print_prediction"):
        msg.divider(f"STEP 3 - prediction")
        msg.info(str(prediction))

    msg.good(f"Succesfully ended analysis - model looks good.")
Exemplo n.º 3
0
def train_model(
    model: Model,
    *,
    train: Sequence[Tuple[str, str]],
    test: Sequence[Tuple[str, str]],
    n_iter: int,
    batch_size: int | thinc.types.Generator = 32,
    learn_rate: float | List[float] | thinc.types.Generator = 0.001,
) -> Model:
    """
    Args:
        model
        train
        test
        n_iter
        batch_size
        learn_rate
    """
    # binarize language labels
    # NOTE: thinc seems to require type "float32" arrays for training labels
    # errors otherwise... :/
    lb = sklearn.preprocessing.LabelBinarizer()
    lb.fit([lang for _, lang in train])
    # THIS NEXT LINE IS CRITICAL: we need to save the training class labels
    # but don't want to keep this label binarizer around; so, add it to the model
    model.layers[-1].attrs["classes"] = lb.classes_

    Y_train = lb.transform([lang for _, lang in train]).astype("float32")
    Y_test = lb.transform([lang for _, lang in test])

    # make sure data is on the right device?
    # Y_train = self.model.ops.asarray(Y_train)
    # Y_test = self.model.ops.asarray(Y_test)

    X_train = [text for text, _ in train]
    X_test = [text for text, _ in test]

    losser = thinc.api.CategoricalCrossentropy(normalize=True)
    optimizer = thinc.api.Adam(learn_rate)

    model.initialize(X=X_train[:10], Y=Y_train[:10])
    print(f"{'epoch':>5}  {'loss':>8}  {'score':>8}")
    # iterate over epochs
    for n in range(n_iter):
        loss = 0.0
        # iterate over batches
        batches = model.ops.multibatch(batch_size,
                                       X_train,
                                       Y_train,
                                       shuffle=True)
        for X, Y in tqdm(batches, leave=False):
            Yh, backprop = model.begin_update(X)
            dYh, loss_batch = losser(Yh, Y)
            loss += loss_batch
            backprop(dYh)
            model.finish_update(optimizer)
            optimizer.step_schedules()

        if optimizer.averages:
            with model.use_params(optimizer.averages):
                score = evaluate_model(model,
                                       X_test=X_test,
                                       Y_test=Y_test,
                                       batch_size=1000)
        else:
            score = evaluate_model(model,
                                   X_test=X_test,
                                   Y_test=Y_test,
                                   batch_size=1000)
        print(f"{n:>5}  {loss:>8.3f}  {score:>8.3f}")

    if optimizer.averages:
        with model.use_params(optimizer.averages):
            pred_langs = models.get_model_preds(
                model, X_test, model.layers[-1].attrs["classes"])
    else:
        pred_langs = models.get_model_preds(model, X_test,
                                            model.layers[-1].attrs["classes"])
    true_langs = list(lb.inverse_transform(Y_test))
    print(sklearn.metrics.classification_report(true_langs, pred_langs))
    return model