def test_sequence_categorical_crossentropy(guesses, labels): d_scores = SequenceCategoricalCrossentropy(normalize=False).get_grad( guesses, labels) d_scores1 = d_scores[0] d_scores2 = d_scores[1] assert d_scores1.shape == guesses1.shape assert d_scores2.shape == guesses2.shape assert d_scores1[1][0] == pytest.approx(0.4, eps) assert d_scores1[1][1] == pytest.approx(-0.4, eps) # The normalization divides the difference (e.g. 0.4) by the number of seqs d_scores = SequenceCategoricalCrossentropy(normalize=True).get_grad( guesses, labels) d_scores1 = d_scores[0] d_scores2 = d_scores[1] assert d_scores1[1][0] == pytest.approx(0.2, eps) assert d_scores1[1][1] == pytest.approx(-0.2, eps) # The third vector predicted all labels, but only the first one was correct assert d_scores1[2][0] == pytest.approx(0, eps) assert d_scores1[2][1] == pytest.approx(0.5, eps) assert d_scores1[2][2] == pytest.approx(0.5, eps) # The fourth vector predicted no labels but should have predicted the last one assert d_scores1[3][0] == pytest.approx(0, eps) assert d_scores1[3][1] == pytest.approx(0, eps) assert d_scores1[3][2] == pytest.approx(-0.5, eps) # Test the second batch assert d_scores2[0][0] == pytest.approx(0.1, eps) assert d_scores2[0][1] == pytest.approx(-0.35, eps) loss = SequenceCategoricalCrossentropy(normalize=True).get_loss( guesses, labels) assert loss == pytest.approx(1.09, eps)
def test_loss(): d_scores = CategoricalCrossentropy().get_grad(scores0, labels0) assert d_scores.dtype == "float32" assert d_scores.shape == scores0.shape d_scores = SequenceCategoricalCrossentropy().get_grad([scores0], [labels0]) assert d_scores[0].dtype == "float32" assert d_scores[0].shape == scores0.shape assert SequenceCategoricalCrossentropy().get_grad([], []) == []
def main(path: Optional[Path] = None, out_dir: Optional[Path] = None): if prefer_gpu(): print("Using gpu!") use_pytorch_for_gpu_memory() # You can edit the CONFIG string within the file, or copy it out to # a separate file and pass in the path. if path is None: config = Config().from_str(CONFIG) else: config = Config().from_disk(path) # make_from_config constructs objects whenever you have blocks with an @ key. # In the optimizer block we write @optimizers = "Adam.v1". This tells Thinc # to use registry.optimizers to fetch the "Adam.v1" function. You can # register your own functions as well and build up trees of objects. C = thinc.registry.make_from_config(config) words_per_subbatch = C["training"]["words_per_subbatch"] n_epoch = C["training"]["n_epoch"] batch_size = C["training"]["batch_size"] model = C["model"] optimizer = C["optimizer"] calculate_loss = SequenceCategoricalCrossentropy() (train_X, train_Y), (dev_X, dev_Y) = ml_datasets.ud_ancora_pos_tags() # Convert the outputs to cupy (if we're using that) train_Y = list(map(model.ops.asarray, train_Y)) dev_Y = list(map(model.ops.asarray, dev_Y)) # Pass in a small batch of data, to fill in missing shapes model.initialize(X=train_X[:5], Y=train_Y[:5]) for epoch in range(n_epoch): # Transformers often learn best with large batch sizes -- larger than # fits in GPU memory. But you don't have to backprop the whole batch # at once. Here we consider the "logical" batch size (number of examples # per update) separately from the physical batch size. batches = model.ops.multibatch(batch_size, train_X, train_Y, shuffle=True) for outer_batch in tqdm.tqdm(batches, leave=False): # For the physical batch size, what we care about is the number # of words (considering padding too). We also want to sort by # length, for efficiency. for batch in minibatch_by_words(outer_batch, words_per_subbatch): inputs, truths = zip(*batch) guesses, backprop = model(inputs, is_train=True) backprop(calculate_loss.get_grad(guesses, truths)) # At the end of the batch, we call the optimizer with the accumulated # gradients, and advance the learning rate schedules. model.finish_update(optimizer) optimizer.step_schedules() # You might want to evaluate more often than once per epoch; that's up # to you. score = evaluate_sequences(model, dev_X, dev_Y, 128) print(epoch, f"{score:.3f}") if out_dir: model.to_disk(out_dir / f"{epoch}.bin")
def get_loss(self, examples: Iterable[Example], scores: List[Floats2d]) -> Tuple[float, List[Floats2d]]: validate_examples(examples, "EditTreeLemmatizer.get_loss") loss_func = SequenceCategoricalCrossentropy(normalize=False, missing_value=-1) truths = [] for eg in examples: eg_truths = [] for (predicted, gold_lemma) in zip(eg.predicted, eg.get_aligned("LEMMA", as_string=True)): if gold_lemma is None: label = -1 else: tree_id = self.trees.add(predicted.text, gold_lemma) label = self.tree2label.get(tree_id, 0) eg_truths.append(label) truths.append(eg_truths) d_scores, loss = loss_func(scores, truths) if self.model.ops.xp.isnan(loss): raise ValueError(Errors.E910.format(name=self.name)) return float(loss), d_scores
def test_sequence_categorical_missing_negative(guesses, labels, names): d_scores = SequenceCategoricalCrossentropy(normalize=False, names=names, neg_prefix="!", missing_value="").get_grad( guesses, labels) d_scores0 = d_scores[0] # [0.1, 0.5, 0.6] should be A assert d_scores0[0][0] == pytest.approx(-0.9, eps) assert d_scores0[0][1] == pytest.approx(0.5, eps) assert d_scores0[0][2] == pytest.approx(0.6, eps) # [0.4, 0.6, 0.3] should NOT be A assert d_scores0[1][0] == pytest.approx(0.4, eps) assert d_scores0[1][1] == pytest.approx(0.0, eps) assert d_scores0[1][2] == pytest.approx(0.0, eps) # [1, 1, 1] has missing gold label assert d_scores0[2][0] == pytest.approx(0.0, eps) assert d_scores0[2][1] == pytest.approx(0.0, eps) assert d_scores0[2][2] == pytest.approx(0.0, eps) # [0.0, 0.0, 0.0] should NOT be C assert d_scores0[3][0] == pytest.approx(0.0, eps) assert d_scores0[3][1] == pytest.approx(0.0, eps) assert d_scores0[3][2] == pytest.approx(0.0, eps)