Exemplo n.º 1
0
def test_d1_3_sentence_tensor():
    global tiny_corpus
    c2i, i2c = vocab.build_vocab(tiny_corpus)

    s = "This is a sentence."
    s_tens = vocab.sentence_to_tensor(s, c2i)
    eq_(s_tens.shape[0], 1)
    eq_(s_tens.shape[1], len(s))
    eq_(len(s_tens.shape), 2)

    # does BOS/EOS padding work?
    with_padding = vocab.sentence_to_tensor(s, c2i, True)
    eq_(with_padding.shape[1], len(s) + 2)
    eq_(with_padding[0, 0].item(), c2i[vocab.BOS_SYM])
Exemplo n.º 2
0
def predict_one(model, s, c2i, i2l):
    """
    Runs a sentence, s, through the model, and returns the predicted label.
    
    Make sure to use "torch.no_grad()"!
    See https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html#gradients for discussion
    
    :param model: The LangID model to use for prediction
    :param s: The sentence to pss through, as a string
    :param c2i: The dictionary to use to map from character to index
    :param i2l: The dictionary for mapping from output index to label
    :returns: The predicted label for s
    :rtype: str
    """

    with torch.no_grad():
        sentence_tensor = vocab.sentence_to_tensor(s, c2i)

        y_pred = model(sentence_tensor)

        list_predictions = y_pred.tolist()
        max_value = max(list_predictions)
        max_index = list_predictions.index(max_value)
        predicted_label = i2l[max_index]

        return predicted_label
Exemplo n.º 3
0
def train(model, epochs, training_data, c2i):
    """
    Train model for the specified number of epochs, over the provided training data.

    Make sure to shuffle the training data at the beginning of each epoch!
    """

    opt = torch.optim.Adam(model.parameters())

    loss_function = torch.nn.NLLLoss()

    loss_batch_size = 100

    for _ in range(epochs):

        random.shuffle(training_data)

        loss = 0

        for idx, name in enumerate(training_data):

            if idx % loss_batch_size == 0:
                opt.zero_grad()

            x_tens = vocab.sentence_to_tensor(name[:-1], c2i, True)
            y_tens = vocab.sentence_to_tensor(name[1:], c2i, True)

            y_hat, _ = model(x_tens, model.init_hidden())


            loss += loss_function(y_hat[-1],y_tens[-1])

            if idx % 1000 == 0:
                print(f"{idx}/{len(training_data)} average per-item loss: {loss / loss_batch_size}")

            if idx % loss_batch_size == 0 and idx > 0:
                # send back gradients:
                loss.backward()
                # now, tell the optimizer to update our weights:
                opt.step()
                loss = 0

        # now one last time:
        loss.backward()
        opt.step()

    return model
Exemplo n.º 4
0
    def test_train_model(self):

        # Adding prefix _ to local variables
        # Noticed some variation in the scores every time I run it.
        # Don't want to pollute my variables with globals when running all tests
        _bi_text = pd.read_csv("../../data/sentences_bilingual.csv")
        _bi_text_train, _bi_text_test = train_test_split(_bi_text,
                                                         test_size=0.2)
        _c2i, _i2c = vocab.build_vocab(bi_text.sentence.values)
        _l2i, _i2l = vocab.build_label_vocab(bi_text.lang.values)

        _li = lang_id.LangID(input_vocab_n=len(_c2i),
                             embedding_dims=10,
                             hidden_dims=20,
                             lstm_layers=1,
                             output_class_n=2)

        _trained_model = lang_id.train_model(model=_li,
                                             n_epochs=1,
                                             training_data=_bi_text_train,
                                             c2i=_c2i,
                                             i2c=_i2c,
                                             l2i=_l2i,
                                             i2l=_i2l)

        _trained_model(vocab.sentence_to_tensor("this is a sentence", _c2i))
        _trained_model(vocab.sentence_to_tensor("quien estas", _c2i))

        _acc, _y_hat = lang_id.eval_acc(_trained_model, _bi_text_test, _c2i,
                                        _i2c, _l2i, _i2l)
        print(f"Trained Accuracy: {_acc}")

        _untrained = lang_id.LangID(input_vocab_n=len(_c2i),
                                    embedding_dims=10,
                                    hidden_dims=20,
                                    lstm_layers=1,
                                    output_class_n=2)

        _acc_untrained, _y_hat_untrained = lang_id.eval_acc(
            _untrained, _bi_text_test, _c2i, _i2c, _l2i, _i2l)
        print(f"Untrained Accuracy: {_acc_untrained}")

        assert_greater(_acc, 0.89)
Exemplo n.º 5
0
def train_model(model, n_epochs, training_data, c2i, i2c, l2i, i2l):
    """
    Train using the Adam optimizer.
    
    :returns: The trained model, as well as a list of average loss values from during training (for visualizing) loss stability, etc.
    """

    opt = torch.optim.Adam(model.parameters())

    loss_func = torch.nn.NLLLoss(
    )  # since our model gives negative log probs on the output side

    loss_batch_size = 100

    for i in range(n_epochs):

        x_train = training_data.sentence.values
        y_train = training_data.lang.values

        # There's a more pandas-ish way to do this...
        pairs = list(zip(x_train, y_train))
        random.shuffle(pairs)

        loss = 0

        for x_idx, (x, y) in enumerate(pairs):

            if x_idx % loss_batch_size == 0:
                opt.zero_grad()

            x_tens = vocab.sentence_to_tensor(x, c2i)

            y_hat = model(x_tens)

            y_tens = torch.tensor(l2i[y])

            loss += loss_func(y_hat.unsqueeze(0), y_tens.unsqueeze(0))

            if x_idx % 1000 == 0:
                print(
                    f"{x_idx}/{len(pairs)} average per-item loss: {loss / loss_batch_size}"
                )

            if x_idx % loss_batch_size == 0 and x_idx > 0:
                # send back gradients:
                loss.backward()
                # now, tell the optimizer to update our weights:
                opt.step()
                loss = 0

        # now one last time:
        loss.backward()
        opt.step()

    return model
Exemplo n.º 6
0
    def test_train_model_embed2_hidden2(self):

        # Adding prefix _ to local variables
        # Noticed some variation in the scores every time I run it.
        # Don't want to pollute my variables with globals when running all tests
        _bi_text = pd.read_csv("../../data/sentences_bilingual.csv")
        _bi_text_train, _bi_text_test = train_test_split(_bi_text,
                                                         test_size=0.2)
        _c2i, _i2c = vocab.build_vocab(bi_text.sentence.values)
        _l2i, _i2l = vocab.build_label_vocab(bi_text.lang.values)

        _li = lang_id.LangID(input_vocab_n=len(_c2i),
                             embedding_dims=2,
                             hidden_dims=2,
                             lstm_layers=1,
                             output_class_n=2)

        _trained_model = lang_id.train_model(model=_li,
                                             n_epochs=1,
                                             training_data=_bi_text_train,
                                             c2i=_c2i,
                                             i2c=_i2c,
                                             l2i=_l2i,
                                             i2l=_i2l)

        _trained_model(vocab.sentence_to_tensor("this is a sentence", _c2i))
        _trained_model(vocab.sentence_to_tensor("quien estas", _c2i))

        _acc, _y_hat = lang_id.eval_acc(_trained_model, _bi_text_test, _c2i,
                                        _i2c, _l2i, _i2l)
        print(f"Trained Accuracy: {_acc}")

        pred_label_list = np.asarray(_y_hat)
        pd.options.mode.chained_assignment = None

        pred_label_list = np.asarray(_y_hat)
        pd.options.mode.chained_assignment = None
        _bi_text_test["predicted"] = pred_label_list
        _bi_text_test = _bi_text_test[["sentence", "lang", "predicted"]]
        _bi_text_test.to_csv("../../data/deliverable_2.4.csv", index=False)

        assert_greater(_acc, 0.89)
Exemplo n.º 7
0
def test_d2_1_forward():
    global bt_c2i

    li = lang_id.LangID(input_vocab_n=len(bt_c2i),
                        embedding_dims=10,
                        hidden_dims=20,
                        lstm_layers=1,
                        output_class_n=2)

    out = li(vocab.sentence_to_tensor("this is a sentence", bt_c2i))

    eq_(out.shape[0], 2)
Exemplo n.º 8
0
def compute_prob(model, sentence, c2i):
    """
    Compute the negative log probability of p(sentence)

    Equivalent to equation 3.3 in Jurafsky & Martin.
    """

    nll = nn.NLLLoss(reduction='sum')

    with torch.no_grad():
        s_tens = vocab.sentence_to_tensor(sentence, c2i, True)
        x = s_tens[:,:-1]
        y = s_tens[:,1:]
        y_hat, _ = model(x, model.init_hidden())
        return nll(y_hat.squeeze(), y.squeeze()).item() # get rid of first dimension of each
Exemplo n.º 9
0
def test_d3_1_setup():
    global c2i, i2c

    mod = lm.NameGenerator(input_vocab_size=len(c2i),
                           n_embedding_dims=25,
                           n_hidden_dims=50,
                           n_lstm_layers=1,
                           output_vocab_size=len(c2i))

    x = vocab.sentence_to_tensor("hello there", c2i, True)

    y_hat, hidden_state = mod(x, mod.init_hidden())

    # is the output the proper size?
    eq_(torch.Size([1, 13, 77]), y_hat.shape)  # counting <bos> and <eos>

    # do things add up to 1?
    sum_of_probs_at_first_pos = y_hat.squeeze()[0].exp().sum().item()
    assert_almost_equals(sum_of_probs_at_first_pos, 1.0, places=3)
Exemplo n.º 10
0
def predict_one(model, s, c2i, i2l):
    """
    Runs a sentence, s, through the model, and returns the predicted label.

    Make sure to use "torch.no_grad()"!
    See https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html#gradients for discussion

    :param model: The LangID model to use for prediction
    :param s: The sentence to pss through, as a string
    :param c2i: The dictionary to use to map from character to index
    :param i2l: The dictionary for mapping from output index to label
    :returns: The predicted label for s
    :rtype: str
    """
    model.eval()
    with torch.no_grad():
        sent_vect = vocab.sentence_to_tensor(s,c2i)
        log_likes = model(sent_vect)
        label = i2l[int(log_likes.argmax())]
    return label