def test_d1_3_sentence_tensor(): global tiny_corpus c2i, i2c = vocab.build_vocab(tiny_corpus) s = "This is a sentence." s_tens = vocab.sentence_to_tensor(s, c2i) eq_(s_tens.shape[0], 1) eq_(s_tens.shape[1], len(s)) eq_(len(s_tens.shape), 2) # does BOS/EOS padding work? with_padding = vocab.sentence_to_tensor(s, c2i, True) eq_(with_padding.shape[1], len(s) + 2) eq_(with_padding[0, 0].item(), c2i[vocab.BOS_SYM])
def predict_one(model, s, c2i, i2l): """ Runs a sentence, s, through the model, and returns the predicted label. Make sure to use "torch.no_grad()"! See https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html#gradients for discussion :param model: The LangID model to use for prediction :param s: The sentence to pss through, as a string :param c2i: The dictionary to use to map from character to index :param i2l: The dictionary for mapping from output index to label :returns: The predicted label for s :rtype: str """ with torch.no_grad(): sentence_tensor = vocab.sentence_to_tensor(s, c2i) y_pred = model(sentence_tensor) list_predictions = y_pred.tolist() max_value = max(list_predictions) max_index = list_predictions.index(max_value) predicted_label = i2l[max_index] return predicted_label
def train(model, epochs, training_data, c2i): """ Train model for the specified number of epochs, over the provided training data. Make sure to shuffle the training data at the beginning of each epoch! """ opt = torch.optim.Adam(model.parameters()) loss_function = torch.nn.NLLLoss() loss_batch_size = 100 for _ in range(epochs): random.shuffle(training_data) loss = 0 for idx, name in enumerate(training_data): if idx % loss_batch_size == 0: opt.zero_grad() x_tens = vocab.sentence_to_tensor(name[:-1], c2i, True) y_tens = vocab.sentence_to_tensor(name[1:], c2i, True) y_hat, _ = model(x_tens, model.init_hidden()) loss += loss_function(y_hat[-1],y_tens[-1]) if idx % 1000 == 0: print(f"{idx}/{len(training_data)} average per-item loss: {loss / loss_batch_size}") if idx % loss_batch_size == 0 and idx > 0: # send back gradients: loss.backward() # now, tell the optimizer to update our weights: opt.step() loss = 0 # now one last time: loss.backward() opt.step() return model
def test_train_model(self): # Adding prefix _ to local variables # Noticed some variation in the scores every time I run it. # Don't want to pollute my variables with globals when running all tests _bi_text = pd.read_csv("../../data/sentences_bilingual.csv") _bi_text_train, _bi_text_test = train_test_split(_bi_text, test_size=0.2) _c2i, _i2c = vocab.build_vocab(bi_text.sentence.values) _l2i, _i2l = vocab.build_label_vocab(bi_text.lang.values) _li = lang_id.LangID(input_vocab_n=len(_c2i), embedding_dims=10, hidden_dims=20, lstm_layers=1, output_class_n=2) _trained_model = lang_id.train_model(model=_li, n_epochs=1, training_data=_bi_text_train, c2i=_c2i, i2c=_i2c, l2i=_l2i, i2l=_i2l) _trained_model(vocab.sentence_to_tensor("this is a sentence", _c2i)) _trained_model(vocab.sentence_to_tensor("quien estas", _c2i)) _acc, _y_hat = lang_id.eval_acc(_trained_model, _bi_text_test, _c2i, _i2c, _l2i, _i2l) print(f"Trained Accuracy: {_acc}") _untrained = lang_id.LangID(input_vocab_n=len(_c2i), embedding_dims=10, hidden_dims=20, lstm_layers=1, output_class_n=2) _acc_untrained, _y_hat_untrained = lang_id.eval_acc( _untrained, _bi_text_test, _c2i, _i2c, _l2i, _i2l) print(f"Untrained Accuracy: {_acc_untrained}") assert_greater(_acc, 0.89)
def train_model(model, n_epochs, training_data, c2i, i2c, l2i, i2l): """ Train using the Adam optimizer. :returns: The trained model, as well as a list of average loss values from during training (for visualizing) loss stability, etc. """ opt = torch.optim.Adam(model.parameters()) loss_func = torch.nn.NLLLoss( ) # since our model gives negative log probs on the output side loss_batch_size = 100 for i in range(n_epochs): x_train = training_data.sentence.values y_train = training_data.lang.values # There's a more pandas-ish way to do this... pairs = list(zip(x_train, y_train)) random.shuffle(pairs) loss = 0 for x_idx, (x, y) in enumerate(pairs): if x_idx % loss_batch_size == 0: opt.zero_grad() x_tens = vocab.sentence_to_tensor(x, c2i) y_hat = model(x_tens) y_tens = torch.tensor(l2i[y]) loss += loss_func(y_hat.unsqueeze(0), y_tens.unsqueeze(0)) if x_idx % 1000 == 0: print( f"{x_idx}/{len(pairs)} average per-item loss: {loss / loss_batch_size}" ) if x_idx % loss_batch_size == 0 and x_idx > 0: # send back gradients: loss.backward() # now, tell the optimizer to update our weights: opt.step() loss = 0 # now one last time: loss.backward() opt.step() return model
def test_train_model_embed2_hidden2(self): # Adding prefix _ to local variables # Noticed some variation in the scores every time I run it. # Don't want to pollute my variables with globals when running all tests _bi_text = pd.read_csv("../../data/sentences_bilingual.csv") _bi_text_train, _bi_text_test = train_test_split(_bi_text, test_size=0.2) _c2i, _i2c = vocab.build_vocab(bi_text.sentence.values) _l2i, _i2l = vocab.build_label_vocab(bi_text.lang.values) _li = lang_id.LangID(input_vocab_n=len(_c2i), embedding_dims=2, hidden_dims=2, lstm_layers=1, output_class_n=2) _trained_model = lang_id.train_model(model=_li, n_epochs=1, training_data=_bi_text_train, c2i=_c2i, i2c=_i2c, l2i=_l2i, i2l=_i2l) _trained_model(vocab.sentence_to_tensor("this is a sentence", _c2i)) _trained_model(vocab.sentence_to_tensor("quien estas", _c2i)) _acc, _y_hat = lang_id.eval_acc(_trained_model, _bi_text_test, _c2i, _i2c, _l2i, _i2l) print(f"Trained Accuracy: {_acc}") pred_label_list = np.asarray(_y_hat) pd.options.mode.chained_assignment = None pred_label_list = np.asarray(_y_hat) pd.options.mode.chained_assignment = None _bi_text_test["predicted"] = pred_label_list _bi_text_test = _bi_text_test[["sentence", "lang", "predicted"]] _bi_text_test.to_csv("../../data/deliverable_2.4.csv", index=False) assert_greater(_acc, 0.89)
def test_d2_1_forward(): global bt_c2i li = lang_id.LangID(input_vocab_n=len(bt_c2i), embedding_dims=10, hidden_dims=20, lstm_layers=1, output_class_n=2) out = li(vocab.sentence_to_tensor("this is a sentence", bt_c2i)) eq_(out.shape[0], 2)
def compute_prob(model, sentence, c2i): """ Compute the negative log probability of p(sentence) Equivalent to equation 3.3 in Jurafsky & Martin. """ nll = nn.NLLLoss(reduction='sum') with torch.no_grad(): s_tens = vocab.sentence_to_tensor(sentence, c2i, True) x = s_tens[:,:-1] y = s_tens[:,1:] y_hat, _ = model(x, model.init_hidden()) return nll(y_hat.squeeze(), y.squeeze()).item() # get rid of first dimension of each
def test_d3_1_setup(): global c2i, i2c mod = lm.NameGenerator(input_vocab_size=len(c2i), n_embedding_dims=25, n_hidden_dims=50, n_lstm_layers=1, output_vocab_size=len(c2i)) x = vocab.sentence_to_tensor("hello there", c2i, True) y_hat, hidden_state = mod(x, mod.init_hidden()) # is the output the proper size? eq_(torch.Size([1, 13, 77]), y_hat.shape) # counting <bos> and <eos> # do things add up to 1? sum_of_probs_at_first_pos = y_hat.squeeze()[0].exp().sum().item() assert_almost_equals(sum_of_probs_at_first_pos, 1.0, places=3)
def predict_one(model, s, c2i, i2l): """ Runs a sentence, s, through the model, and returns the predicted label. Make sure to use "torch.no_grad()"! See https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html#gradients for discussion :param model: The LangID model to use for prediction :param s: The sentence to pss through, as a string :param c2i: The dictionary to use to map from character to index :param i2l: The dictionary for mapping from output index to label :returns: The predicted label for s :rtype: str """ model.eval() with torch.no_grad(): sent_vect = vocab.sentence_to_tensor(s,c2i) log_likes = model(sent_vect) label = i2l[int(log_likes.argmax())] return label