def train(embedding_dimension, number_of_hidden_layers, hidden_layer_dimension, activation_function, number_of_training_epochs, loss_function_choice, optimizer_choice, learning_rate): train_losses = [] train_accuracies = [] validation_accuracies = [] cbow = CBOW(vocab_size=len(vocab), num_classes=len(language_set), embedding_dim=embedding_dimension, hidden_dim=hidden_layer_dimension, number_of_hidden_layers=number_of_hidden_layers, activation_function=activation_function) for epoch in range(number_of_training_epochs): train_losses.append( train_epoch(epoch, cbow, X_train, y_train, loss_function=loss_function_choice(), optimizer=optimizer_choice(cbow.parameters(), lr=learning_rate))) train_accuracies.append(evaluate(cbow, X_train, y_train)) validation_accuracies.append(evaluate(cbow, X_validation, y_validation)) print(f"Training accuracy: {evaluate(cbow, X_train, y_train)}") print(f"Validation accuracy: {evaluate(cbow, X_validation, y_validation)}") print(f"Test accuracy: {evaluate(cbow, X_test, y_test)}") return evaluate(cbow, X_validation, y_validation)
def train_cbow(): losses = [] loss_fn = nn.NLLLoss() model = CBOW(vocab_size, embed_size, CONTEXT_SIZE, hidden_size) print(model) optimizer = optim.SGD(model.parameters(), lr=learning_rate) cbow_train = create_cbow_dataset(text) for epoch in range(n_epoch): total_loss = .0 for context, target in cbow_train: ctx_idxs = [w2i[w] for w in context] ctx_var = Variable(torch.LongTensor(ctx_idxs)) model.zero_grad() log_probs = model(ctx_var) loss = loss_fn(log_probs, Variable(torch.LongTensor([w2i[target]]))) loss.backward() optimizer.step() total_loss += float(loss) losses.append(total_loss) return model, losses
def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=100, window_size=5, iteration=5, initial_lr=0.025, min_count=5, using_hs=False, using_neg=False, context_size=2, hidden_size=128, cbow=None, skip_gram=None): print("\nInput File loading......\n") self.data = InputData(input_file_name, min_count) print("\nInput File loaded.\n") self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.context_size = context_size self.hidden_size = hidden_size self.using_hs = using_hs self.using_neg = using_neg self.cbow = cbow self.skip_gram = skip_gram if self.skip_gram is not None and self.skip_gram: self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) print("skip_gram_model", self.skip_gram_model) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) if self.cbow is not None and self.cbow: self.cbow_model = CBOW(self.emb_size, self.emb_dimension) print("CBOW_model", self.cbow_model) self.optimizer = optim.SGD(self.cbow_model.parameters(), lr=self.initial_lr)
from utils import * from model import CBOW import math import numpy as np import six.moves.cPickle as pickle with open('idx2word.pkl', 'rb') as f: idx2word = pickle.load(f) vocab_size = len(idx2word) emb_size = 128 [context, target], loss, params = CBOW(vocab_size, emb_size) load_params("model-1-epoch", params) embeddings = params[0].get_value() norm = math.sqrt(np.sum(np.square(embeddings), axis=1, keepdims=True)[0]) normalized_embeddings = embeddings / norm # Step 6: Visualize the embeddings. def plot_with_labels(low_dim_embs, labels, filename='tsne.png'): assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings" plt.figure(figsize=(18, 18)) #in inches for i, label in enumerate(labels): x, y = low_dim_embs[i, :] plt.scatter(x, y) plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
data = helper.create_dataset(CONTEXT_SIZE) train_data, test_data = helper.split_dataset(data, train_ratio) word_to_ix = helper.create_dictionary() VOCAB_SIZE = len(word_to_ix) test_inputs, test_labels = helper.make_batch(test_data, -1) train_inputs, train_labels = helper.make_batch(train_data, batch_size) test_inputs = Variable(helper.vectorize_data(test_inputs, word_to_ix), requires_grad=False) test_labels = Variable(helper.vectorize_data(test_labels, word_to_ix).view(-1), requires_grad=False) model = CBOW(VOCAB_SIZE, EMB_SIZE) optimizer = optim.SGD(model.parameters(), lr=lr) criterion = nn.NLLLoss() # Before training print("TESTING BEFORE TRAINING---------") model.eval() print('LOSS: ' + str(evaluate.eval_loss(model, test_inputs, test_labels, criterion))) model.zero_grad() print('TRAINING STARTS------------------') model.train() if (load): model.load_state_dict(torch.load(save_file)['model'])
from tqdm import tqdm import torch from torch.autograd import Variable EMB_DIM = 50 article = get_words() tokens = check(PreProcessor().clean(article)) # vec_txt = encode(tokens) x, y = prep_train(tokens) print("We out here!") embs, w2x = w2v_dict_to_torch_emb(w2v) print("Made it!") cbow = CBOW(len(w2v), EMB_DIM, embs) loss_function = nn.NLLLoss() optimizer = torch.optim.SGD(cbow.parameters(), lr=0.001) print("Almost there!") # train # TODO: save model with torch every few epochs for epoch in range(50): losses = [] total_loss = 0 for context, target in tqdm(zip(x, y)): cbow.zero_grad() context = list(map(lambda w: w2x[w], context)) log_probs = cbow(Variable(torch.LongTensor(context)))
context = [ raw_text[i - 2], raw_text[i - 1], raw_text[i + 1], raw_text[i + 2] ] target = raw_text[i] data.append((context, target)) print(data[:5]) def make_context_vector(context, word_to_ix): idxs = [word_to_ix[w] for w in context] return autograd.Variable(torch.LongTensor(idxs)) losses = [] loss_function = nn.NLLLoss() model = CBOW(vocab_size, EMBEDDING_DIM, CONTEXT_SIZE) optimizer = optim.SGD(model.parameters(), lr=0.001) for epoch in range(10): total_loss = torch.Tensor([0]) for context, target in data: # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words # into integer indices and wrap them in variables) context_var = make_context_vector(context, word_to_ix) # Step 2. Recall that torch *accumulates* gradients. Before passing in a # new instance, you need to zero out the gradients from the old # instance model.zero_grad() # step 3. Run forward pass
# coding:utf-8 from dataset import DataSet from model import CBOW from torch import optim import torch HIDDEN = 100 LR = 0.001 LOG_EVERY = 10 EPOCH = 4 BATCH = 200 WINDOW = 2 NNEG = 4 if __name__ == '__main__': dataset = DataSet(nepoch=EPOCH, nbatch=BATCH, window=WINDOW, nneg=NNEG) model = CBOW(dataset.nvocab, 100) optimizer = optim.SGD(model.parameters(), lr=0.01) for targets, contexts, negtives in dataset: optimizer.zero_grad() loss = model(targets, contexts, negtives) loss.backward() optimizer.step() if dataset.iter % LOG_EVERY == 0: print("[iter %-4d epoch %-2d batch %-3d] loss %-.3f" % (dataset.iter, dataset.epoch, dataset.batch, loss.data[0])) torch.save(model.wordemb, 'data/wordemb.pth')
researchName = list(map(lambda x: x.lower(), researchName)) trainingDataRaw = [] for i in range(2, len(researchName) - 2): context = [researchName[i-2], researchName[i-1], researchName[i+1], researchName[i+2]] target = researchName[i] trainingDataRaw.append((context, target)) #### lookup table for vector #### vocabs = set(researchName) wordToIndex = { word: i for i, word in enumerate(vocabs) } #### Model Initialization #### criterion = nn.NLLLoss() model = CBOW(len(vocabs), 5, 4) optimizer = optim.Adam(model.parameters(), lr = 0.01) #### Training loop #### for epoch in range(3): for context, target in trainingDataRaw: contextIdx = torch.tensor([wordToIndex[word] for word in context], dtype=torch.long) model.zero_grad() output = model(contextIdx) loss = criterion(output, torch.tensor([wordToIndex[target]], dtype=torch.long)) loss.backward() optimizer.step() print(model.getEmbeddingMatrix(contextIdx))
parser = argparse.ArgumentParser() parser.add_argument('--idx2word_file', default='idx2word.pkl', type=str) parser.add_argument('--params_file', default='model-1-epoch', type=str) parser.add_argument('--emb_size', default=128, type=int) parser.add_argument('--eval_data', default='questions-words.txt', type=str) args = parser.parse_args() emb_size = args.emb_size with open(args.idx2word_file, 'rb') as f: idx2word = pickle.load(f) vocab_size = len(idx2word) word2idx = dict([(idx2word[idx], idx) for idx in idx2word]) _, _, params = CBOW(vocab_size, emb_size) load_params(args.params_file, params) embeddings = params[0] norm = T.sqrt(T.sum(T.sqr(embeddings), axis=1, keepdims=True)) normalized_embeddings = embeddings / norm predict = get_analogy_prediction_model(normalized_embeddings, emb_size, vocab_size) """Evaluate analogy questions and reports accuracy.""" # How many questions we get right at precision@1. correct = 0 analogy_data = read_analogies(args.eval_data, word2idx) analogy_questions = analogy_data[:, :3] answers = analogy_data[:, 3]
class Word2Vec: def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=100, window_size=5, iteration=5, initial_lr=0.025, min_count=5, using_hs=False, using_neg=False, context_size=2, hidden_size=128, cbow=None, skip_gram=None): print("\nInput File loading......\n") self.data = InputData(input_file_name, min_count) print("\nInput File loaded.\n") self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.context_size = context_size self.hidden_size = hidden_size self.using_hs = using_hs self.using_neg = using_neg self.cbow = cbow self.skip_gram = skip_gram if self.skip_gram is not None and self.skip_gram: self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) print("skip_gram_model", self.skip_gram_model) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) if self.cbow is not None and self.cbow: self.cbow_model = CBOW(self.emb_size, self.emb_dimension) print("CBOW_model", self.cbow_model) self.optimizer = optim.SGD(self.cbow_model.parameters(), lr=self.initial_lr) def skip_gram_train(self): """Multiple training. Returns: None. """ print("Skip_Gram Training......") pair_count = self.data.evaluate_pair_count(self.window_size) print("pair_count", pair_count) batch_count = self.iteration * pair_count / self.batch_size print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) self.skip_gram_model.save_embedding(self.data.id2word, 'skip_gram_begin_embedding.txt') for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) if self.using_hs: pos_pairs, neg_pairs = self.data.get_pairs_by_huffman( pos_pairs) else: pos_pairs, neg_pairs = self.data.get_pairs_by_neg_sampling( pos_pairs, 5) pos_u = [int(pair[0]) for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_u = [int(pair[0]) for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_u, neg_v) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) print("Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr print("Skip_Gram Trained and Saving File......") self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name) print("Skip_Gram Trained and Saved File.") def cbow_train(self): print("CBOW Training......") self.cbow_model.save_embedding(self.data.id2word, 'cbow_begin_embedding.txt') pos_all_pairs = self.data.get_cbow_batch_all_pairs( self.batch_size, self.context_size) pair_count = len(pos_all_pairs) process_bar = tqdm(range(int(pair_count / self.batch_size))) for _ in process_bar: pos_pairs = self.data.get_cbow_batch_pairs(self.batch_size, self.window_size) if self.using_hs: pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_huffman( pos_pairs) else: pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_neg_sampling( pos_pairs, self.context_size) pos_u = [pair[0] for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_u = [pair[0] for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] self.optimizer.zero_grad() loss = self.cbow_model.forward(pos_u, pos_v, neg_u, neg_v) loss.backward() self.optimizer.step() print("CBOW Trained and Saving File......") self.cbow_model.save_embedding(self.data.id2word, self.output_file_name) print("CBOW Trained and Saved File.")
def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=100, window_size=5, iteration=5, initial_lr=0.025, min_count=5, using_hs=False, using_neg=False, context_size=2, hidden_size=128, cbow=None, skip_gram=None): """Initilize class parameters. Args: input_file_name: Name of a text data from file. Each line is a sentence splited with space. output_file_name: Name of the final embedding file. emb_dimention: Embedding dimention, typically from 50 to 500. batch_size: The count of word pairs for one forward. window_size: Max skip length between words. iteration: Control the multiple training iterations. initial_lr: Initial learning rate. min_count: The minimal word frequency, words with lower frequency will be filtered. using_hs: Whether using hierarchical softmax. Returns: None. """ print("\nInput File loading......\n") self.data = InputData(input_file_name, min_count) print("\nInput File loaded.\n") print("Input Data", self.data) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) print("emb_size", self.emb_size) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.context_size = context_size self.hidden_size = hidden_size self.using_hs = using_hs self.using_neg = using_neg self.cbow = cbow self.skip_gram = skip_gram if self.skip_gram is not None and self.skip_gram: self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) print("skip_gram_model", self.skip_gram_model) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) if self.cbow is not None and self.cbow: # self.cbow_model = CBOW(self.emb_size, self.context_size, self.emb_dimension, self.hidden_size) self.cbow_model = CBOW(self.emb_size, self.emb_dimension) print("CBOW_model", self.cbow_model) self.optimizer = optim.SGD(self.cbow_model.parameters(), lr=self.initial_lr)
class Word2Vec: def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=100, window_size=5, iteration=5, initial_lr=0.025, min_count=5, using_hs=False, using_neg=False, context_size=2, hidden_size=128, cbow=None, skip_gram=None): """Initilize class parameters. Args: input_file_name: Name of a text data from file. Each line is a sentence splited with space. output_file_name: Name of the final embedding file. emb_dimention: Embedding dimention, typically from 50 to 500. batch_size: The count of word pairs for one forward. window_size: Max skip length between words. iteration: Control the multiple training iterations. initial_lr: Initial learning rate. min_count: The minimal word frequency, words with lower frequency will be filtered. using_hs: Whether using hierarchical softmax. Returns: None. """ print("\nInput File loading......\n") self.data = InputData(input_file_name, min_count) print("\nInput File loaded.\n") print("Input Data", self.data) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) print("emb_size", self.emb_size) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.iteration = iteration self.initial_lr = initial_lr self.context_size = context_size self.hidden_size = hidden_size self.using_hs = using_hs self.using_neg = using_neg self.cbow = cbow self.skip_gram = skip_gram if self.skip_gram is not None and self.skip_gram: self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) print("skip_gram_model", self.skip_gram_model) self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) if self.cbow is not None and self.cbow: # self.cbow_model = CBOW(self.emb_size, self.context_size, self.emb_dimension, self.hidden_size) self.cbow_model = CBOW(self.emb_size, self.emb_dimension) print("CBOW_model", self.cbow_model) self.optimizer = optim.SGD(self.cbow_model.parameters(), lr=self.initial_lr) # @profile def skip_gram_train(self): """Multiple training. Returns: None. """ pair_count = self.data.evaluate_pair_count(self.window_size) print("pair_count", pair_count) batch_count = self.iteration * pair_count / self.batch_size print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) self.skip_gram_model.save_embedding(self.data.id2word, 'skip_gram_begin_embedding.txt') for i in process_bar: pos_pairs = self.data.get_batch_pairs(self.batch_size, self.window_size) if self.using_hs: pos_pairs, neg_pairs = self.data.get_pairs_by_huffman( pos_pairs) else: pos_pairs, neg_pairs = self.data.get_pairs_by_neg_sampling( pos_pairs, 5) pos_u = [int(pair[0]) for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_u = [int(pair[0]) for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] self.optimizer.zero_grad() loss = self.skip_gram_model.forward(pos_u, pos_v, neg_u, neg_v) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) print("Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name) def cbow_train(self): print("CBOW Training......") pair_count = self.data.evaluate_pair_count(self.context_size * 2 + 1) print("pair_count", pair_count) batch_count = self.iteration * pair_count / self.batch_size print("batch_count", batch_count) process_bar = tqdm(range(int(batch_count))) self.cbow_model.save_embedding(self.data.id2word, 'cbow_begin_embedding.txt') for i in process_bar: pos_pairs = self.data.get_cbow_batch_all_pairs( self.batch_size, self.context_size) if self.using_hs: pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_huffman( pos_pairs) else: pos_pairs, neg_pairs = self.data.get_cbow_pairs_by_neg_sampling( pos_pairs, self.context_size) pos_u = [pair[0] for pair in pos_pairs] pos_v = [int(pair[1]) for pair in pos_pairs] neg_u = [pair[0] for pair in neg_pairs] neg_v = [int(pair[1]) for pair in neg_pairs] self.optimizer.zero_grad() loss = self.cbow_model.forward(pos_u, pos_v, neg_u, neg_v) # loss = self.cbow_model.forwards(pos_v, pos_u, neg_v, neg_u) loss.backward() self.optimizer.step() process_bar.set_description( "Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) print("Loss: %0.8f, lr: %0.6f" % (loss.data[0], self.optimizer.param_groups[0]['lr'])) if i * self.batch_size % 100000 == 0: lr = self.initial_lr * (1.0 - 1.0 * i / batch_count) for param_group in self.optimizer.param_groups: param_group['lr'] = lr print("CBOW Trained and Saving File......") self.cbow_model.save_embedding(self.data.id2word, self.output_file_name) print("CBOW Trained and Saved File.")
N_EPOCH = 10000 if __name__ == '__main__': # dataset, dataloader trainset = PTBdata(path='ptb.train.txt', window=WINDOW, limit=100) trainloader = DataLoader(dataset=trainset, batch_size=N_BATCH, shuffle=True) # make vocab vocab_size = trainset.vocab_size word2idx = trainset.word2idx idx2word = trainset.idx2word # model, loss, optimizer model = CBOW(vocab_size, N_EMBED) criterion = nn.NLLLoss() optimizer = optim.Adam(model.parameters(), lr=0.01) for epoch in range(N_EPOCH): running_loss = 0 for i, data in enumerate(trainloader): targets = data['target'] contexts = data['context'] prob = model(contexts) loss = criterion(prob, targets) optimizer.zero_grad() loss.backward() optimizer.step()
def train(embedding_dimension, number_of_hidden_layers, hidden_layer_dimension, activation_function, number_of_training_epochs, loss_function_choice, optimizer_choice, learning_rate, use_rnn=False, use_LSTM=False, use_GRU=False): train_losses = [] train_accuracies = [] validation_accuracies = [] #Dynamically select model if use_rnn: print("Using Seq2Vec...") RNN_layer = nn.RNN if use_LSTM: print("Using LSTM...") RNN_layer = nn.LSTM elif use_GRU: print("Using GRU...") RNN_layer = nn.GRU model = Seq2Vec(vocab_size=len(vocab), num_classes=len(language_set), embedding_dim=embedding_dimension, hidden_dim=hidden_layer_dimension, number_of_hidden_layers=number_of_hidden_layers, activation_function=activation_function, RNN_layer=RNN_layer) else: model = CBOW(vocab_size=len(vocab), num_classes=len(language_set), embedding_dim=embedding_dimension, hidden_dim=hidden_layer_dimension, number_of_hidden_layers=number_of_hidden_layers, activation_function=activation_function) for epoch in range(number_of_training_epochs): train_losses.append( train_epoch(epoch, model, X_train, y_train, loss_function=loss_function_choice(), optimizer=optimizer_choice(model.parameters(), lr=learning_rate))) train_accuracies.append(evaluate(model, X_train, y_train)) validation_accuracies.append( evaluate(model, X_validation, y_validation)) #Printing train, validation, and test accuracies, of the best achieved. train_accuracy = train_accuracies[len(train_accuracies) - 1] validation_accuracy = validation_accuracies[len(validation_accuracies) - 1] test_accuracy = evaluate(model, X_test, y_test) print(f"Training accuracy: {train_accuracy}") print(f"Validation accuracy: {validation_accuracy}") print(f"Test accuracy: {test_accuracy}") print("") print(f"Training accuracies: {train_accuracies}") print(f"Validation accuracies: {validation_accuracies}") return train_accuracy, validation_accuracy, test_accuracy, train_accuracies, validation_accuracies
def __init__(self, mode, vocab_dim, embed_dim, sparse): self.mode = mode if self.mode == 'cbow': self.model = CBOW(vocab_dim, embed_dim, sparse) elif self.mode == 'skip-gram': self.model = SkipGram(vocab_dim, embed_dim, sparse)