def __init__(self, batch_size=2, steps=100, validate=False): super(BaseTask, self).__init__() dataset = SimplePOSTaggerDataset() n_total = len(dataset) assert batch_size <= n_total, "`batch_size` is greater than size of dataset" if validate: n_train = int(n_total * 0.9) self.train_loader = DataLoader( Subset(dataset, range(n_train)), batch_size=batch_size, collate_fn=collate_wrapper, ) self.val_loader = DataLoader( Subset(dataset, range(n_train, n_total)), batch_size=batch_size, collate_fn=collate_wrapper, ) else: self.train_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_wrapper) self.val_loader = None vocab_set = set.union(*[set(v) for v in dataset.data]) vocab_to_ix = {vocab: i for i, vocab in enumerate(vocab_set)} tagset_size = len(dataset.tag_to_ix) self.batch_size = batch_size self.model = LSTMTagger(6, 6, tagset_size, vocab_to_ix) self.optimizer = optim.SGD(self.model.parameters(), lr=1e-5) self.criterion = nn.NLLLoss() self.device = torch.device("cuda")
def main(): args = parse_args() data = LabeledDataset(args.train_file) loader = DataLoader(data, batch_size=args.batch_size) model = LSTMTagger(len(data.x_vocab), len(data.y_vocab), 20, 64, 1) if use_cuda: model = model.cuda() train(model, loader, epochs=2)
def __init__(self, PATH): self.TEXT = pickle.load(open('TEXT.pkl', 'rb')) self.LABELS = pickle.load(open('LABELS.pkl', 'rb')) self.BATCH_SIZE = 1 self.INPUT_DIM = len(self.TEXT.vocab) self.EMBEDDING_DIM = 100 self.HIDDEN_DIM = 256 self.OUTPUT_DIM = len(self.LABELS.vocab) self.criterion = nn.CrossEntropyLoss() self.criterion = self.criterion.to(device) self.PATH = PATH self.model = LSTMTagger(self.EMBEDDING_DIM, self.HIDDEN_DIM, self.INPUT_DIM, self.OUTPUT_DIM) self.model.load_state_dict(torch.load(PATH)) self.model.to(device) self.model.eval()
def train(): torch.initial_seed() filename = "models/model" print_hyperparameters() # Loding data and preprocesing print('Reading data...') data_raw = readtrain() data_raw_dev = readdev() print('Training Data size', len(data_raw)) print('Dev data size', len(data_raw_dev)) print('Preparing data...') tag_to_index, word_to_index, index_to_tag, index_to_word = prepare_embedding( data_raw) idxs = [tag_to_index, word_to_index, index_to_tag, index_to_word] dataset = get_loader(data_raw, idxs) dev_dataset = get_loader(data_raw_dev, idxs) ''' data = [] for sentence, tags in data_raw: sentence_in = prepare(sentence, word_to_index) targets = prepare(tags, tag_to_index) data.append((sentence_in, targets)) ''' #data_dev = data_raw_dev # data_dev = [] # for sentence, tags in data_raw_dev: # sentence_in = prepare(sentence, word_to_index) # targets = prepare(tags, tag_to_index) # data_dev.append((sentence_in, targets)) # #data_batches = mini_batch(idxs, data) # data_dev_batches = mini_batch(idxs, data_dev) # print('Training Data size', len(data)) # Save indexes to data.pickle with open('data.pickle', 'wb') as f: pickle.dump([tag_to_index, word_to_index, index_to_tag, index_to_word], f, pickle.HIGHEST_PROTOCOL) # Create an instance of the NN model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_index), len(tag_to_index)) print('Source size:', len(word_to_index)) print('Target size:', len(tag_to_index)) loss_function = nn.NLLLoss(size_average=True) if USE_CUDA: model = model.cuda() if OPTIMIZER == 'SGD': optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE) else: optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) #Train print('Train with', len(dataset), 'batches.') for epoch in range(EPOCHS): print(f'Starting epoch {epoch}.') loss_sum = 0 y_true = list() y_pred = list() for batch, lengths, targets, lengths2 in tqdm(dataset): model.zero_grad() #batch, targets, lengths = sort_batch(batch, targets, lengths) #pred = model(autograd.Variable(batch), lengths.cpu().numpy()) pred = model(autograd.Variable(batch), lengths.cpu().numpy()) loss = loss_function( pred.view(-1, pred.size()[2]), autograd.Variable(targets).view(-1, 1).squeeze(1)) loss.backward() #for f in model.parameters(): # print('data is') # print(f.data) # print('grad is') # print(f.grad) optimizer.step() loss_sum += loss.data[0] #print(loss.data[0]) pred_idx = torch.max(pred, 1)[1] y_true += list(targets.int()) y_pred += list(pred_idx.data.int()) #acc = accuracy_score(y_true, y_pred) loss_total = loss_sum / len(dataset) #print('Accuracy on test:', acc, 'loss:', loss_total) print('>>> Loss:', loss_total) acc = predict(dataset, model_=model, idxs=idxs) print("Accuracy on train:", acc) acc = predict(dev_dataset, model_=model, idxs=idxs) print("Accuracy on dev:", acc)
class EstTokenizer: def __init__(self, PATH): self.TEXT = pickle.load(open('TEXT.pkl', 'rb')) self.LABELS = pickle.load(open('LABELS.pkl', 'rb')) self.BATCH_SIZE = 1 self.INPUT_DIM = len(self.TEXT.vocab) self.EMBEDDING_DIM = 100 self.HIDDEN_DIM = 256 self.OUTPUT_DIM = len(self.LABELS.vocab) self.criterion = nn.CrossEntropyLoss() self.criterion = self.criterion.to(device) self.PATH = PATH self.model = LSTMTagger(self.EMBEDDING_DIM, self.HIDDEN_DIM, self.INPUT_DIM, self.OUTPUT_DIM) self.model.load_state_dict(torch.load(PATH)) self.model.to(device) self.model.eval() def __binary_accuracy(self, preds, y): """ Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8 """ #round predictions to the closest integer _, rounded_preds = torch.max(torch.sigmoid(preds), 1) correct = (rounded_preds == y).float() #convert into float for division acc = correct.sum()/len(correct) return acc def evaluate(self, iterator): epoch_loss = 0 epoch_acc = 0 self.model.eval() with torch.no_grad(): for batch in iterator: t, l = batch.text predictions = self.model(t, l) #predictions = torch.argmax(predictions, dim=2) predictions = predictions.reshape(-1, predictions.size()[-1]) predictions = predictions.float() labels = batch.labels.reshape(-1) labels = labels.long() loss = self.criterion(predictions, labels) acc = self.__binary_accuracy(predictions, labels) epoch_loss += loss.item() epoch_acc += acc.item() return epoch_loss / len(iterator), epoch_acc / len(iterator) def tokenize(self, text, output='conllu'): text = [t for t in text.split("\n") if len(t) > 0] examples = [data.Example().fromlist([t], fields=[('text', self.TEXT)]) for t in text] dataset = data.Dataset(examples, fields=[('text', self.TEXT)]) data_iter = data.BucketIterator(dataset, batch_size=self.BATCH_SIZE, sort_key=lambda x: len(x.text), sort_within_batch=True, shuffle=False, device=device) with torch.no_grad(): preds = [] for batch in data_iter: t, l = batch.text predictions = self.model(t, l) predictions = predictions.float() _, rounded_preds = torch.max(torch.sigmoid(predictions), 2) preds.append(rounded_preds) sents = [] tokens = [] for item in list(zip(text, preds[::-1])): text = item[0] tags = item[1] token = '' for i in tqdm(range(len(tags[0]))): if int(tags[0][i]) == 0: token += text[i] elif int(tags[0][i]) == 1: token += text[i] if output == 'conllu': space_after = 1 if text[i + 1] == ' ' else 0 tokens.append((token.strip(), space_after)) else: tokens.append(token.strip()) token = '' else: token += text[i] if output == 'conllu': tokens.append((token.strip(), 0)) else: tokens.append(token.strip()) token = '' sents.append(tokens) tokens = [] return sents def write_conllu(self, sents, filename='lstm_tokenizer_output.conllu'): with open(filename, 'w', encoding='utf-8') as f: for s_id, sent in enumerate(sents): sent_text = '' token_lines = [] for i, token_info in enumerate(sent): token, space_after = token_info[0], token_info[1] if space_after == 1: sent_text += token + ' ' token_line = '{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_'.format(i + 1, token) token_lines.append(token_line) else: sent_text += token token_line = '{}\t{}\t_\t_\t_\t_\t_\t_\t_\tSpaceAfter=No'.format(i + 1, token) token_lines.append(token_line) f.write('# sent_id = {}\n'.format(s_id + 1)) f.write('# text = {}\n'.format(sent_text)) f.write('\n'.join(token_lines)) f.write('\n\n')
get_pos_vocab(conll_train, conll_val, conll_test, output=pos_vocab_file) vocab, embedding = load_glove(glove, dim=embed_dim, save_dir='dataset') pos_vocab = load_pos_vocab(pos_vocab_file) # convert words to indices for train and test respectively words, pos = read_conll(conll_train) word_ids = [[vocab.get(word, 1) for word in sentence] for sentence in words] pos_ids = [[pos_vocab.get(pos) for pos in sentence] for sentence in pos] test_words, test_pos = read_conll(conll_test) test_word_ids = [[vocab.get(word, 1) for word in sentence] for sentence in words] test_pos_ids = [[pos_vocab.get(pos) for pos in sentence] for sentence in pos] embedding = torch.from_numpy(embedding).float() model = LSTMTagger(embedding, embed_dim, 100, 2, len(pos_vocab)).to(device) optimizer = optim.Adam( [param for param in model.parameters() if param.requires_grad == True], lr=0.001) criterion = nn.NLLLoss() for epoch in range(num_epochs): batch = Batch(word_ids, pos_ids, batch_size=batch_size) total_step = len(batch) i = 0 for inputs, labels in batch: i += 1 pad_words_obj = PadSequence(inputs, [len(inputs), 100]) padded_inputs = torch.Tensor(pad_words_obj.embedding).long().to(device) padded_inputs_lens = torch.Tensor( pad_words_obj.lengths).long().to(device)
import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from model import LSTMTagger from config import LSTMConfig from util import * import time configs = LSTMConfig() word_dict = make_dict(configs.WORD_FILE) tag_dict = make_dict(configs.TAG_FILE) id2word_dict = {v: k for k, v in word_dict.id.items()} id2tag_dict = {v: k for k, v in tag_dict.id.items()} model = LSTMTagger(configs.word_dim, configs.hidden_dim, word_dict.size, tag_dict.size) device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") model = model.to(device) optimizer = optim.SGD(model.parameters(), lr=configs.lr, momentum=configs.lr_decay) loss_function = nn.NLLLoss() data = PosDataset(configs.POS_FILE, word_dict, tag_dict) def makeDataset(test_rate=0.2, validation_rate=0): test_id = int(data.__len__() * test_rate) test_data = DataLoader([data[i] for i in range(test_id)], batch_size=1, num_workers=configs.num_workers)
def train_model(data_folder, patience, max_epoch, model_path): TEXT = data.Field(tokenize=list, include_lengths=True, batch_first=True) LABELS = data.Field(dtype=torch.float, tokenize=list, pad_token=None, unk_token=None, batch_first=True) train_data, val_data, test_data = data.TabularDataset.splits( path='data_folder', train='_train.tsv', validation='_dev.tsv', test='_test.tsv', format='tsv', fields=[('text', TEXT), ('labels', LABELS)], csv_reader_params={"quotechar": '|'}) TEXT.build_vocab(train_data) LABELS.build_vocab(train_data) pickle.dump(TEXT, open('TEXT.pkl', 'wb')) pickle.dump(LABELS, open('LABELS.pkl', 'wb')) BATCH_SIZE = 64 train_iter, val_iter, test_iter = data.BucketIterator.splits( (train_data, val_data, test_data), batch_size=BATCH_SIZE, sort_key=lambda x: len(x.text), sort_within_batch=True, device=device) INPUT_DIM = len(TEXT.vocab) EMBEDDING_DIM = 100 HIDDEN_DIM = 256 OUTPUT_DIM = len(LABELS.vocab) model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, INPUT_DIM, OUTPUT_DIM) optimizer = optim.Adam(model.parameters()) criterion = nn.CrossEntropyLoss() early_stop = EarlyStopping(patience=patience) model = model.to(device) criterion = criterion.to(device) N_EPOCHS = max_epoch train_losses = [] val_losses = [] for epoch in range(N_EPOCHS): try: train_loss, train_acc = train(model, train_iter, optimizer, criterion) train_losses.append(train_loss) except (TypeError, ValueError): print("Exception in user code:") print("-"*60) traceback.print_exc(file=sys.stdout) print("-"*60) break valid_loss, valid_acc = evaluate(model, val_iter, criterion) val_losses.append(valid_loss) if early_stop.step(valid_loss): print('Stopped learning due to lack of progress.') break print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |') torch.save(model.state_dict(), model_path) test_loss, test_acc = evaluate(model, test_iter, criterion) print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')
def train(data, gpu): vocabulary_size = data.word_alphabet.size() label_size = data.label_alphabet.size() EMBEDDING_DIM = data.word_emb_dim HIDDEN_DIM = 100 dropout = 0.2 lstm_layer = 1 bilstm = True use_char = True model = LSTMTagger(data, HIDDEN_DIM, dropout, lstm_layer, bilstm, use_char, gpu) loss_function = nn.NLLLoss() optimizer = optim.SGD(model.parameters(), lr=0.1) best_dev = -1 ## start training for idx in range(100): epoch_start = time.time() temp_start = epoch_start print "Epoch:", idx instance_count = 0 sample_id = 0 sample_loss = 0 right_token = 0 whole_token = 0 random.shuffle(data.train_Ids) for words, chars, label in data.train_Ids: instance_count += 1 # if instance_count > 1000: # continue label = autograd.Variable(torch.LongTensor(label)) if gpu: label = label.cuda() model.zero_grad() # model.hidden = model.init_hidden(gpu) loss, pred_score, tag_seq = model.neg_log_likelihood( [words, chars], label, gpu) print pred_score print "tagseq:", tag_seq print "label:", label # loss = loss_function(pred_score, label) if gpu: pred_score = pred_score.cpu() label = label.cpu() loss = loss.cpu() right, whole = predict_check(pred_score, label) right_token += right whole_token += whole sample_loss += loss.data.numpy()[0] if instance_count % 500 == 0: temp_time = time.time() temp_cost = temp_time - temp_start temp_start = temp_time print( " Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (instance_count, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) sys.stdout.flush() sample_loss = 0 loss.backward() optimizer.step() epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print("Epoch: %s training finished. Time: %.2fs" % (idx, epoch_cost)) acc, p, r, f = evaluate(data, model, "dev", gpu) dev_finish = time.time() dev_cost = dev_finish - epoch_finish print("Dev: time:%.2fs; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (dev_cost, acc, p, r, f)) if f > best_dev: print "Exceed best f, previous best f:", best_dev best_dev = f # ## decode test acc, p, r, f = evaluate(data, model, "test", gpu) test_finish = time.time() test_cost = test_finish - dev_finish print("Test: time: %.2fs; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (test_cost, acc, p, r, f))
word_to_ix = {} for sent, tag in training_data: for word in sent: if word not in word_to_ix: word_to_ix[word] = len(word_to_ix) print(word_to_ix) tag_to_ix = {'DET': 0, 'NN': 1, 'V': 2} # These will usually be more like 32 or 64 dimensional. # We will keep them small, so we can see how the weights change as we train. EMBEDDING_DIM = 6 HIDDEN_DIM = 6 losses = [] loss_function = nn.NLLLoss() model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix)) optimizer = optim.SGD(model.parameters(), lr=0.1) # See what the scores are before training # Note that element i,j of the output is the score for tag j for word i. inputs = prepare_sequence(training_data[0][0], word_to_ix) tag_scores = model(inputs) print(tag_scores) score_to_tag(tag_scores, tag_to_ix) for epoch in range(300): total_loss = torch.Tensor([0]) for sentence, tags in training_data: # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words # into integer indices and wrap them in variables) sentence_in = prepare_sequence(sentence, word_to_ix)
corpus = d.Corpus(args.data, device, args.batch_size, args.seq_len) dict = corpus.dictionary num_of_train_batches = corpus.total_num_of_train_batches def accuracy(pred, target): mask = target != 2 total_num = mask.sum() p, i = pred.max(2) num_correct = (target[mask] == i[mask]).sum() return num_correct.item(), total_num.item() #### train #### model = LSTMTagger(args.emsize, args.nhid, args.batch_size, len(dict), TAG_CLASS, args.nlayers, args.bidirect, args.dropout).to(device) loss_fn = nn.CrossEntropyLoss(size_average=False).to(device) optimizer = optim.Adam(model.parameters()) def evaluate(): model.eval() with torch.no_grad(): data = corpus.test_data_batched labels = corpus.test_label_batched pred = model(data) correct, total = accuracy(pred, labels) print('accuracy = {:.4f}'.format(correct / total))
def main(): progress_bar = ProgressBar() data_iterator, glove_embeddings, word_to_ix, ix_to_word = load_data() logger.info("Building model...") model = LSTMTagger(cf.EMBEDDING_DIM, cf.HIDDEN_DIM, len(word_to_ix), cf.BATCH_SIZE, cf.MAX_SENT_LENGTH, glove_embeddings) # Ensure the word embeddings aren't modified during training optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.1) model.cuda() #if(cf.LOAD_PRETRAINED_MODEL): # model.load_state_dict(torch.load('asset/model_trained')) #else: num_batches = len(data_iterator) loss_list = [] # A place to store the loss history for epoch in range(1, cf.MAX_EPOCHS + 1): epoch_start_time = time.time() for (i, (batch_x, batch_y)) in enumerate(data_iterator): # Ignore batch if it is not the same size as the others (happens at the end sometimes) if len(batch_x) != cf.BATCH_SIZE: continue batch_x = batch_x.to(device) # Step 1. Remember that Pytorch accumulates gradients. # We need to clear them out before each instance model.zero_grad() # Also, we need to clear out the hidden state of the LSTM, # detaching it from its history on the last instance. model.hidden = model.init_hidden() # Step 2. Get our inputs ready for the network, that is, turn them into # Tensors of word indices. #sentence_in = prepare_sequence(sentence, word_to_ix) #target = torch.tensor([word_to_ix[tag]], dtype=torch.long, device=device) batch_x_lengths = [] for x in batch_x: batch_x_lengths.append(len(x)) # Step 3. Run our forward pass. tag_scores = model(batch_x, batch_x_lengths) #loss = loss_function(tag_scores, batch_y) loss = modified_loss(tag_scores, batch_y, batch_x_lengths, word_to_ix) loss.backward() optimizer.step() progress_bar.draw_bar(i, epoch, num_batches, cf.MAX_EPOCHS, epoch_start_time) progress_bar.draw_completed_epoch(loss, loss_list, epoch, cf.MAX_EPOCHS, epoch_start_time) loss_list.append(loss) if epoch % 10 == 0: avg_loss = sum([l for l in loss_list[epoch - 10:]]) / 10 logger.info("Average loss over past 10 epochs: %.6f" % avg_loss) if epoch >= 20: prev_avg_loss = sum( [l for l in loss_list[epoch - 20:epoch - 10]]) / 10 if (avg_loss >= prev_avg_loss): logger.info( "Average loss has not improved over past 10 epochs. Stopping early." ) evaluate_model(model, ix_to_word) break if epoch == 1 or epoch % 10 == 0 or epoch == cf.MAX_EPOCHS: evaluate_model(model, ix_to_word) logger.info("Saving model...") torch.save(model.state_dict(), "asset/model_trained") logger.info("Model saved to %s." % "asset/model_trained")
def predict(data, model_name='', model_=None, idxs=None, out=False): if idxs: [tag_to_index, word_to_index, index_to_tag, index_to_word] = idxs else: with open('data.pickle', 'rb') as f: [tag_to_index, word_to_index, index_to_tag, index_to_word] = pickle.load(f) if model_name: model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_index), len(tag_to_index)) load_checkpoint(model_name, model) elif not model_: raise ValueError('No model specified.') else: model = model_ def out_gen(tag_scores, index_to_tag): tags = [] for word in tag_scores: maxval = -10000 maxind = 0 for i in range(word.size()[0]): if maxval < word.data[i]: maxind = i maxval = word.data[i] tags.append(index_to_tag[maxind]) return tags correct = 0 total = 0 #for line in data: # inputs = prepare(line[0], word_to_index) # tag_scores = model(inputs) # tags = out_gen(tag_scores, index_to_tag) for batch, lengths, targets, lengths2 in data: pred = model(autograd.Variable(batch), lengths.cpu().numpy()) _, pred = torch.max(pred, dim=2) pred = pred.data for p, g in zip(pred, targets): for idx in range(len(g)): if index_to_tag[g[idx]] in [SOS, PAD]: continue elif index_to_tag[g[idx]] == EOS: break elif index_to_tag[g[idx]] == index_to_tag[p[idx]]: correct += 1 if out: print(index_to_tag[p[idx]], end=' ') total += 1 else: total += 1 #for pred, gold in zip(tags, line[1]): # if gold in [EOS, SOS]: # pass # if pred == gold: # correct += 1 # total += 1 return correct / total