def main(args): batch_size = 32 output_size = 2 hidden_size = 256 embedding_length = 300 tokenize = lambda x: x.split() TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=50) LABEL = data.LabelField(tensor_type=torch.FloatTensor) train_data = data.TabularDataset(path=args.train_data_tsv_file, format='tsv', fields=[('text', TEXT), ('label', LABEL)], skip_header=True) TEXT.build_vocab(train_data, vectors=GloVe('840B', 300)) LABEL.build_vocab(train_data) word_embeddings = TEXT.vocab.vectors vocab_size = len(TEXT.vocab) model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings) model.load_state_dict(torch.load(args.saved_model_path)) model.cuda() model.eval() for segments_pkl in os.listdir(args.transcript_segments_folder): print(segments_pkl) all_segments = pickle.load( open(os.path.join(args.transcript_segments_folder, segments_pkl), 'rb')) readable_output_file = open( os.path.join(args.output_transcript_segments_folder, os.path.splitext(segments_pkl)[0] + '.tsv'), 'w') for video_id, segments in all_segments.items(): for i in range(len(segments)): sentence = word_tokenize(segments[i]['transcript'].lower()) test_sent = [[TEXT.vocab.stoi[x] for x in sentence]] test_sent = np.asarray(test_sent) test_sent = torch.LongTensor(test_sent) test_tensor = Variable(test_sent, volatile=True).cuda() output = model(test_tensor, 1) out = F.softmax(output, 1) if (torch.argmax(out[0]) == 1): pred_label = 0 else: pred_label = 1 segments[i]['is_background'] = pred_label all_segments[video_id][i] = segments[i] readable_output_file.write('%s\t%d\n' % (' '.join(sentence), pred_label)) pickle.dump( all_segments, open( os.path.join(args.output_transcript_segments_folder, segments_pkl), 'wb'))
def main(train_data_path: str, model_path: str): TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter = load_data.load_dataset( train_data_path) batch_size = 32 output_size = 2 hidden_size = 256 embedding_length = 300 # TODO: try other types of learning algorithms model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings) for epoch in range(10): train_loss, train_acc = train_model(model, train_iter, epoch) val_loss, val_acc = eval_model(model, valid_iter) print( f'Epoch: {epoch + 1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%' ) test_loss, test_acc = eval_model(model, test_iter) print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%') ''' Let us now predict the sentiment on a single sentence just for the testing purpose. ''' test_sen1 = "This is one of the best creation of Nolan. I can say, it's his magnum opus. Loved the soundtrack and especially those creative dialogues." test_sen1 = TEXT.preprocess(test_sen1) test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]] test_sen = np.asarray(test_sen1) test_sen = torch.from_numpy(test_sen) if torch.cuda.is_available(): test_sen = test_sen.cuda() model.eval() output = model(test_sen, 1) out = F.softmax(output, 1) if (torch.argmax(out[0]) == 1): print("Sentiment: Positive") else: print("Sentiment: Negative") # save the model torch.save(model.state_dict(), model_path)
def do_inference(sentences, TEXT, vocab_size, word_embeddings): ## Load mode for inference batch_size = len(sentences) model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings, conv_hidden, 0.0) model.cuda() state_dict = torch.load(save_path) model.load_state_dict(state_dict) model.eval() data_field = [('headline', TEXT)] ## prepare data score = None examples = [] for text in sentences: examples.append(data.Example.fromlist([text], data_field)) infer_data = data.Dataset(examples, data_field, filter_pred=None) infer_iter = data.Iterator(dataset=infer_data, batch_size=batch_size, train=False, sort=False, device=0) for idx, batch in enumerate(infer_iter): text = batch.headline[0] #if (text.size()[0] is not 32): # continue prediction = model(text) score = torch.max(prediction, 1)[1].float().mean().item() return score
te_acc.append(val_acc) if train_loss<criteria: break; print('Epoch:', epoch+1, 'Train Loss:', train_loss, 'Train Acc:', train_acc, 'Val. Loss:', val_loss, 'Val. Acc:', val_acc) test_loss, test_acc = eval_model(model, test_iter) print('Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%') assert False ''' Let us now predict the sentiment on a single sentence just for the testing purpose. ''' test_sen1 = "This is one of the best creation of Nolan. I can say, it's his magnum opus. Loved the soundtrack and especially those creative dialogues." test_sen2 = "Ohh, such a ridiculous movie. Not gonna recommend it to anyone. Complete waste of time and money." test_sen1 = TEXT.preprocess(test_sen1) test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]] test_sen2 = TEXT.preprocess(test_sen2) test_sen2 = [[TEXT.vocab.stoi[x] for x in test_sen2]] test_sen = np.asarray(test_sen1) test_sen = torch.LongTensor(test_sen) test_tensor = Variable(test_sen, volatile=True) test_tensor = test_tensor.cuda() model.eval() output = model(test_tensor, 1) out = F.softmax(output, 1) if (torch.argmax(out[0]) == 1): print ("Sentiment: Positive") else: print ("Sentiment: Negative")