def main(args): batch_size = 32 output_size = 2 hidden_size = 256 embedding_length = 300 tokenize = lambda x: x.split() TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=50) LABEL = data.LabelField(tensor_type=torch.FloatTensor) train_data = data.TabularDataset(path=args.train_data_tsv_file, format='tsv', fields=[('text', TEXT), ('label', LABEL)], skip_header=True) TEXT.build_vocab(train_data, vectors=GloVe('840B', 300)) LABEL.build_vocab(train_data) word_embeddings = TEXT.vocab.vectors vocab_size = len(TEXT.vocab) model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings) model.load_state_dict(torch.load(args.saved_model_path)) model.cuda() model.eval() for segments_pkl in os.listdir(args.transcript_segments_folder): print(segments_pkl) all_segments = pickle.load( open(os.path.join(args.transcript_segments_folder, segments_pkl), 'rb')) readable_output_file = open( os.path.join(args.output_transcript_segments_folder, os.path.splitext(segments_pkl)[0] + '.tsv'), 'w') for video_id, segments in all_segments.items(): for i in range(len(segments)): sentence = word_tokenize(segments[i]['transcript'].lower()) test_sent = [[TEXT.vocab.stoi[x] for x in sentence]] test_sent = np.asarray(test_sent) test_sent = torch.LongTensor(test_sent) test_tensor = Variable(test_sent, volatile=True).cuda() output = model(test_tensor, 1) out = F.softmax(output, 1) if (torch.argmax(out[0]) == 1): pred_label = 0 else: pred_label = 1 segments[i]['is_background'] = pred_label all_segments[video_id][i] = segments[i] readable_output_file.write('%s\t%d\n' % (' '.join(sentence), pred_label)) pickle.dump( all_segments, open( os.path.join(args.output_transcript_segments_folder, segments_pkl), 'wb'))
def get_gen_score(batch_data, TEXT, vocab_size, word_embeddings): if TEXT == None: return 0 LABEL = data.LabelField(tensor_type=torch.FloatTensor) model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings, conv_hidden, 0.1) state_dict = torch.load(save_path) model.load_state_dict(state_dict) test_datafields = [("headline", TEXT), ("comment", LABEL), ("share", None)] """ with open('temp.tsv', 'w') as f: f.write("headline\tcomment\tshare\n") batch = "" for i in range(len(batch_data)): sentence = "" for j in batch_data[i][0]: token = j+ " " sentence += token try: temp = sentence + '\t1\t1\n' except: temp = " \t1\t1\n" batch += temp with open('temp.tsv', 'a') as f: f.write(batch) test_data = data.TabularDataset(path="temp.tsv", format='tsv', skip_header=True, fields=test_datafields) """ examples = [None] * len(batch_data) for i in range(len(batch_data)): sentence = "" if batch_data[i]: # 若data不為空 for j in batch_data[i]: token = j + " " sentence += token temp = [sentence, 1, 1] else: temp = [" ", 1, 1] print("[info] empty sentence for classifer") example = data.Example.fromlist(temp, test_datafields) examples[i] = example test_data = data.Dataset(examples, fields=test_datafields) LABEL.build_vocab(test_data) test_iter = data.BucketIterator(test_data, batch_size=len(test_data), sort_key=lambda x: len(x.headline), repeat=False, shuffle=True) gen_score = test_model(model, test_iter) gen_score = torch.softmax(gen_score, dim=1) return gen_score
def do_inference(sentences, TEXT, vocab_size, word_embeddings): ## Load mode for inference batch_size = len(sentences) model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings, conv_hidden, 0.0) model.cuda() state_dict = torch.load(save_path) model.load_state_dict(state_dict) model.eval() data_field = [('headline', TEXT)] ## prepare data score = None examples = [] for text in sentences: examples.append(data.Example.fromlist([text], data_field)) infer_data = data.Dataset(examples, data_field, filter_pred=None) infer_iter = data.Iterator(dataset=infer_data, batch_size=batch_size, train=False, sort=False, device=0) for idx, batch in enumerate(infer_iter): text = batch.headline[0] #if (text.size()[0] is not 32): # continue prediction = model(text) score = torch.max(prediction, 1)[1].float().mean().item() return score