def run_lstm_crf(train=True): train_word_lists, train_tag_lists = build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev") test_word_lists, test_tag_lists = build_corpus("test") word2id, tag2id = build_vocab(train_word_lists, train_tag_lists) # 如果是加了CRF的lstm还要加入<start>和<end> (解码的时候需要用到) crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True) # 还需要额外的一些数据处理 train_word_lists, train_tag_lists = prepocess_data_for_lstmcrf( train_word_lists, train_tag_lists) dev_word_lists, dev_tag_lists = prepocess_data_for_lstmcrf( dev_word_lists, dev_tag_lists) test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf( test_word_lists, test_tag_lists, test=True) if train: print("正在训练Bi-LSTM+CRF模型...") bilstm_crf_train((train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), crf_word2id, crf_tag2id) print("正在评估Bi-LSTM+CRF模型...") pred = bilstm_crf_eval((test_word_lists, test_tag_lists), crf_word2id, crf_tag2id) return pred
def main(args): """ Entry point: train or test. """ json.dump(vars(args), open(os.path.join(args.output_dir, 'config.json'), 'w')) if args.gpu_id == -1: ctx = mx.cpu() else: ctx = mx.gpu(args.gpu_id) mx.random.seed(args.seed, ctx=ctx) if args.mode == 'train': train_dataset = read_dataset(args, 'train_file') val_dataset = read_dataset(args, 'test_file') vocab_path = os.path.join(args.output_dir, 'vocab.jsons') if os.path.exists(vocab_path): vocab = nlp.Vocab.from_json(open(vocab_path).read()) else: vocab = build_vocab(train_dataset) with open(vocab_path, 'w') as fout: fout.write(vocab.to_json()) glove = nlp.embedding.create(args.embedding, source=args.embedding_source) vocab.set_embedding(glove) train_data_loader = prepare_data_loader(args, train_dataset, vocab) val_data_loader = prepare_data_loader(args, val_dataset, vocab, test=True) model = NLIModel(len(vocab), args.embedding_size, args.hidden_size, args.dropout, args.intra_attention) train_model(model, train_data_loader, val_data_loader, vocab.embedding, ctx, args) elif args.mode == 'test': model_args = argparse.Namespace( **json.load(open(os.path.join(args.model_dir, 'config.json')))) vocab = nlp.Vocab.from_json( open(os.path.join(args.model_dir, 'vocab.jsons')).read()) val_dataset = read_dataset(args, 'test_file') val_data_loader = prepare_data_loader(args, val_dataset, vocab, test=True) model = NLIModel(len(vocab), model_args.embedding_size, model_args.hidden_size, 0., model_args.intra_attention) model.load_parameters(os.path.join(args.model_dir, 'checkpoints', 'valid_best.params'), ctx=ctx) loss_func = gluon.loss.SoftmaxCrossEntropyLoss() logger.info('Test on {}'.format(args.test_file)) loss, acc = test_model(model, val_data_loader, loss_func, ctx) logger.info('loss={:.4f} acc={:.4f}'.format(loss, acc))
def run_crf(train=True): train_word_lists, train_tag_lists = build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev") test_word_lists, test_tag_lists = build_corpus("test") word2id, tag2id = build_vocab(train_word_lists, train_tag_lists) if train: print("正在训练CRF模型...") crf_train((train_word_lists, train_tag_lists)) print("正在评估CRF模型...") crf_dev((dev_word_lists, dev_tag_lists)) print("正在测试CRF模型...") pred = crf_eval((test_word_lists, test_tag_lists)) return pred
def main(product): TRAIN_FILE = "../data/ABSA-15_{}_Train_Data.xml".format(product) TEST_FILE = "../data/ABSA15_{}_Test.xml".format(product) # load data set training_reviews = load_dataset(TRAIN_FILE) testing_reviews = load_dataset(TEST_FILE) # build vocab vocab = build_vocab(training_reviews, TOPN=1000) vocab_index = list2dict(vocab) cate_index = get_all_categories(training_reviews) cates = dict2list(cate_index) n_cates = len(cates) train_X = get_X(training_reviews, vocab_index) test_X = get_X(testing_reviews, vocab_index) train_labels = get_labels(training_reviews, cate_index) test_labels = get_labels(testing_reviews, cate_index) # transtform to mono-label problem M = len(train_X) X = [] Y = [] for i in range(M): if not train_labels[i]: Y.append(n_cates) # category index from 0 to n_cates-1, n_cates is for None-label X.append(train_X[i]) else: for y in train_labels[i]: Y.append(y) X.append(list(train_X[i])) clf_model = MultinomialNB() clf_model.fit(X, np.array(Y)) # predict output = predict(test_X, clf_model, threshold=0.2) # evaluation p, r, f = microF1(output, test_labels) # output out_dir = "../data/bow_nb/" out_file = out_dir + "laptop.txt" with open(out_file, 'w') as out: out.write("Precision:\t{}\nRecall:\t{}\nF1:\t{}\n".format(p, r, f)) print("{}\n{}\n{}".format(p, r, f))
def run_hmm(train=True): train_word_lists, train_tag_lists = build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev") test_word_lists, test_tag_lists = build_corpus("test") word2id, tag2id = build_vocab(train_word_lists, train_tag_lists) if train: print("正在训练HMM模型...") hmm_train((train_word_lists, train_tag_lists), word2id, tag2id) print("正在评估HMM模型...") hmm_dev((dev_word_lists, dev_tag_lists), word2id, tag2id) print("正在测试HMM模型...") pred = hmm_eval((test_word_lists, test_tag_lists), word2id, tag2id) return pred
def main(args): """ Entry point: train or test. """ json.dump(vars(args), open(os.path.join(args.output_dir, 'config.json'), 'w')) if args.gpu_id == -1: ctx = mx.cpu() else: ctx = mx.gpu(args.gpu_id) mx.random.seed(args.seed, ctx=ctx) if args.mode == 'train': train_dataset = read_dataset(args, 'train_file') val_dataset = read_dataset(args, 'test_file') vocab_path = os.path.join(args.output_dir, 'vocab.jsons') if os.path.exists(vocab_path): vocab = nlp.Vocab.from_json(open(vocab_path).read()) else: vocab = build_vocab(train_dataset) with open(vocab_path, 'w') as fout: fout.write(vocab.to_json()) glove = nlp.embedding.create(args.embedding, source=args.embedding_source) vocab.set_embedding(glove) train_data_loader = prepare_data_loader(args, train_dataset, vocab) val_data_loader = prepare_data_loader(args, val_dataset, vocab, test=True) model = NLIModel(len(vocab), args.embedding_size, args.hidden_size, args.dropout, args.intra_attention) train_model(model, train_data_loader, val_data_loader, vocab.embedding, ctx, args) elif args.mode == 'test': model_args = argparse.Namespace(**json.load( open(os.path.join(args.model_dir, 'config.json')))) vocab = nlp.Vocab.from_json( open(os.path.join(args.model_dir, 'vocab.jsons')).read()) val_dataset = read_dataset(args, 'test_file') val_data_loader = prepare_data_loader(args, val_dataset, vocab, test=True) model = NLIModel(len(vocab), model_args.embedding_size, model_args.hidden_size, 0., model_args.intra_attention) model.load_parameters(os.path.join( args.model_dir, 'checkpoints', 'valid_best.params'), ctx=ctx) loss_func = gluon.loss.SoftmaxCrossEntropyLoss() logger.info('Test on {}'.format(args.test_file)) loss, acc = test_model(model, val_data_loader, loss_func, ctx) logger.info('loss={:.4f} acc={:.4f}'.format(loss, acc))
def run_lstm(train=True): train_word_lists, train_tag_lists = build_corpus("train") dev_word_lists, dev_tag_lists = build_corpus("dev") test_word_lists, test_tag_lists = build_corpus("test") word2id, tag2id = build_vocab(train_word_lists, train_tag_lists) # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False) if train: print("正在训练双向LSTM模型...") bilstm_train((train_word_lists, train_tag_lists), (dev_word_lists, dev_tag_lists), bilstm_word2id, bilstm_tag2id) print("正在评估双向LSTM模型...") pred = bilstm_eval((test_word_lists, test_tag_lists), bilstm_word2id, bilstm_tag2id) return pred
def main(product): TRAIN_FILE = "../data/ABSA-15_{}_Train_Data.xml".format(product) TEST_FILE = "../data/ABSA15_{}_Test.xml".format(product) # load data set training_reviews = load_dataset(TRAIN_FILE) testing_reviews = load_dataset(TEST_FILE) # build vocab vocab = build_vocab(training_reviews, TOPN=1000) vocab_index = list2dict(vocab) cate_index = get_all_categories(training_reviews) cates = dict2list(cate_index) n_cates = len(cates) print "Loading alignment model" align_model = load_align_model("s2t64.actual.ti.final") print "Get prior" prior = get_prior(training_reviews) print "Training level 2 model..." lev2_model = train_pola_clf(training_reviews, vocab_index, cate_index) print "Predicting..." results = [] for review in testing_reviews: for sent in review.sentences: pairs_predict = predict(sent, align_model, prior, lev2_model, vocab_index, cate_index) results.append(pairs_predict) print "Evaluation" opinions = [] for review in testing_reviews: for sent in review.sentences: #opis = [(cate_index[opi.category], opi.polarity) for opi in sent.opinions] opis = [] for opi in sent.opinions: if opi.category in cate_index: opis.append((cate_index[opi.category], opi.polarity)) opinions.append(opis) TP1 = 0.0 FP1 = 0.0 FN1 = 0.0 for i in range(len(opinions)): o = set([pair[0] for pair in results[i]]) g = set([pair[0] for pair in opinions[i]]) TP1 += len(o & g) FP1 += len(o - g) FN1 += len(g - o) p = TP1 / (TP1 + FP1) r = TP1 / (TP1 + FN1) if p + r == 0: f = 0 else: f = 2. * p * r / (p + r) print p, r, f TP2 = 0.0 FP2 = 0.0 FN2 = 0.0 for i in range(len(opinions)): o = set(results[i]) g = set(opinions[i]) TP1 += len(o & g) FP1 += len(o - g) FN1 += len(g - o) p = TP1 / (TP1 + FP1) r = TP1 / (TP1 + FN1) if p + r == 0: f = 0 else: f = 2. * p * r / (p + r) print p, r, f
def init_hidden(self, batch_size): weight = next(self.parameters()).data return (Variable( weight.new(self.n_layers, batch_size, self.hidden_dim).uniform_()), Variable( weight.new(self.n_layers, batch_size, self.hidden_dim).uniform_())) if __name__ == "__main__": EMBEDDING_DIM = 100 HIDDEN_DIM = 100 BATCH_SIZE = 256 vocab = build_vocab('data') word_vocab, label_vocab = vocab train_dataset = NERDataset('data', vocab, type='/train') train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=2, collate_fn=custom_collate, shuffle=True) sample_data, sample_target, sample_len = next(iter(train_loader)) sample_data = sample_data.long() model = RNN(EMBEDDING_DIM, HIDDEN_DIM, len(word_vocab), len(label_vocab)) hidden = model.init_hidden(BATCH_SIZE) with torch.no_grad(): tag_scores = model(sample_data, hidden)
def caption_image(img_path, beam_size=3): # transforms tt = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) # english vocab = build_vocab('data.json') checkpoint = load_checkpoint( 'E:\GP\Image-Captioning\models\BEST_checkpoint_flickr8k_finetune.pth.tar', cpu=True) # arabic # vocab = build_vocab('ar_data.json') # checkpoint = load_checkpoint('E:\GP\Image-Captioning\models\BEST_checkpoint_flickr8k_ar_finetune.pth.tar', cpu=True) addit_tokens = [ vocab.stoi['<sos>'], vocab.stoi['<eos>'], vocab.stoi['<pad>'] ] device = torch.device('cpu') encoder = checkpoint['encoder'].to(device) decoder = checkpoint['decoder'].to(device) #def cap_image(encoder, decoder, image_path, vocab): vocab_size = len(vocab) img = Image.open(img_path).convert("RGB") img = tt(img).unsqueeze(0) # transform and batch image = img.to(device) #encoder encoder_out = encoder( image) # (1, enc_image_size, enc_image_size, encoder_dim) enc_image_size = encoder_out.size(1) encoder_dim = encoder_out.size(3) # Flatten encoding encoder_out = encoder_out.view(1, -1, encoder_dim) # (1, num_pixels, encoder_dim) num_pixels = encoder_out.size(1) k = beam_size # We'll treat the problem as having a batch size of k encoder_out = encoder_out.expand( k, num_pixels, encoder_dim) # (k, num_pixels, encoder_dim) # Tensor to store top k previous words at each step; now they're just <start> k_prev_words = torch.LongTensor([[vocab.stoi['<sos>']]] * k).to( device) # (k, 1) # Tensor to store top k sequences; now they're just <start> seqs = k_prev_words # (k, 1) # Tensor to store top k sequences' scores; now they're just 0 top_k_scores = torch.zeros(k, 1).to(device) # (k, 1) # Tensor to store top k sequences' alphas; now they're just 1s seqs_alpha = torch.ones(k, 1, enc_image_size, enc_image_size).to( device) # (k, 1, enc_image_size, enc_image_size) # Lists to store completed sequences, their alphas and scores complete_seqs = list() complete_seqs_alpha = list() complete_seqs_scores = list() # Start decoding step = 1 h, c = decoder.init_hidden_state(encoder_out) # s is a number less than or equal to k, because sequences are removed from this process once they hit <end> while True: embeddings = decoder.embedding(k_prev_words).squeeze( 1) # (s, embed_dim) awe, alpha = decoder.attention(encoder_out, h) # (s, encoder_dim), (s, num_pixels) alpha = alpha.view( -1, enc_image_size, enc_image_size) # (s, enc_image_size, enc_image_size) gate = decoder.sigmoid( decoder.f_beta(h)) # gating scalar, (s, encoder_dim) awe = gate * awe h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c)) # (s, decoder_dim) scores = decoder.fc(h) # (s, vocab_size) scores = F.log_softmax(scores, dim=1) # Add scores = top_k_scores.expand_as(scores) + scores # (s, vocab_size) # For the first step, all k points will have the same scores (since same k previous words, h, c) if step == 1: top_k_scores, top_k_words = scores[0].topk(k, 0, True, True) # (s) else: # Unroll and find top scores, and their unrolled indices top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True) # (s) # Convert unrolled indices to actual indices of scores prev_word_inds = top_k_words // vocab_size # (s) next_word_inds = top_k_words % vocab_size # (s) # Add new words to sequences, alphas seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1) # (s, step+1) seqs_alpha = torch.cat( [seqs_alpha[prev_word_inds], alpha[prev_word_inds].unsqueeze(1)], dim=1) # (s, step+1, enc_image_size, enc_image_size) # print(seqs[prev_word_inds], prev_word_inds) # if step == 5: # return seqs # Which sequences are incomplete (didn't reach <end>)? incomplete_inds = [ ind for ind, next_word in enumerate(next_word_inds) if next_word != vocab.stoi['<eos>'] ] complete_inds = list( set(range(len(next_word_inds))) - set(incomplete_inds)) # Set aside complete sequences if len(complete_inds) > 0: complete_seqs.extend(seqs[complete_inds].tolist()) complete_seqs_alpha.extend(seqs_alpha[complete_inds].tolist()) complete_seqs_scores.extend(top_k_scores[complete_inds]) k -= len(complete_inds) # reduce beam length accordingly # Proceed with incomplete sequences if k == 0: break seqs = seqs[incomplete_inds] seqs_alpha = seqs_alpha[incomplete_inds] h = h[prev_word_inds[incomplete_inds]] c = c[prev_word_inds[incomplete_inds]] encoder_out = encoder_out[prev_word_inds[incomplete_inds]] top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1) k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1) # Break if things have been going on too long if step > 50: break step += 1 i = complete_seqs_scores.index(max(complete_seqs_scores)) seq = complete_seqs[i] alphas = complete_seqs_alpha[i] print(complete_seqs_scores) # print(seq) all_caps = [ " ".join([vocab.itos[i] for i in sent if i not in addit_tokens]) for sent in complete_seqs ] all_b_caps = "" z = 1 for cap in all_caps: all_b_caps += str(z) + ". " + cap + " || <br> " z += 1 # all_b_caps = [" || ".join(all_caps)][0] # return seq, alphas, complete_seqs, i # return [" ".join([vocab.itos[i] for i in seq if i not in addit_tokens])][0] # return all_b_caps return all_caps
def main(product, pred_threshold, desc_threshold): global PRED_THRESHOLD, DESC_THRESHOLD PRED_THRESHOLD = pred_threshold DESC_THRESHOLD = desc_threshold TRAIN_FILE = "../data/ABSA-15_{}_Train_Data.xml".format(product) TEST_FILE = "../data/ABSA15_{}_Test.xml".format(product) # load data set training_reviews = load_dataset(TRAIN_FILE) training_sentences = unwrap(training_reviews) testing_reviews = load_dataset(TEST_FILE) testing_sentences = unwrap(testing_reviews) # build vocab vocab = build_vocab(training_sentences, TOPN=1000) feature_names, align_model = load_align_model(product + ".t2s64.actual.ti.final") vocab_index = list2dict(list(vocab)) cate_index = get_all_categories(training_sentences) cates = dict2list(cate_index) n_cates = len(cates) print "Training level 1 model..." lev1_model = train_level1_clf(training_sentences, vocab_index, cate_index, align_model) print "Training level 2 model..." lev2_model = train_pola_clf(training_sentences, vocab_index, cate_index) print "Predicting..." (feat_names, entattri_indexes) = get_entity_attribute(training_sentences) results = [] for sent in testing_sentences: pairs_predict = predict(sent, lev1_model, lev2_model, vocab_index, cate_index, feat_names, align_model) results.append(pairs_predict) print "Evaluation" opinions = [] for sent in testing_sentences: opis = [] for opi in sent.opinions: if opi.category in cate_index: opis.append((cate_index[opi.category], opi.polarity)) opinions.append(opis) TP1 = 0.0 FP1 = 0.0 FN1 = 0.0 for i in range(len(opinions)): o = set([pair[0] for pair in results[i]]) g = set([pair[0] for pair in opinions[i]]) TP1 += len(o & g) FP1 += len(o - g) FN1 += len(g - o) p = TP1 / (TP1 + FP1) r = TP1 / (TP1 + FN1) if p + r == 0: f = 0 else: f = 2. * p * r / (p + r) print p, r, f TP2 = 0.0 FP2 = 0.0 FN2 = 0.0 for i in range(len(opinions)): o = set(results[i]) g = set(opinions[i]) TP1 += len(o & g) FP1 += len(o - g) FN1 += len(g - o) p = TP1 / (TP1 + FP1) r = TP1 / (TP1 + FN1) if p + r == 0: f = 0 else: f = 2. * p * r / (p + r) print p, r, f