def getwv(): i2v = preprocess.load_obj('index2vocab') model = models.Word2Vec.load('w2v200.model.bin') embedding = np.zeros((len(i2v), model.wv.vector_size)) j = 0 for i in range(len(i2v)): try: embedding[i] = model.wv[i2v[i]] except: print(i2v[i], 'not in w2v') j += 1 continue print(j) return embedding
from model import BiLSTM_CRF from preprocess import load_obj def get_tag(model, sentence, idx_to_tag): sentence = sentence.unsqueeze(1) mask = sentence.ne(0) best_tag_ids = model.decode(sentence, mask) tags = [idx_to_tag[idx] for idx in best_tag_ids[0]] return tags if __name__ == '__main__': print(TEST_SENTENCE) data_dir = 'data/chinese/processed' word_to_idx = load_obj(os.path.join(data_dir, 'word_to_idx.pkl')) tag_to_idx = load_obj(os.path.join(data_dir, 'tag_to_idx.pkl')) idx_to_tag = {v: k for k, v in tag_to_idx.items()} model = BiLSTM_CRF(len(word_to_idx), len(tag_to_idx), 100, 200, 0.1) model.load_state_dict( torch.load(CUR_MODEL, map_location=torch.device('cuda'))) model.eval() processed_sen = [i.split('/')[0] for i in TEST_SENTENCE.split()] sentence = torch.LongTensor( [word_to_idx.get(w, word_to_idx[UNK]) for w in processed_sen]) best_tags = get_tag(model, sentence, idx_to_tag) print(' '.join(best_tags))
def main(args): teacher_forcing_ratio = 0.5 schedule_sampling_ratio = 0.7 num_layers = 2 features, _ = preprocess.readfeat() #labels, v_size = preprocess.label2onehot_single_sentence(preprocess.readlabel()) labels, v_size = preprocess.label2onehot(preprocess.readlabel(), limit=12) i2v = preprocess.load_obj('index2vocab') print('feature shape =', features.shape) print('label shape =', labels.shape) dataloader = getDataLoader(features, labels, single_sentence=False, batch_size=128) encoder = EncoderRNN(4096, 512, num_layers=num_layers, bidirectional=True) #decoder = VanillaDecoderRNN(512, v_size, num_layers=1) decoder = BAttnDecoderRNN(512, v_size, num_layers=num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() print(encoder) print(decoder) c = nn.CrossEntropyLoss() #c = nn.NLLLoss() params = list(decoder.parameters()) + list(encoder.parameters()) optimizer = torch.optim.Adam(params, lr=0.0003) epochs = 16 for epoch in range(epochs): losses = [] desc = 'Epoch [{}/{}]'.format(epoch + 1, epochs) train_loss = [] for images, inputs, targets, lengths in tqdm(dataloader, desc=desc): images, inputs, targets = to_var(images), to_var(inputs), to_var( targets) batch_size, caption_len = inputs.size()[0], inputs.size()[1] loss = 0 encoder.zero_grad() decoder.zero_grad() encoder_output, encoder_hidden = encoder(images) decoder_hidden = encoder_hidden[:num_layers] decoder_outputs = torch.autograd.Variable( torch.zeros(batch_size, caption_len, v_size)) if torch.cuda.is_available(): decoder_outputs = decoder_outputs.cuda() #use teacher forcing or not if args.learntype == 'teacher_forcing': use_teacher_forcing = True if random.random( ) < teacher_forcing_ratio else False if use_teacher_forcing: for wordindex in range(caption_len): inputword = inputs[:, wordindex] output, decoder_hidden = decoder( inputword, decoder_hidden, encoder_output) decoder_outputs[:, wordindex, :] = output else: inputword = inputs[:, 0] for wordindex in range(caption_len): output, decoder_hidden = decoder( inputword, decoder_hidden, encoder_output) maxkey = np.argmax(output.data, axis=1) inputword = to_var(maxkey) decoder_outputs[:, wordindex, :] = output #schedule sampling else: inputword = inputs[:, 0] for wordindex in range(caption_len): output, decoder_hidden = decoder(inputword, decoder_hidden, encoder_output) decoder_outputs[:, wordindex, :] = output if random.random( ) < schedule_sampling_ratio and wordindex < caption_len - 1: inputword = inputs[:, wordindex + 1] else: maxkey = np.argmax(output.data, axis=1) inputword = to_var(maxkey) for i in range(batch_size): loss += c(decoder_outputs[i], targets[i]) loss.backward() optimizer.step() losses.append(loss.data / batch_size) if (epoch + 1) % 2 == 0: torch.save(encoder.state_dict(), args.model + '/encoder_epoch{}.pt'.format(epoch + 1)) torch.save(decoder.state_dict(), args.model + '/decoder_epoch{}.pt'.format(epoch + 1)) print('loss={:.4f}'.format(np.average(losses)))
sentence = sentence.unsqueeze(1) mask = sentence.ne(0) best_tag_ids = model.decode(sentence, mask) tags = [ix_to_tag[idx] for idx in best_tag_ids[0]] return tags if __name__ == "__main__": if args.sentence is None: raise ValueError("Please input an sentence") if args.model is None: raise ValueError("Please specify model file path") data_dir = "data/msra/processed" word_to_ix = load_obj(os.path.join(data_dir, "word_to_ix.pkl")) tag_to_ix = load_obj(os.path.join(data_dir, "tag_to_ix.pkl")) ix_to_tag = {v: k for k, v in tag_to_ix.items()} # Load trained model model = BiLSTM_CRF(len(word_to_ix), len(tag_to_ix), 100, 200, 0.1) model.load_state_dict( torch.load(args.model, map_location=torch.device("cpu"))) model.eval() # Predict sentence = torch.LongTensor( [word_to_ix.get(w, word_to_ix[UNK]) for w in args.sentence]) best_tags = tagging(model, sentence, ix_to_tag) print(" ".join(best_tags))
import torch as th from preprocess import tokenize_message, pad_features, load_obj # Load dictionaries index_to_word = load_obj("data/index_to_word") word_to_index = load_obj("data/word_to_index") def predict(net, message, use_gpu, sequence_length=200): """ net: pytorch RNN model. Whiout encryption message: string message to classify on plain text use_gpu: Boolean if True use GPU computation sequence_length: int Length of sequences """ if len(message) == 0: return None net.eval() # tokenize review test_ints = tokenize_message(message, word_to_index) # pad tokenized sequence seq_length = sequence_length features = pad_features(test_ints, seq_length)
def main(args): fout = open(args.output, 'w') maxlen = 20 num_layers = 1 dim = 512 quotes_set = set(['[', ']', '{', '}', '!', '?', '。']) sentences, v_size = preprocess.label2onehot(args.input) data = getDataLoader(sentences) v = preprocess.load_obj('vocabindex') i2v = preprocess.load_obj('index2vocab') print(sentences) v_size = len(v) print('sentences shape =', sentences.shape) we = getwv() encoder = EncoderRNN(v_size, dim, we, num_layers=num_layers, bidirectional=False) #decoder = VanillaDecoderRNN(dim, v_size, num_layers=num_layers) #decoder = BAttnDecoderRNN(dim, v_size, num_layers=num_layers) decoder = LAttnDecoderRNN(dim, v_size, we, num_layers=num_layers) encoder.load_state_dict(torch.load(args.encoder)) decoder.load_state_dict(torch.load(args.decoder)) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() print(encoder) print(decoder) i = 0 for sentence in sentences: print(sentence) s = to_var(torch.LongTensor(sentence).view(1, -1)) encoder_hidden = encoder.initHidden(num_layers, 1) encoder_o, encoder_hidden = encoder(s, [s.size(1)], encoder_hidden) encoder_o = pad_packed_sequence(encoder_o, batch_first=True)[0] decoder_hidden = encoder_hidden[:num_layers] inputword = v['<start>'] flag = True sentence = '' words = [] length = 0 while flag: inputword = to_var(torch.LongTensor([inputword]).view(1, -1)) output, decoder_hidden = decoder(inputword, decoder_hidden, encoder_o) maxkey = np.argmax(output[0].data) inputword = maxkey word = i2v[maxkey.item()] length += 1 if length > maxlen: flag = False if word == '<end>' or word == '<pad>': flag = False elif word == '<unk>': continue else: if word == '.' or word == '。': #sentence = sentence[:-1] #sentence += word flag = False else: if word in words and word in quotes_set: continue if (len(words) == 0 or word != words[-1]): words.append(word) sentence = ' '.join(words) print(sentence) fout.write(sentence + '\n') i += 1 fout.close()
def __init__(self, dataset_pkl): super(NERDataset, self).__init__() self.dataset = load_obj(dataset_pkl)
def my_train(): os.makedirs(f"model_result", exist_ok=True) torch.manual_seed(1) device = torch.device('cuda') data_dir = f"data/{DATASET}/processed" # 加载 train_data = NERDataset(os.path.join(data_dir, "train.pkl")) test_data = NERDataset(os.path.join(data_dir, "test.pkl")) dev_data = NERDataset(os.path.join(data_dir, "dev.pkl")) word_to_idx = load_obj(os.path.join(data_dir, "word_to_idx.pkl")) tag_to_idx = load_obj(os.path.join(data_dir, "tag_to_idx.pkl")) idx_to_tag = {n: m for m, n in tag_to_idx.items()} train_loader = DataLoader( train_data, batch_size=BATCH_SIZE, collate_fn=BatchPadding(), shuffle=True, num_workers=2, pin_memory=True, ) dev_loader = DataLoader( dev_data, batch_size=BATCH_SIZE, collate_fn=BatchPadding(), shuffle=True, num_workers=2, pin_memory=True, ) test_loader = DataLoader( test_data, batch_size=BATCH_SIZE, collate_fn=BatchPadding(), shuffle=True, num_workers=2, pin_memory=True, ) # 建模 model = BiLSTM_CRF(len(word_to_idx), len(tag_to_idx), EMBEDDING_DIM, HIDDEN_DIM, DROPOUT).to(device) print(model) optimizer = optim.Adam(model.parameters(), lr=LEARN_RATE) print("\n开始训练") f1_max = 0 cur_patience = 0 # 用于避免过拟合 for epoch in range(EPOCHS): model.train() for i, (seqs, tags, masks) in enumerate(train_loader, 1): optimizer.zero_grad() loss = model.loss(seqs.to(device), tags.to(device), masks.to(device)) loss.backward() optimizer.step() if i % LOG_INTERVAL == 0: print("epoch {}: {:.0f}%\t\tLoss: {:.6f}".format( epoch, 100.0 * i / len(train_loader), loss.item())) dev_precision, dev_recall, dev_f1 = evaluate(model, dev_loader, idx_to_tag) test_precision, test_recall, test_f1 = evaluate( model, test_loader, idx_to_tag) print( f"\ndev\tprecision: {dev_precision}, recall: {dev_recall}, f1: {dev_f1}" ) print( f"test\tprecision: {test_precision}, recall: {test_recall}, f1: {test_f1}\n" ) torch.save(model.state_dict(), f"model_result/{epoch}.pt") if dev_f1 > f1_max: # 用于检测过拟合情况 f1_max = dev_f1 cur_patience = 0 if dev_f1 > 0.9 and test_f1 > 0.9: break else: cur_patience += 1 if cur_patience >= PATIENCE: # 多次低于最高f1,break break print("Best dev F1: ", f1_max)
def index2sentence(s): v = preprocess.load_obj('index2vocab') for w in s: print(v[w.item()], end=' ')