def print_vectors(): vocab = Vocab() vocab.build(TRAIN_PATH) # torch.tensor([2764, 64]) pre_trained_embedding = vocab.load_weight(EMBED_PATH) train_data, test_data, test_answer = ev.prepare_evaluate() obj = train_data for d in obj.keys(): t_in = prep.tensorFromSentence(vocab, obj[d]) embedded = get_sent_embed(t_in, pre_trained_embedding) print("%s\t%s" % (d, ' '.join([str(e) for e in embedded.squeeze().data.tolist()])))
def evaluate(): vocab = Vocab() vocab.build(TRAIN_PATH) # torch.tensor([2764, 64]) pre_trained_embedding = vocab.load_weight(EMBED_PATH) train_data, test_data, test_answer = ev.prepare_evaluate() train_embed = get_embed(vocab, train_data, pre_trained_embedding) # evaluation print("[INFO] start evaluating!") total = len(test_data) answer5 = 0 answer1 = 0 for tk in test_data: print("Q.%s %s" % (tk, pretty_printer2(test_data[tk]))) test_in = prep.tensorFromSentence(vocab, test_data[tk]) embedded = get_sent_embed(test_in, pre_trained_embedding) temp = {} for candi in train_embed.keys(): t = train_embed[candi] e = embedded temp[candi] = cosine_similarity(t, e) top_n = get_top_n(temp, 5) for e in top_n.keys(): print("%.4f %4s %s" % (top_n[e], e, pretty_printer2(train_data[e]))) if ev.isAnswer(e, test_answer[tk]): answer5 += 1 break top1 = list(top_n.keys())[0] if ev.isAnswer(top1, test_answer[tk]): answer1 += 1 print("------------------------------------------") accuracy_at_5 = answer5 / total * 100 accuracy_at_1 = answer1 / total * 100 print("total: %d, accuracy@5: %.4f, accuracy@1: %.4f" % (total, accuracy_at_5, accuracy_at_1))
def process(self): tok = Tokenizer() # consider entire corpus as text ( train + test text columns ) if self.test_csv: text = list(self.df.loc[:, self.text_cols].values) + list( self.test_df.loc[:, self.text_cols]) else: text = list(self.df.loc[:, self.text_cols].values) self.tokens = [tok.tokenizer(x) for x in text] self.vocab = Vocab.create(self.tokens, self.max_vocab, self.min_freq) self.ntokens = [self.vocab.numericalize(t) for t in self.tokens] # only full training if self.valid_pct == 0 and self.test_csv is None: self.trn_ds = (self.ntokens, self.df.loc[:, self.label_cols].values) self.vld_ds = ([], []) self.test_ds = ([], []) # holdout elif self.valid_pct > 0 and self.test_csv is None: self.trn_ds = (self.ntokens[self.cut:], self.df.loc[:, self.label_cols].values[self.cut:]) self.vld_ds = (self.ntokens[:self.cut], self.df.loc[:, self.label_cols].values[:self.cut]) self.tst_ds = ([], []) # holdout and test prediction elif self.valid_pct > 0 and self.test_csv is not None: self.trn_tokens = self.ntokens[:len(self.df)] self.tst_ds = (self.ntokens[len(self.df):], []) trn_tokens = self.trn_tokens[self.cut:] vld_tokens = self.trn_tokens[:self.cut] self.trn_ds = (trn_tokens, self.df.loc[:, self.label_cols].values[self.cut:]) self.vld_ds = (vld_tokens, self.df.loc[:, self.label_cols].values[:self.cut]) # full training and test prediction else: self.trn_ds = (self.ntokens[:len(self.df)], self.df.loc[:, self.label_cols].values) self.vld_ds = ([], []) self.tst_ds = (self.ntokens[len(self.df):], []) return self.vocab, self.trn_ds, self.vld_ds, self.tst_ds
def CaptchaGenerator4(samples, batch_size): # to determine dimensions # while True: batch = np.random.choice(samples, batch_size) X = [] y = [] for sample in batch: img = np.asarray(Image.open(sample)) text = Vocab().text_to_one_hots(sample[-8:-4]) X.append(img) y.append(text) X = np.asarray(X) y = np.asarray(y) for i in range(4): print(y[:, i]) yield X, [y[:, i] for i in range(4)]
def CaptchaGenerator(samples, batch_size): while True: batch = np.random.choice(samples, batch_size) X = [] y = [] for sample in batch: img = np.asarray(Image.open(sample)) text = Vocab().text_to_one_hot(sample[-8:-7]) X.append(img) y.append(text) X = np.asarray(X) y = np.asarray(y) # print("data:") # print(X.shape) # print(y.shape) yield X, y
def evaluate(args): vocab = Vocab() vocab.build(train_file) batch_size = args.batch_size hidden_size = args.hidden_size w_embed_size = args.w_embed_size if args.pre_trained_embed == 'n': encoder = Encoder(vocab.n_words, w_embed_size, hidden_size, batch_size).to(device) decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size, batch_size).to(device) # decoder = Decoder(vocab.n_words, w_embed_size, hidden_size, batch_size).to(device) else: # load pre-trained embedding weight = vocab.load_weight(path="data/komoran_hd_2times.vec") encoder = Encoder(vocab.n_words, w_embed_size, hidden_size, batch_size, weight).to(device) decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size, batch_size, weight).to(device) # decoder = Decoder(vocab.n_words, w_embed_size, hidden_size, batch_size, weight).to(device) if args.encoder: encoder.load_state_dict(torch.load(args.encoder)) print("[INFO] load encoder with %s" % args.encoder) if args.decoder: decoder.load_state_dict(torch.load(args.decoder)) print("[INFO] load decoder with %s" % args.decoder) # evaluate_similarity(encoder, vocab, batch_size, decoder=decoder) pre_trained_embedding = vocab.load_weight(EMBED_PATH) eval_sim_lc(encoder, vocab, batch_size, pre_trained_embedding, decoder=decoder)
def main(args): global batch_size batch_size = args.batch_size hidden_size = args.hidden_size w_embed_size = args.w_embed_size lr = args.lr train_file = 'data/train_data_nv.txt' vocab = Vocab() vocab.build(train_file) if args.pre_trained_embed == 'n': encoder = Encoder(vocab.n_words, w_embed_size, hidden_size, batch_size).to(device) decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size, batch_size).to(device) else: # load pre-trained embedding weight = vocab.load_weight(path="data/komoran_hd_2times.vec") encoder = Encoder(vocab.n_words, w_embed_size, hidden_size, batch_size, weight).to(device) decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size, batch_size, weight).to(device) if args.encoder: encoder.load_state_dict(torch.load(args.encoder)) print("[INFO] load encoder with %s" % args.encoder) if args.decoder: decoder.load_state_dict(torch.load(args.decoder)) print("[INFO] load decoder with %s" % args.decoder) train_data = prep.read_train_data(train_file) train_loader = data.DataLoader(train_data, batch_size=batch_size, shuffle=True) # ev.evaluateRandomly(encoder, decoder, train_data, vocab, batch_size) # ev.evaluate_with_print(encoder, vocab, batch_size) # initialize max_a_at_5, max_a_at_1 = ev.evaluate_similarity(encoder, vocab, batch_size, decoder=decoder) # max_a_at_5, max_a_at_1 = 0, 0 max_bleu = 0 total_epoch = args.epoch print(args) for epoch in range(1, total_epoch + 1): random.shuffle(train_data) trainIters(args, epoch, encoder, decoder, total_epoch, train_data, vocab, train_loader, print_every=2, learning_rate=lr) if epoch % 20 == 0: a_at_5, a_at_1 = ev.evaluate_similarity(encoder, vocab, batch_size, decoder=decoder) if a_at_1 > max_a_at_1: max_a_at_1 = a_at_1 print("[INFO] New record! accuracy@1: %.4f" % a_at_1) if a_at_5 > max_a_at_5: max_a_at_5 = a_at_5 print("[INFO] New record! accuracy@5: %.4f" % a_at_5) if args.save == 'y': torch.save(encoder.state_dict(), 'encoder-max.model') torch.save(decoder.state_dict(), 'decoder-max.model') print("[INFO] new model saved") bleu = ev.evaluateRandomly(encoder, decoder, train_data, vocab, batch_size) if bleu > max_bleu: max_bleu = bleu if args.save == 'y': torch.save(encoder.state_dict(), 'encoder-max-bleu.model') torch.save(decoder.state_dict(), 'decoder-max-bleu.model') print("[INFO] new model saved") print("Done! max accuracy@5: %.4f, max accuracy@1: %.4f" % (max_a_at_5, max_a_at_1)) print("max bleu: %.2f" % max_bleu) if args.save == 'y': torch.save(encoder.state_dict(), 'encoder-last.model') torch.save(decoder.state_dict(), 'decoder-last.model')
parser.add_argument('--hidden_size', type=int, default=128) parser.add_argument('--w_embed_size', type=int, default=64) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epoch', type=int, default=400) parser.add_argument('--save', choices=['y', 'n'], default='n') parser.add_argument('--pre_trained_embed', choices=['y', 'n'], default='y') args = parser.parse_args() global batch_size batch_size = args.batch_size hidden_size = args.hidden_size w_embed_size = args.w_embed_size train_file = 'data/train_data_nv.txt' vocab = Vocab() vocab.build(train_file) if args.pre_trained_embed == 'n': encoder = Encoder(vocab.n_words, w_embed_size, hidden_size, batch_size).to(device) decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size, batch_size).to(device) # decoder = Decoder(vocab.n_words, w_embed_size, hidden_size, batch_size).to(device) else: # load pre-trained embedding weight = vocab.load_weight(path="data/komoran_hd_2times.vec") encoder = Encoder(vocab.n_words, w_embed_size, hidden_size, batch_size, weight).to(device) decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size, batch_size, weight).to(device)
from preprocess import Vocab import matplotlib.pyplot as plt # plt 用于显示图片 model = load_model('capcha_model_one_char.h5') samples = glob.glob('data/test/*.jpg') batch_size = 10 batch = np.random.choice(samples, batch_size) print(batch) for sample in batch: img = np.asarray(Image.open(sample)).reshape(1, 60, 240, 3) plt.imshow(Image.open(sample)) text = sample[-8:-4] pre_text = Vocab().one_hot_to_text(model.predict(img)[0]) print(model.predict(img)) print('prediction is:{}'.format(pre_text)) print('real is {}'.format(text)) # batch = np.random.choice(samples, batch_size) # X = [] # y = [] # for sample in batch: # img = np.asarray(Image.open(sample)) # text = Vocab().text_to_one_hot(sample[-8:-7]) # X.append(img) # y.append(text) # X = np.asarray(X) # y = np.asarray(y) # # y1 = [y[:,i] for i in range(4)]
with open(trainfile, 'r') as file: line = file.readline() while line: s, t, f = line.split('\t') data[0].append((s, t, f.split(';'))) line = file.readline() with open(parsefile, 'r') as file: line = file.readline() while line: s, t, f = line.split('\t') data[1].append((s, t, f.split(';'))) line = file.readline() # Prepare vocabulary. Here, it doesn't add words that appear only in evaluation set. vocab = Vocab(data[0]) vocab.add_parsefile(data[1]) # Instanciate a model. mdl = model.Model(char_dim=config.char_dim, feat_dim=config.feat_dim, hidden_dim=config.hidden_dim, char_size=len(vocab._char_dict.x2i), feat_sizes=[len(fd.x2i) for fd in vocab._feat_dicts]) # Train and validate the model. # It stops training when the maximum accuracy in validation set does not improve for more than specified epochs. max_acc = 0 has_not_been_updated_for = 0 for epc in range(config.epochs):
for i in test_in[1:]: x = torch.cat((x, _pre_trained_embedding[i].view(1, -1)), 0) return x def get_sentence_embed(vocab, sentence, pre_trained_embedding): """ represent sentence by averaing word embeddings """ we_matrix = get_word_embed_matrix(sentence, vocab, pre_trained_embedding) return torch.mean(we_matrix, 0) def get_sentence_embed_sa(vocab, sentence, pre_trained_embedding): we_matrix = get_word_embed_matrix(vocab, sentence, pre_trained_embedding) applied_sent = scaled_dot_product_attn(we_matrix, we_matrix, we_matrix) return applied_sent if __name__ == "__main__": vocab = Vocab() vocab.build(train_file) pre_trained_embedding = vocab.load_weight(EMBED_PATH) sentence = '에어컨/NNG 작동/NNG 시/NNB 냉방/NNG 성능/NNG 떨어지/VV 그렇/VA 모르/VV 어떻/VA 하/VV 하/VX' #sentence = '에어컨/NNG 시원/XR 나오/VV 않/VX 그렇/VA 자동차/NNG 고장/NNG 아니/VCN 하/VV 연락/NNG 드리/VV' #data = get_word_embed_matrix(vocab, sentence, pre_trained_embedding) data = get_sentence_embed_sa(vocab, sentence, pre_trained_embedding) print(data.size()) sns.heatmap(data) plt.show()
import torch import torch.nn as nn import torch.nn.functional as F import numpy as np import os from config import config from preprocess import Vocab, Preprocess, dataset from model import Encoder, Attention, Decoder from torch.utils.data import DataLoader, Dataset if __name__ == "__main__": print('==> Loading config......') cfg = config() print('==> Preprocessing data......') voc = Vocab(cfg) voc.gen_counter_dict() voc.gen_vocab() cfg.vocab_len = voc.vocab_len print('The length of vocab is: {}'.format(cfg.vocab_len)) prep = Preprocess(cfg, voc.vocab) pairs = prep.gen_pair_sen() print('pairs sentences generated.') pairs = prep.tokenize(pairs) print('sentences tokenized.') traindataset = dataset(pairs, voc.vocab) traindataloader = DataLoader(traindataset, batch_size=5, shuffle=False) one_iter = iter(traindataloader).next()
print("total: %d, accuracy@5: %.4f, accuracy@1: %.4f" % (total, accuracy_at_5, accuracy_at_1)) """ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--encoder', help='load exisiting model') parser.add_argument('--decoder', help='load exisiting model') parser.add_argument('--batch_size', type=int, default=40) parser.add_argument('--hidden_size', type=int, default=128) parser.add_argument('--w_embed_size', type=int, default=64) parser.add_argument('--pre_trained_embed', choices=['y', 'n'], default='n') args = parser.parse_args() vocab = Vocab() vocab.build(train_file) batch_size = args.batch_size hidden_size = args.hidden_size w_embed_size = args.w_embed_size if args.pre_trained_embed == 'n': encoder = Encoder(vocab.n_words, w_embed_size, hidden_size, batch_size).to(device) decoder = AttentionDecoder(vocab.n_words, w_embed_size, hidden_size, batch_size).to(device) # decoder = Decoder(vocab.n_words, w_embed_size, hidden_size, batch_size).to(device) else: # load pre-trained embedding weight = vocab.load_weight(path="data/komoran_hd_2times.vec")