def test_empty_vocab(): """ Nothing is present in an empty word list """ vocab = Vocab( [ ] ) assert vocab.as_list() == [ ] assert not vocab.has("sheep")
def train(corpus_file, out_file, mode, dim_size, window, min_count, negative, epoch, pool_size, chunk_size): with bz2.BZ2File(corpus_file) as f: sentences = LineSentence(f) sg = int(mode == 'sg') model = Word2Vec(sentences, size=dim_size, window=window, min_count=min_count, workers=pool_size, iter=epoch, negative=negative, sg=sg) words = [] entities = [] for (w, _) in model.vocab.iteritems(): if w.startswith(MARKER): entities.append(w[len(MARKER):].replace(u'_', u' ')) else: words.append(w) vocab = Vocab(Trie(words), Trie(entities)) word_embedding = np.zeros((len(words), dim_size), dtype=np.float32) entity_embedding = np.zeros((len(entities), dim_size), dtype=np.float32) for word in words: word_embedding[vocab.get_word_index(word)] = model[word] for entity in entities: entity_embedding[vocab.get_entity_index(entity)] = model[MARKER + entity.replace(u' ', u'_')] ret = dict( word_embedding=word_embedding, entity_embedding=entity_embedding, vocab=vocab, ) joblib.dump(ret, out_file, compress=False)
def prepare(args): """ checks data, creates the directories, prepare the vocabulary and embeddings """ logger = logging.getLogger("brc") logger.info('Checking the data files...') for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format(data_path) logger.info('Preparing the directories...') for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]: if not os.path.exists(dir_path): os.makedirs(dir_path) logger.info('Building vocabulary...') brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files, args.test_files) vocab = Vocab(lower=True) for word in brc_data.word_iter('train'): vocab.add(word) unfiltered_vocab_size = vocab.size() vocab.filter_tokens_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num, vocab.size())) logger.info('Assigning embeddings...') vocab.randomly_init_embeddings(args.embed_size) logger.info('Saving vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(vocab, fout) logger.info('Done with preparing!')
def train_worker(vec_size, window_size, k, alpha, queue, results_queue, sent_dic, sent_vecs, vocab_dic, vocab_vecs, table, win_count_dic, lock ): # change shared Array to numpy array sent_vecs = Arr(np.frombuffer(sent_vecs.get_obj()), vec_size) vocab_vecs = Arr(np.frombuffer(vocab_vecs.get_obj()), vec_size) # init objects sent = Sent(vec_size, sent_dic, sent_vecs) vocab = Vocab(vec_size, vocab_dic, vocab_vecs) window_table = WindowTable(vocab=vocab, size=window_size, table=table, win_count_dic=win_count_dic) # get a task sentence = queue.get() while sentence != None: if sentence == CURRENT_TURN_END_TOKEN: results_queue.put(None) sentence = queue.get() continue Jn = 0 windows = gen_windows_from_sentence(sentence, window_size) v = sent[sentence] for wn, window in enumerate(windows): window_key = "-".join([str(vocab.vocab[hash(w)]) for w in window]) h = vocab.get_window_vec(word_index=window_key) # noises noises = window_table.get_samples(k) e_vT_h = np.e**np.dot(v.T, h) update_v = h / (1. + e_vT_h) update_h = v / (1. + e_vT_h) # add positive window's loss Jn += math.log( 1. / ( 1. + 1./e_vT_h)) update_window(vocab, window_key, update_h, lock, alpha) for idx, key in noises: n_h = vocab.get_window_vec(word_index=key) e_vT_h = np.e ** np.dot(v, n_h) frac_e_v_h = 1 - \ 1 / (1 + e_vT_h) # accumulate the gradient update_v += - n_h * frac_e_v_h update_n_h = - v * frac_e_v_h update_window(vocab, key, update_n_h, lock, alpha) # add noise's loss Jn += math.log( 1/ (1+e_vT_h)) update_v /= ( 1 + k) update_sent_vec(v, update_v, lock, alpha) #return Jn results_queue.put(Jn) current = mp.current_process() #print "%s Jn: %f" % (current.name, Jn) sentence = queue.get() show_status(results_queue) print "process %s exit!" % current.name logging.warning("process %s exit!" % current.name)
def load_data(small=True, char_based=False, batch_size=20, vocab_size=10000, history_len=5, max_tokens=50, null_mark=False): vocab_path = os.path.join(resource_dir, "ptb.train.txt") valid_path = os.path.join(resource_dir, "ptb.valid.txt") if small: train_path = os.path.join(resource_dir, "ptb.train.10k.txt") else: train_path = os.path.join(resource_dir, "ptb.train.txt") vocab = Vocab(char_based=char_based, null_mark=null_mark) vocab.load(vocab_path, max_size=vocab_size) lmdata = LMDataset(vocab, train_path, valid_path, history_len=-1, char_based=char_based, max_tokens=max_tokens) batch = BunchSequences(lmdata, batch_size=batch_size, fragment_length=history_len) return vocab, batch
def initialize_vocab_and_tags(tags, vocab): if not vocab: vocab = Vocab() vocab.add("#OOV") learn_vocab = True else: learn_vocab = False if not tags: tags = Vocab() learn_tags = True else: learn_tags = False return learn_tags, learn_vocab, tags, vocab
def __init__(self, path="", vec_size=50, k=20, alpha=0.1, n_workers=1): ''' :parameters: @path: string path to dataset, should be a single file @vec_size: int size of sentence vector and word vector @k: int number of negative samples for a window @alpha: float learning rate ''' self.k = k self.vec_size = vec_size self.n_workers = n_workers self.alpha = alpha self.vocab = Vocab() self.sent = Sent() self.window_table = WindowTable(self.vocab, SIZE) self.dataset = Dataset(path) if path: self.create_vocab() self.create_sent() self.create_window_table()
def test_add(self): v = Vocab() v.add('a') v.add('b') v.add('c') v.add('d') v.add('a') self.assertEqual(len(v), 4) self.assertEqual(v['a'], 0) self.assertEqual(v.rev(3), 'd') self.assertEqual(v.rev(0), 'a')
def initialize_vocab_and_tags(tags, vocab, alphabet): if not vocab: vocab = Vocab() vocab.add('#OOV') alphabet = Vocab() alphabet.add('#OOA') learn_vocab = True else: learn_vocab = False if not tags: tags = Vocab() tags.add("#OOT") learn_tags = True else: learn_tags = False return learn_tags, learn_vocab, tags, vocab, alphabet
def getembd(): vocab_file='data/vocab.txt' vocab = Vocab(filename=vocab_file) emb_file = os.path.join('data/', 'webkbb_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) print(emb.size()) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors(os.path.join('data/glove','glove.6B.200d')) print('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.zeros(vocab.size(),glove_emb.size(1)) for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(word)] else: emb[vocab.getIndex(word)] = torch.Tensor(emb[vocab.getIndex(word)].size()).normal_(-0.05,0.05) torch.save(emb, emb_file) is_preprocessing_data = True # flag to quit print('done creating emb, quit')
def test_small_vocab(): l = ["eeny", "moe", "miney", "meeny"]; vocab = Vocab(l) assert vocab.has("moe") assert vocab.has("eeny") assert vocab.has("miney") assert vocab.has("meeny") assert not vocab.has("many") assert sorted(vocab.as_list()) == sorted(l)
def main(unused_args): ''' Generates data from a trained model (fun!) ''' if not FLAGS.load_model: print('--load_model is required') return -1 with tf.Graph().as_default(), tf.Session() as session: ''' load parameters of the model ''' with tf.variable_scope("params"): num_layers_var = tf.Variable(0, name='num_layers') hidden_size_var = tf.Variable(0, name='hidden_size') vocab_size_var = tf.Variable(0, name='vocab_size') tf.train.Saver([num_layers_var, hidden_size_var, vocab_size_var]).restore(session, FLAGS.load_model) vocab_var = tf.Variable([0] * vocab_size_var.eval(), name='vocab') tf.train.Saver([vocab_var]).restore(session, FLAGS.load_model) FLAGS.num_layers = np.asscalar(num_layers_var.eval()) FLAGS.hidden_size = np.asscalar(hidden_size_var.eval()) vocab = Vocab.from_array(vocab_var.eval()) print('Loaded model from file', FLAGS.load_model) print('\tnum_layers:', FLAGS.num_layers) print('\thidden_size:', FLAGS.hidden_size) print('\tvocab_size', vocab.size) ''' load inference graph ''' with tf.variable_scope("model", reuse=None): m = graph.inference_graph(vocab.size, FLAGS.num_layers, FLAGS.hidden_size) tf.train.Saver().restore(session, FLAGS.load_model) logits = np.ones((vocab.size,)) state = session.run(m.initial_state) for i in range(FLAGS.sample_size): logits = logits / FLAGS.temperature prob = np.exp(logits) prob /= np.sum(prob) prob = prob.ravel() ix = np.random.choice(range(len(prob)), p=prob) print(vocab.decode(ix), end='') logits, state = session.run([m.logits, m.final_state], {m.input_data: np.array([[ix]]), m.initial_state: state})
def read_datasets(input_data, train_fraction=0.95, valid_fraction=0.05, vocab=None, vocab_size=128): print('Reading data from', input_data, '...') with open(input_data, 'rb') as f: data = f.read() if vocab is None: vocab = Vocab.from_data(data, vocab_size=vocab_size) train_size = int(math.floor(len(data) * train_fraction)) valid_size = int(math.floor(len(data) * valid_fraction)) train_data = data[:train_size] valid_data = data[train_size:train_size+valid_size] test_data = data[train_size+valid_size:] return [vocab.encode(c) for c in train_data], [vocab.encode(c) for c in valid_data], [vocab.encode(c) for c in test_data], vocab
def test_from_simulated_file(): from io import StringIO l = StringIO(initial_value=""" #comment # another comment line sheep rats #comment squirrels """) vocab = Vocab(l) assert sorted(vocab.as_list()) == ["rats", "sheep", "squirrels"] assert vocab.has("sheep") assert vocab.has("rats") assert vocab.has("squirrels") assert not vocab.has("#comment")
def main(): args=parse_args() print(args) num_classes = 7 data_dir = args.data_dir #,'train_texts.blk') train_file=os.path.join(data_dir,'train_data.pth') #val_dir = args.val_data #'val_texts.blk') val_file= os.path.join(data_dir,'val_data.pth') vocab_file="../data/vocab.txt" vocab = Vocab(filename=vocab_file) if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = WebKbbDataset(vocab, num_classes,os.path.join(data_dir,'train_texts.blk'),os.path.join(data_dir,'train_labels.blk')) torch.save(train_dataset, train_file) if os.path.isfile(val_file): val_dataset = torch.load(val_file) else: val_dataset = WebKbbDataset(vocab, num_classes,os.path.join(data_dir,'val_texts.blk'),os.path.join(data_dir,'val_labels.blk')) torch.save(val_dataset, val_file) vocab_size=vocab.size() in_dim=200 mem_dim=200 hidden_dim=200 num_classes=7 sparsity=True freeze=args.freeze_emb epochs=args.epochs lr=args.lr pretrain=args.pretrain cuda_flag=True if not torch.cuda.is_available(): cuda_flag=False model = DomTreeLSTM(vocab_size,in_dim, mem_dim, hidden_dim, num_classes, sparsity, freeze) criterion = nn.CrossEntropyLoss() if pretrain: emb_file = os.path.join('../data', 'emb.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) print(emb.size()) print("Embedding weights loaded") else: print("Embedding file not found") model.emb.weight.data.copy_(emb) optimizer = optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=lr) trainer = Trainer(model, criterion, optimizer,train_dataset,val_dataset,cuda_flag=cuda_flag) for epoch in range(epochs): trainer.train() trainer.test()
def main(): args = parse_args() # input files train_file = args.data_dir + '/train.json' test_file = args.data_dir + '/test.json' # output files # token vocab_tok_file = args.vocab_dir + '/vocab_tok.vocab' # position vocab_post_file = args.vocab_dir + '/vocab_post.vocab' # pos_tag vocab_pos_file = args.vocab_dir + '/vocab_pos.vocab' # dep_rel vocab_dep_file = args.vocab_dir + '/vocab_dep.vocab' # polarity vocab_pol_file = args.vocab_dir + '/vocab_pol.vocab' # load files print("loading files...") train_tokens, train_pos, train_dep, train_max_len = load_tokens(train_file) test_tokens, test_pos, test_dep, test_max_len = load_tokens(test_file) # lower tokens if args.lower: train_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in\ (train_tokens, test_tokens)] # counters token_counter = Counter(train_tokens + test_tokens) pos_counter = Counter(train_pos + test_pos) dep_counter = Counter(train_dep + test_dep) max_len = max(train_max_len, test_max_len) post_counter = Counter(list(range(-max_len, max_len))) pol_counter = Counter(['positive', 'negative', 'neutral']) # build vocab print("building vocab...") token_vocab = Vocab(token_counter, specials=['<pad>', '<unk>']) pos_vocab = Vocab(pos_counter, specials=['<pad>', '<unk>']) dep_vocab = Vocab(dep_counter, specials=['<pad>', '<unk>']) post_vocab = Vocab(post_counter, specials=['<pad>', '<unk>']) pol_vocab = Vocab(pol_counter, specials=[]) print( "token_vocab: {}, pos_vocab: {}, dep_vocab: {}, post_vocab: {}, pol_vocab: {}" .format(len(token_vocab), len(pos_vocab), len(dep_vocab), len(post_vocab), len(pol_vocab))) print("dumping to files...") token_vocab.save_vocab(vocab_tok_file) pos_vocab.save_vocab(vocab_pos_file) dep_vocab.save_vocab(vocab_dep_file) post_vocab.save_vocab(vocab_post_file) pol_vocab.save_vocab(vocab_pol_file) print("all done.")
import os from collections import namedtuple import sys import json import dynet as dynet config = sys.argv[1] model = sys.argv[2] testfile = sys.argv[3] vocabfile = os.path.dirname(model) + "/vocab.txt" d = json.load(open(config)) config = namedtuple("options", d.keys())(*d.values()) vocab = Vocab(vocabfile) if "embeds" in config: tagger = SimpleBiltyTagger( config.in_dim, config.h_dim, config.c_in_dim, config.h_layers, embeds_file=config.embeds, word2id=vocab.word2id, ) else: tagger = SimpleBiltyTagger(config.in_dim, config.h_dim, config.c_in_dim, config.h_layers,
from batcher import Dataset from char2vec import CharCNN as Char2Vec from vocab import Vocab parser = argparse.ArgumentParser() parser.add_argument('expdir') args = parser.parse_args() config = tf.ConfigProto(inter_op_parallelism_threads=10, intra_op_parallelism_threads=10) dataset = Dataset(10, preshuffle=False) dataset.ReadData('../data/tweetlid/training.tsv.gz', 'all', 'tweet') input_vocab = Vocab.MakeFromData(dataset.GetSentences(), min_count=1) char_vocab = Vocab.Load(os.path.join(args.expdir, 'char_vocab.pickle')) max_word_len = max([len(x) for x in input_vocab.GetWords()]) + 2 print 'max word len {0}'.format(max_word_len) with open(os.path.join(args.expdir, 'model_params.json'), 'r') as f: model_params = json.load(f) c2v = Char2Vec(char_vocab, model_params, max_sequence_len=max_word_len) the_words, word_lengths = c2v.MakeMat(input_vocab, pad_len=max_word_len) saver = tf.train.Saver(tf.all_variables()) session = tf.Session(config=config) saver.restore(session, os.path.join(args.expdir, 'model.bin'))
def prepare(args): """ checks data, creates the directories, prepare the vocabulary and embeddings """ logger = logging.getLogger("brc") logger.info('Checking the data files...') for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format( data_path) logger.info('Preparing the directories...') for dir_path in [ args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir ]: if not os.path.exists(dir_path): os.makedirs(dir_path) logger.info('Building vocabulary...') brc_data = Dataset(args.max_p_num, args.max_p_len, args.max_q_len, args.max_w_len, args.train_files, args.dev_files, args.test_files) vocab = Vocab(lower=True) for word in brc_data.word_iter('train'): vocab.add(word) for word in brc_data.word_iter('dev'): vocab.add(word) for word in brc_data.word_iter('test'): vocab.add(word) logger.info('Assigning embeddings...') vocab.load_pretrained_char_embeddings(args.char_embed) vocab.load_pretrained_word_embeddings(args.word_embed) vocab.randomly_init_embeddings(args.pos_embed_dim) logger.info('Saving vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(vocab, fout) logger.info('Done with preparing!')
def test(): embed = None if args.embed_path is not None and os.path.exists(args.embed_path): print('Loading pretrained word embedding...') embed = {} with open(args.embed_path, 'r') as f: f.readline() for line in f.readlines(): line = line.strip().split() vec = [float(_) for _ in line[1:]] embed[line[0]] = vec vocab = Vocab(args, embed) train_data, val_data, test_data = [], [], [] fns = os.listdir(args.train_dir) fns.sort(key=lambda p: int(p.split('.')[0])) for fn in tqdm(fns): f = open(args.train_dir + fn, 'r') train_data.append(json.load(f)) f.close() vocab.add_sentence(train_data[-1]['reviewText'].split()) vocab.add_sentence(train_data[-1]['summary'].split()) vocab.add_user(train_data[-1]['userID']) vocab.add_product(train_data[-1]['productID']) fns = os.listdir(args.valid_dir) fns.sort(key=lambda p: int(p.split('.')[0])) for fn in tqdm(fns): f = open(args.valid_dir + fn, 'r') val_data.append(json.load(f)) f.close() vocab.add_sentence(val_data[-1]['reviewText'].split()) vocab.add_sentence(val_data[-1]['summary'].split()) vocab.add_user(val_data[-1]['userID']) vocab.add_product(val_data[-1]['productID']) fns = os.listdir(args.test_dir) fns.sort(key=lambda p: int(p.split('.')[0])) for fn in tqdm(fns): f = open(args.test_dir + fn, 'r') test_data.append(json.load(f)) f.close() vocab.add_sentence(test_data[-1]['reviewText'].split()) vocab.add_sentence(test_data[-1]['summary'].split()) vocab.add_user(test_data[-1]['userID']) vocab.add_product(test_data[-1]['productID']) embed = vocab.trim() args.embed_num = len(embed) args.embed_dim = len(embed[0]) test_dataset = Dataset(test_data) test_iter = DataLoader(dataset=test_dataset, batch_size=args.batch_size, shuffle=False) print('Loading model...') checkpoint = torch.load(args.save_path + args.load_model) net = EncoderDecoder(checkpoint['args'], embed) net.load_state_dict(checkpoint['model']) if args.use_cuda: net.cuda() criterion = nn.NLLLoss(ignore_index=vocab.PAD_IDX, reduction='sum') print('Begin testing...') loss, r1, r2, rl = evaluate(net, criterion, vocab, test_iter, False) print('Loss: %f Rouge-1: %f Rouge-2: %f Rouge-l: %f' % (loss, r1, r2, rl))
tf.set_random_seed(666) baseline = False batch_size = 25 dataset = Dataset(batch_size, preshuffle=mode == 'train') und_symbol = 'und' dataset.ReadData(args.data, mode, args.model) # Make the input vocabulary (words that appear in data) if baseline: # The baseline is to use fixed word embeddings. if mode == 'train': # The input vocab is fixed during training. input_vocab = Vocab.MakeFromData(dataset.GetSentences(), min_count=2) input_vocab.Save(os.path.join(args.expdir, 'input_vocab.pickle')) else: # During testing we need to load the saved input vocab. input_vocab = Vocab.Load( os.path.join(args.expdir, 'input_vocab.pickle')) else: # The open vocabulary can be regenerated with each run. min_count = 1 if mode == 'debug': min_count = 10 # When visualizing word embeddings hide rare words maxlens = {'word': 40, 'char': 150, 'tweet': 40} input_vocab = Vocab.MakeFromData(dataset.GetSentences(), min_count=min_count, max_length=maxlens[args.model])
def train(args: Dict): train_data_src = read_corpus(args['--train-src'], source='src') train_data_tgt = read_corpus(args['--train-tgt'], source='tgt') dev_data_src = read_corpus(args['--dev-src'], source='src') dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt') train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) train_batch_size = int(args['--batch-size']) clip_grad = float(args['--clip-grad']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = args['--save-to'] vocab = Vocab.load(args['--vocab']) model = NMT(embed_size=int(args['--embed-size']), hidden_size=int(args['--hidden-size']), dropout_rate=float(args['--dropout']), input_feed=args['--input-feed'], label_smoothing=float(args['--label-smoothing']), vocab=vocab) model.train() uniform_init = float(args['--uniform-init']) if np.abs(uniform_init) > 0.: print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr) for p in model.parameters(): p.data.uniform_(-uniform_init, uniform_init) vocab_mask = torch.ones(len(vocab.tgt)) vocab_mask[vocab.tgt['<pad>']] = 0 device = torch.device("cuda:0" if args['--cuda'] else "cpu") print('use device: %s' % device, file=sys.stderr) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr'])) num_trial = 0 train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') while True: epoch += 1 for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True): train_iter += 1 optimizer.zero_grad() batch_size = len(src_sents) # (batch_size) example_losses = -model(src_sents, tgt_sents) batch_loss = example_losses.sum() loss = batch_loss / batch_size loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm(model.parameters(), clip_grad) optimizer.step() batch_losses_val = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += batch_size cum_examples += batch_size if train_iter % log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # perform validation if train_iter % valid_niter == 0: print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu dev_ppl = evaluate_ppl(model, dev_data, batch_size=128) # dev batch size can be a bit larger valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) if is_better: patience = 0 print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif patience < int(args['--patience']): patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == int(args['--patience']): num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == int(args['--max-num-trial']): print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * float(args['--lr-decay']) print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model params = torch.load(model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict(torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0 if epoch == int(args['--max-epoch']): print('reached maximum number of epochs!', file=sys.stderr) exit(0)
def main(unused_args): ''' Trains model from data ''' if not FLAGS.input_data: raise ValueError("Must set --input_data to the filename of input dataset") if not FLAGS.train_dir: raise ValueError("Must set --train_dir to the directory where training files will be saved") if not os.path.exists(FLAGS.train_dir): os.mkdir(FLAGS.train_dir) with tf.Graph().as_default(), tf.Session() as session: ''' To make tf.train.Saver write parameters as part of the saved file, add params to the graph as variables (hackish? - MK)''' with tf.variable_scope("params", reuse=None): num_layers_var = tf.Variable(FLAGS.num_layers, trainable=False, name='num_layers') hidden_size_var = tf.Variable(FLAGS.hidden_size, trainable=False, name='hidden_size') ''' If pre-trained model loaded from file, use loaded vocabulary and NN geometry. Else, compute vocabulary and use command-line params for num_layers and hidden_size ''' if FLAGS.load_model: vocab_size_var = tf.Variable(0, trainable=False, name='vocab_size') tf.train.Saver([num_layers_var, hidden_size_var, vocab_size_var]).restore(session, FLAGS.load_model) vocab_var = tf.Variable([0] * vocab_size_var.eval(), trainable=False, name='vocab') tf.train.Saver([vocab_var]).restore(session, FLAGS.load_model) FLAGS.num_layers = np.asscalar(num_layers_var.eval()) # need np.asscalar to upcast np.int32 to Python int FLAGS.hidden_size = np.asscalar(hidden_size_var.eval()) vocab = Vocab.from_array(vocab_var.eval()) train_data, valid_data, test_data, vocab = reader.read_datasets(FLAGS.input_data, FLAGS.train_fraction, FLAGS.valid_fraction, vocab=vocab) else: train_data, valid_data, test_data, vocab = reader.read_datasets(FLAGS.input_data, FLAGS.train_fraction, FLAGS.valid_fraction, vocab_size=FLAGS.vocab_size) vocab_size_var = tf.Variable(vocab.size, trainable=False, name='vocab_size') vocab_var = tf.Variable(vocab.to_array(), trainable=False, name='vocab') ''' build training graph ''' initializer = tf.random_uniform_initializer(-FLAGS.init_scale, FLAGS.init_scale) with tf.variable_scope("model", initializer=initializer): m = graph.inference_graph(vocab.size, FLAGS.num_layers, FLAGS.hidden_size, FLAGS.batch_size, FLAGS.num_steps, FLAGS.dropout_rate) m.update(graph.cost_graph(m.logits, FLAGS.batch_size, FLAGS.num_steps, vocab.size)) m.update(graph.training_graph(m.cost, FLAGS.grad_clip)) # create saver before creating more graph nodes, so that we do not save any vars defined below saver = tf.train.Saver(max_to_keep=50) ''' build graph for validation and testing (shares parameters with the training graph!) ''' with tf.variable_scope("model", reuse=True): mvalid = graph.inference_graph(vocab.size, FLAGS.num_layers, FLAGS.hidden_size, FLAGS.batch_size, FLAGS.num_steps) mvalid.update(graph.cost_graph(mvalid.logits, FLAGS.batch_size, FLAGS.num_steps, vocab.size)) if FLAGS.load_model: saver.restore(session, FLAGS.load_model) print('Loaded model from', FLAGS.load_model) else: print('Created model') print('\tnum_layers:', FLAGS.num_layers) print('\thidden_size:', FLAGS.hidden_size) print('\tvocab_size:', vocab.size) print() print('Training parameters') print('\tbatch_size:', FLAGS.batch_size) print('\tnum_steps:', FLAGS.num_steps) print('\tlearning_rate:', FLAGS.learning_rate) print('\tbeta1:', FLAGS.beta1) print('\tbeta2:', FLAGS.beta2) print() print('Datasets') print('\ttraining dataset size:', len(train_data)) print('\tvalidation dataset size:', len(valid_data)) print('\ttest dataset size:', len(test_data)) print() ''' create two summaries: training cost and validation cost ''' summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph=session.graph) summary_train = summary_graph('Training cost', ema_decay=0.95) summary_valid = summary_graph('Validation cost') session.run([ m.lr.initializer, m.beta1.initializer, m.beta2.initializer, ]) tf.initialize_all_variables().run() session.run([ tf.assign(m.lr, FLAGS.learning_rate), tf.assign(m.beta1, FLAGS.beta1), tf.assign(m.beta2, FLAGS.beta2), ]) state = session.run(m.initial_state) iterations = len(train_data) // FLAGS.batch_size // FLAGS.num_steps * FLAGS.max_epochs for i, (x, y) in enumerate(reader.next_batch(train_data, FLAGS.batch_size, FLAGS.num_steps)): if i >= iterations: break start_time = time.time() cost, state, _ = session.run([m.cost, m.final_state, m.train_op], { m.input_data: x, m.targets: y, m.initial_state: state }) epoch = float(i) / (len(train_data) // FLAGS.batch_size // FLAGS.num_steps) time_elapsed = time.time() - start_time print('%d/%d (epoch %.3f), train_loss = %6.8f, time/batch = %.4fs' % (i+1, iterations, epoch, cost, time_elapsed)) session.run([summary_train.update], {summary_train.x: cost}) if (i+1) % FLAGS.eval_val_every == 0 or i == iterations-1: # evaluate loss on validation data cost = run_test(session, mvalid, valid_data, FLAGS.batch_size, FLAGS.num_steps) print("validation cost = %6.8f" % cost) save_as = '%s/epoch%.2f_%.4f.model' % (FLAGS.train_dir, epoch, cost) saver.save(session, save_as) ''' write out summary events ''' buffer, = session.run([summary_train.summary]) summary_writer.add_summary(buffer, i) session.run([summary_valid.update], {summary_valid.x: cost}) buffer, = session.run([summary_valid.summary]) summary_writer.add_summary(buffer, i) summary_writer.flush() if len(test_data) > FLAGS.batch_size * FLAGS.num_steps: cost = run_test(session, mvalid, test_data, FLAGS.batch_size, FLAGS.num_steps) print("Test cost: %.3f" % test_loss)
def main(): global args args = parse_args() # global logger logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) formatter = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s:%(message)s") # file logger fh = logging.FileHandler(os.path.join(args.save, args.expname)+'.log', mode='w') fh.setLevel(logging.INFO) fh.setFormatter(formatter) logger.addHandler(fh) # console logger ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) ch.setFormatter(formatter) logger.addHandler(ch) # argument validation args.cuda = args.cuda and torch.cuda.is_available() if args.sparse and args.wd != 0: logger.error('Sparsity and weight decay are incompatible, pick one!') exit() logger.debug(args) torch.manual_seed(args.seed) random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True if not os.path.exists(args.save): os.makedirs(args.save) train_dir = os.path.join(args.data, 'train/') dev_dir = os.path.join(args.data, 'dev/') test_dir = os.path.join(args.data, 'test/') # write unique words from all token files sick_vocab_file = os.path.join(args.data, 'sick.vocab') if not os.path.isfile(sick_vocab_file): token_files_b = [os.path.join(split, 'b.toks') for split in [train_dir, dev_dir, test_dir]] token_files_a = [os.path.join(split, 'a.toks') for split in [train_dir, dev_dir, test_dir]] token_files = token_files_a + token_files_b sick_vocab_file = os.path.join(args.data, 'sick.vocab') build_vocab(token_files, sick_vocab_file) # get vocab object from vocab file previously written vocab = Vocab(filename=sick_vocab_file, data=[Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]) logger.debug('==> SICK vocabulary size : %d ' % vocab.size()) # load SICK dataset splits train_file = os.path.join(args.data, 'sick_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = SICKDataset(train_dir, vocab, args.num_classes) torch.save(train_dataset, train_file) logger.debug('==> Size of train data : %d ' % len(train_dataset)) dev_file = os.path.join(args.data, 'sick_dev.pth') if os.path.isfile(dev_file): dev_dataset = torch.load(dev_file) else: dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes) torch.save(dev_dataset, dev_file) logger.debug('==> Size of dev data : %d ' % len(dev_dataset)) test_file = os.path.join(args.data, 'sick_test.pth') if os.path.isfile(test_file): test_dataset = torch.load(test_file) else: test_dataset = SICKDataset(test_dir, vocab, args.num_classes) torch.save(test_dataset, test_file) logger.debug('==> Size of test data : %d ' % len(test_dataset)) # initialize model, criterion/loss_function, optimizer model = SimilarityTreeLSTM( vocab.size(), args.input_dim, args.mem_dim, args.hidden_dim, args.num_classes, args.sparse, args.freeze_embed) criterion = nn.KLDivLoss() if args.cuda: model.cuda(), criterion.cuda() if args.optim == 'adam': optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adagrad': optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) elif args.optim == 'sgd': optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) metrics = Metrics(args.num_classes) # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_file = os.path.join(args.data, 'sick_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors(os.path.join(args.glove, 'glove.840B.300d')) logger.debug('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.Tensor(vocab.size(), glove_emb.size(1)).normal_(-0.05, 0.05) # zero out the embeddings for padding and other special words if they are absent in vocab for idx, item in enumerate([Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]): emb[idx].zero_() for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(word)] torch.save(emb, emb_file) # plug these into embedding matrix inside model if args.cuda: emb = emb.cuda() model.emb.weight.data.copy_(emb) # create trainer object for training and testing trainer = Trainer(args, model, criterion, optimizer) best = -float('inf') for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) train_loss, train_pred = trainer.test(train_dataset) dev_loss, dev_pred = trainer.test(dev_dataset) test_loss, test_pred = trainer.test(test_dataset) train_pearson = metrics.pearson(train_pred, train_dataset.labels) train_mse = metrics.mse(train_pred, train_dataset.labels) logger.info('==> Epoch {}, Train \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, train_loss, train_pearson, train_mse)) dev_pearson = metrics.pearson(dev_pred, dev_dataset.labels) dev_mse = metrics.mse(dev_pred, dev_dataset.labels) logger.info('==> Epoch {}, Dev \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, dev_loss, dev_pearson, dev_mse)) test_pearson = metrics.pearson(test_pred, test_dataset.labels) test_mse = metrics.mse(test_pred, test_dataset.labels) logger.info('==> Epoch {}, Test \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, test_loss, test_pearson, test_mse)) if best < test_pearson: best = test_pearson checkpoint = { 'model': trainer.model.state_dict(), 'optim': trainer.optimizer, 'pearson': test_pearson, 'mse': test_mse, 'args': args, 'epoch': epoch } logger.debug('==> New optimum found, checkpointing everything now...') torch.save(checkpoint, '%s.pt' % os.path.join(args.save, args.expname))
def train(args: Dict): """ Train the NMT Model. @param args (Dict): args from cmd line """ train_data_src = read_corpus(args['--train-src'], source='src') train_data_tgt = read_corpus(args['--train-tgt'], source='tgt') dev_data_src = read_corpus(args['--dev-src'], source='src') dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt') train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) train_batch_size = int(args['--batch-size']) clip_grad = float(args['--clip-grad']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = args['--save-to'] vocab = Vocab.load(args['--vocab']) model = NMT(embed_size=int(args['--embed-size']), hidden_size=int(args['--hidden-size']), dropout_rate=float(args['--dropout']), vocab=vocab, no_char_decoder=args['--no-char-decoder']) model.train() uniform_init = float(args['--uniform-init']) if np.abs(uniform_init) > 0.: print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr) for p in model.parameters(): p.data.uniform_(-uniform_init, uniform_init) vocab_mask = torch.ones(len(vocab.tgt)) vocab_mask[vocab.tgt['<pad>']] = 0 device = torch.device("cuda:0" if args['--cuda'] else "cpu") print('use device: %s' % device, file=sys.stderr) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr'])) num_trial = 0 train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') while True: epoch += 1 for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True): train_iter += 1 optimizer.zero_grad() batch_size = len(src_sents) example_losses = -model(src_sents, tgt_sents) # (batch_size,) batch_loss = example_losses.sum() loss = batch_loss / batch_size loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) optimizer.step() batch_losses_val = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val tgt_words_num_to_predict = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += batch_size cum_examples += batch_size if train_iter % log_every == 0: print( 'epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # perform validation if train_iter % valid_niter == 0: print( 'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu # dev batch size can be a bit larger dev_ppl = evaluate_ppl(model, dev_data, batch_size=128) valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) is_better = len(hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) hist_valid_scores.append(valid_metric) if is_better: patience = 0 print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif patience < int(args['--patience']): patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == int(args['--patience']): num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == int(args['--max-num-trial']): print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * \ float(args['--lr-decay']) print( 'load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model params = torch.load( model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict( torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0 if epoch == int(args['--max-epoch']): print('reached maximum number of epochs!', file=sys.stderr) exit(0)
return preds def decode(self, code: List[int]): code = [int(c.detach().cpu().numpy()) for c in code] result = self.tokenizer.decode(code, remove_special_token=False) return result def encode(self, doc, maxlen): code = self.tokenizer.encode(doc) if len(code) > maxlen: code = code[:maxlen] code[-1] = self.tokenizer.eos_id elif len(code) < maxlen: pad_size = maxlen - len(code) code = code + pad_size * [self.tokenizer.pad_token_id] assert len(code) == maxlen return code if __name__ == "__main__": # from pprint import pprint inf = Inference(tokenizer=Vocab.from_pretrained('./model/vocab.txt'), model_path='model/28.ckpt', config_file='model/config.json', device='cpu') print( inf.infer([ '國際中心/綜合報導英國薩福克郡32歲的巴萊塔(Becky[UNK]Barletta)在新婚不久後,被診斷出罹患失智症,是英國最年輕的失智症患者之一,壽命可能只剩5年。巴萊塔的父親難過的表示,據《每日郵報》報導,巴萊塔目前住在自己的娘家,因為她已經無法自理生活,需要家人全天候的照顧。巴萊塔2015年10月結婚,但在2016年性情大變,當年8月確診罹患「上額顳葉失智症」(Frontotemporal[UNK]dementia)。巴萊塔罹病後,外在的行為表現、情緒、社交及語言能力都受影響。事實上,巴萊塔原是一名滑雪教練,學生們都非常喜歡她,怎料結婚後突然改變,讓家人不能接受。據了解,巴萊塔的叔叔及母親的表弟都死於失智症,因此家人非常擔心她的狀況。巴萊塔的妹妹蘇菲(Sophie)難過的表示,其實姊姊在結婚前,,「她以前是個很棒的老師,尤其對孩子特別好,大家都很喜歡她。」專家表示,若是巴萊塔的病情持續惡化,未來就連吃飯、說話都會有問題,甚至活不過10年。蘇菲補充,「姊姊會突然向街上的人說話,問他們能不能發出些好笑的聲音。大家都不明白為什麼她的外表看起來如此正常,卻會對人如此沒禮貌。」蘇菲目前正在向各方發起募款活動,希望能夠讓外界更' ]))
class Sent2Vec(object): def __init__(self, path="", vec_size=50, k=20, alpha=0.1, n_workers=1): ''' :parameters: @path: string path to dataset, should be a single file @vec_size: int size of sentence vector and word vector @k: int number of negative samples for a window @alpha: float learning rate ''' self.k = k self.vec_size = vec_size self.n_workers = n_workers self.alpha = alpha self.vocab = Vocab() self.sent = Sent() self.window_table = WindowTable(self.vocab, SIZE) self.dataset = Dataset(path) if path: self.create_vocab() self.create_sent() self.create_window_table() def create_vocab(self): for sent in self.dataset.sents: sent = sent.split() self.vocab.add_from_sent(sent) self.vocab.init_vecs() def create_sent(self): for sent in self.dataset.sents: self.sent.add(sent) self.sent.init_vecs() def create_window_table(self): ''' for negative sampling ''' self.window_table(self.dataset.sents) def multi_thread_train(self): ''' use mini-batch to train ''' jobs = Queue(maxsize=9 * self.n_workers) lock = threading.Lock() start, next_report = time.time(), [1.0] self.Js = [] def worker_train(): while True: # get sentence sent = jobs.get() if sent is None: break Jn = self.train_sent(sent, lock) self.Js.append(Jn) workers = [threading.Thread(target=worker_train) for _ in xrange(self.n_workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() # put dataset to Queue for sent in self.dataset.sents: jobs.put(sent) # put None to tell all threads to exit for _ in xrange(self.n_workers): jobs.put(None) for thread in workers: thread.join() print 'Js: ', np.mean(self.Js) elapsed = time.time() - start print 'used time', elapsed def train(self): ''' use mini-batch to train ''' Js = [] for no, sent in enumerate(self.dataset.sents): Jn = self.train_sent(sent) Js.append(Jn) # calculate Jn for this sentence mean_Js = np.mean( np.array(Js)) print 'total J', mean_Js return mean_Js def train_sent(self, sent, lock=None): # the loss Jn = 0 #print no, #print 'training sent: ', no, sent # get windows from the sent windows = gen_windows_from_sentence(sent, SIZE) #print 'gen windows', windows # get sentence vector v = self.sent[sent] for wn, window in enumerate(windows): #print '.', #assert( type(window) == type([]), "window is %s" % str(window)) #print 'window', window window_key = "-".join([str(self.vocab.vocab[hash(w)]) for w in window]) h = self.vocab.get_window_vec(word_index=window_key) # noises noises = self.window_table.get_samples(self.k) #n_hs = [self.vocab.get_window_vec(s[1]) for s in noises ] # for a positive sample #print 'h:', h #print 'v:', v e_vT_h = np.e**np.dot(v.T, h) #print "dot(v,h)", np.dot(v, h) #print "e_vT_h", e_vT_h #sys.exit(0); update_v = h / (1 + e_vT_h) update_h = v / (1 + e_vT_h) # add positive window's loss Jn += math.log( 1 / ( 1 + 1/e_vT_h)) self.update_window(window_key, update_h, lock) # for each negative window sample for idx, key in noises: n_h = self.vocab.get_window_vec(word_index=key) e_vT_h = np.e ** np.dot(v, n_h) frac_e_v_h = 1 - \ 1 / (1 + e_vT_h) # accumulate the gradient update_v += - n_h * frac_e_v_h update_n_h = - v * frac_e_v_h self.update_window(key, update_n_h, lock) # add noise's loss Jn += math.log( 1/ (1+e_vT_h)) update_v /= ( 1 + self.k) # update sentence vector for each window # TODO change to a single turn? self.update_sent_vec(v, update_v, lock) # add loss to total Jn #print return Jn def update_sent_vec(self, sent_vec, grad, lock=None): if lock: with lock: sent_vec += self.alpha * grad sent_vec /= LA.norm(sent_vec) else: sent_vec += self.alpha * grad sent_vec /= LA.norm(sent_vec) def update_window(self, key, grad, lock=None): ''' update each word's vector in a window and norm the vectors :parameters: @key: string like '19-32-2' @grad: numpy.array the gradient ''' word_ids = [int(id) for id in key.split('-')] for id in word_ids: word_vec = self.vocab.vecs[id] if lock: with lock: word_vec += self.alpha * grad word_vec /= LA.norm(word_vec) else: word_vec += self.alpha * grad word_vec /= LA.norm(word_vec) def tofile(self, path): ''' save model to file ''' mod2file(self, path) @staticmethod def fromfile(path): return mod_from_file(path)
def train(): embed = None if args.embed_path is not None and os.path.exists(args.embed_path): print('Loading pretrained word embedding...') embed = {} with open(args.embed_path, 'r') as f: f.readline() for line in f.readlines(): line = line.strip().split() vec = [float(_) for _ in line[1:]] embed[line[0]] = vec vocab = Vocab(args, embed) print('Loading datasets...') train_data, val_data, test_data = [], [], [] fns = os.listdir(args.train_dir) fns.sort(key=lambda p: int(p.split('.')[0])) for fn in tqdm(fns): f = open(args.train_dir + fn, 'r') train_data.append(json.load(f)) f.close() vocab.add_sentence(train_data[-1]['reviewText'].split()) vocab.add_sentence(train_data[-1]['summary'].split()) vocab.add_user(train_data[-1]['userID']) vocab.add_product(train_data[-1]['productID']) fns = os.listdir(args.valid_dir) fns.sort(key=lambda p: int(p.split('.')[0])) for fn in tqdm(fns): f = open(args.valid_dir + fn, 'r') val_data.append(json.load(f)) f.close() vocab.add_sentence(val_data[-1]['reviewText'].split()) vocab.add_sentence(val_data[-1]['summary'].split()) vocab.add_user(val_data[-1]['userID']) vocab.add_product(val_data[-1]['productID']) fns = os.listdir(args.test_dir) fns.sort(key=lambda p: int(p.split('.')[0])) for fn in tqdm(fns): f = open(args.test_dir + fn, 'r') test_data.append(json.load(f)) f.close() vocab.add_sentence(test_data[-1]['reviewText'].split()) vocab.add_sentence(test_data[-1]['summary'].split()) vocab.add_user(test_data[-1]['userID']) vocab.add_product(test_data[-1]['productID']) print('Deleting rare words...') embed = vocab.trim() args.embed_num = len(embed) args.embed_dim = len(embed[0]) train_dataset = Dataset(train_data) val_dataset = Dataset(val_data) train_iter = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True) val_iter = DataLoader(dataset=val_dataset, batch_size=args.batch_size, shuffle=False) net = EncoderDecoder(args, embed) if args.use_cuda: net.cuda() criterion = nn.NLLLoss(ignore_index=vocab.PAD_IDX, reduction='sum') optim = torch.optim.Adam(net.parameters(), lr=args.lr) print('Begin training...') for epoch in range(1, args.epochs + 1): if epoch >= args.lr_decay_start: adjust_learning_rate(optim, epoch - args.lr_decay_start + 1) for i, batch in enumerate(train_iter): src, trg, src_embed, trg_embed, src_mask, src_lens, trg_lens, _1, _2 = vocab.read_batch( batch) output = net(src, trg, src_embed, trg_embed, vocab.word_num, src_mask, src_lens, trg_lens) output = torch.log(output.view(-1, output.size(-1)) + 1e-20) trg_output = trg.view(-1) loss = criterion(output, trg_output) / len(src_lens) loss.backward() clip_grad_norm_(net.parameters(), args.max_norm) optim.step() optim.zero_grad() cnt = (epoch - 1) * len(train_iter) + i if cnt % args.print_every == 0: print('EPOCH [%d/%d]: BATCH_ID=[%d/%d] loss=%f' % (epoch, args.epochs, i, len(train_iter), loss.data)) if cnt % args.valid_every == 0 and cnt / args.valid_every >= 0: print('Begin valid... Epoch %d, Batch %d' % (epoch, i)) cur_loss, r1, r2, rl = evaluate(net, criterion, vocab, val_iter, True) save_path = args.save_path + 'valid_%d_%.4f_%.4f_%.4f_%.4f' % ( cnt / args.valid_every, cur_loss, r1, r2, rl) net.save(save_path) print( 'Epoch: %2d Cur_Val_Loss: %f Rouge-1: %f Rouge-2: %f Rouge-l: %f' % (epoch, cur_loss, r1, r2, rl)) return
from vocab import Vocab from lmdataset import LMDataset from lm import NeuralLM from deepy.dataset import SequentialMiniBatches from deepy.trainers import SGDTrainer, LearningRateAnnealer from deepy.layers import RNN, Dense logging.basicConfig(level=logging.INFO) resource_dir = os.path.abspath(os.path.dirname(__file__)) + os.sep + "resources" vocab_path = os.path.join(resource_dir, "ptb.train.txt") train_path = os.path.join(resource_dir, "ptb.train.txt") valid_path = os.path.join(resource_dir, "ptb.valid.txt") vocab = Vocab(char_based=True) vocab.load(vocab_path, max_size=1000) model = NeuralLM(input_dim=vocab.size, input_tensor=3) model.stack( RNN(hidden_size=100, output_type="sequence"), RNN(hidden_size=100, output_type="sequence"), Dense(vocab.size, "softmax"), ) if __name__ == "__main__": ap = ArgumentParser() ap.add_argument("--model", default=os.path.join(os.path.dirname(__file__), "models", "char_rnn_model1.gz")) ap.add_argument("--sample", default="") args = ap.parse_args()
return strip_eos(sents) def calc_ppl(sents, m): batches, _ = get_batches(sents, vocab, args.batch_size, device) total_nll = 0 with torch.no_grad(): for inputs, targets in batches: total_nll += model.nll_is(inputs, targets, m).sum().item() n_words = sum(len(s) + 1 for s in sents) # include <eos> return total_nll / len(sents), np.exp(total_nll / n_words) if __name__ == '__main__': args = parser.parse_args() vocab = Vocab(os.path.join(args.checkpoint, 'vocab.txt')) set_seed(args.seed) cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if cuda else "cpu") model = get_model(os.path.join(args.checkpoint, 'model.pt')) if args.evaluate: sents = load_sent(args.data) batches, _ = get_batches(sents, vocab, args.batch_size, device) meters = evaluate(model, batches) print(' '.join([ '{} {:.2f},'.format(k, meter.avg) for k, meter in meters.items() ])) if args.ppl: sents = load_sent(args.data)
parser.add_argument('--hidden-dim', default=50, type=int, help='hidden node dimensionality') parser.add_argument('--l2-penalty', default=0.0001, type=float, help='l2 penalty for params') parser.add_argument('--gru-initial-bias', default=2, type=int, help='initial gru bias for r & z. higher => more like SimpleRnn') opts = parser.parse_args() print >>sys.stderr, opts NUM_LABELS = 3 def log(s): print >>sys.stderr, util.dts(), s # slurp training data, including converting of tokens -> ids vocab = Vocab() train_x, train_y, train_stats = util.load_data(opts.train_set, vocab, update_vocab=True, max_egs=int(opts.num_from_train)) log("train_stats %s %s" % (len(train_x), train_stats)) dev_x, dev_y, dev_stats = util.load_data(opts.dev_set, vocab, update_vocab=False, max_egs=int(opts.num_from_dev)) log("dev_stats %s %s" % (len(dev_x), dev_stats)) # input/output example vars s1_idxs = T.ivector('s1') # sequence for sentence one s2_idxs = T.ivector('s2') # sequence for sentence two actual_y = T.ivector('y') # single for sentence pair label; 0, 1 or 2 # keep track of different "layers" that handle their own gradients.
def run(test_dir, test_srcs, test_src_caps, checkpoint, vocab_src, vocab_tgt, out="captions.out.txt", batch_size=16, max_seq_len=MAX_LEN, hidden_dim=HIDDEN_DIM, emb_dim=EMB_DIM, enc_seq_len=ENC_SEQ_LEN, enc_dim=ENC_DIM, attn_activation="relu", deep_out=False, decoder=2, attention=3): if decoder == 1: decoder = mmt.AttentionDecoder_1 elif decoder == 2: decoder = mmt.AttentionDecoder_2 elif decoder == 3: decoder = mmt.AttentionDecoder_3 elif decoder == 4: decoder = mmt.AttentionDecoder_4 if attention == 1: attention = attentions.AdditiveAttention elif attention == 2: attention = attentions.GeneralAttention elif attention == 3: attention = attentions.ScaledGeneralAttention # load vocabulary vocabulary_src = Vocab() vocabulary_src.load(vocab_src) vocabulary_tgt = Vocab() vocabulary_tgt.load(vocab_tgt) # load test instances file paths srcs = open(test_srcs).read().strip().split('\n') srcs = [os.path.join(test_dir, s) for s in srcs] src_caps = open(test_src_caps, encoding='utf-8').read().strip().split('\n') # load model net = MMTNetwork( src_emb_dim=emb_dim, tgt_emb_dim=emb_dim, enc_dim=hidden_dim, dec_dim=hidden_dim, src_dim=vocabulary_src.n_words, out_dim=vocabulary_tgt.n_words, img_attn_dim=512, src_cap_attn_dim=512, sos_token=0, eos_token=1, pad_token=2, max_seq_len=max_seq_len, deep_out=deep_out, attention=attention, decoder=decoder) net.to(DEVICE) net.load_state_dict(torch.load(checkpoint)) net.eval() with torch.no_grad(): # run inference num_instances = len(srcs) i = 0 captions = [] while i < num_instances: srcs_batch = srcs[i:i + batch_size] batch = _load_batch(srcs_batch) batch = batch.to(DEVICE) caps_in = src_caps[i:i + batch_size] caps_in = [vocabulary_src.sentence_to_tensor(y, max_seq_len) for y in caps_in] caps_in = torch.stack(caps_in, dim=0) caps_in = caps_in.permute(1, 0, 2) caps_in = caps_in.to(DEVICE) tokens, _ = net(source_captions=caps_in, image_features=batch, targets=None, max_len=max_seq_len) tokens = tokens.permute(1, 0, 2).detach() _, topi = tokens.topk(1, dim=2) topi = topi.squeeze(2) # decode token output from the model for j in range(len(srcs_batch)): c = vocabulary_tgt.tensor_to_sentence(topi[j]) c = ' '.join(c) captions.append(c) i += len(srcs_batch) out_f = open(out, mode='w', encoding='utf-8') for c in captions: out_f.write(c + '\n') return
def preprocess_data(self): print('Preprocessing data') raw_data = json.loads(open(self.raw_data_path).read().lower()) db_data = self.db_json sw_ent, mw_ent = self._value_key_map(db_data) vocab = Vocab(cfg.vocab_size, self.otlg.special_tokens) # delexicalization dialogs = {} for dial_id, dial in enumerate(raw_data): dialogs[dial_id] = {} dialogs[dial_id]['goal'] = dial['goal'] turns = [] for turn in dial['dial']: turn_num = turn['turn'] constraint = dict((slot, []) for slot in self.informable_slots) constraint_flat, user_request, sys_request = [], [], [] for slot_values in turn['usr']['slu']: if slot_values['act'] == 'inform': slot, value = slot_values['slots'][0][0], slot_values['slots'][0][1] slot = 'restaurant-' + slot if slot != 'restaurant-slot' and value not in ['dontcare', 'none']: constraint[slot].extend(self.word_tokenize(value)) constraint_flat.extend(self.word_tokenize(value)) if value == 'dontcare': constraint[slot].extend(['dontcare']) constraint_flat.extend(['dontcare']) elif slot_values['act'] == 'request': user_request.append('[value_%s]'%slot_values['slots'][0][1]) # constraint[slot].extend(['do', "n't", 'care']) if turn['sys']['da']: for s in turn['sys']['da']: s = ['price', 'range'] if s == 'pricerange' else [s] if s == [["area, centre"]]: s = ['area'] sys_request.extend(s) user = self.word_tokenize(turn['usr']['transcript']) resp = ' '.join(self.word_tokenize(turn['sys']['sent'])) resp = self._replace_entity(resp, sw_ent, mw_ent, constraint_flat) resp = resp.replace('[value_phone].', '[value_phone] .').replace('ok.', 'ok .') resp = resp.split() # try: turns.append({ 'turn': turn_num, 'user': '******'.join(user), 'response': ' '.join(resp), 'constraint': json.dumps(constraint), 'user_request': ' '.join(user_request), 'sys_request': ' '.join(sys_request), 'db_match': len(self.db_json_search(constraint)), }) for word in user + resp: vocab.add_word(word) dialogs[dial_id]['log'] = turns # save preprocessed data with open(self.data_path, 'w') as f: json.dump(dialogs, f, indent=2) # construct vocabulary vocab.construct() vocab.save_vocab(self.dataset_path + 'vocab') return dialogs
def prepare(args): """ checks data, creates the directories, prepare the vocabulary and embeddings """ for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format( data_path) for dir_path in [args.vocab_dir, args.model_dir]: if not os.path.exists(dir_path): os.makedirs(dir_path) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files) vocab = Vocab(lower=True) for word in brc_data.word_iter('train'): vocab.add(word) # unfiltered_vocab_size = vocab.size() print("vocab size is ", vocab.size()) vocab.filter_tokens_by_cnt(min_cnt=2) print("after filtered vocab size is ", vocab.size()) # filtered_num = unfiltered_vocab_size - vocab.size() vocab.randomly_init_embeddings(args.embed_size) if args.use_pre_train: vocab.load_pretrained_embeddings(args.pre_train_file) with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(vocab, fout)
def main(): global args args = parse_args() # global logger logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) formatter = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s:%(message)s") # file logger fh = logging.FileHandler(os.path.join(args.save, args.expname)+'.log', mode='w') fh.setLevel(logging.INFO) fh.setFormatter(formatter) logger.addHandler(fh) # console logger ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) ch.setFormatter(formatter) logger.addHandler(ch) # argument validation args.cuda = args.cuda and torch.cuda.is_available() if args.sparse and args.wd!=0: logger.error('Sparsity and weight decay are incompatible, pick one!') exit() logger.debug(args) #torch.manual_seed(args.seed) #random.seed(args.seed) if args.cuda: #torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True if not os.path.exists(args.save): os.makedirs(args.save) train_dir = os.path.join(args.data,'train/') dev_dir = os.path.join(args.data,'dev/') test_dir = os.path.join(args.data,'test/') # write unique words from all token files sick_vocab_file = os.path.join(args.data,'sick.vocab') if not os.path.isfile(sick_vocab_file): token_files_a = [os.path.join(split,'a.toks') for split in [train_dir,dev_dir,test_dir]] token_files_b = [os.path.join(split,'b.toks') for split in [train_dir,dev_dir,test_dir]] token_files = token_files_a+token_files_b sick_vocab_file = os.path.join(args.data,'sick.vocab') build_vocab(token_files, sick_vocab_file) # get vocab object from vocab file previously written vocab = Vocab(filename=sick_vocab_file, data=[Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]) logger.debug('==> SICK vocabulary size : %d ' % vocab.size()) # load SICK dataset splits train_file = os.path.join(args.data,'sick_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = SICKDataset(train_dir, vocab, args.num_classes) torch.save(train_dataset, train_file) logger.debug('==> Size of train data : %d ' % len(train_dataset)) dev_file = os.path.join(args.data,'sick_dev.pth') if os.path.isfile(dev_file): dev_dataset = torch.load(dev_file) else: dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes) torch.save(dev_dataset, dev_file) logger.debug('==> Size of dev data : %d ' % len(dev_dataset)) test_file = os.path.join(args.data,'sick_test.pth') if os.path.isfile(test_file): test_dataset = torch.load(test_file) else: test_dataset = SICKDataset(test_dir, vocab, args.num_classes) torch.save(test_dataset, test_file) logger.debug('==> Size of test data : %d ' % len(test_dataset)) # initialize model, criterion/loss_function, optimizer model = SimilarityTreeLSTM( vocab.size(), args.input_dim, args.mem_dim, args.hidden_dim, args.num_classes, args.sparse) criterion = nn.KLDivLoss() parameters = filter(lambda p: p.requires_grad, model.parameters()) if args.cuda: model.cuda(), criterion.cuda() if args.optim=='adam': optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=args.wd) elif args.optim=='adagrad': optimizer = optim.Adagrad(parameters, lr=args.lr, weight_decay=args.wd) elif args.optim=='sgd': optimizer = optim.SGD(parameters, lr=args.lr, weight_decay=args.wd) elif args.optim == 'adadelta': optimizer = optim.Adadelta(parameters, lr=args.lr, weight_decay=args.wd) metrics = Metrics(args.num_classes) # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_file = os.path.join(args.data, 'sick_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors(os.path.join(args.glove, 'glove.840B.300d')) logger.debug('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.Tensor(vocab.size(), glove_emb.size(1)).normal_(-0.05, 0.05) # zero out the embeddings for padding and other special words if they are absent in vocab for idx, item in enumerate([Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]): emb[idx].zero_() for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(word)] torch.save(emb, emb_file) # plug these into embedding matrix inside model if args.cuda: emb = emb.cuda() model.emb.weight.data.copy_(emb) # create trainer object for training and testing trainer = Trainer(args, model, criterion, optimizer) best = -float('inf') for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) train_loss, train_pred,act_idxs_train = trainer.test(train_dataset) dev_loss, dev_pred,act_idxs_dev = trainer.test(dev_dataset) test_loss, test_pred,act_idxs_test = trainer.test(test_dataset) train_pred = torch.index_select(train_pred,0,act_idxs_train) train_dataset_labels = torch.index_select(train_dataset.labels,0,act_idxs_train) train_pearson = metrics.pearson(train_pred,train_dataset_labels) train_spearmann = metrics.spearmann(train_pred, train_dataset_labels) train_mse = metrics.mse(train_pred,train_dataset_labels) logger.info('==> Epoch {}, Train \tLoss: {}\tPearson: {}\tSpearman: {}\tMSE: {}'.format(epoch, train_loss, train_pearson, train_spearmann, train_mse)) dev_pred = torch.index_select(dev_pred, 0, act_idxs_dev) dev_dataset_labels = torch.index_select(dev_dataset.labels, 0, act_idxs_dev) dev_pearson = metrics.pearson(dev_pred,dev_dataset_labels) dev_spearmann = metrics.spearmann(dev_pred, dev_dataset_labels) dev_mse = metrics.mse(dev_pred,dev_dataset_labels) logger.info('==> Epoch {}, Dev \tLoss: {}\tPearson: {}\tSpearman: {}\tMSE: {}'.format(epoch, dev_loss, dev_pearson, dev_spearmann, dev_mse)) test_pred = torch.index_select(test_pred, 0, act_idxs_test) test_dataset_labels = torch.index_select(test_dataset.labels, 0, act_idxs_test) test_pearson = metrics.pearson(test_pred,test_dataset_labels) test_spearmann = metrics.spearmann(test_pred, test_dataset_labels) test_mse = metrics.mse(test_pred,test_dataset_labels) logger.info('==> Epoch {}, Test \tLoss: {}\tPearson: {}\tSpearman: {}\tMSE: {}'.format(epoch, test_loss, test_pearson, test_spearmann,test_mse)) if best < test_pearson: best = test_pearson checkpoint = {'model': trainer.model.state_dict(), 'optim': trainer.optimizer, 'pearson': test_pearson, 'mse': test_mse, 'args': args, 'epoch': epoch } logger.debug('==> New optimum found, checkpointing everything now...') torch.save(checkpoint, '%s.pt' % os.path.join(args.save, args.expname)) np.savetxt("test_pred.csv", test_pred.numpy(), delimiter=",")
def __init__(self, model, *args, **kwargs): """""" if args: if len(args) > 1: raise TypeError('Parser takes at most one argument') kwargs['name'] = kwargs.pop('name', model.__name__) super(Network, self).__init__(*args, **kwargs) if not os.path.isdir(self.save_dir): os.mkdir(self.save_dir) with open(os.path.join(self.save_dir, 'config.cfg'), 'w') as f: self._config.write(f) # objectives = ['pos_loss', 'trigger_loss', 'actual_parse_loss', 'srl_loss', 'multitask_loss_sum'] # self._global_steps = {o: tf.Variable(0., trainable=False) for o in objectives} self._global_step = tf.Variable(0., trainable=False) self._global_epoch = tf.Variable(0., trainable=False) # todo what is this?? # self._model = model(self._config, global_step=self.global_step) self._model = model(self._config) self._vocabs = [] if self.conll: vocab_files = [(self.word_file, 1, 'Words'), (self.tag_file, [3, 4], 'Tags'), (self.rel_file, 7, 'Rels')] elif self.conll2012: vocab_files = [(self.word_file, 3, 'Words'), (self.tag_file, [5, 4], 'Tags'), # auto, gold (self.rel_file, 7, 'Rels'), (self.srl_file, range(14, 50), 'SRLs'), (self.trig_file, [10, 4] if self.joint_pos_predicates else 10, 'Trigs'), (self.domain_file, 0, 'Domains')] print("Loading vocabs") sys.stdout.flush() for i, (vocab_file, index, name) in enumerate(vocab_files): vocab = Vocab(vocab_file, index, self._config, name=name, cased=self.cased if not i else True, use_pretrained=(not i)) self._vocabs.append(vocab) print("Predicates vocab: ") for l, i in sorted(self._vocabs[4].iteritems(), key=operator.itemgetter(1)): print("%s: %d" % (l, i)) print("predicate_true_start_idx", self._vocabs[4].predicate_true_start_idx) print("Loading data") sys.stdout.flush() self._trainset = Dataset(self.train_file, self._vocabs, model, self._config, name='Trainset') self._validset = Dataset(self.valid_file, self._vocabs, model, self._config, name='Validset') self._testset = Dataset(self.test_file, self._vocabs, model, self._config, name='Testset') self._ops = self._gen_ops() self._save_vars = filter(lambda x: u'Pretrained' not in x.name, tf.global_variables()) self.history = { 'train_loss': [], 'train_accuracy': [], 'valid_loss': [], 'valid_accuracy': [], 'test_acuracy': 0 } return
def prepro(args): logger = logging.getLogger("QANet") logger.info("====== preprocessing ======") logger.info('Checking the data files...') for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format(data_path) logger.info('Preparing the directories...') for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]: if not os.path.exists(dir_path): os.makedirs(dir_path) logger.info('Building vocabulary...') dataloader = DataLoader(args.max_p_num, args.max_p_len, args.max_q_len, args.max_ch_len, args.train_files, args.dev_files, args.test_files) vocab = Vocab(lower=True) for word in dataloader.word_iter('train'): vocab.add_word(word) [vocab.add_char(ch) for ch in word] unfiltered_vocab_size = vocab.word_size() vocab.filter_words_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.word_size() logger.info('After filter {} tokens, the final vocab size is {}, char size is{}'.format(filtered_num, vocab.word_size(), vocab.char_size())) unfiltered_vocab_char_size = vocab.char_size() vocab.filter_chars_by_cnt(min_cnt=2) filtered_char_num = unfiltered_vocab_char_size - vocab.char_size() logger.info('After filter {} chars, the final char vocab size is {}'.format(filtered_char_num, vocab.char_size())) logger.info('Assigning embeddings...') if args.pretrained_word_path is not None: vocab.load_pretrained_word_embeddings(args.pretrained_word_path) else: vocab.randomly_init_word_embeddings(args.word_embed_size) if args.pretrained_char_path is not None: vocab.load_pretrained_char_embeddings(args.pretrained_char_path) else: vocab.randomly_init_char_embeddings(args.char_embed_size) logger.info('Saving vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(vocab, fout) logger.info('====== Done with preparing! ======')
class Captcha: ''' size: width, height in pixel font: font family(string), size (unit pound) and font color (in "#rrggbb" format) bgcolor: in "#rrggbb" format ''' def __init__(self, size, font, bgcolor, length = 4): #todo: add param check and transform here path = 'D:\AI\\font\\arial\Arial.ttf' self.width, self.height = size self.font_family, self.font_size, self.font_color = font self.bgcolor = bgcolor self.len = length self.vocab = Vocab() self.font = ImageFont.truetype(path, self.font_size) def get_text(self): return self.vocab.rand_string(self.len) # by default, draw center align text def draw_text(self, str): dr = ImageDraw.Draw(self.im) font_width, font_height = self.font.getsize(str) # don't know why, but for center align, I should divide it by 2, other than 3 dr.text(((self.width - font_width) / 3, (self.height - font_height) / 3), str, fill = self.font_color, font = self.font) def draw_background(self): pass def transform(self): params = [1 - float(random.randint(1, 2)) / 100, 0, 0, 0, 1 - float(random.randint(1, 10)) / 100, float(random.randint(1, 2)) / 500, 0.001, float(random.randint(1, 2)) / 500 ] self.im = self.im.transform((self.width, self.height), Image.PERSPECTIVE, params) def filter(self): self.im.filter(ImageFilter.EDGE_ENHANCE_MORE) # by default, add no noises def add_noise(self): pass #获取验证码(包括图片和文字) def get_captcha(self): self.im = Image.new("RGB", (self.width, self.height), (self.bgcolor)) self.draw_background() str = self.get_text() self.draw_text(str) self.add_noise() self.transform() self.filter() #self.im.save("D:\pic\pic2.jpg") return self.im, str #自定义函数,获取图片,把png的四通道转变为三通道,返回图片和内容 def get_myImage(self): #文件图片放在D盘 #ims=Image.open("D:\AI\pythonPack\pic\反馈意见.png");十天内免登录,二维码登录 ims = Image.open("D:\AI\pythonPack\pic4\邮箱中心.png"); bg = Image.new("RGB", ims.size, (255, 255, 255)) #print('bg',bg) #bg.paste(ims, ims) return bg,'邮箱中心';
sentence_str = '_sentence' if args.split_sentences else '' tokenized_data_fn = '{}{}{}.json'.format(args.tokenized_fp, debug_str, sentence_str) with open(tokenized_data_fn, 'r') as fd: tokenized_data = json.load(fd) token_counts_fn = '{}{}{}.json'.format(args.token_counts_fp, debug_str, sentence_str) with open(token_counts_fn, 'r') as fd: token_counts = json.load(fd) N = float(token_counts['__ALL__']) print('Subsampling {} tokens'.format(N)) # Store subsampled data tokenized_subsampled_data = [] # And vocabulary with word counts vocab = Vocab() num_docs = len(tokenized_data) sections = set() categories = set() for doc_idx in tqdm(range(num_docs)): tokenized_doc_str = tokenized_data[doc_idx] subsampled_doc = [] prev_token = 'dummy' doc_tokens = tokenized_doc_str.split() for tidx, token in enumerate(doc_tokens): if prev_token == token: continue wc = token_counts[token] is_section_header = 'header=' in token is_doc_header = 'document=' in token if is_section_header:
def preprocess_data(self): """ Somerrthing to note: We define requestable and informable slots as below in further experiments (including other baselines): :param raw_data: :param add_to_vocab: :param data_type: :return: """ vocab = Vocab(cfg.vocab_size, self.otlg.special_tokens) for data_type in ['train', 'dev', 'test']: print('Preprocessing %s data'%data_type) raw_data = json.loads(open(self.raw_data_path[data_type], 'r').read().lower()) precessed_dialogs = {} state_dump = {} for dial_id, raw_dial in enumerate(raw_data): precessed_dialog = [] prev_utter = '' single_turn = {} constraint_flat = [] constraint_dict = {} intent = raw_dial['scenario']['task']['intent'] if cfg.domain != 'all' and cfg.domain != intent: if intent not in ['navigate','weather','schedule']: raise ValueError('what is %s intent bro?' % intent) else: continue for turn_num,dial_turn in enumerate(raw_dial['dialogue']): state_dump[(dial_id, turn_num)] = {} if dial_turn['turn'] == 'driver': u = self._lemmatize(self._tokenize(dial_turn['data']['utterance'])) u = re.sub(r'(\d+) ([ap]m)', lambda x: x.group(1) + x.group(2), u) single_turn['user'] = u prev_utter += u elif dial_turn['turn'] == 'assistant': s = dial_turn['data']['utterance'] # find entities and replace them s = re.sub(r'(\d+) ([ap]m)', lambda x: x.group(1) + x.group(2), s) s = self._replace_entity(s, prev_utter, intent) single_turn['response'] = s # get constraints for s,v in dial_turn['data']['slots'].items(): constraint_dict[intent + '-' + s] = v constraint_dict = self._clean_constraint_dict(constraint_dict, intent) constraint_flat = list(constraint_dict.values()) single_turn['constraint'] = json.dumps(constraint_dict) single_turn['turn_num'] = len(precessed_dialog) single_turn['dial_id'] = dial_id if 'user' in single_turn: state_dump[(dial_id, len(precessed_dialog))]['constraint'] = constraint_dict precessed_dialog.append(single_turn) single_turn = {} for single_turn in precessed_dialog: for word_token in constraint_flat + \ single_turn['user'].split() + single_turn['response'].split(): vocab.add_word(word_token) precessed_dialogs[dial_id] = precessed_dialog with open(self.data_path[data_type],'w') as f: json.dump(precessed_dialogs,f,indent=2) # construct vocabulary vocab.construct() vocab.save_vocab(self.dataset_path + 'vocab') return
# furthermore these are only valid if tied embeddings (at least for now that's all # implemented) if opts.vocab_file and not opts.tied_embeddings: raise Exception("must set --tied-embeddings if using pre initialised embeddings") # sanity check other opts assert opts.keep_prob >= 0.0 and opts.keep_prob <= 1.0 NUM_LABELS = 3 def log(s): print >>sys.stderr, util.dts(), s # slurp training data, including converting of tokens -> ids # if opts.vocab_file set read from that file, otherwise populate lookups as used vocab = Vocab(opts.vocab_file) train_x, train_y, train_stats = util.load_data(opts.train_set, vocab, update_vocab=True, max_egs=int(opts.num_from_train), parse_mode=opts.parse_mode) log("train_stats %s %s" % (len(train_x), train_stats)) dev_x, dev_y, dev_stats = util.load_data(opts.dev_set, vocab, update_vocab=False, max_egs=int(opts.num_from_dev), parse_mode=opts.parse_mode) log("dev_stats %s %s" % (len(dev_x), dev_stats)) # input/output example vars s1_idxs = T.ivector('s1') # sequence for sentence one s2_idxs = T.ivector('s2') # sequence for sentence two actual_y = T.ivector('y') # single for sentence pair label; 0, 1 or 2
type=str, default='./saved_models', help='Root dir for saving models.') parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() # set random seed torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) torch.cuda.manual_seed(args.seed) helper.print_arguments(args) # load vocab print("Loading vocab...") token_vocab = Vocab.load_vocab(args.vocab_dir + '/vocab_tok.vocab') # token post_vocab = Vocab.load_vocab(args.vocab_dir + '/vocab_post.vocab') # position pos_vocab = Vocab.load_vocab(args.vocab_dir + '/vocab_pos.vocab') # POS dep_vocab = Vocab.load_vocab(args.vocab_dir + '/vocab_dep.vocab') # deprel pol_vocab = Vocab.load_vocab(args.vocab_dir + '/vocab_pol.vocab') # polarity vocab = (token_vocab, post_vocab, pos_vocab, dep_vocab, pol_vocab) print( "token_vocab: {}, post_vocab: {}, pos_vocab: {}, dep_vocab: {}, pol_vocab: {}" .format(len(token_vocab), len(post_vocab), len(pos_vocab), len(dep_vocab), len(pol_vocab))) args.tok_size = len(token_vocab) args.post_size = len(post_vocab) args.pos_size = len(pos_vocab) # load pretrained word emb print("Loading pretrained word emb...")
def main(): global args args = parse_args() # local directory if not os.path.exists(args.save): os.makedirs(args.save) # global logger logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) formatter = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s:%(message)s") # file logger fh = logging.FileHandler(os.path.join(args.save, args.expname)+'.log', mode='w') fh.setLevel(logging.INFO) fh.setFormatter(formatter) logger.addHandler(fh) # console logger ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) ch.setFormatter(formatter) logger.addHandler(ch) # argument validation # single gpu running, if too slow switch multiple-gpu running args.cuda = args.cuda and torch.cuda.is_available() device = parallel.get_device(args.device) if args.cuda else torch.device("cpu") args.shard_size = len(args.device) if args.cuda else 1 # control randomness logger.debug(args) torch.manual_seed(args.seed) random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True train_dir = os.path.join(args.data, 'train/') dev_dir = os.path.join(args.data, 'dev/') test_dir = os.path.join(args.data, 'test/') # prepare vocabularies vocab_file = os.path.join(args.data, 'vocab.txt') assert os.path.isfile(vocab_file) src_vocab_file = os.path.join(args.data, 'src_vocab.txt') if args.use_src: assert os.path.isfile(src_vocab_file) # get vocab object from vocab file previously written vocab = Vocab(vocab_file) logger.debug('==> vocabulary size : %d ' % vocab.size()) if args.use_src: src_vocab = Vocab(src_vocab_file) logger.debug('==> source vocabulary size : %d ' % src_vocab.size()) # initialize model, criterion/loss_function, optimizer model = TreeLSTMAutoEncoder( vocab.size(), # vocabulary size, word embeddings args.input_dim, # word embedding siz args.mem_dim, # hidden size in tree bits_number=args.bit_number, # the number of hashing bits filter_size=args.filter_size, # the internal state size for unbottleneck max_num_children=args.max_num_children, # maximum allowed children number noise_dev=args.noise_dev, # the deviation of injected noise startup_steps=args.startup_size, # warmup step discrete_mix=args.discrete_mix, # mix ratio for discretization use_bottleneck=args.use_bottleneck, # whether use the discretization model src_vocab_size=src_vocab.size() if args.use_src else None, # source vocabulary size atn_num_layer=args.num_layer, # transformer layer number atn_num_heads=args.num_head, # attention heads atn_dp=args.atn_dp, # dropout for transformer ) logger.info(model) model.to(device) if args.optim == 'adam': optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adagrad': optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) elif args.optim == 'sgd': optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) else: raise Exception("Unrecognized/Unsupported model optimizer {}".format(args.optim)) start_epoch = 0 global_step = 0 best = -float('inf') # backup from saved checkpoints saved_checkpoints = '%s.pt' % os.path.join(args.save, args.expname) tmp_saved_checkpoints = '%s.tmp.pt' % os.path.join(args.save, args.expname) if os.path.isfile(saved_checkpoints): saved_states = torch.load(saved_checkpoints, map_location=device) logger.info("detected and loaded") model.load_state_dict(saved_states['model']) optimizer.load_state_dict(saved_states['optim']) start_epoch = saved_states['epoch'] + 1 global_step = saved_states['global_step'] best = saved_states['best'] elif os.path.isfile(tmp_saved_checkpoints): saved_states = torch.load(tmp_saved_checkpoints, map_location=device) logger.info("temporary checkpoint detected and loaded") model.load_state_dict(saved_states['model']) optimizer.load_state_dict(saved_states['optim']) start_epoch = saved_states['epoch'] + 1 global_step = saved_states['global_step'] zglobal.global_update("global_step", global_step) # create trainer object for training and testing trainer = Trainer(args, model, optimizer, device, logger, epoch=start_epoch) if args.mode == "train": # load dataset splits train_dataset = TreeDataset( train_dir, vocab, src_path=train_dir if args.use_src else None, src_vocab=src_vocab if args.use_src else None, tree_depth_limit=args.max_depth, tree_size_limit=args.max_tree_size, src_len_limit=args.max_src_len, ) logger.debug('==> Loading train data') dev_dataset = TreeDataset( dev_dir, vocab, src_path=dev_dir if args.use_src else None, src_vocab=src_vocab if args.use_src else None, tree_depth_limit=args.max_depth, tree_size_limit=args.max_tree_size, src_len_limit=args.max_src_len, ) logger.debug('==> Loading dev data') logger.debug('Start training from epoch {}'.format(start_epoch)) for epoch in range(start_epoch, args.epochs): train_loss = trainer.train(train_dataset) dev_loss, dev_acc, dev_pred, dev_repr = trainer.test(dev_dataset) global_step = zglobal.global_get('global_step') logger.info('==> Epoch {}, Step {}, Train \tLoss: {}'.format(epoch, global_step, train_loss)) logger.info('==> Epoch {}, Step {}, Dev \tLoss: {}, ACC: {}'.format(epoch, global_step, dev_loss, dev_acc)) # select best model according to accuracy, rather than development loss dev_score = dev_acc # - dev_loss if best < dev_score: best = dev_score checkpoint = { 'model': trainer.model.state_dict(), 'optim': trainer.optimizer.state_dict(), 'args': args, 'epoch': epoch, 'global_step': zglobal.global_get('global_step'), 'best': best, } logger.debug('==> New optimum found, checkpointing everything now...') torch.save(checkpoint, '%s.pt' % os.path.join(args.save, args.expname)) torch.save(checkpoint, '%s.%spt' % (os.path.join(args.save, args.expname), epoch)) torch.save(dev_pred, '%s.dev.pred.%spt' % (os.path.join(args.save, args.expname), epoch)) torch.save(dev_repr, '%s.dev.repr.%spt' % (os.path.join(args.save, args.expname), epoch)) elif args.mode == "eval": test_dataset = TreeDataset( test_dir, vocab, src_path=test_dir if args.use_src else None, src_vocab=src_vocab if args.use_src else None, tree_depth_limit=args.max_depth, tree_size_limit=args.max_tree_size, src_len_limit=args.max_src_len, ) logger.debug('==> Loading test data') # evaluating the test set test_loss, test_acc, test_pred, test_repr = trainer.test(test_dataset) torch.save(test_pred, '%s.test.pred.th' % os.path.join(args.save, args.expname)) torch.save(test_repr, '%s.test.repr.th' % os.path.join(args.save, args.expname)) else: raise Exception("Invalid training mode {}".format(args.mode)) logger.debug('Ending')
def prepare(args): """ checks data, creates the directories, prepare the vocabulary and embeddings """ logger = logging.getLogger(args.algo) logger.info('Preparing the directories...') for dir_path in [ args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir ]: if not os.path.exists(dir_path): os.makedirs(dir_path) logger.info('Building vocabulary...') data = Dataset(args.train_files, args.dev_files, args.test_files, args.max_p_len, args.max_q_len) vocab = Vocab() for word in data.word_iter('train'): vocab.add(word) unfiltered_vocab_size = vocab.size() vocab.filtered_tokens(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.size())) logger.info('Assigning embeddings...') if args.use_embe: vocab.load_pretrained_embeddings( embedding_path= '/home/home1/dmyan/codes/tensorflow/data/word2vec/300_ver_not_pure.bin' ) else: vocab.random_init_embeddings(args.embed_size) logger.info('Saving vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(vocab, fout) logger.info('Done with preparing!')
phrase_str = '_phrase' if args.combine_phrases else '' tokenized_data_fn = '{}{}{}.json'.format(args.tokenized_fp, debug_str, phrase_str) with open(tokenized_data_fn, 'r') as fd: tokenized_data = json.load(fd) token_counts_fn = '{}{}{}.json'.format(args.token_counts_fp, debug_str, phrase_str) with open(token_counts_fn, 'r') as fd: token_counts = json.load(fd) N = float(token_counts['__ALL__']) print('Subsampling {} tokens'.format(N)) # Store subsampled data tokenized_subsampled_data = [] # And vocabulary with word counts vocab = Vocab() num_docs = len(tokenized_data) sections = set() for doc_idx in tqdm(range(num_docs)): category, tokenized_doc_str = tokenized_data[doc_idx] subsampled_doc = [] for token in tokenized_doc_str.split(): wc = token_counts[token] is_section_header = re.match(r'header=[A-Z]+', token) is_phrase = '_' in token if is_section_header: subsampled_doc.append(token) sections.add(token) else: threshold = args.min_phrase_count if is_phrase else args.min_token_count
if char in input_vocab_counter: input_vocab_counter[char] += 1 else: input_vocab_counter[char] = 1 for target_text in data_target: target_text = target_text.strip() target_texts.append(target_text) for char in target_text: if char in target_vocab_counter: target_vocab_counter[char] += 1 else: target_vocab_counter[char] = 1 input_vocab = Vocab(list(input_vocab_counter.keys())) target_vocab = Vocab(list(target_vocab_counter.keys())) # Generate train, eval, and test batches seed = 1 zipped_texts = list(zip(input_texts, target_texts)) random.Random(seed).shuffle(zipped_texts) # # train - 90%, eval - 7%, test - 3% train_texts = zipped_texts[0:int(len(zipped_texts)*0.9)] eval_texts = zipped_texts[int( len(zipped_texts)*0.9) + 1:int(len(zipped_texts)*0.97)] test_texts = zipped_texts[int(len(zipped_texts)*0.97):-1] # prepare batches
def prepare(args): """ checks data, creates the directories, prepare the vocabulary and embeddings """ logger = logging.getLogger("brc") logger.info('Checking the data files...') for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format( data_path) logger.info('Preparing the directories...') logger.info('Building vocabulary...') brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files, args.test_files) vocab = Vocab(lower=True) for word in brc_data.word_iter('train'): #构建词典只包含训练集 vocab.add(word) unfiltered_vocab_size = vocab.size() vocab.filter_tokens_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.size())) logger.info('Assigning embeddings...') # vocab.randomly_init_embeddings(args.embed_size)#TODO-load_pretrained_embeddings vocab.load_pretrained_embeddings(args.embedding_path) #glove pre-trained logger.info('Saving vocab...') # with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: with open(args.vocab_path, 'wb') as fout: #不区分search&zhidao pickle.dump(vocab, fout) logger.info('Done with preparing!')
config = dict() # User set parameters# config['maxastnodes'] = 100 config['asthops'] = 2 if modeltype == None: modeltype = modelfile.split('_')[0].split('/')[-1] os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = gpu data_dir = '../data_set/' + args.lang # load vocab vocab = Vocab(data_dir=data_dir) vocab.load_vocab() # load data test_code_data, test_ast_data, test_edges, test_nl = load_data( data_dir, 'test') test_ids = list(range(len(test_code_data))) # code vocab size config['tdatvocabsize'] = len(vocab.code2index) # comment vocab size config['comvocabsize'] = len(vocab.nl2index) # ast vocab size config['smlvocabsize'] = len(vocab.ast2index) # set sequence lengths
traindocuments = parserNcbiTxtFile_simple(opt.train_file) devdocuments = parserNcbiTxtFile_simple(opt.dev_file) testdocuments = parserNcbiTxtFile_simple(opt.test_file) entityAbbres = loadAbbreviations(opt.abbre_file) preprocessMentions(traindocuments, devdocuments, testdocuments, entityAbbres) dict = load_dict(opt.dict_file) meshlabels, meshlabel_to_ix, dict_words = utils.parser_dict(dict) corpus_words = utils.parser_corpus(traindocuments, devdocuments, testdocuments) word_to_ix, all_words, char_to_ix = utils.generate_word_alphabet(corpus_words, dict_words) if opt.random_emb: opt.emb_filename = '' vocab = Vocab(word_to_ix, opt.emb_filename, opt.word_emb_size) dict_instances = norm_dataset.getDictInstance(dict, vocab, meshlabel_to_ix,char_to_ix) train_instances = norm_dataset.getNormInstance(traindocuments,vocab, meshlabel_to_ix,char_to_ix ) dev_instances = norm_dataset.getNormInstance(devdocuments, vocab, meshlabel_to_ix,char_to_ix) test_instances = norm_dataset.getNormInstance(testdocuments, vocab, meshlabel_to_ix,char_to_ix) logging.info('dict_instances_len {}'.format(len(dict_instances))) logging.info('train_instance_len {}'.format(len(train_instances))) my_collate = utils.sorted_collate dict_loader = DataLoader(dict_instances, opt.batch_size, shuffle=True, collate_fn=my_collate) train_loader = DataLoader(train_instances, opt.batch_size, shuffle=True, collate_fn = my_collate) dev_loader = DataLoader(dev_instances, opt.batch_size, shuffle=False, collate_fn = my_collate) test_loader = DataLoader(test_instances, opt.batch_size, shuffle=False, collate_fn = my_collate)
### # Globals ### app = flask.Flask(__name__) CONFIG = config.configuration() app.secret_key = CONFIG.SECRET_KEY # Should allow using session variables # # One shared 'Vocab' object, read-only after initialization, # shared by all threads and instances. Otherwise we would have to # store it in the browser and transmit it on each request/response cycle, # or else read it from the file on each request/responce cycle, # neither of which would be suitable for responding keystroke by keystroke. WORDS = Vocab(CONFIG.VOCAB) ### # Pages ### @app.route("/") @app.route("/index") def index(): """The main page of the application""" flask.g.vocab = WORDS.as_list() flask.session["target_count"] = min( len(flask.g.vocab), CONFIG.SUCCESS_AT_COUNT) flask.session["jumble"] = jumbled( flask.g.vocab, flask.session["target_count"])
def main(): wandb.init(project="nlp_course", config=args) #data preprocessing # train_path = './data/train.txt' # dev_path = './data/dev.txt' # dp = DataProcessor(train_path, dev_path) # dp.tokenize() # additional data processing train_file = Path('./tmp/train.txt') vocab_file = Path('./tmp/vocab.txt') if not vocab_file.exists(): train_corpus = (line.strip() for line in train_file.open()) vocab = Vocab.from_text(train_corpus, max_types=MAX_TYPES, min_freq=MIN_FREQ) vocab.save(vocab_file) else: vocab = Vocab.load(vocab_file) log.info(f'Vocab has {len(vocab)} types') train_data = TextDataset(vocab=vocab, path=train_file) dev_data = TextDataset(vocab=vocab, path=Path('./tmp/dev.txt')) # model = FNN_LM(vocab_size=len(vocab), n_class=len(vocab)) # losses = train(model, n_epochs=5, batch_size=BATCH_SIZE, train_data=train_data, # valid_data=dev_data) if args.model == "RNN": model = RNN_LM(vocab_size=len(vocab), n_class=len(vocab), emb_dim=args.emb_dim, hid=args.hid, dropout_rate=args.dropout_ratio, num_layers=args.num_layers) losses = train(model, n_epochs=args.n_epochs, batch_size=BATCH_SIZE, train_data=train_data, valid_data=dev_data) torch.save(model.state_dict(), wandb.run.dir) elif args.model == "LSTM": model = LSTM_LM(vocab_size=len(vocab), n_class=len(vocab), emb_dim=args.emb_dim, hid=args.hid, dropout_rate=args.dropout_ratio, num_layers=args.num_layers) losses = train(model, n_epochs=args.n_epochs, batch_size=BATCH_SIZE, train_data=train_data, valid_data=dev_data) torch.save(model.state_dict(), wandb.run.dir) elif args.model == "BiLSTM-ATT": model = BiLSTM_ATT_LM(vocab_size=len(vocab), n_class=len(vocab), emb_dim=args.emb_dim, hid=args.hid, dropout_rate=args.dropout_ratio, num_layers=args.num_layers) losses = train(model, n_epochs=args.n_epochs, batch_size=BATCH_SIZE, train_data=train_data, valid_data=dev_data) torch.save(model.state_dict(), wandb.run.dir)
opts = parser.parse_args() print >>sys.stderr, opts seq_len = int(opts.hack_max_len) hidden_dim = int(opts.hidden_dim) embedding_dim = int(opts.embedding_dim) batch_size = int(opts.batch_size) # check that if one of --vocab--file or --initial_embeddings is set, they are both set. assert not ((opts.vocab_file is None) ^ (opts.initial_embeddings is None)), "must set both --vocab-file & --initial-embeddings" def log(s): print >>sys.stderr, util.dts(), s # build vocab and load data log("loading data") vocab = Vocab(opts.vocab_file) train_x, train_y, train_stats = util.load_data(opts.train_set, vocab, update_vocab=True, max_records=opts.num_from_train, max_len=seq_len, batch_size=batch_size) log("train_stats %s %s" % (len(train_x), train_stats)) dev_x, dev_y, dev_stats = util.load_data(opts.dev_set, vocab, update_vocab=False, max_records=opts.num_from_dev, max_len=seq_len, batch_size=batch_size) log("dev_stats %s %s" % (len(dev_x), dev_stats)) log("|VOCAB| %s" % vocab.size()) log("building model")
def test_single_vocab(): vocab = Vocab([ "moe" ]) assert vocab.as_list() == [ "moe" ] assert vocab.has("moe") assert not vocab.has("meeny")