def test(): """ Background test method. """ config = Config() device = torch.device("cpu") # if config.use_cuda: # device = torch.cuda.set_device(config.gpu[0]) print('loading corpus') vocab_mask = load_vocab(config.vocab) label_dic = load_vocab(config.label_file) model = TransXL(tag_vocab=dict([val, key] for key, val in label_dic.items()), bert_config=config.bert_path) model = load_train_model(model) # model.crf.use_cuda = False model.to(device) model.eval() while True: line = input("input sentence, please:") mems = None feature = get_text_line_feature(line, vocab_mask, max_length=512) input_id = torch.LongTensor(feature.input_id).unsqueeze(0) ids, mems = model.predict(input_id, mems)["pred"] ids = ids.squeeze(0).numpy().tolist() pre_tags = id2tag(label_dic, ids) if config.label_mode == "BIOSE": result = decode_tags_io(line, pre_tags[1:-1]) else: result = decode_tags_bio(line, pre_tags[1:-1]) print(result)
def __init__(self, txt_path, in_vocab_path, out_vocab_path): """Read txt file, input vocab and output vocab (punc vocab).""" self.txt_seqs = open(txt_path, encoding='utf8').readlines() self.num_seqs = len(self.txt_seqs) self.word2id = utils.load_vocab(in_vocab_path, extra_word_list=["<UNK>", "<END>"]) self.punc2id = utils.load_vocab(out_vocab_path, extra_word_list=[" "])
def test(): """Test Model in test file""" # load config config = Config() print('settings:\n', config) # load corpus print('loading corpus') vocab = load_vocab(config.vocab) label_dic = load_vocab(config.tri_cls_label_file) # load train and dev and test dataset test_data = read_corpus_tri_cls(config.tri_cls_test_file, max_length=config.max_length, vocab=vocab) test_ids = torch.LongTensor([temp[0] for temp in test_data]) test_masks = torch.LongTensor([temp[1] for temp in test_data]) test_types = torch.LongTensor([temp[2] for temp in test_data]) test_tags = torch.LongTensor([temp[3] for temp in test_data]) test_dataset = TensorDataset(test_ids, test_masks, test_types, test_tags) test_loader = DataLoader(test_dataset, shuffle=False, batch_size=config.batch_size) # init model model = BertQA(config.bert_path, 2) model = load_model(model, name=config.load_tri_cls_path) if config.use_cuda and torch.cuda.is_available(): model.cuda() # test model evaluate(model, test_loader, 0, config)
def __init__(self, txt_path, in_vocab_path, out_vocab_path): """Read txt file, input vocab and output vocab (punc vocab).""" self.txt_seqs = open(txt_path, encoding='utf8', errors='ignore').readlines() self.word2id = utils.load_vocab(in_vocab_path, extra_word_list=["<UNK>", "<END>"]) self.punc2id = utils.load_vocab(out_vocab_path, extra_word_list=[" "]) self.class2punc = { k : v for (v, k) in self.punc2id.items()}
def train(**kwargs): config = Config() config.update(**kwargs) print('current config:\n', config) if config.use_cuda: torch.cuda.set_device(config.gpu) print('loading corpus') vocab = load_vocab(config.vocab) label_dic = load_vocab(config.label_file, user_define=USER_DEFINE) tagset_size = len(label_dic) train_data = read_corpus(config.train_file, max_length=config.max_length, label_dic=label_dic, vocab=vocab, user_define=USER_DEFINE) dev_data = read_corpus(config.dev_file, max_length=config.max_length, label_dic=label_dic, vocab=vocab, user_define=USER_DEFINE) train_ids = torch.LongTensor([temp.input_id for temp in train_data]) train_masks = torch.LongTensor([temp.input_mask for temp in train_data]) train_tags = torch.LongTensor([temp.label_id for temp in train_data]) train_dataset = TensorDataset(train_ids, train_masks, train_tags) train_loader = DataLoader(train_dataset, shuffle=True, batch_size=config.batch_size) dev_ids = torch.LongTensor([temp.input_id for temp in dev_data]) dev_masks = torch.LongTensor([temp.input_mask for temp in dev_data]) dev_tags = torch.LongTensor([temp.label_id for temp in dev_data]) dev_dataset = TensorDataset(dev_ids, dev_masks, dev_tags) dev_loader = DataLoader(dev_dataset, shuffle=True, batch_size=config.batch_size) model = BERT_LSTM_CRF(config.bert_path, tagset_size, config.bert_embedding, config.rnn_hidden, config.rnn_layer, dropout_ratio=config.dropout_ratio, dropout1=config.dropout1, use_cuda=config.use_cuda) if config.load_model: assert config.load_path is not None model = load_model(model, name=config.load_path) if config.use_cuda: model.to(DEVICE) model.train() optimizer = getattr(optim, config.optim) optimizer = optimizer(model.parameters(), lr=config.lr, weight_decay=config.weight_decay) eval_loss = 10000 for epoch in range(config.base_epoch): step = 0 for i, batch in enumerate(tqdm(train_loader)): step += 1 model.zero_grad() inputs, masks, tags = batch inputs, masks, tags = Variable(inputs), Variable(masks), Variable(tags) if config.use_cuda: inputs, masks, tags = inputs.to(DEVICE), masks.to(DEVICE), tags.to(DEVICE) feats = model(inputs, masks) loss = model.loss(feats, masks, tags) loss.backward() optimizer.step() if step % 50 == 0: print('step: {} | epoch: {}| loss: {}'.format(step, epoch, loss.item())) loss_temp = dev(model, dev_loader, epoch, config) if loss_temp < eval_loss: eval_loss = loss_temp save_model(model, epoch)
def load_data(self): vocab_event = load_vocab(self.dir_data + 'vocab_event.txt', hasPad=False) self.vocab_event = dict({'O': 0}) for key in vocab_event: if key[2:] not in self.vocab_event and key != 'O': self.vocab_event.update({key[2:]: len(self.vocab_event)}) # 34 classes includes event type + None type self.vocab_ner = load_vocab(self.dir_data + 'vocab_ner_tail.txt') self.num_class_events = len(self.vocab_event) self.num_class_entities = len(self.vocab_ner)
def __init__(self, txt_path, in_vocab_path, out_vocab_path, sort=True): """Read txt file, input vocab and output vocab (punc vocab).""" self.txt_seqs = open(txt_path, encoding='utf8', errors='ignore').readlines() self.word2id = utils.load_vocab(in_vocab_path, extra_word_list=["<UNK>", "<END>"]) self.punc2id = utils.load_vocab(out_vocab_path, extra_word_list=[" "]) if sort: # Also need to sort in collate_fn cause the sentence length will # change after self.preprocess() self.txt_seqs.sort(key=lambda x: len(x.split()), reverse=True)
def inspect_dataset(args): """ Count the number of samples that have tokens that are not in the common vocab subset """ model_vocab = utils.load_vocab(args.model_vocab_file) print('Model vocab size:', len(model_vocab)) common_vocab = utils.load_vocab(args.common_vocab_file) print('Common vocab size:', len(common_vocab)) for file in os.listdir(args.data_dir): filename = os.fsdecode(file) rel_name = filename.replace('.jsonl', '') if filename.endswith('.jsonl'): # TODO: Following line is broken.. facts = utils.load_TREx_data(args, os.path.join(args.data_dir, filename)) num_common = 0 # Number of samples in the common vocab subset which is a subset of model vocab num_model = 0 # Number of samples in model vocab but not in common vocab num_neither = 0 # Number of samples in neither model nor common vocab for fact in tqdm(facts): sub, obj = fact # First check if object is in common vocab if obj in common_vocab: num_common += 1 else: # If not in common vocab, could be in model vocab if obj in model_vocab: num_model += 1 else: # Not in common or model vocab num_neither += 1 assert len(facts) == num_common + num_model + num_neither print( '{} -> num facts: {}, num common: {}, num model: {}, num neither: {}' .format(rel_name, len(facts), num_common, num_model, num_neither)) # Plot distribution of gold objects obj_set = Counter([obj for sub, obj in facts]) top_obj_set = obj_set.most_common(10) print(top_obj_set) print() gold_objs = pd.DataFrame(top_obj_set, columns=['obj', 'freq']) fig, ax = plt.subplots() gold_objs.sort_values(by='freq').plot.barh(x='obj', y='freq', ax=ax) plt.savefig(os.path.join(args.out_dir, rel_name + '.png'), bbox_inches='tight') plt.close()
def create_duo_word_clouds(infold, outfold, sub, city1, city2, stopwords): vocab1 = utils.load_vocab('vocabs/{}/{}.vocab'.format(sub, city1), 3) vocab2 = utils.load_vocab('vocabs/{}/{}.vocab'.format(sub, city2), 3) thres1, thres2 = utils.get_threshold(sub, city1, city2) results1, results2 = utils.compare_vocabs(vocab1, vocab2, city1, city2, thres1, thres2) text1 = create_text_wc(results1) text2 = create_text_wc(results2) # frequencies1 = utils.filter_stopwords(results1, stopwords, filter_unprintable=True) # frequencies2 = utils.filter_stopwords(results2, stopwords, filter_unprintable=True) create_duo_word_clouds_helper(outfold, sub, text1, city1, text2, city2, stopwords)
def get_baseline_model(args): vocab = utils.load_vocab(args.vocab_json) if args.baseline_start_from is not None: model, kwargs = utils.load_model(args.baseline_start_from) elif args.model_type == 'LSTM': kwargs = { 'vocab': vocab, 'rnn_wordvec_dim': args.rnn_wordvec_dim, 'rnn_dim': args.rnn_hidden_dim, 'rnn_num_layers': args.rnn_num_layers, 'rnn_dropout': args.rnn_dropout, 'fc_dims': parse_int_list(args.classifier_fc_dims), 'fc_use_batchnorm': args.classifier_batchnorm == 1, 'fc_dropout': args.classifier_dropout, } model = LstmModel(**kwargs) elif args.model_type == 'CNN+LSTM': kwargs = { 'vocab': vocab, 'rnn_wordvec_dim': args.rnn_wordvec_dim, 'rnn_dim': args.rnn_hidden_dim, 'rnn_num_layers': args.rnn_num_layers, 'rnn_dropout': args.rnn_dropout, 'cnn_feat_dim': parse_int_list(args.feature_dim), 'cnn_num_res_blocks': args.cnn_num_res_blocks, 'cnn_res_block_dim': args.cnn_res_block_dim, 'cnn_proj_dim': args.cnn_proj_dim, 'cnn_pooling': args.cnn_pooling, 'fc_dims': parse_int_list(args.classifier_fc_dims), 'fc_use_batchnorm': args.classifier_batchnorm == 1, 'fc_dropout': args.classifier_dropout, } model = CnnLstmModel(**kwargs) elif args.model_type == 'CNN+LSTM+SA': kwargs = { 'vocab': vocab, 'rnn_wordvec_dim': args.rnn_wordvec_dim, 'rnn_dim': args.rnn_hidden_dim, 'rnn_num_layers': args.rnn_num_layers, 'rnn_dropout': args.rnn_dropout, 'cnn_feat_dim': parse_int_list(args.feature_dim), 'stacked_attn_dim': args.stacked_attn_dim, 'num_stacked_attn': args.num_stacked_attn, 'fc_dims': parse_int_list(args.classifier_fc_dims), 'fc_use_batchnorm': args.classifier_batchnorm == 1, 'fc_dropout': args.classifier_dropout, } model = CnnLstmSaModel(**kwargs) if model.rnn.token_to_idx != vocab['question_token_to_idx']: # Make sure new vocab is superset of old for k, v in model.rnn.token_to_idx.items(): assert k in vocab['question_token_to_idx'] assert vocab['question_token_to_idx'][k] == v for token, idx in vocab['question_token_to_idx'].items(): model.rnn.token_to_idx[token] = idx kwargs['vocab'] = vocab model.rnn.expand_vocab(vocab['question_token_to_idx']) model.cuda() model.train() return model, kwargs
def main(): vocab_words = load_vocab(constants.ALL_WORDS) train = Dataset(constants.RAW_DATA + 'train', vocab_words) validation = Dataset(constants.RAW_DATA + 'dev', vocab_words) test = Dataset(constants.RAW_DATA + 'test', vocab_words) # get pre trained embeddings embeddings = get_trimmed_w2v_vectors(constants.TRIMMED_FASTTEXT_W2V) model = LstmCnnModel( model_name=constants.MODEL_NAMES.format('sud', constants.JOB_IDENTITY), embeddings=embeddings, batch_size=constants.BATCH_SIZE, constants=constants, ) # train, evaluate and interact model.build() model.load_data(train=train, validation=validation) model.run_train(epochs=constants.EPOCHS, early_stopping=constants.EARLY_STOPPING, patience=constants.PATIENCE) y_pred = model.predict(test) preds = [] labels = [] for pred, label in zip(y_pred, test.labels): labels.extend(label) preds.extend(pred[:len(label)]) p, r, f1, _ = precision_recall_fscore_support(labels, preds, average='binary') print('Result:\tP={:.2f}%\tR={:.2f}%\tF1={:.2f}%'.format( p * 100, r * 100, f1 * 100))
def __init__(self, path_to_map, path_to_vocab, path_to_index2sense): """ Important Note: During processing new words are added to on_senses and vocabulary, therefore they need to be exported again. :param path_to_map: ontonotes sense to wordnet sense mappings :param path_to_vocab: index2word mapping [.csv] :param path_to_index2sense: index2sense mapping [.pickle] """ # sense mappings self.on2wn = load_sense_mappings_pickle(path_to_map) self.index2sense = load_sense_mappings_pickle(path_to_index2sense) self.sense2index = dict() for key, value in self.index2sense.items(): self.sense2index[value.replace("-", ".")] = key self.n_onsenses = len(self.sense2index) # get rid of that, no longer needed, save memory # del self.index2sense # Vocab self.word2index = load_vocab(path_to_vocab) self.n_words = len(self.word2index) # test print("Sense") print(self.sense2index["open.v.2"]) print(self.on2wn["elaborate.v.1"]) print(self.on2wn["elaborate.v.1"][0]) print("Vocab:") print(self.word2index["elaborate"])
async def main(args): tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) # Load common vocab subset common_vocab = utils.load_vocab(args.common_vocab_file) # Go though TREx test set and save every sub-obj pair/fact in a dictionary trex_set = set() with open(args.trex_file, 'r') as f_in: lines = f_in.readlines() for line in tqdm(lines): line = json.loads(line) trex_set.add((line['sub_uri'], line['obj_uri'])) # Get relation ID, i.e. P108 filename = os.path.basename(os.path.normpath(args.in_file)) rel_id = filename.split('.')[0] queries = [] with open(args.in_file, 'r') as f_in: queries = f_in.readlines() with open(args.out_file, 'a+') as f_out: await map_async( lambda q: get_fact(q, args, tokenizer, trex_set, common_vocab, f_out), queries, args.count, args.max_tasks, args.sleep_time)
def main(args): # path to save checkpoint if not os.path.isdir(args.checkpoint_path): os.mkdir(args.checkpoint_path) args.checkpoint_path += '/checkpoint.pt' vocab = utils.load_vocab(args.vocab_json) train_loader_kwargs = { 'question_h5': args.train_question_h5, 'feature_h5': args.train_features_h5, 'vocab': vocab, 'batch_size': args.batch_size, 'shuffle': args.shuffle_train_data == 1, 'max_samples': args.num_train_samples, 'num_workers': args.loader_num_workers, } val_loader_kwargs = { 'question_h5': args.val_question_h5, 'feature_h5': args.val_features_h5, 'vocab': vocab, 'batch_size': args.batch_size, 'max_samples': args.num_val_samples, 'num_workers': args.loader_num_workers, } with ClevrDataLoader(**train_loader_kwargs) as train_loader, \ ClevrDataLoader(**val_loader_kwargs) as val_loader: train_loop(args, train_loader, val_loader)
def __init__(self, args, train_dataset=None, dev_dataset=None, test_dataset=None): self.args = args self.train_dataset = train_dataset self.dev_dataset = dev_dataset self.test_dataset = test_dataset self.label_lst = get_labels(args) self.num_labels = len(self.label_lst) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later self.pad_token_label_id = args.ignore_index self.word_vocab, self.char_vocab, _, _ = load_vocab(args) self.pretrained_word_matrix = None if not args.no_w2v: self.pretrained_word_matrix = load_word_matrix( args, self.word_vocab) self.model = BiLSTM_CNN_CRF(args, self.pretrained_word_matrix) # GPU or CPU self.device = "cuda" if torch.cuda.is_available( ) and not args.no_cuda else "cpu" self.model.to(self.device) self.test_texts = None if args.write_pred: self.test_texts = get_test_texts(args) # Empty the original prediction files if os.path.exists(args.pred_dir): shutil.rmtree(args.pred_dir)
def test(args): test_path = args['--test-src'] model_path = args['--model-path'] batch_size = int(args['--batch-size']) total_examples = 0 total_correct = 0 vocab_path = args['--vocab-src'] softmax = torch.nn.Softmax(dim=1) if args['--data'] == 'quora': test_data = utils.read_data(test_path, 'quora') vocab_data = utils.load_vocab(vocab_path) network = Model(args, vocab_data, 2) network.model = torch.load(model_path) if args['--cuda'] == str(1): network.model = network.model.cuda() softmax = softmax.cuda() network.model.eval() for labels, p1, p2, idx in utils.batch_iter(test_data, batch_size): total_examples += len(labels) print(total_examples) pred, _ = network.forward(labels, p1, p2) pred = softmax(pred) _, pred = pred.max(dim=1) label = network.get_label(labels) total_correct += (pred == label).sum().float() final_acc = total_correct / total_examples print('Accuracy of the model is %.2f' % (final_acc), file=sys.stderr)
def main(args): eval_name = str(os.path.basename(args.data).split('.')[0]) config = tf.estimator.RunConfig(model_dir=args.model_dir) hparams = utils.create_hparams(args) vocab_list = utils.load_vocab(args.vocab) binf2phone_np = None binf2phone = None if hparams.decoder.binary_outputs: binf2phone = utils.load_binf2phone(args.binf_map, vocab_list) binf2phone_np = binf2phone.values def model_fn(features, labels, mode, config, params): return las_model_fn(features, labels, mode, config, params, binf2phone=binf2phone_np, run_name=eval_name) model = tf.estimator.Estimator(model_fn=model_fn, config=config, params=hparams) tf.logging.info('Evaluating on {}'.format(eval_name)) model.evaluate(lambda: input_fn(args.data, args.vocab, args.norm, num_channels=args.num_channels, batch_size=args.batch_size, binf2phone=None), name=eval_name)
def _create_pretrained_emb_from_txt(vocab_file, embed_file, num_trainable_tokens=3, dtype=tf.float32, scope=None): """Load pretrain embeding from embed file, and return an embedding matrix. Args: embed_file: embed file path. num_trainable_tokens: Make the first n tokens in the vocab file as trainable variables. Default is 3, which is "<unk>", "<s>" and "</s>". """ _, vocab, _ = load_vocab(vocab_file) trainable_tokens = vocab[:num_trainable_tokens] print("Using pretrained embedding: %s." % embed_file) print(" with trainable tokens:") emb_dict, emb_size = _load_embed_txt(embed_file) for token in trainable_tokens: print(" %s" % token) for token in vocab: if token not in emb_dict: emb_dict[token] = [0.0] * emb_size emb_mat = np.array([emb_dict[token] for token in vocab], dtype=dtype.as_numpy_dtype()) emb_mat = tf.constant(emb_mat) emb_mat_const = tf.slice(emb_mat, [num_trainable_tokens, 0], [-1, -1]) with tf.variable_scope(scope or "pretrain_embeddings", dtype=dtype): emb_mat_var = tf.get_variable("emb_mat_var", [num_trainable_tokens, emb_size]) # TODO return tf.concat([emb_mat_var, emb_mat_const], 0)
def main(): parser = argparse.ArgumentParser(description="Running Simlex test") parser.add_argument( "--vocab_file_pattern", type=str, default=None, help="vocab path file or file pattern in case of multiple files", required=True) parser.add_argument( "--vector_file_pattern", type=str, default=None, help="vector path file or file pattern in case of multiple files", required=True) parser.add_argument("--output_file", type=str, default=None, help="file to write output to", required=True) args = parser.parse_args() vocab_files = glob.glob(str(args.vocab_file_pattern)) vector_files = glob.glob(str(args.vector_file_pattern)) with open(os.path.join(ROOT_DIR, f'simlex/{args.output_file}'), 'w') as f: for voc, vec in zip(vocab_files, vector_files): file_name = os.path.splitext(os.path.basename(voc))[0][4:] vocab = load_vocab(voc) vectors = load_vectors(vec) simlex_score = eval_simlex(simlex_pairs, vocab, vectors) f.write('{}: {}'.format(file_name, simlex_score)) f.write('\n') f.close()
def main(): opt.load_full = not opt.debug word2id, id2word = utils.load_vocab() glove_emb = utils.load_glove_emb(word2id) word_emb = nn.Embedding.from_pretrained( \ torch.tensor(glove_emb, dtype=torch.float)) model = HierarchicalSeq2seq(word_emb=word_emb, word_emb_dim=300, word_vocab_size=len(id2word)).cuda() if opt.mode == "train": optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, \ model.parameters()), lr=opt.learning_rate, initial_accumulator_value=opt.init_accum) train_data = utils.load_train_data(demo=opt.demo) run_training(model, train_data, optimizer, word2id, id2word, opt) elif opt.mode == "inference": test_data = utils.load_test_data(setup=opt.setup, demo=opt.demo) run_inference(model, test_data, word2id, id2word, opt)
def __init__(self, data_dir, vocab_path, random_seed=None): self.data_dir = data_dir self.vocab = load_vocab(vocab_path) # self.num_examples = {"train": -1, "dev": -1, "infer": -1} self.num_examples = {"train": -1, "dev": -1, "infer": -1} np.random.seed(random_seed)
def main(args): print('loading trained model from %s' % (args.load_path + '/checkpoint.pt')) model, kwargs = utils.load_model(args.load_path + '/checkpoint.pt') model.cuda() model.eval() vocab = utils.load_vocab(args.vocab_json) test_loader_kwargs = { 'question_h5': args.test_question_h5, 'feature_h5': args.test_features_h5, 'vocab': vocab, 'batch_size': args.batch_size, 'max_samples': None, 'num_workers': args.loader_num_workers, } print('loading test data') with ClevrDataLoader(**test_loader_kwargs) as test_loader: print('%d samples in the test set' % len(test_loader.dataset)) print('checking test accuracy...') acc = check_accuracy(args, model, test_loader) print('test accuracy = %.4f' % acc) with open(args.load_path + '/checkpoint.pt.json') as f: info = json.load(f) with open(args.load_path + '/result.txt', 'w') as res: res.write('test accuracy: %4f\n' % acc) res.write('best val accuracy: %4f\n' % info['best_val_acc']) res.write('arguments: \n') for k, v in vars(args).items(): res.write(str(k) + ': ' + str(v) + '\n')
def main(args): config = tf.estimator.RunConfig(model_dir=args.model_dir) hparams = utils.create_hparams(args) hparams.decoder.set_hparam('beam_width', args.beam_width) vocab_list = utils.load_vocab(args.vocab) vocab_list_orig = vocab_list binf2phone_np = None binf2phone = None mapping = None if hparams.decoder.binary_outputs: if args.mapping is not None: vocab_list, mapping = utils.get_mapping(args.mapping, args.vocab) hparams.del_hparam('mapping') hparams.add_hparam('mapping', mapping) binf2phone = utils.load_binf2phone(args.binf_map, vocab_list) binf2phone_np = binf2phone.values def model_fn(features, labels, mode, config, params): return las_model_fn(features, labels, mode, config, params, binf2phone=binf2phone_np) model = tf.estimator.Estimator(model_fn=model_fn, config=config, params=hparams) phone_pred_key = 'sample_ids_phones_binf' if args.use_phones_from_binf else 'sample_ids' predict_keys = [phone_pred_key, 'embedding', 'alignment'] if args.use_phones_from_binf: predict_keys.append('logits_binf') predict_keys.append('alignment_binf') audio, _ = librosa.load(args.waveform, sr=SAMPLE_RATE, mono=True) features = [calculate_acoustic_features(args, audio)] predictions = model.predict( input_fn=lambda: input_fn(features, args.vocab, args.norm, num_channels=features[0].shape[-1], batch_size=args.batch_size), predict_keys=predict_keys) predictions = list(predictions) for p in predictions: beams = p[phone_pred_key].T if len(beams.shape) > 1: i = beams[0] else: i = beams i = i.tolist() + [utils.EOS_ID] i = i[:i.index(utils.EOS_ID)] text = to_text(vocab_list, i) text = text.split(args.delimiter) print(text)
def main(args): vocab_list = np.array(utils.load_vocab(args.vocab)) vocab_size = len(vocab_list) config = tf.estimator.RunConfig(model_dir=args.model_dir) hparams = utils.create_hparams( args, vocab_size, utils.SOS_ID, utils.EOS_ID) hparams.decoder.set_hparam('beam_width', args.beam_width) model = tf.estimator.Estimator( model_fn=las_model_fn, config=config, params=hparams) predictions = model.predict( input_fn=lambda: input_fn( args.data, args.vocab, num_channels=args.num_channels, batch_size=args.batch_size, num_epochs=1), predict_keys='sample_ids') if args.beam_width > 0: predictions = [vocab_list[y['sample_ids'][:, 0]].tolist() + [utils.EOS] for y in predictions] else: predictions = [vocab_list[y['sample_ids']].tolist() + [utils.EOS] for y in predictions] predictions = [' '.join(y[:y.index(utils.EOS)]) for y in predictions] with open(args.save, 'w') as f: f.write('\n'.join(predictions))
def word_sents_to_lword_id_AND_casing_id_sents(word_sents): vocab = load_vocab() lword_id_sents = [] casing_id_sents = [] for word_sent in word_sents: lword_id_sent = [] casing_id_sent = [] for word in word_sent: lword = word.lower() lword_id = vocab[lword] if lword in vocab else 1 lword_id_sent.append(lword_id) casing_id = casing_to_id[word_to_casing(word)] casing_id_sent.append(casing_id) lword_id_sents.append(lword_id_sent) casing_id_sents.append(casing_id_sent) # Pad sentences max_sent_len = len(max(word_sents, key=len)) lword_id_sents = pad_sequences(lword_id_sents, maxlen=max_sent_len) casing_id_sents = pad_sequences(casing_id_sents, maxlen=max_sent_len) return lword_id_sents, casing_id_sents
def __init__(self, config, embeddings, ntags, nchars=None): self.config = config self.nchars = nchars self.ntags = ntags self.embeddings = embeddings self.logger = config.logger # now instantiated in config self.vocab_words = load_vocab(self.config.words_filename) self.vocab_labels = load_vocab(self.config.labels_filename) self.idx_to_word = { idx: word for word, idx in self.vocab_words.items() } self.idx_to_tag = { idx: word for word, idx in self.vocab_labels.items() }
def __init__(self, filepath): vlist, vdict = load_vocab() with open(filepath) as f: data = f.read().splitlines() data = [sent.split(' ') for sent in data] self.data = [[vdict[x] for x in sent] for sent in data] self.lengths = torch.tensor([len(x) + 1 for x in self.data]) self.size = len(self.lengths)
def main(): timer = Timer() timer.start('Load word2vec models...') vocab = load_vocab(config.VOCAB_DATA) embeddings = get_trimmed_w2v_vectors(config.W2V_DATA) timer.stop() timer.start('Load data...') train = process_data(opt.train, vocab) if opt.val is not None: if opt.val != '1vs9': validation = process_data(opt.val, vocab) else: validation, train = train.one_vs_nine() else: validation = None if opt.test is not None: test = process_data(opt.test, vocab) else: test = None timer.stop() timer.start('Build model...') model = CnnModel(embeddings=embeddings) model.build() timer.stop() timer.start('Train model...') epochs = opt.e batch_size = opt.b early_stopping = True if opt.p != 0 else False patience = opt.p pre_train = opt.pre if opt.pre != '' else None model_name = opt.name model.train( model_name, train=train, validation=validation, epochs=epochs, batch_size=batch_size, early_stopping=early_stopping, patience=patience, cont=pre_train, ) timer.stop() if test is not None: timer.start('Test model...') preds = model.predict(test, model_name) labels = test.labels p, r, f1, _ = precision_recall_fscore_support(labels, preds, average='binary') print('Testing result:P=\t{}\tR={}\tF1={}'.format(p, r, f1)) timer.stop()
def __init__(self, log_dir, cfg): self.path = log_dir self.cfg = cfg if cfg.TRAIN.FLAG: self.model_dir = os.path.join(self.path, 'Model') self.log_dir = os.path.join(self.path, 'Log') mkdir_p(self.model_dir) mkdir_p(self.log_dir) self.writer = SummaryWriter(log_dir=self.log_dir) sys.stdout = Logger(logfile=os.path.join(self.path, "logfile.log")) self.data_dir = cfg.DATASET.DATA_DIR self.max_epochs = cfg.TRAIN.MAX_EPOCHS self.snapshot_interval = cfg.TRAIN.SNAPSHOT_INTERVAL s_gpus = cfg.GPU_ID.split(',') self.gpus = [int(ix) for ix in s_gpus] self.num_gpus = len(self.gpus) self.batch_size = cfg.TRAIN.BATCH_SIZE self.lr = cfg.TRAIN.LEARNING_RATE torch.cuda.set_device(self.gpus[0]) cudnn.benchmark = True # load dataset self.dataset = ClevrDataset(data_dir=self.data_dir, split="train") self.dataloader = DataLoader(dataset=self.dataset, batch_size=cfg.TRAIN.BATCH_SIZE, shuffle=True, num_workers=cfg.WORKERS, drop_last=True, collate_fn=collate_fn) self.dataset_val = ClevrDataset(data_dir=self.data_dir, split="val") self.dataloader_val = DataLoader(dataset=self.dataset_val, batch_size=200, drop_last=True, shuffle=False, num_workers=cfg.WORKERS, collate_fn=collate_fn) # load model self.vocab = load_vocab(cfg) self.model, self.model_ema = mac.load_MAC(cfg, self.vocab) self.weight_moving_average(alpha=0) self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) self.previous_best_acc = 0.0 self.previous_best_epoch = 0 self.total_epoch_loss = 0 self.prior_epoch_loss = 10 self.print_info() self.loss_fn = torch.nn.CrossEntropyLoss().cuda()
def main(args): if args.randomize_checkpoint_path == 1: name, ext = os.path.splitext(args.checkpoint_path) num = random.randint(1, 1000000) args.checkpoint_path = '%s_%06d%s' % (name, num, ext) '''load the vocabulary''' vocab = utils.load_vocab(args.vocab_json) if args.use_local_copies == 1: shutil.copy(args.train_question_h5, '/tmp/train_questions.h5') shutil.copy(args.train_features_h5, '/tmp/train_features.h5') shutil.copy(args.val_question_h5, '/tmp/val_questions.h5') shutil.copy(args.val_features_h5, '/tmp/val_features.h5') args.train_question_h5 = '/tmp/train_questions.h5' args.train_features_h5 = '/tmp/train_features.h5' args.val_question_h5 = '/tmp/val_questions.h5' args.val_features_h5 = '/tmp/val_features.h5' question_families = None if args.family_split_file is not None: with open(args.family_split_file, 'r') as f: question_families = json.load(f) ''' ** Dataloaders creation ** Create two ClevrDataloaders for training and validation''' trainer_loader_kwargs = { 'question_h5': args.train_question_h5, 'feature_h5': args.train_features_h5, 'vocab': vocab, 'shuffle': args.shuffle_train_data == 1, 'question_families': question_families, 'max_samples': args.num_train_samples, 'num_workers': args.loader_num_workers, 'images_path': args.train_images } val_loader_kwargs = { 'question_h5': args.val_question_h5, 'feature_h5': args.val_features_h5, 'vocab': vocab, 'question_families': question_families, 'max_samples': args.num_val_samples, 'num_workers': args.loader_num_workers, 'images_path': args.val_images } train_loader = ClevrDataLoader(**trainer_loader_kwargs) val_loader = ClevrDataLoader(**val_loader_kwargs) ''' ** Dataloaders created ** ''' train_loop(args, vocab, train_loader, val_loader) if args.use_local_copies == 1 and args.cleanup_local_copies == 1: os.remove('/tmp/train_questions.h5') os.remove('/tmp/train_features.h5') os.remove('/tmp/val_questions.h5') os.remove('/tmp/val_features.h5')
def generate(start_word, length): parser = argparse.ArgumentParser() parser.add_argument("--vocab_file", type=str, default="data/vocab.pkl", help="Vocabulary dictionary") parser.add_argument("--vocab_size", type=int, default=2854, help="Vocabulary size") parser.add_argument("--embedding_dim", type=int, default=256, help="Dimensionality of the words embedding") parser.add_argument("--rnn_size", type=int, default=128, help="Hidden units of rnn layer ") parser.add_argument("--num_layers", type=int, default=2, help="Number of rnn layer") parser.add_argument("--batch_size", type=int, default=1, help="Minibatch size") args, _ = parser.parse_known_args() vocab_dict = utils.load_vocab(args.vocab_file) index2word = dict(zip(vocab_dict.values(), vocab_dict.keys())) text = [start_word] text_data = utils.transform(text, vocab_dict) checkpoint_dir = os.path.abspath( os.path.join(os.path.curdir, "checkpoints")) checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) graph = tf.Graph() with graph.as_default(): sess = tf.Session() with sess.as_default(): rnn = RNNLM(vocab_size=args.vocab_size, embedding_dim=args.embedding_dim, rnn_size=args.rnn_size, num_layers=args.num_layers, batch_size=args.batch_size, training=False) saver = tf.train.Saver() saver.restore(sess, checkpoint_file) for _ in range(length): data = np.array([text_data]) predictions = sess.run(rnn.prediction, feed_dict={rnn.input_data: data}) text_data.append(predictions[-1]) content = [index2word[index] for index in text_data] return "".join(content)
def filter_by_freq(filename, freq=100): vocab = load_vocab('../gen_data/vocab.1b') fout = open(filename+'.f'+str(freq),'w') with open(filename) as fin: for line in fin: arr = line.strip().split('\t') if len(arr) != 2: logging.info("error line: %s"%('\t'.join(arr))) continue if arr[0] not in vocab or arr[1] not in vocab: logging.info("not in vocab : %s"%('\t'.join(arr))) continue if vocab[arr[0]] < freq and vocab[arr[1]] < freq: logging.info("low freq: %s"%('\t'.join(arr))) else: fout.write(line) fout.close()
def filter_easy_freebase_by_vocab(filename, vocabfile): logging.info('BEGIN: cleaning freebasefile: %s'%filename) vocab = load_vocab(vocabfile) fout = open(filename+'.freq','w') with open(filename) as fin: for line in fin: arr = line.strip().split('\t') head = arr[0].lower() rel = arr[1].lower() tail = arr[2].lower() if head not in vocab or tail not in vocab: continue if len(head) < 2 or len(tail) < 2: continue if vocab[head] < 50 or vocab[tail] < 50: continue if rel == 'is-a' or '/' in rel or '(' in rel: continue rel = rel.replace(' ','_') fout.write('%s\t%s\t%s\n'%(head,rel,tail)) fout.close() logging.info('END')
from model import Model from config import Config from utils import build_data, load_vocab, get_processing_word, Dataset,\ clear_data_path, get_trimmed_glove_vectors,Dataset, write_clear_data_pd import sys with open(sys.argv[1], "r") as f: pipeline = '\t'.join(f.readlines()) config = Config(pipeline) # load vocabs vocab_words = load_vocab(config.words_filename) vocab_labels = load_vocab(config.labels_filename) vocab_chars = load_vocab(config.chars_filename) # get processing functions processing_word = get_processing_word( vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_label = get_processing_word( vocab_labels, lowercase=False, label_vocab=True) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) test_filepath, _ = write_clear_data_pd( config.test_filename, config.DEFAULT, domain=config.domain) test = Dataset(test_filepath, processing_word, processing_label, config.max_iter) # build model model = Model( config, embeddings, ntags=len(vocab_labels), nchars=len(vocab_chars))
if __name__=="__main__": image_paths = {} root_path = "/srv/data/datasets/mscoco/images/" for split in 'train val'.split(): image_ids_path = "datasets/vqa/"+split+"/img_ids.txt" image_ids = set([int(x.strip()) for x in open(image_ids_path).readlines()]) print(split,len(image_ids)) for x in image_ids: name = 'COCO_'+split+'2014_'+format(x, '012')+'.jpg' path = join(root_path,split+"2014",name) image_paths[x] = path q_i2w, q_w2i = load_vocab('datasets/vqa/train/questions.vocab') a_i2w, a_w2i = load_vocab('datasets/vqa/train/answers.vocab') train_set = Dataset('datasets/vqa/train/dataset.h5',image_paths) max_mc = train_set.multiple_choice.shape[-1] max_q = train_set.max_q val_set = Dataset('datasets/vqa/val/dataset.h5',image_paths,max_q=max_q,max_mc=max_mc) Nq = len(q_i2w) Na = len(a_i2w) tf.reset_default_graph() # Read the model with open("tensorflow-vgg16/vgg16.tfmodel", mode='rb') as f: fileContent = f.read() graph_def = tf.GraphDef()
d1 = 512 demb = 50 model_name = "marginloss_wholeimage" image_paths = {} root_path = "/srv/data/datasets/mscoco/images/" for split in 'train val'.split(): image_ids_path = "datasets/vqa/"+split+"/img_ids.txt" image_ids = set([int(x.strip()) for x in open(image_ids_path).readlines()]) print(split,len(image_ids)) for x in image_ids: name = 'COCO_'+split+'2014_'+format(x, '012')+'.jpg' path = join(root_path,split+"2014",name) image_paths[x] = path i2w, w2i = load_vocab('datasets/vqa/vocabulary.txt') train_set = Dataset('datasets/vqa/train/dataset_cleaner.h5', image_paths) max_mc = train_set.multiple_choice.shape[-1] max_q = train_set.max_q val_set = Dataset('datasets/vqa/val/dataset_cleaner.h5', image_paths,max_q=max_q,max_mc=max_mc) Nvocab = len(i2w) tf.reset_default_graph() # Read the model with open("tensorflow-vgg16/vgg16.tfmodel", mode='rb') as f: fileContent = f.read()
def train_lstm(): optimizer=adam # only adam is supported by now. options = locals().copy() with open(prm.outpath, "a") as fout: fout.write("parameters:" + str(options) + str(prm.__dict__)) print "loading dictionary..." vocab = utils.load_vocab(prm.vocab_path, prm.n_words) options['vocab'] = vocab options['vocabinv'] = {} for k,v in vocab.items(): options['vocabinv'][v] = k print 'Loading data...' options['wiki'] = wiki.Wiki(prm.pages_path) options['wikiemb'] = wiki_emb.WikiEmb(prm.pages_emb_path) qpp = qp.QP(prm.qp_path) q_train, q_valid, q_test = qpp.get_queries() a_train, a_valid, a_test = qpp.get_paths() print 'Building model' # This create the initial parameters as np ndarrays. # Dict name (string) -> np ndarray params, exclude_params = init_params() if prm.reload_model: load_params(prm.reload_model, params) if prm.wordemb_path: print 'loading pre-trained weights for word embeddings' params = load_wemb(params, vocab) options['W'] = params['W'] # This create Theano Shared Variable from the parameters. # Dict name (string) -> Theano Tensor Shared Variable # params and tparams have different copy of the weights. tparams = init_tparams(params) mean = theano.shared(np.zeros((prm.dim_proj,)).astype(config.floatX)) # avg of the training set std = theano.shared(np.zeros((prm.dim_proj,)).astype(config.floatX)) # std of the training set t_samples = theano.shared(np.zeros((1,)).astype(config.floatX)) # total number of samples so far stats_vars = {'mean': mean, 'std': std, 't_samples': t_samples} if prm.supervised: baseline_vars = {} else: R_mean = theano.shared(0.71*np.ones((1,)), name='R_mean') R_std = theano.shared(np.ones((1,)), name='R_std') baseline_vars = {'R_mean': R_mean, 'R_std': R_std} is_train, sup, max_hops, k_beam, tq, tq_m, troot_pages, tacts_p, f_pred, cost, \ scan_updates, baseline_updates, stats_updates, consider_constant, \ opt_out = \ build_model(tparams, baseline_vars, stats_vars, options) if prm.decay_c > 0.: decay_c = theano.shared(np_floatX(prm.decay_c), name='decay_c') weight_decay = 0. weight_decay += (tparams['U'] ** 2).sum() weight_decay *= decay_c cost += weight_decay #get only parameters that are not in the exclude_params list tparams_ = OrderedDict([(kk, vv) for kk, vv in tparams.iteritems() if kk not in exclude_params]) grads = tensor.grad(cost, wrt=itemlist(tparams_), consider_constant=consider_constant) lr = tensor.scalar(name='lr') f_grad_shared, f_update = optimizer(lr, tparams_, grads, tq, tq_m, troot_pages, tacts_p, cost, scan_updates, baseline_updates, \ stats_updates, opt_out=[opt_out['R'], opt_out['page_idx'], opt_out['best_answer'], opt_out['best_page_idx']]) print 'Optimization' if prm.train_size == -1: train_size = len(q_train) else: train_size = prm.train_size if prm.valid_size == -1: valid_size = len(q_valid) else: valid_size = prm.valid_size if prm.test_size == -1: test_size = len(q_test) else: test_size = prm.test_size with open(prm.outpath, "a") as fout: fout.write("\n%d train examples" % len(q_train)) with open(prm.outpath, "a") as fout: fout.write("\n%d valid examples" % len(q_valid)) with open(prm.outpath, "a") as fout: fout.write("\n%d test examples" % len(q_test)) history_errs = [] best_p = None if prm.validFreq == -1: validFreq = len(q_train) / prm.batch_size_train else: validFreq = prm.validFreq if prm.saveFreq == -1: saveFreq = len(q_train) / prm.batch_size_train else: saveFreq = prm.saveFreq uidx = 0 # the number of update done estop = False # early stop start_time = time.time() try: for eidx in xrange(prm.max_epochs): n_samples = 0 # Get new shuffled index for the training set. kf = get_minibatches_idx(len(q_train), prm.batch_size_train, shuffle=True) for _, train_index in kf: st = time.time() uidx += 1 is_train.set_value(1.) max_hops.set_value(prm.max_hops_train) # select training dataset k_beam.set_value(1) # Training does not use beam search # Select the random examples for this minibatch queries = [q_train[t].lower() for t in train_index] actions = [a_train[t] for t in train_index] if prm.supervised == 1: sup_ = True elif prm.supervised > 1: if uidx % (int(uidx / prm.supervised) + 1) == 0: sup_ = True else: sup_ = False else: sup_ = False if sup_: sup.set_value(1.) # select supervised mode # Get correct actions (supervision signal) acts_p = get_acts(actions, prm.max_hops_train, k_beam=1) else: sup.set_value(0.) # select non-supervised mode acts_p = -np.ones((prm.max_hops_train+1, len(queries)), dtype=np.float32) root_pages = get_root_pages(actions) # Get the BoW for the queries q_bow, q_m = utils.BOW2(queries, vocab, prm.max_words_query*prm.n_consec) n_samples += len(queries) cost, R, pagesidx, best_answer, best_page_idx = f_grad_shared(q_bow, q_m, root_pages, acts_p) f_update(prm.lrate) if np.isnan(cost) or np.isinf(cost): print 'NaN detected' return 1., 1., 1. if np.mod(uidx, prm.dispFreq) == 0: with open(prm.outpath, "a") as fout: fout.write("\n\nQuery: " + queries[-1].replace("\n"," ")) fout.write('\nBest Answer: ' + utils.idx2text(best_answer[-1], options['vocabinv'])) fout.write('\nBest page: ' + options['wiki'].get_article_title(best_page_idx[-1])) for i, pageidx in enumerate(pagesidx[:,-1]): fout.write('\niteration: ' +str(i) + " page idx " + str(pageidx) + ' title: ' + options['wiki'].get_article_title(pageidx)) fout.write('\nEpoch '+ str(eidx) + ' Update '+ str(uidx) + ' Cost ' + str(cost) + \ ' Reward Mean ' + str(R.mean()) + ' Reward Max ' + str(R.max()) + \ ' Reward Min ' + str(R.min())) fout.write("\nTime per Minibatch Update: " + str(time.time() - st)) if prm.saveto and np.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) np.savez(prm.saveto, history_errs=history_errs, **params) pkl.dump(options, open('%s.pkl' % prm.saveto, 'wb'), -1) print 'Done' if np.mod(uidx, validFreq) == 0: kf_train = get_minibatches_idx(len(q_train), prm.batch_size_pred, shuffle=True, max_samples=train_size) kf_valid = get_minibatches_idx(len(q_valid), prm.batch_size_pred, shuffle=True, max_samples=valid_size) kf_test = get_minibatches_idx(len(q_test), prm.batch_size_pred, shuffle=True, max_samples=test_size) is_train.set_value(0.) sup.set_value(0.) # supervised mode off max_hops.set_value(prm.max_hops_pred) k_beam.set_value(prm.k) with open(prm.outpath, 'a') as fout: fout.write('\n\nComputing Error Training Set') train_err, train_R, train_accp = pred_error(f_pred, q_train, a_train, options, kf_train) with open(prm.outpath, 'a') as fout: fout.write('\n\nComputing Error Validation Set') valid_err, valid_R, valid_accp = pred_error(f_pred, q_valid, a_valid, options, kf_valid) with open(prm.outpath, 'a') as fout: fout.write('\n\nComputing Error Test Set') test_err, test_R, test_accp = pred_error(f_pred, q_test, a_test, options, kf_test) history_errs.append([valid_err[-1], test_err[-1]]) if (uidx == 0 or valid_err[-1] <= np.array(history_errs)[:,0].min()): best_p = unzip(tparams) bad_counter = 0 with open(prm.outpath, "a") as fout: fout.write('\n[{per hop}, Avg] Train err ' + str(train_err) + ' Valid err ' + str(valid_err) + ' Test err ' + str(test_err)) fout.write('\n[{per hop}, Avg] Train R ' + str(train_R) + ' Valid R ' + str(valid_R) + ' Test R ' + str(test_R)) fout.write('\nAccuracy Page Actions Train ' + str(train_accp) + ' Valid ' + str(valid_accp) + ' Test ' + str(test_accp)) if (len(history_errs) > prm.patience and valid_err[-1] >= np.array(history_errs)[:-prm.patience, 0].min()): bad_counter += 1 if bad_counter > prm.patience: print 'Early Stop!' estop = True break with open(prm.outpath, "a") as fout: fout.write('\nSeen %d samples' % n_samples) if estop: break except KeyboardInterrupt: print "Training interupted" end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) is_train.set_value(0.) sup.set_value(0.) # supervised mode off max_hops.set_value(prm.max_hops_pred) k_beam.set_value(prm.k) kf_train_sorted = get_minibatches_idx(len(q_train), prm.batch_size_train) train_err, train_R, train_accp = pred_error(f_pred, q_train, a_train, options, kf_train_sorted) valid_err, valid_R, valid_accp = pred_error(f_pred, q_valid, a_valid, options, kf_valid) test_err, test_R, test_accp = pred_error(f_pred, q_test, a_test, options, kf_test) with open(prm.outpath, "a") as fout: fout.write('\n[{per hop}, Avg] Train err ' + str(train_err) + ' Valid err ' + str(valid_err) + ' Test err ' + str(test_err)) fout.write('\n[{per hop}, Avg] Train R ' + str(train_R) + ' Valid R ' + str(valid_R) + ' Test R ' + str(test_R)) fout.write('\nAccuracy Page Actions Train ' + str(train_accp) + ' Valid ' + str(valid_accp) + ' Test ' + str(test_accp)) if prm.saveto: np.savez(prm.saveto, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) with open(prm.outpath, "a") as fout: fout.write('\nThe code run for %d epochs, with %f sec/epochs' % ((eidx + 1), (end_time - start_time) / (1. * (eidx + 1)))) with open(prm.outpath, "a") as fout: fout.write('\nTraining took %.1fs' % (end_time - start_time)) return train_err, valid_err, test_err
''' Compute the Inverse Document Frequency (IDF) of Wikipedia articles using the vocabulary defined in <vocab_path>. ''' import cPickle as pkl import numpy as np import random import utils from collections import OrderedDict from nltk.tokenize import wordpunct_tokenize import re import parameters as prm print 'loading vocabulary' vocab = utils.load_vocab(prm.vocab_path, prm.n_words) textbegin = False title = '' text = '' n = 0 f = open(prm.dump_path, "rb") print 'creating IDF' m = 0 # number of documents df = {} # word-document frenquency while True: line = f.readline() if (line == ''):
def train_lstm(): optimizer=adam # only adam is supported by now. options = locals().copy() with open(prm.outpath, "a") as fout: fout.write("parameters:" + str(options) + str(prm.__dict__)) print "loading dictionary..." vocab = utils.load_vocab(prm.vocab_path, prm.n_words) options['vocab'] = vocab options['vocabinv'] = {} for k,v in vocab.items(): options['vocabinv'][v] = k print 'Loading data...' options['wiki'] = wiki.Wiki(prm.pages_path) options['wikiemb'] = wiki_emb.WikiEmb(prm.pages_emb_path) #load Q&A Wiki dataset qpp = qp.QP(prm.qp_path) q_train, q_valid, q_test = qpp.get_queries() a_train, a_valid, a_test = qpp.get_paths() print 'Building model' # This create the initial parameters as np ndarrays. # Dict name (string) -> np ndarray params, exclude_params = init_params() if prm.wordemb_path: print 'loading pre-trained weights for word embeddings' params = load_wemb(params, vocab) options['W'] = params['W'] if prm.reload_model: load_params(prm.reload_model, params) params_next = OrderedDict() if prm.learning.lower() == 'q_learning' and prm.update_freq > 0: # copy params to params_next for kk, kv in params.items(): params_next[kk] = kv.copy() # This create Theano Shared Variable from the parameters. # Dict name (string) -> Theano Tensor Shared Variable # params and tparams have different copy of the weights. tparams = init_tparams(params) if prm.update_freq > 0: tparams_next = init_tparams(params_next) else: tparams_next = None if prm.learning.lower() == 'reinforce': R_mean = theano.shared(0.71*np.ones((1,)), name='R_mean') R_std = theano.shared(np.ones((1,)), name='R_std') baseline_vars = {'R_mean': R_mean, 'R_std': R_std} else: baseline_vars = {} iin, out, updates, is_train, sup, max_hops, k_beam, mixer, f_pred, consider_constant \ = build_model(tparams, tparams_next, baseline_vars, options) #get only parameters that are not in the exclude_params list tparams_ = OrderedDict([(kk, vv) for kk, vv in tparams.iteritems() if kk not in exclude_params]) grads = tensor.grad(out[0], wrt=itemlist(tparams_), consider_constant=consider_constant) lr = tensor.scalar(name='lr') f_grad_shared, f_update = optimizer(lr, tparams_, grads, iin, out, updates) print 'Optimization' if prm.train_size == -1: train_size = len(q_train) else: train_size = prm.train_size if prm.valid_size == -1: valid_size = len(q_valid) else: valid_size = prm.valid_size if prm.test_size == -1: test_size = len(q_test) else: test_size = prm.test_size with open(prm.outpath, "a") as fout: fout.write("\n%d train examples" % len(q_train)) with open(prm.outpath, "a") as fout: fout.write("\n%d valid examples" % len(q_valid)) with open(prm.outpath, "a") as fout: fout.write("\n%d test examples" % len(q_test)) history_errs = [] best_p = None if prm.validFreq == -1: validFreq = len(q_train) / prm.batch_size_train else: validFreq = prm.validFreq if prm.saveFreq == -1: saveFreq = len(q_train) / prm.batch_size_train else: saveFreq = prm.saveFreq uidx = 0 # the number of update done estop = False # early stop start_time = time.time() experience = deque(maxlen=prm.replay_mem_size) # experience replay memory as circular buffer. experience_r = deque(maxlen=prm.replay_mem_size) # reward of each entry in the replay memory. try: for eidx in xrange(prm.max_epochs): n_samples = 0 # Get new shuffled index for the training set. kf = get_minibatches_idx(len(q_train), prm.batch_size_train, shuffle=True) for _, train_index in kf: st = time.time() uidx += 1 is_train.set_value(1.) max_hops.set_value(prm.max_hops_train) # select training dataset k_beam.set_value(1) # Training does not use beam search # Select the random examples for this minibatch queries = [q_train[t].lower() for t in train_index] actions = [a_train[t] for t in train_index] if prm.learning.lower() == 'supervised': sup.set_value(1.) # select supervised mode else: sup.set_value(0.) # Get correct actions (supervision signal) acts_p = get_acts(actions, prm.max_hops_train, k_beam=1) # MIXER if prm.mixer > 0 and prm.learning.lower() == 'reinforce': mixer.set_value(max(0, prm.max_hops_train - uidx // prm.mixer)) else: if prm.learning.lower() == 'supervised': mixer.set_value(prm.max_hops_train+1) else: mixer.set_value(0) root_pages = get_root_pages(actions) # Get the BoW for the queries. q_i, q_m = utils.text2idx2(queries, vocab, prm.max_words_query*prm.n_consec) n_samples += len(queries) if uidx > 1 and prm.learning.lower() == 'q_learning': # Randomly select experiences and convert them to numpy arrays. idxs = np.random.choice(np.arange(len(experience)), size=len(queries)) rvs = [] for j in range(len(experience[idxs[0]])): rv = [] for idx in idxs: rv.append(experience[idx][j]) rvs.append(np.asarray(rv)) else: rvs = [np.zeros((len(queries),prm.max_words_query*prm.n_consec),dtype=np.float32), # rs_q np.zeros((len(queries),prm.max_words_query*prm.n_consec),dtype=np.float32), # rs_q_m np.zeros((len(queries),prm.max_hops_train+1),dtype=np.int32), # rl_idx np.zeros((len(queries),prm.max_hops_train+1),dtype=np.float32), # rt np.zeros((len(queries),prm.max_hops_train+1),dtype=np.float32) # rr ] cost, R, l_idx, pages_idx, best_page_idx, best_answer, mask, dist \ = f_grad_shared(q_i, q_m, root_pages, acts_p, uidx, *rvs) f_update(prm.lrate) if prm.learning.lower() == 'q_learning': # update weights of the next_q_val network. if (prm.update_freq > 0 and uidx % prm.update_freq == 0) or (uidx == prm.replay_start): for tk, tv in tparams.items(): if tk in tparams_next: tparams_next[tk].set_value(tv.get_value().copy()) # Only update memory after freeze_mem or before replay_start. if (uidx < prm.replay_start or uidx > prm.freeze_mem) and prm.learning.lower() == 'q_learning': # Update Replay Memory. t = np.zeros((len(queries), prm.max_hops_train+1)) rR = np.zeros((len(queries), prm.max_hops_train+1)) for i in range(len(queries)): j = np.minimum(mask[i].sum(), prm.max_hops_train) # If the agent chooses to stop or the episode ends, # the reward will be the reward obtained with the chosen document. rR[i,j] = R[i] t[i,j] = 1. add = True if prm.selective_mem >= 0 and uidx > 1: # Selective memory: keep the percentage of memories # with reward=1 approximately equal to <selective_mem>. pr = float(np.asarray(experience_r).sum()) / max(1., float(len(experience_r))) if (pr < prm.selective_mem) ^ (rR[i,j] == 1.): # xor add = False if add: experience.append([q_i[i], q_m[i], l_idx[i], t[i], rR[i]]) experience_r.append(rR[i]) if np.isnan(cost) or np.isinf(cost): print 'NaN detected' return 1., 1., 1. #if uidx % 100 == 0: # vis_att(pages_idx[:,-1], queries[-1], alpha[:,-1,:], uidx, options) if np.mod(uidx, prm.dispFreq) == 0: with open(prm.outpath, "a") as fout: fout.write("\n\nQuery: " + queries[-1].replace("\n"," ")) fout.write('\nBest Answer: ' + utils.idx2text(best_answer[-1], options['vocabinv'])) fout.write('\nBest page: ' + options['wiki'].get_article_title(best_page_idx[-1])) for i, page_idx in enumerate(pages_idx[:,-1]): fout.write('\niteration: ' +str(i) + " page idx " + str(page_idx) + ' title: ' + options['wiki'].get_article_title(page_idx)) fout.write('\nEpoch '+ str(eidx) + ' Update '+ str(uidx) + ' Cost ' + str(cost) + \ ' Reward Mean ' + str(R.mean()) + ' Reward Max ' + str(R.max()) + \ ' Reward Min ' + str(R.min()) + \ ' Q-Value Max (avg per sample) ' + str(dist.max(2).mean()) + \ ' Q-Value Mean ' + str(dist.mean())) #fout.write("\nCost Supervised: " + str(cost_sup)) #fout.write("\nCost RL: " + str(cost_RL)) fout.write("\nTime per Minibatch Update: " + str(time.time() - st)) if prm.saveto and np.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) np.savez(prm.saveto, history_errs=history_errs, **params) pkl.dump(options, open('%s.pkl' % prm.saveto, 'wb'), -1) print 'Done' if np.mod(uidx, validFreq) == 0 or uidx == 1: if prm.visited_pages_path: shuffle = False else: shuffle = True kf_train = get_minibatches_idx(len(q_train), prm.batch_size_pred, shuffle=shuffle, max_samples=train_size) kf_valid = get_minibatches_idx(len(q_valid), prm.batch_size_pred, shuffle=shuffle, max_samples=valid_size) kf_test = get_minibatches_idx(len(q_test), prm.batch_size_pred, shuffle=shuffle, max_samples=test_size) is_train.set_value(0.) sup.set_value(0.) # supervised mode off mixer.set_value(0) # no supervision max_hops.set_value(prm.max_hops_pred) k_beam.set_value(prm.k) with open(prm.outpath, 'a') as fout: fout.write('\n\nComputing Error Training Set') train_err, train_R, train_accp, visited_pages_train = pred_error(f_pred, q_train, a_train, options, kf_train) with open(prm.outpath, 'a') as fout: fout.write('\n\nComputing Error Validation Set') valid_err, valid_R, valid_accp, visited_pages_valid = pred_error(f_pred, q_valid, a_valid, options, kf_valid) with open(prm.outpath, 'a') as fout: fout.write('\n\nComputing Error Test Set') test_err, test_R, test_accp, visited_pages_test = pred_error(f_pred, q_test, a_test, options, kf_test) if prm.visited_pages_path: pkl.dump([visited_pages_train, visited_pages_valid, visited_pages_test], open(prm.visited_pages_path, 'wb')) history_errs.append([valid_err[-1], test_err[-1]]) if (uidx == 0 or valid_err[-1] <= np.array(history_errs)[:,0].min()): best_p = unzip(tparams) bad_counter = 0 with open(prm.outpath, "a") as fout: fout.write('\n[{per hop}, Avg] Train err ' + str(train_err) + ' Valid err ' + str(valid_err) + ' Test err ' + str(test_err)) fout.write('\n[{per hop}, Avg] Train R ' + str(train_R) + ' Valid R ' + str(valid_R) + ' Test R ' + str(test_R)) fout.write('\nAccuracy Page Actions Train ' + str(train_accp) + ' Valid ' + str(valid_accp) + ' Test ' + str(test_accp)) if (len(history_errs) > prm.patience and valid_err[-1] >= np.array(history_errs)[:-prm.patience, 0].min()): bad_counter += 1 if bad_counter > prm.patience: print 'Early Stop!' estop = True break with open(prm.outpath, "a") as fout: fout.write('\nSeen %d samples' % n_samples) if estop: break except KeyboardInterrupt: print "Training interupted" end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) is_train.set_value(0.) sup.set_value(0.) # supervised mode off mixer.set_value(0) # no supervision max_hops.set_value(prm.max_hops_pred) k_beam.set_value(prm.k) kf_train_sorted = get_minibatches_idx(len(q_train), prm.batch_size_train) train_err, train_R, train_accp, visited_pages_train = pred_error(f_pred, q_train, a_train, options, kf_train_sorted) valid_err, valid_R, valid_accp, visited_pages_valid = pred_error(f_pred, q_valid, a_valid, options, kf_valid) test_err, test_R, test_accp, visited_pages_test = pred_error(f_pred, q_test, a_test, options, kf_test) with open(prm.outpath, "a") as fout: fout.write('\n[{per hop}, Avg] Train err ' + str(train_err) + ' Valid err ' + str(valid_err) + ' Test err ' + str(test_err)) fout.write('\n[{per hop}, Avg] Train R ' + str(train_R) + ' Valid R ' + str(valid_R) + ' Test R ' + str(test_R)) fout.write('\nAccuracy Page Actions Train ' + str(train_accp) + ' Valid ' + str(valid_accp) + ' Test ' + str(test_accp)) if prm.saveto: np.savez(prm.saveto, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) with open(prm.outpath, "a") as fout: fout.write('\nThe code run for %d epochs, with %f sec/epochs' % ((eidx + 1), (end_time - start_time) / (1. * (eidx + 1)))) with open(prm.outpath, "a") as fout: fout.write('\nTraining took %.1fs' % (end_time - start_time)) return train_err, valid_err, test_err