def main(args): logging.info('-' * 50) logging.info('Load data files..') question_belong = [] if args.debug: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, 100, relabeling=args.relabeling) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, 100, relabeling=args.relabeling, question_belong=question_belong) else: # logging.info('*' * 10 + ' Train') # train_examples = utils.load_data(args.train_file, relabeling=args.relabeling) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, args.max_dev, relabeling=args.relabeling, question_belong=question_belong) # args.num_train = len(train_examples[0]) args.num_dev = len(dev_examples[0]) logging.info('-' * 50) logging.info('Build dictionary..') # word_dict = utils.build_dict(train_examples[0] + train_examples[1] + train_examples[2], args.max_vocab_size) word_dict = pickle.load(open("../../obj/dict.pkl", "rb")) logging.info('-' * 50) embeddings = utils.gen_embeddings(word_dict, args.embedding_size, args.embedding_file) (args.vocab_size, args.embedding_size) = embeddings.shape logging.info('Compile functions..') train_fn, test_fn, attention_fn, params, all_params = build_fn(args, embeddings) logging.info('Done.') logging.info('-' * 50) logging.info(args) logging.info('-' * 50) logging.info('Intial test..') dev_x1, dev_x2, dev_x3, dev_y = utils.vectorize(dev_examples, word_dict, sort_by_len=not args.test_only, concat=args.concat) word_dict_r = {} word_dict_r[0] = "unk" assert len(dev_x1) == args.num_dev all_dev = gen_examples(dev_x1, dev_x2, dev_x3, dev_y, args.batch_size, args.concat) dev_acc, n_examples, prediction, all_examples= eval_acc(test_fn, all_dev) logging.info('Dev accuracy: %.2f %%' % dev_acc.mean()) print(dev_acc.mean()) alpha= attention_func(attention_fn, all_dev) if args.test_only: return dev_acc,n_examples, prediction, all_examples, alpha
def main(args): logging.info('-' * 50) logging.info('Load data files..') if not (args.test_only): logging.info('*' * 10 + ' All') all_examples = utils.load_data(args.all_file, 100, relabeling=args.relabeling) dev_ratio = args.dev_ratio sample_index = np.arange(len(all_examples[0])) random.seed(1000) dev_index = random.sample(sample_index, int(dev_ratio * len(sample_index))) train_index = np.setdiff1d(sample_index, dev_index) dev_examples = tuple_part(all_examples, dev_index) train_examples = tuple_part(all_examples, train_index) #feature preprocessing train_fea_flat_np = FeaExtract(train_examples[-1]) dev_fea_flat_np = FeaExtract(dev_examples[-1]) train_fea_flat_np2, dev_fea_flat_np2 = Prepocessing_func( train_fea_flat_np, dev_fea_flat_np, varian_ratio_tol=args.pca_ratio) train_fea_merge = FeaMerge(train_fea_flat_np2, train_examples[-1]) dev_fea_merge = FeaMerge(dev_fea_flat_np2, dev_examples[-1]) train_examples = train_examples[:-1] + (train_fea_merge, ) dev_examples = dev_examples[:-1] + (dev_fea_merge, ) args.num_train = len(train_examples[0]) else: # logging.info('*' * 10 + ' Train') # train_examples = utils.load_data(args.train_file, relabeling=args.relabeling) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, args.max_dev, relabeling=args.relabeling) dev_fea_flat_np = FeaExtract(dev_examples[-1]) dev_fea_flat_np2 = PrepocessingApply_func(dev_fea_flat_np) dev_fea_merge = FeaMerge(dev_fea_flat_np2, dev_examples[-1]) dev_examples = dev_examples[:-1] + (dev_fea_merge, ) args.num_dev = len(dev_examples[0]) args.mea_num = dev_examples[4][0].shape[-1] logging.info('-' * 50) logging.info('Build dictionary..') word_dict = pickle.load(open("../../obj/dict.pkl", "rb")) logging.info('-' * 50) embeddings = utils.gen_embeddings(word_dict, args.embedding_size, args.embedding_file) (args.vocab_size, args.embedding_size) = embeddings.shape logging.info('Compile functions..') train_fn, test_fn, params, all_params = build_fn(args, embeddings) logging.info('Done.') logging.info('-' * 50) logging.info(args) logging.info('-' * 50) logging.info('Intial test..') dev_x1, dev_x2, dev_x3, dev_y, dev_x4 = utils.vectorize( dev_examples, word_dict, sort_by_len=not args.test_only, concat=args.concat) word_dict_r = {} word_dict_r[0] = "unk" assert len(dev_x1) == args.num_dev all_dev = gen_examples(dev_x1, dev_x2, dev_x3, dev_y, dev_x4, args.batch_size, args.concat) dev_acc, rediction = eval_acc(test_fn, all_dev) logging.info('Dev accuracy: %.2f %%' % dev_acc.mean()) print(dev_acc.mean()) best_dev_acc = dev_acc best_train_acc = 0 if args.test_only: return dev_acc, best_train_acc utils.save_params(args.model_file, all_params, epoch=0, n_updates=0) # Training logging.info('-' * 50) logging.info('Start training..') train_x1, train_x2, train_x3, train_y, train_x4 = utils.vectorize( train_examples, word_dict, concat=args.concat) assert len(train_x1) == args.num_train start_time = time.time() n_updates = 0 all_train = gen_examples(train_x1, train_x2, train_x3, train_y, train_x4, args.batch_size, args.concat) for epoch in range(args.num_epoches): np.random.shuffle(all_train) for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_x3, mb_mask3, mb_y, mb_x4, mb_mask4) in enumerate(all_train): train_loss = train_fn(mb_x1, mb_mask1, mb_x3, mb_mask3, mb_y, mb_x4) # if idx % 100 == 0: # if epoch % 100 == 0: # logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1])) # logging.info('Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' % (epoch, idx, len(all_train), train_loss, time.time() - start_time)) n_updates += 1 if n_updates % args.eval_iter == 0: print([x.get_value() for x in params]) print([x.get_value() for x in all_params]) samples = sorted( np.random.choice(args.num_train, min(args.num_train, args.num_dev), replace=False)) sample_train = gen_examples( [train_x1[k] for k in samples], [train_x2[k] for k in samples], [train_x3[k * 4 + o] for k in samples for o in range(4)], [train_y[k] for k in samples], [train_x4[k] for k in samples], args.batch_size, args.concat) acc, pred = eval_acc(test_fn, sample_train) logging.info('Train accuracy: %.2f %%' % acc) train_acc, pred = eval_acc(test_fn, all_train) logging.info('train accuracy: %.2f %%' % train_acc) dev_acc, pred = eval_acc(test_fn, all_dev) logging.info('Dev accuracy: %.2f %%' % dev_acc) if dev_acc > best_dev_acc: best_dev_acc = dev_acc logging.info( 'Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%' % (epoch, n_updates, best_dev_acc)) best_train_acc = acc logging.info( 'Best train accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%' % (epoch, n_updates, best_train_acc)) utils.save_params( args.model_file, all_params, epoch=epoch, n_updates=n_updates, ) return best_dev_acc, best_train_acc
def main(args): logging.info('-' * 50) logging.info('Load data files..') question_belong = [] if args.debug: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, 100, relabeling=args.relabeling) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, 100, relabeling=args.relabeling, question_belong=question_belong) else: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, relabeling=args.relabeling) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, args.max_dev, relabeling=args.relabeling, question_belong=question_belong) args.num_train = len(train_examples[0]) args.num_dev = len(dev_examples[0]) logging.info('-' * 50) logging.info('Build dictionary..') #word_dict = utils.build_dict(train_examples[0] + train_examples[1] + train_examples[2], args.max_vocab_size) word_dict = pickle.load(open("../obj/dict.pkl", "rb")) logging.info('-' * 50) embeddings = utils.gen_embeddings(word_dict, args.embedding_size, args.embedding_file) # EMBEDDING (args.vocab_size, args.embedding_size) = embeddings.shape logging.info('Compile functions..') train_fn, test_fn, params, all_params = build_fn(args, embeddings) logging.info('Done.') logging.info('-' * 50) logging.info(args) logging.info('-' * 50) logging.info('Intial test..') dev_x1, dev_x2, dev_x3, dev_y = utils.vectorize( dev_examples, word_dict, sort_by_len=not args.test_only, concat=args.concat) word_dict_r = {} word_dict_r[0] = "unk" assert len(dev_x1) == args.num_dev all_dev = gen_examples(dev_x1, dev_x2, dev_x3, dev_y, args.batch_size, args.concat) dev_acc, pred = eval_acc(test_fn, all_dev) logging.info('Dev accuracy: %.2f %%' % dev_acc) best_acc = dev_acc if args.test_only: return utils.save_params(args.model_file, all_params, epoch=0, n_updates=0) # Training logging.info('-' * 50) logging.info('Start training..') train_x1, train_x2, train_x3, train_y = utils.vectorize(train_examples, word_dict, concat=args.concat) assert len(train_x1) == args.num_train start_time = time.time() n_updates = 0 all_train = gen_examples(train_x1, train_x2, train_x3, train_y, args.batch_size, args.concat) for epoch in range(args.num_epoches): np.random.shuffle(all_train) for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_x3, mb_mask3, mb_y) in enumerate(all_train): train_loss = train_fn(mb_x1, mb_mask1, mb_x2, mb_mask2, mb_x3, mb_mask3, mb_y) if idx % 100 == 0: logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1])) logging.info( 'Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' % (epoch, idx, len(all_train), train_loss, time.time() - start_time)) n_updates += 1 if n_updates % args.eval_iter == 0: samples = sorted( np.random.choice(args.num_train, min(args.num_train, args.num_dev), replace=False)) sample_train = gen_examples( [train_x1[k] for k in samples], [train_x2[k] for k in samples], [train_x3[k * 4 + o] for k in samples for o in range(4)], [train_y[k] for k in samples], args.batch_size, args.concat) acc, pred = eval_acc(test_fn, sample_train) logging.info('Train accuracy: %.2f %%' % acc) dev_acc, pred = eval_acc(test_fn, all_dev) logging.info('Dev accuracy: %.2f %%' % dev_acc) if dev_acc > best_acc: best_acc = dev_acc logging.info( 'Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%' % (epoch, n_updates, dev_acc)) utils.save_params(args.model_file, all_params, epoch=epoch, n_updates=n_updates)
def main(args): logging.info('-' * 50) logging.info('Load data files..') if args.debug: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, 100, relabeling=args.relabeling) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, 100, relabeling=args.relabeling) else: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, relabeling=args.relabeling) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, args.max_dev, relabeling=args.relabeling) args.num_train = len(train_examples[0]) args.num_dev = len(dev_examples[0]) logging.info('-' * 50) logging.info('Build dictionary..') word_dict = utils.build_dict(train_examples[0] + train_examples[1]) entity_markers = list(set([w for w in word_dict.keys() if w.startswith('@entity')] + train_examples[2])) entity_markers = ['<unk_entity>'] + entity_markers entity_dict = {w: index for (index, w) in enumerate(entity_markers)} logging.info('Entity markers: %d' % len(entity_dict)) args.num_labels = len(entity_dict) logging.info('-' * 50) # Load embedding file embeddings = utils.gen_embeddings(word_dict, args.embedding_size, args.embedding_file) (args.vocab_size, args.embedding_size) = embeddings.shape logging.info('Compile functions..') train_fn, test_fn, params = build_fn(args, embeddings) logging.info('Done.') logging.info('-' * 50) logging.info(args) logging.info('-' * 50) logging.info('Intial test..') dev_x1, dev_x2, dev_l, dev_y = utils.vectorize(dev_examples, word_dict, entity_dict) assert len(dev_x1) == args.num_dev all_dev = gen_examples(dev_x1, dev_x2, dev_l, dev_y, args.batch_size) dev_acc = eval_acc(test_fn, all_dev) logging.info('Dev accuracy: %.2f %%' % dev_acc) best_acc = dev_acc if args.test_only: return utils.save_params(args.model_file, params, epoch=0, n_updates=0) # Training logging.info('-' * 50) logging.info('Start training..') train_x1, train_x2, train_l, train_y = utils.vectorize(train_examples, word_dict, entity_dict) assert len(train_x1) == args.num_train start_time = time.time() n_updates = 0 all_train = gen_examples(train_x1, train_x2, train_l, train_y, args.batch_size) for epoch in range(args.num_epoches): np.random.shuffle(all_train) for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y) in enumerate(all_train): logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1])) train_loss = train_fn(mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y) logging.info('Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' % (epoch, idx, len(all_train), train_loss, time.time() - start_time)) n_updates += 1 if n_updates % args.eval_iter == 0: samples = sorted(np.random.choice(args.num_train, min(args.num_train, args.num_dev), replace=False)) sample_train = gen_examples([train_x1[k] for k in samples], [train_x2[k] for k in samples], train_l[samples], [train_y[k] for k in samples], args.batch_size) logging.info('Train accuracy: %.2f %%' % eval_acc(test_fn, sample_train)) logging.info('Dev accuracy: %.2f %%' % eval_acc(test_fn, all_dev)) if dev_acc > best_acc: best_acc = dev_acc logging.info('Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%' % (epoch, n_updates, dev_acc)) utils.save_params(args.model_file, params, epoch=epoch, n_updates=n_updates)
def main(args): logging.info('-' * 50) logging.info('Load data files..') if args.debug: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, 100, relabeling=args.relabeling) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, 100, relabeling=args.relabeling) else: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, relabeling=args.relabeling) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, args.max_dev, relabeling=args.relabeling) args.num_train = len(train_examples[0]) args.num_dev = len(dev_examples[0]) logging.info('-' * 50) logging.info('Build dictionary..') word_dict = utils.build_dict(train_examples[0] + train_examples[1]) entity_markers = list( set([w for w in word_dict.keys() if w.startswith('@entity')] + train_examples[2])) entity_markers = ['<unk_entity>'] + entity_markers entity_dict = {w: index for (index, w) in enumerate(entity_markers)} logging.info('Entity markers: %d' % len(entity_dict)) args.num_labels = len(entity_dict) logging.info('-' * 50) # Load embedding file embeddings = utils.gen_embeddings(word_dict, args.embedding_size, args.embedding_file) (args.vocab_size, args.embedding_size) = embeddings.shape logging.info('Compile functions..') train_fn, test_fn, params = build_fn(args, embeddings) logging.info('Done.') if args.prepare_model: return train_fn, test_fn, params logging.info('-' * 50) logging.info(args) logging.info('-' * 50) logging.info('Intial test..') dev_x1, dev_x2, dev_l, dev_y = utils.vectorize(dev_examples, word_dict, entity_dict) assert len(dev_x1) == args.num_dev all_dev = gen_examples(dev_x1, dev_x2, dev_l, dev_y, args.batch_size) dev_acc = eval_acc(test_fn, all_dev) logging.info('Dev accuracy: %.2f %%' % dev_acc) best_acc = dev_acc if args.test_only: return utils.save_params(args.model_file, params, epoch=0, n_updates=0) # Training logging.info('-' * 50) logging.info('Start training..') train_x1, train_x2, train_l, train_y = utils.vectorize( train_examples, word_dict, entity_dict) assert len(train_x1) == args.num_train start_time = time.time() n_updates = 0 all_train = gen_examples(train_x1, train_x2, train_l, train_y, args.batch_size) for epoch in range(args.num_epoches): np.random.shuffle(all_train) for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y) in enumerate(all_train): logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1])) train_loss = train_fn(mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y) logging.info( 'Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' % (epoch, idx, len(all_train), train_loss, time.time() - start_time)) n_updates += 1 if n_updates % args.eval_iter == 0: samples = sorted( np.random.choice(args.num_train, min(args.num_train, args.num_dev), replace=False)) sample_train = gen_examples([train_x1[k] for k in samples], [train_x2[k] for k in samples], train_l[samples], [train_y[k] for k in samples], args.batch_size) logging.info('Train accuracy: %.2f %%' % eval_acc(test_fn, sample_train)) dev_acc = eval_acc(test_fn, all_dev) logging.info('Dev accuracy: %.2f %%' % dev_acc) if dev_acc > best_acc: best_acc = dev_acc logging.info( 'Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%' % (epoch, n_updates, dev_acc)) utils.save_params(args.model_file, params, epoch=epoch, n_updates=n_updates)
def main(args): logging.info('-' * 50) logging.info('Build dictionary..') word_dict = pickle.load(open("../../obj/dict.pkl", "rb")) logging.info('-' * 50) embeddings = utils.gen_embeddings(word_dict, args.embedding_size, args.embedding_file) (args.vocab_size, args.embedding_size) = embeddings.shape logging.info('-' * 50) logging.info('Load data files..') best_dev_acc_vals = [] best_train_acc_vals = [] best_all_acc_vals = [] best_n_updates_vals = [] for val_id in range(args.cross_val): logging.info('Compile functions..') train_fn, test_fn, params, all_params = build_fn(args, embeddings) logging.info('Done.') logging.info('-' * 50) logging.info(args) if not (args.test_only): logging.info('*' * 10 + ' All') all_examples = utils.load_data(args.all_file, args, relabeling=args.relabeling) sample_index = np.arange(len(all_examples[0])) # dev_ratio = args.dev_ratio # random.seed(args.random_seed) # dev_index= random.sample(sample_index, int(dev_ratio*len(sample_index))) val_sample_num = len(sample_index) * (1. / args.cross_val) if (val_id + 1) == args.cross_val: dev_index = sample_index[int(val_id * val_sample_num):] else: dev_index = sample_index[int(val_id * val_sample_num):int( (val_id + 1) * val_sample_num)] train_index = np.setdiff1d(sample_index, dev_index) dev_examples = tuple_part(all_examples, dev_index) train_examples = tuple_part(all_examples, train_index) #feature preprocessing train_fea_flat_np = FeaExtract(train_examples[-1]) dev_fea_flat_np = FeaExtract(dev_examples[-1]) train_fea_flat_np2, dev_fea_flat_np2 = Prepocessing_func( train_fea_flat_np, dev_fea_flat_np, args) # train_fea_flat_np2 = train_fea_flat_np # dev_fea_flat_np2 = dev_fea_flat_np train_fea_merge = FeaMerge(train_fea_flat_np2, train_examples[-1]) dev_fea_merge = FeaMerge(dev_fea_flat_np2, dev_examples[-1]) train_examples = train_examples[:-1] + (train_fea_merge, ) dev_examples = dev_examples[:-1] + (dev_fea_merge, ) args.num_train = len(train_examples[0]) else: logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, args, args.max_dev, relabeling=args.relabeling) dev_fea_flat_np = FeaExtract(dev_examples[-1]) dev_fea_flat_np2 = PrepocessingApply_func(dev_fea_flat_np, args) dev_fea_flat_np2 = dev_fea_flat_np dev_fea_merge = FeaMerge(dev_fea_flat_np2, dev_examples[-1]) dev_examples = dev_examples[:-1] + (dev_fea_merge, ) args.num_dev = len(dev_examples[0]) logging.info('-' * 50) logging.info('Intial test..') dev_x1, dev_x2, dev_x3, dev_y, dev_x4 = utils.vectorize( dev_examples, word_dict, sort_by_len=not args.test_only, concat=args.concat) word_dict_r = {} word_dict_r[0] = "unk" assert len(dev_x1) == args.num_dev all_dev = gen_examples(dev_x1, dev_x2, dev_x3, dev_y, dev_x4, args.batch_size, args.concat) dev_acc, _, prediction, test_probs, weights, _ = eval_acc( test_fn, all_dev) logging.info('Dev accuracy: %.2f %%' % dev_acc.mean()) print(dev_acc.mean()) best_dev_acc = dev_acc best_train_acc = 0 best_all_acc = 0 if args.test_only: best_dev_acc_vals.append(best_dev_acc) best_train_acc_vals.append(best_train_acc) best_all_acc_vals.append(best_all_acc) best_n_updates_vals.append(0) return best_dev_acc_vals, best_train_acc_vals, best_all_acc_vals, best_n_updates_vals utils.save_params(args.model_file, all_params, epoch=0, n_updates=0) # Training logging.info('-' * 50) logging.info('Start training..') train_x1, train_x2, train_x3, train_y, train_x4 = utils.vectorize( train_examples, word_dict, concat=args.concat) assert len(train_x1) == args.num_train start_time = time.time() n_updates = 0 all_train = gen_examples(train_x1, train_x2, train_x3, train_y, train_x4, args.batch_size, args.concat) ini_train_acc, ini_train_label, _, ini_train_probs, train_weight, train_loss = eval_acc( test_fn, all_train) logging.info('initial train accuracy: acc = %.2f %%' % (ini_train_acc)) pickle.dump( { 'train_acc': ini_train_acc, 'train_label': ini_train_label, 'train_probs': ini_train_probs }, open('ini.pickle', 'wb')) ini_dev_acc, _, _, ini_dev_probs, _, _ = eval_acc(test_fn, all_dev) logging.info('initial dev accuracy: acc = %.2f %%' % (ini_dev_acc)) ini_all_acc, _, _, ini_all_probs, _, _ = eval_acc( test_fn, all_train + all_dev) logging.info('initial all accuracy: acc = %.2f %%' % (ini_all_acc)) best_dev_acc = 0 best_n_updates = n_updates + 0 fail_update_num = 0 break_epoch = False loss_curve = [train_loss] train_acc_curve = [ini_train_acc] dev_acc_curve = [ini_dev_acc] weight_curve = [] weight_curve.append(train_weight) para_curve = [] para_curve.append([x.get_value() for x in params]) logging.info([x.get_value() for x in params]) for epoch in range(args.num_epoches): if break_epoch: fig_train.savefig(args.Type + str(val_id) + '.png') break # np.random.shuffle(all_train) for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_x3, mb_mask3, mb_y, mb_x4, mb_mask4) in enumerate(all_train): #early stopping if fail_update_num > args.update_fail_tol: break_epoch = True break train_loss = train_fn(mb_x1, mb_mask1, mb_x3, mb_mask3, mb_y, mb_x4) #if idx % 100 == 0: # if n_updates % int(args.eval_iter/4) == 0: # if n_updates % args.eval_iter == 0: if epoch % args.eval_iter == 0: logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1])) logging.info( 'Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' % (epoch, idx, len(all_train), train_loss, time.time() - start_time)) n_updates += 1 # if n_updates % args.eval_iter == 0: if epoch % args.eval_iter == 0: logging.info([x.get_value() for x in params]) #print([x.get_value() for x in all_params]) samples = sorted( np.random.choice(args.num_train, min(args.num_train, args.num_dev), replace=False)) sample_train = gen_examples([train_x1[k] for k in samples], [train_x2[k] for k in samples], [ train_x3[k * 4 + o] for k in samples for o in range(4) ], [train_y[k] for k in samples], [train_x4[k] for k in samples], args.batch_size, args.concat) acc, _, pred, _, _, _ = eval_acc(test_fn, sample_train) #logging.info('Train accuracy: %.2f %%' % acc) train_acc, train_label, pred, train_probs, train_weight, train_loss = eval_acc( test_fn, all_train) logging.info('train accuracy: %.2f %%' % train_acc) dev_acc, _, pred, _, weights, _ = eval_acc( test_fn, all_dev) logging.info('Dev accuracy: %.2f %%' % dev_acc) all_acc, _, _, _, _, _ = eval_acc(test_fn, all_train + all_dev) logging.info('All accuracy: %.2f %%' % all_acc) if args.show_loss: loss_curve.append(train_loss) train_acc_curve.append(train_acc) dev_acc_curve.append(dev_acc) if (epoch % (20 * args.eval_iter) == 0) & (idx == 0): fig_train = plt.figure(num='training') plt.clf() plt.subplot(121) plt.plot(-np.array(loss_curve), '.--') plt.title('train_loss') plt.subplot(122) plt.title('accuracy') plt.plot(train_acc_curve, '.--', label='train_acc') plt.plot(dev_acc_curve, '.--', label='dev_acc') plt.legend() plt.pause(1) weight_curve.append(train_weight) para_curve.append([x.get_value() for x in params]) if (epoch % (20 * args.eval_iter) == 0) & (idx == 0): fig_train_para = plt.figure(num='para') plt.clf() plt.subplot(121) mlp_w_curve = np.array([ para_tmp[0].reshape((-1)) for para_tmp in para_curve ]) mlp_w_curve0 = np.concatenate( (mlp_w_curve[:, :-5], mlp_w_curve[:, -4:]), axis=1) mlp_w_curve1 = mlp_w_curve[:, -5] plt.plot(mlp_w_curve0, '.--', color='grey') plt.plot(mlp_w_curve1, '.--', color='k') mlp_b_curve = np.array([ para_tmp[1].reshape((-1)) for para_tmp in para_curve ]) plt.plot(mlp_b_curve, '.--', color='r') plt.title('mlp_w') plt.subplot(122) word_weight = np.array([ np.array(weight_tmp[0]).reshape((-1)) for weight_tmp in weight_curve ]) plt.plot(word_weight, '.--') plt.title('word_weight') plt.pause(1) #print(weights[0]) if dev_acc > best_dev_acc: fail_update_num = 0 best_n_updates = n_updates + 0 best_train_acc = train_acc logging.info( 'Best train accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%' % (epoch, n_updates, best_train_acc)) best_dev_acc = dev_acc logging.info( 'Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%' % (epoch, n_updates, best_dev_acc)) best_all_acc = all_acc logging.info( 'Best all accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%' % (epoch, n_updates, best_all_acc)) utils.save_params( args.model_file, all_params, epoch=epoch, n_updates=n_updates, ) else: #updates that dev accuracy does not increase fail_update_num = fail_update_num + 1 logging.info('failed updates: ' + str(fail_update_num)) logging.info('*' * 50) best_dev_acc_vals.append(best_dev_acc) best_train_acc_vals.append(best_train_acc) best_all_acc_vals.append(best_all_acc) best_n_updates_vals.append(best_n_updates) fig_train.savefig(args.Type + str(val_id) + '.png') fig_train_para.savefig(args.Type + str(val_id) + '_para.png') pickle.dump( { 'loss_curve': loss_curve, 'train_acc_curve': train_acc_curve, 'dev_acc_curve': dev_acc_curve, 'weight_curve': weight_curve, 'para_curve': para_curve }, open('final.pickle', 'wb')) return best_dev_acc_vals, best_train_acc_vals, best_all_acc_vals, best_n_updates_vals
def main(args): logging.info('-' * 50) logging.info('Load data files..') if args.debug: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, 5, relabeling=args.relabeling, remove_notfound=args.remove_notfound) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, 100, relabeling=args.relabeling, remove_notfound=False) #elif args.test_only: # logging.info('*' * 10 + ' Train') # #train_examples = utils.load_cnn_data(args.train_file, relabeling=args.relabeling) # docs, qs, ans # train_examples = utils.load_data(args.train_file, relabeling=args.relabeling, remove_notfound=args.remove_notfound) # docs, qs, ans # logging.info('*' * 10 + ' Dev') # dev_examples = utils.load_data(args.dev_file, args.max_dev, relabeling=args.relabeling, # remove_notfound=False) elif args.cnn_train: logging.info('*' * 10 + ' Train') train_examples = utils.load_cnn_data(args.train_file, relabeling=args.relabeling, has_ids=args.train_has_ids) # docs, qs, ans logging.info('*' * 10 + ' Dev') dev_examples = utils.load_cnn_data(args.dev_file, args.max_dev, relabeling=args.relabeling, has_ids=args.dev_has_ids) else: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, relabeling=args.relabeling, remove_notfound=args.remove_notfound) # docs, qs, ans logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, args.max_dev, relabeling=args.relabeling, remove_notfound=False) args.num_train = len(train_examples[0]) args.num_dev = len(dev_examples[0]) logging.info('-' * 50) logging.info('Build dictionary..') word_dict = utils.build_dict(train_examples[0] + train_examples[1], # + dev_examples[0] + dev_examples[1], max_words=args.max_words) # docs+qs entity_markers = list(set([w for w in word_dict.keys() if w.startswith('@entity')] + train_examples[2])) entity_markers = ['<unk_entity>'] + entity_markers entity_dict = {w: index for (index, w) in enumerate(entity_markers)} inv_entity_dict = {index: w for w, index in entity_dict.items()} assert len(entity_dict) == len(inv_entity_dict) logging.info('Entity markers: %d' % len(entity_dict)) args.num_labels = len(entity_dict) logging.info('-' * 50) # Load embedding file embeddings = utils.gen_embeddings(word_dict, args.embedding_size, args.embedding_file) (args.vocab_size, args.embedding_size) = embeddings.shape logging.info('Compile functions..') train_fn, test_fn, params = build_fn(args, embeddings) logging.info('Done.') logging.info('-' * 50) logging.info(args) logging.info('-' * 50) logging.info('Intial test..') dev_x1, dev_x2, dev_l, dev_y, dev_ids = utils.vectorize(dev_examples, word_dict, entity_dict, remove_notfound=False, relabeling=args.relabeling) if dev_ids is not None: assert len(dev_y) == len(dev_ids) assert len(dev_x1) == args.num_dev all_dev = gen_examples(dev_x1, dev_x2, dev_l, dev_y, args.batch_size) dev_acc, dev_preds = eval_acc(test_fn, all_dev) if dev_ids is not None: assert len(dev_ids) == len(dev_preds) == len(dev_y) dev_preds_data = to_output_preds(dev_ids, dev_preds, inv_entity_dict, args.relabeling) logging.info('Dev accuracy: %.2f %%' % dev_acc) best_acc = dev_acc if args.log_file is not None: assert args.log_file.endswith(".log") run_name = args.log_file[:args.log_file.find(".log")] if dev_ids is not None: preds_file_name = run_name + ".preds" utils.write_preds(dev_preds_data, preds_file_name) utils.external_eval(preds_file_name, run_name + ".preds.scores", eval_data="test" if "test" in os.path.basename(args.dev_file) else "dev") if args.test_only: return if args.log_file is not None: utils.save_params(run_name + ".model", params, epoch=0, n_updates=0) # Training logging.info('-' * 50) logging.info('Start training..') train_x1, train_x2, train_l, train_y, train_ids = utils.vectorize(train_examples, word_dict, entity_dict, remove_notfound=args.remove_notfound, relabeling=args.relabeling) assert len(train_x1) == args.num_train start_time = time.time() n_updates = 0 train_accs = [] dev_accs = [] all_train = gen_examples(train_x1, train_x2, train_l, train_y, args.batch_size) improved = [] for epoch in range(args.num_epoches): ep_acc_improved = False np.random.shuffle(all_train) for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y) in enumerate(all_train): logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1])) train_loss = train_fn(mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y) logging.info('Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' % (epoch, idx, len(all_train), train_loss, time.time() - start_time)) n_updates += 1 if n_updates % args.eval_iter == 0: samples = sorted(np.random.choice(args.num_train, min(args.num_train, args.num_dev), replace=False)) sample_train = gen_examples([train_x1[k] for k in samples], [train_x2[k] for k in samples], train_l[samples], [train_y[k] for k in samples], args.batch_size) train_acc, train_preds = eval_acc(test_fn, sample_train) train_accs.append(train_acc) logging.info('Train accuracy: %.2f %%' % train_acc) dev_acc, dev_preds = eval_acc(test_fn, all_dev) dev_accs.append(dev_acc) logging.info('Dev accuracy: %.2f %%' % dev_acc) utils.update_plot(args.eval_iter, train_accs, dev_accs, file_name=args.log_file + ".html") if dev_acc > best_acc: ep_acc_improved = True best_acc = dev_acc logging.info('Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%' % (epoch, n_updates, dev_acc)) if args.log_file is not None: utils.save_params(run_name + ".model", params, epoch=epoch, n_updates=n_updates) if dev_ids is not None: dev_preds_data = to_output_preds(dev_ids, dev_preds, inv_entity_dict, args.relabeling) utils.write_preds(dev_preds_data, preds_file_name) utils.external_eval(preds_file_name, run_name + ".preds.scores", eval_data="dev") improved.append(ep_acc_improved) # early stop if len(improved) > 25 and sum(improved[-3:]) == 0: break
def main(args): logging.info('-' * 50 + '') logging.info('Loading data...') if args.debug: train_examples = utils.load_data(args.train_file, 100) dev_examples = utils.load_data(args.dev_file, 100) else: train_examples = utils.load_data(args.train_file) dev_examples = utils.load_data(args.dev_file) args.num_train = len(train_examples[1]) args.num_dev = len(dev_examples[1]) logging.info('-' * 50) logging.info('Building dictionary...') word_dict = utils.build_dict(train_examples[0] + train_examples[1]) entity_markers = list( set([w for w in word_dict.keys() if w.startswith('@entity')] + train_examples[2])) entity_markers = ['<entity_unk>'] + entity_markers entity_dict = {w: i for (i, w) in enumerate(entity_markers)} logging.info('# of Entity Markers: %d' % len(entity_dict)) args.num_labels = len(entity_dict) logging.info('-' * 50) logging.info('Generating embedding...') embeddings = utils.gen_embeddings(word_dict, args.embedding_size, args.embedding_file) embeddings = embeddings.astype('float32') args.vocab_size, args.embedding_size = embeddings.shape logging.info('-' * 50) logging.info('Creating TF computation graph...') if args.rnn_type == 'lstm': logging.info('Using LSTM Cells') elif args.rnn_type == 'gru': logging.info('Using GRU Cells') # tf.reset_default_graph() d_input = tf.placeholder(dtype=tf.int32, shape=(None, None), name="d_input") q_input = tf.placeholder( dtype=tf.int32, shape=(None, None), name="q_input") # [batch_size, max_seq_length_for_batch] l_mask = tf.placeholder(dtype=tf.float32, shape=(None, None), name="l_mask") # [batch_size, entity num] y = tf.placeholder(dtype=tf.int32, shape=None, name="label") # batch size vector y_1hot = tf.placeholder( dtype=tf.float32, shape=(None, None), name="label_1hot") # onehot encoding of y [batch_size, entitydict] training = tf.placeholder(dtype=tf.bool) word_embeddings = tf.get_variable( "glove", shape=(args.vocab_size, args.embedding_size), initializer=tf.constant_initializer(embeddings)) W_bilinear = tf.Variable( tf.random_uniform((2 * args.hidden_size, 2 * args.hidden_size), minval=-0.01, maxval=0.01)) with tf.variable_scope( 'd_encoder'): # Encoding Step for Passage (d_ for document) d_embed = tf.nn.embedding_lookup( word_embeddings, d_input ) # Apply embeddings: [batch, max passage length in batch, GloVe Dim] d_embed_dropout = tf.layers.dropout( d_embed, rate=args.dropout_rate, training=training) # Apply Dropout to embedding layer if args.rnn_type == 'lstm': d_cell_fw = rnn.LSTMCell(args.hidden_size) d_cell_bw = rnn.LSTMCell(args.hidden_size) elif args.rnn_type == 'gru': d_cell_fw = rnn.GRUCell( args.hidden_size ) # TODO: kernel_initializer=tf.random_normal_initializer(0,0.1) not working for 1.1 d_cell_bw = rnn.GRUCell(args.hidden_size) d_outputs, _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, d_embed_dropout, dtype=tf.float32) d_output = tf.concat( d_outputs, axis=-1 ) # [batch, len, h], len is the max passage length, and h is the hidden size with tf.variable_scope('q_encoder'): # Encoding Step for Question q_embed = tf.nn.embedding_lookup(word_embeddings, q_input) q_embed_dropout = tf.layers.dropout(q_embed, rate=args.dropout_rate, training=training) if args.rnn_type == 'lstm': q_cell_fw = rnn.LSTMCell(args.hidden_size) q_cell_bw = rnn.LSTMCell(args.hidden_size) elif args.rnn_type == 'gru': q_cell_fw = rnn.GRUCell(args.hidden_size) q_cell_bw = rnn.GRUCell(args.hidden_size) q_outputs, q_laststates = tf.nn.bidirectional_dynamic_rnn( q_cell_fw, q_cell_bw, q_embed_dropout, dtype=tf.float32) if args.rnn_type == 'lstm': q_output = tf.concat([q_laststates[0][-1], q_laststates[1][-1]], axis=-1) # (batch, h) elif args.rnn_type == 'gru': q_output = tf.concat(q_laststates, axis=-1) # (batch, h) with tf.variable_scope('bilinear'): # Bilinear Layer (Attention Step) # M computes the similarity between each passage word and the entire question encoding M = d_output * tf.expand_dims(tf.matmul(q_output, W_bilinear), axis=1) # [batch, h] -> [batch, 1, h] # alpha represents the normalized weights representing how relevant the passage word is to the question alpha = tf.nn.softmax(tf.reduce_sum(M, axis=2)) # [batch, len] # this output contains the weighted combination of all contextual embeddings bilinear_output = tf.reduce_sum(d_output * tf.expand_dims(alpha, axis=2), axis=1) # [batch, h] with tf.variable_scope('dense'): # Prediction Step # the final output has dimension [batch, entity#], giving the probabilities of an entity being the answer for examples final_prob = tf.layers.dense( bilinear_output, units=args.num_labels, activation=tf.nn.softmax, kernel_initializer=tf.random_uniform_initializer( minval=-0.01, maxval=0.01)) # [batch, entity#] pred = final_prob * l_mask # ignore entities that don't appear in the passage train_pred = pred / tf.expand_dims( tf.reduce_sum(pred, axis=1), axis=1) # redistribute probabilities ignoring certain labels train_pred = tf.clip_by_value(train_pred, 1e-7, 1.0 - 1e-7) test_pred = tf.cast(tf.argmax(pred, axis=-1), tf.int32) acc = tf.reduce_sum(tf.cast(tf.equal(test_pred, y), tf.int32)) loss_op = tf.reduce_mean( -tf.reduce_sum(y_1hot * tf.log(train_pred), reduction_indices=[1])) optimizer = tf.train.GradientDescentOptimizer( learning_rate=args.learning_rate) train_op = optimizer.minimize(loss_op) logging.info('Done!') logging.info('-' * 50) logging.info('Printing args...') logging.info(args) logging.info('-' * 50) logging.info('Initial Test...') dev_x1, dev_x2, dev_l, dev_y = utils.vectorize(dev_examples, word_dict, entity_dict) all_dev = gen_examples(dev_x1, dev_x2, dev_l, dev_y, args.batch_size) dev_acc = 0. # TODO: first dev accuracy displays here logging.info('Dev Accuracy: %.2f %%' % dev_acc) best_acc = dev_acc saver = tf.train.Saver() logging.info('-' * 50) logging.info('Testing...') if args.test_only: if args.test_file == None: return ValueError("No test file specified") test_examples = utils.load_data(args.test_file) test_x1, test_x2, test_l, test_y = utils.vectorize( test_examples, word_dict, entity_dict) all_test = gen_examples(test_x1, test_x2, test_l, test_y, args.batch_size) with tf.Session() as sess: # saver = tf.train.import_meta_graph(args.model_path + '.meta') saver.restore(sess, args.model_path) # TODO: which file to restore? correct = 0 n_examples = 0 for t_x1, t_mask1, t_x2, t_mask2, t_l, t_y in all_test: correct += sess.run(acc, feed_dict={ d_input: t_x1, q_input: t_x2, y: t_y, l_mask: t_l, training: False }) n_examples += len(t_x1) test_acc = correct * 100. / n_examples logging.info('Test Accuracy: %.2f %%' % test_acc) return logging.info('-' * 50) logging.info('Start training...') train_x1, train_x2, train_l, train_y = utils.vectorize( train_examples, word_dict, entity_dict) all_train = gen_examples(train_x1, train_x2, train_l, train_y, args.batch_size) init = tf.global_variables_initializer() start_time = time.time() n_updates = 0 with tf.Session() as sess: sess.run(init) for e in range(args.num_epoches): np.random.shuffle(all_train) for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y) in enumerate(all_train): logging.info( 'Batch Size = %d, # of Examples = %d, max_len = %d' % (mb_x1.shape[0], len(mb_x1), mb_x1.shape[1])) y_label = np.zeros((mb_x1.shape[0], args.num_labels)) for r, i in enumerate( mb_y): # convert (batch) -> (batch, entity_size) y_label[r][i] = 1. _, train_loss = sess.run( [train_op, loss_op], feed_dict={ d_input: mb_x1, q_input: mb_x2, y_1hot: y_label, l_mask: mb_l, training: True }) logging.info( 'Epoch = %d, Iter = %d (max = %d), Loss = %.2f, Elapsed Time = %.2f (s)' % (e, idx, len(all_train), train_loss, time.time() - start_time)) n_updates += 1 if n_updates % args.eval_iter == 0: saver.save(sess, args.model_path, global_step=e) correct = 0 n_examples = 0 for d_x1, d_mask1, d_x2, d_mask2, d_l, d_y in all_dev: correct += sess.run(acc, feed_dict={ d_input: d_x1, q_input: d_x2, y: d_y, l_mask: d_l, training: False }) n_examples += len(d_x1) dev_acc = correct * 100. / n_examples logging.info('Dev Accuracy: %.2f %%' % dev_acc) if dev_acc > best_acc: best_acc = dev_acc logging.info( 'Best Dev Accuracy: epoch = %d, n_updates (iter) = %d, acc = %.2f %%' % (e, n_updates, dev_acc)) logging.info('-' * 50) logging.info('Training Finished...') logging.info("Model saved in file: %s" % saver.save(sess, args.model_path))
def main(args): logging.info('-' * 50) logging.info('Load data files..') question_belong = [] if args.debug: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, 100) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, 100) test_examples = dev_examples else: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file) test_examples = utils.load_data(args.test_file) args.num_train = len(train_examples) args.num_dev = len(dev_examples) args.relations = len(train_examples[0]) logging.info('-' * 50) logging.info('Build dictionary..') word_dicts, inv_word_dicts = utils.build_dict(train_examples, args.max_cat) logging.info('-' * 50) logging.info('Build dictionary..') word_dicts, inv_word_dicts = utils.build_dict(train_examples, args.max_cat) default_value = [] for word_dict in word_dicts: default_value.append(word_dict['']) #logging.info(word_dicts[1]) #logging.info(inv_word_dicts[1]) #utils.store_labels_to_pkl(inv_word_dicts) #sys.exit(0) args.default_value = default_value embeddings = utils.gen_embeddings(word_dicts, args.embedding_size) train_fn, test_fn, params = build_fn(args, embeddings) logging.info('Done.') logging.info('-' * 50) logging.info(args) topk_acc=args.topk_accuracy #topk_acc=1 labels_data=[] if args.test_print_allowed: labels_data=pickle.load(open(labels_file, 'rb')) logging.info('-' * 50) logging.info('Intial test..') dev_data, dev_mask = utils.vectorize(dev_examples, word_dicts, args) all_dev = gen_examples(dev_data, dev_mask, args.batch_size) dev_acc = eval_acc(test_fn, all_dev, inv_word_dicts, topk_acc) logging.info('Dev accuracy: %s %%' % str(dev_acc)) test_data, test_mask = utils.vectorize(test_examples, word_dicts, args, args.test_print_allowed, labels_data) all_test = gen_examples(test_data, test_mask, args.batch_size) test_acc = eval_acc(test_fn, all_test, inv_word_dicts, topk_acc, args.test_print_allowed, labels_data) logging.info('Test accuracy: %s %%' % str(test_acc)) best_acc = dev_acc if args.test_only: return utils.save_params(args.model_file, params, epoch=0, n_updates=0) #utils.store_labels_to_pkl(inv_word_dicts) # Training if args.num_epoches>0: logging.info('-' * 50) logging.info('Start training..') train_data, train_mask = utils.vectorize(train_examples, word_dicts, args) start_time = time.time() n_updates = 0 all_train_old = gen_examples(train_data, train_mask, args.batch_size) all_train=utils.oversample(all_train_old, args) no_progress=0 for epoch in range(args.num_epoches): np.random.shuffle(all_train) for idx, inps in enumerate(all_train): train_loss = train_fn(*inps) if idx % 1000 == 0: #logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1])) logging.info('Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' % (epoch, idx, len(all_train), train_loss, time.time() - start_time)) n_updates += 1 if n_updates % args.eval_iter == 0: samples = sorted(np.random.choice(args.num_train, min(args.num_train, args.num_dev), replace=False)) train_data_sample = [train_data[j][samples] for j in range(args.relations)] train_mask_sample = [train_mask[j][samples] for j in range(args.relations)] sample_train = gen_examples(train_data_sample, train_mask_sample, args.batch_size) #acc = eval_acc(test_fn, sample_train) #logging.info('Train accuracy: %s %%' % str(acc)) dev_acc = eval_acc(test_fn, all_dev, inv_word_dicts, topk_acc) logging.info('Dev accuracy: %s %%' % str(dev_acc)) #test_acc = eval_acc(test_fn, all_test) #logging.info('Test accuracy: %s %%' % str(test_acc)) if dev_acc > best_acc: best_acc = dev_acc logging.info('Best dev accuracy!') utils.save_params(args.model_file, params, epoch=epoch, n_updates=n_updates) no_progress=0 else: no_progress+=1 logging.info('Dev accuracy has not improved in the past %d evaluations' % no_progress) if no_progress>=MAX_NO_PROGRESS: logging.info("Reached the limit of stagnation. Exiting now...") sys.exit(0)
def main(args): logging.info('-' * 50) logging.info('Load data files..') if args.debug: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, False, 100) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, False, 100) test_examples = dev_examples else: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, False) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, False) test_examples = utils.load_data(args.test_file, False) args.num_train = len(train_examples) args.num_dev = len(dev_examples) #args.relations = len(train_examples[0]) logging.info('-' * 50) logging.info('Build dictionary..') word_dicts = pickle.load( open('%s/train_dicts.pickle' % args.data_dir, 'rb')) inv_word_dicts = pickle.load( open('%s/train_inv_dicts.pickle' % args.data_dir, 'rb')) default_value = [] for word_dict in word_dicts: default_value.append(word_dict['']) #logging.info(word_dicts[1]) #logging.info(inv_word_dicts[1]) #utils.store_labels_to_pkl(inv_word_dicts) #sys.exit(0) args.default_value = default_value embeddings = utils.gen_embeddings(word_dicts, args.embedding_size) train_fn, test_fn, params = build_fn(args, embeddings) logging.info('Done.') logging.info('-' * 50) logging.info(args) topk_acc = args.topk_accuracy #topk_acc=1 labels_data = [] #if args.test_print_allowed: # labels_data=pickle.load(open(labels_file, 'rb')) logging.info('-' * 50) logging.info('Intial test..') dev_data, dev_mask = utils.vectorize(dev_examples, word_dicts, args) all_dev = gen_examples(dev_data, dev_mask, args.batch_size) dev_acc = eval_acc(test_fn, all_dev, inv_word_dicts, topk_acc) logging.info('Dev accuracy: %s %%' % str(dev_acc)) test_data, test_mask = utils.vectorize(test_examples, word_dicts, args, args.test_print_allowed, labels_data) all_test = gen_examples(test_data, test_mask, args.batch_size) test_acc = eval_acc(test_fn, all_test, inv_word_dicts, topk_acc, args.test_print_allowed, labels_data) logging.info('Test accuracy: %s %%' % str(test_acc)) best_acc = dev_acc if args.test_only: return utils.save_params(args.model_file, params, epoch=0, n_updates=0) #utils.store_labels_to_pkl(inv_word_dicts) # Training if args.num_epoches > 0: logging.info('-' * 50) logging.info('Start training..') train_data, train_mask = utils.vectorize(train_examples, word_dicts, args) start_time = time.time() n_updates = 0 all_train_old = gen_examples(train_data, train_mask, args.batch_size) logging.info("start oversampling") all_train = [] all_train_old2 = np.copy(all_train_old) for idx, inps in enumerate(all_train_old): #tmp_inp=[] new_inps = np.copy(inps) for i in range(args.relations): if inps[args.relations + i].sum() < 1: np.random.shuffle(all_train_old2) #inps[args.relations + i][0] = 1 for idx2, inps2 in enumerate(all_train_old2): if inps2[args.relations + i].sum() >= 1: random_index = utils.get_random_example_index( inps2[args.relations + i]) my_col = [ inps2[j][random_index] for j in range(2 * args.relations) ] new_inps = np.insert(new_inps, len(new_inps[i]), my_col, axis=1) #new_inps[j+args.relations]=np.append(inps[j+args.relations], inps2[j+args.relations][random_index]) break all_train += [new_inps.astype(np.int32)] logging.info("done oversampling") #sys.exit(0) no_progress = 0 for epoch in range(args.num_epoches): np.random.shuffle(all_train) for idx, inps in enumerate(all_train): train_loss = train_fn(*inps) if idx % 1000 == 0: #logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1])) logging.info( 'Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' % (epoch, idx, len(all_train), train_loss, time.time() - start_time)) n_updates += 1 if n_updates % args.eval_iter == 0: samples = sorted( np.random.choice(args.num_train, min(args.num_train, args.num_dev), replace=False)) train_data_sample = [ train_data[j][samples] for j in range(args.relations) ] train_mask_sample = [ train_mask[j][samples] for j in range(args.relations) ] sample_train = gen_examples(train_data_sample, train_mask_sample, args.batch_size) #acc = eval_acc(test_fn, sample_train) #logging.info('Train accuracy: %s %%' % str(acc)) dev_acc = eval_acc(test_fn, all_dev, inv_word_dicts, topk_acc) logging.info('Dev accuracy: %s %%' % str(dev_acc)) #test_acc = eval_acc(test_fn, all_test) #logging.info('Test accuracy: %s %%' % str(test_acc)) if dev_acc > best_acc: best_acc = dev_acc logging.info('Best dev accuracy!') utils.save_params(args.model_file, params, epoch=epoch, n_updates=n_updates) no_progress = 0 else: no_progress += 1 logging.info( 'Dev accuracy has not improved in the past %d evaluations' % no_progress) if no_progress >= MAX_NO_PROGRESS: logging.info( "Reached the limit of stagnation. Exiting now...") sys.exit(0)
val_file_name = '/Users/yangsun/Desktop/dataset/validation_cnn.txt' model_path = './model_path' documents, questions, answers = utils.load_data(file_name, 10) word_dict = utils.build_dict(documents + questions) documents_val, questions_val, answers_val = utils.load_data(val_file_name, 100) word_dict_val = utils.build_dict(documents_val + questions_val) entity_markers = list(set([w for w in word_dict.keys() if w.startswith('@entity')] + answers)) entity_markers = ['<unk_entity>'] + entity_markers entity_dict = {w: index for (index, w) in enumerate(entity_markers)} num_labels = len(entity_dict) embeddings = utils.gen_embeddings(word_dict, embedding_size, embedding_file) vocab_size, embedding_size = embeddings.shape # tf.reset_default_graph() d_input = tf.placeholder(dtype=tf.int32, shape=(None, None), name="d_input") q_input = tf.placeholder(dtype=tf.int32, shape=(None, None), name="q_input") # [batch_size, max_seq_length_for_batch] l_mask = tf.placeholder(dtype=tf.float32, shape=(None, None), name="l_mask") # [batch_size, entity num] y = tf.placeholder(dtype=tf.int32, shape=None, name="label") # batch size vector y_1hot= tf.placeholder(dtype=tf.float32, shape=(None, None), name="label_1hot") # onehot encoding of y [batch_size, entitydict] training = tf.placeholder(dtype=tf.bool) word_embeddings = tf.get_variable("glove", shape=(vocab_size, embedding_size), initializer=tf.constant_initializer(embeddings))
def main(args): with tf.device("/gpu:1"): print('-' * 50) print('Load data files..') if args.debug: print('*' * 10 + ' Train') train_data = utils.load_data(args, 'train', 100) print('*' * 10 + ' Test') test_data = utils.load_data(args, 'test', 100) print('*' * 10 + ' Dev') dev_data = utils.load_data(args, 'dev', 100) else: print('*' * 10 + ' Train') train_data = utils.load_data(args, 'train') print('*' * 10 + ' Test') test_data = utils.load_data(args, 'test') print('*' * 10 + ' Dev') dev_data = utils.load_data(args, 'dev') print('-' * 50) print('Build dictionary..') args.word_dict, args.char_dict = utils.build_dict(train_data.data[0] + train_data.data[1] + test_data.data[0] + test_data.data[1] + dev_data.data[0] + dev_data.data[1]) print('-' * 50) # Load embedding file args.embeddings = utils.gen_embeddings(args.word_dict, args.embedding_size, args.embedding_file) (args.word_voc_size, args.embedding_size) = args.embeddings.shape args.char_voc_size = len(args.char_dict) train_data.vectorize(args.word_dict, args.char_dict) test_data.vectorize(args.word_dict, args.char_dict) config_gpu = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config_gpu.gpu_options.allow_growth = True with tf.Session(config=config_gpu) as sess: model = Model(args) trainer = Trainer(args, model) evaluator = Evaluator(args, model) tf.global_variables_initializer().run() timestamp = str(int(time.time())) out_dir = os.path.join(args.out_dir, timestamp) checkpoint_dir = os.path.join(out_dir, "checkpoints") checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables()) if args.load: cpkl = os.path.join(args.out_dir, "1506152182/checkpoints/model-17000") saver.restore(sess, cpkl) test_acc = evaluator.get_evaluation( sess, test_data.gen_minbatches(args.batch_size)) print('Test accuracy: %.2f %%' % test_acc) dev_data.vectorize(args.word_dict, args.char_dict) answers = evaluator.get_answers( sess, dev_data.gen_minbatches(args.batch_size)) gen_answer_file(answers) # return # Training print('-' * 50) print('Start training..') best_acc = 0 start_time = time.time() last_time = start_time n_updates = 0 batch100_time = 0 for epoch in range(args.num_epoches): for idx, batch in enumerate( train_data.gen_minbatches(args.batch_size, shuffle=True)): train_loss, train_op = trainer.step(sess, batch) batch_time = time.time() - last_time if idx % 20 == 0: print( 'Epoch = %d, iter = %d, loss = %.2f, batch time = %.2f (s)' % (epoch, idx, train_loss, batch_time)) n_updates += 1 batch100_time = batch100_time + batch_time # Evalution if n_updates % args.eval_iter == 0: print('time pre 100 batches: %.2f (s)' % (batch100_time)) batch100_time = 0 start_examples = np.random.randint( 0, train_data.num_examples - test_data.num_examples) end_examples = start_examples + test_data.num_examples train_acc = evaluator.get_evaluation( sess, train_data.gen_minbatches(args.batch_size, start_examples, end_examples)) print('Epoch = %d, iter = %d, train_acc = %.2f %%' % (epoch, idx, train_acc)) test_acc = evaluator.get_evaluation( sess, test_data.gen_minbatches(args.batch_size)) print( 'Epoch = %d, iter = %d, test_acc = %.2f %%, Best test accuracy: %.2f %%' % (epoch, idx, test_acc, best_acc)) if test_acc > best_acc: best_acc = test_acc print( 'Best test accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%' % (epoch, n_updates, test_acc)) path = saver.save(sess, checkpoint_prefix, global_step=n_updates) print( "Saved model checkpoint to {}\n".format(path)) last_time = time.time()
def init(): path = config.data_path config.embedding_file = os.path.join(path, config.embedding_file) config.train_file = os.path.join(path, config.train_file) config.dev_file = os.path.join(path, config.dev_file) config.test_file = os.path.join(path, config.test_file) dim = utils.get_dim(config.embedding_file) config.embedding_size = dim # Config log if config.log_file is None: logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m-%d %H:%M') else: logging.basicConfig(filename=config.log_file, filemode='w', level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m-%d %H:%M') # Load data logging.info('-' * 50) logging.info('Load data files..') if config.debug: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(config.train_file, 1000) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(config.dev_file, 100) else: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(config.train_file) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(config.dev_file) config.num_train = len(train_examples[0]) config.num_dev = len(dev_examples[0]) # Build dictionary logging.info('-' * 50) logging.info('Build dictionary..') word_dict = utils.build_dict(train_examples[0] + train_examples[1]) entity_markers = list(set( [w for w in word_dict.keys() if w.startswith('@entity')] + train_examples[2] )) entity_markers = ['<unk_entity>'] + entity_markers entity_dict = {w: index for (index, w) in enumerate(entity_markers)} logging.info('Entity markers: %d' % len(entity_dict)) config.num_labels = len(entity_dict) logging.info('-' * 50) logging.info('Load embedding file..') embeddings = utils.gen_embeddings(word_dict, config.embedding_size, config.embedding_file) (config.vocab_size, config.embedding_size) = embeddings.shape # Log parameters flags = config.__dict__['__flags'] flag_str = "\n" for k in flags: flag_str += "\t%s:\t%s\n" % (k, flags[k]) logging.info(flag_str) # Vectorize test data logging.info('-' * 50) logging.info('Vectorize test data..') # d: document, q: question, a:answer # l: whether the entity label occurs in the document dev_d, dev_q, dev_l, dev_a = utils.vectorize(dev_examples, word_dict, entity_dict) assert len(dev_d) == config.num_dev all_dev = utils.gen_examples(dev_d, dev_q, dev_l, dev_a, config.batch_size) if config.test_only: return embeddings, all_dev, None # Vectorize training data logging.info('-' * 50) logging.info('Vectorize training data..') train_d, train_q, train_l, train_a = utils.vectorize(train_examples, word_dict, entity_dict) assert len(train_d) == config.num_train all_train = utils.gen_examples(train_d, train_q, train_l, train_a, config.batch_size) return embeddings, all_dev, all_train