def get_joint_datasets(args): vocab = data_utils.get_vocab() train_gen_list = [] valid_gen_list = [] if args.mode == 'train': if not args.remove_open and not args.only_crowd: train_gen_list.append( #`("open", get_data_gen('train/open*.json', 'train', args, vocab, "open"))) ("open", get_data_gen('/distant_supervision/headword_train.json', 'train', args, vocab, "open"))) valid_gen_list.append( ("open", get_data_gen('/distant_supervision/headword_dev.json', 'dev', args, vocab, "open"))) if not args.remove_el and not args.only_crowd: valid_gen_list.append( ("wiki", get_data_gen('/distant_supervision/el_dev.json', 'dev', args, vocab, "wiki" if args.multitask else "open"))) train_gen_list.append( ("wiki", get_data_gen('/distant_supervision/el_train.json', 'train', args, vocab, "wiki" if args.multitask else "open"))) #get_data_gen('train/el_train.json', 'train', args, vocab, "wiki" if args.multitask else "open"))) if args.add_crowd or args.only_crowd: train_gen_list.append(("open", get_data_gen('/crowd/train_m.json', 'train', args, vocab, "open"))) crowd_dev_gen = get_data_gen('/crowd/dev.json', 'dev', args, vocab, "open") return train_gen_list, valid_gen_list, crowd_dev_gen
def train(args): vocab = data_utils.get_vocab(vocab_file=args.vocab_file, min_freq=args.min_vocab_freq) # vocab = {} # with open(args.vocab_file, mode='r') as infile: # for line in infile: # w, w_id = line.split('\t') # vocab[w] = int(w_id) print('Vocab loaded...') print('VOCAB SIZE = ', len(vocab)) if args.model_type == 'transformer': transformer = Transformer(args=args, vocab=vocab) transformer.train_generator() elif args.model_type == 'rnn': rnn_params = {'rec_cell': 'lstm', 'encoder_dim': 800, 'decoder_dim': 800, 'num_encoder_layers': 2, 'num_decoder_layers': 2 } rnn = RNNSeq2Seq(args=args, rnn_params=rnn_params, vocab=vocab) # rnn.train() rnn.train_keras() elif args.model_type == 'han_rnn': han_rnn = HanRnnSeq2Seq(args=args, vocab=vocab) han_rnn.train() elif args.model_type == 'cnn': cnn = ConvSeq2Seq(args=args, vocab=vocab) cnn.train_keras() return
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = FLAGS.beam_num # 在beamSample的时候,所有的beam一起丢进去,产生接上下一个词的概率 # Load vocabularies. q_vocab, r_vocab = data_utils.get_vocab(FLAGS.data_dir) r_vocab_reversed = data_utils.get_reverse_vocab_dict(r_vocab) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() question = sys.stdin.readline() while question: if question == "quit": quit() # Get token-ids for the input sentence. src_i = data_utils.tokenize_sentence(q_vocab, question) response = model.generate_response(sess, src_i, _buckets, FLAGS.beam_num, FLAGS.samples_per_beam, FLAGS.segment_length) # Print out respond corresponding to outputs. print(" ".join([tf.compat.as_str(r_vocab_reversed[response_word]) for response_word in response])) print("> ", end="") sys.stdout.flush() question = sys.stdin.readline()
def get_datasets(data_lists, args): data_gen_list = [] vocab_set = data_utils.get_vocab() for dataname, mode, goal in data_lists: data_gen_list.append( get_data_gen(dataname, mode, args, vocab_set, goal)) return data_gen_list
def get_datasets(data_lists, args, eval_epoch=1): data_gen_list = [] vocab_set = data_utils.get_vocab(args.embed_source) for dataname, mode, goal in data_lists: data_gen_list.append( get_data_gen(dataname, mode, args, vocab_set, goal, eval_epoch)) return data_gen_list
def train(args): vocab = data_utils.get_vocab(vocab_file=args.vocab_file, min_freq=args.min_vocab_freq) print('Vocab loaded...') print('VOCAB SIZE = ', len(vocab)) if args.model_type == 'rnn': print('Training RNN...') rnn = RNNSeq2Seq(args=args, vocab=vocab) rnn.train() elif args.model_type == 'han_rnn': print('Training HAN-RNN...') han_rnn = HanRnnSeq2Seq(args=args, vocab=vocab) han_rnn.train()
def get_datasets(data_lists, args): data_gen_list = [] if args.elmo: vocab = (constant.CHAR_DICT, None) # dummy empty dict elmo = data_utils.init_elmo() bert = None elif args.bert: vocab = (constant.CHAR_DICT, None) # dummy empty dict elmo = None bert = None else: vocab = data_utils.get_vocab() elmo = None bert = None for dataname, mode, goal in data_lists: data_gen_list.append(get_data_gen(dataname, mode, args, vocab, goal, elmo=elmo, bert=bert)) return data_gen_list, elmo
def get_joint_datasets(args): if args.elmo: vocab = (constant.CHAR_DICT, None) # dummy empty dict elmo = data_utils.init_elmo() bert = None elif args.bert: vocab = (constant.CHAR_DICT, None) # dummy empty dict elmo = None bert = None else: # glove vocab = data_utils.get_vocab() elmo = None bert = None train_gen_list = [] valid_gen_list = [] if args.mode in ['train', 'train_labeler']: if not args.remove_open and not args.only_crowd: train_gen_list.append( ("open", get_data_gen('train_full/open_train_tree*.json', 'train', args, vocab, "open", elmo=elmo, bert=bert))) #("open", get_data_gen('distant_supervision/headword_train_tree.json', 'train', args, vocab, "open", elmo=elmo, bert=bert))) valid_gen_list.append(("open", get_data_gen('distant_supervision/headword_dev_tree.json', 'dev', args, vocab, "open", elmo=elmo, bert=bert))) if not args.remove_el and not args.only_crowd: valid_gen_list.append( ("wiki", get_data_gen('distant_supervision/el_dev_tree.json', 'dev', args, vocab, "wiki" if args.multitask else "open", elmo=elmo, bert=bert))) train_gen_list.append( ("wiki", #get_data_gen('distant_supervision/el_train_tree.json', 'train', args, vocab, "wiki" if args.multitask else "open", elmo=elmo, bert=bert))) get_data_gen('train_full/el_train_full_tree.json', 'train', args, vocab, "wiki" if args.multitask else "open", elmo=elmo, bert=bert))) if args.add_crowd or args.only_crowd: train_gen_list.append( ("open", get_data_gen('crowd/train_m_tree.json', 'train', args, vocab, "open", elmo=elmo, bert=bert))) if args.add_expanded_head: train_gen_list.append( ("open", get_data_gen('train_full/open_train_1m_cls_relabeled.json', 'train', args, vocab, "open", elmo=elmo, bert=bert))) if args.add_expanded_el: train_gen_list.append( ("wiki", get_data_gen('train_full/el_train_1m_cls_relabeled.json', 'train', args, vocab, "wiki" if args.multitask else "open", elmo=elmo, bert=bert))) #crowd_dev_gen = get_data_gen('crowd/dev.json', 'dev', args, vocab, "open") crowd_dev_gen = None # get_data_gen('crowd/dev_tree.json', 'dev', args, vocab, "open", elmo=elmo, bert=bert) return train_gen_list, valid_gen_list, crowd_dev_gen, elmo, bert, vocab
FLAGS.use_subsampled_dataset = True #dir ubuntu FLAGS.raw_data_dir = "/home/usuario/datasets" if args.dataset=='eus': FLAGS.max_audio_length = 680 # obtained from sequence lengths histogram FLAGS.max_freq_length = 201 elif args.dataset=='quz': FLAGS.max_audio_length = 100 # TBD FLAGS.max_freq_length = 100 # set sentence, doc length to maximum output = open("tunning_"+FLAGS.data_mode+"/"+FLAGS.data_mode + "_hp_grid_tuning_%s.txt" % args.file_suffix,'w') vocab_dict,inverted_vocab = get_vocab() train_data = DataProcessor(vocab_dict,inverted_vocab,data_type="train") val_batch = batch_load_data(DataProcessor(vocab_dict,inverted_vocab,data_type="val")) setup_by_id = {} results_by_id_wer = {} results_by_id_cer = {} setup_id = 0 best_global_wer = 200 best_global_cer = 200 best_setup_id_wer = -1 best_setup_id_cer = -1 ## FLAGS.___ = ___ # set as constant so it doesn't clutter output #FLAGS.use_conv2d = True #FLAGS.use_dropout = False
def main(): with open('config.yaml', 'r') as infile: config = yaml.load(infile) dataset = get_nli_dataset(path=config['data']['nli_data_dir']) all_sentences = [] for split in ['train', 'test', 'dev']: all_sentences += dataset[split]['s1'] + dataset[split]['s2'] max_seq_len = max(len(s) for s in all_sentences) print('Max sentence length: {0}'.format(max_seq_len)) vocab = get_vocab(all_sentences) word_embedding_matrix = get_embeddings_matrix( embeddings_path=config['data']['embeddings_path'], vocab=vocab) print('Embedding Matrix Shape: {0}'.format(word_embedding_matrix.shape)) try: num_train_samples = int(config['data']['num_train_samples']) except ValueError: num_train_samples = len(dataset['train']['s1']) + 1 try: num_dev_samples = int(config['data']['num_dev_samples']) except ValueError: num_dev_samples = len(dataset['dev']['s1']) + 1 try: num_test_samples = int(config['data']['num_test_samples']) except ValueError: num_test_samples = len(dataset['test']['s1']) + 1 s1_train_x = get_sequences(sentences=dataset['train']['s1'], max_seq_len=max_seq_len, vocab=vocab)[:num_train_samples] s2_train_x = get_sequences(sentences=dataset['train']['s2'], max_seq_len=max_seq_len, vocab=vocab)[:num_train_samples] s1_dev_x = get_sequences(sentences=dataset['dev']['s1'], max_seq_len=max_seq_len, vocab=vocab)[:num_dev_samples] s2_dev_x = get_sequences(sentences=dataset['dev']['s2'], max_seq_len=max_seq_len, vocab=vocab)[:num_dev_samples] s1_test_x = get_sequences(sentences=dataset['test']['s1'], max_seq_len=max_seq_len, vocab=vocab)[:num_test_samples] s2_test_x = get_sequences(sentences=dataset['test']['s2'], max_seq_len=max_seq_len, vocab=vocab)[:num_test_samples] print('Train Shape: S1: {0}, S2: {1}'.format(s1_train_x.shape, s2_train_x.shape)) print('Dev Shape: S1: {0}, S2: {1}'.format(s1_dev_x.shape, s2_dev_x.shape)) print('Test Shape: S1: {0}, S2: {1}'.format(s1_test_x.shape, s2_test_x.shape)) train_y = to_categorical(dataset['train']['target'], num_classes=3)[:num_train_samples] dev_y = to_categorical(dataset['dev']['target'], num_classes=3)[:num_dev_samples] test_y = to_categorical(dataset['test']['target'], num_classes=3)[:num_test_samples] print('Y Train Shape: {0}'.format(train_y.shape)) print('Y Dev Shape: {0}'.format(dev_y.shape)) print('Y Test Shape: {0}'.format(test_y.shape)) if not os.path.exists(config['model']['path']): os.makedirs(config['model']['path']) nli_classifier = NLIClassifier(config=config['model'], vocab_size=len(vocab), embedding_matrix=word_embedding_matrix, max_seq_len=max_seq_len) fit_start_time = time() history = nli_classifier.fit(s1_train_x=s1_train_x, s2_train_x=s2_train_x, s1_dev_x=s1_dev_x, s2_dev_x=s2_dev_x, train_y=train_y, dev_y=dev_y) fit_end_time = time() fit_time = fit_end_time - fit_start_time print('Fit time: {0}'.format(round(fit_time, 3))) make_plots( history=history, path='{0}/NLITraining_{1}.png'.format( config['model']['path'], config['model']['encoder']['type']), title='NLI Training - {0}'.format(config['model']['encoder']['type']), epochs=config['model']['training']['epochs']) pred_y = nli_classifier.predict(s1_x=s1_test_x, s2_x=s2_test_x) test_y = np.argmax(test_y, axis=1) pred_y = np.argmax(pred_y, axis=1) print() print('Accuracy - {0}'.format(round(accuracy_score(test_y, pred_y), 3))) print('Classification Report - ') print(classification_report(y_true=test_y, y_pred=pred_y)) encode_start_time = time() encoded_s1 = nli_classifier.encode(s1_x=s1_dev_x) encode_end_time = time() print('Encoded S1 shape: {0}'.format(encoded_s1.shape)) encoding_time = encode_end_time - encode_start_time print('Encoding time for Dev Set: {0}'.format(round(encoding_time, 3))) print('Encoding time per sample: {0}'.format( round(encoding_time / len(s1_dev_x), 3)))
train_tgt = [ line.strip().split() for line in codecs.open(data_path_train_tgt, 'r', encoding='utf-8') ] dev_src = [ line.strip().split() for line in codecs.open(data_path_dev_src, 'r', encoding='utf-8') ] dev_tgt = [ line.strip().split() for line in codecs.open(data_path_dev_tgt, 'r', encoding='utf-8') ] # Get training and dev vocabularies src_vocab, src_word2ind, src_ind2word = get_vocab(train_src) tgt_vocab, tgt_word2ind, tgt_ind2word = get_vocab(train_tgt) logging.info('Running experiment with seed %s ...' % (args.seed)) logging.info('Finished reading data ...') logging.info('Number of training sentence-pairs : %d ' % (len(train_src))) logging.info('Number of validation sentence-pairs : %d ' % (len(dev_src))) # Create symbolic variables src_inp = T.imatrix() tgt_inp = T.imatrix() tgt_op = T.imatrix() src_lens = T.ivector() tgt_mask = T.fmatrix() src_mask = T.fmatrix()
weight_path = utils.mkpath('weight/{}/{}'.format(MODEL_NAME, p)) last_weight, last_epoch = utils.get_last_epoch(weight_path) # move on to next prompt if epoch not greater than last one saved if args.epoch <= last_epoch: continue train_df = data_utils.load_data(p, 'train') val_df = data_utils.load_data(p, 'val') # test_df = data_utils.load_data(p, 'test') print(train_df.shape) print(val_df.shape) # print(test_df.shape) vocab = data_utils.get_vocab(p, train_df) glove_path = 'glove/glove.6B.50d.txt' emb_matrix = data_utils.load_glove_embedding(glove_path, vocab) K.clear_session() model = models.build_glove_model(p, len(vocab), emb_matrix, glove_trainable=args.ft, drop_rate=args.drop) if last_weight: print('Loading weight :', last_weight) model.load_weights(last_weight) train_gen = data_utils.gen(MODEL_NAME,
test_df = data_utils.load_data(p, 'test') print(test_df.shape) K.clear_session() if MODEL_NAME.startswith('elmo'): vocab = None model = models.build_elmo_model_full(p, elmo_trainable=args.ft, use_mask=args.mask, lstm_units=args.re, drop_rate=args.drop, summary=False) elif MODEL_NAME.startswith('glove'): vocab = data_utils.get_vocab(p) glove_path = 'glove/glove.6B.50d.txt' emb_matrix = data_utils.load_glove_embedding(glove_path, vocab) model = models.build_glove_model(p, len(vocab), emb_matrix, glove_trainable=args.ft, drop_rate=args.drop, summary=False) print('Loading weight :', weight) model.load_weights(weight) test_gen = data_utils.gen(MODEL_NAME, p, test_df,
def train(): print('Applying Parameters:') for k, v in FLAGS.__dict__['__flags'].iteritems(): print('%s: %s' % (k, str(v))) print('Preparing data in %s' % FLAGS.data_dir) in_seq_train, out_seq_train, in_seq_dev, in_seq_test, vocab_path = \ data_utils.get_vocab(FLAGS.data_dir, FLAGS.in_vocab_size, FLAGS.out_vocab_size) result_dir = FLAGS.train_dir + 'train/test_result' if not os.path.isdir(result_dir): os.makedirs(result_dir) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) # make sure result is repeatable ??? random.seed(42) np.random.seed(42) tf.set_random_seed(42) sess = tf.Session() sess.as_default() #Create model. print("Max sequence length: %d." % _buckets[0][0]) print("Creating %d layers of %d units." % (FLAGS.num_layer, FLAGS.size)) model, model_test = create_model(sess, rev_vocab, len(vocab)) print("Creating model with source_vocab_size=%d" % len(rev_vocab)) # Read data into buckets and compute their sizes. print("Reading train data (train set limit: %d)." % FLAGS.max_train_data_size) train_set = read_data(in_seq_train, out_seq_train) # Train Loop step_time, loss = 0.0, 0.0 current_step = 0 best_valid_score, best_test_score = 0, 0 train_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/train', sess.graph) valid_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/valid') while model.global_step.eval(sess) < FLAGS.max_traing_step: # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, batch_sequence_length = model.get_batch( train_set) _, step_loss, summary, decoder_logits = model.get_input_feed( sess, encoder_inputs, _buckets, decoder_inputs) train_writer.add_summary(summary, current_step) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt") model.saver.save(sess, checkpoint_path) step_time, loss = 0.0, 0.0 return sess, model_test, vocab
import pickle """ 现在的问题是,数据集中规定好的那些符号和原来GNMT里面的不一样。所以需要更新一下咯。 """ def update_dict(dict_ori, src_of_vocab): """ Args: dict_ori is a dict that needs to be updated. src_of_vocab: str, "q" or "r" Returns: returns an updated dictionary that adds the special tags. """ dict_new = {} dict_new[data_utils._PAD] = data_utils.PAD_ID # 0 dict_new[data_utils._GO] = data_utils.GO_ID # 1 dict_new[data_utils._EOS] = data_utils.EOS_ID # 2 dict_new[data_utils._UNK] = data_utils.UNK_ID # 3 for (k, v) in dict_ori.items(): dict_new[k] = v + 3 # originaly, value starts from 1 fw = open(src_of_vocab + "_train_vocab.pkl", 'wb') pickle.dump(dict_new, fw, pickle.HIGHEST_PROTOCOL) fw.close() print("the %s dict contains %d words" % (src_of_vocab, len(dict_new.keys()))) q_dict, r_dict = data_utils.get_vocab("dialog_data_new") update_dict(q_dict, "q") update_dict(r_dict, "r")