def main(): trained_model = "checkpoints/model.ckpt" embedding_size = 100 # Word embedding dimension epochs = 10 batch_size = 64 # Batch data size rnn_size = 50 # Number of hidden layer neurons sequence_length = 300 # Sentence length learning_rate = 0.01 # Learning rate lrdownRate = 0.9 margin = 0.1 attention_matrix_size = 100 gpu_mem_usage = 0.75 gpu_device = "/gpu:0" cpu_device = "/cpu:0" embeddings, word2idx = data_helpers.load_embedding('vectors.nobin') voc = data_helpers.load_vocab('D:\\DataMining\\Datasets\\insuranceQA\\V1\\vocabulary') all_answers = data_helpers.load_answers('D:\\DataMining\\Datasets\\insuranceQA\\V1\\answers.label.token_idx', voc) questions, pos_answers, neg_answers = data_helpers.load_train_data('D:\\DataMining\\Datasets\\insuranceQA\\V1\\question.train.token_idx.label', all_answers, voc, word2idx, sequence_length) data_size = len(questions) permutation = np.random.permutation(data_size) questions = questions[permutation, :] pos_answers = pos_answers[permutation, :] neg_answers = neg_answers[permutation, :] with tf.Graph().as_default(), tf.device(gpu_device): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_mem_usage) session_conf = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options) model = QALSTM(batch_size, sequence_length, embeddings, embedding_size, rnn_size, margin, attention_matrix_size) with tf.Session(config=session_conf).as_default() as sess: # config=session_conf saver = tf.train.Saver() print("Start training") sess.run(tf.global_variables_initializer()) # Initialize all variables for epoch in range(epochs): print("The training of the %s iteration is underway" % (epoch + 1)) batch_number = 1 for question, pos_answer, neg_answer in data_helpers.batch_iter(questions, pos_answers, neg_answers, batch_size): start_time = time.time() feed_dict = { model.q: question, model.ap: pos_answer, model.an: neg_answer, model.lr: learning_rate } _, loss, acc = sess.run([model.train_op, model.loss, model.acc], feed_dict) duration = time.time() - start_time print('Epoch: [%d][%d/%d]\tTime %.3f\tLoss %2.3f\tAcc %2.3f' % (epoch + 1, batch_number * batch_size, data_size, duration, loss, acc)) batch_number += 1 learning_rate *= lrdownRate saver.save(sess, trained_model) print("End of the training")
def main(): trained_model = "checkpoints/model.ckpt" embedding_size = 100 # Word embedding dimension batch_size = 128 # Batch data size sequence_length = 300 # Sentence length rnn_size = 50 # Number of hidden layer neurons attention_matrix_size = 100 margin = 0.1 gpu_mem_usage = 0.75 gpu_device = "/gpu:0" embeddings, word2idx = data_helpers.load_embedding('vectors.nobin') voc = data_helpers.load_vocab( 'D:\\DataMining\\Datasets\\insuranceQA\\V1\\vocabulary') all_answers = data_helpers.load_answers( 'D:\\DataMining\\Datasets\\insuranceQA\\V1\\answers.label.token_idx', voc) questions, answers, labels, qids, aids = data_helpers.load_test_data( 'D:\\DataMining\\Datasets\\insuranceQA\\V1\\question.test1.label.token_idx.pool', all_answers, voc, word2idx, 300) with tf.Graph().as_default(), tf.device(gpu_device): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_mem_usage) session_conf = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options) model = QALSTM(batch_size, sequence_length, embeddings, embedding_size, rnn_size, margin, attention_matrix_size) with tf.Session(config=session_conf).as_default( ) as sess: # config=session_conf saver = tf.train.Saver() print("Start loading the model") saver.restore(sess, trained_model) print("The model is loaded") scores = [] for question, answer in data_helpers.test_batch_iter( questions, answers, batch_size): feed_dict = {model.qtest: question, model.atest: answer} score = sess.run([model.scores], feed_dict) scores.extend(score[0].tolist()) MAP, MRR = eval_map_mrr(qids, aids, scores, labels) print('MAP %2.3f\tMRR %2.3f' % (MAP, MRR))
# FLAGS._parse_flags() # FLAGS(sys.argv) print("\nParameters:") print(FLAGS) # Load data print("Loading data...") trainset = Dataset('../../data/'+FLAGS.dataset+'/train.ss') devset = Dataset('../../data/'+FLAGS.dataset+'/dev.ss') testset = Dataset('../../data/'+FLAGS.dataset+'/test.ss') alldata = np.concatenate([trainset.t_docs, devset.t_docs, testset.t_docs], axis=0) embeddingpath = '../../data/'+FLAGS.dataset+'/embedding.txt' embeddingfile, wordsdict = data_helpers.load_embedding(embeddingpath, alldata, FLAGS.embedding_dim) del alldata print("Loading data finished...") usrdict, prddict = trainset.get_usr_prd_dict() trainbatches = trainset.batch_iter(usrdict, prddict, wordsdict, FLAGS.n_class, FLAGS.batch_size, FLAGS.num_epochs, FLAGS.max_sen_len, FLAGS.max_doc_len) devset.genBatch(usrdict, prddict, wordsdict, FLAGS.batch_size, FLAGS.max_sen_len, FLAGS.max_doc_len, FLAGS.n_class) testset.genBatch(usrdict, prddict, wordsdict, FLAGS.batch_size, FLAGS.max_sen_len, FLAGS.max_doc_len, FLAGS.n_class) with tf.Graph().as_default(): session_config = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement,
def main(argv): parser = argparse.ArgumentParser( description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--devset', dest='devset', action='store', metavar='DEVSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') args = parser.parse_args() # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) devset = dataset_walker.dataset_walker(args.devset, dataroot=args.dataroot, labels=True, translations=True) testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) train_utters, dev_utters, test_utters = data_helpers.load_dstc5_dataset_multitask( trainset, devset, testset) train_utters += dev_utters context_case = 1 # 여기다가 previous labels context 를 구성하는 코드를 작성하자! # 1) 이전 화행 N개 (speaker 구분안함) # 2) 이전 턴의 상대방 발화들의 모든 화행 (n개) if context_case == 1: pass else: pass # pprint(train_utters[:2]) # pprint(test_utters[:2]) # dump_corpus(train_utters, "dstc5_train.txt") # dump_corpus(test_utters, "dstc5_test.txt") # build vocabulary utters = [[char for char in utter[0]] for utter in train_utters] max_sent_len = int(params['max_sent_len']) pad_utters = data_helpers.pad_sentences(utters, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters) print("vocabulary size: %d" % len(vocabulary)) # build input train_inputs = data_helpers.build_input_data(pad_utters, vocabulary) utters = [[char for char in utter[0]] for utter in test_utters] pad_utters = data_helpers.pad_sentences(utters, max_sent_len) test_inputs = data_helpers.build_input_data(pad_utters, vocabulary) # build labels train_labels_category = [utter[3] for utter in train_utters] test_labels_category = [utter[3] for utter in test_utters] train_labels_attr = [utter[4] for utter in train_utters] test_labels_attr = [utter[4] for utter in test_utters] train_labels_sa = [utter[5] for utter in train_utters] test_labels_sa = [utter[5] for utter in test_utters] label_binarizer_category = preprocessing.MultiLabelBinarizer() label_binarizer_category.fit(train_labels_category + test_labels_category) label_binarizer_attr = preprocessing.MultiLabelBinarizer() label_binarizer_attr.fit(train_labels_attr + test_labels_attr) label_binarizer_sa = preprocessing.MultiLabelBinarizer() label_binarizer_sa.fit(train_labels_sa + test_labels_sa) train_labels_category = label_binarizer_category.transform( train_labels_category) test_labels_category = label_binarizer_category.transform( test_labels_category) train_labels_attr = label_binarizer_attr.transform(train_labels_attr) test_labels_attr = label_binarizer_attr.transform(test_labels_attr) train_labels_sa = label_binarizer_sa.transform(train_labels_sa) test_labels_sa = label_binarizer_sa.transform(test_labels_sa) # split speakers into two sets tourist_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'tourist' ] guide_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'guide' ] tourist_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'tourist' ] guide_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide' ] np.random.shuffle(tourist_train_indices) np.random.shuffle(guide_train_indices) # np.random.shuffle(tourist_test_indices) # np.random.shuffle(guide_test_indices) tourist_train_inputs = train_inputs[tourist_train_indices] tourist_train_labels_category = train_labels_category[ tourist_train_indices] tourist_train_labels_attr = train_labels_attr[tourist_train_indices] tourist_train_labels_sa = train_labels_sa[tourist_train_indices] tourist_train_labels = (tourist_train_labels_category, tourist_train_labels_attr, tourist_train_labels_sa) guide_train_inputs = train_inputs[guide_train_indices] guide_train_labels_category = train_labels_category[guide_train_indices] guide_train_labels_attr = train_labels_attr[guide_train_indices] guide_train_labels_sa = train_labels_sa[guide_train_indices] guide_train_labels = (guide_train_labels_category, guide_train_labels_attr, guide_train_labels_sa) tourist_test_inputs = test_inputs[tourist_test_indices] tourist_test_labels_category = test_labels_category[tourist_test_indices] tourist_test_labels_attr = test_labels_attr[tourist_test_indices] tourist_test_labels_sa = test_labels_sa[tourist_test_indices] tourist_test_labels = (tourist_test_labels_category, tourist_test_labels_attr, tourist_test_labels_sa) guide_test_inputs = test_inputs[guide_test_indices] guide_test_labels_category = test_labels_category[guide_test_indices] guide_test_labels_attr = test_labels_attr[guide_test_indices] guide_test_labels_sa = test_labels_sa[guide_test_indices] guide_test_labels = (guide_test_labels_category, guide_test_labels_attr, guide_test_labels_sa) # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding( vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) run_slu_task(embedding_matrix, vocabulary, label_binarizer_sa, tourist_train_inputs, tourist_train_labels, tourist_test_inputs, tourist_test_labels) run_slu_task(embedding_matrix, vocabulary, label_binarizer_sa, guide_train_inputs, guide_train_labels, guide_test_inputs, guide_test_labels)
def main(argv): parser = argparse.ArgumentParser( description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') args = parser.parse_args() train_utters = [] trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading training instances ... ') for call in trainset: context_utters = [] context_utter_str = '<PAD/>' context_labels = [] context_label = ['INI_OPENING'] last_speaker = None for (log_utter, translations, label_utter) in call: transcript = data_helpers.tokenize_and_lower( log_utter['transcript']) speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += [ '%s_%s' % (sa['act'], attr) for attr in sa['attributes'] ] sa_label_list = sorted(set(sa_label_list)) if last_speaker is not None and log_utter[ 'speaker'] != last_speaker: if len(context_utters) > 0: context_utter_str = ' <pause> '.join(context_utters) context_label = context_labels[-1] else: context_utter_str = '<PAD/>' context_label = ['INI_OPENING'] context_utters = [] context_labels = [] last_speaker = None if last_speaker is None or log_utter['speaker'] == last_speaker: context_utters += [transcript] # cumulate context utters context_labels += [sa_label_list] last_speaker = log_utter['speaker'] train_utters += [ (transcript, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], context_label) ] # train_utters += [(transcript, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], sa_label_list)] sys.stderr.write('Done\n') test_utters = [] testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading testing instances ... ') for call in testset: context_utters = [] context_utter_str = '<PAD/>' context_labels = [] context_label = ['INI_OPENING'] last_speaker = None for (log_utter, translations, label_utter) in call: try: translation = data_helpers.tokenize_and_lower( translations['translated'][0]['hyp']) except: translation = '' speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += [ '%s_%s' % (sa['act'], attr) for attr in sa['attributes'] ] sa_label_list = sorted(set(sa_label_list)) if last_speaker is not None and log_utter[ 'speaker'] != last_speaker: if len(context_utters) > 0: context_utter_str = ' <pause> '.join(context_utters) context_label = context_labels[-1] else: context_utter_str = '' context_label = ['INI_OPENING'] context_utters = [] context_labels = [] last_speaker = None if last_speaker is None or log_utter['speaker'] == last_speaker: context_utters += [translation] # cumulate context utters context_labels += [sa_label_list] last_speaker = log_utter['speaker'] test_utters += [ (translation, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], context_label) ] # test_utters += [(translation, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], sa_label_list)] # pprint(train_utters[:2]) # pprint(test_utters[:2]) # dump_corpus(train_utters, "dstc5_train.txt") # dump_corpus(test_utters, "dstc5_test.txt") # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) # build vocabulary utters = [utter[0].split(' ') for utter in train_utters] ctx_utters = [utter[1].split(' ') for utter in train_utters] print("max context utter length: %d " % max([len(ctx_utter) for ctx_utter in ctx_utters])) max_sent_len = int(params['max_sent_len']) pad_utters = data_helpers.pad_sentences(utters, max_sent_len) pad_ctx_utters = data_helpers.pad_sentences(ctx_utters, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_ctx_utters) print("vocabulary size: %d" % len(vocabulary)) # build input train_inputs = data_helpers.build_input_data(pad_utters, vocabulary) train_ctx_inputs = data_helpers.build_input_data(pad_ctx_utters, vocabulary) utters = [utter[0].split(' ') for utter in test_utters] ctx_utters = [utter[1].split(' ') for utter in test_utters] pad_utters = data_helpers.pad_sentences(utters, max_sent_len) pad_ctx_utters = data_helpers.pad_sentences(ctx_utters, max_sent_len) test_inputs = data_helpers.build_input_data(pad_utters, vocabulary) test_ctx_inputs = data_helpers.build_input_data(pad_ctx_utters, vocabulary) # build labels sa_train_labels = [utter[3] for utter in train_utters] sa_test_labels = [utter[3] for utter in test_utters] sa_train_ctx_labels = [utter[5] for utter in train_utters] sa_test_ctx_labels = [utter[5] for utter in test_utters] label_binarizer = preprocessing.MultiLabelBinarizer() label_binarizer.fit(sa_train_labels + sa_test_labels) train_labels = label_binarizer.transform(sa_train_labels) test_labels = label_binarizer.transform(sa_test_labels) train_ctx_labels = label_binarizer.transform(sa_train_ctx_labels) test_ctx_labels = label_binarizer.transform(sa_test_ctx_labels) # split speakers into two sets tourist_train_indices = [ i for i, utter in enumerate(train_utters) if utter[2].lower() == 'tourist' ] guide_train_indices = [ i for i, utter in enumerate(train_utters) if utter[2].lower() == 'guide' ] tourist_test_indices = [ i for i, utter in enumerate(test_utters) if utter[2].lower() == 'tourist' ] guide_test_indices = [ i for i, utter in enumerate(test_utters) if utter[2].lower() == 'guide' ] np.random.shuffle(tourist_train_indices) np.random.shuffle(guide_train_indices) tourist_train_inputs = train_inputs[tourist_train_indices] tourist_train_ctx_inputs = train_ctx_inputs[tourist_train_indices] tourist_train_labels = train_labels[tourist_train_indices] tourist_train_ctx_labels = train_ctx_labels[tourist_train_indices] guide_train_inputs = train_inputs[guide_train_indices] guide_train_ctx_inputs = train_ctx_inputs[guide_train_indices] guide_train_labels = train_labels[guide_train_indices] guide_train_ctx_labels = train_ctx_labels[guide_train_indices] tourist_test_inputs = test_inputs[tourist_test_indices] tourist_test_ctx_inputs = test_ctx_inputs[tourist_test_indices] tourist_test_labels = test_labels[tourist_test_indices] tourist_test_ctx_labels = test_ctx_labels[tourist_test_indices] guide_test_inputs = test_inputs[guide_test_indices] guide_test_ctx_inputs = test_ctx_inputs[guide_test_indices] guide_test_labels = test_labels[guide_test_indices] guide_test_ctx_labels = test_ctx_labels[guide_test_indices] # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding( vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) run_slu_task(embedding_matrix, vocabulary, label_binarizer, tourist_train_inputs, tourist_train_ctx_inputs, tourist_train_labels, tourist_train_ctx_labels, tourist_test_inputs, tourist_test_ctx_inputs, tourist_test_labels, tourist_test_ctx_labels) run_slu_task(embedding_matrix, vocabulary, label_binarizer, guide_train_inputs, guide_train_ctx_inputs, guide_train_labels, guide_train_ctx_labels, guide_test_inputs, guide_test_ctx_inputs, guide_test_labels, guide_test_ctx_labels) print("")
def train(): with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # load the vocab and embedding files vocab_table, vocab, vocab_size = load_vocab(FLAGS.vocab_file) embeddings = load_embedding(FLAGS.embed_file, vocab) train_iterator, train_next_batch = get_iterator( FLAGS.train_data_file, vocab_table, FLAGS.batch_size, FLAGS.max_seq_len, padding=True) dev_iterator, dev_next_batch = get_iterator(FLAGS.dev_data_file, vocab_table, 10000000, FLAGS.max_seq_len, padding=True) mode = tf.estimator.ModeKeys.TRAIN mymodel = model(vocab_size, l2_reg_lambda=FLAGS.l2_reg_lambda, mode=mode) global_step = tf.Variable(0, name="global_step", trainable=False) learning_rate = 0.001 optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) grads_and_vars = optimizer.compute_gradients(mymodel.loss) # clip the gradient norms: cliped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in grads_and_vars] train_op = optimizer.apply_gradients(cliped_gvs, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries # timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, tf.flags.FLAGS.model + "_runs")) print("Writing to {}\n".format(out_dir)) # Summaries for loss loss_summary = tf.summary.scalar("loss", mymodel.loss) # Train Summaries train_summary_op = tf.summary.merge( [loss_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) def train_step(): """ A single training step """ [batch] = sess.run([train_next_batch]) feed_dict = { mymodel.tokens: batch['tokens'], mymodel.surf_features: batch['features'], mymodel.input_y: batch['scores'], mymodel.batchsize: batch['tokens'].shape[0] } _, step, summaries, loss = sess.run( [train_op, global_step, train_summary_op, mymodel.loss], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}".format(time_str, step, loss)) train_summary_writer.add_summary(summaries, step) def dev_step(step, writer=None): """ Evaluates model on a dev set """ sess.run(dev_iterator.initializer) while True: try: [batch] = sess.run([dev_next_batch]) feed_dict = { mymodel.tokens: batch['tokens'], mymodel.surf_features: batch['features'], mymodel.input_y: batch['scores'], mymodel.batchsize: batch['tokens'].shape[0] } summaries, loss = sess.run( [dev_summary_op, mymodel.loss], feed_dict) print('--- dev loss: ', loss) if writer: writer.add_summary(summaries, step) except tf.errors.OutOfRangeError: print("End of dataset") break time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}".format(time_str, step, loss)) if writer: writer.add_summary(summaries, step) # Initialize all variables init_ops = [ tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer() ] sess.run(init_ops) for epoch in range(FLAGS.num_epochs): # initialize going through dataset sess.run(train_iterator.initializer) while True: try: train_step() current_step = tf.train.global_step(sess, global_step) # evaluate on dev set if current_step % FLAGS.evaluate_every == 0: print("\nEvaluation:") dev_step(current_step, writer=dev_summary_writer) print("") if current_step % FLAGS.checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print( "Saved model checkpoint to {}\n".format(path)) except tf.errors.OutOfRangeError: print("End of dataset") break print('-' * 100)
def main(argv): parser = argparse.ArgumentParser( description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') args = parser.parse_args() # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) ctx_len = int(params['context_length']) train_utters = [] trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading training instances ... ') transcript_contexts = [] for call in trainset: for i, (log_utter, translations, label_utter) in enumerate(call): transcript = data_helpers.tokenize_and_lower( log_utter['transcript']) transcript_contexts += [transcript] speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += [ '%s_%s' % (sa['act'], attr) for attr in sa['attributes'] ] sa_label_list = sorted(set(sa_label_list)) # train_utters += [(transcript_contexts[max(0, i+1-ctx_len):i+1], log_utter['speaker'], sa_label_list, log_utter['utter_index'])] train_utters += [(transcript, log_utter['speaker'], sa_label_list, log_utter['utter_index'])] sys.stderr.write('Done\n') test_utters = [] testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading testing instances ... ') transcript_contexts = [] for call in testset: for i, (log_utter, translations, label_utter) in enumerate(call): try: translation = data_helpers.tokenize_and_lower( translations['translated'][0]['hyp']) except: translation = '' transcript_contexts += [translation] speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += [ '%s_%s' % (sa['act'], attr) for attr in sa['attributes'] ] sa_label_list = sorted(set(sa_label_list)) # test_utters += [(transcript_contexts[max(0, i+1-ctx_len):i+1], log_utter['speaker'], sa_label_list, log_utter['utter_index'])] test_utters += [(translation, log_utter['speaker'], sa_label_list, log_utter['utter_index'])] # pprint(train_utters[:2]) # pprint(test_utters[:2]) # dump_corpus(train_utters, "dstc5_train.txt") # dump_corpus(test_utters, "dstc5_test.txt") # build vocabulary utters = [utter[0].split(' ') for utter in train_utters] max_sent_len = int(params['max_sent_len']) pad_utters = data_helpers.pad_sentences(utters, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters) print("vocabulary size: %d" % len(vocabulary)) # build input train_inputs = data_helpers.build_input_data(pad_utters, vocabulary) utters = [utter[0].split(' ') for utter in test_utters] pad_utters = data_helpers.pad_sentences(utters, max_sent_len) test_inputs = data_helpers.build_input_data(pad_utters, vocabulary) # make windowed input data as context train_inputs = data_helpers.build_windowed_input(train_inputs, ctx_len) test_inputs = data_helpers.build_windowed_input(test_inputs, ctx_len) # build labels sa_train_labels = [utter[2] for utter in train_utters] sa_test_labels = [utter[2] for utter in test_utters] label_binarizer = preprocessing.MultiLabelBinarizer() label_binarizer.fit(sa_train_labels + sa_test_labels) train_labels = label_binarizer.transform(sa_train_labels) test_labels = label_binarizer.transform(sa_test_labels) # split speakers into two sets tourist_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'tourist' ] guide_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'guide' ] tourist_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'tourist' ] guide_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide' ] np.random.shuffle(tourist_train_indices) np.random.shuffle(guide_train_indices) # np.random.shuffle(tourist_test_indices) # np.random.shuffle(guide_test_indices) tourist_train_inputs = train_inputs[tourist_train_indices] tourist_train_labels = train_labels[tourist_train_indices] guide_train_inputs = train_inputs[guide_train_indices] guide_train_labels = train_labels[guide_train_indices] tourist_test_inputs = test_inputs[tourist_test_indices] tourist_test_labels = test_labels[tourist_test_indices] guide_test_inputs = test_inputs[guide_test_indices] guide_test_labels = test_labels[guide_test_indices] # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding( vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) run_slu_sequence_task(embedding_matrix, vocabulary, label_binarizer, tourist_train_inputs, tourist_train_labels, tourist_test_inputs, tourist_test_labels) run_slu_sequence_task(embedding_matrix, vocabulary, label_binarizer, guide_train_inputs, guide_train_labels, guide_test_inputs, guide_test_labels) print("")
def main(argv): parser = argparse.ArgumentParser(description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') parser.add_argument('--roletype', dest='roletype', action='store', choices=['guide', 'tourist'], required=True, help='speaker') args = parser.parse_args() threshold_predictor = None train_utters = [] trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading training instances ... ') for call in trainset: for (log_utter, translations, label_utter) in call: if log_utter['speaker'].lower() != args.roletype: continue transcript = data_helpers.tokenize_and_lower(log_utter['transcript']) speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']] sa_label_list = sorted(set(sa_label_list)) train_utters += [(transcript, log_utter['speaker'], sa_label_list)] sys.stderr.write('Done\n') test_utters = [] testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading testing instances ... ') for call in testset: for (log_utter, translations, label_utter) in call: if log_utter['speaker'].lower() != args.roletype: continue try: translation = data_helpers.tokenize_and_lower(translations['translated'][0]['hyp']) except: translation = '' speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']] sa_label_list = sorted(set(sa_label_list)) test_utters += [(translation, log_utter['speaker'], sa_label_list)] pprint(train_utters[:2]) pprint(test_utters[:2]) # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) num_epochs = int(params['num_epochs']) validation_split = float(params['validation_split']) batch_size = int(params['batch_size']) multilabel = params['multilabel']=="true" # build vocabulary sents = [utter[0].split(' ') for utter in train_utters] max_sent_len = int(params['max_sent_len']) pad_sents = data_helpers.pad_sentences(sents, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_sents) print("vocabulary size: %d" % len(vocabulary)) # params['max_sent_len'] = max_sent_len # build inputs train_inputs = data_helpers.build_input_data(pad_sents, vocabulary) test_sents = [utter[0].split(' ') for utter in test_utters] test_pad_sents = data_helpers.pad_sentences(test_sents, max_sent_len) test_inputs = data_helpers.build_input_data(test_pad_sents, vocabulary) # build labels sa_train_labels = [utter[2] for utter in train_utters] sa_test_labels = [utter[2] for utter in test_utters] label_binarizer = preprocessing.MultiLabelBinarizer() label_binarizer.fit(sa_train_labels+sa_test_labels) train_labels = label_binarizer.transform(sa_train_labels) test_labels = label_binarizer.transform(sa_test_labels) # split and shuffle data indices = np.arange(train_inputs.shape[0]) np.random.shuffle(indices) train_inputs = train_inputs[indices] train_labels = train_labels[indices] num_validation = int(validation_split * train_inputs.shape[0]) # x_train = train_inputs[:-num_validation] # y_train = train_labels[:-num_validation] # x_val = train_inputs[-num_validation:] # y_val = train_labels[-num_validation:] x_train = train_inputs y_train = train_labels x_test = test_inputs y_test = test_labels # construct a pytorch data_loader x_train = torch.from_numpy(x_train).long() y_train = torch.from_numpy(y_train).float() dataset_tensor = data_utils.TensorDataset(x_train, y_train) train_loader = data_utils.DataLoader(dataset_tensor, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=False) x_test = torch.from_numpy(x_test).long() y_test = torch.from_numpy(y_test).long() dataset_tensor = data_utils.TensorDataset(x_test, y_test) test_loader = data_utils.DataLoader(dataset_tensor, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=False) # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding(vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) # load model model = SluConvNet(params, embedding_matrix, len(vocabulary), y_train.shape[1]) if torch.cuda.is_available(): model = model.cuda() learning_rate = float(params['learning_rate']) optimizer = optim.Adam(model.parameters(), lr=learning_rate) loss_fn = nn.MultiLabelSoftMarginLoss() # loss_fn = nn.BCEWithLogitsLoss() for epoch in range(num_epochs): model.train() # set the model to training mode (apply dropout etc) for i, (inputs, labels) in enumerate(train_loader): inputs, labels = autograd.Variable(inputs), autograd.Variable(labels) if torch.cuda.is_available(): inputs, labels = inputs.cuda(), labels.cuda() preds = model(inputs) if torch.cuda.is_available(): preds = preds.cuda() loss = loss_fn(preds, labels) optimizer.zero_grad() loss.backward() optimizer.step() if i % 100 == 0: print("current loss: %.4f" % loss) model.eval() # set the model to evaluation mode # if threshold_predictor is None: threshold_predictor = train_threshold(model, train_loader, y_train.numpy()) # count_predictor = train_count(model, train_loader, y_train.numpy()) true_acts, pred_acts, metrics = evaluate(model, label_binarizer, test_loader, y_test, multilabel, threshold_predictor) # true_acts, pred_acts, metrics = evaluate_count(model, label_binarizer, test_loader, y_test, multilabel, count_predictor) print("Precision: %.4f\tRecall: %.4f\tF1-score: %.4f\n" % (metrics[0], metrics[1], metrics[2])) # end of training true_acts, pred_acts, metrics = evaluate(model, label_binarizer, test_loader, y_test, multilabel) print("Precision: %.4f\tRecall: %.4f\tF1-score: %.4f\n" % (metrics[0], metrics[1], metrics[2])) with open(("pred_result_%s.txt" % args.roletype), "w") as f: for pred_act, true_act in zip(pred_acts, true_acts): f.write("pred: %s\ntrue: %s\n\n" % (', '.join(pred_act), ', '.join(true_act)))
def main(argv): parser = argparse.ArgumentParser( description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--devset', dest='devset', action='store', metavar='DEVSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') args = parser.parse_args() # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) devset = dataset_walker.dataset_walker(args.devset, dataroot=args.dataroot, labels=True, translations=True) testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) train_utters, dev_utters, test_utters = data_helpers.load_dstc5_dataset( trainset, devset, testset) train_utters += dev_utters # pprint(train_utters[:2]) # pprint(test_utters[:2]) # dump_corpus(train_utters, "dstc5_train.txt") # dump_corpus(test_utters, "dstc5_test.txt") # build vocabulary utters = [[char for char in utter[0]] for utter in train_utters] max_sent_len = int(params['max_sent_len']) pad_utters = data_helpers.pad_sentences(utters, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters) print("vocabulary size: %d" % len(vocabulary)) # build input train_inputs = data_helpers.build_input_data(pad_utters, vocabulary) utters = [[char for char in utter[0]] for utter in test_utters] pad_utters = data_helpers.pad_sentences(utters, max_sent_len) test_inputs = data_helpers.build_input_data(pad_utters, vocabulary) # build labels sa_train_labels = [utter[2] for utter in train_utters] sa_test_labels = [utter[2] for utter in test_utters] label_binarizer = preprocessing.MultiLabelBinarizer() label_binarizer.fit(sa_train_labels + sa_test_labels) train_labels = label_binarizer.transform(sa_train_labels) test_labels = label_binarizer.transform(sa_test_labels) # split speakers into two sets tourist_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'tourist' ] guide_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'guide' ] tourist_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'tourist' ] guide_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide' ] np.random.shuffle(tourist_train_indices) np.random.shuffle(guide_train_indices) # np.random.shuffle(tourist_test_indices) # np.random.shuffle(guide_test_indices) tourist_train_inputs = train_inputs[tourist_train_indices] tourist_train_labels = train_labels[tourist_train_indices] guide_train_inputs = train_inputs[guide_train_indices] guide_train_labels = train_labels[guide_train_indices] tourist_test_inputs = test_inputs[tourist_test_indices] tourist_test_labels = test_labels[tourist_test_indices] guide_test_inputs = test_inputs[guide_test_indices] guide_test_labels = test_labels[guide_test_indices] # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding( vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) run_slu_task(embedding_matrix, vocabulary, label_binarizer, tourist_train_inputs, tourist_train_labels, tourist_test_inputs, tourist_test_labels) run_slu_task(embedding_matrix, vocabulary, label_binarizer, guide_train_inputs, guide_train_labels, guide_test_inputs, guide_test_labels) print("")
logger.addHandler(console) logger.addHandler(handler) loss_save = 100.0 patience = 0 #word_index = data_helpers.load_wordindex("./conf/char.tsv") word_index = data_helpers.load_wordindex("./conf/word.tsv") sent_end_id = word_index["</s>"] #train_data = data_helpers.load_data(open(FLAGS.traindata_file, "r").readlines(), word_index) query_list, candidate_list, labels, test_data = data_helpers.load_data( open(FLAGS.testdata_file, "r").readlines(), word_index) embedding_mat = data_helpers.load_embedding(FLAGS.pretrain_embeddingfile, FLAGS.embedding_dim) assert len(word_index) == len(embedding_mat) embedding_mat = np.array(embedding_mat, dtype=np.float32) print "embedding_mat.shape" print embedding_mat.shape # Training model_ckpt_path = os.path.join(FLAGS.model_ckpt, "model") logger.info("logger test") def average_gradients(tower_grads): """Calculate the average gradient for each shared variable across all towers. Note that this function provides a synchronization point across all towers. Args:
temp = FLAGS.checkpoint restore = FLAGS.restore FLAGS = pk.load(open(FLAGS.checkpoint + "/FLAGS", "rb")) FLAGS.checkpoint = temp FLAGS.restore = restore starting_epoch = FLAGS.num_epochs FLAGS.num_epochs = temp_num_epochs vocabulary = pk.load(open(FLAGS.checkpoint + "/vocabulary", 'rb')) vocabulary_inv = pk.load(open(FLAGS.checkpoint + "/vocabulary_inv", "rb")) print("\nparameters:") for attr, value in sorted(FLAGS.__dict__.items()): print(" {} = {}".format(attr, value)) # Load embeddings pretrained_embedding, FLAGS.embedding_dim = data_helpers.load_embedding( vocabulary, FLAGS.pretrained_embedding, FLAGS.embedding_dim) print(" dim. of word vector by setting", FLAGS.embedding_dim) # Training session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement, gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_memory)) with tf.Session(config=session_conf) as sess: cnn = textNN( sequence_length_ment1=FLAGS.sequence_length_ment1, sequence_length_ment2=FLAGS.sequence_length_ment2, sequence_length_sents_ment1=FLAGS.sequence_length_sents_ment1, sequence_length_sents_ment2=FLAGS.sequence_length_sents_ment2, sequence_length_add_ment1=FLAGS.sequence_length_add_ment1,