tracks = ['AR-AR', 'AR-EN', 'SP-SP', 'SP-EN', 'SP-EN-WMT', 'EN-EN', 'EN-TR'] print ("loading data...") # train_sources, train_targets, train_scores = data_helper.load_cross_lang_sentence_data(train_path, True) # dev_sources, dev_targets, dev_scores = data_helper.load_cross_lang_sentence_data(dev_path, True) # test_sources, test_targets, test_scores = data_helper.load_cross_lang_sentence_data(test_path, False) train_sources, train_targets, train_scores = data_helper.load_sts_data(train_path) dev_sources, dev_targets, dev_scores = data_helper.load_sts_data(dev_path) test_sources, test_targets, test_scores = data_helper.load_sts_data(test_path) word2idx, word_embeddings = data_helper.load_embedding2(embedding_path, True) # word to id train_sources, train_sources_length = utils.word2id(train_sources, word2idx, seq_length) train_targets, train_targets_length = utils.word2id(train_targets, word2idx, seq_length) dev_sources, dev_sources_length = utils.word2id(dev_sources, word2idx, seq_length) dev_targets, dev_targets_length = utils.word2id(dev_targets, word2idx, seq_length) test_sources, test_sources_length = utils.word2id(test_sources, word2idx, seq_length) test_targets, test_targets_length = utils.word2id(test_targets, word2idx, seq_length) train_score_probs = utils.build_porbs(train_scores, class_num) dev_score_probs = utils.build_porbs(dev_scores, class_num) test_score_probs = utils.build_porbs(test_scores, class_num) def kl_distance(y_true, y_pred): y_true = kb.clip(y_true, 1e-10, 1.)
embedding_size = 300 filter_sizes = [1, 2, 3] filter_num = 300 batch_size = 64 tune_epochs_num = 300 drop_out_rate = 0.5 regularizer_rate = 0.004 print("loading data...") graph_sources, graph_targets, graph_scores = data_helper.load_cross_lang_sentence_data( graph_path, False) word2idx, word_embeddings = data_helper.load_embedding(embedding_path, True) graph_sources, graph_sources_length = utils.word2id(graph_sources, word2idx, seq_length) graph_targets, graph_targets_length = utils.word2id(graph_targets, word2idx, seq_length) graph_score_probs = utils.build_porbs(graph_scores, class_num) def kl_distance(y_true, y_pred): y_true = kb.clip(y_true, 1e-10, 1.) y_pred = kb.clip(y_pred, 1e-10, 1.) avg_distance = (kb.sum(y_true * kb.log(y_true / y_pred), axis=1) + kb.sum(y_pred * kb.log(y_pred / y_true), axis=1)) / 2.0 return kb.mean(avg_distance) def pearson(y_true, y_pred):
train_sources, train_targets, train_scores = data_helper.load_cross_lang_sentence_data( FLAGS.train_path) dev_sources, dev_targets, dev_scores = data_helper.load_cross_lang_sentence_data( FLAGS.dev_path) test_sources, test_targets, test_scores = data_helper.load_cross_lang_sentence_data( FLAGS.test_path) print 'load source embedding...' source_word2idx, source_word_embedding = data_helper.load_embedding( FLAGS.source_embedding_path, True) print 'load target embedding...' target_word2idx, target_word_embedding = data_helper.load_embedding( FLAGS.target_embedding_path, True) train_sources, train_sources_length = utils.word2id(train_sources, source_word2idx, FLAGS.seq_length) train_targets, train_targets_length = utils.word2id(train_targets, target_word2idx, FLAGS.seq_length) dev_sources, dev_sources_length = utils.word2id(dev_sources, source_word2idx, FLAGS.seq_length) dev_targets, dev_targets_length = utils.word2id(dev_targets, target_word2idx, FLAGS.seq_length) test_sources, test_sources_length = utils.word2id(test_sources, source_word2idx, FLAGS.seq_length) test_targets, test_targets_length = utils.word2id(test_targets, target_word2idx,
FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print "\nParameters:" for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print "" # Data Preparation # ================================================== # Load data print "Loading data..." sources, targets, scores = data_helper.load_sick_data(FLAGS.data_path) word2idx, word_embedding = data_helper.load_embedding(FLAGS.embedding_path) sources = utils.word2id(sources, word2idx, FLAGS.seq_length) targets = utils.word2id(targets, word2idx, FLAGS.seq_length) # dev_sources, dev_targets, dev_scores = data_helper.load_sts_data(FLAGS.dev_path) # dev_sources = utils.word2id(dev_sources, word2idx, FLAGS.seq_length) # dev_targets = utils.word2id(dev_targets, word2idx, FLAGS.seq_length) # Split train/test set # 5 fold cross-validation sample_num = len(scores) fold_num = int(sample_num / 5) # 1 - 2, 3, 4, 5 # train_sources, dev_sources = sources[fold_num:], sources[:fold_num] # train_targets, dev_targets = targets[fold_num:], targets[:fold_num] # train_scores, dev_scores = scores[fold_num:], scores[:fold_num]
# Load data print("Loading data...") train_sources, train_targets, train_scores = data_helper.load_sts_data( FLAGS.train_path) dev_sources, dev_targets, dev_scores = data_helper.load_sts_data( FLAGS.dev_path) test_sources, test_targets, test_scores = data_helper.load_sts_data( FLAGS.test_path) # train_source_features, train_target_features = utils.get_all_handcraft_features(train_sources, train_targets, FLAGS.seq_length) # dev_source_features, dev_target_features = utils.get_all_handcraft_features(dev_sources, dev_targets, FLAGS.seq_length) # test_source_features, test_target_features = utils.get_all_handcraft_features(test_sources, test_targets, FLAGS.seq_length) word2idx, word_embeddings = data_helper.load_embedding(FLAGS.embedding_path, True) train_sources = utils.word2id(train_sources, word2idx, FLAGS.seq_length) train_targets = utils.word2id(train_targets, word2idx, FLAGS.seq_length) dev_sources = utils.word2id(dev_sources, word2idx, FLAGS.seq_length) dev_targets = utils.word2id(dev_targets, word2idx, FLAGS.seq_length) test_sources = utils.word2id(test_sources, word2idx, FLAGS.seq_length) test_targets = utils.word2id(test_targets, word2idx, FLAGS.seq_length) dev_score_probs = utils.build_porbs(dev_scores, FLAGS.class_num) test_score_probs = utils.build_porbs(test_scores, FLAGS.class_num) print("Train/Dev split: {:d}/{:d}".format(len(train_scores), len(dev_scores))) time_stamp = str(int(time.time())) # Training # ==================================================