示例#1
0
tracks = ['AR-AR', 'AR-EN', 'SP-SP', 'SP-EN', 'SP-EN-WMT', 'EN-EN', 'EN-TR']


print ("loading data...")
# train_sources, train_targets, train_scores = data_helper.load_cross_lang_sentence_data(train_path, True)
# dev_sources, dev_targets, dev_scores = data_helper.load_cross_lang_sentence_data(dev_path, True)
# test_sources, test_targets, test_scores = data_helper.load_cross_lang_sentence_data(test_path, False)
train_sources, train_targets, train_scores = data_helper.load_sts_data(train_path)
dev_sources, dev_targets, dev_scores = data_helper.load_sts_data(dev_path)
test_sources, test_targets, test_scores = data_helper.load_sts_data(test_path)

word2idx, word_embeddings = data_helper.load_embedding2(embedding_path, True)


# word to id
train_sources, train_sources_length = utils.word2id(train_sources, word2idx, seq_length)
train_targets, train_targets_length = utils.word2id(train_targets, word2idx, seq_length)


dev_sources, dev_sources_length = utils.word2id(dev_sources, word2idx, seq_length)
dev_targets, dev_targets_length = utils.word2id(dev_targets, word2idx, seq_length)

test_sources, test_sources_length = utils.word2id(test_sources, word2idx, seq_length)
test_targets, test_targets_length = utils.word2id(test_targets, word2idx, seq_length)

train_score_probs = utils.build_porbs(train_scores, class_num)
dev_score_probs = utils.build_porbs(dev_scores, class_num)
test_score_probs = utils.build_porbs(test_scores, class_num)

def kl_distance(y_true, y_pred):
    y_true = kb.clip(y_true, 1e-10, 1.)
embedding_size = 300
filter_sizes = [1, 2, 3]
filter_num = 300
batch_size = 64
tune_epochs_num = 300
drop_out_rate = 0.5
regularizer_rate = 0.004

print("loading data...")

graph_sources, graph_targets, graph_scores = data_helper.load_cross_lang_sentence_data(
    graph_path, False)

word2idx, word_embeddings = data_helper.load_embedding(embedding_path, True)

graph_sources, graph_sources_length = utils.word2id(graph_sources, word2idx,
                                                    seq_length)
graph_targets, graph_targets_length = utils.word2id(graph_targets, word2idx,
                                                    seq_length)

graph_score_probs = utils.build_porbs(graph_scores, class_num)


def kl_distance(y_true, y_pred):
    y_true = kb.clip(y_true, 1e-10, 1.)
    y_pred = kb.clip(y_pred, 1e-10, 1.)
    avg_distance = (kb.sum(y_true * kb.log(y_true / y_pred), axis=1) +
                    kb.sum(y_pred * kb.log(y_pred / y_true), axis=1)) / 2.0
    return kb.mean(avg_distance)


def pearson(y_true, y_pred):
示例#3
0
train_sources, train_targets, train_scores = data_helper.load_cross_lang_sentence_data(
    FLAGS.train_path)
dev_sources, dev_targets, dev_scores = data_helper.load_cross_lang_sentence_data(
    FLAGS.dev_path)
test_sources, test_targets, test_scores = data_helper.load_cross_lang_sentence_data(
    FLAGS.test_path)

print 'load source embedding...'
source_word2idx, source_word_embedding = data_helper.load_embedding(
    FLAGS.source_embedding_path, True)
print 'load target embedding...'
target_word2idx, target_word_embedding = data_helper.load_embedding(
    FLAGS.target_embedding_path, True)

train_sources, train_sources_length = utils.word2id(train_sources,
                                                    source_word2idx,
                                                    FLAGS.seq_length)
train_targets, train_targets_length = utils.word2id(train_targets,
                                                    target_word2idx,
                                                    FLAGS.seq_length)

dev_sources, dev_sources_length = utils.word2id(dev_sources, source_word2idx,
                                                FLAGS.seq_length)
dev_targets, dev_targets_length = utils.word2id(dev_targets, target_word2idx,
                                                FLAGS.seq_length)

test_sources, test_sources_length = utils.word2id(test_sources,
                                                  source_word2idx,
                                                  FLAGS.seq_length)
test_targets, test_targets_length = utils.word2id(test_targets,
                                                  target_word2idx,
示例#4
0
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print "\nParameters:"
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print ""

# Data Preparation
# ==================================================

# Load data
print "Loading data..."
sources, targets, scores = data_helper.load_sick_data(FLAGS.data_path)
word2idx, word_embedding = data_helper.load_embedding(FLAGS.embedding_path)
sources = utils.word2id(sources, word2idx, FLAGS.seq_length)
targets = utils.word2id(targets, word2idx, FLAGS.seq_length)

# dev_sources, dev_targets, dev_scores = data_helper.load_sts_data(FLAGS.dev_path)
# dev_sources = utils.word2id(dev_sources, word2idx, FLAGS.seq_length)
# dev_targets = utils.word2id(dev_targets, word2idx, FLAGS.seq_length)

# Split train/test set
# 5 fold cross-validation
sample_num = len(scores)
fold_num = int(sample_num / 5)
# 1 - 2, 3, 4, 5
# train_sources, dev_sources = sources[fold_num:], sources[:fold_num]
# train_targets, dev_targets = targets[fold_num:], targets[:fold_num]
# train_scores, dev_scores = scores[fold_num:], scores[:fold_num]
示例#5
0
# Load data
print("Loading data...")
train_sources, train_targets, train_scores = data_helper.load_sts_data(
    FLAGS.train_path)
dev_sources, dev_targets, dev_scores = data_helper.load_sts_data(
    FLAGS.dev_path)
test_sources, test_targets, test_scores = data_helper.load_sts_data(
    FLAGS.test_path)

# train_source_features, train_target_features = utils.get_all_handcraft_features(train_sources, train_targets, FLAGS.seq_length)
# dev_source_features, dev_target_features = utils.get_all_handcraft_features(dev_sources, dev_targets, FLAGS.seq_length)
# test_source_features, test_target_features = utils.get_all_handcraft_features(test_sources, test_targets, FLAGS.seq_length)

word2idx, word_embeddings = data_helper.load_embedding(FLAGS.embedding_path,
                                                       True)
train_sources = utils.word2id(train_sources, word2idx, FLAGS.seq_length)
train_targets = utils.word2id(train_targets, word2idx, FLAGS.seq_length)
dev_sources = utils.word2id(dev_sources, word2idx, FLAGS.seq_length)
dev_targets = utils.word2id(dev_targets, word2idx, FLAGS.seq_length)
test_sources = utils.word2id(test_sources, word2idx, FLAGS.seq_length)
test_targets = utils.word2id(test_targets, word2idx, FLAGS.seq_length)

dev_score_probs = utils.build_porbs(dev_scores, FLAGS.class_num)
test_score_probs = utils.build_porbs(test_scores, FLAGS.class_num)

print("Train/Dev split: {:d}/{:d}".format(len(train_scores), len(dev_scores)))

time_stamp = str(int(time.time()))
# Training
# ==================================================