FLAGS.word_embedding_size) print("finish loading dep embedding") fast_embedding_path = "fast-text/wiki.simple.vec" fast_embedding_index = load_embedding(fast_embedding_path) fast_embedding_matrix = get_embedding_matrix(word_index, fast_embedding_index, FLAGS.word_embedding_size) print("finish loading fast embedding") embedding_path = "glove.6B/glove.6B.{}d.txt".format(FLAGS.word_embedding_size) embedding_index = load_embedding(embedding_path) embedding_matrix = get_embedding_matrix(word_index, embedding_index, FLAGS.word_embedding_size) print("finish loading linear embedding") vocab_size = len(word_index) # convert words to indices including padding and cutting train_x = tokens_to_indices(word_index, train_tokens, MAXLEN) valid_x = tokens_to_indices(word_index, valid_tokens, MAXLEN) test_x = tokens_to_indices(word_index, test_tokens, MAXLEN) train_data = zip(train_x, train_sq_len, train_ch, train_y) valid_data = zip(valid_x, valid_sq_len, valid_ch, valid_y) test_data = zip(test_x, test_sq_len, test_ch, test_y) print("Embedding loaded") # Create a model graph = tf.Graph() with graph.as_default(): with tf.Session() as sess: mem_net = MemNet(vocab_size=vocab_size, statment_size=MAXLEN, word_embedding_size=FLAGS.word_embedding_size, num_hops=FLAGS.num_hops,
train_tokens = texts_to_tokens(train_statement) valid_tokens = texts_to_tokens(valid_statement) test_tokens = texts_to_tokens(test_statement) train_sq_len = get_sequence_length(train_tokens) valid_sq_len = get_sequence_length(valid_tokens) test_sq_len = get_sequence_length(test_tokens) # create vocabulary from the data itself wordlist = itertools.chain.from_iterable(train_tokens) word_index, _ = build_vocab(wordlist) vocab_size = len(word_index) # convert words to indices including padding and cutting train_x = tokens_to_indices(word_index, train_tokens, MAXLEN) valid_x = tokens_to_indices(word_index, valid_tokens, MAXLEN) test_x = tokens_to_indices(word_index, test_tokens, MAXLEN) # convert topics to indices # get topic sequence max_topic = 5 train_topic = tokens_to_indices(topic_index, train_topic, max_topic) valid_topic = tokens_to_indices(topic_index, valid_topic, max_topic) test_topic = tokens_to_indices(topic_index, test_topic, max_topic) # get topic sequence train_tp_sq = np.array([np.count_nonzero(t) for t in train_topic]) valid_tp_sq = np.array([np.count_nonzero(t) for t in valid_topic])
# text train_tokens = texts_to_tokens(train_statement) valid_tokens = texts_to_tokens(valid_statement) test_tokens = texts_to_tokens(test_statement) # text sequence train_sq_len = get_sequence_length(train_tokens) valid_sq_len = get_sequence_length(valid_tokens) test_sq_len = get_sequence_length(test_tokens) # create word vocabulary from the data itself wordlist = itertools.chain.from_iterable(train_tokens) word_index, _ = build_vocab(wordlist) vocab_size = len(word_index) # convert words to indices including padding and cutting train_x = tokens_to_indices(word_index, train_tokens, MAXLEN) valid_x = tokens_to_indices(word_index, valid_tokens, MAXLEN) test_x = tokens_to_indices(word_index, test_tokens, MAXLEN) # get topic sequence max_topic = 5 train_topic = tokens_to_indices(topic_index, train_topic, max_topic) valid_topic = tokens_to_indices(topic_index, valid_topic, max_topic) test_topic = tokens_to_indices(topic_index, test_topic, max_topic) # get topic sequence train_tp_sq = np.array([np.count_nonzero(t) for t in train_topic]) valid_tp_sq = np.array([np.count_nonzero(t) for t in valid_topic]) test_tp_sq = np.array([np.count_nonzero(t) for t in test_topic]) # speaker