Пример #1
0
def create_complete_dataset():
    embeddings = load_and_create_vocab()
    create_data_set(movie.cfg.QA_JSON, 'train', data_conf.TRAIN_RECORD_PATH, embeddings)
    create_data_set(data_conf.EVAL_FILE, 'val', data_conf.EVAL_RECORD_PATH, embeddings)
    create_data_set(movie.cfg.QA_JSON, 'test', data_conf.TEST_RECORD_PATH, embeddings)
    print("saving embeddings")
    util.save_embeddings(data_conf.EMBEDDING_DIR, data_conf.EMBEDDING_SIZE)
Пример #2
0
def create_200_random_validation_dataset(qa_ids_file):
    embeddings = load_and_create_vocab()

    outfolder = os.path.join(data_conf.RECORD_DIR, 'val_random_200')
    if not os.path.exists(outfolder):
        os.makedirs(outfolder)

    qa_ids = read_qa_ids(qa_ids_file)

    create_movieqa_data(movie.cfg.QA_JSON, 'val', outfolder, embeddings, qa_ids)

    print("saving embeddings")
    util.save_embeddings(data_conf.EMBEDDING_DIR, data_conf.EMBEDDING_SIZE)
Пример #3
0
def create_validation_dataset(split):
    print("Prepare embeddings for modified input ...")
    embeddings = load_and_create_vocab()

    create_movieqa_data(data_conf.EVAL_FILE, split, data_conf.EVAL_RECORD_PATH, embeddings)
    # save updated vocab file with additional new words
    new_vocab_size = util.save_embeddings(data_conf.EMBEDDING_DIR, data_conf.EMBEDDING_SIZE)
    return new_vocab_size
import os
import sys

present_path = os.path.dirname(os.path.realpath(sys.argv[0]))
sys.path.append(os.path.join(present_path, '../../'))

import core.util as util
import movieqa.data_conf as data_conf

glove = util.loadGloveModel(data_conf.PRETRAINED_EMBEDDINGS_PATH)

#vectors, vocab = util.load_embeddings(data_conf.EMBEDDING_DIR)

util.restore_vocab(data_conf.EMBEDDING_DIR)

print("Restored vocab")

#rev_vocab = dict(zip(vocab.values(), vocab.keys()))
#print("Current vocabulary %s with %d entries" % (str(rev_vocab), len(rev_vocab)))

filename = "adversarial_addAny/common_english.txt"
fin = open(filename, encoding="utf8")
for line in fin:
    word = line.replace('\n', '')
    print("get word vector for %s" % word)
    vec = util.get_word_vector(glove, word, data_conf.EMBEDDING_SIZE)

vsize = util.save_embeddings(data_conf.EMBEDDING_DIR, data_conf.EMBEDDING_SIZE)
print("New vocabulary size %d" % vsize)