def create_dictionary(dataroot):
    dictionary = Dictionary()
    questions = []
    files = [
        'v2_OpenEnded_mscoco_train2014_questions.json',
        'v2_OpenEnded_mscoco_val2014_questions.json',
        'v2_OpenEnded_mscoco_test2015_questions.json',
        'v2_OpenEnded_mscoco_test-dev2015_questions.json'
    ]
    for path in files:
        question_path = os.path.join(dataroot, path)
        qs = json.load(open(question_path))['questions']
        for q in qs:
            dictionary.tokenize(q['question'], True)

    print('words coming from vqa ', dictionary.__len__())

    #add all collected words from imsitu. contains both overlaps with vqa as well as new words
    imsitu_words_path = os.path.join(
        dataroot, 'allnverbs_imsitu_words_nl2vqamatching.json')
    imsitu_words = json.load(open(imsitu_words_path))

    for label, eng_name in imsitu_words.items():
        dictionary.tokenize(eng_name, True)

    print(' with words coming from imsitu ', dictionary.__len__())

    return dictionary
def create_dictionary(dataroot):
    dictionary = Dictionary()
    questions = []
    files = ['imsitu_questions_prev.json']

    for path in files:
        question_path = os.path.join(dataroot, path)
        q_data = json.load(open(question_path))

        for verb, values in q_data.items():
            roles = values['roles']
            for role, info in roles.items():
                question = info['question']
                dictionary.tokenize(question, True)

    #add all collected words from imsitu. contains both overlaps with vqa as well as new words
    imsitu_words_path = os.path.join(
        dataroot, 'allnverbsall_imsitu_words_nl2glovematching.json')
    imsitu_words = json.load(open(imsitu_words_path))

    for label, eng_name in imsitu_words.items():
        dictionary.tokenize(eng_name, True)

    print(' with words coming from imsitu ', dictionary.__len__())

    return dictionary
示例#3
0
def create_question_explain_dictionary(dataroot, thres):
    dictionary = Dictionary()
    counter = Counter()
    question_files = [
        'v2_OpenEnded_mscoco_train2014_questions.json',
        'v2_OpenEnded_mscoco_val2014_questions.json',
        'v2_OpenEnded_mscoco_test2015_questions.json',
        'v2_OpenEnded_mscoco_test-dev2015_questions.json'
    ]
    explain_files = [
        'VQA-E_train_set.json',
        'VQA-E_val_set.json',
    ]
    for path in explain_files:
        explain_path = os.path.join(dataroot, path)
        es = json.load(open(explain_path))
        for e in es:
            counter.update(dictionary.word_token(e['explanation'][0]))

    dictionary.add_word('<pad>')
    dictionary.add_word('<start>')
    dictionary.add_word('<end>')
    dictionary.add_word('<unk>')
    for word, cnt in counter.items():
        if cnt >= thres:
            dictionary.add_word(word)
    for path in question_files:
        question_path = os.path.join(dataroot, path)
        qs = json.load(open(question_path))['questions']
        for q in qs:
            dictionary.tokenize(q['question'], True)

    return dictionary
示例#4
0
def create_dictionary2(dataroot):
    dictionary = Dictionary()
    questions = []
    files = ['train/questions.txt', 'train/questions.txt']
    for path in files:
        question_path = os.path.join(dataroot, path)
        qs = open(question_path)
        qs = qs.read().split("\n")
        for q in qs:
            dictionary.tokenize(q, True)
    return dictionary
def create_dictionary(dataroot):
    dictionary = Dictionary()
    questions = []
    files = ['VQA_caption_traindataset.pkl', 'VQA_caption_valdataset.pkl']
    for path in files:
        question_path = os.path.join(dataroot, path)
        dataset = cPickle.load(open(question_path, 'rb'))
        for idx in range(len(dataset)):
            captions = dataset[idx]['caption']
            for cap in captions:
                dictionary.tokenize(cap, True)
    return dictionary
示例#6
0
def create_dictionary(dataroot):
    dictionary = Dictionary()
    files = ['allwords4verbq1.json']

    for path in files:
        question_path = os.path.join(dataroot, path)
        q_data = json.load(open(question_path))

        for label, eng_name in q_data.items():
            dictionary.tokenize(eng_name, True)

    return dictionary
def create_dictionary(dataroot):
    dictionary = Dictionary()
    files = [
        'imsitu_questions_prev.json'
    ]

    for path in files:
        question_path = os.path.join(dataroot, path)
        q_data = json.load(open(question_path))

        for verb, values in q_data.items():
            roles = values['roles']
            for role, info in roles.items():
                question = info['question']
                dictionary.tokenize(question, True)

    return dictionary
def create_dictionary(dataroot):
    dictionary = Dictionary()
    questions = []
    files = [
        'OpenEnded_abstract_v002_test2015_questions.json',
        'OpenEnded_abstract_v002_train2015_questions.json',
        'OpenEnded_abstract_v002_val2015_questions.json',
        'MultipleChoice_abstract_v002_test2015_questions.json',
        'MultipleChoice_abstract_v002_train2015_questions.json',
        'MultipleChoice_abstract_v002_val2015_questions.json'
    ]
    for path in files:
        question_path = os.path.join(dataroot, path)
        qs = json.load(open(question_path))['questions']
        for q in qs:
            dictionary.tokenize(q['question'], True)
    return dictionary
def create_dictionary(dataroot):
    dictionary = Dictionary()
    questions = []
    files = ['vqacp_v2_train_questions.json',
        'vqacp_v2_test_questions.json'
    ]
    for path in files:
        question_path = os.path.join(dataroot, path)
        qs = json.load(open(question_path))
        for q in qs:
            dictionary.tokenize(q['question'], True)
            if 'train' in path:
                try:
                    dictionary.tokenize(q['orig_question'], True)
                except:
                    continue

    return dictionary
示例#10
0
def create_dictionary(dataroot):
    dictionary = Dictionary()
    role_name_corrector = 'data/roles_namecorrected.json'
    role_name_dict = json.load(open(role_name_corrector))
    files = [
        'imsitu_questions_prev.json'
    ]

    for path in files:
        question_path = os.path.join(dataroot, path)
        q_data = json.load(open(question_path))

        for verb, values in q_data.items():
            roles = values['roles']
            for role, info in roles.items():
                question = role_name_dict[role]
                dictionary.tokenize(question, True)

    return dictionary
示例#11
0
def create_dictionary(dataroot):
    dictionary = Dictionary()
    #general questions
    files = [
        'imsitu_questions_prev.json'
    ]

    for path in files:
        question_path = os.path.join(dataroot, path)
        q_data = json.load(open(question_path))

        for verb, values in q_data.items():
            roles = values['roles']
            for role, info in roles.items():
                question = info['question']
                dictionary.tokenize(question, True)

    #tempalted words
    with open(os.path.join(dataroot, 'role_abstracts.txt')) as f:
        content = f.readlines()
    verb_desc = [x.strip() for x in content]

    for desc in verb_desc:
        dictionary.tokenize(desc, True)
    #labels
    question_path = os.path.join(dataroot, 'all_label_mapping.json')
    q_data = json.load(open(question_path))

    for label, eng_name in q_data.items():
        dictionary.tokenize(eng_name, True)

    return dictionary
示例#12
0
from dataset import Dictionary

if __name__ == '__main__':
    d = Dictionary()
    all_sent = d.get_all_sentence()

    print all_sent[0], all_sent[1]
    token1 = d.tokenize(all_sent[0], False)
    token2 = d.tokenize(all_sent[1], False)
    print token1, token2
def create_dictionary(question):
    dictionary = Dictionary()

    dictionary.tokenize(question, True)

    return dictionary