def create_dictionary(dataroot): dictionary = Dictionary() questions = [] files = [ 'v2_OpenEnded_mscoco_train2014_questions.json', 'v2_OpenEnded_mscoco_val2014_questions.json', 'v2_OpenEnded_mscoco_test2015_questions.json', 'v2_OpenEnded_mscoco_test-dev2015_questions.json' ] for path in files: question_path = os.path.join(dataroot, path) qs = json.load(open(question_path))['questions'] for q in qs: dictionary.tokenize(q['question'], True) print('words coming from vqa ', dictionary.__len__()) #add all collected words from imsitu. contains both overlaps with vqa as well as new words imsitu_words_path = os.path.join( dataroot, 'allnverbs_imsitu_words_nl2vqamatching.json') imsitu_words = json.load(open(imsitu_words_path)) for label, eng_name in imsitu_words.items(): dictionary.tokenize(eng_name, True) print(' with words coming from imsitu ', dictionary.__len__()) return dictionary
def create_dictionary(dataroot): dictionary = Dictionary() questions = [] files = ['imsitu_questions_prev.json'] for path in files: question_path = os.path.join(dataroot, path) q_data = json.load(open(question_path)) for verb, values in q_data.items(): roles = values['roles'] for role, info in roles.items(): question = info['question'] dictionary.tokenize(question, True) #add all collected words from imsitu. contains both overlaps with vqa as well as new words imsitu_words_path = os.path.join( dataroot, 'allnverbsall_imsitu_words_nl2glovematching.json') imsitu_words = json.load(open(imsitu_words_path)) for label, eng_name in imsitu_words.items(): dictionary.tokenize(eng_name, True) print(' with words coming from imsitu ', dictionary.__len__()) return dictionary
def create_question_explain_dictionary(dataroot, thres): dictionary = Dictionary() counter = Counter() question_files = [ 'v2_OpenEnded_mscoco_train2014_questions.json', 'v2_OpenEnded_mscoco_val2014_questions.json', 'v2_OpenEnded_mscoco_test2015_questions.json', 'v2_OpenEnded_mscoco_test-dev2015_questions.json' ] explain_files = [ 'VQA-E_train_set.json', 'VQA-E_val_set.json', ] for path in explain_files: explain_path = os.path.join(dataroot, path) es = json.load(open(explain_path)) for e in es: counter.update(dictionary.word_token(e['explanation'][0])) dictionary.add_word('<pad>') dictionary.add_word('<start>') dictionary.add_word('<end>') dictionary.add_word('<unk>') for word, cnt in counter.items(): if cnt >= thres: dictionary.add_word(word) for path in question_files: question_path = os.path.join(dataroot, path) qs = json.load(open(question_path))['questions'] for q in qs: dictionary.tokenize(q['question'], True) return dictionary
def create_dictionary2(dataroot): dictionary = Dictionary() questions = [] files = ['train/questions.txt', 'train/questions.txt'] for path in files: question_path = os.path.join(dataroot, path) qs = open(question_path) qs = qs.read().split("\n") for q in qs: dictionary.tokenize(q, True) return dictionary
def create_dictionary(dataroot): dictionary = Dictionary() questions = [] files = ['VQA_caption_traindataset.pkl', 'VQA_caption_valdataset.pkl'] for path in files: question_path = os.path.join(dataroot, path) dataset = cPickle.load(open(question_path, 'rb')) for idx in range(len(dataset)): captions = dataset[idx]['caption'] for cap in captions: dictionary.tokenize(cap, True) return dictionary
def create_dictionary(dataroot): dictionary = Dictionary() files = ['allwords4verbq1.json'] for path in files: question_path = os.path.join(dataroot, path) q_data = json.load(open(question_path)) for label, eng_name in q_data.items(): dictionary.tokenize(eng_name, True) return dictionary
def create_dictionary(dataroot): dictionary = Dictionary() files = [ 'imsitu_questions_prev.json' ] for path in files: question_path = os.path.join(dataroot, path) q_data = json.load(open(question_path)) for verb, values in q_data.items(): roles = values['roles'] for role, info in roles.items(): question = info['question'] dictionary.tokenize(question, True) return dictionary
def create_dictionary(dataroot): dictionary = Dictionary() questions = [] files = [ 'OpenEnded_abstract_v002_test2015_questions.json', 'OpenEnded_abstract_v002_train2015_questions.json', 'OpenEnded_abstract_v002_val2015_questions.json', 'MultipleChoice_abstract_v002_test2015_questions.json', 'MultipleChoice_abstract_v002_train2015_questions.json', 'MultipleChoice_abstract_v002_val2015_questions.json' ] for path in files: question_path = os.path.join(dataroot, path) qs = json.load(open(question_path))['questions'] for q in qs: dictionary.tokenize(q['question'], True) return dictionary
def create_dictionary(dataroot): dictionary = Dictionary() questions = [] files = ['vqacp_v2_train_questions.json', 'vqacp_v2_test_questions.json' ] for path in files: question_path = os.path.join(dataroot, path) qs = json.load(open(question_path)) for q in qs: dictionary.tokenize(q['question'], True) if 'train' in path: try: dictionary.tokenize(q['orig_question'], True) except: continue return dictionary
def create_dictionary(dataroot): dictionary = Dictionary() role_name_corrector = 'data/roles_namecorrected.json' role_name_dict = json.load(open(role_name_corrector)) files = [ 'imsitu_questions_prev.json' ] for path in files: question_path = os.path.join(dataroot, path) q_data = json.load(open(question_path)) for verb, values in q_data.items(): roles = values['roles'] for role, info in roles.items(): question = role_name_dict[role] dictionary.tokenize(question, True) return dictionary
def create_dictionary(dataroot): dictionary = Dictionary() #general questions files = [ 'imsitu_questions_prev.json' ] for path in files: question_path = os.path.join(dataroot, path) q_data = json.load(open(question_path)) for verb, values in q_data.items(): roles = values['roles'] for role, info in roles.items(): question = info['question'] dictionary.tokenize(question, True) #tempalted words with open(os.path.join(dataroot, 'role_abstracts.txt')) as f: content = f.readlines() verb_desc = [x.strip() for x in content] for desc in verb_desc: dictionary.tokenize(desc, True) #labels question_path = os.path.join(dataroot, 'all_label_mapping.json') q_data = json.load(open(question_path)) for label, eng_name in q_data.items(): dictionary.tokenize(eng_name, True) return dictionary
from dataset import Dictionary if __name__ == '__main__': d = Dictionary() all_sent = d.get_all_sentence() print all_sent[0], all_sent[1] token1 = d.tokenize(all_sent[0], False) token2 = d.tokenize(all_sent[1], False) print token1, token2
def create_dictionary(question): dictionary = Dictionary() dictionary.tokenize(question, True) return dictionary