def __init__(self, args, logger): self.args = args self.logger = logger Dict = Dictionary(data_path=os.path.join(args.data_path, args.dataset), task_type=args.task_type) self.dict = Dict.dict self.attr_len = Dict.attr_len self.all_the_poss = reduce(mul, Dict.attr_len, 1) self.logger.info("Experiment initializing . . . ") # build models device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") if args.model_type == 'POP': self.model = 'POP' elif any( [True if args.model_type == m else False for m in ['ETN', 'ETNA']]): self.model = ETNADemoPredictor(logger, args.model_type, self.dict.__len__(), args.item_emb_size, Dict.attr_len, args.no_cuda).to(device) else: sys.exit() if args.model_type != 'POP': self.select_optimizer(self.model) self.logger.info(self.model) self.step_count = 0
def create_dictionary(dataroot): dictionary = Dictionary() #general questions files = [ 'imsitu_questions_prev.json' ] for path in files: question_path = os.path.join(dataroot, path) q_data = json.load(open(question_path)) for verb, values in q_data.items(): roles = values['roles'] for role, info in roles.items(): question = info['question'] dictionary.tokenize(question, True) #tempalted words with open(os.path.join(dataroot, 'role_abstracts.txt')) as f: content = f.readlines() verb_desc = [x.strip() for x in content] for desc in verb_desc: dictionary.tokenize(desc, True) #labels question_path = os.path.join(dataroot, 'all_label_mapping.json') q_data = json.load(open(question_path)) for label, eng_name in q_data.items(): dictionary.tokenize(eng_name, True) return dictionary
def create_question_explain_dictionary(dataroot, thres): dictionary = Dictionary() counter = Counter() question_files = [ 'v2_OpenEnded_mscoco_train2014_questions.json', 'v2_OpenEnded_mscoco_val2014_questions.json', 'v2_OpenEnded_mscoco_test2015_questions.json', 'v2_OpenEnded_mscoco_test-dev2015_questions.json' ] explain_files = [ 'VQA-E_train_set.json', 'VQA-E_val_set.json', ] for path in explain_files: explain_path = os.path.join(dataroot, path) es = json.load(open(explain_path)) for e in es: counter.update(dictionary.word_token(e['explanation'][0])) dictionary.add_word('<pad>') dictionary.add_word('<start>') dictionary.add_word('<end>') dictionary.add_word('<unk>') for word, cnt in counter.items(): if cnt >= thres: dictionary.add_word(word) for path in question_files: question_path = os.path.join(dataroot, path) qs = json.load(open(question_path))['questions'] for q in qs: dictionary.tokenize(q['question'], True) return dictionary
def create_dictionary(dataroot): dictionary = Dictionary() questions = [] files = ['imsitu_questions_prev.json'] for path in files: question_path = os.path.join(dataroot, path) q_data = json.load(open(question_path)) for verb, values in q_data.items(): roles = values['roles'] for role, info in roles.items(): question = info['question'] dictionary.tokenize(question, True) #add all collected words from imsitu. contains both overlaps with vqa as well as new words imsitu_words_path = os.path.join( dataroot, 'allnverbsall_imsitu_words_nl2glovematching.json') imsitu_words = json.load(open(imsitu_words_path)) for label, eng_name in imsitu_words.items(): dictionary.tokenize(eng_name, True) print(' with words coming from imsitu ', dictionary.__len__()) return dictionary
def create_dictionary(dataroot, dataset, old_dictionary=None, args=None): dictionary = Dictionary() if old_dictionary is not None: print("Copying old dictionary to new dictionary") dictionary.word2idx = old_dictionary.word2idx dictionary.idx2word = old_dictionary.idx2word file_names = [ 'train_questions.json', 'val_questions.json', 'test_questions.json' ] if dataset.lower() == 'vqa2': file_names.append('test_dev_questions.json') files = [] for f in file_names: files.append(os.path.join(dataroot, 'vqa2', f)) if args.combine_with is not None: for cs in args.combine_with_splits: files.append( os.path.join(args.combine_with_dataroot, 'vqa2', cs + "_questions.json")) print("files to process {}".format(files)) for question_path in files: qs = json.load(open(question_path))['questions'] for q in qs: dictionary.tokenize(q['question'], True) return dictionary
def create_dictionary(dataroot, task='vqa'): dictionary = Dictionary() if task == 'vqa': files = [ 'v2_OpenEnded_mscoco_train2014_questions.json', 'v2_OpenEnded_mscoco_val2014_questions.json', 'v2_OpenEnded_mscoco_test2015_questions.json', 'v2_OpenEnded_mscoco_test-dev2015_questions.json' ] for path in files: question_path = os.path.join(dataroot, path) qs = json.load(open(question_path))['questions'] for q in qs: dictionary.tokenize(q['question'], True) elif task == 'flickr': files = [ 'train_ids.pkl', 'val_ids.pkl', 'test_ids.pkl', ] sentence_dir = os.path.join(dataroot, 'Flickr30kEntities/Sentences') for path in files: ids_file = os.path.join(dataroot, path) with open(ids_file, 'rb') as f: imgids = cPickle.load(f) for image_id in imgids: question_path = os.path.join(sentence_dir, '%d.txt' % image_id) phrases = get_sent_data(question_path) for phrase in phrases: dictionary.tokenize(phrase, True) return dictionary
def create_dictionary(dataroot): dictionary = Dictionary() questions = [] files = [ 'v2_OpenEnded_mscoco_train2014_questions.json', 'v2_OpenEnded_mscoco_val2014_questions.json', 'v2_OpenEnded_mscoco_test2015_questions.json', 'v2_OpenEnded_mscoco_test-dev2015_questions.json' ] for path in files: question_path = os.path.join(dataroot, path) qs = json.load(open(question_path))['questions'] for q in qs: dictionary.tokenize(q['question'], True) print('words coming from vqa ', dictionary.__len__()) #add all collected words from imsitu. contains both overlaps with vqa as well as new words imsitu_words_path = os.path.join( dataroot, 'allnverbs_imsitu_words_nl2vqamatching.json') imsitu_words = json.load(open(imsitu_words_path)) for label, eng_name in imsitu_words.items(): dictionary.tokenize(eng_name, True) print(' with words coming from imsitu ', dictionary.__len__()) return dictionary
def create_dictionary2(dataroot): dictionary = Dictionary() questions = [] files = ['train/questions.txt', 'train/questions.txt'] for path in files: question_path = os.path.join(dataroot, path) qs = open(question_path) qs = qs.read().split("\n") for q in qs: dictionary.tokenize(q, True) return dictionary
def create_dictionary(dataroot): dictionary = Dictionary() questions = [] files = ['VQA_caption_traindataset.pkl', 'VQA_caption_valdataset.pkl'] for path in files: question_path = os.path.join(dataroot, path) dataset = cPickle.load(open(question_path, 'rb')) for idx in range(len(dataset)): captions = dataset[idx]['caption'] for cap in captions: dictionary.tokenize(cap, True) return dictionary
def dispatch(cls, key, request): if key is None or request is None: raise Exception kwargs = get_params(request) params = Dictionary() for k in kwargs: params.set(k, kwargs[k]) params.filter() return cls.hand_logic(params, key, request)
def create_dictionary(dataroot): dictionary = Dictionary() files = ['allwords4verbq1.json'] for path in files: question_path = os.path.join(dataroot, path) q_data = json.load(open(question_path)) for label, eng_name in q_data.items(): dictionary.tokenize(eng_name, True) return dictionary
def create_dictionary(dataroot): dictionary = Dictionary() questions = [] files = [ 'v2_OpenEnded_mscoco_train2014_questions.json', 'v2_OpenEnded_mscoco_val2014_questions.json', 'v2_OpenEnded_mscoco_test2015_questions.json', 'v2_OpenEnded_mscoco_test-dev2015_questions.json' ] for path in files: question_path = os.path.join(dataroot, path) qs = json.load(open(question_path))['questions'] for q in qs: dictionary.tokenize(q['question'], True) return dictionary
def create_dictionary(dataroot): dictionary = Dictionary() questions = [] files = [ 'OpenEnded_abstract_v002_test2015_questions.json', 'OpenEnded_abstract_v002_train2015_questions.json', 'OpenEnded_abstract_v002_val2015_questions.json', 'MultipleChoice_abstract_v002_test2015_questions.json', 'MultipleChoice_abstract_v002_train2015_questions.json', 'MultipleChoice_abstract_v002_val2015_questions.json' ] for path in files: question_path = os.path.join(dataroot, path) qs = json.load(open(question_path))['questions'] for q in qs: dictionary.tokenize(q['question'], True) return dictionary
def load_model_data(config, is_train=True, eval_name="val"): # data load dictionary = Dictionary() embedding_weight = dictionary.create_glove_embedding_init( pre=True, pre_dir='../data/vocabs/embedding_weight.npy') if is_train: train_dset = TextVQA('train', dictionary) eval_dset = TextVQA('val', dictionary) test_dset = None if eval_name == "test": test_dset = TextVQA('test', dictionary) model = build_model(train_dset, config['model_attributes']) return model, train_dset, eval_dset, embedding_weight, test_dset else: eval_dset = TextVQA(eval_name, dictionary) model = build_model(eval_dset, config['model_attributes']) return model, eval_dset
def create_dictionary(dataroot, tk='mecab'): dictionary = Dictionary() if tk == 'mecab': tokenizer = Mecab() elif tk == 'kkma': tokenizer = Kkma() files = [ 'KVQA_annotations_train.json', 'KVQA_annotations_val.json', 'KVQA_annotations_test.json' ] for path in files: question_path = os.path.join(dataroot, path) qs = json.load(open(question_path, encoding='utf-8')) for q in qs: dictionary.tokenize(tokenize_kvqa(q['question']), True, tokenizer.morphs) return dictionary
def create_dictionary(dataroot): dictionary = Dictionary() files = [ 'imsitu_questions_prev.json' ] for path in files: question_path = os.path.join(dataroot, path) q_data = json.load(open(question_path)) for verb, values in q_data.items(): roles = values['roles'] for role, info in roles.items(): question = info['question'] dictionary.tokenize(question, True) return dictionary
def create_dictionary(dataroot): dictionary = Dictionary() questions = [] files = ['vqacp_v2_train_questions.json', 'vqacp_v2_test_questions.json' ] for path in files: question_path = os.path.join(dataroot, path) qs = json.load(open(question_path)) for q in qs: dictionary.tokenize(q['question'], True) if 'train' in path: try: dictionary.tokenize(q['orig_question'], True) except: continue return dictionary
def create_dictionary(dataroot): dictionary = Dictionary() role_name_corrector = 'data/roles_namecorrected.json' role_name_dict = json.load(open(role_name_corrector)) files = [ 'imsitu_questions_prev.json' ] for path in files: question_path = os.path.join(dataroot, path) q_data = json.load(open(question_path)) for verb, values in q_data.items(): roles = values['roles'] for role, info in roles.items(): question = role_name_dict[role] dictionary.tokenize(question, True) return dictionary
def create_dictionary(dataroot): dictionary = Dictionary() questions = [] files = [ 'v2_OpenEnded_mscoco_train2014_questions.json', 'v2_OpenEnded_mscoco_val2014_questions.json', 'v2_OpenEnded_mscoco_test2015_questions.json', 'v2_OpenEnded_mscoco_test-dev2015_questions.json', 'how_many_qa/HowMany-QA/qzcreate.json' ] for path in files: question_path = os.path.join(dataroot, path) qs = json.load(open(question_path)) if "HowMany-QA" not in path: qs = qs['questions'] for q in qs: if 'question' in q: dictionary.tokenize(q['question'], True) print(path, " is ok") return dictionary
def create_explain_dictionary(dataroot, thres): dictionary = Dictionary() counter = Counter() files = [ 'VQA-E_train_set.json', 'VQA-E_val_set.json', ] for path in files: explain_path = os.path.join(dataroot, path) es = json.load(open(explain_path)) for e in es: counter.update(dictionary.word_token(e['explanation'][0])) dictionary.add_word('<pad>') dictionary.add_word('<start>') dictionary.add_word('<end>') dictionary.add_word('<unk>') for word, cnt in counter.items(): if cnt >= thres: dictionary.add_word(word) return dictionary
def create_caption_dictionary(dataroot, thres): dictionary = Dictionary() counter = Counter() files = [ 'captions_train2014.json', 'captions_val2014.json', ] for path in files: caption_path = os.path.join(dataroot, path) qs = json.load(open(caption_path))['annotations'] for q in qs: counter.update(dictionary.word_token(q['caption'])) dictionary.add_word('<pad>') dictionary.add_word('<start>') dictionary.add_word('<end>') dictionary.add_word('<unk>') for word, cnt in counter.items(): if cnt >= thres: dictionary.add_word(word) return dictionary
def create_VQAX_explain_dictionary(dataroot, thres): dictionary = Dictionary() counter = Counter() files = [ 'train_exp_anno.json', 'val_exp_anno.json', 'test_exp_anno.json', ] for path in files: explain_path = os.path.join(dataroot, path) es = json.load(open(explain_path)) for e in es.items(): for E in e[1]: counter.update(dictionary.word_token(E)) dictionary.add_word('<pad>') dictionary.add_word('<start>') dictionary.add_word('<end>') dictionary.add_word('<unk>') for word, cnt in counter.items(): if cnt >= thres: dictionary.add_word(word) return dictionary
def create_dictionary(dataroot, only_image_questions): dictionary = Dictionary() questions = [] files = [ 'official_aaai_split_train_data.json', 'v2_OpenEnded_mscoco_train2014_questions.json' ] for path in files: question_path = os.path.join(dataroot, path) if path == 'official_aaai_split_train_data.json': if only_image_questions: qs = [example for example in json.load(open(question_path)) if example['q_type'] == 'image'] else: qs = [example for example in json.load(open(question_path)) if example['image'] is not None] else: qs = json.load(open(question_path))['questions'] caps = [dia['caption'] for dia in json.load(open(os.path.join(dataroot, 'visdial_1.0_train.json')))['data']['dialogs']] for cap in caps: dictionary.tokenize(cap, True) for example in qs: dictionary.tokenize(example['question'], True) if path == 'official_aaai_split_train_data.json': dictionary.tokenize(example['image']['caption'], True) return dictionary
def __init__(self): self.__name = "item" self.__id = "" self.__attr = Dictionary() self.__children = List()
args = get_args() print(args) # set the random seed manually for reproducibility torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print("WARNING: You have a CUDA device, so you should probably run with --cuda") else: torch.cuda.manual_seed(args.seed) # Load Dictionary assert os.path.exists(args.train_data) assert os.path.exists(args.val_data) dictionary = Dictionary(join_path(data_dir,'data/atec_nlp_sim_train.csv')) args.vocab_size = len(dictionary) best_val_loss = None best_f1 = None n_token = len(dictionary) model = ESIM(args) if torch.cuda.is_available(): model = model.cuda() print(model) print('Begin to load data.') train_data = MyDataset(args.train_data, args.sequence_length, dictionary.word2idx, args.char_model) val_data = MyDataset(args.val_data, args.sequence_length, dictionary.word2idx, args.char_model) train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=16) val_loader = DataLoader(val_data, batch_size=1, shuffle=False) try:
import baseline from train import train import utils import config import os dirs_list = ['./info', './save_models'] if __name__ == '__main__': opt = config.parse_opt() torch.cuda.set_device(1) torch.manual_seed(opt.SEED) torch.cuda.manual_seed(opt.SEED) torch.backends.cudnn.bechmark = True dictionary = Dictionary({'Yes': 0}, ['Yes']) dictionary.init_dict() train_set = FeatureDataset('Action', dictionary, 'Train') test_set = FeatureDataset('Action', dictionary, 'Test') constructor = 'build_baseline' model = getattr(baseline, constructor)(train_set, opt).cuda() model.w_emb.init_embedding() train_loader = DataLoader(train_set, opt.BATCH_SIZE, shuffle=True, num_workers=1) test_loader = DataLoader(test_set, opt.BATCH_SIZE, shuffle=True,
emb_dim = len(entries[0].split(' ')) - 1 print('embedding dim is %d' % emb_dim) weights = np.zeros((len(idx2word), emb_dim), dtype=np.float32) for entry in entries: vals = entry.split(' ') word = vals[0] vals = map(float, vals[1:]) word2emb[word] = np.array(vals) for idx, word in enumerate(idx2word): if word not in word2emb: continue weights[idx] = word2emb[word] return weights, word2emb if __name__ == '__main__': caption_dictionary = Dictionary() caption_dictionary.add_word('<pad>') caption_dictionary.add_word('<unk>') caption_dictionary = create_dictionary(caption_dictionary) caption_dictionary.dump_to_file('caption_dictionary.pkl') emb_dim = 300 glove_file = 'h5data/glove/glove.6B.%dd.txt' % emb_dim #with open('/data/wujial/Attention-on-Attention-for-VQA/data/cache/trainval_label2ans.pkl', 'rb') as f: # x = pickle.load(f) weights, word2emb = create_glove_embedding_init( caption_dictionary.idx2word, glove_file) np.save('glove6b_caption_init_%dd.npy' % emb_dim, weights)
args = get_args() print(args) # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print("WARNING: You have a CUDA device, so you should probably run with --cuda") else: torch.cuda.manual_seed(args.seed) # Load Dictionary assert os.path.exists(args.train_data) assert os.path.exists(args.val_data) print('Begin to load the dictionary.') dictionary = Dictionary('../data/atec_nlp_sim_train.csv') args.vocab_size = len(dictionary) best_val_loss = None best_f1 = None n_token = len(dictionary) embedding_net = EmbeddingCNN(args) print("embedding_net: {}".format(embedding_net)) model = SiameseNet(embedding_net) print(model) print('Begin to load data.') train_data = MyDataset(args.train_data, args.sequence_length, dictionary.word2idx, args.char_model) val_data = MyDataset(args.val_data, args.sequence_length, dictionary.word2idx, args.char_model)
def create_dictionary(question): dictionary = Dictionary() dictionary.tokenize(question, True) return dictionary
from dataset import Dictionary if __name__ == '__main__': d = Dictionary() all_sent = d.get_all_sentence() print all_sent[0], all_sent[1] token1 = d.tokenize(all_sent[0], False) token2 = d.tokenize(all_sent[1], False) print token1, token2