def create_question_explain_dictionary(dataroot, thres): dictionary = Dictionary() counter = Counter() question_files = [ 'v2_OpenEnded_mscoco_train2014_questions.json', 'v2_OpenEnded_mscoco_val2014_questions.json', 'v2_OpenEnded_mscoco_test2015_questions.json', 'v2_OpenEnded_mscoco_test-dev2015_questions.json' ] explain_files = [ 'VQA-E_train_set.json', 'VQA-E_val_set.json', ] for path in explain_files: explain_path = os.path.join(dataroot, path) es = json.load(open(explain_path)) for e in es: counter.update(dictionary.word_token(e['explanation'][0])) dictionary.add_word('<pad>') dictionary.add_word('<start>') dictionary.add_word('<end>') dictionary.add_word('<unk>') for word, cnt in counter.items(): if cnt >= thres: dictionary.add_word(word) for path in question_files: question_path = os.path.join(dataroot, path) qs = json.load(open(question_path))['questions'] for q in qs: dictionary.tokenize(q['question'], True) return dictionary
def create_question_dictionary(dataroot): dictionary = Dictionary() questions = [] files = [ 'v2_OpenEnded_mscoco_train2014_questions.json', 'v2_OpenEnded_mscoco_val2014_questions.json', 'v2_OpenEnded_mscoco_test2015_questions.json', 'v2_OpenEnded_mscoco_test-dev2015_questions.json' ] dictionary.add_word('<pad>') for path in files: question_path = os.path.join(dataroot, path) qs = json.load(open(question_path))['questions'] for q in qs: dictionary.tokenize(q['question'], True) return dictionary
def create_explain_dictionary(dataroot, thres): dictionary = Dictionary() counter = Counter() files = [ 'VQA-E_train_set.json', 'VQA-E_val_set.json', ] for path in files: explain_path = os.path.join(dataroot, path) es = json.load(open(explain_path)) for e in es: counter.update(dictionary.word_token(e['explanation'][0])) dictionary.add_word('<pad>') dictionary.add_word('<start>') dictionary.add_word('<end>') dictionary.add_word('<unk>') for word, cnt in counter.items(): if cnt >= thres: dictionary.add_word(word) return dictionary
def create_caption_dictionary(dataroot, thres): dictionary = Dictionary() counter = Counter() files = [ 'captions_train2014.json', 'captions_val2014.json', ] for path in files: caption_path = os.path.join(dataroot, path) qs = json.load(open(caption_path))['annotations'] for q in qs: counter.update(dictionary.word_token(q['caption'])) dictionary.add_word('<pad>') dictionary.add_word('<start>') dictionary.add_word('<end>') dictionary.add_word('<unk>') for word, cnt in counter.items(): if cnt >= thres: dictionary.add_word(word) return dictionary
def create_VQAX_explain_dictionary(dataroot, thres): dictionary = Dictionary() counter = Counter() files = [ 'train_exp_anno.json', 'val_exp_anno.json', 'test_exp_anno.json', ] for path in files: explain_path = os.path.join(dataroot, path) es = json.load(open(explain_path)) for e in es.items(): for E in e[1]: counter.update(dictionary.word_token(E)) dictionary.add_word('<pad>') dictionary.add_word('<start>') dictionary.add_word('<end>') dictionary.add_word('<unk>') for word, cnt in counter.items(): if cnt >= thres: dictionary.add_word(word) return dictionary
emb_dim = len(entries[0].split(' ')) - 1 print('embedding dim is %d' % emb_dim) weights = np.zeros((len(idx2word), emb_dim), dtype=np.float32) for entry in entries: vals = entry.split(' ') word = vals[0] vals = map(float, vals[1:]) word2emb[word] = np.array(vals) for idx, word in enumerate(idx2word): if word not in word2emb: continue weights[idx] = word2emb[word] return weights, word2emb if __name__ == '__main__': caption_dictionary = Dictionary() caption_dictionary.add_word('<pad>') caption_dictionary.add_word('<unk>') caption_dictionary = create_dictionary(caption_dictionary) caption_dictionary.dump_to_file('caption_dictionary.pkl') emb_dim = 300 glove_file = 'h5data/glove/glove.6B.%dd.txt' % emb_dim #with open('/data/wujial/Attention-on-Attention-for-VQA/data/cache/trainval_label2ans.pkl', 'rb') as f: # x = pickle.load(f) weights, word2emb = create_glove_embedding_init( caption_dictionary.idx2word, glove_file) np.save('glove6b_caption_init_%dd.npy' % emb_dim, weights)
tot_sememe_missing = 0 for word in raw_dict.idx2word: if raw_dict.idx2freq[raw_dict.word2idx[ word]] >= threshold and not overall_dict.exist(word): #if word in pun: # continue search_words = dfs_search(word) if search_words is None: print(word + ': Not found') #raw_dict.add_word(word) else: for single_word in search_words: for j in range(raw_dict.idx2freq[raw_dict.word2idx[word]]): overall_dict.add_word_f(single_word) raw_dict.add_word(single_word) tot_words += (len(search_words) - 1) * raw_dict.idx2freq[raw_dict.word2idx[word]] overall_dict.set_threshold(threshold) overall_dict.sememe_word_visit(raw_dict.word2idx) c_tot_words = 0 delete_word = [] def output(filename): of = open(path_out + filename, 'w') f = open(path_in + filename) ctw = 0 for line in f.readlines(): words = line.split()
class SememeDictionary(object): def __init__(self, path=None): if path == None: path = 'data/HowNet.txt' self.word2idx = {} self.idx2word = [] self.idx2freq = [] self.idx2senses = [] self.threshold = -1 self.sememe_dict = Dictionary() self.threshold = 0 file = open(path) phase = 0 re_chn = re.compile(u'[^\u4e00-\u9fa5]') cur_word = '' # add sememe for special tokens self.add_word('<blank>', ['<blank>']) # padding self.add_word('<s>', ['<s>']) # start self.add_word('</s>', ['</s>']) # end self.add_word('…', ['标点']) self.add_word('?', ['标点']) self.add_word(':', ['标点']) self.add_word('·', ['标点']) self.add_word(';', ['标点']) self.add_word('%', ['标点']) self.add_word('•', ['标点']) self.add_word('-', ['标点']) self.add_word('!', ['标点']) self.add_word('.', ['标点']) self.add_word('「', ['标点']) self.add_word('」', ['标点']) self.add_word('.', ['标点']) self.add_word('/', ['标点']) self.add_word('→', ['标点']) self.add_word('❶', ['标点', '基数']) self.add_word('❷', ['标点', '基数']) self.add_word('❸', ['标点', '基数']) self.add_word('❹', ['标点', '基数']) self.add_word('❺', ['标点', '基数']) self.add_word('❻', ['标点', '基数']) self.add_word('❼', ['标点', '基数']) self.add_word('❽', ['标点', '基数']) self.add_word('❾', ['标点', '基数']) self.add_word('❿', ['标点', '基数']) self.add_word('<unk>', ['<unk>']) self.add_word('<eos>', ['<eos>']) self.add_word('<N>', ['基数']) self.add_word('<year>', ['时间', '年', '特定']) self.add_word('<date>', ['时间', '月', '特定']) self.add_word('<hour>', ['时间', '时', '特定']) self.add_word('(', ['标点']) self.add_word('『', ['标点']) self.add_word('……', ['标点']) self.add_word('●', ['标点']) self.add_word('《', ['标点']) self.add_word('—', ['标点']) self.add_word('———', ['标点']) self.add_word('』', ['标点']) self.add_word('》', ['标点']) self.add_word('△', ['标点']) self.add_word('、', ['标点']) self.add_word(')', ['标点']) self.add_word('℃', ['标点']) self.add_word('▲', ['标点']) for line in file.readlines(): if line[0:3] == 'NO.': phase = 1 continue # new word if phase == 1 and line[0:3] == 'W_C': phase = 2 word = line[4:-1] if word == '': phase = 0 else: cur_word = word continue if phase == 2 and line[0:3] == 'DEF': phase = 3 content = line[4:-1] sememes = re_chn.split(content) sememe_bag = [] for sememe in sememes: if sememe != '': sememe_bag += [sememe] if cur_word != '': self.add_word(cur_word, sememe_bag) self.sememe_dict.idx2freq = [0] * len(self.sememe_dict) def senses_belong(self, sememes_bag, senses_bag): for i in range(len(senses_bag)): if len(set(sememes_bag + senses_bag[i])) == len(sememes_bag)\ and len(sememes_bag) == len(senses_bag[i]): return True return False def add_word(self, word, sememes_bag): if word not in self.word2idx: self.idx2word.append(word) self.idx2senses.append([]) self.idx2freq.append(0) self.word2idx[word] = len(self.idx2word) - 1 idx = self.word2idx[word] sememe_bag_idx = [] for sememe in sememes_bag: sememe_bag_idx.append(self.sememe_dict.add_word(sememe)) sememe_bag_idx = list(set(sememe_bag_idx)) if not self.senses_belong(sememe_bag_idx, self.idx2senses[idx]): self.idx2senses[idx].append(sememe_bag_idx) return self.word2idx[word] def __len__(self): return len(self.idx2word) def summary(self, print_sememes=False): print('=' * 69) print('-' * 31 + 'SUMMARY' + '-' * 31) print('Number of Sememes: {}'.format(len(self.sememe_dict))) print('Number of Words: {}'.format(len(self.idx2word))) tot_senses = 0 tot_sememes = 0 for i in range(len(self.idx2word)): tot_senses += len(self.idx2senses[i]) for j in range(len(self.idx2senses[i])): tot_sememes += len(self.idx2senses[i][j]) ws_ratio = (tot_senses + 0.0) / len(self.idx2word) ss_ratio = (tot_sememes + 0.0) / tot_senses print('Mean Senses per Word: {}'.format(ws_ratio)) print('Mean Sememes per Sense: {}'.format(ss_ratio)) print('=' * 69) if print_sememes: print(','.join(self.sememe_dict.idx2word)) def exist(self, word): return word in self.word2idx def add_word_f(self, word): if word not in self.word2idx: self.add_word(word, ['<unk>']) # print(word) idx = self.word2idx[word] for sense in self.idx2senses[idx]: for sememe in sense: self.sememe_dict.idx2freq[sememe] += 1 self.idx2freq[self.word2idx[word]] += 1 def query_count(self, word): if word not in self.word2idx: raise ValueError("Word don't exist") return self.idx2freq[self.word2idx[word]] def freq_le(self, k): tot = 0 for idx in range(len(self.idx2word)): if self.idx2freq[idx] < k: tot += 1 return tot def freq_ge(self, k): tot = 0 for idx in range(len(self.idx2word)): if self.idx2freq[idx] >= k: tot += 1 return tot def set_threshold(self, threshold): self.threshold = threshold def sememe_word_visit(self, word_dict): sememe_word = [] sememe_sense = [] for i in range(len(self.sememe_dict)): sememe_word.append([]) sememe_sense.append([]) maximum_senses = 0 tot_senses = 0 for word_id in range(len(self.word2idx)): if self.idx2freq[word_id] >= self.threshold: maximum_senses = max(maximum_senses, len(self.idx2senses[word_id])) for sense in self.idx2senses[word_id]: for sememe in sense: sememe_word[sememe].append(word_id) sememe_sense[sememe].append(tot_senses) tot_senses += 1 tot = 0 tot_sememes = 0 max_words = 0 a = [] sememe_word_pair = [[], []] sememe_sense_pair = [[], []] sememe_idx = [] word_sense = [] for i in range(len(word_dict)): word_sense.append([]) for i in range(len(self.sememe_dict)): cur_str = self.sememe_dict.idx2word[i] cur_str += ': ' words = [] for j in range(len(sememe_word[i])): word_id = sememe_word[i][j] sense_id = sememe_sense[i][j] words.append(self.idx2word[word_id]) sememe_word_pair[0].append(tot_sememes) sememe_word_pair[1].append(word_dict[self.idx2word[word_id]]) sememe_sense_pair[0].append(tot_sememes) sememe_sense_pair[1].append(sense_id) word_sense[word_dict[self.idx2word[word_id]]].append(sense_id) tot += len(sememe_word[i]) max_words = max(max_words, len(sememe_word[i])) a += sememe_word[i] cur_str += ','.join(words) if len(set(sememe_word[i])) > 0: sememe_idx.append(tot_sememes) else: sememe_idx.append(-1) tot_sememes += len(sememe_word[i]) > 0 for i in range(len(word_dict)): word_sense[i] = list(set(word_sense[i])) print('Total words: {}'.format(len(set(a)))) print('Maximum words per sememe: {}'.format(max_words)) print('Maximum sense per word: {}'.format(maximum_senses)) print('Total respective semems: {}'.format(tot_sememes)) print('Total sememe-word pairs: {}'.format(tot)) return sememe_word_pair, sememe_idx, sememe_sense_pair, word_sense def visit(self, word, mode='full'): if word not in self.word2idx: raise ValueError('No word!') idx = self.word2idx[word] if mode == 'sbag': sememes = [] for sense in self.idx2senses[idx]: for sememe in sense: sememes.append(sememe) sememes = set(sememes) sememes_str = [] for sememe in sememes: sememes_str.append(self.sememe_dict.idx2word[sememe]) print(word + ':' + ','.join(sememes_str)) if mode == 'full': print('Word: ' + word + ', total {} means'.format(len(self.idx2senses[idx]))) for i in range(len(self.idx2senses[idx])): sememes_list = [] for j in range(len(self.idx2senses[idx][i])): sememes_list.append( self.sememe_dict.idx2word[self.idx2senses[idx][i][j]]) sememes = ','.join(sememes_list) print('Sense #{}: '.format(i + 1) + sememes)