示例#1
0
def create_question_explain_dictionary(dataroot, thres):
    dictionary = Dictionary()
    counter = Counter()
    question_files = [
        'v2_OpenEnded_mscoco_train2014_questions.json',
        'v2_OpenEnded_mscoco_val2014_questions.json',
        'v2_OpenEnded_mscoco_test2015_questions.json',
        'v2_OpenEnded_mscoco_test-dev2015_questions.json'
    ]
    explain_files = [
        'VQA-E_train_set.json',
        'VQA-E_val_set.json',
    ]
    for path in explain_files:
        explain_path = os.path.join(dataroot, path)
        es = json.load(open(explain_path))
        for e in es:
            counter.update(dictionary.word_token(e['explanation'][0]))

    dictionary.add_word('<pad>')
    dictionary.add_word('<start>')
    dictionary.add_word('<end>')
    dictionary.add_word('<unk>')
    for word, cnt in counter.items():
        if cnt >= thres:
            dictionary.add_word(word)
    for path in question_files:
        question_path = os.path.join(dataroot, path)
        qs = json.load(open(question_path))['questions']
        for q in qs:
            dictionary.tokenize(q['question'], True)

    return dictionary
示例#2
0
def create_question_dictionary(dataroot):
    dictionary = Dictionary()
    questions = []
    files = [
        'v2_OpenEnded_mscoco_train2014_questions.json',
        'v2_OpenEnded_mscoco_val2014_questions.json',
        'v2_OpenEnded_mscoco_test2015_questions.json',
        'v2_OpenEnded_mscoco_test-dev2015_questions.json'
    ]
    dictionary.add_word('<pad>')
    for path in files:
        question_path = os.path.join(dataroot, path)
        qs = json.load(open(question_path))['questions']
        for q in qs:
            dictionary.tokenize(q['question'], True)
    return dictionary
示例#3
0
def create_explain_dictionary(dataroot, thres):
    dictionary = Dictionary()
    counter = Counter()
    files = [
        'VQA-E_train_set.json',
        'VQA-E_val_set.json',
    ]
    for path in files:
        explain_path = os.path.join(dataroot, path)
        es = json.load(open(explain_path))
        for e in es:
            counter.update(dictionary.word_token(e['explanation'][0]))

    dictionary.add_word('<pad>')
    dictionary.add_word('<start>')
    dictionary.add_word('<end>')
    dictionary.add_word('<unk>')
    for word, cnt in counter.items():
        if cnt >= thres:
            dictionary.add_word(word)
    return dictionary
示例#4
0
def create_caption_dictionary(dataroot, thres):
    dictionary = Dictionary()
    counter = Counter()
    files = [
        'captions_train2014.json',
        'captions_val2014.json',
    ]
    for path in files:
        caption_path = os.path.join(dataroot, path)
        qs = json.load(open(caption_path))['annotations']
        for q in qs:
            counter.update(dictionary.word_token(q['caption']))

    dictionary.add_word('<pad>')
    dictionary.add_word('<start>')
    dictionary.add_word('<end>')
    dictionary.add_word('<unk>')
    for word, cnt in counter.items():
        if cnt >= thres:
            dictionary.add_word(word)
    return dictionary
示例#5
0
def create_VQAX_explain_dictionary(dataroot, thres):
    dictionary = Dictionary()
    counter = Counter()
    files = [
        'train_exp_anno.json',
        'val_exp_anno.json',
        'test_exp_anno.json',
    ]
    for path in files:
        explain_path = os.path.join(dataroot, path)
        es = json.load(open(explain_path))
        for e in es.items():
            for E in e[1]:
                counter.update(dictionary.word_token(E))

    dictionary.add_word('<pad>')
    dictionary.add_word('<start>')
    dictionary.add_word('<end>')
    dictionary.add_word('<unk>')
    for word, cnt in counter.items():
        if cnt >= thres:
            dictionary.add_word(word)
    return dictionary
    emb_dim = len(entries[0].split(' ')) - 1
    print('embedding dim is %d' % emb_dim)
    weights = np.zeros((len(idx2word), emb_dim), dtype=np.float32)

    for entry in entries:
        vals = entry.split(' ')
        word = vals[0]
        vals = map(float, vals[1:])
        word2emb[word] = np.array(vals)
    for idx, word in enumerate(idx2word):
        if word not in word2emb:
            continue
        weights[idx] = word2emb[word]
    return weights, word2emb


if __name__ == '__main__':

    caption_dictionary = Dictionary()
    caption_dictionary.add_word('<pad>')
    caption_dictionary.add_word('<unk>')
    caption_dictionary = create_dictionary(caption_dictionary)
    caption_dictionary.dump_to_file('caption_dictionary.pkl')
    emb_dim = 300
    glove_file = 'h5data/glove/glove.6B.%dd.txt' % emb_dim
    #with open('/data/wujial/Attention-on-Attention-for-VQA/data/cache/trainval_label2ans.pkl', 'rb') as f:
    #    x = pickle.load(f)
    weights, word2emb = create_glove_embedding_init(
        caption_dictionary.idx2word, glove_file)
    np.save('glove6b_caption_init_%dd.npy' % emb_dim, weights)
示例#7
0
    tot_sememe_missing = 0
    for word in raw_dict.idx2word:
        if raw_dict.idx2freq[raw_dict.word2idx[
                word]] >= threshold and not overall_dict.exist(word):
            #if word in pun:
            #    continue
            search_words = dfs_search(word)
            if search_words is None:
                print(word + ': Not found')
                #raw_dict.add_word(word)
            else:
                for single_word in search_words:
                    for j in range(raw_dict.idx2freq[raw_dict.word2idx[word]]):
                        overall_dict.add_word_f(single_word)
                        raw_dict.add_word(single_word)
                tot_words += (len(search_words) -
                              1) * raw_dict.idx2freq[raw_dict.word2idx[word]]

    overall_dict.set_threshold(threshold)

    overall_dict.sememe_word_visit(raw_dict.word2idx)
    c_tot_words = 0
    delete_word = []

    def output(filename):
        of = open(path_out + filename, 'w')
        f = open(path_in + filename)
        ctw = 0
        for line in f.readlines():
            words = line.split()
示例#8
0
class SememeDictionary(object):
    def __init__(self, path=None):
        if path == None:
            path = 'data/HowNet.txt'
        self.word2idx = {}
        self.idx2word = []
        self.idx2freq = []
        self.idx2senses = []
        self.threshold = -1
        self.sememe_dict = Dictionary()
        self.threshold = 0
        file = open(path)
        phase = 0
        re_chn = re.compile(u'[^\u4e00-\u9fa5]')
        cur_word = ''

        # add sememe for special tokens
        self.add_word('<blank>', ['<blank>'])  # padding
        self.add_word('<s>', ['<s>'])  # start
        self.add_word('</s>', ['</s>'])  # end
        self.add_word('…', ['标点'])
        self.add_word('?', ['标点'])
        self.add_word(':', ['标点'])
        self.add_word('·', ['标点'])
        self.add_word(';', ['标点'])
        self.add_word('%', ['标点'])
        self.add_word('•', ['标点'])
        self.add_word('-', ['标点'])
        self.add_word('!', ['标点'])
        self.add_word('.', ['标点'])
        self.add_word('「', ['标点'])
        self.add_word('」', ['标点'])
        self.add_word('.', ['标点'])
        self.add_word('/', ['标点'])
        self.add_word('→', ['标点'])
        self.add_word('❶', ['标点', '基数'])
        self.add_word('❷', ['标点', '基数'])
        self.add_word('❸', ['标点', '基数'])
        self.add_word('❹', ['标点', '基数'])
        self.add_word('❺', ['标点', '基数'])
        self.add_word('❻', ['标点', '基数'])
        self.add_word('❼', ['标点', '基数'])
        self.add_word('❽', ['标点', '基数'])
        self.add_word('❾', ['标点', '基数'])
        self.add_word('❿', ['标点', '基数'])

        self.add_word('<unk>', ['<unk>'])
        self.add_word('<eos>', ['<eos>'])
        self.add_word('<N>', ['基数'])
        self.add_word('<year>', ['时间', '年', '特定'])
        self.add_word('<date>', ['时间', '月', '特定'])
        self.add_word('<hour>', ['时间', '时', '特定'])
        self.add_word('(', ['标点'])
        self.add_word('『', ['标点'])
        self.add_word('……', ['标点'])
        self.add_word('●', ['标点'])
        self.add_word('《', ['标点'])
        self.add_word('—', ['标点'])
        self.add_word('———', ['标点'])
        self.add_word('』', ['标点'])
        self.add_word('》', ['标点'])
        self.add_word('△', ['标点'])
        self.add_word('、', ['标点'])
        self.add_word(')', ['标点'])
        self.add_word('℃', ['标点'])
        self.add_word('▲', ['标点'])

        for line in file.readlines():
            if line[0:3] == 'NO.':
                phase = 1
                continue  # new word
            if phase == 1 and line[0:3] == 'W_C':
                phase = 2
                word = line[4:-1]
                if word == '':
                    phase = 0
                else:
                    cur_word = word
                continue
            if phase == 2 and line[0:3] == 'DEF':
                phase = 3
                content = line[4:-1]
                sememes = re_chn.split(content)
                sememe_bag = []
                for sememe in sememes:
                    if sememe != '':
                        sememe_bag += [sememe]
                if cur_word != '':
                    self.add_word(cur_word, sememe_bag)
        self.sememe_dict.idx2freq = [0] * len(self.sememe_dict)

    def senses_belong(self, sememes_bag, senses_bag):
        for i in range(len(senses_bag)):
            if len(set(sememes_bag + senses_bag[i])) == len(sememes_bag)\
                    and len(sememes_bag) == len(senses_bag[i]):
                return True
        return False

    def add_word(self, word, sememes_bag):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.idx2senses.append([])
            self.idx2freq.append(0)
            self.word2idx[word] = len(self.idx2word) - 1

        idx = self.word2idx[word]
        sememe_bag_idx = []
        for sememe in sememes_bag:
            sememe_bag_idx.append(self.sememe_dict.add_word(sememe))
        sememe_bag_idx = list(set(sememe_bag_idx))
        if not self.senses_belong(sememe_bag_idx, self.idx2senses[idx]):
            self.idx2senses[idx].append(sememe_bag_idx)

        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

    def summary(self, print_sememes=False):
        print('=' * 69)
        print('-' * 31 + 'SUMMARY' + '-' * 31)
        print('Number of Sememes: {}'.format(len(self.sememe_dict)))
        print('Number of Words: {}'.format(len(self.idx2word)))
        tot_senses = 0
        tot_sememes = 0
        for i in range(len(self.idx2word)):
            tot_senses += len(self.idx2senses[i])
            for j in range(len(self.idx2senses[i])):
                tot_sememes += len(self.idx2senses[i][j])
        ws_ratio = (tot_senses + 0.0) / len(self.idx2word)
        ss_ratio = (tot_sememes + 0.0) / tot_senses
        print('Mean Senses per Word: {}'.format(ws_ratio))
        print('Mean Sememes per Sense: {}'.format(ss_ratio))
        print('=' * 69)
        if print_sememes:
            print(','.join(self.sememe_dict.idx2word))

    def exist(self, word):
        return word in self.word2idx

    def add_word_f(self, word):
        if word not in self.word2idx:
            self.add_word(word, ['<unk>'])
            # print(word)
        idx = self.word2idx[word]
        for sense in self.idx2senses[idx]:
            for sememe in sense:
                self.sememe_dict.idx2freq[sememe] += 1
        self.idx2freq[self.word2idx[word]] += 1

    def query_count(self, word):
        if word not in self.word2idx:
            raise ValueError("Word don't exist")
        return self.idx2freq[self.word2idx[word]]

    def freq_le(self, k):
        tot = 0
        for idx in range(len(self.idx2word)):
            if self.idx2freq[idx] < k:
                tot += 1
        return tot

    def freq_ge(self, k):
        tot = 0
        for idx in range(len(self.idx2word)):
            if self.idx2freq[idx] >= k:
                tot += 1
        return tot

    def set_threshold(self, threshold):
        self.threshold = threshold

    def sememe_word_visit(self, word_dict):
        sememe_word = []
        sememe_sense = []
        for i in range(len(self.sememe_dict)):
            sememe_word.append([])
            sememe_sense.append([])
        maximum_senses = 0
        tot_senses = 0
        for word_id in range(len(self.word2idx)):
            if self.idx2freq[word_id] >= self.threshold:
                maximum_senses = max(maximum_senses,
                                     len(self.idx2senses[word_id]))
                for sense in self.idx2senses[word_id]:
                    for sememe in sense:
                        sememe_word[sememe].append(word_id)
                        sememe_sense[sememe].append(tot_senses)
                    tot_senses += 1
        tot = 0
        tot_sememes = 0
        max_words = 0
        a = []
        sememe_word_pair = [[], []]
        sememe_sense_pair = [[], []]
        sememe_idx = []
        word_sense = []
        for i in range(len(word_dict)):
            word_sense.append([])
        for i in range(len(self.sememe_dict)):
            cur_str = self.sememe_dict.idx2word[i]
            cur_str += ': '
            words = []
            for j in range(len(sememe_word[i])):
                word_id = sememe_word[i][j]
                sense_id = sememe_sense[i][j]
                words.append(self.idx2word[word_id])
                sememe_word_pair[0].append(tot_sememes)
                sememe_word_pair[1].append(word_dict[self.idx2word[word_id]])
                sememe_sense_pair[0].append(tot_sememes)
                sememe_sense_pair[1].append(sense_id)
                word_sense[word_dict[self.idx2word[word_id]]].append(sense_id)
            tot += len(sememe_word[i])
            max_words = max(max_words, len(sememe_word[i]))
            a += sememe_word[i]
            cur_str += ','.join(words)
            if len(set(sememe_word[i])) > 0:
                sememe_idx.append(tot_sememes)
            else:
                sememe_idx.append(-1)
            tot_sememes += len(sememe_word[i]) > 0
        for i in range(len(word_dict)):
            word_sense[i] = list(set(word_sense[i]))
        print('Total words: {}'.format(len(set(a))))
        print('Maximum words per sememe: {}'.format(max_words))
        print('Maximum sense per word: {}'.format(maximum_senses))
        print('Total respective semems: {}'.format(tot_sememes))
        print('Total sememe-word pairs: {}'.format(tot))
        return sememe_word_pair, sememe_idx, sememe_sense_pair, word_sense

    def visit(self, word, mode='full'):
        if word not in self.word2idx:
            raise ValueError('No word!')
        idx = self.word2idx[word]
        if mode == 'sbag':
            sememes = []
            for sense in self.idx2senses[idx]:
                for sememe in sense:
                    sememes.append(sememe)
            sememes = set(sememes)
            sememes_str = []
            for sememe in sememes:
                sememes_str.append(self.sememe_dict.idx2word[sememe])
            print(word + ':' + ','.join(sememes_str))
        if mode == 'full':
            print('Word: ' + word +
                  ', total {} means'.format(len(self.idx2senses[idx])))
            for i in range(len(self.idx2senses[idx])):
                sememes_list = []
                for j in range(len(self.idx2senses[idx][i])):
                    sememes_list.append(
                        self.sememe_dict.idx2word[self.idx2senses[idx][i][j]])
                sememes = ','.join(sememes_list)
                print('Sense #{}: '.format(i + 1) + sememes)