Exemplo n.º 1
0
def tfidf_from_questions(names, args, dictionary, dataroot='data', target=['rad']):
    inds = [[], []] # rows, cols for uncoalesce sparse matrix
    df = dict()
    N = len(dictionary)
    if args.use_RAD:
        dataroot = args.RAD_dir
    def populate(inds, df, text):
        tokens = dictionary.tokenize(text, True)
        for t in tokens:
            df[t] = df.get(t, 0) + 1
        combin = list(itertools.combinations(tokens, 2))
        for c in combin:
            if c[0] < N:
                inds[0].append(c[0]); inds[1].append(c[1])
            if c[1] < N:
                inds[0].append(c[1]); inds[1].append(c[0])

    if 'rad' in target:
        for name in names:
            assert name in ['train', 'test']
            question_path = os.path.join(dataroot, name + 'set.json')
            questions = json.load(open(question_path))
            for question in questions:
                populate(inds, df, question['question'])

    # TF-IDF
    vals = [1] * len(inds[1])
    for idx, col in enumerate(inds[1]):
        assert df[col] >= 1, 'document frequency should be greater than zero!'
        vals[col] /= df[col]

    # Make stochastic matrix
    def normalize(inds, vals):
        z = dict()
        for row, val in zip(inds[0], vals):
            z[row] = z.get(row, 0) + val
        for idx, row in enumerate(inds[0]):
            vals[idx] /= z[row]
        return vals

    vals = normalize(inds, vals)

    tfidf = torch.sparse.FloatTensor(torch.LongTensor(inds), torch.FloatTensor(vals))
    tfidf = tfidf.coalesce()

    # Latent word embeddings
    emb_dim = 300
    glove_file = os.path.join(dataroot, 'glove', 'glove.6B.%dd.txt' % emb_dim)
    weights, word2emb = utils.create_glove_embedding_init(dictionary.idx2word[N:], glove_file)
    print('tf-idf stochastic matrix (%d x %d) is generated.' % (tfidf.size(0), tfidf.size(1)))

    return tfidf, weights
Exemplo n.º 2
0
def tfidf_from_questions(names, dictionary, dataroot='data', target=['vqa', 'vg', 'cap']):
    inds = [[], []] # rows, cols for uncoalesce sparse matrix
    df = dict()
    N = len(dictionary)

    def populate(inds, df, text):
        tokens = dictionary.tokenize(text, True)
        for t in tokens:
            df[t] = df.get(t, 0) + 1
        combin = list(itertools.combinations(tokens, 2))
        for c in combin:
            if c[0] < N:
                inds[0].append(c[0]); inds[1].append(c[1])
            if c[1] < N:
                inds[0].append(c[1]); inds[1].append(c[0])

    if 'vqa' in target: # VQA 2.0
        for name in names:
            assert name in ['train', 'val', 'test-dev2015', 'test2015']
            question_path = os.path.join(
                dataroot, 'v2_OpenEnded_mscoco_%s_questions.json' % \
                (name + '2014' if 'test'!=name[:4] else name))
            questions = json.load(open(question_path))['questions']

            for question in questions:
                populate(inds, df, question['question'])

    if 'vg' in target: # Visual Genome
        question_path = os.path.join(dataroot, 'question_answers.json')
        vgq = json.load(open(question_path, 'r'))
        for vg in vgq:
            for q in vg['qas']:
                populate(inds, df, q['question'])

    if 'cap' in target: # MSCOCO Caption
        for split in ['train2017', 'val2017']:
            captions = json.load(open('data/annotations/captions_%s.json' % split, 'r'))
            for caps in captions['annotations']:
                populate(inds, df, caps['caption'])

    # TF-IDF 
    vals = [1] * len(inds[1])
    for idx, col in enumerate(inds[1]):
        assert df[col] >= 1, 'document frequency should be greater than zero!'
        vals[col] /= df[col]

    # Make stochastic matrix
    def normalize(inds, vals):
        z = dict()
        for row, val in zip(inds[0], vals):
            z[row] = z.get(row, 0) + val
        for idx, row in enumerate(inds[0]):
            vals[idx] /= z[row]
        return vals

    vals = normalize(inds, vals)

    tfidf = torch.sparse.FloatTensor(torch.LongTensor(inds), torch.FloatTensor(vals))
    tfidf = tfidf.coalesce()

    # Latent word embeddings
    emb_dim = 300
    glove_file = 'data/glove/glove.6B.%dd.txt' % emb_dim
    weights, word2emb = utils.create_glove_embedding_init(dictionary.idx2word[N:], glove_file)
    print('tf-idf stochastic matrix (%d x %d) is generated.' % (tfidf.size(0), tfidf.size(1)))

    return tfidf, weights
def tfidf_from_questions_gqa(names, dictionary, dataroot='data', target=['gqa']):
    inds = [[], []]
    df = dict()
    N = len(dictionary)

    def populate(inds, df, text):
        tokens = dictionary.tokenize(text, True)
        for t in tokens:
            df[t] = df.get(t, 0) + 1
        combin = list(itertools.combinations(tokens, 2))
        for c in combin:
            if c[0] < N:
                inds[0].append(c[0])
                inds[1].append(c[1])
            if c[1] < N:
                inds[0].append(c[1])
                inds[1].append(c[0])

    # GQA
    if 'gqa' in target:
        for name in names:
            assert name in ['train_all', 'train_balanced', 'val_all', 'val_balanced', 'challenge_all', 'challenge_balanced', 'testdev_all', 'testdev_balanced', 'test_all', 'test_balanced']
            
            if name == 'train_all':
                questions_path = os.path.join(dataroot, 'gqa', 'questions', 'train_all_questions.pkl')
                questions = pickle.load(open(questions_path, 'rb'))
            elif name == 'train_balanced':
                questions_path = os.path.join(dataroot, 'gqa', 'questions', 'train_balanced_questions.pkl')
                questions = pickle.load(open(questions_path, 'rb'))
            elif name == 'val_all':
                questions_path = os.path.join(dataroot, 'gqa', 'questions', 'val_all_questions.pkl')
                questions = pickle.load(open(questions_path, 'rb'))
            elif name == 'val_balanced':
                questions_path = os.path.join(dataroot, 'gqa', 'questions', 'val_balanced_questions.pkl')
                questions = pickle.load(open(questions_path, 'rb'))
            elif name == 'challenge_all':
                questions_path = os.path.join(dataroot, 'gqa', 'questions', 'challenge_all_questions.pkl')
                questions = pickle.load(open(questions_path, 'rb'))
            elif name == 'challenge_balanced':
                questions_path = os.path.join(dataroot, 'gqa', 'questions', 'challenge_balanced_questions.pkl')
                questions = pickle.load(open(questions_path, 'rb'))
            elif name == 'testdev_all':
                questions_path = os.path.join(dataroot, 'gqa', 'questions', 'testdev_all_questions.pkl')
                questions = pickle.load(open(questions_path, 'rb'))
            elif name == 'testdev_balanced':
                questions_path = os.path.join(dataroot, 'gqa', 'questions', 'testdev_balanced_questions.pkl')
                questions = pickle.load(open(questions_path, 'rb'))
            elif name == 'test_all':
                questions_path = os.path.join(dataroot, 'gqa', 'questions', 'test_all_questions.pkl')
                questions = pickle.load(open(questions_path, 'rb'))
            else:
                questions_path = os.path.join(dataroot, 'gqa', 'questions', 'test_balanced_questions.pkl')
                questions = pickle.load(open(questions_path, 'rb'))

            print (name)
            count = 0    
            for question in questions:
                count = count + 1
                populate(inds, df, question['question'])
            
            print (count)    
            
    # TF-IDF
    vals = np.ones((len(inds[1])))
    for idx, col in enumerate(inds[1]):
        assert df[col] >= 1, 'document frequency should be greater than zero!'
        vals[col] /= df[col]

    # Make stochastic matrix
    def normalize(inds, vals):
        z = dict()
        for row, val in zip(inds[0], vals):
            z[row] = z.get(row, 0) + val
        for idx, row in enumerate(inds[0]):
            vals[idx] /= z[row]
        return vals

    vals = normalize(inds, vals)

    tfidf = torch.sparse.FloatTensor(torch.LongTensor(inds),
                                     torch.FloatTensor(vals))
    tfidf = tfidf.coalesce()

    # Latent word embeddings
    emb_dim = 300
    glove_file = dataroot+'/glove/glove.6B.%dd.txt' % emb_dim
    weights, word2emb = utils.create_glove_embedding_init(dictionary.idx2word[N:], glove_file)
    print('tf-idf stochastic matrix (%d x %d) is generated.' % (tfidf.size(0),tfidf.size(1)))

    return tfidf, weights