Пример #1
0
 def preprocess(model):
     if args.sw_name.startswith("clevr"):
         program_prefix = vr.programs.list_to_prefix(
             model["program"])
     else:
         program_prefix = clevr_util.parse_program(mode=0,
                                                   model=model)
     program_str = vr.programs.list_to_str(program_prefix)
     program_tokens = tokenize(program_str)
     program_encoded = encode(program_tokens, program_token_to_idx)
     program_encoded += [
         program_token_to_idx["<NULL>"]
         for _ in range(27 - len(program_encoded))
     ]
     return np.asarray(program_encoded, dtype=np.int64)
Пример #2
0
def run_single_example(args, model, dtype, question_raw, feats_var=None):
    interactive = feats_var is not None
    if not interactive:
        feats_var = extract_image_features(args, dtype)

    # Tokenize the question
    vocab = load_vocab(args)
    question_tokens = tokenize(question_raw,
                               punct_to_keep=[';', ','],
                               punct_to_remove=['?', '.'])
    if args.enforce_clevr_vocab == 1:
        for word in question_tokens:
            if word not in vocab['question_token_to_idx']:
                print(
                    colored(
                        'No one taught me what "%s" means :( Try me again!' %
                        (word), 'magenta'))
                return
    question_encoded = encode(question_tokens,
                              vocab['question_token_to_idx'],
                              allow_unk=True)
    question_encoded = torch.LongTensor(question_encoded).view(1, -1)
    question_encoded = question_encoded.type(dtype).long()
    question_var = Variable(question_encoded, volatile=False)

    # Run the model
    scores = None
    predicted_program = None
    if type(model) is tuple:
        pg, ee = model
        pg.type(dtype)
        pg.eval()
        ee.type(dtype)
        ee.eval()
        if args.model_type == 'FiLM':
            predicted_program = pg(question_var)
        else:
            predicted_program = pg.reinforce_sample(
                question_var,
                temperature=args.temperature,
                argmax=(args.sample_argmax == 1))
        programs[question_raw] = predicted_program
        if args.debug_every <= -1:
            pdb.set_trace()
        scores = ee(feats_var, predicted_program, save_activations=True)
    else:
        model.type(dtype)
        scores = model(question_var, feats_var)

    # Print results
    predicted_probs = scores.data.cpu()
    _, predicted_answer_idx = predicted_probs[0].max(dim=0)
    predicted_probs = F.softmax(Variable(predicted_probs[0])).data
    predicted_answer = vocab['answer_idx_to_token'][predicted_answer_idx[0]]

    answers_to_probs = {}
    for i in range(len(vocab['answer_idx_to_token'])):
        answers_to_probs[vocab['answer_idx_to_token'][i]] = predicted_probs[i]
    answers_to_probs_sorted = sorted(answers_to_probs.items(),
                                     key=lambda x: x[1])
    answers_to_probs_sorted.reverse()
    for i in range(len(answers_to_probs_sorted)):
        if (answers_to_probs_sorted[i][1] >= 1e-3
                and args.debug_every < float('inf')):
            print("%s: %.1f%%" % (answers_to_probs_sorted[i][0].capitalize(),
                                  100 * answers_to_probs_sorted[i][1]))

    if not interactive:
        print(colored('Question: "%s"' % question_raw, 'cyan'))
    print(colored(str(predicted_answer).capitalize(), 'magenta'))

    if interactive:
        return

    # Visualize Gradients w.r.t. output
    cf_conv = ee.classifier[0](ee.cf_input)
    cf_bn = ee.classifier[1](cf_conv)
    pre_pool = ee.classifier[2](cf_bn)
    pooled = ee.classifier[3](pre_pool)

    pre_pool_max_per_c = pre_pool.max(2)[0].max(3)[0].expand_as(pre_pool)
    pre_pool_masked = (pre_pool_max_per_c == pre_pool).float() * pre_pool
    pool_feat_locs = (pre_pool_masked > 0).float().sum(1)
    if args.debug_every <= 1:
        pdb.set_trace()

    if args.output_viz_dir != 'NA':
        viz_dir = args.output_viz_dir + question_raw + ' ' + predicted_answer
        if not os.path.isdir(viz_dir):
            os.mkdir(viz_dir)
        args.viz_dir = viz_dir
        print('Saving visualizations to ' + args.viz_dir)

        # Backprop w.r.t. sum of output scores - What affected prediction most?
        ee.feats.register_hook(save_grad('stem'))
        for i in range(ee.num_modules):
            ee.module_outputs[i].register_hook(save_grad('m' + str(i)))
        scores_sum = scores.sum()
        scores_sum.backward()

        # Visualizations!
        visualize(feats_var, args, 'resnet101')
        visualize(ee.feats, args, 'conv-stem')
        visualize(grads['stem'], args, 'grad-conv-stem')
        for i in range(ee.num_modules):
            visualize(ee.module_outputs[i], args, 'resblock' + str(i))
            visualize(grads['m' + str(i)], args, 'grad-resblock' + str(i))
        visualize(pre_pool, args, 'pre-pool')
        visualize(pool_feat_locs, args, 'pool-feature-locations')

    if (predicted_program is not None) and (args.model_type != 'FiLM'):
        print()
        print('Predicted program:')
        program = predicted_program.data.cpu()[0]
        num_inputs = 1
        for fn_idx in program:
            fn_str = vocab['program_idx_to_token'][fn_idx]
            num_inputs += vr.programs.get_num_inputs(fn_str) - 1
            print(fn_str)
            if num_inputs == 0:
                break
Пример #3
0
def main(args):
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        print('Must give one of --input_vocab_json or --output_vocab_json')
        return

    print('Loading data')
    with open(args.input_questions_json, 'r') as f:
        questions = json.load(f)['questions']

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        if 'answer' in questions[0]:
            answer_token_to_idx = build_vocab((q['answer'] for q in questions))
        question_token_to_idx = build_vocab((q['question'] for q in questions),
                                            min_token_count=args.unk_threshold,
                                            punct_to_keep=[';', ','],
                                            punct_to_remove=['?', '.'])
        all_program_strs = []
        for q in questions:
            if 'program' not in q: continue
            program_str = program_to_str(q['program'], args.mode)
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = build_vocab(all_program_strs)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
        }

    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # Encode all questions and programs
    print('Encoding data')
    questions_encoded = []
    questions_encoded_bert = []
    programs_encoded = []
    question_families = []
    orig_idxs = []
    image_idxs = []
    answers = []
    types = []
    for orig_idx, q in enumerate(questions):
        question = q['question']
        if 'program' in q:
            types += [q['program'][-1]['function']]

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        if 'question_family_index' in q:
            question_families.append(q['question_family_index'])
        question_tokens = tokenize(question,
                                   punct_to_keep=[';', ','],
                                   punct_to_remove=['?', '.'])
        question_encoded = encode(question_tokens,
                                  vocab['question_token_to_idx'],
                                  allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)
        questions_encoded_bert.append(bert_tokenizer.encode(question.lower()))

        if 'program' in q:
            program = q['program']
            program_str = program_to_str(program, args.mode)
            program_tokens = tokenize(program_str)
            program_encoded = encode(program_tokens,
                                     vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)

        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][q['answer']])

    # Pad encoded questions and programs
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    max_question_length_bert = max(len(x) for x in questions_encoded_bert)
    pad_token_bert = 0
    for qe in questions_encoded_bert:
        while len(qe) < max_question_length_bert:
            qe.append(pad_token_bert)

    if len(programs_encoded) > 0:
        max_program_length = max(len(x) for x in programs_encoded)
        for pe in programs_encoded:
            while len(pe) < max_program_length:
                pe.append(vocab['program_token_to_idx']['<NULL>'])

    # Create h5 file
    print('Writing output')
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    questions_encoded_bert = np.asarray(questions_encoded_bert, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    print(questions_encoded.shape)
    print(questions_encoded_bert.shape)
    print(programs_encoded.shape)

    mapping = {}
    for i, t in enumerate(set(types)):
        mapping[t] = i

    print(mapping)

    types_coded = []
    for t in types:
        types_coded += [mapping[t]]

    with h5py.File(args.output_h5_file, 'w') as f:
        f.create_dataset('questions', data=questions_encoded)
        f.create_dataset('questions_bert', data=questions_encoded_bert)
        f.create_dataset('image_idxs', data=np.asarray(image_idxs))
        f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

        if len(programs_encoded) > 0:
            f.create_dataset('programs', data=programs_encoded)
        if len(question_families) > 0:
            f.create_dataset('question_families',
                             data=np.asarray(question_families))
        if len(answers) > 0:
            f.create_dataset('answers', data=np.asarray(answers))
        if len(types) > 0:
            f.create_dataset('types', data=np.asarray(types_coded))
Пример #4
0
def main(args):
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        print('Must give one of --input_vocab_json or --output_vocab_json')
        return

    print('Loading data from', args.input_questions_json)
    if args.q_family_shift and len(args.q_family_shift):
        if len(args.q_family_shift) != len(args.input_questions_json):
            raise ValueError("shift must be provided for each question file")
        q_family_shifts = args.q_family_shift
    else:
        q_family_shifts = [0] * len(args.input_questions_json)
    questions = []
    for q_file, shift in zip(args.input_questions_json, q_family_shifts):
        print(q_file)
        with open(q_file, 'r') as f:
            more_questions = json.load(f)['questions']
            for q in more_questions:
                q['question_family_index'] += shift
            questions.extend(more_questions)

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        if 'answer' in questions[0]:
            answer_token_to_idx = build_vocab((q['answer'] for q in questions))
        question_token_to_idx = build_vocab((q['question'] for q in questions),
                                            min_token_count=args.unk_threshold,
                                            punct_to_keep=[';', ','],
                                            punct_to_remove=['?', '.'])
        all_program_strs = []
        for q in questions:
            if 'program' not in q:
                continue
            program_str = program_to_str(q['program'], args.mode)
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = build_vocab(all_program_strs)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
        }

        def arity(name):
            if name == 'scene':
                return 0
            if 'equal' in name or name in [
                    'union', 'intersect', 'less_than', 'greater_than'
            ]:
                return 2
            return 1

        vocab['program_token_arity'] = {
            name: arity(name)
            for name in program_token_to_idx
        }
    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    # Encode all questions and programs
    print('Encoding data')
    questions_encoded = []
    programs_encoded = []
    question_families = []
    orig_idxs = []
    image_idxs = []
    answers = []
    types = []
    for orig_idx, q in enumerate(questions):
        question = q['question']
        if 'program' in q:
            types += [q['program'][-1]['function']]

        orig_idxs.append(orig_idx)
        image_idxs.append(q['image_index'])
        if 'question_family_index' in q:
            question_families.append(q['question_family_index'])
        question_tokens = tokenize(question,
                                   punct_to_keep=[';', ','],
                                   punct_to_remove=['?', '.'])
        question_encoded = encode(question_tokens,
                                  vocab['question_token_to_idx'],
                                  allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)

        if 'program' in q:
            program = q['program']
            program_str = program_to_str(program, args.mode)
            program_tokens = tokenize(program_str)
            program_encoded = encode(program_tokens,
                                     vocab['program_token_to_idx'])
            programs_encoded.append(program_encoded)

        if 'answer' in q:
            answers.append(vocab['answer_token_to_idx'][q['answer']])

    # Pad encoded questions and programs
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    if len(programs_encoded) > 0:
        max_program_length = max(len(x) for x in programs_encoded)
        for pe in programs_encoded:
            while len(pe) < max_program_length:
                pe.append(vocab['program_token_to_idx']['<NULL>'])

    # Create h5 file
    print('Writing output')
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    print(questions_encoded.shape)
    print(programs_encoded.shape)

    mapping = {}
    for i, t in enumerate(set(types)):
        mapping[t] = i

    print(mapping)

    types_coded = []
    for t in types:
        types_coded += [mapping[t]]

    with h5py.File(args.output_h5_file, 'w') as f:
        f.create_dataset('questions', data=questions_encoded)
        f.create_dataset('image_idxs', data=np.asarray(image_idxs))
        f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

        if len(programs_encoded) > 0:
            f.create_dataset('programs', data=programs_encoded)
        if len(question_families) > 0:
            f.create_dataset('question_families',
                             data=np.asarray(question_families))
        if len(answers) > 0:
            f.create_dataset('answers', data=np.asarray(answers))
        if len(types) > 0:
            f.create_dataset('types', data=np.asarray(types_coded))
Пример #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--mode',
                        default='prefix',
                        choices=['chain', 'prefix', 'postfix'])
    parser.add_argument('--shapes_data',
                        type=str,
                        help="Path to the SHAPES dataset")
    parser.add_argument('--size',
                        type=str,
                        help="Which version of the training set to use")

    args = parser.parse_args()
    parts = ['train', 'val', 'test']
    part_prefixes = ['train.' + args.size, 'val', 'test']
    part_prefixes = [
        os.path.join(args.shapes_data, prefix) for prefix in part_prefixes
    ]

    for part, prefix in zip(parts, part_prefixes):
        image_path = prefix + '.input.npy'
        images = numpy.load(image_path)

        questions_path = prefix + '.query_str.txt'
        questions_encoded = []
        with open(questions_path) as src:
            questions = [str_ for str_ in src]
            if part == 'train':
                question_vocab = build_vocab(questions, delim=None)
            for qe in questions:
                tkn = tokenize(qe, delim=None)
                questions_encoded.append(
                    encode(tkn, question_vocab, allow_unk=True))
        max_question_length = max(len(x) for x in questions_encoded)
        for qe in questions_encoded:
            while len(qe) < max_question_length:
                qe.append(question_vocab['<NULL>'])

        answers_path = prefix + '.output'
        with open(answers_path) as src:
            answers = [1 if w.strip() == 'true' else 0 for w in src]

        programs_path = prefix + '.query'
        all_program_strs = []
        with open(programs_path) as src:
            for line in src:
                line = line.strip()
                program = layout_tree(layout_from_parsing(parse_tree(line)))
                program_str = program_to_str(program, args.mode)
                if program_str is not None:
                    all_program_strs.append(program_str)
        if part == 'train':
            program_vocab = build_vocab(all_program_strs)

        programs_encoded = []
        programs_arities = []
        programs_depths = []

        with open(programs_path) as src:
            for line in src:
                line = line.strip()
                program = layout_tree(layout_from_parsing(parse_tree(line)))
                program_str = program_to_str(program, args.mode)
                program_tokens = tokenize(program_str, delim=None)
                program_encoded = encode(program_tokens,
                                         program_vocab,
                                         allow_unk=True)
                programs_encoded.append(program_encoded)

                programs_arities.append(program_to_arity(program, args.mode))
                programs_depths.append(program_to_depth(program, args.mode))

        if len(programs_encoded) > 0:
            max_program_length = max(len(x) for x in programs_encoded)
            for pe in programs_encoded:
                while len(pe) < max_program_length:
                    pe.append(program_vocab['<NULL>'])

            max_program_arity_length = max(len(x) for x in programs_arities)
            for ar in programs_arities:
                while len(ar) < max_program_arity_length:
                    ar.append(-1)

            max_program_depth_length = max(len(x) for x in programs_depths)
            for de in programs_depths:
                while len(de) < max_program_depth_length:
                    de.append(-1)

            assert (max_program_length == max_program_arity_length) and (
                max_program_length == max_program_depth_length)

        # Create h5 file
        print('Writing output')
        questions_encoded = numpy.asarray(questions_encoded, dtype=numpy.int32)
        programs_encoded = numpy.asarray(programs_encoded, dtype=numpy.int32)
        programs_arities = numpy.asarray(programs_arities, dtype=numpy.int32)
        programs_depths = numpy.asarray(programs_depths, dtype=numpy.int32)
        print(questions_encoded.shape)
        print(programs_encoded.shape)
        print(programs_arities.shape)
        print(programs_depths.shape)

        with h5py.File(part + '_features.h5', 'w') as f:
            features = images.transpose(0, 3, 1, 2) / 255.0
            features_dataset = f.create_dataset('features', (features.shape),
                                                dtype=numpy.float32)
            features_dataset[:] = features

        with h5py.File(part + '_questions.h5', 'w') as f:
            f.create_dataset('questions', data=questions_encoded)

            image_idxs_dataset = f.create_dataset('image_idxs',
                                                  (len(questions_encoded), ),
                                                  dtype=numpy.int32)
            image_idxs_dataset[:] = range(len(questions_encoded))

            if len(programs_encoded) > 0:
                f.create_dataset('programs', data=programs_encoded)
                f.create_dataset('programs_arities', data=programs_arities)
                f.create_dataset('programs_depths', data=programs_depths)

            if len(answers) > 0:
                f.create_dataset('answers', data=numpy.asarray(answers))

    with open('vocab.json', 'w') as f:
        json.dump(
            {
                'question_token_to_idx': question_vocab,
                'program_token_to_idx': program_vocab,
                'answer_token_to_idx': {
                    'false': 0,
                    'true': 1
                }
            }, f)
Пример #6
0
def main(args):
    if (args.input_vocab_json == '') and (args.output_vocab_json == ''):
        print('Must give one of --input_vocab_json or --output_vocab_json')
        return

    print('Loading data')
    with open(args.input_questions_json, 'r') as f:
        questions = []
        for line in f:
            questions.append(json.loads(line))

    # Either create the vocab or load it from disk
    if args.input_vocab_json == '' or args.expand_vocab == 1:
        print('Building vocab')
        if 'label' in questions[0]:
            answer_token_to_idx = build_vocab((q['label'] for q in questions))
        question_token_to_idx = build_vocab((q['sentence'] for q in questions),
                                            min_token_count=args.unk_threshold,
                                            punct_to_keep=[';', ','],
                                            punct_to_remove=['?', '.'])
        all_program_strs = []
        for q in questions:
            if 'program' not in q: continue
            program_str = program_to_str(q['program'], args.mode)
            if program_str is not None:
                all_program_strs.append(program_str)
        program_token_to_idx = build_vocab(all_program_strs)
        vocab = {
            'question_token_to_idx': question_token_to_idx,
            'program_token_to_idx': program_token_to_idx,
            'answer_token_to_idx': answer_token_to_idx,
        }

    if args.input_vocab_json != '':
        print('Loading vocab')
        if args.expand_vocab == 1:
            new_vocab = vocab
        with open(args.input_vocab_json, 'r') as f:
            vocab = json.load(f)
        if args.expand_vocab == 1:
            num_new_words = 0
            for word in new_vocab['question_token_to_idx']:
                if word not in vocab['question_token_to_idx']:
                    print('Found new word %s' % word)
                    idx = len(vocab['question_token_to_idx'])
                    vocab['question_token_to_idx'][word] = idx
                    num_new_words += 1
            print('Found %d new words' % num_new_words)

    if args.output_vocab_json != '':
        with open(args.output_vocab_json, 'w') as f:
            json.dump(vocab, f)

    # Encode all questions and programs
    print('Encoding data')
    questions_encoded = []
    programs_encoded = []
    question_families = []
    orig_idxs = []
    image_idxs = []
    answers = []
    types = []
    for orig_idx, q in enumerate(questions):
        question = q['sentence']

        orig_idxs.append(orig_idx)
        if "LEFT" in q["image_attention"]:  # LEFT IMG
            image_idxs.append(
                int(''.join(c for c in (q['identifier'] + "-img0")
                            if c in digits)))
        else:  # RIGHT IMG
            image_idxs.append(
                int(''.join(c for c in (q['identifier'] + "-img1")
                            if c in digits)))
        question_tokens = tokenize(question,
                                   punct_to_keep=[';', ','],
                                   punct_to_remove=['?', '.'])
        question_encoded = encode(question_tokens,
                                  vocab['question_token_to_idx'],
                                  allow_unk=args.encode_unk == 1)
        questions_encoded.append(question_encoded)

        if 'label' in q:
            answers.append(vocab['answer_token_to_idx'][q['label']])

    # Pad encoded questions and programs
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
        while len(qe) < max_question_length:
            qe.append(vocab['question_token_to_idx']['<NULL>'])

    # Create h5 file
    print('Writing output')
    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    programs_encoded = np.asarray(programs_encoded, dtype=np.int32)
    print(questions_encoded.shape)
    print(programs_encoded.shape)

    mapping = {}
    for i, t in enumerate(set(types)):
        mapping[t] = i

    print(mapping)

    types_coded = []
    for t in types:
        types_coded += [mapping[t]]

    with h5py.File(args.output_h5_file, 'w') as f:
        print(image_idxs)
        f.create_dataset('questions', data=questions_encoded)
        f.create_dataset('image_idxs', data=np.asarray(image_idxs))
        f.create_dataset('orig_idxs', data=np.asarray(orig_idxs))

        if len(programs_encoded) > 0:
            f.create_dataset('programs', data=programs_encoded)
        if len(question_families) > 0:
            f.create_dataset('question_families',
                             data=np.asarray(question_families))
        if len(answers) > 0:
            f.create_dataset('answers', data=np.asarray(answers))
        if len(types) > 0:
            f.create_dataset('types', data=np.asarray(types_coded))