Exemplo n.º 1
0
def process_vqa_dataset(questions, annotations, split, args, maps=None):
    """
    Process the questions and annotations into a consolidated dataset.
    This is done only for the training set.
    :param questions:
    :param annotations:
    :param split:
    :param args:
    :param maps: Dict containing various mappings such as word_to_wid, wid_to_word, ans_to_aid and aid_to_ans
    :return: The processed dataset ready to be used

    """
    dataset = []
    for idx, q in enumerate(questions):
        d = {}
        d["question_id"] = q["question_id"]
        d["question"] = q["question"]
        d["image_id"] = q["image_id"]
        d["image_name"] = coco_name_format(q["image_id"], "train")

        d["answer"] = annotations[idx]["multiple_choice_answer"]
        answers = []
        for ans in annotations[idx]['answers']:
            answers.append(ans['answer'])
        d['answers_occurence'] = Counter(answers).most_common()

        dataset.append(d)

    # Get the top 1000 answers so we can filter the dataset to only questions with these answers
    top_answers = text.get_top_answers(dataset, args.top_answer_limit)
    dataset = text.filter_dataset(dataset, top_answers)

    # Process the questions
    dataset = text.preprocess_questions(dataset)
    vocab = text.get_vocabulary(dataset)
    dataset = text.remove_tail_words(dataset, vocab)

    if split == "train":
        word_to_wid = {w: i for i, w in enumerate(vocab)}
        wid_to_word = [w for w in vocab]

        ans_to_aid = {a: i for i, a in enumerate(top_answers)}
        aid_to_ans = [a for a in top_answers]

    else:  # split == "val":
        word_to_wid = maps["word_to_wid"]
        wid_to_word = maps["wid_to_word"]
        ans_to_aid = maps["ans_to_aid"]
        aid_to_ans = maps["aid_to_ans"]

    dataset = text.encode_questions(dataset, word_to_wid, args.max_length)
    dataset = text.encode_answers(dataset, ans_to_aid)

    return dataset, vocab, word_to_wid, wid_to_word, ans_to_aid, aid_to_ans
Exemplo n.º 2
0
def process_vqa_dataset(questions_file,
                        annotations_file,
                        split,
                        maps=None,
                        top_answer_limit=1000,
                        max_length=26,
                        year=2014):
    """
    Process the questions and annotations into a consolidated dataset.
    This is done only for the training set.
    :param questions_file:
    :param annotations_file:
    :param split: The dataset split.
    :param maps: Dict containing various mappings such as word_to_wid, wid_to_word, ans_to_aid and aid_to_ans.
    :param top_answer_limit:
    :param max_length: The maximum quetsion length. Taken from the VQA sample code.
    :param year: COCO Dataset release year.
    :return: The processed dataset ready to be used

    """
    cache_file = "vqa_{0}_dataset_cache.pickle".format(split)

    # Check if preprocessed cache exists. If yes, load it up, else preprocess the data
    if os.path.exists(cache_file):
        print("Found {0} set cache! Loading...".format(split))
        dataset, vocab, word_to_wid, wid_to_word, ans_to_aid, aid_to_ans = pickle.load(
            open(cache_file, 'rb'))

    else:
        # load the annotations and questions files
        print("Loading {0} annotations".format(split))
        with open(annotations_file) as ann:
            j = json.load(ann)
            annotations = j["annotations"]

        print("Loading {0} questions".format(split))
        with open(questions_file) as q:
            j = json.load(q)
            questions = j["questions"]

        # load up the dataset
        dataset = []
        for idx, q in enumerate(questions):
            d = dict()
            d["question_id"] = q["question_id"]
            d["question"] = q["question"]
            d["image_id"] = q["image_id"]
            d["image_name"] = coco_name_format(q["image_id"], split, year)

            d["answer"] = annotations[idx]["multiple_choice_answer"]
            answers = []
            for ans in annotations[idx]['answers']:
                answers.append(ans['answer'])
            d['answers_occurence'] = Counter(answers).most_common()

            d["question_type"] = annotations[idx]["question_type"]
            d["answer_type"] = annotations[idx]["answer_type"]

            dataset.append(d)

        # Get the top N answers so we can filter the dataset to only questions with these answers
        top_answers = text.get_top_answers(dataset, top_answer_limit)
        dataset = text.filter_dataset(dataset, top_answers)

        # Process the questions
        dataset = text.preprocess_questions(dataset)

        if split == "train":
            vocab = text.get_vocabulary(dataset)
            word_to_wid = {w: i + 1
                           for i, w in enumerate(vocab)
                           }  # 0 is used for padding
            wid_to_word = {i + 1: w for i, w in enumerate(vocab)}
            ans_to_aid = {a: i for i, a in enumerate(top_answers)}
            aid_to_ans = {i: a for i, a in enumerate(top_answers)}

        else:  # split == "val":
            vocab = maps["vocab"]
            word_to_wid = maps["word_to_wid"]
            wid_to_word = maps["wid_to_word"]
            ans_to_aid = maps["ans_to_aid"]
            aid_to_ans = maps["aid_to_ans"]

            dataset = text.remove_tail_words(dataset, vocab)

        dataset = text.encode_questions(dataset, word_to_wid, max_length)
        dataset = text.encode_answers(dataset, ans_to_aid)

        print("Caching the processed data")
        pickle.dump(
            [dataset, vocab, word_to_wid, wid_to_word, ans_to_aid, aid_to_ans],
            open(cache_file, 'wb+'))

    return dataset, vocab, word_to_wid, wid_to_word, ans_to_aid, aid_to_ans