def get_imdb(file_path): imdb = [{"dataset_name": "gqa"}] questions = json.load(open(file_path, "r")) print("Processing file {}".format(file_path)) for qid, item in tqdm.tqdm(questions.items()): entry = { "image_name": item["imageId"] + "jpg", "image_id": item["imageId"], "question_id": qid, "question_str": item["question"], "question_tokens": tokenize(item["question"]), } if "answer" in item: entry["all_answers"] = [item["answer"] for _ in range(10)] entry["valid_answers"] = [item["answer"] for _ in range(10)] entry["semantic_string"] = (item["semanticStr"], ) entry["gt_object_ids"] = (get_objects(item["semanticStr"]), ) entry["meta_data"] = item["types"] imdb.append(entry) return np.array(imdb)
def convert_gqa_to_vqa(gqa_dir, out_dir): """ Takes GQA dataset and converts it into VQA format Assumes GQA dir structure as: -gqa_dir/ -images/ -images/ -objects/ -spatial/ -questions/ -scenegraphs/ """ image_feat_path = os.path.join(gqa_dir, "images") extract_image_features(image_feat_path, out_dir) questions_dir = os.path.join(gqa_dir, "questions") if os.path.isfile(os.path.join(questions_dir, "train_all_questions.json")): print("Using previously generated train_all_questions.json file") else: merge_train(os.path.join(gqa_dir, "questions", "train_all_questions")) split_mapping = { "test": "test_all_questions.json", "val": "val_all_questions.json", "challenge": "challenge_all_questions.json", "train": "train_all_questions.json", } for split in split_mapping: for balance_type in ["balanced", "all"]: filename = split_mapping[split] csplit = split if balance_type == "balanced": filename = filename.replace("_all", "_balanced") csplit = split + "_balanced" file_path = os.path.join(questions_dir, filename) imdb = get_imdb(file_path) save_path = os.path.join(out_dir, "imdb_{}.npy".format(csplit)) np.save(save_path, imdb) splits = ["val", "train"] split_type = ["balanced", "all"] global_answer = Counter() global_q = Counter() question_len = Counter() for s in splits: for st in split_type: questions_json = os.path.join(questions_dir, "{}_{}_questions.json".format(s, st)) questions = json.load(open(questions_json, "r")) print("Processing split {}_{}".format(s, st)) answers = Counter() q_tokens = Counter() for _, q in tqdm.tqdm(questions.items()): tokens = tokenize(q["question"]) q_tokens.update(tokens) global_q.update(tokens) answers.update([q["answer"].lower()]) global_answer.update([q["answer"].lower()]) question_len.update([len(tokens)]) print("N_unique answers :", len(global_answer)) print("N unique q tokens:", len(global_q)) print("Min Q length", min([x for x in question_len])) print("Max Q length", max([x for x in question_len])) print("Q length distribution", question_len) # Save question vocabulary q_vocabulary = [w[0] for w in global_q.items()] q_vocabulary.sort() q_vocabulary = ["<unk>"] + q_vocabulary vocab_file = os.path.join(out_dir, "vocabulary_gqa.txt") with open(vocab_file, "w") as f: f.writelines([w + "\n" for w in q_vocabulary]) # Save answer vocabulary answer_list = [preprocess_answer(ans[0]) for ans in global_answer.items()] answer_list = [t.strip() for t in answer_list if len(t.strip()) > 0] answer_list.sort() if "<unk>" not in answer_list: answer_list = ["<unk>"] + answer_list answer_file = os.path.join(out_dir, "answers_gqa.txt") with open(answer_file, "w") as fp: fp.writelines([w + "\n" for w in answer_list])