def get_tokens(self, sentences): if not isinstance(sentences, list): sentences = [sentences] final_sentences = [] for _, sentence in enumerate(sentences): tokens = tokenize(sentence) final_sentences.append(tokens) return final_sentences
def get_imdb(ann_path: str, quest_path: str, split: str, answer_vocab_path: str) -> np.ndarray: imdb = [{"dataset_name": "okvqa"}] with PathManager.open(answer_vocab_path, "r") as f: answer_vocab = set(f.read().splitlines()) with PathManager.open(ann_path, "r") as f: annotations = json.load(f)["annotations"] with PathManager.open(quest_path, "r") as f: questions = json.load(f)["questions"] gt_answers = {} for ann in annotations: gt_answers[ann["question_id"]] = ann["answers"] count = 0 for quest in tqdm(questions): image_name = f"COCO_{split}_{quest['image_id']:012d}" q_id = quest["question_id"] all_answers = [item['answer'] for item in gt_answers[q_id]] answers = [ans for ans in all_answers if ans in answer_vocab] if len(answers) == 0: answers = ["<unk>"] count += 1 entry = { "image_name": image_name, "image_id": quest["image_id"], "feature_path": f"{image_name}.npy", "question_id": q_id, "question_str": quest["question"], "question_tokens": tokenize(quest["question"]), "answers": answers, "all_answers": all_answers, } imdb.append(entry) print("Unknown questions:", count) return np.array(imdb)
def extract(self): os.makedirs(self.out_dir, exist_ok=True) word_count = Counter() texts = self.get_text() text_lengths = [None] * len(texts) for inx, text in enumerate(texts): words = tokenize(text) text_lengths[inx] = len(words) word_count.update(words) # UNK token will added on fly if you use Vocab class in core/text vocabulary = [w[0] for w in word_count.items() if w[1] >= self.min_freq] vocabulary.sort() self.save_vocabulary(vocabulary) print("min text len=", min(text_lengths)) print("max text len=", max(text_lengths))
def __getitem__(self, idx): data = self.questions[idx] # Each call to __getitem__ from dataloader returns a Sample class object which # collated by our special batch collator to a SampleList which is basically # a attribute based batch in layman terms current_sample = Sample() question = data["question"] tokens = tokenize(question, keep=[";", ","], remove=["?", "."]) processed = self.text_processor({"tokens": tokens}) current_sample.text = processed["text"] processed = self.answer_processor({"answers": [data["answer"]]}) current_sample.answers = processed["answers"] current_sample.targets = processed["answers_scores"] image_path = os.path.join(self.image_path, data["image_filename"]) image = np.true_divide(Image.open(image_path).convert("RGB"), 255) image = image.astype(np.float32) current_sample.image = torch.from_numpy(image.transpose(2, 0, 1)) return current_sample
def test_tokenize(self): tokens = text_utils.tokenize(self.TOKENIZE_EXAMPLE) self.assertEqual(list(tokens), self.TOKENS)