def get_item(self, idx): data = self.vqamb_data[idx] current_sample = Sample() # store queston and image id current_sample.img_id = data['id'] current_sample.qa_id = data['qa_id'] # process question question = data["question"] tokens = tokenize(question, remove=["?"], keep=["'s"]) processed = self.text_processor({"tokens": tokens}) current_sample.text = processed["text"] # process answers processed = self.answer_processor({"answers": [data['answer']]}) current_sample.answers = processed["answers"] current_sample.targets = processed["answers_scores"][1:] # remove unknown index # Detectron features ---------------- # TODO: read in detectron image instead if detectron is to be built detectron_path = self.detectron_folder + str(data['id']) if self.config.spatial: point = data['point'] # current_sample.point = point detectron_path += ',' + str(point['x']) + ',' + str(point['y']) detectron_path += '.pt' detectron_feat = torch.load(detectron_path, map_location=torch.device('cpu')) # Pad features to fixed length if self.config.pad_detectron: if detectron_feat.shape[0] > 100: detectron_feat = detectron_feat[:100] elif detectron_feat.shape[0] < 100: pad = torch.zeros(100 - detectron_feat.shape[0], detectron_feat.shape[1]) detectron_feat = torch.cat([detectron_feat, pad], dim=0) current_sample.image_feature_0 = detectron_feat # --------------------------------------------- return current_sample
def extract(self): os.makedirs(self.out_dir, exist_ok=True) word_count = Counter() texts = self.get_text() text_lengths = [None] * len(texts) for inx, text in enumerate(texts): words = tokenize(text) text_lengths[inx] = len(words) word_count.update(words) # UNK token will added on fly if you use Vocab class in core/text vocabulary = [w[0] for w in word_count.items() if w[1] >= self.min_freq] vocabulary.sort() self.save_vocabulary(vocabulary) print("min text len=", min(text_lengths)) print("max text len=", max(text_lengths))
def build(self, annotations): targets = [] for idx, annotation in enumerate(annotations): image_id = annotation[0].split('.')[0] image_name = image_id caption_str = annotation[1] caption_tokenes = tokenize(caption_str) caption_tokenes = ['<s>'] + caption_tokenes + ['</s>'] reference_tokens = [caption_tokenes] feature_path = image_id + '.npy' target = { # 'image_id': image_id, 'image_id': idx, 'image_name': image_name, 'caption_str': caption_str, 'caption_tokens': caption_tokenes, 'reference_tokens': reference_tokens, 'feature_path': feature_path } targets.append(target) return targets
def get_item(self, idx): data = self.questions[idx] # Each call to get_item from dataloader returns a Sample class object which # collated by our special batch collator to a SampleList which is basically # a attribute based batch in layman terms current_sample = Sample() question = data["question"] tokens = tokenize(question, keep=[";", ","], remove=["?", "."]) processed = self.text_processor({"tokens": tokens}) current_sample.text = processed["text"] processed = self.answer_processor({"answers": [data["answer"]]}) current_sample.answers = processed["answers"] current_sample.targets = processed["answers_scores"] image_path = os.path.join(self.image_path, data["image_filename"]) image = np.true_divide(Image.open(image_path).convert("RGB"), 255) image = image.astype(np.float32) current_sample.image = torch.from_numpy(image.transpose(2, 0, 1)) return current_sample
def load_item(self, idx): sample = Sample() image_id = self.annotations[idx][0] image_folder = image_id.split('_')[0] caption = self.annotations[idx][1] tokens = tokenize(caption) tokens = ['<s>'] + tokens + ['</s>'] # use text_processor to process caption # pad sequence, convert token to indices and add SOS, EOS token # text_processor already contains a pre-processor to tokenize caption caption_p = self.text_processor({'tokens': tokens}) sample.text = caption_p['text'] sample.caption_len = torch.tensor(len(tokens), dtype=torch.int) # sample.target = caption_p['text'] sample.answers = torch.stack([caption_p['text']]) # generate image features image_path = os.path.join(self.image_dir, image_folder, image_id) image, image_scale = self._image_transform(image_path) with torch.no_grad(): image_features = self.feature_extractor([image], [image_scale]) image_features = image_features[0] sample.image_feature_0 = image_features.cpu() return sample
def get_item(self, idx): data = self.vqamb_data[idx] current_sample = Sample() # store queston and image id current_sample.img_id = data['id'] # current_sample.qa_id = data['qa_id'] # store points current_sample.point = data['point'] # data['points'] bbox = data['bbox'] current_sample.gt_bbox = torch.Tensor([bbox['x'], bbox['y'], bbox['x'] + bbox['w'], bbox['y'] + bbox['h']]) # process question question = data["pt_question"] tokens = tokenize(question, remove=["?"], keep=["'s"]) processed = self.text_processor({"tokens": tokens}) current_sample.text = processed["text"] # process answers processed = self.answer_processor({"answers": [data['ans']]}) current_sample.answers = processed["answers"] current_sample.targets = processed["answers_scores"][1:] # remove unknown index # Detectron features ---------------- # TODO: read in detectron image instead if detectron is to be built detectron_path = self.detectron_folder + str(data['id']) point = data['point'] # point = data['points'][0] if 'pt' in self.detectron_folder: detectron_path += ',' + str(point['x']) + ',' + str(point['y']) detectron_path += '.pt' detectron_feat = torch.load(detectron_path, map_location=torch.device('cpu')) # Pad features to fixed length if self.config.pad_detectron: if detectron_feat.shape[0] > 100: detectron_feat = detectron_feat[:100] elif detectron_feat.shape[0] < 100: pad = torch.zeros(100 - detectron_feat.shape[0], detectron_feat.shape[1]) detectron_feat = torch.cat([detectron_feat, pad], dim=0) current_sample.image_feature_0 = detectron_feat # --------------------------------------------- # read in bounding boxes (hardcoded for now) bbox_path = '' bbox_path += str(data['id']) + ',' + str(point['x']) + ',' + str(point['y']) + '.pt' bboxes = torch.load(bbox_path, map_location=torch.device('cpu')) if bboxes.shape[0] > 100: bboxes = bboxes[:100] elif bboxes.shape[0] < 100: pad = torch.zeros(100 - bboxes.shape[0], bboxes.shape[1]) bboxes = torch.cat([bboxes, pad], dim=0) current_sample.pt_bbox = bboxes # read in image bounding boxes bbox_path = '' bbox_path += str(data['id']) + '.pt' # + ',' + str(point['x']) + ',' + str(point['y']) + '.pt' bboxes = torch.load(bbox_path, map_location=torch.device('cpu')) if bboxes.shape[0] > 100: bboxes = bboxes[:100] elif bboxes.shape[0] < 100: pad = torch.zeros(100 - bboxes.shape[0], bboxes.shape[1]) bboxes = torch.cat([bboxes, pad], dim=0) current_sample.img_bbox = bboxes # Context features -------------------- if self.config.use_context: context_path = self.context_folder + str(data['id']) context_path += ',' + str(point['x']) + ',' + str(point['y']) context_path += '.pt' context_feat = torch.load(context_path, map_location=torch.device('cpu')) context_feat = context_feat.squeeze() orig_dim = context_feat.shape[0] if self.config.pad_context: if context_feat.shape[0] > 100: context_feat = context_feat[:100] elif context_feat.shape[0] < 100: pad = torch.zeros(100 - context_feat.shape[0], context_feat.shape[1]) context_feat = torch.cat([context_feat, pad], dim=0) current_sample.context_feature_0 = context_feat # --------------------------------------------- return current_sample
def build(self): annotations_file = self.args.data_file image_dir = self.args.image_root # visdial_json_file = os.path.join( # self.args.data_dir, # "visdial_%.1f_%s.json" % (self.args.version, self.args.set_type), # ) data = None with open(annotations_file, "r") as f: data = json.load(f) # final_questions = self.get_tokens(data["questions"]) # final_answers = self.get_tokens(data["answers"]) # dialogs = data["dialogs"] # dialogs_with_features = self.parse_dialogs(dialogs) # reference_tokens = [] # caption_tokens = [] # image_names = [] # feature_paths = [] # image_ids = [] # caption_ids = [] # caption_strs = [] all_data = [] # training_data = [] # validation_data = [] all_data.append({"metadata": 'youcookII'}) # training_data.append({"metadata": 'youcookII', "subset": 'training'}) # validation_data.append({"metadata": 'youcookII', "subset": 'validation'}) counter = 0 for video in data["database"]: for i in data["database"][video]["annotations"]: aDict = {} vid_seg = str(video) + "_" + str(i['id']) feature_path = glob.glob(os.path.join(image_dir, vid_seg + "*.npy")) if len(feature_path) != 0: feature_path = os.path.basename(feature_path[0]) print("DEBUG feature_path:", feature_path) # sys.exit() image_name = feature_path.rstrip(".npy") image_id = counter caption_id = counter caption_str = i["sentence"] # caption_token_list = [] caption_token_list = tokenize(caption_str) caption_token_list.insert(0, "<s>") caption_token_list.append("</s>") # reference_tokens.append([caption_token_list]) # caption_tokens.append(caption_token_list) # caption_strs.append(caption_str) # caption_ids.append(caption_id) # image_ids.append(image_id) # image_names.append(image_name) # feature_paths.append(feature_path) aDict["reference_tokens"] = [caption_token_list] aDict["caption_tokens"] = caption_token_list aDict["caption_str"] = caption_str aDict["caption_id"] = caption_id aDict["image_id"] = image_id aDict["image_name"] = image_name aDict["feature_path"] = feature_path # print("DEBUG subset:", data["database"][video]["subset"]) # sys.exit() # if str(data["database"][video]["subset"]) == "training": # training_data.append(aDict) # elif str(data["database"][video]["subset"]) == "validation": # validation_data.append(aDict) all_data.append(aDict) counter+=1 """ imdb = { # "questions": final_questions, # "answers": final_answers, # "dialogs": dialogs_with_features, "reference_tokens": reference_tokens, "caption_tokens": caption_tokens, "image_name": image_names, "feature_path": feature_paths, "image_id": image_ids, "caption_id": caption_ids, "caption_str": caption_strs, } np_data = np.array(list(zip( \ list(zip(["reference_tokens"]*len(reference_tokens), reference_tokens)), list(zip(["caption_tokens"]*len(caption_tokens), caption_tokens)), list(zip(["image_name"]*len(image_names), image_names)), list(zip(["feature_path"]*len(feature_paths), feature_paths)), list(zip(["image_id"]*len(image_ids), image_ids)), list(zip(["caption_id"]*len(caption_ids), caption_ids)), list(zip(["caption_str"]*len(caption_strs), caption_strs)), ))) """ np.save(self.args.out_file, np.array(all_data))
def test_tokenize(self): tokens = text_utils.tokenize(self.TOKENIZE_EXAMPLE) self.assertEqual(list(tokens), self.TOKENS)
def get_item(self, idx): data = self.vqamb_data[idx] current_sample = Sample() # store queston and image id current_sample.img_id = data['id'] current_sample.qa_id = data['qa_index'] # store points current_sample.points = data['points'] obj = data['all_objs'][0] xmin, ymin, xmax, ymax = obj['x'], obj[ 'y'], obj['x'] + obj['w'], obj['y'] + obj['h'] current_sample.gt_bbox = torch.Tensor([xmin, ymin, xmax, ymax]) # process question question = data["question"] tokens = tokenize(question, remove=["?"]) processed = self.text_processor({"tokens": tokens}) current_sample.text = processed["text"] # process answers processed = self.answer_processor({"answers": data['all_ans']}) current_sample.answers = processed["answers"] current_sample.targets = processed["answers_scores"][ 1:] # remove unknown index # Detectron features ---------------- # TODO: read in detectron image instead if detectron is to be built detectron_path = self.detectron_folder + str(data['id']) bbox_path = self.bbox_folder + str(data['id']) if 'pt' in self.detectron_folder: point = data['points'][0] detectron_path += ',' + str(point['x']) + ',' + str(point['y']) bbox_path += ',' + str(point['x']) + ',' + str(point['y']) detectron_path += '.pt' bbox_path += '.pt' detectron_feat = torch.load( detectron_path, map_location=torch.device('cpu')).squeeze() # bbox_feat = torch.load(bbox_path, map_location=torch.device('cpu')).squeeze() '''if detectron_feat.shape[0] == 2048: detectron_feat = detectron_feat.unsqueeze(0) bbox_feat = bbox_feat.unsqueeze(0) ''' ''' if self.config.grid: detectron_feat = detectron_feat.view(detectron_feat.shape[0], -1).T ''' # x_down = max(int(round(pt['x']/600)), 18) # y_down = int(round(pt['y']/800), 25) # preproessing for grid features only # detectron_feat = detectron_feat.view(detectron_feat.shape[0], -1).T # Pad features to fixed length if self.config.grid: MAX_FEAT = 608 else: MAX_FEAT = 100 if self.config.pad_detectron: if detectron_feat.shape[0] > MAX_FEAT: detectron_feat = detectron_feat[:MAX_FEAT] # bbox_feat = bbox_feat[:MAX_FEAT] elif detectron_feat.shape[0] < MAX_FEAT: pad = torch.zeros(MAX_FEAT - detectron_feat.shape[0], detectron_feat.shape[1]) detectron_feat = torch.cat([detectron_feat, pad], dim=0) pad = torch.zeros(MAX_FEAT - bbox_feat.shape[0], bbox_feat.shape[1]) bbox_feat = torch.cat([bbox_feat, pad], dim=0) ''' else: if detectron_feat.dim() > 1: detectron_feat = torch.zeros(2048) ''' # current_sample.bbox = bbox_feat current_sample.image_feature_0 = detectron_feat # --------------------------------------------- return current_sample
def get_item(self, idx): data = self.objpart_data[idx] current_sample = Sample() # store queston and image id current_sample.img_id = data['id'] # current_sample.qa_id = data['qa_id'] if data['ans'] == 'part': current_sample.part = 1 else: current_sample.part = 0 # store points current_sample.point = data['point'] # process question question = data["question"] tokens = tokenize(question, remove=["?"]) processed = self.text_processor({"tokens": tokens}) current_sample.text = processed["text"] # process answers processed = self.answer_processor({"answers": [data['ans']]}) current_sample.answers = processed["answers"] current_sample.targets = processed["answers_scores"][ 1:] # remove unknown index # Detectron features ---------------- # TODO: read in detectron image instead if detectron is to be built detectron_path = self.detectron_folder + str(data['id']) if 'pt' in self.detectron_folder: # hacky way of assessing point supervision point = data['point'] detectron_path += ',' + str(point['x']) + ',' + str(point['y']) detectron_path += '.pt' detectron_feat = torch.load( detectron_path, map_location=torch.device('cpu')).squeeze() # hardcode bounding box and read it # x_down = max(int(round(pt['x']/600)), 18) # y_down = int(round(pt['y']/800), 25) # preproessing for grid features only # detectron_feat = detectron_feat.view(detectron_feat.shape[0], -1).T # Pad features to fixed length MAX_FEAT = 100 if self.config.pad_detectron: if detectron_feat.shape[0] > MAX_FEAT: detectron_feat = detectron_feat[:MAX_FEAT] elif detectron_feat.shape[0] < MAX_FEAT: pad = torch.zeros(MAX_FEAT - detectron_feat.shape[0], detectron_feat.shape[1]) detectron_feat = torch.cat([detectron_feat, pad], dim=0) ''' else: if detectron_feat.dim() > 1: detectron_feat = torch.zeros(2048) ''' current_sample.image_feature_0 = detectron_feat # --------------------------------------------- return current_sample
def test_tokenize(self): tokens = text_utils.tokenize(self.SENTENCE) self.assertEqual(list(tokens), self.TOKENS)