def __init__(self, h5_path, base_path, mode, image_size=(64, 64), mask_size=0, normalize_images=True, max_objects=10, max_samples=None, include_dummies=False, include_relationships=True, use_orphaned_objects=True, debug=False, learned_converse=False, use_transitivity=False, use_converse=False, learned_transitivity=False, learned_symmetry=False, dense_scenes=False, sort_ids=None, eval_func=None): super(CLEVRDialogDataset, self).__init__() self.image_dir = os.path.join(base_path, 'images') self.image_size = image_size self.use_transitivity = use_transitivity self.mode = mode # objects self.vocab = {} self.vocab["use_object_embedding"] = False self.vocab['pred_name_to_idx'] = { '__in_image__': 0, 'right': 1, "behind": 2, "front": 3, "left": 4, '__padding__': 5 } self.vocab['pred_idx_to_name'] = { v: k for k, v in self.vocab['pred_name_to_idx'].items() } # attributes, currently ignored. self.vocab["attributes"] = {} self.vocab["attributes"]['shape'] = { '__image__': 0, 'cube': 1, 'sphere': 2, 'cylinder': 3 } self.vocab["attributes"]["color"] = { '__image__': 0, 'gray': 1, 'red': 2, 'blue': 3, 'green': 4, 'brown': 5, 'purple': 6, 'cyan': 7, 'yellow': 8 } self.vocab["attributes"]["material"] = { '__image__': 0, 'rubber': 1, 'metal': 2 } self.vocab["attributes"]["size"] = { '__image__': 0, 'small': 1, 'large': 2 } self.vocab["reverse_attributes"] = {} for attr in self.vocab["attributes"].keys(): self.vocab["reverse_attributes"][attr] = { v: k for k, v in self.vocab["attributes"][attr].items() } self.vocab['object_name_to_idx'] = {} ind = 0 for attr in self.vocab["attributes"].keys(): for attr_label in self.vocab["attributes"][attr].keys(): if ind != 0: keyy = "{}_{}".format(attr_label, ind) self.vocab['object_name_to_idx'][keyy] = ind else: # __image__ self.vocab['object_name_to_idx'][attr_label] = ind ind += 1 self.use_orphaned_objects = use_orphaned_objects self.max_objects = max_objects self.max_samples = max_samples self.include_relationships = include_relationships self.image_paths = [] transform = [Resize(image_size), T.ToTensor()] if normalize_images: # transform.append(imagenet_preprocess()) transform.append(encode_image()) self.transform = T.Compose(transform) if debug: self.clevr_data = pickle.load(open("clevr_data_sample.pkl", 'rb')) self.dialog_data = pickle.load(open("dialog_data_sample.pkl", 'rb')) else: self.clevr_data = json.load( open( os.path.join( base_path, 'scenes/CLEVR_{mode}_scenes.json'.format(mode=mode)), 'rb')) self.dialog_data = json.load( open(os.path.join(base_path, h5_path), 'rb')) if dense_scenes: self.keep_dense_scenes() if sort_ids: # Replace scenes self.keep_scenes_per_id(sort_ids, eval_func)
def __init__(self, h5_path, base_path, mode, image_size=(256, 256), mask_size=0, normalize_images=True, min_objects=0, max_objects=1000, max_samples=None, include_relationships=True, use_orphaned_objects=True, use_transitivity=False, learned_transitivity=False, include_dummies=True, learned_symmetry=False, use_converse=False, learned_converse=False): super(PackedVGSceneGraphDataset, self).__init__() self.include_dummies = include_dummies self.learned_transitivity = learned_transitivity self.image_dir = os.path.join(base_path, "images") self.image_size = image_size self.mask_size = mask_size self.vocab = json.load(open(os.path.join(base_path, 'vocab.json'))) self.num_objects = len(self.vocab['object_idx_to_name']) self.use_orphaned_objects = use_orphaned_objects self.max_objects = max_objects self.max_samples = max_samples self.include_relationships = include_relationships self.min_objects = min_objects self.learned_symmetry = learned_symmetry self.learned_converse = learned_converse transform = [Resize(image_size), T.ToTensor()] if normalize_images: transform.append(encode_image()) self.transform = T.Compose(transform) if use_transitivity or use_converse: raise NotImplementedError() # Load pretrained data self.data = {} with h5py.File(os.path.join(base_path, h5_path), 'r') as f: for k, v in f.items(): if k == 'image_paths': self.image_paths = list(v) else: self.data[k] = torch.IntTensor(np.asarray(v)) if self.min_objects > 0: col_len = len(self.data["objects_per_image"]) objects_mask = (self.data['objects_per_image'] >= self.min_objects).nonzero()[:, 0] cols = [ col for col in self.data.keys() if len(self.data[col]) == col_len ] for col in cols: self.data[col] = self.data[col][objects_mask] self.image_paths = np.array(self.image_paths)[objects_mask] self.vocab["attributes"] = {} self.vocab["attributes"]['objects'] = self.vocab['object_name_to_idx'] self.vocab["reverse_attributes"] = {} for attr in self.vocab["attributes"].keys(): self.vocab["reverse_attributes"][attr] = { v: k for k, v in self.vocab["attributes"][attr].items() } self.register_augmented_relations()
def __init__(self, base_path, mode, image_size=(64, 64), mask_size=0, normalize_images=True, min_objects=10, max_objects=10, max_samples=None, include_relationships=True, use_orphaned_objects=True, debug=False, learned_transitivity=False, include_dummies=True, use_transitivity=False, use_all_relations=False, use_converse=False, learned_symmetry=False, learned_converse=False): super(PackedGenCLEVRDataset, self).__init__() self.image_dir = os.path.join(base_path, 'images') self.image_size = image_size self.mask_size = mask_size self.learned_transitivity = learned_transitivity self.learned_symmetry = learned_symmetry self.learned_converse = learned_converse self.include_dummies = include_dummies self.use_transitivity = use_transitivity self.use_all_relations = use_all_relations self.use_converse = use_converse self.use_orphaned_objects = use_orphaned_objects self.max_objects = max_objects self.min_objects = min_objects self.max_samples = max_samples self.include_relationships = include_relationships self.mode = mode # objects self.vocab = {} self.vocab["use_object_embedding"] = False # predicates self.register_augmented_relations() # attributes, currently ignored. self.vocab["attributes"] = {} self.vocab["attributes"]['shape'] = {'__image__': 0, 'cube': 1, 'sphere': 2, 'cylinder': 3} self.vocab["attributes"]["color"] = {'__image__': 0, 'gray': 1, 'red': 2, 'blue': 3, 'green': 4, 'brown': 5, 'purple': 6, 'cyan': 7, 'yellow': 8} self.vocab["attributes"]["material"] = {'__image__': 0, 'rubber': 1, 'metal': 2} self.vocab["attributes"]["size"] = {'__image__': 0, 'small': 1, 'large': 2} self.vocab["reverse_attributes"] = {} for attr in self.vocab["attributes"].keys(): self.vocab["reverse_attributes"][attr] = {v: k for k, v in self.vocab["attributes"][attr].items()} self.vocab['object_name_to_idx'] = {} ind = 0 for attr in self.vocab["attributes"].keys(): for attr_label in self.vocab["attributes"][attr].keys(): if ind != 0: keyy = "{}_{}".format(attr_label, ind) self.vocab['object_name_to_idx'][keyy] = ind else: # __image__ self.vocab['object_name_to_idx'][attr_label] = ind ind += 1 self.vocab['object_idx_to_name'] = {} for k, v in self.vocab['object_name_to_idx'].items(): self.vocab['object_idx_to_name'][v] = k self.image_paths = [] transform = [Resize(image_size), T.ToTensor()] if normalize_images: # transform.append(imagenet_preprocess()) transform.append(encode_image()) self.transform = T.Compose(transform) # Load data if debug: self.data = self.create_packed_sgs()