def get_lexicon(self): """ Prepares the lexicon, creating it if needed and loads it. :return: The loaded lexicon. """ self.logger.info("Preparing lexicon...") current_dir = Path() dir_path = current_dir / "data" / "break_data" / "lexicon_by_logical" file_name = "lexicon" if self.domain_split: file_name += "_domain_split" elif self.length_split: file_name += "_length_split" file_name += ".pkl" if not (dir_path / file_name).is_file(): self.create_matching_lexicon(dir_path, file_name) data = load_obj(dir_path, file_name) # TODO delete this? # for type in data: # for ex in data[type]: # data[type][ex] = ast.literal_eval(data[type][ex]) self.logger.info("Lexicon ready.") return data
def load_vocab(dir_path, file_name): """ Loads a vocabulary from a file. :param dir_path: The path of the directory. :param file_name: The name of the vocab file. :return: The loaded Vocab. """ properties = load_obj(dir_path, file_name) vocab = Vocab(properties['counter'], specials=properties['specials']) return vocab
def load_domain_split_dataset(self, data_dir, logger=None): """ Loads break dataset with domain split. Train - on text. val + test - on DB + images :param data_dir: The path of the directory where the preprocessed dataset should be saved to or loaded from. :param logger: A logger for logging events. :return: The loaded dataset. """ current_dir = Path() dir_path = current_dir / "data" / "break_data" / "preprocessed" file_name = "dataset_preprocessed_domain_split.pkl" if not (dir_path / file_name).is_file(): if logger: logger.info('Creating domain split dataset...') text_domain_dataset_prefixes = ('COMQA', 'CWQ', 'DROP', 'HOTP') image_domain_dataset_prefixes = ('CLEVR', 'NLVR2') DB_domain_dataset_prefixes = ('ACADEMIC', 'ATIS', 'GEO', 'SPIDER') image_plus_DB = image_domain_dataset_prefixes + DB_domain_dataset_prefixes train_filtererd = pd.DataFrame() validation_filtererd = pd.DataFrame() test_filtererd = pd.DataFrame() for i, example in enumerate(self.dataset_logical['train']): if example['question_id'].startswith( text_domain_dataset_prefixes): train_filtererd = train_filtererd.append(example, ignore_index=True) for i, example in enumerate(self.dataset_logical['validation']): if example['question_id'].startswith(image_plus_DB): validation_filtererd = validation_filtererd.append( example, ignore_index=True) for i, example in enumerate(self.dataset_logical['test']): if example['question_id'].startswith(image_plus_DB): test_filtererd = test_filtererd.append(example, ignore_index=True) # TODO delete this? # train_dataset = self.dataset_logical['train'].filter( # lambda example: example['question_id'].startswith(text_domain_dataset_prefixes)) # validation_dataset = self.dataset_logical['validation'].filter( # lambda example: example['question_id'].startswith(image_plus_DB)) # test_dataset = self.dataset_logical['test'].filter( # lambda example: example['question_id'].startswith(image_plus_DB)) # train_filtererd_ds = Dataset.from_pandas(train_filtererd) to_save = { 'train': Dataset.from_pandas(train_filtererd), 'validation': Dataset.from_pandas(validation_filtererd), 'test': Dataset.from_pandas(test_filtererd) } save_obj(dir_path, to_save, file_name) dataset = load_obj(dir_path, file_name) return dataset
def get_programs(self): """ Loads the programs from a file. :return: The loaded programs. """ self.logger.info("Preparing programs...") current_dir = Path() dir_path = current_dir / "data" / "break_data" / "programs" file_name = "programs_" + self.dataset_split + ".pkl" if not (dir_path / file_name).is_file(): self.create_matching_programs(dir_path, file_name) data = load_obj(dir_path, file_name) self.logger.info("Programs ready.") return data
def load_length_split_dataset(self, data_dir, logger=None): """ Loads break dataset with length split based on number of operators. Train - <= 4 steps. val + test - on DB + images :param data_dir: The path of the directory where the preprocessed dataset should be saved to or loaded from. :param logger: A logger for logging events. :return: The loaded dataset. """ # TODO datadir required in signature? current_dir = Path() dir_path = current_dir / "data" / "break_data" / "preprocessed" file_name = "dataset_preprocessed_length_split.pkl" if not (dir_path / file_name).is_file(): if logger: logger.info('Creating length split dataset...') threshold_amount_ops = 4 train_filtererd = pd.DataFrame() validation_filtererd = pd.DataFrame() test_filtererd = pd.DataFrame() for i, example in enumerate(self.dataset_logical['train']): if example['operators'].count(',') < threshold_amount_ops: train_filtererd = train_filtererd.append(example, ignore_index=True) for i, example in enumerate(self.dataset_logical['validation']): if example['operators'].count(',') >= threshold_amount_ops: validation_filtererd = validation_filtererd.append( example, ignore_index=True) for i, example in enumerate(self.dataset_logical['test']): if example['operators'].count(',') >= threshold_amount_ops: test_filtererd = test_filtererd.append(example, ignore_index=True) to_save = { 'train': Dataset.from_pandas(train_filtererd), 'validation': Dataset.from_pandas(validation_filtererd), 'test': Dataset.from_pandas(test_filtererd) } save_obj(dir_path, to_save, file_name) dataset = load_obj(dir_path, file_name) return dataset
def load_dataset(data_dir, dataset_split, logger=None): """ loads the requested Break dataset from Hugging Face. :param data_dir: The path of the directory where the preprocessed dataset should be saved to or loaded from. :param dataset_split: The type of dataset to download from HF. :param logger: A logger for logging events. :return: The loaded dataset. """ current_dir = Path() dir_path = current_dir / "data" / "break_data" / "preprocessed" file_name = "dataset_preprocessed_" + dataset_split + ".pkl" if not (dir_path / file_name).is_file(): # Download and preprocess the BREAK dataset (logical form and lexicon), and save the preprocessed data. if logger: logger.info('Downloading and preparing datasets...') dataset_logical = load_dataset('break_data', dataset_split, cache_dir=data_dir) save_obj(dir_path, dataset_logical, file_name) # Load the saved preprocessed data. dataset = load_obj(dir_path, file_name) return dataset
saver_pose_seg_depth.restore(sess, restore_dir) print('restore pose_seg_depth succeed') else: print('Restore pose_seg_depth failed') raise SystemExit restore_dir = tf.train.latest_checkpoint(params.normal_dir) if restore_dir: saver_normal.restore(sess, restore_dir) print('restore normal succeed') else: print('Restore normal failed') raise SystemExit # Initialize datalist meanstd = util.load_obj(params.meanRgb_dir) dataset = glob.glob(params.test_dir + '/*.jpg') img_batch = np.zeros((params.batch_size, 256, 256, 3), dtype=np.float32) mask_batch = np.zeros((params.batch_size, 256, 256), dtype=np.bool) segmap = np.zeros((params.batch_size, 256, 256), dtype=np.int32) testimg = np.zeros((256, 256, 3), dtype=np.float32) images = os.listdir(params.test_dir) for each in images: i = 0 imgname = os.path.join(params.test_dir, each) try: im = Image.open(imgname) # do stuff except IOError: