def create_matching_lexicon(self, dir_path, file_name): """ Creates the lexicons for the questions. :param dir_path: The path in which to save the created lexicon. :param file_name: The name of the lexicon. """ # There are more examples in the lexicon dataset than in the logical examples # This function creates one-to-one mapping between them and stores lexicon dict in a file self.logger.info('Creating lexicon...') dataset_qdmr_lexicon = self.load_dataset(dir_path, 'QDMR-lexicon', self.logger) lexicon_lists = {'train': [], 'validation': [], 'test': []} for data_split in self.dataset_logical: lex_idx = 0 lexicon_split = dataset_qdmr_lexicon[data_split] for i, example in enumerate(self.dataset_logical[data_split]): question = example['question_text'] lexcion_found = False for j in range(lex_idx, len(lexicon_split)): lexicon_example = lexicon_split[j] if lexicon_example['source'] == question: str_lex = lexicon_example['allowed_tokens'] lexicon_lists[data_split].append(str_lex) lex_idx = j + 1 lexcion_found = True break # if it got here, no matching lexicon found in lexicon file if not lexcion_found: raise EOFError save_obj(dir_path, lexicon_lists, file_name) self.logger.info('Done creating lexicon.')
def save_obj(self, filename, vertices, textures): ''' vertices: [nv, 3], tensor texture: [3, h, w], tensor ''' util.save_obj(filename, vertices, self.faces[0], textures=textures, uvcoords=self.raw_uvcoords[0], uvfaces=self.uvfaces[0])
def load_domain_split_dataset(self, data_dir, logger=None): """ Loads break dataset with domain split. Train - on text. val + test - on DB + images :param data_dir: The path of the directory where the preprocessed dataset should be saved to or loaded from. :param logger: A logger for logging events. :return: The loaded dataset. """ current_dir = Path() dir_path = current_dir / "data" / "break_data" / "preprocessed" file_name = "dataset_preprocessed_domain_split.pkl" if not (dir_path / file_name).is_file(): if logger: logger.info('Creating domain split dataset...') text_domain_dataset_prefixes = ('COMQA', 'CWQ', 'DROP', 'HOTP') image_domain_dataset_prefixes = ('CLEVR', 'NLVR2') DB_domain_dataset_prefixes = ('ACADEMIC', 'ATIS', 'GEO', 'SPIDER') image_plus_DB = image_domain_dataset_prefixes + DB_domain_dataset_prefixes train_filtererd = pd.DataFrame() validation_filtererd = pd.DataFrame() test_filtererd = pd.DataFrame() for i, example in enumerate(self.dataset_logical['train']): if example['question_id'].startswith( text_domain_dataset_prefixes): train_filtererd = train_filtererd.append(example, ignore_index=True) for i, example in enumerate(self.dataset_logical['validation']): if example['question_id'].startswith(image_plus_DB): validation_filtererd = validation_filtererd.append( example, ignore_index=True) for i, example in enumerate(self.dataset_logical['test']): if example['question_id'].startswith(image_plus_DB): test_filtererd = test_filtererd.append(example, ignore_index=True) # TODO delete this? # train_dataset = self.dataset_logical['train'].filter( # lambda example: example['question_id'].startswith(text_domain_dataset_prefixes)) # validation_dataset = self.dataset_logical['validation'].filter( # lambda example: example['question_id'].startswith(image_plus_DB)) # test_dataset = self.dataset_logical['test'].filter( # lambda example: example['question_id'].startswith(image_plus_DB)) # train_filtererd_ds = Dataset.from_pandas(train_filtererd) to_save = { 'train': Dataset.from_pandas(train_filtererd), 'validation': Dataset.from_pandas(validation_filtererd), 'test': Dataset.from_pandas(test_filtererd) } save_obj(dir_path, to_save, file_name) dataset = load_obj(dir_path, file_name) return dataset
def create_matching_programs(self, dir_path, file_name): """ Creates the matching programs for all the question QDMRs. :param dir_path: The directory in which to save the created programs. :param file_name: The name of the file with the programs. """ self.logger.info('Creating programs...') programs = [] for gold in self.dataset_logical[self.dataset_split]["decomposition"]: builder = QDMRProgramBuilder(gold) builder.build() programs.append(str(builder)) save_obj(dir_path, programs, file_name) self.logger.info('Done creating programs.')
def load_length_split_dataset(self, data_dir, logger=None): """ Loads break dataset with length split based on number of operators. Train - <= 4 steps. val + test - on DB + images :param data_dir: The path of the directory where the preprocessed dataset should be saved to or loaded from. :param logger: A logger for logging events. :return: The loaded dataset. """ # TODO datadir required in signature? current_dir = Path() dir_path = current_dir / "data" / "break_data" / "preprocessed" file_name = "dataset_preprocessed_length_split.pkl" if not (dir_path / file_name).is_file(): if logger: logger.info('Creating length split dataset...') threshold_amount_ops = 4 train_filtererd = pd.DataFrame() validation_filtererd = pd.DataFrame() test_filtererd = pd.DataFrame() for i, example in enumerate(self.dataset_logical['train']): if example['operators'].count(',') < threshold_amount_ops: train_filtererd = train_filtererd.append(example, ignore_index=True) for i, example in enumerate(self.dataset_logical['validation']): if example['operators'].count(',') >= threshold_amount_ops: validation_filtererd = validation_filtererd.append( example, ignore_index=True) for i, example in enumerate(self.dataset_logical['test']): if example['operators'].count(',') >= threshold_amount_ops: test_filtererd = test_filtererd.append(example, ignore_index=True) to_save = { 'train': Dataset.from_pandas(train_filtererd), 'validation': Dataset.from_pandas(validation_filtererd), 'test': Dataset.from_pandas(test_filtererd) } save_obj(dir_path, to_save, file_name) dataset = load_obj(dir_path, file_name) return dataset
def build_vocab(specials, dir_path, file_name, sents, tokenizer): """ Builds a vocabulary from the questions of the split. The created vocabulary is saved to a file for future usage. :return: The Vocab object. """ if not (dir_path / file_name).is_file(): spacy.load('en_core_web_sm') # Build the vocabulary with the questions, note that the questions currently are of one split only. counter = Counter() for sent in sents: counter.update(tokenizer(sent)) # Save the counter and the specials. to_save = {'counter': counter, 'specials': specials} save_obj(dir_path, to_save, file_name) return load_vocab(dir_path, file_name)
def load_dataset(data_dir, dataset_split, logger=None): """ loads the requested Break dataset from Hugging Face. :param data_dir: The path of the directory where the preprocessed dataset should be saved to or loaded from. :param dataset_split: The type of dataset to download from HF. :param logger: A logger for logging events. :return: The loaded dataset. """ current_dir = Path() dir_path = current_dir / "data" / "break_data" / "preprocessed" file_name = "dataset_preprocessed_" + dataset_split + ".pkl" if not (dir_path / file_name).is_file(): # Download and preprocess the BREAK dataset (logical form and lexicon), and save the preprocessed data. if logger: logger.info('Downloading and preparing datasets...') dataset_logical = load_dataset('break_data', dataset_split, cache_dir=data_dir) save_obj(dir_path, dataset_logical, file_name) # Load the saved preprocessed data. dataset = load_obj(dir_path, file_name) return dataset