def create_matching_lexicon(self, dir_path, file_name):
        """
        Creates the lexicons for the questions.
        :param dir_path: The path in which to save the created lexicon.
        :param file_name: The name of the lexicon.
        """
        # There are more examples in the lexicon dataset than in the logical examples
        # This function creates one-to-one mapping between them and stores lexicon dict in a file
        self.logger.info('Creating lexicon...')
        dataset_qdmr_lexicon = self.load_dataset(dir_path, 'QDMR-lexicon',
                                                 self.logger)

        lexicon_lists = {'train': [], 'validation': [], 'test': []}
        for data_split in self.dataset_logical:
            lex_idx = 0
            lexicon_split = dataset_qdmr_lexicon[data_split]
            for i, example in enumerate(self.dataset_logical[data_split]):
                question = example['question_text']
                lexcion_found = False
                for j in range(lex_idx, len(lexicon_split)):
                    lexicon_example = lexicon_split[j]
                    if lexicon_example['source'] == question:
                        str_lex = lexicon_example['allowed_tokens']
                        lexicon_lists[data_split].append(str_lex)
                        lex_idx = j + 1
                        lexcion_found = True
                        break
                # if it got here, no matching lexicon found in lexicon file
                if not lexcion_found:
                    raise EOFError
        save_obj(dir_path, lexicon_lists, file_name)
        self.logger.info('Done creating lexicon.')
 def save_obj(self, filename, vertices, textures):
     '''
     vertices: [nv, 3], tensor
     texture: [3, h, w], tensor
     '''
     util.save_obj(filename,
                   vertices,
                   self.faces[0],
                   textures=textures,
                   uvcoords=self.raw_uvcoords[0],
                   uvfaces=self.uvfaces[0])
    def load_domain_split_dataset(self, data_dir, logger=None):
        """
        Loads break dataset with domain split. Train - on text. val + test - on DB + images
        :param data_dir:    The path of the directory where the preprocessed dataset should be saved to or loaded from.
        :param logger:      A logger for logging events.
        :return:            The loaded dataset.
        """
        current_dir = Path()
        dir_path = current_dir / "data" / "break_data" / "preprocessed"
        file_name = "dataset_preprocessed_domain_split.pkl"
        if not (dir_path / file_name).is_file():
            if logger:
                logger.info('Creating domain split dataset...')
            text_domain_dataset_prefixes = ('COMQA', 'CWQ', 'DROP', 'HOTP')
            image_domain_dataset_prefixes = ('CLEVR', 'NLVR2')
            DB_domain_dataset_prefixes = ('ACADEMIC', 'ATIS', 'GEO', 'SPIDER')
            image_plus_DB = image_domain_dataset_prefixes + DB_domain_dataset_prefixes
            train_filtererd = pd.DataFrame()
            validation_filtererd = pd.DataFrame()
            test_filtererd = pd.DataFrame()

            for i, example in enumerate(self.dataset_logical['train']):
                if example['question_id'].startswith(
                        text_domain_dataset_prefixes):
                    train_filtererd = train_filtererd.append(example,
                                                             ignore_index=True)
            for i, example in enumerate(self.dataset_logical['validation']):
                if example['question_id'].startswith(image_plus_DB):
                    validation_filtererd = validation_filtererd.append(
                        example, ignore_index=True)
            for i, example in enumerate(self.dataset_logical['test']):
                if example['question_id'].startswith(image_plus_DB):
                    test_filtererd = test_filtererd.append(example,
                                                           ignore_index=True)

            # TODO delete this?
            # train_dataset = self.dataset_logical['train'].filter(
            #     lambda example: example['question_id'].startswith(text_domain_dataset_prefixes))
            # validation_dataset = self.dataset_logical['validation'].filter(
            #     lambda example: example['question_id'].startswith(image_plus_DB))
            # test_dataset = self.dataset_logical['test'].filter(
            #     lambda example: example['question_id'].startswith(image_plus_DB))
            # train_filtererd_ds = Dataset.from_pandas(train_filtererd)
            to_save = {
                'train': Dataset.from_pandas(train_filtererd),
                'validation': Dataset.from_pandas(validation_filtererd),
                'test': Dataset.from_pandas(test_filtererd)
            }
            save_obj(dir_path, to_save, file_name)

        dataset = load_obj(dir_path, file_name)
        return dataset
    def create_matching_programs(self, dir_path, file_name):
        """
        Creates the matching programs for all the question QDMRs.
        :param dir_path: The directory in which to save the created programs.
        :param file_name: The name of the file with the programs.
        """
        self.logger.info('Creating programs...')
        programs = []

        for gold in self.dataset_logical[self.dataset_split]["decomposition"]:
            builder = QDMRProgramBuilder(gold)
            builder.build()
            programs.append(str(builder))

        save_obj(dir_path, programs, file_name)
        self.logger.info('Done creating programs.')
    def load_length_split_dataset(self, data_dir, logger=None):
        """
        Loads break dataset with length split based on number of operators.
        Train - <= 4 steps.
        val + test - on DB + images
        :param data_dir: The path of the directory where the preprocessed dataset should be saved to or loaded from.
        :param logger: A logger for logging events.
        :return: The loaded dataset.
        """
        # TODO datadir required in signature?
        current_dir = Path()
        dir_path = current_dir / "data" / "break_data" / "preprocessed"
        file_name = "dataset_preprocessed_length_split.pkl"

        if not (dir_path / file_name).is_file():
            if logger:
                logger.info('Creating length split dataset...')
            threshold_amount_ops = 4

            train_filtererd = pd.DataFrame()
            validation_filtererd = pd.DataFrame()
            test_filtererd = pd.DataFrame()

            for i, example in enumerate(self.dataset_logical['train']):
                if example['operators'].count(',') < threshold_amount_ops:
                    train_filtererd = train_filtererd.append(example,
                                                             ignore_index=True)
            for i, example in enumerate(self.dataset_logical['validation']):
                if example['operators'].count(',') >= threshold_amount_ops:
                    validation_filtererd = validation_filtererd.append(
                        example, ignore_index=True)
            for i, example in enumerate(self.dataset_logical['test']):
                if example['operators'].count(',') >= threshold_amount_ops:
                    test_filtererd = test_filtererd.append(example,
                                                           ignore_index=True)

            to_save = {
                'train': Dataset.from_pandas(train_filtererd),
                'validation': Dataset.from_pandas(validation_filtererd),
                'test': Dataset.from_pandas(test_filtererd)
            }
            save_obj(dir_path, to_save, file_name)

        dataset = load_obj(dir_path, file_name)
        return dataset
Пример #6
0
def build_vocab(specials, dir_path, file_name, sents, tokenizer):
    """
    Builds a vocabulary from the questions of the split.
    The created vocabulary is saved to a file for future usage.
    :return: The Vocab object.
    """
    if not (dir_path / file_name).is_file():
        spacy.load('en_core_web_sm')

        # Build the vocabulary with the questions, note that the questions currently are of one split only.
        counter = Counter()
        for sent in sents:
            counter.update(tokenizer(sent))
        # Save the counter and the specials.
        to_save = {'counter': counter, 'specials': specials}
        save_obj(dir_path, to_save, file_name)

    return load_vocab(dir_path, file_name)
    def load_dataset(data_dir, dataset_split, logger=None):
        """
        loads the requested Break dataset from Hugging Face.
        :param data_dir: The path of the directory where the preprocessed dataset should be saved to or loaded from.
        :param dataset_split: The type of dataset to download from HF.
        :param logger: A logger for logging events.
        :return: The loaded dataset.
        """
        current_dir = Path()
        dir_path = current_dir / "data" / "break_data" / "preprocessed"
        file_name = "dataset_preprocessed_" + dataset_split + ".pkl"
        if not (dir_path / file_name).is_file():
            # Download and preprocess the BREAK dataset (logical form and lexicon), and save the preprocessed data.
            if logger:
                logger.info('Downloading and preparing datasets...')
            dataset_logical = load_dataset('break_data',
                                           dataset_split,
                                           cache_dir=data_dir)
            save_obj(dir_path, dataset_logical, file_name)

        # Load the saved preprocessed data.
        dataset = load_obj(dir_path, file_name)
        return dataset