def get_sequence_files(self, article, n_article_paragraphs): files = [] for sequence_ix in range(n_article_paragraphs): if article["id"] in self.dev_ids: split_name = "development" subsplit = self.n_development_files // self.folder_size self.n_development_files += 1 elif article["id"] in self.test_ids: split_name = "test" subsplit = self.n_test_files // self.folder_size self.n_test_files += 1 else: split_name = "training" subsplit = self.n_training_files // self.folder_size self.n_training_files += 1 subsplit = ("%." + str(SUBSPLIT_ID_LEN) + "i") % subsplit folder = self.out_directory + split_name + "/" + self.benchmark_name + "/texts/" + subsplit + "/" if not path_exists(folder): make_directory(folder) file_name_pattern = "%." + str(PARAGRAPH_ID_LEN) + "i_%s_%s_%." + str(SUB_SEQUENCE_ID_LEN) + "i.txt" file_name = file_name_pattern % (self.n_paragraphs, article["id"], article["title"][:MAX_TITLE_LEN].replace('/', '_'), sequence_ix) files.append(folder + file_name) self.n_paragraphs += 1 return files
def _save_encoder(self): """ Stores the encoder at the model directory. File name is encoder.pkl. :return: """ make_directory(self.model_dir()) dump_object(self.encoder, self.model_dir() + "/encoder.pkl")
def _save_specification(self): """ Stores the specification at the model directory. File name is specification.pkl. :return: """ make_directory(self.model_dir()) dump_object(self.specification, self.model_dir() + "/specification.pkl")
def _results_folder(self): dir = paths.RESULTS_DIR if self.benchmark_name is not None: dir += self.benchmark_name + "/" if not path_exists(dir): make_directory(dir) dir += self.time_string if self.approach_name is None else self.approach_name if not path_exists(dir): make_directory(dir) return dir
def write_predicted_sequence_files(self): dir = self._results_folder() + "/predicted/" make_directory(dir) # empty files for sequence_result in self.sequence_results: with open(dir + sequence_result.file_name[0], 'w') as file: pass # write to files for sequence_result in self.sequence_results: with open(dir + sequence_result.file_name[0], 'a') as file: file.write(sequence_result.predicted_sequence + '\n')
def prepare_directories(self): for split in ["training", "development", "test"]: split_path = self.out_directory + split if not path_exists(split_path): make_directory(split_path) benchmark_split_path = split_path + "/" + self.benchmark_name if not path_exists(benchmark_split_path): make_directory(benchmark_split_path) texts_path = benchmark_split_path + "/texts" if not path_exists(texts_path): make_directory(texts_path)
def corrupt_dataset(directory: str, p: float, splits: List[str], seed: int): """Generates a corrupt dataset in paragraph format. The format is as follows: directory ---| training -------| <benchmark_name> -----------| texts ---------------| 0000 -------------------| <sequence_file_name> -------------------| ... ---------------| ... ---| development ---| test :param directory: output directory :param p: corruption probability :param splits: subset of {training, development, test}, provided as a list :param seed: corruption random seed """ corruptor = _corruptor(p, seed) benchmark_name = _benchmark_name(p) for split in splits: benchmark_split_dir = directory + split + "/" + benchmark_name + "/" if not path_exists(benchmark_split_dir): make_directory(benchmark_split_dir) text_dir = benchmark_split_dir + "texts/" if not path_exists(text_dir): make_directory(text_dir) for file in Wikipedia.file_iterator(benchmark_name="correct", split=split): sequence = Wikipedia.get_sequence(file) corrupt = corruptor.corrupt(sequence) path_split = file.split('/') path_split[-4] = benchmark_name folder = paths.WIKI_DIR + '/'.join(path_split[:-1]) if not path_exists(folder): make_directory(folder) path = paths.WIKI_DIR + '/'.join(path_split) write_file(path, corrupt) corruptor.print_summary()
while ' ' in text: text.replace(' ', ' ') return text def preprocess_sentence(sentence: str) -> str: sentence = unify_quotation_marks(sentence) sentence = unify_spacing(sentence) return sentence if __name__ == "__main__": TRAINING = sys.argv[1] == "training" random.seed(42) base_dir = paths.WIKI_DIR + "text/" make_directory(paths.WIKI_SENTENCES_DIR) training_ids, development_ids, test_ids = get_article_ids() tuning_ids = set(random.sample(training_ids, 10000)) training_ids = set(training_ids) development_ids = set(development_ids) test_ids = set(test_ids) evaluation_ids = development_ids.union(test_ids).union(tuning_ids) tuning_sentences, development_sentences, test_sentences = [], [], [] sentence_splitter = WikiPunktTokenizer() if TRAINING: training_file = open(paths.WIKI_TRAINING_SENTENCES, 'w',