Пример #1
0
 def _results_folder(self):
     dir = paths.RESULTS_DIR
     if self.benchmark_name is not None:
         dir += self.benchmark_name + "/"
         if not path_exists(dir):
             make_directory(dir)
     dir += self.time_string if self.approach_name is None else self.approach_name
     if not path_exists(dir):
         make_directory(dir)
     return dir
Пример #2
0
 def prepare_directories(self):
     for split in ["training", "development", "test"]:
         split_path = self.out_directory + split
         if not path_exists(split_path):
             make_directory(split_path)
         benchmark_split_path = split_path + "/" + self.benchmark_name
         if not path_exists(benchmark_split_path):
             make_directory(benchmark_split_path)
         texts_path = benchmark_split_path + "/texts"
         if not path_exists(texts_path):
             make_directory(texts_path)
Пример #3
0
 def get_sequence_files(self, article, n_article_paragraphs):
     files = []
     for sequence_ix in range(n_article_paragraphs):
         if article["id"] in self.dev_ids:
             split_name = "development"
             subsplit = self.n_development_files // self.folder_size
             self.n_development_files += 1
         elif article["id"] in self.test_ids:
             split_name = "test"
             subsplit = self.n_test_files // self.folder_size
             self.n_test_files += 1
         else:
             split_name = "training"
             subsplit = self.n_training_files // self.folder_size
             self.n_training_files += 1
         subsplit = ("%." + str(SUBSPLIT_ID_LEN) + "i") % subsplit
         folder = self.out_directory + split_name + "/" + self.benchmark_name + "/texts/" + subsplit + "/"
         if not path_exists(folder):
             make_directory(folder)
         file_name_pattern = "%." + str(PARAGRAPH_ID_LEN) + "i_%s_%s_%." + str(SUB_SEQUENCE_ID_LEN) + "i.txt"
         file_name = file_name_pattern % (self.n_paragraphs,
                                          article["id"],
                                          article["title"][:MAX_TITLE_LEN].replace('/', '_'),
                                          sequence_ix)
         files.append(folder + file_name)
         self.n_paragraphs += 1
     return files
def find_wikipedia_dir() -> str:
    """Searches for the extracted Wikipedia files at pre-defined paths.

    :return: Absolute path to the extracted Wikipedia files.
    """
    dirs = anonymous_paths.EXTRACTED_WIKI_DIRS
    for dir in dirs:
        if path_exists(dir):
            return dir
    raise Exception("Could not find extracted Wikipedia directory.")
def corrupt_dataset(directory: str,
                    p: float,
                    splits: List[str],
                    seed: int):
    """Generates a corrupt dataset in paragraph format.

    The format is as follows:
    directory
    ---| training
    -------| <benchmark_name>
    -----------| texts
    ---------------| 0000
    -------------------| <sequence_file_name>
    -------------------| ...
    ---------------| ...
    ---| development
    ---| test

    :param directory: output directory
    :param p: corruption probability
    :param splits: subset of {training, development, test}, provided as a list
    :param seed: corruption random seed
    """
    corruptor = _corruptor(p, seed)
    benchmark_name = _benchmark_name(p)
    for split in splits:
        benchmark_split_dir = directory + split + "/" + benchmark_name + "/"
        if not path_exists(benchmark_split_dir):
            make_directory(benchmark_split_dir)
        text_dir = benchmark_split_dir + "texts/"
        if not path_exists(text_dir):
            make_directory(text_dir)
        for file in Wikipedia.file_iterator(benchmark_name="correct", split=split):
            sequence = Wikipedia.get_sequence(file)
            corrupt = corruptor.corrupt(sequence)
            path_split = file.split('/')
            path_split[-4] = benchmark_name
            folder = paths.WIKI_DIR + '/'.join(path_split[:-1])
            if not path_exists(folder):
                make_directory(folder)
            path = paths.WIKI_DIR + '/'.join(path_split)
            write_file(path, corrupt)
    corruptor.print_summary()
Пример #6
0
 def initialize(self, specification, encoder):
     """
     Initializes a new estimator model.
     Binds the given specification and encoder to self.
     Removes estimator model with the same name if stored at the default model directory as given by method model_dir.
     Creates Estimator and binds it to self.
     :param specification: specification holding the model's hyperparameters
     :param encoder: encoder that encodes text to subword unit labels (and decodes too)
     """
     self.specification = specification
     self.encoder = encoder
     if path_exists(self.model_dir()):
         remove_dir(self.model_dir())
     self.estimator = self._make_estimator()
     self._save_specification()
     self._save_encoder()
Пример #7
0
from typing import Optional

from src.helper.files import path_exists, make_directory
from src.benchmark.subset import Subset

# BASE DIRECTORY
DUMP_DIRS = [
    #"/home/hertel/tokenization-repair-dumps/data_naacl2021/",  # repro
    "/home/hertel/tokenization-repair-dumps/data/",  # wunderfitz
    "/local/data/hertelm/tokenization-repair-dumps/data/",  # sirba
    "/data/1/matthias-hertel/tokenization-repair-dumps/data/",  # polyaxon
    "/external/"  # docker
]
DUMP_DIR = None
for dir in DUMP_DIRS:
    if path_exists(dir):
        DUMP_DIR = dir
        print("Located data folder: %s" % DUMP_DIR)
        break
if DUMP_DIR is None:
    raise Exception("Unable to locate data folder.")

# MODEL DIRECTORY FOR SERVER
MODEL_FOLDER = DUMP_DIR + "models_server/"

# ESTIMATOR DIRECTORY
ESTIMATORS_DIR = DUMP_DIR + "estimators/"

# DATA DIRECTORY
DATA_DIR = DUMP_DIR + "data/"