Пример #1
0
    def file_iterator(benchmark_name: str = "correct",
                      split: str = "training") -> Iterator[str]:
        """Iterates over the paragraph files of a given benchmark and partition.

        :param benchmark_name: name of the benchmark, equals the benchmark folder name
        :param split: name of the partition, either training, development or test
        :return: iterator over file paths, relative from the Wikipedia directory defined in src.settings.paths
        """
        dir = split + "/" + benchmark_name + "/texts/"
        subdirs = sorted(get_files(paths.WIKI_DIR + dir))
        for subdir in subdirs:
            files = sorted(get_files(paths.WIKI_DIR + dir + subdir))
            for file in files:
                yield dir + subdir + "/" + file
Пример #2
0
def get_files_depth_two(directory):
    """
    Returns the paths to all files from the directories in the given directory.

    :param directory: a directory with directories containing the wanted files
    :return: list of full paths to all files at depth two
    """
    subdirs = get_files(directory)
    files = []
    for subdir in sorted(subdirs):
        path = directory + "/" + subdir + "/"
        subdir_files = get_files(path)
        for file in sorted(subdir_files):
            files.append(path + file)
    return files
Пример #3
0
 def _remove_saved_models(self):
     files = get_files(self.model_dir())
     saved_models = [
         f for f in files
         if file_exists(self.model_dir() + "/" + f + "/saved_model.pb")
     ]
     for model_name in saved_models:
         remove_dir(self.model_dir() + "/" + model_name)
Пример #4
0
def latest_saved_model_dir(path):
    """
    Finds the most recent timestamped subfolder at the given path.
    Assumes folders are named by a timestamp.
    :param path: path containing at least one timestamped subfolder
    :return: string consisting of path/latest_timestamped_subfolder, or None if no saved model exists
    """
    files = get_files(path)
    saved_model_dirs = [
        f for f in files
        if "temp" not in f and file_exists(path + "/" + f + "/saved_model.pb")
    ]
    if len(saved_model_dirs) == 0:
        return None
    latest = sorted([int(model_dir) for model_dir in saved_model_dirs])[-1]
    return path + "/" + str(latest)
import project
from src.helper.files import get_files, read_lines
from select_acl_articles import get_year


def preprocess(line: str):
    tokens = [token for token in line.split() if len(token) > 0]
    line = " ".join(tokens)
    return line


if __name__ == "__main__":
    random.seed(42)

    acl_dir = "/home/hertel/tokenization-repair-dumps/nastase/acl-201302_word-resegmented/raw/"

    files = sorted(get_files(acl_dir))

    files = [file for file in files if get_year(file) >= 2005]
    examples = []

    for file in files:
        lines = read_lines(acl_dir + file)
        lines = [preprocess(line) for line in lines]
        lines = [line for line in lines if len(line) > 0]
        examples.extend(lines)

    random.shuffle(examples)
    for line in examples:
        print(line)
import project
from src.settings import paths
from src.helper.files import get_files
from src.helper.pickle import load_object

if __name__ == "__main__":
    for file in sorted(get_files(paths.THRESHOLD_FITTER_DIR)):
        fitter = load_object(paths.THRESHOLD_FITTER_DIR + file)
        print(file, fitter.n_sequences)
def _remove_training_split_files():
    for file in get_files(paths.WIKI_TRAINING_SPLIT_DIR):
        remove_file(paths.WIKI_TRAINING_SPLIT_DIR + file)
    tuning_ids = set(random.sample(training_ids, 10000))
    training_ids = set(training_ids)
    development_ids = set(development_ids)
    test_ids = set(test_ids)
    evaluation_ids = development_ids.union(test_ids).union(tuning_ids)

    tuning_sentences, development_sentences, test_sentences = [], [], []

    sentence_splitter = WikiPunktTokenizer()

    if TRAINING:
        training_file = open(paths.WIKI_TRAINING_SENTENCES,
                             'w',
                             encoding="utf8")

    for sub_dir in sorted(get_files(base_dir)):
        print(sub_dir)
        for file in sorted(get_files(base_dir + sub_dir)):
            path = base_dir + sub_dir + "/" + file
            for line in read_sequences(path):
                article = json.loads(line)
                id = article["id"]
                if not TRAINING and id in evaluation_ids:
                    sentences = split_article(article["text"],
                                              sentence_splitter)
                    sentences = filter_sentences(sentences)
                    if len(sentences) > 0:
                        selected_sentence = random.choice(sentences)
                        selected_sentence = preprocess_sentence(
                            selected_sentence)
                        if id in tuning_ids:
 def get_all_prediction_files(self) -> List[str]:
     files = get_files(self.results_dir())
     filtered = [file for file in files if file.endswith(".txt")]
     return filtered
import random
import shutil

import project
from src.helper.files import get_files, read_lines, write_lines
from src.settings import paths, symbols
from src.helper.pickle import dump_object

if __name__ == "__main__":
    random.seed(42)

    step = sys.argv[1]

    if step == "split":
        path = "/home/hertel/tokenization-repair-dumps/nastase/acl-201302_word-resegmented/raw/"
        files = sorted(get_files(path))
        print(len(files), "files")

        random.shuffle(files)
        n_test = 100

        out_path = "/home/hertel/tokenization-repair-dumps/acl_corpus/"
        for i, filename in enumerate(files):
            print(filename)
            if i < n_test:
                subdir = "development/"
            elif i < 2 * n_test:
                subdir = "test/"
            else:
                subdir = "training/"
            shutil.copy(path + filename, out_path + subdir + filename)