def file_iterator(benchmark_name: str = "correct", split: str = "training") -> Iterator[str]: """Iterates over the paragraph files of a given benchmark and partition. :param benchmark_name: name of the benchmark, equals the benchmark folder name :param split: name of the partition, either training, development or test :return: iterator over file paths, relative from the Wikipedia directory defined in src.settings.paths """ dir = split + "/" + benchmark_name + "/texts/" subdirs = sorted(get_files(paths.WIKI_DIR + dir)) for subdir in subdirs: files = sorted(get_files(paths.WIKI_DIR + dir + subdir)) for file in files: yield dir + subdir + "/" + file
def get_files_depth_two(directory): """ Returns the paths to all files from the directories in the given directory. :param directory: a directory with directories containing the wanted files :return: list of full paths to all files at depth two """ subdirs = get_files(directory) files = [] for subdir in sorted(subdirs): path = directory + "/" + subdir + "/" subdir_files = get_files(path) for file in sorted(subdir_files): files.append(path + file) return files
def _remove_saved_models(self): files = get_files(self.model_dir()) saved_models = [ f for f in files if file_exists(self.model_dir() + "/" + f + "/saved_model.pb") ] for model_name in saved_models: remove_dir(self.model_dir() + "/" + model_name)
def latest_saved_model_dir(path): """ Finds the most recent timestamped subfolder at the given path. Assumes folders are named by a timestamp. :param path: path containing at least one timestamped subfolder :return: string consisting of path/latest_timestamped_subfolder, or None if no saved model exists """ files = get_files(path) saved_model_dirs = [ f for f in files if "temp" not in f and file_exists(path + "/" + f + "/saved_model.pb") ] if len(saved_model_dirs) == 0: return None latest = sorted([int(model_dir) for model_dir in saved_model_dirs])[-1] return path + "/" + str(latest)
import project from src.helper.files import get_files, read_lines from select_acl_articles import get_year def preprocess(line: str): tokens = [token for token in line.split() if len(token) > 0] line = " ".join(tokens) return line if __name__ == "__main__": random.seed(42) acl_dir = "/home/hertel/tokenization-repair-dumps/nastase/acl-201302_word-resegmented/raw/" files = sorted(get_files(acl_dir)) files = [file for file in files if get_year(file) >= 2005] examples = [] for file in files: lines = read_lines(acl_dir + file) lines = [preprocess(line) for line in lines] lines = [line for line in lines if len(line) > 0] examples.extend(lines) random.shuffle(examples) for line in examples: print(line)
import project from src.settings import paths from src.helper.files import get_files from src.helper.pickle import load_object if __name__ == "__main__": for file in sorted(get_files(paths.THRESHOLD_FITTER_DIR)): fitter = load_object(paths.THRESHOLD_FITTER_DIR + file) print(file, fitter.n_sequences)
def _remove_training_split_files(): for file in get_files(paths.WIKI_TRAINING_SPLIT_DIR): remove_file(paths.WIKI_TRAINING_SPLIT_DIR + file)
tuning_ids = set(random.sample(training_ids, 10000)) training_ids = set(training_ids) development_ids = set(development_ids) test_ids = set(test_ids) evaluation_ids = development_ids.union(test_ids).union(tuning_ids) tuning_sentences, development_sentences, test_sentences = [], [], [] sentence_splitter = WikiPunktTokenizer() if TRAINING: training_file = open(paths.WIKI_TRAINING_SENTENCES, 'w', encoding="utf8") for sub_dir in sorted(get_files(base_dir)): print(sub_dir) for file in sorted(get_files(base_dir + sub_dir)): path = base_dir + sub_dir + "/" + file for line in read_sequences(path): article = json.loads(line) id = article["id"] if not TRAINING and id in evaluation_ids: sentences = split_article(article["text"], sentence_splitter) sentences = filter_sentences(sentences) if len(sentences) > 0: selected_sentence = random.choice(sentences) selected_sentence = preprocess_sentence( selected_sentence) if id in tuning_ids:
def get_all_prediction_files(self) -> List[str]: files = get_files(self.results_dir()) filtered = [file for file in files if file.endswith(".txt")] return filtered
import random import shutil import project from src.helper.files import get_files, read_lines, write_lines from src.settings import paths, symbols from src.helper.pickle import dump_object if __name__ == "__main__": random.seed(42) step = sys.argv[1] if step == "split": path = "/home/hertel/tokenization-repair-dumps/nastase/acl-201302_word-resegmented/raw/" files = sorted(get_files(path)) print(len(files), "files") random.shuffle(files) n_test = 100 out_path = "/home/hertel/tokenization-repair-dumps/acl_corpus/" for i, filename in enumerate(files): print(filename) if i < n_test: subdir = "development/" elif i < 2 * n_test: subdir = "test/" else: subdir = "training/" shutil.copy(path + filename, out_path + subdir + filename)