def get_encoder(n: int = 0) -> CharacterEncoder: if n > 0: frequencies = load_object(paths.CHARACTER_FREQUENCY_DICT) sorted_frequencies = sort_dict_by_value(frequencies) most_frequent_chars = [char for char, frequency in sorted_frequencies[:n]] code_symbols = most_frequent_chars + [symbols.SOS, symbols.EOS, symbols.UNKNOWN] encoder = {symbol: index for index, symbol in enumerate(code_symbols)} else: encoder = load_object(paths.WIKI_ENCODER_DICT) return CharacterEncoder(encoder)
def get_article_ids_split(out_directory: str) -> Tuple[List[int], List[int], List[int]]: """Reads the article IDs of the training, development and test partitions from disk. :param out_directory: directory where the IDs are stored :return: lists of training, development and test article IDs """ training_ids_path = out_directory + "training_article_ids.pkl" development_ids_path = out_directory + "development_article_ids.pkl" test_ids_path = out_directory + "test_article_ids.pkl" training_ids = load_object(training_ids_path) development_ids = load_object(development_ids_path) test_ids = load_object(test_ids_path) return training_ids, development_ids, test_ids
def load(self, name): """ Loads an exported model into the default session. Assumes the model is stored at paths.ESTIMATOR_FOLDER + name including specification and encoder. :param name: name of the model, defines model subfolder """ self.specification = load_object(paths.MODEL_DIR + name + "/specification.pkl") self.encoder = load_object(paths.MODEL_DIR + name + "/encoder.pkl") self.estimator = self._make_estimator() self._update_saved_model() self.predict_fn = tf.contrib.predictor.from_saved_model( latest_saved_model_dir(self.model_dir()))
def most_frequent_wiki_and_all_aspell_word_counts(k_most_frequent): path = paths.WIKI_AND_ASPELL_TOKEN_COUNTERS % k_most_frequent if file_exists(path): return load_object(path) words = most_frequent_tokens(k_most_frequent) wiki_word_counters = load_object(paths.WIKI_TOKEN_COUNTERS) with open(paths.ASPELL_WORD_FILE) as f: for line in f: word = line[:-1] if word not in words: words[word] = wiki_word_counters[ word] if word in wiki_word_counters else 0 dump_object(words, path) return words
def corrupt_dataset_single(p, splits, seed): """Creates a corrupt dataset in single-file format. :param p: corruption probability :param splits: subset of {training, development, test}, provided as a list :param seed: corruption random seed """ corruptor = _corruptor(p, seed) benchmark_name = _benchmark_name(p) for split in splits: if split == "training": correct_sequences_path = paths.WIKI_TRAINING_SEQUENCES elif split == "development": correct_sequences_path = paths.WIKI_DEVELOPMENT_SEQUENCES else: correct_sequences_path = paths.WIKI_TEST_SEQUENCES correct_sequences = load_object(correct_sequences_path) corrupt_sequences = [] byte_position = 0 in_path = paths.WIKI_SINGLE_DIR + split + ".txt" out_path = paths.WIKI_SINGLE_DIR + "%s_%s.txt" % (benchmark_name, split) with open(out_path, 'wb') as out_file: for s_i, sequence in enumerate(read_sequences(in_path)): corrupt = corruptor.corrupt(sequence) s_id = correct_sequences[s_i].id bytes = (corrupt + '\n').encode("utf8") out_file.write(bytes) byte_len = out_file.tell() - byte_position char_len = len(corrupt) corrupt_sequences.append(Sequence(s_id, split, byte_position, byte_len, char_len)) byte_position += byte_len corrupt_sequences_path = paths.WIKI_OUT_DIR + "%s_%s_sequences.pkl" % (benchmark_name, split) dump_object(corrupt_sequences, corrupt_sequences_path) corruptor.print_summary()
def __init__(self, n, use_aspell=True, postprocessing_method=None, verbose=False): if use_aspell: self.token_counters = most_frequent_wiki_and_all_aspell_word_counts( n) else: self.token_counters = most_frequent_tokens(n) self.postprocessing_method = postprocessing_method if postprocessing_method == "SVM": self.split_model = load_object(paths.SVM_SPLIT_MODEL) elif postprocessing_method == "RF": self.split_model = load_object(paths.RF_SPLIT_MODEL) self.verbose = verbose
def __init__(self, n=None): token_frequencies = load_object(paths.TOKEN_FREQUENCY_DICT) if n is None: self.tokens = set(token_frequencies) else: self.tokens = set( token for token, _ in sort_dict_by_value(token_frequencies)[:n]) self.max_token_len = max(len(token) for token in self.tokens)
def load_most_frequent(n): path = None if n is not None: path = paths.MOST_FREQUENT_UNIGRAMS_DICT % n if file_exists(path): frequencies = load_object(path) return frequencies delim_frequencies = load_object(paths.UNIGRAM_DELIM_FREQUENCY_DICT) no_delim_frequencies = load_object(paths.UNIGRAM_NO_DELIM_FREQUENCY_DICT) frequencies = delim_frequencies for token in no_delim_frequencies: if token not in frequencies: frequencies[token] = no_delim_frequencies[token] else: frequencies[token] += no_delim_frequencies[token] if n is not None: frequencies = select_most_frequent(frequencies, n) dump_object(frequencies, path) return frequencies
def test_sequences(n_sequences: Optional[int] = None, seed: Optional[int] = None): """Reads the correct test sequences stored in single-file format. :param n_sequences: number of sequences, set None to retrieve all :param seed: seed for shuffling, set None for unshuffled :return: iterator over paragraph texts """ test_sequences = load_object(paths.WIKI_TEST_SEQUENCES) file = paths.WIKI_TEST_FILE return Wikipedia._read_sequences(file, test_sequences, seed, n_sequences)
def get_evaluation_samples(file_name): file_name = paths.WIKI_EVALUATION_DIR + file_name original_lines = read_file(file_name + file_names.ORIGINAL_SUFFIX).split('\n') original_lines = [line for line in original_lines if len(line) > 0] corrupt_lines = read_file(file_name + file_names.CORRUPT_SUFFIX).split('\n') corrupt_lines = [line for line in corrupt_lines if len(line) > 0] corruptions = load_object(file_name + file_names.CORRUPTIONS_SUFFIX) assert (len(corrupt_lines) == len(original_lines)) assert (len(corruptions) == len(original_lines)) return list(zip(original_lines, corrupt_lines, corruptions))
def __init__(self, fitting_method: FittingMethod = FittingMethod.GREEDY, autosave: bool = True): self.file = THRESHOLD_FILES[fitting_method] if file_exists(self.file): self.threshold_dict = load_object(self.file) else: print( "WARNING: could not locate %s. A new, empty decision threshold dictionary was created instead." % self.file) self.threshold_dict = dict() self.autosave = autosave
def most_frequent_tokens(n): if n is None: print("loading counters...") return load_object(paths.WIKI_TOKEN_COUNTERS) most_frequent_path = paths.WIKI_MOST_FREQUENT_TOKENS % n if file_exists(most_frequent_path): print("loading most frequent counters...") return load_object(most_frequent_path) sorted_token_counters_path = paths.WIKI_SORTED_TOKEN_COUNTERS if file_exists(sorted_token_counters_path): print("loading sorted counters...") sorted_token_counters = load_object(sorted_token_counters_path) else: print("loading counters...") token_counters = load_object(paths.WIKI_TOKEN_COUNTERS) print("sorting counters...") sorted_token_counters = sort_word_counters(token_counters) pickle_dump(sorted_token_counters, sorted_token_counters_path) most_frequent = sorted_word_counters_to_dict(sorted_token_counters[:n]) if not file_exists(most_frequent_path): pickle_dump(most_frequent, most_frequent_path) return most_frequent
def __init__(self, two_pass: bool = False, seq_acc: bool = False, autosave: bool = True): if seq_acc: self.file = paths.SEQ_ACC_BEAM_SEARCH_PENALTY_FILE elif two_pass: self.file = paths.TWO_PASS_BEAM_SEARCH_PENALTY_FILE else: self.file = paths.BEAM_SEARCH_PENALTY_FILE if file_exists(self.file): self.penalties = load_object(self.file) else: self.penalties = {} self.autosave = autosave
def get_stump_dict(unigrams: UnigramHolder) -> Dict[str, Set[str]]: if file_exists(paths.STUMP_DICT): return load_object(paths.STUMP_DICT) else: stump_dict = {} for token in unigrams.frequencies: if not token.isalpha(): continue if unigrams.get(token) < MIN_TOKEN_FREQUENCY: continue for stump in get_stumps(token): if stump not in stump_dict: stump_dict[stump] = {token} else: stump_dict[stump].add(token) dump_object(stump_dict, paths.STUMP_DICT) return stump_dict
def __init__(self, trained_abbreviations: bool = False, extended_abbreviations: bool = True): """ The default Punkt tokenizer with additional abbreviations. :param trained_abbreviations: Use the abbreviations of the Punkt tokenizer from Wikipedia. :param extended_abbreviations: Use the abbreviations determined by counting frequencies of tokens with and without dot on Wikipedia. """ self.tokenizer = load_default_nltk_tokenizer() if trained_abbreviations: wiki_tokenizer = self._load_trained_tokenizer() for abbr in wiki_tokenizer._params.abbrev_types: self.tokenizer._params.abbrev_types.add(abbr) if extended_abbreviations: for abbr in load_object(paths.EXTENDED_PUNKT_ABBREVIATIONS): self.tokenizer._params.abbrev_types.add(abbr.lower())
def training_batches(batch_size: int, seed: int = 42) -> Iterator[Batch]: """Iterates over batches of correct training sequences stored in split format. Loads the number of sequences of equal length from disk and splits the sequences into batches of the same length. Reads the split files in shuffled order. Actually, only the order of the files is shuffled, but the batches are fixed, and the order of batches of the same sequence lenght is fixed. For training for multiple epochs, one should really shuffle the sequences and batches. :param batch_size: maximum number of sequences per batch :param seed: seed to shuffle the files :return: iterator over batches """ num_sequences = load_object(paths.WIKI_TRAINING_SEQUENCE_COUNTS) batches = [] for seq_len in sorted(num_sequences): n_batches = math.ceil(num_sequences[seq_len] / batch_size) batches += [seq_len] * n_batches random.Random(seed).shuffle(batches) file_positions = {} for seq_len in batches: if seq_len in file_positions: start = file_positions[seq_len] else: start = 0 path = paths.WIKI_TRAINING_SPLIT_DIR + "%i.txt" % seq_len with open(path) as file: batch_sequences = [] file.seek(start) for _ in range(batch_size): line = file.readline() if line == "": break sequence = line[:-1] batch_sequences.append(sequence) file_positions[seq_len] = file.tell() yield Batch(batch_sequences, len(batch_sequences), seq_len)
import sys from project import src from src.helper.pickle import load_object from src.settings import paths if __name__ == "__main__": if "test" in sys.argv: article_ids = load_object(paths.WIKI_TEST_ARTICLE_IDS) else: article_ids = load_object(paths.WIKI_DEVELOPMENT_ARTICLE_IDS) article_ids = [int(i) for i in article_ids] for i in sorted(article_ids): print(i)
EPSILON = 1e-16 lookahead = 2 labeling = labeling_model_name != "0" title = "%s (%s)" % (approach, benchmark_name) #BENCHMARKS = ["0_0.1", "0.1_0.1", "arxiv-910k", "nastase-big"] BENCHMARKS = [benchmark_name] all_insertion_intervals = [] all_deletion_intervals = [] for benchmark in BENCHMARKS: cases_path = paths.CASES_FILE_NOISY if benchmark.startswith("0.1") else paths.CASES_FILE_CLEAN cases_path = cases_path % (model_name, "wikipedia" if benchmark.startswith("0") else benchmark) sequence_cases = load_object(cases_path) print(len(sequence_cases)) if labeling_model_name != "0": from src.estimator.bidirectional_labeling_estimator import BidirectionalLabelingEstimator labeling_model = BidirectionalLabelingEstimator() labeling_model.load(labeling_model_name) benchmark = Benchmark(benchmark, Subset.TUNING) case_db = [] correct_sequences = benchmark.get_sequences(BenchmarkFiles.CORRECT) corrupt_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT) for s_i, (correct, corrupt) in enumerate(zip(correct_sequences, corrupt_sequences)):
trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.ABBREV = 0.3 trainer.train(text, verbose=True) del text print("building tokenizer...") tokenizer = PunktSentenceTokenizer(trainer.get_params()) abbrevs = tokenizer._params.abbrev_types print(sorted(abbrevs)) print("%i abbreviations" % len(abbrevs)) target_abbrevs = [ "i.e", "e.g", "prof", "dr", "m.sc", "no", "nos", "mr", "mrs", "ms", "seq", "o.r.s" ] for target in target_abbrevs: print(target, target in abbrevs, score(trainer, target)) print("saving...") dump_object(tokenizer, PICKLE_FILE) elif MODE == "starters": tokenizer = load_object(PICKLE_FILE) for starter in sorted(tokenizer._params.sent_starters): print(starter) elif MODE == "starters-nltk": tokenizer = NLTKSentenceSplitter() for starter in sorted(tokenizer.tokenizer._params.sent_starters): print(starter)
def __init__(self): if file_exists(paths.RESULTS_DICT): self.results = load_object(paths.RESULTS_DICT) else: self.results = {}
import sys import project from src.helper.pickle import load_object, dump_object from src.settings.paths import ESTIMATORS_DIR if __name__ == "__main__": name = sys.argv[1] path = ESTIMATORS_DIR + name + "/specification.pkl" specification = load_object(path) specification.name = name dump_object(specification, path)
from src.data.raw_wikipedia import get_article_jsons from src.data.preprocessing import preprocess_sequence def select_random_paragraph(text: str) -> str: paragraphs = text.split('\n') paragraphs = [preprocess_sequence(paragraph) for paragraph in paragraphs] paragraphs = [paragraph for paragraph in paragraphs if len(paragraph) > 0] selected = random.choice(paragraphs) return selected if __name__ == "__main__": random.seed(1998) development_ids = set(load_object(paths.WIKI_DEVELOPMENT_ARTICLE_IDS)) test_ids = set(load_object(paths.WIKI_TEST_ARTICLE_IDS)) print(development_ids) print(test_ids) development_paragraphs = [] test_paragraphs = [] for article in get_article_jsons(): id = article["id"] is_dev = id in development_ids is_test = (not is_dev) and id in test_ids if is_dev or is_test: paragraph = select_random_paragraph(article["text"]) if is_dev: development_paragraphs.append(paragraph)
import sys import project from src.settings import paths from src.helper.pickle import load_object from src.helper.data_structures import sort_dict_by_value if __name__ == "__main__": char_dict = load_object(paths.CHARACTER_FREQUENCY_DICT) for i, (char, frequency) in enumerate(sort_dict_by_value(char_dict)): print(i + 1, char, frequency)
def __init__(self, model, backward): self.model = model self.encoder = load_object(paths.WIKI_ENCODER_DICT) self.backward = backward
def get_dictionaries(): encoder = load_object(paths.WIKI_ENCODER_DICT) decoder = load_object(paths.WIKI_DECODER_DICT) return encoder, decoder
def load(): return load_object(paths.BIGRAM_HOLDER)
def get_character_counts() -> Dict[str, int]: """Loads the character counts from disk. Path is defined in src.settings.path. :return: dictionary char -> count """ return load_object(paths.WIKI_CHARACTER_COUNT_DICT)
def get_encoder_from_dict(path: str) -> CharacterEncoder: encoder_dict = load_object(path) return CharacterEncoder(encoder_dict)
def load(self): return load_object(self._file())
import project from src.settings import paths from src.helper.files import get_files from src.helper.pickle import load_object if __name__ == "__main__": for file in sorted(get_files(paths.THRESHOLD_FITTER_DIR)): fitter = load_object(paths.THRESHOLD_FITTER_DIR + file) print(file, fitter.n_sequences)