def corrupt_dataset_single(p, splits, seed): """Creates a corrupt dataset in single-file format. :param p: corruption probability :param splits: subset of {training, development, test}, provided as a list :param seed: corruption random seed """ corruptor = _corruptor(p, seed) benchmark_name = _benchmark_name(p) for split in splits: if split == "training": correct_sequences_path = paths.WIKI_TRAINING_SEQUENCES elif split == "development": correct_sequences_path = paths.WIKI_DEVELOPMENT_SEQUENCES else: correct_sequences_path = paths.WIKI_TEST_SEQUENCES correct_sequences = load_object(correct_sequences_path) corrupt_sequences = [] byte_position = 0 in_path = paths.WIKI_SINGLE_DIR + split + ".txt" out_path = paths.WIKI_SINGLE_DIR + "%s_%s.txt" % (benchmark_name, split) with open(out_path, 'wb') as out_file: for s_i, sequence in enumerate(read_sequences(in_path)): corrupt = corruptor.corrupt(sequence) s_id = correct_sequences[s_i].id bytes = (corrupt + '\n').encode("utf8") out_file.write(bytes) byte_len = out_file.tell() - byte_position char_len = len(corrupt) corrupt_sequences.append(Sequence(s_id, split, byte_position, byte_len, char_len)) byte_position += byte_len corrupt_sequences_path = paths.WIKI_OUT_DIR + "%s_%s_sequences.pkl" % (benchmark_name, split) dump_object(corrupt_sequences, corrupt_sequences_path) corruptor.print_summary()
def count_unigrams(n_sequences: int): total_start = timestamp() tokenizer = Tokenizer() counts_delim = {} counts_no_delim = {} tokenization_time = 0 for s_i, sequence in enumerate(Wikipedia.training_sequences(n_sequences)): start = timestamp() tokens = tokenizer.tokenize(sequence) tokens[0].delimiter_before = True tokenization_time += time_diff(start) for token in tokens: counts = counts_delim if token.delimiter_before else counts_no_delim if token.text not in counts: counts[token.text] = 1 else: counts[token.text] += 1 if (s_i + 1) % K10 == 0: print("%ik sequences, %.2f s total time, %.2f s tokenization" % ((s_i + 1) / K, time_diff(total_start), tokenization_time)) if (s_i + 1) % M == 0: print("saving...") dump_object(counts_delim, paths.UNIGRAM_DELIM_FREQUENCY_DICT) dump_object(counts_no_delim, paths.UNIGRAM_NO_DELIM_FREQUENCY_DICT)
def _save_encoder(self): """ Stores the encoder at the model directory. File name is encoder.pkl. :return: """ make_directory(self.model_dir()) dump_object(self.encoder, self.model_dir() + "/encoder.pkl")
def _save_specification(self): """ Stores the specification at the model directory. File name is specification.pkl. :return: """ make_directory(self.model_dir()) dump_object(self.specification, self.model_dir() + "/specification.pkl")
def most_frequent_wiki_and_all_aspell_word_counts(k_most_frequent): path = paths.WIKI_AND_ASPELL_TOKEN_COUNTERS % k_most_frequent if file_exists(path): return load_object(path) words = most_frequent_tokens(k_most_frequent) wiki_word_counters = load_object(paths.WIKI_TOKEN_COUNTERS) with open(paths.ASPELL_WORD_FILE) as f: for line in f: word = line[:-1] if word not in words: words[word] = wiki_word_counters[ word] if word in wiki_word_counters else 0 dump_object(words, path) return words
def get_stump_dict(unigrams: UnigramHolder) -> Dict[str, Set[str]]: if file_exists(paths.STUMP_DICT): return load_object(paths.STUMP_DICT) else: stump_dict = {} for token in unigrams.frequencies: if not token.isalpha(): continue if unigrams.get(token) < MIN_TOKEN_FREQUENCY: continue for stump in get_stumps(token): if stump not in stump_dict: stump_dict[stump] = {token} else: stump_dict[stump].add(token) dump_object(stump_dict, paths.STUMP_DICT) return stump_dict
def load_most_frequent(n): path = None if n is not None: path = paths.MOST_FREQUENT_UNIGRAMS_DICT % n if file_exists(path): frequencies = load_object(path) return frequencies delim_frequencies = load_object(paths.UNIGRAM_DELIM_FREQUENCY_DICT) no_delim_frequencies = load_object(paths.UNIGRAM_NO_DELIM_FREQUENCY_DICT) frequencies = delim_frequencies for token in no_delim_frequencies: if token not in frequencies: frequencies[token] = no_delim_frequencies[token] else: frequencies[token] += no_delim_frequencies[token] if n is not None: frequencies = select_most_frequent(frequencies, n) dump_object(frequencies, path) return frequencies
def split_training_set(): """Splits the training data set in single-file format into files of sequences of equal length.""" _remove_training_split_files() length_counts = {} with open(paths.WIKI_TRAINING_FILE) as training_file: while True: line = training_file.readline() if line == "": break sequence = line[:-1] seq_len = len(sequence) if seq_len not in length_counts: length_counts[seq_len] = 0 path = paths.WIKI_TRAINING_SPLIT_DIR + "%i.txt" % seq_len with open(path, 'a', encoding="utf8") as file: file.write(sequence + "\n") length_counts[seq_len] += 1 dump_object(length_counts, paths.WIKI_TRAINING_SEQUENCE_COUNTS)
def split_dataset(wiki_text_directory, out_directory, n_split): """ Reads all article IDs from an extracted wikipedia dump, splits them into training, development and test sets and pickles the three sets as lists. :param wiki_text_directory: Link to a directory created by the WikiExtractor script. Assumes subdirectories to contain files where each line corresponds to an article json. :param out_directory: directory where the ID lists are stored :param n_split: number of articles of the training and test sets """ training_ids, development_ids, test_ids = split_article_ids( wiki_text_directory, n_split) training_ids_path = out_directory + "training_article_ids.pkl" development_ids_path = out_directory + "development_article_ids.pkl" test_ids_path = out_directory + "test_article_ids.pkl" dump_object(training_ids, training_ids_path) dump_object(development_ids, development_ids_path) dump_object(test_ids, test_ids_path) print( "Split dataset into %i training articles, %i development articles, %i test articles." % (len(training_ids), len(development_ids), len(test_ids)))
def save(self): dump_object(self.threshold_dict, self.file)
def write_dataset_split_files(wiki_text_directory, dev_ids, test_ids): """ Reads the articles from an extracted wikipedia dump and writes three files, each containing the paragraphs of one partition. Also dumps three lists containing the sequences as Sequence-objects. The output file names are defined in src.settings.paths. :param wiki_text_directory: directory of the wikipedia dump, containing folders with files containing articles as jsons :param dev_ids: set of article IDs for the development set :param test_ids: set of article IDs for the test set :return: """ articles = get_article_jsons(wiki_text_directory) training_file = open(paths.WIKI_TRAINING_FILE, 'wb') development_file = open(paths.WIKI_DEVELOPMENT_FILE, 'wb') test_file = open(paths.WIKI_TEST_FILE, 'wb') training_sequences = [] development_sequences = [] test_sequences = [] sequence_id = 0 for article in articles: article_id = article["id"] paragraphs = get_paragraphs(article) for paragraph in paragraphs: char_len = len(paragraph) bytes = (paragraph + '\n').encode(FILE_ENCODING) if article_id in dev_ids: byte_offset = development_file.tell() development_file.write(bytes) byte_len = development_file.tell() - byte_offset sequence = Sequence(sequence_id, DatasetSplit.TRAINING, byte_offset, byte_len, char_len) development_sequences.append(sequence) elif article_id in test_ids: byte_offset = test_file.tell() test_file.write(bytes) byte_len = test_file.tell() - byte_offset sequence = Sequence(sequence_id, DatasetSplit.DEVELOPMENT, byte_offset, byte_len, char_len) test_sequences.append(sequence) else: byte_offset = training_file.tell() training_file.write(bytes) byte_len = training_file.tell() - byte_offset sequence = Sequence(sequence_id, DatasetSplit.TEST, byte_offset, byte_len, char_len) training_sequences.append(sequence) sequence_id += 1 training_file.close() development_file.close() test_file.close() del articles dump_object(training_sequences, paths.WIKI_TRAINING_SEQUENCES) dump_object(development_sequences, paths.WIKI_DEVELOPMENT_SEQUENCES) dump_object(test_sequences, paths.WIKI_TEST_SEQUENCES)
CANDIDATE_FILE = paths.DUMP_DIR + "tmp_punkt_candidates.pkl" COUNT_FILE = paths.DUMP_DIR + "tmp_punkt_counts.pkl" ABBREVIATIONS_FILE = paths.EXTENDED_PUNKT_ABBREVIATIONS if MODE == "candidates": all_tokens = set() for sequence in sequences(): for token in sequence.split(): all_tokens.add(token) candidates = set() for token in all_tokens: is_candidate, candidate = abbreviation_candidate(token) if is_candidate: candidates.add(candidate) dump_object(candidates, CANDIDATE_FILE) elif MODE == "print-candidates": candidates = load_object(CANDIDATE_FILE) for c in sorted(candidates): print(c) elif MODE == "count": counts = { candidate: [0, 0] for candidate in load_object(CANDIDATE_FILE) } for sequence in sequences(): tokens = sequence.split() for token in tokens: if token in counts:
space_state = model.step(state, space_label, include_sequence=False) no_space_state = state p_after_space = [] p_after_no_space = [] for j, label in enumerate(next_labels): char = model.encoder.decode_label(label) space_p = space_state["probabilities"][label] no_space_p = no_space_state["probabilities"][label] #print("", j, label, char, space_p, no_space_p) p_after_space.append(space_p) p_after_no_space.append(no_space_p) if j < LOOKAHEAD: space_state = model.step(space_state, label, include_sequence=False) no_space_state = model.step(no_space_state, label, include_sequence=False) case = Case(sequence_index=s_i, position=i, true_space=is_space, p_space=p_space, p_after_space=p_after_space, p_after_no_space=p_after_no_space) cases[-1].append(case) if model.specification.backward: cases[-1] = cases[-1][::-1] if (s_i + 1) % 1000 == 0: dump_object(cases, path) print("saved at", path) dump_object(cases, path) print("saved at", path)
def dump(self, fitter: ThresholdFitter): dump_object(fitter, self._file())
lines = [ line for line in lines if sum(1 if c == "?" else 0 for c in line) < 4 ] # remove lines with many ?s print(len(lines), "lines") write_lines(paths.ACL_CORPUS_DIR + split + ".txt", lines) random.shuffle(lines) write_lines(paths.ACL_CORPUS_DIR + split + "_shuffled.txt", lines) elif step == "dict": char_frequencies = {} for line in read_lines(paths.ACL_CORPUS_TRAINING_FILE): for char in line: if char not in char_frequencies: char_frequencies[char] = 1 else: char_frequencies[char] += 1 print("== FREQUENCIES ==") for char in sorted(char_frequencies): print(char, char_frequencies[char]) print("== ENCODER DICT ==") encoder_dict = {} for char in sorted(char_frequencies): if char_frequencies[char] > 10: encoder_dict[char] = len(encoder_dict) encoder_dict[symbols.SOS] = len(encoder_dict) encoder_dict[symbols.EOS] = len(encoder_dict) encoder_dict[symbols.UNKNOWN] = len(encoder_dict) print(encoder_dict) dump_object(encoder_dict, paths.ACL_ENCODER_DICT)
def save(self): dump_object(self, paths.BIGRAM_HOLDER)
def save_training_results(self): dump_object(self.training_results, self._path_to_file("training_results.pkl"))
trainer = PunktTrainer() trainer.INCLUDE_ALL_COLLOCS = True trainer.ABBREV = 0.3 trainer.train(text, verbose=True) del text print("building tokenizer...") tokenizer = PunktSentenceTokenizer(trainer.get_params()) abbrevs = tokenizer._params.abbrev_types print(sorted(abbrevs)) print("%i abbreviations" % len(abbrevs)) target_abbrevs = [ "i.e", "e.g", "prof", "dr", "m.sc", "no", "nos", "mr", "mrs", "ms", "seq", "o.r.s" ] for target in target_abbrevs: print(target, target in abbrevs, score(trainer, target)) print("saving...") dump_object(tokenizer, PICKLE_FILE) elif MODE == "starters": tokenizer = load_object(PICKLE_FILE) for starter in sorted(tokenizer._params.sent_starters): print(starter) elif MODE == "starters-nltk": tokenizer = NLTKSentenceSplitter() for starter in sorted(tokenizer.tokenizer._params.sent_starters): print(starter)
print(len(training_lines), "lines") write_lines(paths.ARXIV_TRAINING_LINES, training_lines) print(sum(1 for line in training_lines if len(line) > 256), "length > 256") training_sentences = [] for line in training_lines: sentences = split_sentences(line) training_sentences.extend(sentences) print(len(training_sentences), "sentences") write_lines(paths.ARXIV_TRAINING_SEQUENCES, training_sentences) char_frequencies = {} for sentence in training_sentences: for char in sentence: if char not in char_frequencies: char_frequencies[char] = 1 else: char_frequencies[char] += 1 encoder = { char: i for i, char in enumerate( sorted(select_most_frequent(char_frequencies, 200))) } encoder[symbols.SOS] = len(encoder) encoder[symbols.EOS] = len(encoder) encoder[symbols.UNKNOWN] = len(encoder) print(encoder) dump_object(encoder, paths.ARXIV_ENCODER_DICT)
import project from src.datasets.wikipedia import Wikipedia from src.settings import paths from src.helper.pickle import dump_object K = 100000 M = 10 * K if __name__ == "__main__": token_frequencies = {} for s_i, sequence in enumerate(Wikipedia.training_sequences()): if s_i % K == 0: print("%.1fM sequences, %.1fM tokens" % (s_i / M, len(token_frequencies) / M)) tokens = sequence.split() for token in tokens: if token not in token_frequencies: token_frequencies[token] = 1 else: token_frequencies[token] += 1 dump_object(token_frequencies, paths.TOKEN_FREQUENCY_DICT)
import sys import project from src.helper.pickle import load_object, dump_object from src.settings.paths import ESTIMATORS_DIR if __name__ == "__main__": name = sys.argv[1] path = ESTIMATORS_DIR + name + "/specification.pkl" specification = load_object(path) specification.name = name dump_object(specification, path)
def save(self): dump_object(self.penalties, self.file)
def save(self): dump_object(self.results, paths.RESULTS_DICT)
def save(self): dump_object(self, self.benchmark_file())
import sys from project import src from src.helper.files import read_sequences from src.helper.data_structures import select_most_frequent from src.settings import symbols from src.helper.pickle import dump_object if __name__ == "__main__": text_file = sys.argv[1] out_file = sys.argv[2] char_frequencies = {} for i, line in enumerate(read_sequences(text_file)): for char in line: if char not in char_frequencies: char_frequencies[char] = 1 else: char_frequencies[char] += 1 if (i + 1) % 100000 == 0: print(i + 1, "lines", len(char_frequencies), "unique characters") if (i + 1) == 10000000: break chars = select_most_frequent(char_frequencies, 200) symbs = [symbols.SOS, symbols.EOS, symbols.UNKNOWN] encoder = {symbol: i for i, symbol in enumerate(sorted(chars) + symbs)} dump_object(encoder, out_file)
""" Counts the occurences of all characters in the Wikipedia training set and stores the counts on disk. """ import sys from project import src from src.data.wikipedia import Wikipedia from src.helper.pickle import dump_object from src.settings import paths def count_chars(counters, sequence): for char in sequence: if char not in counters: counters[char] = 1 else: counters[char] += 1 if __name__ == "__main__": n_sequences = int(sys.argv[1]) char_counters = {} for i, sequence in enumerate(Wikipedia.training_sequences(n_sequences)): count_chars(char_counters, sequence) if (i + 1) % 100000 == 0: print("%i sequences processed" % (i + 1)) print(char_counters) dump_object(char_counters, paths.CHARACTER_FREQUENCY_DICT)