def initialize(self): directory = self.benchmark_dir() correct_sequences = read_lines(directory + "correct.txt") corrupt_sequences = read_lines(directory + "corrupt.txt") self.sequences.extend([ BenchmarkSequence(correct_sequence, corrupt_sequence) for correct_sequence, corrupt_sequence in zip(correct_sequences, corrupt_sequences) ])
def main(args): corrector = HunspellSpellChecker() if args.input is None: sequences = interactive_sequences() else: sequences = read_lines(args.input) for sequence in sequences: corrected = corrector.correct(sequence) print(corrected)
def read_typos(typo_dict, test: bool): file_name = "typos_test.txt" if test else "typos_training.txt" for line in read_lines(TYPO_DIR + file_name): vals = line.split(" ") correct = vals[0] for i in range(1, len(vals), 2): misspelling = vals[i] frequency = int(vals[i + 1]) add_typo(typo_dict, correct, misspelling, frequency)
def read_typos(path): typos = [] for line in read_lines(path): vals = line.split(" ") correct = vals[0] for i in range(1, len(vals), 2): misspelling = vals[i] frequency = int(vals[i + 1]) for _ in range(frequency): typos.append((correct, misspelling)) return typos
def get_article_jsons(wiki_text_directory): """ Reads all articles as jsons from an extracted Wikipedia dump. :param wiki_text_directory: Link to a directory created by the WikiExtractor script. Assumes subdirectories to contain files where each line corresponds to an article json. :return: iterator over article jsons """ json_files = get_files_depth_two(wiki_text_directory) for file in json_files: lines = read_lines(file) for line in lines: article = json.loads(line) yield article
def read_error_dict( tsv_file: str, min_frequency: int = 1) -> Dict[str, List[Tuple[str, int]]]: errors = {} for line in read_lines(tsv_file): wrong, correct, freq = line.split("\t") freq = int(freq) if freq >= min_frequency: if len(correct) <= 3 and " " not in wrong and " " not in correct: if correct not in errors: errors[correct] = [(wrong, freq)] else: errors[correct].append((wrong, freq)) return errors
def main(args): inducer = ACLNoiseInducer(args.p, 0.2079, args.seed) if args.print_insertion_prob: error_dict = inducer.error_dict total_count = 0 insertion_count = 0 for correct in error_dict: for wrong, freq in error_dict[correct]: total_count += freq if correct == "": insertion_count += freq insertion_prob = insertion_count / total_count print(len([e for e, f in error_dict[""] if f >= 0]), "insertions") print( f"{insertion_prob * 100:.2f}% char insertions ({insertion_count}/{total_count})" ) if args.runtime: sequence = "Tokenization Repair in the Presence of Spelling Errors" start_time = timestamp() corrupt_sequences = [] for _ in range(100): corrupt_sequences.append(inducer.induce_noise(sequence)) runtime = time_diff(start_time) for s in corrupt_sequences: print(s) print(runtime) elif args.input_file: out_file = open(args.output_file, "w") if args.output_file else None lines = read_lines(args.input_file) for line in lines: corrupt = inducer.induce_noise(line) print(corrupt) if out_file is not None: out_file.write(corrupt + "\n") if out_file is not None: out_file.close() else: while True: sequence = input("> ") for _ in range(100): corrupt = inducer.induce_noise(sequence) print(corrupt)
def read_data(path: str) -> List[float]: lines = read_lines(path) data = [float(line) for line in lines] return data
tokenization_error_rates = [ rate for rate in tokenization_error_rates if rate > 0 ] params_ocr = fit_distribution(ocr_error_rates, distribution, fscale) print("ocr", params_ocr) params_tokenization = fit_distribution(tokenization_error_rates, distribution, fscale) print("tokenization", params_tokenization) ocr_noise_inducer = ACLNoiseInducer(p=0, insertion_prob=0.2079, seed=seed) hyphenator = HyphenationIntroducer(hyphenation_rate) with open(out_dir + "/correct.txt", "w") as correct_file, open(out_dir + "/corrupt.txt", "w") as corrupt_file: for sequence in read_lines(in_file): print(sequence) spans = create_sequence_spans(sequence) misspelled_spans = [] mistokenized_spans = [] for span in spans: p_ocr = sample_distribution(params_ocr, distribution) ocr_noise_inducer.p = p_ocr p_space = sample_distribution(params_tokenization, distribution) misspelled = hyphenator.introduce_hyphens(span) if not (zero and flip_coin(random, ocr_p_zero)): misspelled = ocr_noise_inducer.induce_noise(misspelled) if not (zero and flip_coin(random, tokenization_p_zero)): mistokenized = corrupt_tokenization(misspelled, p_space) else:
files = os.listdir(directory) prefix_input = "[OCR_toInput] " prefix_aligned = "[OCR_aligned] " prefix_truth = "[ GS_aligned] " NO_GS_SYMBOL = "#" ALIGNMENT_SYMBOL = "@" if out_dir is not None: corrupt_file = open(out_dir + "/corrupt.txt", "w") ground_truth_file = open(out_dir + "/spelling.txt", "w") for file in files: print(file) corrupt, aligned, truth = read_lines(directory + "/" + file) corrupt = corrupt[len(prefix_input):] aligned = aligned[len(prefix_aligned):] truth = truth[len(prefix_truth):] print(len(corrupt), len(aligned), len(truth)) input_sequences, ground_truth_sequences = create_input( corrupt, aligned, truth) for s_in, s_true in zip(input_sequences, ground_truth_sequences): print(s_in) print(s_true) if out_dir is not None: corrupt_file.write(s_in + "\n") ground_truth_file.write(s_true + "\n") if out_dir is not None: corrupt_file.close()
split = [val for val in split if len(val) > 0] split_pt = 0 acc_len = 0 while acc_len < len(input): acc_len += len(split[split_pt]) split_pt += 1 predicted = ' '.join(split[:split_pt]) correct = ' '.join(split[(split_pt + 1):]) return predicted, correct if __name__ == "__main__": in_file = sys.argv[1] lines_per_case = int(sys.argv[2]) mode = sys.argv[3] # correct, corrupt, predicted lines = read_lines(in_file) lines = [line[:-1] if line[-1] == '\t' else line for line in lines] n_cases = len(lines) // lines_per_case for i in range(n_cases): if lines_per_case == 3: input = lines[lines_per_case * i] result = lines[lines_per_case * i + 1] else: result = lines[lines_per_case * i] input = ''.join(result.split(" vs ")[0].split(' ')[1:]) if result.startswith(CORRECT_MARKER): correct = predicted = result[(len(CORRECT_MARKER) + 1):] elif result.startswith(FALSE_MARKER): predicted, correct = extract_sequences(result, input) else: raise Exception()
positions_per_token=constants.POSITIONS_PER_TOKEN, token_pairs_per_token=constants.TOKEN_PAIRS_PER_TOKEN, seed=13052021) benchmark_name += ".spaces" else: corruptor = SpaceRemover() benchmark_name += ".no_spaces" out_dir = "/home/hertel/tokenization-repair-dumps/data/benchmarks/" + benchmark_name + "/" if not os.path.exists(out_dir): os.mkdir(out_dir) i = 0 for set in ("tuning", "development", "test"): if typos: test = set == "test" typo_inducer = TypoNoiseInducer(0.1, seed=20210513 + i, test=test) i += 1 subdir = out_dir + set + "/" if not os.path.exists(subdir): os.mkdir(subdir) with open(subdir + "correct.txt", "w") as correct_file, open(subdir + "corrupt.txt", "w") as corrupt_file: sequences = read_lines(in_dir + set + ".txt") for correct in sequences: if typos: correct = typo_inducer.corrupt(correct) corrupt = corruptor.corrupt(correct) print(corrupt) correct_file.write(correct + "\n") corrupt_file.write(corrupt + "\n")
import numpy as np import sys import project from src.helper.files import read_lines, file_exists from src.benchmark.benchmark import all_benchmarks, ERROR_PROBABILITIES, Subset, BenchmarkFiles if __name__ == "__main__": file_name = sys.argv[1] per_chars = 1000 t_mean = [] t_normalized = [] for benchmark in all_benchmarks(Subset.TEST): print("== %s ==" % benchmark.name) path = benchmark.get_results_directory() + file_name total_runtime = float(read_lines(path)[-1]) if file_exists(path) else 0 mean_runtime = total_runtime / 10000 t_mean.append(mean_runtime) print("mean = %.2f" % mean_runtime) n_chars = sum(len(sequence) for sequence in benchmark.get_sequences(BenchmarkFiles.CORRUPT)) normalized_runtime = total_runtime / n_chars * per_chars t_normalized.append(normalized_runtime) print("normalized(%i chars) = %.2f" % (per_chars, normalized_runtime)) print("== total ==") print("mean = %.2f" % np.mean(t_mean)) print("normalized(%i chars) = %.2f" % (per_chars, np.mean(t_normalized)))
import project from src.helper.files import get_files, read_lines from select_acl_articles import get_year def preprocess(line: str): tokens = [token for token in line.split() if len(token) > 0] line = " ".join(tokens) return line if __name__ == "__main__": random.seed(42) acl_dir = "/home/hertel/tokenization-repair-dumps/nastase/acl-201302_word-resegmented/raw/" files = sorted(get_files(acl_dir)) files = [file for file in files if get_year(file) >= 2005] examples = [] for file in files: lines = read_lines(acl_dir + file) lines = [preprocess(line) for line in lines] lines = [line for line in lines if len(line) > 0] examples.extend(lines) random.shuffle(examples) for line in examples: print(line)
import project from src.helper.files import read_lines, write_lines from src.settings import paths if __name__ == "__main__": path = paths.BENCHMARKS_DIR + "doval/test/" corrupt_sequences = [ line.replace(' ', '') for line in read_lines(path + "correct.txt") ] write_lines(path + "corrupt.txt", corrupt_sequences)
def load(self): lines = read_lines(self.file) self.predicted_sequences = lines[:-1] self.runtime = float(lines[-1])
import sys import project from src.helper.files import read_lines if __name__ == "__main__": lines = read_lines(sys.argv[1]) for line in lines: print(line.replace(" ", ""))
def is_word(token): n_letters = len([char for char in token if char.isalpha()]) return n_letters / len(token) >= THRESHOLD def remove_symbols(word): return "".join(char for char in word if char.isalnum()) if __name__ == "__main__": input_file = sys.argv[1] tokenized_file = sys.argv[2] n = -1 if len(sys.argv) < 4 else int(sys.argv[3]) input_lines = read_lines(input_file) tokenized_lines = read_lines(tokenized_file) if n > 0: input_lines = input_lines[:n] for sequence, tokenized in zip(input_lines, tokenized_lines): input_spaces = get_space_positions(sequence) tokenized_spaces = get_space_positions(tokenized) tokens = get_tokens(sequence, input_spaces, tokenized_spaces) postprocessed_tokens = [] for token, space_removed in tokens: prefix, word, suffix = strip_token(token) if space_removed and is_word(word): removed = remove_symbols(word) postprocessed_tokens.append(prefix + removed + suffix) else:
TokenErrorType.TOKENIZATION_ERROR, TokenErrorType.OCR_ERROR, TokenErrorType.MIXED } error_name_label = "Total error" elif analysis_type == "spelling": error_types = {TokenErrorType.OCR_ERROR, TokenErrorType.MIXED} error_name_label = "Spelling error" else: error_types = {TokenErrorType.TOKENIZATION_ERROR} error_name_label = "Tokenization error" absolute_values = [] error_rates = [] for s_i, (correct, corrupt) in enumerate( zip(read_lines(folder + "spelling.txt"), read_lines(folder + "corrupt.txt"))): token_errors = get_token_edit_labels(correct, corrupt) n_tokens = len(token_errors) if n_tokens < 30: # or n_tokens > 40 continue n_spelling_errors = len( [error for error in token_errors if error in error_types]) error_rate = n_spelling_errors / n_tokens print(n_spelling_errors, n_tokens, error_rate) absolute_values.append(n_spelling_errors) error_rates.append(error_rate) if s_i + 1 == n: break subtitle = "ACL development set (%i sequences)" % n
removed = "" while i < len(sequence): if i > 0 and i + 2 < len(sequence) and sequence[i - 1].isalpha() and sequence[i:(i + 2)] in ("- ", "‑ ") \ and sequence[i + 2].isalpha(): i += 2 else: removed += sequence[i] i += 1 return removed def correct_ground_truth(sequence): sequence = sequence.replace("ſ", "s") sequence = sequence.replace("fi", "fi") sequence = sequence.replace("ff", "ff") sequence = sequence.replace("ffi", "ffi") for punctuation in ",;?!:)”": sequence = sequence.replace(" " + punctuation, punctuation) for punctuation in "(“": sequence = sequence.replace(punctuation + " ", punctuation) sequence = remove_hyphenation(sequence) return sequence if __name__ == "__main__": in_file = sys.argv[1] for line in read_lines(in_file): correct = correct_ground_truth(line) print(correct)
import sys import random from project import src from src.helper.files import read_lines, write_lines from src.settings import paths from src.arxiv.dataset import match_lines, to_input_file if __name__ == "__main__": test = "test" in sys.argv random.seed(20201026) files_file = paths.ARXIV_TEST_FILES if test else paths.ARXIV_DEVELOPMENT_FILES subset_name = "test" if test else "development" files = read_lines(files_file) pairs = [] for file in files: true_path = paths.ARXIV_GROUND_TRUTH_DIR + file input_path = paths.PDF_EXTRACT_DIR + to_input_file(file) matched = match_lines(true_path, input_path) pairs.extend(matched) random.shuffle(pairs) path = paths.BENCHMARKS_DIR + "arxiv/" + subset_name + "/" correct_sequences = [correct for _, correct in pairs] corrupt_sequences = [corrupt for corrupt, _ in pairs] write_lines(path + "correct.txt", correct_sequences) write_lines(path + "corrupt.txt", corrupt_sequences)
import random from project import src from src.helper.files import read_lines if __name__ == "__main__": random.seed(20201108) files_file = "/home/hertel/tokenization-repair-dumps/claudius/groundtruth-with-normalized-formulas/training.txt" base_dir = "/".join(files_file.split("/")[:-1]) + "/" print(base_dir) out_file = base_dir + "training_paragraphs.txt" files = read_lines(files_file) paragraphs = [] print("reading...") for file in files: lines = read_lines(base_dir + file) lines = [" ".join(line.split()).strip() for line in lines] lines = [ line for line in lines if len(line) > 0 and line != "[formula]" ] paragraphs.extend(lines) print("shuffling...") random.shuffle(paragraphs) print("writing...")
if __name__ == "__main__": path = "benchmarks/ACL/development/" corrupt_file = path + "corrupt.txt" ground_truth_file = path + "spelling.txt" out_path = "char_error_distributions/" out_file_ocr = out_path + "span_ocr_error_rates.txt" out_file_tokenization = out_path + "span_tokenization_error_rates.txt" out_file_spans = out_path + "spans.txt" out_file_ocr = open(out_file_ocr, "w") out_file_tokenization = open(out_file_tokenization, "w") out_file_spans = open(out_file_spans, "w") for i, (corrupt, correct) in enumerate( zip(read_lines(corrupt_file), read_lines(ground_truth_file))): #print(i) #print(corrupt) #print(correct) erroneous_spans = get_erroneous_spans(corrupt, correct) for corrupt_span, correct_span in erroneous_spans: if corrupt_span.replace( "-", "") != correct_span and len(correct_span) > 0: n_tokens = correct_span.count(" ") + 1 char_edits = get_ocr_character_edits(corrupt_span, correct_span) n_chars = len(correct_span) n_space_edits = 0 n_space_insertions = 0 n_space_deletions = 0 n_ocr_edits = 0
from acl_cleaned_analyse_ocr_errors import get_ocr_character_edits if __name__ == "__main__": raw_file = sys.argv[1] clean_file = sys.argv[2] out_file = sys.argv[3] if len(sys.argv) > 3 else None hyphenator = Hyphenator() char_error_rates = [] total_hyphen_edits = 0 total_tokens = 0 hyphenable_tokens = 0 for i, (corrupt, correct) in enumerate( zip(read_lines(raw_file), read_lines(clean_file))): corrupt_tokens = corrupt.split() correct_tokens = correct.split() total_tokens += len(correct_tokens) for t in correct_tokens: try: if len(hyphenator.pairs(t)) > 0: hyphenable_tokens += 1 except IndexError: pass ocr_errors = get_ocr_errors(corrupt_tokens, correct_tokens) print(i + 1, ocr_errors) n_char_edits = 0 n_hyphen_edits = 0 for erroneous, corrected in ocr_errors: char_edits = get_ocr_character_edits(erroneous, corrected)
else: deletion_positions.add(pos) deduced = "" for i, char in enumerate(corrupt): if i in insertion_positions: deduced += " " if i not in deletion_positions: deduced += char return deduced if __name__ == "__main__": benchmark = sys.argv[1] subset = sys.argv[2] input_file = paths.BENCHMARKS_DIR + benchmark + "/" + subset + "/corrupt.txt" predicted_file = paths.DUMP_DIR + "spelling/" + benchmark + "/" + subset + "/google.txt" out_file = paths.RESULTS_DIR + benchmark + "/" + subset + "/google_deduced.txt" input_sequences = read_lines(input_file) predicted_sequences = read_lines(predicted_file) with open(out_file, "w") as f: for corrupt, predicted in zip(input_sequences, predicted_sequences): tokenized = deduce_tokenization(corrupt, predicted) if corrupt != predicted: print(corrupt) print(predicted) print(tokenized) f.write(tokenized + "\n")
return True return False def split_sentences(text): remerged = [] for sentence in sent_tokenize(text): if len(remerged) == 0 or not is_wrong_split(sentence): remerged.append(sentence) else: remerged[-1] = remerged[-1] + sentence return remerged if __name__ == "__main__": training_files = read_lines(paths.ARXIV_TRAINING_FILES) training_lines = [] for file in training_files[1:]: lines = read_training_lines(paths.ARXIV_GROUND_TRUTH_DIR + file) training_lines += lines training_lines = [ line for line in training_lines if line not in ("=", "[formula]", ".125in") and ".25in" not in line ] print(len(training_lines), "lines") write_lines(paths.ARXIV_TRAINING_LINES, training_lines) print(sum(1 for line in training_lines if len(line) > 256), "length > 256")
import sys import project from src.helper.files import read_lines if __name__ == "__main__": n_files = len(sys.argv) // 2 weighted_files = [(sys.argv[2 * i + 1], int(sys.argv[2 * (i + 1)])) for i in range(n_files)] ocr_error_frequencies = {} for file, weight in weighted_files: for line in read_lines(file): corrupt, correct, frequency = line.split("\t") frequency = int(frequency) * weight pair = (corrupt, correct) if pair in ocr_error_frequencies: ocr_error_frequencies[pair] += frequency else: ocr_error_frequencies[pair] = frequency pairs = sorted(ocr_error_frequencies, key=lambda pair: ocr_error_frequencies[pair], reverse=True) for pair in pairs: print("\t".join([pair[0], pair[1], str(ocr_error_frequencies[pair])]))
char_edits = [] for gap_raw, gap_clean in gaps: err_raw = raw[gap_raw[0]:(gap_raw[1] + 1)] err_clean = cleaned[gap_clean[0]:(gap_clean[1] + 1)] char_edits.append((err_raw, err_clean)) return char_edits if __name__ == "__main__": #in_file = "/home/hertel/tokenization-repair-dumps/nastase/ocr_errors.txt" in_file = sys.argv[1] # "icdar_ocr_errors.txt" print_readable = False error_frequencies = {} for l_i, line in enumerate(read_lines(in_file)): if "\t" in line: raw, cleaned = line.split("\t") char_edits = get_ocr_character_edits(raw, cleaned) for err_raw, err_clean in char_edits: #print(f"'{err_raw}' -> '{err_clean}'", gap_raw, gap_clean) if (err_raw, err_clean) not in error_frequencies: error_frequencies[(err_raw, err_clean)] = 1 else: error_frequencies[(err_raw, err_clean)] += 1 else: pass #print(line) errors = sorted(error_frequencies, key=lambda x: error_frequencies[x],
from src.helper.files import read_lines from acl_cleaned_get_ocr_errors import get_ocr_errors from acl_cleaned_analyse_ocr_errors import get_ocr_character_edits if __name__ == "__main__": raw_file = sys.argv[1] clean_file = sys.argv[2] out_file = sys.argv[3] if len(sys.argv) > 3 else None hyphenator = Hyphenator() error_frequencies = {} for i, (corrupt, correct) in enumerate(zip(read_lines(raw_file), read_lines(clean_file))): print(f"** SEQUENCE {i} **") corrupt_tokens = corrupt.split() correct_tokens = correct.split() ocr_errors = get_ocr_errors(corrupt_tokens, correct_tokens) for corrupt, correct in ocr_errors: corrupt_parts = corrupt.split(" ") correct_parts = correct.split(" ") if len(corrupt_parts) != len(correct_parts): continue for corrupt_part, correct_part in zip(corrupt_parts, correct_parts): edits = get_ocr_character_edits(correct_part, corrupt_part) edits = [e for e in edits if e != ("", "-")] if len(edits) > 0: n_char_edits = [] token_len = len(correct_part)
tuning_path = base_path + "tuning/" development_path = base_path + "development/" test_path = base_path + "test/" return tuning_path, development_path, test_path def file_paths(dir_path: str): correct_path = dir_path + "correct.txt" corrupt_path = dir_path + "corrupt.txt" return correct_path, corrupt_path if __name__ == "__main__": from src.helper.files import read_lines tuning_sequences = read_lines(paths.WIKI_TUNING_SENTENCES) development_sequences = read_lines(paths.WIKI_DEVELOPMENT_SENTENCES) test_sequences = read_lines(paths.WIKI_TEST_SENTENCES) for noise_level in NOISE_LEVELS: tuning_ground_truth_sequences = insert_noise(tuning_sequences, noise_level) development_ground_truth_sequences = insert_noise( development_sequences, noise_level) test_ground_truth_sequences = insert_noise(test_sequences, noise_level) for p in ERROR_PROBABILITIES: print(noise_level, p) tuning_corrupt_sequences = corrupt_tokenization( tuning_ground_truth_sequences, p) development_corrupt_sequences = corrupt_tokenization(