import sys import random from project import src from src.helper.files import read_lines, write_lines from src.settings import paths from src.arxiv.dataset import match_lines, to_input_file if __name__ == "__main__": test = "test" in sys.argv random.seed(20201026) files_file = paths.ARXIV_TEST_FILES if test else paths.ARXIV_DEVELOPMENT_FILES subset_name = "test" if test else "development" files = read_lines(files_file) pairs = [] for file in files: true_path = paths.ARXIV_GROUND_TRUTH_DIR + file input_path = paths.PDF_EXTRACT_DIR + to_input_file(file) matched = match_lines(true_path, input_path) pairs.extend(matched) random.shuffle(pairs) path = paths.BENCHMARKS_DIR + "arxiv/" + subset_name + "/" correct_sequences = [correct for _, correct in pairs] corrupt_sequences = [corrupt for corrupt, _ in pairs] write_lines(path + "correct.txt", correct_sequences) write_lines(path + "corrupt.txt", corrupt_sequences)
def save(self): lines = self.predicted_sequences + [str(self.runtime)] write_lines(self.file, lines)
test_ground_truth_sequences = insert_noise(test_sequences, noise_level) for p in ERROR_PROBABILITIES: print(noise_level, p) tuning_corrupt_sequences = corrupt_tokenization( tuning_ground_truth_sequences, p) development_corrupt_sequences = corrupt_tokenization( development_ground_truth_sequences, p) test_corrupt_sequences = corrupt_tokenization( test_ground_truth_sequences, p) tuning_path, development_path, test_path = benchmark_directories( noise_level, p) make_directory_recursive(tuning_path) make_directory_recursive(development_path) make_directory_recursive(test_path) tune_correct_path, tune_corrupt_path = file_paths(tuning_path) dev_correct_path, dev_corrupt_path = file_paths(development_path) test_correct_path, test_corrupt_path = file_paths(test_path) if TUNING: write_lines(tune_correct_path, tuning_ground_truth_sequences) write_lines(tune_corrupt_path, tuning_corrupt_sequences) if DEVELOPMENT: write_lines(dev_correct_path, development_ground_truth_sequences) write_lines(dev_corrupt_path, development_corrupt_sequences) if TEST: write_lines(test_correct_path, test_ground_truth_sequences) write_lines(test_corrupt_path, test_corrupt_sequences)
if __name__ == "__main__": training_files = read_lines(paths.ARXIV_TRAINING_FILES) training_lines = [] for file in training_files[1:]: lines = read_training_lines(paths.ARXIV_GROUND_TRUTH_DIR + file) training_lines += lines training_lines = [ line for line in training_lines if line not in ("=", "[formula]", ".125in") and ".25in" not in line ] print(len(training_lines), "lines") write_lines(paths.ARXIV_TRAINING_LINES, training_lines) print(sum(1 for line in training_lines if len(line) > 256), "length > 256") training_sentences = [] for line in training_lines: sentences = split_sentences(line) training_sentences.extend(sentences) print(len(training_sentences), "sentences") write_lines(paths.ARXIV_TRAINING_SEQUENCES, training_sentences) char_frequencies = {} for sentence in training_sentences: for char in sentence: if char not in char_frequencies:
if __name__ == "__main__": random.seed(20201026) files = get_files() matched_files = [] unmatched_files = [] for file in files: truth_file = paths.ARXIV_GROUND_TRUTH_DIR + file input_file = paths.PDF_EXTRACT_DIR + to_input_file(file) matched = match_lines(truth_file, input_file) print(truth_file, input_file, len(matched)) if len(matched) > 0: matched_files.append(file) else: unmatched_files.append(file) print("%i matched" % len(matched_files)) print("%i unmatched" % len(unmatched_files)) random.shuffle(matched_files) write_lines(paths.ARXIV_DEVELOPMENT_FILES, sorted(matched_files[:1000])) write_lines(paths.ARXIV_TEST_FILES, sorted(matched_files[1000:2000])) training_files = sorted(matched_files[2000:] + unmatched_files) write_lines(paths.ARXIV_TRAINING_FILES, training_files)
print("** " + file + " **") sequences = read_lines(in_dir + file) sequences = [s.strip() for s in sequences] repaired_sequences = [] i = 0 while i < len(sequences): batch_sequences = [sequences[i]] while batch_sequences[-1][-1] == "-" and i + 1 < len(sequences): i += 1 batch_sequences.append(sequences[i]) i += 1 batch = "".join(batch_sequences) predicted = corrector.correct(batch) if len(batch_sequences) == 1: repaired_sequences.append(predicted) else: split_positions = set( np.cumsum([ len(seq.replace(" ", "")) for seq in batch_sequences ])) start = 0 nospace_chars = 0 for pos in range(len(predicted)): if predicted[pos] != " ": nospace_chars += 1 if nospace_chars in split_positions: seq = predicted[start:(pos + 1)].strip() repaired_sequences.append(seq) start = pos + 1 write_lines(out_dir + file, repaired_sequences)
if __name__ == "__main__": random.seed(1998) development_ids = set(load_object(paths.WIKI_DEVELOPMENT_ARTICLE_IDS)) test_ids = set(load_object(paths.WIKI_TEST_ARTICLE_IDS)) print(development_ids) print(test_ids) development_paragraphs = [] test_paragraphs = [] for article in get_article_jsons(): id = article["id"] is_dev = id in development_ids is_test = (not is_dev) and id in test_ids if is_dev or is_test: paragraph = select_random_paragraph(article["text"]) if is_dev: development_paragraphs.append(paragraph) elif is_test: test_paragraphs.append(paragraph) print("%i dev, %i test" % (len(development_paragraphs), len(test_paragraphs))) random.shuffle(development_paragraphs) random.shuffle(test_paragraphs) write_lines(paths.WIKI_DEVELOPMENT_FILE, development_paragraphs) write_lines(paths.WIKI_TEST_FILE, test_paragraphs)
import random import project from src.helper.files import read_lines, write_lines from src.settings import paths if __name__ == "__main__": random.seed(42) print("reading...") lines = read_lines(paths.WIKI_TRAINING_SENTENCES) print("shuffling...") random.shuffle(lines) print("writing...") write_lines(paths.WIKI_TRAINING_SENTENCES_SHUFFLED, lines)
sentence_splitter) sentences = filter_sentences(sentences) if len(sentences) > 0: selected_sentence = random.choice(sentences) selected_sentence = preprocess_sentence( selected_sentence) if id in tuning_ids: tuning_sentences.append(selected_sentence) elif id in development_ids: development_sentences.append(selected_sentence) else: test_sentences.append(selected_sentence) elif TRAINING and id in training_ids: sentences = split_article(article["text"], sentence_splitter) sentences = filter_sentences(sentences) sentences = [ preprocess_sentence(sentence) for sentence in sentences ] training_file.write('\n'.join(sentences + [""])) if TRAINING: training_file.close() else: for sentence_list in (tuning_sentences, development_sentences, test_sentences): random.shuffle(sentence_list) write_lines(paths.WIKI_TUNING_SENTENCES, tuning_sentences) write_lines(paths.WIKI_DEVELOPMENT_SENTENCES, development_sentences) write_lines(paths.WIKI_TEST_SENTENCES, test_sentences)
elif step == "lines": for split in ["training", "development", "test"]: path = paths.ACL_CORPUS_DIR + split + "/" lines = [] for filename in sorted(get_files(path)): lines.extend(read_lines(path + filename)) lines = [line.strip() for line in lines] lines = [line for line in lines if len(line) > 0] lines = [' '.join(line.split()) for line in lines] lines = [ line for line in lines if sum(1 if c == "?" else 0 for c in line) < 4 ] # remove lines with many ?s print(len(lines), "lines") write_lines(paths.ACL_CORPUS_DIR + split + ".txt", lines) random.shuffle(lines) write_lines(paths.ACL_CORPUS_DIR + split + "_shuffled.txt", lines) elif step == "dict": char_frequencies = {} for line in read_lines(paths.ACL_CORPUS_TRAINING_FILE): for char in line: if char not in char_frequencies: char_frequencies[char] = 1 else: char_frequencies[char] += 1 print("== FREQUENCIES ==") for char in sorted(char_frequencies): print(char, char_frequencies[char]) print("== ENCODER DICT ==")