def get_raw_text_for_trees(treebank_root, splits, tree_files): lines = [] for fname in glob_raw_files(treebank_root, splits): with open(fname, 'r', encoding="windows-1252") as f: for line in f: if line.strip() and not line.startswith('.START'): # Delete invalid gcharacters caused by encoding issues line = line.replace("Õ", "").replace("å", "") lines.append(line) reader = BracketParseCorpusReader('.', tree_files) target_sents = reader.sents() line_iter = iter(lines) line = "" pairs = [] for target_sent in target_sents: if not line.strip(): line = next(line_iter) # Handle PTB-style escaping mismatches target_sent = [standardize_form(word) for word in target_sent] # Handle transpositions: sometimes the raw text transposes punctuation, # while the parsed version cleans up this transposition if 'U.S..' in ''.join(target_sent): target_sent = [x.replace('U.S.', 'U.S') for x in target_sent] if 'Co.,' in ''.join(target_sent) and 'Co,.' in line: target_sent = [x.replace('Co.', 'Co') for x in target_sent] if "But that 's" in ' '.join(target_sent) and "But's that" in line: target_sent = [x.replace("that", "tha") for x in target_sent] target_sent = [x.replace("'s", "t") for x in target_sent] if ('-- Freshman football player' in line or '-- Sophomore football player' in line or '-- Junior football player' in line or '-- Senior football player' in line or '-- Graduate-student football player' in line or '-- Football player' in line or '-- Freshman basketball player' in line or '-- Sophomore basketball player' in line or '-- Junior basketball player' in line or '-- Senior basketball player' in line or '-- Basketball player' in line) and ( '" .' in ' '.join(target_sent) and target_sent[-1] == '.'): target_sent = target_sent[:-1] # Attempt to align raw and parsed text r2p, p2r = tokenizations.get_alignments(line.replace("`", "'"), target_sent) # Handle skips: some lines in the raw data are not parsed while not all(p2r): go_next = False if line.startswith('(See') and '-- WSJ' in line: go_next = True elif line == 'San Diego ': go_next = True elif line == '" ': go_next = True if go_next: line = next(line_iter) r2p, p2r = tokenizations.get_alignments( line.replace("`", "'"), target_sent) else: break # Handle line breaks in raw format that come in the middle of the sentence # (such as mid-sentence line breaks in poems) for _ in range(12): # Loop limit is to aid in debugging if not all(p2r): line = line + next(line_iter) r2p, p2r = tokenizations.get_alignments( line.replace("`", "'"), target_sent) assert all(p2r) end = max([max(x) for x in p2r]) + 1 # Trim excess raw text at the start line_to_save = line[:end] r2p, p2r = tokenizations.get_alignments(line_to_save.replace("`", "'"), target_sent) while True: _, alt_p2r = tokenizations.get_alignments( '\n'.join(line_to_save.replace("`", "'").splitlines()[1:]), target_sent) if sum([len(x) for x in p2r]) == sum([len(x) for x in alt_p2r]): line_to_save = '\n'.join(line_to_save.splitlines()[1:]) else: break pairs.append((line_to_save, target_sent)) line = line[end:] assert len(pairs) == len(target_sents) return [line for (line, target_sent) in pairs]
def get_words_and_whitespace(treebank_root, splits, tree_files): reader = BracketParseCorpusReader('.', tree_files) target_sents = reader.sents() raw_sents = get_raw_text_for_trees(treebank_root, splits, tree_files) pairs = [] for line, target_sent in zip(raw_sents, target_sents): # Fix some errors in the raw text that are also fixed in the parsed trees if "But's that just" in line: line = line.replace("But's that just", "But that's just") if 'Co,.' in line: line = line.replace('Co,.', 'Co.,') if 'U.S..' in ''.join(target_sent): # Address cases where underlying "U.S." got tokenized as "U.S." "."" # This is expected in the sentence-final position, but it seems to # occur in other places, too. line = line.replace('U.S.', 'U.S..').replace( 'U.S.. market', 'U.S. market').replace('U.S.. agenda', 'U.S. agenda').replace( 'U.S.. even', 'U.S. even').replace( 'U.S.. counterpart', 'U.S. counterpart').replace( 'U.S.. unit', 'U.S. unit').replace('U.S..,', 'U.S.,') words = target_sent[:] target_sent = [ standardize_form(word).replace("``", '"') for word in target_sent ] r2p, p2r = tokenizations.get_alignments(line.replace("`", "'"), target_sent) last_char_for_parsed = [max(x) if x else None for x in p2r] have_space_after = [None] * len(words) for i, word in enumerate(target_sent): if last_char_for_parsed[i] is None: continue char_after_word = line[last_char_for_parsed[i] + 1:last_char_for_parsed[i] + 2] have_space_after[i] = (char_after_word != char_after_word.lstrip()) # Fix the few cases where the word form in the parsed data is incorrect if word == "'T-" and target_sent[i + 1] == 'is': target_sent[i] = "'T" if word == "16" and target_sent[i + 1:i + 5] == [ '64', '-', 'inch', 'opening' ]: # This error occurs in the test set, and moreover would affect # tokenization by introducing an extra '/', so we don't fix it. # target_sent[i] = "16/" have_space_after[i] = True if word == "Gaming" and target_sent[i - 1:i + 2] == [ 'and', 'Gaming', 'company' ]: target_sent[i] = "gaming" pairs.append((target_sent, have_space_after)) # For each token in the treebank, we have now queried the raw string to # determine if the token should have whitespace following it. The lines # below are a sanity check that the reconstructed text matches the raw # version as closely as possible. to_delete = set() for indices in p2r: if not indices: continue to_delete |= set(range(min(indices), max(indices) + 1)) - set(indices) raw = list(line) for i in sorted(to_delete, reverse=True): del raw[i] raw = "".join(raw) raw = " ".join(x.strip() for x in raw.split()) guess = "".join([ w + (" " if sp else "") for (w, sp) in zip(target_sent, have_space_after) ]) if "filings policy-making" in guess: # The parsed version of this sentence drops an entire span from the raw # text. Maybe we shouldn't be training on this bad example, but for now # we'll just skip validating it. continue # Fix some issues with the raw text that are corrected in the parsed version raw = raw.replace("`", "'") raw = raw.replace("and <Tourism", "and Tourism") raw = raw.replace("staf reporter", "staff reporter") if " S$" in raw and " S$" not in guess: raw = raw.replace(" S$", " US$") raw = raw.replace("16/ 64-inch opening", "16 64-inch opening") if raw != guess and raw.replace('."', '".') == guess: raw = raw.replace('."', '".') # assert raw == guess if raw != guess: print(raw) print(guess) print() return pairs