def test_extract_plaintext_with_equations(): plaintext = extract_plaintext( "main.tex", "This sentence includes a symbol $x$ and equation $$y = x$$.") assert plaintext == ( "This sentence includes a symbol EQUATION_DEPTH_0_START x EQUATION_DEPTH_0_END " + "and equation EQUATION_DEPTH_0_START y = x EQUATION_DEPTH_0_END.")
def test_extract_plaintext_no_add_space_before_equation_possessive(): plaintext = extract_plaintext( "main.tex", "This sentence includes a possessive equation $x$'s.") assert ( plaintext == r"This sentence includes a possessive equation EQUATION_DEPTH_0_START x " + "EQUATION_DEPTH_0_END's.")
def test_extract_plaintext_with_equations(): plaintext = extract_plaintext( "main.tex", "This sentence includes a symbol $x$ and equation $$y = x$$." ) assert ( plaintext == "This sentence includes a symbol <<equation-0>> and equation <<equation-1>>." )
def test_extract_plaintext_with_nested_equations(): plaintext = extract_plaintext( "main.tex", r"This sentence contains an equation \(x = \textrm{$y$}\).") assert plaintext == ( r"This sentence contains an equation EQUATION_DEPTH_0_START x = \textrm{ " + "EQUATION_DEPTH_1_START y EQUATION_DEPTH_1_END } EQUATION_DEPTH_0_END." )
def parse(self, tex_path: str, tex: str) -> Iterator[Sentence]: check_for_pysbd_reserved_characters(tex) # Extract plaintext from TeX. plaintext = extract_plaintext(tex_path, tex) # Segment the plaintext. Return offsets for each setence relative to the TeX input segmenter = pysbd.Segmenter(language="en", clean=False, char_span=True) # As each sentence is scanned, keep track of what sections and environments the # sentence appears within. section_name = None in_figure = False in_table = False in_itemize = False # The pysbd module has several open bugs and issues which are addressed below. # As of 3/23/20 we know the module will fail in the following ways: # 1. pysbd will not break up the sentence when it starts with a punctuation mark or space. # ex: ". hello. world. hi." # sol: check for sentences being longer than 1000 characters. Also, see the # plaintext extraction function, which attempts to clean up the text so that # consecutive periods are removed before segmentation. # 2. pysbd uses reserved characters for splitting sentences # ex: see PYSBD_RESERVED_CHARACTERS list. # sol: throw a warning if the sentence contains any of these characters. sentence_ranges: List[CharacterRange] = [] sentence_start: Optional[int] = None for span in segmenter.segment(str(plaintext)): if sentence_start is None: # Strip leading whitespace from sentence. sentence_start = span.start + regex.search( r"^(\s*)", span.sent).end() # Don't detect a sentence boundary in the middle of a equation is_boundary_in_equation = regex.search( r"EQUATION_DEPTH_0_START(?!.*EQUATION_DEPTH_0_END)", str(plaintext[sentence_start:span.end]), flags=regex.DOTALL, ) if not is_boundary_in_equation: # Strip trailing whitespace from sentence. end = span.start + regex.search(r"(\s*)$", span.sent).start() sentence_ranges.append(CharacterRange(sentence_start, end)) sentence_start = None for i, sentence_range in enumerate(sentence_ranges): tex_start, tex_end = plaintext.initial_offsets( sentence_range.start, sentence_range.end) if tex_start is None or tex_end is None: logging.warning( # pylint: disable=logging-not-lazy "The span bounds (%d, %d) from pysbd for a sentence could not be mapped " + "back to character offsets in the LaTeX for an unknown reason.", sentence_range.start, sentence_range.end, ) continue sentence_tex = tex[tex_start:tex_end] # Save the sentence as a journaled string, which will allow the mapping of the cleaned # sentence text to the original TeX. sentence = plaintext.substring( sentence_range.start, sentence_range.end, # These truncation options are important for preserving the mapping from offsets in # the edited sentence to the initial offsets before the edits. include_truncated_left=False, include_truncated_right=False, ) if len(sentence) > 1000: logging.warning( # pylint: disable=logging-not-lazy "Exceptionally long sentence (length %d). This might indicate the sentence " + "extractor failed to properly split text into sentences.", len(sentence), ) # Extract TeX around sentence to understand the environment in which it appears context_tex = get_context(tex, tex_start, tex_end) # Detect features describing the context the sentence appears in (i.e., the section it's in, # or if it's in a figure, etc.) using regular expressions. section = regex.findall( r"\\(?:sub)*section[*]*\{[A-Za-z0-9 \{\}\\_.,:-]*\}", context_tex) abstract_begin = regex.findall(r"\\begin\{abstract\}", context_tex) abstract_end = regex.findall(r"\\end\{abstract\}", context_tex) table_begin = regex.findall(r"\\begin\{tabular\}", context_tex) table_end = regex.findall(r"\\end\{tabular\}", context_tex) figure_begin = regex.findall(r"\\begin\{figure[*]*\}", context_tex) figure_end = regex.findall(r"\\end\{figure[*]*\}", context_tex) itemize_begin = regex.findall(r"\\begin\{itemize[*]*\}", context_tex) itemize_end = regex.findall(r"\\end\{itemize[*]*\}", context_tex) cite = regex.findall( r"\\cite[A-Za-z0-9 \\_\[\].,:-]*\{[A-Za-z0-9 \\_.,:-]*\}", context_tex) url = regex.findall(r"\\url\{[A-Za-z0-9 \{\}/\\_.,:-]*\}", context_tex, overlapped=False) label = regex.findall(r"\\label\{[A-Za-z0-9 \\_.,:-]*\}", context_tex) ref = regex.findall(r"\\ref\{[A-Za-z0-9 \\_.,:-]*\}", context_tex) tex_macros = set( regex.findall( r"\\[A-Za-z0-9\\\[\]_.,:-]*[\{[A-Za-z0-9 \\_.,:-]*\}]*", context_tex)) # Save a list of other TeX macros that aren't captured by any of the other # categories: { any } - { section, label, ... }. other_tex_macros: List[str] = [] named_macros = { m for l in [ abstract_begin, abstract_end, table_begin, table_end, figure_begin, figure_end, itemize_begin, itemize_end, cite, ] for m in l } other_tex_macros = list(tex_macros - named_macros) # Save section name. if abstract_begin: section_name = "ABSTRACT" if abstract_end: section_name = None if section: section_name = extract_text_from_tex_group(section[0]) # Save information about whether a sentence is in a figure, table, or other environment. # TODO(dykang): considering using \label{} in table/figure to improve matching. if figure_begin: in_figure = True if figure_end: in_figure = False if table_begin: in_table = True if table_end: in_table = False if itemize_begin: in_itemize = True if itemize_end: in_itemize = False # Use heuristics about the surrounding text to determine whether or not this # sentence is valid. These heuristics have a number of limitations, and should be # replaced with more mature rules for detecting whether the sentence is indeed in # names section, the abstract, a figure, a table, etc. See documentation of its # limitations here: https://github.com/allenai/scholar-reader/issues/138#issue-678432430 validity_guess = all([ # Sentence should appear in a named section. (not self.from_named_sections_only) or section_name, # Sentence should not appear in a figure or table. # TODO(dykang, andrewhead): eventually, this should be rewritten to permit the # extraction of sentences from captions. not in_figure, not in_table, # If the sentence contained regular expression patterns for the start or end of # an environment, it's probably not a sentence, bur rather just TeX macros. not abstract_begin, not abstract_end, not section, not table_end, not figure_end, not itemize_begin, not itemize_end, ]) tokens = regex.split(r"[\s,;.!?()]+", str(sentence)) contains_common_english_word = any([ len(t) > 1 and t.lower() in self.english_words for t in tokens ]) ends_with_stop = bool(regex.search(r"[,.:;!?]\s*$", str(sentence))) is_clean = contains_common_english_word and ends_with_stop # Sanitize the text, replacing macros and unwanted TeX with text that will be easier # for the text processing algorithms to process. sanitized = sentence replace_patterns: List[Tuple[str, str]] = [] # Replace citations with "CITATION". for citation in cite: citation_text = extract_text_from_tex_group(citation) for key in citation_text.split(","): replace_patterns.append((key, "CITATION")) # Replace URLs with "URL". for url_item in url: url_text = extract_text_from_tex_group(url_item) replace_patterns.append((url_text, "URL")) # Replace references to text elements like figures and tables with a single # known word for each type of element. Currently depends on idiomatic patterns # for naming elements, like \ref{{fig,tab,sec,eq}:XXX}, to distinguish between # element types. Also, the code keeps the token ahead of the reference (e.g., # the word "Table" in "Table\ref{...}"), although it might duplicate the # information in the replaced label. for reference in ref: reference_text = extract_text_from_tex_group(reference) for r in reference_text.split(","): if reference.lower().startswith("tab"): replace_patterns.append((r, "TABLE")) if reference.lower().startswith("fig"): replace_patterns.append((r, "FIGURE")) if reference.lower().startswith("sec"): replace_patterns.append((r, "SECTION")) if reference.lower().startswith("eq"): replace_patterns.append((r, "EQUATION")) # Substitute patterns with replacements. for pattern, replacement in replace_patterns: if pattern == "": continue match_start = 0 while True: match_offset = sanitized.find(pattern, match_start) if match_offset == -1: break sanitized = sanitized.edit(match_offset, match_offset + len(pattern), replacement) match_start = match_offset + len(pattern) yield Sentence( id_=str(i), tex_path=tex_path, start=tex_start, end=tex_end, text=str(sentence), text_journal=sentence, sanitized=str(sanitized), sanitized_journal=sanitized, tex=sentence_tex, context_tex=context_tex, validity_guess=validity_guess, is_clean=is_clean, section_name=section_name, in_figure=in_figure, in_table=in_table, in_itemize=in_itemize, label=label, ref=ref, cite=cite, url=url, others=other_tex_macros, )
def test_extract_plaintext_separate_section_header(): plaintext = extract_plaintext( "main.tex", "\n".join(["Line 1", r"\section{Section header}", "Line 3"])) assert plaintext == "\n".join( ["Line 1", "", "Section header.", "", "Line 3"])
def test_extract_plaintext_consolidate_periods_across_groups(): plaintext = extract_plaintext("main.tex", "\\footnote{Sentence.}. Next sentence.") assert plaintext == "Sentence. Next sentence."
def test_extract_plaintext_consolidate_periods(): plaintext = extract_plaintext("main.tex", "Sentence. .. Next sentence.") assert plaintext == "Sentence. Next sentence."
def test_extract_plaintext_leave_initial_intact(): tex = "\\documentclass{article}" plaintext = extract_plaintext("main.tex", tex) assert plaintext.initial == tex
def test_extract_plaintext_skip_input(): plaintext = extract_plaintext( "main.tex", "\n".join([r"\input file", r"\input{file}", r"\include{file}"])) assert plaintext.isspace()
def test_extract_plaintext_remove_comments(): plaintext = extract_plaintext("main.tex", "% comment\nText% comment\n% comment") assert plaintext == "Text"