Exemplo n.º 1
0
def test_extract_phrases_from_formatted_text():
    extractor = PhraseExtractor(["two-token phrase"])
    phrases = list(
        extractor.parse(
            "main.tex",
            r"In this \textbf{two-token phrase}, something happens."))
    assert len(phrases) == 1
Exemplo n.º 2
0
def test_extract_phrases_starting_with_symbol():
    # This example is from arXiv paper 1811.11889.
    extractor = PhraseExtractor(["+D&M"])
    phrases = list(
        extractor.parse("main.tex", r"This sentence contains +D\&M."))
    assert len(phrases) == 1
    assert phrases[0].text == "+D&M"
Exemplo n.º 3
0
def test_extract_phrases_containing_ampersands():
    # This example is from arXiv paper 1811.11889.
    extractor = PhraseExtractor(["D&M"])
    phrases = list(extractor.parse("main.tex", r"This sentence contains D\&M."))
    assert len(phrases) == 1
    assert phrases[0].text == "D&M"
    assert phrases[0].tex == "D\&M"
Exemplo n.º 4
0
def test_extract_phrases():
    extractor = PhraseExtractor(["word", "two-token phrase"])
    phrases = list(
        extractor.parse("main.tex",
                        "This sentence contains word and a two-token phrase."))

    phrase1 = phrases[0]
    assert phrase1.start == 23
    assert phrase1.end == 27
    assert phrase1.text == "word"

    phrase2 = phrases[1]
    assert phrase2.start == 34
    assert phrase2.end == 50
    assert phrase2.text == "two-token phrase"
Exemplo n.º 5
0
 def parse(self, tex_path: str, tex: str) -> Iterator[Term]:
     phrase_extractor = PhraseExtractor(list(self.glossary.keys()))
     for i, phrase in enumerate(phrase_extractor.parse(tex_path, tex)):
         entries = self.glossary[phrase.text]
         definitions = [e.definition for e in entries]
         sources = [e.source for e in entries]
         yield Term(
             id_=f"glossary-term-{i}",
             start=phrase.start,
             end=phrase.end,
             tex=phrase.tex,
             text=phrase.text,
             type_=None,
             tex_path=tex_path,
             context_tex=phrase.context_tex,
             definitions=definitions,
             sources=sources,
             sentence_id=None,
         )
Exemplo n.º 6
0
def test_extract_phrase_containing_single_letter():
    extractor = PhraseExtractor(["T"])
    phrases = list(
        extractor.parse("main.tex", "This sentence contains the letter T."))
    assert len(phrases) == 1
    assert phrases[0].text == "T"
Exemplo n.º 7
0
    def process(
        self, item: DetectDefinitionsTask
    ) -> Iterator[Union[Definiendum, Definition, TermReference]]:
        sentences_ordered = sorted(item.sentences, key=lambda s: s.start)
        num_sentences = len(sentences_ordered)
        end_position_of_last_sentence = sentences_ordered[-1].end

        if len(item.sentences) == 0:
            logging.warning(  # pylint: disable=logging-not-lazy
                "No sentences found for arXiv ID %s. Skipping detection of sentences "
                + "that contain entities.",
                item.arxiv_id,
            )
            return

        # Load the pre-trained definition detection model.
        prediction_type = "DocDef2+AI2020+W00"
        model = DefinitionDetectionModel(prediction_type)

        definition_index = 0
        features = []
        sentences: List[EmbellishedSentence] = []

        definiendums: Dict[TermName, List[Definiendum]] = defaultdict(list)
        term_phrases: List[str] = []
        abbreviations: List[str] = []
        symbol_nicks: List[str] = []
        definitions: Dict[DefinitionId, Definition] = {}

        with tqdm(total=num_sentences,
                  disable=(not self.args.show_progress)) as progress:

            for sentence_index, sentence in enumerate(sentences_ordered):
                progress.update(1)

                # Only attempt to process sentences that have been marked as likely to be proper
                # plaintext. Note that this means some sentences may be skipped that didn't pass
                # heuristics in the sentence extractor.
                if not sentence.validity_guess:
                    continue

                # Extract features from raw text.
                featurized_text = model.featurize(
                    sentence.legacy_definition_input)
                features.append(featurized_text)
                sentences.append(sentence)

                # Process sentences in batches.
                if (len(features) >= self.args.batch_size
                        or sentence_index == num_sentences - 1):

                    # Detect terms and definitions in each sentence with a pre-trained definition
                    # extraction model, from the featurized text.

                    (_, slots, slots_confidence) = model.predict_batch(
                        cast(List[Dict[Any, Any]], features))

                    # Package extracted terms and definitions into a representation that's
                    # easier to process.
                    for (
                            s,
                            sentence_features,
                            termdef_sentence_slots,
                            termdef_sentence_slots_confidence,
                            abbrexp_sentence_slots,
                            abbrexp_sentence_slots_confidence,
                            symnick_sentence_slots,
                            symnick_sentence_slots_confidence,
                    ) in zip(
                            sentences,
                            features,
                            slots["W00"],
                            slots_confidence["W00"],
                            slots["AI2020"],
                            slots_confidence["AI2020"],
                            slots["DocDef2"],
                            slots_confidence["DocDef2"],
                    ):
                        # Extract TeX for each symbol from a parallel representation of the
                        # sentence, so that the TeX for symbols can be saved.
                        # Types of [term and definition] pairs.
                        #   [nickname and definition] for symbols.
                        #   [abbreviation and expansion] for abbreviations.
                        #   [term and definition] for other types.
                        symbol_texs = get_symbol_texs(
                            s.legacy_definition_input, s.with_formulas_marked)

                        # Only process slots when they include both 'TERM' and 'DEFINITION'.
                        if ("TERM" not in termdef_sentence_slots
                                or "DEF" not in termdef_sentence_slots):
                            term_definition_pairs = []
                        else:
                            term_definition_pairs = consolidate_keyword_definitions(
                                s.legacy_definition_input,
                                sentence_features["tokens"],
                                termdef_sentence_slots,
                                termdef_sentence_slots_confidence,
                                "W00",
                            )

                        if ("TERM" not in abbrexp_sentence_slots
                                or "DEF" not in abbrexp_sentence_slots):
                            abbreviation_expansion_pairs = []
                        else:
                            abbreviation_expansion_pairs = consolidate_keyword_definitions(
                                s.legacy_definition_input,
                                sentence_features["tokens"],
                                abbrexp_sentence_slots,
                                abbrexp_sentence_slots_confidence,
                                "AI2020",
                            )

                        if ("TERM" not in symnick_sentence_slots
                                or "DEF" not in symnick_sentence_slots):
                            symbol_nickname_pairs = []
                        else:
                            symbol_nickname_pairs = consolidate_keyword_definitions(
                                s.legacy_definition_input,
                                sentence_features["tokens"],
                                symnick_sentence_slots,
                                symnick_sentence_slots_confidence,
                                "DocDef2",
                            )

                        pairs = (term_definition_pairs +
                                 symbol_nickname_pairs +
                                 abbreviation_expansion_pairs)
                        for pair in pairs:
                            tex_path = s.tex_path
                            definiendum_id = (
                                f"definiendum-{tex_path}-{definition_index}")
                            definition_id = f"definition-{tex_path}-{definition_index}"
                            definiendum_text = pair.term_text
                            definiendum_type = pair.term_type
                            definition_type = pair.definition_type

                            definiendum_confidence = pair.term_confidence
                            definition_confidence = pair.definition_confidence

                            # Map definiendum and definition start and end positions back to
                            # their original positions in the TeX.
                            offsets = s.legacy_definition_input_journal.initial_offsets(
                                pair.term_start, pair.term_end)
                            if offsets[0] is None or offsets[1] is None:
                                logging.warning(  # pylint: disable=logging-not-lazy
                                    "Could not find offsets of definiendum %s in original TeX "
                                    +
                                    "(from sentence %s, file %s, arXiv ID %s). Definiendum will not be saved.",
                                    pair.term_text,
                                    s.id_,
                                    s.tex_path,
                                    item.arxiv_id,
                                )
                                continue
                            definiendum_start = s.start + offsets[0]
                            definiendum_end = s.start + offsets[1]

                            offsets = s.legacy_definition_input_journal.initial_offsets(
                                pair.definition_start, pair.definition_end)
                            if offsets[0] is None or offsets[1] is None:
                                logging.warning(  # pylint: disable=logging-not-lazy
                                    "Could not find offsets of definition %s in original TeX "
                                    +
                                    "(from sentence %s, file %s, arXiv ID %s). Definiendum will not be saved.",
                                    pair.definition_text,
                                    s.id_,
                                    s.tex_path,
                                    item.arxiv_id,
                                )
                                continue
                            definition_start = s.start + offsets[0]
                            definition_end = s.start + offsets[1]

                            # Extract document-level features from sentence.
                            position_ratio = (definiendum_start /
                                              end_position_of_last_sentence)
                            section_name = s.section_name

                            try:
                                tex = item.tex_by_file[tex_path]
                            except KeyError:
                                logging.warning(  # pylint: disable=logging-not-lazy
                                    "Could not find TeX for %s. TeX will not be included in "
                                    +
                                    "the output data for definition '%s' for term '%s'",
                                    tex_path,
                                    pair.definition_text,
                                    definiendum_text,
                                )
                                definiendum_tex = "NOT AVAILABLE"
                                definition_tex = "NOT AVAILABLE"
                            else:
                                if (definiendum_type == "symbol"
                                        and symbol_texs is not None
                                        and pair.term_start in symbol_texs):
                                    definiendum_tex = symbol_texs[
                                        pair.term_start]
                                    definiendum_text = definiendum_tex
                                else:
                                    definiendum_tex = tex.contents[
                                        definiendum_start:definiendum_end]
                                definition_tex = tex.contents[
                                    definition_start:definition_end]

                            # Save the definition to file.
                            definition = Definition(
                                id_=definition_id,
                                start=definition_start,
                                end=definition_end,
                                definiendum=definiendum_text,
                                type_=definition_type,
                                tex_path=tex_path,
                                tex=definition_tex,
                                text=pair.definition_text,
                                context_tex=sentence.context_tex,
                                sentence_id=sentence.id_,
                                intent=True,
                                confidence=definition_confidence,
                            )
                            definitions[definition_id] = definition
                            yield definition

                            # Don't save the definiendum to file yet. Save it in memory first, and then
                            # save it to file once it's done being processed. It will need
                            # to be associated with other definitions. Also, other references
                            # to the term will be detected before this method is over.
                            definiendum = Definiendum(
                                id_=definiendum_id,
                                text=definiendum_text,
                                type_=definiendum_type,
                                confidence=definiendum_confidence,
                                # Link the definiendum to the text that defined it.
                                definition_id=definition_id,
                                # Because a term can be defined multiple places in the paper, these
                                # three lists of definition data will be filled out once all of the
                                # definitions have been found.
                                definition_ids=[],
                                definitions=[],
                                definition_texs=[],
                                sources=[],
                                start=definiendum_start,
                                end=definiendum_end,
                                tex_path=tex_path,
                                tex=definiendum_tex,
                                context_tex=sentence.context_tex,
                                sentence_id=sentence.id_,
                                # Document-level features below.
                                position_ratio=position_ratio,
                                position_ratios=[],
                                section_name=section_name,
                                section_names=[],
                            )
                            definiendums[definiendum_text].append(definiendum)
                            if definiendum.type_ == "term":
                                term_phrases.append(definiendum.text)
                            if definiendum.type_ == "abbreviation":
                                abbreviations.append(definiendum.text)
                            if definiendum.type_ == "symbol":
                                symbol_nicks.append(definiendum.text)

                            definition_index += 1

                    features = []
                    sentences = []

        logging.debug(  # pylint: disable=logging-not-lazy
            "Finished detecting definitions for paper %s. Now finding references to defined terms.",
            item.arxiv_id,
        )

        all_definiendums: List[Definiendum] = []
        for _, definiendum_list in definiendums.items():
            all_definiendums.extend(definiendum_list)

        definition_ids: Dict[TermName, List[DefinitionId]] = {}
        definition_texs: Dict[TermName, List[str]] = {}
        definition_texts: Dict[TermName, List[str]] = {}
        sources: Dict[TermName, List[str]] = {}
        position_ratios: Dict[TermName, List[float]] = {}
        section_names: Dict[TermName, List[str]] = {}

        # Associate terms with all definitions that apply to them.
        for term, definiendum_list in definiendums.items():
            definition_ids[term] = [
                definiendum.definition_id for definiendum in definiendum_list
            ]
            definition_texs[term] = [
                definitions[definiendum.definition_id].tex
                for definiendum in definiendum_list
            ]
            definition_texts[term] = [
                definitions[definiendum.definition_id].text
                for definiendum in definiendum_list
            ]
            sources[term] = ["model"] * len(definition_ids[term])
            position_ratios[term] = [
                definiendum.position_ratio for definiendum in definiendum_list
            ]
            section_names[term] = [
                definiendum.section_name for definiendum in definiendum_list
                if definiendum.section_name is not None
            ]

        # Associate each definiendum with all applicable definitions, and save them to file.
        for _, definiendum_list in definiendums.items():
            for definiendum in definiendum_list:
                definiendum.definition_ids.extend(
                    definition_ids[definiendum.text])
                definiendum.definition_texs.extend(
                    definition_texs[definiendum.text])
                definiendum.definitions.extend(
                    definition_texts[definiendum.text])
                definiendum.sources.extend(sources[definiendum.text])
                definiendum.position_ratios.extend(
                    position_ratios[definiendum.text])
                definiendum.section_names.extend(
                    section_names[definiendum.text])
                yield definiendum

        # Detect all other references to the defined terms. Detect references to textual
        # terms and abbreviations. References to symbols need not be found here; they
        # will be detected automatically in the symbol extraction code.
        term_index = 0

        for tex_path, file_contents in item.tex_by_file.items():
            term_extractor = PhraseExtractor(term_phrases + abbreviations)
            for t in term_extractor.parse(tex_path, file_contents.contents):

                # Don't save term references if they are already in the definiendums.
                if any([
                        overlaps(definiendum, t)
                        for definiendum in all_definiendums
                ]):
                    continue

                logging.debug(
                    "Found reference to term %s at (%d, %d) in %s for arXiv ID %s",
                    t.text,
                    t.start,
                    t.end,
                    t.tex_path,
                    item.arxiv_id,
                )
                type_ = ("abbreviation" if t.text in abbreviations else
                         "term" if t.text in term_phrases else
                         "symbol" if t.text in symbol_nicks else "unknown")
                yield TermReference(
                    id_=f"term-{t.tex_path}-{term_index}",
                    text=t.text,
                    type_=type_,
                    definition_ids=definition_ids[t.text],
                    definitions=definition_texts[t.text],
                    definition_texs=definition_texs[t.text],
                    sources=sources[t.text],
                    position_ratios=position_ratios[t.text],
                    section_names=section_names[t.text],
                    start=t.start,
                    end=t.end,
                    tex_path=t.tex_path,
                    tex=t.tex,
                    context_tex=t.context_tex,
                )
                term_index += 1
Exemplo n.º 8
0
    def process(
        self, item: DetectDefinitionsTask
    ) -> Iterator[Union[Definiendum, Definition, TermReference]]:
        sentences_ordered = sorted(item.sentences, key=lambda s: s.start)
        num_sentences = len(sentences_ordered)

        if len(item.sentences) == 0:
            logging.warning(  # pylint: disable=logging-not-lazy
                "No sentences found for arXiv ID %s. Skipping detection of sentences "
                + "that contain entities.",
                item.arxiv_id,
            )
            return

        # Load the pre-trained definition detection model.
        model = DefinitionDetectionModel()

        definition_index = 0
        features = []
        sentences = []

        definiendums: Dict[TermName, List[Definiendum]] = defaultdict(list)
        definitions: Dict[DefinitionId, Definition] = {}

        with tqdm(
            total=num_sentences, disable=(not self.args.show_progress)
        ) as progress:

            for si, sentence in enumerate(sentences_ordered):
                progress.update(1)

                # Only attempt to process sentences that have been marked as likely to be proper
                # plaintext. Note that this means some sentences may be skipped that didn't pass
                # heuristics in the sentence extractor.
                if not sentence.validity_guess:
                    continue

                # Extract features from raw text.
                featurized_text = model.featurize(sentence.legacy_definition_input)
                features.append(featurized_text)
                sentences.append(sentence)

                # Process sentences in batches.
                if len(features) >= self.args.batch_size or si == num_sentences - 1:

                    # Detect terms and definitions in each sentence with a pre-trained definition
                    # extraction model, from the featurized text.
                    intents, slots = model.predict_batch(
                        cast(List[Dict[Any, Any]], features)
                    )

                    for s, sentence_features, intent, sentence_slots in zip(
                        sentences, features, intents, slots
                    ):
                        # Only process slots when they includ both 'TERM' and 'DEFINITION'.
                        if "TERM" not in sentence_slots or "DEF" not in sentence_slots:
                            continue

                        # Package extracted terms and definitions into a representation that's
                        # easier to process.
                        pairs = get_term_definition_pairs(
                            s.legacy_definition_input,
                            sentence_features,
                            sentence_slots,
                        )

                        # Extract TeX for each symbol from a parallel representation of the
                        # sentence, so that the TeX for symbols can be saved.
                        symbol_texs = get_symbol_texs(
                            s.legacy_definition_input, s.with_equation_tex
                        )

                        for pair in pairs:

                            tex_path = s.tex_path
                            definiendum_id = (
                                f"definiendum-{tex_path}-{definition_index}"
                            )
                            definition_id = f"definition-{tex_path}-{definition_index}"
                            definiendum_text = pair.term_text
                            definiendum_type = (
                                "symbol" if "SYMBOL" in definiendum_text else "term"
                            )

                            # Map definiendum and definition start and end positions back to
                            # their original positions in the TeX.
                            offsets = s.legacy_definition_input_journal.initial_offsets(
                                pair.term_start, pair.term_end
                            )
                            if offsets[0] is None or offsets[1] is None:
                                logging.warning(  # pylint: disable=logging-not-lazy
                                    "Could not find offsets of definiendum %s in original TeX "
                                    + "(from sentence %s, file %s, arXiv ID %s). Definiendum will not be saved.",
                                    pair.term_text,
                                    s.id_,
                                    s.tex_path,
                                    item.arxiv_id,
                                )
                                continue
                            definiendum_start = s.start + offsets[0]
                            definiendum_end = s.start + offsets[1]

                            offsets = s.legacy_definition_input_journal.initial_offsets(
                                pair.definition_start, pair.definition_end
                            )
                            if offsets[0] is None or offsets[1] is None:
                                logging.warning(  # pylint: disable=logging-not-lazy
                                    "Could not find offsets of definition %s in original TeX "
                                    + "(from sentence %s, file %s, arXiv ID %s). Definiendum will not be saved.",
                                    pair.definition_text,
                                    s.id_,
                                    s.tex_path,
                                    item.arxiv_id,
                                )
                                continue
                            definition_start = s.start + offsets[0]
                            definition_end = s.start + offsets[1]

                            try:
                                tex = item.tex_by_file[tex_path]
                            except KeyError:
                                logging.warning(  # pylint: disable=logging-not-lazy
                                    "Could not find TeX for %s. TeX will not be included in "
                                    + "the output data for definition '%s' for term '%s'",
                                    tex_path,
                                    pair.definition_text,
                                    definiendum_text,
                                )
                                definiendum_tex = "NOT AVAILABLE"
                                definition_tex = "NOT AVAILABLE"
                            else:
                                if (
                                    definiendum_type == "symbol"
                                    and symbol_texs is not None
                                    and pair.term_start in symbol_texs
                                ):
                                    definiendum_tex = symbol_texs[pair.term_start]
                                    definiendum_text = definiendum_tex
                                else:
                                    definiendum_tex = tex.contents[
                                        definiendum_start:definiendum_end
                                    ]
                                definition_tex = tex.contents[
                                    definition_start:definition_end
                                ]

                            # Save the definition to file.
                            definition = Definition(
                                id_=definition_id,
                                start=definition_start,
                                end=definition_end,
                                definiendum=definiendum_text,
                                type_=None,
                                tex_path=tex_path,
                                tex=definition_tex,
                                text=pair.definition_text,
                                context_tex=s.context_tex,
                                sentence_id=s.id_,
                                intent=bool(intent),
                                confidence=None,
                            )
                            definitions[definition_id] = definition
                            yield definition

                            # Don't save the definiendum to file yet. Save it in memory first, and then
                            # save it to file once it's done being processed. It will need
                            # to be associated with other definitions. Also, other references
                            # to the term will be detected before this method is over.
                            definiendums[definiendum_text].append(
                                Definiendum(
                                    id_=definiendum_id,
                                    text=definiendum_text,
                                    type_=definiendum_type,
                                    confidence=None,
                                    # Link the definiendum to the text that defined it.
                                    definition_id=definition_id,
                                    # Because a term can be defined multiple places in the paper, these
                                    # three lists of definition data will be filled out once all of the
                                    # definitions have been found.
                                    definition_ids=[],
                                    definitions=[],
                                    definition_texs=[],
                                    sources=[],
                                    start=definiendum_start,
                                    end=definiendum_end,
                                    tex_path=tex_path,
                                    tex=definiendum_tex,
                                    context_tex=s.context_tex,
                                    sentence_id=s.id_,
                                )
                            )
                            definition_index += 1

                    features = []
                    sentences = []

        logging.debug(
            "Finished detecting definitions for paper %s. Now finding references to defined terms.",
            item.arxiv_id,
        )

        all_definiendums: List[Definiendum] = []
        for _, definiendum_list in definiendums.items():
            all_definiendums.extend(definiendum_list)
        term_phrases: List[TermName] = list(definiendums.keys())
        definition_ids: Dict[TermName, List[DefinitionId]] = {}
        definition_texs: Dict[TermName, List[str]] = {}
        definition_texts: Dict[TermName, List[str]] = {}
        sources: Dict[TermName, List[str]] = {}

        # Associate terms with all definitions that apply to them.
        for term, definiendum_list in definiendums.items():
            definition_ids[term] = [d.definition_id for d in definiendum_list]
            definition_texs[term] = [
                definitions[d.definition_id].tex for d in definiendum_list
            ]
            definition_texts[term] = [
                definitions[d.definition_id].text for d in definiendum_list
            ]
            sources[term] = ["model"] * len(definition_ids[term])

        # Associate each definiendum with all applicable definitions, and save them to file.
        for _, definiendum_list in definiendums.items():
            for d in definiendum_list:
                d.definition_ids.extend(definition_ids[d.text])
                d.definition_texs.extend(definition_texs[d.text])
                d.definitions.extend(definition_texts[d.text])
                d.sources.extend(sources[d.text])
                yield d

        # Detect all other references to the defined terms.
        term_index = 0
        sentence_entities: List[SerializableEntity] = cast(
            List[SerializableEntity], item.sentences
        )

        for tex_path, file_contents in item.tex_by_file.items():
            term_extractor = PhraseExtractor(term_phrases)
            for t in term_extractor.parse(tex_path, file_contents.contents):
                t_sentence = get_containing_entity(t, sentence_entities)

                # Don't save term references if they are already in the definiendums
                if any([overlaps(d, t) for d in all_definiendums]):
                    continue

                logging.debug(
                    "Found reference to term %s at (%d, %d) in %s for arXiv ID %s",
                    t.text,
                    t.start,
                    t.end,
                    t.tex_path,
                    item.arxiv_id,
                )
                yield TermReference(
                    id_=f"term-{t.tex_path}-{term_index}",
                    text=t.text,
                    type_=None,
                    definition_ids=definition_ids[t.text],
                    definitions=definition_texts[t.text],
                    definition_texs=definition_texs[t.text],
                    sources=sources[t.text],
                    start=t.start,
                    end=t.end,
                    tex_path=t.tex_path,
                    tex=t.tex,
                    context_tex=t.context_tex,
                    sentence_id=t_sentence.id_ if t_sentence is not None else None,
                )
                term_index += 1