Пример #1
0
    def save(self, item: SymbolData, _: None) -> None:
        symbols_with_ids = item.symbols_with_ids
        boxes = item.boxes
        matches = item.matches
        symbol_contexts = item.symbol_contexts
        mathml_contexts = item.mathml_contexts
        symbol_formulas = item.symbol_formulas
        mathml_formulas = item.mathml_formulas

        symbol_ids_by_symbol_object_ids = {}
        for symbol_with_id in symbols_with_ids:
            symbol_ids_by_symbol_object_ids[id(
                symbol_with_id.symbol)] = symbol_with_id.symbol_id

        entity_infos = []

        for symbol_with_id in symbols_with_ids:
            symbol = symbol_with_id.symbol
            # TODO(andrewhead): move this filtering condition into 'parse_equation'
            if symbol.tex in ["$|$", "|"]:
                continue

            symbol_id = symbol_with_id.symbol_id

            # Get context and formula of the symbol, and other matching ones.
            context = symbol_contexts.get(symbol_id)
            matching_contexts = mathml_contexts.get(symbol.mathml, [])
            other_context_texs = []
            other_context_sentence_ids = []
            for c in matching_contexts:
                matching_sentence_id = f"{c.tex_path}-{c.sentence_id}"
                if (matching_sentence_id not in other_context_sentence_ids
                        # and c.sentence_id != context.sentence_id
                    ):
                    other_context_texs.append(c.snippet)
                    other_context_sentence_ids.append(matching_sentence_id)

            formula = symbol_formulas.get(symbol_id)
            matching_formulas = mathml_formulas.get(symbol.mathml, [])
            other_formula_texs = []
            other_formula_ids = []
            for f in matching_formulas:
                equation_id = f"{f.tex_path}-{f.equation_id}"
                if equation_id not in other_formula_ids:
                    # and (
                    #   :  formula is None or equation_id != formula.equation_id
                    # )
                    other_formula_texs.append(f.tex)
                    other_formula_ids.append(equation_id)

            box = boxes.get(symbol_id)
            if box is None:
                continue

            data: EntityData = {
                "tex":
                f"${symbol.tex}$",
                "tex_start":
                symbol.start,
                "tex_end":
                symbol.end,
                "mathml":
                symbol.mathml,
                "mathml_near_matches":
                [m.matching_mathml for m in matches[symbol.mathml]],
                # "snippet": context.snippet,
                "snippets":
                other_context_texs,
                "defining_formulas":
                other_formula_texs,
                "is_definition":
                symbol.is_definition or False,
            }
            # if formula is not None:
            #     data['formula'] = formula.tex

            create_symbol_id_string: Callable[[SymbolId], str] = (
                lambda sid:
                f"{sid.tex_path}-{sid.equation_index}-{sid.symbol_index}")

            sentence_id = (f"{context.tex_path}-{context.sentence_id}"
                           if context is not None else None)

            parent_id: Optional[str] = None
            for other_symbol_with_id in symbols_with_ids:
                other_symbol_id = other_symbol_with_id.symbol_id
                other_symbol = other_symbol_with_id.symbol
                try:
                    other_symbol.children.index(symbol)
                    parent_id = create_symbol_id_string(other_symbol_id)
                except ValueError:
                    continue

            child_ids = []
            for child_symbol in symbol.children:
                child_symbol_id = symbol_ids_by_symbol_object_ids[id(
                    child_symbol)]
                string_id = create_symbol_id_string(child_symbol_id)
                child_ids.append(string_id)

            relationships: EntityRelationships = {
                "equation":
                EntityReference(
                    type_="equation",
                    id_=f"{symbol_id.tex_path}-{symbol_id.equation_index}",
                ),
                "parent":
                EntityReference(type_="symbol", id_=parent_id),
                "children": [
                    EntityReference(type_="symbol", id_=id_)
                    for id_ in child_ids
                ],
                "sentence":
                EntityReference(type_="sentence", id_=sentence_id) if
                sentence_id is not None else EntityReference(type_="sentence",
                                                             id_=None),
                "defining_formula_equations": [
                    EntityReference(type_="equation", id_=id_)
                    for id_ in other_formula_ids
                ],
                "snippet_sentences": [
                    EntityReference(type_="sentence", id_=id_)
                    for id_ in other_context_sentence_ids
                ],
                # "snippet_sentence": EntityReference(
                #     type_="sentence", id_=f"{symbol_id.tex_path}-f{context.sentence_id}"
                # )
                # if context is not None
                # else None,
                # "formula_equation": EntityReference(
                #     type_="equation",
                #     id_=f"{symbol_id.tex_path}-f{formula.equation_id}"
                #     if formula is not None
                #     else None,
                # ),
            }

            entity_information = EntityInformation(
                id_=
                f"{symbol_id.tex_path}-{symbol_id.equation_index}-{symbol_id.symbol_index}",
                type_="symbol",
                bounding_boxes=[box],
                data=data,
                relationships=relationships,
            )
            entity_infos.append(entity_information)

        upload_entities(item.s2_id, item.arxiv_id, entity_infos,
                        self.args.data_version)
Пример #2
0
def upload_symbols(
    processing_summary: PaperProcessingResult, data_version: Optional[int]
) -> None:

    arxiv_id = processing_summary.arxiv_id
    entities = [es.entity for es in processing_summary.entities]
    symbols = cast(List[SerializableSymbol], entities)
    symbols_by_id = {sid(s): s for s in symbols}

    entity_infos: List[EntityUploadInfo] = []

    # Load MathML matches for partially matching of symbols.
    matches: Matches = {}
    matches_path = os.path.join(
        directories.arxiv_subdir("symbol-matches", processing_summary.arxiv_id),
        "matches.csv",
    )
    if os.path.exists(matches_path):
        for match in file_utils.load_from_csv(matches_path, Match):
            if match.queried_mathml not in matches:
                matches[match.queried_mathml] = []
            matches[match.queried_mathml].append(match)
    else:
        logging.warning(
            "Could not find symbol matches information for paper %s.", arxiv_id,
        )

    # Load parent-child relationships for symbols.
    children: Dict[SymbolId, List[SymbolId]] = defaultdict(list)
    parents: Dict[SymbolId, SymbolId] = {}
    children_path = os.path.join(
        directories.arxiv_subdir("detected-symbols", arxiv_id), "symbol_children.csv"
    )
    if os.path.exists(children_path):
        for parent in file_utils.load_from_csv(children_path, SerializableChild):
            pid = f"{parent.tex_path}-{parent.equation_index}-{parent.symbol_index}"
            cid = f"{parent.tex_path}-{parent.equation_index}-{parent.child_index}"
            parents[cid] = pid
            children[pid].append(cid)
    else:
        logging.warning(
            "Could not find file mapping from symbol to their children for paper %s.",
            arxiv_id,
        )

    # Load contexts that the symbols appear in. Sort them by the symbol MathML.
    context_data_missing = False
    contexts_path = os.path.join(
        directories.arxiv_subdir("contexts-for-symbols", arxiv_id), "contexts.csv",
    )
    if not os.path.exists(contexts_path):
        logging.warning(  # pylint: disable=logging-not-lazy
            "Contexts have not been found for symbols for arXiv paper %s. "
            + "Symbol data will be uploaded without contexts.",
            arxiv_id,
        )
        context_data_missing = True

    symbol_contexts = {}
    mathml_contexts = defaultdict(list)
    if not context_data_missing:
        for context in file_utils.load_from_csv(contexts_path, Context):
            tex_path = context.tex_path
            symbol_id = f"{tex_path}-{context.entity_id}"
            symbol_contexts[symbol_id] = context
            symbol = symbols_by_id[symbol_id]
            mathml_contexts[symbol.mathml].append(context)

    # Prepare collections of formulae that each symbol was found in.
    symbol_formulas = {}
    mathml_formulas: Dict[str, Set[DefiningFormula]] = defaultdict(set)
    for symbol in symbols:
        if (
            symbol.is_definition
            and symbol.equation is not None
            and symbol.relative_start is not None
            and symbol.relative_end is not None
        ):
            highlighted = wrap_span(
                symbol.equation,
                symbol.relative_start,
                symbol.relative_end,
                before=r"\htmlClass{match-highlight}{",
                after="}",
                braces=True,
            )
            formula = DefiningFormula(
                tex=highlighted,
                tex_path=symbol.tex_path,
                equation_id=str(symbol.equation_index),
            )
            symbol_formulas[sid(symbol)] = formula
            mathml_formulas[symbol.mathml].add(formula)

    entity_infos = []
    for localized_entity in processing_summary.entities:

        symbol = cast(SerializableSymbol, localized_entity.entity)
        boxes = [
            BoundingBox(l.left, l.top, l.width, l.height, l.page)
            for l in localized_entity.locations
        ]

        # Get context and formula of the symbol, and other matching ones.
        symbol_context = symbol_contexts.get(sid(symbol))
        matching_contexts = mathml_contexts.get(symbol.mathml, [])
        other_context_texs = []
        other_context_sentence_ids = []
        for c in matching_contexts:
            matching_sentence_id = f"{c.tex_path}-{c.sentence_id}"
            if matching_sentence_id not in other_context_sentence_ids:
                other_context_texs.append(c.snippet)
                other_context_sentence_ids.append(matching_sentence_id)

        matching_formulas = mathml_formulas.get(symbol.mathml, set())
        other_formula_texs = []
        other_formula_ids = []
        for f in matching_formulas:
            equation_id = f"{f.tex_path}-{f.equation_id}"
            if equation_id not in other_formula_ids:
                other_formula_texs.append(f.tex)
                other_formula_ids.append(equation_id)

        # Package up data for the symbol.
        tags: List[str] = []
        MAX_BOX_HEIGHT = 0.1
        for b in boxes:
            if b.height > MAX_BOX_HEIGHT:
                logging.debug(  # pylint: disable=logging-not-lazy
                    "Detected large bounding box for symbol with height %f for entity %s of paper "
                    + "%s. Entity will be given a tag indicating it is unexpectedly large.",
                    b.height,
                    f"{localized_entity.entity.tex_path}-{localized_entity.entity.id_}",
                    arxiv_id,
                )
                tags.append("large")
                break

        data: EntityData = {
            "tex": f"${symbol.tex}$",
            "tex_start": symbol.start,
            "tex_end": symbol.end,
            "type": symbol.type_,
            "mathml": symbol.mathml,
            "mathml_near_matches": [m.matching_mathml for m in matches[symbol.mathml]],
            "snippets": other_context_texs,
            "defining_formulas": other_formula_texs,
            "is_definition": symbol.is_definition or False,
            "tags": tags,
        }

        # Create links between this symbol, its sentence, and related symbols.
        sentence_id = (
            f"{symbol_context.tex_path}-{symbol_context.sentence_id}"
            if symbol_context is not None
            else None
        )

        parent_id = parents.get(sid(symbol))
        child_ids = children.get(sid(symbol), [])

        relationships: EntityRelationships = {
            "equation": EntityReference(
                type_="equation", id_=f"{symbol.tex_path}-{symbol.equation_index}",
            ),
            "parent": EntityReference(type_="symbol", id_=parent_id),
            "children": [EntityReference(type_="symbol", id_=id_) for id_ in child_ids],
            "sentence": EntityReference(type_="sentence", id_=sentence_id)
            if sentence_id is not None
            else EntityReference(type_="sentence", id_=None),
            "defining_formula_equations": [
                EntityReference(type_="equation", id_=id_) for id_ in other_formula_ids
            ],
            "snippet_sentences": [
                EntityReference(type_="sentence", id_=id_)
                for id_ in other_context_sentence_ids
            ],
        }

        # Save all data for this symbol
        entity_information = EntityUploadInfo(
            id_=sid(symbol),
            type_="symbol",
            bounding_boxes=boxes,
            data=data,
            relationships=relationships,
        )
        entity_infos.append(entity_information)

    upload_entities(
        processing_summary.s2_id, arxiv_id, entity_infos, data_version,
    )
Пример #3
0
def upload_terms(processing_summary: PaperProcessingResult,
                 data_version: Optional[int]) -> None:

    arxiv_id = processing_summary.arxiv_id
    contexts = file_utils.load_from_csv(
        os.path.join(
            directories.arxiv_subdir("contexts-for-glossary-terms", arxiv_id),
            "contexts.csv",
        ),
        Context,
    )
    contexts_by_entity = {(c.tex_path, c.entity_id): c for c in contexts}

    # Assemble contexts that should be shown for each term.
    contexts_by_term: Dict[str, List[Context]] = defaultdict(list)
    for entity_and_location in processing_summary.localized_entities:
        term = cast(Term, entity_and_location.entity)
        if (term.tex_path, term.id_) in contexts_by_entity:
            contexts_by_term[term.text].append(
                contexts_by_entity[(term.tex_path, term.id_)])

    entity_infos = []
    for entity_and_location in processing_summary.localized_entities:
        term = cast(Term, entity_and_location.entity)
        context = contexts_by_entity.get((term.tex_path, term.id_))
        boxes = [cast(BoundingBox, l) for l in entity_and_location.locations]

        # Cluster bounding boxes, in case any of these terms are defined as a macro (in which)
        # case all appearances of that term on the same page will have been lumped together.
        clusters = cluster_boxes(boxes, vertical_split=0.005)
        for i, cluster in enumerate(clusters):
            entity_info = EntityInformation(
                id_=f"{term.tex_path}-{term.id_}-{i}",
                type_="term",
                bounding_boxes=list(cluster),
                data={
                    "name":
                    term.text,
                    "definitions":
                    term.definitions,
                    "definition_texs":
                    term.definitions,
                    "sources":
                    term.sources,
                    "snippets":
                    [c.snippet for c in contexts_by_term.get(term.text, [])],
                },
                relationships={
                    "sentence":
                    EntityReference(
                        type_="sentence",
                        id_=f"{context.tex_path}-{context.sentence_id}"
                        if context is not None else None,
                    ),
                    "snippet_sentences": [
                        EntityReference(type_="sentence",
                                        id_=f"{c.tex_path}-{c.sentence_id}")
                        for c in contexts_by_term.get(term.text, [])
                    ],
                },
            )
            entity_infos.append(entity_info)

    upload_entities(
        processing_summary.s2_id,
        processing_summary.arxiv_id,
        entity_infos,
        data_version,
    )
Пример #4
0
def upload_definitions(processing_summary: PaperProcessingResult,
                       data_version: Optional[int]) -> None:

    term_infos = []
    definition_infos = []
    for entity_and_location in processing_summary.localized_entities:
        boxes = [cast(BoundingBox, l) for l in entity_and_location.locations]
        entity = entity_and_location.entity

        if entity.id_.startswith("definition"):
            definition = cast(Definition, entity)
            definition_info = EntityInformation(
                id_=definition.id_,
                type_="definition",
                bounding_boxes=boxes,
                data={
                    "definiendum": definition.definiendum,
                    "definition": definition.text,
                    "tex": definition.tex,
                },
                relationships={
                    "sentence":
                    EntityReference(
                        type_="sentence",
                        id_=f"{definition.tex_path}-{definition.sentence_id}"
                        if definition.sentence_id is not None else None,
                    ),
                },
            )
            definition_infos.append(definition_info)

        if entity.id_.startswith("definiendum") or entity.id_.startswith(
                "term-reference"):
            term = cast(TermReference, entity)
            term_info = EntityInformation(
                id_=term.id_,
                type_="term",
                bounding_boxes=boxes,
                data={
                    "name": term.text,
                    "definitions": term.definitions,
                    "definition_texs": term.definition_texs,
                    "sources": term.sources,
                    "term_type": term.type_ or "unknown"
                },
                relationships={
                    "sentence":
                    EntityReference(
                        type_="sentence",
                        id_=f"{term.tex_path}-{term.sentence_id}"
                        if term.sentence_id is not None else None,
                    ),
                    "definitions": [
                        EntityReference(type_="definition", id_=d)
                        for d in term.definition_ids
                    ],
                },
            )
            term_infos.append(term_info)

    # Upload definitions before terms, because terms hold references to definitions that can
    # only be resolved once the definitions have been uploaded.
    upload_entities(
        processing_summary.s2_id,
        processing_summary.arxiv_id,
        definition_infos,
        data_version,
    )
    upload_entities(
        processing_summary.s2_id,
        processing_summary.arxiv_id,
        term_infos,
        data_version,
    )
Пример #5
0
    def save(self, item: SymbolData, _: None) -> None:
        symbols_with_ids = item.symbols_with_ids
        boxes = item.boxes
        matches = item.matches
        symbol_sentences = item.symbol_sentences

        symbol_ids_by_symbol_object_ids = {}
        for symbol_with_id in symbols_with_ids:
            symbol_ids_by_symbol_object_ids[id(
                symbol_with_id.symbol)] = symbol_with_id.symbol_id

        entity_infos = []

        for symbol_with_id in symbols_with_ids:
            symbol = symbol_with_id.symbol
            symbol_id = symbol_with_id.symbol_id

            box = boxes.get(symbol_id)
            if box is None:
                continue

            data: EntityData = {
                "tex":
                f"${symbol.tex}$",
                "tex_start":
                symbol.start,
                "tex_end":
                symbol.end,
                "mathml":
                symbol.mathml,
                "mathml_near_matches":
                [m.matching_mathml for m in matches[symbol.mathml]],
            }

            sentence_key = symbol_sentences.get(symbol_id)
            sentence_id = (
                f"{sentence_key.tex_path}-{sentence_key.sentence_id}"
                if sentence_key is not None else None)

            child_ids = []
            for child_symbol in symbol.children:
                child_symbol_id = symbol_ids_by_symbol_object_ids[id(
                    child_symbol)]
                string_id = f"{child_symbol_id.tex_path}-{child_symbol_id.equation_index}-{child_symbol_id.symbol_index}"
                child_ids.append(string_id)

            relationships: EntityRelationships = {
                "children": [
                    EntityReference(type_="symbol", id_=id_)
                    for id_ in child_ids
                ],
                "sentence":
                EntityReference(type_="sentence", id_=None)
                if sentence_id is None else EntityReference(type_="sentence",
                                                            id_=sentence_id),
            }

            entity_information = EntityInformation(
                id_=
                f"{symbol_id.tex_path}-{symbol_id.equation_index}-{symbol_id.symbol_index}",
                type_="symbol",
                bounding_boxes=[box],
                data=data,
                relationships=relationships,
            )
            entity_infos.append(entity_information)

        upload_entities(item.s2_id, item.arxiv_id, entity_infos,
                        self.args.data_version)
Пример #6
0
def upload_term_definitions(
    processing_summary: PaperProcessingResult, data_version: Optional[int]
) -> None:
    " Upload textual terms and their definitions. "

    # Group contextual snippets for each term.
    term_infos = []
    contexts_by_term_name: Dict[TermName, List[Context]] = defaultdict(list)
    for entity_summary in processing_summary.entities:
        entity = entity_summary.entity
        context = entity_summary.context
        if is_textual_term(entity) and context is not None:
            contexts_by_term_name[entity.text].append(context)  # type: ignore

    # Construct mapping from definitions to the sentences that contain them.
    contexts_by_definition: Dict[EntityId, Context] = {}
    for entity_summary in processing_summary.entities:
        entity_id = entity_summary.entity.id_
        context = entity_summary.context
        if (entity_id.startswith("definition")) and context is not None:
            contexts_by_definition[entity_id] = context

    # Upload information for each term.
    for entity_summary in processing_summary.entities:
        boxes = [cast(BoundingBox, l) for l in entity_summary.locations]
        entity = entity_summary.entity
        context = entity_summary.context

        if not is_textual_term(entity):
            continue

        term = cast(TermReference, entity)

        # Assemble list of snippets that include this term.
        contexts_matching_term = contexts_by_term_name.get(term.text, [])
        snippets = [c.snippet for c in contexts_matching_term]
        snippet_sentences = [
            f"{c.tex_path}-{c.sentence_id}" for c in contexts_matching_term
        ]

        # Create links to the sentences containing definitions for this term.
        definition_sentences: List[Optional[str]] = []
        for definition_id in term.definition_ids:
            if definition_id not in contexts_by_definition:
                definition_sentences.append(None)
            definition_context = contexts_by_definition[definition_id]
            definition_sentences.append(
                f"{definition_context.tex_path}-{definition_context.sentence_id}"
            )

        term_info = EntityUploadInfo(
            id_=term.id_,
            type_="term",
            bounding_boxes=boxes,
            data={
                "name": term.text,
                "term_type": term.type_ or "unknown",
                "definitions": term.definitions,
                "definition_texs": term.definition_texs,
                "sources": term.sources,
                # A list of all other sentences the term appearse elsewhere in the paper.
                "snippets": snippets,
            },
            relationships={
                # Link the term to the sentence it belongs to. This link is necessary to enable
                # visual filtering in the UI where, when a term is clicked, the sentence is
                # highlighted and all others are lowlighted.
                "sentence": EntityReference(
                    type_="sentence",
                    id_=f"{context.tex_path}-{context.sentence_id}"
                    if context is not None
                    else None,
                ),
                # IDs of the sentences that contain each of the definitions for a term. These IDs
                # can be used to establish links that take a user to the site of a definition.
                "definition_sentences": [
                    EntityReference(type_="sentence", id_=id_)
                    for id_ in definition_sentences
                ],
                # The IDs of each sentence where the term appears elsewhere in the paper (i.e.,
                # for each of the 'snippets' in the entity data above. Used to link from a snippet
                # that is shown in a list of snippets to where that snippet appears in the paper.
                "snippet_sentences": [
                    EntityReference(type_="sentence", id_=id_)
                    for id_ in snippet_sentences
                ],
            },
        )
        term_infos.append(term_info)

    upload_entities(
        processing_summary.s2_id, processing_summary.arxiv_id, term_infos, data_version,
    )
Пример #7
0
def upload_symbol_definitions(
    processing_summary: PaperProcessingResult, data_version: Optional[int]
) -> None:
    " Upload symbols and their definitions. "

    # Associate definitions with symbols as follows:
    # Definitions will be associated with entire equations as per the current implementation
    # of the definition detector. Conservatively, associate a definition for an equation
    # with a single symbol only if that symbol is the *only* top-level symbol in that equation.

    # Load symbols from files. Group symbols by equation to make it easy to detect whether a
    # symbol is the only top-level symbol in the equation.
    symbols_by_equation: Dict[
        Tuple[TexPath, EquationIndex], List[Symbol]
    ] = defaultdict(list)
    symbols: List[Symbol] = []

    symbols_with_ids = file_utils.load_symbols(processing_summary.arxiv_id)
    if symbols_with_ids is None:
        logging.info(  # pylint: disable=logging-not-lazy
            "No symbols were loaded for paper %s. Therefore, no definitions for symbols "
            + "will be uploaded for this paper.",
            processing_summary.arxiv_id,
        )
        return

    for _, symbol in symbols_with_ids:
        symbols_by_equation[symbol.tex_path, symbol.equation_index].append(symbol)
        symbols.append(symbol)

    # Group symbols by their MathML. These groups will be used to propagate definitions from
    # one defined symbol to all other appearances of that symbol.
    symbols_by_mathml: Dict[MathML, List[Symbol]] = defaultdict(list)
    for symbol in symbols:
        symbols_by_mathml[symbol.mathml].append(symbol)

    # Construct map from definitions to the sentences that contain them.
    contexts_by_definition: Dict[EntityId, Context] = {}
    for entity_summary in processing_summary.entities:
        entity_id = entity_summary.entity.id_
        context = entity_summary.context
        if (entity_id.startswith("definition")) and context is not None:
            contexts_by_definition[entity_id] = context

    # Fetch rows for all entities for this paper that have already been uploaded to the database.
    # This allows lookup of the row IDs for the sentence that contain definitions of symbols.
    entity_models = fetch_entity_models(processing_summary.s2_id, data_version)

    # Create a list of rows to insert into the database containing definition data.
    entity_data_models: List[EntityDataModel] = []
    for entity_summary in processing_summary.entities:
        entity = entity_summary.entity
        if not entity.id_.startswith("definiendum"):
            continue

        # Attempt to match definienda (defined terms) to symbols that are being defined.
        definiendum = cast(Definiendum, entity)
        defined_symbol = None
        for symbol in symbols:
            # Is the definiendum an equation?
            if definiendum.type_ != "symbol":
                continue
            # Does the symbol fall within in the range of characters being defined?
            if symbol.start < definiendum.start or symbol.end > definiendum.end:
                continue
            # Is the symbol a top-level symbol?
            if symbol.parent is not None:
                continue
            # Is it the *only* top-level symbol in its equation?
            top_level_symbols_in_equation = filter(
                lambda s: s.parent is not None,
                symbols_by_equation[(symbol.tex_path, symbol.equation_index)],
            )
            if len(list(top_level_symbols_in_equation)) > 1:
                continue

            defined_symbol = symbol
            logging.debug(  # pylint: disable=logging-not-lazy
                "Matched definiedum %s at position (%d, %d) to symbol %s at position "
                + "(%s, %s) for paper %s. A definition for this symbol will be uploaded.",
                definiendum.tex,
                definiendum.start,
                definiendum.end,
                symbol.tex,
                symbol.start,
                symbol.end,
                processing_summary.arxiv_id,
            )
            break

        if defined_symbol is None:
            continue

        # Assemble data about definitions for the symbol.
        definitions = definiendum.definitions
        definition_texs = definiendum.definition_texs
        sources = definiendum.sources
        definition_sentence_ids: List[Optional[str]] = []
        for definition_id in definiendum.definition_ids:
            context = contexts_by_definition.get(definition_id)
            if context is None:
                definition_sentence_ids.append(None)
            else:
                definition_sentence_ids.append(
                    f"{context.tex_path}-{context.sentence_id}"
                )

        # Find all symbols that are the same (i.e., that have the same MathML representation).
        # Then save definition data so that it applies all of those symbols.
        matching_symbols = symbols_by_mathml.get(defined_symbol.mathml)
        if matching_symbols is not None:
            for s in matching_symbols:
                entity_model = entity_models.get(("symbol", sid(s)))
                data: EntityData = {
                    "definitions": definitions,
                    "definition_texs": definition_texs,
                    "sources": sources,
                }
                entity_data_models.extend(make_data_models(None, entity_model, data))

                relationships: EntityRelationships = {
                    "definition_sentences": [
                        EntityReference(type_="sentence", id_=id_)
                        for id_ in definition_sentence_ids
                    ],
                }
                entity_data_models.extend(
                    make_relationship_models(
                        ("symbol", sid(s)), relationships, entity_models
                    )
                )

    with output_database.atomic():
        EntityDataModel.bulk_create(entity_data_models, 200)