Пример #1
0
def upload_entities(
    s2_id: S2Id,
    arxiv_id: ArxivId,
    entities: List[EntityUploadInfo],
    data_version: Optional[int] = None,
) -> None:
    """
    Before uploading entities, make sure to upload all other entities that the entity depends on.
    For example, if symbols have a data field that references sentences, make sure to upload the
    sentence entities before uploading the symbol entities. Otherwise, the references won't
    resolve to valid Postgres IDs and this function will crash.

    Set 'data_version' as 'None' (or don't specify it) if you want to tag the entities with the
    default data version. The default is to set the version index to a unique version number
    corresponding to this run of the pipeline.
    """

    try:
        paper = Paper.get(Paper.s2_id == s2_id)
    except Paper.DoesNotExist:
        paper = Paper.create(s2_id=s2_id, arxiv_id=arxiv_id)

    # Generate the version number of data for this table if one wasn't specified. This relies on
    # a paper having already been created, because the version table references the paper table.
    if data_version is None:
        data_version = get_or_create_data_version(paper.s2_id)

    entity_models = []
    bounding_box_models = []
    entity_data_models = []
    for entity in entities:

        # Create model for entity.
        entity_model = Entity(
            version=data_version,
            paper=paper,
            type=entity.type_,
            within_paper_id=entity.id_,
        )
        entity_models.append(entity_model)

        # Create models for bounding boxes.
        for box in entity.bounding_boxes:
            box_model = BoundingBoxModel(
                entity=entity_model,
                page=box.page,
                left=box.left,
                top=box.top,
                width=box.width,
                height=box.height,
            )
            bounding_box_models.append(box_model)

        # Create models for all entity data.
        if entity.data is not None:
            entity_data_models.extend(
                make_data_models(entity_model, None, entity.data))

    # Save models. This will assign unique IDs to each entity.
    with output_database.atomic():
        Entity.bulk_create(entity_models, 200)
    with output_database.atomic():
        BoundingBoxModel.bulk_create(bounding_box_models, 100)
    with output_database.atomic():
        EntityDataModel.bulk_create(entity_data_models, 200)

    # Upload entity relationships (i.e., references from the entities to other entities). This
    # should happen after the entity models are created, because the relationships may include
    # references between entities (e.g., some symbols may be children of others), and we need to
    # know the row IDs of the uploaded entities to make links between them in the database.
    all_entity_models = fetch_entity_models(s2_id, data_version)
    for entity_model in entity_models:
        all_entity_models[(entity_model.type,
                           entity_model.within_paper_id)] = entity_model

    entity_relationship_models: List[EntityDataModel] = []
    for entity in entities:
        if entity.relationships is not None:
            entity_relationship_models.extend(
                make_relationship_models((entity.type_, entity.id_),
                                         entity.relationships,
                                         all_entity_models))

    with output_database.atomic():
        EntityDataModel.bulk_create(entity_relationship_models, 200)
Пример #2
0
def upload_sentences(processing_summary: PaperProcessingResult) -> None:

    arxiv_id = processing_summary.arxiv_id
    s2_id = processing_summary.s2_id

    # Create entry for the paper if it does not yet exist
    try:
        paper = Paper.get(Paper.s2_id == s2_id)
    except Paper.DoesNotExist:
        paper = Paper.create(s2_id=s2_id, arxiv_id=arxiv_id)

    locations_by_sentence_id: Dict[SentenceKey, List[HueLocationInfo]] = {}
    sentences: Dict[SentenceKey, SentenceEntity] = {}
    sentence_models: Dict[SentenceKey, Sentence] = {}

    for entity_and_location in processing_summary.localized_entities:
        sentence = cast(SentenceEntity, entity_and_location.entity)
        sentence_model = Sentence(paper=paper, text=sentence.text)

        sentence_key = SentenceKey(sentence.tex_path, sentence.id_)
        locations_by_sentence_id[sentence_key] = entity_and_location.locations
        sentence_models[sentence_key] = sentence_model
        sentences[sentence_key] = sentence

    with output_database.atomic():
        Sentence.bulk_create(sentence_models.values(), 100)

    # Save the IDs for the sentence models so that they can be used in downstream tasks,
    # like uploading which sentences symbols belong to.
    model_ids_dir = directories.arxiv_subdir("sentences-model-ids", arxiv_id)
    if os.path.exists(model_ids_dir):
        file_utils.clean_directory(model_ids_dir)
    else:
        os.makedirs(model_ids_dir)
    output_ids_path = os.path.join(model_ids_dir, "model_ids.csv")
    for id_, sentence_entity in sentences.items():
        file_utils.append_to_csv(
            output_ids_path,
            SentenceIdAndModelId(
                tex_path=sentence_entity.tex_path,
                entity_id=sentence_entity.id_,
                model_id=sentence_models[id_].id,
            ),
        )

    entities = []
    entity_bounding_boxes = []
    bounding_boxes = []

    for sentence_id, sentence_model in sentence_models.items():

        entity = Entity(type="sentence",
                        source="tex-pipeline",
                        entity_id=sentence_model.id)
        entities.append(entity)

        for location in locations_by_sentence_id[sentence_id]:
            bounding_box = BoundingBox(
                page=location.page,
                left=location.left,
                top=location.top,
                width=location.width,
                height=location.height,
            )
            bounding_boxes.append(bounding_box)

            entity_bounding_box = EntityBoundingBox(bounding_box=bounding_box,
                                                    entity=entity)
            entity_bounding_boxes.append(entity_bounding_box)

    with output_database.atomic():
        BoundingBox.bulk_create(bounding_boxes, 100)
    with output_database.atomic():
        Entity.bulk_create(entities, 300)
    with output_database.atomic():
        EntityBoundingBox.bulk_create(entity_bounding_boxes, 300)
Пример #3
0
def upload_citations(item: CitationData, source: str = "tex-pipeline") -> None:
    arxiv_id = item.arxiv_id
    s2_id = item.s2_id
    citation_locations = item.citation_locations
    key_s2_ids = item.key_s2_ids
    s2_data = item.s2_data

    try:
        paper = Paper.get(Paper.s2_id == s2_id)
    except Paper.DoesNotExist:
        paper = Paper.create(s2_id=s2_id, arxiv_id=arxiv_id)

    for citation_key, locations in citation_locations.items():
        cited_paper = None
        if citation_key in key_s2_ids:
            s2_id = key_s2_ids[citation_key]
            if s2_id in s2_data:
                paper_data = s2_data[s2_id]

                try:
                    cited_paper = Paper.get(Paper.s2_id == s2_id)
                except Paper.DoesNotExist:
                    cited_paper = Paper.create(s2_id=s2_id,
                                               arxiv_id=paper_data.arxivId
                                               or None)

                try:
                    Summary.get(Summary.paper == cited_paper)
                except Summary.DoesNotExist:
                    Summary.create(
                        paper=cited_paper,
                        title=paper_data.title,
                        authors=paper_data.authors,
                        doi=paper_data.doi or None,
                        venue=paper_data.venue or None,
                        year=paper_data.year,
                    )

        for location_set in locations.values():
            citation = Citation.create(paper=paper)
            if cited_paper:
                try:
                    with output_database.atomic():
                        CitationPaper.create(paper=cited_paper,
                                             citation=citation)
                except IntegrityError:
                    logging.warning(  # pylint: disable=logging-not-lazy
                        ("Cited paper %s and citation %s are already linked. This suggests a bug in "
                         +
                         "the citation resolution code; perhaps multiple citation keys were "
                         + "matched to the same paper?"),
                        cited_paper.arxiv_id,
                        citation.id,
                    )

            entity = Entity.create(type="citation",
                                   source=source,
                                   entity_id=citation.id)

            for location in location_set:
                bounding_box = BoundingBoxModel.create(
                    page=location.page,
                    left=location.left,
                    top=location.top,
                    width=location.width,
                    height=location.height,
                )

                EntityBoundingBox.create(bounding_box=bounding_box,
                                         entity=entity)
Пример #4
0
def upload_entities(
    s2_id: S2Id,
    arxiv_id: ArxivId,
    entities: List[EntityInformation],
    data_version: Optional[int] = None,
) -> None:
    """
    Before uploading entities, make sure to upload all other entities that the entity depends on.
    For example, if symbols have a data field that references sentences, make sure to upload the
    sentence entities before uploading the symbol entities. Otherwise, the references won't
    resolve to valid Postgres IDs and this function will crash.

    Set 'data_version' as 'None' (or don't specify it) if you want to tag the entities with the
    default data version. The default is to set the version index to a unique version number
    corresponding to this run of the pipeline.
    """

    try:
        paper = Paper.get(Paper.s2_id == s2_id)
    except Paper.DoesNotExist:
        paper = Paper.create(s2_id=s2_id, arxiv_id=arxiv_id)

    # Generate the version number of data for this table if one wasn't specified. This relies on
    # a paper having already been created, because the version table references the paper table.
    if data_version is None:
        data_version = get_or_create_data_version(paper.s2_id)

    entity_models = []
    bounding_box_models = []
    entity_data_models = []
    for entity in entities:

        # Create model for entity.
        entity_model = Entity(
            version=data_version,
            paper=paper,
            type=entity.type_,
            within_paper_id=entity.id_,
        )
        entity_models.append(entity_model)

        # Create models for bounding boxes.
        for box in entity.bounding_boxes:
            box_model = BoundingBoxModel(
                entity=entity_model,
                page=box.page,
                left=box.left,
                top=box.top,
                width=box.width,
                height=box.height,
            )
            bounding_box_models.append(box_model)

        # Create models for each field in the entity data.
        if entity.data is not None:
            for key, value in entity.data.items():

                # The value may have been specified as a list, or as a single scalar value.
                # Unpack all of the values for this key into a list.
                values: List[Any] = []
                if isinstance(value, list):
                    of_list = True
                    values.extend(value)
                else:
                    of_list = False
                    values.append(value)

                value_types = {type(v) for v in values}
                if not len(value_types) == 1:
                    logging.warning(  # pylint: disable=logging-not-lazy
                        "Attempted to upload multiple primitive types of data for key %s. "
                        + "Types were %s. Not permitted. Skipping this value.",
                        key,
                        value_types,
                    )
                    continue

                # Create a new row for each value, with information of the base data type
                # and whether that row belongs to a list. If casting of values needs to occur
                # for values based on type to make them appropriate for insertion in Postgres
                # (e.g., casting booleans to 0 / 1), that should happen here.
                for v in values:
                    type_ = None
                    # Check for boolean needs to come before check for int, because booleans
                    # will pass the check 'isinstance(v, int)'.
                    if isinstance(v, bool):
                        type_ = "boolean"
                        v = 1 if v else 0
                    if isinstance(v, int):
                        type_ = "integer"
                    elif isinstance(v, float):
                        type_ = "float"
                    elif isinstance(v, str):
                        type_ = "string"

                    if type_ is not None:
                        entity_data_models.append(
                            EntityDataModel(
                                entity=entity_model,
                                key=key,
                                value=v,
                                item_type=type_,
                                of_list=of_list,
                                relation_id=None,
                            )
                        )

    # Save models. This will assign unique IDs to each entity.
    with output_database.atomic():
        Entity.bulk_create(entity_models, 200)
    with output_database.atomic():
        BoundingBoxModel.bulk_create(bounding_box_models, 100)
    with output_database.atomic():
        EntityDataModel.bulk_create(entity_data_models, 200)

    # Upload entity relationships (i.e., references from the entities to other entities). This
    # should happen after the entity models are created, because the relationships may include
    # references between entities (e.g., some symbols may be children of others), and we need to
    # know the row IDs of the uploaded entities to make links between them in the database.
    uploaded_entity_models: Dict[Tuple[str, str], Entity] = {}

    # Build a map from the within-paper entity IDs to database row IDs. It is assumed that the
    # number of entities already uploaded for this paper won't be so many that they can't all
    # be stored in memory.
    uploaded_entities = Entity.select().where(
        Entity.paper_id == s2_id, Entity.version == data_version
    )
    for entity_model in uploaded_entities:
        uploaded_entity_models[
            (entity_model.type, entity_model.within_paper_id)
        ] = entity_model

    def resolve_model(entity_type: str, within_paper_id: str) -> Optional[Entity]:
        " Helper for resolving an entity ID into a database row ID. "
        try:
            return uploaded_entity_models[(entity_type, within_paper_id)]
        except KeyError:
            logging.warning(  # pylint: disable=logging-not-lazy
                "Could not upload reference to entity %s of type "
                + "%s, because no database row ID could be found for the entity %s of type "
                + "%s. Check to make sure the data for entities of type %s have been uploaded.",
                within_paper_id,
                entity_type,
                within_paper_id,
                entity_type,
                entity_type,
            )
            return None

    entity_relationship_models: List[EntityDataModel] = []
    for entity in entities:

        # Get the row ID for the entity.
        model = resolve_model(entity.type_, entity.id_)
        if model is None or entity.relationships is None:
            continue

        for k, v in entity.relationships.items():
            if isinstance(v, EntityReference):
                referenced_model = resolve_model(v.type_, v.id_)
                if referenced_model is not None:
                    entity_relationship_models.append(
                        EntityDataModel(
                            entity_id=model,
                            key=k,
                            value=referenced_model.id,
                            item_type="relation-id",
                            of_list=False,
                            relation_type=v.type_,
                        )
                    )
            elif (
                isinstance(v, list) and len(v) > 0 and isinstance(v[0], EntityReference)
            ):
                for reference in v:
                    referenced_model = resolve_model(reference.type_, reference.id_)
                    if referenced_model is not None:
                        entity_relationship_models.append(
                            EntityDataModel(
                                entity_id=model,
                                key=k,
                                value=referenced_model.id,
                                item_type="relation-id",
                                of_list=True,
                                relation_type=reference.type_,
                            )
                        )

    EntityDataModel.bulk_create(entity_relationship_models, 200)
Пример #5
0
def upload_symbol_definitions(
    processing_summary: PaperProcessingResult, data_version: Optional[int]
) -> None:
    " Upload symbols and their definitions. "

    # Associate definitions with symbols as follows:
    # Definitions will be associated with entire equations as per the current implementation
    # of the definition detector. Conservatively, associate a definition for an equation
    # with a single symbol only if that symbol is the *only* top-level symbol in that equation.

    # Load symbols from files. Group symbols by equation to make it easy to detect whether a
    # symbol is the only top-level symbol in the equation.
    symbols_by_equation: Dict[
        Tuple[TexPath, EquationIndex], List[Symbol]
    ] = defaultdict(list)
    symbols: List[Symbol] = []

    symbols_with_ids = file_utils.load_symbols(processing_summary.arxiv_id)
    if symbols_with_ids is None:
        logging.info(  # pylint: disable=logging-not-lazy
            "No symbols were loaded for paper %s. Therefore, no definitions for symbols "
            + "will be uploaded for this paper.",
            processing_summary.arxiv_id,
        )
        return

    for _, symbol in symbols_with_ids:
        symbols_by_equation[symbol.tex_path, symbol.equation_index].append(symbol)
        symbols.append(symbol)

    # Group symbols by their MathML. These groups will be used to propagate definitions from
    # one defined symbol to all other appearances of that symbol.
    symbols_by_mathml: Dict[MathML, List[Symbol]] = defaultdict(list)
    for symbol in symbols:
        symbols_by_mathml[symbol.mathml].append(symbol)

    # Construct map from definitions to the sentences that contain them.
    contexts_by_definition: Dict[EntityId, Context] = {}
    for entity_summary in processing_summary.entities:
        entity_id = entity_summary.entity.id_
        context = entity_summary.context
        if (entity_id.startswith("definition")) and context is not None:
            contexts_by_definition[entity_id] = context

    # Fetch rows for all entities for this paper that have already been uploaded to the database.
    # This allows lookup of the row IDs for the sentence that contain definitions of symbols.
    entity_models = fetch_entity_models(processing_summary.s2_id, data_version)

    # Create a list of rows to insert into the database containing definition data.
    entity_data_models: List[EntityDataModel] = []
    for entity_summary in processing_summary.entities:
        entity = entity_summary.entity
        if not entity.id_.startswith("definiendum"):
            continue

        # Attempt to match definienda (defined terms) to symbols that are being defined.
        definiendum = cast(Definiendum, entity)
        defined_symbol = None
        for symbol in symbols:
            # Is the definiendum an equation?
            if definiendum.type_ != "symbol":
                continue
            # Does the symbol fall within in the range of characters being defined?
            if symbol.start < definiendum.start or symbol.end > definiendum.end:
                continue
            # Is the symbol a top-level symbol?
            if symbol.parent is not None:
                continue
            # Is it the *only* top-level symbol in its equation?
            top_level_symbols_in_equation = filter(
                lambda s: s.parent is not None,
                symbols_by_equation[(symbol.tex_path, symbol.equation_index)],
            )
            if len(list(top_level_symbols_in_equation)) > 1:
                continue

            defined_symbol = symbol
            logging.debug(  # pylint: disable=logging-not-lazy
                "Matched definiedum %s at position (%d, %d) to symbol %s at position "
                + "(%s, %s) for paper %s. A definition for this symbol will be uploaded.",
                definiendum.tex,
                definiendum.start,
                definiendum.end,
                symbol.tex,
                symbol.start,
                symbol.end,
                processing_summary.arxiv_id,
            )
            break

        if defined_symbol is None:
            continue

        # Assemble data about definitions for the symbol.
        definitions = definiendum.definitions
        definition_texs = definiendum.definition_texs
        sources = definiendum.sources
        definition_sentence_ids: List[Optional[str]] = []
        for definition_id in definiendum.definition_ids:
            context = contexts_by_definition.get(definition_id)
            if context is None:
                definition_sentence_ids.append(None)
            else:
                definition_sentence_ids.append(
                    f"{context.tex_path}-{context.sentence_id}"
                )

        # Find all symbols that are the same (i.e., that have the same MathML representation).
        # Then save definition data so that it applies all of those symbols.
        matching_symbols = symbols_by_mathml.get(defined_symbol.mathml)
        if matching_symbols is not None:
            for s in matching_symbols:
                entity_model = entity_models.get(("symbol", sid(s)))
                data: EntityData = {
                    "definitions": definitions,
                    "definition_texs": definition_texs,
                    "sources": sources,
                }
                entity_data_models.extend(make_data_models(None, entity_model, data))

                relationships: EntityRelationships = {
                    "definition_sentences": [
                        EntityReference(type_="sentence", id_=id_)
                        for id_ in definition_sentence_ids
                    ],
                }
                entity_data_models.extend(
                    make_relationship_models(
                        ("symbol", sid(s)), relationships, entity_models
                    )
                )

    with output_database.atomic():
        EntityDataModel.bulk_create(entity_data_models, 200)
Пример #6
0
def upload_symbols(item: SymbolData, source: str = "tex-pipeline") -> None:
    arxiv_id = item.arxiv_id
    s2_id = item.s2_id
    symbols_with_ids = item.symbols_with_ids
    boxes = item.boxes
    matches = item.matches

    try:
        paper = Paper.get(Paper.s2_id == s2_id)
    except Paper.DoesNotExist:
        paper = Paper.create(s2_id=s2_id, arxiv_id=arxiv_id)

    # Load MathML models into cache; they will be needed for creating multiple symbols.
    mathml_cache: Dict[MathML, MathMlModel] = {}
    mathml_equations = {swi.symbol.mathml for swi in symbols_with_ids}
    for mathml, mathml_matches in matches.items():
        mathml_equations.update({mathml}.union(
            {match.matching_mathml
             for match in mathml_matches}))
    for mathml in mathml_equations:
        if mathml not in mathml_cache:
            try:
                mathml_model = MathMlModel.get(MathMlModel.mathml == mathml)
            except MathMlModel.DoesNotExist:
                mathml_model = MathMlModel.create(mathml=mathml)
            mathml_cache[mathml] = mathml_model

    # Upload MathML search results.
    mathml_match_models = []
    for mathml, mathml_matches in matches.items():
        for match in mathml_matches:
            mathml_match_models.append(
                MathMlMatch(
                    paper=paper,
                    mathml=mathml_cache[mathml],
                    match=mathml_cache[match.matching_mathml],
                    rank=match.rank,
                ))
    with output_database.atomic():
        MathMlMatch.bulk_create(mathml_match_models, 200)

    # Create all symbols in bulk. This lets us resolve their IDs before we start referring to
    # them from other tables. It also lets us refer to their models in the parent-child table.
    symbol_models: Dict[SymbolId, SymbolModel] = {}
    symbol_models_by_symbol_object_id: Dict[int, SymbolModel] = {}

    for symbol_with_id in symbols_with_ids:
        symbol = symbol_with_id.symbol
        symbol_id = symbol_with_id.symbol_id
        mathml_model = mathml_cache[symbol.mathml]
        symbol_model = SymbolModel(paper=paper, mathml=mathml_model)
        symbol_models[symbol_id] = symbol_model
        symbol_models_by_symbol_object_id[id(symbol)] = symbol_model

    with output_database.atomic():
        SymbolModel.bulk_create(symbol_models.values(), 300)

    # Upload bounding boxes for symbols. 'bulk_create' must have already been called on the
    # the symbol models to make sure their model IDs can be used here.
    entities = []
    entity_bounding_boxes = []
    bounding_boxes = []
    for symbol_with_id in symbols_with_ids:

        symbol_id = symbol_with_id.symbol_id
        symbol_model = symbol_models[symbol_id]

        box = boxes.get(symbol_id)
        if box is not None:
            entity = Entity(type="symbol",
                            source=source,
                            entity_id=symbol_model.id)
            entities.append(entity)
            bounding_box = BoundingBoxModel(
                page=box.page,
                left=box.left,
                top=box.top,
                width=box.width,
                height=box.height,
            )
            bounding_boxes.append(bounding_box)

            entity_bounding_box = EntityBoundingBox(bounding_box=bounding_box,
                                                    entity=entity)
            entity_bounding_boxes.append(entity_bounding_box)

    with output_database.atomic():
        BoundingBoxModel.bulk_create(bounding_boxes, 100)
    with output_database.atomic():
        Entity.bulk_create(entities, 300)
    with output_database.atomic():
        EntityBoundingBox.bulk_create(entity_bounding_boxes, 300)

    # Upload parent-child relationships between symbols.
    symbol_child_models = []
    for symbol_with_id in symbols_with_ids:

        symbol = symbol_with_id.symbol
        symbol_id = symbol_with_id.symbol_id
        symbol_model = symbol_models[symbol_id]

        for child in symbol.children:
            child_model = symbol_models_by_symbol_object_id[id(child)]
            symbol_child_models.append(
                SymbolChild(parent=symbol_model, child=child_model))
    with output_database.atomic():
        SymbolChild.bulk_create(symbol_child_models, 300)

    # Upload links between symbols and the sentences they appear in.
    symbol_sentence_models = []
    for symbol_id, sentence_model_id in item.symbol_sentence_model_ids.items():
        symbol_model = symbol_models[symbol_id]
        symbol_sentence_models.append(
            SymbolSentence(symbol=symbol_model, sentence_id=sentence_model_id))

    with output_database.atomic():
        SymbolSentence.bulk_create(symbol_sentence_models, 300)