Пример #1
0
 def save(self, item: DetectionTask, result: SerializableEntity) -> None:
     results_dir = directories.arxiv_subdir(self.get_output_base_dirkey(),
                                            item.arxiv_id)
     if not os.path.exists(results_dir):
         os.makedirs(results_dir)
     entities_path = os.path.join(results_dir, "entities.csv")
     file_utils.append_to_csv(entities_path, result)
Пример #2
0
    def save(self, item: SearchTask, result: HueLocation) -> None:
        logging.debug(
            "Found bounding box for %s, iteration %s, hue %f",
            item.relative_file_path,
            item.iteration,
            result.hue,
        )

        output_dir = directories.arxiv_subdir(self.get_output_base_dirkey(),
                                              item.arxiv_id)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        output_path = os.path.join(output_dir, "hue_locations.csv")

        file_utils.append_to_csv(
            output_path,
            HueLocationInfo(
                tex_path=item.search.record.tex_path,
                iteration=item.iteration,
                hue=result.hue,
                entity_id=item.search.record.entity_id,
                page=result.box.page,
                left=result.box.left,
                top=result.box.top,
                width=result.box.width,
                height=result.box.height,
                relative_file_path=item.relative_file_path,
            ),
        )
Пример #3
0
def update_compilation_log(
    output_dir_key: str,
    arxiv_id: ArxivId,
    stdout: bytes,
    source_path: RelativePath,
    success: bool,
) -> None:

    arxiv_id_output_root = directories.arxiv_subdir(output_dir_key, arxiv_id)
    results_path = os.path.join(arxiv_id_output_root, "compilation_results.csv")

    missing_driver = is_driver_unimplemented(stdout)
    errors = list(get_errors(stdout))
    if missing_driver:
        logging.warning(  # pylint: disable=logging-not-lazy
            "Could not compile arXiv ID %s because colorization commands are missing for the"
            + "driver needed to compile that TeX project.",
            arxiv_id,
        )

    # Write the compilation result to the log.
    file_utils.append_to_csv(
        results_path,
        CompilationSummaryEntry(
            outcome="SUCCESS" if success else "FAILURE",
            source_path=source_path,
            missing_driver=missing_driver,
            errors=[e.decode("utf-8", "ignore") for e in errors],
        ),
    )
    def save(self, item: LocationTask, result: CitationLocation) -> None:
        output_dir = directories.arxiv_subdir("citation-locations", item.arxiv_id)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        locations_path = os.path.join(output_dir, "citation_locations.csv")
        file_utils.append_to_csv(locations_path, result)
Пример #5
0
    def save(self, item: Task, result: Context) -> None:
        output_dir = directories.arxiv_subdir(
            f"contexts-for-{self.get_entity_name()}", item.arxiv_id)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        contexts_path = os.path.join(output_dir, "contexts.csv")
        file_utils.append_to_csv(contexts_path, result)
Пример #6
0
    def save(self, item: Locations, result: None) -> None:
        output_dir = directories.arxiv_subdir("symbols-locations", item.arxiv_id)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        output_path = os.path.join(output_dir, "entity_locations.csv")
        for location in item.locations:
            file_utils.append_to_csv(output_path, location)
Пример #7
0
    def save(self, item: MatchTask, result: BibitemMatch) -> None:
        resolutions_dir = directories.arxiv_subdir("bibitem-resolutions",
                                                   item.arxiv_id)
        if not os.path.exists(resolutions_dir):
            os.makedirs(resolutions_dir)

        resolutions_path = os.path.join(resolutions_dir, "resolutions.csv")
        file_utils.append_to_csv(resolutions_path, result)
Пример #8
0
 def save(self, item: ExtractionTask, result: Bibitem) -> None:
     logging.debug("Extracted bibitem %s from file %s", result,
                   item.file_contents.path)
     results_dir = directories.arxiv_subdir("detected-citations",
                                            item.arxiv_id)
     if not os.path.exists(results_dir):
         os.makedirs(results_dir)
     results_path = os.path.join(results_dir, "entities.csv")
     file_utils.append_to_csv(results_path, result)
Пример #9
0
    def save(self, item: Task, result: EmbellishedSentence) -> None:

        output_dir = directories.arxiv_subdir("embellished-sentences",
                                              item.arxiv_id)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        output_path = os.path.join(output_dir, "sentences.csv")
        file_utils.append_to_csv(output_path, result)
Пример #10
0
    def save(self, item: MathMLForPaper, result: Matches) -> None:
        output_dir = directories.arxiv_subdir("symbol-matches", item.arxiv_id)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        matches_path = os.path.join(output_dir, "matches.csv")
        for matches in result.values():
            for match in matches:
                file_utils.append_to_csv(matches_path, match)
 def save(self, item: FindSentencesTask,
          result: EntitySentencePair) -> None:
     output_dir = directories.arxiv_subdir(self.get_output_base_dirkey(),
                                           item.arxiv_id)
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
     entity_sentences_path = os.path.join(
         output_dir,
         "entity_sentences.csv",
     )
     file_utils.append_to_csv(
         entity_sentences_path,
         EntitySentencePairIds(item.tex_path, result.entity.id_,
                               result.sentence.id_),
     )
Пример #12
0
    def save(self, item: LocationTask, result: HueLocationInfo) -> None:
        logging.debug(
            "Found bounding box for %s entity %s in iteration %s, hue %f",
            item.arxiv_id,
            result.entity_id,
            result.iteration,
            result.hue,
        )

        output_dir = directories.arxiv_subdir(
            self.output_base_dirs["entity-locations"], item.arxiv_id)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        output_path = os.path.join(output_dir, "entity_locations.csv")

        file_utils.append_to_csv(output_path, result)
Пример #13
0
    def save(self, item: Task, result: TokenizedSentence) -> None:

        # Only save embellished sentences if they are made from sentences that are 'clean,' i.e.,
        # that look like they contain natural language.
        if not item.sentence.is_clean:
            return

        output_dir = directories.arxiv_subdir("sentence-tokens", item.arxiv_id)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        sentence_data_path = os.path.join(output_dir, "sentences.csv")
        file_utils.append_to_csv(sentence_data_path,
                                 result.embellished_sentence)

        token_list_path = os.path.join(output_dir, "tokens.csv")
        for token in result.tokens:
            file_utils.append_to_csv(token_list_path, token)
Пример #14
0
 def save(self, item: SymbolSentencesTask,
          result: SymbolSentencePair) -> None:
     output_dir = directories.arxiv_subdir("sentences-for-symbols",
                                           item.arxiv_id)
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
     entity_sentences_path = os.path.join(
         output_dir,
         "entity_sentences.csv",
     )
     file_utils.append_to_csv(
         entity_sentences_path,
         EntitySentencePairIds(
             result.tex_path,
             f"{result.equation_index}-{result.symbol_index}",
             result.sentence_id,
         ),
     )
Пример #15
0
    def save(self, item: TexAndTokens, result: ColorizationResult) -> None:
        iteration = result.iteration
        iteration_id = f"all-files-{iteration}"
        output_sources_path = directories.iteration(
            "sources-with-colorized-equation-tokens",
            item.arxiv_id,
            iteration_id,
        )
        logging.debug("Outputting to %s", output_sources_path)

        # Create new directory for each colorization iteration.
        unpack_path = unpack(item.arxiv_id, output_sources_path)
        sources_unpacked = unpack_path is not None
        if unpack_path is None:
            logging.warning("Could not unpack sources into %s",
                            output_sources_path)

        if sources_unpacked:
            for tex_path, colorized_tex in result.result.colorized_files.items(
            ):
                full_tex_path = os.path.join(output_sources_path, tex_path)
                with open(full_tex_path, "w",
                          encoding=colorized_tex.encoding) as tex_file:
                    tex_file.write(colorized_tex.contents)

            hues_path = os.path.join(output_sources_path, "entity_hues.csv")
            for colorized_token in result.result.colorized_tokens:
                file_utils.append_to_csv(
                    hues_path,
                    EquationTokenColorizationRecord(
                        entity_id=(str(colorized_token.equation_index) + "-" +
                                   str(colorized_token.token_index)),
                        hue=colorized_token.hue,
                        tex_path=colorized_token.tex_path,
                        iteration=str(iteration),
                        equation_index=colorized_token.equation_index,
                        token_index=colorized_token.token_index,
                        start=colorized_token.start,
                        end=colorized_token.end,
                        text=colorized_token.text,
                    ),
                )
    def save(self, item: ColorizationTask, result: ColorizationResult) -> None:
        iteration = result.iteration
        colorized_tex = result.tex
        colorized_citations = result.colorized_citations

        iteration_id = directories.tex_iteration(item.tex_path, str(iteration))
        output_sources_path = directories.iteration(
            "sources-with-colorized-citations",
            item.arxiv_id,
            iteration_id,
        )
        logging.debug("Outputting to %s", output_sources_path)

        # Create new directory for each colorization iteration for each TeX file.
        unpack_path = unpack(item.arxiv_id, output_sources_path)
        sources_unpacked = unpack_path is not None
        if unpack_path is None:
            logging.warning("Could not unpack sources into %s",
                            output_sources_path)

        if sources_unpacked:
            tex_path = os.path.join(output_sources_path, item.tex_path)
            with open(tex_path, "w",
                      encoding=item.file_contents.encoding) as tex_file:
                tex_file.write(colorized_tex)

            hues_path = os.path.join(output_sources_path, "entity_hues.csv")

            # TODO(andrewhead): It might be better to save this CSV data with the same
            # encoding as the file the TeX was read from, for the citations, for the
            # equations, and for the symbols. There might be some gotchas for character
            # positions not lining up between the ones we save using Unicode here and the
            # positions in the intended encoding in the original files.
            for c in colorized_citations:
                record = ColorizationRecord(
                    hue=c.hue,
                    entity_id=c.key,
                    tex_path=item.tex_path,
                    iteration=iteration_id,
                )
                file_utils.append_to_csv(hues_path, record)
Пример #17
0
    def save(self, item: LocationTask, result: BoundingBox) -> None:
        output_dir = directories.arxiv_subdir("symbol-locations",
                                              item.arxiv_id)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        locations_path = os.path.join(output_dir, "symbol_locations.csv")
        symbol_id = item.symbol_with_id.symbol_id
        file_utils.append_to_csv(
            locations_path,
            SymbolLocation(
                tex_path=symbol_id.tex_path,
                equation_index=symbol_id.equation_index,
                symbol_index=symbol_id.symbol_index,
                page=result.page,
                left=result.left,
                top=result.top,
                width=result.width,
                height=result.height,
            ),
        )
Пример #18
0
    def save(self, item: ArxivId, result: S2Metadata) -> None:
        s2_metadata_dir = directories.arxiv_subdir("s2-metadata", item)
        if not os.path.exists(s2_metadata_dir):
            os.makedirs(s2_metadata_dir)

        references_path = os.path.join(s2_metadata_dir, "references.csv")
        for r in result.references:
            serializable = SerializableReference(
                s2_id=r.s2_id,
                arxivId=r.arxivId,
                doi=r.doi,
                title=r.title,
                authors=str([dataclasses.asdict(a) for a in r.authors]),
                venue=r.venue,
                year=r.year,
            )
            file_utils.append_to_csv(references_path, serializable)

        s2_id_path = os.path.join(s2_metadata_dir, "s2_id")
        with open(s2_id_path, "w") as s2_id_file:
            s2_id_file.write(result.s2_id)
Пример #19
0
    def save(self, item: ColorizationTask, result: ColorizationResult) -> None:
        iteration = result.iteration
        colorized_tex = result.tex
        entity_hues = result.entity_hues

        iteration_id = directories.tex_iteration(item.tex_path, str(iteration))
        output_sources_path = directories.iteration(
            self.get_output_base_dirkey(),
            item.arxiv_id,
            iteration_id,
        )
        logging.debug("Outputting to %s", output_sources_path)

        # Each colorization batch gets a new sources directory.
        unpack_path = unpack(item.arxiv_id, output_sources_path)
        sources_unpacked = unpack_path is not None
        if unpack_path is None:
            logging.warning("Could not unpack sources into %s",
                            output_sources_path)

        if sources_unpacked:
            # Rewrite the TeX with the colorized TeX.
            tex_path = os.path.join(output_sources_path, item.tex_path)
            with open(tex_path, "w",
                      encoding=item.file_contents.encoding) as tex_file:
                tex_file.write(colorized_tex)

            # Save a log of which hues were assigned to which entities.
            hues_path = os.path.join(output_sources_path, "entity_hues.csv")
            for (hue, entity) in entity_hues:
                file_utils.append_to_csv(
                    hues_path,
                    ColorizationRecord(
                        tex_path=item.tex_path,
                        iteration=str(iteration),
                        hue=hue,
                        entity_id=entity.id_,
                    ),
                )
Пример #20
0
def save_colorized_tex(
    arxiv_id: ArxivId,
    output_sources_path: RelativePath,
    tex_path: RelativePath,
    iteration: str,
    tex: str,
    encoding: str,
    entity_hues: Dict[str, float],
) -> bool:
    logging.debug("Outputting colorized TeX to %s.", output_sources_path)

    # Each colorization batch gets a new sources directory.
    unpack_path = unpack(arxiv_id, output_sources_path)
    sources_unpacked = unpack_path is not None
    if unpack_path is None:
        logging.warning("Could not unpack sources into %s.",
                        output_sources_path)
        return False

    if sources_unpacked:
        # Rewrite the TeX with the colorized TeX.
        tex_path = os.path.join(output_sources_path, tex_path)
        with open(tex_path, "w", encoding=encoding) as tex_file:
            tex_file.write(tex)

        # Save a log of which hues were assigned to which entities.
        hues_path = os.path.join(output_sources_path, "entity_hues.csv")
        for entity_id, hue in entity_hues.items():
            file_utils.append_to_csv(
                hues_path,
                ColorizationRecord(
                    tex_path=tex_path,
                    iteration=str(iteration),
                    hue=hue,
                    entity_id=entity_id,
                ),
            )

    return True
Пример #21
0
def save_colorized_tex(
    arxiv_id: ArxivId,
    output_sources_path: RelativePath,
    tex_path: RelativePath,
    iteration: str,
    tex: str,
    encoding: str,
    entity_hues: Dict[str, float],
) -> bool:
    logging.debug("Outputting colorized TeX to %s.", output_sources_path)

    # Each colorization batch gets a new sources directory.
    shutil.copytree(
        directories.arxiv_subdir("normalized-sources", arxiv_id),
        output_sources_path,
    )

    # Rewrite the TeX with the colorized TeX.
    tex_path = os.path.join(output_sources_path, tex_path)
    with open(tex_path, "w", encoding=encoding) as tex_file:
        tex_file.write(tex)

    # Save a log of which hues were assigned to which entities.
    hues_path = os.path.join(output_sources_path, "entity_hues.csv")
    for entity_id, hue in entity_hues.items():
        file_utils.append_to_csv(
            hues_path,
            ColorizationRecord(
                tex_path=tex_path,
                iteration=str(iteration),
                hue=hue,
                entity_id=entity_id,
            ),
        )

    return True
Пример #22
0
    def save(
        self,
        item: DetectDefinitionsTask,
        result: Union[Definiendum, Definition, TermReference],
    ) -> None:

        output_dir = directories.arxiv_subdir("detected-definitions", item.arxiv_id)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        definiendums_path = os.path.join(output_dir, "entities-definiendums.csv")
        definitions_path = os.path.join(output_dir, "entities-definitions.csv")
        term_references_path = os.path.join(output_dir, "entities-term-references.csv")

        if isinstance(result, Definiendum):
            file_utils.append_to_csv(definiendums_path, result)
        elif isinstance(result, Definition):
            file_utils.append_to_csv(definitions_path, result)
        elif isinstance(result, TermReference):
            file_utils.append_to_csv(term_references_path, result)
Пример #23
0
    def save(self, item: PaperTokens, _: None) -> None:

        output_dir = directories.arxiv_subdir("annotation-files",
                                              item.arxiv_id)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Write tokens simultaneously to a file listing each token, and a file including
        # the transformed, cleaned text ready for human annotation.
        token_list_path = os.path.join(output_dir, "tokens.csv")
        sentence_list_path = os.path.join(output_dir,
                                          "sentences_for_annotation.txt")
        annotation_file_path = os.path.join(output_dir,
                                            "initial_annotations.ann")

        offset_in_file = 0
        offset_in_sentence = 0
        is_symbol = False
        brat_annotation_index = 1

        with open(sentence_list_path, mode="w",
                  encoding="utf-8") as sentence_file, open(
                      annotation_file_path, mode="w",
                      encoding="utf-8") as ann_file:
            last_sentence_id = None

            for token in item.tokens:

                # Keep track of whether an upcoming token is a symbol. If it is, a tentative
                # annotation can be saved to a brat annotation file.
                if not is_symbol and token.text == "SYMBOL_START":
                    is_symbol = True
                elif is_symbol and token.text == "SYMBOL_END":
                    is_symbol = False

                # Skip tokens that lack a clean readable representation.
                if token.text_for_annotation is None:
                    annotation_token = self._create_annotation_token(
                        token,
                        start=None,
                        end=None,
                        start_in_sentence=None,
                        end_in_sentence=None,
                    )
                    file_utils.append_to_csv(token_list_path, annotation_token)
                    continue

                # Break lines between sentences.
                if (token.sentence_id != last_sentence_id
                        and last_sentence_id is not None):
                    sentence_file.write("\n")
                    offset_in_file += 1
                    offset_in_sentence = 0

                # Insert spaces between tokens.
                if token.sentence_id == last_sentence_id:
                    sentence_file.write(" ")
                    offset_in_file += 1
                    offset_in_sentence += 1

                start = offset_in_file
                start_in_sentence = offset_in_sentence

                # Write token to the text meant for annotation.
                sentence_file.write(token.text_for_annotation)
                offset_in_file += len(token.text_for_annotation)
                offset_in_sentence += len(token.text_for_annotation)

                end = offset_in_file
                end_in_sentence = offset_in_sentence
                last_sentence_id = token.sentence_id

                # Save a record of this token and its positions in the TeX and annotation file.
                annotation_token = self._create_annotation_token(
                    token,
                    start=start,
                    end=end,
                    start_in_sentence=start_in_sentence,
                    end_in_sentence=end_in_sentence,
                )
                file_utils.append_to_csv(token_list_path, annotation_token)

                # Save an annotation for each symbol. Annotations are stored to a '.ann' file in
                # brat standoff format. See more details about the format here:
                # https://brat.nlplab.org/standoff.html
                if is_symbol:
                    ann_file.write(f"T{brat_annotation_index}\t" +
                                   f"Symbol {start} {end}\t" +
                                   f"{token.text_for_annotation}\n")
                    brat_annotation_index += 1
Пример #24
0
    def save(self, item: ArxivId, result: SymbolData) -> None:
        tokens_dir = directories.arxiv_subdir("detected-equation-tokens", item)
        if not os.path.exists(tokens_dir):
            os.makedirs(tokens_dir)
        symbols_dir = directories.arxiv_subdir("detected-symbols", item)
        if not os.path.exists(symbols_dir):
            os.makedirs(symbols_dir)

        if result.success and result.symbols is not None:
            logging.debug(
                "Successfully extracted %d symbols for equation %s.",
                len(result.symbols),
                result.equation,
            )
        else:
            logging.warning(
                "Could not parse equation %s. See logs in %s.",
                result.equation,
                tokens_dir,
            )

        parse_results_path = os.path.join(tokens_dir, "parse_results.csv")
        file_utils.append_to_csv(
            parse_results_path,
            ParseResult(
                arxiv_id=result.arxiv_id,
                success=result.success,
                equation_index=result.equation_index,
                tex_path=result.tex_path,
                equation=result.equation,
                errorMessage=result.errorMessage,
            ),
        )

        # Save symbol data, including parent-child relationships between symbols, and which tokens
        # were found in each symbol.
        if result.symbols is not None and len(result.symbols) > 0:
            symbols = result.symbols

            tokens_path = os.path.join(tokens_dir, "entities.csv")
            symbols_path = os.path.join(symbols_dir, "entities.csv")
            symbol_tokens_path = os.path.join(symbols_dir, "symbol_tokens.csv")
            symbol_children_path = os.path.join(symbols_dir, "symbol_children.csv")

            # The list of symbol children might be empty, e.g., for a paper with only
            # very simple symbols. Make sure there's at least an empty file, as later stages expect
            # to be able to read the list of symbol children at this path.
            open(symbol_children_path, "a").close()

            all_tokens = set()
            for symbol in symbols:
                symbol_index = symbols.index(symbol)

                if len(symbol.tokens) == 0:
                    continue

                # Collect extra information about the symbol.
                def get_tex(s: Node, equation: str) -> Tuple[str, int, int]:
                    """
                    Extract approximate TeX for the symbol. It's estimated to be the span of TeX
                    that covers all of the tokens, including extra curly braces needed to close
                    opened curly braces (which often aren't included in the token start and end
                    character indexes). While these positions aren't used for colorization (and
                    hence don't have to be super precise), they are useful for:
                    1. Ordering the symbols
                    2. Rendering the symbols in the user interface
                    Hence it is a good thing if a complete subset of the TeX can be extracted that
                    can be used to render the symbol.
                    """
                    start = min([t.start for t in s.tokens])
                    end = max([t.end for t in s.tokens])

                    # Grab the macro right before the symbol if there is one. This ensures that the
                    # rendered 'tex' field will include, for instance, `\mathrm` commands that are
                    # used to style the math.
                    for match in re.finditer(r"\\((math|text)\w+)\{", equation):
                        if match.end() == start:
                            start = match.start()

                    # Adjust the end position to after curly braces are closed.
                    open_brace_count = 0
                    for i, c in enumerate(equation[start:], start=start):
                        open_brace_count = (
                            open_brace_count + 1
                            if c == "{"
                            else open_brace_count - 1
                            if c == "}" and open_brace_count > 0
                            else open_brace_count
                        )
                        if (i + 1) >= end and open_brace_count == 0:
                            end = i + 1
                            break

                    return (equation[start:end], start, end)

                symbol_tex, relative_start, relative_end = get_tex(
                    symbol, result.equation
                )
                start = result.equation_start + relative_start
                end = result.equation_start + relative_end

                # Save a record of this symbol.
                file_utils.append_to_csv(
                    symbols_path,
                    SerializableSymbol(
                        id_=f"{result.equation_index}-{symbol_index}",
                        tex_path=result.tex_path,
                        equation_index=result.equation_index,
                        equation=result.equation,
                        symbol_index=symbol_index,
                        start=start,
                        end=end,
                        tex=symbol_tex,
                        context_tex=result.context_tex,
                        mathml=str(symbol.element),
                        is_definition=symbol.defined or False,
                        relative_start=relative_start,
                        relative_end=relative_end
                    ),
                )

                # Save the relationships between this symbol and its tokens.
                all_tokens.update(symbol.tokens)
                for token in symbol.tokens:
                    file_utils.append_to_csv(
                        symbol_tokens_path,
                        SerializableSymbolToken(
                            tex_path=result.tex_path,
                            equation_index=result.equation_index,
                            symbol_index=symbol_index,
                            token_index=token.token_index,
                        ),
                    )

                # Save the relationships between this symbol and its children.
                for child in symbol.child_symbols:
                    child_index = symbols.index(child)
                    file_utils.append_to_csv(
                        symbol_children_path,
                        SerializableChild(
                            tex_path=result.tex_path,
                            equation_index=result.equation_index,
                            equation=result.equation,
                            symbol_index=symbol_index,
                            child_index=child_index,
                        ),
                    )

            # Write record of all tokens to file.
            for token in all_tokens:
                file_utils.append_to_csv(
                    tokens_path,
                    SerializableToken(
                        tex_path=result.tex_path,
                        id_=f"{result.equation_index}-{token.token_index}",
                        equation_index=result.equation_index,
                        token_index=token.token_index,
                        start=result.equation_start + token.start,
                        end=result.equation_start + token.end,
                        relative_start=token.start,
                        relative_end=token.end,
                        tex=result.equation[token.start : token.end],
                        context_tex=result.context_tex,
                        text=token.text,
                        equation=result.equation,
                        equation_depth=result.equation_depth,
                    ),
                )
Пример #25
0
def upload_sentences(processing_summary: PaperProcessingResult) -> None:

    arxiv_id = processing_summary.arxiv_id
    s2_id = processing_summary.s2_id

    # Create entry for the paper if it does not yet exist
    try:
        paper = Paper.get(Paper.s2_id == s2_id)
    except Paper.DoesNotExist:
        paper = Paper.create(s2_id=s2_id, arxiv_id=arxiv_id)

    locations_by_sentence_id: Dict[SentenceKey, List[HueLocationInfo]] = {}
    sentences: Dict[SentenceKey, SentenceEntity] = {}
    sentence_models: Dict[SentenceKey, Sentence] = {}

    for entity_and_location in processing_summary.localized_entities:
        sentence = cast(SentenceEntity, entity_and_location.entity)
        sentence_model = Sentence(paper=paper, text=sentence.text)

        sentence_key = SentenceKey(sentence.tex_path, sentence.id_)
        locations_by_sentence_id[sentence_key] = entity_and_location.locations
        sentence_models[sentence_key] = sentence_model
        sentences[sentence_key] = sentence

    with output_database.atomic():
        Sentence.bulk_create(sentence_models.values(), 100)

    # Save the IDs for the sentence models so that they can be used in downstream tasks,
    # like uploading which sentences symbols belong to.
    model_ids_dir = directories.arxiv_subdir("sentences-model-ids", arxiv_id)
    if os.path.exists(model_ids_dir):
        file_utils.clean_directory(model_ids_dir)
    else:
        os.makedirs(model_ids_dir)
    output_ids_path = os.path.join(model_ids_dir, "model_ids.csv")
    for id_, sentence_entity in sentences.items():
        file_utils.append_to_csv(
            output_ids_path,
            SentenceIdAndModelId(
                tex_path=sentence_entity.tex_path,
                entity_id=sentence_entity.id_,
                model_id=sentence_models[id_].id,
            ),
        )

    entities = []
    entity_bounding_boxes = []
    bounding_boxes = []

    for sentence_id, sentence_model in sentence_models.items():

        entity = Entity(type="sentence",
                        source="tex-pipeline",
                        entity_id=sentence_model.id)
        entities.append(entity)

        for location in locations_by_sentence_id[sentence_id]:
            bounding_box = BoundingBox(
                page=location.page,
                left=location.left,
                top=location.top,
                width=location.width,
                height=location.height,
            )
            bounding_boxes.append(bounding_box)

            entity_bounding_box = EntityBoundingBox(bounding_box=bounding_box,
                                                    entity=entity)
            entity_bounding_boxes.append(entity_bounding_box)

    with output_database.atomic():
        BoundingBox.bulk_create(bounding_boxes, 100)
    with output_database.atomic():
        Entity.bulk_create(entities, 300)
    with output_database.atomic():
        EntityBoundingBox.bulk_create(entity_bounding_boxes, 300)
Пример #26
0
    def save(self, item: ArxivId, result: SymbolData) -> None:
        tokens_dir = directories.arxiv_subdir("detected-equation-tokens", item)
        if not os.path.exists(tokens_dir):
            os.makedirs(tokens_dir)

        if result.success:
            logging.debug("Successfully extracted characters: %s",
                          str(result.characters))
        else:
            logging.warning(
                "Could not parse equation %s. See logs in %s.",
                result.equation,
                tokens_dir,
            )

        parse_results_path = os.path.join(tokens_dir, "parse_results.csv")
        file_utils.append_to_csv(
            parse_results_path,
            ParseResult(
                arxiv_id=result.arxiv_id,
                success=result.success,
                equation_index=result.equation_index,
                tex_path=result.tex_path,
                equation=result.equation,
                errorMessage=result.errorMessage,
            ),
        )

        # Save string representations of every character extracted from the equation
        if result.characters is not None and len(result.characters) > 0:
            tokens_path = os.path.join(tokens_dir, "entities.csv")
            for token in result.characters:
                file_utils.append_to_csv(
                    tokens_path,
                    SerializableToken(
                        tex_path=result.tex_path,
                        id_=f"{result.equation_index}-{token.i}",
                        equation_index=result.equation_index,
                        token_index=token.i,
                        start=result.equation_start + token.start,
                        end=result.equation_start + token.end,
                        relative_start=token.start,
                        relative_end=token.end,
                        tex=result.equation[token.start:token.end],
                        # Just make the token's context TeX the equation's context TeX
                        context_tex=result.context_tex,
                        text=token.text,
                        equation=result.equation,
                        equation_depth=result.equation_depth,
                    ),
                )

        if result.symbols is not None and len(result.symbols) > 0:
            symbols_path = os.path.join(tokens_dir, "symbols.csv")
            symbol_tokens_path = os.path.join(tokens_dir, "symbol_tokens.csv")
            symbol_children_path = os.path.join(tokens_dir,
                                                "symbol_children.csv")

            # The list of symbol children might be empty, e.g., for a paper with only
            # very simple symbols. Make sure there's at least an empty file, as later stages expect
            # to be able to read the list of symbol children at this path.
            open(symbol_children_path, "a").close()

            for symbol_index, symbol in enumerate(result.symbols):
                # Save data for the symbol
                file_utils.append_to_csv(
                    symbols_path,
                    SerializableSymbol(
                        tex_path=result.tex_path,
                        equation_index=result.equation_index,
                        equation=result.equation,
                        symbol_index=symbol_index,
                        mathml=symbol.mathml,
                    ),
                )

                # Save the symbol's relationship to all its component characters
                for character in symbol.characters:
                    file_utils.append_to_csv(
                        symbol_tokens_path,
                        SerializableCharacter(
                            tex_path=result.tex_path,
                            equation_index=result.equation_index,
                            equation=result.equation,
                            symbol_index=symbol_index,
                            character_index=character,
                        ),
                    )

                # Save the symbol's relationship to its children
                for child in symbol.children:
                    file_utils.append_to_csv(
                        symbol_children_path,
                        SerializableChild(
                            tex_path=result.tex_path,
                            equation_index=result.equation_index,
                            equation=result.equation,
                            symbol_index=symbol_index,
                            child_index=result.symbols.index(child),
                        ),
                    )
Пример #27
0
    def save(self, item: ArxivId, result: List[EquationSymbols]) -> None:
        tokens_dir = directories.arxiv_subdir("detected-equation-tokens", item)
        if not os.path.exists(tokens_dir):
            os.makedirs(tokens_dir)
        symbols_dir = directories.arxiv_subdir("detected-symbols", item)
        if not os.path.exists(symbols_dir):
            os.makedirs(symbols_dir)

        # Before saving any symbol, check that it hasn't already been saved. This check is needed
        # because sometimes the same symbol is extracted twice or more. This happens when a symbol is
        # in an equation nested within another equation (e.g., the symbol 'x' in
        # '\begin{equation}\begin{split}x\end{split}\end{equation}').
        saved_symbols: Set[SavedSymbol] = set()

        equations_from_inside_out = sorted(result,
                                           key=lambda e: e.equation_depth,
                                           reverse=True)
        for equation_symbols in equations_from_inside_out:

            success = equation_symbols.success
            tex_path = equation_symbols.tex_path
            equation_index = equation_symbols.equation_index
            equation = equation_symbols.equation
            equation_start = equation_symbols.equation_start
            equation_depth = equation_symbols.equation_depth
            context_tex = equation_symbols.context_tex
            symbols = equation_symbols.symbols
            error_message = equation_symbols.error_message

            if not success or not symbols:
                logging.warning(
                    "Could not parse equation %s. See logs in %s.",
                    equation,
                    tokens_dir,
                )

            parse_results_path = os.path.join(tokens_dir, "parse_results.csv")
            file_utils.append_to_csv(
                parse_results_path,
                ParseResult(
                    arxiv_id=item,
                    success=success,
                    equation_index=equation_index,
                    tex_path=tex_path,
                    equation=equation,
                    errorMessage=error_message,
                ),
            )

            # Save symbol data, including parent-child relationships between symbols, and which tokens
            # were found in each symbol.
            if symbols is None or len(symbols) == 0:
                continue

            tokens_path = os.path.join(tokens_dir, "entities.csv")
            symbols_path = os.path.join(symbols_dir, "entities.csv")
            symbol_tokens_path = os.path.join(symbols_dir, "symbol_tokens.csv")
            symbol_children_path = os.path.join(symbols_dir,
                                                "symbol_children.csv")

            # The list of symbol children might be empty, e.g., for a paper with only
            # very simple symbols. Make sure there's at least an empty file, as later stages expect
            # to be able to read the list of symbol children at this path.
            open(symbol_children_path, "a").close()

            all_tokens = set()
            for symbol in symbols:
                symbol_index = symbols.index(symbol)

                if len(symbol.tokens) == 0:
                    continue

                # Skip this symbol if it has already been saved.
                symbol_tex = equation[symbol.start:symbol.end]
                start_in_file = equation_start + symbol.start
                end_in_file = equation_start + symbol.end
                if SavedSymbol(symbol_tex, start_in_file,
                               end_in_file) in saved_symbols:
                    continue

                # Save a record of this symbol.
                file_utils.append_to_csv(
                    symbols_path,
                    SerializableSymbol(
                        id_=f"{equation_index}-{symbol_index}",
                        tex_path=tex_path,
                        equation_index=equation_index,
                        equation=equation,
                        symbol_index=symbol_index,
                        start=start_in_file,
                        end=end_in_file,
                        tex=symbol_tex,
                        context_tex=context_tex,
                        mathml=str(symbol.element),
                        is_definition=symbol.defined or False,
                        relative_start=symbol.start,
                        relative_end=symbol.end,
                        contains_affix=symbol.contains_affix_token,
                    ),
                )

                # Save the relationships between this symbol and its tokens.
                all_tokens.update(symbol.tokens)
                for token in symbol.tokens:
                    file_utils.append_to_csv(
                        symbol_tokens_path,
                        SerializableSymbolToken(
                            tex_path=tex_path,
                            equation_index=equation_index,
                            symbol_index=symbol_index,
                            start=token.start,
                            end=token.end,
                        ),
                    )

                # Save the relationships between this symbol and its children.
                for child in symbol.child_symbols:
                    child_index = symbols.index(child)
                    file_utils.append_to_csv(
                        symbol_children_path,
                        SerializableChild(
                            tex_path=tex_path,
                            equation_index=equation_index,
                            equation=equation,
                            symbol_index=symbol_index,
                            child_index=child_index,
                        ),
                    )

                saved_symbols.add(
                    SavedSymbol(symbol_tex, start_in_file, end_in_file))

            # Write record of all tokens to file.
            for token in all_tokens:
                file_utils.append_to_csv(
                    tokens_path,
                    SerializableToken(
                        tex_path=tex_path,
                        id_=f"{equation_index}-{token.start}-{token.end}",
                        equation_index=equation_index,
                        start=equation_start + token.start,
                        end=equation_start + token.end,
                        relative_start=token.start,
                        relative_end=token.end,
                        type_=token.type_,
                        tex=equation[token.start:token.end],
                        context_tex=context_tex,
                        text=token.text,
                        equation=equation,
                        equation_depth=equation_depth,
                    ),
                )