示例#1
0
            adjust_color_positions=adjust_color_positions,
            braces=True,
            when=filter_symbols_with_affixes,
            group=divide_symbols_into_nonoverlapping_groups,
        ),
    ),
    LocateCompositeSymbols,
    CollectSymbolLocations,
    make_upload_entities_command("symbols",
                                 upload_symbols,
                                 DetectedEntityType=SerializableSymbol),
]


def make_digest(_: str, arxiv_id: ArxivId) -> EntityProcessingDigest:
    """
    Custom digest creator. Count the equation tokens, instead of the 'symbols', as we can
    use the default entity counters for the outputs of equation token commands.
    """
    return make_default_paper_digest("equation-tokens", arxiv_id)


symbols_pipeline = EntityPipeline(
    "symbols",
    commands,
    depends_on=["equations"],
    optional_depends_on=["sentences"],
    make_digest=make_digest,
)
register_entity_pipeline(symbols_pipeline)
示例#2
0
from common import directories
from common.colorize_tex import ColorizeOptions
from common.commands.base import CommandList
from entities.common import create_entity_localization_command_sequence
from scripts.pipelines import EntityPipeline, register_entity_pipeline

from .colorize import adjust_color_positions
from .extractor import SentenceExtractor
from .types import Sentence
from .upload import upload_sentences

commands = create_entity_localization_command_sequence(
    "sentences",
    SentenceExtractor,
    DetectedEntityType=Sentence,
    colorize_options=ColorizeOptions(
        adjust_color_positions=adjust_color_positions),
    upload_func=upload_sentences,
)

sentences_pipeline = EntityPipeline("sentences", commands)
register_entity_pipeline(sentences_pipeline)
示例#3
0
directories.register("detected-definitions")
directories.register("sources-with-colorized-definitions")
directories.register("compiled-sources-with-colorized-definitions")
directories.register("paper-images-with-colorized-definitions")
directories.register("diffed-images-with-colorized-definitions")
directories.register("definitions-locations")

upload_command = make_upload_entities_command(
    "definitions",
    upload_definitions,
    DetectedEntityType={
        "entities-definiendums.csv": Definiendum,
        "entities-definitions.csv": Definition,
        "entities-term-references.csv": TermReference,
    },
)

commands: CommandList = [
    EmbellishSentences,
    DetectDefinitions,
    make_locate_entities_command("definitions"),
    upload_command,
]

definitions_pipeline = EntityPipeline(
    "definitions",
    commands,
    depends_on=["symbols", "sentences"],
)
register_entity_pipeline(definitions_pipeline)
示例#4
0
from .commands.locate_citations import LocateCitations
from .commands.resolve_bibitems import ResolveBibitems
from .commands.upload_citations import UploadCitations
from .make_digest import make_digest
from .types import Bibitem

directories.register("detected-citations")
directories.register("bibitem-resolutions")
directories.register("sources-with-colorized-citations")
directories.register("compiled-sources-with-colorized-citations")
directories.register("paper-images-with-colorized-citations")
directories.register("diffed-images-with-colorized-citations")
directories.register("citations-locations")
directories.register("citation-cluster-locations")
directories.register("sources-with-annotated-symbols")

commands: CommandList = [
    ExtractBibitems,
    ResolveBibitems,
    make_locate_entities_command("citations",
                                 DetectedEntityType=Bibitem,
                                 colorize_func=colorize_citations),
    LocateCitations,
    UploadCitations,
]

citations_pipeline = EntityPipeline("citations",
                                    commands,
                                    make_digest=make_digest)
register_entity_pipeline(citations_pipeline)
示例#5
0
from .colorize import adjust_color_positions
from .extractor import GlossaryTermExtractor
from .upload import upload_terms

commands = create_entity_localization_command_sequence(
    "glossary-terms",
    GlossaryTermExtractor,
    DetectedEntityType=Term,
    colorize_options=ColorizeOptions(adjust_color_positions=adjust_color_positions),
    upload_func=upload_terms,
)

# Before uploading entities, extract contexts that each term appeared in.
upload_command_index = len(commands)
for i, command in enumerate(commands):
    if command.get_name() == "upload-glossary-terms":
        upload_command_index = i

directories.register("contexts-for-glossary-terms")
commands.insert(
    upload_command_index,
    make_extract_contexts_command(
        "glossary-terms",
        EntityType=Term,
        tex_wrapper=TexWrapper(before="**", after="**"),
    ),
)

terms_pipeline = EntityPipeline("glossary-terms", commands)
register_entity_pipeline(terms_pipeline)
示例#6
0
from common.types import ColorizeOptions, Term
from entities.common import create_entity_localization_command_sequence
from scripts.pipelines import EntityPipeline, register_entity_pipeline

from .colorize import adjust_color_positions
from .extractor import GlossaryTermExtractor
from .upload import upload_terms

commands = create_entity_localization_command_sequence(
    "glossary-terms",
    GlossaryTermExtractor,
    extract_contexts=True,
    DetectedEntityType=Term,
    colorize_options=ColorizeOptions(
        adjust_color_positions=adjust_color_positions),
    upload_func=upload_terms,
)

terms_pipeline = EntityPipeline("glossary-terms",
                                commands,
                                depends_on=["sentences"])
register_entity_pipeline(terms_pipeline)
示例#7
0
from common import directories
from common.types import SerializableEntity
from entities.common import create_entity_localization_command_sequence
from scripts.pipelines import EntityPipeline, register_entity_pipeline

from .colorize import get_term_color_positions
from .extractor import TermExtractor
from .types import Term
from .upload import upload_terms

commands = create_entity_localization_command_sequence(
    "terms",
    TermExtractor,
    DetectedEntityType=Term,
    get_color_positions=get_term_color_positions,
    upload_func=upload_terms,
)

terms_pipeline = EntityPipeline("terms", commands)
register_entity_pipeline(terms_pipeline)
示例#8
0
from common import directories
from common.commands.base import CommandList
from entities.common import create_entity_localization_command_sequence
from scripts.pipelines import EntityPipeline, register_entity_pipeline

from .colorize import get_sentence_color_positions
from .extractor import SentenceExtractor
from .types import Sentence
from .upload import Sentence as SentenceModel
from .upload import upload_sentences

commands = create_entity_localization_command_sequence(
    "sentences",
    SentenceExtractor,
    DetectedEntityType=Sentence,
    get_color_positions=get_sentence_color_positions,
    upload_func=upload_sentences,
)

# Register additional directories to be used by the upload function
directories.register("sentences-model-ids")

sentences_pipeline = EntityPipeline("sentences",
                                    commands,
                                    database_models=[SentenceModel])
register_entity_pipeline(sentences_pipeline)
示例#9
0
from common.parse_tex import EquationExtractor
from common.types import CharacterRange, ColorizeOptions, Equation, SerializableEntity
from entities.common import create_entity_localization_command_sequence
from scripts.pipelines import EntityPipeline, register_entity_pipeline

from .upload import upload_equations


def colorize_equation_when(entity: SerializableEntity) -> bool:
    equation = cast(Equation, entity)
    return equation.depth == 0


def adjust_color_positions(entity: SerializableEntity) -> CharacterRange:
    equation = cast(Equation, entity)
    return CharacterRange(equation.content_start, equation.content_end)


commands = create_entity_localization_command_sequence(
    "equations",
    EquationExtractor,
    DetectedEntityType=Equation,
    colorize_options=ColorizeOptions(
        when=colorize_equation_when,
        adjust_color_positions=adjust_color_positions),
    upload_func=upload_equations,
)

equations_pipeline = EntityPipeline("equations", commands)
register_entity_pipeline(equations_pipeline)
示例#10
0
    FindSymbolMatches,
    make_find_entity_sentences_command("equation-tokens"),
    FindSymbolSentences,
    ColorizeEquationTokens,
    make_compile_tex_command("equation-tokens"),
    make_raster_pages_command("equation-tokens"),
    make_diff_images_command("equation-tokens"),
    make_locate_hues_command("equation-tokens"),
    LocateSymbols,
    UploadSymbols,
]


def make_digest(_: str, arxiv_id: ArxivId) -> EntityProcessingDigest:
    """
    Custom digest creator. Count the equation tokens, instead of the 'symbols', as we can
    use the default entity counters for the outputs of equation token commands.
    """
    return make_default_paper_digest("equation-tokens", arxiv_id)


symbols_pipeline = EntityPipeline(
    "symbols",
    commands,
    depends_on=["equations"],
    optional_depends_on=["sentences"],
    database_models=[MathMl, MathMlMatch, Symbol, SymbolChild, SymbolSentence],
    make_digest=make_digest,
)
register_entity_pipeline(symbols_pipeline)
示例#11
0
from common import directories
from common.commands.base import CommandList
from entities.common import create_entity_localization_command_sequence
from scripts.pipelines import EntityPipeline, register_entity_pipeline

from .extractor import AbbreviationExtractor
from .types import Abbreviation

commands = create_entity_localization_command_sequence(
    "abbreviations", AbbreviationExtractor, DetectedEntityType=Abbreviation)

abbreviations_pipeline = EntityPipeline("abbreviations", commands)
register_entity_pipeline(abbreviations_pipeline)