SerializableEntity, SerializableSymbol, SerializableToken, ) from entities.sentences.commands.extract_contexts import make_extract_contexts_command from entities.sentences.types import TexWrapper from scripts.pipelines import EntityPipeline, register_entity_pipeline from .colorize import adjust_color_positions from .commands.collect_symbol_locations import CollectSymbolLocations from .commands.extract_symbols import ExtractSymbols from .commands.find_symbol_matches import FindSymbolMatches from .commands.locate_composite_symbols import LocateCompositeSymbols from .upload import upload_symbols directories.register("detected-equation-tokens") directories.register("detected-symbols") directories.register("symbol-matches") directories.register("contexts-for-symbols") directories.register("sources-with-colorized-equation-tokens") directories.register("compiled-sources-with-colorized-equation-tokens") directories.register("paper-images-with-colorized-equation-tokens") directories.register("diffed-images-with-colorized-equation-tokens") directories.register("equation-tokens-locations") directories.register("composite-symbols-locations") directories.register("sources-with-colorized-symbols-with-affixes") directories.register("compiled-sources-with-colorized-symbols-with-affixes") directories.register("paper-images-with-colorized-symbols-with-affixes") directories.register("diffed-images-with-colorized-symbols-with-affixes") directories.register("symbols-with-affixes-locations") directories.register("symbols-locations")
from common import directories from common.commands.base import Command, CommandList from common.commands.detect_entities import make_detect_entities_command from common.commands.locate_entities import make_locate_entities_command from common.commands.upload_entities import make_upload_entities_command from entities.common import create_entity_localization_command_sequence from scripts.pipelines import EntityPipeline, register_entity_pipeline # from .colorize import get_definition_color_positions from .commands.detect_definitions import DetectDefinitions from .commands.embellish_sentences import EmbellishSentences from .types import Definiendum, Definition, TermReference from .upload import upload_definitions # Register directories for output from intermediate pipeline stages. directories.register("embellished-sentences") directories.register("detected-definitions") directories.register("sources-with-colorized-definitions") directories.register("compiled-sources-with-colorized-definitions") directories.register("paper-images-with-colorized-definitions") directories.register("diffed-images-with-colorized-definitions") directories.register("definitions-locations") upload_command = make_upload_entities_command( "definitions", upload_definitions, DetectedEntityType={ "entities-definiendums.csv": Definiendum, "entities-definitions.csv": Definition, "entities-term-references.csv": TermReference, },
from common import directories from common.commands.base import CommandList from common.commands.locate_entities import make_locate_entities_command from scripts.pipelines import EntityPipeline, register_entity_pipeline from .colorize import colorize_citations from .commands.extract_bibitems import ExtractBibitems from .commands.locate_citations import LocateCitations from .commands.resolve_bibitems import ResolveBibitems from .commands.upload_citations import UploadCitations from .make_digest import make_digest from .types import Bibitem directories.register("detected-citations") directories.register("bibitem-resolutions") directories.register("sources-with-colorized-citations") directories.register("compiled-sources-with-colorized-citations") directories.register("paper-images-with-colorized-citations") directories.register("diffed-images-with-colorized-citations") directories.register("citations-locations") directories.register("citation-cluster-locations") directories.register("sources-with-annotated-symbols") commands: CommandList = [ ExtractBibitems, ResolveBibitems, make_locate_entities_command("citations", DetectedEntityType=Bibitem, colorize_func=colorize_citations), LocateCitations, UploadCitations,
from .colorize import adjust_color_positions from .extractor import GlossaryTermExtractor from .upload import upload_terms commands = create_entity_localization_command_sequence( "glossary-terms", GlossaryTermExtractor, DetectedEntityType=Term, colorize_options=ColorizeOptions(adjust_color_positions=adjust_color_positions), upload_func=upload_terms, ) # Before uploading entities, extract contexts that each term appeared in. upload_command_index = len(commands) for i, command in enumerate(commands): if command.get_name() == "upload-glossary-terms": upload_command_index = i directories.register("contexts-for-glossary-terms") commands.insert( upload_command_index, make_extract_contexts_command( "glossary-terms", EntityType=Term, tex_wrapper=TexWrapper(before="**", after="**"), ), ) terms_pipeline = EntityPipeline("glossary-terms", commands) register_entity_pipeline(terms_pipeline)
from common import directories from common.commands.base import CommandList from common.commands.locate_entities import make_locate_entities_command from common.commands.upload_entities import make_upload_entities_command from common.types import ColorizeOptions, SerializableEntity from entities.sentences.commands.extract_contexts import make_extract_contexts_command from scripts.pipelines import EntityPipeline, register_entity_pipeline from .commands.create_annotation_files import CreateAnnotationFiles from .commands.detect_definitions import DetectDefinitions from .commands.tokenize_sentences import TokenizeSentences from .types import Definiendum, Definition, TermReference from .upload import upload_definitions # Register directories for output from intermediate pipeline stages. directories.register("sentence-tokens") directories.register("annotation-files") directories.register("detected-definitions") directories.register("contexts-for-definitions") directories.register("sources-with-colorized-definitions") directories.register("compiled-sources-with-colorized-definitions") directories.register("paper-images-with-colorized-definitions") directories.register("diffed-images-with-colorized-definitions") directories.register("definitions-locations") upload_command = make_upload_entities_command( "definitions", upload_definitions, DetectedEntityType={ "entities-definiendums.csv": Definiendum, "entities-definitions.csv": Definition,
from common import directories from common.commands.base import CommandList from entities.common import create_entity_localization_command_sequence from scripts.pipelines import EntityPipeline, register_entity_pipeline from .colorize import get_sentence_color_positions from .extractor import SentenceExtractor from .types import Sentence from .upload import Sentence as SentenceModel from .upload import upload_sentences commands = create_entity_localization_command_sequence( "sentences", SentenceExtractor, DetectedEntityType=Sentence, get_color_positions=get_sentence_color_positions, upload_func=upload_sentences, ) # Register additional directories to be used by the upload function directories.register("sentences-model-ids") sentences_pipeline = EntityPipeline("sentences", commands, database_models=[SentenceModel]) register_entity_pipeline(sentences_pipeline)
from common.commands.raster_pages import make_raster_pages_command from common.make_digest import make_default_paper_digest from common.types import ArxivId, EntityProcessingDigest from scripts.pipelines import EntityPipeline, register_entity_pipeline from ..sentences.commands.find_entity_sentences import ( make_find_entity_sentences_command, ) from .commands.colorize_equation_tokens import ColorizeEquationTokens from .commands.extract_symbols import ExtractSymbols from .commands.find_symbol_matches import FindSymbolMatches from .commands.find_symbol_sentences import FindSymbolSentences from .commands.locate_symbols import LocateSymbols from .commands.upload_symbols import UploadSymbols directories.register("detected-equation-tokens") directories.register("symbol-matches") directories.register("sentences-for-equation-tokens") directories.register("sentences-for-symbols") directories.register("sources-with-colorized-equation-tokens") directories.register("compiled-sources-with-colorized-equation-tokens") directories.register("paper-with-colorized-equation-tokens-images") directories.register("diff-images-with-colorized-equation-tokens") directories.register("hue-locations-for-equation-tokens") directories.register("symbol-locations") commands = [ ExtractSymbols, FindSymbolMatches, make_find_entity_sentences_command("equation-tokens"),
def create_entity_localization_command_sequence( entity_name: str, EntityExtractorType: Type[EntityExtractor], extract_contexts: bool = False, DetectedEntityType: Optional[Type[SerializableEntity]] = None, upload_func: Optional[EntityUploadCallable] = None, colorize_options: ColorizeOptions = ColorizeOptions(), colorize_func: Optional[ColorizeFunc] = None, ) -> List[Type[Command]]: # type: ignore """ Create a set of commands that can be used to locate a new type of entity. In the simplest case, all you have to provide is and 'entity_name' to be used for naming output files, and 'entity_type' that can be used to filter which commands are being run when you the full pipeline is run, and an 'EntityExtractorType' that locates all instances of that entity in the TeX. This function creates the commands necessary to colorize the entities, compile the LaTeX, raster the pages, and locate the colors in the pages. You may define additional paramters (e.g., 'colorize_options') to fine-tune the commands. To extract the contexts for an entity (i.e., the sentences in which the entities appear), set 'extract_contexts' to True. If you are trying to find the locations of a new type of entity, it is highly recommended that you use this convenience methods instead of creating new commands yourself. """ commands: CommandList = [] directories.register(f"detected-{entity_name}") commands.append( make_detect_entities_command(entity_name, EntityExtractorType)) if extract_contexts: directories.register(f"contexts-for-{entity_name}") commands.append(make_extract_contexts_command(entity_name)) directories.register(f"sources-with-colorized-{entity_name}") directories.register(f"compiled-sources-with-colorized-{entity_name}") directories.register(f"paper-images-with-colorized-{entity_name}") directories.register(f"diffed-images-with-colorized-{entity_name}") directories.register(f"{entity_name}-locations") commands.append( make_locate_entities_command(entity_name, None, DetectedEntityType, colorize_options, colorize_func)) if upload_func is not None: upload_command = make_upload_entities_command( entity_name, upload_func, DetectedEntityType=DetectedEntityType) commands.append(upload_command) return commands
from common import directories from common.commands.base import CommandList from common.commands.compile_tex import make_compile_tex_command from common.commands.diff_images import make_diff_images_command from common.commands.locate_hues import make_locate_hues_command from common.commands.raster_pages import make_raster_pages_command from scripts.pipelines import EntityPipeline, register_entity_pipeline from .commands.colorize_citations import ColorizeCitations from .commands.extract_bibitems import ExtractBibitems from .commands.locate_citations import LocateCitations from .commands.resolve_bibitems import ResolveBibitems from .commands.upload_citations import UploadCitations from .make_digest import make_digest directories.register("bibitems") directories.register("bibitem-resolutions") directories.register("sources-with-colorized-citations") directories.register("compiled-sources-with-colorized-citations") directories.register("paper-with-colorized-citations-images") directories.register("diff-images-with-colorized-citations") directories.register("hue-locations-for-citations") directories.register("citation-locations") directories.register("sources-with-annotated-symbols") commands: CommandList = [ ExtractBibitems, ResolveBibitems, ColorizeCitations, make_compile_tex_command("citations"),
def create_entity_localization_command_sequence( entity_name: str, EntityExtractorType: Type[EntityExtractor], DetectedEntityType: Optional[Type[SerializableEntity]] = None, upload_func: Optional[EntityUploadCallable] = None, colorize_entity_when: Optional[ColorWhenFunc] = None, get_color_positions: Optional[ColorPositionsFunc] = None, ) -> List[Type[Command]]: # type: ignore """ Create a set of commands that can be used to locate a new type of entity. In the simplest case, all you have to provide is and 'entity_name' to be used for naming output files, and 'entity_type' that can be used to filter which commands are being run when you the full pipeline is run, and an 'EntityExtractorType' that locates all instances of that entity in the TeX. This function creates the commands necessary to colorize the entities, compile the LaTeX, raster the pages, and locate the colors in the pages. You may define additional paramters (e.g., 'colorize_entity_when') to fine-tune the commands. If you are trying to find the locations of a new type of entity, it is highly recommended that you use this convenience methods instead of creating new commands yourself. """ # Register directories for output from intermediate pipeline stages. directories.register(f"detected-{entity_name}") directories.register(f"sources-with-colorized-{entity_name}") directories.register(f"compiled-sources-with-colorized-{entity_name}") directories.register(f"paper-with-colorized-{entity_name}-images") directories.register(f"diff-images-with-colorized-{entity_name}") directories.register(f"hue-locations-for-{entity_name}") commands: CommandList = [ make_detect_entities_command(entity_name, EntityExtractorType), make_colorize_tex_command( entity_name=entity_name, DetectedEntityType=DetectedEntityType, when=colorize_entity_when, get_color_positions=get_color_positions, ), make_compile_tex_command(entity_name), make_raster_pages_command(entity_name), make_diff_images_command(entity_name), make_locate_hues_command(entity_name), ] if upload_func is not None: upload_command = make_upload_entities_command( entity_name, upload_func, DetectedEntityType=DetectedEntityType ) commands.append(upload_command) return commands