def annotate_symbols_and_equations_for_file( tex: str, tex_path: RelativePath, symbols: SymbolDict, characters: CharacterDict) -> Tuple[str, Set[str]]: # Extract all equations equation_extractor = EquationExtractor() equations = list(equation_extractor.parse(tex_path, tex)) # Group symbols by equation ID symbols_by_equation_id = _group_by_equation(symbols) # Create a list of annotations annotations: List[Annotation] = [] symbol_tex: Set[str] = set() for equation in equations: equation_id = EquationId(tex_path, equation.i) equation_symbols = symbols_by_equation_id.get(equation_id, []) equation_annotations = _create_annotations_for_equation( tex, equation, equation_id, equation_symbols, characters) annotations.extend(equation_annotations.annotations) symbol_tex.update(equation_annotations.symbol_tex) # Annotate the TeX annotated_tex = tex annotations_reverse_order = sorted(annotations, key=lambda a: a.position, reverse=True) for annotation in annotations_reverse_order: position = annotation.position annotated_tex = (annotated_tex[:position] + annotation.text + annotated_tex[position:]) return annotated_tex, symbol_tex
def _group_by_equation(symbols: SymbolDict) -> SymbolsByEquationId: symbols_by_equation_id: SymbolsByEquationId = {} for symbol_id, symbol in symbols.items(): equation_id = EquationId(symbol_id.tex_path, symbol_id.equation_index) if equation_id not in symbols_by_equation_id: symbols_by_equation_id[equation_id] = [] symbols_by_equation_id[equation_id].append(symbol) return symbols_by_equation_id
def load_equations(arxiv_id: ArxivId) -> Optional[Dict[EquationId, Equation]]: equations_path = os.path.join( directories.arxiv_subdir("detected-equations", arxiv_id), "entities.csv" ) if not os.path.exists(equations_path): logging.warning("No equation data found for paper %s. Skipping.", arxiv_id) return None equations: Dict[EquationId, Equation] = {} for e in load_from_csv(equations_path, Equation): equation_id = EquationId(tex_path=e.tex_path, equation_index=int(e.i)) equations[equation_id] = e return equations
def colorize_equation_tokens( file_contents: Dict[TexFileName, FileContents], tokens: List[SerializableToken], insert_color_macros: bool = True, preset_hue: Optional[float] = None, ) -> Iterator[TokenColorizationBatch]: equations_by_file: Dict[TexFileName, Set[EquationId]] = {} tokens_by_equation: Dict[EquationId, List[SerializableToken]] = {} for token in tokens: equation_id = EquationId(token.tex_path, token.equation_index) if equation_id not in tokens_by_equation: tokens_by_equation[equation_id] = [] # Only color tokens that aren't in nested equations (i.e. equations contained in other # equations). While coloring commands can technically be used multiple times on the same # token with the same visual outcome, processing nested equations will break the expected # positions of the tokens on the second coloring pass. if token.equation_depth == 0: tokens_by_equation[equation_id].append(token) if not token.tex_path in equations_by_file: equations_by_file[token.tex_path] = set() equations_by_file[token.tex_path].add( EquationId(token.tex_path, token.equation_index)) # Number of tokens to skip when coloring. Starts at 0, and increases with each pass of # coloring. Multiple passes will be needed as the distinct hues for tokens runs out fast. # Tokens are colored in parallel for all equations from all TeX files, as the search for # colors will be done within the bounding boxes detected for each equation independently. token_skip = 0 more_batches = True while more_batches: colorized_files: Dict[TexFileName, FileContents] = {} colorized_tokens = [] for tex_filename, tex_file_contents in file_contents.items(): if not tex_filename in equations_by_file: continue colorized_tex = tex_file_contents.contents equations_for_file = equations_by_file[tex_filename] equations_reverse_order = sorted( equations_for_file, key=lambda e: e.equation_index, reverse=True, ) for equation_id in equations_reverse_order: equation_tokens = tokens_by_equation.get(equation_id) if equation_tokens is not None: ( colorized_tex, colorized_tokens_for_equation, ) = _colorize_tokens_for_equation(colorized_tex, equation_tokens, token_skip, preset_hue) colorized_tokens.extend(colorized_tokens_for_equation) # Only insert color macros after all entities have been wrapped in color commands. # The color macros will likely go at the very beginning of the file, and therefore # if they are added before the color commands, they are likely to disrupt the character # positions at which we expect to find the entities. if insert_color_macros: colorized_tex = add_color_macros(colorized_tex) colorized_files[tex_filename] = FileContents( tex_file_contents.path, colorized_tex, tex_file_contents.encoding) # If some tokens were colorized... if len(colorized_tokens) > 0: # Return batch of colorized tokens and colorized TeX yield TokenColorizationBatch(colorized_files, colorized_tokens) colorized_tokens = [] colorized_files = {} # Continue coloring, starting from another set of tokens more_batches = True token_skip += NUM_HUES else: more_batches = False
def load_hues(self, arxiv_id: ArxivId, iteration: str) -> List[HueSearchRegion]: equation_boxes_path = os.path.join( directories.arxiv_subdir("hue-locations-for-equations", arxiv_id), "hue_locations.csv", ) bounding_boxes: Dict[EquationId, BoundingBoxesByFile] = {} for location_info in file_utils.load_from_csv(equation_boxes_path, HueLocationInfo): equation_id = EquationId( tex_path=location_info.tex_path, equation_index=int(location_info.entity_id), ) if equation_id not in bounding_boxes: bounding_boxes[equation_id] = {} file_path = location_info.relative_file_path if file_path not in bounding_boxes[equation_id]: bounding_boxes[equation_id][file_path] = [] box = BoundingBox( page=location_info.page, left=location_info.left, top=location_info.top, width=location_info.width, height=location_info.height, ) bounding_boxes[equation_id][file_path].append(box) token_records_by_equation: Dict[EquationId, Dict[ int, EquationTokenColorizationRecord]] = {} token_hues_path = os.path.join( directories.iteration( "sources-with-colorized-equation-tokens", arxiv_id, iteration, ), "entity_hues.csv", ) for record in file_utils.load_from_csv( token_hues_path, EquationTokenColorizationRecord): equation_id = EquationId(tex_path=record.tex_path, equation_index=record.equation_index) token_index = int(record.token_index) if equation_id not in token_records_by_equation: token_records_by_equation[equation_id] = {} token_records_by_equation[equation_id][token_index] = record hue_searches = [] for equation_id, boxes_by_file in bounding_boxes.items(): for file_path, boxes in boxes_by_file.items(): masks_by_page: MasksForPages = {} for box in boxes: if box.page not in masks_by_page: masks_by_page[box.page] = [] masks_by_page[box.page].append( Rectangle(box.left, box.top, box.width, box.height)) if equation_id in token_records_by_equation: for token_index, record in token_records_by_equation[ equation_id].items(): hue_searches.append( HueSearchRegion( hue=record.hue, record=record, relative_file_path=file_path, masks=masks_by_page, )) return hue_searches