def test_two_raw_text(self): h1 = HierarchyLevel.create_raw_text() h2 = HierarchyLevel.create_raw_text() h3 = HierarchyLevel(level_1=1, level_2=2, can_be_multiline=False, paragraph_type="raw_text") self.assertTrue(h1 == h2) self.assertTrue(h1 >= h2) self.assertTrue(h1 <= h2) self.assertTrue(h1 == h3) self.assertTrue(h1 >= h3) self.assertTrue(h1 <= h3)
def test_raw_text_greater_than_any_other(self): list_item = HierarchyLevel(level_1=2, level_2=1, can_be_multiline=False, paragraph_type="list_item") raw_text = HierarchyLevel.create_raw_text() self.assertFalse(list_item > raw_text) self.assertFalse(list_item >= raw_text) self.assertFalse(list_item == raw_text) self.assertTrue(list_item < raw_text) self.assertTrue(list_item <= raw_text)
def _get_lines_with_meta( self, hierarchy_level_extractor: HierarchyLevelExtractor ) -> List[LineWithMeta]: """ :param paragraph_list: list of Paragraph :return: list of LineWithMeta """ lines_with_meta = [] paragraph_id = 0 for i, paragraph in enumerate(self.paragraph_list): # line with meta: # {"text": "", # "type": ""("paragraph" ,"list_item", "raw_text", "style_header"), # "level": (1,1) or None (hierarchy_level), # "annotations": [["size", start, end, size], ["bold", start, end, "True"], ...]} paragraph_properties = ParagraphInfo(paragraph) line_with_meta = paragraph_properties.get_info() text = line_with_meta["text"] paragraph_type = line_with_meta["type"] level = line_with_meta["level"] if level: hierarchy_level = HierarchyLevel(level[0], level[1], False, paragraph_type) else: hierarchy_level = HierarchyLevel.create_raw_text() dict2annotations = { "bold": BoldAnnotation, "italic": ItalicAnnotation, "underlined": UnderlinedAnnotation, "size": SizeAnnotation, "indentation": IndentationAnnotation, "alignment": AlignmentAnnotation, "style": StyleAnnotation, } annotations = [] for annotation in line_with_meta["annotations"]: annotations.append( dict2annotations[annotation[0]](*annotation[1:])) for object_dict in [self.image_refs, self.diagram_refs]: if i in object_dict: for object_uid in object_dict[i]: annotation = AttachAnnotation(attach_uid=object_uid, start=0, end=len(text)) annotations.append(annotation) if i in self.table_refs: for table_uid in self.table_refs[i]: annotation = TableAnnotation(name=table_uid, start=0, end=len(text)) annotations.append(annotation) paragraph_id += 1 metadata = ParagraphMetadata(paragraph_type=paragraph_type, predicted_classes=None, page_id=0, line_id=paragraph_id) lines_with_meta.append( LineWithMeta(line=text, hierarchy_level=hierarchy_level, metadata=metadata, annotations=annotations, uid=paragraph.uid)) lines_with_meta = hierarchy_level_extractor.get_hierarchy_level( lines_with_meta) return lines_with_meta