def test_root_annotations(self): lines = [ LineWithMeta( line="bold text\n", hierarchy_level=HierarchyLevel.create_root(), metadata=ParagraphMetadata(paragraph_type="root", predicted_classes=None, page_id=0, line_id=0), annotations=[BoldAnnotation(start=0, end=10, value="True")]), LineWithMeta( line="italic text\n", hierarchy_level=HierarchyLevel.create_root(), metadata=ParagraphMetadata(paragraph_type="root", predicted_classes=None, page_id=0, line_id=1), annotations=[ItalicAnnotation(start=0, end=12, value="True")]), ] node = TreeNode.create(lines=lines) node_annotations = node.get_root().annotations node_annotations.sort(key=lambda a: a.start) self.assertEqual(2, len(node_annotations)) bold, italic = node_annotations self.assertEqual(BoldAnnotation.name, bold.name) self.assertEqual("True", bold.value) self.assertEqual(0, bold.start) self.assertEqual(10, bold.end) self.assertEqual(ItalicAnnotation.name, italic.name) self.assertEqual("True", italic.value) self.assertEqual(10, italic.start) self.assertEqual(22, italic.end)
def _get_lines_with_meta( self, paragraph_list: List[Paragraph]) -> List[LineWithMeta]: """ :param paragraph_list: list of Paragraph :return: list of LineWithMeta """ lines_with_meta = [] paragraph_id = 0 for paragraph in paragraph_list: # line with meta: # {"text": "", # "type": ""("paragraph" ,"list_item", "raw_text", "style_header"), # "level": (1,1) or None (hierarchy_level), # "annotations": [["size", start, end, size], ["bold", start, end, "True"], ...]} paragraph_properties = ParagraphInfo(paragraph) line_with_meta = paragraph_properties.get_info() text = line_with_meta["text"] uid = '{}_{}'.format(self.path_hash, line_with_meta["uid"]) paragraph_type = line_with_meta["type"] level = line_with_meta["level"] if level: hierarchy_level = HierarchyLevel(level[0], level[1], False, paragraph_type) else: hierarchy_level = HierarchyLevel(None, None, False, "raw_text") dict2annotations = { "bold": BoldAnnotation, "italic": ItalicAnnotation, "underlined": UnderlinedAnnotation, "size": SizeAnnotation, "indentation": IndentationAnnotation, "alignment": AlignmentAnnotation, "style": StyleAnnotation } annotations = [] for annotation in line_with_meta["annotations"]: annotations.append( dict2annotations[annotation[0]](*annotation[1:])) paragraph_id += 1 metadata = ParagraphMetadata(paragraph_type=paragraph_type, predicted_classes=None, page_id=0, line_id=paragraph_id) lines_with_meta.append( LineWithMeta(line=text, hierarchy_level=hierarchy_level, metadata=metadata, annotations=annotations, uid=uid)) lines_with_meta = self.hierarchy_level_extractor.get_hierarchy_level( lines_with_meta) return lines_with_meta
def test_equal(self): h1 = HierarchyLevel(level_1=3, level_2=3, can_be_multiline=True, paragraph_type="header") h2 = HierarchyLevel(level_1=3, level_2=3, can_be_multiline=True, paragraph_type="header") self.assertFalse(h1 < h2) self.assertTrue(h1 <= h2) self.assertFalse(h1 > h2) self.assertTrue(h1 >= h2) self.assertTrue(h1 == h2)
def test_one_greater_than_other_level2(self): h1 = HierarchyLevel(level_1=2, level_2=1, can_be_multiline=False, paragraph_type="list_item") h2 = HierarchyLevel(level_1=2, level_2=2, can_be_multiline=False, paragraph_type="list_item") self.assertTrue(h1 < h2) self.assertTrue(h1 <= h2) self.assertFalse(h1 > h2) self.assertFalse(h1 >= h2) self.assertFalse(h1 == h2)
def test_raw_text_greater_than_any_other(self): list_item = HierarchyLevel(level_1=2, level_2=1, can_be_multiline=False, paragraph_type="list_item") raw_text = HierarchyLevel.create_raw_text() self.assertFalse(list_item > raw_text) self.assertFalse(list_item >= raw_text) self.assertFalse(list_item == raw_text) self.assertTrue(list_item < raw_text) self.assertTrue(list_item <= raw_text)
def test_two_raw_text(self): h1 = HierarchyLevel.create_raw_text() h2 = HierarchyLevel.create_raw_text() h3 = HierarchyLevel(level_1=1, level_2=2, can_be_multiline=False, paragraph_type="raw_text") self.assertTrue(h1 == h2) self.assertTrue(h1 >= h2) self.assertTrue(h1 <= h2) self.assertTrue(h1 == h3) self.assertTrue(h1 >= h3) self.assertTrue(h1 <= h3)
def create(lines: List[LineWithMeta] = None) -> "TreeNode": """ Creates a root node with given text :param lines: this lines should be the title of the document (or should be empty for documents without title) :return: root of the document tree """ page_id = 0 if len(lines) == 0 else min( (line.metadata.page_id for line in lines)) line_id = 0 if len(lines) == 0 else min( (line.metadata.line_id for line in lines)) texts = (line.line for line in lines) annotations = [] text_length = 0 for line in lines: annotations.extend( TreeNode.__shift_annotations(line=line, text_length=text_length)) text_length += len(line.line) text = "".join(texts) metadata = ParagraphMetadata(paragraph_type="root", page_id=page_id, line_id=line_id, predicted_classes=None) return TreeNode("0", text, annotations=annotations, metadata=metadata, subparagraphs=[], hierarchy_level=HierarchyLevel.create_root(), parent=None)
def insert_table(self, document: UnstructuredDocument) -> UnstructuredDocument: """ takes a document as the input and insert table cells into the paragraphs list. Does not insert table if it already was inserted """ tables_dict = {table.metadata.uid: table for table in document.tables if not table.metadata.is_inserted} paragraphs = [] hierarchy_level = max((line.hierarchy_level.level_1 for line in document.lines if not line.hierarchy_level.is_raw_text()), default=0) hierarchy_level_raw_text = HierarchyLevel(level_1=hierarchy_level + 1, level_2=0, can_be_multiline=True, paragraph_type=HierarchyLevel.raw_text) for line in document.lines: if line.hierarchy_level.is_raw_text(): line.set_hierarchy_level(hierarchy_level_raw_text) paragraphs.append(line) for annotation in line.annotations: if annotation.name == TableAnnotation.name: table_id = annotation.value if table_id in tables_dict: table = tables_dict[table_id] paragraphs += self._create_paragraphs_from_table(table=table, hierarchy_level=hierarchy_level) tables_dict.pop(table_id) for table in tables_dict.values(): paragraphs += self._create_paragraphs_from_table(table=table, hierarchy_level=hierarchy_level) return UnstructuredDocument(lines=paragraphs, tables=document.tables, attachments=document.attachments)
def __get_line(self, text: str, level1: int, level2: int, hl: str = "list") -> LineWithMeta: hierarchy_level = HierarchyLevel(level1, level2, False, hl) metadata = ParagraphMetadata("list_item", None, 0, None) return LineWithMeta(text, hierarchy_level=hierarchy_level, metadata=metadata, annotations=[])
def __create_list_line(line: LineWithMeta): return LineWithMeta(line="", hierarchy_level=HierarchyLevel( level_1=line.hierarchy_level.level_1, level_2=line.hierarchy_level.level_2 - 0.5, # noqa it is intentionaly for lists paragraph_type="list", can_be_multiline=False ), metadata=ParagraphMetadata(paragraph_type="list", page_id=line.metadata.page_id, line_id=line.metadata.line_id, predicted_classes=None), annotations=[])
def _create_cell_line(table: Table, hierarchy_level: int, cell: str) -> LineWithMeta: hierarchy_level_new = HierarchyLevel( level_1=hierarchy_level + 3, level_2=0, can_be_multiline=False, paragraph_type="table_cell" ) metadata = ParagraphMetadata(paragraph_type="table_cell", predicted_classes=None, page_id=table.metadata.page_id, line_id=None) return LineWithMeta(line=cell, hierarchy_level=hierarchy_level_new, metadata=metadata, annotations=[])
def _create_table_line(table: Table, hierarchy_level: int) -> LineWithMeta: hierarchy_level_new = HierarchyLevel( level_1=hierarchy_level + 1, level_2=0, can_be_multiline=False, paragraph_type="table" ) metadata = ParagraphMetadata(paragraph_type="table", predicted_classes=None, page_id=table.metadata.page_id, line_id=None) return LineWithMeta(line="", hierarchy_level=hierarchy_level_new, metadata=metadata, annotations=[], uid="table_{}".format(table.metadata.uid))
def create(texts: Iterable[str]) -> "TreeNode": """ Creates a root node with given text :param texts: this text should be the title of the document (or should be empty for documents without title) :return: root of the document tree """ text = "\n".join(texts) metadata = ParagraphMetadata(paragraph_type="root", page_id=0, line_id=0, predicted_classes=None) hierarchy_level = HierarchyLevel(0, 0, True, paragraph_type="root") return TreeNode("0", text, annotations=[], metadata=metadata, subparagraphs=[], hierarchy_level=hierarchy_level, parent=None)
def __handle_one_element(self, depth: int, value, paragraph_type: str, paragraph_type_meta): if depth == 1 and paragraph_type == "title": level1 = 0 level2 = 0 else: level1 = depth level2 = 1 hierarchy_level = HierarchyLevel(level_1=level1, level_2=level2, can_be_multiline=False, paragraph_type=paragraph_type_meta) metadata = ParagraphMetadata(paragraph_type=paragraph_type, predicted_classes=None, page_id=0, line_id=None) line = LineWithMeta(line=self.__get_text(value), hierarchy_level=hierarchy_level, metadata=metadata, annotations=[]) return line
def _get_lines_with_meta( self, hierarchy_level_extractor: HierarchyLevelExtractor ) -> List[LineWithMeta]: """ :param paragraph_list: list of Paragraph :return: list of LineWithMeta """ lines_with_meta = [] paragraph_id = 0 for i, paragraph in enumerate(self.paragraph_list): # line with meta: # {"text": "", # "type": ""("paragraph" ,"list_item", "raw_text", "style_header"), # "level": (1,1) or None (hierarchy_level), # "annotations": [["size", start, end, size], ["bold", start, end, "True"], ...]} paragraph_properties = ParagraphInfo(paragraph) line_with_meta = paragraph_properties.get_info() text = line_with_meta["text"] paragraph_type = line_with_meta["type"] level = line_with_meta["level"] if level: hierarchy_level = HierarchyLevel(level[0], level[1], False, paragraph_type) else: hierarchy_level = HierarchyLevel.create_raw_text() dict2annotations = { "bold": BoldAnnotation, "italic": ItalicAnnotation, "underlined": UnderlinedAnnotation, "size": SizeAnnotation, "indentation": IndentationAnnotation, "alignment": AlignmentAnnotation, "style": StyleAnnotation, } annotations = [] for annotation in line_with_meta["annotations"]: annotations.append( dict2annotations[annotation[0]](*annotation[1:])) for object_dict in [self.image_refs, self.diagram_refs]: if i in object_dict: for object_uid in object_dict[i]: annotation = AttachAnnotation(attach_uid=object_uid, start=0, end=len(text)) annotations.append(annotation) if i in self.table_refs: for table_uid in self.table_refs[i]: annotation = TableAnnotation(name=table_uid, start=0, end=len(text)) annotations.append(annotation) paragraph_id += 1 metadata = ParagraphMetadata(paragraph_type=paragraph_type, predicted_classes=None, page_id=0, line_id=paragraph_id) lines_with_meta.append( LineWithMeta(line=text, hierarchy_level=hierarchy_level, metadata=metadata, annotations=annotations, uid=paragraph.uid)) lines_with_meta = hierarchy_level_extractor.get_hierarchy_level( lines_with_meta) return lines_with_meta
# but unstructured document consist of flat list of lines with text and metadata # hierarchy structure hidden in HierarchyLevel attribute of LineWithMeta # lets build firs line, it is document tree root: text = "DOCUMENT TITLE" metadata = ParagraphMetadata(paragraph_type="title", predicted_classes=None, page_id=0, line_id=0) # hierarchy level define position of this line in document tree. hierarchy_level = HierarchyLevel( # most important parameters of HierarchyLevel is level_1 and level_2 # hierarchy level compares by tuple (level_1, level_2) lesser -> closer to the root of the tree level_1=0, level_2=0, # can_be_multiline and paragraph_type - some parts of the document (for example title) may take more # than one line # if can_be_multiline is true than several lines in a row with same level_1, level_2 and paragraph_type # will be merged in one tree node can_be_multiline=True, paragraph_type="title") # Annotations: one may specify some information about some part of the text, for example that some word # written in italic font. annotations = [] line1 = LineWithMeta(line=text, hierarchy_level=hierarchy_level, metadata=metadata, annotations=annotations)