def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: prs = Presentation(path) lines, tables = [], [] for page_id, slide in enumerate(prs.slides, start=1): for paragraph_id, shape in enumerate(slide.shapes, start=1): if shape.has_text_frame: metadata = ParagraphMetadata(paragraph_type="raw_text", predicted_classes=None, page_id=page_id, line_id=paragraph_id) lines.append( LineWithMeta(line=shape.text, hierarchy_level=None, metadata=metadata, annotations=[])) if shape.has_table: cells = [[cell.text for cell in row.cells] for row in shape.table.rows] metadata = TableMetadata(page_id=page_id) tables.append(Table(cells=cells, metadata=metadata)) lines = self.hierarchy_level_extractor.get_hierarchy_level(lines) return UnstructuredDocument(lines=lines, tables=tables, attachments=[])
def get_api_dict(api: Api) -> Model: return api.model('Table', { 'cells': fields.List(fields.List(fields.String(description="Cell contains text")), description="matrix of cells"), 'metadata': fields.Nested(TableMetadata.get_api_dict(api), readonly=True, description='Table meta information') })
def _handle_table_xml(self, paragraph_xml: BeautifulSoup): table = DocxTable(paragraph_xml, self.styles_extractor) metadata = TableMetadata(page_id=None, uid=table.uid) self.tables.append(Table(cells=table.get_cells(), metadata=metadata)) table_uid = table.uid if not self.paragraph_list: empty_paragraph_xml = BeautifulSoup('<w:p></w:p>').body.contents[0] empty_paragraph = self.__xml2paragraph(empty_paragraph_xml) self.paragraph_list.append(empty_paragraph) self.table_refs[len(self.paragraph_list) - 1].append(table_uid)
def __parse_sheet(self, sheet_id: int, sheet: Sheet) -> Table: n_rows = sheet.nrows n_cols = sheet.ncols res = [] for row_id in range(n_rows): row = [] for col_id in range(n_cols): value = sheet.cell_value(rowx=row_id, colx=col_id) row.append(value) res.append(row) metadata = TableMetadata(page_id=sheet_id) return Table(cells=res, metadata=metadata)
def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: delimiter = parameters.get("delimiter") if delimiter is None: delimiter = "\t" if path.endswith( ".tsv") else self.default_separator with open(path, errors="ignore") as file: csv_reader = csv.reader(file, delimiter=delimiter) data = list(csv_reader) table_metadata = TableMetadata(page_id=0) tables = [Table(cells=data, metadata=table_metadata)] return UnstructuredDocument(lines=[], tables=tables, attachments=[])
# in this example we create UnstructuredDocument, lets construct document corresponding to example.docx from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.paragraph_metadata import ParagraphMetadata from dedoc.data_structures.table import Table from dedoc.data_structures.table_metadata import TableMetadata # First of all lets create some table, table consist of cells (list of rows, and row is a list of strings from dedoc.structure_parser.heirarchy_level import HierarchyLevel table_cells = [ ["N", "Second name", "Name", "Organization", "Phone", "Notes"], ["1", "Ivanov", "Ivan", "ISP RAS", "8-800"], ] # table also has some metadata, lets assume that our table is on first page table_metadata = TableMetadata(page_id=0) # finally lets build table table = Table(cells=table_cells, metadata=table_metadata) # Documents also contain some text. # Logical structure of document may be represented by tree (see example_tree.png) # but unstructured document consist of flat list of lines with text and metadata # hierarchy structure hidden in HierarchyLevel attribute of LineWithMeta # lets build firs line, it is document tree root: text = "DOCUMENT TITLE" metadata = ParagraphMetadata(paragraph_type="title", predicted_classes=None, page_id=0, line_id=0) # hierarchy level define position of this line in document tree.
def _process_table(table: DocxTable) -> Table: cells = [[cell.text for cell in row.cells] for row in table.rows] metadata = TableMetadata(page_id=None) return Table(cells=cells, metadata=metadata)