def test_extract_multi_line_text_between_list_items(): structured_content = [] text_list = [] list_extractor = ListExtractor() content = ''' <ul> <li>First <a href="#">link</a> item</li> <h2>List heading</h2> <p>Second<br> line</p> Third line <li>Two</li> <li>Three</li> </ul> ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): list_extractor.extract(elem, ev, structured_content, text_list) assert len(text_list) == 7 assert len(structured_content[5]['items']) == 3 assert text_list[0] == 'First link item' assert text_list[1] == 'List heading' assert text_list[3] == 'line' assert structured_content[2]['type'] == 'text' assert structured_content[2]['text'] == 'Second' assert structured_content[5]['type'] == 'list' assert structured_content[5]['items'][0] == 'First [[link]] item' assert structured_content[5]['items'][1] == 'Two' assert 'heading' not in structured_content[2]
def test_extract_table_with_column_1_headers(): structured_content = [] text_list = [] table_extractor = TableExtractor() content = ''' <table> <tr> <th>Column Heading 1</th> <th>Column Heading 2</th> <th>Column Heading 3</th> </tr> <tr> <th>Row 1 Column 1</th> <td>Row 1 Column 2</td> <td>Row 1 Column 3</td> </tr> <tr> <th>Row 2 Column 1</th> <td>Row 2 Column 2</td> <td>Row 2 Column 3</td> </tr> </table> ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): table_extractor.extract(elem, ev, structured_content, text_list) assert len(text_list) == 3 assert len(structured_content[0]['body'][0]) == 3 assert text_list[ 0] == r'Column Heading 1\tColumn Heading 2\tColumn Heading 3' assert structured_content[0]['type'] == 'table' assert structured_content[0]['head'][0][0] == 'Column Heading 1' assert structured_content[0]['body'][0][0] == 'Row 1 Column 1'
def test_extract_untagged_text_at_end_of_list(): structured_content = [] text_list = [] list_extractor = ListExtractor() content = ''' <ul> <li>First <a href="#">link</a> item</li> <li>Two</li> <li>Three</li> List heading </ul> ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): list_extractor.extract(elem, ev, structured_content, text_list) assert len(text_list) == 4 assert len(structured_content[2]['items']) == 3 assert text_list[2] == 'Three' assert text_list[3] == 'List heading' assert structured_content[1]['type'] == 'text' assert structured_content[1]['text'] == 'List heading' assert structured_content[2]['type'] == 'list' assert structured_content[2]['items'][2] == 'Three' assert 'heading' not in structured_content[2]
def test_extract_enclosed_text2(): structured_content = [] text_list = [] text_extractor = TextExtractor() content = ''' <h1>My Heading</h1> <div> First line <p>My <font color="#ccc">colored</font> <a href="#">text</a> line</p> Last line </div> <ul><li>List</li></ul> ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): text_extractor.extract(elem, ev, structured_content, text_list) assert len(text_list) == 3 assert text_list[0] == 'First line' assert text_list[1] == 'My colored text line' assert text_list[2] == 'Last line' assert structured_content[2]['type'] == 'text' assert structured_content[2]['text'] == 'My colored [[text]] line' assert structured_content[3]['type'] == 'text' assert structured_content[3]['text'] == 'Last line'
def test_extract_heading_and_text_combo(): structured_content = [] text_list = [] heading_extractor = HeadingExtractor() text_extractor = TextExtractor( excluded_tags=['ul', 'ol', 'title', 'h1', 'h2', 'h3', 'h4']) content = ''' <h1>My Heading</h1> <div> First line <p>My <font color="#ccc">colored</font> <a href="#">text</a> line</p> Last line </div> <ul> <li>First <a href="#">link</a> item</li> <li>Two</li> <li>Three</li> </ul> Trailing line ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): heading_extractor.extract(elem, ev, structured_content, text_list) text_extractor.extract(elem, ev, structured_content, text_list) assert len(text_list) == 5 assert text_list[0] == 'My Heading' assert text_list[2] == 'My colored text line' assert text_list[4] == 'Trailing line' assert structured_content[0]['type'] == 'heading' assert structured_content[0]['text'] == 'My Heading' assert structured_content[4]['type'] == 'text' assert structured_content[4]['text'] == 'Last line'
def test_extract_text_and_list_combo(): structured_content = [] text_list = [] text_extractor = TextExtractor(excluded_tags=['ul', 'ol']) list_extractor = ListExtractor() content = ''' <h1>My Heading</h1> <div> First line <p>My <font color="#ccc">colored</font> <a href="#">text</a> line</p> Last line </div> <ul> <li>First <a href="#">link</a> item</li> <li>Two</li> <li>Three</li> </ul> Trailing line ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): text_extractor.extract(elem, ev, structured_content, text_list) list_extractor.extract(elem, ev, structured_content, text_list) assert len(text_list) == 8 assert text_list[4] == 'First link item' assert text_list[7] == 'Trailing line' assert structured_content[4]['type'] == 'text' assert structured_content[4]['text'] == 'Last line' assert structured_content[6]['type'] == 'list' assert structured_content[6]['items'][0] == 'First [[link]] item'
def test_extract_text_with_line_breaks_at_head_of_list(): structured_content = [] text_list = [] list_extractor = ListExtractor() content = ''' <ul> <h2>List heading</h2> <p>Second<br> line</p> Third line <li>First <a href="#">link</a> item</li> <li>Two</li> <li>Three</li> </ul> ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): list_extractor.extract(elem, ev, structured_content, text_list) assert len(text_list) == 7 assert len(structured_content[1]['items']) == 3 assert text_list[0] == 'List heading' assert text_list[2] == 'line' assert structured_content[1]['type'] == 'list' assert structured_content[1][ 'heading'] == 'List heading Second line Third line' assert structured_content[1]['items'][0] == 'First [[link]] item'
def test_extract_embedded_heading(): structured_content = [] text_list = [] heading_extractor = HeadingExtractor() content = '<p>First</p><h1>My <span>Head</span>ing</h1><div>Last</div>' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): heading_extractor.extract(elem, ev, structured_content, text_list) assert text_list[0] == 'My Heading' assert structured_content[0]['type'] == 'heading' assert structured_content[0]['text'] == 'My Heading'
def test_extract_heading_with_line_break(): structured_content = [] text_list = [] heading_extractor = HeadingExtractor() content = '<h1>My<br> Heading</h1>' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): heading_extractor.extract(elem, ev, structured_content, text_list) assert text_list[0] == 'My Heading' assert structured_content[0]['type'] == 'heading' assert structured_content[0]['text'] == 'My Heading'
def test_extract_basic_text(): structured_content = [] text_list = [] text_extractor = TextExtractor() content = '<p>My text</p>' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): text_extractor.extract(elem, ev, structured_content, text_list) assert text_list[0] == 'My text' assert structured_content[0]['type'] == 'text' assert structured_content[0]['text'] == 'My text'
def test_extract_complex_text(): structured_content = [] text_list = [] text_extractor = TextExtractor() content = '<p>My <font color="#ccc">colored</font> <a href="#">text</a> line</p>' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): text_extractor.extract(elem, ev, structured_content, text_list) assert text_list[0] == 'My colored text line' assert structured_content[0]['type'] == 'link' assert structured_content[0]['text'] == 'text' assert structured_content[0]['url'] == '#' assert structured_content[1]['type'] == 'text' assert structured_content[1]['text'] == 'My colored [[text]] line'
def test_extract_anchor_from_heading3(): structured_content = [] text_list = [] heading_extractor = HeadingExtractor() content = '<h1>My <a href="link-url">Heading</a> text</h1>' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): heading_extractor.extract(elem, ev, structured_content, text_list) assert text_list[0] == 'My Heading text' assert structured_content[0]['type'] == 'link' assert structured_content[0]['text'] == 'Heading' assert structured_content[0]['url'] == 'link-url' assert structured_content[1]['type'] == 'heading' assert structured_content[1]['text'] == 'My [[Heading]] text'
def test_extract_text_and_table_combo(): structured_content = [] text_list = [] text_extractor = TextExtractor(excluded_tags=['table']) table_extractor = TableExtractor() content = ''' <h1>My Heading</h1> <div> First line <p>My <font color="#ccc">colored</font> <a href="#">text</a> line</p> Last line </div> <table> <tr> <th>Column Heading 1</th> <th>Column Heading 2</th> <th>Column Heading 3</th> </tr> <tr> <th>Row 1 Column 1</th> <td>Row 1 Column 2</td> <td>Row 1 Column 3</td> </tr> <tr> <th>Row 2 Column 1</th> <td>Row 2 Column 2</td> <td>Row 2 Column 3</td> </tr> </table> Trailing line ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): text_extractor.extract(elem, ev, structured_content, text_list) table_extractor.extract(elem, ev, structured_content, text_list) assert len(text_list) == 8 assert text_list[2] == 'My colored text line' assert text_list[5] == r'Row 1 Column 1\tRow 1 Column 2\tRow 1 Column 3' assert text_list[7] == 'Trailing line' assert structured_content[4]['type'] == 'text' assert structured_content[4]['text'] == 'Last line' assert structured_content[5]['type'] == 'table' assert structured_content[5]['body'][0][0] == 'Row 1 Column 1'
def test_exclude_heading_in_list(): structured_content = [] text_list = [] heading_extractor = HeadingExtractor(excluded_tags=['ul', 'ol']) content = ''' <ul> <h2>List heading</h2> <li>One</li> <li>Two</li> </ul> ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): heading_extractor.extract(elem, ev, structured_content, text_list) assert not text_list assert not structured_content
def test_extract_list_with_heading_and_no_items(): structured_content = [] text_list = [] list_extractor = ListExtractor() content = ''' <ul> <h2>List heading</h2> </ul> ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): list_extractor.extract(elem, ev, structured_content, text_list) assert len(text_list) == 1 assert len(structured_content[0]['items']) == 0 assert text_list[0] == 'List heading' assert structured_content[0]['type'] == 'list' assert structured_content[0]['heading'] == 'List heading'
def test_extract_complex_heading_2(): structured_content = [] text_list = [] heading_extractor = HeadingExtractor() content = """ <h2 style="margin-top:0cm;margin-right:30.05pt;margin-bottom:0cm;margin-left:22.4pt;margin-bottom:.0001pt"> <span style="font-family:calibri,sans-serif; font-size:11pt"> Please <u>STOP</u> using the AIM process for this issue. </span> </h2> """ stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): heading_extractor.extract(elem, ev, structured_content, text_list) assert text_list[0] == 'Please STOP using the AIM process for this issue.' assert structured_content[0]['type'] == 'heading' assert structured_content[0][ 'text'] == 'Please STOP using the AIM process for this issue.'
def test_extract_table_with_multiple_header_rows_using_head_tag(): structured_content = [] text_list = [] table_extractor = TableExtractor() # noinspection SpellCheckingInspection content = ''' <table> <thead> <tr> <td>Column Heading 1</td> <td>Column Heading 2</td> <td>Column Heading 3</td> </tr> <tr> <td>Row 1 Column 1</td> <td>Row 1 Column 2</td> <td>Row 1 Column 3</td> </tr> </thead> <tbody> <tr> <td>Row 2 Column 1</td> <td>Row 2 Column 2</td> <td>Row 2 Column 3</td> </tr> </tbody> </table> ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): table_extractor.extract(elem, ev, structured_content, text_list) assert len(text_list) == 3 assert len(structured_content[0]['head']) == 2 assert len(structured_content[0]['body']) == 1 assert text_list[ 0] == r'Column Heading 1\tColumn Heading 2\tColumn Heading 3' assert structured_content[0]['type'] == 'table' assert structured_content[0]['head'][1][0] == 'Row 1 Column 1' assert structured_content[0]['body'][0][0] == 'Row 2 Column 1'
def test_exclude_text_in_list(): structured_content = [] text_list = [] text_extractor = TextExtractor( excluded_tags=['ul', 'ol', 'table', 'title', 'h1', 'h2', 'h3', 'h4']) content = ''' <ul> <h2>List heading</h2> <p>Second line</p> Third line <li>One</li> <li>Two</li> </ul> ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): text_extractor.extract(elem, ev, structured_content, text_list) assert not text_list assert not structured_content
def test_extract_basic_ordered_list(): structured_content = [] text_list = [] list_extractor = ListExtractor() content = ''' <ol> <li>One</li> <li>Two</li> <li>Three</li> </ol> ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): list_extractor.extract(elem, ev, structured_content, text_list) assert len(text_list) == 3 assert len(structured_content[0]['items']) == 3 assert text_list[1] == 'Two' assert structured_content[0]['type'] == 'list' assert structured_content[0]['items'][2] == 'Three'
def test_extract_complex_list_items(): structured_content = [] text_list = [] list_extractor = ListExtractor() content = ''' <ul> <li>First <a href="#">link</a> item</li> <li>Two</li> <li>Three</li> </ul> ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): list_extractor.extract(elem, ev, structured_content, text_list) assert len(text_list) == 3 assert len(structured_content[1]['items']) == 3 assert text_list[0] == 'First link item' assert structured_content[1]['type'] == 'list' assert structured_content[1]['items'][0] == 'First [[link]] item'
def test_extract_trailing_text_at_eod(): structured_content = [] text_list = [] text_extractor = TextExtractor() content = ''' <h1>My Heading</h1> <div> First line <p>My <font color="#ccc">colored</font> <a href="#">text</a> line</p> Last line </div> Trailing line ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): text_extractor.extract(elem, ev, structured_content, text_list) assert len(text_list) == 4 assert text_list[3] == 'Trailing line' assert structured_content[4]['type'] == 'text' assert structured_content[4]['text'] == 'Trailing line'
def test_extract_anchor_from_basic_table(): structured_content = [] text_list = [] table_extractor = TableExtractor() content = ''' <table> <tr> <th>Column Heading 1</th> <th>Column Heading 2</th> <th>Column Heading 3</th> </tr> <tr> <td>Row 1 Column 1</td> <td>Row 1 <a href="link-url">Column 2</a></td> <td>Row 1 Column 3</td> </tr> <tr> <td>Row 2 Column 1</td> <td>Row 2 Column 2</td> <td>Row 2 Column 3</td> </tr> </table> ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): table_extractor.extract(elem, ev, structured_content, text_list) assert len(text_list) == 3 assert len(structured_content[1]['body'][0]) == 3 assert text_list[ 0] == r'Column Heading 1\tColumn Heading 2\tColumn Heading 3' assert structured_content[0]['type'] == 'link' assert structured_content[0]['text'] == 'Column 2' assert structured_content[0]['url'] == 'link-url' assert structured_content[1]['type'] == 'table' assert structured_content[1]['head'][0][0] == 'Column Heading 1' assert structured_content[1]['body'][0][1] == 'Row 1 [[Column 2]]'
def test_extract_table_with_embedded_tags(): structured_content = [] text_list = [] table_extractor = TableExtractor() content = ''' <table> <tr> <th>Column Heading 1</th> <th>Column Heading 2</th> <th>Column Heading 3</th> </tr> <tr> <td><strong>Row 1</strong> <a href="#">Column</a> 1</td> <td><ul><li>Row 1</li> <li>Column 2</li></ul></td> <td>Row 1 Column 3</td> </tr> <tr> <td><div>Row 2</div> Column 1</td> <td>Row 2 Column 2</td> <td>Row 2 Column 3</td> </tr> </table> ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): table_extractor.extract(elem, ev, structured_content, text_list) assert len(text_list) == 3 assert len(structured_content[1]['body'][0]) == 3 assert text_list[1] == r'Row 1 Column 1\tRow 1 Column 2\tRow 1 Column 3' assert text_list[2] == r'Row 2 Column 1\tRow 2 Column 2\tRow 2 Column 3' assert structured_content[1]['type'] == 'table' assert structured_content[1]['body'][0][0] == 'Row 1 [[Column]] 1' assert structured_content[1]['body'][0][1] == 'Row 1 Column 2' assert structured_content[1]['body'][1][0] == 'Row 2 Column 1'
def test_extract_anchor_from_basic_unordered_list(): structured_content = [] text_list = [] list_extractor = ListExtractor() content = ''' <ul> <li>One</li> <li>Two <a href="link-url">embedded</a> link</li> <li>Three</li> </ul> ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): list_extractor.extract(elem, ev, structured_content, text_list) assert len(text_list) == 3 assert len(structured_content[1]['items']) == 3 assert text_list[1] == 'Two embedded link' assert structured_content[0]['type'] == 'link' assert structured_content[0]['text'] == 'embedded' assert structured_content[0]['url'] == 'link-url' assert structured_content[1]['type'] == 'list' assert structured_content[1]['items'][1] == 'Two [[embedded]] link'
def process_doc(self, text: str, a: Dict[str, Any]) -> None: # treat all text as html # lxml will automatically wrap plain text in a para, body and html tags structured_content = [] text_list = [] extractors = [ ListExtractor(excluded_tags=['table']), TableExtractor(), TextExtractor(excluded_tags=[ 'ul', 'ol', 'table', 'title', 'h1', 'h2', 'h3', 'h4' ]), HeadingExtractor(excluded_tags=['ul', 'ol', 'table']) ] stream: IO[AnyStr] = BytesIO(fix_content(text).encode('utf-8')) for ev, elem in self.element_iterator(stream, html=True): process_html_element(elem, ev, extractors, structured_content, text_list, self.__nlp) # re-extract content in single column tables used for layout purposes only html = None # memoize k = [] for i, c in enumerate(structured_content): typ = c['type'] if typ in ['text', 'heading']: k.append(1) elif typ == 'list': k.append(len(c.get('items', []))) elif typ == 'table': k.append(len(c.get('head', [])) + len(c.get('body', []))) if len(c.get('fields', [])) == 1: if not html: # reset stream to reiterate stream.seek(0) # read stream into str and parse as html html = lxml.html.fromstring(stream.read()) # find single column layout table contents = html.xpath( ('/descendant::table[{0}]/tbody/tr/td/*|' + '/descendant::table[{0}]/tr/td/*').format(c['index'])) root = etree.Element('div') root.extend(contents) sc = [] tl = [] for evt, ele in etree.iterwalk(root, events=('start', 'end')): process_html_element(ele, evt, extractors, sc, tl, self.__nlp) j = len(c.get('references', [])) structured_content = flatten([ structured_content[:(i - j)], sc, structured_content[(i + 1):] ]) text_list = flatten([ text_list[:sum(k[:(i - j)])], tl, text_list[sum(k[:(i + 1)]):] ]) data = {} if len(text_list) == 1: data['text'] = text_list[0] else: data['text'] = text_list if structured_content: data['structured_content'] = structured_content a['data'] = data
def extract_text(c: Dict[str, Any], # logger: Logger, a: Dict[str, Any], excluded_tags: List[str], output_handler: Callable, f: TextIO) -> str: # logger.debug('process file: {}'.format(f.name)) a.update({ 'data': {}, 'is_data': False, 'metadata': {'doc_type': None, 'record_id': None} }) it = etree.iterparse(f, events=('start', 'end')) stream = ((event, el) for event, el in it if el.tag not in excluded_tags) for event, el in stream: if el.tag == 'CONTENT' and event == 'end': a['metadata']['record_id'] = el.get('RECORDID') elif el.tag == 'MASTERIDENTIFER' and event == 'end': a['metadata']['title'] = el.text elif el.tag == 'TYPE' and event == 'end': a['metadata']['doc_type'] = el.text elif el.tag == 'DOCUMENTID' and event == 'end': a['metadata']['doc_id'] = el.text elif el.tag == 'VERSION' and event == 'end': a['metadata']['version'] = el.text elif el.tag == 'AUTHOR' and event == 'end': a['metadata']['author'] = el.text elif el.tag == 'ENDTIMESTAMP_MILLIS' and event == 'end': millis = int(el.text) a['metadata']['end_timestamp_millis'] = millis a['metadata']['end_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'STARTTIMESTAMP_MILLIS' and event == 'end': millis = int(el.text) a['metadata']['start_timestamp_millis'] = millis a['metadata']['start_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'CREATETIMESTAMP_MILLIS' and event == 'end': millis = int(el.text) a['metadata']['create_timestamp_millis'] = millis a['metadata']['create_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'LASTMODIFIEDTIMESTAMP_MILLIS' and event == 'end': millis = int(el.text) a['metadata']['last_modified_timestamp_millis'] = millis a['metadata']['last_modified_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'RESOURCEPATH' and event == 'end': a['metadata']['doc_location_path'] = el.text elif el.tag == 'PUBLISHEDTIMESTAMP_MILLIS' and event == 'end': millis = int(el.text) a['metadata']['published_timestamp_millis'] = millis a['metadata']['published_time'] = get_iso_datetime_from_millis(millis) elif el.tag == a['metadata']['doc_type']: a['is_data'] = (event == 'start') elif a['is_data'] and event == 'end' and el.text: # treat all text as html # lxml will automatically wrap plain text in a para, body and html tags structured_content = [] text_list = [] list_extractor = ListExtractor(excluded_tags=['table']) table_extractor = TableExtractor() text_extractor = TextExtractor(excluded_tags=['ul', 'ol', 'table', 'title', 'h1', 'h2', 'h3', 'h4']) heading_extractor = HeadingExtractor(excluded_tags=['ul', 'ol', 'table']) stream = BytesIO(fix_content(el.text).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): heading_extractor.extract(elem, ev, structured_content, text_list) text_extractor.extract(elem, ev, structured_content, text_list) list_extractor.extract(elem, ev, structured_content, text_list) table_extractor.extract(elem, ev, structured_content, text_list) data = {} if len(text_list) == 1: data['text'] = text_list[0] else: data['text'] = text_list if structured_content: data['structured_content'] = structured_content a['data'][el.tag.lower()] = data now = datetime.utcnow().isoformat() a['files_processed'].append({ 'path': f.name, 'time': now }) write_root_dir = c['job']['write_root_dir'] output_filename = '{}_{}.json'.format(convert_name_to_underscore(self.name), a['metadata']['record_id']) output_path = os.path.join(write_root_dir, output_filename) a['files_output'].append({ 'filename': output_filename, 'path': output_path, 'status': 'processed', 'time': now }) content = {'metadata': a['metadata'], 'data': a['data']} output_handler(output_path, content) return output_path
def process_xml_element(self, el: etree.ElementBase, event: str, a: Dict[str, Any]) -> None: if el.tag == 'CONTENT' and event == 'end': a['metadata']['record_id'] = el.get('RECORDID') elif el.tag == 'MASTERIDENTIFER' and event == 'end': a['metadata']['title'] = clean_text(el.text) elif el.tag == 'TYPE' and event == 'end': a['metadata']['doc_type'] = clean_text(el.text) elif el.tag == 'DOCUMENTID' and event == 'end': a['metadata']['doc_id'] = clean_text(el.text) elif el.tag == 'VERSION' and event == 'end': a['metadata']['version'] = clean_text(el.text) elif el.tag == 'AUTHOR' and event == 'end': a['metadata']['author'] = clean_text(el.text) elif el.tag == 'ENDTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['end_timestamp_millis'] = millis a['metadata']['end_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'STARTTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['start_timestamp_millis'] = millis a['metadata']['start_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'CREATETIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['create_timestamp_millis'] = millis a['metadata']['create_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'LASTMODIFIEDTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['last_modified_timestamp_millis'] = millis a['metadata']['last_modified_time'] = get_iso_datetime_from_millis( millis) elif el.tag == 'RESOURCEPATH' and event == 'end': a['metadata']['doc_location_path'] = clean_text(el.text) elif el.tag == 'PUBLISHEDTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['published_timestamp_millis'] = millis a['metadata']['published_time'] = get_iso_datetime_from_millis( millis) elif el.tag == a['metadata']['doc_type']: a['is_data'] = (event == 'start') elif a['is_data'] and event == 'end' and el.text: # treat all text as html # lxml will automatically wrap plain text in a para, body and html tags structured_content = [] text_list = [] try: maybe_json = json.loads(el.text) structured_content.append({'type': 'json', 'json': maybe_json}) except (JSONDecodeError, ValueError): extractors = [ ListExtractor(excluded_tags=['table']), TableExtractor(), TextExtractor(excluded_tags=[ 'ul', 'ol', 'table', 'title', 'h1', 'h2', 'h3', 'h4' ]), HeadingExtractor(excluded_tags=['ul', 'ol', 'table']) ] stream: IO[AnyStr] = BytesIO( fix_content(el.text).encode('utf-8')) for ev, elem in self.element_iterator(stream, html=True): process_html_element(elem, ev, extractors, structured_content, text_list) # re-extract content in single column tables used for layout purposes only html = None # memoize k = [] for i, c in enumerate(structured_content): typ = c['type'] if typ in ['text', 'heading']: k.append(1) elif typ == 'list': k.append(len(c.get('items', []))) elif typ == 'table': k.append( len(c.get('head', [])) + len(c.get('body', []))) if len(c.get('fields', [])) == 1: if not html: # reset stream to reiterate stream.seek(0) # read stream into str and parse as html html = lxml.html.fromstring(stream.read()) # find single column layout table contents = html.xpath( ('/descendant::table[{0}]/tbody/tr/td/*|' + '/descendant::table[{0}]/tr/td/*').format( c['index'])) root = etree.Element('div') root.extend(contents) sc = [] tl = [] for evt, ele in etree.iterwalk(root, events=('start', 'end')): process_html_element(ele, evt, extractors, sc, tl) j = len(c.get('references', [])) structured_content = flatten([ structured_content[:(i - j)], sc, structured_content[(i + 1):] ]) text_list = flatten([ text_list[:sum(k[:(i - j)])], tl, text_list[sum(k[:(i + 1)]):] ]) data = {} if len(text_list) == 1: data['text'] = text_list[0] else: data['text'] = text_list if structured_content: data['structured_content'] = structured_content a['data'][el.tag.lower()] = data
def process_xml_element( el: etree.ElementBase, event: str, accumulator: Dict[str, Any], excluded_html_tags: List[str], ) -> Dict[str, Any]: """ Stateful, so cannot be parallelized. :param el: XML element :param event: event type [start, end] :param accumulator: accumulates state :param excluded_html_tags: XML tags to exclude :return: accumulated content as dict """ a = deepcopy(accumulator) if el.tag == 'CONTENT' and event == 'end': a['metadata']['record_id'] = el.get('RECORDID') elif el.tag == 'MASTERIDENTIFER' and event == 'end': a['metadata']['title'] = clean_text(el.text) elif el.tag == 'TYPE' and event == 'end': a['metadata']['doc_type'] = clean_text(el.text) elif el.tag == 'DOCUMENTID' and event == 'end': a['metadata']['doc_id'] = clean_text(el.text) elif el.tag == 'VERSION' and event == 'end': a['metadata']['version'] = clean_text(el.text) elif el.tag == 'AUTHOR' and event == 'end': a['metadata']['author'] = clean_text(el.text) elif el.tag == 'ENDTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['end_timestamp_millis'] = millis a['metadata']['end_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'STARTTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['start_timestamp_millis'] = millis a['metadata']['start_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'CREATETIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['create_timestamp_millis'] = millis a['metadata']['create_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'LASTMODIFIEDTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['last_modified_timestamp_millis'] = millis a['metadata']['last_modified_time'] = get_iso_datetime_from_millis( millis) elif el.tag == 'RESOURCEPATH' and event == 'end': a['metadata']['doc_location_path'] = clean_text(el.text) elif el.tag == 'PUBLISHEDTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['published_timestamp_millis'] = millis a['metadata']['published_time'] = get_iso_datetime_from_millis(millis) elif el.tag == a['metadata']['doc_type']: a['is_data'] = (event == 'start') elif a['is_data'] and event == 'end' and el.text: # treat all text as html # lxml will automatically wrap plain text in a para, body and html tags structured_content = [] text_list = [] extractors = [ ListExtractor(excluded_tags=['table']), TableExtractor(), TextExtractor(excluded_tags=[ 'ul', 'ol', 'table', 'title', 'h1', 'h2', 'h3', 'h4' ]), HeadingExtractor(excluded_tags=['ul', 'ol', 'table']) ] stream = BytesIO(fix_content(el.text).encode('utf-8')) for ev, elem in element_iterator(stream, excluded_html_tags, html=True): structured, text = process_html_element(elem, ev, extractors) structured_content.extend(structured) text_list.extend(text) data = {} if len(text_list) == 1: data['text'] = text_list[0] else: data['text'] = text_list if structured_content: data['structured_content'] = structured_content a['data'][el.tag.lower()] = data return a