def test_extract_heading_and_text_combo(): structured_content = [] text_list = [] heading_extractor = HeadingExtractor() text_extractor = TextExtractor( excluded_tags=['ul', 'ol', 'title', 'h1', 'h2', 'h3', 'h4']) content = ''' <h1>My Heading</h1> <div> First line <p>My <font color="#ccc">colored</font> <a href="#">text</a> line</p> Last line </div> <ul> <li>First <a href="#">link</a> item</li> <li>Two</li> <li>Three</li> </ul> Trailing line ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): heading_extractor.extract(elem, ev, structured_content, text_list) text_extractor.extract(elem, ev, structured_content, text_list) assert len(text_list) == 5 assert text_list[0] == 'My Heading' assert text_list[2] == 'My colored text line' assert text_list[4] == 'Trailing line' assert structured_content[0]['type'] == 'heading' assert structured_content[0]['text'] == 'My Heading' assert structured_content[4]['type'] == 'text' assert structured_content[4]['text'] == 'Last line'
def test_extract_heading_with_line_break(): structured_content = [] text_list = [] heading_extractor = HeadingExtractor() content = '<h1>My<br> Heading</h1>' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): heading_extractor.extract(elem, ev, structured_content, text_list) assert text_list[0] == 'My Heading' assert structured_content[0]['type'] == 'heading' assert structured_content[0]['text'] == 'My Heading'
def test_extract_embedded_heading(): structured_content = [] text_list = [] heading_extractor = HeadingExtractor() content = '<p>First</p><h1>My <span>Head</span>ing</h1><div>Last</div>' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): heading_extractor.extract(elem, ev, structured_content, text_list) assert text_list[0] == 'My Heading' assert structured_content[0]['type'] == 'heading' assert structured_content[0]['text'] == 'My Heading'
def test_extract_anchor_from_heading3(): structured_content = [] text_list = [] heading_extractor = HeadingExtractor() content = '<h1>My <a href="link-url">Heading</a> text</h1>' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): heading_extractor.extract(elem, ev, structured_content, text_list) assert text_list[0] == 'My Heading text' assert structured_content[0]['type'] == 'link' assert structured_content[0]['text'] == 'Heading' assert structured_content[0]['url'] == 'link-url' assert structured_content[1]['type'] == 'heading' assert structured_content[1]['text'] == 'My [[Heading]] text'
def test_exclude_heading_in_list(): structured_content = [] text_list = [] heading_extractor = HeadingExtractor(excluded_tags=['ul', 'ol']) content = ''' <ul> <h2>List heading</h2> <li>One</li> <li>Two</li> </ul> ''' stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): heading_extractor.extract(elem, ev, structured_content, text_list) assert not text_list assert not structured_content
def test_extract_complex_heading_2(): structured_content = [] text_list = [] heading_extractor = HeadingExtractor() content = """ <h2 style="margin-top:0cm;margin-right:30.05pt;margin-bottom:0cm;margin-left:22.4pt;margin-bottom:.0001pt"> <span style="font-family:calibri,sans-serif; font-size:11pt"> Please <u>STOP</u> using the AIM process for this issue. </span> </h2> """ stream = BytesIO(fix_content(content).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): heading_extractor.extract(elem, ev, structured_content, text_list) assert text_list[0] == 'Please STOP using the AIM process for this issue.' assert structured_content[0]['type'] == 'heading' assert structured_content[0][ 'text'] == 'Please STOP using the AIM process for this issue.'
def process_doc(self, text: str, a: Dict[str, Any]) -> None: # treat all text as html # lxml will automatically wrap plain text in a para, body and html tags structured_content = [] text_list = [] extractors = [ ListExtractor(excluded_tags=['table']), TableExtractor(), TextExtractor(excluded_tags=[ 'ul', 'ol', 'table', 'title', 'h1', 'h2', 'h3', 'h4' ]), HeadingExtractor(excluded_tags=['ul', 'ol', 'table']) ] stream: IO[AnyStr] = BytesIO(fix_content(text).encode('utf-8')) for ev, elem in self.element_iterator(stream, html=True): process_html_element(elem, ev, extractors, structured_content, text_list, self.__nlp) # re-extract content in single column tables used for layout purposes only html = None # memoize k = [] for i, c in enumerate(structured_content): typ = c['type'] if typ in ['text', 'heading']: k.append(1) elif typ == 'list': k.append(len(c.get('items', []))) elif typ == 'table': k.append(len(c.get('head', [])) + len(c.get('body', []))) if len(c.get('fields', [])) == 1: if not html: # reset stream to reiterate stream.seek(0) # read stream into str and parse as html html = lxml.html.fromstring(stream.read()) # find single column layout table contents = html.xpath( ('/descendant::table[{0}]/tbody/tr/td/*|' + '/descendant::table[{0}]/tr/td/*').format(c['index'])) root = etree.Element('div') root.extend(contents) sc = [] tl = [] for evt, ele in etree.iterwalk(root, events=('start', 'end')): process_html_element(ele, evt, extractors, sc, tl, self.__nlp) j = len(c.get('references', [])) structured_content = flatten([ structured_content[:(i - j)], sc, structured_content[(i + 1):] ]) text_list = flatten([ text_list[:sum(k[:(i - j)])], tl, text_list[sum(k[:(i + 1)]):] ]) data = {} if len(text_list) == 1: data['text'] = text_list[0] else: data['text'] = text_list if structured_content: data['structured_content'] = structured_content a['data'] = data
def extract_text(c: Dict[str, Any], # logger: Logger, a: Dict[str, Any], excluded_tags: List[str], output_handler: Callable, f: TextIO) -> str: # logger.debug('process file: {}'.format(f.name)) a.update({ 'data': {}, 'is_data': False, 'metadata': {'doc_type': None, 'record_id': None} }) it = etree.iterparse(f, events=('start', 'end')) stream = ((event, el) for event, el in it if el.tag not in excluded_tags) for event, el in stream: if el.tag == 'CONTENT' and event == 'end': a['metadata']['record_id'] = el.get('RECORDID') elif el.tag == 'MASTERIDENTIFER' and event == 'end': a['metadata']['title'] = el.text elif el.tag == 'TYPE' and event == 'end': a['metadata']['doc_type'] = el.text elif el.tag == 'DOCUMENTID' and event == 'end': a['metadata']['doc_id'] = el.text elif el.tag == 'VERSION' and event == 'end': a['metadata']['version'] = el.text elif el.tag == 'AUTHOR' and event == 'end': a['metadata']['author'] = el.text elif el.tag == 'ENDTIMESTAMP_MILLIS' and event == 'end': millis = int(el.text) a['metadata']['end_timestamp_millis'] = millis a['metadata']['end_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'STARTTIMESTAMP_MILLIS' and event == 'end': millis = int(el.text) a['metadata']['start_timestamp_millis'] = millis a['metadata']['start_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'CREATETIMESTAMP_MILLIS' and event == 'end': millis = int(el.text) a['metadata']['create_timestamp_millis'] = millis a['metadata']['create_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'LASTMODIFIEDTIMESTAMP_MILLIS' and event == 'end': millis = int(el.text) a['metadata']['last_modified_timestamp_millis'] = millis a['metadata']['last_modified_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'RESOURCEPATH' and event == 'end': a['metadata']['doc_location_path'] = el.text elif el.tag == 'PUBLISHEDTIMESTAMP_MILLIS' and event == 'end': millis = int(el.text) a['metadata']['published_timestamp_millis'] = millis a['metadata']['published_time'] = get_iso_datetime_from_millis(millis) elif el.tag == a['metadata']['doc_type']: a['is_data'] = (event == 'start') elif a['is_data'] and event == 'end' and el.text: # treat all text as html # lxml will automatically wrap plain text in a para, body and html tags structured_content = [] text_list = [] list_extractor = ListExtractor(excluded_tags=['table']) table_extractor = TableExtractor() text_extractor = TextExtractor(excluded_tags=['ul', 'ol', 'table', 'title', 'h1', 'h2', 'h3', 'h4']) heading_extractor = HeadingExtractor(excluded_tags=['ul', 'ol', 'table']) stream = BytesIO(fix_content(el.text).encode('utf-8')) for ev, elem in etree.iterparse(stream, events=('start', 'end'), html=True): heading_extractor.extract(elem, ev, structured_content, text_list) text_extractor.extract(elem, ev, structured_content, text_list) list_extractor.extract(elem, ev, structured_content, text_list) table_extractor.extract(elem, ev, structured_content, text_list) data = {} if len(text_list) == 1: data['text'] = text_list[0] else: data['text'] = text_list if structured_content: data['structured_content'] = structured_content a['data'][el.tag.lower()] = data now = datetime.utcnow().isoformat() a['files_processed'].append({ 'path': f.name, 'time': now }) write_root_dir = c['job']['write_root_dir'] output_filename = '{}_{}.json'.format(convert_name_to_underscore(self.name), a['metadata']['record_id']) output_path = os.path.join(write_root_dir, output_filename) a['files_output'].append({ 'filename': output_filename, 'path': output_path, 'status': 'processed', 'time': now }) content = {'metadata': a['metadata'], 'data': a['data']} output_handler(output_path, content) return output_path
def process_xml_element(self, el: etree.ElementBase, event: str, a: Dict[str, Any]) -> None: if el.tag == 'CONTENT' and event == 'end': a['metadata']['record_id'] = el.get('RECORDID') elif el.tag == 'MASTERIDENTIFER' and event == 'end': a['metadata']['title'] = clean_text(el.text) elif el.tag == 'TYPE' and event == 'end': a['metadata']['doc_type'] = clean_text(el.text) elif el.tag == 'DOCUMENTID' and event == 'end': a['metadata']['doc_id'] = clean_text(el.text) elif el.tag == 'VERSION' and event == 'end': a['metadata']['version'] = clean_text(el.text) elif el.tag == 'AUTHOR' and event == 'end': a['metadata']['author'] = clean_text(el.text) elif el.tag == 'ENDTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['end_timestamp_millis'] = millis a['metadata']['end_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'STARTTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['start_timestamp_millis'] = millis a['metadata']['start_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'CREATETIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['create_timestamp_millis'] = millis a['metadata']['create_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'LASTMODIFIEDTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['last_modified_timestamp_millis'] = millis a['metadata']['last_modified_time'] = get_iso_datetime_from_millis( millis) elif el.tag == 'RESOURCEPATH' and event == 'end': a['metadata']['doc_location_path'] = clean_text(el.text) elif el.tag == 'PUBLISHEDTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['published_timestamp_millis'] = millis a['metadata']['published_time'] = get_iso_datetime_from_millis( millis) elif el.tag == a['metadata']['doc_type']: a['is_data'] = (event == 'start') elif a['is_data'] and event == 'end' and el.text: # treat all text as html # lxml will automatically wrap plain text in a para, body and html tags structured_content = [] text_list = [] try: maybe_json = json.loads(el.text) structured_content.append({'type': 'json', 'json': maybe_json}) except (JSONDecodeError, ValueError): extractors = [ ListExtractor(excluded_tags=['table']), TableExtractor(), TextExtractor(excluded_tags=[ 'ul', 'ol', 'table', 'title', 'h1', 'h2', 'h3', 'h4' ]), HeadingExtractor(excluded_tags=['ul', 'ol', 'table']) ] stream: IO[AnyStr] = BytesIO( fix_content(el.text).encode('utf-8')) for ev, elem in self.element_iterator(stream, html=True): process_html_element(elem, ev, extractors, structured_content, text_list) # re-extract content in single column tables used for layout purposes only html = None # memoize k = [] for i, c in enumerate(structured_content): typ = c['type'] if typ in ['text', 'heading']: k.append(1) elif typ == 'list': k.append(len(c.get('items', []))) elif typ == 'table': k.append( len(c.get('head', [])) + len(c.get('body', []))) if len(c.get('fields', [])) == 1: if not html: # reset stream to reiterate stream.seek(0) # read stream into str and parse as html html = lxml.html.fromstring(stream.read()) # find single column layout table contents = html.xpath( ('/descendant::table[{0}]/tbody/tr/td/*|' + '/descendant::table[{0}]/tr/td/*').format( c['index'])) root = etree.Element('div') root.extend(contents) sc = [] tl = [] for evt, ele in etree.iterwalk(root, events=('start', 'end')): process_html_element(ele, evt, extractors, sc, tl) j = len(c.get('references', [])) structured_content = flatten([ structured_content[:(i - j)], sc, structured_content[(i + 1):] ]) text_list = flatten([ text_list[:sum(k[:(i - j)])], tl, text_list[sum(k[:(i + 1)]):] ]) data = {} if len(text_list) == 1: data['text'] = text_list[0] else: data['text'] = text_list if structured_content: data['structured_content'] = structured_content a['data'][el.tag.lower()] = data
def process_xml_element( el: etree.ElementBase, event: str, accumulator: Dict[str, Any], excluded_html_tags: List[str], ) -> Dict[str, Any]: """ Stateful, so cannot be parallelized. :param el: XML element :param event: event type [start, end] :param accumulator: accumulates state :param excluded_html_tags: XML tags to exclude :return: accumulated content as dict """ a = deepcopy(accumulator) if el.tag == 'CONTENT' and event == 'end': a['metadata']['record_id'] = el.get('RECORDID') elif el.tag == 'MASTERIDENTIFER' and event == 'end': a['metadata']['title'] = clean_text(el.text) elif el.tag == 'TYPE' and event == 'end': a['metadata']['doc_type'] = clean_text(el.text) elif el.tag == 'DOCUMENTID' and event == 'end': a['metadata']['doc_id'] = clean_text(el.text) elif el.tag == 'VERSION' and event == 'end': a['metadata']['version'] = clean_text(el.text) elif el.tag == 'AUTHOR' and event == 'end': a['metadata']['author'] = clean_text(el.text) elif el.tag == 'ENDTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['end_timestamp_millis'] = millis a['metadata']['end_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'STARTTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['start_timestamp_millis'] = millis a['metadata']['start_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'CREATETIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['create_timestamp_millis'] = millis a['metadata']['create_time'] = get_iso_datetime_from_millis(millis) elif el.tag == 'LASTMODIFIEDTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['last_modified_timestamp_millis'] = millis a['metadata']['last_modified_time'] = get_iso_datetime_from_millis( millis) elif el.tag == 'RESOURCEPATH' and event == 'end': a['metadata']['doc_location_path'] = clean_text(el.text) elif el.tag == 'PUBLISHEDTIMESTAMP_MILLIS' and event == 'end': millis = int(clean_text(el.text)) a['metadata']['published_timestamp_millis'] = millis a['metadata']['published_time'] = get_iso_datetime_from_millis(millis) elif el.tag == a['metadata']['doc_type']: a['is_data'] = (event == 'start') elif a['is_data'] and event == 'end' and el.text: # treat all text as html # lxml will automatically wrap plain text in a para, body and html tags structured_content = [] text_list = [] extractors = [ ListExtractor(excluded_tags=['table']), TableExtractor(), TextExtractor(excluded_tags=[ 'ul', 'ol', 'table', 'title', 'h1', 'h2', 'h3', 'h4' ]), HeadingExtractor(excluded_tags=['ul', 'ol', 'table']) ] stream = BytesIO(fix_content(el.text).encode('utf-8')) for ev, elem in element_iterator(stream, excluded_html_tags, html=True): structured, text = process_html_element(elem, ev, extractors) structured_content.extend(structured) text_list.extend(text) data = {} if len(text_list) == 1: data['text'] = text_list[0] else: data['text'] = text_list if structured_content: data['structured_content'] = structured_content a['data'][el.tag.lower()] = data return a