def test_table_cell_separator(): html = '<html><body><table><tr><td>Hallo<br>Eins</td><td>Echo<br>Zwei</td></tr></table></html>' config = ParserConfig() assert get_text(html, config) == 'Hallo Echo\nEins Zwei\n' config = ParserConfig(table_cell_separator='\t') assert get_text(html, config) == 'Hallo\tEcho\nEins \tZwei\n'
def test_html_annotations(filter_str=''): for annotation_file in glob(TESTCASE_PATTERN): if filter_str not in annotation_file: continue with open(annotation_file) as f: reference = load(f) with open(annotation_file.replace('.json', '.html')) as f: print(f.name) html = '<html><body>{}</body></html>'.format(f.read()) for indentation_strategy in ('strict', 'relaxed'): result = get_annotated_text( html, ParserConfig(css=CSS_PROFILES[indentation_strategy], annotation_rules=reference['annotation_rules'])) converted = [[a[2], result['text'][a[0]:a[1]]] for a in result['label']] if reference['result'] != converted: print("Reference:") print(reference['result']) print("\nConverted (indentation strategy: {})".format( indentation_strategy)) print(converted) if indentation_strategy == 'strict': assert reference['result'] == converted else: assert_equal_ignoring_whitespace(reference['result'], converted)
def test_html_snippets(filter_str=''): for testcase_txt in glob(TESTCASE_PATTERN): if filter_str not in testcase_txt: continue with open(testcase_txt) as f: reference_txt = f.read().rstrip() with open(testcase_txt.replace('.txt', '.html')) as f: print(f.name) html = '<html><body>{}</body></html>'.format(f.read()) converted_txt = get_text( html, ParserConfig(css=CSS_PROFILES['strict'])).rstrip() if converted_txt != reference_txt: print('File:{}\nHTML:\n{}\n\nReference:\n{}\n\nConverted:\n{}'. format(testcase_txt, html, reference_txt, converted_txt)) print('HTML file:', testcase_txt.replace('.txt', '.html')) print("Visualize differences with `vimdiff reference.txt " "converted.txt`") open("reference.txt", "w").write(reference_txt) open("converted.txt", "w").write(converted_txt) assert converted_txt == reference_txt
def test_display_anchors(): html = '''<html> <body> <a name="first">first</a> <a href="second">second</a> </body> </html> ''' config = ParserConfig(display_anchors=True) assert get_text(html, config).strip() == \ '[first](first) second'
def test_display_links(): html = '''<html> <body> <a href="first">first</a> <a href="second">second</a> <a name="third">third</a> </body> </html> ''' config = ParserConfig(display_links=True) assert get_text(html, config).strip() == \ '[first](first) [second](second) third'
def test_display_images_deduplicated(): html = '''<html> <body> <img src="test1" alt="Ein Test Bild" title="Hallo" /> <img src="test2" alt="Ein Test Bild" title="Juhu" /> <img src="test3" alt="Ein zweites Bild" title="Echo" /> </body> </html> ''' config = ParserConfig(display_images=True, deduplicate_captions=True) assert get_text(html, config).strip() == \ '[Ein Test Bild] [Ein zweites Bild]'
def __init__(self, html_tree, config=None): # use the default configuration, if no config object is provided self.config = config or ParserConfig() # setup start and end tag call tables self.start_tag_handler_dict = { 'table': self._start_table, 'tr': self._start_tr, 'td': self._start_td, 'th': self._start_td, 'ul': self._start_ul, 'ol': self._start_ol, 'li': self._start_li, 'br': self._newline, 'a': self._start_a if self.config.parse_a() else None, 'img': self._start_img if self.config.display_images else None, } self.end_tag_handler_dict = { 'table': self._end_table, 'ul': self._end_ul, 'ol': self._end_ol, 'td': self._end_td, 'th': self._end_td, 'a': self._end_a if self.config.parse_a() else None, } # instance variables self.current_tag = [self.config.css['body']] self.current_line = [Line()] self.next_line = [Line()] # the canvases used for displaying text # clean_text_line[0] refers to the root canvas; tables write into child # canvases that are created for every table line and merged with the # root canvas at the end of a table self.clean_text_lines = [[]] self.current_table = [] self.li_counter = [] self.li_level = 0 self.last_caption = None # used if display_links is enabled self.link_target = '' # crawl the html tree self._parse_html_tree(html_tree) if self.current_line[-1]: self._write_line()
def test_limit_whitespace_affixes(): html = '''<html> <body> hallo<span>echo</span> <pre> def <span>hallo</span>(): print("echo") </pre> </body> </html> ''' config = ParserConfig(css=RELAXED_CSS_PROFILE) assert get_text(html, config).strip() == \ 'hallo echo\n\n' \ 'def hallo():\n' \ ' print("echo")'
def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None): # use the default configuration, if no config object is provided self.config = config or ParserConfig() # setup start and end tag call tables self.start_tag_handler_dict = { 'table': self._start_table, 'tr': self._start_tr, 'td': self._start_td, 'th': self._start_td, 'ul': self._start_ul, 'ol': self._start_ol, 'li': self._start_li, 'br': self._newline, 'a': self._start_a if self.config.parse_a() else None, 'img': self._start_img if self.config.display_images else None, } self.end_tag_handler_dict = { 'table': self._end_table, 'ul': self._end_ul, 'ol': self._end_ol, 'td': self._end_td, 'th': self._end_td, 'a': self._end_a if self.config.parse_a() else None, } # instance variables self.canvas = Canvas() self.css = self.config.css self.apply_attributes = self.config.attribute_handler.apply_attributes self.tags = [self.css['body'].set_canvas(self.canvas)] self.current_table = [] self.li_counter = [] self.last_caption = None # used if display_links is enabled self.link_target = '' # crawl the html tree self._parse_html_tree(html_tree)
def test_html_snippets(filter_str=''): for testcase_txt in glob(TESTCASE_PATTERN): if filter_str not in testcase_txt: continue with open(testcase_txt) as f: reference_txt = f.read().rstrip() with open(testcase_txt.replace(".txt", ".html")) as f: print(f.name) html = "<html><body>{}</body></html>".format(f.read()) converted_txt = get_text( html, ParserConfig(css=CSS_PROFILES['strict'])).rstrip() if converted_txt != reference_txt: print("File:{}\nHTML:\n{}\n\nReference:\n{}\n\nConverted:\n{}". format(testcase_txt, html, reference_txt, converted_txt)) assert converted_txt == reference_txt
#!/usr/bin/env python # encoding: utf-8 ''' Tests different white-space handling. ''' from inscriptis import get_text from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.config import ParserConfig config = ParserConfig(css=CSS_PROFILES['strict']) def test_white_space(): html = (u'<body><span style="white-space: normal"><i>1</i>2\n3</span>' u'</body>') assert get_text(html, config) == u'12 3' html = (u'<body><span style="white-space: nowrap"><i>1</i>2\n3</span>' u'</body>') assert get_text(html, config) == u'12 3' html = (u'<body><span style="white-space: pre"><i>1</i>2\n3</span>' u'</body>') assert get_text(html, config) == u'12\n3' html = (u'<body><span style="white-space: pre-line"><i>1</i>2\n3</span>' u'</body>') assert get_text(html, config) == u'12\n3'
}, # SemanticElementType.FIGURE: { # "img", # "figure", # "picture", # } } STYLE_HTML_ELEMENTS = {} INSCRIPTIS_ANNOTATION_RULES = { t: (k, ) for (k, v) in SEMANTIC_HTML_ELEMENTS.items() for t in v } INSCRIPTIS_CONFIG = ParserConfig( css=STRICT_CSS_PROFILE, display_images=False, deduplicate_captions=True, display_links=False, annotation_rules=INSCRIPTIS_ANNOTATION_RULES, ) class StructuredHtmlParser(Inscriptis): """Subclass of ```inscriptis.Inscriptis``` to provide the position of structural elements.""" __slots__ = [ "link_range_to_target", "anchors", "styled_elements", ] @staticmethod
print('\nInscript comes with ABSOLUTELY NO WARRANTY.') print('This is free software and you are welcome to redistribute it ' 'under the terms of the {}.'.format(__license__)) sys.exit(0) if not args.input: html_content = sys.stdin.read() elif isfile(args.input): with open(args.input, encoding=args.encoding, errors='ignore') as f: html_content = f.read() elif args.input.startswith("http://") or args.input.startswith("https://"): html_content = requests.get(args.input).text else: print("ERROR: Cannot open input file '{}'.\n".format(args.input)) parser.print_help() sys.exit(-1) css_profile = CSS_PROFILES['relaxed'] if args.indentation == 'extended' \ else CSS_PROFILES['strict'] config = ParserConfig(css=css_profile, display_images=args.display_image_captions, deduplicate_captions=args.deduplicate_image_captions, display_links=args.display_link_targets, display_anchors=args.display_anchor_urls) text = get_text(html_content, config) if args.output: with open(args.output, 'w', encoding=args.encoding) as open_file: open_file.write(text) else: print(text)
with Path(args.annotation_rules).open() as f: annotation_rules = load(f) except IOError: print("ERROR: Cannot open annotation rule file '{0}'.".format( args.annotation_rules )) sys.exit(-1) else: annotation_rules = None css_profile = CSS_PROFILES['relaxed'] if args.indentation == 'extended' \ else CSS_PROFILES['strict'] config = ParserConfig(css=css_profile, display_images=args.display_image_captions, deduplicate_captions=args.deduplicate_image_captions, display_links=args.display_link_targets, display_anchors=args.display_anchor_urls, annotation_rules=annotation_rules, table_cell_separator=args.table_cell_separator) if not annotation_rules: output = get_text(html_content, config) else: output = args.postprocessor( get_annotated_text(html_content, config)) if hasattr(args.postprocessor, 'verbatim') \ and not args.postprocessor.verbatim: output = dumps(output) if args.output: with Path(args.output).open('w', encoding=DEFAULT_ENCODING) as f: f.write(output)
#!/usr/bin/env python3 # coding:utf-8 ''' Inscriptis Web Service ''' from flask import request, Response, Flask from inscriptis import get_text, __version__ from inscriptis.css_profiles import RELAXED_CSS_PROFILE from inscriptis.model.config import ParserConfig app = Flask(__name__) CONFIG = ParserConfig(css=RELAXED_CSS_PROFILE, display_images=True, deduplicate_captions=True, display_links=False) @app.route("/") def index(): return "Hello" @app.route("/get_text", methods=['POST']) def get_text_call(): ''' Returns: the text representation of the given HTML content. ''' content_type = request.headers['Content-type'] if '; encoding=' in content_type: