def check_extract_article(test_filename, expected_filename, content_digests=False, node_indexes=False, use_readability_js=False): """Test end-to-end article extraction. Ensure that HTML from file matches JSON from file after parsing is applied.""" test_data_dir = "data" # Read HTML test file test_filepath = os.path.join(os.path.dirname(__file__), test_data_dir, test_filename) with open(test_filepath) as h: html = h.read() # Extract simplified article HTML if use_readability_js: article_json = simple_json_from_html_string(html, content_digests, node_indexes, use_readability=True) else: article_json = simple_json_from_html_string(html, content_digests, node_indexes) # Get expected simplified article HTML expected_filepath = os.path.join(os.path.dirname(__file__), test_data_dir, expected_filename) with open(expected_filepath) as h: expected_article_json = json.loads(h.read()) # Test full JSON matches (checks for unexpected fields in either actual or expected JSON) assert article_json == expected_article_json
def make_readable(request_html): """Use an extraction method to get the main article html This function checks if ReadabiliPy is installed with NodeJS support, as that generally yields better results. If that is not available, it falls back on readability. """ have_readabilipy_js = False try: import readabilipy have_readabilipy_js = readabilipy.simple_json.have_node() except ImportError: pass if have_readabilipy_js: logger.info("Converting HTML using Readability.js") article = readabilipy.simple_json_from_html_string( request_html, use_readability=True) title = article["title"] raw_html = article["content"] else: logger.info("Converting HTML using readability") doc = readability.Document(request_html) title = doc.title() raw_html = doc.summary(html_partial=True) return title, raw_html
def check_html_output_does_not_contain_tag(test_fragment, vetoed_tag): """Check that vetoed tag is not present when parsing HTML fragment.""" article_json = simple_json_from_html_string(test_fragment) # Check that neither <tag> nor </tag> appear in the output content = str(article_json["plain_content"]) if content is not None: for element in ["<{}>".format(vetoed_tag), "</{}>".format(vetoed_tag)]: assert element not in content
def run_readabilipy(htmlstring): '''try with the dragnet module''' try: article = simple_json_from_html_string(htmlstring, use_readability=True) except (TypeError, ValueError): return '' returnlist = [textelem['text'] for textelem in article['plain_text']] return '\n'.join(returnlist) # sanitize(content)
def extract_text_rpy(url): save_article = False parse_res = 1 article = None try: response = requests.get(url, timeout=5) except requests.exceptions.ConnectionError: parse_res = 4 return save_article, parse_res, article except TimeoutError: parse_res = 7 return save_article, parse_res, article if 'pdf' in url[-4:]: parse_res = 3 return save_article, parse_res, article # Extracting the source code of the page. html_string = response.text if 'pdf' in html_string.lower()[:5]: # %pdf parse_res = 3 return save_article, parse_res, article # use_readability=True -> Mozilla's Readability.js try: article = simple_json_from_html_string(html_string, use_readability=True) except: parse_res = 5 return save_article, parse_res, article html_simple = article.get('content', None) text_list = article.get('plain_text', None) if text_list == None or text_list == []: parse_res = 6 return save_article, parse_res, article stitle = article.get('title', '') sdate = article.get('date', None) byline = article.get('byline', None) # author information if sdate: sdate = str(sdate)[:100] if stitle == None: stitle_art = '' else: stitle_art = stitle stitle = stitle[:150] # max length txt_str, parse_res = parse_textli(text_list, parse_res) # included *textContent and "siteName" in article which could be used instead. txt_str = article['textContent'] text_str = stitle_art + '\n' + txt_str article = (text_str, stitle, sdate, byline, html_simple) save_article = True return save_article, parse_res, article
def run_readabilipy(htmlstring): '''try with the readability.py module''' try: article = simple_json_from_html_string(htmlstring, use_readability=True) returnlist = [textelem['text'] for textelem in article['plain_text']] return '\n'.join(returnlist) # sanitize(content) except Exception as err: #print('Readabilipy exception:', err) return ''
def get_normalised_html_output(test_fragment, expected_output=None): """Get normalised HTML output.""" if expected_output is None: expected_output = test_fragment article_json = simple_json_from_html_string(test_fragment) content = str(article_json["plain_content"]) # Check that expected output is present after simplifying the HTML normalised_expectation = strip_html_whitespace(expected_output) normalised_result = strip_html_whitespace(content) print("expectation:", normalised_expectation) print("result:", normalised_result) return (normalised_expectation, normalised_result)
def get_paragraphs_readabilipy(str_text, mode): """ using ReadabiliPy, requires NodeJS version >= 10 """ try: article = simple_json_from_html_string(str_text, use_readability=True) except (TypeError, ValueError): return [''] returnlist = [] for textelem in article['plain_text']: returnlist.append(textelem['text']) return returnlist
def readability_parser(path): with open(path, 'r', encoding="utf8") as f: html_string = f.read() article = simple_json_from_html_string(html_string, content_digests=False, node_indexes=False, use_readability=True) #article_json = json.dumps(article) parsed_article = {} text = '' for item in article['plain_text']: text += item['text'] parsed_article['title'] = article['title'] parsed_article['author'] = article['byline'] parsed_article['content'] = text return parsed_article
def check_html_has_no_output(test_fragment): """Check that no output is present when parsing HTML fragment.""" article_json = simple_json_from_html_string(test_fragment) # Check that there is no output assert article_json["plain_content"] is None or article_json[ "plain_content"] == "<div></div>"
urls = read_file(args.urls_file) else: urls = args.input_urls book = epub.EpubBook() # add metadata book.set_identifier('web-to-epub') book.set_title('Book created with web-to-epub') book.set_language('en') book.add_author('Computer Program') chapters = [] for num, url in enumerate(urls, start=1): req = requests.get(url) article = simple_json_from_html_string(req.text, use_readability=True) content = f"<h1>Chapter {num}</h1>" content += f"<h1>{article['title']}</h1>" content += article['plain_content'] title = f"{num} - {article['title']}" chapter = epub.EpubHtml(title=title, file_name=f"{num}.xhtml", lang='en') chapter.content = content chapters.append(chapter) source_url_content = '<h1>Source URLs</h1>' for url in urls: source_url_content += f"<br/>{url}" source_url_chapter = epub.EpubHtml(title='Source URLs', file_name='source_urls.xhtml', lang='en') source_url_chapter.content = source_url_content
def test_contentless_page(): """Contentless pages should return an empty <div>.""" html = "<html></html>" parsed_content = simple_json_from_html_string(html) assert parsed_content["content"] == "<div></div>"
def test_empty_page(): """Empty pages should return an empty <div>.""" html = "" parsed_content = simple_json_from_html_string(html) assert parsed_content["content"] == "<div></div>"