예제 #1
0
def check_extract_article(test_filename,
                          expected_filename,
                          content_digests=False,
                          node_indexes=False,
                          use_readability_js=False):
    """Test end-to-end article extraction. Ensure that HTML from file matches JSON from file after parsing is applied."""
    test_data_dir = "data"
    # Read HTML test file
    test_filepath = os.path.join(os.path.dirname(__file__), test_data_dir,
                                 test_filename)
    with open(test_filepath) as h:
        html = h.read()

    # Extract simplified article HTML
    if use_readability_js:
        article_json = simple_json_from_html_string(html,
                                                    content_digests,
                                                    node_indexes,
                                                    use_readability=True)
    else:
        article_json = simple_json_from_html_string(html, content_digests,
                                                    node_indexes)

    # Get expected simplified article HTML
    expected_filepath = os.path.join(os.path.dirname(__file__), test_data_dir,
                                     expected_filename)
    with open(expected_filepath) as h:
        expected_article_json = json.loads(h.read())

    # Test full JSON matches (checks for unexpected fields in either actual or expected JSON)
    assert article_json == expected_article_json
예제 #2
0
def make_readable(request_html):
    """Use an extraction method to get the main article html

    This function checks if ReadabiliPy is installed with NodeJS support, as
    that generally yields better results. If that is not available, it falls
    back on readability.
    """

    have_readabilipy_js = False
    try:
        import readabilipy

        have_readabilipy_js = readabilipy.simple_json.have_node()
    except ImportError:
        pass

    if have_readabilipy_js:
        logger.info("Converting HTML using Readability.js")
        article = readabilipy.simple_json_from_html_string(
            request_html, use_readability=True)
        title = article["title"]
        raw_html = article["content"]
    else:
        logger.info("Converting HTML using readability")
        doc = readability.Document(request_html)
        title = doc.title()
        raw_html = doc.summary(html_partial=True)
    return title, raw_html
예제 #3
0
def check_html_output_does_not_contain_tag(test_fragment, vetoed_tag):
    """Check that vetoed tag is not present when parsing HTML fragment."""
    article_json = simple_json_from_html_string(test_fragment)
    # Check that neither <tag> nor </tag> appear in the output
    content = str(article_json["plain_content"])
    if content is not None:
        for element in ["<{}>".format(vetoed_tag), "</{}>".format(vetoed_tag)]:
            assert element not in content
예제 #4
0
def run_readabilipy(htmlstring):
    '''try with the dragnet module'''
    try:
        article = simple_json_from_html_string(htmlstring,
                                               use_readability=True)
    except (TypeError, ValueError):
        return ''
    returnlist = [textelem['text'] for textelem in article['plain_text']]
    return '\n'.join(returnlist)  # sanitize(content)
예제 #5
0
def extract_text_rpy(url):
    save_article = False
    parse_res = 1
    article = None
    try:
        response = requests.get(url, timeout=5)
    except requests.exceptions.ConnectionError:
        parse_res = 4
        return save_article, parse_res, article
    except TimeoutError:
        parse_res = 7
        return save_article, parse_res, article

    if 'pdf' in url[-4:]:
        parse_res = 3
        return save_article, parse_res, article

    # Extracting the source code of the page.
    html_string = response.text
    if 'pdf' in html_string.lower()[:5]:  # %pdf
        parse_res = 3
        return save_article, parse_res, article
    # use_readability=True -> Mozilla's Readability.js
    try:
        article = simple_json_from_html_string(html_string,
                                               use_readability=True)
    except:
        parse_res = 5
        return save_article, parse_res, article

    html_simple = article.get('content', None)
    text_list = article.get('plain_text', None)
    if text_list == None or text_list == []:
        parse_res = 6
        return save_article, parse_res, article

    stitle = article.get('title', '')

    sdate = article.get('date', None)
    byline = article.get('byline', None)  # author information

    if sdate:
        sdate = str(sdate)[:100]

    if stitle == None:
        stitle_art = ''
    else:
        stitle_art = stitle
        stitle = stitle[:150]  # max length
    txt_str, parse_res = parse_textli(text_list, parse_res)
    # included *textContent and "siteName" in article which could be used instead.
    txt_str = article['textContent']
    text_str = stitle_art + '\n' + txt_str
    article = (text_str, stitle, sdate, byline, html_simple)
    save_article = True
    return save_article, parse_res, article
예제 #6
0
def run_readabilipy(htmlstring):
    '''try with the readability.py module'''
    try:
        article = simple_json_from_html_string(htmlstring,
                                               use_readability=True)
        returnlist = [textelem['text'] for textelem in article['plain_text']]
        return '\n'.join(returnlist)  # sanitize(content)
    except Exception as err:
        #print('Readabilipy exception:', err)
        return ''
예제 #7
0
def get_normalised_html_output(test_fragment, expected_output=None):
    """Get normalised HTML output."""
    if expected_output is None:
        expected_output = test_fragment
    article_json = simple_json_from_html_string(test_fragment)
    content = str(article_json["plain_content"])
    # Check that expected output is present after simplifying the HTML
    normalised_expectation = strip_html_whitespace(expected_output)
    normalised_result = strip_html_whitespace(content)
    print("expectation:", normalised_expectation)
    print("result:", normalised_result)
    return (normalised_expectation, normalised_result)
예제 #8
0
def get_paragraphs_readabilipy(str_text, mode):
  """
  using ReadabiliPy, requires NodeJS version >= 10
  """
  try:
      article = simple_json_from_html_string(str_text, use_readability=True)
  except (TypeError, ValueError):
      return ['']
  returnlist = []
  for textelem in article['plain_text']:
      returnlist.append(textelem['text'])
  return returnlist
예제 #9
0
def readability_parser(path):
    with open(path, 'r', encoding="utf8") as f:
        html_string = f.read()
        article = simple_json_from_html_string(html_string,
                                               content_digests=False,
                                               node_indexes=False,
                                               use_readability=True)
        #article_json = json.dumps(article)
        parsed_article = {}
        text = ''
        for item in article['plain_text']:
            text += item['text']
        parsed_article['title'] = article['title']
        parsed_article['author'] = article['byline']
        parsed_article['content'] = text
        return parsed_article
예제 #10
0
def check_html_has_no_output(test_fragment):
    """Check that no output is present when parsing HTML fragment."""
    article_json = simple_json_from_html_string(test_fragment)
    # Check that there is no output
    assert article_json["plain_content"] is None or article_json[
        "plain_content"] == "<div></div>"
예제 #11
0
    urls = read_file(args.urls_file)
else:
    urls = args.input_urls

book = epub.EpubBook()

# add metadata
book.set_identifier('web-to-epub')
book.set_title('Book created with web-to-epub')
book.set_language('en')
book.add_author('Computer Program')

chapters = []
for num, url in enumerate(urls, start=1):
    req = requests.get(url)
    article = simple_json_from_html_string(req.text, use_readability=True)
    content = f"<h1>Chapter {num}</h1>"
    content += f"<h1>{article['title']}</h1>"
    content += article['plain_content']
    title = f"{num} - {article['title']}"
    chapter = epub.EpubHtml(title=title, file_name=f"{num}.xhtml", lang='en')
    chapter.content = content
    chapters.append(chapter)

source_url_content = '<h1>Source URLs</h1>'
for url in urls:
    source_url_content += f"<br/>{url}"
source_url_chapter = epub.EpubHtml(title='Source URLs',
                                   file_name='source_urls.xhtml',
                                   lang='en')
source_url_chapter.content = source_url_content
def test_contentless_page():
    """Contentless pages should return an empty <div>."""
    html = "<html></html>"
    parsed_content = simple_json_from_html_string(html)
    assert parsed_content["content"] == "<div></div>"
def test_empty_page():
    """Empty pages should return an empty <div>."""
    html = ""
    parsed_content = simple_json_from_html_string(html)
    assert parsed_content["content"] == "<div></div>"