예제 #1
0
def parseArticles(listoflinks):
    """
	input list of links, returned as list of plain txt
	"""
    text = []
    for link in links:
        # clean up html, getting rid of unwanted text
        html1 = urllib.urlopen(link).read()
        readable_title = readability.Document(html1).short_title()
        readable_article = readability.Document(html1).summary()

        # parse html
        article_soupify = BeautifulSoup(readable_article, "lxml")
        text.append(readable_title + article_soupify.get_text())
    return text
예제 #2
0
    def retrieve_pdf(self, pdf_url, filename):
        """Turn the HTML article in a clean pdf file"""
        # Steps
        # 1. Pull the HTML page using requests
        # 2. Extract the article part of the page using readability
        # 3. Convert the article HTML to markdown using html2text
        # 4. Convert the markdown back to HTML (this is done to sanitize HTML)
        # 4. Convert the HTML to PDF, pulling in images where needed
        # 5. Save the PDF to the specified filename.
        request_text = get_page_with_retry(pdf_url, return_text=True)
        doc = readability.Document(request_text)
        title = doc.title()
        raw_html = doc.summary(html_partial=True)

        h2t = html2text.HTML2Text()
        h2t.wrap_links = False
        text = h2t.handle(raw_html)

        # Add the title back to the document
        article = "# {title}\n\n{text}".format(title=title, text=text)

        # Convert to html, fixing relative image urls.
        md = markdown.Markdown()
        md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10)
        html_article = md.convert(article)

        if self.debug:
            with open("./paper.html", "w") as fp:
                fp.write(html_article)

        font_config = weasyprint.fonts.FontConfiguration()
        html = weasyprint.HTML(string=html_article, url_fetcher=url_fetcher)
        css = weasyprint.CSS(string=CSS, font_config=font_config)

        html.write_pdf(filename, stylesheets=[css], font_config=font_config)
예제 #3
0
def main():
    #url = "http://johnpaulett.com/2009/10/15/html-to-restructured-text-in-python-using-pandoc/"
    url = "http://antirez.com/post/take-advantage-of-redis-adding-it-to-your-stack.html"
    html = retrive_page(url)
    readable_html = readability.Document(html).summary()
    text = html2text(readable_html)
    print text
예제 #4
0
def make_readable(request_html):
    """Use an extraction method to get the main article html

    This function checks if ReadabiliPy is installed with NodeJS support, as
    that generally yields better results. If that is not available, it falls
    back on readability.
    """

    have_readabilipy_js = False
    try:
        import readabilipy

        have_readabilipy_js = readabilipy.simple_json.have_node()
    except ImportError:
        pass

    if have_readabilipy_js:
        logger.info("Converting HTML using Readability.js")
        article = readabilipy.simple_json_from_html_string(
            request_html, use_readability=True)
        title = article["title"]
        raw_html = article["content"]
    else:
        logger.info("Converting HTML using readability")
        doc = readability.Document(request_html)
        title = doc.title()
        raw_html = doc.summary(html_partial=True)
    return title, raw_html
예제 #5
0
def getRequirements(url: str, sourcetype: str) -> list:
    """Runs the single-link main function."""
    result = str()
    results = list()
    try:
        if sourcetype == "html":
            parser = newspaper.build(url)
            for article in parser.articles:
                a = newspaper.Article(article.url)
                a.download()
                a.parse()
                a.nlp()
                doc = readability.Document(a.html)
                print(doc)
                # print(doc.summary())
                # results = extractRequirements(doc.summary())
                results = extractRequirements(doc)
        elif sourcetype == "text":
            bytesText = simpleGet(url)
            results = extractRequirements(bytesText.decode("utf-8"))
    except Exception as e:
        logging.exception(e)
    finally:
        print(result)
        # result = "".join(results) + "\n"
        # return result
        return results
예제 #6
0
def get_content(html):
    document = readability.Document(html)
    content_html = document.summary()
    content_text = lxml.html.formstring(content_html).text_content().strip()
    short_title = document.short_title()

    return short_title, content_text
예제 #7
0
def get_content(html: str) -> Tuple[str, str]:
    document = readability.Document(html)
    content_html = document.summary()
    # HTMLのタグを除去
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    short_title = document.short_title()

    return short_title, content_text
예제 #8
0
 def parse_article(self, url, html):
     rdoc = readability.Document(html)
     summary = rdoc.summary()
     lang_id, _ = langid.classify(summary)
     article = newspaper.Article(url, config=self.config, language=lang_id)
     article.set_html(html)
     article.parse()
     return article
예제 #9
0
def extract_fulltext(url):
    resp = requests.get(url)
    doc = readability.Document(resp.text)

    summary = doc.summary()
    body = lxml.html.document_fromstring(summary)

    return body.text_content()
예제 #10
0
def write_readable(base_path, fetcher, url_map, url):
    orig = fetcher.urlread(url)

    options = {'url': url, 'urlfetch': fetcher}
    rdbl_doc = readability.Document(orig, **options)
    summary = rdbl_doc.summary()

    path = make_readable_path(base_path, url_map, url)
    return write_file(path, summary.html)
def store_pretty(url):
    r = requests.get(url)
    html = r.text
    doc = rd.Document(html)
    article = doc.summary()
    soup = BeautifulSoup(article, 'html.parser')
    uni = soup.get_text(strip=True)
    cleaner = unidecode(uni)
    return cleaner
예제 #12
0
def get_content(html):
    # HTML の文字列から(タイトル,本文)のタプルを取得する
    document = readability.Document(html)
    content_html = document.summary()

    # HTML Tag を除去して本文のテキストのみを取得する
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    short_title = document.short_title()
    return short_title, content_text
예제 #13
0
def summarize_html(html_text: str, ) -> str:
    """
    Uses readability to summarize the HTML response into a summary
    """
    if html_text.strip() == "":
        raise URLCacheException("No html provided to summarize")
    doc: readability.Document = readability.Document(html_text)
    summary: str = doc.summary()
    return summary
예제 #14
0
def get_content(html):
    document = readability.Document(html)
    content_html = document.summary()
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    content_clean = re.sub('[\t\r\n]', '', content_text)
    content_strip = " ".join(content_clean.split())
    #content_final=re.sub(r'\D{2}\s\d{4}.\d{2}.\d{2}\s.{5}','',content_strip)

    return content_strip
예제 #15
0
 def fetch_url(url):
     """
     get url with readability
     """
     html = urllib.request.urlopen(url).read()
     readable_html = readability.Document(html)
     readable_article = readable_html.summary()
     title = readable_html.short_title()
     text = BeautifulSoup(readable_article, "lxml").get_text()
     return title, text
예제 #16
0
def get_content(html: str) -> Tuple[str, str]:
    """
    HTMLの文字列から (タイトル, 本文) のタプルを取得する。
    """
    document = readability.Document(html)
    content_html = document.summary()
    # HTMLタグを除去して本文のテキストのみを取得する。
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    short_title = document.short_title()

    return short_title, content_text
예제 #17
0
def get_content(html):
    """
    HTMLから,タプルとして(タイトル, 本文)を取り出す.
    """
    document = readability.Document(html)
    content_html = document.summary()

    content_text = lxml.html.fromstring(content_html).text_content().strip()
    title = document.short_title()

    return title, content_text
예제 #18
0
def get_content(html):
    """
    HTML 문자열에서 (<제목>, <본문>) 형태의 튜플을 찾은 뒤 반환합니다. 
    """
    document = readability.Document(html)
    content_html = document.summary()
    # HTM 태그를 제거하고 텍스트만 추출합니다.
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    short_title = document.short_title()
    
    return short_title, content_text
예제 #19
0
def parse_html_string(html_string):
    # Parse out title and body text
    document = readability.Document(html_string)

    # TODO(ajayjain): use document.short_title()?
    title = document.title()
    body_html = document.summary(html_partial=True)
    body_text = BeautifulSoup(body_html, 'lxml').get_text().strip()
    parsed = ParsedDocument(title=title, content=body_text)

    return parsed
예제 #20
0
def fprocess(entry):
    guid = entry.guid
    title = entry.title.split(" - ")[0]
    published = entry.published
    source = entry.source.title
    link = entry.link

    web_content = readability.Document(requests.get(link).text)
    summary = translate_html(web_content.summary())

    newsStory = NewsStory(guid, title, summary, published, source, link)
    return newsStory
예제 #21
0
    def get_content(self):
        """
        HTML の文字列から タイトル, 本文 を取得します。
        """
        document = readability.Document(self.response.content)
        title = document.title()
        content_html = document.summary()
        content_text = lxml.html.fromstring(
            content_html).text_content().strip()

        self.title = title
        self.body = content_text
예제 #22
0
def extract_article_text(article):
    resp = requests.get(article['url'])
    doc = readability.Document(resp.text)

    summary = doc.summary()
    body = lxml.html.document_fromstring(summary)

    return {
        'title': doc.title(),
        'clean_html': summary,
        'body_text': body.text_content()
    }
예제 #23
0
def execute_test(test_data):
    if test_data is None:
        return None
    else:
        base_path = os.path.join(TEST_DATA_PATH, test_data.test.name)
        fetcher = urlfetch.MockUrlFetch(base_path, test_data.test.url_map)
        doc = readability.Document(test_data.orig_html,
                                   url=test_data.test.url,
                                   urlfetch=fetcher)
        summary = doc.summary()
        diff = lxml.html.diff.htmldiff(test_data.rdbl_html, summary.html)
        return ReadabilityTestResult(test_data, summary.html, diff)
예제 #24
0
 def extract(self, url: str, html_text: str):
     doc = readability.Document(html_text)
     self.content = {
         'url': url,
         'text': re.sub('<[^<]+?>', '',
                        doc.summary()),  # need to remove any tags
         'title': doc.title(),
         'publish_date': None,
         'top_image_url': None,
         'authors': None,
         'extraction_method': METHOD_READABILITY,
     }
예제 #25
0
def get_clean_text(html):
    """
    generate clean text for given html
    """
    doc = readability.Document(html)
    try:
        doc._html()
        clean = doc.get_clean_html()
    except Exception as e:
        print(e)
        clean = html
    bsObj = bs(clean)
    return bsObj.get_text()
예제 #26
0
    def get_filename(self, abs_url):
        request_text = get_page_with_retry(abs_url, return_text=True)
        doc = readability.Document(request_text)
        title = doc.title()

        # Clean the title and make it titlecase
        title = clean_string(title)
        title = titlecase.titlecase(title)
        title = title.replace(" ", "_")
        title = clean_string(title)
        name = title.strip("_") + ".pdf"
        name = unidecode.unidecode(name)
        logger.info("Created filename: %s" % name)
        return name
예제 #27
0
def retrieve_url(url):

    # set a "real" user agent
    firefox = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:79.0) Gecko/20100101 Firefox/79.0"

    # retrieve the main text section from the url using the readability module and using the Chrome user agent
    req = requests.get(url, headers={'User-Agent': firefox})
    doc = readability.Document(req.text)
    rawhtml = doc.summary(html_partial=True)

    # remove any html tags from output
    soup = BeautifulSoup(rawhtml, 'html.parser')
    cleantext = soup.get_text().strip('\n').encode('utf-8')

    return str(rawhtml), str(cleantext)
예제 #28
0
def retrieve_url(url):

    # get a "real" user agent
    ua = fake_useragent.UserAgent()
    chrome = ua.chrome

    # retrieve the main text section from the url using the readability module and using the Chrome user agent
    req = requests.get(url, headers={'User-Agent': chrome})
    doc = readability.Document(req.text)
    rawhtml = doc.summary(html_partial=True)

    # remove any html tags from output
    soup = BeautifulSoup(rawhtml, 'html.parser')
    cleantext = soup.get_text().encode('utf-8')

    return str(rawhtml), str(cleantext)
예제 #29
0
파일: PageClassfier.py 프로젝트: hp027/gist
def readability_test(idxs, dist_path="pages"):
    lite_pages = []
    fat_pages = []
    for idx in idxs:
        c = file("%s/%s" % (dist_path, idx['md5'])).read()
        l = len(readability.Document(idx['url']).summary())
        if l < 200:
            lite_pages.append((l, idx['url']))
        elif l > 400:
            fat_pages.append((l, idx['url']))
            # print idx['url']
    for l in lite_pages:
        print l
    print "________________________________________________"
    for f in fat_pages:
        print f
예제 #30
0
def summarize_html(html_text: str) -> str:
    """
    Uses readability to summarize the HTML response into a summary,
    then lxml to remove unnecessary attributes on all elements
    """
    doc: readability.Document = readability.Document(html_text)
    summary: str = doc.summary()
    # remove class/id attributes
    tree = lxml.html.fromstring(summary)
    ctree = cleaner.clean_html(tree)
    # clean_html replaces the top-level
    # element with html, set it back to HTML if possible
    if ctree.tag == "div":
        ctree.tag = "html"
    html_bytes: bytes = lxml.html.tostring(ctree)
    # should html.unescape be called here? Or should that be handled
    # elsewhere/when parsing into text
    return html_bytes.decode("utf-8")