Пример #1
0
def _MungeHyphenate(root_tag):
    for text in root_tag.findAll(text=True):
        if text.findParent('pre'):
            continue
        text_parts = re.split(r'(&[^;]{2,6};)', text)
        new_text = []
        for text_part in text_parts:
            if not text_part:
                continue
            if '&' == text_part[0]:
                new_text.append(text_part)
            else:
                words = re.split(r'\s+', text_part)
                # ­ is a unicode soft hyphen here -- only two UTF-8 bytes, and
                # it doesn't clutter up the source view!
                words = [
                    u'­'.join(hyphenate.hyphenate_word(word)) for word in words
                ]
                new_text.append(' '.join(words))
        text.replaceWith(BeautifulSoup.NavigableString(''.join(new_text)))
Пример #2
0
def ExtractFromHtml(url, html):
    """Given a string of HTML, remove nasty bits, score and pick bit to keep."""
    if re.search(r'^http://(www\.)?reddit\.com/.*/comments/', url, re.I):
        strainer = BeautifulSoup.SoupStrainer(
            attrs={'class': re.compile(r'thing.*link|usertext border')})
        soup = BeautifulSoup.BeautifulSoup(html, parseOnlyThese=strainer)
        body = soup.find(attrs={'class': re.compile(r'\busertext-body\b')})
        if not body:
            body = soup.find('a', attrs={'class': re.compile(r'\btitle\b')})
            body = body and body.text or soup
        return soup, body
    elif re.search(r'^http://(www\.)?xkcd\.com/\d+', url, re.I):
        soup = BeautifulSoup.BeautifulSoup(html)
        img = soup.find(alt=True, title=True)
        cont = img.parent.parent
        for tag in cont.findAll(('br', 'div')):
            util.Strip(tag)
        return soup, cont
    elif re.search(r'^http://groups\.google\.com/', url, re.I):
        strainer = BeautifulSoup.SoupStrainer(attrs={'class': 'maincontbox'})
        soup = BeautifulSoup.BeautifulSoup(html, parseOnlyThese=strainer)
        return _ExtractFromHtmlGeneric(url, unicode(soup))
    elif re.search(r'^http://(www\.)?nytimes\.com/', url, re.I):
        soup = BeautifulSoup.BeautifulSoup(html)
        bodies = soup.findAll(attrs={'class': 'articleBody'})
        if bodies:
            # Put the first "article body" contents into the second -- for some
            # reason NYT splits the lead-in text into its own "body".
            while bodies[0].contents:
                bodies[1].insert(0, bodies[0].contents[-1])
        return _ExtractFromHtmlGeneric(url, unicode(soup))
    elif re.search(r'\.txt(\?|$)', url, re.I):
        soup = BeautifulSoup.BeautifulSoup()
        pre = BeautifulSoup.Tag(soup, 'pre')
        pre.insert(0, BeautifulSoup.NavigableString(html))
        soup.insert(0, pre)
        return soup, soup
    else:
        return _ExtractFromHtmlGeneric(url, html)