示例#1
0
def html(value):
    value = value.replace("\r\n","\n")
    soup = BeautifulSoup(value)

    # remove HTML comments
    for comment in soup.findAll(
            text=lambda text: isinstance(text, Comment)):
        comment.extract()

    # limit tags and attributes, 'nofollow' links, escape <pre> contents
    for tag in soup.findAll(True):
        if tag.name not in valid_tags:
            tag.hidden = True
        tag.attrs = [(attr, val) for attr, val in tag.attrs
                     if attr in valid_attrs and url(val)]
        if tag.name == 'a':
            tag.attrs.append(['rel', 'nofollow'])
        if tag.name == 'pre':
            # convert < into &lt; AND \n\n -> \n&nbsp;\n so that we don't add <br>s in the linebreaking step
            tag.replaceWith('<pre>%s</pre>' % tag.contents[0].replace('<', '&lt;').replace('\n', '\r'))

    # add a linebreak whenever there are two returns
    html = soup.renderContents().decode('utf8')
    html = html.replace("\n", "<br />\n")
    html = html.replace('\r', '\n')

    return html
示例#2
0
    def fetch_burger_string(self):
        """Fetch the html from the TASTY_URL and scrape it to extract the
        available burger for the current friday."""

        # return value
        burger_string = ''

        try:
            # fetch html from the given url
            url_fetch_response = urlfetch.fetch(TASTY_URL)
            if url_fetch_response.status_code == 200:
                # parse html document
                soup = BeautifulSoup(url_fetch_response.content)

                # find all elments with the text 'burger' in it
                burgers = soup.findAll(
                    text=re.compile('[^A-Za-z]burger', re.IGNORECASE))

                # find an element which does not contain 'veggie'
                burger_element = None
                for item in burgers:
                    if re.search(r'veggie|neu', item, re.IGNORECASE) == None:
                        burger_element = item
                # stop here if there is no valid element
                if burger_element == None:
                    raise Exception('Could not find a burger element')

                # find the parent 'p' element
                parent_p_burger_element = burger_element.findParent(name='p')

                if(parent_p_burger_element):
                    # get all text without the html tags and set the
                    # burger_string
                    burger_string = ''.join(
                        parent_p_burger_element.findAll(text=True))
                else:
                    burger_string = burger_element

                # TODO: look at the next p element if there is a second half

        except Exception, e:
            logging.error('UpdateHandler::fetch_burger_string() - ' + str(e))
示例#3
0
import re
import urllib

from vendor.BeautifulSoup import BeautifulSoup


URL = "http://www.tasty-babelsberg.de"


soup = BeautifulSoup(urllib.urlopen(URL))

burgerArray = soup.findAll(text=re.compile('[^A-Za-z]burger', re.IGNORECASE))
print burgerArray

burger_element = None
for item in burgerArray:
    if re.search(r'veggie|neu', item, re.IGNORECASE) == None:
        burger_element = item

if(burger_element):
    parent_burger_p = burger_element.findParent(name='p')
    if(parent_burger_p):
        string = ''.join(parent_burger_p.findAll(text=True))
        print unicode(string.replace('\n', ' ').strip())
    else:
        print unicode(burger_element)