Exemplo n.º 1
0
def html(value):
    value = value.replace("\r\n","\n")
    soup = BeautifulSoup(value)

    # remove HTML comments
    for comment in soup.findAll(
            text=lambda text: isinstance(text, Comment)):
        comment.extract()

    # limit tags and attributes, 'nofollow' links, escape <pre> contents
    for tag in soup.findAll(True):
        if tag.name not in valid_tags:
            tag.hidden = True
        tag.attrs = [(attr, val) for attr, val in tag.attrs
                     if attr in valid_attrs and url(val)]
        if tag.name == 'a':
            tag.attrs.append(['rel', 'nofollow'])
        if tag.name == 'pre':
            # convert < into &lt; AND \n\n -> \n&nbsp;\n so that we don't add <br>s in the linebreaking step
            tag.replaceWith('<pre>%s</pre>' % tag.contents[0].replace('<', '&lt;').replace('\n', '\r'))

    # add a linebreak whenever there are two returns
    html = soup.renderContents().decode('utf8')
    html = html.replace("\n", "<br />\n")
    html = html.replace('\r', '\n')

    return html
Exemplo n.º 2
0
def guess_type(url):
    url = urlparse.urlsplit(url_fix(url))
    client = httplib.HTTPSConnection(url.netloc, httplib.HTTPS_PORT, timeout=CLIENT_TIMEOUT) if url.scheme == "https" else httplib.HTTPConnection(url.netloc, httplib.HTTP_PORT, timeout=CLIENT_TIMEOUT)
    type = None

    # 
    # from location
    # 

    ### wordpress / wordpress.com ###

    if type == None:
        if re.match(config.WORDPRESS_URL_MATCH, url.netloc):
            type = init_type(config.WORDPRESS, config.WORDPRESS_VERSION_COM, client)
            type.factory("http://" + url.netloc)

    # 
    # from body
    # 

    body = None
    response = None

    if type == None:
        try:
            client.request("GET", url.path + url.query + url.fragment)
            response = client.getresponse()
            if response.status == 200:
                body = BeautifulSoup(response.read())
        except:
            None

    ### wordpress / wordpress.com | wordpress domain ###

    if type == None and body:
        _meta = str(body.find("meta", content=config.WORDPRESS_BODY_META_FIND))
        if re.search(config.WORDPRESS_BODY_META_SEARCH, _meta):
            type = init_type(config.WORDPRESS, config.WORDPRESS_VERSION_COM if re.match(config.WORDPRESS_URL_MATCH, url.netloc) else config.WORDPRESS_VERSION_DOMAIN, client)
            type.url_accepted = True # avoid url re-check
            type.factory("http://" + url.netloc)

    ### etc... ###

    # if type == None and body:
    #     TODO
    
    ### unsuported ###

    if type == None and response:
        if response.status == 200:
            type = init_type(config.UNSUPORTED, config.UNSUPORTED_VERSION_UNSUPORTED)
            type.url_accepted = True


    return type
Exemplo n.º 3
0
    def fetch_burger_string(self):
        """Fetch the html from the TASTY_URL and scrape it to extract the
        available burger for the current friday."""

        # return value
        burger_string = ''

        try:
            # fetch html from the given url
            url_fetch_response = urlfetch.fetch(TASTY_URL)
            if url_fetch_response.status_code == 200:
                # parse html document
                soup = BeautifulSoup(url_fetch_response.content)

                # find all elments with the text 'burger' in it
                burgers = soup.findAll(
                    text=re.compile('[^A-Za-z]burger', re.IGNORECASE))

                # find an element which does not contain 'veggie'
                burger_element = None
                for item in burgers:
                    if re.search(r'veggie|neu', item, re.IGNORECASE) == None:
                        burger_element = item
                # stop here if there is no valid element
                if burger_element == None:
                    raise Exception('Could not find a burger element')

                # find the parent 'p' element
                parent_p_burger_element = burger_element.findParent(name='p')

                if(parent_p_burger_element):
                    # get all text without the html tags and set the
                    # burger_string
                    burger_string = ''.join(
                        parent_p_burger_element.findAll(text=True))
                else:
                    burger_string = burger_element

                # TODO: look at the next p element if there is a second half

        except Exception, e:
            logging.error('UpdateHandler::fetch_burger_string() - ' + str(e))
Exemplo n.º 4
0
import re
import urllib

from vendor.BeautifulSoup import BeautifulSoup


URL = "http://www.tasty-babelsberg.de"


soup = BeautifulSoup(urllib.urlopen(URL))

burgerArray = soup.findAll(text=re.compile('[^A-Za-z]burger', re.IGNORECASE))
print burgerArray

burger_element = None
for item in burgerArray:
    if re.search(r'veggie|neu', item, re.IGNORECASE) == None:
        burger_element = item

if(burger_element):
    parent_burger_p = burger_element.findParent(name='p')
    if(parent_burger_p):
        string = ''.join(parent_burger_p.findAll(text=True))
        print unicode(string.replace('\n', ' ').strip())
    else:
        print unicode(burger_element)