Exemplo n.º 1
0
def extract_feed_links(response):
    split = urllib.parse.urlsplit(response.url)
    soup = get_soup(response.content, response.encoding)
    for tpe in FEED_MIMETYPES:
        for alternate in soup.find_all(
                _check_keys(rel=['alternate'], type=[tpe])):
            yield rebuild_url(alternate.attrs['href'], split)
Exemplo n.º 2
0
def try_get_icon_url(url, *splits):
    for split in splits:
        if split is None:
            continue
        rb_url = rebuild_url(url, split)
        response = None
        # if html in content-type, we assume it's a fancy 404 page
        try:
            response = jarr_get(rb_url, conf.crawler.timeout,
                                conf.crawler.user_agent)
            response.raise_for_status()
            content_type = response.headers.get('content-type', '')
        except Exception:
            logger.exception('something went wrong while fetching %r', rb_url)
        else:
            if response.ok and 'html' not in content_type and response.content:
                return response.url
    return None