def from_html(htmlstring, source=None): """ Extract all img urls from an html string """ if not htmlstring: return [] soup = make_soup(htmlstring) out_imgs = [] for tag, attr in IMG_TAGS: for el in soup.find_all(tag): img_url = el.attrs.get(attr) if not img_url: continue # embeds. if img_url.startswith("//:"): img_url = "http{}".format(img_url) # only take images with known formats fmt = url.is_image(img_url) if not fmt: continue # absolutify images if we know their source. if img_url.startswith("/") or not img_url.startswith("http"): if source: img_url = urljoin(source, img_url) else: continue out_imgs.append(img_url) return uniq(out_imgs)
def from_html(htmlstring, source=None): """ Extract all img urls from an html string """ if not htmlstring: return [] soup = make_soup(htmlstring) out_imgs = [] for tag, attr in IMG_TAGS: for el in soup.find_all(tag): img_url = el.attrs.get(attr) if not img_url: continue # embeds. if img_url.startswith('//:'): img_url = "http{}".format(img_url) # only take images with known formats fmt = url.is_image(img_url) if not fmt: continue # absolutify images if we know their source. if img_url.startswith('/') or not img_url.startswith('http'): if source: img_url = urljoin(source, img_url) else: continue out_imgs.append(img_url) return uniq(out_imgs)
def make_abs(htmlstring, source_url): """ Make "src" and "href" attributes absolute. """ soup = make_soup(htmlstring) # links for a in soup.find_all('a'): href = a.attrs.get('href') if not href: continue if href.startswith('/') or not href.startswith('http'): if source_url: a['href'] = urljoin(source_url, href) elif href.startswith('#'): a.attrs.pop('href') # images for img in soup.find_all('img'): src = img.attrs.get('src') if not src: continue # embeds. if src.startswith('//:'): img['src'] = "http{}".format(src) if src.startswith('/') or not src.startswith('http'): if source_url: img['src'] = urljoin(source_url, src) return soup
def extract(soup, tags=PESSIMISTIC_TAGS, attrs=TAG_ATTRS, vals=TAG_VALS): """ Extract author attrs from meta tags. Only works for english articles. """ # soupify if not isinstance(soup, BeautifulSoup): soup = make_soup(soup) # Search popular author tags for authors matches = [] _authors = [] for tag in tags: for attr in attrs: for val in vals: found = soup.find_all(tag, {attr: val}) matches.extend(found) for match in matches: content = u"" m = match.attrs.get("content", None) if m: content = m else: # match.tag == <any other tag> content = match.text or u"" # text_content() if len(content) > 0: _authors.extend(parse(content)) return _format(_authors)
def _bypass_bitly_warning(url): """ Sometime bitly blocks unshorten attempts, this bypasses that. """ html_string = network.get(url) soup = make_soup(html_string) a = soup.find('a', {'id': 'clickthrough'}) if a: return a.attrs.get('href') return url
def body_via_article_tag(soup, source_url): """ Extract content from an "article" tag. """ if not isinstance(soup, BeautifulSoup): soup = make_soup(soup) articles = soup.find_all('article') if len(articles): raw_html = html.get_inner(articles[0]) body = html.prepare(raw_html, source_url) return body, raw_html return None, None
def from_html(htmlstring, **kw): """ Extract urls from htmlstring, optionally reconciling relative urls + embeds + redirects. """ source = kw.get('source', None) exclude_images = kw.get('excl_img', True) if not htmlstring: return [] final_urls = [] if source: source_domain = get_domain(source) soup = make_soup(htmlstring) for tag in URL_TAGS: for el in soup.find_all(tag): for attr in URL_ATTRS: href = el.attrs.get(attr, None) if not href: continue url = reconcile_embed(href) if source: url = redirect_back(url, source_domain) if not is_abs(url): url = urljoin(source, url) if not is_valid(url): continue if exclude_images: if not is_image(url): final_urls.append(url) else: final_urls.append(url) return uniq(final_urls)
def extract( soup, tags=PESSIMISTIC_TAGS, attrs=TAG_ATTRS, vals=TAG_VALS): """ Extract author attrs from meta tags. Only works for english articles. """ # soupify if not isinstance(soup, BeautifulSoup): soup = make_soup(soup) # Search popular author tags for authors matches = [] _authors = [] for tag in tags: for attr in attrs: for val in vals: found = soup.find_all(tag, {attr: val}) matches.extend(found) for match in matches: content = u'' m = match.attrs.get('content', None) if m: content = m else: # match.tag == <any other tag> content = match.text or u'' # text_content() if len(content) > 0: _authors.extend(parse(content)) return _format(_authors)
def extract(source_url, **kw): """ Article extraction. Method is as follows: 1. Get html from url. 2. Canonicalize URL. 3. If not canonical, prepare the url. 4. Extract meta tags. 5. If embedly is active, use it for content extraction. 6. If embedly doesnt return content or is not active, use readability 7. If readability doesnt return content, use article tag. 8. If authors aren't detcted from meta tags, detect them in article body. """ type = kw.get('type', 'article') # fetch page page_html = network.get(source_url) # something failed. if not page_html: log.warning("Failed to extract html from {}".format(source_url)) return None soup = make_soup(page_html) # get canonical url canonical_url = meta.canonical_url(soup) if not canonical_url: canonical_url = url.prepare( source_url, source=source_url, canonicalize=False) # domain domain = url.get_domain(canonical_url) # get meta tags + other data data = { 'url': canonical_url, 'domain': domain, 'title': meta.title(soup, canonical_url), 'description': meta.description(soup, canonical_url), 'img_url': meta.img_url(soup, canonical_url), 'created': meta.publish_date(soup, canonical_url), 'favicon': meta.favicon(soup, canonical_url), 'site_name': meta.site_name(soup, canonical_url), 'authors': author.extract(soup), 'type': type, 'body': None } # embed videos if url.is_video(canonical_url): data['type'] = 'video' data['body'] = embed.video(canonical_url) return data # extract article body if data['type'] == 'article': if settings.EMBEDLY_ENABLED: data['body'] = body_via_embedly(canonical_url) if not data['body']: data['body'] = body_via_readability(page_html, canonical_url) # # extract body from article tag body, raw_html = body_via_article_tag(soup, canonical_url) # merge body if not data['body']: data['body'] = body # get creators from raw article html if not len(data['authors']) and raw_html: data['authors'] = author.extract(raw_html, tags=author.OPTIMISTIC_TAGS) # remove site name from authors if data.get('site_name'): data['authors'] = [ a.replace(data['site_name'].upper(), "").strip() for a in data['authors'] ] # get links from raw_html + content links = [u for u in url.from_any(data['body']) if source_url not in u] for u in url.from_any(raw_html, source=source_url): if u not in links and (u != source_url or not u.startswith(source_url)): links.append(u) # split out internal / external links / article links data['links'] = links return data
def prepare(url, source=None, canonicalize=True, expand=True, keep_params=KEEP_PARAMS): """ Operations that unshorten a url, reconcile embeds, resolves redirects, strip parameters (with optional ones to keep), and then attempts to canonicalize the url by checking the page source's metadata. All urls that enter `merlynne` are first treated with this function. """ if not url or url == "": return None # encode. url = url.encode('utf-8', errors='ignore') # reconcile embeds: url = reconcile_embed(url) # reconcile redirects url = redirect_back(url, source) # check for non absolute urls. if source: source_domain = get_domain(source) # if the domain is in the source, attempt to absolutify it if source_domain in url: # check for non-absolute urls if not is_abs(url): url = urljoin(source, url) # check for missing scheme if not get_scheme(url): url = "http://" + url # check short urls if expand: if is_shortened(url): url = unshorten(url, attempts=1) # canonicalize if canonicalize: page_html = network.get(url) if page_html: soup = make_soup(page_html) _url = meta.canonical_url(soup) if _url: url = _url # if it got converted to None, return if not url: return None # remove arguments w/ optional parameters to keep. url = remove_args(url, keep_params) # remove index.html url = re_index_html.sub('', url) # always remove trailing slash if url.endswith('/'): url = url[:-1] return url