def extract_opengraph(tree): '''Search meta tags following the OpenGraph guidelines (https://ogp.me/)''' title, author, url, description, site_name = (None, ) * 5 # detect OpenGraph schema for elem in tree.xpath('//head/meta[starts-with(@property, "og:")]'): # safeguard if not elem.get('content'): continue # site name if elem.get('property') == 'og:site_name': site_name = elem.get('content') # blog title elif elem.get('property') == 'og:title': title = elem.get('content') # orig URL elif elem.get('property') == 'og:url': if validate_url(elem.get('content'))[0] is True: url = elem.get('content') # description elif elem.get('property') == 'og:description': description = elem.get('content') # og:author elif elem.get('property') in ('og:author', 'og:article:author'): author = elem.get('content') # og:type #elif elem.get('property') == 'og:type': # pagetype = elem.get('content') # og:locale #elif elem.get('property') == 'og:locale': # pagelocale = elem.get('content') return trim(title), trim(author), trim(url), trim(description), trim( site_name)
def extract_url(tree, default_url=None): '''Extract the URL from the canonical link''' # https://www.tutorialrepublic.com/html-reference/html-base-tag.php # default url as fallback url = default_url # try canonical link first element = tree.find('.//head//link[@rel="canonical"]') if element is not None and 'href' in element.attrib and URL_COMP_CHECK.match( element.attrib['href']): url = element.attrib['href'] # try default language link else: for element in tree.iterfind('.//head//link[@rel="alternate"]'): if 'hreflang' in element.attrib and element.attrib[ 'hreflang'] is not None and element.attrib[ 'hreflang'] == 'x-default': if URL_COMP_CHECK.match(element.attrib['href']): LOGGER.debug( html.tostring(element, pretty_print=False, encoding='unicode').strip()) url = element.attrib['href'] # add domain name if it's missing if url is not None and url.startswith('/'): for element in tree.iterfind('.//head//meta[@content]'): if 'name' in element.attrib: attrtype = element.attrib['name'] elif 'property' in element.attrib: attrtype = element.attrib['property'] else: continue if attrtype.startswith('og:') or attrtype.startswith('twitter:'): domain_match = re.match(r'https?://[^/]+', element.attrib['content']) if domain_match: # prepend URL url = domain_match.group(0) + url break # sanity check: don't return invalid URLs if url is not None: validation_result, parsed_url = validate_url(url) if validation_result is False: url = None else: url = normalize_url(parsed_url) return url
def examine_meta(tree): '''Search meta tags for relevant information''' metadata = dict.fromkeys([ 'title', 'author', 'url', 'hostname', 'description', 'sitename', 'date', 'categories', 'tags' ]) # bootstrap from potential OpenGraph tags title, author, url, description, site_name = extract_opengraph(tree) # test if all return values have been assigned if all((title, author, url, description, site_name)): # if they are all defined metadata['title'], metadata['author'], metadata['url'], metadata[ 'description'], metadata[ 'sitename'] = title, author, url, description, site_name return metadata tags = [] # skim through meta tags for elem in tree.xpath('//head/meta[@content]'): # content if not elem.get('content'): continue content_attr = elem.get('content') # image info # ... # property if 'property' in elem.attrib: # no opengraph a second time if elem.get('property').startswith('og:'): continue if elem.get('property') == 'article:tag': tags.append(content_attr) elif elem.get('property') in ('author', 'article:author'): if author is None: author = content_attr # name attribute elif 'name' in elem.attrib: # elem.get('name') is not None: # author if elem.get('name') in ('author', 'byl', 'dc.creator', 'sailthru.author'): # twitter:creator if author is None: author = content_attr # title elif elem.get('name') in ('title', 'dc.title', 'sailthru.title', 'twitter:title'): if title is None: title = content_attr # description elif elem.get('name') in ('description', 'dc.description', 'dc:description', 'sailthru.description', 'twitter:description'): if description is None: description = content_attr # site name elif elem.get('name') in ( 'publisher', 'DC.publisher', 'twitter:site', 'application-name' ) or 'twitter:app:name' in elem.get('name'): if site_name is None: site_name = content_attr # url elif elem.get('name') == 'twitter:url': if url is None and validate_url(content_attr)[0] is True: url = content_attr # keywords elif elem.get('name') == 'keywords': # 'page-topic' tags.append(content_attr) elif 'itemprop' in elem.attrib: if elem.get('itemprop') == 'author': if author is None: author = content_attr elif elem.get('itemprop') == 'description': if description is None: description = content_attr # to verify: #elif elem.get('itemprop') == 'name': # if title is None: # title = elem.get('content') # other types else: if not 'charset' in elem.attrib and not 'http-equiv' in elem.attrib and not 'property' in elem.attrib: LOGGER.debug( html.tostring(elem, pretty_print=False, encoding='unicode').strip()) metadata['title'], metadata['author'], metadata['url'], metadata[ 'description'], metadata['sitename'], metadata[ 'tags'] = title, author, url, description, site_name, tags return metadata
def examine_meta(tree): '''Search meta tags for relevant information''' metadata = dict.fromkeys(METADATA_LIST) # bootstrap from potential OpenGraph tags title, author, url, description, site_name = extract_opengraph(tree) # test if all return values have been assigned if all((title, author, url, description, site_name)): # if they are all defined metadata['title'], metadata['author'], metadata['url'], metadata[ 'description'], metadata[ 'sitename'] = title, author, url, description, site_name return metadata tags, backup_sitename = [], None # skim through meta tags for elem in tree.iterfind('.//head/meta[@content]'): # content if not elem.get('content'): continue content_attr = elem.get('content') # image info # ... # property if 'property' in elem.attrib: # no opengraph a second time if elem.get('property').startswith('og:'): continue if elem.get('property') == 'article:tag': tags.append(content_attr) elif elem.get('property') in ('author', 'article:author'): author = author or content_attr # name attribute elif 'name' in elem.attrib: name_attr = elem.get('name').lower() # author if name_attr in METANAME_AUTHOR: author = author or content_attr # title elif name_attr in METANAME_TITLE: title = title or content_attr # description elif name_attr in METANAME_DESCRIPTION: description = description or content_attr # site name elif name_attr in METANAME_PUBLISHER: site_name = site_name or content_attr elif name_attr in ('twitter:site', 'application-name' ) or 'twitter:app:name' in elem.get('name'): backup_sitename = content_attr # url elif name_attr == 'twitter:url': if url is None and validate_url(content_attr)[0] is True: url = content_attr # keywords elif name_attr == 'keywords': # 'page-topic' tags.append(content_attr) elif 'itemprop' in elem.attrib: if elem.get('itemprop') == 'author': author = author or content_attr elif elem.get('itemprop') == 'description': description = description or content_attr elif elem.get('itemprop') == 'headline': title = title or content_attr # to verify: #elif elem.get('itemprop') == 'name': # if title is None: # title = elem.get('content') # other types else: if not any(key in elem.attrib for key in ('charset', 'http-equiv', 'property')): LOGGER.debug( 'unknown attribute: %s', html.tostring(elem, pretty_print=False, encoding='unicode').strip()) # backups if site_name is None and backup_sitename is not None: site_name = backup_sitename # copy metadata['title'], metadata['author'], metadata['url'], metadata[ 'description'], metadata['sitename'], metadata[ 'tags'] = title, author, url, description, site_name, tags return metadata
def examine_meta(tree): '''Search meta tags for relevant information''' metadata = Document() # alt: Metadata() # bootstrap from potential OpenGraph tags title, author, url, description, site_name = extract_opengraph(tree) # test if all return values have been assigned if all((title, author, url, description, site_name)): # if they are all defined metadata.title, metadata.author, metadata.url, metadata.description, metadata.sitename = title, author, url, description, site_name return metadata tags, backup_sitename = [], None # skim through meta tags for elem in tree.iterfind('.//head/meta[@content]'): # content if not elem.get('content'): continue content_attr = HTML_STRIP_TAG.sub('', elem.get('content')) # image info # ... # property if 'property' in elem.attrib: # no opengraph a second time if elem.get('property').startswith('og:'): continue if elem.get('property') == 'article:tag': tags.append(normalize_tags(content_attr)) elif elem.get('property') in PROPERTY_AUTHOR: author = normalize_authors(author, content_attr) elif elem.get('property') == 'article:publisher': site_name = site_name or content_attr # name attribute elif 'name' in elem.attrib: name_attr = elem.get('name').lower() # author if name_attr in METANAME_AUTHOR: author = normalize_authors(author, content_attr) # title elif name_attr in METANAME_TITLE: title = title or content_attr # description elif name_attr in METANAME_DESCRIPTION: description = description or content_attr # site name elif name_attr in METANAME_PUBLISHER: site_name = site_name or content_attr elif name_attr in TWITTER_ATTRS or 'twitter:app:name' in elem.get('name'): backup_sitename = content_attr # url elif name_attr == 'twitter:url': if url is None and validate_url(content_attr)[0] is True: url = content_attr # keywords elif name_attr in METANAME_TAG: # 'page-topic' tags.append(normalize_tags(content_attr)) elif 'itemprop' in elem.attrib: if elem.get('itemprop') == 'author': author = normalize_authors(author, content_attr) elif elem.get('itemprop') == 'description': description = description or content_attr elif elem.get('itemprop') == 'headline': title = title or content_attr # to verify: # elif elem.get('itemprop') == 'name': # if title is None: # title = elem.get('content') # other types elif all( key not in elem.attrib for key in EXTRA_META ): LOGGER.debug('unknown attribute: %s', tostring(elem, pretty_print=False, encoding='unicode').strip()) # backups if site_name is None and backup_sitename is not None: site_name = backup_sitename # copy metadata.title, metadata.author, metadata.url, metadata.description, metadata.sitename, metadata.tags = title, author, url, description, site_name, tags return metadata