def _from_url(url): # pragma: no cover """ get list of social media links/handles given a url """ import requests from html_to_etree import parse_html_bytes res = requests.get(url) tree = parse_html_bytes(res.content, res.headers.get('content-type')) return set(find_links_tree(tree))
def get_social_media_links(self,link, DEFAULT_SM, default_dict): res = requests.get(link,verify=False, timeout=30) tree = parse_html_bytes(res.content, res.headers.get('content-type')) sm_link = list(find_links_tree(tree)) for i in sm_link: for sm in DEFAULT_SM: if sm in i: default_dict[sm] += [i] return default_dict
def find_social_links(url) -> set: """ Find the social media links are a webpage. - url: The url of the webpage to search (String) """ response = requests.get(url) tree = parse_html_bytes(response.content, response.headers.get('content-type')) return set(find_links_tree(tree))
def extract_social_media_from_response(self, content, header): tree = parse_html_bytes(content, header.get('content-type')) result = {} for m in self.metas: for link in list(set(find_links_tree(tree))): if m in link: result[m] = link return result
def get_social_media(url): media = [ 'facebook', 'linkedin', 'twitter', 'youtube', 'github', 'google plus', 'pinterest', 'instagram', 'snapchat', 'flipboard', 'flickr', 'weibo', 'periscope', 'telegram', 'soundcloud', 'feedburner', 'vimeo', 'slideshare', 'vkontakte', 'xing' ] res = requests.get(url) social = dict() tree = parse_html_bytes(res.content, res.headers.get('content-type')) links = set(find_links_tree(tree)) for i in range(len(media)): for link in links: if media[i] in link: social[media[i]] = link return social
def test_tree(): for rec in get_data(): meta = rec['meta'] url = meta['url'] contains = meta['contains'] webdata = rec['webdata'] body = webdata['byte_body'] content_type = webdata['content-type'] if len(clean_html(body)) < 100: logging.warning('skipping %s', url) continue tree = parse_html_bytes(body=body, content_type=content_type) assert contains in extract_text(tree), (url, contains, etree.tostring( tree, encoding='utf-8'))
def _request_html(self, url): html = requests.get(url).content print(html) return parse_html_bytes(html)
def audit_html_bytes(body, content_type=''): """ audit html with given bytestring body and header content_type """ logging.debug('parse_html_bytes') tree = parse_html_bytes(body, content_type) return audit_etree(tree)
import requests from html_to_etree import parse_html_bytes from extract_social_media import find_links_tree res = requests.get('https://github.com/HarshCasper/Rotten-Scripts') tree = parse_html_bytes(res.content, res.headers.get('content-type')) links = set(find_links_tree(tree)) print(links)