Пример #1
0
 def get_social_media_links(self,link, DEFAULT_SM, default_dict):
     res = requests.get(link,verify=False, timeout=30)
     tree = parse_html_bytes(res.content, res.headers.get('content-type'))
     sm_link =  list(find_links_tree(tree))
     for i in sm_link:
         for sm in DEFAULT_SM:
             if sm in i:
                 default_dict[sm] += [i]
     return default_dict
Пример #2
0
def find_social_links(url) -> set:
    """
    Find the social media links are a webpage.
    - url: The url of the webpage to search (String)
    """
    response = requests.get(url)
    tree = parse_html_bytes(response.content,
                            response.headers.get('content-type'))
    return set(find_links_tree(tree))
def test_href():
    href = etree.HTML("""
        <a href="http://feeds.feedburner.com/TnsGlobalPressReleases">
        <fb:like href="http://www.facebook.com/elDiarioEs">
        <a class="twitter-follow-button" href="https://twitter.com/NASA">
        <a class="github-button"
            href="https://github.com/igrigorik/githubarchive.org"
            data-count-href="/igrigorik/githubarchive.org/stargazers">
        <div class="fb-page" data-href="https://www.facebook.com/facebook"
              data-tabs="timeline" data-small-header="false">
    """)
    assert len(list(find_links_tree(href))) == 5, href
Пример #4
0
def get_social_media(url):
    media = [
        'facebook', 'linkedin', 'twitter', 'youtube', 'github', 'google plus',
        'pinterest', 'instagram', 'snapchat', 'flipboard', 'flickr', 'weibo',
        'periscope', 'telegram', 'soundcloud', 'feedburner', 'vimeo',
        'slideshare', 'vkontakte', 'xing'
    ]
    res = requests.get(url)
    social = dict()
    tree = parse_html_bytes(res.content, res.headers.get('content-type'))
    links = set(find_links_tree(tree))
    for i in range(len(media)):
        for link in links:
            if media[i] in link:
                social[media[i]] = link
    return social
def test_broken_href():
    href = etree.HTML("""
        <a href>
    """)
    assert len(list(find_links_tree(href))) == 0, href
def test_twitter():
    href = etree.HTML("""
        <meta name="twitter:site" content="@fluquid_ds">
        <meta name="twitter:creator" content="@fluquid_ds">
    """)
    assert len(list(find_links_tree(href))) == 2, href
Пример #7
0
import requests
from html_to_etree import parse_html_bytes
from extract_social_media import find_links_tree

res = requests.get('https://github.com/HarshCasper/Rotten-Scripts')

tree = parse_html_bytes(res.content, res.headers.get('content-type'))

links = set(find_links_tree(tree))
print(links)