Exemplo n.º 1
0
def main():
    arguments = docopt(__doc__,
                       version='Social shares ' + socialshares.__version__)
    url = arguments['<url>']
    attempts = int(arguments['--retry']) + 1
    plain = arguments['--plain']
    strict = arguments['--exit']
    platforms = arguments['<platforms>'] or socialshares.platforms.default

    try:
        counts = socialshares.fetch(url,
                                    platforms,
                                    attempts=attempts,
                                    strict=strict)
    except IOError:
        sys.exit(1)

    if plain:
        l = []
        for platform in platforms:
            count = counts[platform]
            if isinstance(count, dict):
                l = l + count.values()
            else:
                l.append(count)
        print(" ".join(map(str, l)))
    else:
        print(json.dumps(counts, indent=2))
Exemplo n.º 2
0
 def test_facebookfql(self):
     counts = socialshares.fetch(
         url,
         ['facebookfql'],
     )
     self.assertIn('facebookfql', counts)
     self.assertIsInstance(counts['facebookfql'], dict)
Exemplo n.º 3
0
 def get_share_count(self):
     self.shares = socialshares.fetch(self.url, ['facebook', 'pinterest', 'google', 'linkedin', 'reddit'])
     return (self.shares['reddit']['ups'] if 'reddit' in self.shares else 0) \
         + (self.shares['facebook']['share_count'] if 'facebook' in self.shares else 0) \
         + (self.shares['google'] if 'google' in self.shares else 0) \
         + (self.shares['pinterest'] if 'pinterest' in self.shares else 0) \
         + (self.shares['linkedin'] if 'linkedin' in self.shares else 0)
Exemplo n.º 4
0
def socialShareExtraction(jsonList):
    #extract shares from all the social media
    urlList = stats(jsonList)["url"].keys()
    ss = []
    socialmedia = ['facebook', 'twitter', 'linkedin', 'google', 'pinterest', 'reddit', 'facebookfql']
    for i in xrange(0, 10):
        counts = socialshares.fetch(urlList[i], socialmedia)
        ss.append(counts)
    return (ss)
Exemplo n.º 5
0
    def test_cli_json(self):
        py = socialshares.fetch(url)
        cli_raw = subprocess.check_output('socialshares {url}'.format(url=url), shell=True)
        cli = json.loads(cli_raw.decode('utf-8'))

        for k, v in py.items():
            self.assertIn(k, cli)
            self.assertIsInstance(v, int)
            self.assertTrue(is_close(py[k], cli[k]))
Exemplo n.º 6
0
    def test_cli_json(self):
        py = socialshares.fetch(url, ['facebook'])['facebook']
        cli_raw = subprocess.check_output('socialshares {url}'.format(url=url),
                                          shell=True)
        cli = json.loads(cli_raw.decode('utf-8'))['facebook']

        for k, v in py.items():
            self.assertIn(k, cli)
            self.assertTrue(is_close(py[k], cli[k]))
Exemplo n.º 7
0
    def set_social_media_share_counts(self):
        shares = socialshares.fetch(
            self.url,
            ['facebook', 'pinterest', 'google', 'linkedin', 'reddit'])

        self.shares['reddit'] = (shares['reddit']['ups']
                                 if 'reddit' in shares else 0)
        self.shares['facebook'] = (shares['facebook']['share_count']
                                   if 'facebook' in shares else 0)
        self.shares['google'] = (shares['google'] if 'google' in shares else 0)
        self.shares['pinterest'] = (shares['pinterest']
                                    if 'pinterest' in shares else 0)
        self.shares['linkedin'] = (shares['linkedin']
                                   if 'linkedin' in shares else 0)
Exemplo n.º 8
0
def main():
    arguments = docopt(__doc__, version='Social shares 0.1')
    url = arguments['<url>']
    attempts = int(arguments['--retry']) + 1
    plain = arguments['--plain']
    strict = arguments['--exit']
    platforms = arguments['<platforms>'] or socialshares.platforms.default
    todo = set(platforms)

    counts = {}
    attempt = 0
    while len(todo) and attempt < attempts:
        attempts = attempts + 1
        partial = socialshares.fetch(url, platforms)
        todo = todo.difference(partial)
        counts.update(partial)

    if strict and len(counts) < len(platforms):
        sys.exit(1)
    elif plain:
        print " ".join(map(str, counts.values()))
    else:
        print json.dumps(counts, indent=2)
Exemplo n.º 9
0
    def update_social_data(self) -> None:
        if not self.url_raw:
            message = 'News ID={} has not url_raw field'.format(self.id)
            logger_crawlers.warning(message)
            return

        social_data = {}
        try:
            social_data = socialshares.fetch(
                self.url_raw, ['pinterest', 'linkedin', 'google', 'reddit'])
            fb_data = self._get_fb_data(self.url_raw)
            social_data.update(fb_data)
        except Exception as e:
            message = 'Error at fetch social shares for {}'.format(
                self.url_raw)
            logger_crawlers.error(message, exc_info=True)

        self.social = social_data
        try:
            self.save()
        except Exception as e:
            message = 'Error at saving News ID={} at updating social data'.format(
                self.id)
            logger_crawlers.error(message)
Exemplo n.º 10
0
def main():
    arguments = docopt(__doc__, version='Social shares ' + socialshares.__version__)
    url = arguments['<url>']
    attempts = int(arguments['--retry']) + 1
    plain = arguments['--plain']
    strict = arguments['--exit']
    platforms = arguments['<platforms>'] or socialshares.platforms.default

    try:
        counts = socialshares.fetch(url, platforms, attempts=attempts, strict=strict)
    except IOError:
        sys.exit(1)

    if plain:
        l = []
        for platform in platforms:
            count = counts[platform]
            if isinstance(count, dict):
                l = l + count.values()
            else:
                l.append(count)
        print(" ".join(map(str, l)))
    else:
        print(json.dumps(counts, indent=2))
Exemplo n.º 11
0
 def test_pinterest(self):
     counts = socialshares.fetch(url, ['pinterest'], **self.defaults)
     self.assertIn('pinterest', counts)
     self.assertIsInstance(counts['pinterest'], int)
Exemplo n.º 12
0
 def test_linkedin(self):
     counts = socialshares.fetch(url, ['linkedin'], **self.defaults)
     self.assertIn('linkedin', counts)
     self.assertIsInstance(counts['linkedin'], int)
Exemplo n.º 13
0
 def test_google(self):
     counts = socialshares.fetch(url, ['google'], **self.defaults)
     self.assertIn('google', counts)
     self.assertIsInstance(counts['google'], int)
Exemplo n.º 14
0
 def test_default(self):
     counts = socialshares.fetch(url, **self.lax_defaults)
     self.assertEqual(set(counts.keys()),
                      set(socialshares.platforms.default))
Exemplo n.º 15
0
 def update_shares(self):
     orig_shares = self.shares
     self.shares = socialshares.fetch(self.content_object.get_full_url(), ['facebook'], attempts=3).get('facebook',
                                                                                                        0)
     if self.shares != orig_shares:
         self.save()
Exemplo n.º 16
0
def scrape_and_aggregate(domains):

    from bs4 import BeautifulSoup
    import urllib.request
    # import requests
    import socialshares
    import pandas as pd

    # Try to convert each url into a html document
    urls = []
    inaccessible = []
    for domain in domains:
        try:
            with urllib.request.urlopen(domain) as response:
                html = response.read()
        except:
            inaccessible.append(domain)
            continue

        soup = BeautifulSoup(html, 'html.parser')

        # Get a list of urls from the domain
        for link in soup.find_all('a'):
            url = link.get('href')

            # For internal links, prepend the domain address
            try:
                if url.startswith('http'):
                    urls.append(url)

                elif url.startswith('/http'):
                    urls.append(url[1:])
                else:
                    urls.append(domain + url)

            # If the url is empty, skip it
            except AttributeError:
                continue

    # Run each url through SM apis to get share counts. Collect in a dictionary
    dct = {
        'URL': [],
        'Facebook': [],
        'Pinterest': [],
        'Google': [],
        'Linkedin': []
    }

    for url in urls:
        # Find any '//' after http:// and convert to '/'
        idx = url.find('/')
        cleaned_url = url[:idx + 1] + url[idx + 1:].replace('//', '/')

        try:
            counts = socialshares.fetch(
                cleaned_url, ['facebook', 'pinterest', 'google', 'linkedin'
                              ])  # can also query reddit, but unreliable

        # If no data, skip that url
        except TypeError:
            continue

        dct['URL'].append(cleaned_url)

        # try:
        #     header = requests.head(cleaned_url).headers
        #     dct['Modified date'].append(header['Last-Modified'])
        # except:
        #     dct['Modified date'].append('n/a')

        if 'facebook' in counts:
            dct['Facebook'].append(counts['facebook']['share_count'])
        else:
            dct['Facebook'].append('n/a')
        if 'pinterest' in counts:
            dct['Pinterest'].append(counts['pinterest'])
        else:
            dct['Pinterest'].append('n/a')
        if 'google' in counts:
            dct['Google'].append(counts['google'])
        else:
            dct['Google'].append('n/a')
        if 'linkedin' in counts:
            dct['Linkedin'].append(counts['linkedin'])
        else:
            dct['Linkedin'].append('n/a')

    dataframe = pd.DataFrame(dct)
    dataframe = dataframe.set_index('URL').reset_index()

    return dataframe, inaccessible
Exemplo n.º 17
0
 def test_default(self):
     counts = socialshares.fetch(url, **self.lax_defaults)
     self.assertEqual(set(counts.keys()), set(socialshares.platforms.default))
Exemplo n.º 18
0
 def test_facebook(self):
     counts = socialshares.fetch(url, ['facebook'], **self.defaults)
     self.assertIn('facebook', counts)
     self.assertIsInstance(counts['facebook']['share_count'], int)
Exemplo n.º 19
0
 def test_google(self):
     counts = socialshares.fetch(url, ['google'], **self.defaults)
     self.assertIn('google', counts)
     self.assertIsInstance(counts['google'], int)
Exemplo n.º 20
0
 def test_linkedin(self):
     counts = socialshares.fetch(url, ['linkedin'], **self.defaults)
     self.assertIn('linkedin', counts)
     self.assertIsInstance(counts['linkedin'], int)
Exemplo n.º 21
0
        print('unable to parse %s' % domain)
        continue

    soup = BeautifulSoup(html,'html.parser')

    # Get a list of urls from the domain
    for link in soup.find_all('a'):
        names.append(link.get('href'))

urls = [name for name in names if name.startswith('http')]

# Run each url through SM apis to get share counts. Collect in a dictionary
dct = {'url':[], 'facebook':[], 'pinterest':[], 'google':[], 'reddit ups':[], 'reddit downs':[], 'linkedin':[]}

for url in urls:
    counts = socialshares.fetch(url, ['facebook', 'pinterest', 'google', 'reddit', 'linkedin'])

    dct['url'].append(url)
    if 'facebook' in counts:
        dct['facebook'].append(counts['facebook']['share_count'])
    else:
        dct['facebook'].append('na')
    dct['pinterest'].append(counts['pinterest'])
    dct['google'].append(counts['google'])
    dct['reddit ups'].append(counts['reddit']['ups'])
    dct['reddit downs'].append(counts['reddit']['downs'])
    dct['linkedin'].append(counts['linkedin'])

# Convert to CSV and save

dataframe = pd.DataFrame(dct)
Exemplo n.º 22
0
 def test_pinterest(self):
     counts = socialshares.fetch(url, ['pinterest'], **self.defaults)
     self.assertIn('pinterest', counts)
     self.assertIsInstance(counts['pinterest'], int)
Exemplo n.º 23
0
 def test_all(self):
     counts = socialshares.fetch(url, socialshares.platforms.supported,
                                 **self.lax_defaults)
     self.assertTrue(len(counts.keys()))
Exemplo n.º 24
0
 def test_reddit(self):
     counts = socialshares.fetch(url, ['reddit'], **self.defaults)
     self.assertIn('reddit', counts)
     self.assertIsInstance(counts['reddit'], dict)
Exemplo n.º 25
0
 def test_cli_plain(self):
     py = socialshares.fetch(url, ['pinterest'])
     cli_raw = subprocess.check_output(
         'socialshares {url} pinterest --plain'.format(url=url), shell=True)
     cli = int(cli_raw)
     self.assertEqual(py['pinterest'], cli)
Exemplo n.º 26
0
 def test_twitter(self):
     counts = socialshares.fetch(url, ['twitter'], **self.defaults)
     self.assertIn('twitter', counts)
     self.assertIsInstance(counts['twitter'], int)
Exemplo n.º 27
0
 def test_cli_plain(self):
     py = socialshares.fetch(url, ['twitter'])
     cli_raw = subprocess.check_output('socialshares {url} twitter --plain'.format(url=url), shell=True)
     cli = int(cli_raw)
     self.assertEqual(py['twitter'], cli)
Exemplo n.º 28
0
 def test_all(self):
     counts = socialshares.fetch(url, socialshares.platforms.supported, **self.lax_defaults)
     self.assertTrue(len(counts.keys()))
Exemplo n.º 29
0
 def test_facebook(self):
     counts = socialshares.fetch(url, ['facebook'], **self.defaults)
     self.assertIn('facebook', counts)
     self.assertIsInstance(counts['facebook'], int)
Exemplo n.º 30
0
 def test_twitter(self):
     counts = socialshares.fetch(url, ['twitter'], **self.defaults)
     self.assertIn('twitter', counts)
     self.assertIsInstance(counts['twitter'], int)
Exemplo n.º 31
0
 def test_facebookfql(self):
     counts = socialshares.fetch(url, ['facebookfql'], )
     self.assertIn('facebookfql', counts)
     self.assertIsInstance(counts['facebookfql'], dict)
Exemplo n.º 32
0
			s1 = str(val[0])
			s2 = str(val[1])
	 	        author = s1 + " " + s2
			sharecount = soup.find("p", attrs={"class":"postInfo color-grey mt-5 fr"})
			val1 = sharecount.text.split()
			date = str(val1[1])
			month = str(val1[2])
			year = str(val1[3])
			val3 = soup.find("div", attrs={'class':'ys_post_content text'})
			contents = bleach.clean(val3,tags=[],styles=[],strip=True)
			contents = contents.encode('utf-8').decode('ascii', 'ignore')




			counts = socialshares.fetch(links, ['facebook', 'linkedin'])
			facebook = counts['facebook']
			linkedin = counts['linkedin']



			with open('index1.csv', 'a') as csv_file:
			        writer = csv.writer(csv_file)
			        writer.writerow([date, month, year, author, data, facebook, linkedin,  contents])

		except:
			continue		
#	f = open(name, "a")
#	if soup.title.string:
#		print(soup.title.string)
#		f.write(soup.title.string)
Exemplo n.º 33
0
 def test_reddit(self):
     counts = socialshares.fetch(url, ['reddit'], **self.defaults)
     self.assertIn('reddit', counts)
     self.assertIsInstance(counts['reddit'], dict)