def main(): arguments = docopt(__doc__, version='Social shares ' + socialshares.__version__) url = arguments['<url>'] attempts = int(arguments['--retry']) + 1 plain = arguments['--plain'] strict = arguments['--exit'] platforms = arguments['<platforms>'] or socialshares.platforms.default try: counts = socialshares.fetch(url, platforms, attempts=attempts, strict=strict) except IOError: sys.exit(1) if plain: l = [] for platform in platforms: count = counts[platform] if isinstance(count, dict): l = l + count.values() else: l.append(count) print(" ".join(map(str, l))) else: print(json.dumps(counts, indent=2))
def test_facebookfql(self): counts = socialshares.fetch( url, ['facebookfql'], ) self.assertIn('facebookfql', counts) self.assertIsInstance(counts['facebookfql'], dict)
def get_share_count(self): self.shares = socialshares.fetch(self.url, ['facebook', 'pinterest', 'google', 'linkedin', 'reddit']) return (self.shares['reddit']['ups'] if 'reddit' in self.shares else 0) \ + (self.shares['facebook']['share_count'] if 'facebook' in self.shares else 0) \ + (self.shares['google'] if 'google' in self.shares else 0) \ + (self.shares['pinterest'] if 'pinterest' in self.shares else 0) \ + (self.shares['linkedin'] if 'linkedin' in self.shares else 0)
def socialShareExtraction(jsonList): #extract shares from all the social media urlList = stats(jsonList)["url"].keys() ss = [] socialmedia = ['facebook', 'twitter', 'linkedin', 'google', 'pinterest', 'reddit', 'facebookfql'] for i in xrange(0, 10): counts = socialshares.fetch(urlList[i], socialmedia) ss.append(counts) return (ss)
def test_cli_json(self): py = socialshares.fetch(url) cli_raw = subprocess.check_output('socialshares {url}'.format(url=url), shell=True) cli = json.loads(cli_raw.decode('utf-8')) for k, v in py.items(): self.assertIn(k, cli) self.assertIsInstance(v, int) self.assertTrue(is_close(py[k], cli[k]))
def test_cli_json(self): py = socialshares.fetch(url, ['facebook'])['facebook'] cli_raw = subprocess.check_output('socialshares {url}'.format(url=url), shell=True) cli = json.loads(cli_raw.decode('utf-8'))['facebook'] for k, v in py.items(): self.assertIn(k, cli) self.assertTrue(is_close(py[k], cli[k]))
def set_social_media_share_counts(self): shares = socialshares.fetch( self.url, ['facebook', 'pinterest', 'google', 'linkedin', 'reddit']) self.shares['reddit'] = (shares['reddit']['ups'] if 'reddit' in shares else 0) self.shares['facebook'] = (shares['facebook']['share_count'] if 'facebook' in shares else 0) self.shares['google'] = (shares['google'] if 'google' in shares else 0) self.shares['pinterest'] = (shares['pinterest'] if 'pinterest' in shares else 0) self.shares['linkedin'] = (shares['linkedin'] if 'linkedin' in shares else 0)
def main(): arguments = docopt(__doc__, version='Social shares 0.1') url = arguments['<url>'] attempts = int(arguments['--retry']) + 1 plain = arguments['--plain'] strict = arguments['--exit'] platforms = arguments['<platforms>'] or socialshares.platforms.default todo = set(platforms) counts = {} attempt = 0 while len(todo) and attempt < attempts: attempts = attempts + 1 partial = socialshares.fetch(url, platforms) todo = todo.difference(partial) counts.update(partial) if strict and len(counts) < len(platforms): sys.exit(1) elif plain: print " ".join(map(str, counts.values())) else: print json.dumps(counts, indent=2)
def update_social_data(self) -> None: if not self.url_raw: message = 'News ID={} has not url_raw field'.format(self.id) logger_crawlers.warning(message) return social_data = {} try: social_data = socialshares.fetch( self.url_raw, ['pinterest', 'linkedin', 'google', 'reddit']) fb_data = self._get_fb_data(self.url_raw) social_data.update(fb_data) except Exception as e: message = 'Error at fetch social shares for {}'.format( self.url_raw) logger_crawlers.error(message, exc_info=True) self.social = social_data try: self.save() except Exception as e: message = 'Error at saving News ID={} at updating social data'.format( self.id) logger_crawlers.error(message)
def test_pinterest(self): counts = socialshares.fetch(url, ['pinterest'], **self.defaults) self.assertIn('pinterest', counts) self.assertIsInstance(counts['pinterest'], int)
def test_linkedin(self): counts = socialshares.fetch(url, ['linkedin'], **self.defaults) self.assertIn('linkedin', counts) self.assertIsInstance(counts['linkedin'], int)
def test_google(self): counts = socialshares.fetch(url, ['google'], **self.defaults) self.assertIn('google', counts) self.assertIsInstance(counts['google'], int)
def test_default(self): counts = socialshares.fetch(url, **self.lax_defaults) self.assertEqual(set(counts.keys()), set(socialshares.platforms.default))
def update_shares(self): orig_shares = self.shares self.shares = socialshares.fetch(self.content_object.get_full_url(), ['facebook'], attempts=3).get('facebook', 0) if self.shares != orig_shares: self.save()
def scrape_and_aggregate(domains): from bs4 import BeautifulSoup import urllib.request # import requests import socialshares import pandas as pd # Try to convert each url into a html document urls = [] inaccessible = [] for domain in domains: try: with urllib.request.urlopen(domain) as response: html = response.read() except: inaccessible.append(domain) continue soup = BeautifulSoup(html, 'html.parser') # Get a list of urls from the domain for link in soup.find_all('a'): url = link.get('href') # For internal links, prepend the domain address try: if url.startswith('http'): urls.append(url) elif url.startswith('/http'): urls.append(url[1:]) else: urls.append(domain + url) # If the url is empty, skip it except AttributeError: continue # Run each url through SM apis to get share counts. Collect in a dictionary dct = { 'URL': [], 'Facebook': [], 'Pinterest': [], 'Google': [], 'Linkedin': [] } for url in urls: # Find any '//' after http:// and convert to '/' idx = url.find('/') cleaned_url = url[:idx + 1] + url[idx + 1:].replace('//', '/') try: counts = socialshares.fetch( cleaned_url, ['facebook', 'pinterest', 'google', 'linkedin' ]) # can also query reddit, but unreliable # If no data, skip that url except TypeError: continue dct['URL'].append(cleaned_url) # try: # header = requests.head(cleaned_url).headers # dct['Modified date'].append(header['Last-Modified']) # except: # dct['Modified date'].append('n/a') if 'facebook' in counts: dct['Facebook'].append(counts['facebook']['share_count']) else: dct['Facebook'].append('n/a') if 'pinterest' in counts: dct['Pinterest'].append(counts['pinterest']) else: dct['Pinterest'].append('n/a') if 'google' in counts: dct['Google'].append(counts['google']) else: dct['Google'].append('n/a') if 'linkedin' in counts: dct['Linkedin'].append(counts['linkedin']) else: dct['Linkedin'].append('n/a') dataframe = pd.DataFrame(dct) dataframe = dataframe.set_index('URL').reset_index() return dataframe, inaccessible
def test_facebook(self): counts = socialshares.fetch(url, ['facebook'], **self.defaults) self.assertIn('facebook', counts) self.assertIsInstance(counts['facebook']['share_count'], int)
print('unable to parse %s' % domain) continue soup = BeautifulSoup(html,'html.parser') # Get a list of urls from the domain for link in soup.find_all('a'): names.append(link.get('href')) urls = [name for name in names if name.startswith('http')] # Run each url through SM apis to get share counts. Collect in a dictionary dct = {'url':[], 'facebook':[], 'pinterest':[], 'google':[], 'reddit ups':[], 'reddit downs':[], 'linkedin':[]} for url in urls: counts = socialshares.fetch(url, ['facebook', 'pinterest', 'google', 'reddit', 'linkedin']) dct['url'].append(url) if 'facebook' in counts: dct['facebook'].append(counts['facebook']['share_count']) else: dct['facebook'].append('na') dct['pinterest'].append(counts['pinterest']) dct['google'].append(counts['google']) dct['reddit ups'].append(counts['reddit']['ups']) dct['reddit downs'].append(counts['reddit']['downs']) dct['linkedin'].append(counts['linkedin']) # Convert to CSV and save dataframe = pd.DataFrame(dct)
def test_all(self): counts = socialshares.fetch(url, socialshares.platforms.supported, **self.lax_defaults) self.assertTrue(len(counts.keys()))
def test_reddit(self): counts = socialshares.fetch(url, ['reddit'], **self.defaults) self.assertIn('reddit', counts) self.assertIsInstance(counts['reddit'], dict)
def test_cli_plain(self): py = socialshares.fetch(url, ['pinterest']) cli_raw = subprocess.check_output( 'socialshares {url} pinterest --plain'.format(url=url), shell=True) cli = int(cli_raw) self.assertEqual(py['pinterest'], cli)
def test_twitter(self): counts = socialshares.fetch(url, ['twitter'], **self.defaults) self.assertIn('twitter', counts) self.assertIsInstance(counts['twitter'], int)
def test_cli_plain(self): py = socialshares.fetch(url, ['twitter']) cli_raw = subprocess.check_output('socialshares {url} twitter --plain'.format(url=url), shell=True) cli = int(cli_raw) self.assertEqual(py['twitter'], cli)
def test_facebook(self): counts = socialshares.fetch(url, ['facebook'], **self.defaults) self.assertIn('facebook', counts) self.assertIsInstance(counts['facebook'], int)
def test_facebookfql(self): counts = socialshares.fetch(url, ['facebookfql'], ) self.assertIn('facebookfql', counts) self.assertIsInstance(counts['facebookfql'], dict)
s1 = str(val[0]) s2 = str(val[1]) author = s1 + " " + s2 sharecount = soup.find("p", attrs={"class":"postInfo color-grey mt-5 fr"}) val1 = sharecount.text.split() date = str(val1[1]) month = str(val1[2]) year = str(val1[3]) val3 = soup.find("div", attrs={'class':'ys_post_content text'}) contents = bleach.clean(val3,tags=[],styles=[],strip=True) contents = contents.encode('utf-8').decode('ascii', 'ignore') counts = socialshares.fetch(links, ['facebook', 'linkedin']) facebook = counts['facebook'] linkedin = counts['linkedin'] with open('index1.csv', 'a') as csv_file: writer = csv.writer(csv_file) writer.writerow([date, month, year, author, data, facebook, linkedin, contents]) except: continue # f = open(name, "a") # if soup.title.string: # print(soup.title.string) # f.write(soup.title.string)