-
Notifications
You must be signed in to change notification settings - Fork 0
/
sm_aggregator.py
51 lines (40 loc) · 1.58 KB
/
sm_aggregator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from bs4 import BeautifulSoup
import urllib.request
import socialshares
import pandas as pd
# Enter list of domains to be scraped
domains = ['http://www.peoplesmomentum.com/','https://fullfact.org/' ]
# Enter a path to save final CSV file to
filename = 'test.csv'
# Try to convert each url into a html document
names=[]
for domain in domains:
try:
with urllib.request.urlopen(domain) as response:
html = response.read()
except:
print('unable to parse %s' % domain)
continue
soup = BeautifulSoup(html,'html.parser')
# Get a list of urls from the domain
for link in soup.find_all('a'):
names.append(link.get('href'))
urls = [name for name in names if name.startswith('http')]
# Run each url through SM apis to get share counts. Collect in a dictionary
dct = {'url':[], 'facebook':[], 'pinterest':[], 'google':[], 'reddit ups':[], 'reddit downs':[], 'linkedin':[]}
for url in urls:
counts = socialshares.fetch(url, ['facebook', 'pinterest', 'google', 'reddit', 'linkedin'])
dct['url'].append(url)
if 'facebook' in counts:
dct['facebook'].append(counts['facebook']['share_count'])
else:
dct['facebook'].append('na')
dct['pinterest'].append(counts['pinterest'])
dct['google'].append(counts['google'])
dct['reddit ups'].append(counts['reddit']['ups'])
dct['reddit downs'].append(counts['reddit']['downs'])
dct['linkedin'].append(counts['linkedin'])
# Convert to CSV and save
dataframe = pd.DataFrame(dct)
dataframe = dataframe.set_index('url').reset_index()
dataframe.to_csv(filename, index=False)