/
sattascrapsite.py
53 lines (50 loc) · 1.52 KB
/
sattascrapsite.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
years=[2015,2016,2017,2018]
months=[1,2,3,4,5,6,7,8,9,10,11,12]
import requests
from bs4 import BeautifulSoup as bsp
a2m=lambda ahi:{
'game':ahi.parent.h2.text,
'time':ahi.previousSibling.previousSibling.text,
'url':ahi['href'],
'tvalue': ahi.parent.parent.h3.text
}
def saveobjs(objs,at="./export.csv"):
'''db handler, csv for now'''
with open(at,'a') as f:
f.write('\n'.join([','.join(map(str,x)) for x in objs])+'\n')
return 1
def scrapeHomePage():
rh=requests.get('https://sattakingdarbar.com/')
sp=bsp(rh.text,'html.parser')
ahrefs=filter(lambda a:a.text=="Record Chart",sp.findAll('a'))
temps={d['url']:[d['game'],d['time'],d['tvalue']] for d in map(a2m,ahrefs)}
return temps
def urlpath(url):
import sys
if sys.version_info[0] < 3:
import urllib
return urllib.splitquery(url)[0]
else:
from urllib.parse import urlparse,urlunparse
return urlunparse(list(urlparse(url))[:3]+['']*3)
def scrapeThisPage(url):
objs=[]
hitu=urlpath(url)+"?month={}&year={}"
for y in years:
for m in months:
ri=requests.get(hitu.format(m,y))
spi=bsp(ri.text,'html.parser')
dates=spi.findAll('td',attrs={'class':'day'})
vals=spi.findAll('td',attrs={'class':'number'})
names=spi.findAll('th',attrs={'class':'name'})
ln,ld=len(names),len(dates)
for i,iv in enumerate(vals):
objs.append([dates[i//ln].text,m,y,names[i%ln].text,iv.text,''])
return objs
def scrapAll():
allobjs=[]
alllinks=scrapeHomePage()
for u in alllinks:
allobjs+=scrapeThisPage(u)
if len(allobjs)>500 and saveobjs(allobjs):allobjs=[]
scrapAll()