-
Notifications
You must be signed in to change notification settings - Fork 0
/
crow.py
120 lines (100 loc) · 2.69 KB
/
crow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from xgoogle.search import GoogleSearch,SearchError
from BeautifulSoup import BeautifulSoup
import urllib,urllib2
from urllib2 import URLError
import urlrule
from log import log
from Pipeline import SqlitePipeline
from twisted.internet import reactor
class ExecutionQueue:
def __init__(self,queue=[]):
self.queue = queue
self.current = 0
self.paths = set()
def size(self):
return len(self.queue)
def add_link(self,link):
if link.url not in self.paths:
self.paths.add(link.url)
self.queue.append(link)
def get_next_link(self):
if self.size() > self.current:
result = self.queue[self.current]
self.current +=1
return result
else:
return None
class Link:
def __init__(self,url,depth):
self.url = url
self.depth = depth
class Crow:
cid = 0
def __init__(self,url):
Crow.cid += 1
self.cid = Crow.cid
log("[init Crow%d]" % self.cid)
self.async = False
self.queue = ExecutionQueue([Link(url,1)])
self.count = 20
self.success = 0
def to(self,pipe):
self.pipe = pipe
return self
def select(self,rule):
self.rule=rule
return self
def start(self,count):
if(self.async == True):
self.pipe = SqlitePipeline()
if(self.pipe==None or self.rule==None):
print "please set the target and rules"
self.count = count
while self.count > self.success:
link = self.queue.get_next_link()
if link == None:
break
url = link.url
depth = link.depth
try:
log("[Crow%d crawling]%s in depth:%d " %(self.cid,url,depth))
html = urllib2.urlopen(url,timeout=5)
type = html.headers.get("content-type")
if urlrule.filter.search(type) and html.getcode() == 200:
self.parse_link(link,html)
self.success +=1
except URLError as e:
log(str(e))
except Exception as e:
log(str(e))
return self.stat()
def stat(self):
MB = lambda x:x/1024/1024
log("[Crow%d]total %.2f MB data in database" % (self.cid,MB(self.pipe.size())))
log("[Crow%d]%.2f MB saved in this session" % (self.cid,MB(self.pipe.session_size())))
return self
def parse_link(self,base_url,html):
soup = BeautifulSoup(html)
self.pipe.process(self,base_url,soup)
depth = base_url.depth + 1
for ref in soup.findAll(self.rule,href=True):
url = urlrule.get_abs_url(base_url.url,ref["href"])
if urlrule.match(url):
self.queue.add_link(Link(url,depth))
def async_start(self,count):
self.async = True
reactor.callInThread(self.start,count)
@staticmethod
def run():
reactor.run()
def main():
#the hardcoded search query:
gs = GoogleSearch("computer")
gs.result_per_page=10
results = gs.get_results()
for r in results:
Crow(r.url).select("a").to(SqlitePipeline()).async_start(50)
Crow.run()
f.close()
if __name__ == "__main__":
main()