forked from scrapinghub/frontera
/
__init__.py
82 lines (69 loc) · 2.79 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# -*- coding: utf-8 -*-
from collections import OrderedDict
from frontera import Backend
from frontera.core.components import States
class CommonBackend(Backend):
"""
A simpliest possible backend, performing one-time crawl: if page was crawled once, it will not be crawled again.
"""
component_name = 'Common Backend'
@classmethod
def from_manager(cls, manager):
return cls(manager)
def frontier_start(self):
self.metadata.frontier_start()
self.queue.frontier_start()
self.states.frontier_start()
self.queue_size = self.queue.count()
def frontier_stop(self):
self.metadata.frontier_stop()
self.queue.frontier_stop()
self.states.frontier_stop()
def add_seeds(self, seeds):
for seed in seeds:
seed.meta['depth'] = 0
self.metadata.add_seeds(seeds)
self.states.fetch([seed.meta['fingerprint'] for seed in seeds])
self.states.set_states(seeds)
self._schedule(seeds)
def _schedule(self, requests):
batch = []
queue_incr = 0
for request in requests:
schedule = True if request.meta['state'] in [States.NOT_CRAWLED, States.ERROR, None] else False
batch.append((request.meta['fingerprint'], self._get_score(request), request, schedule))
if schedule:
queue_incr += 1
request.meta['state'] = States.QUEUED
self.queue.schedule(batch)
self.metadata.update_score(batch)
self.queue_size += queue_incr
def _get_score(self, obj):
return obj.meta.get('score', 1.0)
def get_next_requests(self, max_next_requests, **kwargs):
partitions = kwargs.pop('partitions', [0]) # TODO: Collect from all known partitions
batch = []
for partition_id in partitions:
batch.extend(self.queue.get_next_requests(max_next_requests, partition_id, **kwargs))
self.queue_size -= len(batch)
return batch
def page_crawled(self, response, links):
response.meta['state'] = States.CRAWLED
self.states.update_cache(response)
depth = response.meta.get('depth', 0)+1
to_fetch = OrderedDict()
for link in links:
to_fetch[link.meta['fingerprint']] = link
link.meta['depth'] = depth
self.states.fetch(to_fetch.keys())
self.states.set_states(links)
unique_links = to_fetch.values()
self.metadata.page_crawled(response, unique_links)
self._schedule(unique_links)
self.states.update_cache(unique_links)
def request_error(self, request, error):
request.meta['state'] = States.ERROR
self.metadata.request_error(request, error)
self.states.update_cache(request)
def finished(self):
return self.queue_size == 0