-
Notifications
You must be signed in to change notification settings - Fork 1
/
tornado_client_page.py
127 lines (103 loc) · 5.55 KB
/
tornado_client_page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import logging
import urlparse
from lxml import html
from tornado.gen import coroutine, Return
from tornado.httpclient import AsyncHTTPClient, HTTPRequest, HTTPError
from config import PAGE_TIMEOUT
from util import decode_to_unicode, obtain_domain_with_subdomain_for_page
from web_page import WebPage
__author__ = 'jayesh'
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.FileHandler('page.log', mode='w'))
tornado_logger = logging.getLogger('tornado.general')
tornado_logger.setLevel(logging.DEBUG)
tornado_logger.addHandler(logging.FileHandler('tornado-requests.log', mode='w'))
class TornadoClientPage(WebPage):
def process(self, spider):
logger.debug("Called {} for {}".format('process', self.encoded_url))
self.spider = spider
@coroutine
def make_head_request(self):
logger.debug("Called %s for %s " % ('make_head_request', self.encoded_url))
request = HTTPRequest(method='HEAD', url=self.url, request_timeout=PAGE_TIMEOUT, follow_redirects=True,
headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 "
"(KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1"},
max_redirects=10)
try:
response = yield AsyncHTTPClient().fetch(request)
except HTTPError as ex:
logger.debug(
u"Error processing head request for : %s with error : %s " % (self.encoded_url, str(ex.message)))
self.response_code = ex.code
self.failure_message = decode_to_unicode(ex.message)
self.finalize_process(self.spider)
raise Return(None)
raise Return(response)
@coroutine
def _process_head_response(self, response):
if response:
logger.debug(u"Called {} for {} ".format('_process_head_response', self.encoded_url))
self.response_code = response.code
self.content_type = u"".join(response.headers.get('Content-Type', ''))
effective_url = response.effective_url if response.effective_url else self.url
if self.is_page_internal(effective_url) and u'text/html' in self.content_type:
get_response = yield self._make_get_request()
raise Return(get_response)
else:
self.finalize_process(self.spider)
raise Return(None)
@coroutine
def _make_get_request(self):
logger.debug(u"Called {} for {} ".format('_make_get_request', self.encoded_url))
request = HTTPRequest(method='GET', url=self.url, request_timeout=PAGE_TIMEOUT, follow_redirects=True,
headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 "
"(KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1"},
max_redirects=10)
try:
response = yield AsyncHTTPClient().fetch(request)
except Exception as ex:
logger.debug(
u"Error processing get request for : %s with error : %s " % (self.encoded_url, str(ex.message)))
self.response_code = ex.code
self.failure_message = decode_to_unicode(ex.message)
self.finalize_process(self.spider)
raise Return(None)
raise Return(response)
def process_get_response(self, response):
logger.debug(u"Called {} for {} ".format('process_get_response', self.encoded_url))
if response.error:
logger.debug(u"Error processing get request: {} with error : {} ( {} )"
% (self.encoded_url, response.error, response.reason))
self.failure_message = response.reason
else:
html_source = response.body
html_source = decode_to_unicode(html_source)
if self.is_page_internal():
dom = html.fromstring(html_source)
# logger.debug("obtained dom object for {}".format(encoded_url))
link_count = 0
for href_value in dom.xpath('//a/@href'):
href_value = decode_to_unicode(href_value)
logger.debug(u"Entering for loop for for {} with href {}".format(self.encoded_url, href_value))
self._process_hardcoded_url(href_value)
link = self._format_link(href_value)
logger.debug(u"obtained link object{} for {}".format(link, self.encoded_url))
if link:
parsed_link = obtain_domain_with_subdomain_for_page(link)
if parsed_link not in self.domains_to_skip:
link_page = TornadoClientPage(link, self, self.base_site, self.base_domain,
self.domains_to_skip)
self.links.add(link_page)
link_page.parent = self
link_count += 1
self.finalize_process(self.spider)
def _format_link(self, href_value):
href_value = decode_to_unicode(href_value.strip())
if href_value.startswith('#'):
link = self.url
else:
href_value = href_value.replace("..", "") if href_value.startswith("..") else href_value
link = urlparse.urljoin(self.url, href_value, allow_fragments=False)
link = link if 'javascript:void' not in href_value and not href_value.startswith('mailto') else None
return decode_to_unicode(link)