def __init__(self, request, timeout=180, download_size=0): self.url = urldefrag(request.url)[0] self.method = request.method self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.start_time = time() self.deferred = defer.Deferred() self.deferred.addCallback(self._build_response, request) self.invalid_headers = [] self.timeout = timeout self.download_size = download_size # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected # to have _disconnectedDeferred. See Twisted r32329. # As Crawlmi implements it's own logic to handle redirects is not # needed to add the callback _waitForDisconnect. # Specifically this avoids the AttributeError exception when # clientConnectionFailed method is called. self._disconnectedDeferred = defer.Deferred() self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault('Connection', 'close')
def test_from_args(self): mappings = [ ({ 'url': 'http://www.example.com/data.csv' }, TextResponse), # headers takes precedence over url ({ 'headers': Headers({'Content-Type': ['text/html; charset=utf-8']}), 'url': 'http://www.example.com/item/' }, HtmlResponse), ({ 'headers': Headers({ 'Content-Disposition': ['attachment; filename="data.xml.gz"'] }), 'url': 'http://www.example.com/page/' }, Response), ] for source, cls in mappings: retcls = from_args(**source) self.assertIs( retcls, cls, 'Expected: %s Received: %s' % (cls.__name__, retcls.__name__))
def test_multivalue(self): h = Headers() h['X-Forwarded-For'] = hlist = ['ip1', 'ip2'] self.assertEqual(h['X-Forwarded-For'], 'ip2') self.assertEqual(h.get('X-Forwarded-For'), 'ip2') self.assertEqual(h.getlist('X-Forwarded-For'), hlist) self.assertIsNot(h.getlist('X-Forwarded-For'), hlist)
def test_copy(self): h1 = Headers({'header1': ['value1', 'value2']}, encoding='ascii') h2 = copy.copy(h1) self.assertEqual(h1, h2) self.assertEqual(h1.encoding, h2.encoding) self.assertEqual(h1.getlist('header1'), h2.getlist('header1')) self.assertIsNot(h1.getlist('header1'), h2.getlist('header1')) self.assertIsInstance(h2, Headers)
def test_iterables(self): idict = {'Content-Type': 'text/html', 'X-Forwarded-For': ['ip1', 'ip2']} h = Headers(idict) self.assertEqual(dict(h), {'Content-Type': ['text/html'], 'X-Forwarded-For': ['ip1', 'ip2']}) self.assertEqual(h.keys(), ['X-Forwarded-For', 'Content-Type']) self.assertEqual(h.items(), [('X-Forwarded-For', ['ip1', 'ip2']), ('Content-Type', ['text/html'])]) self.assertEqual(list(h.iteritems()), [('X-Forwarded-For', ['ip1', 'ip2']), ('Content-Type', ['text/html'])]) self.assertEqual(h.values(), ['ip2', 'text/html'])
def test_appendlist(self): h = Headers({'header1': 'value1'}) h.appendlist('header1', 'value3') self.assertListEqual(h.getlist('header1'), ['value1', 'value3']) h = Headers() h.appendlist('header1', 'value1') h.appendlist('header1', 'value3') self.assertListEqual(h.getlist('header1'), ['value1', 'value3'])
def test_setdefault(self): h = Headers() hlist = ['ip1', 'ip2'] olist = h.setdefault('X-Forwarded-For', hlist) self.assertIsNot(h.getlist('X-Forwarded-For'), hlist) self.assertIs(h.getlist('X-Forwarded-For'), olist) h = Headers() olist = h.setdefault('X-Forwarded-For', 'ip1') self.assertEqual(h.getlist('X-Forwarded-For'), ['ip1']) self.assertIs(h.getlist('X-Forwarded-For'), olist)
def connectionMade(self): self.headers = Headers() # method command self.sendCommand(self.factory.method, self.factory.path) # headers for key, values in self.factory.headers.iteritems(): for value in values: self.sendHeader(key, value) self.endHeaders() # body if self.factory.body is not None: self.transport.write(self.factory.body)
def test_http_encoding_header(self): headers = Headers({'Content-Type': 'text/html; charset=ISO-8859-4'}) self.assertEqual(get_encoding_from_headers(headers), 'iso8859-4') headers = Headers({'Something-else': 'text/html; charset=ISO-8859-4'}) self.assertIsNone(get_encoding_from_headers(headers)) headers = Headers({'Content-Type': 'text/html'}) # self.assertEqual(get_encoding_from_headers(headers), 'cp1252') self.assertIsNone(get_encoding_from_headers(headers)) headers = Headers({'Content-Type': 'application/xhtml+xml'}) self.assertIsNone(get_encoding_from_headers(headers)) # invalid encoding headers = Headers({'Content-Type': 'text/html; charset='}) self.assertIsNone(get_encoding_from_headers(headers))
def test_non_standard_line_endings(self): factory = CrawlmiHTPPClientFactory(Request(url='http://foo/bar')) protocol = CrawlmiHTTPClient() protocol.factory = factory protocol.headers = Headers() protocol.dataReceived('HTTP/1.0 200 OK\n') protocol.dataReceived('Hello: World\n') protocol.dataReceived('Foo: Bar\n') protocol.dataReceived('\n') self.assertEqual(protocol.headers, Headers({ 'Hello': ['World'], 'Foo': ['Bar'] }))
def _cb_body_done(self, result, request, url): txresponse, body, flags = result status = int(txresponse.code) headers = Headers(txresponse.headers.getAllRawHeaders()) response_cls = resp_factory.from_args(headers=headers, url=url) return response_cls(url=url, status=status, headers=headers, body=body, request=request)
def __init__(self, url, callback=None, method='GET', headers={}, params={}, body='', cookies=None, meta={}, errback=None, proxy=None, priority=0, history=[], encoding='utf-8'): self.callback = callback self.errback = errback self.headers = Headers(headers, encoding) self.meta = dict(meta) self.history = list(history) self.proxy = proxy self.priority = priority self.cookies = cookies or {} # following attributes are immutable self._encoding = encoding self._method = self._prepare_method(method) self._url = self._prepare_url(url, params) self._body = self._prepare_body(body)
def test_add(self): h1 = Headers({'header1': 'value1'}) h1.add('header1', 'value3') self.assertEqual(h1.getlist('header1'), ['value1', 'value3']) h1 = Headers() h1.add('header1', 'value1') h1.add('header1', 'value3') self.assertEqual(h1.getlist('header1'), ['value1', 'value3'])
def retrieve_response(self, request): data = self._read_data(request) if data is None: return # not cached url = data['url'] status = data['status'] headers = Headers(data['headers']) body = data['body'] respcls = from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def test_to_string(self): h = Headers({'Content-type': 'text/html', 'Accept': 'gzip'}) self.assertEqual(h.to_string(), 'Content-Type: text/html\r\nAccept: gzip') h = Headers({'Content-type': ['text/html'], 'Accept': ['gzip']}) self.assertEqual(h.to_string(), 'Content-Type: text/html\r\nAccept: gzip')
def test_invalid_status(self): transport = StringTransport() factory = CrawlmiHTPPClientFactory(Request(url='http://foo/bar')) protocol = CrawlmiHTTPClient() protocol.factory = factory protocol.makeConnection(transport) protocol.headers = Headers() protocol.dataReceived('HTTP/1.0 BUG OK\r\n') protocol.dataReceived('Hello: World\r\n') protocol.dataReceived('Foo: Bar\r\n') protocol.dataReceived('\r\n') protocol.handleResponse('') return self.assertFailure(factory.deferred, BadHttpHeaderError)
def test_update(self): h = Headers() h.update({ 'Content-Type': 'text/html', 'X-Forwarded-For': ['ip1', 'ip2'] }) self.assertEqual(h.getlist('Content-Type'), ['text/html']) self.assertEqual(h.getlist('X-Forwarded-For'), ['ip1', 'ip2'])
def test_basics(self): h = Headers({'Content-Type': 'text/html', 'Content-Length': 1234}) self.assertIn('Content-Type', h) self.assertIn('Content-Length', h) self.assertRaises(KeyError, h.__getitem__, 'Accept') self.assertEqual(h.get('Accept'), None) self.assertEqual(h.getlist('Accept'), []) self.assertEqual(h.get('Accept', '*/*'), '*/*') self.assertEqual(h.getlist('Accept', '*/*'), ['*/*']) self.assertEqual(h.getlist('Accept', ['text/html', 'images/jpeg']), ['text/html', 'images/jpeg'])
def __init__(self, url, status=200, headers={}, body=None, request=None, flags=None): self.url = url self.status = int(status) self.headers = Headers(headers) self.request = request self.flags = [] if flags is None else list(flags) # following attributes are immutable self._body = body or ''
def test_from_headers(self): mappings = [ ({ 'Content-Type': ['text/html; charset=utf-8'] }, HtmlResponse), ({ 'Content-Type': ['application/octet-stream'], 'Content-Disposition': ['attachment; filename=data.txt'] }, TextResponse), ({ 'Content-Type': ['text/html; charset=utf-8'], 'Content-Encoding': ['gzip'] }, Response), ] for source, cls in mappings: source = Headers(source) retcls = from_headers(source) self.assertIs( retcls, cls, 'Expected: %s Received: %s' % (cls.__name__, retcls.__name__))
def test_iterables(self): idict = { 'Content-Type': 'text/html', 'X-Forwarded-For': ['ip1', 'ip2'] } h = Headers(idict) self.assertEqual(dict(h), { 'Content-Type': ['text/html'], 'X-Forwarded-For': ['ip1', 'ip2'] }) self.assertEqual(h.keys(), ['X-Forwarded-For', 'Content-Type']) self.assertEqual(h.items(), [('X-Forwarded-For', ['ip1', 'ip2']), ('Content-Type', ['text/html'])]) self.assertEqual(list(h.iteritems()), [('X-Forwarded-For', ['ip1', 'ip2']), ('Content-Type', ['text/html'])]) self.assertEqual(h.values(), ['ip2', 'text/html'])
def test_encode_latin1(self): h = Headers({u'key': u'\xa3'}, encoding='latin1') key, val = dict(h).items()[0] self.assertEqual(val[0], '\xa3')
class CrawlmiHTPPClientFactory(HTTPClientFactory): protocol = CrawlmiHTTPClient waiting = 1 noisy = False followRedirect = False afterFoundGet = False def __init__(self, request, timeout=180, download_size=0): self.url = urldefrag(request.url)[0] self.method = request.method self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.start_time = time() self.deferred = defer.Deferred() self.deferred.addCallback(self._build_response, request) self.invalid_headers = [] self.timeout = timeout self.download_size = download_size # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected # to have _disconnectedDeferred. See Twisted r32329. # As Crawlmi implements it's own logic to handle redirects is not # needed to add the callback _waitForDisconnect. # Specifically this avoids the AttributeError exception when # clientConnectionFailed method is called. self._disconnectedDeferred = defer.Deferred() self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault('Connection', 'close') def _build_response(self, body, request): if self.invalid_headers: raise BadHttpHeaderError('Invalid headers received: %s' % self.invalid_headers) response_cls = resp_factory.from_args(headers=self.response_headers, url=self.url) response = response_cls( url=self.url, status=self.status, headers=self.response_headers, body=body, request=request) response.download_latency = self.headers_time - self.start_time return response def _set_connection_attributes(self, request): self.scheme, self.netloc, self.host, self.port, self.path = \ _parse_url_args(request.url) if request.proxy: self.scheme, _, self.host, self.port, _ = \ _parse_url_args(request.proxy) self.path = self.url def gotStatus(self, version, status, message): self.version, self.status, self.message = version, int(status), message def gotHeaders(self, headers): self.headers_time = time() self.response_headers = headers
def __init__(self, body, encoding): self.body = body self.headers = Headers() if encoding: self.headers['Content-Type'] = 'text/html; charset=' + encoding
class CrawlmiHTTPClient(HTTPClient): delimiter = '\n' def __init__(self): self.body_size = 0 def connectionMade(self): self.headers = Headers() # method command self.sendCommand(self.factory.method, self.factory.path) # headers for key, values in self.factory.headers.iteritems(): for value in values: self.sendHeader(key, value) self.endHeaders() # body if self.factory.body is not None: self.transport.write(self.factory.body) def extractHeader(self, header): key, val = header.split(':', 1) val = val.lstrip() self.handleHeader(key, val) if key.lower() == 'content-length': self.length = int(val) def lineReceived(self, line): try: HTTPClient.lineReceived(self, line.rstrip()) except: self.factory.invalid_headers.append(line) if hasattr(self.transport, 'abortConnection'): self.transport.abortConnection() else: self.transport.loseConnection() def handleHeader(self, key, value): self.headers.add(key, value) def handleStatus(self, version, status, message): self.factory.gotStatus(version, status, message) def handleEndHeaders(self): self.factory.gotHeaders(self.headers) def connectionLost(self, reason): self._connection_lost_reason = reason HTTPClient.connectionLost(self, reason) self.factory.noPage(reason) def handleResponse(self, response): if self.factory.method.upper() == 'HEAD': self.factory.page('') elif self.length is not None and self.length > 0: self.factory.noPage(self._connection_lost_reason) else: self.factory.page(response) self.transport.loseConnection() def timeout(self): self.transport.loseConnection() self.factory.noPage( defer.TimeoutError('Getting %s took longer than %s seconds.' % (self.factory.url, self.factory.timeout))) def handleResponsePart(self, data): HTTPClient.handleResponsePart(self, data) self.body_size += len(data) if (self.factory.download_size and self.body_size > self.factory.download_size): self.transport.loseConnection() self.factory.noPage( DownloadSizeError('Response exceeded %s bytes.' % self.factory.download_size))
def test_early_headers(self): # basic test stolen from twisted HTTPageGetter factory = CrawlmiHTPPClientFactory( Request(url='http://foo/bar', body='some data', headers={ 'Host': 'example.net', 'User-Agent': 'fooble', 'Cookie': 'blah blah', 'Content-Length': '12981', 'Useful': 'value' })) self._test( factory, 'GET /bar HTTP/1.0\r\n' 'Content-Length: 9\r\n' 'Useful: value\r\n' 'Connection: close\r\n' 'User-Agent: fooble\r\n' 'Host: example.net\r\n' 'Cookie: blah blah\r\n' '\r\n' 'some data') # test minimal sent headers factory = CrawlmiHTPPClientFactory(Request('http://foo/bar')) self._test(factory, 'GET /bar HTTP/1.0\r\n' 'Host: foo\r\n' '\r\n') # test a simple POST with body and content-type factory = CrawlmiHTPPClientFactory( Request( method='POST', url='http://foo/bar', body='name=value', headers={'Content-Type': 'application/x-www-form-urlencoded'})) self._test( factory, 'POST /bar HTTP/1.0\r\n' 'Host: foo\r\n' 'Connection: close\r\n' 'Content-Type: application/x-www-form-urlencoded\r\n' 'Content-Length: 10\r\n' '\r\n' 'name=value') # test with single and multivalued headers factory = CrawlmiHTPPClientFactory( Request(url='http://foo/bar', headers={ 'X-Meta-Single': 'single', 'X-Meta-Multivalued': ['value1', 'value2'] })) self._test( factory, 'GET /bar HTTP/1.0\r\n' 'Host: foo\r\n' 'X-Meta-Multivalued: value1\r\n' 'X-Meta-Multivalued: value2\r\n' 'X-Meta-Single: single\r\n' '\r\n') # same test with single and multivalued headers but using Headers class factory = CrawlmiHTPPClientFactory( Request(url='http://foo/bar', headers=Headers({ 'X-Meta-Single': 'single', 'X-Meta-Multivalued': ['value1', 'value2'] }))) self._test( factory, 'GET /bar HTTP/1.0\r\n' 'Host: foo\r\n' 'X-Meta-Multivalued: value1\r\n' 'X-Meta-Multivalued: value2\r\n' 'X-Meta-Single: single\r\n' '\r\n')
def test_single_value(self): h = Headers() h['Content-Type'] = 'text/html' self.assertEqual(h['Content-Type'], 'text/html') self.assertEqual(h.get('Content-Type'), 'text/html') self.assertEqual(h.getlist('Content-Type'), ['text/html'])
def test_clear(self): h = Headers({'a': 'b'}) self.assertIn('a', h) h.clear() self.assertNotIn('a', h)
class CrawlmiHTPPClientFactory(HTTPClientFactory): protocol = CrawlmiHTTPClient waiting = 1 noisy = False followRedirect = False afterFoundGet = False def __init__(self, request, timeout=180, download_size=0): self.url = urldefrag(request.url)[0] self.method = request.method self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.start_time = time() self.deferred = defer.Deferred() self.deferred.addCallback(self._build_response, request) self.invalid_headers = [] self.timeout = timeout self.download_size = download_size # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected # to have _disconnectedDeferred. See Twisted r32329. # As Crawlmi implements it's own logic to handle redirects is not # needed to add the callback _waitForDisconnect. # Specifically this avoids the AttributeError exception when # clientConnectionFailed method is called. self._disconnectedDeferred = defer.Deferred() self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault('Connection', 'close') def _build_response(self, body, request): if self.invalid_headers: raise BadHttpHeaderError('Invalid headers received: %s' % self.invalid_headers) response_cls = resp_factory.from_args(headers=self.response_headers, url=self.url) response = response_cls(url=self.url, status=self.status, headers=self.response_headers, body=body, request=request) response.download_latency = self.headers_time - self.start_time return response def _set_connection_attributes(self, request): self.scheme, self.netloc, self.host, self.port, self.path = \ _parse_url_args(request.url) if request.proxy: self.scheme, _, self.host, self.port, _ = \ _parse_url_args(request.proxy) self.path = self.url def gotStatus(self, version, status, message): self.version, self.status, self.message = version, int(status), message def gotHeaders(self, headers): self.headers_time = time() self.response_headers = headers
def test_delete_and_contains(self): h = Headers() h['Content-Type'] = 'text/html' self.assertIn('Content-Type', h) del h['Content-Type'] self.assertNotIn('Content-Type', h)
def test_encode_utf8(self): h = Headers({u'key': u'\xa3'}, encoding='utf-8') key, val = dict(h).items()[0] self.assertIsInstance(key, str) self.assertIsInstance(val[0], str) self.assertEqual(val[0], '\xc2\xa3')
def test_update(self): h = Headers() h.update({'Content-Type': 'text/html', 'X-Forwarded-For': ['ip1', 'ip2']}) self.assertEqual(h.getlist('Content-Type'), ['text/html']) self.assertEqual(h.getlist('X-Forwarded-For'), ['ip1', 'ip2'])
def test_encode_multiple(self): h = Headers({u'key': [u'\xa3']}, encoding='utf-8') key, val = dict(h).items()[0] self.assertEqual(val[0], '\xc2\xa3')