def test_appendlist(self): h1 = Headers({'header1': 'value1'}) h1.appendlist('header1', 'value3') self.assertEqual(h1.getlist('header1'), ['value1', 'value3']) h1 = Headers() h1.appendlist('header1', 'value1') h1.appendlist('header1', 'value3') self.assertEqual(h1.getlist('header1'), ['value1', 'value3'])
def test_appendlist(self): h1 = Headers({"header1": "value1"}) h1.appendlist("header1", "value3") self.assertEqual(h1.getlist("header1"), [b"value1", b"value3"]) h1 = Headers() h1.appendlist("header1", "value1") h1.appendlist("header1", "value3") self.assertEqual(h1.getlist("header1"), [b"value1", b"value3"])
class ScrapyHTTPPageGetter(HTTPClient): delimiter = '\n' def connectionMade(self): self.headers = Headers() # bucket for response headers # Method command self.sendCommand(self.factory.method, self.factory.path) # Headers for key, values in self.factory.headers.items(): for value in values: self.sendHeader(key, value) self.endHeaders() # Body if self.factory.body is not None: self.transport.write(self.factory.body) def extractHeader(self, header): key, val = header.split(':', 1) val = val.lstrip() self.handleHeader(key, val) if key.lower() == 'content-length': self.length = int(val) def lineReceived(self, line): try: HTTPClient.lineReceived(self, line.rstrip()) except: self.factory.add_invalid_header(line) def handleHeader(self, key, value): self.headers.appendlist(key, value) def handleStatus(self, version, status, message): self.factory.gotStatus(version, status, message) def handleEndHeaders(self): self.factory.gotHeaders(self.headers) def connectionLost(self, reason): HTTPClient.connectionLost(self, reason) self.factory.noPage(reason) def handleResponse(self, response): if self.factory.method.upper() == 'HEAD': self.factory.page('') else: self.factory.page(response) self.transport.loseConnection() def timeout(self): self.transport.loseConnection() self.factory.noPage(\ defer.TimeoutError("Getting %s took longer than %s seconds." % \ (self.factory.url, self.factory.timeout)))
class ScrapyHTTPPageGetter(HTTPClient): delimiter = b'\n' def connectionMade(self): self.headers = Headers() # bucket for response headers # Method command self.sendCommand(self.factory.method, self.factory.path) # Headers for key, values in self.factory.headers.items(): for value in values: self.sendHeader(key, value) self.endHeaders() # Body if self.factory.body is not None: self.transport.write(self.factory.body) def lineReceived(self, line): return HTTPClient.lineReceived(self, line.rstrip()) def handleHeader(self, key, value): self.headers.appendlist(key, value) def handleStatus(self, version, status, message): self.factory.gotStatus(version, status, message) def handleEndHeaders(self): self.factory.gotHeaders(self.headers) def connectionLost(self, reason): self._connection_lost_reason = reason HTTPClient.connectionLost(self, reason) self.factory.noPage(reason) def handleResponse(self, response): if self.factory.method.upper() == b'HEAD': self.factory.page(b'') elif self.length is not None and self.length > 0: self.factory.noPage(self._connection_lost_reason) else: self.factory.page(response) self.transport.loseConnection() def timeout(self): self.transport.loseConnection() # transport cleanup needed for HTTPS connections if self.factory.url.startswith(b'https'): self.transport.stopProducing() self.factory.noPage(\ defer.TimeoutError("Getting %s took longer than %s seconds." % \ (self.factory.url, self.factory.timeout)))
class ScrapyHTTPPageGetter(HTTPClient): delimiter = '\n' def connectionMade(self): self.headers = Headers() # bucket for response headers # Method command self.sendCommand(self.factory.method, self.factory.path) # Headers for key, values in self.factory.headers.items(): for value in values: self.sendHeader(key, value) self.endHeaders() # Body if self.factory.body is not None: self.transport.write(self.factory.body) def lineReceived(self, line): return HTTPClient.lineReceived(self, line.rstrip()) def handleHeader(self, key, value): self.headers.appendlist(key, value) def handleStatus(self, version, status, message): self.factory.gotStatus(version, status, message) def handleEndHeaders(self): self.factory.gotHeaders(self.headers) def connectionLost(self, reason): HTTPClient.connectionLost(self, reason) self.factory.noPage(reason) def handleResponse(self, response): if self.factory.method.upper() == 'HEAD': self.factory.page('') elif self.length != None and self.length != 0: self.factory.noPage(failure.Failure( PartialDownloadError(self.factory.status, None, response))) else: self.factory.page(response) self.transport.loseConnection() def timeout(self): self.transport.loseConnection() self.factory.noPage(\ defer.TimeoutError("Getting %s took longer than %s seconds." % \ (self.factory.url, self.factory.timeout)))
def test_netscape_example_2(self): # Second Example transaction sequence: # # Assume all mappings from above have been cleared. # # Client receives: # # Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/ # # When client requests a URL in path "/" on this server, it sends: # # Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001 # # Client receives: # # Set-Cookie: PART_NUMBER=RIDING_ROCKET_0023; path=/ammo # # When client requests a URL in path "/ammo" on this server, it sends: # # Cookie: PART_NUMBER=RIDING_ROCKET_0023; PART_NUMBER=ROCKET_LAUNCHER_0001 # # NOTE: There are two name/value pairs named "PART_NUMBER" due to # the inheritance of the "/" mapping in addition to the "/ammo" mapping. c = CookieJar() headers = Headers({'Set-Cookie': 'PART_NUMBER=ROCKET_LAUNCHER_0001; path=/'}) req = Request("http://www.acme.com/") res = Response("http://www.acme.com/", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/") c.add_cookie_header(req) self.assertEquals(req.headers.get("Cookie"), "PART_NUMBER=ROCKET_LAUNCHER_0001") headers.appendlist("Set-Cookie", "PART_NUMBER=RIDING_ROCKET_0023; path=/ammo") res = Response("http://www.acme.com/", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/ammo") c.add_cookie_header(req) self.assert_(re.search(r"PART_NUMBER=RIDING_ROCKET_0023;\s*" "PART_NUMBER=ROCKET_LAUNCHER_0001", req.headers.get("Cookie")))
def test_session_cookies(self): year_plus_one = time.localtime()[0] + 1 # Check session cookies are deleted properly by # CookieJar.clear_session_cookies method req = Request('http://www.perlmeister.com/scripts') headers = Headers() headers.appendlist("Set-Cookie", "s1=session;Path=/scripts") headers.appendlist("Set-Cookie", "p1=perm; Domain=.perlmeister.com;Path=/;expires=Fri, 02-Feb-%d 23:24:20 GMT" % year_plus_one) headers.appendlist("Set-Cookie", "p2=perm;Path=/;expires=Fri, 02-Feb-%d 23:24:20 GMT" % year_plus_one) headers.appendlist("Set-Cookie", "s2=session;Path=/scripts;" "Domain=.perlmeister.com") headers.appendlist('Set-Cookie2', 's3=session;Version=1;Discard;Path="/"') res = Response('http://www.perlmeister.com/scripts', headers=headers) c = CookieJar() c.extract_cookies(res, req) # How many session/permanent cookies do we have? counter = {"session_after": 0, "perm_after": 0, "session_before": 0, "perm_before": 0} for cookie in c: key = "%s_before" % cookie.value counter[key] = counter[key] + 1 c.clear_session_cookies() # How many now? for cookie in c: key = "%s_after" % cookie.value counter[key] = counter[key] + 1 self.assert_(not ( # a permanent cookie got lost accidently counter["perm_after"] != counter["perm_before"] or # a session cookie hasn't been cleared counter["session_after"] != 0 or # we didn't have session cookies in the first place counter["session_before"] == 0))
def test_netscape_misc(self): # Some additional Netscape cookies tests. c = CookieJar() headers = Headers() req = Request("http://foo.bar.acme.com/foo") # Netscape allows a host part that contains dots headers.appendlist("Set-Cookie", "Customer=WILE_E_COYOTE; domain=.acme.com") res = Response("http://www.acme.com/foo", headers=headers) c.extract_cookies(res, req) # and that the domain is the same as the host without adding a leading # dot to the domain. Should not quote even if strange chars are used # in the cookie value. headers.appendlist("Set-Cookie", "PART_NUMBER=3,4; domain=foo.bar.acme.com") res = Response("http://www.acme.com/foo", headers=headers) c.extract_cookies(res, req) req = Request("http://foo.bar.acme.com/foo") c.add_cookie_header(req) self.assert_( "PART_NUMBER=3,4" in req.headers.get("Cookie") and "Customer=WILE_E_COYOTE" in req.headers.get("Cookie"))
class ScrapyHTTPPageGetter(HTTPClient): delimiter = '\n' def connectionMade(self): self.headers = Headers() # bucket for response headers if self.factory.use_tunnel: log.msg("Sending CONNECT", log.DEBUG) self.tunnel_started = False self.sendCommand("CONNECT", "%s:%s" % (self.factory.tunnel_to_host, self.factory.tunnel_to_port)) self.sendHeaders(only=['Host','Proxy-Connection', 'User-Agent']) del self.factory.headers['Proxy-Connection'] else: self.sendEverything() def sendCommand(self, command, path): if self.factory.use_tunnel and not self.tunnel_started: http_version = "1.1" else: http_version = "1.0" self.transport.write('%s %s HTTP/%s\r\n' % (command, path, http_version)) def sendEverything(self): self.sendMethod() self.sendHeaders() self.sendBody() def sendMethod(self): # Method command self.sendCommand(self.factory.method, self.factory.path) def sendHeaders(self, only=None): # Note: it's a Headers object, not a dict keys = only if only is not None else self.factory.headers.keys() for key in keys: for value in self.factory.headers.getlist(key): self.sendHeader(key, value) self.endHeaders() def sendBody(self): # Body if self.factory.body is not None: self.transport.write(self.factory.body) def lineReceived(self, line): if self.factory.use_tunnel and not self.tunnel_started: log.msg("LINE: %s" % line) if self.factory.use_tunnel and not self.tunnel_started and not line.rstrip(): # End of headers from the proxy in response to our CONNECT request # Skip the call to HTTPClient.lienReceived for now, since otherwise # it would switch to row mode. self.startTunnel() else: return HTTPClient.lineReceived(self, line.rstrip()) def startTunnel(self): log.msg("starting Tunnel") # We'll get a new batch of headers through the tunnel. This sets us # up to capture them. self.firstLine = True self.tunnel_started = True # Switch to SSL ctx = ClientContextFactory() self.transport.startTLS(ctx, self.factory) # And send the normal request: self.sendEverything() def handleHeader(self, key, value): if self.factory.use_tunnel and not self.tunnel_started: pass # maybe log headers for CONNECT request? else: self.headers.appendlist(key, value) def handleStatus(self, version, status, message): if self.factory.use_tunnel and not self.tunnel_started: self.tunnel_status = status else: self.factory.gotStatus(version, status, message) def handleEndHeaders(self): self.factory.gotHeaders(self.headers) def connectionLost(self, reason): HTTPClient.connectionLost(self, reason) self.factory.noPage(reason) def handleResponse(self, response): if self.factory.method.upper() == 'HEAD': self.factory.page('') else: self.factory.page(response) self.transport.loseConnection() def timeout(self): self.transport.loseConnection() self.factory.noPage(\ defer.TimeoutError("Getting %s took longer than %s seconds." % \ (self.factory.url, self.factory.timeout)))
def test_netscape_example_1(self): #------------------------------------------------------------------- # First we check that it works for the original example at # http://www.netscape.com/newsref/std/cookie_spec.html # Client requests a document, and receives in the response: # # Set-Cookie: CUSTOMER=WILE_E_COYOTE; path=/; expires=Wednesday, 09-Nov-99 23:12:40 GMT # # When client requests a URL in path "/" on this server, it sends: # # Cookie: CUSTOMER=WILE_E_COYOTE # # Client requests a document, and receives in the response: # # Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/ # # When client requests a URL in path "/" on this server, it sends: # # Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001 # # Client receives: # # Set-Cookie: SHIPPING=FEDEX; path=/fo # # When client requests a URL in path "/" on this server, it sends: # # Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001 # # When client requests a URL in path "/foo" on this server, it sends: # # Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001; SHIPPING=FEDEX # # The last Cookie is buggy, because both specifications say that the # most specific cookie must be sent first. SHIPPING=FEDEX is the # most specific and should thus be first. year_plus_one = time.localtime()[0] + 1 c = CookieJar(DefaultCookiePolicy(rfc2965 = True)) #req = Request("http://1.1.1.1/", # headers={"Host": "www.acme.com:80"}) req = Request("http://www.acme.com:80/", headers={"Host": "www.acme.com:80"}) headers = Headers() headers['Set-Cookie'] = 'CUSTOMER=WILE_E_COYOTE; path=/ ; expires=Wednesday, 09-Nov-%d 23:12:40 GMT' % year_plus_one res = Response("http://www.acme.com/", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/") c.add_cookie_header(req) self.assertEqual(req.headers.get("Cookie"), "CUSTOMER=WILE_E_COYOTE") self.assertEqual(req.headers.get("Cookie2"), '$Version="1"') headers.appendlist("Set-Cookie", "PART_NUMBER=ROCKET_LAUNCHER_0001; path=/") res = Response("http://www.acme.com/", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/foo/bar") c.add_cookie_header(req) h = req.headers.get("Cookie") self.assert_("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and "CUSTOMER=WILE_E_COYOTE" in h) headers.appendlist('Set-Cookie', 'SHIPPING=FEDEX; path=/foo') res = Response("http://www.acme.com", headers=headers) c.extract_cookies(res, req) req = Request("http://www.acme.com/") c.add_cookie_header(req) h = req.headers.get("Cookie") self.assert_("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and "CUSTOMER=WILE_E_COYOTE" in h and "SHIPPING=FEDEX" not in h) req = Request("http://www.acme.com/foo/") c.add_cookie_header(req) h = req.headers.get("Cookie") self.assert_(("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and "CUSTOMER=WILE_E_COYOTE" in h and h.startswith("SHIPPING=FEDEX;")))