Exemplo n.º 1
0
 def handleResponse(self, response):
     self.block_buffer.write(response)
     
     block_string = self.block_buffer.getvalue()
     record = warcrecords.WarcResponseRecord(url=self.factory.url, block=block_string)
     self._warcout.write_record(record)
     
     ScrapyHTTPPageGetter.handleResponse(self, response)
Exemplo n.º 2
0
 def handleHeader(self, key, value):
     ScrapyHTTPPageGetter.handleHeader(self, key, value)
     # if key.lower() == 'content-length' and int(value) > MAX_RESPONSE_SIZE:
     #     self.connectionLost('oversized')
     #     print("XXXXXXXXXXXXXX")
     if key.lower() == 'content-type' and value.lower() != 'text/html':
         self.connectionLost('unqualified_type')
         print("TTTTTTTTTTTTTT")
Exemplo n.º 3
0
 def connectionMade(self):
     # Create a fake_transport. Let ScrapyHTTPPageGetter make its request.
     # Then save the request as a WARC record and send it off
     real_transport = self.transport
     fake_transport = StringIO()
     self.transport = fake_transport
     
     ScrapyHTTPPageGetter.connectionMade(self)
     
     self.transport = real_transport
     send_string = fake_transport.getvalue()
     real_transport.write(send_string)
     
     record = warcrecords.WarcRequestRecord(url=self.factory.url, block=send_string)
     self._warcout.write_record(record)
Exemplo n.º 4
0
 def handleHeader(self, key, value):
     ScrapyHTTPPageGetter.handleHeader(self, key, value)
     if key.lower() == 'content-length' and int(value) > MAX_RESPONSE_SIZE:
         self.connectionLost('oversized')
Exemplo n.º 5
0
 def handleHeader(self, key, value):
     ScrapyHTTPPageGetter.handleHeader(self, key, value)
     if self.factory.method.upper() == 'GET' and key.lower(
     ) == 'content-length' and int(value) > MAX_RESPONSE_SIZE:
         self.connectionLost('response_too_big: %s' % value)
 def handleHeader(self, key, value):
     ScrapyHTTPPageGetter.handleHeader(self, key, value)
     if self.factory.method.upper() == 'GET' and key.lower() == 'content-length' and int(value) > MAX_RESPONSE_SIZE:
         self.connectionLost('response_too_big: %s' % value)
Exemplo n.º 7
0
 def lineReceived(self, line):
     # line is missing \n, so strip off the \r and add both back
     self.block_buffer.write(line.rstrip() + '\r\n')
     return ScrapyHTTPPageGetter.lineReceived(self, line.rstrip())
Exemplo n.º 8
0
 def handleHeader(self, key, value):
     ScrapyHTTPPageGetter.handleHeader(self, key, value)
     if key.lower() == 'content-length' and int(value) > MAX_RESPONSE_SIZE:
         self.connectionLost('oversized')