def handleResponse(self, response): self.block_buffer.write(response) block_string = self.block_buffer.getvalue() record = warcrecords.WarcResponseRecord(url=self.factory.url, block=block_string) self._warcout.write_record(record) ScrapyHTTPPageGetter.handleResponse(self, response)
def handleHeader(self, key, value): ScrapyHTTPPageGetter.handleHeader(self, key, value) # if key.lower() == 'content-length' and int(value) > MAX_RESPONSE_SIZE: # self.connectionLost('oversized') # print("XXXXXXXXXXXXXX") if key.lower() == 'content-type' and value.lower() != 'text/html': self.connectionLost('unqualified_type') print("TTTTTTTTTTTTTT")
def connectionMade(self): # Create a fake_transport. Let ScrapyHTTPPageGetter make its request. # Then save the request as a WARC record and send it off real_transport = self.transport fake_transport = StringIO() self.transport = fake_transport ScrapyHTTPPageGetter.connectionMade(self) self.transport = real_transport send_string = fake_transport.getvalue() real_transport.write(send_string) record = warcrecords.WarcRequestRecord(url=self.factory.url, block=send_string) self._warcout.write_record(record)
def handleHeader(self, key, value): ScrapyHTTPPageGetter.handleHeader(self, key, value) if key.lower() == 'content-length' and int(value) > MAX_RESPONSE_SIZE: self.connectionLost('oversized')
def handleHeader(self, key, value): ScrapyHTTPPageGetter.handleHeader(self, key, value) if self.factory.method.upper() == 'GET' and key.lower( ) == 'content-length' and int(value) > MAX_RESPONSE_SIZE: self.connectionLost('response_too_big: %s' % value)
def handleHeader(self, key, value): ScrapyHTTPPageGetter.handleHeader(self, key, value) if self.factory.method.upper() == 'GET' and key.lower() == 'content-length' and int(value) > MAX_RESPONSE_SIZE: self.connectionLost('response_too_big: %s' % value)
def lineReceived(self, line): # line is missing \n, so strip off the \r and add both back self.block_buffer.write(line.rstrip() + '\r\n') return ScrapyHTTPPageGetter.lineReceived(self, line.rstrip())