def test_post_extract_wrong_method(self): mq = MethodQueryCanonicalizer('PUT', 'application/x-www-form-urlencoded', len(self.post_data), BytesIO(self.post_data)) assert mq.append_query('http://example.com/') == 'http://example.com/'
def test_post_extract_1(self): mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded', len(self.post_data), BytesIO(self.post_data)) assert mq.append_query('http://example.com/') == 'http://example.com/?foo=bar&dir=/baz' assert mq.append_query('http://example.com/?123=ABC') == 'http://example.com/?123=ABC&foo=bar&dir=/baz'
def test_post_extract_no_boundary_in_multipart_form_mimetype(self): mq = MethodQueryCanonicalizer('POST', 'multipart/form-data', len(self.post_data), BytesIO(self.post_data)) assert mq.append_query( 'http://example.com/' ) == 'http://example.com/?__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
def test_post_extract_json(self): post_data = b'{"a": "b", "c": {"a": 2}, "d": "e"}' mq = MethodQueryCanonicalizer('POST', 'application/json', len(post_data), BytesIO(post_data)) assert mq.append_query( 'http://example.com/' ) == 'http://example.com/?__wb_method=POST&a=b&a.2_=2&d=e'
def test_post_extract_length_too_short(self): mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded', len(self.post_data) - 4, BytesIO(self.post_data)) assert mq.append_query( 'http://example.com/') == 'http://example.com/?foo=bar&dir=%2'
def test_post_extract_non_form_data_2(self): mq = MethodQueryCanonicalizer('POST', 'text/plain', len(self.post_data), BytesIO(self.post_data)) #base64 encoded data assert mq.append_query( 'http://example.com/pathbar?id=123' ) == 'http://example.com/pathbar?id=123&__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
def test_post_extract_non_form_data_1(self): mq = MethodQueryCanonicalizer('POST', 'application/octet-stream', len(self.post_data), BytesIO(self.post_data)) #base64 encoded data assert mq.append_query( 'http://example.com/' ) == 'http://example.com/?__wb_method=POST&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
def test_post_extract_malformed_form_data(self): mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded', len(self.binary_post_data), BytesIO(self.binary_post_data)) #base64 encoded data assert mq.append_query( 'http://example.com/' ) == 'http://example.com/?__wb_method=POST&__wb_post_data=gTZsYEygNFAO4HICtYkZAGZQ2w6wAiw='
def test_post_extract_length_invalid_ignore(self): mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded', 0, BytesIO(self.post_data)) assert mq.append_query('http://example.com/') == 'http://example.com/' mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(self.post_data)) assert mq.append_query('http://example.com/') == 'http://example.com/'
def test_amf_parse(self): mq = MethodQueryCanonicalizer('POST', 'application/x-amf', 0, BytesIO()) req = Request(target='t', body="") ev_1 = Envelope(AMF3) ev_1['/0'] = req req = Request(target='t', body="alt_content") ev_2 = Envelope(AMF3) ev_2['/0'] = req assert mq.amf_parse(encode(ev_1).getvalue(), None) != \ mq.amf_parse(encode(ev_2).getvalue(), None)
def create_record_iter(self, raw_iter): append_post = self.options.get('append_post') include_all = self.options.get('include_all') surt_ordered = self.options.get('surt_ordered', True) minimal = self.options.get('minimal') if append_post and minimal: raise Exception('Sorry, minimal index option and ' + 'append POST options can not be used together') for record in raw_iter: entry = None if not include_all and not minimal and ( record.http_headers.get_statuscode() == '-'): continue if record.rec_type == 'arc_header': continue if record.format == 'warc': if (record.rec_type in ('request', 'warcinfo') and not include_all and not append_post): continue elif (not include_all and record.content_type == 'application/warc-fields'): continue entry = self.parse_warc_record(record) elif record.format == 'arc': entry = self.parse_arc_record(record) if not entry: continue if entry.get('url') and not entry.get('urlkey'): entry['urlkey'] = canonicalize(entry['url'], surt_ordered) compute_digest = False if (entry.get('digest', '-') == '-' and record.rec_type not in ('revisit', 'request', 'warcinfo')): compute_digest = True elif not minimal and record.rec_type == 'request' and append_post: method = record.http_headers.protocol len_ = record.http_headers.get_header('Content-Length') post_query = MethodQueryCanonicalizer( method, entry.get('_content_type'), len_, record.raw_stream) entry['_post_query'] = post_query entry.record = record self.begin_payload(compute_digest, entry) while True: buff = record.raw_stream.read(BUFF_SIZE) if not buff: break self.handle_payload(buff) raw_iter.read_to_end(record) entry.set_rec_info(*raw_iter.member_info) self.end_payload(entry) yield entry
def test_post_extract_non_form_data_1(self): mq = MethodQueryCanonicalizer('POST', 'application/octet-stream', len(self.post_data), BytesIO(self.post_data)) #base64 encoded data assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
def test_options(self): mq = MethodQueryCanonicalizer('OPTIONS', '', 0, BytesIO()) assert mq.append_query( 'http://example.com/') == 'http://example.com/?__wb_method=OPTIONS'
def test_head(self): mq = MethodQueryCanonicalizer('HEAD', '', 0, BytesIO()) assert mq.append_query('http://example.com/') == 'http://example.com/?__pywb_method=head'
def test_options(self): mq = MethodQueryCanonicalizer('OPTIONS', '', 0, BytesIO()) assert mq.append_query('http://example.com/') == 'http://example.com/?__pywb_method=options'
def test_post_extract_length_too_short(self): mq = MethodQueryCanonicalizer('POST', 'application/x-www-form-urlencoded', len(self.post_data) - 4, BytesIO(self.post_data)) assert mq.append_query('http://example.com/') == 'http://example.com/?foo=bar&dir=%2'
def test_post_extract_non_form_data_2(self): mq = MethodQueryCanonicalizer('POST', 'text/plain', len(self.post_data), BytesIO(self.post_data)) #base64 encoded data assert mq.append_query('http://example.com/pathbar?id=123') == 'http://example.com/pathbar?id=123&__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
def test_head(self): mq = MethodQueryCanonicalizer('HEAD', '', 0, BytesIO()) assert mq.append_query( 'http://example.com/') == 'http://example.com/?__wb_method=HEAD'