def test_do_not_archive(): with tempfile.TemporaryDirectory() as tmpdir: logging.debug('cd %s', tmpdir) os.chdir(tmpdir) wwt = warcprox.writerthread.WarcWriterProcessor() wwt.inq = queue.Queue(maxsize=1) wwt.outq = queue.Queue(maxsize=1) try: wwt.start() # to be written -- default do_not_archive False recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwt.inq.put(RecordedUrl( url='http://example.com/yes', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest)) # not to be written -- do_not_archive set True recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwt.inq.put(RecordedUrl( url='http://example.com/no', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': '-'}, do_not_archive=True)) recorded_url = wwt.outq.get(timeout=10) assert recorded_url.warc_records recorded_url = wwt.outq.get(timeout=10) assert not recorded_url.warc_records assert wwt.outq.empty() finally: wwt.stop.set() wwt.join()
def test_special_dont_write_prefix(): with tempfile.TemporaryDirectory() as tmpdir: logging.debug('cd %s', tmpdir) os.chdir(tmpdir) wwt = warcprox.writerthread.WarcWriterProcessor(Options(prefix='-')) wwt.inq = queue.Queue(maxsize=1) wwt.outq = queue.Queue(maxsize=1) try: wwt.start() # not to be written due to default prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwt.inq.put(RecordedUrl( url='http://example.com/no', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest)) # to be written due to warcprox-meta prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwt.inq.put(RecordedUrl( url='http://example.com/yes', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': 'normal-warc-prefix'})) recorded_url = wwt.outq.get(timeout=10) assert not recorded_url.warc_records recorded_url = wwt.outq.get(timeout=10) assert recorded_url.warc_records assert wwt.outq.empty() finally: wwt.stop.set() wwt.join() wwt = warcprox.writerthread.WarcWriterProcessor( Options(blackout_period=60, prefix='foo')) wwt.inq = queue.Queue(maxsize=1) wwt.outq = queue.Queue(maxsize=1) try: wwt.start() # to be written due to default prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwt.inq.put(RecordedUrl( url='http://example.com/yes', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest)) # not to be written due to warcprox-meta prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwt.inq.put(RecordedUrl( url='http://example.com/no', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': '-'})) recorded_url = wwt.outq.get(timeout=10) assert recorded_url.warc_records recorded_url = wwt.outq.get(timeout=10) assert not recorded_url.warc_records assert wwt.outq.empty() # test blackout_period option. Write first revisit record because # its outside the blackout_period (60). Do not write the second # because its inside the blackout_period. recorder = ProxyingRecorder(io.BytesIO(b'test1'), None) recorder.read() old = datetime.utcnow() - timedelta(0, 3600) ru = RecordedUrl( url='http://example.com/dup', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest) ru.dedup_info = dict(id=b'1', url=b'http://example.com/dup', date=old.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8')) wwt.inq.put(ru) recorded_url = wwt.outq.get(timeout=10) recorder = ProxyingRecorder(io.BytesIO(b'test2'), None) recorder.read() recent = datetime.utcnow() - timedelta(0, 5) ru = RecordedUrl( url='http://example.com/dup', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest) ru.dedup_info = dict(id=b'2', url=b'http://example.com/dup', date=recent.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8')) wwt.inq.put(ru) assert recorded_url.warc_records recorded_url = wwt.outq.get(timeout=10) assert not recorded_url.warc_records assert wwt.outq.empty() finally: wwt.stop.set() wwt.join()
def test_close_for_prefix(tmpdir): wwp = warcprox.writerthread.WarcWriterProcessor( Options(directory=str(tmpdir))) wwp.inq = queue.Queue(maxsize=1) wwp.outq = queue.Queue(maxsize=1) try: wwp.start() # write a record to the default prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwp.inq.put(RecordedUrl( url='http://example.com/1', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest)) time.sleep(0.5) rurl = wwp.outq.get() # wait for it to finish assert rurl.url == b'http://example.com/1' assert len(tmpdir.listdir()) == 1 assert tmpdir.listdir()[0].basename.startswith('warcprox-') assert tmpdir.listdir()[0].basename.endswith('-00000.warc.open') assert tmpdir.listdir()[0].basename == wwp.writer_pool.default_warc_writer.finalname + '.open' # request close of default warc wwp.close_for_prefix() # write a record to some other prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwp.inq.put(RecordedUrl( url='http://example.com/2', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': 'some-prefix'})) time.sleep(0.5) rurl = wwp.outq.get() # wait for it to finish assert rurl.url == b'http://example.com/2' assert len(tmpdir.listdir()) == 2 basenames = sorted(f.basename for f in tmpdir.listdir()) assert basenames[0].startswith('some-prefix-') assert basenames[0].endswith('-00000.warc.open') assert basenames[1].startswith('warcprox-') assert basenames[1].endswith('-00000.warc') # request close of warc with prefix wwp.close_for_prefix('some-prefix') # write another record to the default prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwp.inq.put(RecordedUrl( url='http://example.com/3', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest)) time.sleep(0.5) rurl = wwp.outq.get() # wait for it to finish assert rurl.url == b'http://example.com/3' # now some-prefix warc is closed and a new default prefix warc is open basenames = sorted(f.basename for f in tmpdir.listdir()) assert len(basenames) == 3 assert basenames[0].startswith('some-prefix-') assert basenames[0].endswith('-00000.warc') assert basenames[1].startswith('warcprox-') assert basenames[1].endswith('-00000.warc') assert basenames[2].startswith('warcprox-') assert basenames[2].endswith('-00001.warc.open') # write another record to with prefix "some-prefix" recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwp.inq.put(RecordedUrl( url='http://example.com/4', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': 'some-prefix'})) time.sleep(0.5) rurl = wwp.outq.get() # wait for it to finish assert rurl.url == b'http://example.com/4' # new some-prefix warc will have a new random token and start over at # serial 00000 basenames = sorted(f.basename for f in tmpdir.listdir()) assert len(basenames) == 4 assert basenames[0].startswith('some-prefix-') assert basenames[1].startswith('some-prefix-') # order of these two warcs depends on random token so we don't know # which is which assert basenames[0][-5:] != basenames[1][-5:] assert '-00000.' in basenames[0] assert '-00000.' in basenames[1] assert basenames[2].startswith('warcprox-') assert basenames[2].endswith('-00000.warc') assert basenames[3].startswith('warcprox-') assert basenames[3].endswith('-00001.warc.open') finally: wwp.stop.set() wwp.join()