def test_warc_writer_filename(tmpdir): """Test if WarcWriter is writing WARC files with custom filenames. """ recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com') recorded_url = RecordedUrl(url='http://example.com', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow()) dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer'))) wwriter = WarcWriter( Options( directory=dirname, prefix='foo', warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}', writer_threads=1)) wwriter.write_records(recorded_url) warcs = [fn for fn in os.listdir(dirname)] assert warcs assert re.search(r'\d{17}_foo_\d{14}_00000.warc.open', wwriter._available_warcs.queue[0].path)
def test_warc_writer_locking(tmpdir): """Test if WarcWriter is locking WARC files. When we don't have the .open suffix, WarcWriter locks the file and the external process trying to ``lock_file`` fails (result=0). """ recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com') recorded_url = RecordedUrl(url='http://example.com', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow()) dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer'))) wwriter = WarcWriter( Options(directory=dirname, no_warc_open_suffix=True, writer_threads=1)) wwriter.write_records(recorded_url) warcs = [fn for fn in os.listdir(dirname) if fn.endswith('.warc')] assert warcs target_warc = os.path.join(dirname, warcs[0]) # launch another process and try to lock WARC file queue = Queue() p = Process(target=lock_file, args=(queue, target_warc)) p.start() p.join() assert queue.get() == 'FAILED TO OBTAIN LOCK' wwriter.close_writer() # locking must succeed after writer has closed the WARC file. p = Process(target=lock_file, args=(queue, target_warc)) p.start() p.join() assert queue.get() == 'OBTAINED LOCK'
def test_do_not_archive(): with tempfile.TemporaryDirectory() as tmpdir: logging.debug('cd %s', tmpdir) os.chdir(tmpdir) wwt = warcprox.writerthread.WarcWriterProcessor( Options(writer_threads=1)) wwt.inq = warcprox.TimestampedQueue(maxsize=1) wwt.outq = warcprox.TimestampedQueue(maxsize=1) try: wwt.start() # to be written -- default do_not_archive False recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwt.inq.put( RecordedUrl(url='http://example.com/yes', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest)) # not to be written -- do_not_archive set True recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwt.inq.put( RecordedUrl(url='http://example.com/no', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': '-'}, do_not_archive=True)) recorded_url = wwt.outq.get(timeout=10) assert recorded_url.warc_records recorded_url = wwt.outq.get(timeout=10) assert not recorded_url.warc_records assert wwt.outq.empty() finally: wwt.stop.set() wwt.join()
def test_warc_writer_locking(tmpdir): """Test if WarcWriter is locking WARC files. When we don't have the .open suffix, WarcWriter locks the file and the external process trying to ``lock_file`` fails (result=0). """ dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer'))) wwriter = WarcWriter(Options(directory=dirname, no_warc_open_suffix=True)) wwriter.write_records(recorded_url) warcs = [fn for fn in os.listdir(dirname) if fn.endswith('.warc')] assert warcs target_warc = os.path.join(dirname, warcs[0]) # launch another process and try to lock WARC file queue = Queue() p = Process(target=lock_file, args=(queue, target_warc)) p.start() p.join() assert queue.get() == 'FAILED TO OBTAIN LOCK' wwriter.close_writer() # locking must succeed after writer has closed the WARC file. p = Process(target=lock_file, args=(queue, target_warc)) p.start() p.join() assert queue.get() == 'OBTAINED LOCK'
def test_close_for_prefix(tmpdir): wwp = warcprox.writerthread.WarcWriterProcessor( Options(directory=str(tmpdir))) wwp.inq = queue.Queue(maxsize=1) wwp.outq = queue.Queue(maxsize=1) try: wwp.start() # write a record to the default prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwp.inq.put( RecordedUrl(url='http://example.com/1', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest)) time.sleep(0.5) rurl = wwp.outq.get() # wait for it to finish assert rurl.url == b'http://example.com/1' assert len(tmpdir.listdir()) == 1 assert tmpdir.listdir()[0].basename.startswith('warcprox-') assert tmpdir.listdir()[0].basename.endswith('-00000.warc.open') assert tmpdir.listdir( )[0].basename == wwp.writer_pool.default_warc_writer.finalname + '.open' # request close of default warc wwp.close_for_prefix() # write a record to some other prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwp.inq.put( RecordedUrl(url='http://example.com/2', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': 'some-prefix'})) time.sleep(0.5) rurl = wwp.outq.get() # wait for it to finish assert rurl.url == b'http://example.com/2' assert len(tmpdir.listdir()) == 2 basenames = sorted(f.basename for f in tmpdir.listdir()) assert basenames[0].startswith('some-prefix-') assert basenames[0].endswith('-00000.warc.open') assert basenames[1].startswith('warcprox-') assert basenames[1].endswith('-00000.warc') # request close of warc with prefix wwp.close_for_prefix('some-prefix') # write another record to the default prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwp.inq.put( RecordedUrl(url='http://example.com/3', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest)) time.sleep(0.5) rurl = wwp.outq.get() # wait for it to finish assert rurl.url == b'http://example.com/3' # now some-prefix warc is closed and a new default prefix warc is open basenames = sorted(f.basename for f in tmpdir.listdir()) assert len(basenames) == 3 assert basenames[0].startswith('some-prefix-') assert basenames[0].endswith('-00000.warc') assert basenames[1].startswith('warcprox-') assert basenames[1].endswith('-00000.warc') assert basenames[2].startswith('warcprox-') assert basenames[2].endswith('-00001.warc.open') # write another record to with prefix "some-prefix" recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwp.inq.put( RecordedUrl(url='http://example.com/4', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': 'some-prefix'})) time.sleep(0.5) rurl = wwp.outq.get() # wait for it to finish assert rurl.url == b'http://example.com/4' # new some-prefix warc will have a new random token and start over at # serial 00000 basenames = sorted(f.basename for f in tmpdir.listdir()) assert len(basenames) == 4 assert basenames[0].startswith('some-prefix-') assert basenames[1].startswith('some-prefix-') # order of these two warcs depends on random token so we don't know # which is which assert basenames[0][-5:] != basenames[1][-5:] assert '-00000.' in basenames[0] assert '-00000.' in basenames[1] assert basenames[2].startswith('warcprox-') assert basenames[2].endswith('-00000.warc') assert basenames[3].startswith('warcprox-') assert basenames[3].endswith('-00001.warc.open') finally: wwp.stop.set() wwp.join()
def test_special_dont_write_prefix(): with tempfile.TemporaryDirectory() as tmpdir: logging.debug('cd %s', tmpdir) os.chdir(tmpdir) wwt = warcprox.writerthread.WarcWriterProcessor(Options(prefix='-')) wwt.inq = queue.Queue(maxsize=1) wwt.outq = queue.Queue(maxsize=1) try: wwt.start() # not to be written due to default prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwt.inq.put( RecordedUrl(url='http://example.com/no', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest)) # to be written due to warcprox-meta prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwt.inq.put( RecordedUrl( url='http://example.com/yes', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': 'normal-warc-prefix'})) recorded_url = wwt.outq.get(timeout=10) assert not recorded_url.warc_records recorded_url = wwt.outq.get(timeout=10) assert recorded_url.warc_records assert wwt.outq.empty() finally: wwt.stop.set() wwt.join() wwt = warcprox.writerthread.WarcWriterProcessor( Options(blackout_period=60, prefix='foo')) wwt.inq = queue.Queue(maxsize=1) wwt.outq = queue.Queue(maxsize=1) try: wwt.start() # to be written due to default prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwt.inq.put( RecordedUrl(url='http://example.com/yes', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest)) # not to be written due to warcprox-meta prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwt.inq.put( RecordedUrl(url='http://example.com/no', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': '-'})) recorded_url = wwt.outq.get(timeout=10) assert recorded_url.warc_records recorded_url = wwt.outq.get(timeout=10) assert not recorded_url.warc_records assert wwt.outq.empty() # test blackout_period option. Write first revisit record because # its outside the blackout_period (60). Do not write the second # because its inside the blackout_period. recorder = ProxyingRecorder(io.BytesIO(b'test1'), None) recorder.read() old = datetime.utcnow() - timedelta(0, 3600) ru = RecordedUrl(url='http://example.com/dup', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest) ru.dedup_info = dict( id=b'1', url=b'http://example.com/dup', date=old.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8')) wwt.inq.put(ru) recorded_url = wwt.outq.get(timeout=10) recorder = ProxyingRecorder(io.BytesIO(b'test2'), None) recorder.read() recent = datetime.utcnow() - timedelta(0, 5) ru = RecordedUrl(url='http://example.com/dup', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest) ru.dedup_info = dict( id=b'2', url=b'http://example.com/dup', date=recent.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8')) wwt.inq.put(ru) assert recorded_url.warc_records recorded_url = wwt.outq.get(timeout=10) assert not recorded_url.warc_records assert wwt.outq.empty() finally: wwt.stop.set() wwt.join()
def test_special_dont_write_prefix(): class NotifyMe: def __init__(self): self.the_list = [] def notify(self, recorded_url, records): self.the_list.append((recorded_url, records)) with tempfile.TemporaryDirectory() as tmpdir: logging.debug('cd %s', tmpdir) os.chdir(tmpdir) q = warcprox.TimestampedQueue(maxsize=1) listener = NotifyMe() wwt = warcprox.writerthread.WarcWriterThread( recorded_url_q=q, options=Options(prefix='-'), listeners=[listener]) try: wwt.start() # not to be written due to default prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() q.put(RecordedUrl( url='http://example.com/no', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest)) # to be written due to warcprox-meta prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() q.put(RecordedUrl( url='http://example.com/yes', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': 'normal-warc-prefix'})) wait(lambda: len(listener.the_list) == 2, 10.0) assert not listener.the_list[0][1] assert listener.the_list[1][1] finally: wwt.stop.set() wwt.join() q = warcprox.TimestampedQueue(maxsize=1) listener = NotifyMe() wwt = warcprox.writerthread.WarcWriterThread( recorded_url_q=q, listeners=[listener]) try: wwt.start() # to be written due to default prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() q.put(RecordedUrl( url='http://example.com/yes', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest)) # not to be written due to warcprox-meta prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() q.put(RecordedUrl( url='http://example.com/no', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': '-'})) wait(lambda: len(listener.the_list) == 2, 10.0) assert listener.the_list[0][1] assert not listener.the_list[1][1] finally: wwt.stop.set() wwt.join()