예제 #1
0
def test_do_not_archive():
    with tempfile.TemporaryDirectory() as tmpdir:
        logging.debug('cd %s', tmpdir)
        os.chdir(tmpdir)

        wwt = warcprox.writerthread.WarcWriterProcessor()
        wwt.inq = queue.Queue(maxsize=1)
        wwt.outq = queue.Queue(maxsize=1)
        try:
            wwt.start()
            # to be written -- default do_not_archive False
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(RecordedUrl(
                url='http://example.com/yes', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest))
            # not to be written -- do_not_archive set True
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(RecordedUrl(
                url='http://example.com/no', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest,
                warcprox_meta={'warc-prefix': '-'},
                do_not_archive=True))
            recorded_url = wwt.outq.get(timeout=10)
            assert recorded_url.warc_records
            recorded_url = wwt.outq.get(timeout=10)
            assert not recorded_url.warc_records
            assert wwt.outq.empty()
        finally:
            wwt.stop.set()
            wwt.join()
예제 #2
0
def test_special_dont_write_prefix():
    with tempfile.TemporaryDirectory() as tmpdir:
        logging.debug('cd %s', tmpdir)
        os.chdir(tmpdir)

        wwt = warcprox.writerthread.WarcWriterProcessor(Options(prefix='-'))
        wwt.inq = queue.Queue(maxsize=1)
        wwt.outq = queue.Queue(maxsize=1)
        try:
            wwt.start()
            # not to be written due to default prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(RecordedUrl(
                url='http://example.com/no', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest))
            # to be written due to warcprox-meta prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(RecordedUrl(
                url='http://example.com/yes', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest,
                warcprox_meta={'warc-prefix': 'normal-warc-prefix'}))
            recorded_url = wwt.outq.get(timeout=10)
            assert not recorded_url.warc_records
            recorded_url = wwt.outq.get(timeout=10)
            assert recorded_url.warc_records
            assert wwt.outq.empty()
        finally:
            wwt.stop.set()
            wwt.join()

        wwt = warcprox.writerthread.WarcWriterProcessor(
                Options(blackout_period=60, prefix='foo'))
        wwt.inq = queue.Queue(maxsize=1)
        wwt.outq = queue.Queue(maxsize=1)
        try:
            wwt.start()
            # to be written due to default prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(RecordedUrl(
                url='http://example.com/yes', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest))
            # not to be written due to warcprox-meta prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(RecordedUrl(
                url='http://example.com/no', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest,
                warcprox_meta={'warc-prefix': '-'}))
            recorded_url = wwt.outq.get(timeout=10)
            assert recorded_url.warc_records
            recorded_url = wwt.outq.get(timeout=10)
            assert not recorded_url.warc_records
            assert wwt.outq.empty()

            # test blackout_period option. Write first revisit record because
            # its outside the blackout_period (60). Do not write the second
            # because its inside the blackout_period.
            recorder = ProxyingRecorder(io.BytesIO(b'test1'), None)
            recorder.read()
            old = datetime.utcnow() - timedelta(0, 3600)
            ru = RecordedUrl(
                url='http://example.com/dup',
                content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest)
            ru.dedup_info = dict(id=b'1', url=b'http://example.com/dup',
                                 date=old.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8'))
            wwt.inq.put(ru)
            recorded_url = wwt.outq.get(timeout=10)
            recorder = ProxyingRecorder(io.BytesIO(b'test2'), None)
            recorder.read()
            recent = datetime.utcnow() - timedelta(0, 5)
            ru = RecordedUrl(
                url='http://example.com/dup', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest)
            ru.dedup_info = dict(id=b'2', url=b'http://example.com/dup',
                                 date=recent.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8'))
            wwt.inq.put(ru)
            assert recorded_url.warc_records
            recorded_url = wwt.outq.get(timeout=10)
            assert not recorded_url.warc_records
            assert wwt.outq.empty()

        finally:
            wwt.stop.set()
            wwt.join()
예제 #3
0
def test_close_for_prefix(tmpdir):
    wwp = warcprox.writerthread.WarcWriterProcessor(
            Options(directory=str(tmpdir)))
    wwp.inq = queue.Queue(maxsize=1)
    wwp.outq = queue.Queue(maxsize=1)

    try:
        wwp.start()

        # write a record to the default prefix
        recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
        recorder.read()
        wwp.inq.put(RecordedUrl(
            url='http://example.com/1', content_type='text/plain',
            status=200, client_ip='127.0.0.2', request_data=b'abc',
            response_recorder=recorder, remote_ip='127.0.0.3',
            timestamp=datetime.utcnow(),
            payload_digest=recorder.block_digest))
        time.sleep(0.5)
        rurl = wwp.outq.get() # wait for it to finish

        assert rurl.url == b'http://example.com/1'
        assert len(tmpdir.listdir()) == 1
        assert tmpdir.listdir()[0].basename.startswith('warcprox-')
        assert tmpdir.listdir()[0].basename.endswith('-00000.warc.open')
        assert tmpdir.listdir()[0].basename == wwp.writer_pool.default_warc_writer.finalname + '.open'

        # request close of default warc
        wwp.close_for_prefix()

        # write a record to some other prefix
        recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
        recorder.read()
        wwp.inq.put(RecordedUrl(
            url='http://example.com/2', content_type='text/plain',
            status=200, client_ip='127.0.0.2', request_data=b'abc',
            response_recorder=recorder, remote_ip='127.0.0.3',
            timestamp=datetime.utcnow(),
            payload_digest=recorder.block_digest,
            warcprox_meta={'warc-prefix': 'some-prefix'}))
        time.sleep(0.5)
        rurl = wwp.outq.get() # wait for it to finish

        assert rurl.url == b'http://example.com/2'
        assert len(tmpdir.listdir()) == 2
        basenames = sorted(f.basename for f in tmpdir.listdir())
        assert basenames[0].startswith('some-prefix-')
        assert basenames[0].endswith('-00000.warc.open')
        assert basenames[1].startswith('warcprox-')
        assert basenames[1].endswith('-00000.warc')

        # request close of warc with prefix
        wwp.close_for_prefix('some-prefix')

        # write another record to the default prefix
        recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
        recorder.read()
        wwp.inq.put(RecordedUrl(
            url='http://example.com/3', content_type='text/plain',
            status=200, client_ip='127.0.0.2', request_data=b'abc',
            response_recorder=recorder, remote_ip='127.0.0.3',
            timestamp=datetime.utcnow(),
            payload_digest=recorder.block_digest))
        time.sleep(0.5)
        rurl = wwp.outq.get() # wait for it to finish

        assert rurl.url == b'http://example.com/3'
        # now some-prefix warc is closed and a new default prefix warc is open
        basenames = sorted(f.basename for f in tmpdir.listdir())
        assert len(basenames) == 3
        assert basenames[0].startswith('some-prefix-')
        assert basenames[0].endswith('-00000.warc')
        assert basenames[1].startswith('warcprox-')
        assert basenames[1].endswith('-00000.warc')
        assert basenames[2].startswith('warcprox-')
        assert basenames[2].endswith('-00001.warc.open')

        # write another record to with prefix "some-prefix"
        recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
        recorder.read()
        wwp.inq.put(RecordedUrl(
            url='http://example.com/4', content_type='text/plain',
            status=200, client_ip='127.0.0.2', request_data=b'abc',
            response_recorder=recorder, remote_ip='127.0.0.3',
            timestamp=datetime.utcnow(),
            payload_digest=recorder.block_digest,
            warcprox_meta={'warc-prefix': 'some-prefix'}))
        time.sleep(0.5)
        rurl = wwp.outq.get() # wait for it to finish

        assert rurl.url == b'http://example.com/4'
        # new some-prefix warc will have a new random token and start over at
        # serial 00000
        basenames = sorted(f.basename for f in tmpdir.listdir())
        assert len(basenames) == 4
        assert basenames[0].startswith('some-prefix-')
        assert basenames[1].startswith('some-prefix-')
        # order of these two warcs depends on random token so we don't know
        # which is which
        assert basenames[0][-5:] != basenames[1][-5:]
        assert '-00000.' in basenames[0]
        assert '-00000.' in basenames[1]

        assert basenames[2].startswith('warcprox-')
        assert basenames[2].endswith('-00000.warc')
        assert basenames[3].startswith('warcprox-')
        assert basenames[3].endswith('-00001.warc.open')

    finally:
        wwp.stop.set()
        wwp.join()