Exemplo n.º 1
0
def test_warc_writer_filename(tmpdir):
    """Test if WarcWriter is writing WARC files with custom filenames.
    """
    recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com')
    recorded_url = RecordedUrl(url='http://example.com',
                               content_type='text/plain',
                               status=200,
                               client_ip='127.0.0.2',
                               request_data=b'abc',
                               response_recorder=recorder,
                               remote_ip='127.0.0.3',
                               timestamp=datetime.utcnow())

    dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
    wwriter = WarcWriter(
        Options(
            directory=dirname,
            prefix='foo',
            warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}',
            writer_threads=1))
    wwriter.write_records(recorded_url)
    warcs = [fn for fn in os.listdir(dirname)]
    assert warcs
    assert re.search(r'\d{17}_foo_\d{14}_00000.warc.open',
                     wwriter._available_warcs.queue[0].path)
Exemplo n.º 2
0
def test_warc_writer_locking(tmpdir):
    """Test if WarcWriter is locking WARC files.
    When we don't have the .open suffix, WarcWriter locks the file and the
    external process trying to ``lock_file`` fails (result=0).
    """
    recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com')
    recorded_url = RecordedUrl(url='http://example.com',
                               content_type='text/plain',
                               status=200,
                               client_ip='127.0.0.2',
                               request_data=b'abc',
                               response_recorder=recorder,
                               remote_ip='127.0.0.3',
                               timestamp=datetime.utcnow())

    dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
    wwriter = WarcWriter(
        Options(directory=dirname, no_warc_open_suffix=True, writer_threads=1))
    wwriter.write_records(recorded_url)
    warcs = [fn for fn in os.listdir(dirname) if fn.endswith('.warc')]
    assert warcs
    target_warc = os.path.join(dirname, warcs[0])
    # launch another process and try to lock WARC file
    queue = Queue()
    p = Process(target=lock_file, args=(queue, target_warc))
    p.start()
    p.join()
    assert queue.get() == 'FAILED TO OBTAIN LOCK'
    wwriter.close_writer()

    # locking must succeed after writer has closed the WARC file.
    p = Process(target=lock_file, args=(queue, target_warc))
    p.start()
    p.join()
    assert queue.get() == 'OBTAINED LOCK'
Exemplo n.º 3
0
def test_do_not_archive():
    with tempfile.TemporaryDirectory() as tmpdir:
        logging.debug('cd %s', tmpdir)
        os.chdir(tmpdir)

        wwt = warcprox.writerthread.WarcWriterProcessor(
            Options(writer_threads=1))
        wwt.inq = warcprox.TimestampedQueue(maxsize=1)
        wwt.outq = warcprox.TimestampedQueue(maxsize=1)
        try:
            wwt.start()
            # to be written -- default do_not_archive False
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(
                RecordedUrl(url='http://example.com/yes',
                            content_type='text/plain',
                            status=200,
                            client_ip='127.0.0.2',
                            request_data=b'abc',
                            response_recorder=recorder,
                            remote_ip='127.0.0.3',
                            timestamp=datetime.utcnow(),
                            payload_digest=recorder.block_digest))
            # not to be written -- do_not_archive set True
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(
                RecordedUrl(url='http://example.com/no',
                            content_type='text/plain',
                            status=200,
                            client_ip='127.0.0.2',
                            request_data=b'abc',
                            response_recorder=recorder,
                            remote_ip='127.0.0.3',
                            timestamp=datetime.utcnow(),
                            payload_digest=recorder.block_digest,
                            warcprox_meta={'warc-prefix': '-'},
                            do_not_archive=True))
            recorded_url = wwt.outq.get(timeout=10)
            assert recorded_url.warc_records
            recorded_url = wwt.outq.get(timeout=10)
            assert not recorded_url.warc_records
            assert wwt.outq.empty()
        finally:
            wwt.stop.set()
            wwt.join()
Exemplo n.º 4
0
def test_warc_writer_locking(tmpdir):
    """Test if WarcWriter is locking WARC files.
    When we don't have the .open suffix, WarcWriter locks the file and the
    external process trying to ``lock_file`` fails (result=0).
    """
    dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
    wwriter = WarcWriter(Options(directory=dirname, no_warc_open_suffix=True))
    wwriter.write_records(recorded_url)
    warcs = [fn for fn in os.listdir(dirname) if fn.endswith('.warc')]
    assert warcs
    target_warc = os.path.join(dirname, warcs[0])
    # launch another process and try to lock WARC file
    queue = Queue()
    p = Process(target=lock_file, args=(queue, target_warc))
    p.start()
    p.join()
    assert queue.get() == 'FAILED TO OBTAIN LOCK'
    wwriter.close_writer()

    # locking must succeed after writer has closed the WARC file.
    p = Process(target=lock_file, args=(queue, target_warc))
    p.start()
    p.join()
    assert queue.get() == 'OBTAINED LOCK'
Exemplo n.º 5
0
def test_close_for_prefix(tmpdir):
    wwp = warcprox.writerthread.WarcWriterProcessor(
        Options(directory=str(tmpdir)))
    wwp.inq = queue.Queue(maxsize=1)
    wwp.outq = queue.Queue(maxsize=1)

    try:
        wwp.start()

        # write a record to the default prefix
        recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
        recorder.read()
        wwp.inq.put(
            RecordedUrl(url='http://example.com/1',
                        content_type='text/plain',
                        status=200,
                        client_ip='127.0.0.2',
                        request_data=b'abc',
                        response_recorder=recorder,
                        remote_ip='127.0.0.3',
                        timestamp=datetime.utcnow(),
                        payload_digest=recorder.block_digest))
        time.sleep(0.5)
        rurl = wwp.outq.get()  # wait for it to finish

        assert rurl.url == b'http://example.com/1'
        assert len(tmpdir.listdir()) == 1
        assert tmpdir.listdir()[0].basename.startswith('warcprox-')
        assert tmpdir.listdir()[0].basename.endswith('-00000.warc.open')
        assert tmpdir.listdir(
        )[0].basename == wwp.writer_pool.default_warc_writer.finalname + '.open'

        # request close of default warc
        wwp.close_for_prefix()

        # write a record to some other prefix
        recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
        recorder.read()
        wwp.inq.put(
            RecordedUrl(url='http://example.com/2',
                        content_type='text/plain',
                        status=200,
                        client_ip='127.0.0.2',
                        request_data=b'abc',
                        response_recorder=recorder,
                        remote_ip='127.0.0.3',
                        timestamp=datetime.utcnow(),
                        payload_digest=recorder.block_digest,
                        warcprox_meta={'warc-prefix': 'some-prefix'}))
        time.sleep(0.5)
        rurl = wwp.outq.get()  # wait for it to finish

        assert rurl.url == b'http://example.com/2'
        assert len(tmpdir.listdir()) == 2
        basenames = sorted(f.basename for f in tmpdir.listdir())
        assert basenames[0].startswith('some-prefix-')
        assert basenames[0].endswith('-00000.warc.open')
        assert basenames[1].startswith('warcprox-')
        assert basenames[1].endswith('-00000.warc')

        # request close of warc with prefix
        wwp.close_for_prefix('some-prefix')

        # write another record to the default prefix
        recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
        recorder.read()
        wwp.inq.put(
            RecordedUrl(url='http://example.com/3',
                        content_type='text/plain',
                        status=200,
                        client_ip='127.0.0.2',
                        request_data=b'abc',
                        response_recorder=recorder,
                        remote_ip='127.0.0.3',
                        timestamp=datetime.utcnow(),
                        payload_digest=recorder.block_digest))
        time.sleep(0.5)
        rurl = wwp.outq.get()  # wait for it to finish

        assert rurl.url == b'http://example.com/3'
        # now some-prefix warc is closed and a new default prefix warc is open
        basenames = sorted(f.basename for f in tmpdir.listdir())
        assert len(basenames) == 3
        assert basenames[0].startswith('some-prefix-')
        assert basenames[0].endswith('-00000.warc')
        assert basenames[1].startswith('warcprox-')
        assert basenames[1].endswith('-00000.warc')
        assert basenames[2].startswith('warcprox-')
        assert basenames[2].endswith('-00001.warc.open')

        # write another record to with prefix "some-prefix"
        recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
        recorder.read()
        wwp.inq.put(
            RecordedUrl(url='http://example.com/4',
                        content_type='text/plain',
                        status=200,
                        client_ip='127.0.0.2',
                        request_data=b'abc',
                        response_recorder=recorder,
                        remote_ip='127.0.0.3',
                        timestamp=datetime.utcnow(),
                        payload_digest=recorder.block_digest,
                        warcprox_meta={'warc-prefix': 'some-prefix'}))
        time.sleep(0.5)
        rurl = wwp.outq.get()  # wait for it to finish

        assert rurl.url == b'http://example.com/4'
        # new some-prefix warc will have a new random token and start over at
        # serial 00000
        basenames = sorted(f.basename for f in tmpdir.listdir())
        assert len(basenames) == 4
        assert basenames[0].startswith('some-prefix-')
        assert basenames[1].startswith('some-prefix-')
        # order of these two warcs depends on random token so we don't know
        # which is which
        assert basenames[0][-5:] != basenames[1][-5:]
        assert '-00000.' in basenames[0]
        assert '-00000.' in basenames[1]

        assert basenames[2].startswith('warcprox-')
        assert basenames[2].endswith('-00000.warc')
        assert basenames[3].startswith('warcprox-')
        assert basenames[3].endswith('-00001.warc.open')

    finally:
        wwp.stop.set()
        wwp.join()
Exemplo n.º 6
0
def test_special_dont_write_prefix():
    with tempfile.TemporaryDirectory() as tmpdir:
        logging.debug('cd %s', tmpdir)
        os.chdir(tmpdir)

        wwt = warcprox.writerthread.WarcWriterProcessor(Options(prefix='-'))
        wwt.inq = queue.Queue(maxsize=1)
        wwt.outq = queue.Queue(maxsize=1)
        try:
            wwt.start()
            # not to be written due to default prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(
                RecordedUrl(url='http://example.com/no',
                            content_type='text/plain',
                            status=200,
                            client_ip='127.0.0.2',
                            request_data=b'abc',
                            response_recorder=recorder,
                            remote_ip='127.0.0.3',
                            timestamp=datetime.utcnow(),
                            payload_digest=recorder.block_digest))
            # to be written due to warcprox-meta prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(
                RecordedUrl(
                    url='http://example.com/yes',
                    content_type='text/plain',
                    status=200,
                    client_ip='127.0.0.2',
                    request_data=b'abc',
                    response_recorder=recorder,
                    remote_ip='127.0.0.3',
                    timestamp=datetime.utcnow(),
                    payload_digest=recorder.block_digest,
                    warcprox_meta={'warc-prefix': 'normal-warc-prefix'}))
            recorded_url = wwt.outq.get(timeout=10)
            assert not recorded_url.warc_records
            recorded_url = wwt.outq.get(timeout=10)
            assert recorded_url.warc_records
            assert wwt.outq.empty()
        finally:
            wwt.stop.set()
            wwt.join()

        wwt = warcprox.writerthread.WarcWriterProcessor(
            Options(blackout_period=60, prefix='foo'))
        wwt.inq = queue.Queue(maxsize=1)
        wwt.outq = queue.Queue(maxsize=1)
        try:
            wwt.start()
            # to be written due to default prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(
                RecordedUrl(url='http://example.com/yes',
                            content_type='text/plain',
                            status=200,
                            client_ip='127.0.0.2',
                            request_data=b'abc',
                            response_recorder=recorder,
                            remote_ip='127.0.0.3',
                            timestamp=datetime.utcnow(),
                            payload_digest=recorder.block_digest))
            # not to be written due to warcprox-meta prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(
                RecordedUrl(url='http://example.com/no',
                            content_type='text/plain',
                            status=200,
                            client_ip='127.0.0.2',
                            request_data=b'abc',
                            response_recorder=recorder,
                            remote_ip='127.0.0.3',
                            timestamp=datetime.utcnow(),
                            payload_digest=recorder.block_digest,
                            warcprox_meta={'warc-prefix': '-'}))
            recorded_url = wwt.outq.get(timeout=10)
            assert recorded_url.warc_records
            recorded_url = wwt.outq.get(timeout=10)
            assert not recorded_url.warc_records
            assert wwt.outq.empty()

            # test blackout_period option. Write first revisit record because
            # its outside the blackout_period (60). Do not write the second
            # because its inside the blackout_period.
            recorder = ProxyingRecorder(io.BytesIO(b'test1'), None)
            recorder.read()
            old = datetime.utcnow() - timedelta(0, 3600)
            ru = RecordedUrl(url='http://example.com/dup',
                             content_type='text/plain',
                             status=200,
                             client_ip='127.0.0.2',
                             request_data=b'abc',
                             response_recorder=recorder,
                             remote_ip='127.0.0.3',
                             timestamp=datetime.utcnow(),
                             payload_digest=recorder.block_digest)
            ru.dedup_info = dict(
                id=b'1',
                url=b'http://example.com/dup',
                date=old.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8'))
            wwt.inq.put(ru)
            recorded_url = wwt.outq.get(timeout=10)
            recorder = ProxyingRecorder(io.BytesIO(b'test2'), None)
            recorder.read()
            recent = datetime.utcnow() - timedelta(0, 5)
            ru = RecordedUrl(url='http://example.com/dup',
                             content_type='text/plain',
                             status=200,
                             client_ip='127.0.0.2',
                             request_data=b'abc',
                             response_recorder=recorder,
                             remote_ip='127.0.0.3',
                             timestamp=datetime.utcnow(),
                             payload_digest=recorder.block_digest)
            ru.dedup_info = dict(
                id=b'2',
                url=b'http://example.com/dup',
                date=recent.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8'))
            wwt.inq.put(ru)
            assert recorded_url.warc_records
            recorded_url = wwt.outq.get(timeout=10)
            assert not recorded_url.warc_records
            assert wwt.outq.empty()

        finally:
            wwt.stop.set()
            wwt.join()
Exemplo n.º 7
0
def test_special_dont_write_prefix():
    class NotifyMe:
        def __init__(self):
            self.the_list = []
        def notify(self, recorded_url, records):
            self.the_list.append((recorded_url, records))

    with tempfile.TemporaryDirectory() as tmpdir:
        logging.debug('cd %s', tmpdir)
        os.chdir(tmpdir)

        q = warcprox.TimestampedQueue(maxsize=1)
        listener = NotifyMe()
        wwt = warcprox.writerthread.WarcWriterThread(
                recorded_url_q=q, options=Options(prefix='-'),
                listeners=[listener])
        try:
            wwt.start()
            # not to be written due to default prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            q.put(RecordedUrl(
                url='http://example.com/no', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest))
            # to be written due to warcprox-meta prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            q.put(RecordedUrl(
                url='http://example.com/yes', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest,
                warcprox_meta={'warc-prefix': 'normal-warc-prefix'}))
            wait(lambda: len(listener.the_list) == 2, 10.0)
            assert not listener.the_list[0][1]
            assert listener.the_list[1][1]
        finally:
            wwt.stop.set()
            wwt.join()

        q = warcprox.TimestampedQueue(maxsize=1)
        listener = NotifyMe()
        wwt = warcprox.writerthread.WarcWriterThread(
                recorded_url_q=q, listeners=[listener])
        try:
            wwt.start()
            # to be written due to default prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            q.put(RecordedUrl(
                url='http://example.com/yes', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest))
            # not to be written due to warcprox-meta prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            q.put(RecordedUrl(
                url='http://example.com/no', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest,
                warcprox_meta={'warc-prefix': '-'}))
            wait(lambda: len(listener.the_list) == 2, 10.0)
            assert listener.the_list[0][1]
            assert not listener.the_list[1][1]
        finally:
            wwt.stop.set()
            wwt.join()