Exemplo n.º 1
0
def test_do_not_archive():
    with tempfile.TemporaryDirectory() as tmpdir:
        logging.debug('cd %s', tmpdir)
        os.chdir(tmpdir)

        wwt = warcprox.writerthread.WarcWriterProcessor(
            Options(writer_threads=1))
        wwt.inq = warcprox.TimestampedQueue(maxsize=1)
        wwt.outq = warcprox.TimestampedQueue(maxsize=1)
        try:
            wwt.start()
            # to be written -- default do_not_archive False
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(
                RecordedUrl(url='http://example.com/yes',
                            content_type='text/plain',
                            status=200,
                            client_ip='127.0.0.2',
                            request_data=b'abc',
                            response_recorder=recorder,
                            remote_ip='127.0.0.3',
                            timestamp=datetime.utcnow(),
                            payload_digest=recorder.block_digest))
            # not to be written -- do_not_archive set True
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(
                RecordedUrl(url='http://example.com/no',
                            content_type='text/plain',
                            status=200,
                            client_ip='127.0.0.2',
                            request_data=b'abc',
                            response_recorder=recorder,
                            remote_ip='127.0.0.3',
                            timestamp=datetime.utcnow(),
                            payload_digest=recorder.block_digest,
                            warcprox_meta={'warc-prefix': '-'},
                            do_not_archive=True))
            recorded_url = wwt.outq.get(timeout=10)
            assert recorded_url.warc_records
            recorded_url = wwt.outq.get(timeout=10)
            assert not recorded_url.warc_records
            assert wwt.outq.empty()
        finally:
            wwt.stop.set()
            wwt.join()
Exemplo n.º 2
0
 def chain(self, processor0, processor1):
     '''
     Sets `processor0.outq` = `processor1.inq` = `queue.Queue()`
     '''
     assert not processor0.outq
     assert not processor1.inq
     q = warcprox.TimestampedQueue(maxsize=self.options.queue_size)
     processor0.outq = q
     processor1.inq = q
Exemplo n.º 3
0
    def __init__(self,
                 stats_db=None,
                 status_callback=None,
                 options=warcprox.Options()):
        self.status_callback = status_callback
        self.stats_db = stats_db
        self.options = options
        self.remote_connection_pool = PoolManager(
            num_pools=max(round(options.max_threads /
                                6), 200) if options.max_threads else 200)
        server_address = (options.address or 'localhost',
                          options.port if options.port is not None else 8000)

        if options.onion_tor_socks_proxy:
            try:
                host, port = options.onion_tor_socks_proxy.split(':')
                WarcProxyHandler.onion_tor_socks_proxy_host = host
                WarcProxyHandler.onion_tor_socks_proxy_port = int(port)
            except ValueError:
                WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
                WarcProxyHandler.onion_tor_socks_proxy_port = None

        if options.socket_timeout:
            WarcProxyHandler._socket_timeout = options.socket_timeout
        if options.max_resource_size:
            WarcProxyHandler._max_resource_size = options.max_resource_size
        if options.tmp_file_max_memory_size:
            WarcProxyHandler._tmp_file_max_memory_size = options.tmp_file_max_memory_size

        http_server.HTTPServer.__init__(self,
                                        server_address,
                                        WarcProxyHandler,
                                        bind_and_activate=True)

        self.digest_algorithm = options.digest_algorithm or 'sha1'

        ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64]
        self.ca = CertificateAuthority(ca_file=options.cacert
                                       or 'warcprox-ca.pem',
                                       certs_dir=options.certs_dir
                                       or './warcprox-ca',
                                       ca_name=ca_name)

        self.recorded_url_q = warcprox.TimestampedQueue(
            maxsize=options.queue_size or 1000)

        self.running_stats = warcprox.stats.RunningStats()
Exemplo n.º 4
0
    def __init__(self,
                 ca=None,
                 recorded_url_q=None,
                 stats_db=None,
                 options=warcprox.Options()):
        server_address = (options.address or 'localhost',
                          options.port if options.port is not None else 8000)

        if options.onion_tor_socks_proxy:
            try:
                host, port = options.onion_tor_socks_proxy.split(':')
                WarcProxyHandler.onion_tor_socks_proxy_host = host
                WarcProxyHandler.onion_tor_socks_proxy_port = int(port)
            except ValueError:
                WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
                WarcProxyHandler.onion_tor_socks_proxy_port = None

        http_server.HTTPServer.__init__(self,
                                        server_address,
                                        WarcProxyHandler,
                                        bind_and_activate=True)

        self.digest_algorithm = options.digest_algorithm or 'sha1'

        if ca is not None:
            self.ca = ca
        else:
            ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
            self.ca = CertificateAuthority(ca_file='warcprox-ca.pem',
                                           certs_dir='./warcprox-ca',
                                           ca_name=ca_name)

        if recorded_url_q is not None:
            self.recorded_url_q = recorded_url_q
        else:
            self.recorded_url_q = warcprox.TimestampedQueue(
                maxsize=options.queue_size or 1000)

        self.stats_db = stats_db
        self.options = options

        self.running_stats = warcprox.stats.RunningStats()
Exemplo n.º 5
0
def test_special_dont_write_prefix():
    with tempfile.TemporaryDirectory() as tmpdir:
        logging.debug('cd %s', tmpdir)
        os.chdir(tmpdir)

        wwt = warcprox.writerthread.WarcWriterProcessor(
            Options(prefix='-', writer_threads=1))
        wwt.inq = warcprox.TimestampedQueue(maxsize=1)
        wwt.outq = warcprox.TimestampedQueue(maxsize=1)
        try:
            wwt.start()
            # not to be written due to default prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(
                RecordedUrl(url='http://example.com/no',
                            content_type='text/plain',
                            status=200,
                            client_ip='127.0.0.2',
                            request_data=b'abc',
                            response_recorder=recorder,
                            remote_ip='127.0.0.3',
                            timestamp=datetime.utcnow(),
                            payload_digest=recorder.block_digest))
            # to be written due to warcprox-meta prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(
                RecordedUrl(
                    url='http://example.com/yes',
                    content_type='text/plain',
                    status=200,
                    client_ip='127.0.0.2',
                    request_data=b'abc',
                    response_recorder=recorder,
                    remote_ip='127.0.0.3',
                    timestamp=datetime.utcnow(),
                    payload_digest=recorder.block_digest,
                    warcprox_meta={'warc-prefix': 'normal-warc-prefix'}))
            recorded_url = wwt.outq.get(timeout=10)
            assert not recorded_url.warc_records
            recorded_url = wwt.outq.get(timeout=10)
            assert recorded_url.warc_records
            assert wwt.outq.empty()
        finally:
            wwt.stop.set()
            wwt.join()

        wwt = warcprox.writerthread.WarcWriterProcessor(
            Options(writer_threads=1, blackout_period=60, prefix='foo'))
        wwt.inq = warcprox.TimestampedQueue(maxsize=1)
        wwt.outq = warcprox.TimestampedQueue(maxsize=1)
        try:
            wwt.start()
            # to be written due to default prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(
                RecordedUrl(url='http://example.com/yes',
                            content_type='text/plain',
                            status=200,
                            client_ip='127.0.0.2',
                            request_data=b'abc',
                            response_recorder=recorder,
                            remote_ip='127.0.0.3',
                            timestamp=datetime.utcnow(),
                            payload_digest=recorder.block_digest))
            # not to be written due to warcprox-meta prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            wwt.inq.put(
                RecordedUrl(url='http://example.com/no',
                            content_type='text/plain',
                            status=200,
                            client_ip='127.0.0.2',
                            request_data=b'abc',
                            response_recorder=recorder,
                            remote_ip='127.0.0.3',
                            timestamp=datetime.utcnow(),
                            payload_digest=recorder.block_digest,
                            warcprox_meta={'warc-prefix': '-'}))
            recorded_url = wwt.outq.get(timeout=10)
            assert recorded_url.warc_records
            recorded_url = wwt.outq.get(timeout=10)
            assert not recorded_url.warc_records
            assert wwt.outq.empty()

            # test blackout_period option. Write first revisit record because
            # its outside the blackout_period (60). Do not write the second
            # because its inside the blackout_period.
            recorder = ProxyingRecorder(io.BytesIO(b'test1'), None)
            recorder.read()
            old = datetime.utcnow() - timedelta(0, 3600)
            ru = RecordedUrl(url='http://example.com/dup',
                             content_type='text/plain',
                             status=200,
                             client_ip='127.0.0.2',
                             request_data=b'abc',
                             response_recorder=recorder,
                             remote_ip='127.0.0.3',
                             timestamp=datetime.utcnow(),
                             payload_digest=recorder.block_digest)
            ru.dedup_info = dict(
                id=b'1',
                url=b'http://example.com/dup',
                date=old.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8'))
            wwt.inq.put(ru)
            recorded_url = wwt.outq.get(timeout=10)
            recorder = ProxyingRecorder(io.BytesIO(b'test2'), None)
            recorder.read()
            recent = datetime.utcnow() - timedelta(0, 5)
            ru = RecordedUrl(url='http://example.com/dup',
                             content_type='text/plain',
                             status=200,
                             client_ip='127.0.0.2',
                             request_data=b'abc',
                             response_recorder=recorder,
                             remote_ip='127.0.0.3',
                             timestamp=datetime.utcnow(),
                             payload_digest=recorder.block_digest)
            ru.dedup_info = dict(
                id=b'2',
                url=b'http://example.com/dup',
                date=recent.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8'))
            wwt.inq.put(ru)
            assert recorded_url.warc_records
            recorded_url = wwt.outq.get(timeout=10)
            assert not recorded_url.warc_records
            assert wwt.outq.empty()

        finally:
            wwt.stop.set()
            wwt.join()
Exemplo n.º 6
0
def init_controller(args):
    '''
    Creates a warcprox.controller.WarcproxController configured according to
    the supplied arguments (normally the result of parse_args(sys.argv)).
    '''
    options = warcprox.Options(**vars(args))

    try:
        hashlib.new(args.digest_algorithm)
    except Exception as e:
        logging.fatal(e)
        exit(1)

    listeners = []

    if args.rethinkdb_dedup_url:
        dedup_db = warcprox.dedup.RethinkDedupDb(options=options)
    elif args.rethinkdb_big_table_url:
        dedup_db = warcprox.bigtable.RethinkCapturesDedup(options=options)
    elif args.rethinkdb_trough_db_url:
        dedup_db = warcprox.dedup.TroughDedupDb(options)
    elif args.cdxserver_dedup:
        dedup_db = warcprox.dedup.CdxServerDedup(cdx_url=args.cdxserver_dedup)
    elif args.dedup_db_file in (None, '', '/dev/null'):
        logging.info('deduplication disabled')
        dedup_db = None
    else:
        dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file, options=options)
    if dedup_db:
        listeners.append(dedup_db)

    if args.rethinkdb_stats_url:
        stats_db = warcprox.stats.RethinkStatsDb(options=options)
        listeners.append(stats_db)
    elif args.stats_db_file in (None, '', '/dev/null'):
        logging.info('statistics tracking disabled')
        stats_db = None
    else:
        stats_db = warcprox.stats.StatsDb(args.stats_db_file, options=options)
        listeners.append(stats_db)

    recorded_url_q = warcprox.TimestampedQueue(maxsize=args.queue_size)

    ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
    ca = certauth.certauth.CertificateAuthority(args.cacert,
                                                args.certs_dir,
                                                ca_name=ca_name)

    proxy = warcprox.warcproxy.WarcProxy(ca=ca,
                                         recorded_url_q=recorded_url_q,
                                         stats_db=stats_db,
                                         options=options)

    if args.playback_port is not None:
        playback_index_db = warcprox.playback.PlaybackIndexDb(
            args.playback_index_db_file, options=options)
        playback_proxy = warcprox.playback.PlaybackProxy(
            ca=ca, playback_index_db=playback_index_db, options=options)
        listeners.append(playback_index_db)
    else:
        playback_index_db = None
        playback_proxy = None

    if args.crawl_log_dir:
        listeners.append(
            warcprox.crawl_log.CrawlLogger(args.crawl_log_dir,
                                           options=options))

    for qualname in args.plugins or []:
        try:
            (module_name, class_name) = qualname.rsplit('.', 1)
            module_ = importlib.import_module(module_name)
            class_ = getattr(module_, class_name)
            listener = class_()
            listener.notify  # make sure it has this method
            listeners.append(listener)
        except Exception as e:
            logging.fatal('problem with plugin class %r: %s', qualname, e)
            sys.exit(1)

    writer_pool = warcprox.writer.WarcWriterPool(options=options)
    # number of warc writer threads = sqrt(proxy.max_threads)
    # I came up with this out of thin air because it strikes me as reasonable
    # 1=>1 2=>1 5=>2 10=>3 50=>7 100=>10 200=>14 500=>22 1000=>32 2000=>45
    num_writer_threads = args.writer_threads or int(proxy.max_threads**0.5)
    logging.debug('initializing %d warc writer threads', num_writer_threads)
    warc_writer_threads = [
        warcprox.writerthread.WarcWriterThread(name='WarcWriterThread%03d' % i,
                                               recorded_url_q=recorded_url_q,
                                               writer_pool=writer_pool,
                                               dedup_db=dedup_db,
                                               listeners=listeners,
                                               options=options)
        for i in range(num_writer_threads)
    ]

    if args.rethinkdb_services_url:
        parsed = doublethink.parse_rethinkdb_url(
            options.rethinkdb_services_url)
        rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database)
        svcreg = doublethink.ServiceRegistry(rr, table=parsed.table)
    else:
        svcreg = None

    controller = warcprox.controller.WarcproxController(
        proxy,
        warc_writer_threads,
        playback_proxy,
        service_registry=svcreg,
        options=options)

    return controller
Exemplo n.º 7
0
def test_special_dont_write_prefix():
    class NotifyMe:
        def __init__(self):
            self.the_list = []
        def notify(self, recorded_url, records):
            self.the_list.append((recorded_url, records))

    with tempfile.TemporaryDirectory() as tmpdir:
        logging.debug('cd %s', tmpdir)
        os.chdir(tmpdir)

        q = warcprox.TimestampedQueue(maxsize=1)
        listener = NotifyMe()
        wwt = warcprox.writerthread.WarcWriterThread(
                recorded_url_q=q, options=Options(prefix='-'),
                listeners=[listener])
        try:
            wwt.start()
            # not to be written due to default prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            q.put(RecordedUrl(
                url='http://example.com/no', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest))
            # to be written due to warcprox-meta prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            q.put(RecordedUrl(
                url='http://example.com/yes', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest,
                warcprox_meta={'warc-prefix': 'normal-warc-prefix'}))
            wait(lambda: len(listener.the_list) == 2, 10.0)
            assert not listener.the_list[0][1]
            assert listener.the_list[1][1]
        finally:
            wwt.stop.set()
            wwt.join()

        q = warcprox.TimestampedQueue(maxsize=1)
        listener = NotifyMe()
        wwt = warcprox.writerthread.WarcWriterThread(
                recorded_url_q=q, listeners=[listener])
        try:
            wwt.start()
            # to be written due to default prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            q.put(RecordedUrl(
                url='http://example.com/yes', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest))
            # not to be written due to warcprox-meta prefix
            recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
            recorder.read()
            q.put(RecordedUrl(
                url='http://example.com/no', content_type='text/plain',
                status=200, client_ip='127.0.0.2', request_data=b'abc',
                response_recorder=recorder, remote_ip='127.0.0.3',
                timestamp=datetime.utcnow(),
                payload_digest=recorder.block_digest,
                warcprox_meta={'warc-prefix': '-'}))
            wait(lambda: len(listener.the_list) == 2, 10.0)
            assert listener.the_list[0][1]
            assert not listener.the_list[1][1]
        finally:
            wwt.stop.set()
            wwt.join()