def test_do_not_archive(): with tempfile.TemporaryDirectory() as tmpdir: logging.debug('cd %s', tmpdir) os.chdir(tmpdir) wwt = warcprox.writerthread.WarcWriterProcessor( Options(writer_threads=1)) wwt.inq = warcprox.TimestampedQueue(maxsize=1) wwt.outq = warcprox.TimestampedQueue(maxsize=1) try: wwt.start() # to be written -- default do_not_archive False recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwt.inq.put( RecordedUrl(url='http://example.com/yes', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest)) # not to be written -- do_not_archive set True recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwt.inq.put( RecordedUrl(url='http://example.com/no', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': '-'}, do_not_archive=True)) recorded_url = wwt.outq.get(timeout=10) assert recorded_url.warc_records recorded_url = wwt.outq.get(timeout=10) assert not recorded_url.warc_records assert wwt.outq.empty() finally: wwt.stop.set() wwt.join()
def chain(self, processor0, processor1): ''' Sets `processor0.outq` = `processor1.inq` = `queue.Queue()` ''' assert not processor0.outq assert not processor1.inq q = warcprox.TimestampedQueue(maxsize=self.options.queue_size) processor0.outq = q processor1.inq = q
def __init__(self, stats_db=None, status_callback=None, options=warcprox.Options()): self.status_callback = status_callback self.stats_db = stats_db self.options = options self.remote_connection_pool = PoolManager( num_pools=max(round(options.max_threads / 6), 200) if options.max_threads else 200) server_address = (options.address or 'localhost', options.port if options.port is not None else 8000) if options.onion_tor_socks_proxy: try: host, port = options.onion_tor_socks_proxy.split(':') WarcProxyHandler.onion_tor_socks_proxy_host = host WarcProxyHandler.onion_tor_socks_proxy_port = int(port) except ValueError: WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy WarcProxyHandler.onion_tor_socks_proxy_port = None if options.socket_timeout: WarcProxyHandler._socket_timeout = options.socket_timeout if options.max_resource_size: WarcProxyHandler._max_resource_size = options.max_resource_size if options.tmp_file_max_memory_size: WarcProxyHandler._tmp_file_max_memory_size = options.tmp_file_max_memory_size http_server.HTTPServer.__init__(self, server_address, WarcProxyHandler, bind_and_activate=True) self.digest_algorithm = options.digest_algorithm or 'sha1' ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64] self.ca = CertificateAuthority(ca_file=options.cacert or 'warcprox-ca.pem', certs_dir=options.certs_dir or './warcprox-ca', ca_name=ca_name) self.recorded_url_q = warcprox.TimestampedQueue( maxsize=options.queue_size or 1000) self.running_stats = warcprox.stats.RunningStats()
def __init__(self, ca=None, recorded_url_q=None, stats_db=None, options=warcprox.Options()): server_address = (options.address or 'localhost', options.port if options.port is not None else 8000) if options.onion_tor_socks_proxy: try: host, port = options.onion_tor_socks_proxy.split(':') WarcProxyHandler.onion_tor_socks_proxy_host = host WarcProxyHandler.onion_tor_socks_proxy_port = int(port) except ValueError: WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy WarcProxyHandler.onion_tor_socks_proxy_port = None http_server.HTTPServer.__init__(self, server_address, WarcProxyHandler, bind_and_activate=True) self.digest_algorithm = options.digest_algorithm or 'sha1' if ca is not None: self.ca = ca else: ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] self.ca = CertificateAuthority(ca_file='warcprox-ca.pem', certs_dir='./warcprox-ca', ca_name=ca_name) if recorded_url_q is not None: self.recorded_url_q = recorded_url_q else: self.recorded_url_q = warcprox.TimestampedQueue( maxsize=options.queue_size or 1000) self.stats_db = stats_db self.options = options self.running_stats = warcprox.stats.RunningStats()
def test_special_dont_write_prefix(): with tempfile.TemporaryDirectory() as tmpdir: logging.debug('cd %s', tmpdir) os.chdir(tmpdir) wwt = warcprox.writerthread.WarcWriterProcessor( Options(prefix='-', writer_threads=1)) wwt.inq = warcprox.TimestampedQueue(maxsize=1) wwt.outq = warcprox.TimestampedQueue(maxsize=1) try: wwt.start() # not to be written due to default prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwt.inq.put( RecordedUrl(url='http://example.com/no', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest)) # to be written due to warcprox-meta prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwt.inq.put( RecordedUrl( url='http://example.com/yes', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': 'normal-warc-prefix'})) recorded_url = wwt.outq.get(timeout=10) assert not recorded_url.warc_records recorded_url = wwt.outq.get(timeout=10) assert recorded_url.warc_records assert wwt.outq.empty() finally: wwt.stop.set() wwt.join() wwt = warcprox.writerthread.WarcWriterProcessor( Options(writer_threads=1, blackout_period=60, prefix='foo')) wwt.inq = warcprox.TimestampedQueue(maxsize=1) wwt.outq = warcprox.TimestampedQueue(maxsize=1) try: wwt.start() # to be written due to default prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwt.inq.put( RecordedUrl(url='http://example.com/yes', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest)) # not to be written due to warcprox-meta prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() wwt.inq.put( RecordedUrl(url='http://example.com/no', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': '-'})) recorded_url = wwt.outq.get(timeout=10) assert recorded_url.warc_records recorded_url = wwt.outq.get(timeout=10) assert not recorded_url.warc_records assert wwt.outq.empty() # test blackout_period option. Write first revisit record because # its outside the blackout_period (60). Do not write the second # because its inside the blackout_period. recorder = ProxyingRecorder(io.BytesIO(b'test1'), None) recorder.read() old = datetime.utcnow() - timedelta(0, 3600) ru = RecordedUrl(url='http://example.com/dup', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest) ru.dedup_info = dict( id=b'1', url=b'http://example.com/dup', date=old.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8')) wwt.inq.put(ru) recorded_url = wwt.outq.get(timeout=10) recorder = ProxyingRecorder(io.BytesIO(b'test2'), None) recorder.read() recent = datetime.utcnow() - timedelta(0, 5) ru = RecordedUrl(url='http://example.com/dup', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest) ru.dedup_info = dict( id=b'2', url=b'http://example.com/dup', date=recent.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8')) wwt.inq.put(ru) assert recorded_url.warc_records recorded_url = wwt.outq.get(timeout=10) assert not recorded_url.warc_records assert wwt.outq.empty() finally: wwt.stop.set() wwt.join()
def init_controller(args): ''' Creates a warcprox.controller.WarcproxController configured according to the supplied arguments (normally the result of parse_args(sys.argv)). ''' options = warcprox.Options(**vars(args)) try: hashlib.new(args.digest_algorithm) except Exception as e: logging.fatal(e) exit(1) listeners = [] if args.rethinkdb_dedup_url: dedup_db = warcprox.dedup.RethinkDedupDb(options=options) elif args.rethinkdb_big_table_url: dedup_db = warcprox.bigtable.RethinkCapturesDedup(options=options) elif args.rethinkdb_trough_db_url: dedup_db = warcprox.dedup.TroughDedupDb(options) elif args.cdxserver_dedup: dedup_db = warcprox.dedup.CdxServerDedup(cdx_url=args.cdxserver_dedup) elif args.dedup_db_file in (None, '', '/dev/null'): logging.info('deduplication disabled') dedup_db = None else: dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file, options=options) if dedup_db: listeners.append(dedup_db) if args.rethinkdb_stats_url: stats_db = warcprox.stats.RethinkStatsDb(options=options) listeners.append(stats_db) elif args.stats_db_file in (None, '', '/dev/null'): logging.info('statistics tracking disabled') stats_db = None else: stats_db = warcprox.stats.StatsDb(args.stats_db_file, options=options) listeners.append(stats_db) recorded_url_q = warcprox.TimestampedQueue(maxsize=args.queue_size) ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir, ca_name=ca_name) proxy = warcprox.warcproxy.WarcProxy(ca=ca, recorded_url_q=recorded_url_q, stats_db=stats_db, options=options) if args.playback_port is not None: playback_index_db = warcprox.playback.PlaybackIndexDb( args.playback_index_db_file, options=options) playback_proxy = warcprox.playback.PlaybackProxy( ca=ca, playback_index_db=playback_index_db, options=options) listeners.append(playback_index_db) else: playback_index_db = None playback_proxy = None if args.crawl_log_dir: listeners.append( warcprox.crawl_log.CrawlLogger(args.crawl_log_dir, options=options)) for qualname in args.plugins or []: try: (module_name, class_name) = qualname.rsplit('.', 1) module_ = importlib.import_module(module_name) class_ = getattr(module_, class_name) listener = class_() listener.notify # make sure it has this method listeners.append(listener) except Exception as e: logging.fatal('problem with plugin class %r: %s', qualname, e) sys.exit(1) writer_pool = warcprox.writer.WarcWriterPool(options=options) # number of warc writer threads = sqrt(proxy.max_threads) # I came up with this out of thin air because it strikes me as reasonable # 1=>1 2=>1 5=>2 10=>3 50=>7 100=>10 200=>14 500=>22 1000=>32 2000=>45 num_writer_threads = args.writer_threads or int(proxy.max_threads**0.5) logging.debug('initializing %d warc writer threads', num_writer_threads) warc_writer_threads = [ warcprox.writerthread.WarcWriterThread(name='WarcWriterThread%03d' % i, recorded_url_q=recorded_url_q, writer_pool=writer_pool, dedup_db=dedup_db, listeners=listeners, options=options) for i in range(num_writer_threads) ] if args.rethinkdb_services_url: parsed = doublethink.parse_rethinkdb_url( options.rethinkdb_services_url) rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) svcreg = doublethink.ServiceRegistry(rr, table=parsed.table) else: svcreg = None controller = warcprox.controller.WarcproxController( proxy, warc_writer_threads, playback_proxy, service_registry=svcreg, options=options) return controller
def test_special_dont_write_prefix(): class NotifyMe: def __init__(self): self.the_list = [] def notify(self, recorded_url, records): self.the_list.append((recorded_url, records)) with tempfile.TemporaryDirectory() as tmpdir: logging.debug('cd %s', tmpdir) os.chdir(tmpdir) q = warcprox.TimestampedQueue(maxsize=1) listener = NotifyMe() wwt = warcprox.writerthread.WarcWriterThread( recorded_url_q=q, options=Options(prefix='-'), listeners=[listener]) try: wwt.start() # not to be written due to default prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() q.put(RecordedUrl( url='http://example.com/no', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest)) # to be written due to warcprox-meta prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() q.put(RecordedUrl( url='http://example.com/yes', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': 'normal-warc-prefix'})) wait(lambda: len(listener.the_list) == 2, 10.0) assert not listener.the_list[0][1] assert listener.the_list[1][1] finally: wwt.stop.set() wwt.join() q = warcprox.TimestampedQueue(maxsize=1) listener = NotifyMe() wwt = warcprox.writerthread.WarcWriterThread( recorded_url_q=q, listeners=[listener]) try: wwt.start() # to be written due to default prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() q.put(RecordedUrl( url='http://example.com/yes', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest)) # not to be written due to warcprox-meta prefix recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) recorder.read() q.put(RecordedUrl( url='http://example.com/no', content_type='text/plain', status=200, client_ip='127.0.0.2', request_data=b'abc', response_recorder=recorder, remote_ip='127.0.0.3', timestamp=datetime.utcnow(), payload_digest=recorder.block_digest, warcprox_meta={'warc-prefix': '-'})) wait(lambda: len(listener.the_list) == 2, 10.0) assert listener.the_list[0][1] assert not listener.the_list[1][1] finally: wwt.stop.set() wwt.join()