예제 #1
0
    def __init__(self, rethinkdb_trough_db_url, promotion_interval=None):
        '''
        TroughClient constructor

        Args:
            rethinkdb_trough_db_url: url with schema rethinkdb:// pointing to
                trough configuration database
            promotion_interval: if specified, `TroughClient` will spawn a
                thread that "promotes" (pushed to hdfs) "dirty" trough segments
                (segments that have received writes) periodically, sleeping for
                `promotion_interval` seconds between cycles (default None)
        '''
        parsed = doublethink.parse_rethinkdb_url(rethinkdb_trough_db_url)
        self.rr = doublethink.Rethinker(servers=parsed.hosts,
                                        db=parsed.database)
        self.svcreg = doublethink.ServiceRegistry(self.rr)
        self._write_url_cache = {}
        self._read_url_cache = {}
        self._dirty_segments = set()
        self._dirty_segments_lock = threading.RLock()

        self.promotion_interval = promotion_interval
        self._promoter_thread = None
        if promotion_interval:
            self._promoter_thread = threading.Thread(
                target=self._promotrix, name='TroughClient-promoter')
            self._promoter_thread.setDaemon(True)
            self._promoter_thread.start()
예제 #2
0
 def __init__(self, options=warcprox.Options()):
     parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_dedup_url)
     self.rr = doublethink.Rethinker(servers=parsed.hosts,
                                     db=parsed.database)
     self.table = parsed.table
     self._ensure_db_table()
     self.options = options
예제 #3
0
파일: dedup.py 프로젝트: nlevitt/warcprox
 def __init__(self, options=warcprox.Options()):
     parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_dedup_url)
     self.rr = doublethink.Rethinker(
             servers=parsed.hosts, db=parsed.database)
     self.table = parsed.table
     self._ensure_db_table()
     self.options = options
예제 #4
0
파일: trough.py 프로젝트: ukwa/warcprox
    def __init__(self, rethinkdb_trough_db_url, promotion_interval=None):
        '''
        TroughClient constructor

        Args:
            rethinkdb_trough_db_url: url with schema rethinkdb:// pointing to
                trough configuration database
            promotion_interval: if specified, `TroughClient` will spawn a
                thread that "promotes" (pushed to hdfs) "dirty" trough segments
                (segments that have received writes) periodically, sleeping for
                `promotion_interval` seconds between cycles (default None)
        '''
        parsed = doublethink.parse_rethinkdb_url(rethinkdb_trough_db_url)
        self.rr = doublethink.Rethinker(
                servers=parsed.hosts, db=parsed.database)
        self.svcreg = doublethink.ServiceRegistry(self.rr)
        self._write_url_cache = {}
        self._read_url_cache = {}
        self._dirty_segments = set()
        self._dirty_segments_lock = threading.RLock()

        self.promotion_interval = promotion_interval
        self._promoter_thread = None
        if promotion_interval:
            self._promoter_thread = threading.Thread(
                    target=self._promotrix, name='TroughClient-promoter')
            self._promoter_thread.setDaemon(True)
            self._promoter_thread.start()
예제 #5
0
 def service_registry(options):
     if options.rethinkdb_services_url:
         parsed = doublethink.parse_rethinkdb_url(
                 options.rethinkdb_services_url)
         rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database)
         return doublethink.ServiceRegistry(rr, table=parsed.table)
     else:
         return None
예제 #6
0
 def service_registry(options):
     if options.rethinkdb_services_url:
         parsed = doublethink.parse_rethinkdb_url(
                 options.rethinkdb_services_url)
         rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database)
         return doublethink.ServiceRegistry(rr, table=parsed.table)
     else:
         return None
예제 #7
0
파일: stats.py 프로젝트: ukwa/warcprox
    def __init__(self, options=warcprox.Options()):
        StatsProcessor.__init__(self, options)

        parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_stats_url)
        self.rr = doublethink.Rethinker(
                servers=parsed.hosts, db=parsed.database)
        self.table = parsed.table
        self.replicas = min(3, len(self.rr.servers))
예제 #8
0
    def __init__(self, options=warcprox.Options()):
        StatsProcessor.__init__(self, options)

        parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_stats_url)
        self.rr = doublethink.Rethinker(servers=parsed.hosts,
                                        db=parsed.database)
        self.table = parsed.table
        self.replicas = min(3, len(self.rr.servers))
예제 #9
0
파일: stats.py 프로젝트: nlevitt/warcprox
    def __init__(self, options=warcprox.Options()):
        parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_stats_url)
        self.rr = doublethink.Rethinker(
                servers=parsed.hosts, db=parsed.database)
        self.table = parsed.table
        self.replicas = min(3, len(self.rr.servers))
        self._ensure_db_table()
        self.options = options

        self._stop = threading.Event()
        self._batch_lock = threading.RLock()
        with self._batch_lock:
            self._batch = {}
        self._timer = None
예제 #10
0
    def __init__(self, options=warcprox.Options()):
        parsed = doublethink.parse_rethinkdb_url(
            options.rethinkdb_big_table_url)
        self.rr = doublethink.Rethinker(servers=parsed.hosts,
                                        db=parsed.database)
        self.table = parsed.table
        self.options = options
        self._ensure_db_table()

        self._stop = threading.Event()
        self._batch_lock = threading.RLock()
        with self._batch_lock:
            self._batch = []
        self._timer = None
예제 #11
0
def ensure_rethinkdb_tables(argv=None):
    '''
    Creates rethinkdb tables if they don't already exist. Warcprox normally
    creates the tables it needs on demand at startup, but if multiple instances
    are starting up at the same time, you can end up with duplicate broken
    tables. So it's a good idea to use this utility at an early step when
    spinning up a cluster.
    '''
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
        prog=os.path.basename(argv[0]),
        formatter_class=BetterArgumentDefaultsHelpFormatter)
    arg_parser.add_argument(
        '--rethinkdb-stats-url',
        dest='rethinkdb_stats_url',
        help=('rethinkdb stats table url, e.g. rethinkdb://db0.foo.org,'
              'db1.foo.org:38015/my_warcprox_db/my_stats_table'))
    group = arg_parser.add_mutually_exclusive_group()
    group.add_argument(
        '--rethinkdb-dedup-url',
        dest='rethinkdb_dedup_url',
        help=('rethinkdb dedup url, e.g. rethinkdb://db0.foo.org,'
              'db1.foo.org:38015/my_warcprox_db/my_dedup_table'))
    group.add_argument(
        '--rethinkdb-big-table-url',
        dest='rethinkdb_big_table_url',
        help=('rethinkdb big table url (table will be populated with '
              'various capture information and is suitable for use as '
              'index for playback), e.g. rethinkdb://db0.foo.org,'
              'db1.foo.org:38015/my_warcprox_db/captures'))
    group.add_argument(
        '--rethinkdb-trough-db-url',
        dest='rethinkdb_trough_db_url',
        help=('🐷   url pointing to trough configuration rethinkdb database, '
              'e.g. rethinkdb://db0.foo.org,db1.foo.org:38015'
              '/trough_configuration'))
    arg_parser.add_argument(
        '--rethinkdb-services-url',
        dest='rethinkdb_services_url',
        help=('rethinkdb service registry table url; if provided, warcprox '
              'will create and heartbeat entry for itself'))
    arg_parser.add_argument('-q',
                            '--quiet',
                            dest='log_level',
                            action='store_const',
                            default=logging.INFO,
                            const=logging.WARN)
    arg_parser.add_argument('-v',
                            '--verbose',
                            dest='log_level',
                            action='store_const',
                            default=logging.INFO,
                            const=logging.DEBUG)
    args = arg_parser.parse_args(args=argv[1:])

    logging.basicConfig(
        stream=sys.stdout,
        level=args.log_level,
        format=('%(asctime)s %(levelname)s %(name)s.%(funcName)s'
                '(%(filename)s:%(lineno)d) %(message)s'))

    options = warcprox.Options(**vars(args))

    did_something = False
    if args.rethinkdb_services_url:
        parsed = doublethink.parse_rethinkdb_url(
            options.rethinkdb_services_url)
        rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database)
        svcreg = doublethink.ServiceRegistry(rr, table=parsed.table)
        did_something = True
    if args.rethinkdb_stats_url:
        stats_db = warcprox.stats.RethinkStatsProcessor(options=options)
        stats_db._ensure_db_table()
        did_something = True
    if args.rethinkdb_dedup_url:
        dedup_db = warcprox.dedup.RethinkDedupDb(options=options)
        did_something = True
    if args.rethinkdb_big_table_url:
        dedup_db = warcprox.bigtable.RethinkCapturesDedup(options=options)
        did_something = True
    if args.rethinkdb_trough_db_url:
        dedup_db = warcprox.dedup.TroughDedupDb(options)
        logging.warn(
            'trough is responsible for creating most of the rethinkdb '
            'tables that it uses')
        did_something = True

    if not did_something:
        logging.error('nothing to do, no --rethinkdb-* options supplied')
예제 #12
0
def ensure_rethinkdb_tables(argv=None):
    '''
    Creates rethinkdb tables if they don't already exist. Warcprox normally
    creates the tables it needs on demand at startup, but if multiple instances
    are starting up at the same time, you can end up with duplicate broken
    tables. So it's a good idea to use this utility at an early step when
    spinning up a cluster.
    '''
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(argv[0]),
            formatter_class=BetterArgumentDefaultsHelpFormatter)
    arg_parser.add_argument(
            '--rethinkdb-stats-url', dest='rethinkdb_stats_url', help=(
                'rethinkdb stats table url, e.g. rethinkdb://db0.foo.org,'
                'db1.foo.org:38015/my_warcprox_db/my_stats_table'))
    group = arg_parser.add_mutually_exclusive_group()
    group.add_argument(
            '--rethinkdb-dedup-url', dest='rethinkdb_dedup_url', help=(
                'rethinkdb dedup url, e.g. rethinkdb://db0.foo.org,'
                'db1.foo.org:38015/my_warcprox_db/my_dedup_table'))
    group.add_argument(
            '--rethinkdb-big-table-url', dest='rethinkdb_big_table_url', help=(
                'rethinkdb big table url (table will be populated with '
                'various capture information and is suitable for use as '
                'index for playback), e.g. rethinkdb://db0.foo.org,'
                'db1.foo.org:38015/my_warcprox_db/captures'))
    group.add_argument(
            '--rethinkdb-trough-db-url', dest='rethinkdb_trough_db_url', help=(
                '🐷   url pointing to trough configuration rethinkdb database, '
                'e.g. rethinkdb://db0.foo.org,db1.foo.org:38015'
                '/trough_configuration'))
    arg_parser.add_argument(
            '--rethinkdb-services-url', dest='rethinkdb_services_url', help=(
                'rethinkdb service registry table url; if provided, warcprox '
                'will create and heartbeat entry for itself'))
    arg_parser.add_argument(
            '-q', '--quiet', dest='log_level',
            action='store_const', default=logging.INFO, const=logging.WARN)
    arg_parser.add_argument(
            '-v', '--verbose', dest='log_level',
            action='store_const', default=logging.INFO, const=logging.DEBUG)
    args = arg_parser.parse_args(args=argv[1:])

    logging.basicConfig(
            stream=sys.stdout, level=args.log_level, format=(
                '%(asctime)s %(levelname)s %(name)s.%(funcName)s'
                '(%(filename)s:%(lineno)d) %(message)s'))

    options = warcprox.Options(**vars(args))

    did_something = False
    if args.rethinkdb_services_url:
        parsed = doublethink.parse_rethinkdb_url(
                options.rethinkdb_services_url)
        rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database)
        svcreg = doublethink.ServiceRegistry(rr, table=parsed.table)
        did_something = True
    if args.rethinkdb_stats_url:
        stats_db = warcprox.stats.RethinkStatsProcessor(options=options)
        stats_db._ensure_db_table()
        did_something = True
    if args.rethinkdb_dedup_url:
        dedup_db = warcprox.dedup.RethinkDedupDb(options=options)
        did_something = True
    if args.rethinkdb_big_table_url:
        dedup_db = warcprox.bigtable.RethinkCapturesDedup(options=options)
        did_something = True
    if args.rethinkdb_trough_db_url:
        dedup_db = warcprox.dedup.TroughDedupDb(options)
        logging.warning(
                'trough is responsible for creating most of the rethinkdb '
                'tables that it uses')
        did_something = True

    if not did_something:
        logging.error('nothing to do, no --rethinkdb-* options supplied')
예제 #13
0
파일: main.py 프로젝트: nlevitt/warcprox
def init_controller(args):
    '''
    Creates a warcprox.controller.WarcproxController configured according to
    the supplied arguments (normally the result of parse_args(sys.argv)).
    '''
    options = warcprox.Options(**vars(args))

    try:
        hashlib.new(args.digest_algorithm)
    except Exception as e:
        logging.fatal(e)
        exit(1)

    listeners = []

    if args.rethinkdb_dedup_url:
        dedup_db = warcprox.dedup.RethinkDedupDb(options=options)
    elif args.rethinkdb_big_table_url:
        dedup_db = warcprox.bigtable.RethinkCapturesDedup(options=options)
    elif args.rethinkdb_trough_db_url:
        dedup_db = warcprox.dedup.TroughDedupDb(options)
    elif args.cdxserver_dedup:
        dedup_db = warcprox.dedup.CdxServerDedup(cdx_url=args.cdxserver_dedup)
    elif args.dedup_db_file in (None, '', '/dev/null'):
        logging.info('deduplication disabled')
        dedup_db = None
    else:
        dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file, options=options)
    if dedup_db:
        listeners.append(dedup_db)

    if args.rethinkdb_stats_url:
        stats_db = warcprox.stats.RethinkStatsDb(options=options)
        listeners.append(stats_db)
    elif args.stats_db_file in (None, '', '/dev/null'):
        logging.info('statistics tracking disabled')
        stats_db = None
    else:
        stats_db = warcprox.stats.StatsDb(args.stats_db_file, options=options)
        listeners.append(stats_db)

    recorded_url_q = warcprox.TimestampedQueue(maxsize=args.queue_size)

    ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
    ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir,
                                                ca_name=ca_name)

    proxy = warcprox.warcproxy.WarcProxy(ca=ca, recorded_url_q=recorded_url_q,
            stats_db=stats_db, options=options)

    if args.playback_port is not None:
        playback_index_db = warcprox.playback.PlaybackIndexDb(
                args.playback_index_db_file, options=options)
        playback_proxy = warcprox.playback.PlaybackProxy(
                ca=ca, playback_index_db=playback_index_db, options=options)
        listeners.append(playback_index_db)
    else:
        playback_index_db = None
        playback_proxy = None

    if args.crawl_log_dir:
        listeners.append(warcprox.crawl_log.CrawlLogger(
            args.crawl_log_dir, options=options))

    for qualname in args.plugins or []:
        try:
            (module_name, class_name) = qualname.rsplit('.', 1)
            module_ = importlib.import_module(module_name)
            class_ = getattr(module_, class_name)
            listener = class_()
            listener.notify  # make sure it has this method
            listeners.append(listener)
        except Exception as e:
            logging.fatal('problem with plugin class %r: %s', qualname, e)
            sys.exit(1)

    writer_pool = warcprox.writer.WarcWriterPool(options=options)
    # number of warc writer threads = sqrt(proxy.max_threads)
    # I came up with this out of thin air because it strikes me as reasonable
    # 1=>1 2=>1 5=>2 10=>3 50=>7 100=>10 200=>14 500=>22 1000=>32 2000=>45
    num_writer_threads = args.writer_threads or int(proxy.max_threads ** 0.5)
    logging.debug('initializing %d warc writer threads', num_writer_threads)
    warc_writer_threads = [
            warcprox.writerthread.WarcWriterThread(
                name='WarcWriterThread%03d' % i, recorded_url_q=recorded_url_q,
                writer_pool=writer_pool, dedup_db=dedup_db,
                listeners=listeners, options=options)
            for i in range(num_writer_threads)]

    if args.rethinkdb_services_url:
        parsed = doublethink.parse_rethinkdb_url(
                options.rethinkdb_services_url)
        rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database)
        svcreg = doublethink.ServiceRegistry(rr, table=parsed.table)
    else:
        svcreg = None

    controller = warcprox.controller.WarcproxController(
            proxy, warc_writer_threads, playback_proxy,
            service_registry=svcreg, options=options)

    return controller
예제 #14
0
파일: main.py 프로젝트: d235j/warcprox
def init_controller(args):
    '''
    Creates a warcprox.controller.WarcproxController configured according to
    the supplied arguments (normally the result of parse_args(sys.argv)).
    '''
    options = warcprox.Options(**vars(args))

    try:
        hashlib.new(args.digest_algorithm)
    except Exception as e:
        logging.fatal(e)
        exit(1)

    listeners = []

    if args.rethinkdb_dedup_url:
        dedup_db = warcprox.dedup.RethinkDedupDb(options=options)
    elif args.rethinkdb_big_table_url:
        dedup_db = warcprox.bigtable.RethinkCapturesDedup(options=options)
    elif args.rethinkdb_trough_db_url:
        dedup_db = warcprox.dedup.TroughDedupDb(options)
    elif args.cdxserver_dedup:
        dedup_db = warcprox.dedup.CdxServerDedup(cdx_url=args.cdxserver_dedup)
    elif args.dedup_db_file in (None, '', '/dev/null'):
        logging.info('deduplication disabled')
        dedup_db = None
    else:
        dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file, options=options)
    if dedup_db:
        listeners.append(dedup_db)

    if args.rethinkdb_stats_url:
        stats_db = warcprox.stats.RethinkStatsDb(options=options)
        listeners.append(stats_db)
    elif args.stats_db_file in (None, '', '/dev/null'):
        logging.info('statistics tracking disabled')
        stats_db = None
    else:
        stats_db = warcprox.stats.StatsDb(args.stats_db_file, options=options)
        listeners.append(stats_db)

    recorded_url_q = warcprox.TimestampedQueue(maxsize=args.queue_size)

    ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
    ca = certauth.certauth.CertificateAuthority(args.cacert,
                                                args.certs_dir,
                                                ca_name=ca_name)

    proxy = warcprox.warcproxy.WarcProxy(ca=ca,
                                         recorded_url_q=recorded_url_q,
                                         stats_db=stats_db,
                                         options=options)

    if args.playback_port is not None:
        playback_index_db = warcprox.playback.PlaybackIndexDb(
            args.playback_index_db_file, options=options)
        playback_proxy = warcprox.playback.PlaybackProxy(
            ca=ca, playback_index_db=playback_index_db, options=options)
        listeners.append(playback_index_db)
    else:
        playback_index_db = None
        playback_proxy = None

    if args.crawl_log_dir:
        listeners.append(
            warcprox.crawl_log.CrawlLogger(args.crawl_log_dir,
                                           options=options))

    for qualname in args.plugins or []:
        try:
            (module_name, class_name) = qualname.rsplit('.', 1)
            module_ = importlib.import_module(module_name)
            class_ = getattr(module_, class_name)
            listener = class_()
            listener.notify  # make sure it has this method
            listeners.append(listener)
        except Exception as e:
            logging.fatal('problem with plugin class %r: %s', qualname, e)
            sys.exit(1)

    writer_pool = warcprox.writer.WarcWriterPool(options=options)
    # number of warc writer threads = sqrt(proxy.max_threads)
    # I came up with this out of thin air because it strikes me as reasonable
    # 1=>1 2=>1 5=>2 10=>3 50=>7 100=>10 200=>14 500=>22 1000=>32 2000=>45
    num_writer_threads = args.writer_threads or int(proxy.max_threads**0.5)
    logging.debug('initializing %d warc writer threads', num_writer_threads)
    warc_writer_threads = [
        warcprox.writerthread.WarcWriterThread(name='WarcWriterThread%03d' % i,
                                               recorded_url_q=recorded_url_q,
                                               writer_pool=writer_pool,
                                               dedup_db=dedup_db,
                                               listeners=listeners,
                                               options=options)
        for i in range(num_writer_threads)
    ]

    if args.rethinkdb_services_url:
        parsed = doublethink.parse_rethinkdb_url(
            options.rethinkdb_services_url)
        rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database)
        svcreg = doublethink.ServiceRegistry(rr, table=parsed.table)
    else:
        svcreg = None

    controller = warcprox.controller.WarcproxController(
        proxy,
        warc_writer_threads,
        playback_proxy,
        service_registry=svcreg,
        options=options)

    return controller