def brozzler_worker(argv=None): ''' Main entry point for brozzler, gets sites and pages to brozzle from rethinkdb, brozzles them. ''' argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), formatter_class=BetterArgumentDefaultsHelpFormatter) add_rethinkdb_options(arg_parser) arg_parser.add_argument('-e', '--chrome-exe', dest='chrome_exe', default=suggest_default_chrome_exe(), help='executable to use to invoke chrome') arg_parser.add_argument( '-n', '--max-browsers', dest='max_browsers', default='1', help='max number of chrome instances simultaneously browsing pages') arg_parser.add_argument('--proxy', dest='proxy', default=None, help='http proxy') arg_parser.add_argument( '--warcprox-auto', dest='warcprox_auto', action='store_true', help=('when needed, choose an available instance of warcprox from ' 'the rethinkdb service registry')) arg_parser.add_argument('--skip-extract-outlinks', dest='skip_extract_outlinks', action='store_true', help=argparse.SUPPRESS) arg_parser.add_argument('--skip-visit-hashtags', dest='skip_visit_hashtags', action='store_true', help=argparse.SUPPRESS) arg_parser.add_argument('--skip-youtube-dl', dest='skip_youtube_dl', action='store_true', help=argparse.SUPPRESS) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) def dump_state(signum, frame): signal.signal(signal.SIGQUIT, signal.SIG_IGN) try: state_strs = [] frames = sys._current_frames() threads = {th.ident: th for th in threading.enumerate()} for ident in frames: if threads[ident]: state_strs.append(str(threads[ident])) else: state_strs.append('<???:thread:ident=%s>' % ident) stack = traceback.format_stack(frames[ident]) state_strs.append(''.join(stack)) logging.info('dumping state (caught signal %s)\n%s' % (signum, '\n'.join(state_strs))) except BaseException as e: logging.error('exception dumping state: %s' % e) finally: signal.signal(signal.SIGQUIT, dump_state) rr = rethinker(args) frontier = brozzler.RethinkDbFrontier(rr) service_registry = doublethink.ServiceRegistry(rr) worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe, proxy=args.proxy, warcprox_auto=args.warcprox_auto, skip_extract_outlinks=args.skip_extract_outlinks, skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl) signal.signal(signal.SIGQUIT, dump_state) signal.signal(signal.SIGTERM, lambda s, f: worker.stop()) signal.signal(signal.SIGINT, lambda s, f: worker.stop()) th = threading.Thread(target=worker.run, name='BrozzlerWorkerThread') th.start() th.join() logging.info('brozzler-worker is all done, exiting')
def brozzler_worker(argv=None): ''' Main entry point for brozzler, gets sites and pages to brozzle from rethinkdb, brozzles them. ''' argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), formatter_class=BetterArgumentDefaultsHelpFormatter) add_rethinkdb_options(arg_parser) arg_parser.add_argument( '-e', '--chrome-exe', dest='chrome_exe', default=suggest_default_chrome_exe(), help='executable to use to invoke chrome') arg_parser.add_argument( '-n', '--max-browsers', dest='max_browsers', default='1', help='max number of chrome instances simultaneously browsing pages') arg_parser.add_argument( '--proxy', dest='proxy', default=None, help='http proxy') arg_parser.add_argument( '--warcprox-auto', dest='warcprox_auto', action='store_true', help=( 'when needed, choose an available instance of warcprox from ' 'the rethinkdb service registry')) arg_parser.add_argument( '--skip-extract-outlinks', dest='skip_extract_outlinks', action='store_true', help=argparse.SUPPRESS) arg_parser.add_argument( '--skip-visit-hashtags', dest='skip_visit_hashtags', action='store_true', help=argparse.SUPPRESS) arg_parser.add_argument( '--skip-youtube-dl', dest='skip_youtube_dl', action='store_true', help=argparse.SUPPRESS) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) brozzler.chrome.check_version(args.chrome_exe) def dump_state(signum, frame): signal.signal(signal.SIGQUIT, signal.SIG_IGN) try: state_strs = [] frames = sys._current_frames() threads = {th.ident: th for th in threading.enumerate()} for ident in frames: if threads[ident]: state_strs.append(str(threads[ident])) else: state_strs.append('<???:thread:ident=%s>' % ident) stack = traceback.format_stack(frames[ident]) state_strs.append(''.join(stack)) logging.info( 'dumping state (caught signal %s)\n%s' % ( signum, '\n'.join(state_strs))) except BaseException as e: logging.error('exception dumping state: %s' % e) finally: signal.signal(signal.SIGQUIT, dump_state) rr = rethinker(args) frontier = brozzler.RethinkDbFrontier(rr) service_registry = doublethink.ServiceRegistry(rr) worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe, proxy=args.proxy, warcprox_auto=args.warcprox_auto, skip_extract_outlinks=args.skip_extract_outlinks, skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl) signal.signal(signal.SIGQUIT, dump_state) signal.signal(signal.SIGTERM, lambda s,f: worker.stop()) signal.signal(signal.SIGINT, lambda s,f: worker.stop()) th = threading.Thread(target=worker.run, name='BrozzlerWorkerThread') th.start() th.join() logging.info('brozzler-worker is all done, exiting')