def one(ctx, interactive, enable_phantomjs, scripts): """ One mode not only means all-in-one, it runs every thing in one process over tornado.ioloop, for debug purpose """ ctx.obj['debug'] = False g = ctx.obj g['testing_mode'] = True if scripts: from spider.database.local.projectdb import ProjectDB g['projectdb'] = ProjectDB(scripts) if g.get('is_taskdb_default'): g['taskdb'] = connect_database('sqlite+taskdb://') if g.get('is_resultdb_default'): g['resultdb'] = None if enable_phantomjs: phantomjs_config = g.config.get('phantomjs', {}) phantomjs_obj = ctx.invoke(phantomjs, **phantomjs_config) if phantomjs_obj: g.setdefault('phantomjs_proxy', '127.0.0.1:%s' % phantomjs_obj.port) else: phantomjs_obj = None result_worker_config = g.config.get('result_worker', {}) if g.resultdb is None: result_worker_config.setdefault('result_cls', 'spider.result.OneResultWorker') result_worker_obj = ctx.invoke(result_worker, **result_worker_config) processor_config = g.config.get('processor', {}) processor_config.setdefault('enable_stdout_capture', False) processor_obj = ctx.invoke(processor, **processor_config) fetcher_config = g.config.get('fetcher', {}) fetcher_config.setdefault('xmlrpc', False) fetcher_obj = ctx.invoke(fetcher, **fetcher_config) scheduler_config = g.config.get('scheduler', {}) scheduler_config.setdefault('xmlrpc', False) scheduler_config.setdefault('scheduler_cls', 'spider.scheduler.OneScheduler') scheduler_obj = ctx.invoke(scheduler, **scheduler_config) scheduler_obj.init_one(ioloop=fetcher_obj.ioloop, fetcher=fetcher_obj, processor=processor_obj, result_worker=result_worker_obj, interactive=interactive) if scripts: for project in g.projectdb.projects: scheduler_obj.trigger_on_start(project) try: scheduler_obj.run() finally: scheduler_obj.quit() if phantomjs_obj: phantomjs_obj.quit()
def cli(ctx, **kwargs): """ A powerful spider system in python. """ logging.config.fileConfig(kwargs['logging_config']) # get db from env for db in ('taskdb', 'projectdb', 'resultdb'): if kwargs[db] is not None: continue if os.environ.get('MYSQL_NAME'): kwargs[db] = utils.Get(lambda db=db: connect_database( 'sqlalchemy+mysql+%s://%s:%s/%s' % ( db, os.environ['MYSQL_PORT_3306_TCP_ADDR'], os.environ['MYSQL_PORT_3306_TCP_PORT'], db))) elif os.environ.get('MONGODB_NAME'): kwargs[db] = utils.Get(lambda db=db: connect_database( 'mongodb+%s://%s:%s/%s' % ( db, os.environ['MONGODB_PORT_27017_TCP_ADDR'], os.environ['MONGODB_PORT_27017_TCP_PORT'], db))) elif ctx.invoked_subcommand == 'bench': if kwargs['data_path'] == './data': kwargs['data_path'] += '/bench' shutil.rmtree(kwargs['data_path'], ignore_errors=True) os.mkdir(kwargs['data_path']) if db in ('taskdb', 'resultdb'): kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s://' % (db))) else: kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % ( db, kwargs['data_path'], db[:-2]))) else: if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) kwargs[db] = utils.Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % ( db, kwargs['data_path'], db[:-2]))) kwargs['is_%s_default' % db] = True # create folder for counter.dump if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) # message queue, compatible with old version if kwargs.get('message_queue'): pass elif kwargs.get('amqp_url'): kwargs['message_queue'] = kwargs['amqp_url'] elif os.environ.get('RABBITMQ_NAME'): kwargs['message_queue'] = ("amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s" ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ) elif kwargs.get('beanstalk'): kwargs['message_queue'] = "beanstalk://%s/" % kwargs['beanstalk'] for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): if kwargs.get('message_queue'): kwargs[name] = utils.Get(lambda name=name: connect_message_queue( name, kwargs.get('message_queue'), kwargs['queue_maxsize'])) else: kwargs[name] = connect_message_queue(name, kwargs.get('message_queue'), kwargs['queue_maxsize']) # phantomjs-proxy if kwargs.get('phantomjs_proxy'): pass elif os.environ.get('PHANTOMJS_NAME'): kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT_25555_TCP'][len('tcp://'):] ctx.obj = utils.ObjectDict(ctx.obj or {}) ctx.obj['instances'] = [] ctx.obj.update(kwargs) if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'): ctx.invoke(all) return ctx
def connect_db(ctx, param, value): if not value: return return utils.Get(lambda: connect_database(value))