def fetcher(ctx, xmlrpc, xmlrpc_host, xmlrpc_port, poolsize, proxy, user_agent, timeout, Fetcher=Fetcher): g = ctx.obj fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor, poolsize=poolsize, proxy=proxy) fetcher.phantomjs_proxy = g.phantomjs_proxy if user_agent: fetcher.user_agent = user_agent if timeout: fetcher.default_options = dict(fetcher.default_options) fetcher.default_options['timeout'] = timeout g.instances.append(fetcher) if g.get('testing_mode'): return fetcher if xmlrpc: run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host) fetcher.run()
def run_fetcher(g=g): from pyspider.fetcher.tornado_fetcher import Fetcher fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor) fetcher.phantomjs_proxy = g.phantomjs_proxy run_in_thread(fetcher.xmlrpc_run, port=g.fetcher_xmlrpc_port, bind=g.webui_host) fetcher.run()
def webui(ctx, host, port, cdn, scheduler_rpc, fetcher_rpc, max_rate, max_burst, username, password, need_auth, app=app): g = ctx.obj app.config['taskdb'] = g.taskdb app.config['projectdb'] = g.projectdb app.config['resultdb'] = g.resultdb app.config['cdn'] = cdn if max_rate: app.config['max_rate'] = max_rate if max_burst: app.config['max_burst'] = max_burst if username: app.config['webui_username'] = username if password: app.config['webui_password'] = password # fetcher rpc if isinstance(fetcher_rpc, six.string_types): fetcher_rpc = connect_rpc(ctx, None, fetcher_rpc) if fetcher_rpc is None: fetcher = Fetcher(inqueue=None, outqueue=None, async=False) fetcher.phantomjs_proxy = g.phantomjs_proxy app.config['fetch'] = lambda x: fetcher.fetch(x)[1] else: import umsgpack app.config['fetch'] = lambda x: umsgpack.unpackb( fetcher_rpc.fetch(x).data) if isinstance(scheduler_rpc, six.string_types): scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc) if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'): app.config['scheduler_rpc'] = connect_rpc( ctx, None, 'http://%s/' % (os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):])) elif scheduler_rpc is None: app.config['scheduler_rpc'] = connect_rpc(ctx, None, 'http://localhost:23333/') else: app.config['scheduler_rpc'] = scheduler_rpc app.debug = g.debug g.instances.append(app) if g.get('testing_mode'): return app app.run(host=host, port=port)
def fetcher(ctx, xmlrpc, xmlrpc_host, xmlrpc_port): g = ctx.obj from pyspider.fetcher.tornado_fetcher import Fetcher fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor) fetcher.phantomjs_proxy = g.phantomjs_proxy g.instances.append(fetcher) if xmlrpc: run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host) fetcher.run()
def webui(ctx, host, port, cdn, scheduler_rpc, fetcher_rpc, max_rate, max_burst, username, password): g = ctx.obj from pyspider.webui.app import app app.config['taskdb'] = g.taskdb app.config['projectdb'] = g.projectdb app.config['resultdb'] = g.resultdb app.config['cdn'] = cdn if max_rate: app.config['max_rate'] = max_rate if max_burst: app.config['max_burst'] = max_burst if username: app.config['webui_username'] = username if password: app.config['webui_password'] = password # fetcher rpc if isinstance(fetcher_rpc, basestring): fetcher_rpc = connect_rpc(ctx, None, fetcher_rpc) if fetcher_rpc is None: from pyspider.fetcher.tornado_fetcher import Fetcher fetcher = Fetcher(inqueue=None, outqueue=None, async=False) fetcher.phantomjs_proxy = g.phantomjs_proxy app.config['fetch'] = lambda x: fetcher.fetch(x)[1] else: import umsgpack app.config['fetch'] = lambda x: umsgpack.unpackb(fetcher_rpc.fetch(x).data) if isinstance(scheduler_rpc, basestring): scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc) if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'): app.config['scheduler_rpc'] = connect_rpc(ctx, None, 'http://%s/' % ( os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):])) elif scheduler_rpc is None: app.config['scheduler_rpc'] = connect_rpc(ctx, None, 'http://localhost:23333/') else: app.config['scheduler_rpc'] = scheduler_rpc app.debug = g.debug if g.get('testing_mode'): return app app.run(host=host, port=port)
def run_webui(g=g): import cPickle as pickle from pyspider.fetcher.tornado_fetcher import Fetcher fetcher = Fetcher(inqueue=None, outqueue=None, async=False) fetcher.phantomjs_proxy = g.phantomjs_proxy from pyspider.webui.app import app app.config['taskdb'] = g.taskdb app.config['projectdb'] = g.projectdb app.config['resultdb'] = g.resultdb app.config['fetch'] = lambda x: fetcher.fetch(x)[1] app.config['scheduler_rpc'] = g.scheduler_rpc #app.config['cdn'] = '//cdnjs.cloudflare.com/ajax/libs/' if g.demo_mode: app.config['max_rate'] = 0.2 app.config['max_burst'] = 3.0 if 'WEBUI_USERNAME' in os.environ: app.config['webui_username'] = os.environ['WEBUI_USERNAME'] app.config['webui_password'] = os.environ.get('WEBUI_PASSWORD', '') if not getattr(g, 'all_in_one', False): app.debug = g.debug app.run(host=g.webui_host, port=g.webui_port)