def setUpClass(self): shutil.rmtree('./data/tests', ignore_errors=True) os.makedirs('./data/tests') import tests.data_test_webpage import httpbin from pyspider.webui import bench_test # flake8: noqa self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' ctx = run.cli.make_context('test', [ '--taskdb', 'sqlalchemy+sqlite+taskdb:///data/tests/task.db', '--projectdb', 'sqlalchemy+sqlite+projectdb:///data/tests/projectdb.db', '--resultdb', 'sqlalchemy+sqlite+resultdb:///data/tests/resultdb.db', ], None, obj=ObjectDict(testing_mode=True)) self.ctx = run.cli.invoke(ctx) self.threads = [] ctx = run.scheduler.make_context('scheduler', [], self.ctx) self.scheduler = scheduler = run.scheduler.invoke(ctx) self.threads.append(run_in_thread(scheduler.xmlrpc_run)) self.threads.append(run_in_thread(scheduler.run)) ctx = run.fetcher.make_context('fetcher', [ '--xmlrpc', '--xmlrpc-port', '24444', ], self.ctx) fetcher = run.fetcher.invoke(ctx) self.threads.append(run_in_thread(fetcher.xmlrpc_run)) self.threads.append(run_in_thread(fetcher.run)) ctx = run.processor.make_context('processor', [], self.ctx) processor = run.processor.invoke(ctx) self.threads.append(run_in_thread(processor.run)) ctx = run.result_worker.make_context('result_worker', [], self.ctx) result_worker = run.result_worker.invoke(ctx) self.threads.append(run_in_thread(result_worker.run)) ctx = run.webui.make_context( 'webui', ['--scheduler-rpc', 'http://localhost:23333/'], self.ctx) app = run.webui.invoke(ctx) app.debug = True self.app = app.test_client() self.rpc = app.config['scheduler_rpc'] time.sleep(1)
def test_30_cli_command_line(self): ctx = run.cli.make_context( 'test', ['--projectdb', 'mongodb+projectdb://localhost:23456/projectdb'], None, obj=ObjectDict(testing_mode=True)) ctx = run.cli.invoke(ctx) from pymongo.errors import ConnectionFailure with self.assertRaises(ConnectionFailure): ctx.obj.projectdb
def test_40_cli_env(self): try: os.environ['RESULTDB'] = 'sqlite+resultdb://' ctx = run.cli.make_context('test', [], None, obj=ObjectDict(testing_mode=True)) ctx = run.cli.invoke(ctx) from pyspider.database.sqlite import resultdb self.assertIsInstance(ctx.obj.resultdb, resultdb.ResultDB) finally: del os.environ['RESULTDB']
def test_10_cli(self): ctx = run.cli.make_context('test', [], None, obj=ObjectDict(testing_mode=True)) ctx = run.cli.invoke(ctx) self.assertEqual(ctx.obj.debug, False) for db in ('taskdb', 'projectdb', 'resultdb'): self.assertIsNotNone(getattr(ctx.obj, db)) for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): self.assertIsNotNone(getattr(ctx.obj, name)) self.assertEqual(len(ctx.obj.instances), 0)
def test_80_docker_phantomjs(self): try: os.environ['PHANTOMJS_NAME'] = 'phantomjs' os.environ['PHANTOMJS_PORT'] = 'tpc://binux:25678' ctx = run.cli.make_context('test', [], None, obj=ObjectDict(testing_mode=True)) ctx = run.cli.invoke(ctx) self.assertEqual(ctx.obj.phantomjs_proxy, 'binux:25678') except Exception as e: self.assertIsNone(e) finally: del os.environ['PHANTOMJS_NAME'] del os.environ['PHANTOMJS_PORT']
def test_70_docker_mysql(self): try: os.environ['MYSQL_NAME'] = 'mysql' os.environ['MYSQL_PORT_3306_TCP_ADDR'] = 'localhost' os.environ['MYSQL_PORT_3306_TCP_PORT'] = '3306' ctx = run.cli.make_context('test', [], None, obj=ObjectDict(testing_mode=True)) ctx = run.cli.invoke(ctx) ctx.obj.resultdb except Exception as e: self.assertIsNone(e) finally: del os.environ['MYSQL_NAME'] del os.environ['MYSQL_PORT_3306_TCP_ADDR'] del os.environ['MYSQL_PORT_3306_TCP_PORT']
def test_60_docker_mongodb(self): try: os.environ['MONGODB_NAME'] = 'mongodb' os.environ['MONGODB_PORT_27017_TCP_ADDR'] = 'localhost' os.environ['MONGODB_PORT_27017_TCP_PORT'] = '27017' ctx = run.cli.make_context('test', [], None, obj=ObjectDict(testing_mode=True)) ctx = run.cli.invoke(ctx) ctx.obj.resultdb except Exception as e: self.assertIsNone(e) finally: del os.environ['MONGODB_NAME'] del os.environ['MONGODB_PORT_27017_TCP_ADDR'] del os.environ['MONGODB_PORT_27017_TCP_PORT']
def test_50_docker_rabbitmq(self): try: os.environ['RABBITMQ_NAME'] = 'rabbitmq' os.environ['RABBITMQ_PORT_5672_TCP_ADDR'] = 'localhost' os.environ['RABBITMQ_PORT_5672_TCP_PORT'] = '5672' ctx = run.cli.make_context('test', [], None, obj=ObjectDict(testing_mode=True)) ctx = run.cli.invoke(ctx) queue = ctx.obj.newtask_queue queue.put('abc') queue.delete() except Exception as e: self.assertIsNone(e) finally: del os.environ['RABBITMQ_NAME'] del os.environ['RABBITMQ_PORT_5672_TCP_ADDR'] del os.environ['RABBITMQ_PORT_5672_TCP_PORT']
def test_90_docker_scheduler(self): try: os.environ['SCHEDULER_NAME'] = 'scheduler' os.environ['SCHEDULER_PORT_23333_TCP'] = 'tpc://binux:25678' ctx = run.cli.make_context('test', [], None, obj=ObjectDict(testing_mode=True)) ctx = run.cli.invoke(ctx) webui = run.cli.get_command(ctx, 'webui') webui_ctx = webui.make_context('webui', [], ctx) app = webui.invoke(webui_ctx) rpc = app.config['scheduler_rpc'] self.assertEqual(rpc._ServerProxy__host, 'binux:25678') except Exception as e: self.assertIsNone(e) finally: del os.environ['SCHEDULER_NAME'] del os.environ['SCHEDULER_PORT_23333_TCP']
def setUpClass(self): shutil.rmtree('./data/tests', ignore_errors=True) os.makedirs('./data/tests') ctx = run.cli.make_context('test', [ '--taskdb', 'sqlite+taskdb:///data/tests/task.db', '--projectdb', 'sqlite+projectdb:///data/tests/projectdb.db', '--resultdb', 'sqlite+resultdb:///data/tests/resultdb.db', ], None, obj=ObjectDict(testing_mode=True)) self.ctx = run.cli.invoke(ctx) ctx = run.scheduler.make_context('scheduler', [], self.ctx) scheduler = run.scheduler.invoke(ctx) run_in_thread(scheduler.xmlrpc_run) run_in_thread(scheduler.run) ctx = run.fetcher.make_context('fetcher', [], self.ctx) fetcher = run.fetcher.invoke(ctx) run_in_thread(fetcher.run) ctx = run.processor.make_context('processor', [], self.ctx) processor = run.processor.invoke(ctx) run_in_thread(processor.run) ctx = run.result_worker.make_context('result_worker', [], self.ctx) result_worker = run.result_worker.invoke(ctx) run_in_thread(result_worker.run) ctx = run.webui.make_context( 'webui', ['--scheduler-rpc', 'http://localhost:23333/'], self.ctx) app = run.webui.invoke(ctx) app.debug = True self.app = app.test_client() self.rpc = app.config['scheduler_rpc'] time.sleep(1)
def test_20_cli_config(self): with open('./data/tests/config.json', 'w') as fp: json.dump( { 'debug': True, 'taskdb': 'mysql+taskdb://localhost:23456/taskdb', 'amqp-url': 'amqp://*****:*****@localhost:23456/%%2F' }, fp) ctx = run.cli.make_context('test', ['--config', './data/tests/config.json'], None, obj=ObjectDict(testing_mode=True)) ctx = run.cli.invoke(ctx) self.assertEqual(ctx.obj.debug, True) import mysql.connector with self.assertRaises(mysql.connector.InterfaceError): ctx.obj.taskdb with self.assertRaisesRegexp(Exception, 'Connection refused'): ctx.obj.newtask_queue
def cli(ctx, **kwargs): """ A powerful spider system in python. """ logging.config.fileConfig(os.path.join(os.path.dirname(__file__), "logging.conf")) # get db from env for db in ('taskdb', 'projectdb', 'resultdb'): if kwargs[db] is not None: continue if os.environ.get('MYSQL_NAME'): kwargs[db] = Get(lambda db=db: connect_database('mysql+%s://%s:%s/%s' % ( db, os.environ['MYSQL_PORT_3306_TCP_ADDR'], os.environ['MYSQL_PORT_3306_TCP_PORT'], db))) elif os.environ.get('MONGODB_NAME'): kwargs[db] = Get(lambda db=db: connect_database('mongodb+%s://%s:%s/%s' % ( db, os.environ['MONGODB_PORT_27017_TCP_ADDR'], os.environ['MONGODB_PORT_27017_TCP_PORT'], db))) elif ctx.invoked_subcommand == 'bench': if kwargs['data_path'] == './data': kwargs['data_path'] += '/bench' shutil.rmtree(kwargs['data_path'], ignore_errors=True) os.mkdir(kwargs['data_path']) if db in ('taskdb', 'resultdb'): kwargs[db] = Get(lambda db=db: connect_database('sqlite+%s://' % (db))) else: kwargs[db] = Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % ( db, kwargs['data_path'], db[:-2]))) else: if not os.path.exists(kwargs['data_path']): os.mkdir(kwargs['data_path']) kwargs[db] = Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % ( db, kwargs['data_path'], db[:-2]))) # queue if kwargs.get('amqp_url'): from pyspider.libs.rabbitmq import Queue for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): kwargs[name] = Get(lambda name=name: Queue(name, amqp_url=kwargs['amqp_url'], maxsize=kwargs['queue_maxsize'])) elif os.environ.get('RABBITMQ_NAME'): from pyspider.libs.rabbitmq import Queue amqp_url = ("amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s" ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ) for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): kwargs[name] = Get(lambda name=name: Queue(name, amqp_url=amqp_url, maxsize=kwargs['queue_maxsize'])) else: from multiprocessing import Queue for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher', 'fetcher2processor', 'processor2result'): kwargs[name] = Queue(kwargs['queue_maxsize']) # phantomjs-proxy if kwargs.get('phantomjs_proxy'): pass elif os.environ.get('PHANTOMJS_NAME'): kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT'][len('tcp://'):] ctx.obj = ObjectDict(ctx.obj or {}) ctx.obj['instances'] = [] ctx.obj.update(kwargs) if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'): ctx.invoke(all) return ctx
def main(): cli(obj=ObjectDict(), default_map={})