Exemplo n.º 1
0
    def setUpClass(self):
        shutil.rmtree('./data/tests', ignore_errors=True)
        os.makedirs('./data/tests')

        import tests.data_test_webpage
        import httpbin
        from pyspider.webui import bench_test  # flake8: noqa
        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run,
                                                      port=14887,
                                                      passthrough_errors=False)
        self.httpbin = 'http://127.0.0.1:14887'

        ctx = run.cli.make_context('test', [
            '--taskdb',
            'sqlalchemy+sqlite+taskdb:///data/tests/task.db',
            '--projectdb',
            'sqlalchemy+sqlite+projectdb:///data/tests/projectdb.db',
            '--resultdb',
            'sqlalchemy+sqlite+resultdb:///data/tests/resultdb.db',
        ],
                                   None,
                                   obj=ObjectDict(testing_mode=True))
        self.ctx = run.cli.invoke(ctx)

        self.threads = []

        ctx = run.scheduler.make_context('scheduler', [], self.ctx)
        self.scheduler = scheduler = run.scheduler.invoke(ctx)
        self.threads.append(run_in_thread(scheduler.xmlrpc_run))
        self.threads.append(run_in_thread(scheduler.run))

        ctx = run.fetcher.make_context('fetcher', [
            '--xmlrpc',
            '--xmlrpc-port',
            '24444',
        ], self.ctx)
        fetcher = run.fetcher.invoke(ctx)
        self.threads.append(run_in_thread(fetcher.xmlrpc_run))
        self.threads.append(run_in_thread(fetcher.run))

        ctx = run.processor.make_context('processor', [], self.ctx)
        processor = run.processor.invoke(ctx)
        self.threads.append(run_in_thread(processor.run))

        ctx = run.result_worker.make_context('result_worker', [], self.ctx)
        result_worker = run.result_worker.invoke(ctx)
        self.threads.append(run_in_thread(result_worker.run))

        ctx = run.webui.make_context(
            'webui', ['--scheduler-rpc', 'http://localhost:23333/'], self.ctx)
        app = run.webui.invoke(ctx)
        app.debug = True
        self.app = app.test_client()
        self.rpc = app.config['scheduler_rpc']

        time.sleep(1)
Exemplo n.º 2
0
    def test_30_cli_command_line(self):
        ctx = run.cli.make_context(
            'test',
            ['--projectdb', 'mongodb+projectdb://localhost:23456/projectdb'],
            None,
            obj=ObjectDict(testing_mode=True))
        ctx = run.cli.invoke(ctx)

        from pymongo.errors import ConnectionFailure
        with self.assertRaises(ConnectionFailure):
            ctx.obj.projectdb
Exemplo n.º 3
0
    def test_40_cli_env(self):
        try:
            os.environ['RESULTDB'] = 'sqlite+resultdb://'
            ctx = run.cli.make_context('test', [],
                                       None,
                                       obj=ObjectDict(testing_mode=True))
            ctx = run.cli.invoke(ctx)

            from pyspider.database.sqlite import resultdb
            self.assertIsInstance(ctx.obj.resultdb, resultdb.ResultDB)
        finally:
            del os.environ['RESULTDB']
Exemplo n.º 4
0
 def test_10_cli(self):
     ctx = run.cli.make_context('test', [],
                                None,
                                obj=ObjectDict(testing_mode=True))
     ctx = run.cli.invoke(ctx)
     self.assertEqual(ctx.obj.debug, False)
     for db in ('taskdb', 'projectdb', 'resultdb'):
         self.assertIsNotNone(getattr(ctx.obj, db))
     for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                  'fetcher2processor', 'processor2result'):
         self.assertIsNotNone(getattr(ctx.obj, name))
     self.assertEqual(len(ctx.obj.instances), 0)
Exemplo n.º 5
0
 def test_80_docker_phantomjs(self):
     try:
         os.environ['PHANTOMJS_NAME'] = 'phantomjs'
         os.environ['PHANTOMJS_PORT'] = 'tpc://binux:25678'
         ctx = run.cli.make_context('test', [],
                                    None,
                                    obj=ObjectDict(testing_mode=True))
         ctx = run.cli.invoke(ctx)
         self.assertEqual(ctx.obj.phantomjs_proxy, 'binux:25678')
     except Exception as e:
         self.assertIsNone(e)
     finally:
         del os.environ['PHANTOMJS_NAME']
         del os.environ['PHANTOMJS_PORT']
Exemplo n.º 6
0
 def test_70_docker_mysql(self):
     try:
         os.environ['MYSQL_NAME'] = 'mysql'
         os.environ['MYSQL_PORT_3306_TCP_ADDR'] = 'localhost'
         os.environ['MYSQL_PORT_3306_TCP_PORT'] = '3306'
         ctx = run.cli.make_context('test', [],
                                    None,
                                    obj=ObjectDict(testing_mode=True))
         ctx = run.cli.invoke(ctx)
         ctx.obj.resultdb
     except Exception as e:
         self.assertIsNone(e)
     finally:
         del os.environ['MYSQL_NAME']
         del os.environ['MYSQL_PORT_3306_TCP_ADDR']
         del os.environ['MYSQL_PORT_3306_TCP_PORT']
Exemplo n.º 7
0
 def test_60_docker_mongodb(self):
     try:
         os.environ['MONGODB_NAME'] = 'mongodb'
         os.environ['MONGODB_PORT_27017_TCP_ADDR'] = 'localhost'
         os.environ['MONGODB_PORT_27017_TCP_PORT'] = '27017'
         ctx = run.cli.make_context('test', [],
                                    None,
                                    obj=ObjectDict(testing_mode=True))
         ctx = run.cli.invoke(ctx)
         ctx.obj.resultdb
     except Exception as e:
         self.assertIsNone(e)
     finally:
         del os.environ['MONGODB_NAME']
         del os.environ['MONGODB_PORT_27017_TCP_ADDR']
         del os.environ['MONGODB_PORT_27017_TCP_PORT']
Exemplo n.º 8
0
 def test_50_docker_rabbitmq(self):
     try:
         os.environ['RABBITMQ_NAME'] = 'rabbitmq'
         os.environ['RABBITMQ_PORT_5672_TCP_ADDR'] = 'localhost'
         os.environ['RABBITMQ_PORT_5672_TCP_PORT'] = '5672'
         ctx = run.cli.make_context('test', [],
                                    None,
                                    obj=ObjectDict(testing_mode=True))
         ctx = run.cli.invoke(ctx)
         queue = ctx.obj.newtask_queue
         queue.put('abc')
         queue.delete()
     except Exception as e:
         self.assertIsNone(e)
     finally:
         del os.environ['RABBITMQ_NAME']
         del os.environ['RABBITMQ_PORT_5672_TCP_ADDR']
         del os.environ['RABBITMQ_PORT_5672_TCP_PORT']
Exemplo n.º 9
0
 def test_90_docker_scheduler(self):
     try:
         os.environ['SCHEDULER_NAME'] = 'scheduler'
         os.environ['SCHEDULER_PORT_23333_TCP'] = 'tpc://binux:25678'
         ctx = run.cli.make_context('test', [],
                                    None,
                                    obj=ObjectDict(testing_mode=True))
         ctx = run.cli.invoke(ctx)
         webui = run.cli.get_command(ctx, 'webui')
         webui_ctx = webui.make_context('webui', [], ctx)
         app = webui.invoke(webui_ctx)
         rpc = app.config['scheduler_rpc']
         self.assertEqual(rpc._ServerProxy__host, 'binux:25678')
     except Exception as e:
         self.assertIsNone(e)
     finally:
         del os.environ['SCHEDULER_NAME']
         del os.environ['SCHEDULER_PORT_23333_TCP']
Exemplo n.º 10
0
    def setUpClass(self):
        shutil.rmtree('./data/tests', ignore_errors=True)
        os.makedirs('./data/tests')

        ctx = run.cli.make_context('test', [
            '--taskdb',
            'sqlite+taskdb:///data/tests/task.db',
            '--projectdb',
            'sqlite+projectdb:///data/tests/projectdb.db',
            '--resultdb',
            'sqlite+resultdb:///data/tests/resultdb.db',
        ],
                                   None,
                                   obj=ObjectDict(testing_mode=True))
        self.ctx = run.cli.invoke(ctx)

        ctx = run.scheduler.make_context('scheduler', [], self.ctx)
        scheduler = run.scheduler.invoke(ctx)
        run_in_thread(scheduler.xmlrpc_run)
        run_in_thread(scheduler.run)

        ctx = run.fetcher.make_context('fetcher', [], self.ctx)
        fetcher = run.fetcher.invoke(ctx)
        run_in_thread(fetcher.run)

        ctx = run.processor.make_context('processor', [], self.ctx)
        processor = run.processor.invoke(ctx)
        run_in_thread(processor.run)

        ctx = run.result_worker.make_context('result_worker', [], self.ctx)
        result_worker = run.result_worker.invoke(ctx)
        run_in_thread(result_worker.run)

        ctx = run.webui.make_context(
            'webui', ['--scheduler-rpc', 'http://localhost:23333/'], self.ctx)
        app = run.webui.invoke(ctx)
        app.debug = True
        self.app = app.test_client()
        self.rpc = app.config['scheduler_rpc']

        time.sleep(1)
Exemplo n.º 11
0
    def test_20_cli_config(self):
        with open('./data/tests/config.json', 'w') as fp:
            json.dump(
                {
                    'debug': True,
                    'taskdb': 'mysql+taskdb://localhost:23456/taskdb',
                    'amqp-url': 'amqp://*****:*****@localhost:23456/%%2F'
                }, fp)
        ctx = run.cli.make_context('test',
                                   ['--config', './data/tests/config.json'],
                                   None,
                                   obj=ObjectDict(testing_mode=True))
        ctx = run.cli.invoke(ctx)
        self.assertEqual(ctx.obj.debug, True)

        import mysql.connector
        with self.assertRaises(mysql.connector.InterfaceError):
            ctx.obj.taskdb

        with self.assertRaisesRegexp(Exception, 'Connection refused'):
            ctx.obj.newtask_queue
Exemplo n.º 12
0
def cli(ctx, **kwargs):
    """
    A powerful spider system in python.
    """
    logging.config.fileConfig(os.path.join(os.path.dirname(__file__), "logging.conf"))

    # get db from env
    for db in ('taskdb', 'projectdb', 'resultdb'):
        if kwargs[db] is not None:
            continue
        if os.environ.get('MYSQL_NAME'):
            kwargs[db] = Get(lambda db=db: connect_database('mysql+%s://%s:%s/%s' % (
                db, os.environ['MYSQL_PORT_3306_TCP_ADDR'],
                os.environ['MYSQL_PORT_3306_TCP_PORT'], db)))
        elif os.environ.get('MONGODB_NAME'):
            kwargs[db] = Get(lambda db=db: connect_database('mongodb+%s://%s:%s/%s' % (
                db, os.environ['MONGODB_PORT_27017_TCP_ADDR'],
                os.environ['MONGODB_PORT_27017_TCP_PORT'], db)))
        elif ctx.invoked_subcommand == 'bench':
            if kwargs['data_path'] == './data':
                kwargs['data_path'] += '/bench'
                shutil.rmtree(kwargs['data_path'], ignore_errors=True)
                os.mkdir(kwargs['data_path'])
            if db in ('taskdb', 'resultdb'):
                kwargs[db] = Get(lambda db=db: connect_database('sqlite+%s://' % (db)))
            else:
                kwargs[db] = Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % (
                    db, kwargs['data_path'], db[:-2])))
        else:
            if not os.path.exists(kwargs['data_path']):
                os.mkdir(kwargs['data_path'])
            kwargs[db] = Get(lambda db=db: connect_database('sqlite+%s:///%s/%s.db' % (
                db, kwargs['data_path'], db[:-2])))

    # queue
    if kwargs.get('amqp_url'):
        from pyspider.libs.rabbitmq import Queue
        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                     'fetcher2processor', 'processor2result'):
            kwargs[name] = Get(lambda name=name: Queue(name, amqp_url=kwargs['amqp_url'],
                                                       maxsize=kwargs['queue_maxsize']))
    elif os.environ.get('RABBITMQ_NAME'):
        from pyspider.libs.rabbitmq import Queue
        amqp_url = ("amqp://*****:*****@%(RABBITMQ_PORT_5672_TCP_ADDR)s"
                    ":%(RABBITMQ_PORT_5672_TCP_PORT)s/%%2F" % os.environ)
        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                     'fetcher2processor', 'processor2result'):
            kwargs[name] = Get(lambda name=name: Queue(name, amqp_url=amqp_url,
                                                       maxsize=kwargs['queue_maxsize']))
    else:
        from multiprocessing import Queue
        for name in ('newtask_queue', 'status_queue', 'scheduler2fetcher',
                     'fetcher2processor', 'processor2result'):
            kwargs[name] = Queue(kwargs['queue_maxsize'])

    # phantomjs-proxy
    if kwargs.get('phantomjs_proxy'):
        pass
    elif os.environ.get('PHANTOMJS_NAME'):
        kwargs['phantomjs_proxy'] = os.environ['PHANTOMJS_PORT'][len('tcp://'):]

    ctx.obj = ObjectDict(ctx.obj or {})
    ctx.obj['instances'] = []
    ctx.obj.update(kwargs)

    if ctx.invoked_subcommand is None and not ctx.obj.get('testing_mode'):
        ctx.invoke(all)
    return ctx
Exemplo n.º 13
0
def main():
    cli(obj=ObjectDict(), default_map={})