Пример #1
0
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)
        self.httpbin = 'http://127.0.0.1:14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = '127.0.0.1:25555'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',
                                              '--password=123456', '--port=14830',
                                              '--debug'], close_fds=True)
        self.proxy = '127.0.0.1:14830'
        try:
            self.phantomjs = subprocess.Popen(['phantomjs',
                os.path.join(os.path.dirname(__file__),
                    '../pyspider/fetcher/phantomjs_fetcher.js'),
                '25555'])
        except OSError:
            self.phantomjs = None
        time.sleep(0.5)
Пример #2
0
 def setUpClass(self):
     self.inqueue = Queue(10)
     self.outqueue = Queue(10)
     self.fetcher = Fetcher(self.inqueue, self.outqueue)
     self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444)
     self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
     self.thread = utils.run_in_thread(self.fetcher.run)
Пример #3
0
def scheduler(
    ctx, xmlrpc, xmlrpc_host, xmlrpc_port, inqueue_limit, delete_time, active_tasks, loop_limit, scheduler_cls
):
    g = ctx.obj
    Scheduler = load_cls(None, None, scheduler_cls)

    scheduler = Scheduler(
        taskdb=g.taskdb,
        projectdb=g.projectdb,
        resultdb=g.resultdb,
        newtask_queue=g.newtask_queue,
        status_queue=g.status_queue,
        out_queue=g.scheduler2fetcher,
        data_path=g.get("data_path", "data"),
    )
    scheduler.INQUEUE_LIMIT = inqueue_limit
    scheduler.DELETE_TIME = delete_time
    scheduler.ACTIVE_TASKS = active_tasks
    scheduler.LOOP_LIMIT = loop_limit

    g.instances.append(scheduler)
    if g.get("testing_mode"):
        return scheduler

    if xmlrpc:
        utils.run_in_thread(scheduler.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)
    scheduler.run()
Пример #4
0
def bloomfilter(ctx, xmlrpc, xmlrpc_host, xmlrpc_port, key, capacity, error, redis):
    """
    Run bloomfilter, only one bloomfilter is allowed.
    """
    g = ctx.obj

    if os.name == 'nt':
        from pyspider.filter import BloomFilter
        bloomfilter = BloomFilter(key, capacity, error)
    else:
        from pyspider.filter import RedisBloomFilter
        from six.moves.urllib.parse import urlparse
        parsed = urlparse(url)
        # ParseResult(scheme='', netloc='127.0.0.1:6379', path='/0', params='', query='', fragment='')
        bloomfilter = RedisBloomFilter(key, capacity, error,
            parsed.hostname, parsed.port, int(parsed.path.strip('/') or 0))

    g.instances.append(bloomfilter)
    if g.get('testing_mode'):
        return bloomfilter


    if xmlrpc:
        utils.run_in_thread(bloomfilter.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)
    bloomfilter.run()
Пример #5
0
def scheduler(ctx, xmlrpc, xmlrpc_host, xmlrpc_port,
              inqueue_limit, delete_time, active_tasks, loop_limit, scheduler_cls,
              threads):
    """
    Run Scheduler, only one scheduler is allowed.
    """
    g = ctx.obj
    Scheduler = load_cls(None, None, scheduler_cls)

    kwargs = dict(taskdb=g.taskdb, projectdb=g.projectdb, resultdb=g.resultdb,
                  newtask_queue=g.newtask_queue, status_queue=g.status_queue,
                  out_queue=g.scheduler2fetcher, data_path=g.get('data_path', 'data'))
    if threads:
        kwargs['threads'] = int(threads)

    scheduler = Scheduler(**kwargs)
    scheduler.INQUEUE_LIMIT = inqueue_limit
    scheduler.DELETE_TIME = delete_time
    scheduler.ACTIVE_TASKS = active_tasks
    scheduler.LOOP_LIMIT = loop_limit

    g.instances.append(scheduler)
    if g.get('testing_mode'):
        return scheduler

    if xmlrpc:
        utils.run_in_thread(scheduler.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)
    scheduler.run()
Пример #6
0
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887)
        self.httpbin = "http://127.0.0.1:14887"

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = "127.0.0.1:25555"
        self.rpc = xmlrpc_client.ServerProxy("http://localhost:%d" % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen(
            ["pyproxy", "--username=binux", "--password=123456", "--port=14830", "--debug"], close_fds=True
        )
        self.proxy = "127.0.0.1:14830"
        try:
            self.phantomjs = subprocess.Popen(
                [
                    "phantomjs",
                    os.path.join(os.path.dirname(__file__), "../pyspider/fetcher/phantomjs_fetcher.js"),
                    "25555",
                ]
            )
        except OSError:
            self.phantomjs = None
        time.sleep(0.5)
Пример #7
0
def run_fetcher(g=g):
    from pyspider.fetcher.tornado_fetcher import Fetcher
    fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor)
    fetcher.phantomjs_proxy = g.phantomjs_proxy

    run_in_thread(fetcher.xmlrpc_run, port=g.fetcher_xmlrpc_port, bind=g.webui_host)
    fetcher.run()
Пример #8
0
    def setUpClass(self):
        shutil.rmtree("./data/tests", ignore_errors=True)
        os.makedirs("./data/tests")

        ctx = run.cli.make_context(
            "test",
            [
                "--taskdb",
                "sqlite+taskdb:///data/tests/task.db",
                "--projectdb",
                "sqlite+projectdb:///data/tests/projectdb.db",
                "--resultdb",
                "sqlite+resultdb:///data/tests/resultdb.db",
            ],
            None,
            obj=dict(testing_mode=True),
        )
        self.ctx = run.cli.invoke(ctx)

        ctx = run.scheduler.make_context("scheduler", [], self.ctx)
        scheduler = run.scheduler.invoke(ctx)
        utils.run_in_thread(scheduler.xmlrpc_run)
        utils.run_in_thread(scheduler.run)

        time.sleep(1)
Пример #9
0
def fetcher(ctx, xmlrpc, xmlrpc_host, xmlrpc_port, poolsize, proxy, user_agent,
            timeout, phantomjs_endpoint, splash_endpoint, fetcher_cls,
            async_mode=True, get_object=False, no_input=False):
    """
    Run Fetcher.
    """
    g = ctx.obj
    Fetcher = load_cls(None, None, fetcher_cls)

    if no_input:
        inqueue = None
        outqueue = None
    else:
        inqueue = g.scheduler2fetcher
        outqueue = g.fetcher2processor
    fetcher = Fetcher(inqueue=inqueue, outqueue=outqueue,
                      poolsize=poolsize, proxy=proxy, async_mode=async_mode)
    fetcher.phantomjs_proxy = phantomjs_endpoint or g.phantomjs_proxy
    fetcher.splash_endpoint = splash_endpoint
    if user_agent:
        fetcher.user_agent = user_agent
    if timeout:
        fetcher.default_options = copy.deepcopy(fetcher.default_options)
        fetcher.default_options['timeout'] = timeout

    g.instances.append(fetcher)
    if g.get('testing_mode') or get_object:
        return fetcher

    if xmlrpc:
        utils.run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)
    fetcher.run()
Пример #10
0
def run_scheduler(g=g):
    from pyspider.scheduler import Scheduler
    scheduler = Scheduler(taskdb=g.taskdb, projectdb=g.projectdb, resultdb=g.resultdb,
            newtask_queue=g.newtask_queue, status_queue=g.status_queue,
            out_queue=g.scheduler2fetcher)
    g.scheduler = scheduler
    run_in_thread(scheduler.xmlrpc_run)
    scheduler.run()
Пример #11
0
def fetcher(ctx, xmlrpc, xmlrpc_host, xmlrpc_port):
    g = ctx.obj
    from pyspider.fetcher.tornado_fetcher import Fetcher
    fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor)
    fetcher.phantomjs_proxy = g.phantomjs_proxy
    g.instances.append(fetcher)

    if xmlrpc:
        run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)
    fetcher.run()
Пример #12
0
def run_scheduler(g=g):
    from pyspider.scheduler import Scheduler
    scheduler = Scheduler(taskdb=g.taskdb, projectdb=g.projectdb, resultdb=g.resultdb,
            newtask_queue=g.newtask_queue, status_queue=g.status_queue,
            out_queue=g.scheduler2fetcher)
    if g.demo_mode:
        scheduler.INQUEUE_LIMIT = 1000

    run_in_thread(scheduler.xmlrpc_run, port=g.scheduler_xmlrpc_port, bind=g.webui_host)
    scheduler.run()
Пример #13
0
    def test_40_multiple_threading_error(self):
        def put(q):
            for i in range(100):
                q.put("DATA_%d" % i)

        def get(q):
            for i in range(100):
                q.get()

        utils.run_in_thread(put, self.q3)
        get(self.q3)
Пример #14
0
 def setUpClass(self):
     self.inqueue = Queue(10)
     self.outqueue = Queue(10)
     self.fetcher = Fetcher(self.inqueue, self.outqueue)
     self.fetcher.phantomjs_proxy = 'localhost:25555'
     self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444)
     self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
     self.thread = utils.run_in_thread(self.fetcher.run)
     self.phantomjs = subprocess.Popen(['phantomjs',
         os.path.join(os.path.dirname(__file__),
             '../pyspider/fetcher/phantomjs_fetcher.js'),
         '25555'])
Пример #15
0
 def run_scheduler():
     scheduler = Scheduler(taskdb=get_taskdb(), projectdb=get_projectdb(),
                           newtask_queue=self.newtask_queue, status_queue=self.status_queue,
                           out_queue=self.scheduler2fetcher, data_path="./data/tests/",
                           resultdb=get_resultdb())
     scheduler.UPDATE_PROJECT_INTERVAL = 0.1
     scheduler.LOOP_INTERVAL = 0.1
     scheduler.INQUEUE_LIMIT = 10
     Scheduler.DELETE_TIME = 0
     scheduler._last_tick = int(time.time())  # not dispatch cronjob
     run_in_thread(scheduler.xmlrpc_run, port=self.scheduler_xmlrpc_port)
     scheduler.run()
Пример #16
0
    def setUpClass(self):
        shutil.rmtree("./data/tests", ignore_errors=True)
        os.makedirs("./data/tests")

        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887)
        self.httpbin = "http://127.0.0.1:14887"

        ctx = run.cli.make_context(
            "test",
            [
                "--taskdb",
                "sqlite+taskdb:///data/tests/task.db",
                "--projectdb",
                "sqlite+projectdb:///data/tests/projectdb.db",
                "--resultdb",
                "sqlite+resultdb:///data/tests/resultdb.db",
            ],
            None,
            obj=ObjectDict(testing_mode=True),
        )
        self.ctx = run.cli.invoke(ctx)

        ctx = run.scheduler.make_context("scheduler", [], self.ctx)
        scheduler = run.scheduler.invoke(ctx)
        run_in_thread(scheduler.xmlrpc_run)
        run_in_thread(scheduler.run)

        ctx = run.fetcher.make_context("fetcher", [], self.ctx)
        fetcher = run.fetcher.invoke(ctx)
        run_in_thread(fetcher.run)

        ctx = run.processor.make_context("processor", [], self.ctx)
        processor = run.processor.invoke(ctx)
        run_in_thread(processor.run)

        ctx = run.result_worker.make_context("result_worker", [], self.ctx)
        result_worker = run.result_worker.invoke(ctx)
        run_in_thread(result_worker.run)

        ctx = run.webui.make_context("webui", ["--scheduler-rpc", "http://localhost:23333/"], self.ctx)
        app = run.webui.invoke(ctx)
        app.debug = True
        self.app = app.test_client()
        self.rpc = app.config["scheduler_rpc"]

        time.sleep(1)
Пример #17
0
def scheduler(ctx, xmlrpc, xmlrpc_host, xmlrpc_port,
        inqueue_limit, delete_time, active_tasks):
    g = ctx.obj
    from pyspider.scheduler import Scheduler
    scheduler = Scheduler(taskdb=g.taskdb, projectdb=g.projectdb, resultdb=g.resultdb,
            newtask_queue=g.newtask_queue, status_queue=g.status_queue,
            out_queue=g.scheduler2fetcher)
    scheduler.INQUEUE_LIMIT = inqueue_limit
    scheduler.DELETE_TIME = delete_time
    scheduler.ACTIVE_TASKS = active_tasks
    g.instances.append(scheduler)

    if xmlrpc:
        run_in_thread(scheduler.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)
    scheduler.run()
Пример #18
0
    def setUpClass(self):
        shutil.rmtree('./data/tests', ignore_errors=True)
        os.makedirs('./data/tests')

        def get_taskdb():
            return taskdb.TaskDB(self.taskdb_path)
        self.taskdb = get_taskdb()
        def get_projectdb():
            return projectdb.ProjectDB(self.projectdb_path)
        self.projectdb = get_projectdb()
        def get_resultdb():
            return resultdb.ResultDB(self.resultdb_path)
        self.resultdb = get_resultdb()

        self.newtask_queue = Queue(10)
        self.status_queue = Queue(10)
        self.scheduler2fetcher = Queue(10)
        self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % self.scheduler_xmlrpc_port)

        def run_scheduler():
            scheduler = Scheduler(taskdb=get_taskdb(), projectdb=get_projectdb(),
                    newtask_queue=self.newtask_queue, status_queue=self.status_queue,
                    out_queue=self.scheduler2fetcher, data_path="./data/tests/",
                    resultdb=get_resultdb())
            scheduler.UPDATE_PROJECT_INTERVAL = 0.1
            scheduler.LOOP_INTERVAL = 0.1
            scheduler.INQUEUE_LIMIT = 10
            Scheduler.DELETE_TIME = 0
            scheduler._last_tick = int(time.time()) # not dispatch cronjob
            run_in_thread(scheduler.xmlrpc_run, port=self.scheduler_xmlrpc_port)
            scheduler.run()

        self.process = run_in_thread(run_scheduler)
        time.sleep(1)
Пример #19
0
    def setUpClass(self):
        import easywebdav

        shutil.rmtree('./data/tests', ignore_errors=True)
        os.makedirs('./data/tests')

        ctx = run.cli.make_context('test', [
            '--taskdb', 'sqlite+taskdb:///data/tests/task.db',
            '--projectdb', 'sqlite+projectdb:///data/tests/projectdb.db',
            '--resultdb', 'sqlite+resultdb:///data/tests/resultdb.db',
        ], None, obj=utils.ObjectDict(testing_mode=True))
        self.ctx = run.cli.invoke(ctx)

        ctx = run.webui.make_context('webui', [
            '--username', 'binux',
            '--password', '4321',
            '--need-auth',
        ], self.ctx)
        self.app = run.webui.invoke(ctx)
        self.app_thread = utils.run_in_thread(self.app.run)
        time.sleep(5)

        self.webdav = easywebdav.connect('localhost', port=5000, path='dav')
        self.webdav_up = easywebdav.connect('localhost', port=5000, path='dav',
                                            username='******', password='******')
Пример #20
0
    def setUpClass(self):
        shutil.rmtree('./data/tests', ignore_errors=True)
        os.makedirs('./data/tests')

        ctx = run.cli.make_context('test', [
            '--taskdb', 'sqlite+taskdb:///data/tests/task.db',
            '--projectdb', 'sqlite+projectdb:///data/tests/projectdb.db',
            '--resultdb', 'sqlite+resultdb:///data/tests/resultdb.db',
        ], None, obj=dict(testing_mode=True))
        self.ctx = run.cli.invoke(ctx)

        ctx = run.scheduler.make_context('scheduler', [], self.ctx)
        scheduler = run.scheduler.invoke(ctx)
        utils.run_in_thread(scheduler.xmlrpc_run)
        utils.run_in_thread(scheduler.run)

        time.sleep(1)
Пример #21
0
def scheduler(ctx, xmlrpc, xmlrpc_host, xmlrpc_port,
              inqueue_limit, delete_time, active_tasks, Scheduler=Scheduler):
    g = ctx.obj
    scheduler = Scheduler(taskdb=g.taskdb, projectdb=g.projectdb, resultdb=g.resultdb,
                          newtask_queue=g.newtask_queue, status_queue=g.status_queue,
                          out_queue=g.scheduler2fetcher, data_path=g.get('data_path', 'data'))
    scheduler.INQUEUE_LIMIT = inqueue_limit
    scheduler.DELETE_TIME = delete_time
    scheduler.ACTIVE_TASKS = active_tasks

    g.instances.append(scheduler)
    if g.get('testing_mode'):
        return scheduler

    if xmlrpc:
        run_in_thread(scheduler.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)
    scheduler.run()
Пример #22
0
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, host='0.0.0.0', port=14887, passthrough_errors=False)
        self.httpbin = 'http://' + socket.gethostbyname(socket.gethostname()) + ':14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.splash_endpoint = 'http://127.0.0.1:8050/execute'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',
                                              '--password=123456', '--port=14830',
                                              '--debug'], close_fds=True)
        self.proxy = '127.0.0.1:14830'
Пример #23
0
def fetcher(ctx, xmlrpc, xmlrpc_host, xmlrpc_port, poolsize, proxy, user_agent, timeout, Fetcher=Fetcher):
    g = ctx.obj
    fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor,
                      poolsize=poolsize, proxy=proxy)
    fetcher.phantomjs_proxy = g.phantomjs_proxy
    if user_agent:
        fetcher.user_agent = user_agent
    if timeout:
        fetcher.default_options = dict(fetcher.default_options)
        fetcher.default_options['timeout'] = timeout

    g.instances.append(fetcher)
    if g.get('testing_mode'):
        return fetcher

    if xmlrpc:
        run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)
    fetcher.run()
Пример #24
0
def fetcher(ctx, xmlrpc, xmlrpc_host, xmlrpc_port, poolsize, proxy, user_agent, timeout, fetcher_cls):
    g = ctx.obj
    Fetcher = load_cls(None, None, fetcher_cls)

    fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor, poolsize=poolsize, proxy=proxy)
    fetcher.phantomjs_proxy = g.phantomjs_proxy
    if user_agent:
        fetcher.user_agent = user_agent
    if timeout:
        fetcher.default_options = copy.deepcopy(fetcher.default_options)
        fetcher.default_options["timeout"] = timeout

    g.instances.append(fetcher)
    if g.get("testing_mode"):
        return fetcher

    if xmlrpc:
        utils.run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)
    fetcher.run()
Пример #25
0
    def setUpClass(self):
        shutil.rmtree('./data/tests', ignore_errors=True)
        os.makedirs('./data/tests')

        import tests.data_test_webpage
        import httpbin
        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887)
        self.httpbin = 'http://127.0.0.1:14887'

        ctx = run.cli.make_context('test', [
            '--taskdb', 'sqlite+taskdb:///data/tests/task.db',
            '--projectdb', 'sqlite+projectdb:///data/tests/projectdb.db',
            '--resultdb', 'sqlite+resultdb:///data/tests/resultdb.db',
        ], None, obj=ObjectDict(testing_mode=True))
        self.ctx = run.cli.invoke(ctx)

        ctx = run.scheduler.make_context('scheduler', [], self.ctx)
        scheduler = run.scheduler.invoke(ctx)
        run_in_thread(scheduler.xmlrpc_run)
        run_in_thread(scheduler.run)

        ctx = run.fetcher.make_context('fetcher', [], self.ctx)
        fetcher = run.fetcher.invoke(ctx)
        run_in_thread(fetcher.run)

        ctx = run.processor.make_context('processor', [], self.ctx)
        processor = run.processor.invoke(ctx)
        run_in_thread(processor.run)

        ctx = run.result_worker.make_context('result_worker', [], self.ctx)
        result_worker = run.result_worker.invoke(ctx)
        run_in_thread(result_worker.run)

        ctx = run.webui.make_context('webui', [
            '--scheduler-rpc', 'http://localhost:23333/'
        ], self.ctx)
        app = run.webui.invoke(ctx)
        app.debug = True
        self.app = app.test_client()
        self.rpc = app.config['scheduler_rpc']

        time.sleep(1)
Пример #26
0
    def test_60_timeout_in_thread(self):
        base_task = self.base_task
        fetch_result = self.fetch_result
        base_task['process']['callback'] = 'sleep'
        base_task['process']['process_time_limit'] = 0.5
        fetch_result['save'] = 2

        start_time = time.time()
        thread = utils.run_in_thread(lambda self=self: self.instance.run_task(self.module, base_task, fetch_result))
        thread.join()
        self.assertGreaterEqual(time.time() - start_time, 2)
Пример #27
0
    def test_40_multiple_threading_error(self):
        def put(q):
            for i in range(100):
                q.put("DATA_{0:d}".format(i))

        def get(q):
            for i in range(100):
                q.get()

        t = utils.run_in_thread(put, self.q3)
        get(self.q3)
        t.join()
Пример #28
0
def scheduler(ctx, xmlrpc, xmlrpc_host, xmlrpc_port,
              inqueue_limit, delete_time, active_tasks, loop_limit, scheduler_cls,
              threads, bloomfilter_on, bloomfilter_rpc, get_object=False):
    """
    Run Scheduler, only one scheduler is allowed.
    """
    g = ctx.obj
    Scheduler = load_cls(None, None, scheduler_cls)

    kwargs = dict(taskdb=g.taskdb, projectdb=g.projectdb, resultdb=g.resultdb,
                  newtask_queue=g.newtask_queue, status_queue=g.status_queue,
                  out_queue=g.scheduler2fetcher, data_path=g.get('data_path', 'data'))
    if threads:
        kwargs['threads'] = int(threads)

    if bloomfilter_on:
        bloomfilter_config = g.config.get('bloomfilter', {})
        bloomfilter_config.setdefault('xmlrpc_host', '127.0.0.1')
        bloomfilter_config.setdefault('xmlrpc_port', 13100)
        bloomfilter_rpc = connect_rpc(ctx, None,
                                    'http://%(xmlrpc_host)s:%(xmlrpc_port)s/' % bloomfilter_config)
    else:
        bloomfilter_rpc = None

    scheduler = Scheduler(**kwargs)

    scheduler.INQUEUE_LIMIT = inqueue_limit
    scheduler.DELETE_TIME = delete_time
    scheduler.ACTIVE_TASKS = active_tasks
    scheduler.LOOP_LIMIT = loop_limit

    g.instances.append(scheduler)
    if g.get('testing_mode') or get_object:
        return scheduler

    if xmlrpc:
        utils.run_in_thread(scheduler.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)

    scheduler.run()
Пример #29
0
    def setUpClass(self):
        shutil.rmtree("./data/tests", ignore_errors=True)
        os.makedirs("./data/tests")

        run_in_thread(run.run_scheduler, g=run.g)
        run_in_thread(run.run_fetcher, g=run.g)
        run_in_thread(run.run_processor, g=run.g)
        run_in_thread(run.run_result_worker, g=run.g)
        time.sleep(1)

        app.config["taskdb"] = run.g.taskdb
        app.config["projectdb"] = run.g.projectdb
        app.config["resultdb"] = run.g.resultdb
        app.config["scheduler_rpc"] = xmlrpclib.ServerProxy("http://localhost:23333")
        self.app = app.test_client()
Пример #30
0
    def setUpClass(self):
        shutil.rmtree('./data/tests/', ignore_errors=True)
        os.makedirs('./data/tests/')

        def get_resultdb():
            return resultdb.ResultDB(self.resultdb_path)
        self.resultdb = get_resultdb()
        self.inqueue = Queue(10)

        def run_result_worker():
            self.result_worker = ResultWorker(get_resultdb(), self.inqueue)
            self.result_worker.run()
        self.process = run_in_thread(run_result_worker)
        time.sleep(1)
Пример #31
0
                      proxy=proxy,
                      async=async)
    fetcher.phantomjs_proxy = g.phantomjs_proxy
    if user_agent:
        fetcher.user_agent = user_agent
    if timeout:
        fetcher.default_options = copy.deepcopy(fetcher.default_options)
        fetcher.default_options['timeout'] = timeout

    g.instances.append(fetcher)
    if g.get('testing_mode'):
        return fetcher

    if xmlrpc:
        utils.run_in_thread(fetcher.xmlrpc_run,
                            port=xmlrpc_port,
                            bind=xmlrpc_host)
    fetcher.run()


@cli.command()
@click.option('--processor-cls',
              default='pyspider.processor.Processor',
              callback=load_cls,
              help='Processor class to be used.')
@click.pass_context
def processor(ctx, processor_cls, enable_stdout_capture=True):
    """
    Run Processor.
    """
    g = ctx.obj
Пример #32
0
    def setUpClass(self):
        shutil.rmtree('./data/tests', ignore_errors=True)
        os.makedirs('./data/tests')

        import tests.data_test_webpage
        import httpbin
        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887)
        self.httpbin = 'http://127.0.0.1:14887'

        ctx = run.cli.make_context('test', [
            '--taskdb', 'sqlalchemy+sqlite+taskdb:///data/tests/task.db',
            '--projectdb', 'sqlalchemy+sqlite+projectdb:///data/tests/projectdb.db',
            '--resultdb', 'sqlalchemy+sqlite+resultdb:///data/tests/resultdb.db',
        ], None, obj=ObjectDict(testing_mode=True))
        self.ctx = run.cli.invoke(ctx)

        ctx = run.scheduler.make_context('scheduler', [], self.ctx)
        scheduler = run.scheduler.invoke(ctx)
        run_in_thread(scheduler.xmlrpc_run)
        run_in_thread(scheduler.run)

        ctx = run.fetcher.make_context('fetcher', [
            '--xmlrpc',
            '--xmlrpc-port', '24444',
        ], self.ctx)
        fetcher = run.fetcher.invoke(ctx)
        run_in_thread(fetcher.xmlrpc_run)
        run_in_thread(fetcher.run)

        ctx = run.processor.make_context('processor', [], self.ctx)
        processor = run.processor.invoke(ctx)
        run_in_thread(processor.run)

        ctx = run.result_worker.make_context('result_worker', [], self.ctx)
        result_worker = run.result_worker.invoke(ctx)
        run_in_thread(result_worker.run)

        ctx = run.webui.make_context('webui', [
            '--scheduler-rpc', 'http://localhost:23333/'
        ], self.ctx)
        app = run.webui.invoke(ctx)
        app.debug = True
        self.app = app.test_client()
        self.rpc = app.config['scheduler_rpc']

        time.sleep(1)