示例#1
0
class EggStorageTest(unittest.TestCase):

    def setUp(self):
        d = self.mktemp()
        config = Config(values={'eggs_dir': d})
        self.eggst = FilesystemEggStorage(config)

    def test_interface(self):
        verifyObject(IEggStorage, self.eggst)

    def test_put_get_list_delete(self):
        self.eggst.put(StringIO("egg01"), 'mybot', '01')
        self.eggst.put(StringIO("egg03"), 'mybot', '03')
        self.eggst.put(StringIO("egg02"), 'mybot', '02')

        self.assertEqual(self.eggst.list('mybot'), ['01', '02', '03'])
        self.assertEqual(self.eggst.list('mybot2'), [])

        v, f = self.eggst.get('mybot')
        self.assertEqual(v, "03")
        self.assertEqual(f.read(), "egg03")

        v, f = self.eggst.get('mybot', '02')
        self.assertEqual(v, "02")
        self.assertEqual(f.read(), "egg02")

        self.eggst.delete('mybot', '02')
        self.assertEqual(self.eggst.list('mybot'), ['01', '03'])

        self.eggst.delete('mybot')
        self.assertEqual(self.eggst.list('mybot'), [])
示例#2
0
class EggStorageTest(unittest.TestCase):
    def setUp(self):
        d = self.mktemp()
        config = Config(values={"eggs_dir": d})
        self.eggst = FilesystemEggStorage(config)

    def test_interface(self):
        verifyObject(IEggStorage, self.eggst)

    def test_put_get_list_delete(self):
        self.eggst.put(StringIO("egg01"), "mybot", "01")
        self.eggst.put(StringIO("egg03"), "mybot", "03")
        self.eggst.put(StringIO("egg02"), "mybot", "02")

        self.assertEqual(self.eggst.list("mybot"), ["01", "02", "03"])
        self.assertEqual(self.eggst.list("mybot2"), [])

        v, f = self.eggst.get("mybot")
        self.assertEqual(v, "03")
        self.assertEqual(f.read(), "egg03")
        f.close()

        v, f = self.eggst.get("mybot", "02")
        self.assertEqual(v, "02")
        self.assertEqual(f.read(), "egg02")
        f.close()

        self.eggst.delete("mybot", "02")
        self.assertEqual(self.eggst.list("mybot"), ["01", "03"])

        self.eggst.delete("mybot")
        self.assertEqual(self.eggst.list("mybot"), [])
示例#3
0
class EggStorageTest(unittest.TestCase):
    def setUp(self):
        d = self.mktemp()
        config = Config(values={'eggs_dir': d})
        self.eggst = FilesystemEggStorage(config)

    def test_interface(self):
        verifyObject(IEggStorage, self.eggst)

    def test_put_get_list_delete(self):
        self.eggst.put(StringIO("egg01"), 'mybot', '01')
        self.eggst.put(StringIO("egg03"), 'mybot', '03')
        self.eggst.put(StringIO("egg02"), 'mybot', '02')

        self.assertEqual(self.eggst.list('mybot'), ['01', '02', '03'])
        self.assertEqual(self.eggst.list('mybot2'), [])

        v, f = self.eggst.get('mybot')
        self.assertEqual(v, "03")
        self.assertEqual(f.read(), "egg03")
        f.close()

        v, f = self.eggst.get('mybot', '02')
        self.assertEqual(v, "02")
        self.assertEqual(f.read(), "egg02")
        f.close()

        self.eggst.delete('mybot', '02')
        self.assertEqual(self.eggst.list('mybot'), ['01', '03'])

        self.eggst.delete('mybot')
        self.assertEqual(self.eggst.list('mybot'), [])
示例#4
0
    def test_egg(self, eggf):
        future = Future()
        temp_dir = tempfile.mkdtemp('scrapydd-egg-%s' % self.project_name)
        self.temp_dir = temp_dir
        eggf.seek(0)
        egg_storage = FilesystemEggStorage(
            scrapyd.config.Config({'eggs_dir': os.path.join(temp_dir,
                                                            'eggs')}))
        egg_storage.put(eggf, project=self.project_name, version='1')
        eggf.seek(0)

        requirements = self._read_egg_requirements(eggf) + ['scrapyd']

        def after_spider_list(callback_future):
            logger.debug('after_spider_list')
            exc = callback_future.exception()
            if exc is not None:
                future.set_exception(exc)
                return
            spider_list = callback_future.result()
            #os.removedirs(temp_dir)
            future.set_result(spider_list)

        def after_pip_install(callback_future):
            logger.debug('after_pip_install')
            exc = callback_future.exception()
            if exc is not None:
                future.set_exception(exc)
                return

            self.spider_list(self.project_name,
                             cwd=temp_dir).add_done_callback(after_spider_list)

        self.pip_install(requirements).add_done_callback(after_pip_install)

        return future
示例#5
0
class EggStorageTest(unittest.TestCase):

    def setUp(self):
        d = self.mktemp()
        config = Config(values={'eggs_dir': d})
        self.eggst = FilesystemEggStorage(config)

    def test_interface(self):
        verifyObject(IEggStorage, self.eggst)

    def test_put_get_list_delete(self):
        self.eggst.put(BytesIO(b"egg01"), 'mybot', '01')
        self.eggst.put(BytesIO(b"egg03"), 'mybot', '03/ver')
        self.eggst.put(BytesIO(b"egg02"), 'mybot', '02_my branch')

        self.assertEqual(self.eggst.list('mybot'), [
            '01',
            '02_my_branch',
            '03_ver'
        ])
        self.assertEqual(self.eggst.list('mybot2'), [])

        v, f = self.eggst.get('mybot')
        self.assertEqual(v, "03_ver")
        self.assertEqual(f.read(), b"egg03")
        f.close()

        v, f = self.eggst.get('mybot', '02_my branch')
        self.assertEqual(v, "02_my branch")
        self.assertEqual(f.read(), b"egg02")
        f.close()

        v, f = self.eggst.get('mybot', '02_my_branch')
        self.assertEqual(v, "02_my_branch")
        self.assertEqual(f.read(), b"egg02")
        f.close()

        self.eggst.delete('mybot', '02_my branch')
        self.assertEqual(self.eggst.list('mybot'), ['01', '03_ver'])

        self.eggst.delete('mybot', '03_ver')
        self.assertEqual(self.eggst.list('mybot'), ['01'])

        self.eggst.delete('mybot')
        self.assertEqual(self.eggst.list('mybot'), [])
示例#6
0
class EggStorageTest(unittest.TestCase):
    def setUp(self):
        d = self.mktemp()
        config = Config(values={'eggs_dir': d})
        self.eggst = FilesystemEggStorage(config)

    def test_interface(self):
        verifyObject(IEggStorage, self.eggst)

    def test_put_get_list_delete(self):
        self.eggst.put(BytesIO(get_data("scrapyd.tests", "mybot.egg")),
                       'mybot', '01')
        self.eggst.put(BytesIO(get_data("scrapyd.tests", "mybot3.egg")),
                       'mybot', '03/ver')
        self.eggst.put(BytesIO(get_data("scrapyd.tests", "mybot2.egg")),
                       'mybot', '02_my branch')

        self.assertEqual(self.eggst.list('mybot'),
                         ['01', '02_my_branch', '03_ver'])
        self.assertEqual(self.eggst.list('mybot2'), [])

        v, f = self.eggst.get('mybot')
        self.assertEqual(v, "03_ver")
        self.assertEqual(f.read(), get_data("scrapyd.tests", "mybot3.egg"))
        f.close()

        v, f = self.eggst.get('mybot', '02_my branch')
        self.assertEqual(v, "02_my branch")
        self.assertEqual(f.read(), get_data("scrapyd.tests", "mybot2.egg"))
        f.close()

        v, f = self.eggst.get('mybot', '02_my_branch')
        self.assertEqual(v, "02_my_branch")
        self.assertEqual(f.read(), get_data("scrapyd.tests", "mybot2.egg"))
        f.close()

        self.eggst.delete('mybot', '02_my branch')
        self.assertEqual(self.eggst.list('mybot'), ['01', '03_ver'])

        self.eggst.delete('mybot', '03_ver')
        self.assertEqual(self.eggst.list('mybot'), ['01'])

        self.eggst.delete('mybot')
        self.assertEqual(self.eggst.list('mybot'), [])
示例#7
0
class ProjectWorkspace(object):
    pip = None
    python = None
    process = None
    project_workspace_dir = None
    project_check = None
    temp_dir = None

    def __init__(self, project_name):
        project_workspace_dir = os.path.abspath(
            os.path.join('workspace', project_name))
        self.project_workspace_dir = project_workspace_dir
        self.project_name = project_name
        self.egg_storage = FilesystemEggStorage(scrapyd.config.Config())
        if sys.platform.startswith('linux'):
            self.pip = os.path.join(project_workspace_dir, 'bin', 'pip')
            self.python = os.path.join(project_workspace_dir, 'bin', 'python')
        elif sys.platform.startswith('win'):
            self.pip = os.path.join(project_workspace_dir, 'Scripts',
                                    'pip.exe')
            self.python = os.path.join(project_workspace_dir, 'Scripts',
                                       'python.exe')
        else:
            raise NotImplementedError('Unsupported system %s' % sys.platform)

    def init(self):
        '''
        init project isolated workspace,
        :return: future
        '''
        future = Future()
        if os.path.exists(self.pip) and os.path.exists(self.python):
            future.set_result(self)
            return future

        logger.debug('start creating virtualenv.')
        try:
            process = Popen([
                'virtualenv', '--system-site-packages',
                self.project_workspace_dir
            ],
                            stdout=PIPE,
                            stderr=PIPE)
        except Exception as e:
            future.set_exception(e)
            return future

        def check_process():
            logger.debug('create virtualenv process poll.')
            retcode = process.poll()
            if retcode is not None:
                if retcode == 0:
                    future.set_result(self)
                else:
                    std_output = process.stdout.read()
                    err_output = process.stderr.read()
                    future.set_exception(
                        ProcessFailed('Error when init workspace virtualenv ',
                                      std_output=std_output,
                                      err_output=err_output))
                return
            IOLoop.current().call_later(1, check_process)

        check_process()
        return future

    def find_project_requirements(self, project, egg_storage=None, eggf=None):
        if eggf is None:
            if egg_storage is None:
                egg_storage = FilesystemEggStorage(scrapyd.config.Config())
            version, eggf = egg_storage.get(project)
        try:
            prefix = '%s-nover-' % (project)
            fd, eggpath = tempfile.mkstemp(prefix=prefix, suffix='.egg')
            logger.debug('tmp egg file saved to %s' % eggpath)
            lf = os.fdopen(fd, 'wb')
            eggf.seek(0)
            shutil.copyfileobj(eggf, lf)
            lf.close()
            try:
                d = pkg_resources.find_distributions(eggpath).next()
            except StopIteration:
                raise ValueError("Unknown or corrupt egg")
            requirements = [str(x) for x in d.requires()]
            return requirements
        finally:
            if eggpath:
                os.remove(eggpath)

    def test_egg(self, eggf):
        future = Future()
        temp_dir = tempfile.mkdtemp('scrapydd-egg-%s' % self.project_name)
        self.temp_dir = temp_dir
        eggf.seek(0)
        egg_storage = FilesystemEggStorage(
            scrapyd.config.Config({'eggs_dir': os.path.join(temp_dir,
                                                            'eggs')}))
        egg_storage.put(eggf, project=self.project_name, version='1')
        eggf.seek(0)

        requirements = self._read_egg_requirements(eggf) + ['scrapyd']

        def after_spider_list(callback_future):
            logger.debug('after_spider_list')
            exc = callback_future.exception()
            if exc is not None:
                future.set_exception(exc)
                return
            spider_list = callback_future.result()
            #os.removedirs(temp_dir)
            future.set_result(spider_list)

        def after_pip_install(callback_future):
            logger.debug('after_pip_install')
            exc = callback_future.exception()
            if exc is not None:
                future.set_exception(exc)
                return

            self.spider_list(self.project_name,
                             cwd=temp_dir).add_done_callback(after_spider_list)

        self.pip_install(requirements).add_done_callback(after_pip_install)

        return future

    def _read_egg_requirements(self, eggf):
        try:
            prefix = '%s-%s-' % (self.project_name, 0)
            fd, eggpath = tempfile.mkstemp(prefix=prefix, suffix='.egg')
            logger.debug('tmp egg file saved to %s' % eggpath)
            lf = os.fdopen(fd, 'wb')
            eggf.seek(0)
            shutil.copyfileobj(eggf, lf)
            lf.close()
            try:
                d = pkg_resources.find_distributions(eggpath).next()
            except StopIteration:
                raise ValueError("Unknown or corrupt egg")
            requirements = [str(x) for x in d.requires()]
            return requirements
        finally:
            if eggpath:
                os.remove(eggpath)

    def pip_install(self, requirements):
        logger.debug('installing requirements: %s' % requirements)
        future = Future()
        try:
            process = Popen([self.pip, 'install'] + requirements,
                            stdout=PIPE,
                            stderr=PIPE)
        except Exception as e:
            future.set_exception(e)
            return future

        def check_process():
            logger.debug('poll')
            retcode = process.poll()
            if retcode is not None:
                if retcode == 0:
                    future.set_result(self)
                else:
                    std_out = process.stdout.read()
                    err_out = process.stderr.read()
                    future.set_exception(
                        ProcessFailed(std_output=std_out, err_output=err_out))
                return
            IOLoop.current().call_later(1, check_process)

        check_process()
        return future

    def spider_list(self, project, cwd=None):
        future = Future()
        try:
            env = os.environ.copy()
            env['SCRAPY_PROJECT'] = project
            process = Popen([self.python, '-m', 'scrapyd.runner', 'list'],
                            env=env,
                            cwd=cwd,
                            stdout=PIPE,
                            stderr=PIPE)
        except Exception as e:
            logger.error(e)
            future.set_exception(e)
            return future

        def check_process():
            logger.debug('poll')
            retcode = process.poll()
            if retcode is not None:
                if retcode == 0:
                    future.set_result(process.stdout.read().splitlines())
                else:
                    #future.set_exception(ProcessFailed(std_output=process.stdout.read(), err_output=process.stderr.read()))
                    future.set_exception(
                        InvalidProjectEgg(detail=process.stderr.read()))
                return
            IOLoop.current().call_later(1, check_process)

        check_process()
        return future

    def clearup(self):
        '''
        clean up temp files.
        :return:
        '''
        if self.temp_dir and os.path.exists(self.temp_dir):
            shutil.rmtree(self.temp_dir)

    def put_egg(self, eggfile, version):
        eggfile.seek(0)
        self.egg_storage.put(eggfile=eggfile,
                             project=self.project_name,
                             version=version)

    def get_egg(self, version=None):
        return self.egg_storage.get(self.project_name, version=version)

    def delete_egg(self, project, version=None):
        logger.info('deleting project eggs')
        return self.egg_storage.delete(project, version)

    def list_versions(self, project):
        return self.egg_storage.list(project)
示例#8
0
class TaskExecutor():
    def __init__(self, task, config=None):
        '''
        @type task: SpiderTask
        '''
        self.task = task
        if config is None:
            config = AgentConfig()
        if config.get('server_https_port'):
            self.service_base = 'https://%s:%d' % (config.get('server'), config.getint('server_https_port'))
        else:
            self.service_base = 'http://%s:%d' % (config.get('server'), config.getint('server_port'))
        self._f_output = None
        self.output_file = None
        self.p = None
        self.check_process_callback = None
        self.items_file = None
        self.ret_code = None
        self.workspace_dir = tempfile.mkdtemp(prefix='ddjob-%s-%s-' % (task.project_name, task.id))
        if not os.path.exists(self.workspace_dir):
            os.makedirs(self.workspace_dir)
        self.output_file = str(os.path.join(self.workspace_dir, '%s.log' % self.task.id))
        self._f_output = open(self.output_file, 'w')

        eggs_dir = os.path.join(self.workspace_dir, 'eggs')
        if not os.path.exists(eggs_dir):
            os.mkdir(eggs_dir)
        self.egg_storage = FilesystemEggStorage(scrapyd.config.Config(values={'eggs_dir': eggs_dir}))
        self.on_subprocess_start = None

    @gen.coroutine
    def execute(self):
        try:
            workspace = ProjectWorkspace(self.task.project_name)
            yield workspace.init()
            logger.debug('begin download egg.')
            egg_request_url = urlparse.urljoin(self.service_base, '/spiders/%d/egg' % self.task.spider_id)
            request = HTTPRequest(egg_request_url)
            client = AsyncHTTPClient()
            response = yield client.fetch(request)
            self.egg_storage.put(response.buffer, self.task.project_name, self.task.project_version)
            logger.debug('download egg done.')
            requirements = workspace.find_project_requirements(self.task.project_name, egg_storage=self.egg_storage)
            yield workspace.pip_install(requirements)
            result = yield self.execute_subprocess()
        except ProcessFailed as e:
            logger.error(e)
            error_log = e.message
            if e.std_output:
                logger.error(e.std_output)
                error_log += e.std_output
            result = self.complete_with_error(error_log)
        except Exception as e:
            logger.error(e)
            error_log = e.message
            result = self.complete_with_error(error_log)
        raise gen.Return(result)

    def check_process(self):
        execute_result = self.p.poll()
        logger.debug('check process')
        if execute_result is not None:
            logger.info('task complete')
            self.complete(execute_result)

    def execute_subprocess(self):
        future = Future()
        # init items file
        workspace = ProjectWorkspace(self.task.project_name)
        self.items_file = os.path.join(self.workspace_dir, '%s.%s' % (self.task.id, 'jl'))
        python = workspace.python
        runner = 'scrapyd.runner'
        pargs = [python, '-m', runner, 'crawl', self.task.spider_name]
        for spider_parameter_key, spider_parameter_value in self.task.spider_parameters.items():
            pargs += [
                        '-s',
                        '%s=%s' % (spider_parameter_key, spider_parameter_value)
                      ]

        env = os.environ.copy()
        env['SCRAPY_PROJECT'] = str(self.task.project_name)
        env['SCRAPY_JOB'] = str(self.task.id)
        env['SCRAPY_FEED_URI'] = str(path_to_file_uri(self.items_file))
        try:
            self.p = subprocess.Popen(pargs, env=env, stdout=self._f_output, cwd=self.workspace_dir, stderr=self._f_output)
            if self.on_subprocess_start:
                self.on_subprocess_start(self.task, self.p.pid)

        except Exception as e:
            return self.complete_with_error('Error when starting crawl subprocess : %s' % e)
        logger.info('job %s started on pid: %d' % (self.task.id, self.p.pid))

        def check_process():
            execute_result = self.p.poll()
            logger.debug('check process')
            if execute_result is not None:
                logger.info('task complete')
                future.set_result(self.complete(execute_result))

        self.check_process_callback = PeriodicCallback(check_process, 1*1000)
        self.check_process_callback.start()
        return future

    def result(self):
        return self

    def complete(self, ret_code):
        self._f_output.close()
        self.ret_code = ret_code
        self.check_process_callback.stop()
        self.check_process_callback = None
        return self.result()

    def complete_with_error(self, error_message):
        logger.error(error_message)
        self._f_output.write(error_message)
        self._f_output.close()
        self.ret_code = 1
        return self.result()

    def __del__(self):
        logger.debug('delete task executor for task %s' % self.task.id)
        if self.workspace_dir and os.path.exists(self.workspace_dir):
            shutil.rmtree(self.workspace_dir)

    @gen.coroutine
    def kill(self):
        logger.info('killing job %s' % self.task.id)
        if self.p:
            self.p.terminate()

        gen.sleep(10)
        if self.p:
            self.p.kill()