def __init__(self, task, config=None): ''' @type task: SpiderTask ''' self.task = task if config is None: config = AgentConfig() if config.get('server_https_port'): self.service_base = 'https://%s:%d' % (config.get('server'), config.getint('server_https_port')) else: self.service_base = 'http://%s:%d' % (config.get('server'), config.getint('server_port')) self._f_output = None self.output_file = None self.p = None self.check_process_callback = None self.items_file = None self.ret_code = None self.workspace_dir = tempfile.mkdtemp(prefix='ddjob-%s-%s-' % (task.project_name, task.id)) if not os.path.exists(self.workspace_dir): os.makedirs(self.workspace_dir) self.output_file = str(os.path.join(self.workspace_dir, '%s.log' % self.task.id)) self._f_output = open(self.output_file, 'w') eggs_dir = os.path.join(self.workspace_dir, 'eggs') if not os.path.exists(eggs_dir): os.mkdir(eggs_dir) self.egg_storage = FilesystemEggStorage(scrapyd.config.Config(values={'eggs_dir': eggs_dir})) self.on_subprocess_start = None
def get_application(config): app = Application('Scrapyd') http_port = config.getint('http_port', 6800) bind_address = config.get('bind_address', '0.0.0.0') poll_interval = config.getfloat('poll_interval', 5) poller = QueuePoller(config) eggstorage = FilesystemEggStorage(config) scheduler = SpiderScheduler(config) environment = Environment(config) app.setComponent(IPoller, poller) app.setComponent(IEggStorage, eggstorage) app.setComponent(ISpiderScheduler, scheduler) app.setComponent(IEnvironment, environment) laupath = config.get('launcher', 'scrapyd_mongodb.launcher.Launcher') laucls = load_object(laupath) launcher = laucls(config, app) timer = TimerService(poll_interval, poller.poll) webservice = TCPServer(http_port, server.Site(Root(config, app)), interface=bind_address) log.msg(format=('Scrapyd web console available at ' 'http://%(bind_address)s:%(http_port)s/', ), bind_address=bind_address, http_port=http_port) launcher.setServiceParent(app) timer.setServiceParent(app) webservice.setServiceParent(app) return app
def application(config): app = Application("Scrapyd") http_port = int(environ.get('PORT', config.getint('http_port', 6800))) config.cp.set('scrapyd', 'database_url', environ.get('DATABASE_URL')) poller = Psycopg2QueuePoller(config) eggstorage = FilesystemEggStorage(config) scheduler = Psycopg2SpiderScheduler(config) environment = Environment(config) app.setComponent(IPoller, poller) app.setComponent(IEggStorage, eggstorage) app.setComponent(ISpiderScheduler, scheduler) app.setComponent(IEnvironment, environment) launcher = Launcher(config, app) timer = TimerService(5, poller.poll) webservice = TCPServer(http_port, server.Site(Root(config, app))) log.msg("Scrapyd web console available at http://localhost:%s/ (HEROKU)" % http_port) launcher.setServiceParent(app) timer.setServiceParent(app) webservice.setServiceParent(app) return app
def __init__(self, project_name): project_workspace_dir = os.path.abspath( os.path.join('workspace', project_name)) self.project_workspace_dir = project_workspace_dir self.project_name = project_name self.egg_storage = FilesystemEggStorage(scrapyd.config.Config()) if sys.platform.startswith('linux'): self.pip = os.path.join(project_workspace_dir, 'bin', 'pip') self.python = os.path.join(project_workspace_dir, 'bin', 'python') elif sys.platform.startswith('win'): self.pip = os.path.join(project_workspace_dir, 'Scripts', 'pip.exe') self.python = os.path.join(project_workspace_dir, 'Scripts', 'python.exe') else: raise NotImplementedError('Unsupported system %s' % sys.platform)
def project_environment(project): config = Config() eggstorage = FilesystemEggStorage(config) version, eggfile = eggstorage.get(project) if eggfile: prefix = '%s-%s-' % (project, version) fd, eggpath = tempfile.mkstemp(prefix=prefix, suffix='.egg') lf = os.fdopen(fd, 'wb') shutil.copyfileobj(eggfile, lf) lf.close() activate_egg(eggpath) else: eggpath = None try: assert 'scrapy.conf' not in sys.modules, "Scrapy settings already loaded" yield finally: if eggpath: os.remove(eggpath)
def find_project_requirements(self, project, egg_storage=None, eggf=None): if eggf is None: if egg_storage is None: egg_storage = FilesystemEggStorage(scrapyd.config.Config()) version, eggf = egg_storage.get(project) try: prefix = '%s-nover-' % (project) fd, eggpath = tempfile.mkstemp(prefix=prefix, suffix='.egg') logger.debug('tmp egg file saved to %s' % eggpath) lf = os.fdopen(fd, 'wb') eggf.seek(0) shutil.copyfileobj(eggf, lf) lf.close() try: d = pkg_resources.find_distributions(eggpath).next() except StopIteration: raise ValueError("Unknown or corrupt egg") requirements = [str(x) for x in d.requires()] return requirements finally: if eggpath: os.remove(eggpath)
def test_egg(self, eggf): future = Future() temp_dir = tempfile.mkdtemp('scrapydd-egg-%s' % self.project_name) self.temp_dir = temp_dir eggf.seek(0) egg_storage = FilesystemEggStorage( scrapyd.config.Config({'eggs_dir': os.path.join(temp_dir, 'eggs')})) egg_storage.put(eggf, project=self.project_name, version='1') eggf.seek(0) requirements = self._read_egg_requirements(eggf) + ['scrapyd'] def after_spider_list(callback_future): logger.debug('after_spider_list') exc = callback_future.exception() if exc is not None: future.set_exception(exc) return spider_list = callback_future.result() #os.removedirs(temp_dir) future.set_result(spider_list) def after_pip_install(callback_future): logger.debug('after_pip_install') exc = callback_future.exception() if exc is not None: future.set_exception(exc) return self.spider_list(self.project_name, cwd=temp_dir).add_done_callback(after_spider_list) self.pip_install(requirements).add_done_callback(after_pip_install) return future
def application(config): app = Application("Scrapyd") http_port = config.getint('http_port', 6800) bind_address = config.get('bind_address', '0.0.0.0') poll_interval = config.getfloat('poll_interval', 5) poller = QueuePoller(config) eggstorage = FilesystemEggStorage(config) schedpath = config.get('scheduler', 'scrapyd.scheduler.SpiderScheduler') schedCls = load_object(schedpath) scheduler = schedCls(config, app) environment = Environment(config) pubsub_path = config.get('pubsub', 'scrapyd.pubsub.BasePubSub') pubsubCls = load_object(pubsub_path) pubsub = pubsubCls(config, app) app.setComponent(IPoller, poller) app.setComponent(IEggStorage, eggstorage) app.setComponent(ISpiderScheduler, scheduler) app.setComponent(IEnvironment, environment) app.setComponent(IPubSub, pubsub) laupath = config.get('launcher', 'scrapyd.launcher.Launcher') laucls = load_object(laupath) launcher = laucls(config, app) timer = TimerService(poll_interval, poller.poll) webservice = TCPServer(http_port, server.Site(Root(config, app)), interface=bind_address) log.msg( format= "Scrapyd web console available at http://%(bind_address)s:%(http_port)s/", bind_address=bind_address, http_port=http_port) pubsub.setServiceParent(app) launcher.setServiceParent(app) timer.setServiceParent(app) webservice.setServiceParent(app) return app
def application(config): app = Application("Scrapyd") http_port = config.getint('http_port', 6800) if 'PORT' in os.environ: http_port = int(os.environ.get('PORT')) bind_address = '0.0.0.0' if 'PORT' in os.environ else config.get('bind_address', '127.0.0.1') poll_interval = config.getfloat('poll_interval', 5) poller = QueuePoller(config) eggstorage = FilesystemEggStorage(config) scheduler = SpiderScheduler(config) environment = Environment(config) app.setComponent(IPoller, poller) app.setComponent(IEggStorage, eggstorage) app.setComponent(ISpiderScheduler, scheduler) app.setComponent(IEnvironment, environment) laupath = config.get('launcher', 'scrapyd.launcher.Launcher') laucls = load_object(laupath) launcher = laucls(config, app) webpath = config.get('webroot', 'scrapyd.website.Root') webcls = load_object(webpath) timer = TimerService(poll_interval, poller.poll) webservice = TCPServer(http_port, server.Site(webcls(config, app)), interface=bind_address) log.msg(format="Scrapyd web console available at http://%(bind_address)s:%(http_port)s/", bind_address=bind_address, http_port=http_port) launcher.setServiceParent(app) timer.setServiceParent(app) webservice.setServiceParent(app) return app
class EggStorageTest(unittest.TestCase): def setUp(self): d = self.mktemp() config = Config(values={'eggs_dir': d}) self.eggst = FilesystemEggStorage(config) def test_interface(self): verifyObject(IEggStorage, self.eggst) def test_put_get_list_delete(self): self.eggst.put(BytesIO(b"egg01"), 'mybot', '01') self.eggst.put(BytesIO(b"egg03"), 'mybot', '03/ver') self.eggst.put(BytesIO(b"egg02"), 'mybot', '02_my branch') self.assertEqual(self.eggst.list('mybot'), [ '01', '02_my_branch', '03_ver' ]) self.assertEqual(self.eggst.list('mybot2'), []) v, f = self.eggst.get('mybot') self.assertEqual(v, "03_ver") self.assertEqual(f.read(), b"egg03") f.close() v, f = self.eggst.get('mybot', '02_my branch') self.assertEqual(v, "02_my branch") self.assertEqual(f.read(), b"egg02") f.close() v, f = self.eggst.get('mybot', '02_my_branch') self.assertEqual(v, "02_my_branch") self.assertEqual(f.read(), b"egg02") f.close() self.eggst.delete('mybot', '02_my branch') self.assertEqual(self.eggst.list('mybot'), ['01', '03_ver']) self.eggst.delete('mybot', '03_ver') self.assertEqual(self.eggst.list('mybot'), ['01']) self.eggst.delete('mybot') self.assertEqual(self.eggst.list('mybot'), [])
class EggStorageTest(unittest.TestCase): def setUp(self): d = self.mktemp() config = Config(values={'eggs_dir': d}) self.eggst = FilesystemEggStorage(config) def test_interface(self): verifyObject(IEggStorage, self.eggst) def test_put_get_list_delete(self): self.eggst.put(StringIO("egg01"), 'mybot', '01') self.eggst.put(StringIO("egg03"), 'mybot', '03/ver') self.eggst.put(StringIO("egg02"), 'mybot', '02_my branch') self.assertEqual(self.eggst.list('mybot'), [ '01', '02_my_branch', '03_ver' ]) self.assertEqual(self.eggst.list('mybot2'), []) v, f = self.eggst.get('mybot') self.assertEqual(v, "03_ver") self.assertEqual(f.read(), "egg03") f.close() v, f = self.eggst.get('mybot', '02_my branch') self.assertEqual(v, "02_my branch") self.assertEqual(f.read(), "egg02") f.close() v, f = self.eggst.get('mybot', '02_my_branch') self.assertEqual(v, "02_my_branch") self.assertEqual(f.read(), "egg02") f.close() self.eggst.delete('mybot', '02_my branch') self.assertEqual(self.eggst.list('mybot'), ['01', '03_ver']) self.eggst.delete('mybot', '03_ver') self.assertEqual(self.eggst.list('mybot'), ['01']) self.eggst.delete('mybot') self.assertEqual(self.eggst.list('mybot'), [])
def setUp(self): d = self.mktemp() config = Config(values={'eggs_dir': d}) self.eggst = FilesystemEggStorage(config)
import os from scrapyd.eggstorage import FilesystemEggStorage from scrapyd.config import Config import urllib2 from poster.encode import multipart_encode from poster.streaminghttp import register_openers register_openers() source_dir = '/kf/scrapyd' dest_url = 'http://localhost:6801/addversion.json' source_eggs_dir = os.path.join(source_dir, 'eggs') source_config = Config({'eggs_dir': source_eggs_dir}) source_egg_storage = FilesystemEggStorage(source_config) for dir in os.listdir(source_eggs_dir): #print dir project = dir version, egg = source_egg_storage.get(project) print project, version post_data = { 'egg': egg, 'project': project, 'version': version, } datagen, headers = multipart_encode(post_data) request = urllib2.Request(url=dest_url, headers=headers, data=datagen) try: res = urllib2.urlopen(request) except urllib2.HTTPError as e: print 'HTTPError: %s' % e
class TaskExecutor(): def __init__(self, task, config=None): ''' @type task: SpiderTask ''' self.task = task if config is None: config = AgentConfig() if config.get('server_https_port'): self.service_base = 'https://%s:%d' % (config.get('server'), config.getint('server_https_port')) else: self.service_base = 'http://%s:%d' % (config.get('server'), config.getint('server_port')) self._f_output = None self.output_file = None self.p = None self.check_process_callback = None self.items_file = None self.ret_code = None self.workspace_dir = tempfile.mkdtemp(prefix='ddjob-%s-%s-' % (task.project_name, task.id)) if not os.path.exists(self.workspace_dir): os.makedirs(self.workspace_dir) self.output_file = str(os.path.join(self.workspace_dir, '%s.log' % self.task.id)) self._f_output = open(self.output_file, 'w') eggs_dir = os.path.join(self.workspace_dir, 'eggs') if not os.path.exists(eggs_dir): os.mkdir(eggs_dir) self.egg_storage = FilesystemEggStorage(scrapyd.config.Config(values={'eggs_dir': eggs_dir})) self.on_subprocess_start = None @gen.coroutine def execute(self): try: workspace = ProjectWorkspace(self.task.project_name) yield workspace.init() logger.debug('begin download egg.') egg_request_url = urlparse.urljoin(self.service_base, '/spiders/%d/egg' % self.task.spider_id) request = HTTPRequest(egg_request_url) client = AsyncHTTPClient() response = yield client.fetch(request) self.egg_storage.put(response.buffer, self.task.project_name, self.task.project_version) logger.debug('download egg done.') requirements = workspace.find_project_requirements(self.task.project_name, egg_storage=self.egg_storage) yield workspace.pip_install(requirements) result = yield self.execute_subprocess() except ProcessFailed as e: logger.error(e) error_log = e.message if e.std_output: logger.error(e.std_output) error_log += e.std_output result = self.complete_with_error(error_log) except Exception as e: logger.error(e) error_log = e.message result = self.complete_with_error(error_log) raise gen.Return(result) def check_process(self): execute_result = self.p.poll() logger.debug('check process') if execute_result is not None: logger.info('task complete') self.complete(execute_result) def execute_subprocess(self): future = Future() # init items file workspace = ProjectWorkspace(self.task.project_name) self.items_file = os.path.join(self.workspace_dir, '%s.%s' % (self.task.id, 'jl')) python = workspace.python runner = 'scrapyd.runner' pargs = [python, '-m', runner, 'crawl', self.task.spider_name] for spider_parameter_key, spider_parameter_value in self.task.spider_parameters.items(): pargs += [ '-s', '%s=%s' % (spider_parameter_key, spider_parameter_value) ] env = os.environ.copy() env['SCRAPY_PROJECT'] = str(self.task.project_name) env['SCRAPY_JOB'] = str(self.task.id) env['SCRAPY_FEED_URI'] = str(path_to_file_uri(self.items_file)) try: self.p = subprocess.Popen(pargs, env=env, stdout=self._f_output, cwd=self.workspace_dir, stderr=self._f_output) if self.on_subprocess_start: self.on_subprocess_start(self.task, self.p.pid) except Exception as e: return self.complete_with_error('Error when starting crawl subprocess : %s' % e) logger.info('job %s started on pid: %d' % (self.task.id, self.p.pid)) def check_process(): execute_result = self.p.poll() logger.debug('check process') if execute_result is not None: logger.info('task complete') future.set_result(self.complete(execute_result)) self.check_process_callback = PeriodicCallback(check_process, 1*1000) self.check_process_callback.start() return future def result(self): return self def complete(self, ret_code): self._f_output.close() self.ret_code = ret_code self.check_process_callback.stop() self.check_process_callback = None return self.result() def complete_with_error(self, error_message): logger.error(error_message) self._f_output.write(error_message) self._f_output.close() self.ret_code = 1 return self.result() def __del__(self): logger.debug('delete task executor for task %s' % self.task.id) if self.workspace_dir and os.path.exists(self.workspace_dir): shutil.rmtree(self.workspace_dir) @gen.coroutine def kill(self): logger.info('killing job %s' % self.task.id) if self.p: self.p.terminate() gen.sleep(10) if self.p: self.p.kill()
class ProjectWorkspace(object): pip = None python = None process = None project_workspace_dir = None project_check = None temp_dir = None def __init__(self, project_name): project_workspace_dir = os.path.abspath( os.path.join('workspace', project_name)) self.project_workspace_dir = project_workspace_dir self.project_name = project_name self.egg_storage = FilesystemEggStorage(scrapyd.config.Config()) if sys.platform.startswith('linux'): self.pip = os.path.join(project_workspace_dir, 'bin', 'pip') self.python = os.path.join(project_workspace_dir, 'bin', 'python') elif sys.platform.startswith('win'): self.pip = os.path.join(project_workspace_dir, 'Scripts', 'pip.exe') self.python = os.path.join(project_workspace_dir, 'Scripts', 'python.exe') else: raise NotImplementedError('Unsupported system %s' % sys.platform) def init(self): ''' init project isolated workspace, :return: future ''' future = Future() if os.path.exists(self.pip) and os.path.exists(self.python): future.set_result(self) return future logger.debug('start creating virtualenv.') try: process = Popen([ 'virtualenv', '--system-site-packages', self.project_workspace_dir ], stdout=PIPE, stderr=PIPE) except Exception as e: future.set_exception(e) return future def check_process(): logger.debug('create virtualenv process poll.') retcode = process.poll() if retcode is not None: if retcode == 0: future.set_result(self) else: std_output = process.stdout.read() err_output = process.stderr.read() future.set_exception( ProcessFailed('Error when init workspace virtualenv ', std_output=std_output, err_output=err_output)) return IOLoop.current().call_later(1, check_process) check_process() return future def find_project_requirements(self, project, egg_storage=None, eggf=None): if eggf is None: if egg_storage is None: egg_storage = FilesystemEggStorage(scrapyd.config.Config()) version, eggf = egg_storage.get(project) try: prefix = '%s-nover-' % (project) fd, eggpath = tempfile.mkstemp(prefix=prefix, suffix='.egg') logger.debug('tmp egg file saved to %s' % eggpath) lf = os.fdopen(fd, 'wb') eggf.seek(0) shutil.copyfileobj(eggf, lf) lf.close() try: d = pkg_resources.find_distributions(eggpath).next() except StopIteration: raise ValueError("Unknown or corrupt egg") requirements = [str(x) for x in d.requires()] return requirements finally: if eggpath: os.remove(eggpath) def test_egg(self, eggf): future = Future() temp_dir = tempfile.mkdtemp('scrapydd-egg-%s' % self.project_name) self.temp_dir = temp_dir eggf.seek(0) egg_storage = FilesystemEggStorage( scrapyd.config.Config({'eggs_dir': os.path.join(temp_dir, 'eggs')})) egg_storage.put(eggf, project=self.project_name, version='1') eggf.seek(0) requirements = self._read_egg_requirements(eggf) + ['scrapyd'] def after_spider_list(callback_future): logger.debug('after_spider_list') exc = callback_future.exception() if exc is not None: future.set_exception(exc) return spider_list = callback_future.result() #os.removedirs(temp_dir) future.set_result(spider_list) def after_pip_install(callback_future): logger.debug('after_pip_install') exc = callback_future.exception() if exc is not None: future.set_exception(exc) return self.spider_list(self.project_name, cwd=temp_dir).add_done_callback(after_spider_list) self.pip_install(requirements).add_done_callback(after_pip_install) return future def _read_egg_requirements(self, eggf): try: prefix = '%s-%s-' % (self.project_name, 0) fd, eggpath = tempfile.mkstemp(prefix=prefix, suffix='.egg') logger.debug('tmp egg file saved to %s' % eggpath) lf = os.fdopen(fd, 'wb') eggf.seek(0) shutil.copyfileobj(eggf, lf) lf.close() try: d = pkg_resources.find_distributions(eggpath).next() except StopIteration: raise ValueError("Unknown or corrupt egg") requirements = [str(x) for x in d.requires()] return requirements finally: if eggpath: os.remove(eggpath) def pip_install(self, requirements): logger.debug('installing requirements: %s' % requirements) future = Future() try: process = Popen([self.pip, 'install'] + requirements, stdout=PIPE, stderr=PIPE) except Exception as e: future.set_exception(e) return future def check_process(): logger.debug('poll') retcode = process.poll() if retcode is not None: if retcode == 0: future.set_result(self) else: std_out = process.stdout.read() err_out = process.stderr.read() future.set_exception( ProcessFailed(std_output=std_out, err_output=err_out)) return IOLoop.current().call_later(1, check_process) check_process() return future def spider_list(self, project, cwd=None): future = Future() try: env = os.environ.copy() env['SCRAPY_PROJECT'] = project process = Popen([self.python, '-m', 'scrapyd.runner', 'list'], env=env, cwd=cwd, stdout=PIPE, stderr=PIPE) except Exception as e: logger.error(e) future.set_exception(e) return future def check_process(): logger.debug('poll') retcode = process.poll() if retcode is not None: if retcode == 0: future.set_result(process.stdout.read().splitlines()) else: #future.set_exception(ProcessFailed(std_output=process.stdout.read(), err_output=process.stderr.read())) future.set_exception( InvalidProjectEgg(detail=process.stderr.read())) return IOLoop.current().call_later(1, check_process) check_process() return future def clearup(self): ''' clean up temp files. :return: ''' if self.temp_dir and os.path.exists(self.temp_dir): shutil.rmtree(self.temp_dir) def put_egg(self, eggfile, version): eggfile.seek(0) self.egg_storage.put(eggfile=eggfile, project=self.project_name, version=version) def get_egg(self, version=None): return self.egg_storage.get(self.project_name, version=version) def delete_egg(self, project, version=None): logger.info('deleting project eggs') return self.egg_storage.delete(project, version) def list_versions(self, project): return self.egg_storage.list(project)
class EggStorageTest(unittest.TestCase): def setUp(self): d = self.mktemp() config = Config(values={"eggs_dir": d}) self.eggst = FilesystemEggStorage(config) def test_interface(self): verifyObject(IEggStorage, self.eggst) def test_put_get_list_delete(self): self.eggst.put(StringIO("egg01"), "mybot", "01") self.eggst.put(StringIO("egg03"), "mybot", "03") self.eggst.put(StringIO("egg02"), "mybot", "02") self.assertEqual(self.eggst.list("mybot"), ["01", "02", "03"]) self.assertEqual(self.eggst.list("mybot2"), []) v, f = self.eggst.get("mybot") self.assertEqual(v, "03") self.assertEqual(f.read(), "egg03") f.close() v, f = self.eggst.get("mybot", "02") self.assertEqual(v, "02") self.assertEqual(f.read(), "egg02") f.close() self.eggst.delete("mybot", "02") self.assertEqual(self.eggst.list("mybot"), ["01", "03"]) self.eggst.delete("mybot") self.assertEqual(self.eggst.list("mybot"), [])