def backoff_request(self, url, method, payload=None, num_attempts=1, initial_wait=0, need_result=True): initial_wait = 5 for _ in range(0, num_attempts): if self.stop_event.is_set(): break try: try: if method == "POST": if need_result or num_attempts > 1: content = post_string(url, payload) else: post_string_noreturn(url, payload, result_callback=self.master_post_result_callback) return elif method == "GET": content = get_string(url) else: raise Exception("Invalid method %s" % method) return 200, content except Exception as e: ciel.log("Backoff-request failed with exception %s; re-raising MasterNotResponding" % e, "MASTER_PROXY", logging.ERROR) raise MasterNotRespondingException() except: ciel.log.error("Error contacting master", "MSTRPRXY", logging.WARN, True) self.stop_event.wait(initial_wait) initial_wait += initial_wait * random.uniform(0.5, 1.5) ciel.log.error("Given up trying to contact master", "MSTRPRXY", logging.ERROR, True) if self.stop_event.is_set(): raise WorkerShutdownException() else: raise MasterNotRespondingException()
def create_job_for_process(self, record): ref = self.get_reference_for_process(record) root_task_descriptor = {'handler' : 'proc', 'dependencies' : [ref], 'task_private' : ref} master_task_submit_uri = urlparse.urljoin(self.worker.master_url, "control/job/") try: message = simplejson.dumps(root_task_descriptor, cls=SWReferenceJSONEncoder) content = post_string(master_task_submit_uri, message) except Exception, e: ciel.log('Network error submitting process job to master', 'PROCESSPOOL', logging.WARN) raise e
def backoff_request(self, url, method, payload=None, need_result=True, callback=None): if self.stop_event.is_set(): return try: if method == "POST": if need_result: content = post_string(url, payload) else: if callback is None: callback = self.master_post_result_callback post_string_noreturn(url, payload, result_callback=callback) return elif method == "GET": content = get_string(url) else: raise Exception("Invalid method %s" % method) return 200, content except: ciel.log("Error attempting to contact master, aborting", "MSTRPRXY", logging.WARNING, True) raise
def master_main(options): create_pycurl_thread(ciel.engine) deferred_worker = DeferredWorkPlugin(ciel.engine) deferred_worker.subscribe() worker_pool = WorkerPool(ciel.engine, deferred_worker, None) worker_pool.subscribe() task_failure_investigator = TaskFailureInvestigator(worker_pool, deferred_worker) job_pool = JobPool(ciel.engine, options.journaldir, None, task_failure_investigator, deferred_worker, worker_pool) job_pool.subscribe() worker_pool.job_pool = job_pool backup_sender = BackupSender(cherrypy.engine) backup_sender.subscribe() if options.hostname is not None: local_hostname = options.hostname else: local_hostname = socket.getfqdn() local_port = cherrypy.config.get('server.socket_port') master_netloc = '%s:%d' % (local_hostname, local_port) ciel.log('Local port is %d' % local_port, 'STARTUP', logging.INFO) if options.blockstore is None: static_content_root = tempfile.mkdtemp(prefix=os.getenv('TEMP', default='/tmp/sw-files-')) else: static_content_root = options.blockstore block_store_dir = os.path.join(static_content_root, "data") try: os.mkdir(block_store_dir) except: pass block_store = BlockStore(local_hostname, local_port, block_store_dir) block_store.build_pin_set() block_store.check_local_blocks() if options.master is not None: monitor = MasterRecoveryMonitor(cherrypy.engine, 'http://%s/' % master_netloc, options.master, job_pool) monitor.subscribe() else: monitor = None recovery_manager = RecoveryManager(ciel.engine, job_pool, block_store, deferred_worker) recovery_manager.subscribe() root = MasterRoot(worker_pool, block_store, job_pool, backup_sender, monitor) cherrypy.config.update({"server.thread_pool" : 50}) cherrypy_conf = dict() if options.staticbase is not None: cherrypy_conf["/skyweb"] = { "tools.staticdir.on": True, "tools.staticdir.dir": options.staticbase } app = cherrypy.tree.mount(root, "", cherrypy_conf) lighty_conf_template = options.lighty_conf if lighty_conf_template is not None: lighty = LighttpdAdapter(ciel.engine, lighty_conf_template, static_content_root, local_port) lighty.subscribe() # Zap CherryPy's original flavour server cherrypy.server.unsubscribe() server = cherrypy.process.servers.FlupFCGIServer(application=app, bindAddress=lighty.socket_path) adapter = cherrypy.process.servers.ServerAdapter(cherrypy.engine, httpserver=server, bind_addr=lighty.socket_path) # Insert a FastCGI server in its place adapter.subscribe() if hasattr(ciel.engine, "signal_handler"): ciel.engine.signal_handler.subscribe() if hasattr(ciel.engine, "console_control_handler"): ciel.engine.console_control_handler.subscribe() ciel.engine.start() if options.workerlist is not None: master_details = {'netloc': master_netloc} master_details_as_json = simplejson.dumps(master_details) with (open(options.workerlist, "r")) as f: for worker_url in f.readlines(): try: post_string(urllib2.urlparse.urljoin(worker_url, 'control/master/'), master_details_as_json) # Worker will be created by a callback. except: ciel.log.error("Error adding worker: %s" % (worker_url, ), "WORKER", logging.WARNING) ciel.engine.block()