def smart_delay(self, *args, **kwargs): """Return cached result if it exists, send job to celery if needed""" # check cache id_str = json.dumps([self.task_key, args, kwargs]) cache_key = b64encode(id_str) cached = self.memcache.get(cache_key) if cached: age = time() - cached['timestamp'] if age > self.result_fresh: amqp_log("%s: scheduling task" % id_str) self.delay(*args, **kwargs) if age < self.result_expires: amqp_log("%s: smart delay cache hit" % id_str) return cached['payload'] else: self.delay(*args, **kwargs)
def on_stats(self, backend_id, machine_id, start, stop, step, requestID): try: data = get_stats(self.user, backend_id, machine_id, start - 50, stop + 50, step / 1000) except Exception as exc: amqp_log("Error getting stats: %r" % exc) return ret = { 'backend_id': backend_id, 'machine_id': machine_id, 'start': start, 'stop': stop, 'requestID': requestID, 'metrics': data, } self.emit('stats', ret)
def on_stats(self, backend_id, machine_id, start, stop, step, request_id, metrics): error = False try: data = get_stats(self.user, backend_id, machine_id, start, stop, step) except BadRequestError as exc: error = str(exc) data = [] except Exception as exc: amqp_log("Error getting stats: %r" % exc) return ret = { 'backend_id': backend_id, 'machine_id': machine_id, 'start': start, 'stop': stop, 'request_id': request_id, 'metrics': data, } if error: ret['error'] = error self.emit('stats', ret)
def run_deploy_script(self, email, backend_id, machine_id, command, key_id=None, username=None, password=None, port=22): from mist.io.methods import ssh_command, connect_provider from mist.io.methods import notify_user, notify_admin user = user_from_email(email) try: # find the node we're looking for and get its hostname conn = connect_provider(user.backends[backend_id]) nodes = conn.list_nodes() node = None for n in nodes: if n.id == machine_id: node = n break if node and len(node.public_ips): # filter out IPv6 addresses ips = filter(lambda ip: ':' not in ip, node.public_ips) host = ips[0] else: raise self.retry(exc=Exception(), countdown=60, max_retries=5) try: from mist.io.shell import Shell shell = Shell(host) key_id, ssh_user = shell.autoconfigure(user, backend_id, node.id, key_id, username, password, port) start_time = time() retval, output = shell.command(command) execution_time = time() - start_time shell.disconnect() msg = """ Command: %s Return value: %s Duration: %s seconds Output: %s""" % (command, retval, execution_time, output) if retval: notify_user(user, "Deployment script failed for machine %s (%s)" % (node.name, node.id), msg) amqp_log("Deployment script failed for user %s machine %s (%s): %s" % (user, node.name, node.id, msg)) else: notify_user(user, "Deployment script succeeded for machine %s (%s)" % (node.name, node.id), msg) amqp_log("Deployment script succeeded for user %s machine %s (%s): %s" % (user, node.name, node.id, msg)) except ServiceUnavailableError as exc: raise self.retry(exc=exc, countdown=60, max_retries=5) except Exception as exc: if str(exc).startswith('Retry'): return amqp_log("Deployment script failed for machine %s in backend %s by user %s after 5 retries: %s" % (node.id, backend_id, email, repr(exc))) notify_user(user, "Deployment script failed for machine %s after 5 retries" % node.id) notify_admin("Deployment script failed for machine %s in backend %s by user %s after 5 retries" % (node.id, backend_id, email), repr(exc))
def post_deploy_steps(self, email, backend_id, machine_id, monitoring, command, key_id=None, username=None, password=None, port=22): from mist.io.methods import ssh_command, connect_provider, enable_monitoring from mist.io.methods import notify_user, notify_admin if multi_user: from mist.core.methods import enable_monitoring else: from mist.io.methods import enable_monitoring user = user_from_email(email) try: # find the node we're looking for and get its hostname conn = connect_provider(user.backends[backend_id]) nodes = conn.list_nodes() node = None for n in nodes: if n.id == machine_id: node = n break if node and len(node.public_ips): # filter out IPv6 addresses ips = filter(lambda ip: ':' not in ip, node.public_ips) host = ips[0] else: raise self.retry(exc=Exception(), countdown=120, max_retries=5) try: from mist.io.shell import Shell shell = Shell(host) # connect with ssh even if no command, to create association # to be able to enable monitoring key_id, ssh_user = shell.autoconfigure( user, backend_id, node.id, key_id, username, password, port ) if command: start_time = time() retval, output = shell.command(command) execution_time = time() - start_time output = output.decode('utf-8','ignore') msg = ("Command: %s\n" "Return value: %s\n" "Duration: %d seconds\n" "Output:%s\n") % (command, retval, execution_time, output) msg = msg.encode('utf-8', 'ignore') msg_title = "Deployment script %s for machine %s (%s)" % ( 'failed' if retval else 'succeeded', node.name, node.id ) notify_user(user, msg_title, msg) shell.disconnect() if monitoring: try: enable_monitoring(user, backend_id, node.id, name=node.name, dns_name=node.extra.get('dns_name',''), public_ips=ips, no_ssh=False, dry=False, ) except Exception as e: print repr(e) notify_user(user, "Enable monitoring failed for machine %s (%s)" % (node.name, node.id), repr(e)) notify_admin('Enable monitoring on creation failed for user %s machine %s: %r' % (email, node.name, e)) except (ServiceUnavailableError, SSHException) as exc: raise self.retry(exc=exc, countdown=60, max_retries=5) except Exception as exc: if str(exc).startswith('Retry'): raise amqp_log("Deployment script failed for machine %s in backend %s by user %s after 5 retries: %s" % (node.id, backend_id, email, repr(exc))) notify_user(user, "Deployment script failed for machine %s after 5 retries" % node.id) notify_admin("Deployment script failed for machine %s in backend %s by user %s after 5 retries" % (node.id, backend_id, email), repr(exc))
def run(self, *args, **kwargs): email = args[0] # seq_id is an id for the sequence of periodic tasks, to avoid # running multiple concurrent sequences of the same task with the # same arguments. it is empty on first run, constant afterwards seq_id = kwargs.pop('seq_id', '') id_str = json.dumps([self.task_key, args, kwargs]) cache_key = b64encode(id_str) cached_err = self.memcache.get(cache_key + 'error') if cached_err: # task has been failing recently if seq_id != cached_err['seq_id']: # other sequence of task already handling this error flow # This is not working! Passing instead #return pass if not amqp_user_listening(email): # noone is waiting for result, stop trying, but flush cached erros if cached_err: self.memcache.delete(cache_key + 'error') return # check cache to stop iteration if other sequence has started cached = self.memcache.get(cache_key) if cached: if seq_id and seq_id != cached['seq_id']: amqp_log("%s: found new cached seq_id [%s], " "stopping iteration of [%s]" % (id_str, cached['seq_id'], seq_id)) return elif not seq_id and time() - cached['timestamp'] < self.result_fresh: amqp_log("%s: fresh task submitted with fresh cached result " ", dropping" % id_str) return if not seq_id: # this task is called externally, not a rerun, create a seq_id amqp_log("%s: fresh task submitted [%s]" % (id_str, seq_id)) seq_id = uuid4().hex # actually run the task try: data = self.execute(*args, **kwargs) except Exception as exc: # error handling now = time() if not cached_err: cached_err = {'seq_id': seq_id, 'timestamps': []} cached_err['timestamps'].append(now) x0 = cached_err['timestamps'][0] rel_points = [x - x0 for x in cached_err['timestamps']] rerun = self.error_rerun_handler(exc, rel_points, *args, **kwargs) if rerun is not None: self.memcache.set(cache_key + 'error', cached_err) kwargs['seq_id'] = seq_id self.apply_async(args, kwargs, countdown=rerun) else: self.memcache.delete(cache_key + 'error') amqp_log("%s: error %r, rerun %s" % (id_str, exc, rerun)) return else: if cached_err: self.memcache.delete(cache_key + 'error') cached = {'timestamp': time(), 'payload': data, 'seq_id': seq_id} ok = amqp_publish_user(email, routing_key=self.task_key, data=data) if not ok: # echange closed, no one gives a shit, stop repeating, why try? amqp_log("%s: exchange closed" % id_str) return kwargs['seq_id'] = seq_id self.memcache.set(cache_key, cached) if self.polling: amqp_log("%s: will rerun in %d secs [%s]" % (id_str, self.result_fresh, seq_id)) self.apply_async(args, kwargs, countdown=self.result_fresh)
def run(self, *args, **kwargs): email = args[0] # seq_id is an id for the sequence of periodic tasks, to avoid # running multiple concurrent sequences of the same task with the # same arguments. it is empty on first run, constant afterwards seq_id = kwargs.pop('seq_id', '') id_str = json.dumps([self.task_key, args, kwargs]) cache_key = b64encode(id_str) cached_err = self.memcache.get(cache_key + 'error') if cached_err: # task has been failing recently if seq_id != cached_err['seq_id']: # other sequence of task already handling this error flow # This is not working! Passing instead #return pass if not amqp_user_listening(email): # noone is waiting for result, stop trying, but flush cached erros if cached_err: self.memcache.delete(cache_key + 'error') return # check cache to stop iteration if other sequence has started cached = self.memcache.get(cache_key) if cached: if seq_id and seq_id != cached['seq_id']: amqp_log("%s: found new cached seq_id [%s], " "stopping iteration of [%s]" % (id_str, cached['seq_id'], seq_id)) return elif not seq_id and time( ) - cached['timestamp'] < self.result_fresh: amqp_log("%s: fresh task submitted with fresh cached result " ", dropping" % id_str) return if not seq_id: # this task is called externally, not a rerun, create a seq_id amqp_log("%s: fresh task submitted [%s]" % (id_str, seq_id)) seq_id = uuid4().hex # actually run the task try: data = self.execute(*args, **kwargs) except Exception as exc: # error handling now = time() if not cached_err: cached_err = {'seq_id': seq_id, 'timestamps': []} cached_err['timestamps'].append(now) x0 = cached_err['timestamps'][0] rel_points = [x - x0 for x in cached_err['timestamps']] rerun = self.error_rerun_handler(exc, rel_points, *args, **kwargs) if rerun is not None: self.memcache.set(cache_key + 'error', cached_err) kwargs['seq_id'] = seq_id self.apply_async(args, kwargs, countdown=rerun) else: self.memcache.delete(cache_key + 'error') amqp_log("%s: error %r, rerun %s" % (id_str, exc, rerun)) return else: if cached_err: self.memcache.delete(cache_key + 'error') cached = {'timestamp': time(), 'payload': data, 'seq_id': seq_id} ok = amqp_publish_user(email, routing_key=self.task_key, data=data) if not ok: # echange closed, no one gives a shit, stop repeating, why try? amqp_log("%s: exchange closed" % id_str) return kwargs['seq_id'] = seq_id self.memcache.set(cache_key, cached) if self.polling: amqp_log("%s: will rerun in %d secs [%s]" % (id_str, self.result_fresh, seq_id)) self.apply_async(args, kwargs, countdown=self.result_fresh)
def run_deploy_script(self, email, backend_id, machine_id, command, key_id=None, username=None, password=None, port=22): from mist.io.methods import ssh_command, connect_provider from mist.io.methods import notify_user, notify_admin user = user_from_email(email) try: # find the node we're looking for and get its hostname conn = connect_provider(user.backends[backend_id]) nodes = conn.list_nodes() node = None for n in nodes: if n.id == machine_id: node = n break if node and len(node.public_ips): # filter out IPv6 addresses ips = filter(lambda ip: ':' not in ip, node.public_ips) host = ips[0] else: raise self.retry(exc=Exception(), countdown=60, max_retries=5) try: from mist.io.shell import Shell shell = Shell(host) key_id, ssh_user = shell.autoconfigure(user, backend_id, node.id, key_id, username, password, port) start_time = time() retval, output = shell.command(command) execution_time = time() - start_time shell.disconnect() msg = """ Command: %s Return value: %s Duration: %s seconds Output: %s""" % (command, retval, execution_time, output) if retval: notify_user( user, "Deployment script failed for machine %s (%s)" % (node.name, node.id), msg) amqp_log( "Deployment script failed for user %s machine %s (%s): %s" % (user, node.name, node.id, msg)) else: notify_user( user, "Deployment script succeeded for machine %s (%s)" % (node.name, node.id), msg) amqp_log( "Deployment script succeeded for user %s machine %s (%s): %s" % (user, node.name, node.id, msg)) except ServiceUnavailableError as exc: raise self.retry(exc=exc, countdown=60, max_retries=5) except Exception as exc: if str(exc).startswith('Retry'): return amqp_log( "Deployment script failed for machine %s in backend %s by user %s after 5 retries: %s" % (node.id, backend_id, email, repr(exc))) notify_user( user, "Deployment script failed for machine %s after 5 retries" % node.id) notify_admin( "Deployment script failed for machine %s in backend %s by user %s after 5 retries" % (node.id, backend_id, email), repr(exc))
def run(self, *args, **kwargs): email = args[0] # seq_id is an id for the sequence of periodic tasks, to avoid # running multiple concurrent sequences of the same task with the # same arguments. it is empty on first run, constant afterwards seq_id = kwargs.pop("seq_id", "") id_str = json.dumps([self.task_key, args, kwargs]) cache_key = b64encode(id_str) cached_err = self.memcache.get(cache_key + "error") if cached_err: # task has been failing recently if seq_id != cached_err["seq_id"]: if seq_id: # other sequence of tasks has taken over return else: # taking over from other sequence cached_err = None # cached err will be deleted or overwritten in a while # self.memcache.delete(cache_key + 'error') if not amqp_user_listening(email): # noone is waiting for result, stop trying, but flush cached erros self.memcache.delete(cache_key + "error") return # check cache to stop iteration if other sequence has started cached = self.memcache.get(cache_key) if cached: if seq_id and seq_id != cached["seq_id"]: amqp_log( "%s: found new cached seq_id [%s], " "stopping iteration of [%s]" % (id_str, cached["seq_id"], seq_id) ) return elif not seq_id and time() - cached["timestamp"] < self.result_fresh: amqp_log("%s: fresh task submitted with fresh cached result " ", dropping" % id_str) return if not seq_id: # this task is called externally, not a rerun, create a seq_id amqp_log("%s: fresh task submitted [%s]" % (id_str, seq_id)) seq_id = uuid4().hex # actually run the task try: data = self.execute(*args, **kwargs) except Exception as exc: # error handling if isinstance(exc, SoftTimeLimitExceeded): log.error("SoftTimeLimitExceeded: %s", id_str) now = time() if not cached_err: cached_err = {"seq_id": seq_id, "timestamps": []} cached_err["timestamps"].append(now) x0 = cached_err["timestamps"][0] rel_points = [x - x0 for x in cached_err["timestamps"]] rerun = self.error_rerun_handler(exc, rel_points, *args, **kwargs) if rerun is not None: self.memcache.set(cache_key + "error", cached_err) kwargs["seq_id"] = seq_id self.apply_async(args, kwargs, countdown=rerun) else: self.memcache.delete(cache_key + "error") amqp_log("%s: error %r, rerun %s" % (id_str, exc, rerun)) return else: self.memcache.delete(cache_key + "error") cached = {"timestamp": time(), "payload": data, "seq_id": seq_id} ok = amqp_publish_user(email, routing_key=self.task_key, data=data) if not ok: # echange closed, no one gives a shit, stop repeating, why try? amqp_log("%s: exchange closed" % id_str) return kwargs["seq_id"] = seq_id self.memcache.set(cache_key, cached) if self.polling: amqp_log("%s: will rerun in %d secs [%s]" % (id_str, self.result_fresh, seq_id)) self.apply_async(args, kwargs, countdown=self.result_fresh)