def install_python_gdeploy(): attributes = {} # Install python-gdeploy on the node if NS.config.data['package_source_type'] == 'pip': name = "https://github.com/Tendrl/python-gdeploy/archive/master.tar.gz" attributes["name"] = name attributes["editable"] = "false" ansible_module_path = "packaging/language/pip.py" elif NS.config.data['package_source_type'] == 'rpm': name = "python-gdeploy" ansible_module_path = "packaging/os/yum.py" attributes["name"] = name else: raise FlowExecutionFailedError( "Failed to install python-gdeploy. Invalid package source type") try: runner = ansible_module_runner.AnsibleRunner(ansible_module_path, **attributes) except ansible_module_runner.AnsibleModuleNotFound: # Backward compat ansible<=2.2 runner = ansible_module_runner.AnsibleRunner( "core/" + ansible_module_path, **attributes) try: result, err = runner.run() if result.get('failed', None): raise FlowExecutionFailedError( "Failed to install python-gdeploy. %s" % result['msg']) except ansible_module_runner.AnsibleExecutableGenerationFailed: raise FlowExecutionFailedError("Failed to install python-gdeploy")
def install_gdeploy(): # Install gdeploy on the node ansible_module_path = "packaging/os/yum.py" attributes = dict() attributes["name"] = "gdeploy" try: runner = ansible_module_runner.AnsibleRunner( ansible_module_path, **attributes ) except ansible_module_runner.AnsibleModuleNotFound: # Backward compat ansible<=2.2 runner = ansible_module_runner.AnsibleRunner( "core/" + ansible_module_path, **attributes ) try: result, err = runner.run() if result.get('failed', None): raise FlowExecutionFailedError( "Failed to install gdeploy. %s" % result['msg'] ) except ansible_module_runner.AnsibleExecutableGenerationFailed: raise FlowExecutionFailedError( "Failed to install gdeploy" )
def wait_for_task(task_id): count = 0 plugin = NS.ceph_provisioner.get_plugin() resp = {} while count < 90: gevent.sleep(10) resp = plugin.task_status(task_id) if resp: if resp["ended"]: if resp["succeeded"]: return else: stderr = resp.get( "stderr", "ceph-installer task_id %s " "failed and did not complete" % task_id) stdout = resp.get("stdout", "") raise FlowExecutionFailedError( dict(ceph_installer_task_id=task_id, ceph_installer_task_stdout=stdout, ceph_installer_task_stderr=stderr)) count = count + 1 stderr = resp.get( "stderr", "ceph-installer task_id %s timed out and did " "not complete" % task_id) stdout = resp.get("stdout", "") raise FlowExecutionFailedError( dict(ceph_installer_task_id=task_id, ceph_installer_task_stdout=stdout, ceph_installer_task_stderr=stderr))
def acquire_node_lock(parameters): # check node_id is present for node in parameters['Node[]']: if not NS.tendrl.objects.NodeContext(node_id=node).exists(): raise FlowExecutionFailedError("Unknown Node %s, cannot lock" % node) # check job is parent or child job = NS.tendrl.objects.Job(job_id=parameters['job_id']).load() p_job_id = None if "parent" in job.payload: p_job_id = job.payload['parent'] for node_id in parameters['Node[]']: nc = NS.tendrl.objects.NodeContext(node_id=node_id).load() try: lock_owner_job = nc.locked_by # If the parent job has aquired lock on participating nodes, # dont you worry child job :) if p_job_id is not None and lock_owner_job is not None: if p_job_id == lock_owner_job: continue else: # if the locker owner job is already finished or # failed, we should allow other flows to # acquire the lock. job = NS.tendrl.objects.Job(job_id=lock_owner_job).load() if job and job.status in ["finished", "failed"]: continue else: raise FlowExecutionFailedError( "Cannot proceed further, " "Node (%s) is already locked " "by Job (%s)" % (node_id, lock_owner_job)) except EtcdKeyNotFound: # To check what are all the nodes are already locked continue for node_id in parameters['Node[]']: nc = NS.tendrl.objects.NodeContext(node_id=node_id).load() lock_owner_job = nc.locked_by if p_job_id is not None and lock_owner_job is not None and \ p_job_id == lock_owner_job: continue else: lock_owner_job = str(parameters["job_id"]) nc.locked_by = lock_owner_job nc.save() logger.log("info", NS.publisher_id, { "message": "Acquired lock (%s) on (%s)" % (lock_owner_job, node_id) }, job_id=parameters['job_id'], flow_id=parameters['flow_id'])
def run(self): super(SetupClusterAlias, self).run() integration_id = self.parameters["TendrlContext.integration_id"] short_name = self.parameters.get("Cluster.short_name") alias_dir_path = "%snames" % graphite_utils.get_data_dir_path() if not os.path.exists(alias_dir_path): try: os.makedirs(str(alias_dir_path)) except OSError as ex: raise FlowExecutionFailedError( "Failed to create cluster alias dir: (%s)" " .Error: (%s)" % (str(alias_dir_path), ex) ) if short_name in [None, ""]: short_name = integration_id os.symlink( "%s/clusters/%s" % ( graphite_utils.get_data_dir_path(), integration_id ), "%s/%s" % (alias_dir_path, short_name) ) # Assign permission for carbon user try: storage_dir_path = graphite_utils.get_graphite_path( "cache", "storage_dir" ) graphite_utils.change_owner( storage_dir_path, "carbon", "carbon", recursive=True ) except (KeyError, OSError, TypeError) as ex: raise FlowExecutionFailedError( "Unable to modify the ownership of %s" % storage_dir_path ) logger.log( "debug", NS.publisher_id, { "message": "Link %s -> %s created" % ( "%s/%s" % (alias_dir_path, short_name), "%s/clusters/%s" % ( graphite_utils.get_data_dir_path(), integration_id ) ) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return True
def expand_gluster(parameters): node_ips = get_node_ips(parameters) plugin = NS.gluster_provisioner.get_plugin() Event( Message(job_id=parameters['job_id'], flow_id=parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Setting up gluster nodes %s" % parameters['TendrlContext.integration_id'] })) ret_val = plugin.setup_gluster_node(node_ips, repo=NS.config.data.get( 'glusterfs_repo', None)) if ret_val is not True: raise FlowExecutionFailedError("Error setting up gluster node") Event( Message(job_id=parameters['job_id'], flow_id=parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Expanding gluster cluster %s" % parameters['TendrlContext.integration_id'] })) failed_nodes = [] for node in node_ips: ret_val = plugin.expand_gluster_cluster(node) if not ret_val: failed_nodes.append(node) if failed_nodes: raise FlowExecutionFailedError( "Error expanding gluster cluster. Following nodes failed: %s" % ",".join(failed_nodes)) Event( Message(job_id=parameters['job_id'], flow_id=parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Expanded Gluster Cluster %s." " New nodes are: %s" % (parameters['TendrlContext.integration_id'], ",".join(node_ips)) }))
def acquire_node_lock(parameters): # check node_id is present for node in parameters['Node[]']: try: NS._int.client.read("/nodes/%s" % node) except EtcdKeyNotFound: raise FlowExecutionFailedError( "Unknown Node %s, cannot lock" % node) # check job is parent or child job = Job(job_id=parameters['job_id']).load() p_job_id = None if "parent" in job.payload: p_job_id = job.payload['parent'] for node in parameters['Node[]']: key = "/nodes/%s/locked_by" % node try: lock_owner_job = NS._int.client.read(key).value # If the parent job has aquired lock on participating nodes, # dont you worry child job :) if p_job_id == lock_owner_job: continue else: raise FlowExecutionFailedError("Cannot proceed further, " "Node (%s) is already locked " "by Job (%s)" % (node, lock_owner_job) ) except EtcdKeyNotFound: # To check what are all the nodes are already locked continue for node in parameters['Node[]']: try: lock_owner_job = NS._int.client.read(key).value if p_job_id == lock_owner_job: continue except EtcdKeyNotFound: lock_owner_job = str(parameters["job_id"]) key = "nodes/%s/locked_by" % node NS._int.client.write(key, lock_owner_job) Event( Message( job_id=parameters['job_id'], flow_id=parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Acquired lock (%s) for Node (%s)" % ( lock_owner_job, node) } ) )
def gluster_create_ssh_setup_jobs(parameters, skip_current_node=False): node_list = copy.deepcopy(parameters['Node[]']) ssh_job_ids = [] ssh_key, err = NS.gluster_provisioner.get_plugin().setup() if err != "": _msg = "Error generating ssh key on node %s" % NS.node_context.node_id logger.log("error", NS.publisher_id, {"message": _msg}, job_id=parameters['job_id'], flow_id=parameters['flow_id']) raise FlowExecutionFailedError(_msg) if not skip_current_node: ret_val, err = authorize_key.AuthorizeKey(ssh_key).run() if ret_val is not True or err != "": _msg = "Error adding authorized key for node %s" % \ NS.node_context.node_id logger.log("error", NS.publisher_id, {"message": _msg}, job_id=parameters['job_id'], flow_id=parameters['flow_id']) raise FlowExecutionFailedError(_msg) node_list.remove(NS.node_context.node_id) for node in node_list: if node == NS.node_context.node_id: continue new_params = parameters.copy() new_params['Node[]'] = [node] new_params['ssh_key'] = ssh_key # Create same flow for each node from list except this one payload = { "tags": ["tendrl/node_%s" % node], "run": "tendrl.flows.AuthorizeSshKey", "status": "new", "parameters": new_params, "parent": parameters['job_id'], "type": "node" } _job_id = str(uuid.uuid4()) NS.tendrl.objects.Job(job_id=_job_id, status="new", payload=payload).save() ssh_job_ids.append(_job_id) logger.log("info", NS.publisher_id, { "message": "Created SSH setup job(jobID: %s) for %s" % (_job_id, node) }, job_id=parameters['job_id'], flow_id=parameters['flow_id']) return ssh_job_ids
def run(self): integration_id = self.parameters.get("TendrlContext.integration_id") # Delete the cluster related alert dashboards grafana_utils.delete_panel(integration_id) # Archive the carbon data for the cluster archive_base_path = "%s/clusters" % ( NS.config.data.get( "graphite_archive_path", "/usr/share/tendrl/graphite/archive" ) ) if not os.path.exists(archive_base_path): try: os.makedirs(str(archive_base_path)) except OSError as ex: raise FlowExecutionFailedError( "Failed to create archive dir: (%s)" "for monitoring data. Error: (%s)" % (str(archive_base_path), ex) ) archive_path = "%s/%s_%s" % ( archive_base_path, integration_id, str(datetime.datetime.now().isoformat()) ) resource_path = "%s/clusters/%s" % \ ( graphite_utils.get_data_dir_path(), integration_id ) try: shutil.move(resource_path, archive_path) except Exception as ex: raise FlowExecutionFailedError( "Failed to archive the monitoring data. Error: (%s)" % ex ) # Log an event mentioning the archive data location logger.log( "debug", NS.publisher_id, { "message": "Cluster %s moved to un-managed state.\n" "The archived monitoring data available at: %s" % (integration_id, archive_path) } ) return True
def create_gluster(parameters): node_ips = get_node_ips(parameters) plugin = NS.gluster_provisioner.get_plugin() Event( Message( job_id=parameters['job_id'], flow_id=parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={"message": "Setting up gluster nodes %s" % parameters['TendrlContext.integration_id'] } ) ) ret_val = plugin.setup_gluster_node( node_ips, repo=NS.config.data.get('glusterfs_repo', None) ) if ret_val is not True: raise FlowExecutionFailedError("Error setting up gluster node") Event( Message( job_id=parameters['job_id'], flow_id=parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={"message": "Creating gluster cluster %s" % parameters['TendrlContext.integration_id'] } ) ) ret_val = plugin.create_gluster_cluster(node_ips) if ret_val is not True: raise FlowExecutionFailedError("Error creating gluster cluster") Event( Message( job_id=parameters['job_id'], flow_id=parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={"message": "Created Gluster Cluster %s" % parameters['TendrlContext.integration_id'] } ) )
def run(self): super(SetupClusterAlias, self).run() integration_id = self.parameters["TendrlContext.integration_id"] short_name = self.parameters.get("Cluster.short_name") alias_dir_path = "%snames" % graphite_utils.get_data_dir_path() if not os.path.exists(alias_dir_path): try: os.makedirs(str(alias_dir_path)) except OSError as ex: raise FlowExecutionFailedError( "Failed to create cluster alias dir: (%s)" " .Error: (%s)" % (str(alias_dir_path), ex)) if short_name in [None, ""]: short_name = integration_id os.symlink( "%s/clusters/%s" % (graphite_utils.get_data_dir_path(), integration_id), "%s/%s" % (alias_dir_path, short_name)) logger.log( "debug", NS.publisher_id, { "message": "Link %s -> %s created" % ("%s/%s" % (alias_dir_path, short_name), "%s/clusters/%s" % (graphite_utils.get_data_dir_path(), integration_id)) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) return True
def run(self): _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id).load() if _cluster.is_managed != "yes": raise FlowExecutionFailedError('Cluster is not managed') self.parameters['Service.name'] = 'collectd' super(ConfigureMonitoring, self).run()
def run(self): integration_id = self.parameters['TendrlContext.integration_id'] if "Node[]" not in self.parameters: try: integration_id_index_key = \ "indexes/tags/tendrl/integration/%s" % integration_id _node_ids = NS._int.client.read( integration_id_index_key).value self.parameters["Node[]"] = json.loads(_node_ids) except etcd.EtcdKeyNotFound: raise FlowExecutionFailedError("Cluster with " "integration_id " "(%s) not found, cannot " "import" % integration_id) else: # TODO(shtripat) ceph-installer is auto detected and # provisioner/$integration_id # tag is set , below is not required for ceph current_tags = list(NS.node_context.tags) new_tags = ['provisioner/%s' % integration_id] new_tags += current_tags new_tags = list(set(new_tags)) if new_tags != current_tags: NS.node_context.tags = new_tags NS.node_context.save() _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() _cluster.enable_volume_profiling = self.parameters[ 'Cluster.enable_volume_profiling'] _cluster.save() super(ImportCluster, self).run()
def expand_gluster(parameters): node_ips = get_node_ips(parameters) plugin = NS.gluster_provisioner.get_plugin() cluster = NS.tendrl.objects.Cluster( integration_id=parameters['TendrlContext.integration_id']).load() logger.log( "info", NS.publisher_id, { "message": "Setting up gluster nodes for cluster %s" % cluster.short_name }, job_id=parameters['job_id'], flow_id=parameters['flow_id'], ) ret_val = plugin.setup_gluster_node(node_ips, repo=NS.config.data.get( 'glusterfs_repo', None)) if ret_val is not True: raise FlowExecutionFailedError("Error setting up gluster node") logger.log( "info", NS.publisher_id, {"message": "Expanding gluster cluster %s" % cluster.short_name}, job_id=parameters['job_id'], flow_id=parameters['flow_id']) failed_nodes = [] for node in node_ips: ret_val = plugin.expand_gluster_cluster(node) if not ret_val: failed_nodes.append(node) if failed_nodes: raise FlowExecutionFailedError( "Error expanding gluster cluster. Following nodes failed: %s" % ",".join(failed_nodes)) logger.log("info", NS.publisher_id, { "message": "Expanded Gluster Cluster %s" " with nodes %s" % (cluster.short_name, ",".join(node_ips)) }, job_id=parameters['job_id'], flow_id=parameters['flow_id'])
def run(self): volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=self.parameters['Volume.vol_id']).load() if 'job_id' in volume.locked_by \ and 'status' in volume.current_job \ and volume.current_job['status'] in ['in_progress']: raise FlowExecutionFailedError( "Another job in progress for volume." " Please wait till the job finishes " "(job_id: %s) (volume: %s) (integration_id: %s) " % (volume.current_job['job_id'], volume.name, NS.tendrl_context.integration_id)) _lock_details = { 'node_id': NS.node_context.node_id, 'fqdn': NS.node_context.fqdn, 'tags': NS.node_context.tags, 'type': NS.type, 'job_name': self.__class__.__name__, 'job_id': self.job_id } volume.locked_by = _lock_details volume.current_job = { 'job_id': self.job_id, 'job_name': self.__class__.__name__, 'status': "in_progress" } volume.save() try: super(StopProfiling, self).run() except (FlowExecutionFailedError, AtomExecutionFailedError, Exception) as ex: volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=self.parameters['Volume.vol_id']).load() volume.current_job = { 'job_id': self.job_id, 'job_name': self.__class__.__name__, 'status': "failed" } volume.locked_by = {} volume.save(update=False) raise ex volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=self.parameters['Volume.vol_id']).load() volume.current_job = { 'job_id': self.job_id, 'job_name': self.__class__.__name__, 'status': "finished" } volume.locked_by = {} volume.save(update=False) return True
def run(self): try: all_node_status_up = True # check job is parent or child job = NS.tendrl.objects.Job( job_id=self.parameters['job_id']).load() if "parent" not in job.payload: # fetch node id using integration_id integration_id = self.parameters[ 'TendrlContext.integration_id'] key = "indexes/tags/tendrl/integration/%s" % \ integration_id node_ids_str = etcd_utils.read(key).value node_ids = json.loads(node_ids_str) # identifying node status using node_id logger.log( "info", NS.publisher_id, {"message": "Checking if nodes %s are up" % str(node_ids)}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) nodes_up = [] nodes_down = [] for node in node_ids: node = str(node) # if node_context not found it will give status DOWN node_context = NS.tendrl.objects.NodeContext( node_id=node, status='DOWN').load() if node_context.status == "UP": nodes_up.append(node) else: all_node_status_up = False nodes_down.append(node) if all_node_status_up: logger.log( "info", NS.publisher_id, {"message": "Status of nodes %s are up" % nodes_up}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) else: logger.log("info", NS.publisher_id, { "message": "Status of nodes %s are down" % nodes_down }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) # no need to check for child job return all_node_status_up except (etcd.EtcdKeyNotFound, TypeError) as ex: raise FlowExecutionFailedError( "Error checking status of nodes .error: %s" % str(ex))
def _execute_atom(self, atom_fqdn): try: ns, atom_name = atom_fqdn.split(".atoms.") ns, obj_name = ns.split(".objects.") ns_str = ns.split(".")[-1] if "integrations" in ns: current_ns = getattr(NS.integrations, ns_str) else: current_ns = getattr(NS, ns_str) runnable_atom = current_ns.ns.get_atom(obj_name, atom_name) try: ret_val = runnable_atom(parameters=self.parameters).run() return ret_val except AtomExecutionFailedError: exc_type, exc_value, exc_traceback = sys.exc_info() raise FlowExecutionFailedError( str( traceback.format_exception(exc_type, exc_value, exc_traceback))) except (KeyError, AttributeError) as ex: _msg = "Could not find atom {0}".format(atom_fqdn) logger.log("error", NS.publisher_id, {"message": _msg}, job_id=self.job_id, flow_id=self.parameters['flow_id']) Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": _msg, "exception": ex })) return False
def run(self): integration_id = self.parameters['TendrlContext.integration_id'] if integration_id is None: _msg = "TendrlContext.integration_id cannot be empty" raise FlowExecutionFailedError(_msg) if "Cluster.node_configuration" not in self.parameters.keys(): _msg = "Cluster.node_configuration cannot be empty" raise FlowExecutionFailedError(_msg) ssh_job_ids = [] ssh_setup_script = NS.ceph_provisioner.get_plugin().setup() for node_id in self.parameters["Cluster.node_configuration"].keys(): new_params = dict() new_params['Node[]'] = [node_id] new_params['ssh_setup_script'] = ssh_setup_script payload = { "tags": ["tendrl/node_%s" % node_id], "run": "tendrl.flows.SetupSsh", "status": "new", "parameters": new_params, "parent": self.parameters['job_id'], "type": "node" } _job_id = str(uuid.uuid4()) Job(job_id=_job_id, status="new", payload=payload).save() ssh_job_ids.append(_job_id) Event( Message(job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Created SSH setup job %s for node" " %s" % (_job_id, node_id) })) while True: time.sleep(3) all_status = {} for job_id in ssh_job_ids: # noinspection PyUnresolvedReferences all_status[job_id] = NS._int.client.read("/queue/%s/status" % job_id).value _failed = { _jid: status for _jid, status in all_status.iteritems() if status == "failed" } if _failed: raise FlowExecutionFailedError( "SSH setup failed for jobs %s cluster %s" % (str(_failed), integration_id)) if all([status == "finished" for status in all_status.values()]): Event( Message(job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "SSH setup completed for all nodes" })) break Event( Message(job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Adding OSDs to ceph cluster %s" % integration_id })) add_osds(self.parameters)
def run(self): integration_id = self.parameters['TendrlContext.integration_id'] _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id).load() if _cluster.status is not None and _cluster.status != "" and \ _cluster.status in ["importing", "unmanaging", "expanding"]: raise FlowExecutionFailedError( "Another job in progress for cluster, please wait till " "the job finishes (job_id: %s) (integration_id: %s) " % (_cluster.current_job['job_id'], integration_id)) _lock_details = { 'node_id': NS.node_context.node_id, 'fqdn': NS.node_context.fqdn, 'tags': NS.node_context.tags, 'type': NS.type, 'job_name': self.__class__.__name__, 'job_id': self.job_id } _cluster.locked_by = _lock_details _cluster.status = "expanding" _cluster.current_job = { 'job_id': self.job_id, 'job_name': self.__class__.__name__, 'status': 'in_progress' } _cluster.save() try: integration_id_index_key = \ "indexes/tags/tendrl/integration/%s" % integration_id node_ids = etcd_utils.read(integration_id_index_key).value node_ids = json.loads(node_ids) except etcd.EtcdKeyNotFound: _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id).load() _cluster.locked_by = {} _cluster.status = "expand_pending" _cluster.current_job = { 'job_id': self.job_id, 'job_name': self.__class__.__name__, 'status': 'failed' } _cluster.save() raise FlowExecutionFailedError("Cluster with integration_id " "(%s) not found, cannot " "import" % integration_id) job_ids = [] new_peers = [] # Remove the current node from list as its already participating # in cluster for sure node_ids.remove(NS.node_context.node_id) for node_id in node_ids: _cnc = NS.tendrl.objects.ClusterNodeContext(node_id=node_id).load() if _cnc.is_managed not in [None, ""] \ and _cnc.is_managed.lower() == "yes": continue params = { 'TendrlContext.integration_id': integration_id, 'Node[]': [node_id], 'Cluster.volume_profiling_flag': _cluster.volume_profiling_flag } payload = { "tags": ["tendrl/node_%s" % node_id], "run": "tendrl.flows.ImportCluster", "status": "new", "parent": self.parameters['job_id'], "parameters": params, "type": "node" } _job_id = str(uuid.uuid4()) NS.tendrl.objects.Job(job_id=_job_id, status="new", payload=payload).save() logger.log("info", NS.publisher_id, { "message": "ImportCluster %s (jobID: %s) : " "importing host %s" % (_cluster.short_name, _job_id, node_id) }, job_id=self.parameters['job_id']) job_ids.append(_job_id) new_peers.append(node_id) loop_count = 0 # Wait for (no of nodes) * 6 minutes for import to complete wait_count = len(job_ids) * 36 while True: child_jobs_failed = [] if loop_count >= wait_count: logger.log( "info", NS.publisher_id, { "message": "Import jobs not yet complete " "on all new nodes %s on cluster %s. Timing out. " % (str(node_ids), _cluster.short_name) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id).load() _cluster.locked_by = {} _cluster.status = "expand_pending" _cluster.current_job = { 'job_id': self.job_id, 'job_name': self.__class__.__name__, 'status': 'failed' } _cluster.save() raise FlowExecutionFailedError( "Failed to expand cluster with integration_id " "(%s)" % integration_id) time.sleep(10) finished = True for job_id in job_ids: job = NS.tendrl.objects.Job(job_id=job_id).load() if job.status not in ["finished", "failed"]: finished = False elif job.status == "failed": child_jobs_failed.append(job.job_id) if finished: break else: loop_count += 1 continue if len(child_jobs_failed) > 0: _msg = "Child jobs failed are %s" % child_jobs_failed logger.log("error", NS.publisher_id, {"message": _msg}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id).load() _cluster.status = "expand_pending" _cluster.locked_by = {} _cluster.current_job = { 'status': "failed", 'job_name': self.__class__.__name__, 'job_id': self.job_id } _cluster.save() raise FlowExecutionFailedError( "Failed to expand cluster with integration_id " "(%s)" % integration_id) _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id).load() _cluster.status = "" _cluster.locked_by = {} _cluster.current_job = { 'status': "finished", 'job_name': self.__class__.__name__, 'job_id': self.job_id } _cluster.save() logger.log("info", NS.publisher_id, { "message": "Newly detected nodes: %s added to the " "cluster %s)" % (str(new_peers), _cluster.short_name), }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) return True
def run(self): try: # Lock nodes flow_utils.acquire_node_lock(self.parameters) integration_id = self.parameters['TendrlContext.integration_id'] if integration_id is None: raise FlowExecutionFailedError( "TendrlContext.integration_id cannot be empty") supported_sds = NS.compiled_definitions.get_parsed_defs( )['namespace.tendrl']['supported_sds'] sds_name = self.parameters["TendrlContext.sds_name"] if sds_name not in supported_sds: raise FlowExecutionFailedError("SDS (%s) not supported" % sds_name) ssh_job_ids = [] ssh_job_ids = \ flow_utils.gluster_create_ssh_setup_jobs( self.parameters, skip_current_node=True ) while True: time.sleep(3) all_status = {} for job_id in ssh_job_ids: job = NS.tendrl.objects.Job(job_id=job_id).load() all_status[job_id] = job.status _failed = { _jid: status for _jid, status in all_status.iteritems() if status == "failed" } if _failed: raise FlowExecutionFailedError( "SSH setup failed for jobs %s cluster %s" % (str(_failed), integration_id)) if all( [status == "finished" for status in all_status.values()]): logger.log("info", NS.publisher_id, { "message": "SSH setup completed for all " "nodes in cluster %s" % integration_id }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) break # SSH setup jobs finished above, now install sds # bits and create cluster logger.log("info", NS.publisher_id, { "message": "Expanding Gluster Storage" " Cluster %s" % integration_id }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) gluster_help.expand_gluster(self.parameters) logger.log( "info", NS.publisher_id, { "message": "SDS install/config completed on newly " "expanded nodes, Please wait while " "tendrl-node-agents detect sds details on the newly " "expanded nodes %s" % self.parameters['Node[]'] }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) # Wait till detected cluster in populated for nodes while True: time.sleep(3) all_status = [] detected_cluster = "" different_cluster_id = False dc = "" for node in self.parameters['Node[]']: try: dc = NS.tendrl.objects.DetectedCluster( node_id=node).load() if not detected_cluster: detected_cluster = dc.detected_cluster_id else: if detected_cluster != dc.detected_cluster_id: all_status.append(False) different_cluster_id = True break all_status.append(True) except etcd.EtcdKeyNotFound: all_status.append(False) if different_cluster_id: raise FlowExecutionFailedError( "Seeing different detected cluster id in" " different nodes. %s and %s" % (detected_cluster, dc.detected_cluster_id)) if all_status: if all(all_status): break # Create the params list for import cluster flow new_params = dict() new_params['Node[]'] = self.parameters['Node[]'] new_params['TendrlContext.integration_id'] = integration_id # Get node context for one of the nodes from list dc = NS.tendrl.objects.DetectedCluster( node_id=self.parameters['Node[]'][0]).load() sds_pkg_name = dc.sds_pkg_name new_params['import_after_expand'] = True sds_pkg_version = dc.sds_pkg_version new_params['DetectedCluster.sds_pkg_name'] = \ sds_pkg_name new_params['DetectedCluster.sds_pkg_version'] = \ sds_pkg_version tags = [] for node in self.parameters['Node[]']: tags.append("tendrl/node_%s" % node) payload = { "tags": tags, "run": "tendrl.flows.ImportCluster", "status": "new", "parameters": new_params, "parent": self.parameters['job_id'], "type": "node" } _job_id = str(uuid.uuid4()) # release lock before import cluster flow_utils.release_node_lock(self.parameters) NS.tendrl.objects.Job(job_id=_job_id, status="new", payload=payload).save() logger.log( "info", NS.publisher_id, { "message": "Please wait while Tendrl imports (" "job_id: %s) newly expanded " "%s storage nodes in cluster %s" % (_job_id, sds_pkg_name, NS.tendrl.objects.Cluster( integration_id=integration_id).load().short_name) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) except Exception as ex: Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": ex.message, "exception": ex })) # raising exception to mark job as failed raise ex finally: # release lock if any exception came flow_utils.release_node_lock(self.parameters)
def run(self): integration_id = self.parameters['TendrlContext.integration_id'] _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id).load() if _cluster.is_managed == "no": if _cluster.current_job['job_name'] == self.__class__.__name__ \ and _cluster.current_job['status'] == 'finished': raise FlowExecutionFailedError( "Cluster is already in un-managed state") if _cluster.current_job['status'] == 'in_progress' and \ ( 'job_id' in _cluster.locked_by and _cluster.locked_by['job_id'] != "" ) and ( _cluster.status in ['importing', 'unmanaging', 'expanding'] ): raise FlowExecutionFailedError( "Another job in progress for cluster." " Please wait till the job finishes " "(job_id: %s) (integration_id: %s) " % (_cluster.current_job['job_id'], _cluster.integration_id)) _lock_details = { 'node_id': NS.node_context.node_id, 'fqdn': NS.node_context.fqdn, 'tags': NS.node_context.tags, 'type': NS.type, 'job_name': self.__class__.__name__, 'job_id': self.job_id } _cluster.locked_by = _lock_details _cluster.status = "unmanaging" _cluster.current_job = { 'job_id': self.job_id, 'job_name': self.__class__.__name__, 'status': "in_progress" } _cluster.save() try: super(UnmanageCluster, self).run() _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id).load() _cluster.status = "" _cluster.is_managed = "no" _cluster.locked_by = {} _cluster.errors = [] _cluster.current_job = { 'status': "finished", 'job_name': self.__class__.__name__, 'job_id': self.job_id } _cluster.save() except (FlowExecutionFailedError, AtomExecutionFailedError, Exception) as ex: exc_type, exc_value, exc_traceback = sys.exc_info() _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id).load() _cluster.status = "" _cluster.locked_by = {} _cluster.current_job = { 'status': "failed", 'job_name': self.__class__.__name__, 'job_id': self.job_id } _errors = [] if hasattr(ex, 'message'): _errors = [ex.message] else: _errors = [str(ex)] if _errors: _cluster.errors = _errors _cluster.save() raise FlowExecutionFailedError( str( traceback.format_exception(exc_type, exc_value, exc_traceback)))
def run(self): try: # Lock nodes create_cluster_utils.acquire_node_lock(self.parameters) integration_id = self.parameters['TendrlContext.integration_id'] sds_name = self.parameters['DetectedCluster.sds_pkg_name'] if not self.parameters.get('import_after_expand', False) and \ not self.parameters.get('import_after_create', False): # check if gdeploy in already provisioned in this cluster # if no it has to be provisioned here if sds_name.find("gluster") > -1 and \ not self.parameters.get("gdeploy_provisioned", False) and \ not self._probe_and_mark_provisioner( self.parameters["Node[]"], integration_id ): create_cluster_utils.install_gdeploy() create_cluster_utils.install_python_gdeploy() ssh_job_ids = create_cluster_utils.gluster_create_ssh_setup_jobs( self.parameters) while True: gevent.sleep(3) all_status = {} for job_id in ssh_job_ids: all_status[job_id] = NS._int.client.read( "/queue/%s/status" % job_id).value _failed = { _jid: status for _jid, status in all_status.iteritems() if status == "failed" } if _failed: raise AtomExecutionFailedError( "SSH setup failed for jobs %s cluster %s" % (str(_failed), integration_id)) if all([ status == "finished" for status in all_status.values() ]): Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "SSH setup completed for all nodes in cluster %s" % integration_id })) # set this node as gluster provisioner tags = ["provisioner/%s" % integration_id] NS.node_context = NS.node_context.load() tags += NS.node_context.tags NS.node_context.tags = list(set(tags)) NS.node_context.save() # set gdeploy_provisioned to true so that no other nodes # tries to configure gdeploy self.parameters['gdeploy_provisioned'] = True break NS.tendrl_context = NS.tendrl_context.load() NS.tendrl_context.integration_id = integration_id _detected_cluster = NS.tendrl.objects.DetectedCluster().load() NS.tendrl_context.cluster_id = _detected_cluster.detected_cluster_id NS.tendrl_context.cluster_name = _detected_cluster.detected_cluster_name NS.tendrl_context.sds_name = _detected_cluster.sds_pkg_name NS.tendrl_context.sds_version = _detected_cluster.sds_pkg_version NS.tendrl_context.save() Event( Message(job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Registered Node %s with cluster %s" % (NS.node_context.node_id, NS.tendrl_context.integration_id) })) node_list = self.parameters['Node[]'] cluster_nodes = [] if len(node_list) > 1: # This is the master node for this flow for node in node_list: if NS.node_context.node_id != node: new_params = self.parameters.copy() new_params['Node[]'] = [node] # create same flow for each node in node list except $this payload = { "tags": ["tendrl/node_%s" % node], "run": "tendrl.flows.ImportCluster", "status": "new", "parameters": new_params, "parent": self.parameters['job_id'], "type": "node" } _job_id = str(uuid.uuid4()) cluster_nodes.append(_job_id) Job(job_id=_job_id, status="new", payload=payload).save() Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Importing (job: %s) Node %s to cluster %s" % (_job_id, node, integration_id) })) if "ceph" in sds_name.lower(): node_context = NS.node_context.load() is_mon = False for tag in node_context.tags: mon_tag = NS.compiled_definitions.get_parsed_defs( )['namespace.tendrl']['tags']['ceph-mon'] if mon_tag in tag: is_mon = True if is_mon: # Check if minimum required version of underlying ceph # cluster met. If not fail the import task detected_cluster = NS.tendrl.objects.DetectedCluster( ).load() detected_cluster_ver = detected_cluster.sds_pkg_version.split( '.') maj_ver = detected_cluster_ver[0] min_ver = detected_cluster_ver[1] reqd_ceph_ver = NS.compiled_definitions.get_parsed_defs( )['namespace.tendrl']['min_reqd_ceph_ver'] req_maj_ver, req_min_ver, req_rel = reqd_ceph_ver.split( '.') Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Check: Minimum required version (%s.%s.%s) of Ceph Storage" % (req_maj_ver, req_min_ver, req_rel) })) if int(maj_ver) < int(req_maj_ver) or \ int(min_ver) < int(req_min_ver): Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="error", publisher=NS.publisher_id, payload={ "message": "Error: Minimum required version (%s.%s.%s) " "doesnt match that of detected Ceph Storage (%s.%s.%s)" % (req_maj_ver, req_min_ver, req_rel, maj_ver, min_ver, 0) })) raise FlowExecutionFailedError( "Detected ceph version: %s" " is lesser than required version: %s" % (detected_cluster.sds_pkg_version, reqd_ceph_ver)) import_ceph(self.parameters) else: # Check if minimum required version of underlying gluster # cluster met. If not fail the import task detected_cluster = NS.tendrl.objects.DetectedCluster().load() detected_cluster_ver = detected_cluster.sds_pkg_version.split( '.') maj_ver = detected_cluster_ver[0] min_ver = detected_cluster_ver[1] reqd_gluster_ver = NS.compiled_definitions.get_parsed_defs( )['namespace.tendrl']['min_reqd_gluster_ver'] req_maj_ver, req_min_ver, req_rel = reqd_gluster_ver.split('.') Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Check: Minimum required version (%s.%s.%s) of Gluster Storage" % (req_maj_ver, req_min_ver, req_rel) })) if int(maj_ver) < int(req_maj_ver) or \ int(min_ver) < int(req_min_ver): Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="error", publisher=NS.publisher_id, payload={ "message": "Error: Minimum required version (%s.%s.%s) " "doesnt match that of detected Gluster Storage (%s.%s.%s)" % (req_maj_ver, req_min_ver, req_rel, maj_ver, min_ver, 0) })) raise FlowExecutionFailedError( "Detected gluster version: %s" " is lesser than required version: %s" % (detected_cluster.sds_pkg_version, reqd_gluster_ver)) import_gluster(self.parameters) Event( Message(job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Waiting for participant nodes %s to be " "imported %s" % (node_list, integration_id) })) # An import is sucessfull once all Node[] register to # /clusters/:integration_id/nodes/:node_id while True: _all_node_status = [] gevent.sleep(3) for node_id in self.parameters['Node[]']: _status = NS.tendrl.objects.ClusterNodeContext(node_id=node_id).exists() \ and NS.tendrl.objects.ClusterTendrlContext( integration_id=integration_id ).exists() _all_node_status.append(_status) if _all_node_status: if all(_all_node_status): Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Import Cluster completed for all nodes " "in cluster %s" % integration_id })) break Event( Message(job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Sucessfully imported cluster %s" % integration_id })) except Exception as ex: # For traceback Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": ex.message, "exception": ex })) # raising exception to mark job as failed raise ex finally: # release lock create_cluster_utils.release_node_lock(self.parameters) return True
def run(self): action = self.parameters["Cluster.volume_profiling_flag"] if action not in VOL_PROFILE_ACTIONS.keys(): raise FlowExecutionFailedError( "Invalid value of Cluster.volume_profiling_flag " "(%s) while enable/disable volume profiling for" "cluster (%s). Valid values are enable/disable" % ( action, NS.tendrl_context.integration_id ) ) _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() _lock_details = { 'node_id': NS.node_context.node_id, 'tags': NS.node_context.tags, 'type': NS.type, 'job_name': self.__class__.__name__, 'job_id': self.job_id } _cluster.locked_by = _lock_details _cluster.status = "set_volume_profiling" _cluster.current_job = { 'job_id': self.job_id, 'job_name': self.__class__.__name__, 'status': 'in_progress' } _cluster.save() volumes = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id ).load_all() or [] failed_vols = [] for volume in volumes: out, err, rc = cmd_utils.Command( "gluster volume profile %s %s" % (volume.name, VOL_PROFILE_ACTIONS[action]) ).run() if err != "" or rc != 0: logger.log( "info", NS.publisher_id, { "message": "%s profiling failed for volume: %s." " Error: %s" % (action, volume.name, err) }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"] ) failed_vols.append(volume.name) else: if action == "enable": volume.profiling_enabled = "yes" else: volume.profiling_enabled = "no" volume.save() if len(failed_vols) > 0: logger.log( "info", NS.publisher_id, { "message": "%s profiling failed for " "volumes: %s" % (action, str(failed_vols)) }, job_id=self.parameters['job_id'], flow_id=self.parameters["flow_id"] ) _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() _cluster.status = "" _cluster.locked_by = {} _cluster.current_job = { 'status': "finished", 'job_name': self.__class__.__name__, 'job_id': self.job_id } _cluster.volume_profiling_state = "%sd" % action _cluster.save() return True
def run(self): # Execute the pre runs for the flow msg = "Processing pre-runs for flow: %s" % self.to_str Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg})) # Check for mandatory parameters if 'mandatory' in self._defs.get('inputs', {}): for item in self._defs['inputs']['mandatory']: if item not in self.parameters: msg = "Mandatory parameter %s not provided" % item Event( Message(job_id=self.job_id, flow_id=self.parameters['flow_id'], priority="warning", publisher=NS.publisher_id, payload={"message": msg})) raise FlowExecutionFailedError("Mandatory parameter %s " "not provided" % item) if self._defs.get("pre_run") is not None: for atom_fqn in self._defs.get("pre_run"): msg = "Start pre-run : %s" % atom_fqn Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg})) ret_val = self._execute_atom(atom_fqn) if not ret_val: msg = "Failed pre-run: %s for flow: %s" % \ (atom_fqn, self._defs['help']) Event( Message(job_id=self.job_id, flow_id=self.parameters['flow_id'], priority="error", publisher=NS.publisher_id, payload={"message": msg})) raise AtomExecutionFailedError( "Error executing pre run function: %s for flow: %s" % (atom_fqn, self._defs['help'])) else: msg = "Finished pre-run: %s for flow: %s" %\ (atom_fqn, self._defs['help']) Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg})) # Execute the atoms for the flow msg = "Processing atoms for flow: %s" % self._defs['help'] Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg})) if self._defs.get("atoms") is not None: for atom_fqn in self._defs.get("atoms"): msg = "Start atom : %s" % atom_fqn Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg})) ret_val = self._execute_atom(atom_fqn) if not ret_val: msg = "Failed atom: %s on flow: %s" % \ (atom_fqn, self._defs['help']) Event( Message(job_id=self.job_id, flow_id=self.parameters['flow_id'], priority="error", publisher=NS.publisher_id, payload={"message": msg})) raise AtomExecutionFailedError( "Error executing atom: %s on flow: %s" % (atom_fqn, self._defs['help'])) else: msg = 'Finished atom %s for flow: %s' %\ (atom_fqn, self._defs['help']) Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg})) # Execute the post runs for the flow msg = "Processing post-runs for flow: %s" % self._defs['help'] Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg})) if self._defs.get("post_run") is not None: for atom_fqn in self._defs.get("post_run"): msg = "Start post-run : %s" % atom_fqn Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg})) ret_val = self._execute_atom(atom_fqn) if not ret_val: msg = "Failed post-run: %s for flow: %s" % \ (atom_fqn, self._defs['help']) Event( Message(job_id=self.job_id, flow_id=self.parameters['flow_id'], priority="error", publisher=NS.publisher_id, payload={"message": msg})) raise AtomExecutionFailedError( "Error executing post run function: %s" % atom_fqn) else: msg = "Finished post-run: %s for flow: %s" %\ (atom_fqn, self._defs['help']) Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg}))
def run(self): if "Node[]" not in self.parameters: integration_id = self.parameters['TendrlContext.integration_id'] _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id).load() if (_cluster.import_job_id is not None and _cluster.import_job_id != "") or _cluster.import_status \ in ["in_progress", "done", "failed"]: raise FlowExecutionFailedError( "Cluster already being imported by another Job, please " "wait till " "the job finishes (job_id: %s) (integration_id: %s) " % (_cluster.import_job_id, _cluster.integration_id)) _cluster.import_status = "in_progress" _cluster.import_job_id = self.job_id _cluster.save() try: integration_id_index_key = \ "indexes/tags/tendrl/integration/%s" % integration_id _node_ids = NS._int.client.read(integration_id_index_key).value self.parameters["Node[]"] = json.loads(_node_ids) except etcd.EtcdKeyNotFound: raise FlowExecutionFailedError("Cluster with " "integration_id " "(%s) not found, cannot " "import" % integration_id) else: _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id).load() _cluster.enable_volume_profiling = self.parameters[ 'Cluster.enable_volume_profiling'] _cluster.save() # Try to claim "provisioner/:integration_id" tag try: _tag = "provisioner/%s" % _cluster.integration_id _index_key = "/indexes/tags/%s" % _tag _node_id = json.dumps([NS.node_context.node_id]) NS._int.wclient.write(_index_key, _node_id, prevExist=False) # TODO(shtripat) ceph-installer is auto detected and # provisioner/$integration_id # tag is set , below is not required for ceph current_tags = list(NS.node_context.tags) new_tags = ['provisioner/%s' % integration_id] new_tags += current_tags new_tags = list(set(new_tags)) if new_tags != current_tags: NS.node_context.tags = new_tags NS.node_context.save() except etcd.EtcdAlreadyExist: pass try: super(ImportCluster, self).run() except (FlowExecutionFailedError, AtomExecutionFailedError, Exception) as ex: _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id).load() _cluster.import_status = "failed" _errors = [] if hasattr(ex, 'message'): _errors = [ex.message] else: _errors = [str(ex)] if _errors: _cluster.errors = _errors _cluster.save() raise ex
def run(self): ssh_key = self.parameters['ssh_key'] ret_val, err = authorize_key.AuthorizeKey(ssh_key).run() if ret_val is not True or err != "": raise FlowExecutionFailedError(err) return True
def run(self): try: integration_id = self.parameters['TendrlContext.integration_id'] # Lock nodes create_cluster_utils.acquire_node_lock(self.parameters) NS.tendrl_context = NS.tendrl_context.load() # TODO(team) when Tendrl supports create/expand/shrink cluster # setup passwordless ssh for all gluster nodes with given # integration_id (check # /indexes/tags/tendrl/integration/$integration_id for list of # nodes in cluster node_list = self.parameters['Node[]'] cluster_nodes = [] if len(node_list) > 1: # This is the master node for this flow for node in node_list: if NS.node_context.node_id != node: new_params = self.parameters.copy() new_params['Node[]'] = [node] # create same flow for each node in node list except # $this payload = {"tags": ["tendrl/node_%s" % node], "run": "tendrl.flows.ImportCluster", "status": "new", "parameters": new_params, "parent": self.parameters['job_id'], "type": "node" } _job_id = str(uuid.uuid4()) cluster_nodes.append(_job_id) Job(job_id=_job_id, status="new", payload=payload).save() Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Importing (job: %s) Node %s " "to cluster %s" % (_job_id, node, integration_id) } ) ) # Check if minimum required version of underlying gluster # cluster met. If not fail the import task cluster_ver = \ NS.tendrl_context.sds_version.split('.') maj_ver = cluster_ver[0] min_ver = re.findall(r'\d+', cluster_ver[1])[0] reqd_gluster_ver = NS.compiled_definitions.get_parsed_defs()[ 'namespace.tendrl' ]['min_reqd_gluster_ver'] req_maj_ver, req_min_ver, req_rel = reqd_gluster_ver.split('.') Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Check: Minimum required version (" "%s.%s.%s) of Gluster Storage" % (req_maj_ver, req_min_ver, req_rel) } ) ) ver_check_failed = False if int(maj_ver) < int(req_maj_ver): ver_check_failed = True else: if int(maj_ver) == int(req_maj_ver) and \ int(min_ver) < int(req_min_ver): ver_check_failed = True if ver_check_failed: Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="error", publisher=NS.publisher_id, payload={ "message": "Error: Minimum required version " "(%s.%s.%s) " "doesnt match that of detected Gluster " "Storage (%s.%s.%s)" % (req_maj_ver, req_min_ver, req_rel, maj_ver, min_ver, 0) } ) ) raise FlowExecutionFailedError( "Detected gluster version: %s" " is lesser than required version: %s" % ( NS.tendrl_context.sds_version, reqd_gluster_ver ) ) if not import_gluster(self.parameters): return False if len(node_list) > 1: Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Waiting for participant nodes %s to " "be " "imported %s" % (node_list, integration_id) } ) ) loop_count = 0 # Wait for (no of nodes) * 6 minutes for import to complete wait_count = (len(node_list) - 1) * 36 while True: parent_job = Job(job_id=self.parameters['job_id']).load() if loop_count >= wait_count: Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Import jobs not yet complete " "on all nodes. Timing out. (%s, %s)" % (str(node_list), integration_id) } ) ) return False time.sleep(10) finished = True for child_job_id in parent_job.children: child_job = Job(job_id=child_job_id).load() if child_job.status != "finished": finished = False break if finished: break else: loop_count += 1 continue except Exception as ex: # For traceback Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={ "message": ex.message, "exception": ex } ) ) # raising exception to mark job as failed raise ex finally: # release lock create_cluster_utils.release_node_lock(self.parameters) return True
def mock_invoke_flow(flow, job): raise FlowExecutionFailedError("Flow Execution failed")
def process_job(job): jid = job.key.split('/')[-1] job_status_key = "/queue/%s/status" % jid job_lock_key = "/queue/%s/locked_by" % jid NS.node_context = NS.node_context.load() # Check job not already locked by some agent try: _locked_by = etcd_utils.read(job_lock_key).value if _locked_by: return except etcd.EtcdKeyNotFound: pass # Check job not already "finished", or "processing" try: _status = etcd_utils.read(job_status_key).value if _status in ["finished", "processing"]: return except etcd.EtcdKeyNotFound: pass try: _job_timeout_key = "/queue/%s/timeout" % jid _timeout = None _timeout = etcd_utils.read(_job_timeout_key).value if _timeout: _timeout = _timeout.lower() except etcd.EtcdKeyNotFound: pass # tendrl-node-agent tagged as tendrl/monitor will ensure # >10 min old "new" jobs are timed out and marked as # "failed" (the parent job of these jobs will also be # marked as "failed") if "tendrl/monitor" in NS.node_context.tags and \ _timeout == "yes": _job_valid_until_key = "/queue/%s/valid_until" % jid _valid_until = None try: _valid_until = etcd_utils.read( _job_valid_until_key).value except etcd.EtcdKeyNotFound: pass if _valid_until: _now_epoch = (time_utils.now() - datetime.datetime(1970, 1, 1).replace( tzinfo=utc)).total_seconds() if int(_now_epoch) >= int(_valid_until): # Job has "new" status since 10 minutes, # mark status as "failed" and Job.error = # "Timed out" try: etcd_utils.write(job_status_key, "failed", prevValue="new") except etcd.EtcdCompareFailed: pass else: job = NS.tendrl.objects.Job(job_id=jid).load() _msg = str("Timed-out (>10min as 'new')") job.errors = _msg job.save() if job.payload.get('parent') is None: alert_utils.alert_job_status( "failed", "Job timed out (job_id: %s)" % jid, integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) return else: _now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10) _epoch_start = datetime.datetime(1970, 1, 1).replace(tzinfo=utc) # noinspection PyTypeChecker _now_plus_10_epoch = (_now_plus_10 - _epoch_start).total_seconds() etcd_utils.write(_job_valid_until_key, int(_now_plus_10_epoch)) job = NS.tendrl.objects.Job(job_id=jid).load() if job.payload["type"] == NS.type and \ job.status == "new": # Job routing # Flows created by tendrl-api use 'tags' from flow # definition to target jobs _tag_match = False if job.payload.get("tags", []): for flow_tag in job.payload['tags']: if flow_tag in NS.node_context.tags: _tag_match = True if not _tag_match: _job_tags = ", ".join(job.payload.get("tags", [])) _msg = "Node (%s)(type: %s)(tags: %s) will not " \ "process job-%s (tags: %s)" % \ (NS.node_context.node_id, NS.type, NS.node_context.tags, jid, _job_tags) logger.log( "info", NS.publisher_id, {"message": _msg} ) return job_status_key = "/queue/%s/status" % job.job_id job_lock_key = "/queue/%s/locked_by" % job.job_id try: lock_info = dict(node_id=NS.node_context.node_id, fqdn=NS.node_context.fqdn, tags=NS.node_context.tags, type=NS.type) etcd_utils.write(job_status_key, "processing", prevValue="new") etcd_utils.write(job_lock_key, json.dumps(lock_info)) except etcd.EtcdCompareFailed: # job is already being processed by some tendrl # agent return the_flow = None try: current_ns, flow_name, obj_name = \ _extract_fqdn(job.payload['run']) if obj_name: runnable_flow = current_ns.ns.get_obj_flow( obj_name, flow_name) else: runnable_flow = current_ns.ns.get_flow(flow_name) the_flow = runnable_flow(parameters=job.payload[ 'parameters'], job_id=job.job_id) logger.log( "info", NS.publisher_id, {"message": "Processing Job %s" % job.job_id}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) logger.log( "info", NS.publisher_id, {"message": "Running Flow %s" % job.payload['run']}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) the_flow.run() try: etcd_utils.write(job_status_key, "finished", prevValue="processing") except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'finished', " \ "current job status invalid" raise FlowExecutionFailedError(_msg) logger.log( "info", NS.publisher_id, {"message": "Job (%s): Finished " "Flow %s" % ( job.job_id, job.payload['run'])}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'], ) if job.payload.get('parent') is None: alert_utils.alert_job_status( "finished", "Job finished successfully (job_id: %s)" % job.job_id, integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) except (FlowExecutionFailedError, AtomExecutionFailedError, Exception) as e: _trace = str(traceback.format_exc(e)) _msg = "Failure in Job %s Flow %s with error:" % \ (job.job_id, job.payload['run']) Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={"message": _msg + _trace, "exception": e } ) ) if the_flow: logger.log( "error", NS.publisher_id, {"message": _msg + "\n" + _trace}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) else: logger.log( "error", NS.publisher_id, {"message": _msg + "\n" + _trace} ) try: etcd_utils.write(job_status_key, "failed", prevValue="processing") except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'failed', current" \ "job status invalid" raise FlowExecutionFailedError(_msg) else: job = job.load() job.errors = _trace if job.payload.get('parent') is None: alert_utils.alert_job_status( "failed", "Job failed (job_id: %s)" % job.job_id, integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) job.save()
def run(self): super(StopServices, self).run() services = self.parameters['Services[]'] for service in services: logger.log( "info", NS.publisher_id, { "message": "Stopping service %s on node %s" % (service, NS.node_context.fqdn) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], ) srv = NS.tendrl.objects.Service(service=service) if not srv.running: if len(srv.error) > 0: raise FlowExecutionFailedError( "Unable to check status of service %s " "on %s. Error: %s" % (service, NS.node_context.node_id, srv.error)) else: logger.log( "debug", NS.publisher_id, { "message": "%s not running on " "%s" % (service, NS.node_context.fqdn) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], ) continue _cmd_str = "systemctl stop %s" % service cmd = cmd_utils.Command(_cmd_str) _, err, _ = cmd.run() if err: logger.log( "debug", NS.publisher_id, { "message": "Could not stop %s" " service on %s. Error: %s" % (service, NS.node_context.fqdn, err) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], ) _cmd_str = "systemctl disable %s" % service cmd = cmd_utils.Command(_cmd_str) _, err, _ = cmd.run() if err: logger.log( "debug", NS.publisher_id, { "message": "Could not disable %s" " service on %s. Error: %s" % (service, NS.node_context.fqdn, err) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], ) return True