def run(self): vol_id = self.parameters['Volume.vol_id'] volume = NS.gluster.objects.Volume(vol_id=vol_id).load() command = "gluster volume profile %s stop" % volume.name cmd = cmd_utils.Command(command) out, err, rc = cmd.run() if rc != 0: raise AtomExecutionFailedError( "Error while disabling profiling " "for volume: %s in cluster: %s. Error: %s" % (volume.name, NS.tendrl_context.integration_id, err)) loop_count = 0 while True: if loop_count >= 24: raise AtomExecutionFailedError( "Could not disable for volume: %s " "under cluster: %s. Timed out" % (volume.name, NS.tendrl_context.integration_id)) out, err, rc = cmd_utils.Command("gluster volume profile %s info" % volume.name).run() if rc == 1: break else: time.sleep(5) loop_count += 1 volume.profiling_enabled = "no" volume.save() return True
def run(self): Event( Message( priority="info", publisher=NS.publisher_id, payload={"message": "Checking if update parameters are valid"}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], cluster_id=NS.tendrl_context.integration_id, )) if 'Pool.poolname' in self.parameters and \ ('Pool.pg_num' in self.parameters or 'Pool.size' in self.parameters or 'Pool.pg_num' in self.parameters or 'Pool.min_size' in self.parameters or 'Pool.quota_enabled' in self.parameters): Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "Invalid combination of pool update parameters. " "Pool name shouldnt be updated with other parameters." }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], cluster_id=NS.tendrl_context.integration_id, )) raise AtomExecutionFailedError( "Invalid combination of pool update parameters. " "Pool name shoulnt be update with other parameters.") if 'Pool.pg_num' in self.parameters: fetched_pool = Pool(pool_id=self.parameters['Pool.pool_id']).load() if self.parameters['Pool.pg_num'] <= fetched_pool.pg_num: Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "New pg-num cannot be less than " "existing value" }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], cluster_id=NS.tendrl_context.integration_id, )) raise AtomExecutionFailedError( "New pg-num cannot be less than existing value") return True
def run(self): node_ids = self.parameters.get('Node[]') if not node_ids or len(node_ids) == 0: raise AtomExecutionFailedError("Node[] cannot be empty") for node_id in node_ids: # Check if node has the OS details populated try: os_details = etcd_utils.read("nodes/%s/Os" % node_id) if os_details.leaves is None: raise AtomExecutionFailedError( "Node doesnt have OS details populated" ) except etcd.EtcdKeyNotFound: raise AtomExecutionFailedError( "Node doesnt have OS details populated" ) # Check if node has the CPU details populated try: cpu_details = etcd_utils.read("nodes/%s/Cpu" % node_id) if cpu_details.leaves is None: raise AtomExecutionFailedError( "Node doesnt have CPU details populated" ) except etcd.EtcdKeyNotFound: raise AtomExecutionFailedError( "Node doesnt have CPU details populated" ) # Check if node has the Memory populated try: memory_details = etcd_utils.read( "nodes/%s/Memory" % node_id ) if memory_details.leaves is None: raise AtomExecutionFailedError( "Node doesnt have Memory details populated" ) except etcd.EtcdKeyNotFound: raise AtomExecutionFailedError( "Node doesnt have Memory details populated" ) # Check if node has networks details populated try: networks = etcd_utils.read("nodes/%s/Networks" % node_id) if networks.leaves is None: raise AtomExecutionFailedError( "Node doesnt have network details populated" ) except etcd.EtcdKeyNotFound: raise AtomExecutionFailedError( "Node doesnt have network details populated" ) return True
def run(self): Event( Message( priority="info", publisher=NS.publisher_id, payload={"message": "Checking if update parameters are valid"}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], cluster_id=NS.tendrl_context.integration_id, )) if 'Pool.pg_num' in self.parameters: fetched_pool = Pool(pool_id=self.parameters['Pool.pool_id']).load() if self.parameters['Pool.pg_num'] <= int(fetched_pool.pg_num): Event( Message( priority="error", publisher=NS.publisher_id, payload={ "message": "New pg-num cannot be less than " "existing value" }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], cluster_id=NS.tendrl_context.integration_id, )) raise AtomExecutionFailedError( "New pg-num cannot be less than existing value") return True
def run(self): retry_count = 0 while True: volumes = None try: volumes = NS._int.client.read("clusters/%s/Volumes" % NS.tendrl_context.integration_id) except etcd.EtcdKeyNotFound: # ignore as no volumes available till now pass if volumes: for entry in volumes.leaves: volume = NS.gluster.objects.Volume( vol_id=entry.key.split("Volumes/")[-1]).load() if volume.name == self.parameters['Volume.volname']: return True retry_count += 1 time.sleep(1) if retry_count == 600: logger.log( "error", NS.publisher_id, { "message": "Volume %s not reflected in tendrl" " yet. Timing out" % self.parameters['Volume.volname'] }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], integration_id=NS.tendrl_context.integration_id) raise AtomExecutionFailedError( "Volume %s not reflected in tendrl yet. Timing out" % self.parameters['Volume.volname'])
def run(self): cmd = self.parameters.get("Node.cmd_str") logger.log("info", NS.publisher_id, { "message": "Executing %s on node %s" % (cmd, self.parameters.get("fqdn")) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) out, err, rc = Command(cmd).run() if not err and rc == 0: logger.log("info", NS.publisher_id, { "message": "Successfully executed %s on node %s" % (cmd, self.parameters.get("fqdn")) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) return True else: logger.log("error", NS.publisher_id, { "message": "Failed to execute %s on node %s." "Error %s" % (cmd, self.parameters.get("fqdn"), err) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) raise AtomExecutionFailedError(err)
def run(self): retry_count = 0 while True: _cluster = None try: _cluster = NS.tendrl.objects.Cluster( integration_id=self. parameters["TendrlContext.integration_id"]).load() except etcd.EtcdKeyNotFound: # pass and continue the time out below pass if _cluster.exists() and _cluster.is_managed == "yes": return True retry_count += 1 time.sleep(1) if retry_count == 600: logger.log("error", NS.publisher_id, { "message": "Cluster data sync still incomplete. " "Timing out" }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], integration_id=NS.tendrl_context.integration_id) raise AtomExecutionFailedError( "Cluster data sync still incomplete. Timing out")
def run(self): retry_count = 0 while True: try: NS.ceph.objects.Rbd(pool_id=self.parameters['Rbd.pool_id'], name=self.parameters['Rbd.name']).load() return True except etcd.EtcdKeyNotFound: retry_count += 1 gevent.sleep(1) if retry_count == 600: Event( Message( priority="error", publisher=NS.publisher_id, payload={ "message": "Rbd %s not reflected in tendrl yet. Timing out" % self.parameters['Rbd.name'] }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], cluster_id=NS.tendrl_context.integration_id, )) raise AtomExecutionFailedError( "Rbd %s not reflected in tendrl yet. Timing out" % self.parameters['Rbd.name'])
def run(self): retry_count = 0 while True: _cluster = None try: _cluster = NS.tendrl.objects.Cluster( integration_id=self. parameters["TendrlContext.integration_id"]).load() except etcd.EtcdKeyNotFound: # pass and continue the time out below pass if _cluster and _cluster.status == "done": return True retry_count += 1 gevent.sleep(1) if retry_count == 600: Event( Message( priority="error", publisher=NS.publisher_id, payload={ "message": "Cluster data sync still incomplete. Timing out" }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], cluster_id=NS.tendrl_context.integration_id, )) raise AtomExecutionFailedError( "Cluster data sync still incomplete. Timing out")
def run(self): integration_id = self.parameters['TendrlContext.integration_id'] # Wait for /indexes/tags/tendrl/integration/$integration_id # to appear. This means cluster is import ready wait_count = 6 loop_count = 0 while True: try: integration_id_index_key = \ "indexes/tags/tendrl/integration/%s" % integration_id _node_ids = NS._int.client.read( integration_id_index_key ).value if _node_ids: return True if loop_count >= wait_count: raise AtomExecutionFailedError( "Cluster: %s is not yet marked as " "import ready. Timing out." % integration_id ) except etcd.EtcdKeyNotFound: time.sleep(5) loop_count += 1 continue return True
def run(self): self.parameters['Service.name'] = 'collectd' plugin_config_success = True graphite_host = (NS.config.data.get('graphite_host') or NS.config.data['etcd_connection']) graphite_port = (NS.config.data.get('graphite_port') or 2003) plugin_params = { "graphite_host": graphite_host, "graphite_port": graphite_port, "hostname": NS.node_context.fqdn, "integration_id": NS.tendrl_context.integration_id, "node_id": NS.node_context.node_id, "logging_socket_path": NS.config.data['logging_socket_path'], "interval": NS.config.data['sync_interval'], "interface": self.get_node_interface(NS.node_context.fqdn), "etcd_host": NS.config.data['etcd_connection'], "etcd_port": NS.config.data['etcd_port'] } etcd_ca_cert_file = NS.config.data.get("etcd_ca_cert_file") etcd_cert_file = NS.config.data.get("etcd_cert_file") etcd_key_file = NS.config.data.get("etcd_key_file") if etcd_ca_cert_file and str(etcd_ca_cert_file) != "" \ and etcd_cert_file and str(etcd_cert_file) != "" \ and etcd_key_file and str(etcd_key_file) != "": plugin_params.update({ "etcd_ca_cert_file": NS.config.data['etcd_ca_cert_file'], "etcd_cert_file": NS.config.data['etcd_cert_file'], "etcd_key_file": NS.config.data['etcd_key_file'] }) for node_plugin in NODE_PLUGINS: plugin_config_success &= self._configure_plugin( node_plugin, plugin_params) if NS.tendrl_context.sds_name == 'gluster': plugin_params['is_provisioner_node'] = False if "provisioner/%s" % ( NS.tendrl_context.integration_id) in NS.node_context.tags: plugin_params['is_provisioner_node'] = True for gluster_plugin in GLUSTER_CLUSTER_PLUGINS: plugin_config_success &= self._configure_plugin( gluster_plugin, plugin_params) if not plugin_config_success: raise AtomExecutionFailedError( "Collectd configuration failed for node %s from cluster %s" % (NS.node_context.fqdn, NS.tendrl_context.integration_id)) err, success = Service( 'collectd', publisher_id='node_agent', node_id=NS.node_context.node_id, socket_path=NS.config.data['logging_socket_path'], enabled=True).restart() _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id).load() _cluster.import_status = "done" _cluster.save() return True
def run(self): self.parameters['Service.name'] = 'collectd' plugin_config_success = True graphite_host = ( NS.config.data.get('graphite_host') or NS.config.data['etcd_connection'] ) graphite_port = ( NS.config.data.get('graphite_port') or 2003 ) plugin_params = { "graphite_host": graphite_host, "graphite_port": graphite_port, "hostname": NS.node_context.fqdn, "integration_id": NS.tendrl_context.integration_id, "node_id": NS.node_context.node_id, "logging_socket_path": NS.config.data['logging_socket_path'], "interval": NS.config.data['sync_interval'], "interface": self.get_node_interface(NS.node_context.fqdn) } for node_plugin in NODE_PLUGINS: plugin_config_success &= self._configure_plugin( node_plugin, plugin_params ) if NS.tendrl_context.sds_name == 'gluster': plugins = GLUSTER_CLUSTER_PLUGINS.get('node_plugins', []) if "provisioner/%s" % ( NS.tendrl_context.integration_id ) in NS.node_context.tags: plugins.update( GLUSTER_CLUSTER_PLUGINS.get( 'cluster_plugins', [] ) ) for gluster_plugin in plugins: plugin_config_success &= self._configure_plugin( gluster_plugin, plugin_params ) if not plugin_config_success: raise AtomExecutionFailedError( "Collectd configuration failed for node %s from cluster %s" % ( NS.node_context.fqdn, NS.tendrl_context.integration_id ) ) err, success = Service( 'collectd', publisher_id='node_agent', node_id=NS.node_context.node_id, socket_path=NS.config.data['logging_socket_path'], enabled=True ).restart() return True
def run(self): cmd = self.parameters.get("Node.cmd_str") Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "Executing %s on node %s" % (cmd, self.parameters.get("fqdn")) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], )) out, err, rc = Command(cmd).run() if not err and rc == 0: Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "Successfully executed %s on node %s" % (cmd, self.parameters.get("fqdn")) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], )) return True else: Event( Message( priority="error", publisher=NS.publisher_id, payload={ "message": "Failed to execute %s on node %s." "Error %s" % (cmd, self.parameters.get("fqdn"), err) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], )) raise AtomExecutionFailedError(err)
def run(self): vol_id = self.parameters['Volume.vol_id'] volume = NS.gluster.objects.Volume(vol_id=vol_id).load() command = "gluster volume profile %s stop" % volume.name cmd = cmd_utils.Command(command) out, err, rc = cmd.run() if rc != 0: raise AtomExecutionFailedError( "Error while disabling profiling " "for volume: %s in cluster: %s. Error: %s" % (volume.name, NS.tendrl_context.integration_id, err)) while True: volume = NS.gluster.objects.Volume(vol_id=vol_id).load() if volume.profiling_enabled == "no": break time.sleep(5) return True
def run(self): retry_count = 0 while True: pools = None try: pools = NS._int.client.read("clusters/%s/Pools" % NS.tendrl_context.integration_id) except etcd.EtcdKeyNotFound: pass if pools: for entry in pools.leaves: try: pool = Pool( pool_id=entry.key.split("Pools/")[-1]).load() if pool.pool_name == self.parameters['Pool.poolname']: return True except etcd.EtcdKeyNotFound: continue retry_count += 1 gevent.sleep(1) if retry_count == 600: Event( Message( priority="error", publisher=NS.publisher_id, payload={ "message": "Pool %s not reflected in tendrl yet. Timing out" % self.parameters['Pool.pool_name'] }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], cluster_id=NS.tendrl_context.integration_id, )) raise AtomExecutionFailedError( "Pool %s not reflected in tendrl yet. Timing out" % self.parameters['Pool.pool_name'])
def run(self): if not self.parameters.get('import_after_expand', False) and \ not self.parameters.get('import_after_create', False): # Above condition means, this is a fresh import # Check if nodes participate in some existing cluster try: for entry in self.parameters["Node[]"]: _node_tc = NS.tendrl.objects.TendrlContext( node_id=entry).load() Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Check: Node %s not part of any other cluster" % entry })) if _node_tc.integration_id != "": _msg = "Error: Node %s is already part of other " \ "cluster %s" % (entry, _node_tc.integration_id) Event( Message(job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="error", publisher=NS.publisher_id, payload={"message": _msg})) return False except etcd.EtcdKeyNotFound: raise AtomExecutionFailedError( "Error while checking pre-participation of nodes in any cluster" ) return True
def run(self): integration_id = self.parameters['TendrlContext.integration_id'] # wait for 360 sec to complete the first round of sync of # cluster data loop_count = 0 while True: if loop_count >= 72: raise AtomExecutionFailedError( "Timing out import job, Cluster data still not " "fully updated (node: %s) " "(integration_id: %s)" % (integration_id, NS.node_context.node_id)) time.sleep(5) try: _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=NS.node_context.node_id).load() if _cnc.first_sync_done is not None and \ _cnc.first_sync_done.lower() == "yes": break except etcd.EtcdKeyNotFound: loop_count += 1 continue return True
def run(self): node_ids = self.parameters.get('Node[]') if not node_ids or len(node_ids) == 0: raise AtomExecutionFailedError("Node[] cannot be empty") for node_id in node_ids: # Check if node has the OS details populated try: os_details = etcd_utils.read("nodes/%s/Os" % node_id) if os_details.leaves is None: logger.log( "error", NS.get("publisher_id", None), { "message": "Node %s doesn't have OS details " "populated" % NS.node_context.fqdn }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False except etcd.EtcdKeyNotFound: logger.log( "error", NS.get("publisher_id", None), { "message": "Node %s doesn't have OS details " "populated" % NS.node_context.fqdn }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False # Check if node has the CPU details populated try: cpu_details = etcd_utils.read("nodes/%s/Cpu" % node_id) if cpu_details.leaves is None: logger.log( "error", NS.get("publisher_id", None), { "message": "Node %s doesn't have CPU details " "populated" % NS.node_context.fqdn }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False except etcd.EtcdKeyNotFound: logger.log( "error", NS.get("publisher_id", None), { "message": "Node %s doesn't have CPU details " "populated" % NS.node_context.fqdn }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False # Check if node has the Memory populated try: memory_details = etcd_utils.read( "nodes/%s/Memory" % node_id ) if memory_details.leaves is None: logger.log( "error", NS.get("publisher_id", None), { "message": "Node %s doesn't have Memory details " "populated" % NS.node_context.fqdn }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False except etcd.EtcdKeyNotFound: logger.log( "error", NS.get("publisher_id", None), { "message": "Node %s doesn't have Memory details " "populated" % NS.node_context.fqdn }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False # Check if node has networks details populated try: networks = etcd_utils.read("nodes/%s/Networks" % node_id) if networks.leaves is None: logger.log( "error", NS.get("publisher_id", None), { "message": "Node %s doesn't have network details " "populated" % NS.node_context.fqdn }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False except etcd.EtcdKeyNotFound: logger.log( "error", NS.get("publisher_id", None), { "message": "Node %s doesn't have network details " "populated" % NS.node_context.fqdn }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False return True
def run(self): try: # Lock nodes create_cluster_utils.acquire_node_lock(self.parameters) integration_id = self.parameters['TendrlContext.integration_id'] sds_name = self.parameters['DetectedCluster.sds_pkg_name'] if not self.parameters.get('import_after_expand', False) and \ not self.parameters.get('import_after_create', False): # check if gdeploy in already provisioned in this cluster # if no it has to be provisioned here if sds_name.find("gluster") > -1 and \ not self.parameters.get("gdeploy_provisioned", False) and \ not self._probe_and_mark_provisioner( self.parameters["Node[]"], integration_id ): create_cluster_utils.install_gdeploy() create_cluster_utils.install_python_gdeploy() ssh_job_ids = create_cluster_utils.gluster_create_ssh_setup_jobs( self.parameters) while True: gevent.sleep(3) all_status = {} for job_id in ssh_job_ids: all_status[job_id] = NS._int.client.read( "/queue/%s/status" % job_id).value _failed = { _jid: status for _jid, status in all_status.iteritems() if status == "failed" } if _failed: raise AtomExecutionFailedError( "SSH setup failed for jobs %s cluster %s" % (str(_failed), integration_id)) if all([ status == "finished" for status in all_status.values() ]): Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "SSH setup completed for all nodes in cluster %s" % integration_id })) # set this node as gluster provisioner tags = ["provisioner/%s" % integration_id] NS.node_context = NS.node_context.load() tags += NS.node_context.tags NS.node_context.tags = list(set(tags)) NS.node_context.save() # set gdeploy_provisioned to true so that no other nodes # tries to configure gdeploy self.parameters['gdeploy_provisioned'] = True break NS.tendrl_context = NS.tendrl_context.load() NS.tendrl_context.integration_id = integration_id _detected_cluster = NS.tendrl.objects.DetectedCluster().load() NS.tendrl_context.cluster_id = _detected_cluster.detected_cluster_id NS.tendrl_context.cluster_name = _detected_cluster.detected_cluster_name NS.tendrl_context.sds_name = _detected_cluster.sds_pkg_name NS.tendrl_context.sds_version = _detected_cluster.sds_pkg_version NS.tendrl_context.save() Event( Message(job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Registered Node %s with cluster %s" % (NS.node_context.node_id, NS.tendrl_context.integration_id) })) node_list = self.parameters['Node[]'] cluster_nodes = [] if len(node_list) > 1: # This is the master node for this flow for node in node_list: if NS.node_context.node_id != node: new_params = self.parameters.copy() new_params['Node[]'] = [node] # create same flow for each node in node list except $this payload = { "tags": ["tendrl/node_%s" % node], "run": "tendrl.flows.ImportCluster", "status": "new", "parameters": new_params, "parent": self.parameters['job_id'], "type": "node" } _job_id = str(uuid.uuid4()) cluster_nodes.append(_job_id) Job(job_id=_job_id, status="new", payload=payload).save() Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Importing (job: %s) Node %s to cluster %s" % (_job_id, node, integration_id) })) if "ceph" in sds_name.lower(): node_context = NS.node_context.load() is_mon = False for tag in node_context.tags: mon_tag = NS.compiled_definitions.get_parsed_defs( )['namespace.tendrl']['tags']['ceph-mon'] if mon_tag in tag: is_mon = True if is_mon: # Check if minimum required version of underlying ceph # cluster met. If not fail the import task detected_cluster = NS.tendrl.objects.DetectedCluster( ).load() detected_cluster_ver = detected_cluster.sds_pkg_version.split( '.') maj_ver = detected_cluster_ver[0] min_ver = detected_cluster_ver[1] reqd_ceph_ver = NS.compiled_definitions.get_parsed_defs( )['namespace.tendrl']['min_reqd_ceph_ver'] req_maj_ver, req_min_ver, req_rel = reqd_ceph_ver.split( '.') Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Check: Minimum required version (%s.%s.%s) of Ceph Storage" % (req_maj_ver, req_min_ver, req_rel) })) if int(maj_ver) < int(req_maj_ver) or \ int(min_ver) < int(req_min_ver): Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="error", publisher=NS.publisher_id, payload={ "message": "Error: Minimum required version (%s.%s.%s) " "doesnt match that of detected Ceph Storage (%s.%s.%s)" % (req_maj_ver, req_min_ver, req_rel, maj_ver, min_ver, 0) })) raise FlowExecutionFailedError( "Detected ceph version: %s" " is lesser than required version: %s" % (detected_cluster.sds_pkg_version, reqd_ceph_ver)) import_ceph(self.parameters) else: # Check if minimum required version of underlying gluster # cluster met. If not fail the import task detected_cluster = NS.tendrl.objects.DetectedCluster().load() detected_cluster_ver = detected_cluster.sds_pkg_version.split( '.') maj_ver = detected_cluster_ver[0] min_ver = detected_cluster_ver[1] reqd_gluster_ver = NS.compiled_definitions.get_parsed_defs( )['namespace.tendrl']['min_reqd_gluster_ver'] req_maj_ver, req_min_ver, req_rel = reqd_gluster_ver.split('.') Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Check: Minimum required version (%s.%s.%s) of Gluster Storage" % (req_maj_ver, req_min_ver, req_rel) })) if int(maj_ver) < int(req_maj_ver) or \ int(min_ver) < int(req_min_ver): Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="error", publisher=NS.publisher_id, payload={ "message": "Error: Minimum required version (%s.%s.%s) " "doesnt match that of detected Gluster Storage (%s.%s.%s)" % (req_maj_ver, req_min_ver, req_rel, maj_ver, min_ver, 0) })) raise FlowExecutionFailedError( "Detected gluster version: %s" " is lesser than required version: %s" % (detected_cluster.sds_pkg_version, reqd_gluster_ver)) import_gluster(self.parameters) Event( Message(job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Waiting for participant nodes %s to be " "imported %s" % (node_list, integration_id) })) # An import is sucessfull once all Node[] register to # /clusters/:integration_id/nodes/:node_id while True: _all_node_status = [] gevent.sleep(3) for node_id in self.parameters['Node[]']: _status = NS.tendrl.objects.ClusterNodeContext(node_id=node_id).exists() \ and NS.tendrl.objects.ClusterTendrlContext( integration_id=integration_id ).exists() _all_node_status.append(_status) if _all_node_status: if all(_all_node_status): Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Import Cluster completed for all nodes " "in cluster %s" % integration_id })) break Event( Message(job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Sucessfully imported cluster %s" % integration_id })) except Exception as ex: # For traceback Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": ex.message, "exception": ex })) # raising exception to mark job as failed raise ex finally: # release lock create_cluster_utils.release_node_lock(self.parameters) return True
def run(self): try: # Locking nodes create_cluster_utils.acquire_node_lock(self.parameters) integration_id = self.parameters['TendrlContext.integration_id'] sds_name = self.parameters["TendrlContext.sds_name"] ssh_job_ids = [] if "ceph" in sds_name: ssh_job_ids = create_cluster_utils.ceph_create_ssh_setup_jobs( self.parameters) else: ssh_job_ids = \ create_cluster_utils.gluster_create_ssh_setup_jobs( self.parameters ) while True: gevent.sleep(3) all_status = {} for job_id in ssh_job_ids: # noinspection PyUnresolvedReferences all_status[job_id] = NS._int.client.read( "/queue/%s/status" % job_id).value _failed = { _jid: status for _jid, status in all_status.iteritems() if status == "failed" } if _failed: raise AtomExecutionFailedError( "SSH setup failed for jobs %s cluster %s" % (str(_failed), integration_id)) if all( [status == "finished" for status in all_status.values()]): Event( Message(job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "SSH setup completed for all " "nodes in cluster %s" % integration_id })) # set this node as gluster provisioner if "gluster" in self.parameters["TendrlContext.sds_name"]: tags = ["provisioner/%s" % integration_id] NS.node_context = NS.node_context.load() tags += NS.node_context.tags NS.node_context.tags = list(set(tags)) NS.node_context.save() break Event( Message(job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Starting SDS install and config %s" % integration_id })) # SSH setup jobs finished above, now install sds bits and create # cluster if "ceph" in sds_name: Event( Message(job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Creating Ceph Storage Cluster " "%s" % integration_id })) self.parameters.update({'create_mon_secret': True}) create_ceph(self.parameters) else: Event( Message(job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Creating Gluster Storage " "Cluster %s" % integration_id })) create_gluster(self.parameters) except Exception as ex: # For traceback Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": ex.message, "exception": ex })) # raising exception to mark job as failed raise ex finally: # releasing nodes if any exception came create_cluster_utils.release_node_lock(self.parameters) return True
def run(self): try: integration_id = self.parameters['TendrlContext.integration_id'] # Lock nodes flow_utils.acquire_node_lock(self.parameters) NS.tendrl_context = NS.tendrl_context.load() # TODO(team) when Tendrl supports create/expand/shrink cluster # setup passwordless ssh for all gluster nodes with given # integration_id (check # /indexes/tags/tendrl/integration/$integration_id for list of # nodes in cluster node_list = self.parameters['Node[]'] cluster_nodes = [] if len(node_list) > 1: # This is the master node for this flow for node in node_list: if NS.node_context.node_id != node: new_params = self.parameters.copy() new_params['Node[]'] = [node] # create same flow for each node in node list except # $this payload = { "tags": ["tendrl/node_%s" % node], "run": "tendrl.flows.ImportCluster", "status": "new", "parameters": new_params, "parent": self.parameters['job_id'], "type": "node" } _job_id = str(uuid.uuid4()) cluster_nodes.append(_job_id) NS.tendrl.objects.Job(job_id=_job_id, status="new", payload=payload).save() logger.log("info", NS.publisher_id, { "message": "Importing (job: %s) Node %s " "to cluster %s" % (_job_id, node, integration_id) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) # Check if minimum required version of underlying gluster # cluster met. If not fail the import task cluster_ver = \ NS.tendrl_context.sds_version.split('.') maj_ver = cluster_ver[0] min_ver = re.findall(r'\d+', cluster_ver[1])[0] reqd_gluster_ver = NS.compiled_definitions.get_parsed_defs( )['namespace.tendrl']['min_reqd_gluster_ver'] req_maj_ver, req_min_ver, req_rel = reqd_gluster_ver.split('.') logger.log("info", NS.publisher_id, { "message": "Check: Minimum required version (" "%s.%s.%s) of Gluster Storage" % (req_maj_ver, req_min_ver, req_rel) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) ver_check_failed = False if int(maj_ver) < int(req_maj_ver): ver_check_failed = True else: if int(maj_ver) == int(req_maj_ver) and \ int(min_ver) < int(req_min_ver): ver_check_failed = True if ver_check_failed: logger.log("error", NS.publisher_id, { "message": "Error: Minimum required version " "(%s.%s.%s) " "doesnt match that of detected Gluster " "Storage (%s.%s.%s)" % (req_maj_ver, req_min_ver, req_rel, maj_ver, min_ver, 0) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) raise AtomExecutionFailedError( "Detected gluster version: %s" " is lesser than required version: %s" % (NS.tendrl_context.sds_version, reqd_gluster_ver)) ret_val, err = import_gluster(self.parameters) if not ret_val: raise AtomExecutionFailedError( "Error importing the cluster (integration_id: %s). " "Error: %s" % (integration_id, err)) if len(node_list) > 1: logger.log("info", NS.publisher_id, { "message": "Waiting for participant nodes %s to " "be " "imported %s" % (node_list, integration_id) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) loop_count = 0 # Wait for (no of nodes) * 6 minutes for import to complete wait_count = (len(node_list) - 1) * 36 while True: parent_job = NS.tendrl.objects.Job( job_id=self.parameters['job_id']).load() if loop_count >= wait_count: logger.log("info", NS.publisher_id, { "message": "Import jobs not yet complete " "on all nodes. Timing out. (%s, %s)" % (str(node_list), integration_id) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) return False time.sleep(10) finished = True for child_job_id in parent_job.children: child_job = NS.tendrl.objects.Job( job_id=child_job_id).load() if child_job.status != "finished": finished = False break if finished: break else: loop_count += 1 continue except Exception as ex: # For traceback Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": ex.message, "exception": ex })) # raising exception to mark job as failed raise ex finally: # release lock flow_utils.release_node_lock(self.parameters) return True
def run(self): try: integration_id = self.parameters['TendrlContext.integration_id'] _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id).load() # Lock nodes flow_utils.acquire_node_lock(self.parameters) NS.tendrl_context = NS.tendrl_context.load() # TODO(team) when Tendrl supports create/expand/shrink cluster # setup passwordless ssh for all gluster nodes with given # integration_id (check # /indexes/tags/tendrl/integration/$integration_id for list of # nodes in cluster node_list = self.parameters['Node[]'] cluster_nodes = [] if len(node_list) > 1: # This is the master node for this flow for node in node_list: if NS.node_context.node_id != node: new_params = self.parameters.copy() new_params['Node[]'] = [node] # create same flow for each node in node list except # $this payload = { "tags": ["tendrl/node_%s" % node], "run": "tendrl.flows.ImportCluster", "status": "new", "parameters": new_params, "parent": self.parameters['job_id'], "type": "node" } _job_id = str(uuid.uuid4()) cluster_nodes.append(_job_id) NS.tendrl.objects.Job(job_id=_job_id, status="new", payload=payload).save() logger.log("info", NS.publisher_id, { "message": "ImportCluster %s (jobID: %s) :" "importing host %s" % (_cluster.short_name, _job_id, node) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) # Check if minimum required version of underlying gluster # cluster met. If not fail the import task # A sample output from "rpm -qa | grep glusterfs-server" # looks as below # `glusterfs-server-3.8.4-54.4.el7rhgs.x86_64` # In case of upstream build the format could be as below # `glusterfs-server-4.1dev-0.203.gitc3e1a2e.el7.centos.x86_64` # `glusterfs-server-3.12.8-0.0.el7.centos.x86_64.rpm` cmd = subprocess.Popen( 'rpm -q glusterfs-server', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) out, err = cmd.communicate() if out in [None, ""] or err: raise AtomExecutionFailedError( "Failed to detect underlying cluster version") lines = out.split('\n') build_no = None req_build_no = None ver_det = lines[0].split('glusterfs-server-')[-1].split('.') maj_ver = ver_det[0] min_ver = ver_det[1] if 'dev' in min_ver: min_ver = min_ver[0] rel = ver_det[2] if '-' in rel: build_no = rel.split('-')[-1] rel = rel.split('-')[0] reqd_gluster_ver = NS.compiled_definitions.get_parsed_defs( )['namespace.tendrl']['min_reqd_gluster_ver'] req_maj_ver, req_min_ver, req_rel = reqd_gluster_ver.split('.') if '-' in req_rel: req_build_no = req_rel.split('-')[-1] req_rel = req_rel.split('-')[0] logger.log("info", NS.publisher_id, { "message": "Checking minimum required version (" "%s.%s.%s) of Gluster Storage" % (req_maj_ver, req_min_ver, req_rel) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) ver_check_failed = False if int(maj_ver) < int(req_maj_ver): ver_check_failed = True else: if int(maj_ver) == int(req_maj_ver): if int(min_ver) < int(req_min_ver): ver_check_failed = True else: if int(min_ver) == int(req_min_ver): if int(rel) < int(req_rel): ver_check_failed = True else: if int(rel) == int(req_rel): if build_no is not None and \ req_build_no is not None and \ int(build_no) < int(req_build_no): ver_check_failed = True if ver_check_failed: logger.log("error", NS.publisher_id, { "message": "Error: Minimum required version " "(%s.%s.%s) " "doesnt match that of detected Gluster " "Storage (%s.%s.%s)" % (req_maj_ver, req_min_ver, req_rel, maj_ver, min_ver, 0) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) raise AtomExecutionFailedError( "Detected gluster version: %s" " is lesser than required version: %s" % (NS.tendrl_context.sds_version, reqd_gluster_ver)) ret_val, err = import_gluster(self.parameters) if not ret_val: raise AtomExecutionFailedError( "Error importing the cluster (integration_id: %s). " "Error: %s" % (integration_id, err)) if len(node_list) > 1: logger.log("info", NS.publisher_id, { "message": "ImportCluster %s waiting for hosts %s " "to be imported" % (_cluster.short_name, node_list) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) loop_count = 0 # Wait for (no of nodes) * 6 minutes for import to complete wait_count = (len(node_list) - 1) * 36 while True: parent_job = NS.tendrl.objects.Job( job_id=self.parameters['job_id']).load() if loop_count >= wait_count: logger.log( "info", NS.publisher_id, { "message": "Import jobs on cluster(%s) not yet " "complete on all nodes(%s). Timing out." % (_cluster.short_name, str(node_list)) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) return False time.sleep(10) finished = True for child_job_id in parent_job.children: child_job = NS.tendrl.objects.Job( job_id=child_job_id).load() if child_job.status != "finished": finished = False break if finished: break else: loop_count += 1 continue except Exception as ex: # For traceback Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": ex.message, "exception": ex })) # raising exception to mark job as failed raise ex finally: # release lock flow_utils.release_node_lock(self.parameters) return True
def run(self): # Execute the pre runs for the flow msg = "Processing pre-runs for flow: %s" % self.to_str Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg})) # Check for mandatory parameters if 'mandatory' in self._defs.get('inputs', {}): for item in self._defs['inputs']['mandatory']: if item not in self.parameters: msg = "Mandatory parameter %s not provided" % item Event( Message(job_id=self.job_id, flow_id=self.parameters['flow_id'], priority="warning", publisher=NS.publisher_id, payload={"message": msg})) raise FlowExecutionFailedError("Mandatory parameter %s " "not provided" % item) if self._defs.get("pre_run") is not None: for atom_fqn in self._defs.get("pre_run"): msg = "Start pre-run : %s" % atom_fqn Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg})) ret_val = self._execute_atom(atom_fqn) if not ret_val: msg = "Failed pre-run: %s for flow: %s" % \ (atom_fqn, self._defs['help']) Event( Message(job_id=self.job_id, flow_id=self.parameters['flow_id'], priority="error", publisher=NS.publisher_id, payload={"message": msg})) raise AtomExecutionFailedError( "Error executing pre run function: %s for flow: %s" % (atom_fqn, self._defs['help'])) else: msg = "Finished pre-run: %s for flow: %s" %\ (atom_fqn, self._defs['help']) Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg})) # Execute the atoms for the flow msg = "Processing atoms for flow: %s" % self._defs['help'] Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg})) if self._defs.get("atoms") is not None: for atom_fqn in self._defs.get("atoms"): msg = "Start atom : %s" % atom_fqn Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg})) ret_val = self._execute_atom(atom_fqn) if not ret_val: msg = "Failed atom: %s on flow: %s" % \ (atom_fqn, self._defs['help']) Event( Message(job_id=self.job_id, flow_id=self.parameters['flow_id'], priority="error", publisher=NS.publisher_id, payload={"message": msg})) raise AtomExecutionFailedError( "Error executing atom: %s on flow: %s" % (atom_fqn, self._defs['help'])) else: msg = 'Finished atom %s for flow: %s' %\ (atom_fqn, self._defs['help']) Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg})) # Execute the post runs for the flow msg = "Processing post-runs for flow: %s" % self._defs['help'] Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg})) if self._defs.get("post_run") is not None: for atom_fqn in self._defs.get("post_run"): msg = "Start post-run : %s" % atom_fqn Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg})) ret_val = self._execute_atom(atom_fqn) if not ret_val: msg = "Failed post-run: %s for flow: %s" % \ (atom_fqn, self._defs['help']) Event( Message(job_id=self.job_id, flow_id=self.parameters['flow_id'], priority="error", publisher=NS.publisher_id, payload={"message": msg})) raise AtomExecutionFailedError( "Error executing post run function: %s" % atom_fqn) else: msg = "Finished post-run: %s for flow: %s" %\ (atom_fqn, self._defs['help']) Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg}))
def run(self): pool_id = self.parameters['Pool.pool_id'] attrs = {} if 'Pool.pg_num' in self.parameters: fetched_obj = Pool(pool_id=self.parameters['Pool.pool_id']).load() attrs['pg_num'] = self.parameters.get('Pool.pg_num') if attrs['pg_num'] <= int(fetched_obj.pg_num): raise AtomExecutionFailedError( "New pg-num cannot be less than existing value") if 'Pool.size' in self.parameters: attrs['size'] = self.parameters.get('Pool.size') if 'Pool.min_size' in self.parameters: attrs['min_size'] = self.parameters.get('Pool.min_size') if 'Pool.quota_enabled' in self.parameters and \ self.parameters['Pool.quota_enabled'] is True: attrs['quota_max_objects'] = \ self.parameters.get('Pool.quota_max_objects') attrs['quota_max_bytes'] = \ self.parameters.get('Pool.quota_max_bytes') Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "Updating details for pool-id %s." " Attributes: %s" % (self.parameters['Pool.pool_id'], str(attrs)) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], cluster_id=NS.tendrl_context.integration_id, )) crud = Crud() resp = crud.update("pool", pool_id, attrs) try: crud.sync_request_status(resp['request']) except RequestStateError as ex: Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "Failed to update pool %s." " Error: %s" % (self.parameters['Pool.pool_id'], ex) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], cluster_id=NS.tendrl_context.integration_id, )) return False Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "Pool %s successfully updated" % (self.parameters['Pool.pool_id']) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], cluster_id=NS.tendrl_context.integration_id, )) return True