def terminate_unneeded_clusters(self, ctx): LOG.debug('Terminating unneeded clusters') ctx = context.get_admin_context() context.set_ctx(ctx) for cluster in conductor.cluster_get_all(ctx, status='Active'): if not cluster.is_transient: continue jc = conductor.job_execution_count(ctx, end_time=None, cluster_id=cluster.id) if jc > 0: continue cluster_updated_at = timeutils.normalize_time( timeutils.parse_isotime(cluster.updated_at)) current_time = timeutils.utcnow() spacing = timeutils.delta_seconds(cluster_updated_at, current_time) if spacing < CONF.min_transient_cluster_active_time: continue if CONF.use_identity_api_v3: trusts.use_os_admin_auth_token(cluster) api.terminate_cluster(cluster.id) LOG.debug('Terminated cluster %s with id %s' % (cluster.name, cluster.id)) else: if cluster.status != 'AwaitingTermination': conductor.cluster_update( ctx, cluster, {'status': 'AwaitingTermination'}) context.set_ctx(None)
def _inner(): if initial_delay: greenthread.sleep(initial_delay) try: while self._running: start = timeutils.utcnow() self.f(*self.args, **self.kw) end = timeutils.utcnow() if not self._running: break delay = interval - timeutils.delta_seconds(start, end) if delay <= 0: LOG.warn(_LW('task run outlasted interval by %s sec') % -delay) greenthread.sleep(delay if delay > 0 else 0) except LoopingCallDone as e: self.stop() done.send(e.retvalue) except Exception: LOG.exception(_LE('in fixed duration looping call')) done.send_exception(*sys.exc_info()) return else: done.send(True)
def _start_cloudera_manager(cluster): manager = pu.get_manager(cluster) with manager.remote() as r: cmd.start_cloudera_db(r) cmd.start_manager(r) timeout = 300 LOG.debug("Waiting %(timeout)s seconds for Manager to start : " % { 'timeout': timeout}) s_time = timeutils.utcnow() while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: try: conn = telnetlib.Telnet(manager.management_ip, CM_API_PORT) conn.close() break except IOError: context.sleep(2) else: message = _("Cloudera Manager failed to start in %(timeout)s minutes " "on node '%(node)s' of cluster '%(cluster)s'") % { 'timeout': timeout / 60, 'node': manager.management_ip, 'cluster': cluster.name} raise ex.HadoopProvisionError(message) LOG.info(_LI("Cloudera Manager has been started"))
def _detach_volume(instance, volume_id): volume = cinder.get_volume(volume_id) try: LOG.debug("Detaching volume %s from instance %s" % ( volume_id, instance.instance_name)) nova.client().volumes.delete_server_volume(instance.instance_id, volume_id) except Exception: LOG.exception(_LE("Can't detach volume %s"), volume.id) detach_timeout = CONF.detach_volume_timeout LOG.debug("Waiting %d seconds to detach %s volume" % (detach_timeout, volume_id)) s_time = tu.utcnow() while tu.delta_seconds(s_time, tu.utcnow()) < detach_timeout: volume = cinder.get_volume(volume_id) if volume.status not in ['available', 'error']: context.sleep(2) else: LOG.debug("Volume %s has been detached" % volume_id) return else: LOG.warn(_LW("Can't detach volume %(volume)s. " "Current status of volume: %(status)s"), {'volume': volume_id, 'status': volume.status})
def transient_cluster_testing(self, plugin_config, floating_ip_pool, internal_neutron_net): cluster_template_id = self.create_cluster_template( name='test-transient-cluster-template-vanilla', plugin_config=self.vanilla_config, description=('test cluster template for transient cluster ' 'of Vanilla plugin'), cluster_configs={}, node_groups=[ dict( name='single-node', flavor_id=self.flavor_id, node_processes=['namenode'], floating_ip_pool=floating_ip_pool, count=1) ], net_id=internal_neutron_net ) try: try: cluster_name = (self.common_config.CLUSTER_NAME + '-transient-' + plugin_config.PLUGIN_NAME) self.create_cluster( name=cluster_name, plugin_config=plugin_config, cluster_template_id=cluster_template_id, description='test transient cluster', cluster_configs={}, is_transient=True ) except Exception: self.delete_objects(cluster_id=self.cluster_id) raise cluster_info = self.get_cluster_info(plugin_config) # set timeout in seconds timeout = self.common_config.TRANSIENT_CLUSTER_TIMEOUT * 60 s_time = timeutils.utcnow() raise_failure = True while timeutils.delta_seconds( s_time, timeutils.utcnow()) < timeout: try: self.sahara.clusters.get(cluster_info['cluster_id']) except sab.APIException as api_ex: if 'not found' in api_ex.message: raise_failure = False break time.sleep(2) if raise_failure: self.delete_objects(cluster_id=cluster_info['cluster_id']) self.fail('Transient cluster has not been deleted within %s ' 'minutes.' % self.common_config.TRANSIENT_CLUSTER_TIMEOUT) finally: self.delete_objects(cluster_template_id=cluster_template_id)
def transient_cluster_testing(self, plugin_config, floating_ip_pool, internal_neutron_net): cluster_template_id = self.create_cluster_template( name='test-transient-cluster-template-vanilla', plugin_config=self.vanilla_config, description=('test cluster template for transient cluster ' 'of Vanilla plugin'), cluster_configs={}, node_groups=[ dict(name='single-node', flavor_id=self.flavor_id, node_processes=['namenode'], floating_ip_pool=floating_ip_pool, count=1) ], net_id=internal_neutron_net) try: try: cluster_name = (self.common_config.CLUSTER_NAME + '-transient-' + plugin_config.PLUGIN_NAME) self.create_cluster(name=cluster_name, plugin_config=plugin_config, cluster_template_id=cluster_template_id, description='test transient cluster', cluster_configs={}, is_transient=True) except Exception: self.delete_objects(cluster_id=self.cluster_id) raise cluster_info = self.get_cluster_info(plugin_config) # set timeout in seconds timeout = self.common_config.TRANSIENT_CLUSTER_TIMEOUT * 60 s_time = timeutils.utcnow() raise_failure = True while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: try: self.sahara.clusters.get(cluster_info['cluster_id']) except sab.APIException as api_ex: if 'not found' in api_ex.message: raise_failure = False break time.sleep(2) if raise_failure: self.delete_objects(cluster_id=cluster_info['cluster_id']) self.fail('Transient cluster has not been deleted within %s ' 'minutes.' % self.common_config.TRANSIENT_CLUSTER_TIMEOUT) finally: self.delete_objects(cluster_template_id=cluster_template_id)
def _check_decommission(cluster, instances, check_func, timeout): s_time = timeutils.utcnow() while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: statuses = check_func(cluster) dec_ok = True for instance in instances: if statuses[instance.fqdn()] != 'decommissioned': dec_ok = False if dec_ok: return else: context.sleep(5) else: ex.SaharaException("Cannot finish decommission in %d seconds" % timeout)
def _check_decommission(cluster, instances, check_func, timeout): s_time = timeutils.utcnow() while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: statuses = check_func(cluster) dec_ok = True for instance in instances: if statuses[instance.fqdn()] != 'decommissioned': dec_ok = False if dec_ok: return else: context.sleep(5) else: ex.DecommissionError( "Cannot finish decommission of cluster %s in %d seconds" % (cluster, timeout))
def terminate_unneeded_clusters(self, ctx): LOG.debug('Terminating unneeded transient clusters') ctx = context.get_admin_context() context.set_ctx(ctx) for cluster in conductor.cluster_get_all(ctx, status='Active'): if not cluster.is_transient: continue jc = conductor.job_execution_count(ctx, end_time=None, cluster_id=cluster.id) if jc > 0: continue cluster_updated_at = timeutils.normalize_time( timeutils.parse_isotime(cluster.updated_at)) current_time = timeutils.utcnow() spacing = timeutils.delta_seconds(cluster_updated_at, current_time) if spacing < CONF.min_transient_cluster_active_time: continue if CONF.use_identity_api_v3: trusts.use_os_admin_auth_token(cluster) LOG.info(_LI('Terminating transient cluster %(cluster)s ' 'with id %(id)s'), {'cluster': cluster.name, 'id': cluster.id}) try: api.terminate_cluster(cluster.id) except Exception as e: LOG.info(_LI('Failed to terminate transient cluster ' '%(cluster)s with id %(id)s: %(error)s.'), {'cluster': cluster.name, 'id': cluster.id, 'error': six.text_type(e)}) else: if cluster.status != 'AwaitingTermination': conductor.cluster_update( ctx, cluster, {'status': 'AwaitingTermination'}) context.set_ctx(None)
def _await_agents(instances): api = cu.get_api_client(instances[0].node_group.cluster) timeout = 300 LOG.debug("Waiting %(timeout)s seconds for agent connected to manager" % { 'timeout': timeout}) s_time = timeutils.utcnow() while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: hostnames = [i.fqdn() for i in instances] hostnames_to_manager = [h.hostname for h in api.get_all_hosts('full')] is_ok = True for hostname in hostnames: if hostname not in hostnames_to_manager: is_ok = False break if not is_ok: context.sleep(5) else: break else: raise ex.HadoopProvisionError(_("Cloudera agents failed to connect to" " Cloudera Manager"))
def decommission_dn(nn, inst_to_be_deleted, survived_inst): with remote.get_remote(nn) as r: r.write_file_to('/etc/hadoop/dn.excl', utils.generate_fqdn_host_names(inst_to_be_deleted)) run.refresh_nodes(remote.get_remote(nn), "dfsadmin") context.sleep(3) timeout = config_helper.get_decommissioning_timeout( nn.node_group.cluster) s_time = timeutils.utcnow() all_found = False while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: cmd = r.execute_command( "sudo su -c 'hadoop dfsadmin -report' hadoop") all_found = True datanodes_info = parse_dfs_report(cmd[1]) for i in inst_to_be_deleted: for dn in datanodes_info: if (dn["Name"].startswith(i.internal_ip)) and ( dn["Decommission Status"] != "Decommissioned"): all_found = False break if all_found: r.write_files_to({ '/etc/hadoop/dn.incl': utils.generate_fqdn_host_names(survived_inst), '/etc/hadoop/dn.excl': "", }) break context.sleep(3) if not all_found: ex.DecommissionError( "Cannot finish decommission of cluster %s in %d seconds" % (nn.node_group.cluster, timeout))
def decommission_dn(nn, inst_to_be_deleted, survived_inst): with remote.get_remote(nn) as r: r.write_file_to('/etc/hadoop/dn.excl', utils.generate_fqdn_host_names( inst_to_be_deleted)) run.refresh_nodes(remote.get_remote(nn), "dfsadmin") context.sleep(3) timeout = c_helper.get_decommissioning_timeout( nn.node_group.cluster) s_time = timeutils.utcnow() all_found = False while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: cmd = r.execute_command( "sudo -u hdfs hadoop dfsadmin -report") all_found = True datanodes_info = parse_dfs_report(cmd[1]) for i in inst_to_be_deleted: for dn in datanodes_info: if (dn["Name"].startswith(i.internal_ip)) and ( dn["Decommission Status"] != "Decommissioned"): all_found = False break if all_found: r.write_files_to({'/etc/hadoop/dn.incl': utils. generate_fqdn_host_names(survived_inst), '/etc/hadoop/dn.excl': "", }) break context.sleep(3) if not all_found: ex.DecommissionError( "Cannot finish decommission of cluster %s in %d seconds" % (nn.node_group.cluster, timeout))
def transient_cluster_testing(self, plugin_config, floating_ip_pool, internal_neutron_net): cluster_template_id = self.create_cluster_template( name='test-transient-cluster-template-vanilla', plugin_config=self.vanilla_config, description=('test cluster template for transient cluster ' 'of Vanilla plugin'), cluster_configs={}, node_groups=[ dict( name='master-node', flavor_id=self.flavor_id, node_processes=['namenode', 'oozie', 'jobtracker'], floating_ip_pool=floating_ip_pool, count=1), dict( name='worker-node', flavor_id=self.flavor_id, node_processes=['datanode', 'tasktracker'], floating_ip_pool=floating_ip_pool, count=1) ], net_id=internal_neutron_net ) try: # create a transient cluster try: cluster_name = (self.common_config.CLUSTER_NAME + '-transient-' + plugin_config.PLUGIN_NAME) self.create_cluster( name=cluster_name, plugin_config=plugin_config, cluster_template_id=cluster_template_id, description='test transient cluster', cluster_configs={}, is_transient=True ) except Exception: self.delete_objects(cluster_id=self.cluster_id) raise # check EDP path = 'sahara/tests/integration/tests/resources/' pig_job_data = open(path + 'edp-job.pig').read() pig_lib_data = open(path + 'edp-lib.jar').read() self.edp_testing(job_type=utils_edp.JOB_TYPE_PIG, job_data_list=[{'pig': pig_job_data}], lib_data_list=[{'jar': pig_lib_data}]) # set timeout in seconds timeout = self.common_config.TRANSIENT_CLUSTER_TIMEOUT * 60 s_time = timeutils.utcnow() raise_failure = True # wait for cluster deleting while timeutils.delta_seconds( s_time, timeutils.utcnow()) < timeout: try: self.sahara.clusters.get(self.cluster_id) except sab.APIException as api_ex: if 'not found' in api_ex.message: raise_failure = False break time.sleep(2) if raise_failure: self.delete_objects(cluster_id=self.cluster_id) self.fail('Transient cluster has not been deleted within %s ' 'minutes.' % self.common_config.TRANSIENT_CLUSTER_TIMEOUT) finally: self.delete_objects(cluster_template_id=cluster_template_id)