def _detach_volume(instance, volume_id): volume = cinder.get_volume(volume_id) try: LOG.debug("Detaching volume %s from instance %s" % ( volume_id, instance.instance_name)) nova.client().volumes.delete_server_volume(instance.instance_id, volume_id) except Exception: LOG.exception(_LE("Can't detach volume %s"), volume.id) detach_timeout = CONF.detach_volume_timeout LOG.debug("Waiting %d seconds to detach %s volume" % (detach_timeout, volume_id)) s_time = tu.utcnow() while tu.delta_seconds(s_time, tu.utcnow()) < detach_timeout: volume = cinder.get_volume(volume_id) if volume.status not in ['available', 'error']: context.sleep(2) else: LOG.debug("Volume %s has been detached" % volume_id) return else: LOG.warn(_LW("Can't detach volume %(volume)s. " "Current status of volume: %(status)s"), {'volume': volume_id, 'status': volume.status})
def _start_cloudera_manager(cluster): manager = pu.get_manager(cluster) with manager.remote() as r: cmd.start_cloudera_db(r) cmd.start_manager(r) timeout = 300 LOG.debug("Waiting %(timeout)s seconds for Manager to start : " % { 'timeout': timeout}) s_time = timeutils.utcnow() while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: try: conn = telnetlib.Telnet(manager.management_ip, CM_API_PORT) conn.close() break except IOError: context.sleep(2) else: message = _("Cloudera Manager failed to start in %(timeout)s minutes " "on node '%(node)s' of cluster '%(cluster)s'") % { 'timeout': timeout / 60, 'node': manager.management_ip, 'cluster': cluster.name} raise ex.HadoopProvisionError(message) LOG.info(_LI("Cloudera Manager has been started"))
def _inner(): if initial_delay: greenthread.sleep(initial_delay) try: while self._running: start = timeutils.utcnow() self.f(*self.args, **self.kw) end = timeutils.utcnow() if not self._running: break delay = interval - timeutils.delta_seconds(start, end) if delay <= 0: LOG.warn(_LW('task run outlasted interval by %s sec') % -delay) greenthread.sleep(delay if delay > 0 else 0) except LoopingCallDone as e: self.stop() done.send(e.retvalue) except Exception: LOG.exception(_LE('in fixed duration looping call')) done.send_exception(*sys.exc_info()) return else: done.send(True)
def transient_cluster_testing(self, plugin_config, floating_ip_pool, internal_neutron_net): cluster_template_id = self.create_cluster_template( name='test-transient-cluster-template-vanilla', plugin_config=self.vanilla_config, description=('test cluster template for transient cluster ' 'of Vanilla plugin'), cluster_configs={}, node_groups=[ dict( name='single-node', flavor_id=self.flavor_id, node_processes=['namenode'], floating_ip_pool=floating_ip_pool, count=1) ], net_id=internal_neutron_net ) try: try: cluster_name = (self.common_config.CLUSTER_NAME + '-transient-' + plugin_config.PLUGIN_NAME) self.create_cluster( name=cluster_name, plugin_config=plugin_config, cluster_template_id=cluster_template_id, description='test transient cluster', cluster_configs={}, is_transient=True ) except Exception: self.delete_objects(cluster_id=self.cluster_id) raise cluster_info = self.get_cluster_info(plugin_config) # set timeout in seconds timeout = self.common_config.TRANSIENT_CLUSTER_TIMEOUT * 60 s_time = timeutils.utcnow() raise_failure = True while timeutils.delta_seconds( s_time, timeutils.utcnow()) < timeout: try: self.sahara.clusters.get(cluster_info['cluster_id']) except sab.APIException as api_ex: if 'not found' in api_ex.message: raise_failure = False break time.sleep(2) if raise_failure: self.delete_objects(cluster_id=cluster_info['cluster_id']) self.fail('Transient cluster has not been deleted within %s ' 'minutes.' % self.common_config.TRANSIENT_CLUSTER_TIMEOUT) finally: self.delete_objects(cluster_template_id=cluster_template_id)
def transient_cluster_testing(self, plugin_config, floating_ip_pool, internal_neutron_net): cluster_template_id = self.create_cluster_template( name='test-transient-cluster-template-vanilla', plugin_config=self.vanilla_config, description=('test cluster template for transient cluster ' 'of Vanilla plugin'), cluster_configs={}, node_groups=[ dict(name='single-node', flavor_id=self.flavor_id, node_processes=['namenode'], floating_ip_pool=floating_ip_pool, count=1) ], net_id=internal_neutron_net) try: try: cluster_name = (self.common_config.CLUSTER_NAME + '-transient-' + plugin_config.PLUGIN_NAME) self.create_cluster(name=cluster_name, plugin_config=plugin_config, cluster_template_id=cluster_template_id, description='test transient cluster', cluster_configs={}, is_transient=True) except Exception: self.delete_objects(cluster_id=self.cluster_id) raise cluster_info = self.get_cluster_info(plugin_config) # set timeout in seconds timeout = self.common_config.TRANSIENT_CLUSTER_TIMEOUT * 60 s_time = timeutils.utcnow() raise_failure = True while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: try: self.sahara.clusters.get(cluster_info['cluster_id']) except sab.APIException as api_ex: if 'not found' in api_ex.message: raise_failure = False break time.sleep(2) if raise_failure: self.delete_objects(cluster_id=cluster_info['cluster_id']) self.fail('Transient cluster has not been deleted within %s ' 'minutes.' % self.common_config.TRANSIENT_CLUSTER_TIMEOUT) finally: self.delete_objects(cluster_template_id=cluster_template_id)
def terminate_unneeded_clusters(self, ctx): LOG.debug('Terminating unneeded clusters') ctx = context.get_admin_context() context.set_ctx(ctx) for cluster in conductor.cluster_get_all(ctx, status='Active'): if not cluster.is_transient: continue jc = conductor.job_execution_count(ctx, end_time=None, cluster_id=cluster.id) if jc > 0: continue cluster_updated_at = timeutils.normalize_time( timeutils.parse_isotime(cluster.updated_at)) current_time = timeutils.utcnow() spacing = timeutils.delta_seconds(cluster_updated_at, current_time) if spacing < CONF.min_transient_cluster_active_time: continue if CONF.use_identity_api_v3: trusts.use_os_admin_auth_token(cluster) api.terminate_cluster(cluster.id) LOG.debug('Terminated cluster %s with id %s' % (cluster.name, cluster.id)) else: if cluster.status != 'AwaitingTermination': conductor.cluster_update( ctx, cluster, {'status': 'AwaitingTermination'}) context.set_ctx(None)
def test_cluster_terminate(self, terminate_cluster, get_job_status): self.override_config("use_identity_api_v3", True) ctx = context.ctx() job = self.api.job_create(ctx, te.SAMPLE_JOB) ds = self.api.data_source_create(ctx, te.SAMPLE_DATA_SOURCE) c = tc.SAMPLE_CLUSTER.copy() c["status"] = "Active" c["id"] = "1" c["name"] = "1" c['updated_at'] = timeutils.utcnow() self.api.cluster_create(ctx, c) c["id"] = "2" c["name"] = "2" self.api.cluster_create(ctx, c) self._create_job_execution( { "end_time": datetime.datetime.now(), "id": 1, "cluster_id": "1" }, job, ds, ds) self._create_job_execution( { "end_time": None, "id": 2, "cluster_id": "2" }, job, ds, ds) self._create_job_execution( { "end_time": None, "id": 3, "cluster_id": "2" }, job, ds, ds) p.SaharaPeriodicTasks().terminate_unneeded_clusters(None) self.assertEqual(terminate_cluster.call_count, 1) terminate_cluster.assert_has_calls([mock.call(u'1')])
def test_cluster_terminate(self, terminate_cluster, utcnow): utcnow.return_value = datetime.datetime(2005, 2, 1, 0, 0) ctx = context.ctx() job = self.api.job_create(ctx, te.SAMPLE_JOB) ds = self.api.data_source_create(ctx, te.SAMPLE_DATA_SOURCE) self._make_cluster('1') self._make_cluster('2') self._create_job_execution({"end_time": timeutils.utcnow(), "id": 1, "cluster_id": "1"}, job, ds, ds) self._create_job_execution({"end_time": None, "id": 2, "cluster_id": "2"}, job, ds, ds) self._create_job_execution({"end_time": None, "id": 3, "cluster_id": "2"}, job, ds, ds) utcnow.return_value = datetime.datetime(2005, 2, 1, 0, 1) p.SaharaPeriodicTasks().terminate_unneeded_clusters(None) self.assertEqual(terminate_cluster.call_count, 1) terminate_cluster.assert_has_calls([mock.call(u'1')])
def test_cluster_terminate(self, terminate_cluster, get_job_status): self.override_config("use_identity_api_v3", True) ctx = context.ctx() job = self.api.job_create(ctx, te.SAMPLE_JOB) ds = self.api.data_source_create(ctx, te.SAMPLE_DATA_SOURCE) c = tc.SAMPLE_CLUSTER.copy() c["status"] = "Active" c["id"] = "1" c["name"] = "1" c['updated_at'] = timeutils.utcnow() self.api.cluster_create(ctx, c) c["id"] = "2" c["name"] = "2" self.api.cluster_create(ctx, c) self._create_job_execution({"end_time": datetime.datetime.now(), "id": 1, "cluster_id": "1"}, job, ds, ds) self._create_job_execution({"end_time": None, "id": 2, "cluster_id": "2"}, job, ds, ds) self._create_job_execution({"end_time": None, "id": 3, "cluster_id": "2"}, job, ds, ds) p.SaharaPeriodicTasks().terminate_unneeded_clusters(None) self.assertEqual(terminate_cluster.call_count, 1) terminate_cluster.assert_has_calls([mock.call(u'1')])
def _check_decommission(cluster, instances, check_func, timeout): s_time = timeutils.utcnow() while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: statuses = check_func(cluster) dec_ok = True for instance in instances: if statuses[instance.fqdn()] != 'decommissioned': dec_ok = False if dec_ok: return else: context.sleep(5) else: ex.SaharaException("Cannot finish decommission in %d seconds" % timeout)
def _check_decommission(cluster, instances, check_func, timeout): s_time = timeutils.utcnow() while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: statuses = check_func(cluster) dec_ok = True for instance in instances: if statuses[instance.fqdn()] != 'decommissioned': dec_ok = False if dec_ok: return else: context.sleep(5) else: ex.DecommissionError( "Cannot finish decommission of cluster %s in %d seconds" % (cluster, timeout))
def soft_delete(self, synchronize_session='evaluate'): return self.update( { 'deleted': literal_column('id'), 'updated_at': literal_column('updated_at'), 'deleted_at': timeutils.utcnow() }, synchronize_session=synchronize_session)
def _make_cluster(self, id_name): ctx = context.ctx() c = tc.SAMPLE_CLUSTER.copy() c["status"] = "Active" c["id"] = id_name c["name"] = id_name c['updated_at'] = timeutils.utcnow() self.api.cluster_create(ctx, c)
def drop_old_duplicate_entries_from_table(migrate_engine, table_name, use_soft_delete, *uc_column_names): """Drop all old rows having the same values for columns in uc_columns. This method drop (or mark ad `deleted` if use_soft_delete is True) old duplicate rows form table with name `table_name`. :param migrate_engine: Sqlalchemy engine :param table_name: Table with duplicates :param use_soft_delete: If True - values will be marked as `deleted`, if False - values will be removed from table :param uc_column_names: Unique constraint columns """ meta = MetaData() meta.bind = migrate_engine table = Table(table_name, meta, autoload=True) columns_for_group_by = [table.c[name] for name in uc_column_names] columns_for_select = [func.max(table.c.id)] columns_for_select.extend(columns_for_group_by) duplicated_rows_select = sqlalchemy.sql.select( columns_for_select, group_by=columns_for_group_by, having=func.count(table.c.id) > 1 ) for row in migrate_engine.execute(duplicated_rows_select): # NOTE(boris-42): Do not remove row that has the biggest ID. delete_condition = table.c.id != row[0] is_none = None # workaround for pyflakes delete_condition &= table.c.deleted_at == is_none for name in uc_column_names: delete_condition &= table.c[name] == row[name] rows_to_delete_select = sqlalchemy.sql.select([table.c.id]).where(delete_condition) for row in migrate_engine.execute(rows_to_delete_select).fetchall(): LOG.info( _LI("Deleting duplicated row with id: %(id)s from table: " "%(table)s") % dict(id=row[0], table=table_name) ) if use_soft_delete: delete_statement = ( table.update() .where(delete_condition) .values( { "deleted": literal_column("id"), "updated_at": literal_column("updated_at"), "deleted_at": timeutils.utcnow(), } ) ) else: delete_statement = table.delete().where(delete_condition) migrate_engine.execute(delete_statement)
def _await_agents(instances): api = cu.get_api_client(instances[0].node_group.cluster) timeout = 300 LOG.debug("Waiting %(timeout)s seconds for agent connected to manager" % { 'timeout': timeout}) s_time = timeutils.utcnow() while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: hostnames = [i.fqdn() for i in instances] hostnames_to_manager = [h.hostname for h in api.get_all_hosts('full')] is_ok = True for hostname in hostnames: if hostname not in hostnames_to_manager: is_ok = False break if not is_ok: context.sleep(5) else: break else: raise ex.HadoopProvisionError(_("Cloudera agents failed to connect to" " Cloudera Manager"))
def drop_old_duplicate_entries_from_table(migrate_engine, table_name, use_soft_delete, *uc_column_names): """Drop all old rows having the same values for columns in uc_columns. This method drop (or mark ad `deleted` if use_soft_delete is True) old duplicate rows form table with name `table_name`. :param migrate_engine: Sqlalchemy engine :param table_name: Table with duplicates :param use_soft_delete: If True - values will be marked as `deleted`, if False - values will be removed from table :param uc_column_names: Unique constraint columns """ meta = MetaData() meta.bind = migrate_engine table = Table(table_name, meta, autoload=True) columns_for_group_by = [table.c[name] for name in uc_column_names] columns_for_select = [func.max(table.c.id)] columns_for_select.extend(columns_for_group_by) duplicated_rows_select = select(columns_for_select, group_by=columns_for_group_by, having=func.count(table.c.id) > 1) for row in migrate_engine.execute(duplicated_rows_select): # NOTE(boris-42): Do not remove row that has the biggest ID. delete_condition = table.c.id != row[0] is_none = None # workaround for pyflakes delete_condition &= table.c.deleted_at == is_none for name in uc_column_names: delete_condition &= table.c[name] == row[name] rows_to_delete_select = select([table.c.id]).where(delete_condition) for row in migrate_engine.execute(rows_to_delete_select).fetchall(): LOG.info( _("Deleting duplicated row with id: %(id)s from table: " "%(table)s") % dict(id=row[0], table=table_name)) if use_soft_delete: delete_statement = table.update().\ where(delete_condition).\ values({ 'deleted': literal_column('id'), 'updated_at': literal_column('updated_at'), 'deleted_at': timeutils.utcnow() }) else: delete_statement = table.delete().where(delete_condition) migrate_engine.execute(delete_statement)
def decommission_dn(nn, inst_to_be_deleted, survived_inst): with remote.get_remote(nn) as r: r.write_file_to('/etc/hadoop/dn.excl', utils.generate_fqdn_host_names( inst_to_be_deleted)) run.refresh_nodes(remote.get_remote(nn), "dfsadmin") context.sleep(3) timeout = c_helper.get_decommissioning_timeout( nn.node_group.cluster) s_time = timeutils.utcnow() all_found = False while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: cmd = r.execute_command( "sudo -u hdfs hadoop dfsadmin -report") all_found = True datanodes_info = parse_dfs_report(cmd[1]) for i in inst_to_be_deleted: for dn in datanodes_info: if (dn["Name"].startswith(i.internal_ip)) and ( dn["Decommission Status"] != "Decommissioned"): all_found = False break if all_found: r.write_files_to({'/etc/hadoop/dn.incl': utils. generate_fqdn_host_names(survived_inst), '/etc/hadoop/dn.excl': "", }) break context.sleep(3) if not all_found: ex.DecommissionError( "Cannot finish decommission of cluster %s in %d seconds" % (nn.node_group.cluster, timeout))
def decommission_dn(nn, inst_to_be_deleted, survived_inst): with remote.get_remote(nn) as r: r.write_file_to('/etc/hadoop/dn.excl', utils.generate_fqdn_host_names(inst_to_be_deleted)) run.refresh_nodes(remote.get_remote(nn), "dfsadmin") context.sleep(3) timeout = config_helper.get_decommissioning_timeout( nn.node_group.cluster) s_time = timeutils.utcnow() all_found = False while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: cmd = r.execute_command( "sudo su -c 'hadoop dfsadmin -report' hadoop") all_found = True datanodes_info = parse_dfs_report(cmd[1]) for i in inst_to_be_deleted: for dn in datanodes_info: if (dn["Name"].startswith(i.internal_ip)) and ( dn["Decommission Status"] != "Decommissioned"): all_found = False break if all_found: r.write_files_to({ '/etc/hadoop/dn.incl': utils.generate_fqdn_host_names(survived_inst), '/etc/hadoop/dn.excl': "", }) break context.sleep(3) if not all_found: ex.DecommissionError( "Cannot finish decommission of cluster %s in %d seconds" % (nn.node_group.cluster, timeout))
def terminate_unneeded_clusters(self, ctx): LOG.debug('Terminating unneeded transient clusters') ctx = context.get_admin_context() context.set_ctx(ctx) for cluster in conductor.cluster_get_all(ctx, status='Active'): if not cluster.is_transient: continue jc = conductor.job_execution_count(ctx, end_time=None, cluster_id=cluster.id) if jc > 0: continue cluster_updated_at = timeutils.normalize_time( timeutils.parse_isotime(cluster.updated_at)) current_time = timeutils.utcnow() spacing = timeutils.delta_seconds(cluster_updated_at, current_time) if spacing < CONF.min_transient_cluster_active_time: continue if CONF.use_identity_api_v3: trusts.use_os_admin_auth_token(cluster) LOG.info(_LI('Terminating transient cluster %(cluster)s ' 'with id %(id)s'), {'cluster': cluster.name, 'id': cluster.id}) try: api.terminate_cluster(cluster.id) except Exception as e: LOG.info(_LI('Failed to terminate transient cluster ' '%(cluster)s with id %(id)s: %(error)s.'), {'cluster': cluster.name, 'id': cluster.id, 'error': six.text_type(e)}) else: if cluster.status != 'AwaitingTermination': conductor.cluster_update( ctx, cluster, {'status': 'AwaitingTermination'}) context.set_ctx(None)
def soft_delete(self, synchronize_session='evaluate'): return self.update({'deleted': literal_column('id'), 'updated_at': literal_column('updated_at'), 'deleted_at': timeutils.utcnow()}, synchronize_session=synchronize_session)
def soft_delete(self, session=None): """Mark this object as deleted.""" self.deleted = self.id self.deleted_at = timeutils.utcnow() self.save(session=session)
class TimestampMixin(object): created_at = Column(DateTime, default=lambda: timeutils.utcnow()) updated_at = Column(DateTime, onupdate=lambda: timeutils.utcnow())
def transient_cluster_testing(self, plugin_config, floating_ip_pool, internal_neutron_net): cluster_template_id = self.create_cluster_template( name='test-transient-cluster-template-vanilla', plugin_config=self.vanilla_config, description=('test cluster template for transient cluster ' 'of Vanilla plugin'), cluster_configs={}, node_groups=[ dict( name='master-node', flavor_id=self.flavor_id, node_processes=['namenode', 'oozie', 'jobtracker'], floating_ip_pool=floating_ip_pool, count=1), dict( name='worker-node', flavor_id=self.flavor_id, node_processes=['datanode', 'tasktracker'], floating_ip_pool=floating_ip_pool, count=1) ], net_id=internal_neutron_net ) try: # create a transient cluster try: cluster_name = (self.common_config.CLUSTER_NAME + '-transient-' + plugin_config.PLUGIN_NAME) self.create_cluster( name=cluster_name, plugin_config=plugin_config, cluster_template_id=cluster_template_id, description='test transient cluster', cluster_configs={}, is_transient=True ) except Exception: self.delete_objects(cluster_id=self.cluster_id) raise # check EDP path = 'sahara/tests/integration/tests/resources/' pig_job_data = open(path + 'edp-job.pig').read() pig_lib_data = open(path + 'edp-lib.jar').read() self.edp_testing(job_type=utils_edp.JOB_TYPE_PIG, job_data_list=[{'pig': pig_job_data}], lib_data_list=[{'jar': pig_lib_data}]) # set timeout in seconds timeout = self.common_config.TRANSIENT_CLUSTER_TIMEOUT * 60 s_time = timeutils.utcnow() raise_failure = True # wait for cluster deleting while timeutils.delta_seconds( s_time, timeutils.utcnow()) < timeout: try: self.sahara.clusters.get(self.cluster_id) except sab.APIException as api_ex: if 'not found' in api_ex.message: raise_failure = False break time.sleep(2) if raise_failure: self.delete_objects(cluster_id=self.cluster_id) self.fail('Transient cluster has not been deleted within %s ' 'minutes.' % self.common_config.TRANSIENT_CLUSTER_TIMEOUT) finally: self.delete_objects(cluster_template_id=cluster_template_id)