def _write_result(self, status, description): cond.cluster_health_check_update( context.ctx(), self.health_check_id, {'status': status, 'description': description}) self.health_check = cond.cluster_health_check_get( context.ctx(), self.health_check_id) sender.health_notify(self.cluster, self.health_check)
def execute_job(job_id, data): # Elements common to all job types cluster_id = data['cluster_id'] configs = data.get('job_configs', {}) interface = data.get('interface', {}) # Not in Java job types but present for all others input_id = data.get('input_id', None) output_id = data.get('output_id', None) # Since we will use a unified class in the database, we pass # a superset for all job types job_ex_dict = {'input_id': input_id, 'output_id': output_id, 'job_id': job_id, 'cluster_id': cluster_id, 'info': {'status': edp.JOB_STATUS_PENDING}, 'job_configs': configs, 'extra': {}, 'interface': interface} job_execution = conductor.job_execution_create(context.ctx(), job_ex_dict) context.set_current_job_execution_id(job_execution.id) # check to use proxy user if p.job_execution_requires_proxy_user(job_execution): try: p.create_proxy_user_for_job_execution(job_execution) except ex.SaharaException as e: LOG.error(_LE("Can't run job execution. " "(Reasons: {reason})").format(reason=e)) conductor.job_execution_destroy(context.ctx(), job_execution) raise e OPS.run_edp_job(job_execution.id) return job_execution
def test_verification_start(self, get_health_checks): cluster = self._cluster_sample() get_health_checks.return_value = [Check] verification_base.handle_verification(cluster, { 'verification': {'status': 'START'}}) cluster = self.api.cluster_get(context.ctx(), cluster) ver = cluster.verification self.assertEqual('GREEN', ver['status']) self.assertEqual(1, len(ver['checks'])) self.assertEqual('No criminality', ver.checks[0]['description']) id = ver['id'] get_health_checks.return_value = [YellowCheck, Check, Check] verification_base.handle_verification(cluster, { 'verification': {'status': 'START'}}) cluster = self.api.cluster_get(context.ctx(), cluster) ver = cluster.verification self.assertEqual('YELLOW', ver['status']) self.assertEqual(3, len(ver['checks'])) self.assertNotEqual(ver['id'], id) get_health_checks.return_value = [RedCheck, YellowCheck] verification_base.handle_verification(cluster, { 'verification': {'status': 'START'}}) cluster = self.api.cluster_get(context.ctx(), cluster) ver = cluster.verification self.assertEqual('RED', ver['status']) self.assertEqual(2, len(ver['checks'])) self.assertNotEqual(ver['id'], id) self.assertEqual("James bond check", ver['checks'][0]['name'])
def job_execution_requires_proxy_user(job_execution): '''Returns True if the job execution requires a proxy user.''' def _check_values(values): return any(value.startswith( su.SWIFT_INTERNAL_PREFIX) for value in values if ( isinstance(value, six.string_types))) if CONF.use_domain_for_proxy_users is False: return False paths = [conductor.data_source_get(context.ctx(), job_execution.output_id), conductor.data_source_get(context.ctx(), job_execution.input_id)] if _check_values(ds.url for ds in paths if ds): return True if _check_values(six.itervalues( job_execution.job_configs.get('configs', {}))): return True if _check_values(six.itervalues( job_execution.job_configs.get('params', {}))): return True if _check_values(job_execution.job_configs.get('args', [])): return True job = conductor.job_get(context.ctx(), job_execution.job_id) if _check_values(main.url for main in job.mains): return True if _check_values(lib.url for lib in job.libs): return True # We did the simple checks, now if data_source referencing is # enabled and we have values that could be a name or uuid, # query for data_sources that match and contain a swift path by_name, by_uuid = job_utils.may_contain_data_source_refs( job_execution.job_configs) if by_name: names = tuple(job_utils.find_possible_data_source_refs_by_name( job_execution.job_configs)) # do a query here for name in names and path starts with swift-prefix if names and conductor.data_source_count( context.ctx(), name=names, url=su.SWIFT_INTERNAL_PREFIX+'%') > 0: return True if by_uuid: uuids = tuple(job_utils.find_possible_data_source_refs_by_uuid( job_execution.job_configs)) # do a query here for id in uuids and path starts with swift-prefix if uuids and conductor.data_source_count( context.ctx(), id=uuids, url=su.SWIFT_INTERNAL_PREFIX+'%') > 0: return True return False
def _generate_heat_stack_name(cluster): cluster = conductor.cluster_get(context.ctx(), cluster) hsn = cluster.name + cluster.id[:8] extra = cluster.extra.to_dict() if cluster.extra else {} extra['heat_stack_name'] = hsn conductor.cluster_update(context.ctx(), cluster, {'extra': extra}) return conductor.cluster_get(context.ctx(), cluster)
def test_get_hadoop_ssh_keys(self): cluster_dict = { 'name': 'cluster1', 'plugin_name': 'mock_plugin', 'hadoop_version': 'mock_version', 'default_image_id': 'initial', 'node_groups': [tu.make_ng_dict("ng1", "f1", ["s1"], 1)]} cluster1 = conductor.cluster_create(context.ctx(), cluster_dict) (private_key1, public_key1) = c_h.get_hadoop_ssh_keys(cluster1) #should store keys for old cluster cluster1 = conductor.cluster_get(context.ctx(), cluster1) (private_key2, public_key2) = c_h.get_hadoop_ssh_keys(cluster1) self.assertEqual(public_key1, public_key2) self.assertEqual(private_key1, private_key2) #should generate new keys for new cluster cluster_dict.update({'name': 'cluster2'}) cluster2 = conductor.cluster_create(context.ctx(), cluster_dict) (private_key3, public_key3) = c_h.get_hadoop_ssh_keys(cluster2) self.assertNotEqual(public_key1, public_key3) self.assertNotEqual(private_key1, private_key3)
def get_oozie_password(cluster): cluster = conductor.cluster_get(context.ctx(), cluster) extra = cluster.extra.to_dict() if 'oozie_pass_id' not in extra: extra['oozie_pass_id'] = u.generate_random_password() conductor.cluster_update(context.ctx(), cluster, {'extra': extra}) return castellan.get_secret(extra['oozie_pass_id'])
def update_plugin(self, plugin_name, values): ctx = context.ctx() current = self.get_label_details(plugin_name) if not conductor.plugin_get(ctx, plugin_name): current['name'] = plugin_name conductor.plugin_create(ctx, current) del current['name'] if values.get(PLUGIN_LABELS_SCOPE): for label in values.get(PLUGIN_LABELS_SCOPE).keys(): current[PLUGIN_LABELS_SCOPE][label].update( values.get(PLUGIN_LABELS_SCOPE).get(label)) else: del current[PLUGIN_LABELS_SCOPE] if values.get(VERSION_LABELS_SCOPE): vl = values.get(VERSION_LABELS_SCOPE) for version in vl.keys(): for label in vl.get(version).keys(): current[VERSION_LABELS_SCOPE][version][label].update( vl[version][label]) else: del current[VERSION_LABELS_SCOPE] conductor.plugin_update(context.ctx(), plugin_name, current)
def test_get_instances(self): cluster = self._make_sample() ctx = context.ctx() idx = 0 ids = [] for ng in cluster.node_groups: for i in range(ng.count): idx += 1 ids.append(self.api.instance_add(context.ctx(), ng, { 'instance_id': str(idx), 'instance_name': str(idx), })) cluster = self.api.cluster_get(ctx, cluster) instances = general.get_instances(cluster, ids) ids = set() for inst in instances: ids.add(inst.instance_id) self.assertEqual(idx, len(ids)) for i in range(1, idx): self.assertIn(str(i), ids) instances = general.get_instances(cluster) ids = set() for inst in instances: ids.add(inst.instance_id) self.assertEqual(idx, len(ids)) for i in range(1, idx): self.assertIn(str(i), ids)
def check_data_sources_are_different(data_source_1_id, data_source_2_id): ds1 = conductor.data_source_get(context.ctx(), data_source_1_id) ds2 = conductor.data_source_get(context.ctx(), data_source_2_id) if ds1.type == ds2.type and ds1.url == ds2.url: raise ex.InvalidDataException(_('Provided input and output ' 'DataSources reference the same ' 'location: %s') % ds1.url)
def clean_verification_data(cluster): cluster = cond.cluster_get(context.ctx(), cluster) if verification_exists(cluster): try: vid = cluster.verification.id cond.cluster_verification_delete(context.ctx(), vid) except exceptions.NotFoundException: LOG.debug("Verification data already cleaned")
def _indicate_start(self): vid = self.cluster.verification.id self.health_check_id = cond.cluster_health_check_add( context.ctx(), vid, {'status': common.HEALTH_STATUS_CHECKING, 'name': self.get_health_check_name()}).id self.health_check = cond.cluster_health_check_get( context.ctx(), self.health_check_id) sender.health_notify(self.cluster, self.health_check)
def test_apply_recommended_configs(self, cond_cluster, cond_node_group, fake_flavor): fake_flavor.return_value = FakeObject(ram=2048, vcpus=1) to_tune = { 'cluster_configs': { 'dfs.replication': ('dfs', 'replica') }, 'node_configs': { 'mapreduce.task.io.sort.mb': ('bond', 'extra_name') } } fake_plugin_configs = [ FakeObject(applicable_target='dfs', name='replica', default_value=3)] fake_ng = FakeObject( use_autoconfig=True, count=2, node_processes=['dog_datanode'], flavor_id='fake_id', node_configs=Configs({ 'bond': { 'name': 'james' } }) ) fake_cluster = FakeObject( cluster_configs=Configs({ 'cat': { 'talk': 'meow', } }), node_groups=[fake_ng], use_autoconfig=True, ) v = ru.HadoopAutoConfigsProvider( to_tune, fake_plugin_configs, fake_cluster, {'datanode_process_name': "dog_datanode"}) v.apply_recommended_configs() self.assertEqual([mock.call(context.ctx(), fake_cluster, { 'cluster_configs': { 'cat': { 'talk': 'meow' }, 'dfs': { 'replica': 2 } } })], cond_cluster.call_args_list) self.assertEqual([mock.call(context.ctx(), fake_ng, { 'node_configs': { 'bond': { 'name': 'james', 'extra_name': 102 } } })], cond_node_group.call_args_list)
def update_cluster(id, values): if "update_keypair" in values: if values["update_keypair"]: api.OPS.update_keypair(id) values.pop("update_keypair") if verification_base.update_verification_required(values): api.OPS.handle_verification(id, values) return conductor.cluster_get(context.ctx(), id) return conductor.cluster_update(context.ctx(), id, values)
def get_raw_binary(job_binary): url = job_binary.url if url.startswith("internal-db://"): res = db.get_raw_data(context.ctx(), job_binary) # TODO(mattf): remove support for OLD_SWIFT_INTERNAL_PREFIX if url.startswith(su.SWIFT_INTERNAL_PREFIX) or ( url.startswith(su.OLD_SWIFT_INTERNAL_PREFIX)): res = i_swift.get_raw_data(context.ctx(), job_binary) return res
def _set_cluster_info(self, cluster): nn = vu.get_namenode(cluster) rm = vu.get_resourcemanager(cluster) hs = vu.get_historyserver(cluster) oo = vu.get_oozie(cluster) info = {} if rm: info['YARN'] = { 'Web UI': 'http://%s:%s' % (rm.management_ip, '8088'), 'ResourceManager': 'http://%s:%s' % (rm.management_ip, '8032') } if nn: info['HDFS'] = { 'Web UI': 'http://%s:%s' % (nn.management_ip, '50070'), 'NameNode': 'hdfs://%s:%s' % (nn.hostname(), '9000') } if oo: info['JobFlow'] = { 'Oozie': 'http://%s:%s' % (oo.management_ip, '11000') } if hs: info['MapReduce JobHistory Server'] = { 'Web UI': 'http://%s:%s' % (hs.management_ip, '19888') } ctx = context.ctx() conductor.cluster_update(ctx, cluster, {'info': info})
def _await_networks(self, cluster, instances): if not instances: return ips_assigned = set() while len(ips_assigned) != len(instances): if not g.check_cluster_exists(cluster): return for instance in instances: if instance.id not in ips_assigned: if networks.init_instances_ips(instance): ips_assigned.add(instance.id) context.sleep(1) LOG.info( _LI("Cluster '%s': all instances have IPs assigned"), cluster.id) cluster = conductor.cluster_get(context.ctx(), cluster) instances = g.get_instances(cluster, ips_assigned) with context.ThreadGroup() as tg: for instance in instances: tg.spawn("wait-for-ssh-%s" % instance.instance_name, self._wait_until_accessible, instance) LOG.info(_LI("Cluster '%s': all instances are accessible"), cluster.id)
def test_transient_cluster_terminate(self, terminate_cluster, use_os_admin_auth_token): timeutils.set_time_override(datetime.datetime(2005, 2, 1, 0, 0)) ctx = context.ctx() job = self.api.job_create(ctx, te.SAMPLE_JOB) ds = self.api.data_source_create(ctx, te.SAMPLE_DATA_SOURCE) self._make_cluster('1') self._make_cluster('2') self._create_job_execution({"end_time": timeutils.utcnow(), "id": 1, "cluster_id": "1"}, job, ds, ds) self._create_job_execution({"end_time": None, "id": 2, "cluster_id": "2"}, job, ds, ds) self._create_job_execution({"end_time": None, "id": 3, "cluster_id": "2"}, job, ds, ds) timeutils.set_time_override(datetime.datetime(2005, 2, 1, 0, 1)) p._make_periodic_tasks().terminate_unneeded_transient_clusters(None) self.assertEqual(1, terminate_cluster.call_count) terminate_cluster.assert_has_calls([mock.call(u'1')]) self.assertEqual(1, use_os_admin_auth_token.call_count)
def apply_node_configs(self, node_group): """Method applies configs for node_group using conductor api, which were calculated with recommend_node_configs method. :param node_group: NodeGroup Sahara resource. :return: None. """ if not node_group.use_autoconfig or not self.cluster.use_autoconfig: return to_update = self.node_configs_to_update recommended_node_configs = self._get_recommended_node_configs( node_group) if not recommended_node_configs: # Nothing to configure return current_dict = node_group.node_configs.to_dict() configuration = {} for ncfg in six.iterkeys(to_update): if ncfg not in recommended_node_configs: continue n_section = to_update[ncfg][0] n_name = to_update[ncfg][1] proposed_config_value = recommended_node_configs[ncfg] if n_section not in configuration: configuration.update({n_section: {}}) configuration[n_section].update({n_name: proposed_config_value}) current_dict = self._merge_configs(current_dict, configuration) conductor.node_group_update(context.ctx(), node_group, {'node_configs': current_dict})
def apply_cluster_configs(self): """Method applies configs for cluster using conductor api, which were calculated with recommend_cluster_configs method. :return: None. """ cluster = self.cluster if not cluster.use_autoconfig: return to_update = self.cluster_configs_to_update recommended_cluster_configs = self._get_recommended_cluster_configs() if not recommended_cluster_configs: # Nothing to configure return current_dict = cluster.cluster_configs.to_dict() configuration = {} for ncfg in six.iterkeys(to_update): if ncfg not in recommended_cluster_configs: continue n_section = to_update[ncfg][0] n_name = to_update[ncfg][1] proposed_config_value = recommended_cluster_configs[ncfg] if n_section not in configuration: configuration.update({n_section: {}}) configuration[n_section].update({n_name: proposed_config_value}) current_dict = self._merge_configs(current_dict, configuration) conductor.cluster_update(context.ctx(), cluster, {'cluster_configs': current_dict})
def _await_networks(self, cluster, instances): if not instances: return cpo.add_provisioning_step(cluster.id, _("Assign IPs"), len(instances)) ips_assigned = set() self._ips_assign(ips_assigned, cluster, instances) LOG.info( _LI("Cluster {cluster_id}: all instances have IPs assigned") .format(cluster_id=cluster.id)) cluster = conductor.cluster_get(context.ctx(), cluster) instances = g.get_instances(cluster, ips_assigned) cpo.add_provisioning_step( cluster.id, _("Wait for instance accessibility"), len(instances)) with context.ThreadGroup() as tg: for instance in instances: tg.spawn("wait-for-ssh-%s" % instance.instance_name, self._wait_until_accessible, instance) LOG.info(_LI("Cluster {cluster_id}: all instances are accessible") .format(cluster_id=cluster.id))
def finalize_autoconfiguration(self): if not self.cluster.use_autoconfig: return cluster_extra = self._get_cluster_extra() cluster_extra['auto-configured'] = True conductor.cluster_update( context.ctx(), self.cluster, {'extra': cluster_extra})
def test_data_source_count_in(self): ctx = context.ctx() ctx.tenant_id = SAMPLE_DATA_SOURCE['tenant_id'] src = copy.copy(SAMPLE_DATA_SOURCE) self.api.data_source_create(ctx, src) cnt = self.api.data_source_count(ctx, name='ngt_test') self.assertEqual(1, cnt) cnt = self.api.data_source_count(ctx, name=('ngt_test', 'test2', 'test3')) self.assertEqual(1, cnt) cnt = self.api.data_source_count(ctx, name=('test1', 'test2', 'test3')) self.assertEqual(0, cnt) lst = self.api.data_source_get_all(ctx, name='ngt_test') myid = lst[0]['id'] cnt = self.api.data_source_count(ctx, name=('ngt_test', 'test2', 'test3'), id=myid) self.assertEqual(1, cnt) cnt = self.api.data_source_count(ctx, name=('ngt_test', 'test2', 'test3'), id=(myid, '2')) self.assertEqual(1, cnt)
def test_job_execution_search(self): ctx = context.ctx() job = self.api.job_create(ctx, SAMPLE_JOB) ds_input = self.api.data_source_create(ctx, SAMPLE_DATA_SOURCE) SAMPLE_DATA_OUTPUT = copy.copy(SAMPLE_DATA_SOURCE) SAMPLE_DATA_OUTPUT['name'] = 'output' ds_output = self.api.data_source_create(ctx, SAMPLE_DATA_OUTPUT) SAMPLE_JOB_EXECUTION['job_id'] = job['id'] SAMPLE_JOB_EXECUTION['input_id'] = ds_input['id'] SAMPLE_JOB_EXECUTION['output_id'] = ds_output['id'] ctx.tenant_id = SAMPLE_JOB_EXECUTION['tenant_id'] self.api.job_execution_create(ctx, SAMPLE_JOB_EXECUTION) lst = self.api.job_execution_get_all(ctx) self.assertEqual(1, len(lst)) kwargs = {'tenant_id': SAMPLE_JOB_EXECUTION['tenant_id']} lst = self.api.job_execution_get_all(ctx, **kwargs) self.assertEqual(1, len(lst)) # Valid field but no matching value kwargs = {'job_id': SAMPLE_JOB_EXECUTION['job_id']+"foo"} lst = self.api.job_execution_get_all(ctx, **kwargs) self.assertEqual(0, len(lst)) # Invalid field self.assertRaises(sa_exc.InvalidRequestError, self.api.job_execution_get_all, ctx, **{'badfield': 'somevalue'})
def change_cluster_status_description(cluster, status_description): try: ctx = context.ctx() return conductor.cluster_update( ctx, cluster, {'status_description': status_description}) except e.NotFoundException: return None
def change_cluster_status(cluster, status, status_description=None): ctx = context.ctx() # Update cluster status. Race conditions with deletion are still possible, # but this reduces probability at least. cluster = conductor.cluster_get(ctx, cluster) if cluster else None if status_description is not None: change_cluster_status_description(cluster, status_description) # 'Deleting' is final and can't be changed if cluster is None or cluster.status == CLUSTER_STATUS_DELETING: return cluster update_dict = {"status": status} cluster = conductor.cluster_update(ctx, cluster, update_dict) conductor.cluster_provision_progress_update(ctx, cluster.id) LOG.info(_LI("Cluster status has been changed. New status=" "{status}").format(status=cluster.status)) sender.notify(ctx, cluster.id, cluster.name, cluster.status, "update") return cluster
def _set_cluster_info(self, cluster): nn = vu.get_namenode(cluster) jt = vu.get_jobtracker(cluster) oozie = vu.get_oozie(cluster) info = {} if jt: ui_port = c_helper.get_port_from_config("MapReduce", "mapred.job.tracker.http.address", cluster) jt_port = c_helper.get_port_from_config("MapReduce", "mapred.job.tracker", cluster) info["MapReduce"] = { "Web UI": "http://%s:%s" % (jt.management_ip, ui_port), "JobTracker": "%s:%s" % (jt.hostname(), jt_port), } if nn: ui_port = c_helper.get_port_from_config("HDFS", "dfs.http.address", cluster) nn_port = c_helper.get_port_from_config("HDFS", "fs.default.name", cluster) info["HDFS"] = { "Web UI": "http://%s:%s" % (nn.management_ip, ui_port), "NameNode": "hdfs://%s:%s" % (nn.hostname(), nn_port), } if oozie: # TODO(yrunts) change from hardcode value info["JobFlow"] = {"Oozie": "http://%s:11000" % oozie.management_ip} ctx = context.ctx() conductor.cluster_update(ctx, cluster, {"info": info})
def _run_job(job_execution_id): ctx = context.ctx() job_execution = conductor.job_execution_get(ctx, job_execution_id) cluster = conductor.cluster_get(ctx, job_execution.cluster_id) if cluster.status != c_u.CLUSTER_STATUS_ACTIVE: return eng = _get_job_engine(cluster, job_execution) if eng is None: raise e.EDPError(_("Cluster does not support job type %s") % _get_job_type(job_execution)) job_execution = _update_job_execution_extra(cluster, job_execution) # Job id is a string # Status is a string # Extra is a dictionary to add to extra in the job_execution jid, status, extra = eng.run_job(job_execution) # Set the job id and the start time # Optionally, update the status and the 'extra' field update_dict = {'oozie_job_id': jid, 'start_time': datetime.datetime.now()} if status: update_dict['info'] = {'status': status} if extra: curr_extra = job_execution.extra.copy() curr_extra.update(extra) update_dict['extra'] = curr_extra job_execution = conductor.job_execution_update( ctx, job_execution, update_dict)
def _shutdown_instance(self, instance): ctx = context.ctx() if instance.node_group.floating_ip_pool: try: networks.delete_floating_ip(instance.instance_id) except nova_exceptions.NotFound: LOG.warn(_LW("Attempted to delete non-existent floating IP in " "pool %(pool)s from instance %(instance)s"), {'pool': instance.node_group.floating_ip_pool, 'instance': instance.instance_id}) try: volumes.detach_from_instance(instance) except Exception: LOG.warn(_LW("Detaching volumes from instance %s failed"), instance.instance_id) try: nova.client().servers.delete(instance.instance_id) except nova_exceptions.NotFound: LOG.warn(_LW("Attempted to delete non-existent instance %s"), instance.instance_id) conductor.instance_remove(ctx, instance)
def scale_cluster(self, cluster, node_group_id_map): ctx = context.ctx() cluster = g.change_cluster_status(cluster, "Scaling") instance_ids = self._scale_cluster_instances(cluster, node_group_id_map) self._update_rollback_strategy(cluster, instance_ids=instance_ids) cluster = conductor.cluster_get(ctx, cluster) g.clean_cluster_from_empty_ng(cluster) cluster = conductor.cluster_get(ctx, cluster) instances = g.get_instances(cluster, instance_ids) self._await_active(cluster, instances) self._assign_floating_ips(instances) self._await_networks(cluster, instances) cluster = conductor.cluster_get(ctx, cluster) volumes.attach_to_instances( g.get_instances(cluster, instance_ids)) # we should be here with valid cluster: if instances creation # was not successful all extra-instances will be removed above if instance_ids: self._configure_instances(cluster) self._update_rollback_strategy(cluster) return instance_ids
def update_job_execution(id, values): _update_status(values.pop("info", None)) return conductor.job_execution_update(context.ctx(), id, values)
def get_data_sources(**kwargs): return conductor.data_source_get_all(context.ctx(), regex_search=True, **kwargs)
def update_job(id, values): return conductor.job_update(context.ctx(), id, values)
def get_data_source(id): return conductor.data_source_get(context.ctx(), id)
def create_job_binary_internal(values): return conductor.job_binary_internal_create(context.ctx(), values)
def _setup_trust_for_cluster(cluster): cluster = conductor.cluster_get(context.ctx(), cluster) trusts.create_trust_for_cluster(cluster) trusts.use_os_admin_auth_token(cluster)
def register_data_source(values): return conductor.data_source_create(context.ctx(), values)
def get_cluster_templates(**kwargs): return conductor.cluster_template_get_all(context.ctx(), regex_search=True, **kwargs)
def update_cluster(id, values): if verification_base.update_verification_required(values): api.OPS.handle_verification(id, values) return conductor.cluster_get(context.ctx(), id) return conductor.cluster_update(context.ctx(), id, values)
def data_source_update(id, values): return conductor.data_source_update(context.ctx(), id, values)
def _set_cluster_info(self, cluster): ambari_ip = plugin_utils.get_instance( cluster, p_common.AMBARI_SERVER).get_ip_or_dns_name() ambari_port = "8080" info = { p_common.AMBARI_SERVER: { "Web UI": "http://{host}:{port}".format(host=ambari_ip, port=ambari_port), "Username": "******", "Password": cluster.extra["ambari_password"] } } nns = plugin_utils.get_instances(cluster, p_common.NAMENODE) info[p_common.NAMENODE] = {} for idx, namenode in enumerate(nns): info[p_common.NAMENODE]["Web UI %s" % (idx + 1)] = ( "http://%s:50070" % namenode.get_ip_or_dns_name()) rms = plugin_utils.get_instances(cluster, p_common.RESOURCEMANAGER) info[p_common.RESOURCEMANAGER] = {} for idx, resourcemanager in enumerate(rms): info[p_common.RESOURCEMANAGER]["Web UI %s" % (idx + 1)] = ( "http://%s:8088" % resourcemanager.get_ip_or_dns_name()) historyserver = plugin_utils.get_instance(cluster, p_common.HISTORYSERVER) if historyserver: info[p_common.HISTORYSERVER] = { "Web UI": "http://%s:19888" % historyserver.get_ip_or_dns_name() } atlserver = plugin_utils.get_instance(cluster, p_common.APP_TIMELINE_SERVER) if atlserver: info[p_common.APP_TIMELINE_SERVER] = { "Web UI": "http://%s:8188" % atlserver.get_ip_or_dns_name() } oozie = plugin_utils.get_instance(cluster, p_common.OOZIE_SERVER) if oozie: info[p_common.OOZIE_SERVER] = { "Web UI": "http://%s:11000/oozie" % oozie.get_ip_or_dns_name() } hbase_master = plugin_utils.get_instance(cluster, p_common.HBASE_MASTER) if hbase_master: info[p_common.HBASE_MASTER] = { "Web UI": "http://%s:16010" % hbase_master.get_ip_or_dns_name() } falcon = plugin_utils.get_instance(cluster, p_common.FALCON_SERVER) if falcon: info[p_common.FALCON_SERVER] = { "Web UI": "http://%s:15000" % falcon.get_ip_or_dns_name() } storm_ui = plugin_utils.get_instance(cluster, p_common.STORM_UI_SERVER) if storm_ui: info[p_common.STORM_UI_SERVER] = { "Web UI": "http://%s:8744" % storm_ui.get_ip_or_dns_name() } ranger_admin = plugin_utils.get_instance(cluster, p_common.RANGER_ADMIN) if ranger_admin: info[p_common.RANGER_ADMIN] = { "Web UI": "http://%s:6080" % ranger_admin.get_ip_or_dns_name(), "Username": "******", "Password": "******" } spark_hs = plugin_utils.get_instance(cluster, p_common.SPARK_JOBHISTORYSERVER) if spark_hs: info[p_common.SPARK_JOBHISTORYSERVER] = { "Web UI": "http://%s:18080" % spark_hs.get_ip_or_dns_name() } info.update(cluster.info.to_dict()) ctx = context.ctx() conductor.cluster_update(ctx, cluster, {"info": info}) cluster = conductor.cluster_get(ctx, cluster.id)
def test_job_execution_advanced_search(self): ctx = context.ctx() job = self.api.job_create(ctx, SAMPLE_JOB) ds_input = self.api.data_source_create(ctx, SAMPLE_DATA_SOURCE) SAMPLE_DATA_OUTPUT = copy.copy(SAMPLE_DATA_SOURCE) SAMPLE_DATA_OUTPUT['name'] = 'output' ds_output = self.api.data_source_create(ctx, SAMPLE_DATA_OUTPUT) # Create a cluster cl1 = self.api.cluster_create(ctx, test_clusters.SAMPLE_CLUSTER) # Create a second cluster with a different name cl2_vals = copy.copy(test_clusters.SAMPLE_CLUSTER) cl2_vals['name'] = 'test_cluster2' cl2 = self.api.cluster_create(ctx, cl2_vals) my_sample_job_exec = copy.copy(SAMPLE_JOB_EXECUTION) my_sample_job_exec['job_id'] = job['id'] my_sample_job_exec['input_id'] = ds_input['id'] my_sample_job_exec['output_id'] = ds_output['id'] my_sample_job_exec['cluster_id'] = cl1['id'] # Run job on cluster 1 self.api.job_execution_create(ctx, my_sample_job_exec) # Run the same job on cluster 2 and set status my_sample_job_exec['cluster_id'] = cl2['id'] my_sample_job_exec['info'] = {'status': 'KiLLeD'} self.api.job_execution_create(ctx, my_sample_job_exec) # Search only with job exeuction fields (finds both) lst = self.api.job_execution_get_all(ctx, **{'return_code': 1}) self.assertEqual(2, len(lst)) # Search on cluster name kwargs = {'cluster.name': cl1['name'], 'return_code': 1} lst = self.api.job_execution_get_all(ctx, **kwargs) self.assertEqual(1, len(lst)) # Search on cluster name and job name kwargs = { 'cluster.name': cl1['name'], 'job.name': SAMPLE_JOB['name'], 'return_code': 1 } lst = self.api.job_execution_get_all(ctx, **kwargs) self.assertEqual(1, len(lst)) # Search on cluster name, job name, and status kwargs = { 'cluster.name': cl2['name'], 'job.name': SAMPLE_JOB['name'], 'status': 'killed', 'return_code': 1 } lst = self.api.job_execution_get_all(ctx, **kwargs) self.assertEqual(1, len(lst)) # Search on job name (finds both) kwargs = {'job.name': SAMPLE_JOB['name'], 'return_code': 1} lst = self.api.job_execution_get_all(ctx, **kwargs) self.assertEqual(2, len(lst)) # invalid cluster name value kwargs = { 'cluster.name': cl1['name'] + 'foo', 'job.name': SAMPLE_JOB['name'] } lst = self.api.job_execution_get_all(ctx, **kwargs) self.assertEqual(0, len(lst)) # invalid job name value kwargs = { 'cluster.name': cl1['name'], 'job.name': SAMPLE_JOB['name'] + 'foo' } lst = self.api.job_execution_get_all(ctx, **kwargs) self.assertEqual(0, len(lst)) # invalid status value kwargs = {'cluster.name': cl1['name'], 'status': 'PENDING'} lst = self.api.job_execution_get_all(ctx, **kwargs) self.assertEqual(0, len(lst))
def get_script_name(self, job): return conductor.job_main_name(context.ctx(), job)
def test_duplicate_job_binary_create(self): ctx = context.ctx() self.api.job_binary_create(ctx, SAMPLE_JOB_BINARY) with testtools.ExpectedException(ex.DBDuplicateEntry): self.api.job_binary_create(ctx, SAMPLE_JOB_BINARY)
def get_job_binary_data(id): job_binary = conductor.job_binary_get(context.ctx(), id) return dispatch.get_raw_binary(job_binary, with_context=True)
def test_duplicate_data_source_create(self): ctx = context.ctx() self.api.data_source_create(ctx, SAMPLE_DATA_SOURCE) with testtools.ExpectedException(ex.DBDuplicateEntry): self.api.data_source_create(ctx, SAMPLE_DATA_SOURCE)
def get_job_binary_internal_data(id): return conductor.job_binary_internal_get_raw_data(context.ctx(), id)
def _nullify_ng_counts(self, cluster): ctx = context.ctx() for node_group in cluster.node_groups: conductor.node_group_update(ctx, node_group, {"count": 0})
def create_job(values): return conductor.job_create(context.ctx(), values)
def update_job_binary_internal(id, values): return conductor.job_binary_internal_update(context.ctx(), id, values)
def get_job_binary_internal(id): return conductor.job_binary_internal_get(context.ctx(), id)
def delete_job_binary_internal(id): conductor.job_binary_internal_destroy(context.ctx(), id)
def cancel_job_execution(id): context.set_current_job_execution_id(id) job_execution = conductor.job_execution_get(context.ctx(), id) OPS.cancel_job_execution(id) return job_execution
def get_job_binary_internals(**kwargs): return conductor.job_binary_internal_get_all(context.ctx(), regex_search=True, **kwargs)
def get_job(id): return conductor.job_get(context.ctx(), id)
def delete_job(job_id): return conductor.job_destroy(context.ctx(), job_id)
def delete_data_source(id): conductor.data_source_destroy(context.ctx(), id)
def get_job_binary(id): return conductor.job_binary_get(context.ctx(), id)
def run_job(self, job_execution): ctx = context.ctx() job = conductor.job_get(ctx, job_execution.job_id) proxy_configs = job_execution.job_configs.get('proxy_configs') # We'll always run the driver program on the master master = plugin_utils.get_instance(self.cluster, "master") # TODO(tmckay): wf_dir should probably be configurable. # The only requirement is that the dir is writable by the image user wf_dir = job_utils.create_workflow_dir(master, '/tmp/spark-edp', job, job_execution.id) paths = job_utils.upload_job_files(master, wf_dir, job, libs_subdir=False, proxy_configs=proxy_configs) # We can shorten the paths in this case since we'll run out of wf_dir paths = [os.path.basename(p) for p in paths] # TODO(tmckay): for now, paths[0] is always assumed to be the app # jar and we generate paths in order (mains, then libs). # When we have a Spark job type, we can require a "main" and set # the app jar explicitly to be "main" app_jar = paths.pop(0) # The rest of the paths will be passed with --jars additional_jars = ",".join(paths) if additional_jars: additional_jars = "--jars " + additional_jars # Launch the spark job using spark-submit and deploy_mode = client host = master.hostname() port = c_helper.get_config_value("Spark", "Master port", self.cluster) spark_submit = os.path.join( c_helper.get_config_value("Spark", "Spark home", self.cluster), "bin/spark-submit") job_class = job_execution.job_configs.configs["edp.java.main_class"] # TODO(tmckay): we need to clean up wf_dirs on long running clusters # TODO(tmckay): probably allow for general options to spark-submit args = " ".join(job_execution.job_configs.get('args', [])) # The redirects of stdout and stderr will preserve output in the wf_dir cmd = "%s %s --class %s %s --master spark://%s:%s %s" % ( spark_submit, app_jar, job_class, additional_jars, host, port, args) job_execution = conductor.job_execution_get(ctx, job_execution.id) if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED: return (None, edp.JOB_STATUS_KILLED, None) # If an exception is raised here, the job_manager will mark # the job failed and log the exception with remote.get_remote(master) as r: # Upload the command launch script launch = os.path.join(wf_dir, "launch_command") r.write_file_to(launch, self._job_script()) r.execute_command("chmod +x %s" % launch) ret, stdout = r.execute_command( "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!" % (wf_dir, cmd)) if ret == 0: # Success, we'll add the wf_dir in job_execution.extra and store # pid@instance_id as the job id # We know the job is running so return "RUNNING" return (stdout.strip() + "@" + master.id, edp.JOB_STATUS_RUNNING, { 'spark-path': wf_dir }) # Hmm, no execption but something failed. # Since we're using backgrounding with redirect, this is unlikely. raise e.EDPError( _("Spark job execution failed. Exit status = " "%(status)s, stdout = %(stdout)s") % { 'status': ret, 'stdout': stdout })
def get_jobs(**kwargs): return conductor.job_get_all(context.ctx(), regex_search=True, **kwargs)