def install_packages(self, instances, packages): # instances non-empty cpo.add_provisioning_step(instances[0].cluster_id, _("Install packages"), len(instances)) with context.ThreadGroup() as tg: for i in instances: tg.spawn("cdh-inst-pkgs-%s" % i.instance_name, self._install_pkgs, i, packages)
def mount_to_instances(instances): if len(instances) == 0: return use_xfs = _can_use_xfs(instances) for instance in instances: with context.set_current_instance_id(instance.instance_id): devices = _find_instance_devices(instance) if devices: cpo.add_provisioning_step( instance.cluster_id, _("Mount volumes to {inst_name} instance").format( inst_name=instance.instance_name), len(devices)) formatted_devices = [] lock = threading.Lock() with context.ThreadGroup() as tg: # Since formating can take several minutes (for large # disks) and can be done in parallel, launch one thread # per disk. for device in devices: tg.spawn('format-device-%s' % device, _format_device, instance, device, use_xfs, formatted_devices, lock) conductor.instance_update( context.current(), instance, {"storage_devices_number": len(formatted_devices)}) for idx, dev in enumerate(formatted_devices): _mount_volume_to_node(instance, idx+1, dev, use_xfs)
def _await_networks(self, cluster, instances): if not instances: return cpo.add_provisioning_step(cluster.id, _("Assign IPs"), len(instances)) ips_assigned = set() self._ips_assign(ips_assigned, cluster, instances) LOG.info( _LI("Cluster {cluster_id}: all instances have IPs assigned") .format(cluster_id=cluster.id)) cluster = conductor.cluster_get(context.ctx(), cluster) instances = g.get_instances(cluster, ips_assigned) cpo.add_provisioning_step( cluster.id, _("Wait for instance accessibility"), len(instances)) with context.ThreadGroup() as tg: for instance in instances: tg.spawn("wait-for-ssh-%s" % instance.instance_name, self._wait_until_accessible, instance) LOG.info(_LI("Cluster {cluster_id}: all instances are accessible") .format(cluster_id=cluster.id))
def _provision_cluster(self, name, cluster_spec, ambari_info, servers, version): # TODO(jspeidel): encapsulate in another class if servers: cpo.add_provisioning_step( servers[0].cluster_id, _("Provision cluster via Ambari"), len(servers)) with context.ThreadGroup() as tg: for server in servers: with context.set_current_instance_id( server.instance['instance_id']): tg.spawn( "hdp-provision-instance-%s" % server.instance.hostname(), server.provision_ambari, ambari_info, cluster_spec) handler = self.version_factory.get_version_handler(version) ambari_client = handler.get_ambari_client() ambari_client.wait_for_host_registrations(len(servers), ambari_info) self._set_ambari_credentials(cluster_spec, ambari_info, version) ambari_client.provision_cluster( cluster_spec, servers, ambari_info, name) LOG.info(_LI('Cluster provisioned via Ambari Server: {server_ip}') .format(server_ip=ambari_info.get_address()))
def _scale_cluster_instances(self, cluster, node_group_id_map): ctx = context.ctx() aa_group = None old_aa_groups = None if cluster.anti_affinity: aa_group = self._find_aa_server_group(cluster) if not aa_group: old_aa_groups = self._generate_anti_affinity_groups(cluster) instances_to_delete = [] node_groups_to_enlarge = set() node_groups_to_delete = set() for node_group in cluster.node_groups: new_count = node_group_id_map[node_group.id] if new_count < node_group.count: instances_to_delete += node_group.instances[new_count: node_group.count] if new_count == 0: node_groups_to_delete.add(node_group.id) elif new_count > node_group.count: node_groups_to_enlarge.add(node_group.id) if node_group.count == 0 and node_group.auto_security_group: self._create_auto_security_group(node_group) if instances_to_delete: cluster = c_u.change_cluster_status( cluster, c_u.CLUSTER_STATUS_DELETING_INSTANCES) for instance in instances_to_delete: with context.set_current_instance_id(instance.instance_id): self._shutdown_instance(instance) self._await_deleted(cluster, instances_to_delete) for ng in cluster.node_groups: if ng.id in node_groups_to_delete: self._delete_auto_security_group(ng) cluster = conductor.cluster_get(ctx, cluster) instances_to_add = [] if node_groups_to_enlarge: cpo.add_provisioning_step( cluster.id, _("Add instances"), self._count_instances_to_scale( node_groups_to_enlarge, node_group_id_map, cluster)) cluster = c_u.change_cluster_status( cluster, c_u.CLUSTER_STATUS_ADDING_INSTANCES) for ng in cluster.node_groups: if ng.id in node_groups_to_enlarge: count = node_group_id_map[ng.id] for idx in six.moves.xrange(ng.count + 1, count + 1): instance_id = self._start_instance( cluster, ng, idx, aa_group, old_aa_groups) instances_to_add.append(instance_id) return instances_to_add
def start_cloudera_agents(self, instances): # instances non-empty cpo.add_provisioning_step(instances[0].cluster_id, _("Start Cloudera Agents"), len(instances)) with context.ThreadGroup() as tg: for i in instances: tg.spawn("cdh-agent-start-%s" % i.instance_name, self._start_cloudera_agent, i)
def install_swift_integration(self, servers): if servers: cpo.add_provisioning_step(servers[0].cluster_id, _("Install Swift integration"), len(servers)) for server in servers: server.install_swift_integration()
def configure_os(self, instances): # instances non-empty cpo.add_provisioning_step( instances[0].cluster_id, _("Configure OS"), len(instances)) with context.ThreadGroup() as tg: for inst in instances: tg.spawn('cdh-repo-conf-%s' % inst.instance_name, self._configure_repo_from_inst, inst)
def start_secondarynamenodes(self, cluster): snns = vu.get_secondarynamenodes(cluster) if len(snns) == 0: return cpo.add_provisioning_step(cluster.id, utils.start_process_event_message("SecondaryNameNodes"), len(snns)) for snn in snns: self._start_secondarynamenode(snn)
def update_configs(self, instances): # instances non-empty cpo.add_provisioning_step( instances[0].cluster_id, _("Update configs"), len(instances)) with context.ThreadGroup as tg: for instance in instances: tg.spawn("update-configs-%s" % instances.instance_name, self._update_configs, instance)
def install_swift_integration(self, servers): if servers: cpo.add_provisioning_step( servers[0].cluster_id, _("Install Swift integration"), len(servers)) for server in servers: server.install_swift_integration()
def update_configs(self, instances): # instances non-empty cpo.add_provisioning_step( instances[0].cluster_id, _("Update configs"), len(instances)) with context.ThreadGroup() as tg: for instance in instances: tg.spawn("update-configs-%s" % instance.instance_name, self._update_configs, instance)
def configure_os(self, instances): # instances non-empty cpo.add_provisioning_step(instances[0].cluster_id, _("Configure OS"), len(instances)) with context.ThreadGroup() as tg: for inst in instances: tg.spawn('cdh-repo-conf-%s' % inst.instance_name, self._configure_repo_from_inst, inst)
def _setup_agents(instances, manager_address): cpo.add_provisioning_step(instances[0].cluster.id, _("Set up Ambari agents"), len(instances)) with context.ThreadGroup() as tg: for inst in instances: tg.spawn("hwx-agent-setup-%s" % inst.id, _setup_agent, inst, manager_address) LOG.debug("Ambari agents have been installed")
def _scale_cluster_instances(self, cluster, node_group_id_map): ctx = context.ctx() aa_group = None old_aa_groups = None if cluster.anti_affinity: aa_group = self._find_aa_server_group(cluster) if not aa_group: old_aa_groups = self._generate_anti_affinity_groups(cluster) instances_to_delete = [] node_groups_to_enlarge = set() node_groups_to_delete = set() for node_group in cluster.node_groups: new_count = node_group_id_map[node_group.id] if new_count < node_group.count: instances_to_delete += node_group.instances[ new_count:node_group.count] if new_count == 0: node_groups_to_delete.add(node_group.id) elif new_count > node_group.count: node_groups_to_enlarge.add(node_group.id) if node_group.count == 0 and node_group.auto_security_group: self._create_auto_security_group(node_group) if instances_to_delete: cluster = g.change_cluster_status(cluster, "Deleting Instances") for instance in instances_to_delete: with context.set_current_instance_id(instance.instance_id): self._shutdown_instance(instance) self._await_deleted(cluster, instances_to_delete) for ng in cluster.node_groups: if ng.id in node_groups_to_delete: self._delete_auto_security_group(ng) cluster = conductor.cluster_get(ctx, cluster) instances_to_add = [] if node_groups_to_enlarge: cpo.add_provisioning_step( cluster.id, _("Add instances"), self._count_instances_to_scale(node_groups_to_enlarge, node_group_id_map, cluster)) cluster = g.change_cluster_status(cluster, "Adding Instances") for ng in cluster.node_groups: if ng.id in node_groups_to_enlarge: count = node_group_id_map[ng.id] for idx in six.moves.xrange(ng.count + 1, count + 1): instance_id = self._start_instance( cluster, ng, idx, aa_group, old_aa_groups) instances_to_add.append(instance_id) return instances_to_add
def start_cloudera_agents(self, instances): # instances non-empty cpo.add_provisioning_step(instances[0].cluster_id, _("Start Cloudera Agents"), len(instances)) with context.ThreadGroup() as tg: for i in instances: tg.spawn('cdh-agent-start-%s' % i.instance_name, self._start_cloudera_agent, i)
def configure_instances(pctx, instances): if len(instances) == 0: return cpo.add_provisioning_step(instances[0].cluster_id, _("Configure instances"), len(instances)) for instance in instances: with context.set_current_instance_id(instance.instance_id): _configure_instance(pctx, instance)
def install_packages(self, instances, packages): # instances non-empty cpo.add_provisioning_step(instances[0].cluster_id, _("Install packages"), len(instances)) with context.ThreadGroup() as tg: for i in instances: tg.spawn('cdh-inst-pkgs-%s' % i.instance_name, self._install_pkgs, i, packages)
def _await_deleted(self, cluster, instances): """Await all instances are deleted.""" if not instances: return cpo.add_provisioning_step( cluster.id, _("Wait for instances to be deleted"), len(instances)) deleted_ids = set() self._check_deleted(deleted_ids, cluster, instances)
def stop(self, cluster_context, instances=None): instances = instances or cluster_context.get_instances() zookeepers = cluster_context.filter_instances(instances, mng.ZOOKEEPER) cpo.add_provisioning_step(cluster_context.cluster.id, _("Stop ZooKeepers nodes"), len(zookeepers)) self._stop_zk_nodes(zookeepers) cpo.add_provisioning_step(cluster_context.cluster.id, _("Stop Warden nodes"), len(instances)) self._stop_warden_on_nodes(instances)
def configure_instances(pctx, instances): if len(instances) == 0: return cpo.add_provisioning_step( instances[0].cluster_id, _("Configure instances"), len(instances)) for instance in instances: _configure_instance(pctx, instance)
def prepare_policy_files(cluster, instances=None): if instances is None: instances = pl_utils.get_instances(cluster) remote_url = get_policy_url(cluster) cpo.add_provisioning_step(cluster.id, _("Preparing policy files"), len(instances)) with context.ThreadGroup() as tg: for inst in instances: tg.spawn('policy-files', _prepare_policy_files, inst, remote_url)
def _start_slave_processes(self, sl_instances): if len(sl_instances) == 0: return cpo.add_provisioning_step( sl_instances[0].cluster_id, utils.start_process_event_message("Slave"), len(sl_instances) ) with context.ThreadGroup() as tg: for i in sl_instances: tg.spawn("storm-start-sl-%s" % i.instance_name, self._start_slaves, i)
def configure_swift(self, cluster, instances=None): if self.c_helper.is_swift_enabled(cluster): if not instances: instances = u.get_instances(cluster) cpo.add_provisioning_step(cluster.id, _("Configure Swift"), len(instances)) with context.ThreadGroup() as tg: for i in instances: tg.spawn("cdh-swift-conf-%s" % i.instance_name, self._configure_swift_to_inst, i) swift_helper.install_ssl_certs(instances)
def configure_rack_awareness(cluster, instances): if not t_helper.is_data_locality_enabled(): return with _get_ambari_client(cluster) as client: cpo.add_provisioning_step(cluster.id, _("Configure rack awareness"), len(instances)) for inst in instances: _configure_topology_data(cluster, inst, client) _restart_hdfs_and_mapred_services(cluster, client)
def install_swift_integration(self, servers): if servers: cpo.add_provisioning_step( servers[0].cluster_id, _("Install Swift integration"), len(servers)) for server in servers: with context.set_current_instance_id( server.instance['instance_id']): server.install_swift_integration()
def start_secondarynamenodes(self, cluster): snns = vu.get_secondarynamenodes(cluster) if len(snns) == 0: return cpo.add_provisioning_step( snns[0].cluster_id, utils.start_process_event_message("SecondaryNameNodes"), len(snns)) for snn in vu.get_secondarynamenodes(cluster): self._start_secondarynamenode(snn)
def start_zk_server(instances): cpo.add_provisioning_step(instances[0].cluster_id, pu.start_process_event_message("ZooKeeper"), len(instances)) with context.ThreadGroup() as tg: for instance in instances: with context.set_current_instance_id(instance.instance_id): tg.spawn('ZK-start-processes-%s' % instance.instance_name, _start_zk_processes, instance, 'start')
def install_swift_integration(self, servers): if servers: cpo.add_provisioning_step(servers[0].cluster_id, _("Install Swift integration"), len(servers)) for server in servers: with context.set_current_instance_id( server.instance['instance_id']): server.install_swift_integration()
def _start_slave_processes(self, sl_instances): if len(sl_instances) == 0: return cpo.add_provisioning_step( sl_instances[0].cluster_id, utils.start_process_event_message("Slave"), len(sl_instances)) with context.ThreadGroup() as tg: for i in sl_instances: tg.spawn('storm-start-sl-%s' % i.instance_name, self._start_slaves, i)
def wrapped(*args, **kwargs): cluster_context = _find_argument(cluster_context_reference, *args, **kwargs) instances = _find_argument(instances_reference, *args, **kwargs) cluster_id = cluster_context.cluster.id instance_count = len(instances) cpo.add_provisioning_step(cluster_id, name, instance_count) return function(*args, **kwargs)
def wrapped(*args, **kwargs): cluster_context = _find_argument( cluster_context_reference, *args, **kwargs) instances = _find_argument(instances_reference, *args, **kwargs) cluster_id = cluster_context.cluster.id instance_count = len(instances) cpo.add_provisioning_step(cluster_id, name, instance_count) return function(*args, **kwargs)
def _start_datanode_processes(self, dn_instances): if len(dn_instances) == 0: return cpo.add_provisioning_step( dn_instances[0].cluster_id, utils.start_process_event_message("DataNodes"), len(dn_instances) ) with context.ThreadGroup() as tg: for i in dn_instances: tg.spawn("spark-start-dn-%s" % i.instance_name, self._start_datanode, i)
def _start_zookeeper_processes(self, zk_instances): if len(zk_instances) == 0: return cpo.add_provisioning_step( zk_instances[0].cluster_id, utils.start_process_event_message("Zookeeper"), len(zk_instances) ) with context.ThreadGroup() as tg: for i in zk_instances: tg.spawn("storm-start-zk-%s" % i.instance_name, self._start_zookeeper, i)
def post_install(self, cluster_context, instances): LOG.debug('Initializing MapR FS') instances = instances or cluster_context.get_instances() file_servers = cluster_context.filter_instances(instances, FILE_SERVER) cpo.add_provisioning_step(cluster_context.cluster.id, _("Initializing MapR-FS"), len(file_servers)) with context.ThreadGroup() as tg: for instance in file_servers: tg.spawn('init-mfs-%s' % instance.id, self._init_mfs_instance, instance) LOG.info('MapR FS successfully initialized')
def _configure_topology_for_cluster(self, cluster, servers): if CONF.enable_data_locality: cpo.add_provisioning_step( cluster.id, _("Enable data locality for cluster"), len(servers)) topology_data = th.generate_topology_map( cluster, CONF.enable_hypervisor_awareness) topology_str = "\n".join( [k + " " + v for k, v in topology_data.items()]) + "\n" for server in servers: server.configure_topology(topology_str)
def _configure_topology_for_cluster(self, cluster, servers): if CONF.enable_data_locality: cpo.add_provisioning_step(cluster.id, _("Enable data locality for cluster"), len(servers)) topology_data = th.generate_topology_map( cluster, CONF.enable_hypervisor_awareness) topology_str = "\n".join( [k + " " + v for k, v in topology_data.items()]) + "\n" for server in servers: server.configure_topology(topology_str)
def configure_swift(self, cluster, instances=None): if self.c_helper.is_swift_enabled(cluster): if not instances: instances = u.get_instances(cluster) cpo.add_provisioning_step(cluster.id, _("Configure Swift"), len(instances)) with context.ThreadGroup() as tg: for i in instances: tg.spawn('cdh-swift-conf-%s' % i.instance_name, self._configure_swift_to_inst, i)
def _start_zookeeper_processes(self, zk_instances): if len(zk_instances) == 0: return cpo.add_provisioning_step( zk_instances[0].cluster_id, utils.start_process_event_message("Zookeeper"), len(zk_instances)) with context.ThreadGroup() as tg: for i in zk_instances: tg.spawn('storm-start-zk-%s' % i.instance_name, self._start_zookeeper, i)
def prepare_policy_files(cluster, instances=None): if instances is None: instances = pl_utils.get_instances(cluster) remote_url = get_policy_url(cluster) cpo.add_provisioning_step( cluster.id, _("Preparing policy files"), len(instances)) with context.ThreadGroup() as tg: for inst in instances: tg.spawn( 'policy-files', _prepare_policy_files, inst, remote_url)
def _start_datanode_processes(self, dn_instances): if len(dn_instances) == 0: return cpo.add_provisioning_step( dn_instances[0].cluster_id, utils.start_process_event_message("DataNodes"), len(dn_instances)) with context.ThreadGroup() as tg: for i in dn_instances: tg.spawn('spark-start-dn-%s' % i.instance_name, self._start_datanode, i)
def _await_active(self, cluster, instances): """Await all instances are in Active status and available.""" if not instances: return cpo.add_provisioning_step( cluster.id, _("Wait for instances to become active"), len(instances)) active_ids = set() self._check_active(active_ids, cluster, instances) LOG.info(_LI("All instances are active"))
def wrapped(*args, **kwargs): event_instance = instance or _find_argument( instance_reference, *args, **kwargs) if name: cpo.add_provisioning_step(event_instance.node_group.cluster.id, name, 1) try: result = function(*args, **kwargs) cpo.add_successful_event(event_instance) return result except Exception as exception: cpo.add_fail_event(event_instance, exception) raise exception
def setup_clients(cluster, server=None, instances=None): if not instances: instances = cl_utils.get_instances(cluster) server_ip = None cpo.add_provisioning_step(cluster.id, _("Setting Up Kerberos clients"), len(instances)) if not server: server_ip = get_kdc_server_ip(cluster) with context.ThreadGroup() as tg: for instance in instances: tg.spawn('setup-client-%s' % instance.instance_name, _setup_client_node, cluster, instance, server, server_ip)
def _push_configs_to_nodes(self, cluster, extra, new_instances): all_instances = utils.get_instances(cluster) cpo.add_provisioning_step(cluster.id, _("Push configs to nodes"), len(all_instances)) with context.ThreadGroup() as tg: for instance in all_instances: if instance in new_instances: tg.spawn('spark-configure-%s' % instance.instance_name, self._push_configs_to_new_node, cluster, extra, instance) else: tg.spawn('spark-reconfigure-%s' % instance.instance_name, self._push_configs_to_existing_node, cluster, extra, instance)
def setup_clients(cluster, server=None, instances=None): if not instances: instances = cl_utils.get_instances(cluster) server_ip = None cpo.add_provisioning_step( cluster.id, _("Setting Up Kerberos clients"), len(instances)) if not server: server_ip = get_kdc_server_ip(cluster) with context.ThreadGroup() as tg: for instance in instances: tg.spawn('setup-client-%s' % instance.instance_name, _setup_client_node, cluster, instance, server, server_ip)
def _push_configs_to_nodes(self, cluster, extra, new_instances): all_instances = utils.get_instances(cluster) cpo.add_provisioning_step( cluster.id, _("Push configs to nodes"), len(all_instances)) with context.ThreadGroup() as tg: for instance in all_instances: if instance in new_instances: tg.spawn('spark-configure-%s' % instance.instance_name, self._push_configs_to_new_node, cluster, extra, instance) else: tg.spawn('spark-reconfigure-%s' % instance.instance_name, self._push_configs_to_existing_node, cluster, extra, instance)
def attach_to_instances(instances): instances_to_attach = _count_instances_to_attach(instances) if instances_to_attach == 0: return cpo.add_provisioning_step( instances[0].cluster_id, _("Attach volumes to instances"), instances_to_attach) with context.ThreadGroup() as tg: for instance in instances: if instance.node_group.volumes_per_node > 0: tg.spawn( 'attach-volumes-for-instance-%s' % instance.instance_name, _attach_volumes_to_node, instance.node_group, instance)
def _configure_instances(self, cluster): """Configure active instances. * generate /etc/hosts * setup passwordless login * etc. """ hosts_file = g.generate_etc_hosts(cluster) cpo.add_provisioning_step(cluster.id, _("Configure instances"), g.count_instances(cluster)) with context.ThreadGroup() as tg: for node_group in cluster.node_groups: for instance in node_group.instances: tg.spawn("configure-instance-%s" % instance.instance_name, self._configure_instance, instance, hosts_file)
def _configure_instances(self, cluster): """Configure active instances. * generate /etc/hosts * setup passwordless login * etc. """ hosts_file = g.generate_etc_hosts(cluster) cpo.add_provisioning_step( cluster.id, _("Configure instances"), g.count_instances(cluster)) with context.ThreadGroup() as tg: for node_group in cluster.node_groups: for instance in node_group.instances: tg.spawn("configure-instance-%s" % instance.instance_name, self._configure_instance, instance, hosts_file)
def refresh_zk_servers(cluster, to_delete_instances=None): instances = vu.get_zk_servers(cluster) if to_delete_instances: for instance in to_delete_instances: if instance in instances: instances.remove(instance) cpo.add_provisioning_step(cluster.id, pu.start_process_event_message("ZooKeeper"), len(instances)) with context.ThreadGroup() as tg: for instance in instances: with context.set_current_instance_id(instance.instance_id): tg.spawn('ZK-restart-processes-%s' % instance.instance_name, _start_zk_processes, instance, 'restart')
def _start_tt_dn_processes(self, instances): tt_dn_names = ["datanode", "tasktracker"] instances = utils.instances_with_services(instances, tt_dn_names) if not instances: return cpo.add_provisioning_step( instances[0].cluster_id, utils.start_process_event_message("DataNodes, TaskTrackers"), len(instances) ) with context.ThreadGroup() as tg: for i in instances: processes = set(i.node_group.node_processes) tt_dn_procs = processes.intersection(tt_dn_names) tg.spawn("vanilla-start-tt-dn-%s" % i.instance_name, self._start_tt_dn, i, list(tt_dn_procs))
def _create_instances(self, cluster): ctx = context.ctx() cluster = self._create_auto_security_groups(cluster) aa_group = None if cluster.anti_affinity: aa_group = self._create_aa_server_group(cluster) cpo.add_provisioning_step( cluster.id, _("Run instances"), g.count_instances(cluster)) for node_group in cluster.node_groups: count = node_group.count conductor.node_group_update(ctx, node_group, {'count': 0}) for idx in six.moves.xrange(1, count + 1): self._start_instance( cluster, node_group, idx, aa_group=aa_group)
def mount_to_instances(instances): if len(instances) == 0: return cpo.add_provisioning_step( instances[0].cluster_id, _("Mount volumes to instances"), _count_volumes_to_mount(instances)) with context.ThreadGroup() as tg: for instance in instances: devices = _find_instance_volume_devices(instance) # Since formatting can take several minutes (for large disks) and # can be done in parallel, launch one thread per disk. for idx in range(0, instance.node_group.volumes_per_node): tg.spawn('mount-volume-%d-to-node-%s' % (idx, instance.instance_name), _mount_volume_to_node, instance, idx, devices[idx])
def start_dn_nm_processes(instances): filternames = ['datanode', 'nodemanager'] instances = pu.instances_with_services(instances, filternames) if len(instances) == 0: return cpo.add_provisioning_step( instances[0].cluster_id, pu.start_process_event_message("DataNodes, NodeManagers"), len(instances)) with context.ThreadGroup() as tg: for instance in instances: with context.set_current_instance_id(instance.instance_id): processes = set(instance.node_group.node_processes) processes = processes.intersection(filternames) tg.spawn('vanilla-start-processes-%s' % instance.instance_name, _start_processes, instance, list(processes))