def launch_instances(self, ctx, cluster, target_count): # create all instances cluster = conductor.cluster_update(ctx, cluster, {"status": self.STAGES[0]}) LOG.info(g.format_cluster_status(cluster)) tmpl = heat.ClusterTemplate(cluster) self._configure_template(ctx, tmpl, cluster, target_count) stack = tmpl.instantiate(update_existing=self.UPDATE_STACK) stack.wait_till_active() self.inst_ids = self._populate_cluster(ctx, cluster, stack) # wait for all instances are up and networks ready cluster = conductor.cluster_update(ctx, cluster, {"status": self.STAGES[1]}) LOG.info(g.format_cluster_status(cluster)) instances = g.get_instances(cluster, self.inst_ids) self._await_networks(cluster, instances) # prepare all instances cluster = conductor.cluster_update(ctx, cluster, {"status": self.STAGES[2]}) LOG.info(g.format_cluster_status(cluster)) instances = g.get_instances(cluster, self.inst_ids) volumes.mount_to_instances(instances) self._configure_instances(cluster)
def scale_cluster(self, cluster, node_group_id_map): ctx = context.ctx() instance_ids = [] try: instance_ids = self._scale_cluster_instances(cluster, node_group_id_map) cluster = conductor.cluster_get(ctx, cluster) g.clean_cluster_from_empty_ng(cluster) cluster = conductor.cluster_get(ctx, cluster) instances = g.get_instances(cluster, instance_ids) self._await_active(cluster, instances) self._assign_floating_ips(instances) self._await_networks(cluster, instances) cluster = conductor.cluster_get(ctx, cluster) volumes.attach_to_instances( g.get_instances(cluster, instance_ids)) except Exception as ex: with excutils.save_and_reraise_exception(): self._log_operation_exception( "Can't scale cluster '%s' (reason: %s)", cluster, ex) cluster = conductor.cluster_get(ctx, cluster) self._rollback_cluster_scaling( cluster, g.get_instances(cluster, instance_ids), ex) instance_ids = [] cluster = conductor.cluster_get(ctx, cluster) g.clean_cluster_from_empty_ng(cluster) if cluster.status == 'Decommissioning': cluster = conductor.cluster_update(ctx, cluster, {"status": "Error"}) else: cluster = conductor.cluster_update(ctx, cluster, {"status": "Active"}) LOG.info(g.format_cluster_status(cluster)) # we should be here with valid cluster: if instances creation # was not successful all extra-instances will be removed above if instance_ids: self._configure_instances(cluster) return instance_ids
def scale_cluster(self, cluster, node_group_id_map): ctx = context.ctx() instance_ids = [] try: instance_ids = self._scale_cluster_instances( cluster, node_group_id_map) cluster = conductor.cluster_get(ctx, cluster) g.clean_cluster_from_empty_ng(cluster) cluster = conductor.cluster_get(ctx, cluster) instances = g.get_instances(cluster, instance_ids) self._await_active(cluster, instances) self._assign_floating_ips(instances) self._await_networks(cluster, instances) cluster = conductor.cluster_get(ctx, cluster) volumes.attach_to_instances(g.get_instances(cluster, instance_ids)) except Exception as ex: with excutils.save_and_reraise_exception(): self._log_operation_exception( "Can't scale cluster '%s' (reason: %s)", cluster, ex) cluster = conductor.cluster_get(ctx, cluster) self._rollback_cluster_scaling( cluster, g.get_instances(cluster, instance_ids), ex) instance_ids = [] cluster = conductor.cluster_get(ctx, cluster) g.clean_cluster_from_empty_ng(cluster) if cluster.status == 'Decommissioning': cluster = conductor.cluster_update(ctx, cluster, {"status": "Error"}) else: cluster = conductor.cluster_update(ctx, cluster, {"status": "Active"}) LOG.info(g.format_cluster_status(cluster)) # we should be here with valid cluster: if instances creation # was not successful all extra-instances will be removed above if instance_ids: self._configure_instances(cluster) return instance_ids
def _await_networks(self, cluster, instances): if not instances: return ips_assigned = set() while len(ips_assigned) != len(instances): if not g.check_cluster_exists(instances[0].node_group.cluster): return for instance in instances: if instance.id not in ips_assigned: if networks.init_instances_ips(instance): ips_assigned.add(instance.id) context.sleep(1) LOG.info("Cluster '%s': all instances have IPs assigned" % cluster.id) ctx = context.ctx() cluster = conductor.cluster_get(ctx, instances[0].node_group.cluster) instances = g.get_instances(cluster, ips_assigned) with context.ThreadGroup() as tg: for instance in instances: tg.spawn("wait-for-ssh-%s" % instance.instance_name, self._wait_until_accessible, instance) LOG.info("Cluster '%s': all instances are accessible" % cluster.id)
def shutdown_cluster(self, cluster): """Shutdown specified cluster and all related resources.""" try: heat.client().stacks.delete(cluster.name) except heat_exc.HTTPNotFound: LOG.warn("Did not found stack for cluster %s" % cluster.name) self._clean_job_executions(cluster) ctx = context.ctx() instances = g.get_instances(cluster) for inst in instances: conductor.instance_remove(ctx, inst)
def shutdown_cluster(self, cluster): """Shutdown specified cluster and all related resources.""" try: heat.client().stacks.delete(cluster.name) except heat_exc.HTTPNotFound: LOG.warn('Did not found stack for cluster %s' % cluster.name) self._clean_job_executions(cluster) ctx = context.ctx() instances = g.get_instances(cluster) for inst in instances: conductor.instance_remove(ctx, inst)
def _populate_cluster(self, ctx, cluster, stack): old_ids = [i.instance_id for i in g.get_instances(cluster)] new_ids = [] for node_group in cluster.node_groups: nova_ids = stack.get_node_group_instances(node_group) for name, nova_id in nova_ids: if nova_id not in old_ids: instance_id = conductor.instance_add( ctx, node_group, {"instance_id": nova_id, "instance_name": name} ) new_ids.append(instance_id) return new_ids
def _provision_scaled_cluster(id, node_group_id_map): ctx = context.ctx() cluster = conductor.cluster_get(ctx, id) plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name) # Decommissioning surplus nodes with the plugin cluster = conductor.cluster_update(ctx, cluster, {"status": "Decommissioning"}) LOG.info(g.format_cluster_status(cluster)) instances_to_delete = [] for node_group in cluster.node_groups: new_count = node_group_id_map[node_group.id] if new_count < node_group.count: instances_to_delete += node_group.instances[new_count: node_group.count] if instances_to_delete: plugin.decommission_nodes(cluster, instances_to_delete) # Scaling infrastructure cluster = conductor.cluster_update(ctx, cluster, {"status": "Scaling"}) LOG.info(g.format_cluster_status(cluster)) instances = INFRA.scale_cluster(cluster, node_group_id_map) # Setting up new nodes with the plugin if instances: cluster = conductor.cluster_update(ctx, cluster, {"status": "Configuring"}) LOG.info(g.format_cluster_status(cluster)) try: instances = g.get_instances(cluster, instances) plugin.scale_cluster(cluster, instances) except Exception as ex: LOG.exception("Can't scale cluster '%s' (reason: %s)", cluster.name, ex) cluster = conductor.cluster_update(ctx, cluster, {"status": "Error"}) LOG.info(g.format_cluster_status(cluster)) return cluster = conductor.cluster_update(ctx, cluster, {"status": "Active"}) LOG.info(g.format_cluster_status(cluster))
def _provision_scaled_cluster(id, node_group_id_map): ctx = context.ctx() cluster = conductor.cluster_get(ctx, id) plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name) # Decommissioning surplus nodes with the plugin cluster = conductor.cluster_update(ctx, cluster, {"status": "Decommissioning"}) LOG.info(g.format_cluster_status(cluster)) instances_to_delete = [] for node_group in cluster.node_groups: new_count = node_group_id_map[node_group.id] if new_count < node_group.count: instances_to_delete += node_group.instances[new_count:node_group. count] if instances_to_delete: plugin.decommission_nodes(cluster, instances_to_delete) # Scaling infrastructure cluster = conductor.cluster_update(ctx, cluster, {"status": "Scaling"}) LOG.info(g.format_cluster_status(cluster)) instances = INFRA.scale_cluster(cluster, node_group_id_map) # Setting up new nodes with the plugin if instances: cluster = conductor.cluster_update(ctx, cluster, {"status": "Configuring"}) LOG.info(g.format_cluster_status(cluster)) try: instances = g.get_instances(cluster, instances) plugin.scale_cluster(cluster, instances) except Exception as ex: LOG.exception("Can't scale cluster '%s' (reason: %s)", cluster.name, ex) cluster = conductor.cluster_update(ctx, cluster, {"status": "Error"}) LOG.info(g.format_cluster_status(cluster)) return cluster = conductor.cluster_update(ctx, cluster, {"status": "Active"}) LOG.info(g.format_cluster_status(cluster))
def _populate_cluster(self, ctx, cluster, stack): old_ids = [i.instance_id for i in g.get_instances(cluster)] new_ids = [] for node_group in cluster.node_groups: nova_ids = stack.get_node_group_instances(node_group) for name, nova_id in nova_ids: if nova_id not in old_ids: instance_id = conductor.instance_add( ctx, node_group, { "instance_id": nova_id, "instance_name": name }) new_ids.append(instance_id) return new_ids
def test_delete_floating_ips(self, novaclient): nova = _create_nova_mock(novaclient) node_groups = [_make_ng_dict("test_group_1", "test_flavor", ["data node", "test tracker"], 2, "pool")] ctx = context.ctx() cluster = _create_cluster_mock(node_groups, ["datanode"]) self.engine._create_instances(cluster) cluster = conductor.cluster_get(ctx, cluster) instances_list = g.get_instances(cluster) self.engine._assign_floating_ips(instances_list) self.engine._shutdown_instances(cluster) self.assertEqual(nova.floating_ips.delete.call_count, 2, "Not expected floating IPs number found in delete") self.assertEqual(nova.servers.delete.call_count, 2, "Not expected")
def test_delete_floating_ips(self): node_groups = [_make_ng_dict("test_group_1", "test_flavor", ["data node", "test tracker"], 2, 'pool')] ctx = context.ctx() cluster = _create_cluster_mock(node_groups, ["datanode"]) self.engine._create_instances(cluster) cluster = conductor.cluster_get(ctx, cluster) instances_list = g.get_instances(cluster) self.engine._assign_floating_ips(instances_list) self.engine._shutdown_instances(cluster) self.assertEqual(self.nova.floating_ips.delete.call_count, 2, "Not expected floating IPs number found in delete") self.assertEqual(self.nova.servers.delete.call_count, 2, "Not expected")
def create_cluster(self, cluster): ctx = context.ctx() try: # create all instances conductor.cluster_update(ctx, cluster, {"status": "Spawning"}) LOG.info(g.format_cluster_status(cluster)) self._create_instances(cluster) # wait for all instances are up and networks ready cluster = conductor.cluster_update(ctx, cluster, {"status": "Waiting"}) LOG.info(g.format_cluster_status(cluster)) instances = g.get_instances(cluster) self._await_active(cluster, instances) self._assign_floating_ips(instances) self._await_networks(cluster, instances) cluster = conductor.cluster_get(ctx, cluster) # attach volumes volumes.attach(cluster) # prepare all instances cluster = conductor.cluster_update(ctx, cluster, {"status": "Preparing"}) LOG.info(g.format_cluster_status(cluster)) self._configure_instances(cluster) except Exception as ex: with excutils.save_and_reraise_exception(): self._log_operation_exception( "Can't start cluster '%s' (reason: %s)", cluster, ex) cluster = conductor.cluster_update( ctx, cluster, { "status": "Error", "status_description": str(ex) }) LOG.info(g.format_cluster_status(cluster)) self._rollback_cluster_creation(cluster, ex)
def create_cluster(self, cluster): ctx = context.ctx() try: # create all instances conductor.cluster_update(ctx, cluster, {"status": "Spawning"}) LOG.info(g.format_cluster_status(cluster)) self._create_instances(cluster) # wait for all instances are up and networks ready cluster = conductor.cluster_update(ctx, cluster, {"status": "Waiting"}) LOG.info(g.format_cluster_status(cluster)) instances = g.get_instances(cluster) self._await_active(cluster, instances) self._assign_floating_ips(instances) self._await_networks(cluster, instances) cluster = conductor.cluster_get(ctx, cluster) # attach volumes volumes.attach(cluster) # prepare all instances cluster = conductor.cluster_update(ctx, cluster, {"status": "Preparing"}) LOG.info(g.format_cluster_status(cluster)) self._configure_instances(cluster) except Exception as ex: with excutils.save_and_reraise_exception(): self._log_operation_exception( "Can't start cluster '%s' (reason: %s)", cluster, ex) cluster = conductor.cluster_update( ctx, cluster, {"status": "Error", "status_description": str(ex)}) LOG.info(g.format_cluster_status(cluster)) self._rollback_cluster_creation(cluster, ex)
def test_ip_assignment_use_no_floating(self): self.override_config("use_floating_ips", False) node_groups = [_make_ng_dict("test_group_1", "test_flavor", ["data node", "test tracker"], 2, 'pool'), _make_ng_dict("test_group_2", "test_flavor", ["name node", "test tracker"], 1)] ctx = context.ctx() cluster = _create_cluster_mock(node_groups, ["data node"]) self.engine._create_instances(cluster) cluster = conductor.cluster_get(ctx, cluster) instances_list = g.get_instances(cluster) self.engine._assign_floating_ips(instances_list) self.nova.floating_ips.create.assert_has_calls( [mock.call("pool"), mock.call("pool")]) self.assertEqual(self.nova.floating_ips.create.call_count, 2, "Not expected floating IPs number found.")
def test_ip_assignment_use_no_floating(self, cfg, novaclient): cfg.CONF.set_override("use_floating_ips", False) try: nova = _create_nova_mock(novaclient) node_groups = [ _make_ng_dict("test_group_1", "test_flavor", ["data node", "test tracker"], 2, "pool"), _make_ng_dict("test_group_2", "test_flavor", ["name node", "test tracker"], 1), ] ctx = context.ctx() cluster = _create_cluster_mock(node_groups, ["data node"]) self.engine._create_instances(cluster) cluster = conductor.cluster_get(ctx, cluster) instances_list = g.get_instances(cluster) self.engine._assign_floating_ips(instances_list) nova.floating_ips.create.assert_has_calls([mock.call("pool"), mock.call("pool")]) self.assertEqual(nova.floating_ips.create.call_count, 2, "Not expected floating IPs number found.") finally: cfg.CONF.clear_override("use_floating_ips")