Пример #1
0
    def _check_swift_availability(self, cluster_info):
        plugin_config = cluster_info['plugin_config']
        # Make unique name of Swift container during Swift testing
        swift_container_name = 'Swift-test-' + str(uuid.uuid4())[:8]
        extra_script_parameters = {
            'OS_TENANT_NAME': self.common_config.OS_TENANT_NAME,
            'OS_USERNAME': self.common_config.OS_USERNAME,
            'OS_PASSWORD': self.common_config.OS_PASSWORD,
            'HADOOP_USER': plugin_config.HADOOP_USER,
            'SWIFT_CONTAINER_NAME': swift_container_name
        }
        namenode_ip = cluster_info['node_info']['namenode_ip']
        self.open_ssh_connection(namenode_ip, plugin_config.SSH_USERNAME)
        try:
            self.transfer_helper_script_to_node(
                'swift_test_script.sh', parameter_list=extra_script_parameters
            )

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print(str(e))
        swift = self.connect_to_swift()
        swift.put_container(swift_container_name)
        try:
            self.execute_command('./script.sh')

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print(str(e))

        finally:
            self.delete_swift_container(swift, swift_container_name)
        self.close_ssh_connection()
Пример #2
0
    def consume_in_thread(self):
        """Runs the ZmqProxy service."""
        ipc_dir = CONF.rpc_zmq_ipc_dir
        consume_in = "tcp://%s:%s" % \
            (CONF.rpc_zmq_bind_address,
             CONF.rpc_zmq_port)
        consumption_proxy = InternalContext(None)

        try:
            os.makedirs(ipc_dir)
        except os.error:
            if not os.path.isdir(ipc_dir):
                with excutils.save_and_reraise_exception():
                    LOG.error(_("Required IPC directory does not exist at"
                                " %s") % (ipc_dir, ))
        try:
            self.register(consumption_proxy,
                          consume_in,
                          zmq.PULL)
        except zmq.ZMQError:
            if os.access(ipc_dir, os.X_OK):
                with excutils.save_and_reraise_exception():
                    LOG.error(_("Permission denied to IPC directory at"
                                " %s") % (ipc_dir, ))
            with excutils.save_and_reraise_exception():
                LOG.error(_("Could not create ZeroMQ receiver daemon. "
                            "Socket may already be in use."))

        super(ZmqProxy, self).consume_in_thread()
Пример #3
0
    def consume_in_thread(self):
        """Runs the ZmqProxy service."""
        ipc_dir = CONF.rpc_zmq_ipc_dir
        consume_in = "tcp://%s:%s" % \
            (CONF.rpc_zmq_bind_address,
             CONF.rpc_zmq_port)
        consumption_proxy = InternalContext(None)

        try:
            os.makedirs(ipc_dir)
        except os.error:
            if not os.path.isdir(ipc_dir):
                with excutils.save_and_reraise_exception():
                    LOG.error(
                        _("Required IPC directory does not exist at"
                          " %s") % (ipc_dir, ))
        try:
            self.register(consumption_proxy, consume_in, zmq.PULL)
        except zmq.ZMQError:
            if os.access(ipc_dir, os.X_OK):
                with excutils.save_and_reraise_exception():
                    LOG.error(
                        _("Permission denied to IPC directory at"
                          " %s") % (ipc_dir, ))
            with excutils.save_and_reraise_exception():
                LOG.error(
                    _("Could not create ZeroMQ receiver daemon. "
                      "Socket may already be in use."))

        super(ZmqProxy, self).consume_in_thread()
Пример #4
0
    def check_swift_availability(self, cluster_info):
        plugin_config = cluster_info['plugin_config']
        # Make unique name of Swift container during Swift testing
        swift_container_name = 'Swift-test-' + str(uuid.uuid4())[:8]
        extra_script_parameters = {
            'OS_TENANT_NAME': self.common_config.OS_TENANT_NAME,
            'OS_USERNAME': self.common_config.OS_USERNAME,
            'OS_PASSWORD': self.common_config.OS_PASSWORD,
            'HADOOP_USER': plugin_config.HADOOP_USER,
            'SWIFT_CONTAINER_NAME': swift_container_name
        }
        namenode_ip = cluster_info['node_info']['namenode_ip']
        self.open_ssh_connection(namenode_ip, plugin_config.SSH_USERNAME)
        try:
            self.transfer_helper_script_to_node(
                'swift_test_script.sh', parameter_list=extra_script_parameters)

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print(str(e))
        swift = self.connect_to_swift()
        swift.put_container(swift_container_name)
        try:
            self.execute_command('./script.sh')

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print(str(e))

        finally:
            self.delete_swift_container(swift, swift_container_name)
        self.close_ssh_connection()
Пример #5
0
    def create_cluster_and_get_info(self, plugin_config, cluster_template_id,
                                    description, cluster_configs,
                                    node_groups=None, anti_affinity=None,
                                    net_id=None, is_transient=False):
        self.cluster_id = None
        data = self.savanna.clusters.create(
            self.common_config.CLUSTER_NAME + '-' + plugin_config.PLUGIN_NAME,
            plugin_config.PLUGIN_NAME, plugin_config.HADOOP_VERSION,
            cluster_template_id, plugin_config.IMAGE_ID, is_transient,
            description, cluster_configs, node_groups,
            self.common_config.USER_KEYPAIR_ID, anti_affinity, net_id)
        self.cluster_id = data.id
        self.poll_cluster_state(self.cluster_id)
        node_ip_list_with_node_processes = (
            self.get_cluster_node_ip_list_with_node_processes(self.cluster_id))
        try:
            node_info = self.get_node_info(node_ip_list_with_node_processes,
                                           plugin_config)

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print(
                    '\nFailure during check of node process deployment '
                    'on cluster node: ' + str(e)
                )
        try:
            self.await_active_workers_for_namenode(node_info, plugin_config)

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print(
                    '\nFailure while active worker waiting for namenode: '
                    + str(e)
                )
        # For example: method "create_cluster_and_get_info" return
        # {
        #       'node_info': {
        #               'tasktracker_count': 3,
        #               'node_count': 6,
        #               'namenode_ip': '172.18.168.242',
        #               'datanode_count': 3
        #               },
        #       'cluster_id': 'bee5c6a1-411a-4e88-95fc-d1fbdff2bb9d',
        #       'node_ip_list': {
        #               '172.18.168.153': ['tasktracker', 'datanode'],
        #               '172.18.168.208': ['secondarynamenode', 'oozie'],
        #               '172.18.168.93': ['tasktracker'],
        #               '172.18.168.101': ['tasktracker', 'datanode'],
        #               '172.18.168.242': ['namenode', 'jobtracker'],
        #               '172.18.168.167': ['datanode']
        #       },
        #       'plugin_config': <oslo.config.cfg.GroupAttr object at 0x215d9d>
        # }
        return {
            'cluster_id': self.cluster_id,
            'node_ip_list': node_ip_list_with_node_processes,
            'node_info': node_info,
            'plugin_config': plugin_config
        }
Пример #6
0
    def create_cluster_and_get_info(self, plugin_config, cluster_template_id,
                                    description, cluster_configs,
                                    node_groups=None, anti_affinity=None,
                                    net_id=None, is_transient=False):
        self.cluster_id = None
        data = self.savanna.clusters.create(
            self.common_config.CLUSTER_NAME + '-' + plugin_config.PLUGIN_NAME,
            plugin_config.PLUGIN_NAME, plugin_config.HADOOP_VERSION,
            cluster_template_id, plugin_config.IMAGE_ID, is_transient,
            description, cluster_configs, node_groups,
            self.common_config.USER_KEYPAIR_ID, anti_affinity, net_id)
        self.cluster_id = data.id
        self.poll_cluster_state(self.cluster_id)
        node_ip_list_with_node_processes = (
            self.get_cluster_node_ip_list_with_node_processes(self.cluster_id))
        try:
            node_info = self.get_node_info(node_ip_list_with_node_processes,
                                           plugin_config)

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print(
                    '\nFailure during check of node process deployment '
                    'on cluster node: ' + str(e)
                )
        try:
            self.await_active_workers_for_namenode(node_info, plugin_config)

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print(
                    '\nFailure while active worker waiting for namenode: '
                    + str(e)
                )
        # For example: method "create_cluster_and_get_info" return
        # {
        #       'node_info': {
        #               'tasktracker_count': 3,
        #               'node_count': 6,
        #               'namenode_ip': '172.18.168.242',
        #               'datanode_count': 3
        #               },
        #       'cluster_id': 'bee5c6a1-411a-4e88-95fc-d1fbdff2bb9d',
        #       'node_ip_list': {
        #               '172.18.168.153': ['tasktracker', 'datanode'],
        #               '172.18.168.208': ['secondarynamenode', 'oozie'],
        #               '172.18.168.93': ['tasktracker'],
        #               '172.18.168.101': ['tasktracker', 'datanode'],
        #               '172.18.168.242': ['namenode', 'jobtracker'],
        #               '172.18.168.167': ['datanode']
        #       },
        #       'plugin_config': <oslo.config.cfg.GroupAttr object at 0x215d9d>
        # }
        return {
            'cluster_id': self.cluster_id,
            'node_ip_list': node_ip_list_with_node_processes,
            'node_info': node_info,
            'plugin_config': plugin_config
        }
Пример #7
0
    def _map_reduce_testing(self, cluster_info):
        plugin_config = cluster_info['plugin_config']
        node_count = cluster_info['node_info']['node_count']
        extra_script_parameters = {
            'HADOOP_VERSION': plugin_config.HADOOP_VERSION,
            'HADOOP_DIRECTORY': plugin_config.HADOOP_DIRECTORY,
            'HADOOP_LOG_DIRECTORY': plugin_config.HADOOP_LOG_DIRECTORY,
            'HADOOP_USER': plugin_config.HADOOP_USER,
            'NODE_COUNT': node_count,
            'PLUGIN_NAME': plugin_config.PLUGIN_NAME
        }
        node_ip_and_process_list = cluster_info['node_ip_list']
        try:
            self.transfer_helper_script_to_nodes(
                node_ip_and_process_list, plugin_config.SSH_USERNAME,
                'map_reduce_test_script.sh',
                parameter_list=extra_script_parameters
            )

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print(str(e))
        namenode_ip = cluster_info['node_info']['namenode_ip']
        self.open_ssh_connection(namenode_ip, plugin_config.SSH_USERNAME)
        self.__run_pi_job()
        job_name = self.__get_name_of_completed_pi_job()
        self.close_ssh_connection()
        # Check that cluster used each "tasktracker" node while work of PI-job.
        # Count of map-tasks and reduce-tasks in helper script guarantees that
        # cluster will use each from such nodes while work of PI-job.
        try:
            for node_ip, process_list in node_ip_and_process_list.items():
                if plugin_config.PROCESS_NAMES['tt'] in process_list:
                    self.open_ssh_connection(
                        node_ip, plugin_config.SSH_USERNAME
                    )
                    self.execute_command(
                        './script.sh check_directory -job_name %s' % job_name
                    )
                    self.close_ssh_connection()

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print(
                    '\nLog file of completed \'PI\' job on \'tasktracker\' '
                    'cluster node not found: ' + str(e)
                )
                self.close_ssh_connection()
                self.open_ssh_connection(
                    namenode_ip, plugin_config.SSH_USERNAME
                )
                self.capture_error_log_from_cluster_node(
                    '/tmp/MapReduceTestOutput/log.txt'
                )
        self.open_ssh_connection(namenode_ip, plugin_config.SSH_USERNAME)
        self.__run_wordcount_job()
        self.close_ssh_connection()
Пример #8
0
    def cluster_scaling(self, cluster_info, change_list):
        scale_body = {'add_node_groups': [], 'resize_node_groups': []}
        for change in change_list:
            if change['operation'] == 'resize':
                node_group_name = change['info'][0]
                node_group_size = change['info'][1]
                self._add_new_field_to_scale_body_while_ng_resizing(
                    scale_body, node_group_name, node_group_size)
                self._change_node_info_while_ng_resizing(
                    node_group_name, node_group_size, cluster_info)
            if change['operation'] == 'add':
                node_group_name = change['info'][0]
                node_group_size = change['info'][1]
                node_group_id = change['info'][2]
                self._add_new_field_to_scale_body_while_ng_adding(
                    scale_body, node_group_id, node_group_size,
                    node_group_name)
                self._change_node_info_while_ng_adding(node_group_id,
                                                       node_group_size,
                                                       cluster_info)
        self.savanna.clusters.scale(cluster_info['cluster_id'], scale_body)
        self.poll_cluster_state(cluster_info['cluster_id'])
        new_node_ip_list = self.get_cluster_node_ip_list_with_node_processes(
            cluster_info['cluster_id'])
        try:
            new_node_info = self.get_node_info(new_node_ip_list,
                                               cluster_info['plugin_config'])

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print('\nFailure during check of node process deployment '
                      'on cluster node: ' + str(e))
        expected_node_info = cluster_info['node_info']
        self.assertEqual(
            expected_node_info, new_node_info,
            'Failure while node info comparison.\n'
            'Expected node info after cluster scaling: %s.\n'
            'Actual node info after cluster scaling: %s.' %
            (expected_node_info, new_node_info))
        try:
            self.await_active_workers_for_namenode(
                new_node_info, cluster_info['plugin_config'])

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print('\nFailure while active worker waiting for namenode: ' +
                      str(e))
        return {
            'cluster_id': cluster_info['cluster_id'],
            'node_ip_list': new_node_ip_list,
            'node_info': new_node_info,
            'plugin_config': cluster_info['plugin_config']
        }
Пример #9
0
    def _check_swift_availability(self, cluster_info):

        plugin_config = cluster_info["plugin_config"]

        # Make unique name of Swift container during Swift testing
        swift_container_name = "Swift-test-" + str(uuid.uuid4())

        extra_script_parameters = {
            "OS_TENANT_NAME": self.common_config.OS_TENANT_NAME,
            "OS_USERNAME": self.common_config.OS_USERNAME,
            "OS_PASSWORD": self.common_config.OS_PASSWORD,
            "HADOOP_USER": plugin_config.HADOOP_USER,
            "SWIFT_CONTAINER_NAME": swift_container_name,
        }

        namenode_ip = cluster_info["node_info"]["namenode_ip"]

        self.open_ssh_connection(namenode_ip, plugin_config.NODE_USERNAME)

        try:

            self.transfer_helper_script_to_node("swift_test_script.sh", parameter_list=extra_script_parameters)

        except Exception as e:

            with excutils.save_and_reraise_exception():

                print(str(e))

        swift = self.connect_to_swift()

        swift.put_container(swift_container_name)

        try:

            self.execute_command("./script.sh")

        except Exception as e:

            with excutils.save_and_reraise_exception():

                print(str(e))

        finally:

            self.delete_swift_container(swift, swift_container_name)

        self.close_ssh_connection()
Пример #10
0
def create_cluster(values):
    ctx = context.ctx()
    cluster = conductor.cluster_create(ctx, values)
    plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name)

    # validating cluster
    try:
        cluster = conductor.cluster_update(ctx, cluster,
                                           {"status": "Validating"})
        LOG.info(g.format_cluster_status(cluster))

        plugin.validate(cluster)
    except Exception as e:
        with excutils.save_and_reraise_exception():
            cluster = conductor.cluster_update(ctx, cluster,
                                               {"status": "Error",
                                                "status_description": str(e)})
            LOG.info(g.format_cluster_status(cluster))

    context.spawn("cluster-creating-%s" % cluster.id,
                  _provision_cluster, cluster.id)
    if CONF.use_identity_api_v3 and cluster.is_transient:
        trusts.create_trust(cluster)

    return conductor.cluster_get(ctx, cluster.id)
Пример #11
0
    def _add_params_to_script_and_transfer_to_node(self,
                                                   cluster_info,
                                                   node_group,
                                                   node_with_volumes=False):
        plugin_config = cluster_info['plugin_config']
        hadoop_log_directory = plugin_config.HADOOP_LOG_DIRECTORY
        if node_with_volumes:
            hadoop_log_directory = (
                plugin_config.HADOOP_LOG_DIRECTORY_ON_VOLUME)
        extra_script_parameters = {
            'HADOOP_VERSION': plugin_config.HADOOP_VERSION,
            'HADOOP_DIRECTORY': plugin_config.HADOOP_DIRECTORY,
            'HADOOP_LOG_DIRECTORY': hadoop_log_directory,
            'HADOOP_USER': plugin_config.HADOOP_USER,
            'NODE_COUNT': cluster_info['node_info']['node_count'],
            'PLUGIN_NAME': plugin_config.PLUGIN_NAME
        }
        for instance in node_group['instances']:
            try:
                self.open_ssh_connection(instance['management_ip'],
                                         plugin_config.SSH_USERNAME)
                self.transfer_helper_script_to_node(
                    'map_reduce_test_script.sh', extra_script_parameters)
                self.close_ssh_connection()

            except Exception as e:
                with excutils.save_and_reraise_exception():
                    print(str(e))
Пример #12
0
    def transfer_helper_script_to_node(self, script_name, parameter_list=None):

        script = open('savanna/tests/integration/tests/resources/%s'
                      % script_name).read()

        if parameter_list:

            for parameter, value in parameter_list.items():

                script = script.replace(
                    '%s=""' % parameter, '%s=%s' % (parameter, value))

        try:

            self.write_file_to('script.sh', script)

        except Exception as e:

            with excutils.save_and_reraise_exception():

                print(
                    '\nFailure while helper script transferring '
                    'to cluster node: ' + str(e)
                )

        self.execute_command('chmod 777 script.sh')
Пример #13
0
    def map_reduce_testing(self, cluster_info):
        self._transfer_helper_script_to_nodes(cluster_info)
        plugin_config = cluster_info['plugin_config']
        namenode_ip = cluster_info['node_info']['namenode_ip']
        self.open_ssh_connection(namenode_ip, plugin_config.SSH_USERNAME)
        self._run_pi_job()
        job_name = self._get_name_of_completed_pi_job()
        self.close_ssh_connection()
        # Check that cluster used each "tasktracker" node while work of PI-job.
        # Count of map-tasks and reduce-tasks in helper script guarantees that
        # cluster will use each from such nodes while work of PI-job.
        node_ip_and_process_list = cluster_info['node_ip_list']
        try:
            for node_ip, process_list in node_ip_and_process_list.items():
                if plugin_config.PROCESS_NAMES['tt'] in process_list:
                    self.open_ssh_connection(node_ip,
                                             plugin_config.SSH_USERNAME)
                    self.execute_command(
                        './script.sh check_directory -job_name %s' % job_name)
                    self.close_ssh_connection()

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print('\nLog file of completed \'PI\' job on \'tasktracker\' '
                      'cluster node not found: ' + str(e))
                self.close_ssh_connection()
                self.open_ssh_connection(namenode_ip,
                                         plugin_config.SSH_USERNAME)
                self.capture_error_log_from_cluster_node(
                    '/tmp/MapReduceTestOutput/log.txt')
        self.open_ssh_connection(namenode_ip, plugin_config.SSH_USERNAME)
        self._run_wordcount_job()
        self.close_ssh_connection()
Пример #14
0
def scale_cluster(cluster_id, data):
    cluster = get_cluster(id=cluster_id)
    plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name)
    existing_node_groups = data.get("resize_node_groups", [])
    additional_node_groups = data.get("add_node_groups", [])

    # the next map is the main object we will work with
    # to_be_enlarged : {node_group_name: desired_amount_of_instances}
    to_be_enlarged = {}
    for ng in existing_node_groups:
        to_be_enlarged.update({ng["name"]: ng["count"]})

    additional = construct_ngs_for_scaling(additional_node_groups)

    try:
        context.model_update(cluster, status="Validating")
        plugin.validate_scaling(cluster, to_be_enlarged, additional)
    except Exception:
        with excutils.save_and_reraise_exception():
            context.model_update(cluster, status="Active")

    # If we are here validation is successful.
    # So let's update bd and to_be_enlarged map:
    for add_n_g in additional:
        cluster.node_groups.append(add_n_g)
        to_be_enlarged.update({add_n_g.name: additional[add_n_g]})
    context.model_save(cluster)

    context.spawn(_provision_nodes, cluster_id, to_be_enlarged)
    return cluster
Пример #15
0
def create_cluster(cluster):
    ctx = context.ctx()
    try:
        # create all instances
        conductor.cluster_update(ctx, cluster, {"status": "Spawning"})
        LOG.info(g.format_cluster_status(cluster))
        _create_instances(cluster)

        # wait for all instances are up and accessible
        cluster = conductor.cluster_update(ctx, cluster, {"status": "Waiting"})
        LOG.info(g.format_cluster_status(cluster))
        cluster = _await_instances(cluster)

        # attach volumes
        volumes.attach(cluster)

        # prepare all instances
        cluster = conductor.cluster_update(ctx, cluster,
                                           {"status": "Preparing"})
        LOG.info(g.format_cluster_status(cluster))

        _configure_instances(cluster)
    except Exception as ex:
        LOG.warn("Can't start cluster '%s' (reason: %s)", cluster.name, ex)
        with excutils.save_and_reraise_exception():
            cluster = conductor.cluster_update(ctx, cluster,
                                               {"status": "Error",
                                                "status_description": str(ex)})
            LOG.info(g.format_cluster_status(cluster))
            _rollback_cluster_creation(cluster, ex)
Пример #16
0
def create_cluster(cluster):
    ctx = context.ctx()
    try:
        # create all instances
        conductor.cluster_update(ctx, cluster, {"status": "Spawning"})
        LOG.info(g.format_cluster_status(cluster))
        _create_instances(cluster)

        # wait for all instances are up and accessible
        cluster = conductor.cluster_update(ctx, cluster, {"status": "Waiting"})
        LOG.info(g.format_cluster_status(cluster))
        cluster = _await_instances(cluster)

        # attach volumes
        volumes.attach(cluster)

        # prepare all instances
        cluster = conductor.cluster_update(ctx, cluster,
                                           {"status": "Preparing"})
        LOG.info(g.format_cluster_status(cluster))

        _configure_instances(cluster)
    except Exception as ex:
        LOG.warn("Can't start cluster '%s' (reason: %s)", cluster.name, ex)
        with excutils.save_and_reraise_exception():
            cluster = conductor.cluster_update(ctx, cluster, {
                "status": "Error",
                "status_description": str(ex)
            })
            LOG.info(g.format_cluster_status(cluster))
            _rollback_cluster_creation(cluster, ex)
Пример #17
0
def _scale_cluster(cluster, target_count):
    ctx = context.ctx()

    rollback_count = _get_ng_counts(cluster)

    launcher = _ScaleLauncher()

    try:
        launcher.launch_instances(ctx, cluster, target_count)
    except Exception as ex:
        LOG.warn("Can't scale cluster '%s' (reason: %s)", cluster.name, ex)
        with excutils.save_and_reraise_exception():
            cluster = conductor.cluster_get(ctx, cluster)

            try:
                _rollback_cluster_scaling(ctx, cluster, rollback_count, target_count)
            except Exception:
                # if something fails during the rollback, we stop
                # doing anything further
                cluster = conductor.cluster_update(ctx, cluster, {"status": "Error"})
                LOG.info(g.format_cluster_status(cluster))
                LOG.error("Unable to complete rollback, aborting")
                raise

            cluster = conductor.cluster_update(ctx, cluster, {"status": "Active"})
            LOG.info(g.format_cluster_status(cluster))
            LOG.warn("Rollback successful. Throwing off an initial exception.")
    finally:
        cluster = conductor.cluster_get(ctx, cluster)
        _clean_cluster_from_empty_ng(cluster)

    return launcher.inst_ids
Пример #18
0
def scale_cluster(cluster, node_group_names_map, plugin):
    # Now let's work with real node_groups, not names:
    node_groups_map = {}
    for ng in cluster.node_groups:
        if ng.name in node_group_names_map:
            node_groups_map.update({ng: node_group_names_map[ng.name]})
    instances_list = []
    try:
        instances_list = _scale_cluster_instances(
            cluster, node_groups_map, plugin)
        _clean_cluster_from_empty_ng(cluster)
        _await_instances(cluster)
        volumes.attach_to_instances(instances_list)

    except Exception as ex:
        LOG.warn("Can't scale cluster '%s' (reason: %s)", cluster.name, ex)
        with excutils.save_and_reraise_exception():
            _rollback_cluster_scaling(cluster, instances_list, ex)
            instances_list = []
            _clean_cluster_from_empty_ng(cluster)
            if cluster.status == 'Decommissioning':
                context.model_update(cluster, status='Error')
            else:
                context.model_update(cluster, status='Active')
    # we should be here with valid cluster: if instances creation
    # was not successful all extra-instances will be removed above
    if instances_list:
        _configure_instances(cluster)
    return instances_list
Пример #19
0
def scale_cluster(id, data):
    ctx = context.ctx()

    cluster = conductor.cluster_get(ctx, id)
    plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name)
    existing_node_groups = data.get("resize_node_groups", [])
    additional_node_groups = data.get("add_node_groups", [])

    # the next map is the main object we will work with
    # to_be_enlarged : {node_group_id: desired_amount_of_instances}
    to_be_enlarged = {}
    for ng in existing_node_groups:
        ng_id = g.find(cluster.node_groups, name=ng["name"])["id"]
        to_be_enlarged.update({ng_id: ng["count"]})

    additional = construct_ngs_for_scaling(cluster, additional_node_groups)

    try:
        cluster = conductor.cluster_update(ctx, cluster, {"status": "Validating"})
        LOG.info(g.format_cluster_status(cluster))
        plugin.validate_scaling(cluster, to_be_enlarged, additional)
    except Exception:
        with excutils.save_and_reraise_exception():
            i.clean_cluster_from_empty_ng(cluster)
            cluster = conductor.cluster_update(ctx, cluster, {"status": "Active"})
            LOG.info(g.format_cluster_status(cluster))

    # If we are here validation is successful.
    # So let's update to_be_enlarged map:
    to_be_enlarged.update(additional)

    context.spawn("cluster-scaling-%s" % id, _provision_nodes, id, to_be_enlarged)
    return conductor.cluster_get(ctx, id)
Пример #20
0
    def _cluster_config_testing(self, cluster_info):
        cluster_id = cluster_info['cluster_id']
        data = self.savanna.clusters.get(cluster_id)
        self.__compare_configs(
            {'Enable Swift': True}, data.cluster_configs['general']
        )
        self.__compare_configs(
            CLUSTER_HDFS_CONFIG, data.cluster_configs['HDFS']
        )
        self.__compare_configs(
            CLUSTER_MR_CONFIG, data.cluster_configs['MapReduce']
        )
        node_groups = data.node_groups
        self.__check_configs_for_node_groups(node_groups)
        node_ip_list_with_node_processes = (
            self.get_cluster_node_ip_list_with_node_processes(cluster_id))
        try:
            self.transfer_helper_script_to_nodes(
                node_ip_list_with_node_processes,
                self.vanilla_config.SSH_USERNAME,
                'cluster_config_test_script.sh'
            )

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print(str(e))
        self.__check_config_application_on_cluster_nodes(
            node_ip_list_with_node_processes
        )
Пример #21
0
 def __enter__(self):
     _acquire_remote_semaphore()
     try:
         self.bulk = BulkInstanceInteropHelper(self.instance)
         return self.bulk
     except Exception:
         with excutils.save_and_reraise_exception():
             _release_remote_semaphore()
Пример #22
0
 def __enter__(self):
     _acquire_remote_semaphore()
     try:
         self.bulk = BulkInstanceInteropHelper(self.instance)
         return self.bulk
     except Exception:
         with excutils.save_and_reraise_exception():
             _release_remote_semaphore()
Пример #23
0
 def __init__(self, instance):
     super(BulkInstanceInteropHelper, self).__init__(instance)
     self.proc = procutils.start_subprocess()
     try:
         procutils.run_in_subprocess(self.proc, _connect,
                                     self._get_conn_params())
     except Exception:
         with excutils.save_and_reraise_exception():
             procutils.shutdown_subprocess(self.proc, _cleanup)
Пример #24
0
    def _run_wordcount_job(self):
        try:
            self.execute_command('./script.sh run_wordcount_job')

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print('\nFailure while \'Wordcount\' job launch: ' + str(e))
                self.capture_error_log_from_cluster_node(
                    '/tmp/MapReduceTestOutput/log.txt')
Пример #25
0
 def __init__(self, instance):
     super(BulkInstanceInteropHelper, self).__init__(instance)
     self.proc = procutils.start_subprocess()
     try:
         procutils.run_in_subprocess(self.proc, _connect,
                                     self._get_conn_params())
     except Exception:
         with excutils.save_and_reraise_exception():
             procutils.shutdown_subprocess(self.proc, _cleanup)
Пример #26
0
 def __init__(self, instance, username):
     self.instance = instance
     self.username = username
     self.proc = procutils.start_subprocess()
     try:
         procutils.run_in_subprocess(self.proc, _connect,
                                     self._get_conn_params())
     except Exception:
         with excutils.save_and_reraise_exception():
             procutils.shutdown_subprocess(self.proc, _cleanup)
Пример #27
0
 def __init__(self, instance, username):
     self.instance = instance
     self.username = username
     self.proc = procutils.start_subprocess()
     try:
         procutils.run_in_subprocess(self.proc, _connect,
                                     self._get_conn_params())
     except Exception:
         with excutils.save_and_reraise_exception():
             procutils.shutdown_subprocess(self.proc, _cleanup)
Пример #28
0
    def __run_wordcount_job(self):
        try:
            self.execute_command('./script.sh run_wordcount_job')

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print('\nFailure while \'Wordcount\' job launch: ' + str(e))
                self.capture_error_log_from_cluster_node(
                    '/tmp/MapReduceTestOutput/log.txt'
                )
Пример #29
0
    def try_telnet(self, host, port):
        try:
            telnetlib.Telnet(host, port)

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print(
                    '\nTelnet has failed: ' + str(e) +
                    '  NODE IP: %s, PORT: %s. Passed %s minute(s).'
                    % (host, port, self.common_config.TELNET_TIMEOUT)
                )
Пример #30
0
def remove_path_on_error(path):
    """Protect code that wants to operate on PATH atomically.
    Any exception will cause PATH to be removed.

    :param path: File to work with
    """
    try:
        yield
    except Exception:
        with excutils.save_and_reraise_exception():
            delete_if_exists(path)
Пример #31
0
    def _get_name_of_completed_pi_job(self):
        try:
            job_name = self.execute_command('./script.sh get_pi_job_name')

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print('\nFailure while name obtaining completed \'PI\' job: ' +
                      str(e))
                self.capture_error_log_from_cluster_node(
                    '/tmp/MapReduceTestOutput/log.txt')
        return job_name[1][:-1]
Пример #32
0
        def try_get_image_id_and_savanna_username(parameter, value):

            try:

                return image.id, image.metadata['_savanna_username']

            except KeyError:

                with excutils.save_and_reraise_exception():

                    print_error_log(parameter, value)
Пример #33
0
        def try_get_image_id_and_ssh_username(parameter, value):
            try:
                if not plugin_config.SSH_USERNAME:
                    return image.id, image.metadata['_savanna_username']

                else:
                    return image.id, plugin_config.SSH_USERNAME

            except KeyError:
                with excutils.save_and_reraise_exception():
                    print_error_log(parameter, value)
Пример #34
0
        def try_get_image_id_and_ssh_username(parameter, value):
            try:
                if not plugin_config.SSH_USERNAME:
                    return image.id, image.metadata['_savanna_username']

                else:
                    return image.id, plugin_config.SSH_USERNAME

            except KeyError:
                with excutils.save_and_reraise_exception():
                    print_error_log(parameter, value)
Пример #35
0
def _serialize(data):
    """Serialization wrapper.

    We prefer using JSON, but it cannot encode all types.
    Error if a developer passes us bad data.
    """
    try:
        return jsonutils.dumps(data, ensure_ascii=True)
    except TypeError:
        with excutils.save_and_reraise_exception():
            LOG.error(_("JSON serialization failed."))
Пример #36
0
    def _compare_configs_on_cluster_node(self, config, value):
        config = config.replace(' ', '')
        try:
            self.execute_command('./script.sh %s -value %s' % (config, value))

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print('\nFailure while config comparison on cluster node: ' +
                      str(e))
                self.capture_error_log_from_cluster_node(
                    '/tmp/config-test-log.txt')
Пример #37
0
def remove_path_on_error(path):
    """Protect code that wants to operate on PATH atomically.
    Any exception will cause PATH to be removed.

    :param path: File to work with
    """
    try:
        yield
    except Exception:
        with excutils.save_and_reraise_exception():
            delete_if_exists(path)
Пример #38
0
def _serialize(data):
    """Serialization wrapper.

    We prefer using JSON, but it cannot encode all types.
    Error if a developer passes us bad data.
    """
    try:
        return jsonutils.dumps(data, ensure_ascii=True)
    except TypeError:
        with excutils.save_and_reraise_exception():
            LOG.error(_("JSON serialization failed."))
Пример #39
0
    def try_telnet(self, host, port):
        try:
            telnetlib.Telnet(host, port)

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print(
                    '\nTelnet has failed: ' + str(e) +
                    '  NODE IP: %s, PORT: %s. Passed %s minute(s).'
                    % (host, port, self.common_config.TELNET_TIMEOUT)
                )
Пример #40
0
    def _run(self, func, *args, **kwargs):
        proc = procutils.start_subprocess()

        try:
            procutils.run_in_subprocess(proc, _connect,
                                        self._get_conn_params())
            return procutils.run_in_subprocess(proc, func, args, kwargs)
        except Exception:
            with excutils.save_and_reraise_exception():
                procutils.shutdown_subprocess(proc, _cleanup)
        finally:
            procutils.shutdown_subprocess(proc, _cleanup)
Пример #41
0
    def _run(self, func, *args, **kwargs):
        proc = procutils.start_subprocess()

        try:
            procutils.run_in_subprocess(proc, _connect,
                                        self._get_conn_params())
            return procutils.run_in_subprocess(proc, func, args, kwargs)
        except Exception:
            with excutils.save_and_reraise_exception():
                procutils.shutdown_subprocess(proc, _cleanup)
        finally:
            procutils.shutdown_subprocess(proc, _cleanup)
Пример #42
0
    def scale_cluster(self, cluster, node_group_id_map):
        ctx = context.ctx()

        instance_ids = []
        try:
            instance_ids = self._scale_cluster_instances(cluster,
                                                         node_group_id_map)

            cluster = conductor.cluster_get(ctx, cluster)
            g.clean_cluster_from_empty_ng(cluster)

            cluster = conductor.cluster_get(ctx, cluster)
            instances = g.get_instances(cluster, instance_ids)

            self._await_active(cluster, instances)

            self._assign_floating_ips(instances)

            self._await_networks(cluster, instances)

            cluster = conductor.cluster_get(ctx, cluster)

            volumes.attach_to_instances(
                g.get_instances(cluster, instance_ids))

        except Exception as ex:
            with excutils.save_and_reraise_exception():
                self._log_operation_exception(
                    "Can't scale cluster '%s' (reason: %s)", cluster, ex)

                cluster = conductor.cluster_get(ctx, cluster)
                self._rollback_cluster_scaling(
                    cluster, g.get_instances(cluster, instance_ids), ex)
                instance_ids = []

                cluster = conductor.cluster_get(ctx, cluster)
                g.clean_cluster_from_empty_ng(cluster)
                if cluster.status == 'Decommissioning':
                    cluster = conductor.cluster_update(ctx, cluster,
                                                       {"status": "Error"})
                else:
                    cluster = conductor.cluster_update(ctx, cluster,
                                                       {"status": "Active"})

                LOG.info(g.format_cluster_status(cluster))

        # we should be here with valid cluster: if instances creation
        # was not successful all extra-instances will be removed above
        if instance_ids:
            self._configure_instances(cluster)
        return instance_ids
Пример #43
0
    def scale_cluster(self, cluster, node_group_id_map):
        ctx = context.ctx()

        instance_ids = []
        try:
            instance_ids = self._scale_cluster_instances(
                cluster, node_group_id_map)

            cluster = conductor.cluster_get(ctx, cluster)
            g.clean_cluster_from_empty_ng(cluster)

            cluster = conductor.cluster_get(ctx, cluster)
            instances = g.get_instances(cluster, instance_ids)

            self._await_active(cluster, instances)

            self._assign_floating_ips(instances)

            self._await_networks(cluster, instances)

            cluster = conductor.cluster_get(ctx, cluster)

            volumes.attach_to_instances(g.get_instances(cluster, instance_ids))

        except Exception as ex:
            with excutils.save_and_reraise_exception():
                self._log_operation_exception(
                    "Can't scale cluster '%s' (reason: %s)", cluster, ex)

                cluster = conductor.cluster_get(ctx, cluster)
                self._rollback_cluster_scaling(
                    cluster, g.get_instances(cluster, instance_ids), ex)
                instance_ids = []

                cluster = conductor.cluster_get(ctx, cluster)
                g.clean_cluster_from_empty_ng(cluster)
                if cluster.status == 'Decommissioning':
                    cluster = conductor.cluster_update(ctx, cluster,
                                                       {"status": "Error"})
                else:
                    cluster = conductor.cluster_update(ctx, cluster,
                                                       {"status": "Active"})

                LOG.info(g.format_cluster_status(cluster))

        # we should be here with valid cluster: if instances creation
        # was not successful all extra-instances will be removed above
        if instance_ids:
            self._configure_instances(cluster)
        return instance_ids
Пример #44
0
    def __compare_configs_on_cluster_node(self, config, value):
        config = config.replace(' ', '')
        try:
            self.execute_command('./script.sh %s -value %s' % (config, value))

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print(
                    '\nFailure while config comparison on cluster node: '
                    + str(e)
                )
                self.capture_error_log_from_cluster_node(
                    '/tmp/config-test-log.txt'
                )
Пример #45
0
    def __get_name_of_completed_pi_job(self):
        try:
            job_name = self.execute_command('./script.sh get_pi_job_name')

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print(
                    '\nFailure while name obtaining completed \'PI\' job: ' +
                    str(e)
                )
                self.capture_error_log_from_cluster_node(
                    '/tmp/MapReduceTestOutput/log.txt'
                )
        return job_name[1][:-1]
Пример #46
0
def scale_cluster(id, data):
    ctx = context.ctx()

    cluster = conductor.cluster_get(ctx, id)
    plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name)
    existing_node_groups = data.get('resize_node_groups', [])
    additional_node_groups = data.get('add_node_groups', [])

    #the next map is the main object we will work with
    #to_be_enlarged : {node_group_id: desired_amount_of_instances}
    to_be_enlarged = {}
    for ng in existing_node_groups:
        ng_id = g.find(cluster.node_groups, name=ng['name'])['id']
        to_be_enlarged.update({ng_id: ng['count']})

    additional = construct_ngs_for_scaling(cluster, additional_node_groups)
    cluster = conductor.cluster_get(ctx, cluster)

    # update nodegroup image usernames
    for nodegroup in cluster.node_groups:
        if additional.get(nodegroup.id):
            image_username = INFRA.get_node_group_image_username(nodegroup)
            conductor.node_group_update(ctx, nodegroup,
                                        {"image_username": image_username})
    cluster = conductor.cluster_get(ctx, cluster)

    try:
        cluster = conductor.cluster_update(ctx, cluster,
                                           {"status": "Validating"})
        LOG.info(g.format_cluster_status(cluster))
        plugin.validate_scaling(cluster, to_be_enlarged, additional)
    except Exception:
        with excutils.save_and_reraise_exception():
            g.clean_cluster_from_empty_ng(cluster)
            cluster = conductor.cluster_update(ctx, cluster,
                                               {"status": "Active"})
            LOG.info(g.format_cluster_status(cluster))

    # If we are here validation is successful.
    # So let's update to_be_enlarged map:
    to_be_enlarged.update(additional)

    for node_group in cluster.node_groups:
        if node_group.id not in to_be_enlarged:
            to_be_enlarged[node_group.id] = node_group.count

    context.spawn("cluster-scaling-%s" % id, _provision_scaled_cluster, id,
                  to_be_enlarged)
    return conductor.cluster_get(ctx, id)
Пример #47
0
def scale_cluster(id, data):
    ctx = context.ctx()

    cluster = conductor.cluster_get(ctx, id)
    plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name)
    existing_node_groups = data.get('resize_node_groups', [])
    additional_node_groups = data.get('add_node_groups', [])

    #the next map is the main object we will work with
    #to_be_enlarged : {node_group_id: desired_amount_of_instances}
    to_be_enlarged = {}
    for ng in existing_node_groups:
        ng_id = g.find(cluster.node_groups, name=ng['name'])['id']
        to_be_enlarged.update({ng_id: ng['count']})

    additional = construct_ngs_for_scaling(cluster, additional_node_groups)
    cluster = conductor.cluster_get(ctx, cluster)

    # update nodegroup image usernames
    for nodegroup in cluster.node_groups:
        if additional.get(nodegroup.id):
            image_username = INFRA.get_node_group_image_username(nodegroup)
            conductor.node_group_update(
                ctx, nodegroup, {"image_username": image_username})
    cluster = conductor.cluster_get(ctx, cluster)

    try:
        cluster = conductor.cluster_update(ctx, cluster,
                                           {"status": "Validating"})
        LOG.info(g.format_cluster_status(cluster))
        plugin.validate_scaling(cluster, to_be_enlarged, additional)
    except Exception:
        with excutils.save_and_reraise_exception():
            INFRA.clean_cluster_from_empty_ng(cluster)
            cluster = conductor.cluster_update(ctx, cluster,
                                               {"status": "Active"})
            LOG.info(g.format_cluster_status(cluster))

    # If we are here validation is successful.
    # So let's update to_be_enlarged map:
    to_be_enlarged.update(additional)

    for node_group in cluster.node_groups:
        if node_group.id not in to_be_enlarged:
            to_be_enlarged[node_group.id] = node_group.count

    context.spawn("cluster-scaling-%s" % id,
                  _provision_scaled_cluster, id, to_be_enlarged)
    return conductor.cluster_get(ctx, id)
Пример #48
0
def create_cluster(values):
    cluster = s.create_cluster(values)
    plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name)

    # validating cluster
    try:
        context.model_update(cluster, status="Validating")
        plugin.validate(cluster)
    except Exception as ex:
        with excutils.save_and_reraise_exception():
            context.model_update(cluster, status="Error", status_description=str(ex))

    context.spawn(_provision_cluster, cluster.id)

    return cluster
Пример #49
0
    def get_floating_ip_pool_id_for_neutron_net(self):
        # Find corresponding floating IP pool by its name and get its ID.
        # If pool not found then handle error
        try:
            floating_ip_pool = self.neutron.list_networks(
                name=self.common_config.FLOATING_IP_POOL)
            floating_ip_pool_id = floating_ip_pool['networks'][0]['id']
            return floating_ip_pool_id

        except IndexError:
            with excutils.save_and_reraise_exception():
                raise Exception(
                    '\nFloating IP pool \'%s\' not found in pool list. '
                    'Please, make sure you specified right floating IP pool.'
                    % self.common_config.FLOATING_IP_POOL
                )
Пример #50
0
    def get_internal_neutron_net_id(self):
        # Find corresponding internal Neutron network by its name and get
        # its ID. If network not found then handle error
        try:
            internal_neutron_net = self.neutron.list_networks(
                name=self.common_config.INTERNAL_NEUTRON_NETWORK)
            internal_neutron_net_id = internal_neutron_net['networks'][0]['id']
            return internal_neutron_net_id

        except IndexError:
            with excutils.save_and_reraise_exception():
                raise Exception(
                    '\nInternal Neutron network \'%s\' not found in network '
                    'list. Please, make sure you specified right network name.'
                    % self.common_config.INTERNAL_NEUTRON_NETWORK
                )
Пример #51
0
    def get_floating_ip_pool_id_for_neutron_net(self):
        # Find corresponding floating IP pool by its name and get its ID.
        # If pool not found then handle error
        try:
            floating_ip_pool = self.neutron.list_networks(
                name=self.common_config.FLOATING_IP_POOL)
            floating_ip_pool_id = floating_ip_pool['networks'][0]['id']
            return floating_ip_pool_id

        except IndexError:
            with excutils.save_and_reraise_exception():
                raise Exception(
                    '\nFloating IP pool \'%s\' not found in pool list. '
                    'Please, make sure you specified right floating IP pool.'
                    % self.common_config.FLOATING_IP_POOL
                )
Пример #52
0
    def get_internal_neutron_net_id(self):
        # Find corresponding internal Neutron network by its name and get
        # its ID. If network not found then handle error
        try:
            internal_neutron_net = self.neutron.list_networks(
                name=self.common_config.INTERNAL_NEUTRON_NETWORK)
            internal_neutron_net_id = internal_neutron_net['networks'][0]['id']
            return internal_neutron_net_id

        except IndexError:
            with excutils.save_and_reraise_exception():
                raise Exception(
                    '\nInternal Neutron network \'%s\' not found in network '
                    'list. Please, make sure you specified right network name.'
                    % self.common_config.INTERNAL_NEUTRON_NETWORK
                )
Пример #53
0
    def transfer_helper_script_to_node(self, script_name, parameter_list=None):
        script = open('savanna/tests/integration/tests/resources/%s'
                      % script_name).read()
        if parameter_list:
            for parameter, value in parameter_list.items():
                script = script.replace(
                    '%s=""' % parameter, '%s=%s' % (parameter, value))
        try:
            self.write_file_to('script.sh', script)

        except Exception as e:
            with excutils.save_and_reraise_exception():
                print(
                    '\nFailure while helper script transferring '
                    'to cluster node: ' + str(e)
                )
        self.execute_command('chmod 777 script.sh')
Пример #54
0
    def create_cluster(self, cluster):
        ctx = context.ctx()
        try:
            # create all instances
            conductor.cluster_update(ctx, cluster, {"status": "Spawning"})
            LOG.info(g.format_cluster_status(cluster))
            self._create_instances(cluster)

            # wait for all instances are up and networks ready
            cluster = conductor.cluster_update(ctx, cluster,
                                               {"status": "Waiting"})
            LOG.info(g.format_cluster_status(cluster))

            instances = g.get_instances(cluster)

            self._await_active(cluster, instances)

            self._assign_floating_ips(instances)

            self._await_networks(cluster, instances)

            cluster = conductor.cluster_get(ctx, cluster)

            # attach volumes
            volumes.attach(cluster)

            # prepare all instances
            cluster = conductor.cluster_update(ctx, cluster,
                                               {"status": "Preparing"})
            LOG.info(g.format_cluster_status(cluster))

            self._configure_instances(cluster)
        except Exception as ex:
            with excutils.save_and_reraise_exception():
                self._log_operation_exception(
                    "Can't start cluster '%s' (reason: %s)", cluster, ex)

                cluster = conductor.cluster_update(
                    ctx, cluster, {
                        "status": "Error",
                        "status_description": str(ex)
                    })
                LOG.info(g.format_cluster_status(cluster))
                self._rollback_cluster_creation(cluster, ex)
Пример #55
0
 def __iter__(self):
     """Return a result until we get a reply with an 'ending' flag."""
     if self._done:
         raise StopIteration
     while True:
         try:
             data = self._dataqueue.get(timeout=self._timeout)
             result = self._process_data(data)
         except queue.Empty:
             self.done()
             raise rpc_common.Timeout()
         except Exception:
             with excutils.save_and_reraise_exception():
                 self.done()
         if self._got_ending:
             self.done()
             raise StopIteration
         if isinstance(result, Exception):
             self.done()
             raise result
         yield result
Пример #56
0
def scale_cluster(cluster, node_group_id_map, plugin):
    ctx = context.ctx()

    instances_list = []
    try:
        instances_list = _scale_cluster_instances(cluster, node_group_id_map,
                                                  plugin)

        cluster = conductor.cluster_get(ctx, cluster)
        cluster = clean_cluster_from_empty_ng(cluster)

        cluster = _await_instances(cluster)

        volumes.attach_to_instances(get_instances(cluster, instances_list))

    except Exception as ex:
        LOG.warn("Can't scale cluster '%s' (reason: %s)", cluster.name, ex)
        with excutils.save_and_reraise_exception():
            cluster = conductor.cluster_get(ctx, cluster)
            _rollback_cluster_scaling(cluster,
                                      get_instances(cluster, instances_list),
                                      ex)
            instances_list = []

            cluster = conductor.cluster_get(ctx, cluster)
            clean_cluster_from_empty_ng(cluster)
            if cluster.status == 'Decommissioning':
                cluster = conductor.cluster_update(ctx, cluster,
                                                   {"status": "Error"})
            else:
                cluster = conductor.cluster_update(ctx, cluster,
                                                   {"status": "Active"})

            LOG.info(g.format_cluster_status(cluster))

    # we should be here with valid cluster: if instances creation
    # was not successful all extra-instances will be removed above
    if instances_list:
        _configure_instances(cluster)
    return instances_list
Пример #57
0
    def scale_cluster(self, cluster, target_count):
        ctx = context.ctx()

        rollback_count = self._get_ng_counts(cluster)

        launcher = _ScaleLauncher()

        try:
            launcher.launch_instances(ctx, cluster, target_count)
        except Exception as ex:
            with excutils.save_and_reraise_exception():
                self._log_operation_exception(
                    "Can't scale cluster '%s' (reason: %s)", cluster, ex)

                cluster = conductor.cluster_get(ctx, cluster)

                try:
                    self._rollback_cluster_scaling(ctx, cluster,
                                                   rollback_count,
                                                   target_count)
                except Exception:
                    # if something fails during the rollback, we stop
                    # doing anything further
                    cluster = conductor.cluster_update(ctx, cluster,
                                                       {"status": "Error"})
                    LOG.info(g.format_cluster_status(cluster))
                    LOG.error("Unable to complete rollback, aborting")
                    raise

                cluster = conductor.cluster_update(ctx, cluster,
                                                   {"status": "Active"})
                LOG.info(g.format_cluster_status(cluster))
                LOG.warn(
                    "Rollback successful. Throwing off an initial exception.")
        finally:
            cluster = conductor.cluster_get(ctx, cluster)
            g.clean_cluster_from_empty_ng(cluster)

        return launcher.inst_ids
Пример #58
0
    def _cluster_config_testing(self, cluster_info):

        cluster_id = cluster_info['cluster_id']
        data = self.savanna.clusters.get(cluster_id)

        self.__compare_configs(
            data.cluster_configs['general'], {'Enable Swift': True}
        )
        self.__compare_configs(
            data.cluster_configs['HDFS'], CLUSTER_HDFS_CONFIG
        )
        self.__compare_configs(
            data.cluster_configs['MapReduce'], CLUSTER_MR_CONFIG
        )

        node_groups = data.node_groups

        self.__check_configs_for_node_groups(node_groups)

        node_ip_list_with_node_processes = \
            self.get_cluster_node_ip_list_with_node_processes(cluster_id)

        try:

            self.transfer_helper_script_to_nodes(
                node_ip_list_with_node_processes,
                self.vanilla_config.NODE_USERNAME,
                'cluster_config_test_script.sh'
            )

        except Exception as e:

            with excutils.save_and_reraise_exception():

                print(str(e))

        self.__check_config_application_on_cluster_nodes(
            node_ip_list_with_node_processes
        )
Пример #59
0
    def create_cluster(self, cluster):
        ctx = context.ctx()

        launcher = _CreateLauncher()

        try:
            target_count = self._get_ng_counts(cluster)
            self._nullify_ng_counts(cluster)

            cluster = conductor.cluster_get(ctx, cluster)

            launcher.launch_instances(ctx, cluster, target_count)
        except Exception as ex:
            with excutils.save_and_reraise_exception():
                self._log_operation_exception(
                    "Can't start cluster '%s' (reason: %s)", cluster, ex)

                cluster = conductor.cluster_update(
                    ctx, cluster, {
                        "status": "Error",
                        "status_description": str(ex)
                    })
                LOG.info(g.format_cluster_status(cluster))
                self._rollback_cluster_creation(cluster)