示例#1
0
def main():
    # TODO(tmckay): Work on restricting the options
    # pulled in by imports which show up in the help.
    # If we find a nice way to do this the calls to
    # unregister_extra_cli_opt() can be removed
    CONF(project='sahara')

    # For some reason, this is necessary to clear cached values
    # and re-read configs.  For instance, if this is not done
    # here the 'plugins' value will not reflect the value from
    # the config file on the command line
    CONF.reload_config_files()
    log.setup(CONF, "sahara")

    # If we have to enforce extra option checks, like one option
    # requires another, do it here
    extra_option_checks()

    # Since this may be scripted, record the command in the log
    # so a user can know exactly what was done
    LOG.info(_LI("Command: {command}").format(command=' '.join(sys.argv)))

    api.set_logger(LOG)
    api.set_conf(CONF)

    CONF.command.func()

    LOG.info(_LI("Finished {command}").format(command=CONF.command.name))
示例#2
0
    def _await_networks(self, cluster, instances):
        if not instances:
            return

        cpo.add_provisioning_step(cluster.id, _("Assign IPs"), len(instances))

        ips_assigned = set()
        self._ips_assign(ips_assigned, cluster, instances)

        LOG.info(
            _LI("Cluster {cluster_id}: all instances have IPs assigned")
            .format(cluster_id=cluster.id))

        cluster = conductor.cluster_get(context.ctx(), cluster)
        instances = g.get_instances(cluster, ips_assigned)

        cpo.add_provisioning_step(
            cluster.id, _("Wait for instance accessibility"), len(instances))

        with context.ThreadGroup() as tg:
            for instance in instances:
                tg.spawn("wait-for-ssh-%s" % instance.instance_name,
                         self._wait_until_accessible, instance)

        LOG.info(_LI("Cluster {cluster_id}: all instances are accessible")
                 .format(cluster_id=cluster.id))
示例#3
0
    def start_cluster(self, cluster):
        nn_instance = utils.get_instance(cluster, "namenode")
        sm_instance = utils.get_instance(cluster, "master")
        dn_instances = utils.get_instances(cluster, "datanode")

        # Start the name node
        with remote.get_remote(nn_instance) as r:
            run.format_namenode(r)
            run.start_processes(r, "namenode")

        # start the data nodes
        self._start_slave_datanode_processes(dn_instances)

        LOG.info(_LI("Hadoop services in cluster %s have been started"),
                 cluster.name)

        with remote.get_remote(nn_instance) as r:
            r.execute_command("sudo -u hdfs hdfs dfs -mkdir -p /user/$USER/")
            r.execute_command("sudo -u hdfs hdfs dfs -chown $USER "
                              "/user/$USER/")

        # start spark nodes
        if sm_instance:
            with remote.get_remote(sm_instance) as r:
                run.start_spark_master(r, self._spark_home(cluster))
                LOG.info(_LI("Spark service at '%s' has been started"),
                         sm_instance.hostname())

        LOG.info(_LI('Cluster %s has been started successfully'), cluster.name)
        self._set_cluster_info(cluster)
示例#4
0
    def start_services(self, cluster_name, cluster_spec, ambari_info):
        start_url = ('http://{0}/api/v1/clusters/{1}/services?ServiceInfo/'
                     'state=INSTALLED'.format(ambari_info.get_address(),
                                              cluster_name))
        body = ('{"RequestInfo" : { "context" : "Start all services" },'
                '"Body" : {"ServiceInfo": {"state" : "STARTED"}}}')

        self._fire_service_start_notifications(cluster_name, cluster_spec,
                                               ambari_info)
        result = self._put(start_url, ambari_info, data=body)
        if result.status_code == 202:
            json_result = json.loads(result.text)
            request_id = json_result['Requests']['id']
            success = self._wait_for_async_request(
                self._get_async_request_uri(ambari_info, cluster_name,
                                            request_id), ambari_info)
            if success:
                LOG.info(_LI("Successfully started Hadoop cluster."))
                LOG.info(
                    _LI('Ambari server address: {server_address}').format(
                        server_address=ambari_info.get_address()))

            else:
                LOG.error(_LE('Failed to start Hadoop cluster.'))
                raise ex.HadoopProvisionError(
                    _('Start of Hadoop services failed.'))

        elif result.status_code != 200:
            LOG.error(
                _LE('Start command failed. Status: {status}, '
                    'response: {response}').format(status=result.status_code,
                                                   response=result.text))
            raise ex.HadoopProvisionError(
                _('Start of Hadoop services failed.'))
示例#5
0
    def _await_networks(self, cluster, instances):
        if not instances:
            return

        cpo.add_provisioning_step(cluster.id, _("Assign IPs"), len(instances))

        ips_assigned = set()
        self._ips_assign(ips_assigned, cluster, instances)

        LOG.info(_LI("All instances have IPs assigned"))

        cluster = conductor.cluster_get(context.ctx(), cluster)
        instances = g.get_instances(cluster, ips_assigned)

        cpo.add_provisioning_step(cluster.id,
                                  _("Wait for instance accessibility"),
                                  len(instances))

        with context.ThreadGroup() as tg:
            for instance in instances:
                with context.set_current_instance_id(instance.instance_id):
                    tg.spawn("wait-for-ssh-%s" % instance.instance_name,
                             self._wait_until_accessible, instance)

        LOG.info(_LI("All instances are accessible"))
示例#6
0
    def start_cluster(self, cluster):
        nn_instance = utils.get_instance(cluster, "namenode")
        dn_instances = utils.get_instances(cluster, "datanode")

        # Start the name node
        self._start_namenode(nn_instance)

        # start the data nodes
        self._start_datanode_processes(dn_instances)

        LOG.info(
            _LI("Hadoop services in cluster {cluster} have been started").
            format(cluster=cluster.name))

        with remote.get_remote(nn_instance) as r:
            r.execute_command("sudo -u hdfs hdfs dfs -mkdir -p /user/$USER/")
            r.execute_command("sudo -u hdfs hdfs dfs -chown $USER "
                              "/user/$USER/")

        # start spark nodes
        self.start_spark(cluster)

        LOG.info(
            _LI('Cluster {cluster} has been started successfully').format(
                cluster=cluster.name))
        self._set_cluster_info(cluster)
示例#7
0
    def wait_for_host_registrations(self, num_hosts, ambari_info):
        LOG.info(
            _LI('Waiting for all Ambari agents to register with server ...'))

        url = 'http://{0}/api/v1/hosts'.format(ambari_info.get_address())
        result = None
        json_result = None

        # TODO(jspeidel): timeout
        while result is None or len(json_result['items']) < num_hosts:
            context.sleep(5)
            try:
                result = self._get(url, ambari_info)
                json_result = json.loads(result.text)

                LOG.info(_LI('Registered Hosts: %(current_number)s of '
                             '%(final_number)s'),
                         {'current_number': len(json_result['items']),
                          'final_number': num_hosts})
                for hosts in json_result['items']:
                    LOG.debug('Registered Host: {0}'.format(
                        hosts['Hosts']['host_name']))
            except Exception:
                # TODO(jspeidel): max wait time
                LOG.info(_LI('Waiting to connect to ambari server ...'))
示例#8
0
文件: plugin.py 项目: crobby/sahara
    def start_cluster(self, cluster):
        nn_instance = utils.get_instance(cluster, "namenode")
        dn_instances = utils.get_instances(cluster, "datanode")
        zep_instance = utils.get_instance(cluster, "zeppelin")

        # Start the name node
        self._start_namenode(nn_instance)

        # start the data nodes
        self._start_datanode_processes(dn_instances)

        LOG.info(_LI("Hadoop services have been started"))

        with remote.get_remote(nn_instance) as r:
            r.execute_command("sudo -u hdfs hdfs dfs -mkdir -p /user/$USER/")
            r.execute_command("sudo -u hdfs hdfs dfs -chown $USER "
                              "/user/$USER/")

        # start spark nodes
        self.start_spark(cluster)

        # start zeppelin, if necessary
        if zep_instance:
            self._start_zeppelin(zep_instance)

        LOG.info(_LI('Cluster has been started successfully'))
        self._set_cluster_info(cluster)
示例#9
0
    def _install_services(self, cluster_name, ambari_info):
        LOG.info(_LI('Installing required Hadoop services ...'))

        ambari_address = ambari_info.get_address()
        install_url = ('http://{0}/api/v1/clusters/{'
                       '1}/services?ServiceInfo/state=INIT'.format(
                           ambari_address, cluster_name))
        body = ('{"RequestInfo" : { "context" : "Install all services" },'
                '"Body" : {"ServiceInfo": {"state" : "INSTALLED"}}}')

        result = self._put(install_url, ambari_info, data=body)

        if result.status_code == 202:
            json_result = json.loads(result.text)
            request_id = json_result['Requests']['id']
            success = self._wait_for_async_request(self._get_async_request_uri(
                ambari_info, cluster_name, request_id),
                ambari_info)
            if success:
                LOG.info(_LI("Install of Hadoop stack successful."))
                self._finalize_ambari_state(ambari_info)
            else:
                LOG.critical(_LC('Install command failed.'))
                raise ex.HadoopProvisionError(
                    _('Installation of Hadoop stack failed.'))
        elif result.status_code != 200:
            LOG.error(
                _LE('Install command failed. {0}').format(result.text))
            raise ex.HadoopProvisionError(
                _('Installation of Hadoop stack failed.'))
示例#10
0
    def _await_networks(self, cluster, instances):
        if not instances:
            return

        ips_assigned = set()
        while len(ips_assigned) != len(instances):
            if not g.check_cluster_exists(cluster):
                return
            for instance in instances:
                if instance.id not in ips_assigned:
                    if networks.init_instances_ips(instance):
                        ips_assigned.add(instance.id)

            context.sleep(1)

        LOG.info(
            _LI("Cluster '%s': all instances have IPs assigned"), cluster.id)

        cluster = conductor.cluster_get(context.ctx(), cluster)
        instances = g.get_instances(cluster, ips_assigned)

        with context.ThreadGroup() as tg:
            for instance in instances:
                tg.spawn("wait-for-ssh-%s" % instance.instance_name,
                         self._wait_until_accessible, instance)

        LOG.info(_LI("Cluster '%s': all instances are accessible"), cluster.id)
示例#11
0
    def start_services(self, cluster_name, cluster_spec, ambari_info):
        start_url = ('http://{0}/api/v1/clusters/{1}/services?ServiceInfo/'
                     'state=INSTALLED'.format(
                         ambari_info.get_address(), cluster_name))
        body = ('{"RequestInfo" : { "context" : "Start all services" },'
                '"Body" : {"ServiceInfo": {"state" : "STARTED"}}}')

        self._fire_service_start_notifications(
            cluster_name, cluster_spec, ambari_info)
        result = self._put(start_url, ambari_info, data=body)
        if result.status_code == 202:
            json_result = json.loads(result.text)
            request_id = json_result['Requests']['id']
            success = self._wait_for_async_request(
                self._get_async_request_uri(ambari_info, cluster_name,
                                            request_id), ambari_info)
            if success:
                LOG.info(
                    _LI("Successfully started Hadoop cluster."))
                LOG.info(_LI('Ambari server address: {server_address}')
                         .format(server_address=ambari_info.get_address()))

            else:
                LOG.error(_LE('Failed to start Hadoop cluster.'))
                raise ex.HadoopProvisionError(
                    _('Start of Hadoop services failed.'))

        elif result.status_code != 200:
            LOG.error(
                _LE('Start command failed. Status: {status}, '
                    'response: {response}').format(status=result.status_code,
                                                   response=result.text))
            raise ex.HadoopProvisionError(
                _('Start of Hadoop services failed.'))
示例#12
0
文件: cli.py 项目: thefuyang/sahara
def main():
    # TODO(tmckay): Work on restricting the options
    # pulled in by imports which show up in the help.
    # If we find a nice way to do this the calls to
    # unregister_extra_cli_opt() can be removed
    CONF(project="sahara")

    # For some reason, this is necessary to clear cached values
    # and re-read configs.  For instance, if this is not done
    # here the 'plugins' value will not reflect the value from
    # the config file on the command line
    CONF.reload_config_files()
    log.setup(CONF, "sahara")

    # If we have to enforce extra option checks, like one option
    # requires another, do it here
    extra_option_checks()

    # Since this may be scripted, record the command in the log
    # so a user can know exactly what was done
    LOG.info(_LI("Command: {command}").format(command=" ".join(sys.argv)))

    api.set_logger(LOG)
    api.set_conf(CONF)

    CONF.command.func()

    LOG.info(_LI("Finished {command}").format(command=CONF.command.name))
示例#13
0
    def start_cluster(self, cluster):
        nn_instance = utils.get_instance(cluster, "namenode")
        sm_instance = utils.get_instance(cluster, "master")
        dn_instances = utils.get_instances(cluster, "datanode")

        # Start the name node
        with remote.get_remote(nn_instance) as r:
            run.format_namenode(r)
            run.start_processes(r, "namenode")

        # start the data nodes
        self._start_slave_datanode_processes(dn_instances)

        LOG.info(_LI("Hadoop services in cluster %s have been started"), cluster.name)

        with remote.get_remote(nn_instance) as r:
            r.execute_command("sudo -u hdfs hdfs dfs -mkdir -p /user/$USER/")
            r.execute_command(("sudo -u hdfs hdfs dfs -chown $USER " "/user/$USER/"))

        # start spark nodes
        if sm_instance:
            with remote.get_remote(sm_instance) as r:
                run.start_spark_master(r, self._spark_home(cluster))
                LOG.info(_LI("Spark service at '%s' has been started"), sm_instance.hostname())

        LOG.info(_LI("Cluster %s has been started successfully"), cluster.name)
        self._set_cluster_info(cluster)
示例#14
0
文件: scaling.py 项目: a9261/sahara
def remove_services(cluster, instances):
    LOG.info(_LI("Start remove all mapr services"))
    for instance in instances:
        with instance.remote() as r:
            r.execute_command(REMOVE_MAPR_PACKAGES_CMD, run_as_root=True)
            r.execute_command(REMOVE_MAPR_HOME_CMD, run_as_root=True)
            r.execute_command(REMOVE_MAPR_CORES_CMD, run_as_root=True)
    LOG.info(_LI("All mapr services removed"))
示例#15
0
def remove_services(cluster, instances):
    LOG.info(_LI("Start remove all mapr services"))
    for instance in instances:
        with instance.remote() as r:
            r.execute_command(REMOVE_MAPR_PACKAGES_CMD, run_as_root=True)
            r.execute_command(REMOVE_MAPR_HOME_CMD, run_as_root=True)
            r.execute_command(REMOVE_MAPR_CORES_CMD, run_as_root=True)
    LOG.info(_LI("All mapr services removed"))
示例#16
0
文件: scaling.py 项目: a9261/sahara
def stop_services(cluster, instances):
    LOG.info(_LI("Stop warden and zookeeper"))
    for instance in instances:
        with instance.remote() as r:
            r.execute_command(STOP_WARDEN_CMD, run_as_root=True)
            if check_if_is_zookeeper_node(instance):
                r.execute_command(STOP_ZOOKEEPER_CMD, run_as_root=True)
    LOG.info(_LI("Warden and zookeeper stoped"))
示例#17
0
def stop_services(cluster, instances):
    LOG.info(_LI("Stop warden and zookeeper"))
    for instance in instances:
        with instance.remote() as r:
            r.execute_command(STOP_WARDEN_CMD, run_as_root=True)
            if check_if_is_zookeeper_node(instance):
                r.execute_command(STOP_ZOOKEEPER_CMD, run_as_root=True)
    LOG.info(_LI("Warden and zookeeper stoped"))
示例#18
0
def exec_configure_sh_on_instance(cluster, instance, script_string):
    LOG.info(_LI('START: Executing configure.sh'))
    if check_for_mapr_db(cluster):
        script_string += ' -M7'
    if not check_if_mapr_user_exist(instance):
        script_string += ' --create-user'
    LOG.debug('script_string = %s', script_string)
    instance.remote().execute_command(script_string, run_as_root=True)
    LOG.info(_LI('END: Executing configure.sh'))
示例#19
0
def move_node(cluster, instances):
    LOG.info(_LI("Start moving the node to the /decommissioned"))
    for instance in instances:
        with instance.remote() as r:
            command = GET_SERVER_ID_CMD % instance.management_ip
            ec, out = r.execute_command(command, run_as_root=True)
            command = MOVE_NODE_CMD % out.strip()
            r.execute_command(command, run_as_root=True)
    LOG.info(_LI("Nodes moved to the /decommissioned"))
示例#20
0
文件: scaling.py 项目: a9261/sahara
def scale_cluster(cluster, instances, disk_setup_script_path, waiting_script,
                  context, configure_sh_string, is_node_awareness):
    LOG.info(_LI('START: Cluster scaling. Cluster = %s'), cluster.name)
    for inst in instances:
        start_helper.install_role_on_instance(inst, context)
    config.configure_instances(cluster, instances)
    start_services(cluster, instances, disk_setup_script_path,
                   waiting_script, configure_sh_string)
    LOG.info(_LI('END: Cluster scaling. Cluster = %s'), cluster)
示例#21
0
文件: general.py 项目: turu/sahara
def format_cluster_deleted_message(cluster):
    msg = _LI("Cluster %(name)s (id=%(id)s) was deleted. "
              "Canceling current operation.")

    if cluster:
        return (msg, {'name': cluster.name,
                      'id': cluster.id})
    return (msg, {'name': _LI("Unknown"),
                  'id': _LI("Unknown")})
示例#22
0
def scale_cluster(cluster, instances, disk_setup_script_path, waiting_script,
                  context, configure_sh_string, is_node_awareness):
    LOG.info(_LI('START: Cluster scaling. Cluster = %s'), cluster.name)
    for inst in instances:
        start_helper.install_role_on_instance(inst, context)
    config.configure_instances(cluster, instances)
    start_services(cluster, instances, disk_setup_script_path, waiting_script,
                   configure_sh_string)
    LOG.info(_LI('END: Cluster scaling. Cluster = %s'), cluster)
示例#23
0
文件: scaling.py 项目: a9261/sahara
def move_node(cluster, instances):
    LOG.info(_LI("Start moving the node to the /decommissioned"))
    for instance in instances:
        with instance.remote() as r:
            command = GET_SERVER_ID_CMD % instance.management_ip
            ec, out = r.execute_command(command, run_as_root=True)
            command = MOVE_NODE_CMD % out.strip()
            r.execute_command(command, run_as_root=True)
    LOG.info(_LI("Nodes moved to the /decommissioned"))
示例#24
0
def install_role_on_instance(instance,  cluster_context):
    LOG.info(_LI('START: Installing roles on node '))
    roles_list = instance.node_group.node_processes
    exec_str = (cluster_context.get_install_manager()
                + cluster_context.get_roles_str(roles_list))
    LOG.debug('Executing "%(command)s" on %(instance)s',
              {'command': exec_str, 'instance': instance.instance_id})

    instance.remote().execute_command(exec_str, run_as_root=True, timeout=900)
    LOG.info(_LI('END: Installing roles on node '))
示例#25
0
def install_roles(cluster,  cluster_context):
    LOG.info(_LI('START: Installing roles on cluster'))
    instances = utils.get_instances(cluster)
    with context.ThreadGroup(len(instances)) as tg:
        for instance in instances:
            tg.spawn('install_roles_%s' % instance.instance_id,
                     install_role_on_instance,
                     instance,
                     cluster_context)
    LOG.info(_LI('END: Installing roles on cluster'))
示例#26
0
 def execute_command(self, cmd, run_as_root=False, get_stderr=False, raise_when_error=True, timeout=300):
     try:
         LOG.info(_LI("Issuing command: {cmd}").format(cmd=cmd))
         stdout = self.guest.sh(cmd)
         LOG.info(_LI("Received response: {stdout}").format(stdout=stdout))
         return 0, stdout
     except RuntimeError as ex:
         if raise_when_error:
             raise ex
         else:
             return 1, ex.message
示例#27
0
def cancel_job(job_execution_id):
    ctx = context.ctx()
    job_execution = conductor.job_execution_get(ctx, job_execution_id)
    if job_execution.info['status'] in edp.JOB_STATUSES_TERMINATED:
        return job_execution
    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster is None:
        return job_execution
    engine = _get_job_engine(cluster, job_execution)
    if engine is not None:
        job_execution = conductor.job_execution_update(
            ctx, job_execution_id,
            {'info': {
                'status': edp.JOB_STATUS_TOBEKILLED
            }})

        timeout = CONF.job_canceling_timeout
        s_time = timeutils.utcnow()
        while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout:
            if job_execution.info['status'] not in edp.JOB_STATUSES_TERMINATED:
                try:
                    job_info = engine.cancel_job(job_execution)
                except Exception as ex:
                    job_info = None
                    LOG.exception(
                        _LE("Error during cancel of job execution %(job)s: "
                            "%(error)s"), {
                                'job': job_execution.id,
                                'error': ex
                            })
                if job_info is not None:
                    job_execution = _write_job_status(job_execution, job_info)
                    LOG.info(_LI("Job execution %s was canceled successfully"),
                             job_execution.id)
                    return job_execution
                context.sleep(3)
                job_execution = conductor.job_execution_get(
                    ctx, job_execution_id)
                if not job_execution:
                    LOG.info(
                        _LI("Job execution %(job_exec_id)s was deleted. "
                            "Canceling current operation."),
                        {'job_exec_id': job_execution_id})
                    return job_execution
            else:
                LOG.info(
                    _LI("Job execution status %(job)s: %(status)s"), {
                        'job': job_execution.id,
                        'status': job_execution.info['status']
                    })
                return job_execution
        else:
            raise e.CancelingFailed(
                _('Job execution %s was not canceled') % job_execution.id)
示例#28
0
def cancel_job(job_execution_id):
    ctx = context.ctx()
    job_execution = conductor.job_execution_get(ctx, job_execution_id)
    if job_execution.info['status'] in edp.JOB_STATUSES_TERMINATED:
        LOG.info(
            _LI("Job execution is already finished and shouldn't be"
                " canceled"))
        return job_execution
    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster is None:
        LOG.info(_LI("Can not cancel this job on a non-existant cluster."))
        return job_execution
    engine = get_job_engine(cluster, job_execution)
    if engine is not None:
        job_execution = conductor.job_execution_update(
            ctx, job_execution_id,
            {'info': {
                'status': edp.JOB_STATUS_TOBEKILLED
            }})

        timeout = CONF.job_canceling_timeout
        s_time = timeutils.utcnow()
        while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout:
            if job_execution.info['status'] not in edp.JOB_STATUSES_TERMINATED:
                try:
                    job_info = engine.cancel_job(job_execution)
                except Exception as ex:
                    job_info = None
                    LOG.warning(
                        _LW("Error during cancel of job execution: "
                            "{error}").format(error=ex))
                if job_info is not None:
                    job_execution = _write_job_status(job_execution, job_info)
                    LOG.info(_LI("Job execution was canceled successfully"))
                    return job_execution
                context.sleep(3)
                job_execution = conductor.job_execution_get(
                    ctx, job_execution_id)
                if not job_execution:
                    LOG.info(
                        _LI("Job execution was deleted. "
                            "Canceling current operation."))
                    return job_execution
            else:
                LOG.info(
                    _LI("Job execution status: {status}").format(
                        status=job_execution.info['status']))
                return job_execution
        else:
            raise e.CancelingFailed(
                _('Job execution %s was not canceled') % job_execution.id)
示例#29
0
 def _install_components(self, ambari_info, auth, cluster_name, servers):
     # query for the host components on the given hosts that are in the
     # INIT state
     # TODO(jspeidel): provide request context
     body = '{"HostRoles": {"state" : "INSTALLED"}}'
     install_uri = ('http://{0}/api/v1/clusters/{'
                    '1}/host_components?HostRoles/state=INIT&'
                    'HostRoles/host_name.in({2})'.format(
                        ambari_info.get_address(), cluster_name,
                        self._get_host_list(servers)))
     self._exec_ambari_command(ambari_info, body, install_uri)
     LOG.info(_LI('Started Hadoop components while scaling up'))
     LOG.info(_LI('Ambari server ip {ip}')
              .format(ip=ambari_info.get_address()))
示例#30
0
 def _install_components(self, ambari_info, auth, cluster_name, servers):
     # query for the host components on the given hosts that are in the
     # INIT state
     # TODO(jspeidel): provide request context
     body = '{"HostRoles": {"state" : "INSTALLED"}}'
     install_uri = ('http://{0}/api/v1/clusters/{'
                    '1}/host_components?HostRoles/state=INIT&'
                    'HostRoles/host_name.in({2})'.format(
                        ambari_info.get_address(), cluster_name,
                        self._get_host_list(servers)))
     self._exec_ambari_command(ambari_info, body, install_uri)
     LOG.info(_LI('Started Hadoop components while scaling up'))
     LOG.info(
         _LI('Ambari server ip {ip}').format(ip=ambari_info.get_address()))
示例#31
0
    def start_cluster(self, cluster):
        nn_instance = vu.get_namenode(cluster)
        with remote.get_remote(nn_instance) as r:
            run.format_namenode(r)
            run.start_processes(r, "namenode")

        for snn in vu.get_secondarynamenodes(cluster):
            run.start_processes(remote.get_remote(snn), "secondarynamenode")

        jt_instance = vu.get_jobtracker(cluster)
        if jt_instance:
            run.start_processes(remote.get_remote(jt_instance), "jobtracker")

        self._start_tt_dn_processes(utils.get_instances(cluster))

        self._await_datanodes(cluster)

        LOG.info(_LI("Hadoop services in cluster %s have been started"),
                 cluster.name)

        oozie = vu.get_oozie(cluster)
        if oozie:
            with remote.get_remote(oozie) as r:
                if c_helper.is_mysql_enable(cluster):
                    run.mysql_start(r, oozie)
                    run.oozie_create_db(r)
                run.oozie_share_lib(r, nn_instance.hostname())
                run.start_oozie(r)
                LOG.info(_LI("Oozie service at '%s' has been started"),
                         nn_instance.hostname())

        hive_server = vu.get_hiveserver(cluster)
        if hive_server:
            with remote.get_remote(hive_server) as r:
                run.hive_create_warehouse_dir(r)
                run.hive_copy_shared_conf(
                    r, edp.get_hive_shared_conf_path('hadoop'))

                if c_helper.is_mysql_enable(cluster):
                    if not oozie or hive_server.hostname() != oozie.hostname():
                        run.mysql_start(r, hive_server)
                    run.hive_create_db(r, cluster.extra['hive_mysql_passwd'])
                    run.hive_metastore_start(r)
                    LOG.info(
                        _LI("Hive Metastore server at %s has been "
                            "started"), hive_server.hostname())

        LOG.info(_LI('Cluster %s has been started successfully'), cluster.name)
        self._set_cluster_info(cluster)
示例#32
0
        def terminate_unneeded_clusters(self, ctx):
            LOG.debug('Terminating unneeded transient clusters')
            ctx = context.get_admin_context()
            context.set_ctx(ctx)
            for cluster in conductor.cluster_get_all(ctx, status='Active'):
                if not cluster.is_transient:
                    continue

                jc = conductor.job_execution_count(ctx,
                                                   end_time=None,
                                                   cluster_id=cluster.id)

                if jc > 0:
                    continue

                cluster_updated_at = timeutils.normalize_time(
                    timeutils.parse_isotime(cluster.updated_at))
                current_time = timeutils.utcnow()
                spacing = timeutils.delta_seconds(cluster_updated_at,
                                                  current_time)
                if spacing < CONF.min_transient_cluster_active_time:
                    continue

                if CONF.use_identity_api_v3:
                    trusts.use_os_admin_auth_token(cluster)

                    LOG.info(
                        _LI('Terminating transient cluster %(cluster)s '
                            'with id %(id)s'), {
                                'cluster': cluster.name,
                                'id': cluster.id
                            })

                    try:
                        ops.terminate_cluster(cluster.id)
                    except Exception as e:
                        LOG.info(
                            _LI('Failed to terminate transient cluster '
                                '%(cluster)s with id %(id)s: %(error)s.'), {
                                    'cluster': cluster.name,
                                    'id': cluster.id,
                                    'error': six.text_type(e)
                                })

                else:
                    if cluster.status != 'AwaitingTermination':
                        conductor.cluster_update(
                            ctx, cluster, {'status': 'AwaitingTermination'})
            context.set_ctx(None)
示例#33
0
    def start_cluster(self, cluster):
        nn_instance = vu.get_namenode(cluster)
        with remote.get_remote(nn_instance) as r:
            run.format_namenode(r)
            run.start_processes(r, "namenode")

        for snn in vu.get_secondarynamenodes(cluster):
            run.start_processes(remote.get_remote(snn), "secondarynamenode")

        jt_instance = vu.get_jobtracker(cluster)
        if jt_instance:
            run.start_processes(remote.get_remote(jt_instance), "jobtracker")

        self._start_tt_dn_processes(utils.get_instances(cluster))

        self._await_datanodes(cluster)

        LOG.info(_LI("Hadoop services in cluster %s have been started"),
                 cluster.name)

        oozie = vu.get_oozie(cluster)
        if oozie:
            with remote.get_remote(oozie) as r:
                if c_helper.is_mysql_enable(cluster):
                    run.mysql_start(r, oozie)
                    run.oozie_create_db(r)
                run.oozie_share_lib(r, nn_instance.hostname())
                run.start_oozie(r)
                LOG.info(_LI("Oozie service at '%s' has been started"),
                         nn_instance.hostname())

        hive_server = vu.get_hiveserver(cluster)
        if hive_server:
            with remote.get_remote(hive_server) as r:
                run.hive_create_warehouse_dir(r)
                run.hive_copy_shared_conf(
                    r, edp.get_hive_shared_conf_path('hadoop'))

                if c_helper.is_mysql_enable(cluster):
                    if not oozie or hive_server.hostname() != oozie.hostname():
                        run.mysql_start(r, hive_server)
                    run.hive_create_db(r)
                    run.hive_metastore_start(r)
                    LOG.info(_LI("Hive Metastore server at %s has been "
                                 "started"),
                             hive_server.hostname())

        LOG.info(_LI('Cluster %s has been started successfully'), cluster.name)
        self._set_cluster_info(cluster)
示例#34
0
def decommission_nodes(cluster, instances, configure_sh_string):
    LOG.info(_LI('Start decommission . Cluster = %s'), cluster.name)
    move_node(cluster, instances)
    stop_services(cluster, instances)
    context.sleep(names.WAIT_NODE_ALARM_NO_HEARTBEAT)
    remove_node(cluster, instances)
    remove_services(cluster, instances)
    if check_for_cldb_or_zookeeper_service(instances):
        all_instances = gen.get_instances(cluster)
        current_cluster_instances = [
            x for x in all_instances if x not in instances]
        for inst in current_cluster_instances:
            start_helper.exec_configure_sh_on_instance(
                cluster, inst, configure_sh_string)
    LOG.info(_LI('End decommission. Cluster = %s'), cluster.name)
示例#35
0
文件: rpc.py 项目: msionkin/sahara
def setup():
    """Initialise the oslo_messaging layer."""
    global TRANSPORT, NOTIFIER

    messaging.set_transport_defaults('sahara')

    TRANSPORT = messaging.get_transport(cfg.CONF, aliases=_ALIASES)

    if not cfg.CONF.oslo_messaging_notifications.enable:
        LOG.info(_LI("Notifications disabled"))
        return
    LOG.info(_LI("Notifications enabled"))

    serializer = ContextSerializer(JsonPayloadSerializer())
    NOTIFIER = messaging.Notifier(TRANSPORT, serializer=serializer)
示例#36
0
def create_disk_list_file(instance, path_to_disk_setup_script):
    LOG.info(_LI('START: Creating disk list file.'))
    script_path = '/tmp/disk_setup_script.sh'
    rmt = instance.remote()
    LOG.debug('Writing /tmp/disk_setup_script.sh')
    rmt.write_file_to(
        script_path, files.get_file_text(path_to_disk_setup_script))
    LOG.debug('Start executing command: chmod +x %s', script_path)
    rmt.execute_command('chmod +x ' + script_path, run_as_root=True)
    LOG.debug('Done for executing command.')
    args = ' '.join(instance.node_group.storage_paths())
    cmd = '%s %s' % (script_path, args)
    LOG.debug('Executing %s', cmd)
    rmt.execute_command(cmd, run_as_root=True)
    LOG.info(_LI('END: Creating disk list file.'))
示例#37
0
文件: rpc.py 项目: egafford/sahara
def setup():
    """Initialise the oslo_messaging layer."""
    global TRANSPORT, NOTIFIER

    messaging.set_transport_defaults('sahara')

    TRANSPORT = messaging.get_transport(cfg.CONF, aliases=_ALIASES)

    if not cfg.CONF.enable_notifications:
        LOG.info(_LI("Notifications disabled"))
        return
    LOG.info(_LI("Notifications enabled"))

    serializer = ContextSerializer(JsonPayloadSerializer())
    NOTIFIER = messaging.Notifier(TRANSPORT, serializer=serializer)
示例#38
0
def cancel_job(job_execution_id):
    ctx = context.ctx()
    job_execution = conductor.job_execution_get(ctx, job_execution_id)
    if job_execution.info['status'] in edp.JOB_STATUSES_TERMINATED:
        return job_execution
    cluster = conductor.cluster_get(ctx, job_execution.cluster_id)
    if cluster is None:
        return job_execution
    engine = _get_job_engine(cluster, job_execution)
    if engine is not None:
        job_execution = conductor.job_execution_update(
            ctx, job_execution_id,
            {'info': {'status': edp.JOB_STATUS_TOBEKILLED}})

        timeout = CONF.job_canceling_timeout
        s_time = timeutils.utcnow()
        while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout:
            if job_execution.info['status'] not in edp.JOB_STATUSES_TERMINATED:
                try:
                    job_info = engine.cancel_job(job_execution)
                except Exception as ex:
                    job_info = None
                    LOG.warning(
                        _LW("Error during cancel of job execution {job}: "
                            "{error}").format(job=job_execution.id,
                                              error=ex))
                if job_info is not None:
                    job_execution = _write_job_status(job_execution, job_info)
                    LOG.info(_LI("Job execution {job_id} was canceled "
                                 "successfully").format(
                                     job_id=job_execution.id))
                    return job_execution
                context.sleep(3)
                job_execution = conductor.job_execution_get(
                    ctx, job_execution_id)
                if not job_execution:
                    LOG.info(_LI("Job execution {job_exec_id} was deleted. "
                                 "Canceling current operation.").format(
                             job_exec_id=job_execution_id))
                    return job_execution
            else:
                LOG.info(_LI("Job execution status {job}: {status}").format(
                         job=job_execution.id,
                         status=job_execution.info['status']))
                return job_execution
        else:
            raise e.CancelingFailed(_('Job execution %s was not canceled')
                                    % job_execution.id)
示例#39
0
def start_server(app):
    sock = eventlet.listen((cfg.CONF.host, cfg.CONF.port), backlog=500)
    if sslutils.is_enabled():
        LOG.info(_LI("Using HTTPS for port %s"), cfg.CONF.port)
        sock = sslutils.wrap(sock)

    wsgi.server(sock, app, log=loggers.WritableLogger(LOG), debug=False)
示例#40
0
 def post_start(self, c_context, instances=None):
     instances = instances or c_context.get_instances()
     LOG.debug('Executing service post start hooks')
     for service in c_context.cluster_services:
         updated = c_context.filter_instances(instances, service=service)
         service.post_start(c_context, updated)
     LOG.info(_LI('Post start hooks successfully executed'))
示例#41
0
    def create_cluster(self, cluster):
        version = cluster.hadoop_version
        handler = self.version_factory.get_version_handler(version)

        cluster_spec = handler.get_cluster_spec(
            cluster, self._map_to_user_inputs(version,
                                              cluster.cluster_configs))
        hosts = self._get_servers(cluster)
        ambari_info = self.get_ambari_info(cluster_spec)
        self.cluster_ambari_mapping[cluster.name] = ambari_info
        rpm = self._get_rpm_uri(cluster_spec)

        servers = []
        for host in hosts:
            host_role = utils.get_host_role(host)
            servers.append(
                h.HadoopServer(host,
                               cluster_spec.node_groups[host_role],
                               ambari_rpm=rpm))

        self._provision_cluster(cluster.name, cluster_spec, ambari_info,
                                servers, cluster.hadoop_version)

        # add the topology data file and script if rack awareness is
        # enabled
        self._configure_topology_for_cluster(cluster, servers)

        LOG.info(_LI("Install of Hadoop stack successful."))
        # add service urls
        self._set_cluster_info(cluster, cluster_spec)

        # check if HDFS HA is enabled; set it up if so
        if cluster_spec.is_hdfs_ha_enabled(cluster):
            self.configure_hdfs_ha(cluster)
示例#42
0
 def post_start(self, c_context, instances=None):
     instances = instances or c_context.get_instances()
     LOG.debug('Executing service post start hooks')
     for service in c_context.cluster_services:
         updated = c_context.filter_instances(instances, service=service)
         service.post_start(c_context, updated)
     LOG.info(_LI('Post start hooks successfully executed'))
示例#43
0
def change_cluster_status(cluster, status, status_description=None):
    ctx = context.ctx()

    # Update cluster status. Race conditions with deletion are still possible,
    # but this reduces probability at least.
    cluster = conductor.cluster_get(ctx, cluster) if cluster else None

    if status_description is not None:
        change_cluster_status_description(cluster, status_description)

    # 'Deleting' is final and can't be changed
    if cluster is None or cluster.status == CLUSTER_STATUS_DELETING:
        return cluster

    update_dict = {"status": status}
    cluster = conductor.cluster_update(ctx, cluster, update_dict)
    conductor.cluster_provision_progress_update(ctx, cluster.id)

    LOG.info(_LI("Cluster status has been changed. New status="
                 "{status}").format(status=cluster.status))

    sender.notify(ctx, cluster.id, cluster.name, cluster.status,
                  "update")

    return cluster
示例#44
0
    def _exec_ambari_command(self, ambari_info, body, cmd_uri):

        LOG.debug('PUT URI: {0}'.format(cmd_uri))
        result = self._put(cmd_uri, ambari_info, data=body)
        if result.status_code == 202:
            LOG.debug(
                'PUT response: {0}'.format(result.text))
            json_result = json.loads(result.text)
            href = json_result['href'] + '/tasks?fields=Tasks/status'
            success = self._wait_for_async_request(href, ambari_info)
            if success:
                LOG.info(
                    _LI("Successfully changed state of Hadoop components "))
            else:
                LOG.critical(_LC('Failed to change state of Hadoop '
                                 'components'))
                raise ex.HadoopProvisionError(
                    _('Failed to change state of Hadoop components'))

        else:
            LOG.error(
                _LE('Command failed. Status: %(status)s, response: '
                    '%(response)s'),
                {'status': result.status_code, 'response': result.text})
            raise ex.HadoopProvisionError(_('Hadoop/Ambari command failed.'))
示例#45
0
 def _single_run(self, application, sock):
     """Start a WSGI server in a new green thread."""
     LOG.info(_LI("Starting single process server"))
     eventlet.wsgi.server(sock, application,
                          custom_pool=self.pool,
                          log=loggers.WritableLogger(LOG),
                          debug=False)
示例#46
0
    def _provision_cluster(self, name, cluster_spec, ambari_info,
                           servers, version):
        # TODO(jspeidel): encapsulate in another class

        if servers:
            cpo.add_provisioning_step(
                servers[0].cluster_id,
                _("Provision cluster via Ambari"), len(servers))

        with context.ThreadGroup() as tg:
            for server in servers:
                with context.set_current_instance_id(
                        server.instance['instance_id']):
                    tg.spawn(
                        "hdp-provision-instance-%s" %
                        server.instance.hostname(),
                        server.provision_ambari, ambari_info, cluster_spec)

        handler = self.version_factory.get_version_handler(version)
        ambari_client = handler.get_ambari_client()

        ambari_client.wait_for_host_registrations(len(servers), ambari_info)
        self._set_ambari_credentials(cluster_spec, ambari_info, version)

        ambari_client.provision_cluster(
            cluster_spec, servers, ambari_info, name)

        LOG.info(_LI('Cluster provisioned via Ambari Server: {server_ip}')
                 .format(server_ip=ambari_info.get_address()))
示例#47
0
    def create_cluster(self, cluster):
        version = cluster.hadoop_version
        handler = self.version_factory.get_version_handler(version)

        cluster_spec = handler.get_cluster_spec(
            cluster, self._map_to_user_inputs(
                version, cluster.cluster_configs))
        hosts = self._get_servers(cluster)
        ambari_info = self.get_ambari_info(cluster_spec)
        self.cluster_ambari_mapping[cluster.name] = ambari_info
        rpm = self._get_rpm_uri(cluster_spec)

        servers = []
        for host in hosts:
            host_role = utils.get_host_role(host)
            servers.append(
                h.HadoopServer(host, cluster_spec.node_groups[host_role],
                               ambari_rpm=rpm))

        self._provision_cluster(
            cluster.name, cluster_spec, ambari_info, servers,
            cluster.hadoop_version)

        # add the topology data file and script if rack awareness is
        # enabled
        self._configure_topology_for_cluster(cluster, servers)

        LOG.info(_LI("Install of Hadoop stack successful."))
        # add service urls
        self._set_cluster_info(cluster, cluster_spec)
示例#48
0
    def configure_hdfs_ha(self, cluster):
        LOG.debug("Configuring HDFS HA")
        version = cluster.hadoop_version
        handler = self.version_factory.get_version_handler(version)

        cluster_spec = handler.get_cluster_spec(
            cluster, self._map_to_user_inputs(version,
                                              cluster.cluster_configs))
        hosts = self._get_servers(cluster)
        ambari_info = self.get_ambari_info(cluster_spec)
        self.cluster_ambari_mapping[cluster.name] = ambari_info
        rpm = self._get_rpm_uri(cluster_spec)

        servers = []
        for host in hosts:
            host_role = utils.get_host_role(host)
            servers.append(
                h.HadoopServer(host,
                               cluster_spec.node_groups[host_role],
                               ambari_rpm=rpm))

        ambari_client = handler.get_ambari_client()
        ambari_client.setup_hdfs_ha(cluster_spec, servers, ambari_info,
                                    cluster.name)
        LOG.info(_LI("Configure HDFS HA successful."))
示例#49
0
def vm_awareness_mapred_config():
    c = x.load_hadoop_xml_defaults('topology/resources/mapred-template.xml')
    result = [cfg for cfg in c if cfg['value']]
    LOG.info(
        _LI("Vm awareness will add following configs in map-red "
            "params: %s"), result)
    return result
示例#50
0
def change_cluster_status(cluster, status, status_description=None):
    ctx = context.ctx()

    # Update cluster status. Race conditions with deletion are still possible,
    # but this reduces probability at least.
    cluster = conductor.cluster_get(ctx, cluster) if cluster else None

    # 'Deleting' is final and can't be changed
    if cluster is None or cluster.status == 'Deleting':
        return cluster

    update_dict = {"status": status}
    if status_description:
        update_dict["status_description"] = status_description

    cluster = conductor.cluster_update(ctx, cluster, update_dict)

    LOG.info(
        _LI("Cluster status has been changed: id=%(id)s, New status="
            "%(status)s"), {
                'id': cluster.id,
                'status': cluster.status
            })

    sender.notify(ctx, cluster.id, cluster.name, cluster.status, "update")

    return cluster
示例#51
0
    def _rollback_cluster_creation(self, cluster, ex):
        """Shutdown all instances and update cluster status."""
        LOG.info(_LI("Cluster '%(name)s' creation rollback "
                     "(reason: %(reason)s)"),
                 {'name': cluster.name, 'reason': ex})

        self.shutdown_cluster(cluster)