Пример #1
0
    def start_cluster(self, cluster):
        nn_instance = utils.get_instance(cluster, "namenode")
        sm_instance = utils.get_instance(cluster, "master")
        dn_instances = utils.get_instances(cluster, "datanode")

        # Start the name node
        with remote.get_remote(nn_instance) as r:
            run.format_namenode(r)
            run.start_processes(r, "namenode")

        # start the data nodes
        self._start_slave_datanode_processes(dn_instances)

        LOG.info(_LI("Hadoop services in cluster %s have been started"),
                 cluster.name)

        with remote.get_remote(nn_instance) as r:
            r.execute_command("sudo -u hdfs hdfs dfs -mkdir -p /user/$USER/")
            r.execute_command("sudo -u hdfs hdfs dfs -chown $USER "
                              "/user/$USER/")

        # start spark nodes
        if sm_instance:
            with remote.get_remote(sm_instance) as r:
                run.start_spark_master(r, self._spark_home(cluster))
                LOG.info(_LI("Spark service at '%s' has been started"),
                         sm_instance.hostname())

        LOG.info(_LI('Cluster %s has been started successfully'), cluster.name)
        self._set_cluster_info(cluster)
Пример #2
0
    def _extract_configs_to_extra(self, cluster):
        nn = utils.get_instance(cluster, "namenode")
        sp_master = utils.get_instance(cluster, "master")
        sp_slaves = utils.get_instances(cluster, "slave")

        extra = dict()

        config_master = config_slaves = ""
        if sp_master is not None:
            config_master = c_helper.generate_spark_env_configs(cluster)

        if sp_slaves is not None:
            slavenames = []
            for slave in sp_slaves:
                slavenames.append(slave.hostname())
            config_slaves = c_helper.generate_spark_slaves_configs(slavenames)
        else:
            config_slaves = "\n"

        for ng in cluster.node_groups:
            extra[ng.id] = {
                "xml": c_helper.generate_xml_configs(ng.configuration(), ng.storage_paths(), nn.hostname(), None),
                "setup_script": c_helper.generate_hadoop_setup_script(
                    ng.storage_paths(), c_helper.extract_hadoop_environment_confs(ng.configuration())
                ),
                "sp_master": config_master,
                "sp_slaves": config_slaves,
            }

        if c_helper.is_data_locality_enabled(cluster):
            topology_data = th.generate_topology_map(cluster, CONF.enable_hypervisor_awareness)
            extra["topology_data"] = "\n".join([k + " " + v for k, v in topology_data.items()]) + "\n"

        return extra
Пример #3
0
    def start_cluster(self, cluster):
        nn_instance = utils.get_instance(cluster, "namenode")
        sm_instance = utils.get_instance(cluster, "master")
        dn_instances = utils.get_instances(cluster, "datanode")

        # Start the name node
        with remote.get_remote(nn_instance) as r:
            run.format_namenode(r)
            run.start_processes(r, "namenode")

        # start the data nodes
        self._start_slave_datanode_processes(dn_instances)

        LOG.info("Hadoop services in cluster %s have been started" %
                 cluster.name)

        with remote.get_remote(nn_instance) as r:
            r.execute_command("sudo -u hdfs hdfs dfs -mkdir -p /user/$USER/")
            r.execute_command(("sudo -u hdfs hdfs dfs -chown $USER "
                               "/user/$USER/"))

        # start spark nodes
        if sm_instance:
            with remote.get_remote(sm_instance) as r:
                run.start_spark_master(r, self._spark_home(cluster))
                LOG.info("Spark service at '%s' has been started",
                         sm_instance.hostname())

        LOG.info('Cluster %s has been started successfully' % cluster.name)
        self._set_cluster_info(cluster)
Пример #4
0
    def scale_cluster(self, cluster, instances):
        master = utils.get_instance(cluster, "master")
        r_master = remote.get_remote(master)

        run.stop_spark(r_master, self._spark_home(cluster))

        self._setup_instances(cluster, instances)
        nn = utils.get_instance(cluster, "namenode")
        run.refresh_nodes(remote.get_remote(nn), "dfsadmin")
        self._start_slave_datanode_processes(instances)

        run.start_spark_master(r_master, self._spark_home(cluster))
        LOG.info(_LI("Spark master service at '%s' has been restarted"), master.hostname())
Пример #5
0
    def scale_cluster(self, cluster, instances):
        master = utils.get_instance(cluster, "master")
        r_master = remote.get_remote(master)

        run.stop_spark(r_master, self._spark_home(cluster))

        self._setup_instances(cluster, instances)
        nn = utils.get_instance(cluster, "namenode")
        run.refresh_nodes(remote.get_remote(nn), "dfsadmin")
        self._start_slave_datanode_processes(instances)

        run.start_spark_master(r_master, self._spark_home(cluster))
        LOG.info(_LI("Spark master service at '%s' has been restarted"),
                 master.hostname())
Пример #6
0
def scale_cluster(cluster, instances):
    scale_ins_hosts = [i.fqdn() for i in instances]
    dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)]
    tt_hosts = [tt.fqdn() for tt in u.get_tasktrackers(cluster)]
    to_scale_dn = []
    to_scale_tt = []
    for i in scale_ins_hosts:
        if i in dn_hosts:
            to_scale_dn.append(i)

        if i in tt_hosts:
            to_scale_tt.append(i)

    client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name)
    rack = '/Default'
    client.nodes.add(scale_ins_hosts, rack, 'hadoop',
                     '/home/hadoop/.ssh/id_rsa')
    client.cluster.install_software(scale_ins_hosts)

    if to_scale_tt:
        client.services.mapred.add_nodes('TaskTracker', to_scale_tt)

    if to_scale_dn:
        client.services.hdfs.add_nodes('DataNode', to_scale_dn)

    client.nodes.config()

    if to_scale_dn:
        client.services.hdfs.start()

    if to_scale_tt:
        client.services.mapred.start()
Пример #7
0
def install_cluster(cluster):
    mng_instance = u.get_instance(cluster, 'manager')

    all_hosts = list(set([i.fqdn() for i in u.get_instances(cluster)]))

    client = c.IntelClient(mng_instance, cluster.name)

    LOG.info("Create cluster")
    client.cluster.create()

    LOG.info("Add nodes to cluster")
    rack = '/Default'
    client.nodes.add(all_hosts, rack, 'hadoop',
                     '/home/hadoop/.ssh/id_rsa')

    LOG.info("Install software")
    client.cluster.install_software(all_hosts)

    LOG.info("Configure services")
    _configure_services(client, cluster)

    LOG.info("Deploy cluster")
    client.nodes.config(force=True)

    LOG.info("Provisioning configs")
    # cinder and ephemeral drive support
    _configure_storage(client, cluster)
    # swift support
    _configure_swift(client, cluster)
    # user configs
    _add_user_params(client, cluster)

    LOG.info("Format HDFS")
    client.services.hdfs.format()
Пример #8
0
def scale_cluster(cluster, instances):
    scale_ins_hosts = [i.fqdn() for i in instances]
    dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)]
    tt_hosts = [tt.fqdn() for tt in u.get_tasktrackers(cluster)]
    to_scale_dn = []
    to_scale_tt = []
    for i in scale_ins_hosts:
        if i in dn_hosts:
            to_scale_dn.append(i)

        if i in tt_hosts:
            to_scale_tt.append(i)

    client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name)
    rack = '/Default'
    client.nodes.add(scale_ins_hosts, rack, 'hadoop',
                     '/home/hadoop/.ssh/id_rsa')
    client.cluster.install_software(scale_ins_hosts)

    if to_scale_tt:
        client.services.mapred.add_nodes('TaskTracker', to_scale_tt)

    if to_scale_dn:
        client.services.hdfs.add_nodes('DataNode', to_scale_dn)

    client.nodes.config()

    if to_scale_dn:
        client.services.hdfs.start()

    if to_scale_tt:
        client.services.mapred.start()
Пример #9
0
def install_cluster(cluster):
    mng_instance = u.get_instance(cluster, 'manager')

    all_hosts = list(set([i.fqdn() for i in u.get_instances(cluster)]))

    client = c.IntelClient(mng_instance, cluster.name)

    LOG.info("Create cluster")
    client.cluster.create()

    LOG.info("Add nodes to cluster")
    rack = '/Default'
    client.nodes.add(all_hosts, rack, 'hadoop', '/home/hadoop/.ssh/id_rsa')

    LOG.info("Install software")
    client.cluster.install_software(all_hosts)

    LOG.info("Configure services")
    _configure_services(client, cluster)

    LOG.info("Deploy cluster")
    client.nodes.config(force=True)

    LOG.info("Provisioning configs")
    # cinder and ephemeral drive support
    _configure_storage(client, cluster)
    # swift support
    _configure_swift(client, cluster)
    # user configs
    _add_user_params(client, cluster)

    LOG.info("Format HDFS")
    client.services.hdfs.format()
Пример #10
0
def decommission_nodes(cluster, instances):
    dec_hosts = [i.fqdn() for i in instances]
    dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)]
    nm_hosts = [nm.fqdn() for nm in u.get_nodemanagers(cluster)]

    client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name)

    dec_dn_hosts = []
    for dec_host in dec_hosts:
        if dec_host in dn_hosts:
            dec_dn_hosts.append(dec_host)

    if dec_dn_hosts:
        client.services.hdfs.decommission_nodes(dec_dn_hosts)

        #TODO(alazarev) make timeout configurable (bug #1262897)
        timeout = 14400  # 4 hours
        cur_time = 0
        for host in dec_dn_hosts:
            while cur_time < timeout:
                if client.services.hdfs.get_datanode_status(
                        host) == 'Decomissioned':
                    break
                context.sleep(5)
                cur_time += 5
            else:
                LOG.warn("Failed to decomission node '%s' of cluster '%s' "
                         "in %s minutes" % (host, cluster.name, timeout / 60))

    client.nodes.stop(dec_hosts)

    # wait stop services
    #TODO(alazarev) make timeout configurable (bug #1262897)
    timeout = 600  # 10 minutes
    cur_time = 0
    for instance in instances:
        while cur_time < timeout:
            stopped = True

            if instance.fqdn() in dn_hosts:
                stopped = stopped and _is_hadoop_service_stopped(
                    instance, 'hadoop-hdfs-datanode')

            if instance.fqdn() in nm_hosts:
                stopped = stopped and _is_hadoop_service_stopped(
                    instance, 'hadoop-yarn-nodemanager')

            if stopped:
                break
            else:
                context.sleep(5)
                cur_time += 5
        else:
            LOG.warn("Failed to stop services on node '%s' of cluster '%s' "
                     "in %s minutes" % (instance, cluster.name, timeout / 60))

    for node in dec_hosts:
        LOG.info("Deleting node '%s' on cluster '%s'" % (node, cluster.name))
        client.nodes.delete(node)
Пример #11
0
    def _set_cluster_info(self, cluster):
        nn = utils.get_instance(cluster, "namenode")
        sp_master = utils.get_instance(cluster, "master")
        info = {}

        if nn:
            address = c_helper.get_config_value("HDFS", "dfs.http.address", cluster)
            port = address[address.rfind(":") + 1 :]
            info["HDFS"] = {"Web UI": "http://%s:%s" % (nn.management_ip, port)}
            info["HDFS"]["NameNode"] = "hdfs://%s:8020" % nn.hostname()

        if sp_master:
            port = c_helper.get_config_value("Spark", "Master webui port", cluster)
            if port is not None:
                info["Spark"] = {"Web UI": "http://%s:%s" % (sp_master.management_ip, port)}
        ctx = context.ctx()
        conductor.cluster_update(ctx, cluster, {"info": info})
Пример #12
0
    def _extract_configs_to_extra(self, cluster):
        nn = utils.get_instance(cluster, "namenode")
        sp_master = utils.get_instance(cluster, "master")
        sp_slaves = utils.get_instances(cluster, "slave")

        extra = dict()

        config_master = config_slaves = ''
        if sp_master is not None:
            config_master = c_helper.generate_spark_env_configs(cluster)

        if sp_slaves is not None:
            slavenames = []
            for slave in sp_slaves:
                slavenames.append(slave.hostname())
            config_slaves = c_helper.generate_spark_slaves_configs(slavenames)
        else:
            config_slaves = "\n"

        for ng in cluster.node_groups:
            extra[ng.id] = {
                'xml':
                c_helper.generate_xml_configs(
                    ng.configuration(),
                    ng.storage_paths(),
                    nn.hostname(),
                    None,
                ),
                'setup_script':
                c_helper.generate_hadoop_setup_script(
                    ng.storage_paths(),
                    c_helper.extract_hadoop_environment_confs(
                        ng.configuration())),
                'sp_master':
                config_master,
                'sp_slaves':
                config_slaves
            }

        if c_helper.is_data_locality_enabled(cluster):
            topology_data = th.generate_topology_map(
                cluster, CONF.enable_hypervisor_awareness)
            extra['topology_data'] = "\n".join(
                [k + " " + v for k, v in topology_data.items()]) + "\n"

        return extra
Пример #13
0
    def decommission_nodes(self, cluster, instances):
        sls = utils.get_instances(cluster, "slave")
        dns = utils.get_instances(cluster, "datanode")
        decommission_dns = False
        decommission_sls = False

        for i in instances:
            if 'datanode' in i.node_group.node_processes:
                dns.remove(i)
                decommission_dns = True
            if 'slave' in i.node_group.node_processes:
                sls.remove(i)
                decommission_sls = True

        nn = utils.get_instance(cluster, "namenode")
        spark_master = utils.get_instance(cluster, "master")

        if decommission_sls:
            sc.decommission_sl(spark_master, instances, sls)
        if decommission_dns:
            sc.decommission_dn(nn, instances, dns)
Пример #14
0
    def decommission_nodes(self, cluster, instances):
        sls = utils.get_instances(cluster, "slave")
        dns = utils.get_instances(cluster, "datanode")
        decommission_dns = False
        decommission_sls = False

        for i in instances:
            if 'datanode' in i.node_group.node_processes:
                dns.remove(i)
                decommission_dns = True
            if 'slave' in i.node_group.node_processes:
                sls.remove(i)
                decommission_sls = True

        nn = utils.get_instance(cluster, "namenode")
        spark_master = utils.get_instance(cluster, "master")

        if decommission_sls:
            sc.decommission_sl(spark_master, instances, sls)
        if decommission_dns:
            sc.decommission_dn(nn, instances, dns)
Пример #15
0
    def _set_cluster_info(self, cluster):
        nn = utils.get_instance(cluster, "namenode")
        sp_master = utils.get_instance(cluster, "master")
        info = {}

        if nn:
            address = c_helper.get_config_value('HDFS', 'dfs.http.address',
                                                cluster)
            port = address[address.rfind(':') + 1:]
            info['HDFS'] = {
                'Web UI': 'http://%s:%s' % (nn.management_ip, port)
            }
            info['HDFS']['NameNode'] = 'hdfs://%s:8020' % nn.hostname()

        if sp_master:
            port = c_helper.get_config_value('Spark', 'Master webui port',
                                             cluster)
            if port is not None:
                info['Spark'] = {
                    'Web UI': 'http://%s:%s' % (sp_master.management_ip, port)
                }
        ctx = context.ctx()
        conductor.cluster_update(ctx, cluster, {'info': info})
Пример #16
0
    def _set_cluster_info(self, cluster):
        nn = utils.get_instance(cluster, "namenode")
        sp_master = utils.get_instance(cluster, "master")
        info = {}

        if nn:
            address = c_helper.get_config_value(
                'HDFS', 'dfs.http.address', cluster)
            port = address[address.rfind(':') + 1:]
            info['HDFS'] = {
                'Web UI': 'http://%s:%s' % (nn.management_ip, port)
            }
            info['HDFS']['NameNode'] = 'hdfs://%s:8020' % nn.hostname()

        if sp_master:
            port = c_helper.get_config_value(
                'Spark', 'Master webui port', cluster)
            if port is not None:
                info['Spark'] = {
                    'Web UI': 'http://%s:%s' % (sp_master.management_ip, port)
                }
        ctx = context.ctx()
        conductor.cluster_update(ctx, cluster, {'info': info})
Пример #17
0
    def start_cluster(self, cluster):
        sm_instance = utils.get_instance(cluster, "master")
        sl_instances = utils.get_instances(cluster, "slave")

        # start storm master
        if sm_instance:
            with remote.get_remote(sm_instance) as r:
                run.start_storm_master(r)
                LOG.info("Storm master at '%s' has been started",
                         sm_instance.hostname())

        # start storm slaves
        self._start_slave_processes(sl_instances)

        LOG.info('Cluster %s has been started successfully' % cluster.name)
        self._set_cluster_info(cluster)
Пример #18
0
def start_cluster(cluster):
    client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name)

    LOG.debug("Starting hadoop services")
    client.services.hdfs.start()

    if u.get_jobtracker(cluster):
        client.services.mapred.start()

    if u.get_hiveserver(cluster):
        client.services.hive.start()

    if u.get_oozie(cluster):
        LOG.info("Setup oozie")
        _setup_oozie(cluster)

        client.services.oozie.start()
Пример #19
0
def start_cluster(cluster):
    client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name)

    LOG.debug("Starting hadoop services")
    client.services.hdfs.start()

    if u.get_jobtracker(cluster):
        client.services.mapred.start()

    if u.get_hiveserver(cluster):
        client.services.hive.start()

    if u.get_oozie(cluster):
        LOG.info("Setup oozie")
        _setup_oozie(cluster)

        client.services.oozie.start()
Пример #20
0
def scale_cluster(cluster, instances):
    scale_ins_hosts = [i.fqdn() for i in instances]
    dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)]
    nm_hosts = [nm.fqdn() for nm in u.get_nodemanagers(cluster)]
    to_scale_dn = []
    to_scale_nm = []
    for i in scale_ins_hosts:
        if i in dn_hosts:
            to_scale_dn.append(i)

        if i in nm_hosts:
            to_scale_nm.append(i)

    client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name)
    rack = '/Default'
    client.nodes.add(scale_ins_hosts, rack, 'hadoop',
                     '/home/hadoop/.ssh/id_rsa')
    client.cluster.install_software(scale_ins_hosts)

    if to_scale_nm:
        client.services.yarn.add_nodes('NodeManager', to_scale_nm)

    if to_scale_dn:
        client.services.hdfs.add_nodes('DataNode', to_scale_dn)

    # IDH 3.0.2 reset cluster parameters (bug #1300603)
    # Restoring them back
    LOG.info("Provisioning configs")
    # cinder and ephemeral drive support
    _configure_storage(client, cluster)
    # swift support
    _configure_swift(client, cluster)
    # user configs
    _add_user_params(client, cluster)

    client.nodes.config()

    if to_scale_dn:
        client.services.hdfs.start()

    if to_scale_nm:
        client.services.yarn.start()
Пример #21
0
def generate_spark_env_configs(cluster):
    configs = []

    # master configuration
    sp_master = utils.get_instance(cluster, "master")
    configs.append('SPARK_MASTER_IP=' + sp_master.hostname())

    masterport = get_config_value("Spark", "Master port", cluster)
    if masterport and masterport != _get_spark_opt_default("Master port"):
        configs.append('SPARK_MASTER_PORT=' + str(masterport))

    masterwebport = get_config_value("Spark", "Master webui port", cluster)
    if masterwebport and \
            masterwebport != _get_spark_opt_default("Master webui port"):
        configs.append('SPARK_MASTER_WEBUI_PORT=' + str(masterwebport))

    # configuration for workers
    workercores = get_config_value("Spark", "Worker cores", cluster)
    if workercores and workercores != _get_spark_opt_default("Worker cores"):
        configs.append('SPARK_WORKER_CORES=' + str(workercores))

    workermemory = get_config_value("Spark", "Worker memory", cluster)
    if workermemory and \
            workermemory != _get_spark_opt_default("Worker memory"):
        configs.append('SPARK_WORKER_MEMORY=' + str(workermemory))

    workerport = get_config_value("Spark", "Worker port", cluster)
    if workerport and workerport != _get_spark_opt_default("Worker port"):
        configs.append('SPARK_WORKER_PORT=' + str(workerport))

    workerwebport = get_config_value("Spark", "Worker webui port", cluster)
    if workerwebport and \
            workerwebport != _get_spark_opt_default("Worker webui port"):
        configs.append('SPARK_WORKER_WEBUI_PORT=' + str(workerwebport))

    workerinstances = get_config_value("Spark", "Worker instances", cluster)
    if workerinstances and \
            workerinstances != _get_spark_opt_default("Worker instances"):
        configs.append('SPARK_WORKER_INSTANCES=' + str(workerinstances))
    return '\n'.join(configs)
Пример #22
0
def generate_spark_env_configs(cluster):
    configs = []

    # master configuration
    sp_master = utils.get_instance(cluster, "master")
    configs.append('SPARK_MASTER_IP=' + sp_master.hostname())

    masterport = get_config_value("Spark", "Master port", cluster)
    if masterport and masterport != _get_spark_opt_default("Master port"):
        configs.append('SPARK_MASTER_PORT=' + str(masterport))

    masterwebport = get_config_value("Spark", "Master webui port", cluster)
    if (masterwebport and
            masterwebport != _get_spark_opt_default("Master webui port")):
        configs.append('SPARK_MASTER_WEBUI_PORT=' + str(masterwebport))

    # configuration for workers
    workercores = get_config_value("Spark", "Worker cores", cluster)
    if workercores and workercores != _get_spark_opt_default("Worker cores"):
        configs.append('SPARK_WORKER_CORES=' + str(workercores))

    workermemory = get_config_value("Spark", "Worker memory", cluster)
    if (workermemory and
            workermemory != _get_spark_opt_default("Worker memory")):
        configs.append('SPARK_WORKER_MEMORY=' + str(workermemory))

    workerport = get_config_value("Spark", "Worker port", cluster)
    if workerport and workerport != _get_spark_opt_default("Worker port"):
        configs.append('SPARK_WORKER_PORT=' + str(workerport))

    workerwebport = get_config_value("Spark", "Worker webui port", cluster)
    if (workerwebport and
            workerwebport != _get_spark_opt_default("Worker webui port")):
        configs.append('SPARK_WORKER_WEBUI_PORT=' + str(workerwebport))

    workerinstances = get_config_value("Spark", "Worker instances", cluster)
    if (workerinstances and
            workerinstances != _get_spark_opt_default("Worker instances")):
        configs.append('SPARK_WORKER_INSTANCES=' + str(workerinstances))
    return '\n'.join(configs)
Пример #23
0
    def _extract_configs_to_extra(self, cluster):
        st_master = utils.get_instance(cluster, "master")
        st_slaves = utils.get_instances(cluster, "slave")
        zk_servers = utils.get_instances(cluster, "zookeeper")

        extra = dict()

        config_instances = ''
        if st_master is not None:
            if zk_servers is not None:
                zknames = []
                for zk in zk_servers:
                    zknames.append(zk.hostname())

            config_master = c_helper.generate_storm_config(cluster,
                                st_master.hostname(),
                                zknames)

        # FIGURE OUT HOW TO GET IPS
        for ng in cluster.node_groups:
            extra[ng.id] = {
                'setup_script': c_helper.generate_hosts_setup_script(
                    ng.storage_paths(),
                    c_helper.extract_hadoop_environment_confs(
                        ng.configuration())
                ),
                'sp_master': config_master,
                'sp_slaves': config_slaves
            }

        if c_helper.is_data_locality_enabled(cluster):
            topology_data = th.generate_topology_map(
                cluster, CONF.enable_hypervisor_awareness)
            extra['topology_data'] = "\n".join(
                [k + " " + v for k, v in topology_data.items()]) + "\n"

        return extra
Пример #24
0
def get_resourcemanager(cluster):
    return u.get_instance(cluster, 'RESOURCEMANAGER')
Пример #25
0
def install_manager(cluster):
    LOG.info("Starting Install Manager Process")
    mng_instance = u.get_instance(cluster, 'manager')

    idh_tarball_path = c_helper.get_config_value(
        cluster.cluster_configs.get('general'), c_helper.IDH_TARBALL_URL)

    idh_tarball_filename = idh_tarball_path.rsplit('/', 1)[-1]
    idh_dir = idh_tarball_filename[:idh_tarball_filename.find('.tar.gz')]
    LOG.info("IDH tgz will be retrieved from: \'%s\'", idh_tarball_path)

    idh_repo = c_helper.get_config_value(
        cluster.cluster_configs.get('general'), c_helper.IDH_REPO_URL)

    os_repo = c_helper.get_config_value(cluster.cluster_configs.get('general'),
                                        c_helper.OS_REPO_URL)

    idh_install_cmd = 'sudo ./%s/install.sh --mode=silent 2>&1' % idh_dir

    with mng_instance.remote() as r:
        LOG.info("Download IDH manager ")
        try:
            r.execute_command('curl -O %s 2>&1' % idh_tarball_path)
        except Exception as e:
            raise RuntimeError(
                "Unable to download IDH manager from %s" % idh_tarball_path, e)

        # unpack archive
        LOG.info("Unpack manager %s ", idh_tarball_filename)
        try:
            r.execute_command('tar xzf %s 2>&1' % idh_tarball_filename)
        except Exception as e:
            raise RuntimeError("Unable to unpack tgz %s", idh_tarball_filename,
                               e)

        # install idh
        LOG.debug("Install manager with %s : ", idh_install_cmd)
        inst_conf = _INST_CONF_TEMPLATE % (os_repo, idh_repo)
        r.write_file_to('%s/ui-installer/conf' % idh_dir, inst_conf)
        #TODO(alazarev) make timeout configurable (bug #1262897)
        r.execute_command(idh_install_cmd, timeout=3600)

        # fix nginx persimmions bug
        r.execute_command('sudo chmod o+x /var/lib/nginx/ /var/lib/nginx/tmp '
                          '/var/lib/nginx/tmp/client_body')

    # waiting start idh manager
    #TODO(alazarev) make timeout configurable (bug #1262897)
    timeout = 600
    LOG.debug("Waiting %s seconds for Manager to start : ", timeout)
    while timeout:
        try:
            telnetlib.Telnet(mng_instance.management_ip, 9443)
            break
        except IOError:
            timeout -= 2
            context.sleep(2)
    else:
        message = ("IDH Manager failed to start in %s minutes on node '%s' "
                   "of cluster '%s'" %
                   (timeout / 60, mng_instance.management_ip, cluster.name))
        LOG.error(message)
        raise iex.IntelPluginException(message)
Пример #26
0
    def run_job(self, job_execution):
        ctx = context.ctx()
        job = conductor.job_get(ctx, job_execution.job_id)

        proxy_configs = job_execution.job_configs.get('proxy_configs')

        # We'll always run the driver program on the master
        master = plugin_utils.get_instance(self.cluster, "master")

        # TODO(tmckay): wf_dir should probably be configurable.
        # The only requirement is that the dir is writable by the image user
        wf_dir = job_utils.create_workflow_dir(master, '/tmp/spark-edp', job,
                                               job_execution.id)
        paths = job_utils.upload_job_files(master,
                                           wf_dir,
                                           job,
                                           libs_subdir=False,
                                           proxy_configs=proxy_configs)

        # We can shorten the paths in this case since we'll run out of wf_dir
        paths = [os.path.basename(p) for p in paths]

        # TODO(tmckay): for now, paths[0] is always assumed to be the app
        # jar and we generate paths in order (mains, then libs).
        # When we have a Spark job type, we can require a "main" and set
        # the app jar explicitly to be "main"
        app_jar = paths.pop(0)

        # The rest of the paths will be passed with --jars
        additional_jars = ",".join(paths)
        if additional_jars:
            additional_jars = "--jars " + additional_jars

        # Launch the spark job using spark-submit and deploy_mode = client
        host = master.hostname()
        port = c_helper.get_config_value("Spark", "Master port", self.cluster)
        spark_submit = os.path.join(
            c_helper.get_config_value("Spark", "Spark home", self.cluster),
            "bin/spark-submit")

        job_class = job_execution.job_configs.configs["edp.java.main_class"]

        # TODO(tmckay): we need to clean up wf_dirs on long running clusters
        # TODO(tmckay): probably allow for general options to spark-submit
        args = " ".join(job_execution.job_configs.get('args', []))

        # The redirects of stdout and stderr will preserve output in the wf_dir
        cmd = "%s %s --class %s %s --master spark://%s:%s %s" % (
            spark_submit, app_jar, job_class, additional_jars, host, port,
            args)

        # If an exception is raised here, the job_manager will mark
        # the job failed and log the exception
        with remote.get_remote(master) as r:
            # Upload the command launch script
            launch = os.path.join(wf_dir, "launch_command")
            r.write_file_to(launch, self._job_script())
            r.execute_command("chmod +x %s" % launch)
            ret, stdout = r.execute_command(
                "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!" %
                (wf_dir, cmd))

        if ret == 0:
            # Success, we'll add the wf_dir in job_execution.extra and store
            # pid@instance_id as the job id
            # We know the job is running so return "RUNNING"
            return (stdout.strip() + "@" + master.id, edp.JOB_STATUS_RUNNING, {
                'spark-path': wf_dir
            })

        # Hmm, no execption but something failed.
        # Since we're using backgrounding with redirect, this is unlikely.
        raise e.EDPError(
            _("Spark job execution failed. Exit status = "
              "%(status)s, stdout = %(stdout)s") % {
                  'status': ret,
                  'stdout': stdout
              })
Пример #27
0
def get_oozie(cluster):
    return u.get_instance(cluster, 'OOZIE_SERVER')
Пример #28
0
def get_historyserver(cluster):
    return u.get_instance(cluster, 'historyserver')
Пример #29
0
def get_manager(cluster):
    return u.get_instance(cluster, 'MANAGER')
Пример #30
0
def get_jobtracker(cluster):
    instance = u.get_instance(cluster, "jobtracker")

    return instance
Пример #31
0
def get_oozie(cluster):
    return u.get_instance(cluster, "oozie")
Пример #32
0
def get_namenode(cluster):
    return u.get_instance(cluster, "namenode")
Пример #33
0
def get_jobtracker(cluster):
    instance = u.get_instance(cluster, "jobtracker")

    return instance
Пример #34
0
def install_manager(cluster):
    LOG.info("Starting Install Manager Process")
    mng_instance = u.get_instance(cluster, 'manager')

    idh_tarball_path = c_helper.get_config_value(
        cluster.cluster_configs.get('general'), c_helper.IDH_TARBALL_URL)

    idh_tarball_filename = idh_tarball_path.rsplit('/', 1)[-1]
    idh_dir = idh_tarball_filename[:idh_tarball_filename.find('.tar.gz')]
    LOG.info("IDH tgz will be retrieved from: \'%s\'", idh_tarball_path)

    idh_repo = c_helper.get_config_value(
        cluster.cluster_configs.get('general'), c_helper.IDH_REPO_URL)

    os_repo = c_helper.get_config_value(
        cluster.cluster_configs.get('general'), c_helper.OS_REPO_URL)

    idh_install_cmd = 'sudo ./%s/install.sh --mode=silent 2>&1' % idh_dir

    with mng_instance.remote() as r:
        LOG.info("Download IDH manager ")
        try:
            r.execute_command('curl -O %s 2>&1' % idh_tarball_path)
        except Exception as e:
            raise RuntimeError("Unable to download IDH manager from %s" %
                               idh_tarball_path, e)

        # unpack archive
        LOG.info("Unpack manager %s ", idh_tarball_filename)
        try:
            r.execute_command('tar xzf %s 2>&1' % idh_tarball_filename)
        except Exception as e:
            raise RuntimeError("Unable to unpack tgz %s",
                               idh_tarball_filename, e)

        # install idh
        LOG.debug("Install manager with %s : ", idh_install_cmd)
        inst_conf = _INST_CONF_TEMPLATE % (os_repo, idh_repo)
        r.write_file_to('%s/ui-installer/conf' % idh_dir, inst_conf)
        #TODO(alazarev) make timeout configurable (bug #1262897)
        r.execute_command(idh_install_cmd, timeout=3600)

        # fix nginx persimmions bug
        r.execute_command('sudo chmod o+x /var/lib/nginx/ /var/lib/nginx/tmp '
                          '/var/lib/nginx/tmp/client_body')

    # waiting start idh manager
    #TODO(alazarev) make timeout configurable (bug #1262897)
    timeout = 600
    LOG.debug("Waiting %s seconds for Manager to start : ", timeout)
    while timeout:
        try:
            telnetlib.Telnet(mng_instance.management_ip, 9443)
            break
        except IOError:
            timeout -= 2
            context.sleep(2)
    else:
        message = ("IDH Manager failed to start in %s minutes on node '%s' "
                   "of cluster '%s'"
                   % (timeout / 60, mng_instance.management_ip, cluster.name))
        LOG.error(message)
        raise iex.IntelPluginException(message)
Пример #35
0
 def get_oozie_server(self, cluster):
     return u.get_instance(cluster, "oozie")
Пример #36
0
    def run_job(self, job_execution):
        ctx = context.ctx()
        job = conductor.job_get(ctx, job_execution.job_id)

        # We'll always run the driver program on the master
        master = plugin_utils.get_instance(self.cluster, "master")

        # TODO(tmckay): wf_dir should probably be configurable.
        # The only requirement is that the dir is writable by the image user
        wf_dir = job_utils.create_workflow_dir(master, '/tmp/spark-edp', job,
                                               job_execution.id)
        paths = job_utils.upload_job_files(master, wf_dir, job,
                                           libs_subdir=False)

        # We can shorten the paths in this case since we'll run out of wf_dir
        paths = [os.path.basename(p) for p in paths]

        # TODO(tmckay): for now, paths[0] is always assumed to be the app
        # jar and we generate paths in order (mains, then libs).
        # When we have a Spark job type, we can require a "main" and set
        # the app jar explicitly to be "main"
        app_jar = paths.pop(0)

        # The rest of the paths will be passed with --jars
        additional_jars = ",".join(paths)
        if additional_jars:
            additional_jars = "--jars " + additional_jars

        # Launch the spark job using spark-submit and deploy_mode = client
        host = master.hostname()
        port = c_helper.get_config_value("Spark", "Master port", self.cluster)
        spark_submit = os.path.join(
            c_helper.get_config_value("Spark",
                                      "Spark home",
                                      self.cluster),
            "bin/spark-submit")

        job_class = job_execution.job_configs.configs["edp.java.main_class"]

        # TODO(tmckay): we need to clean up wf_dirs on long running clusters
        # TODO(tmckay): probably allow for general options to spark-submit
        args = " ".join(job_execution.job_configs.get('args', []))

        # The redirects of stdout and stderr will preserve output in the wf_dir
        cmd = "%s %s --class %s %s --master spark://%s:%s %s" % (
            spark_submit,
            app_jar,
            job_class,
            additional_jars,
            host,
            port,
            args)

        # If an exception is raised here, the job_manager will mark
        # the job failed and log the exception
        with remote.get_remote(master) as r:
            # Upload the command launch script
            launch = os.path.join(wf_dir, "launch_command")
            r.write_file_to(launch, self._job_script())
            r.execute_command("chmod +x %s" % launch)
            ret, stdout = r.execute_command(
                "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!"
                % (wf_dir, cmd))

        if ret == 0:
            # Success, we'll add the wf_dir in job_execution.extra and store
            # pid@instance_id as the job id
            # We know the job is running so return "RUNNING"
            return (stdout.strip() + "@" + master.id,
                    edp.JOB_STATUS_RUNNING,
                    {'spark-path': wf_dir})

        # Hmm, no execption but something failed.
        # Since we're using backgrounding with redirect, this is unlikely.
        raise e.EDPError("Spark job execution failed. Exit status = %s, "
                         "stdout = %s" % (ret, stdout))
Пример #37
0
def get_namenode(cluster):
    return u.get_instance(cluster, "namenode")
Пример #38
0
def decommission_nodes(cluster, instances):
    dec_hosts = [i.fqdn() for i in instances]
    dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)]
    tt_hosts = [dn.fqdn() for dn in u.get_tasktrackers(cluster)]

    client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name)

    dec_dn_hosts = []
    for dec_host in dec_hosts:
        if dec_host in dn_hosts:
            dec_dn_hosts.append(dec_host)

    if dec_dn_hosts:
        client.services.hdfs.decommission_nodes(dec_dn_hosts)

        #TODO(alazarev) make timeout configurable (bug #1262897)
        timeout = 14400  # 4 hours
        cur_time = 0
        for host in dec_dn_hosts:
            while cur_time < timeout:
                if client.services.hdfs.get_datanode_status(
                        host) == 'Decomissioned':
                    break
                context.sleep(5)
                cur_time += 5
            else:
                LOG.warn("Failed to decomission node '%s' of cluster '%s' "
                         "in %s minutes" % (host, cluster.name, timeout / 60))

    client.nodes.stop(dec_hosts)

    # wait stop services
    #TODO(alazarev) make timeout configurable (bug #1262897)
    timeout = 600  # 10 minutes
    cur_time = 0
    for instance in instances:
        while cur_time < timeout:
            stopped = True
            if instance.fqdn() in dn_hosts:
                code, out = instance.remote().execute_command(
                    'sudo /sbin/service hadoop-datanode status',
                    raise_when_error=False)
                if out.strip() != 'datanode is stopped':
                    stopped = False
                if out.strip() == 'datanode dead but pid file exists':
                    instance.remote().execute_command(
                        'sudo rm -f '
                        '/var/run/hadoop/hadoop-hadoop-datanode.pid')
            if instance.fqdn() in tt_hosts:
                code, out = instance.remote().execute_command(
                    'sudo /sbin/service hadoop-tasktracker status',
                    raise_when_error=False)
                if out.strip() != 'tasktracker is stopped':
                    stopped = False
            if stopped:
                break
            else:
                context.sleep(5)
                cur_time += 5
        else:
            LOG.warn("Failed to stop services on node '%s' of cluster '%s' "
                     "in %s minutes" % (instance, cluster.name, timeout / 60))

    for node in dec_hosts:
        LOG.info("Deleting node '%s' on cluster '%s'" % (node, cluster.name))
        client.nodes.delete(node)
Пример #39
0
def get_resourcemanager(cluster):
    return u.get_instance(cluster, 'resourcemanager')
Пример #40
0
 def get_oozie_server(self, cluster):
     return u.get_instance(cluster, "oozie_server")
Пример #41
0
def get_hiveserver(cluster):
    return u.get_instance(cluster, "hiveserver")
Пример #42
0
def get_historyserver(cluster):
    return u.get_instance(cluster, 'JOBHISTORY')
Пример #43
0
def get_namenode(cluster):
    return u.get_instance(cluster, "NAMENODE")
Пример #44
0
def get_secondarynamenode(cluster):
    return u.get_instance(cluster, 'SECONDARYNAMENODE')
Пример #45
0
def get_resourcemanager(cluster):
    return u.get_instance(cluster, 'resourcemanager')
Пример #46
0
 def test_get_instance(self):
     self.assertIsNone(u.get_instance(self.c1, 'wrong-process'))
     self.assertEqual(u.get_instance(self.c1, 'nn'),
                      self.ng1.instances[0])
     with testtools.ExpectedException(ex.InvalidComponentCountException):
         u.get_instance(self.c1, 'dn')
Пример #47
0
def get_oozie(cluster):
    return u.get_instance(cluster, "oozie")
Пример #48
0
def get_hiveserver(cluster):
    return u.get_instance(cluster, "hiveserver")
Пример #49
0
def decommission_nodes(cluster, instances):
    dec_hosts = [i.fqdn() for i in instances]
    dn_hosts = [dn.fqdn() for dn in u.get_datanodes(cluster)]
    tt_hosts = [dn.fqdn() for dn in u.get_tasktrackers(cluster)]

    client = c.IntelClient(u.get_instance(cluster, 'manager'), cluster.name)

    dec_dn_hosts = []
    for dec_host in dec_hosts:
        if dec_host in dn_hosts:
            dec_dn_hosts.append(dec_host)

    if dec_dn_hosts:
        client.services.hdfs.decommission_nodes(dec_dn_hosts)

        #TODO(alazarev) make timeout configurable (bug #1262897)
        timeout = 14400  # 4 hours
        cur_time = 0
        for host in dec_dn_hosts:
            while cur_time < timeout:
                if client.services.hdfs.get_datanode_status(
                        host) == 'Decomissioned':
                    break
                context.sleep(5)
                cur_time += 5
            else:
                LOG.warn("Failed to decomission node '%s' of cluster '%s' "
                         "in %s minutes" % (host, cluster.name, timeout / 60))

    client.nodes.stop(dec_hosts)

    # wait stop services
    #TODO(alazarev) make timeout configurable (bug #1262897)
    timeout = 600  # 10 minutes
    cur_time = 0
    for instance in instances:
        while cur_time < timeout:
            stopped = True
            if instance.fqdn() in dn_hosts:
                code, out = instance.remote().execute_command(
                    'sudo /sbin/service hadoop-datanode status',
                    raise_when_error=False)
                if out.strip() != 'datanode is stopped':
                    stopped = False
                if out.strip() == 'datanode dead but pid file exists':
                    instance.remote().execute_command(
                        'sudo rm -f '
                        '/var/run/hadoop/hadoop-hadoop-datanode.pid')
            if instance.fqdn() in tt_hosts:
                code, out = instance.remote().execute_command(
                    'sudo /sbin/service hadoop-tasktracker status',
                    raise_when_error=False)
                if out.strip() != 'tasktracker is stopped':
                    stopped = False
            if stopped:
                break
            else:
                context.sleep(5)
                cur_time += 5
        else:
            LOG.warn("Failed to stop services on node '%s' of cluster '%s' "
                     "in %s minutes" % (instance, cluster.name, timeout / 60))

    for node in dec_hosts:
        LOG.info("Deleting node '%s' on cluster '%s'" % (node, cluster.name))
        client.nodes.delete(node)
Пример #50
0
def get_historyserver(cluster):
    return u.get_instance(cluster, 'historyserver')
Пример #51
0
 def get_oozie_server(self, cluster):
     return u.get_instance(cluster, "OOZIE_SERVER")
Пример #52
0
 def get_oozie_server(self, cluster):
     return u.get_instance(cluster, "OOZIE_SERVER")