Пример #1
0
    def run_task(self, ip_address, process_id, settings, run_settings):
        """
            Start the task on the instance, then hang and
            periodically check its state.
        """
        logger.debug("run_task %s" % ip_address)
        #ip = botocloudconnector.get_instance_ip(instance_id, settings)
        #ip = ip_address
        logger.debug("ip=%s" % ip_address)
        # curr_username = settings['username']
        #settings['username'] = '******'
        # ssh = sshconnector.open_connection(ip_address=ip,
        #                                    settings=settings)
        # settings['username'] = curr_username

        #relative_path = settings['type'] + '@' + settings['payload_destination'] + "/" + process_id
        relative_path_suffix = self.get_relative_output_path(settings)
        relative_path = settings['type'] + '@' + os.path.join(relative_path_suffix, process_id)
        destination = get_url_with_credentials(settings,
            relative_path,
            is_relative_path=True,
            ip_address=ip_address)
        makefile_path = get_make_path(destination)
        try:
            ssh = open_connection(ip_address=ip_address, settings=settings)
            command, errs = run_make(ssh, makefile_path, 'start_running_process')
            logger.debug('execute_command=%s' % command
            )
        finally:
            ssh.close()
Пример #2
0
def start_round_robin_reschedule(nodes, procs_2b_rescheduled,
                                 current_procs, settings,
                                 output_storage_settings, relative_path_suffix):
    total_nodes = len(nodes)
    all_nodes = list(nodes)
    processes = len(procs_2b_rescheduled)
    if total_nodes > processes:
        total_nodes = processes
        all_nodes = nodes[:total_nodes]
    if total_nodes == 0:
        return
    proc_per_node = processes / total_nodes
    remaining_procs = processes % total_nodes
    index = 0
    new_processes = current_procs
    rescheduled_procs = list(procs_2b_rescheduled)
    for cur_node in all_nodes:
        logger.debug('Schedule here %s' % cur_node)
        ip_address = cur_node.ip_address
        if not ip_address:
            ip_address = cur_node.private_ip_address
        logger.debug('ip_address=%s' % ip_address)
        #relative_path = output_storage_settings['type'] + '@' + settings['payload_destination']
        relative_path = output_storage_settings['type'] + '@' + relative_path_suffix
        procs_on_cur_node = proc_per_node
        if remaining_procs:
            procs_on_cur_node = proc_per_node + 1
            remaining_procs -= 1
        logger.debug('procs_cur_node=%d' % procs_on_cur_node)
        ids = get_procs_ids(procs_on_cur_node,
                            rescheduled_procs=rescheduled_procs)
        #index += len(ids)
        #logger.debug('index=%d' % index)
        put_proc_ids(relative_path, ids, ip_address, settings)
        new_processes = construct_lookup_table(
            ids, ip_address, new_processes,
            status='reschedule_ready',
            maximum_retry=int(settings['maximum_retry']))
        destination = get_url_with_credentials(settings,
            relative_path,
            is_relative_path=True,
            ip_address=ip_address)
        logger.debug('schedule destination=%s' % destination)
        makefile_path = get_make_path(destination)
        logger.debug('makefile_path=%s' % makefile_path)

        command = "cd %s; make %s" % (makefile_path,
            'start_schedule %s %s %s' % (settings['payload_name'],
                                         settings['filename_for_PIDs'],
                                         settings['process_output_dirname'],
                                         settings['smart_connector_input']))
        command_out = ''
        errs = ''
        logger.debug("starting command for %s" % ip_address)
        try:
            ssh = open_connection(ip_address=ip_address, settings=settings)
            command_out, errs = run_command_with_status(ssh, command)
        except Exception, e:
            logger.error(e)
        finally:
Пример #3
0
def start_round_robin_reschedule(nodes, procs_2b_rescheduled, current_procs,
                                 settings, output_storage_settings,
                                 relative_path_suffix):
    total_nodes = len(nodes)
    all_nodes = list(nodes)
    processes = len(procs_2b_rescheduled)
    if total_nodes > processes:
        total_nodes = processes
        all_nodes = nodes[:total_nodes]
    if total_nodes == 0:
        return
    proc_per_node = processes / total_nodes
    remaining_procs = processes % total_nodes
    index = 0
    new_processes = current_procs
    rescheduled_procs = list(procs_2b_rescheduled)
    for cur_node in all_nodes:
        logger.debug('Schedule here %s' % cur_node)
        ip_address = cur_node.ip_address
        if not ip_address:
            ip_address = cur_node.private_ip_address
        logger.debug('ip_address=%s' % ip_address)
        #relative_path = output_storage_settings['type'] + '@' + settings['payload_destination']
        relative_path = output_storage_settings[
            'type'] + '@' + relative_path_suffix
        procs_on_cur_node = proc_per_node
        if remaining_procs:
            procs_on_cur_node = proc_per_node + 1
            remaining_procs -= 1
        logger.debug('procs_cur_node=%d' % procs_on_cur_node)
        ids = get_procs_ids(procs_on_cur_node,
                            rescheduled_procs=rescheduled_procs)
        #index += len(ids)
        #logger.debug('index=%d' % index)
        put_proc_ids(relative_path, ids, ip_address, settings)
        new_processes = construct_lookup_table(ids,
                                               ip_address,
                                               new_processes,
                                               status='reschedule_ready',
                                               maximum_retry=int(
                                                   settings['maximum_retry']))
        destination = get_url_with_credentials(settings,
                                               relative_path,
                                               is_relative_path=True,
                                               ip_address=ip_address)
        logger.debug('schedule destination=%s' % destination)
        makefile_path = get_make_path(destination)
        logger.debug('makefile_path=%s' % makefile_path)
        command = "cd %s; make %s" % (
            makefile_path, 'start_schedule PAYLOAD_NAME=%s IDS=%s' %
            (settings['payload_name'], settings['filename_for_PIDs']))
        command_out = ''
        errs = ''
        logger.debug("starting command for %s" % ip_address)
        try:
            ssh = open_connection(ip_address=ip_address, settings=settings)
            command_out, errs = run_command_with_status(ssh, command)
        except Exception, e:
            logger.error(e)
        finally:
Пример #4
0
    def run_task(self, ip_address, process_id, settings, run_settings):
        """
            Start the task on the instance, then hang and
            periodically check its state.
        """
        logger.debug("run_task %s" % ip_address)
        #ip = botocloudconnector.get_instance_ip(instance_id, settings)
        #ip = ip_address
        logger.debug("ip=%s" % ip_address)
        # curr_username = settings['username']
        #settings['username'] = '******'
        # ssh = sshconnector.open_connection(ip_address=ip,
        #                                    settings=settings)
        # settings['username'] = curr_username

        #relative_path = settings['type'] + '@' + settings['payload_destination'] + "/" + process_id
        relative_path_suffix = self.get_relative_output_path(settings)
        relative_path = settings['type'] + '@' + \
            os.path.join(relative_path_suffix, process_id)
        destination = get_url_with_credentials(settings,
                                               relative_path,
                                               is_relative_path=True,
                                               ip_address=ip_address)
        makefile_path = get_make_path(destination)
        try:
            ssh = open_connection(ip_address=ip_address, settings=settings)
            logger.debug(settings['process_output_dirname'])
            try:
                self.hadoop_input = 'HADOOP_INPUT_%s' % self.contextid
                self.hadoop_output = 'HADOOP_OUTPUT_%s' % self.contextid
                hadoop = run_settings['%s/input/system/compplatform/hadoop' %
                                      django_settings.SCHEMA_PREFIX]
                sudo = False
                options = '%s %s  %s %s %s ' % (
                    settings['smart_connector_input'],
                    settings['process_output_dirname'],
                    settings['hadoop_home_path'], self.hadoop_input,
                    self.hadoop_output)
                logger.debug('options = %s ' % options)
                optional_args = self.get_optional_args(run_settings)
                if optional_args:
                    options += " %s" % optional_args
                logger.debug('options = %s ' % options)
                command, errs = run_make(ssh,
                                         makefile_path,
                                         'start_running_process  %s' % options,
                                         sudo=sudo)
            except KeyError:
                sudo = True
                command, errs = run_make(ssh,
                                         makefile_path,
                                         'start_running_process %s %s' %
                                         (settings['smart_connector_input'],
                                          settings['process_output_dirname']),
                                         sudo=sudo)
            logger.debug('execute_command=%s' % command)
        finally:
            ssh.close()
Пример #5
0
def start_round_robin_schedule(nodes, processes, schedule_index, settings, relative_path_suffix):
    total_nodes = len(nodes)
    all_nodes = list(nodes)
    if total_nodes > processes:
        total_nodes = processes
        all_nodes = nodes[:total_nodes]
    if total_nodes == 0:
        return
    proc_per_node = processes / total_nodes
    remaining_procs = processes % total_nodes
    index = schedule_index
    new_processes = []

    for cur_node in all_nodes:
        ip_address = cur_node[1]
        #relative_path = settings['type'] + '@' + settings['payload_destination']
        relative_path = settings['type'] + '@' + relative_path_suffix
        procs_on_cur_node = proc_per_node
        if remaining_procs:
            procs_on_cur_node = proc_per_node + 1
            remaining_procs -= 1
        logger.debug('procs_cur_node=%d' % procs_on_cur_node)
        ids = get_procs_ids(procs_on_cur_node, index=index)
        index += len(ids)
        logger.debug('index=%d' % index)
        put_proc_ids(relative_path, ids, ip_address, settings)
        new_processes = construct_lookup_table(
            ids, ip_address, new_processes,
            maximum_retry=int(settings['maximum_retry']))

        destination = get_url_with_credentials(
            settings,
            relative_path,
            is_relative_path=True,
            ip_address=ip_address)
        logger.debug('schedule destination=%s' % destination)
        makefile_path = get_make_path(destination)
        logger.debug('makefile_path=%s' % makefile_path)

        command = "cd %s; make %s" % (makefile_path,
            'start_schedule %s %s %s %s' % (
            settings['payload_name'], settings['filename_for_PIDs'], settings['process_output_dirname'],
            settings['smart_connector_input']))

        command_out = ''
        errs = ''
        logger.debug("starting command for %s" % ip_address)
        try:
            ssh = open_connection(ip_address=ip_address, settings=settings)
            command_out, errs = run_command_with_status(ssh, command)
        except Exception, e:
            logger.error(e)
        finally:
Пример #6
0
def start_round_robin_schedule(nodes, processes, schedule_index, settings, relative_path_suffix):
    total_nodes = len(nodes)
    all_nodes = list(nodes)
    if total_nodes > processes:
        total_nodes = processes
        all_nodes = nodes[:total_nodes]
    if total_nodes == 0:
        return
    proc_per_node = processes / total_nodes
    remaining_procs = processes % total_nodes
    index = schedule_index
    new_processes = []

    for cur_node in all_nodes:
        ip_address = cur_node[1]
        #relative_path = settings['type'] + '@' + settings['payload_destination']
        relative_path = settings['type'] + '@' + relative_path_suffix
        procs_on_cur_node = proc_per_node
        if remaining_procs:
            procs_on_cur_node = proc_per_node + 1
            remaining_procs -= 1
        logger.debug('procs_cur_node=%d' % procs_on_cur_node)
        ids = get_procs_ids(procs_on_cur_node, index=index)
        index += len(ids)
        logger.debug('index=%d' % index)
        put_proc_ids(relative_path, ids, ip_address, settings)
        new_processes = construct_lookup_table(
            ids, ip_address, new_processes,
            maximum_retry=int(settings['maximum_retry']))

        destination = get_url_with_credentials(
            settings,
            relative_path,
            is_relative_path=True,
            ip_address=ip_address)
        logger.debug('schedule destination=%s' % destination)
        makefile_path = get_make_path(destination)
        logger.debug('makefile_path=%s' % makefile_path)
        command = "cd %s; make %s" % (makefile_path,
            'start_schedule PAYLOAD_NAME=%s IDS=%s' % (
            settings['payload_name'], settings['filename_for_PIDs']))
        command_out = ''
        errs = ''
        logger.debug("starting command for %s" % ip_address)
        try:
            ssh = open_connection(ip_address=ip_address, settings=settings)
            command_out, errs = run_command_with_status(ssh, command)
        except Exception, e:
            logger.error(e)
        finally:
Пример #7
0
    def run_task(self, ip_address, process_id, settings, run_settings):
        """
            Start the task on the instance, then hang and
            periodically check its state.
        """
        logger.debug("run_task %s" % ip_address)
        #ip = botocloudconnector.get_instance_ip(instance_id, settings)
        #ip = ip_address
        logger.debug("ip=%s" % ip_address)
        # curr_username = settings['username']
        #settings['username'] = '******'
        # ssh = sshconnector.open_connection(ip_address=ip,
        #                                    settings=settings)
        # settings['username'] = curr_username

        #relative_path = settings['type'] + '@' + settings['payload_destination'] + "/" + process_id
        relative_path_suffix = self.get_relative_output_path(settings)
        relative_path = settings['type'] + '@' + \
            os.path.join(relative_path_suffix, process_id)
        destination = get_url_with_credentials(settings,
                                               relative_path,
                                               is_relative_path=True,
                                               ip_address=ip_address)
        makefile_path = get_make_path(destination)
        try:
            ssh = open_connection(ip_address=ip_address, settings=settings)
            logger.debug(settings['process_output_dirname'])
            try:
                self.hadoop_input = 'HADOOP_INPUT_%s' % self.contextid
                self.hadoop_output = 'HADOOP_OUTPUT_%s' % self.contextid
                hadoop = run_settings['%s/input/system/compplatform/hadoop' % django_settings.SCHEMA_PREFIX]
                sudo = False
                options = '%s %s  %s %s %s ' % (settings['smart_connector_input'], settings['process_output_dirname'], settings['hadoop_home_path'], self.hadoop_input, self.hadoop_output)
                logger.debug('options = %s ' % options)
                optional_args = self.get_optional_args(run_settings)
                if optional_args:
                        options += " %s" % optional_args
                logger.debug('options = %s ' % options)
                command, errs = run_make(ssh, makefile_path, 'start_running_process  %s'  % options, sudo= sudo )
            except KeyError:
                sudo = True
                command, errs = run_make(
                ssh, makefile_path,
                'start_running_process %s %s'  % (settings['smart_connector_input'],
                settings['process_output_dirname']), sudo= sudo)
            logger.debug('execute_command=%s' % command
                         )
        finally:
            ssh.close()
Пример #8
0
def _is_bootstrap_complete(ip, settings, destination):
    """
        Return True if package job on instance_id has is_job_finished
    """
    ssh = open_connection(ip_address=ip, settings=settings)
    makefile_path = get_make_path(destination)
    (command_out, err) = run_make(ssh, makefile_path, 'bootstrap_done')
    if command_out:
        logger.debug("command_out = %s" % command_out)
        for line in command_out:
            if 'Environment Setup Completed' in line:
                return True
    else:
        logger.warn(err)
    return False
def _is_bootstrap_complete(ip, settings, destination):
    """
        Return True if package job on instance_id has is_job_finished
    """
    ssh = open_connection(ip_address=ip, settings=settings)
    makefile_path = get_make_path(destination)
    (command_out, err) = run_make(ssh, makefile_path, 'bootstrap_done')
    if command_out:
        logger.debug("command_out = %s" % command_out)
        for line in command_out:
            if 'Environment Setup Completed' in line:
                return True
    else:
        logger.warn(err)
    return False
Пример #10
0
def _is_schedule_complete(ip, settings, destination):
    """
            Return True if package job on instance_id has is_job_finished
        """
    ssh = open_connection(ip_address=ip, settings=settings)
    makefile_path = get_make_path(destination)
    command = "cd %s; make %s" % (makefile_path, 'schedule_done IDS=%s' %
                                  (settings['filename_for_PIDs']))
    command_out, _ = run_command_with_status(ssh, command)
    logger.debug('command=%s' % command)
    if command_out:
        logger.debug("command_out = %s" % command_out)
        for line in command_out:
            if 'All processes are scheduled' in line:
                return True
    return False
Пример #11
0
def _is_schedule_complete(ip, settings, destination):
        """
            Return True if package job on instance_id has is_job_finished
        """
        ssh = open_connection(ip_address=ip, settings=settings)
        makefile_path = get_make_path(destination)
        command = "cd %s; make %s" % (makefile_path,
                                      'schedule_done IDS=%s' % (
                                          settings['filename_for_PIDs']))
        command_out, _ = run_command_with_status(ssh, command)
        logger.debug('command=%s' % command)
        if command_out:
            logger.debug("command_out = %s" % command_out)
            for line in command_out:
                if 'All processes are scheduled' in line:
                    return True
        return False
Пример #12
0
def _start_bootstrap(instance, ip,  settings, source, destination):
    """
        Start the task on the instance, then return
    """
    logger.info("run_task %s" % str(instance))
    copy_directories(source, destination)
    makefile_path = get_make_path(destination)
    # TODO, FIXME:  need to have timeout for yum install make
    # and then test can access, otherwise, loop.
    install_make = 'yum install -y make'
    command_out = ''
    errs = ''
    logger.debug("starting command for %s" % ip)
    ssh = ''
    try:
        ssh = open_connection(ip_address=ip, settings=settings)
        command_out, errs = run_command_with_status(ssh, install_make)
        logger.debug("command_out1=(%s, %s)" % (command_out, errs))
        run_make(ssh, makefile_path, 'start_bootstrap')
    except Exception, e:#fixme: consider using reliability framework
        logger.error(e)
        raise
def _start_bootstrap(instance, ip, settings, source, destination):
    """
        Start the task on the instance, then return
    """
    logger.info("run_task %s" % str(instance))
    copy_directories(source, destination)
    makefile_path = get_make_path(destination)
    # TODO, FIXME:  need to have timeout for yum install make
    # and then test can access, otherwise, loop.
    install_make = 'yum install -y make'
    command_out = ''
    errs = ''
    logger.debug("starting command for %s" % ip)
    ssh = ''
    try:
        ssh = open_connection(ip_address=ip, settings=settings)
        command_out, errs = run_command_with_status(ssh, install_make)
        logger.debug("command_out1=(%s, %s)" % (command_out, errs))
        run_make(ssh, makefile_path, 'start_bootstrap')
    except Exception, e:  #fixme: consider using reliability framework
        logger.error(e)
        raise
Пример #14
0
    def is_job_finished(self, wait_class, ip_address, process_id, retry_left,
                        settings, relative_path_suffix):
        """
            Return True if package job on instance_id has is_job_finished
        """
        # TODO: maybe this should be a reusable library method?
        ip = ip_address
        logger.debug("ip=%s" % ip)
        curr_username = settings['username']
        # settings['username'] = '******'
        #relative_path = settings['type'] + '@' + settings['payload_destination'] + "/" + process_id
        relative_path = settings['type'] + '@' + os.path.join(
            relative_path_suffix, process_id)
        destination = get_url_with_credentials(settings,
                                               relative_path,
                                               is_relative_path=True,
                                               ip_address=ip)
        makefile_path = get_make_path(destination)
        ssh = None
        try:
            logger.debug('trying ssh')
            ssh = open_connection(ip_address=ip, settings=settings)
            logger.debug('successful ssh')
            (command_out, errs) = run_make(ssh, makefile_path,
                                           "process_running_done")
            ssh.close()
            logger.debug("command_out2=(%s, %s)" % (command_out, errs))
            if command_out:
                logger.debug("command_out = %s" % command_out)
                for line in command_out:
                    if "stopped" in line:
                        return True
        except Exception, e:

            # Failure detection and then management
            logger.debug('error is = %s' % e)
            process_failed = False
            node_failed = False
            logger.debug('Is there error? %s' %
                         wait_class.failure_detector.failed_ssh_connection(e))
            if wait_class.failure_detector.failed_ssh_connection(e):
                node = [
                    x for x in wait_class.created_nodes if x[1] == ip_address
                ]
                wait_class.failed_processes = wait_class.ftmanager.manage_failed_process(
                    settings, process_id, node[0], node[0][0], ip_address,
                    wait_class.failed_nodes, wait_class.executed_procs,
                    wait_class.current_processes, wait_class.all_processes,
                    wait_class.procs_2b_rescheduled)
                #wait_class.procs_2b_rescheduled.extend(rescheduled_prcs)
                '''
                if wait_class.failure_detector.node_terminated(settings, node[0][0]):
                    if not wait_class.failure_detector.recorded_failed_node(
                            wait_class.failed_nodes, ip_address):
                        wait_class.failed_nodes.append(node[0])
                    node_failed = True
                else:
                    if not retry_left:
                        process_failed = True
                    else:
                        process_lists = [wait_class.executed_procs, wait_class.current_processes,
                                         wait_class.all_processes]
                        wait_class.ftmanager.decrease_max_retry(
                            process_lists, ip_address, process_id)
                # Failure management
                if node_failed or process_failed:
                    process_lists = [wait_class.executed_procs,
                                     wait_class.current_processes, wait_class.all_processes]
                    if node_failed:
                        wait_class.ftmanager.flag_all_processes(process_lists, ip_address)
                    elif process_failed:
                        wait_class.ftmanager.flag_this_process(
                            process_lists, ip_address, process_id)
                    wait_class.failed_processes = wait_class.ftmanager.\
                        get_total_failed_processes(wait_class.executed_procs)
                    if wait_class.reschedule_failed_procs:
                        wait_class.ftmanager.collect_failed_processes(
                            wait_class.executed_procs, wait_class.procs_2b_rescheduled)

                '''
            else:
                raise
    def is_job_finished(self, ip_address, process_id, retry_left, settings, relative_path_suffix):
        """
            Return True if package job on instance_id has is_job_finished
        """
        # TODO: maybe this should be a reusable library method?
        ip = ip_address
        logger.debug("ip=%s" % ip)
        curr_username = settings['username']
        settings['username'] = '******'
        #relative_path = settings['type'] + '@' + settings['payload_destination'] + "/" + process_id
        relative_path = settings['type'] + '@' + os.path.join(relative_path_suffix, process_id)
        destination = get_url_with_credentials(settings,
            relative_path,
            is_relative_path=True,
            ip_address=ip)
        makefile_path = get_make_path(destination)
        ssh = None
        try:
            logger.debug('trying ssh')
            ssh = open_connection(ip_address=ip, settings=settings)
            logger.debug('successful ssh')
            (command_out, errs) = run_make(ssh, makefile_path, "process_running_done")
            ssh.close()
            logger.debug("command_out2=(%s, %s)" % (command_out, errs))
            if command_out:
                logger.debug("command_out = %s" % command_out)
                for line in command_out:
                    if "stopped" in line:
                        return True
        except Exception, e:

            # Failure detection and then management
            logger.debug('error is = %s' % e)
            process_failed = False
            node_failed = False
            logger.debug('Is there error? %s' % self.failure_detector.failed_ssh_connection(e))
            if self.failure_detector.failed_ssh_connection(e):
                node = [x for x in self.created_nodes if x[1] == ip_address]
                self.failed_processes = self.ftmanager.manage_failed_process(
                    settings, process_id, node[0], node[0][0], ip_address,
                    self.failed_nodes, self.executed_procs, self.current_processes,
                    self.all_processes, self.procs_2b_rescheduled)
                #self.procs_2b_rescheduled.extend(rescheduled_prcs)
                '''
                if self.failure_detector.node_terminated(settings, node[0][0]):
                    if not self.failure_detector.recorded_failed_node(
                            self.failed_nodes, ip_address):
                        self.failed_nodes.append(node[0])
                    node_failed = True
                else:
                    if not retry_left:
                        process_failed = True
                    else:
                        process_lists = [self.executed_procs, self.current_processes,
                                         self.all_processes]
                        self.ftmanager.decrease_max_retry(
                            process_lists, ip_address, process_id)
                # Failure management
                if node_failed or process_failed:
                    process_lists = [self.executed_procs,
                                     self.current_processes, self.all_processes]
                    if node_failed:
                        self.ftmanager.flag_all_processes(process_lists, ip_address)
                    elif process_failed:
                        self.ftmanager.flag_this_process(
                            process_lists, ip_address, process_id)
                    self.failed_processes = self.ftmanager.\
                        get_total_failed_processes(self.executed_procs)
                    if self.reschedule_failed_procs:
                        self.ftmanager.collect_failed_processes(
                            self.executed_procs, self.procs_2b_rescheduled)

                '''
            else:
                raise