Exemplo n.º 1
0
 class TestComponent (Component):
     
     def method1 (self):
         return "return1"
     method1 = exposed(method1)
     
     def method2 (self):
         return "return2"
     method2 = exposed(method2)
     
     def method3 (self):
         return "return3"
class EventSimulator(Component):
    """Event Simulator. Manages time stamps, events, and the advancing of the clock

    Definition of an event, which is a dictionary of following keys:
        machine -- 0, 1, 2 ... represent the system (e.g. Intrepid or Eureka) where the event occurs
        type -- I (init), Q (submit job), S (start job), E (end job),
        datetime -- the date time at which the event occurs
        unixtime -- the unix time form for datetime
        jobid -- the job id associated with the event
        location -- the location where the event occurs, represented by node list or partition list
    """

    implementation = "evsim"
    name = "event-manager"

    def __init__(self, *args, **kwargs):

        Component.__init__(self, *args, **kwargs)
        self.event_list = [{'unixtime':0}]
        self.time_stamp = 0

        self.finished = False

        ###
        # samnickolay
        self.jobs_queue_time_utilizations = {}
        self.utilization_records = []
        self.next_slowdown_threshold_time_step = None
        self.high_priority_nodes = 0
        self.low_priority_nodes = 0
        # samnickolay
        ###

        # dwang:
        print("[dw_evsim] simu_name: %s. " %kwargs.get("name"))
        print("[dw_evsim] simu_times: %d. " %kwargs.get("times"))
        print("[dw_evsim] checkpoint: %s. " %kwargs.get("checkpoint"))
        #
        print("[dw_evsim] checkp_dsize: %s. " %kwargs.get("checkp_dsize"))
        print("[dw_evsim] checkp_bw_write: %s. " %kwargs.get("checkp_w_bandwidth"))
        print("[dw_evsim] checkp_bw_read: %s. " %kwargs.get("checkp_r_bandwidth"))
        print("[dw_evsim] checkp_interval: %s. " %kwargs.get("checkp_t_internval"))
        # dwang
        self.bgsched = Sim_bg_Sched(**kwargs)
        #self.csched = Sim_Cluster_Sched()

        self.mmon = metricmon()

        self.go_next = True

    def set_go_next(self, bool_value):
        self.go_next = bool_value
    set_go_next = exposed(set_go_next)

    def get_go_next(self,):
        return self.go_next
    get_go_next = exposed(get_go_next)

    def events_length(self):
        return len(self.event_list)

    def add_event(self, ev_spec):
        '''insert time stamps in the same order'''

        time_sec = ev_spec.get('unixtime')
        if time_sec == None:
            # print "insert time stamp error: no unix time provided"
            return -1

        if not ev_spec.has_key('jobid'):
            ev_spec['jobid'] = 0
        if not ev_spec.has_key('location'):
            ev_spec['location'] = []

        pos  = self.events_length()

        while time_sec < self.event_list[pos-1].get('unixtime'):
            pos = pos - 1

        self.event_list.insert(pos, ev_spec)
        #print "insert time stamp ", ev_spec, " at pos ", pos
        return pos
    add_event = exposed(add_event)

    # dwang:
    def del_event(self, jobid_sel):
        for temp_elem in self.event_list:
            if temp_elem.get('jobid') == jobid_sel:
                # print "[DEL_EVENT] temp_elem j_id: ", temp_elem.get('jobid')
                # print "[DEL_EVENT] temp_elem j_loc: ", temp_elem.get('location')
                # print "[DEL_EVENT] temp_elem j_unixt: ", temp_elem.get('unixtime')
                # print "[DEL_EVENT] temp_elem type: ", temp_elem.get('type')
                self.event_list.remove(temp_elem)
    del_event = exposed(del_event)
    # dwang

    def get_time_span(self):
        '''return the whole time span'''
        starttime = self.event_list[1].get('unixtime')
        endtime = self.event_list[-1].get('unixtime')
        timespan = endtime - starttime
        return timespan
    get_time_span = exposed(get_time_span)

    def get_current_time_stamp(self):
        '''return current time stamp'''
        return self.time_stamp

    def get_current_time(self):
        '''return current unix time'''
        return self.event_list[self.time_stamp].get('unixtime')
    get_current_time = exposed(get_current_time)

    def get_current_date_time(self):
        '''return current date time'''
        return self.event_list[self.time_stamp].get('datetime')
    get_current_date_time = exposed(get_current_date_time)

    def get_current_event_type(self):
        '''return current event type'''
        return self.event_list[self.time_stamp].get('type')
    get_current_event_type = exposed(get_current_event_type)

    def get_current_event_job(self):
        '''return current event job'''
        return self.event_list[self.time_stamp].get('jobid')
    get_current_event_job = exposed(get_current_event_job)

    def get_current_event_location(self):
        return self.event_list[self.time_stamp].get('location')
    get_current_event_location = exposed(get_current_event_location)

    def get_current_event_machine(self):
        '''return machine which the current event belongs to'''
        return self.event_list[self.time_stamp].get('machine')

    def get_current_event_all(self):
        '''return current event'''
        return self.event_list[self.time_stamp]

    def get_next_event_time_sec(self):
        '''return the next event time'''
        if self.time_stamp < len(self.event_list) - 1:
            return self.event_list[self.time_stamp + 1].get('unixtime')
        else:
            return -1
    get_next_event_time_sec = exposed(get_next_event_time_sec)


    def is_finished(self):
        return self.finished
    is_finished = exposed(is_finished)

    def clock_increment(self):
        '''the current time stamp increments by 1'''
        if self.time_stamp < len(self.event_list) - 1:
            self.time_stamp += 1
            if SHOW_SCREEN_LOG:
                print str(self.get_current_date_time()) + \
                "[%s]: Time stamp is incremented by 1, current time stamp: %s " % (self.implementation, self.time_stamp)
        else:
            self.finished = True

        return self.time_stamp
    clock_intrement = exposed(clock_increment)

    def add_init_events(self, jobspecs, machine_id):   ###EVSIM change here
        """add initial submission events based on input jobs and machine id"""

        for jobspec in jobspecs:
            evspec = {}
            evspec['machine'] = machine_id
            evspec['type'] = "Q"
            evspec['unixtime'] = float(jobspec.get('submittime'))
            evspec['datetime'] = sec_to_date(float(jobspec.get('submittime')))
            evspec['jobid'] = jobspec.get('jobid')
            evspec['location'] = []
            self.add_event(evspec)
    add_init_events = exposed(add_init_events)


    def init_unhold_events(self, machine_id):
        """add unholding event"""
        if not self.event_list:
            return

        first_time_sec = self.event_list[1]['unixtime']
        last_time_sec = self.event_list[-1]['unixtime']

        unhold_point = first_time_sec + UNHOLD_INTERVAL + machine_id
        while unhold_point < last_time_sec:
            evspec = {}
            evspec['machine'] = machine_id
            evspec['type'] = "C"
            evspec['unixtime'] = unhold_point
            evspec['datetime'] = sec_to_date(unhold_point)
            self.add_event(evspec)

            unhold_point += UNHOLD_INTERVAL + machine_id
    init_unhold_events = exposed(init_unhold_events)

    def init_mmon_events(self):
        """add metrics monitor points into time stamps"""
        if not self.event_list:
            return

        first_time_sec = self.get_first_mmon_point(self.event_list[1]['datetime'])
        last_time_sec = self.event_list[-1]['unixtime']
        machine_id = MMON

        mmon_point = first_time_sec + MMON_INTERVAL
        while mmon_point < last_time_sec:
            evspec = {}
            evspec['machine'] = machine_id
            evspec['unixtime'] = mmon_point
            evspec['datetime'] = sec_to_date(mmon_point)
            self.add_event(evspec)
            mmon_point += MMON_INTERVAL
    init_mmon_events = exposed(init_mmon_events)

    def get_first_mmon_point(self, date_time):
        "based on the input date time (%m/%d/%Y %H:%M:%S), get the next epoch time that is at the beginning of an hour"
        segs = date_time.split()
        hours = segs[1].split(":")
        new_datetime = "%s %s:%s:%s" %  (segs[0], hours[0], '00', '00')
        new_epoch = date_to_sec(new_datetime) + 3600
        return new_epoch

    def print_events(self):
        print "total events:", len(self.event_list)
        i = 0
        for event in self.event_list:
            print event
            i += 1
            if i == 25:
                break

    # dwang:
    #def event_driver(self, preempt):
    #def event_driver(self, preempt,simu_name,simu_tid):
    def event_driver(self, preempt,fp_backf,fp_pre_bj, checkpoint_opt, checkp_dsize, checkp_w_bandwidth, checkp_r_bandwidth,
                     checkp_t_internval, checkp_t_internval_pcent, checkp_heur_opt, job_length_type, checkp_overhead_percent):
    # dwang
        """core part that drives the clock""" 
        # print "[dw_evsim] current t_stamp: ", self.time_stamp
        # print "[dw_evsim] total t_stamp: ", len(self.event_list)
        # print "[dw_evsim] checkpoint_opt: ", checkpoint_opt
        
        if self.go_next:
            ##
            # samnickolay
            # if there is a rt job that will reach the slowdown threshold before the next scheduled time step
            # insert a fake time step and then clear the next_slowdown_threshold_time_step variable
            if self.next_slowdown_threshold_time_step is not None and \
                            self.get_next_event_time_sec() > self.next_slowdown_threshold_time_step and \
                            self.get_current_time() < self.next_slowdown_threshold_time_step:
                evspec = {}
                evspec['unixtime'] = self.next_slowdown_threshold_time_step
                evspec['datetime'] = sec_to_date(self.next_slowdown_threshold_time_step)
                evspec['machine'] = INTREPID

                self.add_event(evspec)
                # global next_slowdown_threshold_time_step
                self.next_slowdown_threshold_time_step = None

            # record the utilization
            current_utilization = ComponentProxy("system").get_utilization_rate(0)
            old_time = self.bgsched.get_current_time()

            #only if the go_next tag is true will the clock be incremented. enable scheduler schedule multiple job at the same time stamp
            self.clock_increment()

            new_time = self.bgsched.get_current_time()
            from bqsim import TOTAL_NODES
            utilization_chunk = (old_time, new_time, current_utilization, self.low_priority_nodes, self.high_priority_nodes)
                                 # float(self.low_priority_nodes)/TOTAL_NODES, float(self.high_priority_nodes)/TOTAL_NODES)

            # from bqsim import utilization_records
            # global utilization_records

            if old_time > 0.0:
                self.utilization_records.append(utilization_chunk)

            if self.time_stamp % 50 == 0:
                print "t=" + str(self.time_stamp) + ' (' + str(len(self.event_list)) + ')'

            # record utilizations for when a job first arrives
            cur_event = self.get_current_event_type()
            if cur_event == "Q":
                cur_event_job = self.get_current_event_job()
                # from bqsim import jobs_queue_time_utilizations
                self.jobs_queue_time_utilizations[int(cur_event_job)] = current_utilization

            # samnickolay
            ###

            # print "[dw_evsim] current t_stamp: ", self.time_stamp
            # print "[dw_evsim] total t_stamp: ", len(self.event_list)


        machine = self.get_current_event_machine()
# 
#         print "event_[%s]: %s, machine=%s, event=%s, job=%s" % (
#                                            self.implementation,
#                                            self.get_current_date_time(),
#                                            self.get_current_event_machine(),
#                                            self.get_current_event_type(),
#                                            self.get_current_event_job(),
#                                            )
#
        if machine == INTREPID:
            # dwang:
            #self.bgsched.schedule_jobs(preempt)
            #self.bgsched.schedule_jobs(preempt,simu_name,simu_tid)

            slowdown_threshold_time_step = None

            # basic_highpQ
            if checkpoint_opt == "highpQ":
                self.bgsched.schedule_jobs(preempt,fp_backf)
            # highpQ_resv 
            elif checkpoint_opt == "highpQ_resv":
                self.bgsched.schedule_jobs_hpQ_resv(preempt,fp_backf)
            # wcheckp
            elif checkpoint_opt == "v0":
                self.bgsched.schedule_jobs_wcheckp_v0(preempt,fp_backf)
            # wcheckp_jrestart
            elif checkpoint_opt == "v1":
                self.bgsched.schedule_jobs_wcheckp_v1(preempt,fp_backf,fp_pre_bj,
                                                        checkp_heur_opt) 
            elif checkpoint_opt == "v1H_wth":
                self.bgsched.schedule_jobs_wcheckp_v1H_wth(preempt,fp_backf,fp_pre_bj,
                                                            checkp_heur_opt) 
            # wcheckp_jresume
            elif checkpoint_opt == "v2":
                self.bgsched.schedule_jobs_wcheckp_v2(preempt,fp_backf,fp_pre_bj,
                                                        checkp_dsize, checkp_w_bandwidth, checkp_r_bandwidth,
                                                        checkp_heur_opt)
            # wcheckp_jresume
            elif checkpoint_opt == "v2p":
                self.bgsched.schedule_jobs_wcheckp_v2p(preempt,fp_backf,fp_pre_bj,
                                                        checkp_dsize, checkp_w_bandwidth, checkp_r_bandwidth, checkp_t_internval,
                                                        checkp_heur_opt)
            # wcheckp_jresume
            elif checkpoint_opt == "v2p_app":
                self.bgsched.schedule_jobs_wcheckp_v2p_app(preempt,fp_backf,fp_pre_bj,
                                                            checkp_dsize, checkp_w_bandwidth, checkp_r_bandwidth, checkp_t_internval_pcent,
                                                            checkp_heur_opt)

            # ...
            # dwang

            ###
            # samnickolay
            elif checkpoint_opt == "baseline":
                self.bgsched.schedule_jobs_baseline(preempt, fp_backf)


            elif checkpoint_opt == "v2_sam_v1":
                results = self.bgsched.schedule_jobs_wcheckp_v2_sam_v1(preempt, fp_backf, fp_pre_bj,
                                                             checkp_dsize, checkp_w_bandwidth, checkp_r_bandwidth,
                                                             checkp_heur_opt, job_length_type)

            elif checkpoint_opt == "v2p_sam_v1":
                results = self.bgsched.schedule_jobs_wcheckp_v2p_sam_v1(preempt,fp_backf,fp_pre_bj, checkp_dsize, checkp_w_bandwidth,
                                                          checkp_r_bandwidth, checkp_t_internval, checkp_heur_opt,
                                                              job_length_type)
            elif checkpoint_opt == "v2p_app_sam_v1":
                results = self.bgsched.schedule_jobs_wcheckp_v2p_app_sam_v1(preempt, fp_backf, fp_pre_bj, checkp_dsize,
                                                                  checkp_w_bandwidth, checkp_r_bandwidth, checkp_heur_opt,
                                                                  job_length_type, checkp_overhead_percent)
            try:
                if results is not None:
                    slowdown_threshold_time_step, low_priority_queue_jobs, high_priority_queue_jobs = results

                if slowdown_threshold_time_step is not None:
                    # global next_slowdown_threshold_time_step
                    self.next_slowdown_threshold_time_step = slowdown_threshold_time_step

                if low_priority_queue_jobs is not None:
                    low_priority_nodes = []
                    for jobid, low_priority_queue_job in low_priority_queue_jobs.iteritems():
                        # low_priority_nodes += int(low_priority_queue_job.nodes)
                        low_priority_nodes.append(low_priority_queue_job.nodes)
                    self.low_priority_nodes = low_priority_nodes

                if high_priority_queue_jobs is not None:
                    high_priority_nodes = []
                    for jobid, high_priority_queue_job in high_priority_queue_jobs.iteritems():
                        # high_priority_nodes += int(high_priority_queue_job.nodes)
                        high_priority_nodes.append(high_priority_queue_job.nodes)
                    self.high_priority_nodes = high_priority_nodes
            except:
                pass

            # samnickolay
            ###


        if machine == EUREKA:
            self.csched.schedule_jobs()
        if machine == MMON:
            self.mmon.metric_monitor()

        if self.go_next:
            ComponentProxy("queue-manager").calc_loss_of_capacity()
Exemplo n.º 3
0
class Qsimulator(Simulator):
    '''Cobalt Queue Simulator'''

    implementation = "qsim"
    name = "queue-manager"
    alias = Simulator.name

    def __init__(self, *args, **kwargs):

        print "kwargs= ",  kwargs

        #initialize partitions
        Simulator.__init__(self, *args, **kwargs)
        partnames = self._partitions.keys()
        self.init_partition(partnames)
        self.part_size_list = []

        for part in self.partitions.itervalues():
            if int(part.size) not in self.part_size_list:
                self.part_size_list.append(int(part.size))
        self.part_size_list.sort()

        #get command line parameters
        self.FAILURE_FREE = True
        self.FRACTION = kwargs.get("fraction", 1)
        self.workload_file =  kwargs.get("workload")
        self.output_log = kwargs.get("outputlog")
        self.failure_log = kwargs.get('failurelog')

        self.weibull = kwargs.get('weibull')
        if self.weibull:
            self.SCALE = float(kwargs.get('scale'))
            if self.SCALE == 0:
                self.SCALE = default_SCALE
            self.SHAPE = float(kwargs.get('shape'))
            if self.SHAPE == 0:
                self.SHAPE = default_SHAPE

        self.fault_aware = kwargs.get('faultaware')
        self.SENSITIVITY = default_SENSITIVITY
        self.SPECIFICITY = default_SPECIFICITY
        if self.fault_aware:
            self.SENSITIVITY = float(kwargs.get('sensitivity', default_SENSITIVITY))
            self.SPECIFICITY = float(kwargs.get('specificity', defalt_SPECIFICITY))

        if self.failure_log or self.weibull:
            self.FAILURE_FREE = False

        #initialize time stamps and job queues
        #time stamp format: ('EVENT', 'time_stamp_date', time_stamp_second, {'job_id':str(jobid), 'location':[partition1, partition2,...]})
        self.time_stamps = [('I', '0', 0, {})]
        self.cur_time_index = 0
        self.queues = SimQueueDict(policy=kwargs['policy'])
        self.init_queues()
        self.visible_jobs = []

        #initialize failures
        self.failure_dict = {}
        if not self.FAILURE_FREE:
            if self.failure_log:
                #if specified failure log, use log trace failure
                self.inject_failures()
            elif self.weibull:
                #else MAKE failures by Weibull distribution
                self.make_failures()

        #initialize PBS-style logger
        self.pbslog = PBSlogger(self.output_log)

        #initialize debug logger
        self.dbglog = PBSlogger(self.output_log+"-debug")

        #finish tag
        self.finished = False

        #tag for controlling time stamp increment
        self.increment_tag = True

        #register local alias "system" for this component
        local_components["system"] = self
        print "Simulation starts:"

    def register_alias(self):
        '''register alternate name for the Qsimulator, by registering in slp
        with another name for the same location. in this case 'system' is the
        alternate name'''
        try:
            slp = Cobalt.Proxy.ComponentProxy("service-location", defer=False)
        except ComponentLookupError:
            print >> sys.stderr, "unable to find service-location"
            qsim_quit()
        svc_location = slp.locate(self.name)
        if svc_location:
            slp.register(self.alias, svc_location)
    register_alias = automatic(register_alias, 30)

    def is_finished(self):
        return self.finished
    is_finished = exposed(is_finished)

    def init_partition(self, namelist):
        '''add all paritions and apply activate and enable'''
        func = self.add_partitions
        args = ([{'tag':'partition', 'name':partname, 'size':"*",
                  'functional':False, 'scheduled':False, 'queue':"*",
                  'deps':[]} for partname in namelist],)
        apply(func, args)

        func = self.set_partitions
        args = ([{'tag':'partition', 'name':partname} for partname in namelist],
                {'scheduled':True, 'functional': True})
        apply(func, args)


    def get_current_time_event(self):
        return self.time_stamps[self.cur_time_index][0]

    def get_current_time(self):
        '''get current time in date format'''
        return self.time_stamps[self.cur_time_index][1]

    def get_current_time_sec(self):
        return self.time_stamps[self.cur_time_index][2]
    get_current_time_sec = exposed(get_current_time_sec)

    def get_current_time_job(self):
        ret = None
        if self.time_stamps[self.cur_time_index][3].has_key('jobid'):
            ret = self.time_stamps[self.cur_time_index][3]['jobid']
        return ret

    def get_current_time_partition(self):
        if self.get_current_time_event() in set(["R","S"]):
            return self.time_stamps[self.cur_time_index][3]['location']
        else:
            return None

    def get_current_time_stamp(self):
        '''get current time stamp index'''
        return self.cur_time_index
    get_current_time_stamp = exposed(get_current_time_stamp)

    def get_current_time_stamp_tuple(self):
        return  self.time_stamps[self.cur_time_index]

    def time_increment(self):
        '''the current time stamp increments by 1'''
        if self.cur_time_index < len(self.time_stamps) - 1:
            self.cur_time_index += 1
            print " "
            print str(self.get_current_time()) + \
            " Time stamp is incremented by 1, current time stamp: " + \
            str(self.cur_time_index)
        else:
            print str(self.get_current_time()) +\
            " Reached maximum time stamp: %s, simulating finished! " \
             %  (str(self.cur_time_index))
            self.finished = True
            self.pbslog.closeLog()
            qsim_quit()  #simulation completed, exit!!!
        return self.cur_time_index

    def insert_time_stamp(self, new_time_date, event, info):
        '''insert time stamps in the same order'''
        if event not in SET_event:
            print "invalid event type,", event
            return

        new_time_sec = date_to_sec(new_time_date)
        new_time_tuple = (event, new_time_date, new_time_sec, info)

        pos = len(self.time_stamps)

        while new_time_sec < self.time_stamps[pos-1][2]:
            pos = pos - 1

        self.time_stamps.insert(pos, new_time_tuple)
        #print "insert time stamp ", new_time_tuple, " at pos ", pos
        return pos

    def init_queues(self):
        '''parses the work load log file, initializes queues and sorted time
        stamp list'''

        raw_jobs = parse_work_load(self.workload_file)
        specs = []

        tag = 0
        for key in raw_jobs:
            spec = {'valid':True}
            tmp = raw_jobs[key]

            spec['jobid'] = tmp.get('jobid')
            spec['queue'] = tmp.get('queue')

            #convert submittime from "%m/%d/%Y %H:%M:%S" to Unix time sec
            format_sub_time = tmp.get('submittime')
            if format_sub_time:
                spec['submittime'] = date_to_sec(format_sub_time)
                spec['first_subtime'] = spec['submittime']  #set the first submit time
            else:
                spec['valid'] = False

            #convert walltime from 'hh:mm:ss' to float of minutes
            format_walltime = tmp.get('Resource_List.walltime')
            spec['walltime'] = 0
            if format_walltime:
                segs = format_walltime.split(':')
                spec['walltime'] = str(int(segs[0])*60 + int(segs[1]))
            else:  #invalid job entry, discard
                spec['valid'] = False

            if tmp.get('start') and tmp.get('end'):
                act_run_time = float(tmp.get('end')) - float(tmp.get('start'))
                spec['runtime'] = str(round(act_run_time, 1))

                if IDEALWALLTIME:
                    wtime = (round(act_run_time / 60, 2) + float(spec['walltime']))/2
                    #wtime = act_run_time / 60
                    spec['walltime'] = str(round(wtime, 2))
            else:
                spec['valid'] = False

            if tmp.get('Resource_List.nodect'):
                spec['nodes'] = tmp.get('Resource_List.nodect')

            else:  #invalid job entry, discard
                spec['valid'] = False

            if tmp.get('user'):
                spec['user'] = tmp.get('user')
            if tmp.get('project'):
                spec['project'] = tmp.get('project')

            spec['state'] = 'invisible'
            spec['start_time'] = '0'
            spec['end_time'] = '0'

            #add the job spec to the spec list
            if spec['valid'] == True:
                specs.append(spec)

        #adjust workload density
        if FRACTION != 1:
            tune_workload(specs, FRACTION)
            print "workload adjusted: last submit job=", specs[len(specs)-1].get('submittime')

        print "Initializing jobs and time stamps list, wait one moment... ..."
        for spec in specs:
            format_sub_time = sec_to_date(spec['submittime'])
            if not self.time_stamps.__contains__(format_sub_time):
                    self.insert_time_stamp(format_sub_time, 'Q', {'jobid':str(spec['jobid'])})

        print "total job number:", len(specs)
        self.add_jobs(specs)

        return 0

    def log_job_event(self, eventtype, timestamp, spec):
        '''log job events(Queue,Start,End) to PBS-style log'''
        def len2 (_input):
            _input = str(_input)
            if len(_input) == 1:
                return "0" + _input
            else:
                return _input
        if eventtype == 'Q':  #submitted(queued) for the first time
            message = "%s;Q;%d;queue=%s" % (timestamp, spec['jobid'], spec['queue'])
        elif eventtype == 'R':  #resume running after failure recovery
            message = "%s;R;%s" % (timestamp, ":".join(spec['location']))
        else:
            wall_time = spec['walltime']
            walltime_minutes = len2(int(float(wall_time)) % 60)
            walltime_hours = len2(int(float(wall_time)) // 60)
            log_walltime = "%s:%s:00" % (walltime_hours, walltime_minutes)
            if eventtype == 'S':  #start running
                message = "%s;S;%d;queue=%s qtime=%s Resource_List.nodect=%s Resource_List.walltime=%s start=%s exec_host=%s" % \
                (timestamp, spec['jobid'], spec['queue'], spec['submittime'],
                 spec['nodes'], log_walltime, spec['start_time'], ":".join(spec['location']))
            elif eventtype == 'E':  #end
                message = "%s;E;%d;queue=%s qtime=%s Resource_List.nodect=%s Resource_List.walltime=%s start=%s end=%f exec_host=%s runtime=%s" % \
                (timestamp, spec['jobid'], spec['queue'], spec['submittime'], spec['nodes'], log_walltime, spec['start_time'],
                 round(float(spec['end_time']), 1), ":".join(spec['location']),
                 spec['runtime'])
            elif eventtype == 'F':  #failure
                frag_runtime = round(float(spec['failure_time']) - float(spec['start_time']), 1)  #running time before failure(after the latest start)
                message = "%s;F;%d;queue=%s qtime=%s Resource_List.nodect=%s Resource_List.walltime=%s exec_host=%s start=%s frag_runtime=%s complete=%f" % \
                (timestamp, spec['jobid'], spec['queue'], spec['submittime'],
                 spec['nodes'], log_walltime, ":".join(spec['location']), spec['start_time'],
                 frag_runtime, round(frag_runtime / float(spec['runtime']), 2)
                )
            elif eventtype == 'P':  #pending
                message = "%s;P;%d;queue=%s qtime=%s Resource_List.nodect=%s Resource_List.walltime=%s exec_host=%s start=%s" % \
                (timestamp, spec['jobid'], spec['queue'], spec['submittime'],
                 spec['nodes'], log_walltime, ":".join(spec['location']), spec['start_time'],
                )
            else:
                print "invalid event type, type=", type
                return
        self.pbslog.LogMessage(message)

    def get_new_states(self, jobspec):
        '''return the new state updates of a specific job at specific time
        stamp, including invisible->queued, running->ended'''

        updates = {}
        curstate = jobspec['state']
        newstate = curstate
        job_id = jobspec['jobid']

        cur_event = self.get_current_time_event()

        #handle job submssion event
        if cur_event == 'Q' and curstate == "invisible":
            newstate = "queued"
            updates['is_runnable'] = True
            updates['is_visible'] = True
            self.log_job_event('Q', self.get_current_time(), jobspec)

        #handle job completion event
        elif cur_event == 'E' and curstate == "running":
            newstate = "ended"
            updates['is_runnable'] = False
            updates['has_resources'] = False
            updates['is_visible'] = False

            #release partition immediately
            partitions = jobspec['location']
            for partition in partitions:
                self.release_partition(partition)
            self.queues.del_jobs([{'jobid':job_id}])

            #write to output log
            if jobspec['end_time']:
                end = float(jobspec['end_time'])
            else:
                end = 0
            end_datetime = sec_to_date(end)
            self.log_job_event('E', end_datetime, jobspec)

        #handle job failure event
        elif cur_event == 'F' and curstate == "running":
            print "entered failure handling"

            #release partition
            partitions = jobspec['location']
            for partition in partitions:
                print "partition %s start repairing" % (partition)
                self.start_repair_partition(partition)

            #write to output log
            if jobspec['failure_time']:
                fail = float(jobspec['failure_time'])
            else:
                fail = 0
            failure_datetime = sec_to_date(fail)
            self.log_job_event('F', failure_datetime, jobspec)
            print self.get_current_time(), " job %d failed at %s!!" % (job_id, ":".join(jobspec['location']))

            rec_updates = self.recovery_mgr(jobspec)

            if not rec_updates == {}:
                updates.update(rec_updates)

            updates['has_resources'] = False

            if updates.has_key('state'):
                newstate = updates['state']

            if CHECKPOINT:
                print "enter checkpoint handling****"
                #runtime before failed after latest start
                frag_runtime = float(jobspec['failure_time']) - float(jobspec['start_time'])
                updates['remain_time'] = jobspec['remain_time'] - frag_runtime

        else:#other event
            pass

        if updates and not curstate == newstate:
            print self.get_current_time(), "state changed, job", job_id, \
             ":", curstate, "->", newstate
            updates['state'] = newstate

        return updates

    def update_job_states(self, specs, updates):
        '''update the state of the jobs associated to the current time stamp'''

        def _update_job_states(job, newattr):
            '''callback function to update job states'''
            temp = job.to_rx()
            newattr = self.get_new_states(temp)
            if newattr:
                temp.update(newattr)
                job.update(newattr)

        ids_str = self.get_current_time_job()
        ids = ids_str.split(':')
        cur_event = self.get_current_time_event()
        for id in ids:
            for spec in specs:
                spec['jobid'] = int(id)
            ret_jobs = self.queues.get_jobs(specs, _update_job_states, updates)
            if cur_event == "Q":
                self.visible_jobs.extend(ret_jobs)
            elif cur_event=="E":
                self.visible_jobs = [j for j in self.visible_jobs if j not in ret_jobs]
        return 0

    def run_job_updates(self, jobspec, newattr):
        ''' return the state updates (including state queued -> running,
        setting the start_time, end_time)'''
        updates = {}

        #print "enter run_job_updates, jobspec=", jobspec

        start = self.get_current_time_sec()
        updates['start_time'] = start
        updates['starttime'] = start

        updates['state'] = 'running'
        updates['system_state'] = 'running'
        updates['is_runnable'] = False
        updates['has_resources'] = True

        print self.get_current_time(), "run job state change, job", jobspec['jobid'], \
             ":", jobspec['state'], "->", updates['state']

        #determine whether the job is going to fail before completion
        location = newattr['location']
        duration = jobspec['remain_time']
        #print "duration=", duration
        nearest_failure = self.get_next_failure(location, start, duration)
        if (nearest_failure):
            updates['failure_time'] = date_to_sec(nearest_failure)
            new_time_stamp = nearest_failure
            self.insert_time_stamp(new_time_stamp, 'F', {'jobid':str(jobspec['jobid'])})
        else:  # will complete
            end = start + duration
            updates['end_time'] = end
            new_time_stamp = sec_to_date(end)
            #print "new_time_stamp=", new_time_stamp
            self.insert_time_stamp(new_time_stamp, 'E', {'jobid':str(jobspec['jobid'])})

        updates.update(newattr)

        return updates

    def start_job(self, specs, updates):
        '''update the job state and start_time and end_time when cqadm --run
        is issued to a group of jobs'''
        partitions = updates['location']
        for partition in partitions:
            self.reserve_partition(partition)

        def _start_job(job, newattr):
            '''callback function to update job start/end time'''
            temp = job.to_rx()
            newattr = self.run_job_updates(temp, newattr)
            temp.update(newattr)
            job.update(newattr)
            self.log_job_event('S', self.get_current_time(), temp)
        return self.queues.get_jobs(specs, _start_job, updates)

    def add_jobs(self, specs):
        '''Add a job, currently for unit test only'''
        response = self.queues.add_jobs(specs)
        return response
    add_jobs = exposed(query(add_jobs))

    def get_jobs(self, specs):
        '''get a list of jobs, each time triggers time stamp increment and job
        states update'''

        jobs = []
        if self.increment_tag:
            self.time_increment()
            eventtype = self.get_current_time_event()
            print "current event type====", eventtype
            if eventtype == "R":
                self.release_repaired_partition()

                #if the repaired job associated with some pending jobs,
                #returen empty list to scheduler, in order to ensure the next
                #time stamp will restart the pending job other than scheduling other jobs at this time stamp
                #this will avoid run multiple jobs on the same partition(once a bug, solved)
                if self.get_current_time_job():
                    return jobs

            elif eventtype == "S":

                self.restart_pending_job()
                return jobs

            else:
                self.update_job_states(specs, {})

        if len(self.recovering_jobs) > 0:
            self.update_recovering_jobs({})

        self.increment_tag = True

        jobs = self.visible_jobs
#        print "running jobs=", [job.jobid for job in self.running_jobs]
#        print "queueing jobs=", [job.jobid for job in self.queuing_jobs]
#        print "visible jobs=", [job.jobid for job in self.visible_jobs]
#        print "return jobs=", len(jobs)

        return jobs
    get_jobs = exposed(query(get_jobs))

    def update_recovering_jobs(self, updates):
        print "enter update_recovering_jobs()"

        def _update_recovering_jobs(job, newattr):
            '''callback function to update job states'''
            temp = job.to_rx()
            print "temp=", temp
            newattr = self.recovery_mgr(temp)
            print "update_recovering_jobs newattr=", newattr
            print "temp=", temp
            if newattr:
                temp.update(newattr)
                job.update(newattr)

        ids = [job.jobid for job in self.recovering_jobs]
        print "ids=", ids

        ret = self.queues.get_jobs([{'tag':"job", 'state': "recovering"}], _update_recovering_jobs, updates)
        return 0

    def _get_queuing_jobs(self):
        return [job for job in self.visible_jobs if job.is_runnable==True]
    queuing_jobs = property(_get_queuing_jobs)

    def _get_running_jobs(self):
        return [job for job in self.visible_jobs if job.has_resources==True]
    running_jobs = property(_get_running_jobs)

    def _get_recovering_jobs(self):
        return self.queues.get_jobs([{'jobid':"*", 'state':"recovering"}])
    recovering_jobs = property(_get_recovering_jobs)

    def get_visible_jobs(self):
        return self.visible_jobs;
    get_visible_jobs = exposed(get_visible_jobs)

    def get_running_jobs(self):
        return [job for job in self.visible_jobs if job.has_resources==True]
    get_running_jobs = exposed(get_running_jobs)

    def get_queuing_jobs(self):
        return [job for job in self.visible_jobs if job.is_runnable==True]
    get_queuing_jobs = exposed(get_queuing_jobs)

    def _get_job_by_id(self, jobid):
        jobs = self.queues.get_jobs([{'jobid':jobid}])
        if len(jobs) == 1:
            return jobs[0]
        else:
            return None

    def add_queues(self, specs):
        '''add queues'''
        return self.queues.add_queues(specs)
    add_queues = exposed(query(add_queues))

    def get_queues(self, specs):
        '''get queues'''
        return self.queues.get_queues(specs)
    get_queues = exposed(query(get_queues))

    def run_jobs(self, specs, nodelist):
        '''run a queued job, by updating the job state, start_time and
        end_time'''
        print "run job specs=", specs, " on partion", nodelist
        if specs:
            self.start_job(specs, {'location': nodelist})
            #set tag false, enable scheduling another job at the same time
            self.increment_tag = False
        #print "current running jobs=", [job.jobid for job in self.running_jobs]
        return self.running_jobs
    run_jobs = exposed(query(run_jobs))


    def get_midplanes(self, partname):
        '''return a list of sub-partitions each contains 512-nodes(midplane)'''
        midplane_list = []
        partition = self._partitions[partname]

        if partition.size == MIDPLANE_SIZE:
            midplane_list.append(partname)
        elif partition.size > MIDPLANE_SIZE:
            children = partition.children
            for part in children:
                if self._partitions[part].size == MIDPLANE_SIZE:
                    midplane_list.append(part)
        else:
            parents = partition.parents
            for part in parents:
                if self._partitions[part].size == MIDPLANE_SIZE:
                    midplane_list.append(part)

        return midplane_list

    def get_next_failure(self, location, now, duration):
        '''return the next(closest) failure moment according the partition failure list'''

        if (self.FAILURE_FREE):
            return None

        def _find_next_failure(partname, now):
            next = None
            failure_list = self.failure_dict[partname]
            if failure_list:
                for fail_time in failure_list:
                    if date_to_sec(fail_time) > now:
                        next = fail_time
                        break
            return next

        closest_fail_sec = MAXINT
        partitions = location

        midplanes = set()
        for partition in partitions:
            tmp_midplanes = self.get_midplanes(partition)
            for item in tmp_midplanes:
                if item not in midplanes:
                    midplanes.add(item)

        for midplane in midplanes:
            next = _find_next_failure(midplane, now)
            if (next):
                next_sec = date_to_sec(next)
                if next_sec < closest_fail_sec:
                    closest_fail_sec =next_sec

        if closest_fail_sec == MAXINT:
            next_failure_date = None
        else:
            job_end_sec = now + duration
            if closest_fail_sec < job_end_sec:
                next_failure_date = sec_to_date(closest_fail_sec)
            else:
                next_failure_date = None

        #print "next_failure_date=", next_failure_date

        return next_failure_date

    def will_job_fail(self, mtbf, nodes, hours):
        '''simulate static failure chance, [not used]'''
        return False
        print "mtbf=%d, nodes=%d, hours=%f" % (mtbf,nodes,hours)
        failure_chance = 1 - (1 - hours * 1.0/mtbf) ** nodes
        if failure_chance > 0.7 :
            failure_chance = 0.7
        random_num = random.random()
        print "failure chance=%f, random_num=%f" % (failure_chance, random_num)
        if random_num < failure_chance:
            return True
        else:
            return False

    def nodes_static(self):
        '''static the node requested by each job, [not used]'''
        jobs = self.queues.get_jobs([{'jobid':"*", 'queue':"*", 'nodes':"*"}])
        nodesdict = {}
        for job in jobs:
            nodes = int(job.nodes)
            nodesstr = nodes
            if (nodesdict.has_key(nodesstr)):
                nodesdict[nodesstr] =  nodesdict[nodesstr] + 1
            else:
                nodesdict[nodesstr] = 1
        keys = nodesdict.keys()
        keys.sort()
        for key in keys:
            print key, ":", nodesdict[key]

    def gen_failure_list(self, scale, shape, startdate, enddate):
        '''generate a synthetic failure time list based on weibull distribution
         and start/end date time'''
        failure_moments = []
        ttf_list = []

        start = date_to_sec(startdate)
        end = date_to_sec(enddate)

        cur_failure = start

        while True:
            ttf = random.weibullvariate(scale,shape)
            cur_failure += ttf
            if cur_failure < end:
                ttf_list.append(ttf)
                failure_moments.append(sec_to_date(cur_failure))
            else:
                break
        return failure_moments, ttf_list

    def make_failures(self):
        '''generate failure lists for each 512-nodes partition'''
        ttf_dict = {}
        start = self.time_stamps[1][1]
        end = self.time_stamps[len(self.time_stamps)-1][1]

        for partition in self._partitions.values():
            if partition.size == MIDPLANE_SIZE:
                fl, ttfs = self.gen_failure_list(self.SCALE, self.SHAPE, start, end)
                self.failure_dict[partition.name] = fl
                ttf_dict[partition.name] = ttfs

        partnames = self.failure_dict.keys()
        partnames.sort()
        f = open(default_FAILURE_LOG, "w")
        total_f = 0
        mtbf = 0
        for part in partnames:
            f_list = self.failure_dict[part]
            print part, " ", f_list
            f.write("%s;%s\n" % (part, ";".join(f_list)))
            total_f +=  len(f_list)

            ttfs = ttf_dict[part]
            if len(ttfs)==0:
                mtbf = 0
            else:
                total = 0

                for ttf in ttfs:
                    total += ttf
                    mtbf = total / len(ttfs)
        start_sec = date_to_sec(start)
        end_sec = date_to_sec(end)
        f.write("Total=%d\nMTBF=%f" % (total_f, (end_sec-start_sec)/(total_f*3600)))

        f.close()

    def inject_failures(self):
        '''parse failure trace log to make failure list for each 1-midplane partition'''

        raw_job_dict = {}
        partnames = set(self._partitions.keys())
        flog = open(self.failure_log, "r")
        self.failure_dict = {}
        for line in flog:
            print "line=", line
            line = line.strip('\n')
            parsedline = line.split(";")
            print "parsedline=", parsedline
            failure_list = []
            part = parsedline[0]
            if part in partnames:
                for i in range(1, len(parsedline)):
                    failure_moment = parsedline[i]
                    if len(failure_moment) == 0:
                        continue
                    failure_list.append(failure_moment)
                self.failure_dict[part] = failure_list
        partnames = self.failure_dict.keys()
        partnames.sort()
        for part in partnames:
            f_list = self.failure_dict[part]
            print part, " ", f_list

    def get_failure_chance(self, location, duration):
        now = date_to_sec(self.get_current_time())
        next_fail = self.get_next_failure(location, now, duration)
        if (next_fail != None):
            return self.SENSITIVITY
        else:
            return 1 - self.SPECIFICITY
    get_failure_chance = exposed(get_failure_chance)

    def recovery_mgr(self, jobspec):
        """Recovery manager, this function can be extended to support various recovery options.
        at this version, the failed job is sent back to the rear of the queue. The extended code
        is ready and available at private code branch(wtang)."""

        updates = {}

        updates = self.handle_reque_rear(jobspec)

        recovery_option = jobspec['recovery_opt']
        print "rec_opt=", recovery_option

        #if_else structure remains room for recovery option extending
        if recovery_option == 1:
            #resubmit the job
            #resubmit the job, the submit time changed to NOW
            updates = self.handle_reque_rear(jobspec)

        return updates

    def handle_reque_rear(self, jobspec):
        '''handle option 1 - resubmit the job to rear of waiting queue'''
        updates = {}
        updates['state'] = "queued"
        updates['start_time'] = 0
        updates['submittime'] = self.get_current_time_sec()
        return updates

    def start_repair_partition(self, partname):
        '''partition failed, assuming get repaired MTTR seconds later'''
        now = self.get_current_time_sec()
        time_to_repair = now + MTTR
        time_to_repair_date = sec_to_date(time_to_repair)
        self.insert_time_stamp(time_to_repair_date, "R", {'location':partname})

    def release_repaired_partition(self):
        '''enter release_repaired_partition() partition repaired'''
        partition = self.get_current_time_partition()
        if partition == None:
            return False
        self.release_partition(partition)
        print "partition %s gets repaired" % (partition)
        self.log_job_event('R', self.get_current_time(), {'location':partition})
        return True

    def restart_pending_job(self):
        '''restart jobs that pending for the nodes repair'''
        partname = self.get_current_time_partition()
        print "enter restart_pending_job() partname=", partname

        ids_str = self.get_current_time_job()
        ids = ids_str.split(':')
        jobspecs = []
        for id in ids:
            spec = {'tag':'job', 'jobid':int(id)}
            jobspecs.append(spec)
        print "restart pending job ", jobspecs, " on repaired partition ", partname
        self.run_jobs(jobspecs, [partname])

    def possible_locations(self, job):
        '''find the partitions with the size that can right accomodates the job
        (returned partions are not necessarily idle)'''
        locations = []
        proper_partsize = 64
        job_nodes = int(job['nodes'])

        for psize in self.part_size_list:
            if psize >= job_nodes:
                proper_partsize = psize
                break

        for part in self.cached_partitions.itervalues():
            if int(part.size) == proper_partsize:
                locations.append(part)

        return locations

    def _find_job_location(self, args, drain_partitions=set(), backfilling=False):
        jobid = args['jobid']
        nodes = args['nodes']
        queue = args['queue']
        utility_score = args['utility_score']
        walltime = args['walltime']
        forbidden = args.get("forbidden", [])
        required = args.get("required", [])

        best_score = sys.maxint
        best_partition = None

        # get partitions of proper size as the candidates
        candidate_partitions = self.possible_locations(args)
        #exclude the partitions already drained
        if drain_partitions:
            candidate_partitions = [part for part in candidate_partitions if part not in drain_partitions]

        now = self.get_current_time_sec()
        for partition in candidate_partitions:

            #skip partitions that are not "idle"
            if partition.state != "idle":
                continue

            if backfilling:
                #skip the partition with too short cutoff to backfill the job
                if 60*float(walltime) > (partition.backfill_time - now):
                    continue

            # let's check the impact on partitions that would become blocked
            score = 0
            for p in partition.parents:
                if self.cached_partitions[p].state == "idle" and self.cached_partitions[p].scheduled:
                    score += 1

                for ch in partition.children:
                    score += 0.01

                if (FAULTAWARE):
                    Pf = 0
                    Pf = self.get_failure_chance(partition.name, 60*float(walltime))
                    score += Pf

            # the lower the score, the fewer new partitions will be blocked by this selection
            if score < best_score:
                best_score = score
                best_partition = partition
            elif score == best_score:
                if partition.name > best_partition.name:
                    best_partition = partition

        if best_partition:
            #print "return bestpartition=",{jobid: [best_partition.name, best_partition.state]}
            return {jobid: [best_partition.name]}

    def find_job_location(self, arg_list, end_times):

        best_partition_dict = {}

        if self.bridge_in_error:
            print "bridge_in_error"
            return {}

        self.cached_partitions = self.partitions

        # first, figure out backfilling cutoffs per partition (which we'll also
        # use for picking which partition to drain)
        job_end_times = {}
        for item in end_times:
            job_end_times[item[0][0]] = item[1]

        now = self.get_current_time_sec()
        for p in self.cached_partitions.itervalues():
            if p.state == "idle":
                p.backfill_time = now
            else:
                p.backfill_time = now + 5*60
            p.draining = False

        for p in self.cached_partitions.itervalues():
            if p.name in job_end_times:
                if job_end_times[p.name] > p.backfill_time:
                    p.backfill_time = job_end_times[p.name]

                for parent_name in p.parents:
                    parent_partition = self.cached_partitions[parent_name]
                    if p.backfill_time > parent_partition.backfill_time:
                        parent_partition.backfill_time = p.backfill_time

        for p in self.cached_partitions.itervalues():
            if p.backfill_time == now:
                continue

            for child_name in p.children:
                child_partition = self.cached_partitions[child_name]
                if child_partition.backfill_time == now or child_partition.backfill_time > p.backfill_time:
                    child_partition.backfill_time = p.backfill_time

        # first time through, try for starting jobs based on utility scores
        drain_partitions = set()
        # the sets draining_jobs and cannot_start are for efficiency, not correctness
        draining_jobs = set()
        cannot_start = set()

        for idx in range(len(arg_list)):
            winning_job = arg_list[idx]
            for jj in range(idx, len(arg_list)):
                job = arg_list[jj]

                # this job isn't good enough!
                if job['utility_score'] < winning_job['threshold']:
                    break

                if job['jobid'] not in cannot_start:
                    partition_name = self._find_job_location(job, drain_partitions)
                    if partition_name:
                        best_partition_dict.update(partition_name)
                        break

                cannot_start.add(job['jobid'])

                # we already picked a drain location for the winning job
                if winning_job['jobid'] in draining_jobs:
                    continue

                location = self._find_drain_partition(winning_job)
                if location is not None:
                    for p_name in location.parents:
                        drain_partitions.add(self.cached_partitions[p_name])
                    for p_name in location.children:
                        drain_partitions.add(self.cached_partitions[p_name])
                        self.cached_partitions[p_name].draining = True
                    drain_partitions.add(location)
                    #self.logger.info("job %s is draining %s" % (winning_job['jobid'], location.name))
                    #self.dbglog.LogMessage("job %s is draining %s" % (winning_job['jobid'], location.name))
                    location.draining = True
                    draining_jobs.add(winning_job['jobid'])

            # at this time, we only want to try launching one job at a time
            if best_partition_dict:

            #    msg =  "idx=%s, jj=%s, job=%s, partition=%s" % (idx, jj, job['jobid'],best_partition_dict[job['jobid']])
                #print msg
            #    self.dbglog.LogMessage(msg)
                break

        # the next time through, try to backfill, but only if we couldn't find anything to start
        if not best_partition_dict:

        # arg_list.sort(self._walltimecmp)
        #   msg = "try to backfill jobs..."
        #   self.dbglog.LogMessage(msg)

            for args in arg_list:
                partition_name = self._find_job_location(args, backfilling=True)
                if partition_name:
                    msg = "backfilling job %s(%s)" % (args['jobid'], args['nodes'])
                    self.logger.info(msg)
                    self.dbglog.LogMessage(msg)
                    best_partition_dict.update(partition_name)
                    break

        # reserve the stuff in the best_partition_dict, as those partitions are allegedly going to
        # be running jobs very soon
        #
        # also, this is the only part of finding a job location where we need to lock anything
        #self._partitions_lock.acquire()
        try:
            for p in self.partitions.itervalues():
                # push the backfilling info from the local cache back to the real objects
                p.draining = self.cached_partitions[p.name].draining
                p.backfill_time = self.cached_partitions[p.name].backfill_time

            for partition_list in best_partition_dict.itervalues():
                part = self.partitions[partition_list[0]]
                ##part.reserved_until = self.get_current_time_sec() + 5*60
                part.state = "starting job"
                for p in part._parents:
                    if p.state == "idle":
                        p.state = "blocked by starting job"
                for p in part._children:
                    if p.state == "idle":
                        p.state = "blocked by starting job"
        except:
            self.logger.error("error in find_job_location", exc_info=True)
        #self._partitions_lock.release()

        #print "best_partition_dict=", best_partition_dict

        return best_partition_dict
    find_job_location = locking(exposed(find_job_location))
Exemplo n.º 4
0
            except:
                self.logger.error("task %s: unable to start",
                                  label,
                                  exc_info=1)
                os._exit(1)
        else:
            local_id = self.id_gen.next()
            kid = Child()
            kid.id = local_id
            kid.pid = child_pid
            kid.label = "%s/%s" % (label, local_id)
            self.children[local_id] = kid
            self.logger.info("task %s: forked with pid %s", kid.label, kid.pid)
            return local_id

    fork = exposed(fork)

    def signal(self, local_id, signame):
        """Signal a child process.
        
        Arguments:
        local_id -- id of the child to signal
        signame -- signal name
        """
        if not self.children.has_key(local_id):
            self.logger.error("signal found no child with id %s", local_id)
            return

        kid = self.children[local_id]
        self.logger.info("task %s: sending %s to pid %s", kid.label, signame,
                         kid.pid)
Exemplo n.º 5
0
class BGSystem(BGBaseSystem):
    """Blue Gene system component.
    
    Methods:
    configure -- load partitions from the bridge API
    add_process_groups -- add (start) an mpirun process on the system (exposed, ~query)
    get_process_groups -- retrieve mpirun processes (exposed, query)
    wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query)
    signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query)
    update_partition_state -- update partition state from the bridge API (runs as a thread)
    """

    name = "system"
    implementation = "bgsystem"

    logger = logger

    _configfields = ['diag_script_location', 'diag_log_file', 'kernel']
    _config = ConfigParser.ConfigParser()
    _config.read(Cobalt.CONFIG_FILES)
    if not _config._sections.has_key('bgsystem'):
        print '''"bgsystem" section missing from cobalt config file'''
        sys.exit(1)
    config = _config._sections['bgsystem']
    mfields = [field for field in _configfields if not config.has_key(field)]
    if mfields:
        print "Missing option(s) in cobalt config file [bgsystem] section: %s" % (
            " ".join(mfields))
        sys.exit(1)
    if config.get('kernel') == "true":
        _kernel_configfields = ['bootprofiles', 'partitionboot']
        mfields = [
            field for field in _kernel_configfields
            if not config.has_key(field)
        ]
        if mfields:
            print "Missing option(s) in cobalt config file [bgsystem] section: %s" % (
                " ".join(mfields))
            sys.exit(1)

    def __init__(self, *args, **kwargs):
        BGBaseSystem.__init__(self, *args, **kwargs)
        sys.setrecursionlimit(5000)
        self.process_groups.item_cls = BGProcessGroup
        self.diag_pids = dict()
        self.configure()
        # initiate the process before starting any threads
        thread.start_new_thread(self.update_partition_state, tuple())

    def __getstate__(self):
        flags = {}
        for part in self._partitions.values():
            sched = None
            func = None
            queue = None
            if hasattr(part, 'scheduled'):
                sched = part.scheduled
            if hasattr(part, 'functional'):
                func = part.functional
            if hasattr(part, 'queue'):
                queue = part.queue
            flags[part.name] = (sched, func, queue)
        return {
            'managed_partitions': self._managed_partitions,
            'version': 1,
            'partition_flags': flags
        }

    def __setstate__(self, state):
        sys.setrecursionlimit(5000)
        self._managed_partitions = state['managed_partitions']
        self._partitions = PartitionDict()
        self.process_groups = BGProcessGroupDict()
        self.process_groups.item_cls = BGProcessGroup
        self.node_card_cache = dict()
        self._partitions_lock = thread.allocate_lock()
        self.pending_diags = dict()
        self.failed_diags = list()
        self.diag_pids = dict()
        self.pending_script_waits = sets.Set()
        self.bridge_in_error = False
        self.cached_partitions = None
        self.offline_partitions = []

        self.configure()
        if 'partition_flags' in state:
            for pname, flags in state['partition_flags'].items():
                if pname in self._partitions:
                    self._partitions[pname].scheduled = flags[0]
                    self._partitions[pname].functional = flags[1]
                    self._partitions[pname].queue = flags[2]
                else:
                    logger.info("Partition %s is no longer defined" % pname)

        self.update_relatives()
        # initiate the process before starting any threads
        thread.start_new_thread(self.update_partition_state, tuple())
        self.lock = threading.Lock()
        self.statistics = Statistics()

    def save_me(self):
        Component.save(self)

    save_me = automatic(save_me)

    def _get_node_card(self, name, state):
        if not self.node_card_cache.has_key(name):
            self.node_card_cache[name] = NodeCard(name, state)

        return self.node_card_cache[name]

    def _new_partition_dict(self, partition_def, bp_cache={}):
        # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API
        NODES_PER_NODECARD = 32

        node_list = []

        if partition_def.small:
            bp_name = partition_def.base_partitions[0].id
            for nc in partition_def._node_cards:
                node_list.append(
                    self._get_node_card(bp_name + "-" + nc.id, nc.state))
        else:
            try:
                for bp in partition_def.base_partitions:
                    if bp.id not in bp_cache:
                        bp_cache[bp.id] = []
                        for nc in Cobalt.bridge.NodeCardList.by_base_partition(
                                bp):
                            bp_cache[bp.id].append(
                                self._get_node_card(bp.id + "-" + nc.id,
                                                    nc.state))
                    node_list += bp_cache[bp.id]
            except BridgeException:
                print "Error communicating with the bridge during initial config.  Terminating."
                sys.exit(1)

        d = dict(
            name=partition_def.id,
            queue="default",
            size=NODES_PER_NODECARD * len(node_list),
            bridge_partition=partition_def,
            node_cards=node_list,
            switches=[s.id for s in partition_def.switches],
            state=_get_state(partition_def),
        )
        return d

    def _detect_wiring_deps(self, partition, wiring_cache={}):
        def _kernel():
            s2 = sets.Set(p.switches)

            if s1.intersection(s2):
                p._wiring_conflicts.add(partition.name)
                partition._wiring_conflicts.add(p.name)
                self.logger.debug("%s and %s havening problems" %
                                  (partition.name, p.name))

        s1 = sets.Set(partition.switches)

        if wiring_cache.has_key(partition.size):
            for p in wiring_cache[partition.size]:
                if partition.name != p.name:
                    _kernel()
        else:
            wiring_cache[partition.size] = [partition]
            for p in self._partitions.values():
                if p.size == partition.size and partition.name != p.name:
                    wiring_cache[partition.size].append(p)
                    _kernel()

    def configure(self):
        """Read partition data from the bridge."""

        self.logger.info("configure()")
        try:
            system_def = Cobalt.bridge.PartitionList.by_filter()
        except BridgeException:
            print "Error communicating with the bridge during initial config.  Terminating."
            sys.exit(1)

        # initialize a new partition dict with all partitions
        #
        partitions = PartitionDict()

        tmp_list = []

        wiring_cache = {}
        bp_cache = {}

        for partition_def in system_def:
            tmp_list.append(self._new_partition_dict(partition_def, bp_cache))

        partitions.q_add(tmp_list)

        # update object state
        self._partitions.clear()
        self._partitions.update(partitions)

        # find the wiring deps
        start = time.time()
        for p in self._partitions.values():
            self._detect_wiring_deps(p, wiring_cache)

        end = time.time()
        self.logger.info("took %f seconds to find wiring deps" % (end - start))

        # update state information
        for p in self._partitions.values():
            if p.state != "busy":
                for nc in p.node_cards:
                    if nc.used_by:
                        p.state = "blocked (%s)" % nc.used_by
                        break
                for dep_name in p._wiring_conflicts:
                    if self._partitions[dep_name].state == "busy":
                        p.state = "blocked-wiring (%s)" % dep_name
                        break

    def update_partition_state(self):
        """Use the quicker bridge method that doesn't return nodecard information to update the states of the partitions"""
        def _start_partition_cleanup(p):
            self.logger.info("partition %s: marking partition for cleaning",
                             p.name)
            p.cleanup_pending = True
            partitions_cleanup.append(p)
            _set_partition_cleanup_state(p)
            p.reserved_until = False
            p.reserved_by = None
            p.used_by = None

        def _set_partition_cleanup_state(p):
            p.state = "cleanup"
            for part in p._children:
                if part.bridge_partition.state == "RM_PARTITION_FREE":
                    part.state = "blocked (%s)" % (p.name, )
                else:
                    part.state = "cleanup"
            for part in p._parents:
                if part.state == "idle":
                    part.state = "blocked (%s)" % (p.name, )

        while True:
            try:
                system_def = Cobalt.bridge.PartitionList.info_by_filter()
            except BridgeException:
                self.logger.error(
                    "Error communicating with the bridge to update partition state information."
                )
                self.bridge_in_error = True
                time.sleep(5)  # wait a little bit...
                continue  # then try again

            try:
                bg_object = Cobalt.bridge.BlueGene.by_serial()
                for bp in bg_object.base_partitions:
                    for nc in Cobalt.bridge.NodeCardList.by_base_partition(bp):
                        self.node_card_cache[bp.id + "-" +
                                             nc.id].state = nc.state
            except:
                self.logger.error(
                    "Error communicating with the bridge to update nodecard state information."
                )
                self.bridge_in_error = True
                time.sleep(5)  # wait a little bit...
                continue  # then try again

            self.bridge_in_error = False
            busted_switches = []
            for s in bg_object.switches:
                if s.state != "RM_SWITCH_UP":
                    busted_switches.append(s.id)

            # set all of the nodecards to not busy
            for nc in self.node_card_cache.values():
                nc.used_by = ''

            # update the state of each partition
            self._partitions_lock.acquire()
            now = time.time()
            partitions_cleanup = []
            self.offline_partitions = []
            missing_partitions = set(self._partitions.keys())
            new_partitions = []
            try:
                for partition in system_def:
                    missing_partitions.discard(partition.id)
                    if self._partitions.has_key(partition.id):
                        p = self._partitions[partition.id]
                        p.state = _get_state(partition)
                        p.bridge_partition = partition
                        p._update_node_cards()
                        if p.reserved_until and now > p.reserved_until:
                            p.reserved_until = False
                            p.reserved_by = None
                    else:
                        new_partitions.append(partition)

                # remove the missing partitions and their wiring relations
                for pname in missing_partitions:
                    self.logger.info("missing partition removed: %s", pname)
                    p = self._partitions[pname]
                    for dep_name in p._wiring_conflicts:
                        self.logger.debug(
                            "removing wiring dependency from: %s", dep_name)
                        self._partitions[dep_name]._wiring_conflicts.discard(
                            p.name)
                    if p.name in self._managed_partitions:
                        self._managed_partitions.discard(p.name)
                    del self._partitions[p.name]

                bp_cache = {}
                wiring_cache = {}
                # throttle the adding of new partitions so updating of
                # machine state doesn't get bogged down
                for partition in new_partitions[:8]:
                    self.logger.info("new partition found: %s", partition.id)
                    bridge_p = Cobalt.bridge.Partition.by_id(partition.id)
                    self._partitions.q_add(
                        [self._new_partition_dict(bridge_p, bp_cache)])
                    p = self._partitions[bridge_p.id]
                    p.bridge_partition = partition
                    self._detect_wiring_deps(p, wiring_cache)

                # if partitions were added or removed, then update the relationships between partitions
                if len(missing_partitions) > 0 or len(new_partitions) > 0:
                    self.update_relatives()

                for p in self._partitions.values():
                    if p.cleanup_pending:
                        if p.used_by:
                            # if the partition has a pending cleanup request, then set the state so that cleanup will be
                            # performed
                            _start_partition_cleanup(p)
                        else:
                            # if the cleanup has already been initiated, then see how it's going
                            busy = []
                            parts = list(p._all_children)
                            parts.append(p)
                            for part in parts:
                                if part.bridge_partition.state != "RM_PARTITION_FREE":
                                    busy.append(part.name)
                            if len(busy) > 0:
                                _set_partition_cleanup_state(p)
                                self.logger.info(
                                    "partition %s: still cleaning; busy partition(s): %s",
                                    p.name, ", ".join(busy))
                            else:
                                p.cleanup_pending = False
                                self.logger.info(
                                    "partition %s: cleaning complete", p.name)
                    if p.state == "busy":
                        # when the partition becomes busy, if a script job isn't reserving it, then release the reservation
                        if not p.reserved_by:
                            p.reserved_until = False
                    elif p.state != "cleanup":
                        if p.reserved_until:
                            p.state = "allocated"
                            for part in p._parents:
                                if part.state == "idle":
                                    part.state = "blocked (%s)" % (p.name, )
                            for part in p._children:
                                if part.state == "idle":
                                    part.state = "blocked (%s)" % (p.name, )
                        elif p.bridge_partition.state == "RM_PARTITION_FREE" and p.used_by:
                            # if the job assigned to the partition has completed, then set the state so that cleanup will be
                            # performed
                            _start_partition_cleanup(p)
                            continue
                        for diag_part in self.pending_diags:
                            if p.name == diag_part.name or p.name in diag_part.parents or p.name in diag_part.children:
                                p.state = "blocked by pending diags"
                        for nc in p.node_cards:
                            if nc.used_by:
                                p.state = "blocked (%s)" % nc.used_by
                            if nc.state != "RM_NODECARD_UP":
                                p.state = "hardware offline: nodecard %s" % nc.id
                                self.offline_partitions.append(p.name)
                        for s in p.switches:
                            if s in busted_switches:
                                p.state = "hardware offline: switch %s" % s
                                self.offline_partitions.append(p.name)
                        for dep_name in p._wiring_conflicts:
                            if self._partitions[dep_name].state in [
                                    "busy", "allocated", "cleanup"
                            ]:
                                p.state = "blocked-wiring (%s)" % dep_name
                                break
                        for part_name in self.failed_diags:
                            part = self._partitions[part_name]
                            if p.name == part.name:
                                p.state = "failed diags"
                            elif p.name in part.parents or p.name in part.children:
                                p.state = "blocked by failed diags"
            except:
                self.logger.error("error in update_partition_state",
                                  exc_info=True)

            self._partitions_lock.release()

            # cleanup partitions and set their kernels back to the default (while _not_ holding the lock)
            pnames_cleaned = []
            for p in partitions_cleanup:
                self.logger.info(
                    "partition %s: starting partition destruction", p.name)
                pnames_destroyed = []
                parts = list(p._all_children)
                parts.append(p)
                for part in parts:
                    pnames_cleaned.append(part.name)
                    try:
                        bpart = part.bridge_partition
                        if bpart.state != "RM_PARTITION_FREE":
                            bpart.destroy()
                            pnames_destroyed.append(part.name)
                    except Cobalt.bridge.IncompatibleState:
                        pass
                    except:
                        self.logger.info(
                            "partition %s: an exception occurred while attempting to destroy partition %s",
                            p.name, part.name)
                if len(pnames_destroyed) > 0:
                    self.logger.info(
                        "partition %s: partition destruction initiated for %s",
                        p.name, ", ".join(pnames_destroyed))
                else:
                    self.logger.info(
                        "partition %s: no partition destruction was required",
                        p.name)
                try:
                    self._clear_kernel(p.name)
                    self.logger.info("partition %s: kernel settings cleared",
                                     p.name)
                except:
                    self.logger.error(
                        "partition %s: failed to clear kernel settings",
                        p.name)
            job_filter = Cobalt.bridge.JobFilter()
            job_filter.job_type = Cobalt.bridge.JOB_TYPE_ALL_FLAG
            jobs = Cobalt.bridge.JobList.by_filter(job_filter)
            for job in jobs:
                if job.partition_id in pnames_cleaned:
                    try:
                        job.cancel()
                        self.logger.info("partition %s: task %d canceled",
                                         job.partition_id, job.db_id)
                    except (Cobalt.bridge.IncompatibleState,
                            Cobalt.bridge.JobNotFound):
                        pass

            time.sleep(10)

    def _mark_partition_for_cleaning(self, pname, jobid):
        self._partitions_lock.acquire()
        try:
            p = self._partitions[pname]
            if p.used_by == jobid:
                p.cleanup_pending = True
                self.logger.info("partition %s: partition marked for cleanup",
                                 pname)
            elif p.used_by != None:
                self.logger.info("partition %s: job %s was not the current partition user (%s); partition not marked " + \
                    "for cleanup", pname, jobid, p.used_by)
        except:
            self.logger.exception(
                "partition %s: unexpected exception while marking the partition for cleanup",
                pname)
        self._partitions_lock.release()

    def _validate_kernel(self, kernel):
        if self.config.get('kernel') != 'true':
            return True
        kernel_dir = "%s/%s" % (os.path.expandvars(
            self.config.get('bootprofiles')), kernel)
        return os.path.exists(kernel_dir)

    def _set_kernel(self, partition, kernel):
        '''Set the kernel to be used by jobs run on the specified partition'''
        if self.config.get('kernel') != 'true':
            if kernel != "default":
                raise Exception("custom kernel capabilities disabled")
            return
        partition_link = "%s/%s" % (os.path.expandvars(
            self.config.get('partitionboot')), partition)
        kernel_dir = "%s/%s" % (os.path.expandvars(
            self.config.get('bootprofiles')), kernel)
        try:
            current = os.readlink(partition_link)
        except OSError:
            self.logger.error(
                "partition %s: failed to read partitionboot location %s" %
                (partition, partition_link))
            raise Exception("failed to read partitionboot location %s" %
                            (partition_link, ))
        if current != kernel_dir:
            if not self._validate_kernel(kernel):
                self.logger.error(
                    "partition %s: kernel directory \"%s\" does not exist" %
                    (partition, kernel_dir))
                raise Exception("kernel directory \"%s\" does not exist" %
                                (kernel_dir, ))
            self.logger.info(
                "partition %s: updating boot image; currently set to \"%s\"" %
                (partition, current.split('/')[-1]))
            try:
                os.unlink(partition_link)
                os.symlink(kernel_dir, partition_link)
            except OSError:
                self.logger.error(
                    "partition %s: failed to reset boot location" %
                    (partition, ))
                raise Exception("failed to reset boot location for partition" %
                                (partition, ))
            self.logger.info(
                "partition %s: boot image updated; now set to \"%s\"" %
                (partition, kernel))

    def _clear_kernel(self, partition):
        '''Set the kernel to be used by a partition to the default value'''
        if self.config.get('kernel') == 'true':
            try:
                self._set_kernel(partition, "default")
            except:
                logger.error("partition %s: failed to reset boot location" %
                             (partition, ))

    def generate_xml(self):
        """This method produces an XML file describing the managed partitions, suitable for use with the simulator."""
        ret = "<BG>\n"
        ret += "<PartitionList>\n"
        for p_name in self._managed_partitions:
            p = self._partitions[p_name]

            ret += "   <Partition name='%s'>\n" % p.name
            for nc in p.node_cards:
                ret += "      <NodeCard id='%s' />\n" % nc.id
            for s in p.switches:
                ret += "      <Switch id='%s' />\n" % s
            ret += "   </Partition>\n"

        ret += "</PartitionList>\n"

        ret += "</BG>\n"

        return ret

    generate_xml = exposed(generate_xml)

    def add_process_groups(self, specs):
        """Create a process group.
        
        Arguments:
        spec -- dictionary hash specifying a process group to start
        """

        self.logger.info("add_process_groups(%r)" % (specs))

        script_specs = []
        other_specs = []
        for spec in specs:
            if spec.get('mode', False) == "script":
                script_specs.append(spec)
            else:
                other_specs.append(spec)

        # start up script jobs
        script_pgroups = []
        if script_specs:
            for spec in script_specs:
                try:
                    self._set_kernel(
                        spec.get('location')[0], spec.get('kernel', "default"))
                except Exception, e:
                    new_pgroup = self.process_groups.q_add([spec])
                    pgroup = new_pgroup[0]
                    pgroup.nodect = self._partitions[pgroup.location[0]].size
                    pgroup.exit_status = 1
                    self.logger.info(
                        "process group %s: job %s/%s failed to set the kernel; %s",
                        pgroup.id, pgroup.jobid, pgroup.user, e)
                else:
                    try:
                        script_pgroup = ComponentProxy(
                            "script-manager").add_jobs([spec])
                    except (ComponentLookupError, xmlrpclib.Fault):
                        self._clear_kernel(spec.get('location')[0])
                        # FIXME: jobs that were already started are not reported
                        raise ProcessGroupCreationError(
                            "system::add_process_groups failed to communicate with script-manager"
                        )
                    new_pgroup = self.process_groups.q_add([spec])
                    pgroup = new_pgroup[0]
                    pgroup.script_id = script_pgroup[0]['id']
                    pgroup.nodect = self._partitions[pgroup.location[0]].size
                    self.logger.info(
                        "job %s/%s: process group %s created to track script",
                        pgroup.jobid, pgroup.user, pgroup.id)
                    self.reserve_resources_until(
                        spec['location'],
                        time.time() + 60 * float(spec['walltime']),
                        pgroup.jobid)
                    if pgroup.kernel != "default":
                        self.logger.info(
                            "process group %s: job %s/%s using kernel %s",
                            pgroup.id, pgroup.jobid, pgroup.user,
                            pgroup.kernel)
                    script_pgroups.append(pgroup)

        # start up non-script mode jobs
        process_groups = self.process_groups.q_add(other_specs)
        for pgroup in process_groups:
            pgroup.nodect = self._partitions[pgroup.location[0]].size
            self.logger.info(
                "job %s/%s: process group %s created to track mpirun status",
                pgroup.jobid, pgroup.user, pgroup.id)
            try:
                if not pgroup.true_mpi_args:
                    self._set_kernel(pgroup.location[0], pgroup.kernel)
            except Exception, e:
                # FIXME: setting exit_status to signal the job has failed isn't really the right thing to do.  another flag
                # should be added to the process group that wait_process_group uses to determine when a process group is no
                # longer active.  an error message should also be attached to the process group so that cqm can report the
                # problem to the user.
                pgroup.exit_status = 1
                self.logger.info(
                    "process group %s: job %s/%s failed to set the kernel; %s",
                    pgroup.id, pgroup.jobid, pgroup.user, e)
            else:
                if pgroup.kernel != "default" and not pgroup.true_mpi_args:
                    self.logger.info(
                        "process group %s: job %s/%s using kernel %s",
                        pgroup.id, pgroup.jobid, pgroup.user, pgroup.kernel)
                pgroup.start()
Exemplo n.º 6
0
class BGSched(Component):

    implementation = "bgsched"
    name = "scheduler"
    logger = logging.getLogger("Cobalt.Components.scheduler")

    _configfields = ['utility_file']
    _config = ConfigParser.ConfigParser()
    print Cobalt.CONFIG_FILES
    _config.read(Cobalt.CONFIG_FILES)
    if not _config._sections.has_key('bgsched'):
        print '''"bgsched" section missing from cobalt config file'''
        sys.exit(1)
    config = _config._sections['bgsched']
    mfields = [field for field in _configfields if not config.has_key(field)]
    if mfields:
        print "Missing option(s) in cobalt config file [bgsched] section: %s" % (
            " ".join(mfields))
        sys.exit(1)
    if config.get("default_reservation_policy"):
        global DEFAULT_RESERVATION_POLICY
        DEFAULT_RESERVATION_POLICY = config.get("default_reservation_policy")

    def __init__(self, *args, **kwargs):
        Component.__init__(self, *args, **kwargs)
        self.COMP_QUEUE_MANAGER = "queue-manager"
        self.COMP_SYSTEM = "system"
        self.reservations = ReservationDict()
        self.queues = QueueDict(self.COMP_QUEUE_MANAGER)
        self.jobs = JobDict(self.COMP_QUEUE_MANAGER)
        self.started_jobs = {}
        self.sync_state = Cobalt.Util.FailureMode("Foreign Data Sync")
        self.active = True

        self.get_current_time = time.time
        self.id_gen = IncrID()
        global bgsched_id_gen
        bgsched_id_gen = self.id_gen

        self.cycle_id_gen = IncrID()
        global bgsched_cycle_id_gen
        bgsched_cycle_id_gen = self.cycle_id_gen

    def __getstate__(self):
        return {
            'reservations': self.reservations,
            'version': 1,
            'active': self.active,
            'next_res_id': self.id_gen.idnum + 1,
            'next_cycle_id': self.cycle_id_gen.idnum + 1,
            'msg_queue': dbwriter.msg_queue,
            'overflow': dbwriter.overflow
        }

    def __setstate__(self, state):
        self.reservations = state['reservations']
        if 'active' in state:
            self.active = state['active']
        else:
            self.active = True

        self.id_gen = IncrID()
        self.id_gen.set(state['next_res_id'])
        global bgsched_id_gen
        bgsched_id_gen = self.id_gen

        self.cycle_id_gen = IncrID()
        self.cycle_id_gen.set(state['next_cycle_id'])
        global bgsched_cycle_id_gen
        bgsched_cycle_id_gen = self.cycle_id_gen

        self.queues = QueueDict(self.COMP_QUEUE_MANAGER)
        self.jobs = JobDict(self.COMP_QUEUE_MANAGER)
        self.started_jobs = {}
        self.sync_state = Cobalt.Util.FailureMode("Foreign Data Sync")

        self.get_current_time = time.time
        self.lock = threading.Lock()
        self.statistics = Statistics()

        if state.has_key('msg_queue'):
            dbwriter.msg_queue = state['msg_queue']
        if state.has_key('overflow') and (dbwriter.max_queued != None):
            dbwriter.overflow = state['overflow']

    # order the jobs with biggest utility first
    def utilitycmp(self, job1, job2):
        return -cmp(job1.score, job2.score)

    def prioritycmp(self, job1, job2):
        """Compare 2 jobs first using queue priority and then first-in, first-out."""

        val = cmp(self.queues[job1.queue].priority,
                  self.queues[job2.queue].priority)
        if val == 0:
            return self.fifocmp(job1, job2)
        else:
            # we want the higher priority first
            return -val

    def fifocmp(self, job1, job2):
        """Compare 2 jobs for first-in, first-out."""
        def fifo_value(job):
            if job.index is not None:
                return int(job.index)
            else:
                return job.jobid

        # Implement some simple variations on FIFO scheduling
        # within a particular queue, based on queue policy
        fifoval = cmp(fifo_value(job1), fifo_value(job2))
        if (job1.queue == job2.queue):
            qpolicy = self.queues[job1.queue].policy
            sizeval = cmp(int(job1.nodes), int(job2.nodes))
            wtimeval = cmp(int(job1.walltime), int(job2.walltime))
            if (qpolicy == 'largest-first' and sizeval):
                return -sizeval
            elif (qpolicy == 'smallest-first' and sizeval):
                return sizeval
            elif (qpolicy == 'longest-first' and wtimeval):
                return -wtimeval
            elif (qpolicy == 'shortest-first' and wtimeval):
                return wtimeval
            else:
                return fifoval
        else:
            return fifoval

        return cmp(fifo_value(job1), fifo_value(job2))

    def save_me(self):
        Component.save(self)

    save_me = automatic(save_me)

    #user_name in this context is the user setting/modifying the res.
    def add_reservations(self, specs, user_name):
        self.logger.info("%s adding reservation: %r" % (user_name, specs))
        added_reservations = self.reservations.q_add(specs)
        for added_reservation in added_reservations:
            self.logger.info("Res %s/%s: %s adding reservation: %r" %
                             (added_reservation.res_id,
                              added_reservation.cycle_id, user_name, specs))
            dbwriter.log_to_db(user_name, "creating", "reservation",
                               added_reservation)
        return added_reservations

    add_reservations = exposed(query(add_reservations))

    def del_reservations(self, specs, user_name):
        self.logger.info("%s releasing reservation: %r" % (user_name, specs))
        del_reservations = self.reservations.q_del(specs)
        for del_reservation in del_reservations:
            self.logger.info("Res %s/%s/: %s releasing reservation: %r" %
                             (del_reservation.res_id, del_reservation.cycle_id,
                              user_name, specs))
            #dbwriter.log_to_db(user_name, "ending", "reservation", del_reservation)
        return del_reservations

    del_reservations = exposed(query(del_reservations))

    def get_reservations(self, specs):
        return self.reservations.q_get(specs)

    get_reservations = exposed(query(get_reservations))

    def set_reservations(self, specs, updates, user_name):
        log_str = "%s modifying reservation: %r with updates %r" % (
            user_name, specs, updates)
        self.logger.info(log_str)

        #handle defers as a special case:  have to log these, and not drop a mod record.
        def _set_reservations(res, newattr):
            res.update(newattr)

        updates['__cmd_user'] = user_name
        mod_reservations = self.reservations.q_get(specs, _set_reservations,
                                                   updates)
        for mod_reservation in mod_reservations:
            self.logger.info("Res %s/%s: %s modifying reservation: %r" %
                             (mod_reservation.res_id, mod_reservation.cycle_id,
                              user_name, specs))
        return mod_reservations

    set_reservations = exposed(query(set_reservations))

    def release_reservations(self, specs, user_name):
        self.logger.info("%s requested release of reservation: %r" %
                         (user_name, specs))
        self.logger.info("%s releasing reservation: %r" % (user_name, specs))
        rel_res = self.get_reservations(specs)
        for res in rel_res:
            dbwriter.log_to_db(user_name, "released", "reservation", res)
        del_reservations = self.reservations.q_del(specs)
        for del_reservation in del_reservations:
            self.logger.info("Res %s/%s/: %s releasing reservation: %r" %
                             (del_reservation.res_id, del_reservation.cycle_id,
                              user_name, specs))
        return del_reservations

    release_reservations = exposed(query(release_reservations))

    def check_reservations(self):
        ret = ""
        reservations = self.reservations.values()
        for i in range(len(reservations)):
            for j in range(i + 1, len(reservations)):
                # if at least one reservation is cyclic, we want *that* reservation to be the one getting its overlaps method
                # called
                if reservations[i].cycle is not None:
                    res1 = reservations[i]
                    res2 = reservations[j]
                else:
                    res1 = reservations[j]
                    res2 = reservations[i]

                # we subtract a little bit because the overlaps method isn't really meant to do this
                # it will report warnings when one reservation starts at the same time another ends
                if res1.overlaps(res2.start, res2.duration - 0.00001):
                    # now we need to check for overlap in space
                    results = ComponentProxy(self.COMP_SYSTEM).get_partitions(
                        [{
                            'name': p,
                            'children': '*',
                            'parents': '*'
                        } for p in res2.partitions.split(":")])
                    for p in res1.partitions.split(":"):
                        for r in results:
                            if p == r['name'] or p in r['children'] or p in r[
                                    'parents']:
                                ret += "Warning: reservation '%s' overlaps reservation '%s'\n" % (
                                    res1.name, res2.name)

        return ret

    check_reservations = exposed(check_reservations)

    def sync_data(self):
        started = self.get_current_time()
        for item in [self.jobs, self.queues]:
            try:
                item.Sync()
            except (ComponentLookupError, xmlrpclib.Fault):
                # the ForeignDataDicts already include FailureMode stuff
                pass
        # print "took %f seconds for sync_data" % (time.time() - started, )

    #sync_data = automatic(sync_data)

    def _run_reservation_jobs(self, reservations_cache):
        # handle each reservation separately, as they shouldn't be competing for resources
        for cur_res in reservations_cache.itervalues():
            #print "trying to run res jobs in", cur_res.name, self.started_jobs
            queue = cur_res.queue
            if not (self.queues.has_key(queue)
                    and self.queues[queue].state == 'running'):
                continue

            temp_jobs = self.jobs.q_get([{
                'is_runnable': True,
                'queue': queue
            }])
            active_jobs = []
            for j in temp_jobs:
                if not self.started_jobs.has_key(
                        j.jobid) and cur_res.job_within_reservation(j):
                    active_jobs.append(j)

            if not active_jobs:
                continue
            active_jobs.sort(self.utilitycmp)

            job_location_args = []
            for job in active_jobs:
                job_location_args.append({
                    'jobid':
                    str(job.jobid),
                    'nodes':
                    job.nodes,
                    'queue':
                    job.queue,
                    'required':
                    cur_res.partitions.split(":"),
                    'utility_score':
                    job.score,
                    'walltime':
                    job.walltime,
                    'attrs':
                    job.attrs,
                    'user':
                    job.user,
                })

            # there's no backfilling in reservations
            try:
                best_partition_dict = ComponentProxy(
                    self.COMP_SYSTEM).find_job_location(job_location_args, [])
            except:
                self.logger.error("failed to connect to system component")
                best_partition_dict = {}

            for jobid in best_partition_dict:
                job = self.jobs[int(jobid)]
                self._start_job(job, best_partition_dict[jobid],
                                {str(job.jobid): cur_res.res_id})

    def _start_job(self, job, partition_list, resid=None):
        """Get the queue manager to start a job."""

        cqm = ComponentProxy(self.COMP_QUEUE_MANAGER)

        try:
            self.logger.info("trying to start job %d on partition %r" %
                             (job.jobid, partition_list))
            cqm.run_jobs([{
                'tag': "job",
                'jobid': job.jobid
            }], partition_list, None, resid)
        except ComponentLookupError:
            self.logger.error("failed to connect to queue manager")
            return

        self.started_jobs[job.jobid] = self.get_current_time()

    def schedule_jobs(self):
        '''look at the queued jobs, and decide which ones to start'''

        started_scheduling = self.get_current_time()

        if not self.active:
            return

        self.sync_data()

        # if we're missing information, don't bother trying to schedule jobs
        if not (self.queues.__oserror__.status
                and self.jobs.__oserror__.status):
            self.sync_state.Fail()
            return
        self.sync_state.Pass()

        self.lock.acquire()
        try:
            # cleanup any reservations which have expired
            for res in self.reservations.values():
                if res.is_over():
                    self.logger.info("reservation %s has ended; removing" %
                                     (res.name))
                    self.logger.info("Res %s/%s: Ending reservation: %r" %
                                     (res.res_id, res.cycle_id, res.name))
                    #dbwriter.log_to_db(None, "ending", "reservation",
                    #        res)
                    del_reservations = self.reservations.q_del([{
                        'name':
                        res.name
                    }])

            reservations_cache = self.reservations.copy()
        except:
            # just to make sure we don't keep the lock forever
            self.logger.error("error in schedule_jobs", exc_info=True)
        self.lock.release()

        # clean up the started_jobs cached data
        # TODO: Make this tunable.
        now = self.get_current_time()
        for job_name in self.started_jobs.keys():
            if (now - self.started_jobs[job_name]) > 60:
                del self.started_jobs[job_name]

        active_queues = []
        spruce_queues = []
        res_queues = set()
        for item in reservations_cache.q_get([{'queue': '*'}]):
            if self.queues.has_key(item.queue):
                if self.queues[item.queue].state == 'running':
                    res_queues.add(item.queue)

        for queue in self.queues.itervalues():
            if queue.name not in res_queues and queue.state == 'running':
                if queue.policy == "high_prio":
                    spruce_queues.append(queue)
                else:
                    active_queues.append(queue)

        # handle the reservation jobs that might be ready to go
        self._run_reservation_jobs(reservations_cache)

        # figure out stuff about queue equivalence classes
        if __running_mode__ == "simulation":
            equiv = [{'reservations': [], 'queues': ['default']}]
        else:
            res_info = {}
            for cur_res in reservations_cache.values():
                res_info[cur_res.name] = cur_res.partitions
            try:
                equiv = ComponentProxy(
                    self.COMP_SYSTEM).find_queue_equivalence_classes(
                        res_info,
                        [q.name for q in active_queues + spruce_queues])
            except:
                self.logger.error("failed to connect to system component")
                return

        for eq_class in equiv:
            # recall that is_runnable is True for certain types of holds
            temp_jobs = self.jobs.q_get([{'is_runnable':True, 'queue':queue.name} for queue in active_queues \
                if queue.name in eq_class['queues']])
            active_jobs = []
            for j in temp_jobs:
                if not self.started_jobs.has_key(j.jobid):
                    active_jobs.append(j)

            temp_jobs = self.jobs.q_get([{'is_runnable':True, 'queue':queue.name} for queue in spruce_queues \
                if queue.name in eq_class['queues']])
            spruce_jobs = []
            for j in temp_jobs:
                if not self.started_jobs.has_key(j.jobid):
                    spruce_jobs.append(j)

            # if there are any pending jobs in high_prio queues, those are the only ones that can start
            if spruce_jobs:
                active_jobs = spruce_jobs

            # get the cutoff time for backfilling
            #
            # BRT: should we use 'has_resources' or 'is_active'?  has_resources returns to false once the resource epilogue
            # scripts have finished running while is_active only returns to false once the job (not just the running task) has
            # completely terminated.  the difference is likely to be slight unless the job epilogue scripts are heavy weight.
            temp_jobs = [
                job for job in self.jobs.q_get([{
                    'has_resources': True
                }]) if job.queue in eq_class['queues']
            ]
            end_times = []
            for job in temp_jobs:
                # take the max so that jobs which have gone overtime and are being killed
                # continue to cast a small backfilling shadow (we need this for the case
                # that the final job in a drained partition runs overtime -- which otherwise
                # allows things to be backfilled into the drained partition)

                ##*AdjEst*
                if running_job_walltime_prediction:
                    runtime_estimate = float(job.walltime_p)
                else:
                    runtime_estimate = float(job.walltime)

                end_time = max(
                    float(job.starttime) + 60 * runtime_estimate, now + 5 * 60)
                end_times.append([job.location, end_time])

            for res_name in eq_class['reservations']:
                cur_res = reservations_cache[res_name]

                if not cur_res.cycle:
                    end_time = float(cur_res.start) + float(cur_res.duration)
                else:
                    done_after = float(cur_res.duration) - (
                        (now - float(cur_res.start)) % float(cur_res.cycle))
                    if done_after < 0:
                        done_after += cur_res.cycle
                    end_time = now + done_after
                if cur_res.is_active():
                    for part_name in cur_res.partitions.split(":"):
                        end_times.append([[part_name], end_time])

            if not active_jobs:
                continue
            active_jobs.sort(self.utilitycmp)

            # now smoosh lots of data together to be passed to the allocator in the system component
            job_location_args = []
            for job in active_jobs:
                forbidden_locations = set()
                for res_name in eq_class['reservations']:
                    cur_res = reservations_cache[res_name]
                    if cur_res.overlaps(self.get_current_time(),
                                        60 * float(job.walltime) + SLOP_TIME):
                        forbidden_locations.update(
                            cur_res.partitions.split(":"))

                job_location_args.append({
                    'jobid': str(job.jobid),
                    'nodes': job.nodes,
                    'queue': job.queue,
                    'forbidden': list(forbidden_locations),
                    'utility_score': job.score,
                    'walltime': job.walltime,
                    'walltime_p': job.walltime_p,  #*AdjEst*
                    'attrs': job.attrs,
                    'user': job.user,
                })

            try:
                best_partition_dict = ComponentProxy(
                    self.COMP_SYSTEM).find_job_location(
                        job_location_args, end_times)
            except:
                self.logger.error("failed to connect to system component",
                                  exc_info=True)
                best_partition_dict = {}

            for jobid in best_partition_dict:
                job = self.jobs[int(jobid)]
                self._start_job(job, best_partition_dict[jobid])

        # print "took %f seconds for scheduling loop" % (time.time() - started_scheduling, )

    schedule_jobs = locking(automatic(schedule_jobs))

    def get_resid(self, queue_name):

        return None

    get_resid = exposed(get_resid)

    def enable(self, user_name):
        """Enable scheduling"""
        self.logger.info("%s enabling scheduling", user_name)
        self.active = True

    enable = exposed(enable)

    def disable(self, user_name):
        """Disable scheduling"""
        self.logger.info("%s disabling scheduling", user_name)
        self.active = False

    disable = exposed(disable)

    def set_res_id(self, id_num):
        """Set the reservation id number."""
        self.id_gen.set(id_num)
        logger.info("Reset res_id generator to %s." % id_num)

    set_res_id = exposed(set_res_id)

    def set_cycle_id(self, id_num):
        """Set the cycle id number."""
        self.cycle_id_gen.set(id_num)
        logger.info("Reset cycle_id generator to %s." % id_num)

    set_cycle_id = exposed(set_cycle_id)

    def force_res_id(self, id_num):
        """Override the id-generator and change the resid to id_num"""
        self.id_gen.idnum = id_num - 1
        logger.warning("Forced res_id generator to %s." % id_num)

    force_res_id = exposed(force_res_id)

    def force_cycle_id(self, id_num):
        """Override the id-generator and change the cycleid to id_num"""
        self.cycle_id_gen.idnum = id_num - 1
        logger.warning("Forced cycle_id generator to %s." % id_num)

    force_cycle_id = exposed(force_cycle_id)

    def get_next_res_id(self):
        """Get what the next resid number would be"""
        return self.id_gen.idnum + 1

    get_next_res_id = exposed(get_next_res_id)

    def get_next_cycle_id(self):
        """get what the next cycleid number would be"""
        return self.cycle_id_gen.idnum + 1

    get_next_cycle_id = exposed(get_next_cycle_id)

    def __flush_msg_queue(self):
        """Send queued messages to the database-writer component"""
        dbwriter.flush_queue()

    __flush_msg_queue = automatic(
        __flush_msg_queue, float(get_bgsched_config('db_flush_interval', 10)))
Exemplo n.º 7
0
class Simulator (ClusterBaseSystem):
    
    """Generic system simulator.
    
    Methods:
    configure -- load partitions from an xml file
    reserve_partition -- lock a partition for use by a process_group (exposed)
    release_partition -- release a locked (busy) partition (exposed)
    add_process_groups -- add (start) a process group on the system (exposed, query)
    get_process_groups -- retrieve process groups (exposed, query)
    wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query)
    signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query)
    update_partition_state -- simulates updating partition state from the bridge API (automatic)
    """
    
    name = "system"
    implementation = "cluster_simulator"
    
    logger = logger

    def __init__ (self, *args, **kwargs):
        ClusterBaseSystem.__init__(self, *args, **kwargs)
        self.process_groups.item_cls = ClusterProcessGroup
    
    
    def __setstate__(self, state):
        ClusterBaseSystem.__setstate__(self, state)
        self.process_groups.item_cls = ClusterProcessGroup
        
        
    def add_process_groups (self, specs):
        
        """Create a simulated process group.
        
        Arguments:
        spec -- dictionary hash specifying a process group to start
        """
        
        self.logger.info("add_process_groups(%r)" % (specs))
        process_groups = self.process_groups.q_add(specs)
        for process_group in process_groups:
            self.start(process_group)
        return  process_groups
    add_process_groups = exposed(query(all_fields=True)(add_process_groups))
    
    def get_process_groups (self, specs):
        """Query process_groups from the simulator."""
        return self.process_groups.q_get(specs)
    get_process_groups = exposed(query(get_process_groups))
    
    def wait_process_groups (self, specs):
        """get process groups that have finished running."""
        self.logger.info("wait_process_groups(%r)" % (specs))
        process_groups = [pg for pg in self.process_groups.q_get(specs) if pg.exit_status is not None]
        for process_group in process_groups:
            self.logger.info("finished on hosts: %s", Cobalt.Util.merge_nodelist(self.process_groups[process_group.id].location))
            for host in self.process_groups[process_group.id].location:
                self.running_nodes.discard(host)
            del self.process_groups[process_group.id]
        return process_groups
    wait_process_groups = exposed(query(wait_process_groups))
    
    def signal_process_groups (self, specs, signame="SIGINT"):
        """Simulate the signaling of a process_group."""
        self.logger.info("signal_process_groups(%r, %r)" % (specs, signame))
        process_groups = self.process_groups.q_get(specs)
        for process_group in process_groups:
            process_group.signals.append(signame)
        return process_groups
    signal_process_groups = exposed(query(signal_process_groups))
    
    def start (self, process_group):
        thread.start_new_thread(self._mpirun, (process_group, ))
    
    def _mpirun (self, process_group):
        argv = process_group._get_argv()
        stdout = open(process_group.stdout or "/dev/null", "a")
        stderr = open(process_group.stderr or "/dev/null", "a")
        
        try:
            cobalt_log_file = open(process_group.cobalt_log_file or "/dev/null", "a")
            print >> cobalt_log_file, "%s\n" % " ".join(argv[1:])
            cobalt_log_file.close()
        except:
            logger.error("Job %s/%s:  unable to open cobaltlog file %s" % (process_group.id, process_group.user, process_group.cobalt_log_file))
        
        try:
            partition = argv[argv.index("-partition") + 1]
        except ValueError:
            print >> stderr, "ERROR: '-partition' is a required flag"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        except IndexError:
            print >> stderr, "ERROR: '-partition' requires a value"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        
        try:
            mode = argv[argv.index("-mode") + 1]
        except ValueError:
            print >> stderr, "ERROR: '-mode' is a required flag"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        except IndexError:
            print >> stderr, "ERROR: '-mode' requires a value"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        
        try:
            size = argv[argv.index("-np") + 1]
        except ValueError:
            print >> stderr, "ERROR: '-np' is a required flag"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        except IndexError:
            print >> stderr, "ERROR: '-np' requires a value"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        try:
            size = int(size)
        except ValueError:
            print >> stderr, "ERROR: '-np' got invalid value %r" % (size)
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
        
        print >> stdout, "ENVIRONMENT"
        print >> stdout, "-----------"
        for key, value in process_group.env.iteritems():
            print >> stdout, "%s=%s" % (key, value)
        print >> stdout
        
        print >> stderr, "FE_MPI (Info) : Initializing MPIRUN"
        
        
        print >> stderr, "FE_MPI (Info) : process group with id", process_group.id
        print >> stderr, "FE_MPI (Info) : Waiting for process_group to terminate"
        
        print >> stdout, "Running process_group: %s" % " ".join(argv)
        
        start_time = time.time()
        run_time = random.randint(60, 180)
        my_exit_status = 0
         
        print "running for about %f seconds" % run_time
        while time.time() < (start_time + run_time):
            if "SIGKILL" in process_group.signals:
                process_group.exit_status = 1
                return
            elif "SIGTERM" in process_group.signals:
                print >> stderr, "FE_MPI (Info) : ProcessGroup got signal SIGTERM"
                my_exit_status = 1
                break
            else:
                time.sleep(1) # tumblers better than pumpers
        
        print >> stderr, "FE_MPI (Info) : ProcessGroup", process_group.id, "switched to state TERMINATED ('T')"
        print >> stderr, "FE_MPI (Info) : ProcessGroup sucessfully terminated"
        print >> stderr, "BE_MPI (Info) : Releasing partition", partition
        print >> stderr, "BE_MPI (Info) : Partition", partition, "switched to state FREE ('F')"
        print >> stderr, "BE_MPI (Info) : BE completed"
        print >> stderr, "FE_MPI (Info) : FE completed"
        print >> stderr, "FE_MPI (Info) : Exit status:", my_exit_status
        
        process_group.exit_status = my_exit_status
    
    
    
    def launch_diags(self, partition, test_name):
        exit_value = 0
        for nc in partition.node_cards:
            if nc.id in self.failed_components:
                exit_value = 1
        for switch in partition.switches:
            if switch in self.failed_components:
                exit_value = 2

        self.finish_diags(partition, test_name, exit_value)
Exemplo n.º 8
0
class HeckleSystem(Component):
    """
    Cobalt System component for handling / interacting with Heckle resource manager
    
    External Methods:
        add_process_groups -- allocates nodes
        get_process_groups -- get process groups based on specs
        signal_process_groups -- signal a process group
        wait_process_groups -- removed process groups based on specs
        
    Internal Methods:
        __init__:
        _start_pg:
        _check_builds_done:
        _wait:
        _release_resources:
        get_resources:
        
    Queue Manager Methods:
        validate_job:
        verify_locations:
        find_job_locations:
        find_queue_equivalence_classes:
    """

    name = "system"
    implementation = "HeckleBreadboard"
    queue_assignments = {}

    def __init__(self, *args, **kwargs):
        Component.__init__(self, *args, **kwargs)
        self.process_groups = ProcessGroupDict()
        self.process_groups.item_cls = HeckleProcessGroup
        self.queue_assignments["default"] = self.get_resources()
        self.hacky_forbidden_nodes = [
        ]  #This is a temporary fix for the forbidden nodes issue

    def __repr__(self):
        """
        printout representation of the class
        """
        indict = self.__dict__
        printstr = ""
        printstr += "Heckle System Object: Values"
        for element in indict:
            printstr += str(element) + "::"
            if indict[element] == None:
                printstr += "None, "
            else:
                printstr += str(indict[element]) + ", "
        printstr += "   Process Groups:"
        for element in self.process_groups:
            printstr += str(element) + "::" + \
            str(self.process_groups[element]) + ", "
        return printstr

    #####################
    # Main set of methods
    #####################
    def add_process_groups(self, specs):
        """
        This function takes the specs (a list of jobs) and initiates each job as
        a process group.
        The process group abstracts the actual job into an object, providing a 
        single point of control and interaction for all the nodes within that job.
        Each job is described by a dict.  Each dict contains:
            size:  
            kernel: a String, the name of the kernel image to load.
            executable: A string, the name of the command to execute upon the
                head node; this could be considered the actual job's file.
            stdin, stdout, stderr:  Three separate strings, each containing
                the file to use for standard communication with the job as it
                is running.  May be specified, or False.
            kerneloptions: A string containing various options for the kernel,
                or False.
            args: A list
            umask: An integer
            jobid: An integer
            cobalt_log_file: A string containing the log file to use in the
                initiation and running of the job itself.
            location:  List of strings of node / resource names
            env:  A dict of key:value strings, specifying the environment in
                which the job is to run on the node
            id: A number
            mode:
            nodect:
            cwd:  A string, specifying the current working directory in which
                to run the job on the node
            walltime:  Integer; the time, in minutes, allocated for the job
                to run on the node.
            user:  A string, the name of the user under which this job is to run.
        """
        logstr = "System:add_process_groups:"
        LOGGER.debug(logstr + "Specs are %s" % specs)
        return self.process_groups.q_add(specs)

    add_process_groups = exposed(query(add_process_groups))

    def get_process_groups(self, specs):
        """get a list of existing allocations"""
        LOGGER.debug("System:get_process_groups: specs are %s" % specs)
        self._wait()
        return self.process_groups.q_get(specs)

    get_process_groups = exposed(query(get_process_groups))

    def signal_process_groups(self, specs, sig):
        """Free the specified process group (set of allocated nodes)"""
        LOGGER.debug(
        "System:signal_process_groups: Specs are %s, sig is %s"\
        % (specs, sig) )
        return self.process_groups.q_get(specs, lambda x, y: x.signal(y), sig)

    signal_process_groups = exposed(query(signal_process_groups))

    def wait_process_groups(self, specs):
        """Remove terminated process groups"""
        LOGGER.debug("System:wait_process_groups; specs are %s" % specs)
        return self.process_groups.q_del(specs, lambda x, \
        _:self._release_resources(x))

    wait_process_groups = exposed(query(wait_process_groups))

    #########################################
    # Methods for dealing with Process Groups
    #########################################

    def _check_builds_done(self):
        """
        Check to see if the nodes are done building
        Starts the process group if all nodes in them are done building
        """
        #LOGGER.debug( "System:Check Build Done: Waiting to Start..." )
        #sleep(20)
        exstr = "System:check_build_done:"
        retval = True
        pg_list = [x for x in self.process_groups.itervalues()\
        if (len(x.pinging_nodes) > 0)]
        hiccup = HeckleConnector()
        for pgp in pg_list:
            for nodename in pgp.pinging_nodes:
                teststr = hiccup.get_node_bootstate(nodename)
                if teststr == "READY":
                    if 'fakebuild' in pgp.__dict__ and pgp.fakebuild:
                        pgp.pinging_nodes.remove(nodename)
                        LOGGER.debug( exstr + "Node %s done building; "\
                             + "%s pinging nodes left" %\
                             ( nodename, len(pgp.pinging_nodes)-1 ) )
                    else:
                        LOGGER.debug( exstr + "Node %s not done yet" %\
                                          nodename )
                if teststr == "COMPLETED":
                    LOGGER.debug( exstr +
                         "Removing node %s...%i pinging nodes left" \
                              % (nodename, len(pgp.pinging_nodes)-1) )
                    pgp.pinging_nodes.remove(nodename)
                elif teststr in ["BOOTING", "", ""]:
                    LOGGER.debug(exstr + "Node %s not done yet." % nodename)
                elif teststr == "UNALLOCATED":
                    raise Exception(
                        exstr +
                        "Node 'UNALLOCATED'; Possible build error, or system timed out."
                    )
                elif teststr == "CRITFAIL":
                    raise Exception(
                        exstr +
                        "Node says, 'CRITFAIL'.  It timed out while building.")
                #####################
                ####     Need to figure a better way to fail gracefully
                #####################
            if len(pgp.pinging_nodes) == 0:
                LOGGER.debug(
                "System:Check Build Done: No Pinging Nodes left, Start PG %s Running." \
        % pgp.jobid)
                pgp.start()
            else:
                retval = False
        return retval

    _check_builds_done = automatic(_check_builds_done)

    def _wait(self):
        """
        Calls the process group container's wait() method
        """
        waitlen = len(self.process_groups.keys())
        LOGGER.debug("System:_wait:%s process groups." % waitlen)
        for pgp in self.process_groups.itervalues():
            pgp.wait()
            try:
                del (self.hacky_forbidden_nodes[pgp.location])
            except:
                pass

    _wait = automatic(_wait)

    def _release_resources(self, pgp):
        """
        Releases all the Heckle nodes, unreserving them
        """
        LOGGER.debug("System:release")
        LOGGER.debug("System:Locations are: %s" % pgp.location)
        hiccup = HeckleConnector()
        hiccup.free_reserved_node(uid=pgp.uid, node_list=pgp.location)
        try:
            del (self.hacky_forbidden_nodes[pgp.location])
        except:
            pass

    def get_resources(self, specs=None):
        """
        Returns a list of free resources (nodes) which match the given specs.
        Specs is a dict which describes a job
        """
        LOGGER.debug("System:get Resources")
        ##################################
        ###  Look at this as a future change
        ##################################
        hiccup = HeckleConnector()
        if not specs:
            return hiccup.node_list
        else:
            return hiccup.list_available_nodes(**specs)

    get_resources = exposed(query(get_resources))

    ##########################################################
    # Methods for interacting with scheduler and queue-manager
    ##########################################################

    def validate_job(self, spec):
        """
        Validates a job for submission
        -- will the job ever run under the current Heckle configuration?
        Steps:
            1)  Validate Kernel
            2)  Validate HW
            3)  Validate Job versus overall
        """
        LOGGER.debug("System:Validate Job: Specs are %s" % spec)
        hiccup = HeckleConnector()
        try:
            kernel = spec['kernel']
            valid_kernel = hiccup.validkernel(kernel)
            if not valid_kernel:
                raise Exception("System:Validate Job: Bad Kernel")
        except:
            spec['kernel'] = 'default'
        try:
            valid_hw = hiccup.validhw(**spec['attrs'])
            if not valid_hw:
                raise Exception("System:Validate Job: Bad Hardware Specs: %s" %
                                spec)
        except Exception as strec:
            raise Exception("System:Validate Job:  Validate Job: %s" % strec)
        #try:
        #valid_job = hiccup.validjob( **spec )
        #if not valid_job:
        #raise Exception(
        #"System: validate Job:  Never enough nodes")
        #except:
        #raise Exception("System: validate Job: Never enough nodes")
        return spec

    validate_job = exposed(validate_job)

    def verify_locations(self, location_list):
        """
        Makes sure a location list is valid
        location list is a list of fully qualified strings of node names
        ex:  nodename.mcs.anl.gov
        """
        LOGGER.debug("System:validate Job: Verify Locations")
        hiccup = HeckleConnector()
        heckle_set = set(hiccup.list_all_nodes())
        location_set = set(location_list)
        if heckle_set >= location_set:
            return location_list
        else:
            not_valid_list = list(location_set.difference(heckle_set))
            raise Exception(
                "System:VerifyLocations: Invalid location names: %s" %
                not_valid_list)

    verify_locations = exposed(verify_locations)

    def find_job_location(self, job_location_args, end_times):
        """
        Finds a group of not-busy nodes in which to run the job
        
        Arguments:
            job_location_args -- A list of dictionaries with info about the job
                jobid -- string identifier
                nodes -- int number of nodes
                queue -- string queue name
                required -- ??
                utility_score -- ??
                threshold -- ??
                walltime -- ??
                attrs -- dictionary of attributes to match against
            end_times -- supposed time the job will end
            
        Returns: Dictionary with list of nodes a job can run on, keyed by jobid
        """
        LOGGER.debug("System:find_job_location")
        locations = {}

        def jobsort(job):
            """Used to sort job list by utility score"""
            return job["utility_score"]

        job_location_args.sort(key=jobsort)

        #Try to match jobs to nodes which can run them
        hiccup = HeckleConnector()
        for job in job_location_args:
            if "attrs" not in job or job["attrs"] is None:
                job["attrs"] = {}
            print "Job is %s" % job
            tempjob = job.copy()
            if self.hacky_forbidden_nodes:
                if 'forbidden' not in tempjob.keys():
                    tempjob['forbidden'] = self.hacky_forbidden_nodes
                else:
                    tempjob['forbidden'].extend(self.hacky_forbidden_nodes)
            #############################
            ###  Look at this as point of change
            ###  Think:  For node in unreserved nodes
            ###            Choose node from list
            ###            Remove node from unreserved nodes
            #############################
            try:
                resources = hiccup.find_job_location(**
                                                     job)  #get matching nodes
                if not resources:
                    continue
            except Exception as err:
                LOGGER.info("System:find_job_location: Error %s" % err)
                continue
            node_list = []
            # Build a list of appropriate nodes
            for node in resources:
                node_list.append(node)
                self.hacky_forbidden_nodes.append(node)
            locations[job["jobid"]] = node_list
        LOGGER.info("System:find_job_location: locations are %s" % locations)
        return locations

    find_job_location = exposed(find_job_location)


    def find_queue_equivalence_classes(self, reservation_dict, \
                                                        active_queue_names):
        """
        Finds equivalent queues
        An equivalent queue is a queue which can run upon the same partition(s)
        For now, with one partition (everything!) this is irrelevant.
        Returns: equiv= [{'reservations': [], 'queues': ['default']}]
        """
        #LOGGER.debug("System:find queue equivalence classes" )
        equiv = []
        #print "Reservation_Dict is: %s" % reservation_dict
        #print "Active_queue_names is %s" % active_queue_names
        #print "Queue assignments are: %s" % self.queue_assignments
        for queue in self.queue_assignments:
            # skip queues that aren't running
            if not queue in active_queue_names:
                continue
            found_a_match = False
            #print "Heckle Queue is %s" % queue
            for equ in equiv:
                print "Heckle Equ is %s" % equ
                if equ['data'].intersection(self.queue_assignments[queue]):
                    equ['queues'].add(queue)
                    equ['data'].update(self.queue_assignments[queue])
                    found_a_match = True
                    break
            if not found_a_match:
                equiv.append({
                    'queues': set([queue]),
                    'data': set(self.queue_assignments[queue]),
                    'reservations': set()
                })
        real_equiv = []
        for eq_class in equiv:
            found_a_match = False
            for equ in real_equiv:
                if equ['queues'].intersection(eq_class['queues']):
                    equ['queues'].update(eq_class['queues'])
                    equ['data'].update(eq_class['data'])
                    found_a_match = True
                    break
            if not found_a_match:
                real_equiv.append(eq_class)
        equiv = real_equiv
        for eq_class in equiv:
            for res_name in reservation_dict:
                for host_name in reservation_dict[res_name].split(":"):
                    if host_name in eq_class['data']:
                        eq_class['reservations'].add(res_name)
            for key in eq_class:
                eq_class[key] = list(eq_class[key])
            del eq_class['data']
        return equiv

    find_queue_equivalence_classes = exposed(find_queue_equivalence_classes)

    def get_partitions(self, locations):
        """
        Work-around to get the cqadm to run a single job on this system
        PRE:  locations is a list of dict of strings of possible node names
        POST:  if good, return locations
                if not good, raise exception and list bad nodes
        """
        logstr = "System:get_partition: "
        hiccup = HeckleConnector()
        heckle_node_set = set(hiccup.list_all_nodes())
        locs = locations[0]['name']
        LOGGER.debug(logstr + "raw is are: %s" % locations)
        LOGGER.debug(logstr + "vals are: %s" % locs)
        if type(locs) == ListType:
            locset = set(locs)
            badlocations = locset.difference(heckle_node_set)
            if badlocations:
                raise Exception(logstr +
                                "Bad Locations: %s " % list(badlocations))
        elif type(locs) == StringType:
            if locs not in locations:
                raise Exception(logstr + "Bad Locations: %s" % locs)
        else:
            raise Exception( logstr +
                             "location needs to be string or list of strings, you provided %s : %s" \
% ( type(locs), locs))
        return locations

    get_partitions = exposed(get_partitions)
Exemplo n.º 9
0
            p = self._partitions[p_name]

            ret += "   <Partition name='%s'>\n" % p.name
            for nc in p.node_cards:
                ret += "      <NodeCard id='%s' />\n" % nc.id
            for s in p.switches:
                ret += "      <Switch id='%s' />\n" % s
            ret += "   </Partition>\n"

        ret += "</PartitionList>\n"

        ret += "</BG>\n"

        return ret

    generate_xml = exposed(generate_xml)

    def validate_job(self, spec):
        """
        validate a job for submission

        Arguments:
        spec -- job specification dictionary
        """
        spec = BGBaseSystem.validate_job(self, spec)
        if not self._validate_kernel(spec['kernel']):
            raise JobValidationError("kernel does not exist")
        return spec

    validate_job = exposed(validate_job)
Exemplo n.º 10
0
        for p_name in self._managed_partitions:
            p = self._partitions[p_name]

            ret += "   <Partition name='%s'>\n" % p.name
            for nc in p.node_cards:
                ret += "      <NodeCard id='%s' />\n" % nc.id
            for s in p.switches:
                ret += "      <Switch id='%s' />\n" % s
            ret += "   </Partition>\n"
        
        ret += "</PartitionList>\n"

        ret += "</BG>\n"
            
        return ret
    generate_xml = exposed(generate_xml)
    
    def validate_job(self, spec):
        """
        validate a job for submission

        Arguments:
        spec -- job specification dictionary
        """
        spec = BGBaseSystem.validate_job(self, spec)
        if not self._validate_kernel(spec['kernel']):
            raise JobValidationError("kernel does not exist")
        return spec
    validate_job = exposed(validate_job)

    def launch_diags(self, partition, test_name):
Exemplo n.º 11
0
class ClusterSystem(ClusterBaseSystem):
    """cluster system component.

    Methods:
    configure -- load partitions from the bridge API
    add_process_groups -- add (start) an mpirun process on the system (exposed, ~query)
    get_process_groups -- retrieve mpirun processes (exposed, query)
    wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query)
    signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query)
    update_partition_state -- update partition state from the bridge API (runs as a thread)
    """

    name = "system"
    implementation = "cluster_system"

    logger = logger

    def __init__(self, *args, **kwargs):
        ClusterBaseSystem.__init__(self, *args, **kwargs)
        self.process_groups.item_cls = ClusterProcessGroup

    def __getstate__(self):
        state = {}
        state.update(ClusterBaseSystem.__getstate__(self))
        # state.update({
        #         "cluster_system_version": 1 })
        return state

    def __setstate__(self, state):
        ClusterBaseSystem.__setstate__(self, state)
        self.process_groups.item_cls = ClusterProcessGroup

    def add_process_groups(self, specs):
        """Create a process group.

        Arguments:
        spec -- dictionary hash specifying a process group to start
        """

        self.logger.info("add_process_groups(%r)", specs)
        process_groups = self.process_groups.q_add(specs)
        for pgroup in process_groups:
            self.logger.info(
                "Job %s/%s: process group %s created to track script",
                pgroup.user, pgroup.jobid, pgroup.id)
        #System has started the job.  We need remove them from the temp, alloc array
        #in cluster_base_system.
        self.apg_started = True
        for pgroup in process_groups:
            for location in pgroup.location:
                try:
                    del self.alloc_only_nodes[location]
                except KeyError:
                    logger.critical(
                        "%s already removed from alloc_only_nodes list",
                        location)
        return process_groups

    add_process_groups = exposed(query(add_process_groups))

    def get_process_groups(self, specs):
        self._get_exit_status()
        return self.process_groups.q_get(specs)

    get_process_groups = exposed(query(get_process_groups))

    def _get_exit_status(self):
        children = {}
        cleanup = {}
        for forker in ['user_script_forker']:
            try:
                for child in ComponentProxy(forker).get_children(
                        "process group", None):
                    children[(forker, child['id'])] = child
                    child['pg'] = None
                cleanup[forker] = []
            except ComponentLookupError, e:
                self.logger.error(
                    "failed to contact the %s component to obtain a list of children",
                    forker)
            except:
Exemplo n.º 12
0
    _get_exit_status = automatic(
        _get_exit_status,
        float(get_cluster_system_config('get_exit_status_interval', 10)))

    def wait_process_groups(self, specs):
        self._get_exit_status()
        process_groups = [
            pg for pg in self.process_groups.q_get(specs)
            if pg.exit_status is not None
        ]
        for process_group in process_groups:
            self.clean_nodes(process_group.location, process_group.user,
                             process_group.jobid)
        return process_groups

    wait_process_groups = locking(exposed(query(wait_process_groups)))

    def signal_process_groups(self, specs, signame="SIGINT"):
        my_process_groups = self.process_groups.q_get(specs)
        for pg in my_process_groups:
            if pg.exit_status is None:
                try:
                    ComponentProxy(pg.forker).signal(pg.head_pid, signame)
                except:
                    self.logger.error(
                        "Failed to communicate with forker when signalling job"
                    )

        return my_process_groups

    signal_process_groups = exposed(query(signal_process_groups))
Exemplo n.º 13
0
                    ComponentProxy(forker).cleanup_children(cleanup[forker])
                except ComponentLookupError:
                    self.logger.error("failed to contact the %s component to cleanup children", forker)
                except:
                    self.logger.error("unexpected exception while requesting that the %s component perform cleanup",
                        forker, exc_info=True)
    _get_exit_status = automatic(_get_exit_status,
            float(get_cluster_system_config('get_exit_status_interval', 10)))

    def wait_process_groups (self, specs):
        self._get_exit_status()
        process_groups = [pg for pg in self.process_groups.q_get(specs) if pg.exit_status is not None]
        for process_group in process_groups:
            self.clean_nodes(process_group.location, process_group.user, process_group.jobid)
        return process_groups
    wait_process_groups = locking(exposed(query(wait_process_groups)))

    def signal_process_groups (self, specs, signame="SIGINT"):
        my_process_groups = self.process_groups.q_get(specs)
        for pg in my_process_groups:
            if pg.exit_status is None:
                try:
                    ComponentProxy(pg.forker).signal(pg.head_pid, signame)
                except:
                    self.logger.error("Failed to communicate with forker when signalling job")

        return my_process_groups
    signal_process_groups = exposed(query(signal_process_groups))

    def del_process_groups(self, jobid):
        '''delete a process group and don't track it anymore.
Exemplo n.º 14
0
class BBSystem(Component):
    """Breadboard system component.

    Methods:
    add_process_groups -- allocates nodes
    get_process_groups -- get process groups based on specs
    signal_process_groups -- signal a process group
    wait_process_groups -- removed process groups based on specs
    """

    name = "system"
    implementation = "Breadboard"

    def __init__(self, *args, **kwargs):
        Component.__init__(self, *args, **kwargs)
        self.resources = ResourceDict()
        self.process_groups = ProcessGroupDict()
        self.process_groups.item_cls = BBProcessGroup
        self.queue_assignments = {}
        self.queue_assignments["default"] = sets.Set(self.resources)

    #####################
    # Main set of methods
    #####################
    def add_process_groups(self, specs):
        """Allocate nodes and add the list of those allocated to the PGDict"""
        return self.process_groups.q_add(specs, lambda x, _: self._start_pg(x))

    add_process_groups = exposed(query(add_process_groups))

    def get_process_groups(self, specs):
        """Get a list of existing allocations"""
        self._wait()
        return self.process_groups.q_get(specs)

    get_process_groups = exposed(query(get_process_groups))

    def signal_process_groups(self, specs, sig):
        """Free the specified process group (set of allocated nodes)"""
        return self.process_groups.q_get(specs, lambda x, y: x.signal(y), sig)

    signal_process_groups = exposed(query(signal_process_groups))

    def wait_process_groups(self, specs):
        """Remove terminated process groups"""
        return self.process_groups.q_del(
            specs, lambda x, _: self._release_resources(x))

    wait_process_groups = exposed(query(wait_process_groups))

    #########################################
    # Methods for dealing with Process Groups
    #########################################
    def _start_pg(self, pgp):
        """Starts a process group by initiating building/rebooting nodes"""

        ###########################################
        ### The following is for back-compatibility
        ### with bballoc (bbtools) until breadboard
        ### is switched entirely to run on cobalt
        ###########################################
        bbdata = bblib.BBConfig("/etc/bb.xml")
        bbdata.SetNodeAttr(pgp.location, {
            "user": pgp.user,
            "state": "Cobalt",
            "comment": "Managed by Cobalt"
        })
        bbdata.WriteAndClose()
        ###########################################
        ### End of back-compatibility
        ###########################################

        specs = [{"name": name, "attributes": "*"} for name in pgp.location]
        resources = self.get_resources(specs)
        action = "build-%s" % pgp.kernel
        for res in resources:
            # Set build action for each resource
            specs = [{"name": res.name}]
            new_attrs = {"attributes": {"action": action}}
            self.set_attributes(specs, new_attrs)
            mac = res.attributes["mac"]
            linkname = "/tftpboot/pxelinux.cfg/01-%s" \
                % mac.replace(":", "-").lower()
            if os.readlink(linkname) == action:
                continue
            os.unlink(linkname)
            os.symlink(action, linkname)
        for res in resources:
            # Cycle power
            os.system("/usr/sbin/pm -c %s" % res.name)
            # Add resource to list of building nodes
            pgp.building_nodes.append(res.name)

    def _check_builds_done(self):
        """Checks if nodes are done building for each process group and
        scripts can begin running"""
        for pgp in [
                x for x in self.process_groups.itervalues()
                if (len(x.building_nodes) > 0 or len(x.pinging_nodes) > 0)
        ]:
            specs = [{
                "name": name,
                "attributes": "*"
            } for name in pgp.building_nodes]
            building = self.get_resources(specs)
            build_action = "build-%s" % pgp.kernel
            for node in building:
                if node.attributes["action"] != build_action:
                    pgp.building_nodes.remove(node.name)
                    pgp.pinging_nodes.append(node.name)
            for nodename in pgp.pinging_nodes:
                if os.system("/bin/ping -c 1 -W 1 %s > /dev/null" % nodename):
                    continue
                pgp.pinging_nodes.remove(nodename)
            if len(pgp.building_nodes) == 0 and len(pgp.pinging_nodes) == 0:
                pgp.start()

    _check_builds_done = automatic(_check_builds_done)

    def node_done_building(self, node):
        """Sets a node as done building
        
        Arguments:
        node -- string name of node that is done building

        Returns: nothing
        """
        specs = [{"name": node, "attributes": "*"}]
        nodedata = self.get_resources(specs)
        if len(nodedata) > 0:
            buildimage = nodedata[0].attributes["action"]
            nodedata[0].attributes["action"] = buildimage.replace(
                "build-", "boot-")

    node_done_building = exposed(node_done_building)

    def _wait(self):
        """Calls the process group container's wait() method"""
        for pgp in self.process_groups.itervalues():
            pgp.wait()

    _wait = automatic(_wait)

    def _release_resources(self, pgp):
        """Releases the resources held by a process group"""
        os.system("/usr/sbin/pm -0 %s" % " ".join(pgp.location))
        specs = [{"name": name} for name in pgp.location]
        new_attrs = {"state": "idle"}
        self.set_attributes(specs, new_attrs)

        ###########################################
        ### The following is for back-compatibility
        ### with bballoc (bbtools) until breadboard
        ### is switched entirely to run on cobalt
        ###########################################
        bbdata = bblib.BBConfig("/etc/bb.xml")
        bbdata.SetNodeAttr(pgp.location, {"user": "******"})
        bbdata.WriteAndClose()
        ###########################################
        ### End of back-compatibility
        ###########################################

    ####################################
    # Methods for dealing with resources
    ####################################
    def add_resources(self, specs):
        """Add a resource to this system
        
        Arguments:
        specs -- A list of dictionaries with the attributes for the resources
        
        Returns: list of values added
        """
        try:
            ret = self.resources.q_add(specs)
            for res in ret:
                self.queue_assignments["default"].add(res)
        except KeyError:
            ret = "KeyError"
        return ret

    add_resources = exposed(query(add_resources))

    def remove_resources(self, specs):
        """Remove a resource from this system
        
        Arguments:
        specs -- A list of dictionaries with the attributes to pick which
                 resources to remove

        Returns: list of resources removed
        """
        ret = self.resources.q_del(specs)
        for res in ret:
            self.queue_assignments["default"].discard(res)
        return ret

    remove_resources = exposed(remove_resources)

    def get_resources(self, specs):
        """Returns a list of all the resources for this system matching the
        given specs (list of dictionaries)"""
        return self.resources.q_get(specs)

    get_resources = exposed(query(get_resources))

    def set_attributes(self, specs, newattrs):
        """Sets an attribute in specified resources

        Arguments:
        specs -- list of dictionaries with resource attributes to match
        newattrs -- a dictionary with key:val pairs of attributes to set

        Returns: a list of the changed resources
        """
        return self.resources.q_get(
            specs,
            lambda x, y: [set_attr(x, key, val) for key, val in y.iteritems()],
            newattrs)

    set_attributes = exposed(query(set_attributes))

    def remove_attributes(self, specs, attrs):
        """Removes other attributes in specified resources

        Arguments:
        specs -- list of dictionaries with resource attributes to match
        attrs -- list of names of attributes to remove from resource.attributes

        Returns: a list of the changed resources
        """
        return self.resources.q_get(
            specs, lambda x, y: [rem_attr(x, key) for key in y], attrs)

    remove_attributes = exposed(query(remove_attributes))

    ##########################################################
    # Methods for interacting with scheduler and queue-manager
    ##########################################################
    def validate_job(self, spec):
        """Validate a job for submission

        Arguments:
        spec -- job specification dictionary
        """
        max_nodes = len(
            self.get_resources([{
                "name": "*",
                "functional": True,
                "scheduled": True
            }]))
        try:
            spec["nodecount"] = int(spec["nodecount"])
        except ValueError:
            raise JobValidationError("Non-integer node count")
        if not 0 < spec["nodecount"] <= max_nodes:
            raise JobValidationError("Node count out of realistic range")
        if float(spec["time"]) < 15:
            raise JobValidationError("Walltime less than minimum 15 minutes")
        if "kernel" in spec:
            if not (os.path.exists(
                    "/tftpboot/pxelinux.cfg/build-%s" % spec["kernel"])
                    and os.path.exists(
                        "/tftpboot/pxelinux.cfg/boot-%s" % spec["kernel"])):
                raise JobValidationError(
                    ("Specified image %s (from -k " +
                     "'kernel' flag does not exist") % spec["kernel"])
        if "attrs" in spec:
            matched_res = self.resources.get_attr_matched_resources(
                [{
                    "name": "*",
                    "functional": True,
                    "scheduled": True,
                    "attributes": "*"
                }], spec["attrs"])
            if spec["nodecount"] > len(matched_res):
                raise JobValidationError("Not enough nodes exist with the " +
                                         "attributes to match")
        return spec

    validate_job = exposed(validate_job)

    def verify_locations(self, location_list):
        """Makes sure a 'location string' is valid"""
        resources = self.get_resources([{"name": r} for r in location_list])
        return [r.name for r in resources]

    verify_locations = exposed(verify_locations)

    def find_job_location(self, job_location_args, end_times):
        """Finds and reserves a list of nodes in which the job can run
        
        Arguments:
        job_location_args -- A list of dictionaries with info about the job
            jobid -- string identifier
            nodes -- int number of nodes
            queue -- string queue name
            required -- ??
            utility_score -- ??
            threshold -- ??
            walltime -- ??
            attrs -- dictionary of attributes to match against
        end_times -- supposed time the job will end

        Returns: Dictionary with list of nodes a job can run on, keyed by jobid
        """
        locations = {}

        def jobsort(job):
            """Used to sort job list by utility score"""
            return job["utility_score"]

        job_location_args.sort(key=jobsort)
        for job in job_location_args:
            specs = [{
                "name": "*",
                "functional": True,
                "scheduled": True,
                "state": "idle",
                "attributes": "*"
            }]
            if "attrs" not in job or job["attrs"] is None:
                job["attrs"] = {}
            resources = self.resources.get_attr_matched_resources(
                specs, job["attrs"])
            if len(resources) < job["nodes"]:
                #Can't schedule job - not enough resources
                continue

            def namesort(res):
                """Used to sort resources by name"""
                return res.name

            resources.sort(key=namesort)
            used_resources = resources[:job["nodes"]]
            for res in used_resources:
                res.state = "busy"
            locations[job["jobid"]] = [r.name for r in used_resources]
        return locations

    find_job_location = exposed(find_job_location)

    def find_queue_equivalence_classes(self, reservation_dict,
                                       active_queue_names):
        """Finds equivalent queues"""
        equiv = []
        for queue in self.queue_assignments:
            # skip queues that aren't running
            if not queue in active_queue_names:
                continue
            found_a_match = False
            for equ in equiv:
                if equ['data'].intersection(self.queue_assignments[queue]):
                    equ['queues'].add(queue)
                    equ['data'].update(self.queue_assignments[queue])
                    found_a_match = True
                    break
            if not found_a_match:
                equiv.append({
                    'queues': set([queue]),
                    'data': set(self.queue_assignments[queue]),
                    'reservations': set()
                })
        real_equiv = []
        for eq_class in equiv:
            found_a_match = False
            for equ in real_equiv:
                if equ['queues'].intersection(eq_class['queues']):
                    equ['queues'].update(eq_class['queues'])
                    equ['data'].update(eq_class['data'])
                    found_a_match = True
                    break
            if not found_a_match:
                real_equiv.append(eq_class)
        equiv = real_equiv
        for eq_class in equiv:
            for res_name in reservation_dict:
                for host_name in reservation_dict[res_name].split(":"):
                    if host_name in eq_class['data']:
                        eq_class['reservations'].add(res_name)
            for key in eq_class:
                eq_class[key] = list(eq_class[key])
            del eq_class['data']
        return equiv

    find_queue_equivalence_classes = exposed(find_queue_equivalence_classes)
Exemplo n.º 15
0
               #p = subprocess.check_call(args)
               self.logger.debug("Heckle_FORKER: Fork: About to exec with %s" % cmd)
               os.execvpe(cmd, (cmd, ), environ)
               #os.execl(*cmd)
               self.logger.debug("Heckle_FORKER: Fork: Finished Execution")
          else:
               self.logger.debug("Heckle_FORKER: Fork: Child")
               local_id = self.id_gen.next()
               kid = Child()
               kid.id = local_id
               kid.pid = child_pid
               kid.label = "%s/%s" % (label, local_id)
               self.children[local_id] = kid
               self.logger.info("task %s: forked with pid %s", kid.label, kid.pid)
               return local_id
    fork = exposed(fork)
    
    def signal (self, local_id, signame):
        """Signal a child process.
        
        Arguments:
        local_id -- id of the child to signal
        signame -- signal name
        """
        if not self.children.has_key(local_id):
            self.logger.error("signal found no child with id %s", local_id)
            return

        kid = self.children[local_id]
        self.logger.info("task %s: sending %s to pid %s", kid.label, signame, kid.pid)
        try:
Exemplo n.º 16
0
class OrcmBaseSystem(Component):
    """base system class.

    Methods:
    add_partitions -- tell the system to manage partitions (exposed, query)
    get_partitions -- retrieve partitions in the simulator (exposed, query)
    del_partitions -- tell the system not to manage partitions (exposed, query)
    set_partitions -- change random attributes of partitions (exposed, query)
    update_relatives -- should be called when partitions are added and removed from the managed list
    """
    def __init__(self, *args, **kwargs):
        orcm.orcmapi_init()
        Component.__init__(self, *args, **kwargs)
        self.process_groups = ProcessGroupDict()
        self.all_nodes = set()
        self.running_nodes = set()
        self.down_nodes = set()
        self.queue_assignments = {}
        self.node_order = {}

        self.configure()

        self.queue_assignments["default"] = set(self.all_nodes)
        self.alloc_only_nodes = {}  # nodename:starttime
        self.cleaning_processes = []
        #keep track of which jobs still have hosts being cleaned
        self.cleaning_host_count = {}  # jobid:count
        self.locations_by_jobid = {}  #jobid:[locations]
        self.jobid_to_user = {}  #jobid:username

        self.alloc_timeout = int(
            get_orcm_system_config("allocation_timeout", 300))

        self.logger.info("allocation timeout set to %d seconds." %
                         self.alloc_timeout)

    def __del__(self):
        orcm.orcmapi_finalize()

    def __getstate__(self):
        state = {}
        state.update(Component.__getstate__(self))
        state.update({
            "orcm_base_version": 1,
            "queue_assignments": self.queue_assignments,
            "down_nodes": self.down_nodes
        })
        return state

    def __setstate__(self, state):
        Component.__setstate__(self, state)
        self.all_nodes = set()
        self.node_order = {}
        self.configure()
        self.queue_assignments = state.get('queue_assignments', {})
        nonexistent_queues = []
        #make sure we can't try and schedule nodes that don't exist
        if self.queue_assignments == {}:
            self.queue_assignments["default"] = set(self.all_nodes)
        else:
            #remove nodes that have disappeared
            for queue, nodes in self.queue_assignments.iteritems():
                corrected_nodes = self.all_nodes & set(nodes)
                if corrected_nodes == set():
                    nonexistent_queues.append(queue)
                self.queue_assignments[queue] = corrected_nodes
            for queue in nonexistent_queues:
                del self.queue_assignments[queue]
        self.down_nodes = self.all_nodes & set(state.get('down_nodes', set()))
        self.process_groups = ProcessGroupDict()
        self.running_nodes = set()
        self.alloc_only_nodes = {}  # nodename:starttime
        if not state.has_key("cleaning_processes"):
            self.cleaning_processes = []
        self.cleaning_host_count = {}  # jobid:count
        self.locations_by_jobid = {}  #jobid:[locations]
        self.jobid_to_user = {}  #jobid:username

        self.alloc_timeout = int(
            get_orcm_system_config("allocation_timeout", 300))
        self.logger.info("allocation timeout set to %d seconds." %
                         self.alloc_timeout)

    def save_me(self):
        '''Automatically write statefiles.'''
        Component.save(self)

    save_me = automatic(save_me)

    def validate_job(self, spec):
        """validate a job for submission

        Arguments:
        spec -- job specification dictionary
        """
        # spec has {nodes, walltime*, procs, mode, kernel}

        max_nodes = len(self.all_nodes)
        sys_type = 'orcm'
        job_types = ['co', 'vn', 'smp', 'dual', 'script']
        spec['mode'] = 'script'
        try:
            spec['nodecount'] = int(spec['nodecount'])
        except:
            raise JobValidationError("Non-integer node count")
        if not 0 < spec['nodecount'] <= max_nodes:
            raise JobValidationError("Node count out of realistic range")
        if float(spec['time']) < 5 and float(spec['time']) > 0:
            raise JobValidationError("Walltime less than minimum")
        if spec['mode'] not in job_types:
            raise JobValidationError("%s is an invalid mode" % spec['mode'])
        if not spec['proccount']:
            spec['proccount'] = spec['nodecount']
        else:
            try:
                spec['proccount'] = int(spec['proccount'])
            except:
                JobValidationError("non-integer proccount")
            if spec['proccount'] < 1:
                raise JobValidationError("negative proccount")
            if spec['proccount'] > spec['nodecount']:
                if spec['mode'] not in ['vn', 'dual']:
                    raise JobValidationError("proccount too large")
        # need to handle kernel
        return spec

    validate_job = exposed(validate_job)

    def fail_partitions(self, specs):
        self.logger.error("Fail_partitions not used on orcm systems.")
        return ""

    fail_partitions = exposed(fail_partitions)

    def unfail_partitions(self, specs):
        self.logger.error("unfail_partitions not used on orcm systems.")
        return ""

    unfail_partitions = exposed(unfail_partitions)

    def _find_job_location(self, args):
        '''Get a list of nodes capable of running a job.

        '''
        nodes = args['nodes']
        jobid = args['jobid']

        available_nodes = self._get_available_nodes(args)

        if nodes <= len(available_nodes):
            return {jobid: [available_nodes.pop() for i in range(nodes)]}
        else:
            return None

    def _get_available_nodes(self, args):
        '''Get all nodes required for a job, ignoring forbidden ones (i.e. reserved nodes).

        '''
        queue = args['queue']
        forbidden = args.get("forbidden", [])
        required = args.get("required", [])

        if required:
            available_nodes = set(required)
        else:
            available_nodes = self.queue_assignments[queue].difference(
                forbidden)

        available_nodes = available_nodes.difference(self.running_nodes)
        available_nodes = available_nodes.difference(self.down_nodes)

        return available_nodes

    def _backfill_cmp(self, left, right):
        return cmp(left[1], right[1])

    # the argument "required" is used to pass in the set of locations allowed by a reservation;
    def find_job_location(self, arg_list, end_times):
        '''Find the best location for a job and start the job allocation process

        '''
        best_location_dict = {}
        winner = arg_list[0]

        jobid = None
        user = None

        # first time through, try for starting jobs based on utility scores
        for args in arg_list:
            location_data = self._find_job_location(args)
            if location_data:
                best_location_dict.update(location_data)
                jobid = int(args['jobid'])
                user = args['user']
                break

        # the next time through, try to backfill, but only if we couldn't find anything to start
        if not best_location_dict:
            job_end_times = {}
            total = 0
            for item in sorted(end_times, cmp=self._backfill_cmp):
                total += len(item[0])
                job_end_times[total] = item[1]

            needed = winner['nodes'] - len(self._get_available_nodes(winner))
            now = time.time()
            backfill_cutoff = 0
            for num in sorted(job_end_times):
                if needed <= num:
                    backfill_cutoff = job_end_times[num] - now

            for args in arg_list:
                if 60 * float(args['walltime']) > backfill_cutoff:
                    continue

                location_data = self._find_job_location(args)
                if location_data:
                    best_location_dict.update(location_data)
                    self.logger.info("backfilling job %s" % args['jobid'])
                    jobid = int(args['jobid'])
                    user = args['user']
                    break

        # reserve the stuff in the best_partition_dict, as those partitions are allegedly going to
        # be running jobs very soon
        for jobid_str, location_list in best_location_dict.iteritems():
            self.running_nodes.update(location_list)
            self.logger.info("Job %s: Allocating nodes: %s" %
                             (int(jobid_str), location_list))
            #just in case we're not going to be running a job soon, and have to
            #return this to the pool:
            self.jobid_to_user[jobid] = user
            alloc_time = time.time()
            for location in location_list:
                self.alloc_only_nodes[location] = alloc_time
            self.locations_by_jobid[jobid] = location_list

        return best_location_dict

    find_job_location = exposed(find_job_location)

    def check_alloc_only_nodes(self):
        '''Check to see if nodes that we have allocated but not run yet should be freed.

        '''
        jobids = []
        check_time = time.time()
        dead_locations = []
        for location, start_time in self.alloc_only_nodes.iteritems():
            if int(check_time) - int(start_time) > self.alloc_timeout:
                self.logger.warning(
                    "Location: %s: released.  Time between allocation and run exceeded.",
                    location)
                dead_locations.append(location)

        if dead_locations == []:
            #well we don't have anything dying this time.
            return

        for jobid, locations in self.locations_by_jobid.iteritems():
            clear_from_dead_locations = False
            for location in locations:
                if location in dead_locations:
                    clear_from_dead_locations = True
                    if jobid not in jobids:
                        jobids.append(jobid)
            #bagging the jobid will cause all locs assoc with job to be
            #cleaned so clear them out to make this faster
            if clear_from_dead_locations:
                for location in locations:
                    if location in dead_locations:
                        dead_locations.remove(location)
            if dead_locations == []:
                #well we don't have anything dying this time.
                break
        self.invoke_node_cleanup(jobids)
        return

    check_alloc_only_nodes = automatic(
        check_alloc_only_nodes,
        get_orcm_system_config("automatic_method_interval", 10.0))

    def invoke_node_cleanup(self, jobids):
        '''Invoke cleanup for nodes that have exceeded their allocated time

        '''
        found_locations = set()
        for jobid in jobids:
            user = self.jobid_to_user[jobid]
            locations = self.locations_by_jobid[jobid]
            locations_to_clean = set()
            for location in locations:
                if location not in found_locations:
                    try:
                        del self.alloc_only_nodes[location]
                    except KeyError:
                        self.logger.warning(
                            'WANING: Location: %s Jobid: %s; Location already removed from alloc_only_nodes',
                            location, jobid)
                    else:
                        locations_to_clean.add(location)
                        found_locations.add(location)

#            self.clean_nodes(list(locations_to_clean), user, jobid)

    def _walltimecmp(self, dict1, dict2):
        return -cmp(float(dict1['walltime']), float(dict2['walltime']))

    def find_queue_equivalence_classes(self,
                                       reservation_dict,
                                       active_queue_names,
                                       passthrough_partitions=[]):
        '''Take a dictionary of reservation information and a list of active
            queues return a list of dictionaries containing queues, partition
            associations and reservation data.
            
            Input:
            reservation_dict: A dict of reservations and associated partitions
            active_queue_names: A list of queues that you can schedule jobs from
            passthrough_partitions: not used in this implementation
            
            Output:
            A dictionary of queues and associated reservations that have resources
            in common with each other
            
        '''

        equiv = []
        for q in self.queue_assignments:
            # skip queues that aren't "running"
            if not q in active_queue_names:
                continue

            found_a_match = False
            for e in equiv:
                if e['data'].intersection(self.queue_assignments[q]):
                    e['queues'].add(q)
                    e['data'].update(self.queue_assignments[q])
                    found_a_match = True
                    break
            if not found_a_match:
                equiv.append({
                    'queues': set([q]),
                    'data': set(self.queue_assignments[q]),
                    'reservations': set()
                })

        real_equiv = []
        for eq_class in equiv:
            found_a_match = False
            for e in real_equiv:
                if e['queues'].intersection(eq_class['queues']):
                    e['queues'].update(eq_class['queues'])
                    e['data'].update(eq_class['data'])
                    found_a_match = True
                    break
            if not found_a_match:
                real_equiv.append(eq_class)

        equiv = real_equiv

        for eq_class in equiv:
            for res_name in reservation_dict:
                skip = True
                for host_name in reservation_dict[res_name].split(":"):
                    if host_name in eq_class['data']:
                        eq_class['reservations'].add(res_name)

            for key in eq_class:
                eq_class[key] = list(eq_class[key])
            del eq_class['data']

        return equiv

    find_queue_equivalence_classes = exposed(find_queue_equivalence_classes)

    def reserve_resources_until(self, location, time, jobid):
        '''hold onto resources until a timeout has passed, and then clear the
        nodes for another job.

        '''

        #WARNING: THIS IS VERY DIFFERENT FROM BLUE GENES!
        #THIS WILL FORCIBLY CLEAR THE NODE!

        if time is None:
            for host in location:
                self.running_nodes.discard(host)
                self.logger.info("hasty job kill: freeing %s" % host)
        else:
            self.logger.error("failed to reserve location '%r' until '%s'" %
                              (location, time))
            return True  #So we can progress.

    reserve_resources_until = exposed(reserve_resources_until)

    def nodes_up(self, node_list, user_name=None):
        changed = []
        for n in node_list:
            if n in self.down_nodes:
                self.down_nodes.remove(n)
                changed.append(n)
            if n in self.running_nodes:
                self.running_nodes.remove(n)
                changed.append(n)
        if changed:
            self.logger.info("%s marking nodes up: %s", user_name,
                             ", ".join(changed))
        return changed

    nodes_up = exposed(nodes_up)

    def nodes_down(self, node_list, user_name=None):
        changed = []
        for n in node_list:
            if n in self.all_nodes:
                self.down_nodes.add(n)
                changed.append(n)
        if changed:
            self.logger.info("%s marking nodes down: %s", user_name,
                             ", ".join(changed))
        return changed

    nodes_down = exposed(nodes_down)

    def get_node_status(self):
        def my_cmp(left, right):
            return cmp(left[2], right[2])

        self.configure()
        status_list = []
        for n in self.all_nodes:
            if n in self.running_nodes:
                status = "allocated"
            elif n in self.down_nodes:
                status = "down"
            else:
                status = "idle"

            status_list.append((n, status, self.node_order[n]))
        status_list.sort(my_cmp)
        return status_list

    get_node_status = exposed(get_node_status)

    def get_queue_assignments(self):
        ret = {}
        for q in self.queue_assignments:
            ret[q] = list(self.queue_assignments[q])
        return ret

    get_queue_assignments = exposed(get_queue_assignments)

    def set_queue_assignments(self, queue_names, node_list, user_name=None):
        checked_nodes = set()
        for n in node_list:
            if n in self.all_nodes:
                checked_nodes.add(n)

        queue_list = queue_names.split(":")
        for q in queue_list:
            if q not in self.queue_assignments:
                self.queue_assignments[q] = set()

        for q in self.queue_assignments.keys():
            if q not in queue_list:
                self.queue_assignments[q].difference_update(checked_nodes)
                if len(self.queue_assignments[q]) == 0:
                    del self.queue_assignments[q]
            else:
                self.queue_assignments[q].update(checked_nodes)
        self.logger.info("%s assigning queues %s to nodes %s", user_name,
                         queue_names, " ".join(checked_nodes))
        return list(checked_nodes)

    set_queue_assignments = exposed(set_queue_assignments)

    def verify_locations(self, location_list):
        """Providing a system agnostic interface for making sure a 'location string' is valid"""
        ret = []
        for l in location_list:
            if l in self.all_nodes:
                ret.append(l)
        return ret

    verify_locations = exposed(verify_locations)

    def configure(self):
        '''Add nodes from ORCM to Cobalt's configuration of tracked nodes.
        '''
        nodelist = PP_node_t()
        node_count = c_int(0)
        counter = 0
        orcm.orcmapi_get_nodes(byref(nodelist), byref(node_count))
        self.down_nodes.clear()
        for i in range(node_count.value):
            name = nodelist[i].contents.name
            state = nodelist[i].contents.state
            self.all_nodes.add(name)
            self.node_order[name] = counter
            if (state == ORCM_NODE_STATE_UNKNOWN) or (state
                                                      == ORCM_NODE_STATE_DOWN):
                self.down_nodes.add(name)
            counter += 1

    # this gets called by bgsched in order to figure out if there are partition overlaps;
    # it was written to provide the data that bgsched asks for and raises an exception
    # if you try to ask for more
    def get_partitions(self, specs):
        '''Fetch node information and their respective states.

        '''
        partitions = []
        for spec in specs:
            item = {}
            for node in self.all_nodes:
                if "name" in spec:
                    if spec["name"] == '*':
                        item.update({"name": node})
                    elif spec["name"] == node:
                        item.update({"name": node})

            if "name" in spec:
                spec.pop("name")
            if "children" in spec:
                item.update({"children": []})
                spec.pop("children")
            if "parents" in spec:
                item.update({"parents": []})
                spec.pop("parents")
            if spec:
                raise NotSupportedError(
                    "orcm clusters lack information on: %s" %
                    ", ".join(spec.keys()))
            if item:
                partitions.append(item)

        return partitions

    get_partitions = exposed(get_partitions)

    def launch_script(self, config_option, host, jobid, user, group_name):
        '''Start our script processes used for node prep and cleanup.

        '''
        #TODO: ORCM_launch

    def launch_session(self, jobid, nodes, nodelist, user):
        o_user = c_char_p(user)
        o_jobid = c_int(jobid)
        o_nodes = c_int(nodes)
        o_nodelist = c_char_p(nodelist)
        retval = orcm.orcmapi_launch_session(o_jobid, o_nodes, o_nodelist,
                                             o_user)
        return retval

    launch_session = exposed(launch_session)

    def cancel_session(self, jobid):
        o_jobid = c_int(jobid)
        retval = orcm.orcmapi_cancel_session(o_jobid)
        return retval

    cancel_session = exposed(cancel_session)

    def del_process_groups(self, jobid):
        '''Set actions to take when deleting process groups.  This must be
        overridden in the implementation.

        '''
        raise NotImplementedError("Must be overridden in child class")
Exemplo n.º 17
0
class ClusterSystem (ClusterBaseSystem):
    
    """cluster system component.
    
    Methods:
    configure -- load partitions from the bridge API
    add_process_groups -- add (start) an mpirun process on the system (exposed, ~query)
    get_process_groups -- retrieve mpirun processes (exposed, query)
    wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query)
    signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query)
    update_partition_state -- update partition state from the bridge API (runs as a thread)
    """
    
    name = "system"
    implementation = "cluster_system"
    
    logger = logger

    
    def __init__ (self, *args, **kwargs):
        ClusterBaseSystem.__init__(self, *args, **kwargs)
        self.process_groups.item_cls = ClusterProcessGroup

        
    def __setstate__(self, state):
        ClusterBaseSystem.__setstate__(self, state)
        self.process_groups.item_cls = ClusterProcessGroup

    
    def add_process_groups (self, specs):
        """Create a process group.
        
        Arguments:
        spec -- dictionary hash specifying a process group to start
        """

        self.logger.info("add_process_groups(%r)", specs)
        process_groups = self.process_groups.q_add(specs)
        for pgroup in process_groups:
            self.logger.info("job %s/%s: process group %s created to track script", 
                    pgroup.jobid, pgroup.user, pgroup.id)

        return process_groups
    add_process_groups = exposed(query(add_process_groups))
    
    def get_process_groups (self, specs):
        self._get_exit_status()
        return self.process_groups.q_get(specs)
    get_process_groups = exposed(query(get_process_groups))
    
    def _get_exit_status (self):
        try:
            running = ComponentProxy("forker").active_list()
        except:
            self.logger.error("failed to contact forker component for list of running jobs")
            return

        for each in self.process_groups.itervalues():
            if each.head_pid not in running and each.exit_status is None:
                # FIXME: i bet we should consider a retry thing here -- if we fail enough times, just
                # assume the process is dead?  or maybe just say there's no exit code the first time it happens?
                # maybe the second choice is better
                try:
                    dead_dict = ComponentProxy("forker").get_status(each.head_pid)
                except Queue.Empty:
                    self.logger.error("failed call for get_status from forker component for pg %s", each.head_pid)
                    return
                
                if dead_dict is None:
                    self.logger.info("process group %i: job %s/%s exited with unknown status", each.id, each.jobid, each.user)
                    each.exit_status = 1234567
                else:
                    each.exit_status = dead_dict["exit_status"]
                    if dead_dict["signum"] == 0:
                        self.logger.info("process group %i: job %s/%s exited with status %i", 
                            each.id, each.jobid, each.user, each.exit_status)
                    else:
                        if dead_dict["core_dump"]:
                            core_dump_str = ", core dumped"
                        else:
                            core_dump_str = ""
                        self.logger.info("process group %i: job %s/%s terminated with signal %s%s", 
                            each.id, each.jobid, each.user, dead_dict["signum"], core_dump_str)
    
    _get_exit_status = automatic(_get_exit_status)
    
    def wait_process_groups (self, specs):
        self._get_exit_status()
        process_groups = [pg for pg in self.process_groups.q_get(specs) if pg.exit_status is not None]
        for process_group in process_groups:
            thread.start_new_thread(self.clean_nodes, (process_group,))
        return process_groups
    wait_process_groups = locking(exposed(query(wait_process_groups)))
    
    def signal_process_groups (self, specs, signame="SIGINT"):
        my_process_groups = self.process_groups.q_get(specs)
        for pg in my_process_groups:
            if pg.exit_status is None:
                try:
                    ComponentProxy("forker").signal(pg.head_pid, signame)
                except:
                    self.logger.error("Failed to communicate with forker when signalling job")

        return my_process_groups
    signal_process_groups = exposed(query(signal_process_groups))

    def clean_nodes(self, pg):
        try:
            tmp_data = pwd.getpwnam(pg.user)
            groupid = tmp_data.pw_gid
            group_name = grp.getgrgid(groupid)[0]
        except KeyError:
            group_name = ""
            self.logger.error("Job %s/%s unable to determine group name for epilogue" % (pg.jobid, pg.user))
 
        processes = []
        for host in pg.location:
            h = host.split(":")[0]
            try:
                p = subprocess.Popen(["/usr/bin/ssh", h, pg.config.get("epilogue"), str(pg.jobid), pg.user, group_name], 
                                     stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                p.host = h
                processes.append(p)
            except:
                self.logger.error("Job %s/%s failed to run epilogue on host %s", pg.jobid, pg.user, h, exc_info=True)
        
        start = time.time()
        dirty_nodes = []
        while True:
            running = False
            for p in processes:
                if p.poll() is None:
                    running = True
                    break
            
            if not running:
                break
            
            if time.time() - start > float(pg.config.get("epilogue_timeout")):
                for p in processes:
                    if p.poll() is None:
                        try:
                            os.kill(p.pid, signal.SIGTERM)
                            dirty_nodes.append(p.host)
                            self.logger.error("Job %s/%s epilogue timed out on host %s" % (pg.jobid, pg.user, p.host))
                        except:
                            self.logger.error("epilogue for %s already terminated" %p.host)
                break
            else:
                time.sleep(5)

        for p in processes:
            if p.poll() > 0:
                self.logger.error("epilogue failed for host %s", p.host)
                self.logger.error("stderr from epilogue on host %s: [%s]", p.host, p.stderr.read().strip())
            
            
        self.lock.acquire()
        try:
            self.logger.info("job finished on %s", Cobalt.Util.merge_nodelist(pg.location))
            for host in pg.location:
                self.running_nodes.discard(host)
            
            if dirty_nodes:    
                for host in dirty_nodes:
                    self.down_nodes.add(host)
                    self.logger.info("epilogue timed out, marking host %s down" % host)
                p = subprocess.Popen([pg.config.get("epi_epilogue"), str(pg.jobid), pg.user, group_name] + dirty_nodes)
            
            del self.process_groups[pg.id]
        except:
            self.logger.error("error in clean_nodes", exc_info=True)
        self.lock.release()
Exemplo n.º 18
0
class HeckleSystem(Component):
    """
     Cobalt System component for handling / interacting with Heckle resource manager
     
     External Methods:
          add_process_groups -- allocates nodes
          get_process_groups -- get process groups based on specs
          signal_process_groups -- signal a process group
          wait_process_groups -- removed process groups based on specs
          
     Internal Methods:
          __init__:
          _start_pg:
          _check_builds_done:
          _wait:
          _release_resources:
          get_resources:
          
     Queue Manager Methods:
          validate_job:
          verify_locations:
          find_job_locations:
          find_queue_equivalence_classes:
     """

    name = "system"
    implementation = "HeckleBreadboard"
    queue_assignments = {}

    def __init__(self, *args, **kwargs):
        logger.debug(
            "heckle: System: init ... %s ... &&&&&&&&&&&&&&&&&&&&&&&&&&&&&  I am here as well &&&&&&&&&&&&&&&&&&&&&&&&&"
            % threading.current_thread().getName())
        Component.__init__(self, *args, **kwargs)
        self.process_groups = ProcessGroupDict()
        self.process_groups.item_cls = HeckleProcessGroup
        self.resources = ResourceDict()
        self.queue_assignments["default"] = self.resources.keys()
        print "\n\n\n\n"
        print "Queue assignments are: %s" % self.queue_assignments

    def __repr__(self):
        """
          printout representation of the class
          """
        indict = self.__dict__
        printstr = ""
        printstr += "Heckle System Object: Values"
        for element in indict:
            printstr += str(element) + "::"
            if indict[element] == None:
                printstr += "None, "
            else:
                printstr += str(indict[element]) + ", "
        printstr += "   Process Groups:"
        for element in self.process_groups:
            printstr += str(element) + "::" + str(
                self.process_groups[element]) + ", "
        return printstr

    #####################
    # Main set of methods
    #####################
    def add_process_groups(self, specs):
        """
          Allocate nodes and add the list of those allocated to the PGDict
          specs is a list of dictionaries
          Each dictionary contains the specifications for all the nodes in the process group
          """
        #Debug - Take out to really rebuild
        ####    Need to check the environment variable for fakebuild
        try:
            specs[0]['fakebuild'] = specs[0]['env']['fakebuild']
            del specs[0]['env']['fakebuild']
        except:
            pass
        print "Heckle System:  add_process_groups: <<<<<<<<<<<<<<<<<<          OK< Debug< This< :  %s" % specs
        HICCUP = HeckleConnector()
        #try:
        reservation = HICCUP.make_reservation(**(specs[0]))
        heckle_res_id = reservation.id
        uid = specs[0]['user']
        logger.debug("Heckle System: heckle_res_id = %i" % heckle_res_id)
        specs[0]['heckle_res_id'] = heckle_res_id
        return self.process_groups.q_add(
            specs, lambda x, _: self._start_pg(
                x, heckle_res_id=heckle_res_id, uid=uid))
        #except Exception as hec_except:
        ## could do something here about problems
        ##    1)  Kill job, then resubmit job w/o node name(s)
        ##         Would require access to cqadm via api
        ##    2)  Put job / node in fail state
        ##    3)  Simply fail
        #raise Exception("Heckle System Object: add_process_groups: %s" % hec_except)

    add_process_groups = exposed(query(add_process_groups))

    def get_process_groups(self, specs):
        """get a list of existing allocations"""
        #logger.debug( "Heckle System: get_process_groups" )
        self._wait()
        return self.process_groups.q_get(specs)

    get_process_groups = exposed(query(get_process_groups))

    def signal_process_groups(self, specs, sig):
        """Free the specified process group (set of allocated nodes)"""
        logger.debug(
            "Heckle System: signal_process_groups: Specs are %s, sig is %s" %
            (specs, sig))
        return self.process_groups.q_get(specs, lambda x, y: x.signal(y), sig)

    signal_process_groups = exposed(query(signal_process_groups))

    def wait_process_groups(self, specs):
        """Remove terminated process groups"""
        logger.debug("Heckle System: wait_process_groups; specs are %s" %
                     specs)
        return self.process_groups.q_del(
            specs, lambda x, _: self._release_resources(x))

    wait_process_groups = exposed(query(wait_process_groups))

    #########################################
    # Methods for dealing with Process Groups
    #########################################

    def _start_pg(self, pgp, heckle_res_id, uid):
        """
          Populates the process group with its resources
               gets node information for nodes in process group
               Updates those attributes
               Places nodes in the pinging nodes list, to see if they're built
          """
        logger.debug("Heckle System: start_pg: PGP is %s" % pgp)
        nodelist = pgp.location
        for node in nodelist:
            node_attributes = self.resources[node]
            node_attributes['mac'] = node_attributes['mac'].replace("-", ":")
            node_attributes['heckle_res_id'] = heckle_res_id
            pgp.resource_attributes[node] = node_attributes._get_dict()
        pgp.uid = uid
        pgp.pinging_nodes.append(nodelist)

    add_process_groups = exposed(query(add_process_groups))

    def _check_builds_done(self):
        """
          Check to see if the nodes are done building
          Starts the process group if all nodes in them are done building
          """
        #logger.debug( "heckle: System: Check Build Done: Waiting to Start..." )
        #sleep(20)
        retval = True
        pg_list = [
            x for x in self.process_groups.itervalues()
            if (len(x.pinging_nodes) > 0)
        ]
        self.resources.update()
        for pgp in pg_list:
            for nodename in pgp.pinging_nodes:
                teststr = self.resources[nodename]['bootstate']
                if teststr == "COMPLETED":
                    logger.debug(
                        "heckle: System: Check Build Done: Removing node %s...%i pinging nodes left"
                        % (nodename, len(pgp.pinging_nodes) - 1))
                    pgp.pinging_nodes.remove(nodename)
                elif teststr in ["BOOTING", "", ""]:
                    logger.debug(
                        "Heckle System: Check Build Done: Node %s not done yet."
                        % nodename)
                elif teststr == "UNALLOCATED":
                    raise Exception(
                        "HIC_SO: _check_builds_done: Node says, 'UNALLOCATED'.  Possible build error, or system timed out."
                    )
                elif teststr == "CRITFAIL":
                    raise Exception(
                        "HIC_SO: _check_builds_done: Node says, 'CRITFAIL'.  It timed out while building."
                    )
                    #####################
                    ####      Need to figure a better way to fail gracefully on this one...
                    #####################
                elif teststr == "READY":
                    raise Exception(
                        "HIC_SO: _check_builds_done: Node says, 'READY'.  The Heckle Reservation is already ready already, skipping pinging."
                    )
            if len(pgp.pinging_nodes) == 0:
                logger.debug(
                    "Heckle System: Check Build Done: No Pinging Nodes left, Start PG Running."
                )
                pgp.start()
            else:
                retval = False
        return retval

    _check_builds_done = automatic(_check_builds_done)

    def _wait(self):
        """
          Calls the process group container's wait() method
          """
        #logger.debug( "Heckle System: wait" )
        for pgp in self.process_groups.itervalues():
            pgp.wait()

    _wait = automatic(_wait)

    def _release_resources(self, pgp):
        """
          Releases all the Heckle nodes, unreserving them
          """
        logger.debug("Heckle System: Release %s" % pgp.location)
        #self.resources[pgp.location]['action']='Free'
        HICCUP = HeckleConnector()
        HICCUP.free_reserved_node(uid=pgp.uid, node_list=pgp.location)
        self.resources.free(nodes=pgp.location)

    def get_resources(self, specs={}):
        """
          Returns a list of names for all the FREE resources (nodes) which match the given specs.
          """
        logger.debug("Heckle System: get Resources, specs are %s" % specs)
        ##################################
        ###  Look at this as a future change
        ##################################
        specs['current reservation'] = 9999999
        specs['allocatable'] = 'True'
        res_list = self.resources >= specs
        logger.debug("Heckle System: get Resources, resources are %s" %
                     res_list)
        return res_list

    get_resources = exposed(query(get_resources))

    ##########################################################
    # Methods for interacting with scheduler and queue-manager
    ##########################################################

    def validate_job(self, spec):
        """
          Validates a job for submission
          -- will the job ever run under the current Heckle configuration?
          Steps:
               1)  Validate Kernel
               2)  Validate HW
               3)  Validate Job versus overall
          """
        logger.debug("Heckle System: Validate Job: Specs are %s" % spec)
        try:
            checklist = spec['attrs']
        except:
            checklist = {}
        #del(checklist['action'])
        try:
            nodecount = spec['nodecount']
        except:
            nodecount = 1
        glossary = self.resources.glossary
        dnelist = []  # for attributes which do not exist in glossary
        badlist = []  # for attributes in glossary which do not exist
        ##################################
        ###  Look at this as a future change
        ###  Think:  Refresh Resources Info
        ##################################
        #1st step:  Are there enough nodes at all?
        if nodecount >= self.resources.node_count():
            pass
        else:
            raise Exception(
                "Validate Job: Not enough nodes; Requested %s, only have %s in the system."
                % (nodecount, self.resources.nodecount()))
        for att in checklist:
            val = checklist[att]
            try:
                if val in glossary[att]:
                    pass
                else:
                    badlist.append("%s:%s" % (att, val))  # Bad attribute
            except:
                dnelist.append(att)  #Attribute does not exist
            checklist['current reservation'] = 9999999
            checklist['allocatable'] = 'True'
            retlist = self.resources >= checklist
            retcount = len(retlist)
            goodlen = retcount >= nodecount
            if goodlen and not badlist and not dnelist:
                return spec  #Good Job!
            else:
                retstr = "Validate Job: "
                if badlist or dnelist:
                    if badlist:
                        restr += "No value for attribute: %s" % badlist
                    if dnelist:
                        retstr += "Attributes Do Not Exist: %s" % dnelist
                else:
                    retstr += "Need %s nodes, only have %s nodes:  %s" % (
                        nodecount, retcount, retlist)
                raise Exception(retstr)
        return spec

    validate_job = exposed(validate_job)

    def verify_locations(self, location_list):
        """
          Makes sure a location list is valid
          location list is a list of fully qualified strings of node names
          ex:  nodename.mcs.anl.gov
          """
        logger.debug("heckle: System: Validate Job: Verify Locations")
        return location_list in self.resources.glossary

    verify_locations = exposed(verify_locations)

    def find_job_location(self, job_location_args, end_times):
        """
          Finds a group of not-busy nodes in which to run the job
          
          Arguments:
               job_location_args -- A list of dictionaries with info about the job
                    jobid -- string identifier
                    nodes -- int number of nodes
                    queue -- string queue name
                    required -- ??
                    utility_score -- ??
                    threshold -- ??
                    walltime -- ??
                    attrs -- dictionary of attributes to match against
               end_times -- supposed time the job will end
               
          Returns: Dictionary with list of nodes a job can run on, keyed by jobid
          """
        logger.debug("heckle: System: find_job_location")
        locations = {}

        def jobsort(job):
            """Used to sort job list by utility score"""
            return job["utility_score"]

        job_location_args.sort(key=jobsort)
        #Try to match jobs to nodes which can run them
        for job in job_location_args:
            if "attrs" not in job or job["attrs"] is None:
                attrs = {}
            else:
                attrs = job['attrs']
            attrs['current reservation'] = 9999999
            attrs['allocatable'] = 'True'
            nodecount = int(job['nodes'])
            print "Heckle System: Find Job Location: Job is %s" % job
            #############################
            ###  Look at this as point of change
            ###  Think:  For node in unreserved nodes
            ###            Choose node from list
            ###            Remove node from unreserved nodes
            #############################
            print "Heckle System: Find Job Location: Free Nodes is %s" % self.resources.getfreenodes(
            )
            nodelist = (self.resources >= attrs)  # get Matching Node
            print "Nodelist at this stage is %s" % nodelist
            if len(nodelist) >= nodecount:
                print "Nodecount = %s" % nodecount
                retlist = nodelist[:nodecount]
                self.resources.allocate(retlist)
                print "Heckle System: Find Job Location: Remaining nodelist is %s" % retlist
            else:
                raise Exception(
                    "Heckle System: find_job_locations: Not Enough matching Nodes Available"
                )
            locations[job["jobid"]] = retlist
            print "Locations is now: %s" % locations
        logger.info("heckle: find_job_location: locations are %s" % locations)
        return locations

    find_job_location = exposed(find_job_location)

    def find_queue_equivalence_classes(self, reservation_dict,
                                       active_queue_names):
        """
        Finds equivalent queues
        An equivalent queue is a queue which can run upon the same partition(s)
        For now, with one partition (everything!) this is irrelevant.
        Returns: equiv= [{'reservations': [], 'queues': ['default']}]
        """
        logger.debug("Heckle System: find queue equivalence classes")
        equiv = []
        #print "Reservation_Dict is: %s" % reservation_dict
        #print "Active_queue_names is %s" % active_queue_names
        #print "Queue assignments are: %s" % self.queue_assignments
        for queue in self.queue_assignments:
            # skip queues that aren't running
            if not queue in active_queue_names:
                continue
            found_a_match = False
            print "Heckle Queue is %s" % queue
            for equ in equiv:
                print "Heckle Equ is %s" % equ
                if equ['data'].intersection(self.queue_assignments[queue]):
                    equ['queues'].add(queue)
                    equ['data'].update(self.queue_assignments[queue])
                    found_a_match = True
                    break
            if not found_a_match:
                equiv.append({
                    'queues': set([queue]),
                    'data': set(self.queue_assignments[queue]),
                    'reservations': set()
                })
        real_equiv = []
        for eq_class in equiv:
            found_a_match = False
            for equ in real_equiv:
                if equ['queues'].intersection(eq_class['queues']):
                    equ['queues'].update(eq_class['queues'])
                    equ['data'].update(eq_class['data'])
                    found_a_match = True
                    break
            if not found_a_match:
                real_equiv.append(eq_class)
        equiv = real_equiv
        for eq_class in equiv:
            for res_name in reservation_dict:
                for host_name in reservation_dict[res_name].split(":"):
                    if host_name in eq_class['data']:
                        eq_class['reservations'].add(res_name)
            for key in eq_class:
                eq_class[key] = list(eq_class[key])
            del eq_class['data']
        return equiv

    find_queue_equivalence_classes = exposed(find_queue_equivalence_classes)

    def get_partitions(self, locations):
        """
          Work-around to get the cqadm to run a single job on this system
          PRE:  locations is a list of dict of strings of possible node names
          POST:  if good, return locations
                 if not good, raise exception and list bad nodes
          """
        nodelist = self.resources.Glossary.nodelist
        logger.debug("Heckle System: get_partitions: raw is are: %s" %
                     locations)
        logger.debug("Heckle System: get_partitions: vals are: %s" % locs)
        if locations in nodelist:
            return locations
        else:
            raise Exception(
                "heckle: System: get_partition: Bad Locations: %s " %
                badlocations)

    get_partitions = exposed(get_partitions)
Exemplo n.º 19
0
class HistoryManager(Component):
    '''Historical Data Manager'''

    implementation = 'histm'
    name = 'history-manager'
    
    def __init__(self, *args, **kwargs):
        
        Component.__init__(self, *args, **kwargs)
        self.least_item = int(get_histm_config('least_item', 10))  # tunable
        self.lastDays = int(get_histm_config("last_days", 60))    # tunable
        self.jobinfo_file = get_histm_config("jobinfo_file", "jobinfo.hist")
        self.jobinfo_script = get_histm_config("jobinfo_script", "jobinfo.py")
        self.fraction = float(get_histm_config("fraction", 0.8))
        self.minimum_ap = float(get_histm_config("minimum_ap", 0.5))
        
        self.job_dict = {}   #historical job dictionary
        self.project_set = set([])  #distinct project names of historical jobs
        self.user_set = set([])     #distinct user names of historical jobs
        self.pair_set = set([])  #distinct (user, project) pair 
        
        self.Ap_dict_proj = {}  #dictionary of walltime adjusting parameters by project name
        self.Ap_dict_user = {}  #dictionary of walltime adjusting parameters by user name
        self.Ap_dict_paired = {} #dictionary of walltime adjusting parameters by double key (user, project)
        
        self.update_Ap_Dict()
                
    def update_job_dict(self):
        '''initialize/update job_dict from jobinfo_file'''
        try:
            input_file = open(self.jobinfo_file, "r")
        except IOError:
            logger.error("History manager: unable to open jobinfo file %s", self.jobinfo_file)
            return
                
        for line in input_file:
            line = line.strip('\n')
            jobspec = parse_jobinfo(line)
            jobid = jobspec.get('jobid')
            if not self.job_dict.has_key(jobid):
                self.job_dict[jobid] = {}
            self.job_dict[jobid] = jobspec
            self.project_set.add(jobspec.get('project'))
            self.user_set.add(jobspec.get('user'))
            key_pair = (jobspec.get('user'), jobspec.get('project'))
            self.pair_set.add(key_pair)
                    
        input_file.close()            
                        
    def update_Ap_Dict(self):
        '''Update dictionary Adjust Parameter (Ap), including project based Dict and user based Dict'''
        
        self.update_job_dict()
        
        for projectname in self.project_set:
            if not self.Ap_dict_proj.has_key(projectname):
                ap = self.calculate_Ap('project', projectname)
                self.Ap_dict_proj[projectname] = ap
                
        for username in self.user_set:
            if not self.Ap_dict_proj.has_key(username):
                ap = self.calculate_Ap('user', username)
                self.Ap_dict_user[username] = ap
                
        for keypair in self.pair_set:
            if not self.Ap_dict_paired.has_key(keypair):
                ap = self.calculate_Ap_paired(keypair)
                keystr = "%s:%s" % (keypair[0], keypair[1])
                self.Ap_dict_paired[keystr] = ap                
      
        print "***********Adjusting Parameter Dict Updated***********"
        
    update_Ap_Dict = automatic(update_Ap_Dict, update_interval*3600)
                        
    def calculate_Ap(self, keyname, valname):
        '''get Adjust Parameter from dict, keyname: either 'project' or 'user', valname: value of the key'''
        Rlist = []  #list of R values
               
        for id in self.job_dict.keys():
            if self.job_dict[id][keyname] == valname:
                Rlist.append(float(self.job_dict[id]['Rvalue']))
                 
        if len(Rlist) > self.least_item:
            Rlist.sort()
            pos = int(self.fraction * len(Rlist))
            Ap = Rlist[pos]
        else:
            Ap = 1
        if Ap > 1:
            Ap = 1
        return Ap
    
    def calculate_Ap_paired(self, keypair):
        username = keypair[0]
        projectname = keypair[1]
        
        Rlist = [] 
        for id in self.job_dict.keys():
            if self.job_dict[id]['user'] == username and self.job_dict[id]['project'] == projectname:
                Rlist.append(float(self.job_dict[id]['Rvalue']))
            
        if len(Rlist) > self.least_item:
            Rlist.sort()
            pos = int(self.fraction * len(Rlist))
            Ap = Rlist[pos]
        else:
            Ap = 1
        if Ap < self.minimum_ap:
            Ap = self.minimum_ap
        if Ap > 1:
            Ap = 1
        return Ap
    
    def get_Ap(self, key, val):
        Ap = 1
        if key == 'user':
            Ap = self.Ap_dict_user.get(val, 1)
        if key == 'project':
            Ap = self.Ap_dict_proj.get(val, 1)
        return Ap
    get_Ap = exposed(get_Ap)
    
    def get_Ap_by_keypair(self, username, projectname):
        keypair = "%s:%s" % (username, projectname)
        Ap = self.Ap_dict_paired.get(keypair, 1)
        return Ap
    get_Ap_by_keypair = exposed(get_Ap_by_keypair)
    
    def get_Ap_dict(self, key):
        if key == 'project':
            return self.Ap_dict_proj
        elif key == 'user':
            return self.Ap_dict_user
        else:
            return None
    get_Ap_dict = exposed(get_Ap_dict)
    
    def is_alive(self):
        return True
    is_alive = exposed(is_alive)
Exemplo n.º 20
0
class OrcmSystem (OrcmBaseSystem):


    """ORCM system component.

    Methods:
    add_process_groups -- add (start) an mpirun process on the system (exposed, ~query)
    get_process_groups -- retrieve mpirun processes (exposed, query)
    wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query)
    signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query)
    update_partition_state -- update partition state from the bridge API (runs as a thread)
    """

    name = "system"
    implementation = "orcm_system"

    logger = logger


    def __init__ (self, *args, **kwargs):
        OrcmBaseSystem.__init__(self, *args, **kwargs)
        self.process_groups.item_cls = OrcmProcessGroup

    def __del__ (self):
        OrcmBaseSystem.__del__(self)

    def __getstate__(self):
        state = {}
        state.update(OrcmBaseSystem.__getstate__(self))
        # state.update({
        #         "orcm_system_version": 1 })
        return state

    def __setstate__(self, state):
        OrcmBaseSystem.__setstate__(self, state)
        self.process_groups.item_cls = OrcmProcessGroup
    

    def add_process_groups (self, specs):
        """Create a process group.

        Arguments:
        spec -- dictionary hash specifying a process group to start
        """

        self.logger.info("add_process_groups(%r)", specs)
        process_groups = self.process_groups.q_add(specs)
        for pgroup in process_groups:
            self.logger.info("Job %s/%s: process group %s created to track script", pgroup.user, pgroup.jobid, pgroup.id)
        #System has started the job.  We need remove them from the temp, alloc array
        #in orcm_base_system.
        self.apg_started = True
        for pgroup in process_groups:
            for location in pgroup.location:
                try:
                    del self.alloc_only_nodes[location]
                except KeyError:
                    logger.critical("%s already removed from alloc_only_nodes list", location)
        return process_groups
    add_process_groups = exposed(query(add_process_groups))

    def get_process_groups (self, specs):
        self._get_exit_status()
        return self.process_groups.q_get(specs)
    get_process_groups = exposed(query(get_process_groups))

    def _get_exit_status (self):
        children = {}
        cleanup = {}
    _get_exit_status = automatic(_get_exit_status,
            float(get_orcm_system_config('get_exit_status_interval', 10)))

    def wait_process_groups (self, specs):
        process_groups = self.process_groups.q_get(specs)
        return process_groups
    wait_process_groups = locking(exposed(query(wait_process_groups)))

    def signal_process_groups (self, specs, signame="SIGINT"):
        my_process_groups = self.process_groups.q_get(specs)
        for pg in my_process_groups:
            OrcmBaseSystem.cancel_session(self, pg.jobid)
            pg.exit_status = 0
            for host in pg.location:
                self.running_nodes.discard(host)
        return my_process_groups
    signal_process_groups = exposed(query(signal_process_groups))

    def del_process_groups(self, jobid):
        '''delete a process group and don't track it anymore.

           jobid -- jobid associated with the process group we are removing

        '''
        del_items = self.process_groups.q_del([{'jobid':jobid}])
        if del_items == []:
            self.logger.warning("Job %s: Process group not found for this jobid.", jobid)
        else:
            self.logger.info("Job %s: Process group deleted.", jobid)
            return
Exemplo n.º 21
0
                        # until the job has exhausted its maximum alloted time
                        del self.process_groups[process_group.id]
                        raise
                    except:
                        self.logger.error("%s: an unexpected exception occurred while attempting to start the process group "
                            "using the %s component; releasing resources", pgroup.label, pgroup.forker, exc_info=True)
                        self.reserve_resources_until(pgroup.location, None, pgroup.jobid)
                        pgroup.exit_status = 255
                else:
                    self.logger.error("%s: the internal reservation on %s expired; job has been terminated", pgroup.label,
                        pgroup.location)
                    pgroup.exit_status = 255
        return process_groups

        
    add_process_groups = exposed(query(all_fields=True)(add_process_groups))
    
    def get_process_groups (self, specs):
        """Query process_groups from the simulator."""
        self._get_exit_status()
        return self.process_groups.q_get(specs)
    
    get_process_groups = exposed(query(get_process_groups))


    def _get_exit_status (self):

        #common to bgsystem

        running = []
        active_forker_components = []
Exemplo n.º 22
0
                # longer active.  an error message should also be attached to the process group so that cqm can report the
                # problem to the user.
                pgroup.exit_status = 1
                self.logger.info(
                    "process group %s: job %s/%s failed to set the kernel; %s",
                    pgroup.id, pgroup.jobid, pgroup.user, e)
            else:
                if pgroup.kernel != "default" and not pgroup.true_mpi_args:
                    self.logger.info(
                        "process group %s: job %s/%s using kernel %s",
                        pgroup.id, pgroup.jobid, pgroup.user, pgroup.kernel)
                pgroup.start()

        return script_pgroups + process_groups

    add_process_groups = exposed(query(add_process_groups))

    def get_process_groups(self, specs):
        self._get_exit_status()
        return self.process_groups.q_get(specs)

    get_process_groups = exposed(query(get_process_groups))

    def _get_exit_status(self):
        try:
            running = ComponentProxy("forker").active_list()
        except:
            self.logger.error(
                "failed to contact forker component for list of running jobs")
            return
Exemplo n.º 23
0
                            "%s: an unexpected exception occurred while attempting to start the process group "
                            "using the %s component; releasing resources",
                            pgroup.label,
                            pgroup.forker,
                            exc_info=True)
                        self.reserve_resources_until(pgroup.location, None,
                                                     pgroup.jobid)
                        pgroup.exit_status = 255
                else:
                    self.logger.error(
                        "%s: the internal reservation on %s expired; job has been terminated",
                        pgroup.label, pgroup.location)
                    pgroup.exit_status = 255
        return process_groups

    add_process_groups = exposed(query(all_fields=True)(add_process_groups))

    def get_process_groups(self, specs):
        """Query process_groups from the simulator."""
        self._get_exit_status()
        return self.process_groups.q_get(specs)

    get_process_groups = exposed(query(get_process_groups))

    def _get_exit_status(self):

        #common to bgsystem

        running = []
        active_forker_components = []
        for forker_component in ['bg_mpirun_forker', 'user_script_forker']:
Exemplo n.º 24
0
class Simulator(BGBaseSystem):
    """Generic system simulator.
    
    Methods:
    configure -- load partitions from an xml file
    reserve_partition -- lock a partition for use by a process_group (exposed)
    release_partition -- release a locked (busy) partition (exposed)
    add_process_groups -- add (start) a process group on the system (exposed, query)
    get_process_groups -- retrieve process groups (exposed, query)
    wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query)
    signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query)
    update_partition_state -- simulates updating partition state from the bridge API (automatic)
    """

    name = "system"
    implementation = "simulator"

    logger = logger

    bgsystem_config = BGBaseSystem.bgsystem_config

    def __init__(self, *args, **kwargs):
        BGBaseSystem.__init__(self, *args, **kwargs)
        sys.setrecursionlimit(5000)  #why this magic number?
        self.process_groups.item_cls = BGSimProcessGroup
        self.config_file = kwargs.get("config_file", None)
        self.failed_components = set()
        if self.config_file is not None:
            self.configure(self.config_file)

    def __getstate__(self):
        state = {}
        state.update(BGBaseSystem.__getstate__(self))
        state.update({'simulator_version': 3, 'config_file': self.config_file})
        return state

    def __setstate__(self, state):
        BGBaseSystem.__setstate__(self, state)
        self.config_file = state['config_file']
        self.process_groups.item_cls = BGSimProcessGroup
        if self.config_file is not None:
            self.configure(self.config_file)
        self.update_relatives()
        self._restore_partition_state(state)

    def save_me(self):
        Component.save(self)

    save_me = automatic(save_me,
                        float(bgsystem_config.get('save_me_interval', 10)))

    def configure(self, config_file):
        """Configure simulated partitions.
        
        Arguments:
        config_file -- xml configuration file
        """
        def _get_node_card(name):
            if not self.node_card_cache.has_key(name):
                self.node_card_cache[name] = NodeCard(name)

            return self.node_card_cache[name]

        self.logger.info("configure()")
        try:
            system_doc = ElementTree.parse(config_file)
        except IOError:
            self.logger.error("unable to open file: %r" % config_file)
            self.logger.error("exiting...")
            sys.exit(1)
        except:
            self.logger.error("problem loading data from file: %r" %
                              config_file)
            self.logger.error("exiting...")
            sys.exit(1)

        system_def = system_doc.getroot()
        if system_def.tag != "BG":
            self.logger.error("unexpected root element in %r: %r" %
                              (config_file, system_def.tag))
            self.logger.error("exiting...")
            sys.exit(1)

        # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API
        NODES_PER_NODECARD = 32

        # initialize a new partition dict with all partitions
        #
        partitions = PartitionDict()

        tmp_list = []

        # this is going to hold partition objects from the bridge (not our own Partition)
        wiring_cache = {}
        bp_cache = {}

        for partition_def in system_def.getiterator("Partition"):
            node_list = []
            switch_list = []

            for nc in partition_def.getiterator("NodeCard"):
                node_list.append(_get_node_card(nc.get("id")))

            nc_count = len(node_list)

            if not wiring_cache.has_key(nc_count):
                wiring_cache[nc_count] = []
            wiring_cache[nc_count].append(partition_def.get("name"))

            for s in partition_def.getiterator("Switch"):
                switch_list.append(s.get("id"))

            tmp_list.append(
                dict(
                    name=partition_def.get("name"),
                    queue=partition_def.get("queue", "default"),
                    size=NODES_PER_NODECARD * nc_count,
                    node_cards=node_list,
                    switches=switch_list,
                    state="idle",
                ))

        partitions.q_add(tmp_list)

        # find the wiring deps
        for size in wiring_cache:
            for p in wiring_cache[size]:
                p = partitions[p]
                s1 = set(p.switches)
                for other in wiring_cache[size]:
                    other = partitions[other]
                    if (p.name == other.name):
                        continue

                    s2 = set(other.switches)

                    if s1.intersection(s2):
                        self.logger.info(
                            "found a wiring dep between %s and %s", p.name,
                            other.name)
                        partitions[p.name]._wiring_conflicts.add(other.name)

        # update object state
        self._partitions.clear()
        self._partitions.update(partitions)

    def reserve_partition(self, name, size=None):
        """Reserve a partition and block all related partitions.
        
        Arguments:
        name -- name of the partition to reserve
        size -- size of the process group reserving the partition (optional)
        """

        try:
            self._partitions_lock.acquire()

            try:
                partition = self.partitions[name]
            except KeyError:
                self.logger.error(
                    "reserve_partition(%r, %r) [does not exist]" %
                    (name, size))
                return False
            if partition.state != "allocated":
                self.logger.error("reserve_partition(%r, %r) [%s]" %
                                  (name, size, partition.state))
                return False
            if not partition.functional:
                self.logger.error(
                    "reserve_partition(%r, %r) [not functional]" %
                    (name, size))
            if size is not None and size > partition.size:
                self.logger.error("reserve_partition(%r, %r) [size mismatch]" %
                                  (name, size))
                return False

            partition.state = "busy"
            # partition.reserved_until = False
        finally:
            self._partitions_lock.release()

        # explicitly call this, since the above "busy" is instantaneously available
        self.update_partition_state()

        self.logger.info("reserve_partition(%r, %r)" % (name, size))
        return True

    reserve_partition = exposed(reserve_partition)

    def release_partition(self, name):
        """Release a reserved partition.
        
        Arguments:
        name -- name of the partition to release
        """
        try:
            self._partitions_lock.acquire()

            try:
                partition = self.partitions[name]
            except KeyError:
                self.logger.error("release_partition(%r) [already free]" %
                                  (name))
                return False
            if not partition.state == "busy":
                self.logger.info("release_partition(%r) [not busy]" % (name))
                return False

            if partition.used_by is not None:
                partition.state = "allocated"
            else:
                partition.state = "idle"
        finally:
            self._partitions_lock.release()

        # explicitly unblock the blocked partitions
        self.update_partition_state()

        self.logger.info("release_partition(%r)" % (name))
        return True

    release_partition = exposed(release_partition)

    def _mark_partition_for_cleaning(self, pname, jobid):
        pass

    def _set_kernel(self, partition, kernel):
        # TODO: allow the kernel set step to work in the simulator.  For now this doesn't fly.
        pass

    def update_partition_state(self):
        # first, set all of the nodecards to not busy
        for nc in self.node_card_cache.values():
            nc.used_by = ''

        self._partitions_lock.acquire()
        try:
            for p in self._partitions.values():
                p._update_node_cards()

            now = time.time()

            # since we don't have the bridge, a partition which isn't busy
            # should be set to idle and then blocked states can be derived
            for p in self._partitions.values():
                if p.state != "busy":
                    p.state = "idle"
                if p.reserved_until and now > p.reserved_until:
                    p.reserved_until = None
                    p.reserved_by = None

            for p in self._partitions.values():
                if p.state == "busy":
                    # when the partition becomes busy, if a script job isn't reserving it, then release the reservation
                    if not p.reserved_by:
                        p.reserved_until = False
                else:
                    if p.reserved_until:
                        p.state = "allocated"
                        for part in p._parents:
                            if part.state == "idle":
                                part.state = "blocked (%s)" % (p.name, )
                        for part in p._children:
                            if part.state == "idle":
                                part.state = "blocked (%s)" % (p.name, )
                    for diag_part in self.pending_diags:
                        if p.name == diag_part.name or p.name in diag_part.parents or p.name in diag_part.children:
                            p.state = "blocked by pending diags"
                    for nc in p.node_cards:
                        if nc.used_by:
                            p.state = "blocked (%s)" % nc.used_by
                            break
                    for dep_name in p._wiring_conflicts:
                        if self._partitions[dep_name].state in [
                                "allocated", "busy"
                        ]:
                            p.state = "blocked-wiring (%s)" % dep_name
                            break
                    for part_name in self.failed_diags:
                        part = self._partitions[part_name]
                        if p.name == part.name:
                            p.state = "failed diags"
                        elif p.name in part.parents or p.name in part.children:
                            p.state = "blocked by failed diags"
        except:
            self.logger.error("error in update_partition_state", exc_info=True)

        self._partitions_lock.release()

    update_partition_state = automatic(update_partition_state)

    def add_failed_components(self, component_names):
        success = []
        for name in component_names:
            if self.node_card_cache.has_key(name):
                self.failed_components.add(name)
                success.append(name)
            else:
                for p in self._partitions.values():
                    if name in p.switches:
                        self.failed_components.add(name)
                        success.append(name)
                        break
        return success

    add_failed_component = exposed(add_failed_components)

    def del_failed_components(self, component_names):
        success = []
        for name in component_names:
            try:
                self.failed_components.remove(name)
                success.append(name)
            except KeyError:
                pass

        return success

    del_failed_components = exposed(del_failed_components)

    def list_failed_components(self, component_names):
        return list(self.failed_components)

    list_failed_components = exposed(list_failed_components)

    def launch_diags(self, partition, test_name):
        exit_value = 0
        for nc in partition.node_cards:
            if nc.id in self.failed_components:
                exit_value = 1
        for switch in partition.switches:
            if switch in self.failed_components:
                exit_value = 2

        self.finish_diags(partition, test_name, exit_value)
Exemplo n.º 25
0
class Simulator(BGBaseSystem):
    """Generic system simulator.
    
    Methods:
    configure -- load partitions from an xml file
    reserve_partition -- lock a partition for use by a process_group (exposed)
    release_partition -- release a locked (busy) partition (exposed)
    add_process_groups -- add (start) a process group on the system (exposed, query)
    get_process_groups -- retrieve process groups (exposed, query)
    wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query)
    signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query)
    update_partition_state -- simulates updating partition state from the bridge API (automatic)
    """

    name = "system"
    implementation = "simulator"

    logger = logger

    MIN_RUN_TIME = 60
    MAX_RUN_TIME = 180

    def __init__(self, *args, **kwargs):
        BGBaseSystem.__init__(self, *args, **kwargs)
        sys.setrecursionlimit(5000)  #why this magic number?
        self.process_groups.item_cls = BGSimProcessGroup
        self.config_file = kwargs.get("config_file", None)
        self.failed_components = set()
        if self.config_file is not None:
            self.configure(self.config_file)

    def __getstate__(self):
        flags = {}
        for part in self._partitions.values():
            sched = None
            func = None
            queue = None
            if hasattr(part, 'scheduled'):
                sched = part.scheduled
            if hasattr(part, 'functional'):
                func = part.functional
            if hasattr(part, 'queue'):
                queue = part.queue
            flags[part.name] = (sched, func, queue)
        return {
            'managed_partitions': self._managed_partitions,
            'version': 2,
            'config_file': self.config_file,
            'partition_flags': flags
        }

    def __setstate__(self, state):
        Cobalt.Util.fix_set(state)
        sys.setrecursionlimit(5000)
        self._managed_partitions = state['managed_partitions']
        self.config_file = state['config_file']
        self._partitions = PartitionDict()
        self.process_groups = BGProcessGroupDict()
        self.process_groups.item_cls = BGSimProcessGroup
        self.node_card_cache = dict()
        self._partitions_lock = thread.allocate_lock()
        self.failed_components = set()
        self.pending_diags = dict()
        self.failed_diags = list()
        self.bridge_in_error = False
        self.cached_partitions = None
        self.offline_partitions = []
        if self.config_file is not None:
            self.configure(self.config_file)

        if 'partition_flags' in state:
            for pname, flags in state['partition_flags'].items():
                if pname in self._partitions:
                    self._partitions[pname].scheduled = flags[0]
                    self._partitions[pname].functional = flags[1]
                    self._partitions[pname].queue = flags[2]
                else:
                    logger.info("Partition %s is no longer defined" % pname)

        self.update_relatives()
        self.lock = threading.Lock()
        self.statistics = Statistics()

    def save_me(self):
        Component.save(self)

    save_me = automatic(save_me)

    def configure(self, config_file):
        """Configure simulated partitions.
        
        Arguments:
        config_file -- xml configuration file
        """
        def _get_node_card(name):
            if not self.node_card_cache.has_key(name):
                self.node_card_cache[name] = NodeCard(name)

            return self.node_card_cache[name]

        self.logger.info("configure()")
        try:
            system_doc = ElementTree.parse(config_file)
        except IOError:
            self.logger.error("unable to open file: %r" % config_file)
            self.logger.error("exiting...")
            sys.exit(1)
        except:
            self.logger.error("problem loading data from file: %r" %
                              config_file)
            self.logger.error("exiting...")
            sys.exit(1)

        system_def = system_doc.getroot()
        if system_def.tag != "BG":
            self.logger.error("unexpected root element in %r: %r" %
                              (config_file, system_def.tag))
            self.logger.error("exiting...")
            sys.exit(1)

        # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API
        NODES_PER_NODECARD = 32

        # initialize a new partition dict with all partitions
        #
        partitions = PartitionDict()

        tmp_list = []

        # this is going to hold partition objects from the bridge (not our own Partition)
        wiring_cache = {}
        bp_cache = {}

        for partition_def in system_def.getiterator("Partition"):
            node_list = []
            switch_list = []

            for nc in partition_def.getiterator("NodeCard"):
                node_list.append(_get_node_card(nc.get("id")))

            nc_count = len(node_list)

            if not wiring_cache.has_key(nc_count):
                wiring_cache[nc_count] = []
            wiring_cache[nc_count].append(partition_def.get("name"))

            for s in partition_def.getiterator("Switch"):
                switch_list.append(s.get("id"))

            tmp_list.append(
                dict(
                    name=partition_def.get("name"),
                    queue=partition_def.get("queue", "default"),
                    size=NODES_PER_NODECARD * nc_count,
                    node_cards=node_list,
                    switches=switch_list,
                    state="idle",
                ))

        partitions.q_add(tmp_list)

        # find the wiring deps
        for size in wiring_cache:
            for p in wiring_cache[size]:
                p = partitions[p]
                s1 = set(p.switches)
                for other in wiring_cache[size]:
                    other = partitions[other]
                    if (p.name == other.name):
                        continue

                    s2 = set(other.switches)

                    if s1.intersection(s2):
                        self.logger.info(
                            "found a wiring dep between %s and %s", p.name,
                            other.name)
                        partitions[p.name]._wiring_conflicts.add(other.name)

        # update object state
        self._partitions.clear()
        self._partitions.update(partitions)

    def reserve_partition(self, name, size=None):
        """Reserve a partition and block all related partitions.
        
        Arguments:
        name -- name of the partition to reserve
        size -- size of the process group reserving the partition (optional)
        """

        try:
            partition = self.partitions[name]
        except KeyError:
            self.logger.error("reserve_partition(%r, %r) [does not exist]" %
                              (name, size))
            return False
        if partition.state != "allocated":
            self.logger.error("reserve_partition(%r, %r) [%s]" %
                              (name, size, partition.state))
            return False
        if not partition.functional:
            self.logger.error("reserve_partition(%r, %r) [not functional]" %
                              (name, size))
        if size is not None and size > partition.size:
            self.logger.error("reserve_partition(%r, %r) [size mismatch]" %
                              (name, size))
            return False

        self._partitions_lock.acquire()
        try:
            partition.state = "busy"
            partition.reserved_until = False
        except:
            self.logger.error("error in reserve_partition", exc_info=True)
        self._partitions_lock.release()
        # explicitly call this, since the above "busy" is instantaneously available
        self.update_partition_state()

        self.logger.info("reserve_partition(%r, %r)" % (name, size))
        return True

    reserve_partition = exposed(reserve_partition)

    def release_partition(self, name):
        """Release a reserved partition.
        
        Arguments:
        name -- name of the partition to release
        """
        try:
            partition = self.partitions[name]
        except KeyError:
            self.logger.error("release_partition(%r) [already free]" % (name))
            return False
        if not partition.state == "busy":
            self.logger.info("release_partition(%r) [not busy]" % (name))
            return False

        self._partitions_lock.acquire()
        try:
            partition.state = "idle"
        except:
            self.logger.error("error in release_partition", exc_info=True)
        self._partitions_lock.release()

        # explicitly unblock the blocked partitions
        self.update_partition_state()

        self.logger.info("release_partition(%r)" % (name))
        return True

    release_partition = exposed(release_partition)

    def add_process_groups(self, specs):
        """Create a simulated process group.
        
        Arguments:
        spec -- dictionary hash specifying a process group to start
        """

        self.logger.info("add_process_groups(%r)" % (specs))

        # FIXME: setting exit_status to signal the job has failed isn't really the right thing to do.  another flag should be
        # added to the process group that wait_process_group uses to determine when a process group is no longer active.  an
        # error message should also be attached to the process group so that cqm can report the problem to the user.
        process_groups = self.process_groups.q_add(specs)
        for pgroup in process_groups:
            pgroup.label = "Job %s/%s/%s" % (pgroup.jobid, pgroup.user,
                                             pgroup.id)
            pgroup.nodect = self._partitions[pgroup.location[0]].size
            self.logger.info(
                "%s: process group %s created to track job status",
                pgroup.label, pgroup.id)
            try:
                #TODO: allow the kernel set step to work in the simulator.  For now this doesn't fly.
                pass
                #self._set_kernel(pgroup.location[0], pgroup.kernel)
            except Exception, e:
                self.logger.error("%s: failed to set the kernel; %s",
                                  pgroup.label, e)
                pgroup.exit_status = 255
            else:
                if pgroup.kernel != "default":
                    self.logger.info("%s: now using kernel %s", pgroup.label,
                                     pgroup.kernel)
                if pgroup.mode == "script":
                    pgroup.forker = 'user_script_forker'
                else:
                    pgroup.forker = 'bg_mpirun_forker'
                if self.reserve_resources_until(
                        pgroup.location,
                        float(pgroup.starttime) + 60 * float(pgroup.walltime),
                        pgroup.jobid):
                    try:
                        pgroup.start()
                        if pgroup.head_pid == None:
                            self.logger.error(
                                "%s: process group failed to start using the %s component; releasing resources",
                                pgroup.label, pgroup.forker)
                            self.reserve_resources_until(
                                pgroup.location, None, pgroup.jobid)
                            pgroup.exit_status = 255
                    except (ComponentLookupError, xmlrpclib.Fault), e:
                        self.logger.error(
                            "%s: failed to contact the %s component",
                            pgroup.label, pgroup.forker)
                        # do not release the resources; instead re-raise the exception and allow cqm to the opportunity to retry
                        # until the job has exhausted its maximum alloted time
                        del self.process_groups[pgroup.id]
                        raise
                    except (ComponentLookupError, xmlrpclib.Fault), e:
                        self.logger.error(
                            "%s: a fault occurred while attempting to start the process group using the %s "
                            "component", pgroup.label, pgroup.forker)
                        # do not release the resources; instead re-raise the exception and allow cqm to the opportunity to retry
                        # until the job has exhausted its maximum alloted time
                        del self.process_groups[process_group.id]
                        raise
                    except:
Exemplo n.º 26
0
class MessageQueue(Component):

    name = "cdbwriter"
    implementation = "cdbwriter"
    logger = logging.getLogger("Cobalt.Components.cdbwriter")

    _configfields = ['user', 'pwd', 'database', 'schema']
    _config = ConfigParser.ConfigParser()
    _config.read(Cobalt.CONFIG_FILES)
    if not config._sections.has_key('cdbwriter'):
        logger.error('"cdbwriter" section missing from config file.')
    config = _config._sections['cdbwriter']
    mfields = [field for field in _configfields if not config.has_key(field)]
    if mfields:
        logger.error(
            "Missing option(s) in cobalt config file [cdbwriter] section: %s" %
            (" ".join(mfields)))
        sys.exit(1)

    def __init__(self, *args, **kwargs):
        Component.__init__(self, *args, **kwargs)
        self.sync_state = Cobalt.Util.FailureMode("Foreign Data Sync")
        self.connected = False
        self.msg_queue = []
        self.decoder = LogMessageDecoder()

        self.overflow = False
        self.overflow_filename = None
        self.overflow_file = None
        self.clearing_overflow = False

        self.max_queued = int(get_cdbwriter_config('max_queued_msgs', '-1'))

        if self.max_queued <= 0:
            logger.info("message queue set to unlimited.")
            self.max_queued = None
            self.overflow_filename = get_cdbwriter_config(
                'overflow_file', None)

        if self.overflow_filename == None:
            logger.warning(
                "No file given to catch maximum messages. Setting queue size to unlimited."
            )
            self.max_queued = None

    def __getstate__(self):
        state = {}
        state.update(Component.__getstate__(self))
        state.update({
            'cdbwriter_version': 1,
            'msg_queue': self.msg_queue,
            'overflow': self.overflow
        })
        return state

    def __setstate__(self, state):
        Component.__setstate__(self, state)

        self.msg_queue = state['msg_queue']
        self.connected = False
        self.decoder = LogMessageDecoder()
        self.clearing_overflow = False
        self.overflow_filename = None
        self.overflow_file = None
        self.max_queued = int(get_cdbwriter_config('max_queued_msgs', '-1'))

        if self.max_queued <= 0:
            logger.info("message queue set to unlimited.")
            self.max_queued = None
        else:
            self.overflow_filename = get_cdbwriter_config(
                'overflow_file', None)

        if self.max_queued and (self.overflow_filename == None):
            logger.warning(
                "No file given to catch maximum messages. Setting queue size to unlimited."
            )
            self.max_queued = None

        if state.has_key('overflow') and self.max_queued:
            self.overflow = state['overflow']
        else:
            self.overflow = False

    def init_database_connection(self):
        user = get_cdbwriter_config('user', None)
        pwd = get_cdbwriter_config('pwd', None)
        database = get_cdbwriter_config('database', None)
        schema = get_cdbwriter_config('schema', None)

        try:
            self.database_writer = DatabaseWriter(database, user, pwd, schema)
        except:
            #make this a log statement
            logging.error("Unable to connect to %s as %s" % (database, user))
            self.connected = False
            logging.debug(traceback.format_exc())
        else:
            self.connected = True

    def iterate(self):
        """Go through the messages that are sitting on the queue and
      load them into the database."""

        #if we're not connected, try to reconnect to the database
        if not self.connected:
            logger.debug("Attempting reconnection.")
            self.init_database_connection()

        if self.connected and self.overflow:
            self.clearing_overflow = True
            self.open_overflow('r')
            if self.overflow_file:
                overflow_queue = [
                    self.decoder.decode(line) for line in self.overflow_file
                ]
                overflow_queue.extend(self.msg_queue)
                self.msg_queue = overflow_queue
                self.close_overflow()
                self.del_overflow()
                self.overflow = False

        while self.msg_queue and self.connected:
            msg = self.msg_queue[0]

            try:
                self.database_writer.addMessage(msg)
            except db2util.adapterError:
                logger.error(
                    "Error updating databse.  Unable to add message due to adapter error. Message dropped."
                )
                logging.debug(traceback.format_exc())
                self.msg_queue.pop(0)
            except:
                logger.error("Error updating databse.  Unable to add message.")
                logging.debug(traceback.format_exc())
                self.connected = False
                #if we were clearing an overflow, here we go again.
                if ((self.max_queued != None)
                        and (len(self.msg_queue) >= self.max_queued)):
                    self.overflow = True
                    self.open_overflow('a')
                    if self.overflow_file != None:
                        self.queue_to_overflow()
                        self.close_overflow()

                break
            else:
                #message added
                self.msg_queue.pop(0)

        self.clearing_overflow = False

    iterate = automatic(iterate)

    def add_message(self, msg):

        #keep the queue from consuming all memory
        if ((self.max_queued != None)
                and (len(self.msg_queue) >= self.max_queued)
                and (not self.clearing_overflow)):

            self.overflow = True
            self.open_overflow('a')
            if self.overflow_file == None:
                logger.critical("MESSAGE DROPPED: %s", msg)
            else:
                self.queue_to_overflow()
                self.close_overflow()
        #and now queue as normal

        msgDict = None

        try:
            msgDict = self.decoder.decode(msg)

        except ValueError:
            logger.error("Bad message recieved.  Failed to decode string %s" %
                         msg)
            return
        except:
            logging.debug(traceback.format_exc())

        self.msg_queue.append(msgDict)

    add_message = exposed(add_message)

    def save_me(self):
        Component.save(self)

    save_me = automatic(save_me)

    def open_overflow(self, mode):

        try:
            self.overflow_file = open(self.overflow_filename, mode)
        except:
            self.logger.critical(
                "Unable to open overflow file!  Information to database will be lost!"
            )

    def close_overflow(self):
        if self.overflow and self.overflow_file:
            self.overflow_file.close()
            self.overflow_file = None

    def del_overflow(self):
        os.remove(self.overflow_filename)

    def queue_to_overflow(self):

        elements_written = 0
        if len(self.msg_queue) == 0:
            return

        while self.msg_queue:
            msg = self.msg_queue.pop(0)
            try:
                self.overflow_file.write(
                    json.dumps(msg, cls=LogMessageEncoder) + '\n')
                elements_written += 1
            except IOError:
                logger.error(
                    'Could only partially empty queue, %d messages written' %
                    elements_written)
                self.msg_queue.insert(0, msg)

        if elements_written > 0:
            del self.msg_queue[0:elements_written -
                               1]  #empty the queue of what we have written

        return len(self.msg_queue)
Exemplo n.º 27
0
class EventSimulator(Component):
    """Event Simulator. Manages time stamps, events, and the advancing of the clock
        
    Definition of an event, which is a dictionary of following keys:
        machine -- 0, 1, 2 ... represent the system (e.g. Intrepid or Eureka) where the event occurs
        type -- I (init), Q (submit job), S (start job), E (end job),
        datetime -- the date time at which the event occurs
        unixtime -- the unix time form for datetime
        jobid -- the job id associated with the event
        location -- the location where the event occurs, represented by node list or partition list 
    """

    implementation = "evsim"
    name = "event-manager"

    def __init__(self, *args, **kwargs):

        Component.__init__(self, *args, **kwargs)
        self.event_list = [{'unixtime': 0}]
        self.time_stamp = 0

        self.finished = False

        self.bgsched = Sim_bg_Sched(**kwargs)
        self.csched = Sim_Cluster_Sched()

        self.mmon = metricmon()

        self.go_next = True

    def set_go_next(self, bool_value):
        self.go_next = bool_value

    set_go_next = exposed(set_go_next)

    def get_go_next(self, ):
        return self.go_next

    get_go_next = exposed(get_go_next)

    def events_length(self):
        return len(self.event_list)

    def add_event(self, ev_spec):
        '''insert time stamps in the same order'''

        time_sec = ev_spec.get('unixtime')
        if time_sec == None:
            print "insert time stamp error: no unix time provided"
            return -1

        if not ev_spec.has_key('jobid'):
            ev_spec['jobid'] = 0
        if not ev_spec.has_key('location'):
            ev_spec['location'] = []

        pos = self.events_length()

        while time_sec < self.event_list[pos - 1].get('unixtime'):
            pos = pos - 1

        self.event_list.insert(pos, ev_spec)
        #print "insert time stamp ", ev_spec, " at pos ", pos
        return pos

    add_event = exposed(add_event)

    def get_time_span(self):
        '''return the whole time span'''
        starttime = self.event_list[1].get('unixtime')
        endtime = self.event_list[-1].get('unixtime')
        timespan = endtime - starttime
        return timespan

    get_time_span = exposed(get_time_span)

    def get_current_time_stamp(self):
        '''return current time stamp'''
        return self.time_stamp

    def get_current_time(self):
        '''return current unix time'''
        return self.event_list[self.time_stamp].get('unixtime')

    get_current_time = exposed(get_current_time)

    def get_current_date_time(self):
        '''return current date time'''
        return self.event_list[self.time_stamp].get('datetime')

    get_current_date_time = exposed(get_current_date_time)

    def get_current_event_type(self):
        '''return current event type'''
        return self.event_list[self.time_stamp].get('type')

    get_current_event_type = exposed(get_current_event_type)

    def get_current_event_job(self):
        '''return current event job'''
        return self.event_list[self.time_stamp].get('jobid')

    get_current_event_job = exposed(get_current_event_job)

    def get_current_event_location(self):
        return self.event_list[self.time_stamp].get('location')

    get_current_event_location = exposed(get_current_event_location)

    def get_current_event_machine(self):
        '''return machine which the current event belongs to'''
        return self.event_list[self.time_stamp].get('machine')

    def get_current_event_all(self):
        '''return current event'''
        return self.event_list[self.time_stamp]

    def get_next_event_time_sec(self):
        '''return the next event time'''
        if self.time_stamp < len(self.event_list) - 1:
            return self.event_list[self.time_stamp + 1].get('unixtime')
        else:
            return -1

    get_next_event_time_sec = exposed(get_next_event_time_sec)

    def is_finished(self):
        return self.finished

    is_finished = exposed(is_finished)

    def clock_increment(self):
        '''the current time stamp increments by 1'''
        if self.time_stamp < len(self.event_list) - 1:
            self.time_stamp += 1
            if SHOW_SCREEN_LOG:
                print str(self.get_current_date_time()) + \
                "[%s]: Time stamp is incremented by 1, current time stamp: %s " % (self.implementation, self.time_stamp)
        else:
            self.finished = True

        return self.time_stamp

    clock_intrement = exposed(clock_increment)

    def add_init_events(self, jobspecs, machine_id):  ###EVSIM change here
        """add initial submission events based on input jobs and machine id"""

        for jobspec in jobspecs:
            evspec = {}
            evspec['machine'] = machine_id
            evspec['type'] = "Q"
            evspec['unixtime'] = float(jobspec.get('submittime'))
            evspec['datetime'] = sec_to_date(float(jobspec.get('submittime')))
            evspec['jobid'] = jobspec.get('jobid')
            evspec['location'] = []
            self.add_event(evspec)

    add_init_events = exposed(add_init_events)

    def init_unhold_events(self, machine_id):
        """add unholding event"""
        if not self.event_list:
            return

        first_time_sec = self.event_list[1]['unixtime']
        last_time_sec = self.event_list[-1]['unixtime']

        unhold_point = first_time_sec + UNHOLD_INTERVAL + machine_id
        while unhold_point < last_time_sec:
            evspec = {}
            evspec['machine'] = machine_id
            evspec['type'] = "C"
            evspec['unixtime'] = unhold_point
            evspec['datetime'] = sec_to_date(unhold_point)
            self.add_event(evspec)

            unhold_point += UNHOLD_INTERVAL + machine_id

    init_unhold_events = exposed(init_unhold_events)

    def init_mmon_events(self):
        """add metrics monitor points into time stamps"""
        if not self.event_list:
            return

        first_time_sec = self.get_first_mmon_point(
            self.event_list[1]['datetime'])
        last_time_sec = self.event_list[-1]['unixtime']
        machine_id = MMON

        mmon_point = first_time_sec + MMON_INTERVAL
        while mmon_point < last_time_sec:
            evspec = {}
            evspec['machine'] = machine_id
            evspec['unixtime'] = mmon_point
            evspec['datetime'] = sec_to_date(mmon_point)
            self.add_event(evspec)
            mmon_point += MMON_INTERVAL

    init_mmon_events = exposed(init_mmon_events)

    def get_first_mmon_point(self, date_time):
        "based on the input date time (%m/%d/%Y %H:%M:%S), get the next epoch time that is at the beginning of an hour"
        segs = date_time.split()
        hours = segs[1].split(":")
        new_datetime = "%s %s:%s:%s" % (segs[0], hours[0], '00', '00')
        new_epoch = date_to_sec(new_datetime) + 3600
        return new_epoch

    def print_events(self):
        print "total events:", len(self.event_list)
        i = 0
        for event in self.event_list:
            print event
            i += 1
            if i == 25:
                break

    def event_driver(self):
        """core part that drives the clock"""

        if self.go_next:
            #only if the go_next tag is true will the clock be incremented. enable scheduler schedule multiple job at the same time stamp
            self.clock_increment()

        machine = self.get_current_event_machine()
        #        print "[%s]: %s, machine=%s, event=%s, job=%s" % (
        #                                            self.implementation,
        #                                            self.get_current_date_time(),
        #                                            self.get_current_event_machine(),
        #                                            self.get_current_event_type(),
        #                                            self.get_current_event_job(),
        #                                            )

        if machine == INTREPID:
            self.bgsched.schedule_jobs()
        if machine == EUREKA:
            self.csched.schedule_jobs()
        if machine == MMON:
            self.mmon.metric_monitor()

        if self.go_next:
            ComponentProxy("queue-manager").calc_loss_of_capacity()
Exemplo n.º 28
0
                # FIXME: setting exit_status to signal the job has failed isn't really the right thing to do.  another flag
                # should be added to the process group that wait_process_group uses to determine when a process group is no
                # longer active.  an error message should also be attached to the process group so that cqm can report the
                # problem to the user.
                pgroup.exit_status = 1
                self.logger.info("process group %s: job %s/%s failed to set the kernel; %s", pgroup.id, pgroup.jobid, 
                    pgroup.user, e)
            else:
                if pgroup.kernel != "default" and not pgroup.true_mpi_args:
                    self.logger.info("process group %s: job %s/%s using kernel %s", pgroup.id, pgroup.jobid, pgroup.user,
                        pgroup.kernel)
                pgroup.start()
            
        return script_pgroups + process_groups
    
    add_process_groups = exposed(query(add_process_groups))
    
    def get_process_groups (self, specs):
        self._get_exit_status()
        return self.process_groups.q_get(specs)
    get_process_groups = exposed(query(get_process_groups))
    
    def _get_exit_status (self):
        try:
            running = ComponentProxy("forker").active_list()
        except:
            self.logger.error("failed to contact forker component for list of running jobs")
            return

        for each in self.process_groups.itervalues():
            if each.head_pid not in running and each.exit_status is None and each.mode != "script":
Exemplo n.º 29
0
class ClusterBaseSystem(Component):
    """base system class.
    
    Methods:
    add_partitions -- tell the system to manage partitions (exposed, query)
    get_partitions -- retrieve partitions in the simulator (exposed, query)
    del_partitions -- tell the system not to manage partitions (exposed, query)
    set_partitions -- change random attributes of partitions (exposed, query)
    update_relatives -- should be called when partitions are added and removed from the managed list
    """
    def __init__(self, *args, **kwargs):
        Component.__init__(self, *args, **kwargs)
        self.process_groups = ProcessGroupDict()
        self.pending_diags = dict()
        self.failed_diags = list()
        self.all_nodes = sets.Set()
        self.running_nodes = sets.Set()
        self.down_nodes = sets.Set()
        self.queue_assignments = {}
        self.node_order = {}
        try:
            self.configure(CP.get("cluster_system", "hostfile"))
        except:
            self.logger.error("unable to load hostfile")
        self.queue_assignments["default"] = sets.Set(self.all_nodes)

    def __getstate__(self):
        return {
            "queue_assignments": self.queue_assignments,
            "version": 1,
            "down_nodes": self.down_nodes
        }

    def __setstate__(self, state):
        self.queue_assignments = state["queue_assignments"]
        self.down_nodes = state["down_nodes"]

        self.process_groups = ProcessGroupDict()
        self.pending_diags = dict()
        self.failed_diags = list()
        self.all_nodes = sets.Set()
        self.running_nodes = sets.Set()
        self.node_order = {}
        try:
            self.configure(CP.get("cluster_system", "hostfile"))
        except:
            self.logger.error("unable to load hostfile")
        self.lock = threading.Lock()
        self.statistics = Statistics()

    def save_me(self):
        Component.save(self)

    save_me = automatic(save_me)

    def validate_job(self, spec):
        """validate a job for submission

        Arguments:
        spec -- job specification dictionary
        """
        # spec has {nodes, walltime*, procs, mode, kernel}

        max_nodes = len(self.all_nodes)
        # FIXME: is bgtype really needed for clusters?
        try:
            sys_type = CP.get('bgsystem', 'bgtype')
        except:
            sys_type = 'bgl'
        if sys_type == 'bgp':
            job_types = ['smp', 'dual', 'vn', 'script']
        else:
            job_types = ['co', 'vn', 'script']
        try:
            spec['nodecount'] = int(spec['nodecount'])
        except:
            raise JobValidationError("Non-integer node count")
        if not 0 < spec['nodecount'] <= max_nodes:
            raise JobValidationError("Node count out of realistic range")
        if float(spec['time']) < 5:
            raise JobValidationError("Walltime less than minimum")
        if not spec['mode']:
            if sys_type == 'bgp':
                spec['mode'] = 'smp'
            else:
                spec['mode'] = 'co'
        if spec['mode'] not in job_types:
            raise JobValidationError("Invalid mode")
        if not spec['proccount']:
            if spec.get('mode', 'co') == 'vn':
                if sys_type == 'bgl':
                    spec['proccount'] = str(2 * int(spec['nodecount']))
                elif sys_type == 'bgp':
                    spec['proccount'] = str(4 * int(spec['nodecount']))
                else:
                    self.logger.error("Unknown bgtype %s" % (sys_type))
            elif spec.get('mode', 'co') == 'dual':
                spec['proccount'] = 2 * int(spec['nodecount'])
            else:
                spec['proccount'] = spec['nodecount']
        else:
            try:
                spec['proccount'] = int(spec['proccount'])
            except:
                JobValidationError("non-integer proccount")
            if spec['proccount'] < 1:
                raise JobValidationError("negative proccount")
            if spec['proccount'] > spec['nodecount']:
                if spec['mode'] not in ['vn', 'dual']:
                    raise JobValidationError("proccount too large")
                if sys_type == 'bgl' and (spec['proccount'] >
                                          (2 * spec['nodecount'])):
                    raise JobValidationError("proccount too large")
                elif sys_type == ' bgp' and (spec['proccount'] >
                                             (4 * spec['nodecount'])):
                    raise JobValidationError("proccount too large")
        # need to handle kernel
        return spec

    validate_job = exposed(validate_job)

    def run_diags(self, partition_list, test_name):
        def size_cmp(left, right):
            return -cmp(left.size, right.size)

        def _find_covering(partition):
            kids = [self._partitions[c_name] for c_name in partition.children]
            kids.sort(size_cmp)
            n = len(kids)
            part_node_cards = sets.Set(partition.node_cards)
            # generate the power set, but try to use the big partitions first (hence the sort above)
            for i in xrange(1, 2**n + 1):
                test_cover = [kids[j] for j in range(n) if i & 2**j]

                test_node_cards = sets.Set()
                for t in test_cover:
                    test_node_cards.update(t.node_cards)

                if test_node_cards.issubset(
                        part_node_cards) and test_node_cards.issuperset(
                            part_node_cards):
                    return test_cover

            return []

        def _run_diags(partition):
            covering = _find_covering(partition)
            for child in covering:
                self.pending_diags[child] = test_name
            return [child.name for child in covering]

        results = []
        for partition_name in partition_list:
            p = self._partitions[partition_name]
            results.append(_run_diags(p))

        return results

    run_diags = exposed(run_diags)

    def launch_diags(self, partition, test_name):
        '''override this method in derived classes!'''
        pass

    def finish_diags(self, partition, test_name, exit_value):
        '''call this method somewhere in your derived class where you deal with the exit values of diags'''
        if exit_value == 0:
            for dead in self.failed_diags[:]:
                if dead == partition.name or dead in partition.children:
                    self.failed_diags.remove(dead)
                    self.logger.info("removing %s from failed_diags list" %
                                     dead)
        else:
            if partition.children:
                self.run_diags([partition.name], test_name)
            else:
                self.failed_diags.append(partition.name)
                self.logger.info("adding %s to failed_diags list" %
                                 partition.name)

    def handle_pending_diags(self):
        for p in self.pending_diags.keys():
            if p.state in [
                    "idle", "blocked by pending diags", "failed diags",
                    "blocked by failed diags"
            ]:
                self.logger.info("launching diagnostics on %s" % p.name)
                self.launch_diags(p, self.pending_diags[p])
                del self.pending_diags[p]

    handle_pending_diags = automatic(handle_pending_diags)

    def fail_partitions(self, specs):
        parts = self.get_partitions(specs)
        if not parts:
            ret = "no matching partitions found\n"
        else:
            ret = ""
        for p in parts:
            if self.failed_diags.count(p.name) == 0:
                ret += "failing %s\n" % p.name
                self.failed_diags.append(p.name)
            else:
                ret += "%s is already marked as failing\n" % p.name

        return ret

    fail_partitions = exposed(fail_partitions)

    def unfail_partitions(self, specs):
        parts = self.get_partitions(specs)
        if not parts:
            ret = "no matching partitions found\n"
        else:
            ret = ""
        for p in self.get_partitions(specs):
            if self.failed_diags.count(p.name):
                ret += "unfailing %s\n" % p.name
                self.failed_diags.remove(p.name)
            else:
                ret += "%s is not currently failing\n" % p.name

        return ret

    unfail_partitions = exposed(unfail_partitions)

    def _find_job_location(self, args):
        nodes = int(args['nodes'])
        jobid = args['jobid']

        available_nodes = self._get_available_nodes(args)

        if nodes <= len(available_nodes):
            return {jobid: [available_nodes.pop() for i in range(nodes)]}
        else:
            return None

    def _get_available_nodes(self, args):
        queue = args['queue']
        forbidden = args.get("forbidden", [])
        required = args.get("required", [])

        if required:
            available_nodes = sets.Set(required)
        else:
            available_nodes = self.queue_assignments[queue].difference(
                forbidden)

        available_nodes = available_nodes.difference(self.running_nodes)
        available_nodes = available_nodes.difference(self.down_nodes)

        return available_nodes

    def _backfill_cmp(self, left, right):
        return cmp(left[1], right[1])

    # the argument "required" is used to pass in the set of locations allowed by a reservation;
    def find_job_location(self, arg_list, end_times):
        best_location_dict = {}
        winner = arg_list[0]

        # first time through, try for starting jobs based on utility scores
        for args in arg_list:
            location_data = self._find_job_location(args)
            if location_data:
                best_location_dict.update(location_data)
                break

        # the next time through, try to backfill, but only if we couldn't find anything to start
        if not best_location_dict:
            job_end_times = {}
            total = 0
            for item in sorted(end_times, cmp=self._backfill_cmp):
                total += len(item[0])
                job_end_times[total] = item[1]

            needed = winner['nodes'] - len(self._get_available_nodes(winner))
            now = time.time()
            backfill_cutoff = 0
            for num in sorted(job_end_times):
                if needed <= num:
                    backfill_cutoff = job_end_times[num] - now

            for args in arg_list:
                if 60 * float(args['walltime']) > backfill_cutoff:
                    continue

                location_data = self._find_job_location(args)
                if location_data:
                    best_location_dict.update(location_data)
                    self.logger.info("backfilling job %s" % args['jobid'])
                    break

        # reserve the stuff in the best_partition_dict, as those partitions are allegedly going to
        # be running jobs very soon
        for location_list in best_location_dict.itervalues():
            self.running_nodes.update(location_list)

        return best_location_dict

    find_job_location = exposed(find_job_location)

    def _walltimecmp(self, dict1, dict2):
        return -cmp(float(dict1['walltime']), float(dict2['walltime']))

    def find_queue_equivalence_classes(self, reservation_dict,
                                       active_queue_names):
        equiv = []
        for q in self.queue_assignments:
            # skip queues that aren't "running"
            if not q in active_queue_names:
                continue

            found_a_match = False
            for e in equiv:
                if e['data'].intersection(self.queue_assignments[q]):
                    e['queues'].add(q)
                    e['data'].update(self.queue_assignments[q])
                    found_a_match = True
                    break
            if not found_a_match:
                equiv.append({
                    'queues': set([q]),
                    'data': set(self.queue_assignments[q]),
                    'reservations': set()
                })

        real_equiv = []
        for eq_class in equiv:
            found_a_match = False
            for e in real_equiv:
                if e['queues'].intersection(eq_class['queues']):
                    e['queues'].update(eq_class['queues'])
                    e['data'].update(eq_class['data'])
                    found_a_match = True
                    break
            if not found_a_match:
                real_equiv.append(eq_class)

        equiv = real_equiv

        for eq_class in equiv:
            for res_name in reservation_dict:
                skip = True
                for host_name in reservation_dict[res_name].split(":"):
                    if host_name in eq_class['data']:
                        eq_class['reservations'].add(res_name)

            for key in eq_class:
                eq_class[key] = list(eq_class[key])
            del eq_class['data']

        return equiv

    find_queue_equivalence_classes = exposed(find_queue_equivalence_classes)

    def reserve_resources_until(self, location, time, jobid):
        if time is None:
            for host in location:
                self.running_nodes.discard(host)
                self.logger.info("hasty job kill: freeing %s" % host)
        else:
            self.logger.error("failed to reserve location '%r' until '%s'" %
                              (location, time))

    reserve_resources_until = exposed(reserve_resources_until)

    def nodes_up(self, node_list, user_name=None):
        changed = []
        for n in node_list:
            if n in self.down_nodes:
                self.down_nodes.remove(n)
                changed.append(n)
            if n in self.running_nodes:
                self.running_nodes.remove(n)
                changed.append(n)
        if changed:
            self.logger.info("%s marking nodes up: %s", user_name,
                             ", ".join(changed))
        return changed

    nodes_up = exposed(nodes_up)

    def nodes_down(self, node_list, user_name=None):
        changed = []
        for n in node_list:
            if n in self.all_nodes:
                self.down_nodes.add(n)
                changed.append(n)
        if changed:
            self.logger.info("%s marking nodes down: %s", user_name,
                             ", ".join(changed))
        return changed

    nodes_down = exposed(nodes_down)

    def get_node_status(self):
        def my_cmp(left, right):
            return cmp(left[2], right[2])

        status_list = []
        for n in self.all_nodes:
            if n in self.running_nodes:
                status = "allocated"
            elif n in self.down_nodes:
                status = "down"
            else:
                status = "idle"

            status_list.append((n, status, self.node_order[n]))
        status_list.sort(my_cmp)
        return status_list

    get_node_status = exposed(get_node_status)

    def get_queue_assignments(self):
        ret = {}
        for q in self.queue_assignments:
            ret[q] = list(self.queue_assignments[q])
        return ret

    get_queue_assignments = exposed(get_queue_assignments)

    def set_queue_assignments(self, queue_names, node_list, user_name=None):
        checked_nodes = sets.Set()
        for n in node_list:
            if n in self.all_nodes:
                checked_nodes.add(n)

        queue_list = queue_names.split(":")
        for q in queue_list:
            if q not in self.queue_assignments:
                self.queue_assignments[q] = sets.Set()

        for q in self.queue_assignments.keys():
            if q not in queue_list:
                self.queue_assignments[q].difference_update(checked_nodes)
                if len(self.queue_assignments[q]) == 0:
                    del self.queue_assignments[q]
            else:
                self.queue_assignments[q].update(checked_nodes)
        self.logger.info("%s assigning queues %s to nodes %s", user_name,
                         queue_names, " ".join(checked_nodes))
        return list(checked_nodes)

    set_queue_assignments = exposed(set_queue_assignments)

    def verify_locations(self, location_list):
        """Providing a system agnostic interface for making sure a 'location string' is valid"""
        ret = []
        for l in location_list:
            if l in self.all_nodes:
                ret.append(l)
        return ret

    verify_locations = exposed(verify_locations)

    def configure(self, filename):
        f = open(filename)

        counter = 0
        for line in f:
            name = line.strip()
            self.all_nodes.add(name)
            self.node_order[name] = counter
            counter += 1

        f.close()

    # this gets called by bgsched in order to figure out if there are partition overlaps;
    # it was written to provide the data that bgsched asks for and raises an exception
    # if you try to ask for more
    def get_partitions(self, specs):
        partitions = []
        for spec in specs:
            item = {}
            for n in self.all_nodes:
                if "name" in spec:
                    if spec["name"] == '*':
                        item.update({"name": n})
                    elif spec["name"] == n:
                        item.update({"name": n})

            if "name" in spec:
                spec.pop("name")
            if "children" in spec:
                item.update({"children": []})
                spec.pop("children")
            if "parents" in spec:
                item.update({"parents": []})
                spec.pop("parents")
            if spec:
                raise NotSupportedError("clusters lack information on: %s" %
                                        ", ".join(spec.keys()))
            if item:
                partitions.append(item)

        return partitions

    get_partitions = exposed(get_partitions)
Exemplo n.º 30
0
class BGBaseSystem(Component):
    """base system class.
    
    Methods:
    add_partitions -- tell the system to manage partitions (exposed, query)
    get_partitions -- retrieve partitions in the simulator (exposed, query)
    del_partitions -- tell the system not to manage partitions (exposed, query)
    set_partitions -- change random attributes of partitions (exposed, query)
    update_relatives -- should be called when partitions are added and removed from the managed list
    """
    def __init__(self, *args, **kwargs):
        Component.__init__(self, *args, **kwargs)
        self._partitions = PartitionDict()
        self._managed_partitions = set()
        self.process_groups = BGProcessGroupDict()
        self.node_card_cache = dict()
        self._partitions_lock = thread.allocate_lock()
        self.pending_diags = dict()
        self.failed_diags = list()
        self.bridge_in_error = False
        self.cached_partitions = None
        self.offline_partitions = []

    def _get_partitions(self):
        return PartitionDict([(partition.name, partition)
                              for partition in self._partitions.itervalues()
                              if partition.name in self._managed_partitions])

    partitions = property(_get_partitions)

    def add_partitions(self, specs, user_name=None):
        self.logger.info("%s called add_partitions(%r)", user_name, specs)
        specs = [{'name': spec.get("name")} for spec in specs]

        self._partitions_lock.acquire()
        try:
            partitions = [
                partition for partition in self._partitions.q_get(specs)
                if partition.name not in self._managed_partitions
            ]
        except:
            partitions = []
            self.logger.error("error in add_partitions", exc_info=True)
        self._partitions_lock.release()

        self._managed_partitions.update(
            [partition.name for partition in partitions])
        self.update_relatives()
        return partitions

    add_partition = exposed(query(add_partitions))

    def get_partitions(self, specs):
        """Query partitions on simulator."""
        self._partitions_lock.acquire()
        try:
            partitions = self.partitions.q_get(specs)
        except:
            partitions = []
            self.logger.error("error in get_partitions", exc_info=True)
        self._partitions_lock.release()

        return partitions

    get_partitions = exposed(query(get_partitions))

    def verify_locations(self, location_list):
        """Providing a system agnostic interface for making sure a 'location string' is valid"""
        parts = self.get_partitions([{'name': l} for l in location_list])
        return [p.name for p in parts]

    verify_locations = exposed(verify_locations)

    def del_partitions(self, specs, user_name=None):
        """Remove partitions from the list of managed partitions"""
        self.logger.info("%s called del_partitions(%r)", user_name, specs)

        self._partitions_lock.acquire()
        try:
            partitions = [
                partition for partition in self._partitions.q_get(specs)
                if partition.name in self._managed_partitions
            ]
        except:
            partitions = []
            self.logger.error("error in del_partitions", exc_info=True)
        self._partitions_lock.release()

        self._managed_partitions -= set(
            [partition.name for partition in partitions])
        self.update_relatives()
        return partitions

    del_partitions = exposed(query(del_partitions))

    def set_partitions(self, specs, updates, user_name=None):
        """Update random attributes on matching partitions"""
        def _set_partitions(part, newattr):
            self.logger.info("%s updating partition %s: %r", user_name,
                             part.name, newattr)
            part.update(newattr)

        self._partitions_lock.acquire()
        try:
            partitions = self._partitions.q_get(specs, _set_partitions,
                                                updates)
        except:
            partitions = []
            self.logger.error("error in set_partitions", exc_info=True)
        self._partitions_lock.release()
        return partitions

    set_partitions = exposed(query(set_partitions))

    def update_relatives(self):
        """Call this method after changing the contents of self._managed_partitions"""
        for p_name in self._managed_partitions:
            self._partitions[p_name]._parents = set()
            self._partitions[p_name]._children = set()

        for p in self._partitions.itervalues():
            p._all_children = set()

        for p_name in self._managed_partitions:
            p = self._partitions[p_name]

            #Check the wiring dependencies of our children.
            #Touching those would be bad. --PMR

            #            new_parents = []
            #            for par in p._parents:
            #                for dep_name in par._wiring_conflicts:
            #                    if dep_name in self._managed_partitions:
            #                        new_parents.append(self._partitions[dep_name])
            #            p._parents.union(set(new_parents))
            #
            #            for child in p._children:
            #                for dep_name in child._wiring_conflicts:
            #                    if dep_name in self._managed_partitions:
            #                        p._parents.add(self._partitions[dep_name])

            # toss the wiring dependencies in with the parents
            for dep_name in p._wiring_conflicts:
                if dep_name in self._managed_partitions:
                    p._parents.add(self._partitions[dep_name])

            for other in self._partitions.itervalues():
                if p.name == other.name:
                    continue

                p_set = set(p.node_cards)
                other_set = set(other.node_cards)

                if other.name in self._managed_partitions:
                    # if p is a subset of other, then p is a child; add other to p's list of managed parent partitions, and p to
                    # other's list of managed child partitions
                    if p_set.intersection(other_set) == p_set:
                        p._parents.add(other)
                        other._children.add(p)
                    # if p contains other, then p is a parent; add other to p's list of managed child partitions and p to other's
                    # list of managed parent partitions
                    elif p_set.union(other_set) == p_set:
                        p._children.add(other)
                        other._parents.add(p)

                # if p contains other, then p is a parent; add other to p's list of all child partitions
                if p_set.union(other_set) == p_set:
                    p._all_children.add(other)

        #Let's get the wiring conflicts for direct childeren as well,
        #we shouldn't be able to run on these either. --PMR
        for p_name in self._managed_partitions:

            #if p_name != "ANL-R10-R47-32768":
            #   continue

            p = self._partitions[p_name]
            for child in p._children:
                #print "Child %s:" % child.name
                for dep_name in child._wiring_conflicts:
                    #print "Conflict: %s" % dep_name
                    if dep_name in self._managed_partitions:
                        p._parents.add(self._partitions[dep_name])
                #we shouldn't be scheduling on the parents of our children either
                for par in child._parents:
                    #print "Parent: %s" % par.name
                    if ((par.name != p_name)
                            and (par.name in self._managed_partitions)):
                        p._parents.add(self._partitions[par.name])

        #for p_name in self._managed_partitions:

        #if p_name != "ANL-R10-R47-32768":
        #   continue
        #   print str(p_name) + ":"
        #   print "Parents: " + str(":".join([par.name for par in self._partitions[p_name]._parents]))
        #   print "Children:" + str(":".join([child.name for child in self._partitions[p_name]._children]))
        #   print "Conflicts:" +  str(":".join([con for con in self._partitions[p_name]._wiring_conflicts]))

    def validate_job(self, spec):
        """validate a job for submission

        Arguments:
        spec -- job specification dictionary
        """
        # spec has {nodes, walltime*, procs, mode, kernel}

        max_nodes = max([int(p.size) for p in self._partitions.values()])
        try:
            sys_type = CP.get('bgsystem', 'bgtype')
        except:
            sys_type = 'bgl'
        if sys_type == 'bgp':
            job_types = ['smp', 'dual', 'vn', 'script']
        else:
            job_types = ['co', 'vn', 'script']
        try:
            spec['nodecount'] = int(spec['nodecount'])
        except:
            raise JobValidationError("Non-integer node count")
        if not 0 < spec['nodecount'] <= max_nodes:
            raise JobValidationError("Node count out of realistic range")
        if float(spec['time']) < 5:
            raise JobValidationError("Walltime less than minimum")
        if not spec['mode']:
            if sys_type == 'bgp':
                spec['mode'] = 'smp'
            else:
                spec['mode'] = 'co'
        if spec['mode'] not in job_types:
            raise JobValidationError("Invalid mode")
        if spec['attrs'].has_key("location"):
            p_name = spec['attrs']['location']
            if not self.partitions.has_key(p_name):
                raise JobValidationError("Partition %s not found" % p_name)
        if not spec['proccount']:
            if spec.get('mode', 'co') == 'vn':
                if sys_type == 'bgl':
                    spec['proccount'] = str(2 * int(spec['nodecount']))
                elif sys_type == 'bgp':
                    spec['proccount'] = str(4 * int(spec['nodecount']))
                else:
                    self.logger.error("Unknown bgtype %s" % (sys_type))
            elif spec.get('mode', 'co') == 'dual':
                spec['proccount'] = 2 * int(spec['nodecount'])
            else:
                spec['proccount'] = spec['nodecount']
        else:
            try:
                spec['proccount'] = int(spec['proccount'])
            except:
                JobValidationError("non-integer proccount")
            if spec['proccount'] < 1:
                raise JobValidationError("negative proccount")
            if spec['proccount'] > spec['nodecount']:
                if spec['mode'] not in ['vn', 'dual']:
                    raise JobValidationError("proccount too large")
                if sys_type == 'bgl' and (spec['proccount'] >
                                          (2 * spec['nodecount'])):
                    raise JobValidationError("proccount too large")
                elif sys_type == ' bgp' and (spec['proccount'] >
                                             (4 * spec['nodecount'])):
                    raise JobValidationError("proccount too large")
        # need to handle kernel
        return spec

    validate_job = exposed(validate_job)

    def run_diags(self, partition_list, test_name, user_name=None):
        self.logger.info("%s running diags %s on partitions %s", user_name,
                         test_name, partition_list)

        def size_cmp(left, right):
            return -cmp(left.size, right.size)

        def _find_covering(partition):
            kids = [self._partitions[c_name] for c_name in partition.children]
            kids.sort(size_cmp)
            n = len(kids)
            part_node_cards = set(partition.node_cards)
            # generate the power set, but try to use the big partitions first (hence the sort above)
            for i in xrange(1, 2**n + 1):
                test_cover = [kids[j] for j in range(n) if i & 2**j]

                test_node_cards = set()
                for t in test_cover:
                    test_node_cards.update(t.node_cards)

                if test_node_cards.issubset(
                        part_node_cards) and test_node_cards.issuperset(
                            part_node_cards):
                    return test_cover

            return []

        def _run_diags(partition):
            covering = _find_covering(partition)
            for child in covering:
                self.pending_diags[child] = test_name
            return [child.name for child in covering]

        results = []
        for partition_name in partition_list:
            p = self._partitions[partition_name]
            results.append(_run_diags(p))

        return results

    run_diags = exposed(run_diags)

    def launch_diags(self, partition, test_name):
        '''override this method in derived classes!'''
        pass

    def finish_diags(self, partition, test_name, exit_value):
        '''call this method somewhere in your derived class where you deal with the exit values of diags'''
        if exit_value == 0:
            for dead in self.failed_diags[:]:
                if dead == partition.name or dead in partition.children:
                    self.failed_diags.remove(dead)
                    self.logger.info("removing %s from failed_diags list" %
                                     dead)
        else:
            if partition.children:
                self.run_diags([partition.name], test_name)
            else:
                self.failed_diags.append(partition.name)
                self.logger.info("adding %s to failed_diags list" %
                                 partition.name)

    def handle_pending_diags(self):
        for p in self.pending_diags.keys():
            if p.state in [
                    "idle", "blocked by pending diags", "failed diags",
                    "blocked by failed diags"
            ]:
                self.logger.info("launching diagnostics on %s" % p.name)
                self.launch_diags(p, self.pending_diags[p])
                del self.pending_diags[p]

    handle_pending_diags = automatic(handle_pending_diags)

    def fail_partitions(self, specs, user_name=None):
        self.logger.info("%s failing partition %s", user_name, specs)
        parts = self.get_partitions(specs)
        if not parts:
            ret = "no matching partitions found\n"
        else:
            ret = ""
        for p in parts:
            if self.failed_diags.count(p.name) == 0:
                ret += "failing %s\n" % p.name
                self.failed_diags.append(p.name)
            else:
                ret += "%s is already marked as failing\n" % p.name

        return ret

    fail_partitions = exposed(fail_partitions)

    def unfail_partitions(self, specs, user_name=None):
        self.logger.info("%s unfailing partition %s", user_name, specs)
        parts = self.get_partitions(specs)
        if not parts:
            ret = "no matching partitions found\n"
        else:
            ret = ""
        for p in self.get_partitions(specs):
            if self.failed_diags.count(p.name):
                ret += "unfailing %s\n" % p.name
                self.failed_diags.remove(p.name)
            else:
                ret += "%s is not currently failing\n" % p.name

        return ret

    unfail_partitions = exposed(unfail_partitions)

    def _find_job_location(self,
                           args,
                           drain_partitions=set(),
                           backfilling=False):
        jobid = args['jobid']
        nodes = args['nodes']
        queue = args['queue']
        utility_score = args['utility_score']
        walltime = args['walltime']
        walltime_p = args.get('walltime_p', walltime)  #*AdjEst*
        forbidden = args.get("forbidden", [])
        required = args.get("required", [])

        if walltime_prediction_enabled:  # *Adj_Est*
            runtime_estimate = float(walltime_p)
        else:
            runtime_estimate = float(walltime)

        best_score = sys.maxint
        best_partition = None

        available_partitions = set()

        requested_location = None
        if args['attrs'].has_key("location"):
            requested_location = args['attrs']['location']

        if required:
            # whittle down the list of required partitions to the ones of the proper size
            # this is a lot like the stuff in _build_locations_cache, but unfortunately,
            # reservation queues aren't assigned like real queues, so that code doesn't find
            # these
            for p_name in required:
                available_partitions.add(self.cached_partitions[p_name])
                available_partitions.update(
                    self.cached_partitions[p_name]._children)

            possible = set()
            for p in available_partitions:
                possible.add(p.size)

            desired_size = 0
            job_nodes = int(nodes)
            for psize in sorted(possible):
                if psize >= job_nodes:
                    desired_size = psize
                    break

            for p in available_partitions.copy():
                if p.size != desired_size:
                    available_partitions.remove(p)
                elif p.name in self._not_functional_set:
                    available_partitions.remove(p)
                elif requested_location and p.name != requested_location:
                    available_partitions.remove(p)
        else:
            for p in self.possible_locations(nodes, queue):
                skip = False
                for bad_name in forbidden:
                    if p.name == bad_name or bad_name in p.children or bad_name in p.parents:
                        skip = True
                        break
                if not skip:
                    if (not requested_location) or (p.name
                                                    == requested_location):
                        available_partitions.add(p)

        available_partitions -= drain_partitions
        now = time.time()

        for partition in available_partitions:
            # if the job needs more time than the partition currently has available, look elsewhere
            if backfilling:

                if partition.reserved_by:
                    #if the partition is reserved, we don't use predicted walltime to backfill
                    runtime_estimate = float(walltime)

                if 60 * runtime_estimate > (partition.backfill_time -
                                            now):  # *Adj_Est*
                    continue

                if 60 * float(walltime) > (partition.backfill_time - now):
                    continue

            if partition.state == "idle":
                # let's check the impact on partitions that would become blocked
                score = 0
                for p in partition.parents:
                    if self.cached_partitions[
                            p].state == "idle" and self.cached_partitions[
                                p].scheduled:
                        score += 1

                # the lower the score, the fewer new partitions will be blocked by this selection
                if score < best_score:
                    best_score = score
                    best_partition = partition

        if best_partition:
            return {jobid: [best_partition.name]}

    def _find_drain_partition(self, job):
        # if the user requested a particular partition, we only try to drain that one
        if job['attrs'].has_key("location"):
            target_name = job['attrs']['location']
            return self.cached_partitions.get(target_name, None)

        drain_partition = None
        locations = self.possible_locations(job['nodes'], job['queue'])

        for p in locations:
            if not drain_partition:
                drain_partition = p
            else:
                if p.backfill_time < drain_partition.backfill_time:
                    drain_partition = p

        if drain_partition:
            # don't try to drain for an entire weekend
            hours = (drain_partition.backfill_time - time.time()) / 3600.0
            if hours > max_drain_hours:
                drain_partition = None

        return drain_partition

    def possible_locations(self, job_nodes, q_name):
        desired_size = 0
        job_nodes = int(job_nodes)
        if self._defined_sizes.has_key(q_name):
            for psize in self._defined_sizes[q_name]:
                if psize >= job_nodes:
                    desired_size = psize
                    break

        if self._locations_cache.has_key(q_name):
            return self._locations_cache[q_name].get(desired_size, [])
        else:
            return []

    # this function builds three things, namely a pair of dictionaries keyed by queue names, and a set of
    # partition names which are not functional
    #
    # self._defined_sizes maps queue names to an ordered list of partition sizes available in that queue
    #     for all schedulable partitions (even if currently offline and not functional)
    # self._locations_cache maps queue names to dictionaries which map partition sizes to partition objects;
    #     this structure will only contain partitions which are fully online, so we don't try to drain a
    #     broken partition
    # self._not_functional_set contains names of partitions which are not functional (either themselves, or
    #     a parent or child)
    def _build_locations_cache(self):
        per_queue = {}
        defined_sizes = {}
        not_functional_set = set()
        for target_partition in self.cached_partitions.itervalues():
            usable = True
            if target_partition.name in self.offline_partitions:
                usable = False
            else:
                for part in self.cached_partitions.itervalues():
                    if not part.functional:
                        not_functional_set.add(part.name)
                        if target_partition.name in part.children or target_partition.name in part.parents:
                            usable = False
                            not_functional_set.add(target_partition.name)
                            break

            for queue_name in target_partition.queue.split(":"):
                if not per_queue.has_key(queue_name):
                    per_queue[queue_name] = {}
                if not defined_sizes.has_key(queue_name):
                    defined_sizes[queue_name] = set()
                if target_partition.scheduled:
                    defined_sizes[queue_name].add(target_partition.size)
                if target_partition.scheduled and target_partition.functional and usable:
                    if not per_queue[queue_name].has_key(
                            target_partition.size):
                        per_queue[queue_name][target_partition.size] = []
                    per_queue[queue_name][target_partition.size].append(
                        target_partition)

        for q_name in defined_sizes:
            defined_sizes[q_name] = sorted(defined_sizes[q_name])

        self._defined_sizes = defined_sizes
        self._locations_cache = per_queue
        self._not_functional_set = not_functional_set

    def find_job_location(self, arg_list, end_times):
        best_partition_dict = {}

        if self.bridge_in_error:
            return {}

        self._partitions_lock.acquire()
        try:
            self.cached_partitions = copy.deepcopy(self.partitions)
        except:
            self.logger.error("error in copy.deepcopy", exc_info=True)
            return {}
        finally:
            self._partitions_lock.release()

        # build the cached_partitions structure first
        self._build_locations_cache()

        # first, figure out backfilling cutoffs per partition (which we'll also use for picking which partition to drain)
        job_end_times = {}
        for item in end_times:
            job_end_times[item[0][0]] = item[1]

        now = time.time()
        for p in self.cached_partitions.itervalues():
            if p.state == "idle":
                p.backfill_time = now
            else:
                p.backfill_time = now + 5 * 60
            p.draining = False

        for p in self.cached_partitions.itervalues():
            if p.name in job_end_times:
                if job_end_times[p.name] > p.backfill_time:
                    p.backfill_time = job_end_times[p.name]

                for parent_name in p.parents:
                    parent_partition = self.cached_partitions[parent_name]
                    if p.backfill_time > parent_partition.backfill_time:
                        parent_partition.backfill_time = p.backfill_time

        for p in self.cached_partitions.itervalues():
            if p.backfill_time == now:
                continue

            for child_name in p.children:
                child_partition = self.cached_partitions[child_name]
                if child_partition.backfill_time == now or child_partition.backfill_time > p.backfill_time:
                    child_partition.backfill_time = p.backfill_time

        # first time through, try for starting jobs based on utility scores
        drain_partitions = set()

        for job in arg_list:
            partition_name = self._find_job_location(job, drain_partitions)
            if partition_name:
                best_partition_dict.update(partition_name)
                break

            location = self._find_drain_partition(job)
            if location is not None:
                for p_name in location.parents:
                    drain_partitions.add(self.cached_partitions[p_name])
                for p_name in location.children:
                    drain_partitions.add(self.cached_partitions[p_name])
                    self.cached_partitions[p_name].draining = True
                drain_partitions.add(location)
                #self.logger.info("job %s is draining %s" % (winning_job['jobid'], location.name))
                location.draining = True

        # the next time through, try to backfill, but only if we couldn't find anything to start
        if not best_partition_dict:

            # arg_list.sort(self._walltimecmp)

            for args in arg_list:
                partition_name = self._find_job_location(args,
                                                         backfilling=True)
                if partition_name:
                    self.logger.info("backfilling job %s" % args['jobid'])
                    best_partition_dict.update(partition_name)
                    break

        # reserve the stuff in the best_partition_dict, as those partitions are allegedly going to
        # be running jobs very soon
        #
        # also, this is the only part of finding a job location where we need to lock anything
        self._partitions_lock.acquire()
        try:
            for p in self.partitions.itervalues():
                # push the backfilling info from the local cache back to the real objects
                p.draining = self.cached_partitions[p.name].draining
                p.backfill_time = self.cached_partitions[p.name].backfill_time

            for jobid, partition_list in best_partition_dict.iteritems():
                part = self.partitions[partition_list[0]]
                # FIXME: use reserve_resources_until() here? --brt
                part.used_by = int(jobid)
                part.reserved_until = time.time() + 5 * 60
                part.state = "allocated"
                for p in part._parents:
                    if p.state == "idle":
                        p.state = "blocked (%s)" % (part.name, )
                for p in part._children:
                    if p.state == "idle":
                        p.state = "blocked (%s)" % (part.name, )
        except:
            self.logger.error("error in find_job_location", exc_info=True)
        self._partitions_lock.release()

        return best_partition_dict

    find_job_location = locking(exposed(find_job_location))

    def _walltimecmp(self, dict1, dict2):
        return -cmp(float(dict1['walltime']), float(dict2['walltime']))

    def find_queue_equivalence_classes(self, reservation_dict,
                                       active_queue_names):
        equiv = []
        for part in self.partitions.itervalues():
            if part.functional and part.scheduled:
                part_active_queues = []
                for q in part.queue.split(":"):
                    if q in active_queue_names:
                        part_active_queues.append(q)

                # go on to the next partition if there are no running
                # queues using this partition
                if not part_active_queues:
                    continue

                found_a_match = False
                for e in equiv:
                    if e['data'].intersection(part.node_card_names):
                        e['queues'].update(part_active_queues)
                        e['data'].update(part.node_card_names)
                        found_a_match = True
                        break
                if not found_a_match:
                    equiv.append({
                        'queues': set(part_active_queues),
                        'data': set(part.node_card_names),
                        'reservations': set()
                    })

        real_equiv = []
        for eq_class in equiv:
            found_a_match = False
            for e in real_equiv:
                if e['queues'].intersection(eq_class['queues']):
                    e['queues'].update(eq_class['queues'])
                    e['data'].update(eq_class['data'])
                    found_a_match = True
                    break
            if not found_a_match:
                real_equiv.append(eq_class)

        equiv = real_equiv

        for eq_class in equiv:
            for res_name in reservation_dict:
                skip = True
                for p_name in reservation_dict[res_name].split(":"):
                    p = self.partitions[p_name]
                    if eq_class['data'].intersection(p.node_card_names):
                        eq_class['reservations'].add(res_name)
                    for dep_name in p._wiring_conflicts:
                        if self.partitions.has_key(dep_name):
                            if eq_class['data'].intersection(
                                    self.partitions[dep_name].node_card_names):
                                eq_class['reservations'].add(res_name)
                                break

            for key in eq_class:
                eq_class[key] = list(eq_class[key])
            del eq_class['data']

        return equiv

    find_queue_equivalence_classes = exposed(find_queue_equivalence_classes)

    def can_run(self, target_partition, node_count, partition_dict):
        if target_partition.state != "idle":
            return False
        desired = sys.maxint
        for part in partition_dict.itervalues():
            if not part.functional:
                if target_partition.name in part.children or target_partition.name in part.parents:
                    return False
            else:
                if part.scheduled:
                    if int(node_count) <= int(part.size) < desired:
                        desired = int(part.size)
        return target_partition.scheduled and target_partition.functional and int(
            target_partition.size) == desired

    def reserve_resources_until(self, location, new_time, jobid):
        rc = False
        partition_name = location[0]
        pg = self.process_groups.find_by_jobid(jobid)
        try:
            self._partitions_lock.acquire()
            used_by = self.partitions[partition_name].used_by
            if used_by == None:
                self.partitions[partition_name].used_by = jobid
                used_by = jobid
            if new_time:
                if used_by == jobid:
                    self.partitions[partition_name].reserved_until = new_time
                    self.partitions[partition_name].reserved_by = jobid
                    self.logger.info(
                        "job %s: partition '%s' now reserved until %s", jobid,
                        partition_name, time.asctime(time.gmtime(new_time)))
                    rc = True
                else:
                    self.logger.error(
                        "job %s wasn't allowed to update the reservation on partition %s (owner=%s)",
                        jobid, partition_name, used_by)
            else:
                if used_by == jobid:
                    self.partitions[partition_name].reserved_until = False
                    self.partitions[partition_name].reserved_by = None
                    self.logger.info(
                        "reservation on partition '%s' has been removed",
                        partition_name)
                    rc = True
                else:
                    self.logger.error(
                        "job %s wasn't allowed to clear the reservation on partition %s (owner=%s)",
                        jobid, partition_name, used_by)
        except:
            self.logger.exception(
                "an unexpected error occurred will adjusting the partition reservation time"
            )
        finally:
            self._partitions_lock.release()
        return rc

    reserve_resources_until = exposed(reserve_resources_until)
Exemplo n.º 31
0
Arquivo: slp.py Projeto: zzhou/Qsim
class ServiceLocator(Component):
    """Generic implementation of the service-location component.
    
    Methods:
    register -- register a service (exposed)
    unregister -- remove a service from the registry (exposed)
    locate -- retrieve the location of a service (exposed)
    get_services -- part of the query interface from DataSet (exposed)
    """

    name = "service-location"

    # A default logger for the class is placed here.
    # Assigning an instance-level logger is supported,
    # and expected in the case of multiple instances.
    logger = logging.getLogger("Cobalt.Components.ServiceLocator")

    def __init__(self, *args, **kwargs):
        """Initialize a new ServiceLocator.
        
        All arguments are passed to the component constructor.
        """
        Component.__init__(self, *args, **kwargs)
        self.services = ServiceDict()

    def register(self, service_name, location):
        """Register the availability of a service.
        
        Arguments:
        service_name -- name of the service to register
        location -- location of the service
        """
        try:
            service = self.services[service_name]
        except KeyError:
            service = Service(dict(name=service_name, location=location))
            self.services[service_name] = service
            self.logger.info("register(%r, %r)" % (service_name, location))
        else:
            service.location = location
            service.touch()

    register = exposed(register)

    def unregister(self, service_name):
        """Remove a service from the registry.
        
        Arguments:
        service_name -- name of the service to remove
        """
        try:
            del self.services[service_name]
        except KeyError:
            self.logger.info("unregister(%r) [not registered]" %
                             (service_name))
        else:
            self.logger.info("unregister(%r)" % (service_name))

    unregister = exposed(unregister)

    def locate(self, service_name):
        """Retrieve the location for a service.
        
        Arguments:
        service_name -- name of the service to look up
        """
        try:
            service = self.services[service_name]
        except KeyError:
            self.logger.debug("locate(%r) [not registered]" % (service_name))
            return ""
        return service.location

    locate = exposed(locate)

    def get_services(self, specs):
        """Query interface "Get" method."""
        return self.services.q_get(specs)

    get_services = exposed(query(get_services))
Exemplo n.º 32
0
class Simulator(BGBaseSystem):
    """Generic system simulator.
    
    Methods:
    configure -- load partitions from an xml file
    reserve_partition -- lock a partition for use by a process_group (exposed)
    release_partition -- release a locked (busy) partition (exposed)
    add_process_groups -- add (start) a process group on the system (exposed, query)
    get_process_groups -- retrieve process groups (exposed, query)
    wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query)
    signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query)
    update_partition_state -- simulates updating partition state from the bridge API (automatic)
    """

    name = "system"
    implementation = "simulator"

    logger = logger

    def __init__(self, *args, **kwargs):
        BGBaseSystem.__init__(self, *args, **kwargs)
        sys.setrecursionlimit(5000)  #why this magic number?
        self.process_groups.item_cls = BGSimProcessGroup
        self.node_card_cache = dict()
        self.failed_components = set()
        self.config_file = kwargs.get(
            "config_file",
            get_config_option('bgsystem', 'system_def_file', None))
        if self.config_file is not None:
            self.logger.log(1, "init: loading machine configuration")
            self.configure(self.config_file)
            self.logger.log(1, "init: recomputing partition state")
            self._recompute_partition_state()

    def __getstate__(self):
        state = {}
        state.update(BGBaseSystem.__getstate__(self))
        state.update({
            'simulator_version': 4,
            'config_file': self.config_file,
            'failed_components': self.failed_components
        })
        return state

    def __setstate__(self, state):
        try:
            self.logger.log(1, "restart: initializing base system class")
            BGBaseSystem.__setstate__(self, state)
            self.process_groups.item_cls = BGSimProcessGroup
            self.node_card_cache = dict()
            try:
                self.failed_components = state['failed_components']
            except KeyError:
                self.failed_components = set()
            try:
                self.config_file = state['config_file']
            except KeyError:
                self.config_file = os.path.expandvars(
                    get_config_option('system', 'def_file', ""))
            if self.config_file:
                self.logger.log(1, "restart: loading machine configuration")
                self.configure(self.config_file)
                self.logger.log(1, "restart: restoring partition state")
                self._restore_partition_state(state)
                self.logger.log(1, "restart: recomputing partition state")
                self._recompute_partition_state()
        except:
            self.logger.error(
                "A fatal error occurred while restarting the system component",
                exc_info=True)
            print "A fatal error occurred while restarting the system component.  Terminating."
            sys.exit(1)

    def save_me(self):
        Component.save(self)

    save_me = automatic(
        save_me, float(get_config_option('bgsystem', 'save_me_interval', 10)))

    def _recompute_partition_state(self):
        self.offline_partitions = []

        for p in self._partitions.values():
            if p.state != 'idle':
                continue

            for part_name in self.failed_partitions:
                try:
                    part = self._partitions[part_name]
                except KeyError:
                    pass
                else:
                    if p == part:
                        p.state = "failed diags"
                        break
                    elif p in part._parents or p in part._children:
                        p.state = "blocked (%s)" % (part.name, )
                        break
            if p.state != 'idle':
                continue

            for nc in p.node_cards:
                if nc.id in self.failed_components:
                    p.state = "hardware offline: nodecard %s" % nc.id
                    self.offline_partitions.append(p.name)
                    break
                elif nc.used_by:
                    p.state = "blocked (%s)" % nc.used_by
                    break
            if p.state != 'idle':
                continue

            for s in p.switches:
                if s in self.failed_components:
                    p.state = "hardware offline: switch %s" % (s, )
                    self.offline_partitions.append(p.name)
                    break
            if p.state != 'idle':
                continue

            for w in p.wires:
                if w in self.failed_components:
                    p.state = "hardware offline: switch %s" % (w, )
                    self.offline_partitions.append(p.name)
                    break
            if p.state != 'idle':
                continue

            for dep_name in p._wiring_conflicts:
                try:
                    part = self._partitions[dep_name]
                except KeyError:
                    self.logger.warning(
                        "partition %s: wiring conflict %s does not exist in partition table",
                        p.name, dep_name)
                else:
                    if part.state == "busy" or part.used_by:
                        p.state = "blocked-wiring (%s)" % dep_name
                        break
            if p.state != 'idle':
                continue

            if p.used_by:
                p.state = "allocated"
                continue

            for part in p._parents.union(p._children):
                if part.used_by:
                    p.state = "blocked (%s)" % (part.name, )
                    break

    def configure(self, config_file):
        """
        Configure simulated partitions.

        Arguments:
        config_file -- xml configuration file
        """

        self.logger.log(1, "configure: opening machine configuration file")

        def _get_node_card(name):
            if not self.node_card_cache.has_key(name):
                self.node_card_cache[name] = NodeCard(name)

            return self.node_card_cache[name]

        try:
            system_doc = ElementTree.parse(config_file)
        except IOError:
            self.logger.error("unable to open file: %r" % config_file)
            self.logger.error("exiting...")
            sys.exit(1)
        except:
            self.logger.error("problem loading data from file: %r" %
                              config_file,
                              exc_info=True)
            self.logger.error("exiting...")
            sys.exit(1)

        system_def = system_doc.getroot()
        if system_def.tag != "BG":
            self.logger.error("unexpected root element in %r: %r" %
                              (config_file, system_def.tag))
            self.logger.error("exiting...")
            sys.exit(1)

        # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API
        NODES_PER_NODECARD = 32

        # initialize a new partition dict with all partitions
        #
        partitions = PartitionDict()

        tmp_list = []

        # this is going to hold partition objects from the bridge (not our own Partition)
        self.logger.log(
            1,
            "configure: acquiring machine information and creating partition objects"
        )
        self._partitions.clear()
        for partition_def in system_def.getiterator("Partition"):
            node_list = []
            switch_list = []
            wire_list = []

            for nc in partition_def.getiterator("NodeCard"):
                node_list.append(_get_node_card(nc.get("id")))

            nc_count = len(node_list)

            for s in partition_def.getiterator("Switch"):
                switch_list.append(s.get("id"))

            for w in partition_def.getiterator("Wire"):
                wire_list.append(w.get("id"))

            self._partitions.q_add([
                dict(
                    name=partition_def.get("name"),
                    queue=partition_def.get("queue", "default"),
                    size=NODES_PER_NODECARD * nc_count,
                    node_cards=node_list,
                    switches=switch_list,
                    wires=wire_list,
                    state="idle",
                )
            ])

        # find the wiring deps
        self.logger.log(1, "configure: looking for wiring dependencies")
        for p in self._partitions.values():
            self._detect_wiring_deps(p)

        # update partition relationship lists
        self.logger.log(1, "configure: updating partition relationship lists")
        self.update_relatives()

    def update_partition_state(self):
        # first, set all of the nodecards to not busy
        for nc in self.node_card_cache.values():
            nc.used_by = ''

        self._partitions_lock.acquire()
        try:
            # first determine if the partition and associate node cards are in use
            now = time.time()
            for p in self._partitions.values():
                # since we don't have the bridge, a partition which isn't busy
                # should be set to idle and then blocked states can be derived
                if p.state != "busy":
                    p.state = "idle"

                # check if the partition is not longer reserved or the reservation has expired
                if p.used_by:
                    if not p.reserved_until or now > p.reserved_until:
                        p.reserved_until = None
                        p.reserved_by = None
                        p.used_by = None
                        # for now, assume cleanup happens instantaneously
                        p.state = 'idle'

                p._update_node_cards()

            # then set parition states based on that usage as well as failed hardware, resource reservations, etc.
            self._recompute_partition_state()
        except:
            self.logger.error("error in update_partition_state", exc_info=True)

        self._partitions_lock.release()

    update_partition_state = automatic(update_partition_state)

    def _mark_partition_for_cleaning(self, pname, jobid):
        pass

    def _set_kernel(self, partition, kernel):
        # TODO: allow the kernel set step to work in the simulator.  For now this doesn't fly.
        pass

    def reserve_partition(self, name, size=None):
        """Reserve a partition and block all related partitions.
        
        Arguments:
        name -- name of the partition to reserve
        size -- size of the process group reserving the partition (optional)
        """

        try:
            self._partitions_lock.acquire()

            try:
                partition = self.partitions[name]
            except KeyError:
                self.logger.error(
                    "reserve_partition(%r, %r) [does not exist]" %
                    (name, size))
                return False
            if partition.state != "allocated":
                self.logger.error("reserve_partition(%r, %r) [%s]" %
                                  (name, size, partition.state))
                return False
            if not partition.functional:
                self.logger.error(
                    "reserve_partition(%r, %r) [not functional]" %
                    (name, size))
                return False
            if size is not None and size > partition.size:
                self.logger.error("reserve_partition(%r, %r) [size mismatch]" %
                                  (name, size))
                return False

            partition.state = "busy"
            # partition.reserved_until = False
        finally:
            self._partitions_lock.release()

        # explicitly call this, since the above "busy" is instantaneously available
        self.update_partition_state()

        self.logger.info("reserve_partition(%r, %r)" % (name, size))
        return True

    reserve_partition = exposed(reserve_partition)

    def release_partition(self, name):
        """Release a reserved partition.

        Arguments:
        name -- name of the partition to release
        """
        try:
            self._partitions_lock.acquire()

            try:
                partition = self.partitions[name]
            except KeyError:
                self.logger.error("release_partition(%r) [already free]" %
                                  (name))
                return False
            if not partition.state == "busy":
                self.logger.info("release_partition(%r) [not busy]" % (name))
                return False

            if partition.used_by is not None:
                partition.state = "allocated"
            else:
                partition.state = "idle"
        finally:
            self._partitions_lock.release()

        # explicitly unblock the blocked partitions
        self.update_partition_state()

        self.logger.info("release_partition(%r)" % (name))
        return True

    release_partition = exposed(release_partition)

    def add_failed_components(self, component_names):
        success = []
        for name in component_names:
            if self.node_card_cache.has_key(name):
                self.failed_components.add(name)
                success.append(name)
            else:
                for p in self._partitions.values():
                    if name in p.switches:
                        self.failed_components.add(name)
                        success.append(name)
                        break
        return success

    add_failed_component = exposed(add_failed_components)

    def del_failed_components(self, component_names):
        success = []
        for name in component_names:
            try:
                self.failed_components.remove(name)
                success.append(name)
            except KeyError:
                pass
        return success

    del_failed_components = exposed(del_failed_components)

    def list_failed_components(self, component_names):
        return list(self.failed_components)

    list_failed_components = exposed(list_failed_components)
Exemplo n.º 33
0
class Simulator(BGBaseSystem):
    """Generic system simulator.
    
    Methods:
    configure -- load partitions from an xml file
    reserve_partition -- lock a partition for use by a process_group (exposed)
    release_partition -- release a locked (busy) partition (exposed)
    add_process_groups -- add (start) a process group on the system (exposed, query)
    get_process_groups -- retrieve process groups (exposed, query)
    wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query)
    signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query)
    update_partition_state -- simulates updating partition state from the bridge API (automatic)
    """

    name = "system"
    implementation = "simulator"

    logger = logger

    MIN_RUN_TIME = 60
    MAX_RUN_TIME = 180

    def __init__(self, *args, **kwargs):
        BGBaseSystem.__init__(self, *args, **kwargs)
        self.process_groups.item_cls = BGSimProcessGroup
        self.config_file = kwargs.get("config_file", None)
        self.failed_components = sets.Set()
        if self.config_file is not None:
            self.configure(self.config_file)

    def __getstate__(self):
        flags = {}
        for part in self._partitions.values():
            sched = None
            func = None
            queue = None
            if hasattr(part, 'scheduled'):
                sched = part.scheduled
            if hasattr(part, 'functional'):
                func = part.functional
            if hasattr(part, 'queue'):
                queue = part.queue
            flags[part.name] = (sched, func, queue)
        return {
            'managed_partitions': self._managed_partitions,
            'version': 2,
            'config_file': self.config_file,
            'partition_flags': flags
        }

    def __setstate__(self, state):
        self._managed_partitions = state['managed_partitions']
        self.config_file = state['config_file']
        self._partitions = PartitionDict()
        self.process_groups = BGProcessGroupDict()
        self.process_groups.item_cls = BGSimProcessGroup
        self.node_card_cache = dict()
        self._partitions_lock = thread.allocate_lock()
        self.failed_components = sets.Set()
        self.pending_diags = dict()
        self.failed_diags = list()
        self.bridge_in_error = False
        self.cached_partitions = None
        self.offline_partitions = []
        if self.config_file is not None:
            self.configure(self.config_file)

        if 'partition_flags' in state:
            for pname, flags in state['partition_flags'].items():
                if pname in self._partitions:
                    self._partitions[pname].scheduled = flags[0]
                    self._partitions[pname].functional = flags[1]
                    self._partitions[pname].queue = flags[2]
                else:
                    logger.info("Partition %s is no longer defined" % pname)

        self.update_relatives()
        self.lock = threading.Lock()
        self.statistics = Statistics()

    def save_me(self):
        Component.save(self)

    save_me = automatic(save_me)

    def configure(self, config_file):
        """Configure simulated partitions.
        
        Arguments:
        config_file -- xml configuration file
        """
        def _get_node_card(name):
            if not self.node_card_cache.has_key(name):
                self.node_card_cache[name] = NodeCard(name)

            return self.node_card_cache[name]

        self.logger.info("configure()")
        try:
            system_doc = ElementTree.parse(config_file)
        except IOError:
            self.logger.error("unable to open file: %r" % config_file)
            self.logger.error("exiting...")
            sys.exit(1)
        except:
            self.logger.error("problem loading data from file: %r" %
                              config_file)
            self.logger.error("exiting...")
            sys.exit(1)

        system_def = system_doc.getroot()
        if system_def.tag != "BG":
            self.logger.error("unexpected root element in %r: %r" %
                              (config_file, system_def.tag))
            self.logger.error("exiting...")
            sys.exit(1)

        # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API
        NODES_PER_NODECARD = 32

        # initialize a new partition dict with all partitions
        #
        partitions = PartitionDict()

        tmp_list = []

        # this is going to hold partition objects from the bridge (not our own Partition)
        wiring_cache = {}
        bp_cache = {}

        for partition_def in system_def.getiterator("Partition"):
            if not partition_def.get("name").startswith("ANL"):
                continue

            node_list = []
            switch_list = []

            for nc in partition_def.getiterator("NodeCard"):
                node_list.append(_get_node_card(nc.get("id")))

            nc_count = len(node_list)

            # remove partitions which have less than 512 nodes
            if (NODES_PER_NODECARD * nc_count) < 512:
                continue
            if not wiring_cache.has_key(nc_count):
                wiring_cache[nc_count] = []
            wiring_cache[nc_count].append(partition_def.get("name"))

            for s in partition_def.getiterator("Switch"):
                switch_list.append(s.get("id"))

            tmp_list.append(
                dict(
                    name=partition_def.get("name"),
                    queue=partition_def.get("queue", "default"),
                    size=NODES_PER_NODECARD * nc_count,
                    node_cards=node_list,
                    switches=switch_list,
                    state="idle",
                ))

        partitions.q_add(tmp_list)

        # find the wiring deps
        for size in wiring_cache:
            for p in wiring_cache[size]:
                p = partitions[p]
                s1 = sets.Set(p.switches)
                for other in wiring_cache[size]:
                    other = partitions[other]
                    if (p.name == other.name):
                        continue

                    s2 = sets.Set(other.switches)

                    if s1.intersection(s2):
                        self.logger.info(
                            "found a wiring dep between %s and %s", p.name,
                            other.name)
                        partitions[p.name]._wiring_conflicts.add(other.name)

        # update object state
        self._partitions.clear()
        self._partitions.update(partitions)
        print "Total partitions: ", len(self._partitions)

    def reserve_partition(self, name, size=None):
        """Reserve a partition and block all related partitions.
        
        Arguments:
        name -- name of the partition to reserve
        size -- size of the process group reserving the partition (optional)
        """

        try:
            partition = self.partitions[name]
        except KeyError:
            self.logger.error("reserve_partition(%r, %r) [does not exist]" %
                              (name, size))
            return False
        if partition.state != "allocated":
            self.logger.error("reserve_partition(%r, %r) [%s]" %
                              (name, size, partition.state))
            return False
        if not partition.functional:
            self.logger.error("reserve_partition(%r, %r) [not functional]" %
                              (name, size))
        if size is not None and size > partition.size:
            self.logger.error("reserve_partition(%r, %r) [size mismatch]" %
                              (name, size))
            return False

        self._partitions_lock.acquire()
        try:
            partition.state = "busy"
            partition.reserved_until = False
        except:
            self.logger.error("error in reserve_partition", exc_info=True)
        self._partitions_lock.release()
        # explicitly call this, since the above "busy" is instantaneously available
        self.update_partition_state()

        self.logger.info("reserve_partition(%r, %r)" % (name, size))
        return True

    reserve_partition = exposed(reserve_partition)

    def release_partition(self, name):
        """Release a reserved partition.
        
        Arguments:
        name -- name of the partition to release
        """
        try:
            partition = self.partitions[name]
        except KeyError:
            self.logger.error("release_partition(%r) [already free]" % (name))
            return False
        if not partition.state == "busy":
            self.logger.info("release_partition(%r) [not busy]" % (name))
            return False

        self._partitions_lock.acquire()
        try:
            partition.state = "idle"
        except:
            self.logger.error("error in release_partition", exc_info=True)
        self._partitions_lock.release()

        # explicitly unblock the blocked partitions
        self.update_partition_state()

        self.logger.info("release_partition(%r)" % (name))
        return True

    release_partition = exposed(release_partition)

    def add_process_groups(self, specs):
        """Create a simulated process group.
        
        Arguments:
        spec -- dictionary hash specifying a process group to start
        """

        self.logger.info("add_process_groups(%r)" % (specs))

        script_specs = []
        other_specs = []
        for spec in specs:
            if spec.get('mode') == "script":
                script_specs.append(spec)
            else:
                other_specs.append(spec)

        # start up script jobs
        new_pgroups = []
        if script_specs:
            try:
                for spec in script_specs:
                    script_pgroup = ComponentProxy("script-manager").add_jobs(
                        [spec])
                    new_pgroup = self.process_groups.q_add([spec])
                    new_pgroup[0].script_id = script_pgroup[0]['id']
                    self.reserve_resources_until(
                        spec['location'],
                        time.time() + 60 * float(spec['walltime']),
                        new_pgroup[0].jobid)
                    new_pgroups.append(new_pgroup[0])
            except (ComponentLookupError, xmlrpclib.Fault):
                raise ProcessGroupCreationError(
                    "system::add_process_groups failed to communicate with script-manager"
                )

        process_groups = self.process_groups.q_add(other_specs)
        for process_group in process_groups:
            self.start(process_group)

        return new_pgroups + process_groups

    add_process_groups = exposed(query(all_fields=True)(add_process_groups))

    def get_process_groups(self, specs):
        """Query process_groups from the simulator."""
        return self.process_groups.q_get(specs)

    get_process_groups = exposed(query(get_process_groups))

    def wait_process_groups(self, specs):
        """get process groups that have finished running."""
        self.logger.info("wait_process_groups(%r)" % (specs))
        process_groups = [
            pg for pg in self.process_groups.q_get(specs)
            if pg.exit_status is not None
        ]
        for process_group in process_groups:
            # jobs that were launched on behalf of the script manager shouldn't release the partition
            if not process_group.true_mpi_args:
                self.reserve_resources_until(process_group.location, None,
                                             process_group.jobid)
            del self.process_groups[process_group.id]
        return process_groups

    wait_process_groups = exposed(query(wait_process_groups))

    def signal_process_groups(self, specs, signame="SIGINT"):
        """Simulate the signaling of a process_group."""
        self.logger.info("signal_process_groups(%r, %r)" % (specs, signame))
        process_groups = self.process_groups.q_get(specs)
        for process_group in process_groups:
            if process_group.mode == "script":
                try:
                    pgroup = ComponentProxy("script-manager").signal_jobs(
                        [{
                            'id': process_group.script_id
                        }], "SIGTERM")
                except (ComponentLookupError, xmlrpclib.Fault):
                    logger.error(
                        "Failed to communicate with script manager when killing job"
                    )
            else:
                process_group.signals.append(signame)
        return process_groups

    signal_process_groups = exposed(query(signal_process_groups))

    def start(self, process_group):
        thread.start_new_thread(self._mpirun, (process_group, ))

    def _mpirun(self, process_group):
        argv = process_group._get_argv()
        try:
            stdout = open(process_group.stdout or "/dev/null", "a")
        except:
            stdout = open("/dev/null", "a")
        try:
            stderr = open(process_group.stderr or "/dev/null", "a")
        except:
            stderr = open("/dev/null", "a")

        try:
            clfn = process_group.cobalt_log_file or "/dev/null"
            cobalt_log_file = open(clfn, "a")
            print >> cobalt_log_file, "%s\n" % " ".join(argv[1:])
            cobalt_log_file.close()
        except:
            logger.error("Job %s/%s: unable to open cobaltlog file %s",
                         process_group.id,
                         process_group.user,
                         clfn,
                         exc_info=True)

        try:
            partition = argv[argv.index("-partition") + 1]
        except ValueError:
            print >> stderr, "ERROR: '-partition' is a required flag"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        except IndexError:
            print >> stderr, "ERROR: '-partition' requires a value"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return

        try:
            mode = argv[argv.index("-mode") + 1]
        except ValueError:
            print >> stderr, "ERROR: '-mode' is a required flag"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        except IndexError:
            print >> stderr, "ERROR: '-mode' requires a value"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return

        try:
            size = argv[argv.index("-np") + 1]
        except ValueError:
            print >> stderr, "ERROR: '-np' is a required flag"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        except IndexError:
            print >> stderr, "ERROR: '-np' requires a value"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        try:
            size = int(size)
        except ValueError:
            print >> stderr, "ERROR: '-np' got invalid value %r" % (size)
            print >> stderr, "FE_MPI (Info) : Exit status: 1"

        print >> stdout, "ENVIRONMENT"
        print >> stdout, "-----------"
        for key, value in process_group.env.iteritems():
            print >> stdout, "%s=%s" % (key, value)
        print >> stdout

        print >> stderr, "FE_MPI (Info) : Initializing MPIRUN"
        reserved = self.reserve_partition(partition, size)
        if not reserved:
            print >> stderr, "BE_MPI (ERROR): Failed to run process on partition"
            print >> stderr, "BE_MPI (Info) : BE completed"
            print >> stderr, "FE_MPI (ERROR): Failure list:"
            print >> stderr, "FE_MPI (ERROR): - 1. ProcessGroup execution failed - unable to reserve partition", partition
            print >> stderr, "FE_MPI (Info) : FE completed"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return

        hardware_failure = False
        for nc in self.partitions[partition].node_cards:
            if nc.id in self.failed_components:
                hardware_failure = True
                break
        for switch in self.partitions[partition].switches:
            if switch in self.failed_components:
                hardware_failure = True
                break

        if hardware_failure:
            excuses = [
                "incorrectly polarized packet accelerator",
                "the Internet is full",
                "side fumbling detected",
                "unilateral phase detractors offline",
            ]
            print >> stderr, "BE_MPI (ERROR): Booting aborted - partition is in DEALLOCATING ('D') state"
            print >> stderr, "BE_MPI (ERROR): Partition has not reached the READY ('I') state"
            print >> stderr, "BE_MPI (Info) : Checking for block error text:"
            print >> stderr, "BE_MPI (ERROR): block error text '%s.'" % random.choice(
                excuses)
            print >> stderr, "BE_MPI (Info) : Starting cleanup sequence"
            time.sleep(20)
            self.release_partition(partition)
            print >> stderr, "BE_MPI (Info) : Partition", partition, "switched to state FREE ('F')"
            print >> stderr, "FE_MPI (ERROR): Failure list:"
            print >> stderr, "FE_MPI (ERROR): - 1.", partition, "couldn't boot."
            print >> stderr, "FE_MPI (Info) : FE completed"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return

        print >> stderr, "FE_MPI (Info) : process group with id", process_group.id
        print >> stderr, "FE_MPI (Info) : Waiting for process_group to terminate"

        print >> stdout, "Running process_group: %s" % " ".join(argv)

        start_time = time.time()
        run_time = random.randint(self.MIN_RUN_TIME, self.MAX_RUN_TIME)
        my_exit_status = 0

        self.logger.info("process group %d running for about %f seconds",
                         process_group.id, run_time)
        while time.time() < (start_time + run_time):
            if "SIGKILL" in process_group.signals:
                process_group.exit_status = 1
                return
            elif "SIGTERM" in process_group.signals:
                print >> stderr, "FE_MPI (Info) : ProcessGroup got signal SIGTERM"
                my_exit_status = 1
                break
            else:
                time.sleep(1)  # tumblers better than pumpers

        print >> stderr, "FE_MPI (Info) : ProcessGroup", process_group.id, "switched to state TERMINATED ('T')"
        print >> stderr, "FE_MPI (Info) : ProcessGroup sucessfully terminated"
        print >> stderr, "BE_MPI (Info) : Releasing partition", partition
        released = self.release_partition(partition)
        if not released:
            print >> stderr, "BE_MPI (ERROR): Partition", partition, "could not switch to state FREE ('F')"
            print >> stderr, "BE_MPI (Info) : BE completed"
            print >> stderr, "FE_MPI (Info) : FE completed"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        print >> stderr, "BE_MPI (Info) : Partition", partition, "switched to state FREE ('F')"
        print >> stderr, "BE_MPI (Info) : BE completed"
        print >> stderr, "FE_MPI (Info) : FE completed"
        print >> stderr, "FE_MPI (Info) : Exit status:", my_exit_status

        process_group.exit_status = my_exit_status

    def update_partition_state(self):
        # first, set all of the nodecards to not busy
        for nc in self.node_card_cache.values():
            nc.used_by = ''

        self._partitions_lock.acquire()
        try:
            for p in self._partitions.values():
                p._update_node_cards()

            now = time.time()

            # since we don't have the bridge, a partition which isn't busy
            # should be set to idle and then blocked states can be derived
            for p in self._partitions.values():
                if p.state != "busy":
                    p.state = "idle"
                if p.reserved_until and now > p.reserved_until:
                    p.reserved_until = None
                    p.reserved_by = None

            for p in self._partitions.values():
                if p.state == "busy":
                    # when the partition becomes busy, if a script job isn't reserving it, then release the reservation
                    if not p.reserved_by:
                        p.reserved_until = False
                else:
                    if p.reserved_until:
                        p.state = "allocated"
                        for part in p._parents:
                            if part.state == "idle":
                                part.state = "blocked (%s)" % (p.name, )
                        for part in p._children:
                            if part.state == "idle":
                                part.state = "blocked (%s)" % (p.name, )
                    for diag_part in self.pending_diags:
                        if p.name == diag_part.name or p.name in diag_part.parents or p.name in diag_part.children:
                            p.state = "blocked by pending diags"
                    for nc in p.node_cards:
                        if nc.used_by:
                            p.state = "blocked (%s)" % nc.used_by
                            break
                    for dep_name in p._wiring_conflicts:
                        if self._partitions[dep_name].state in [
                                "allocated", "busy"
                        ]:
                            p.state = "blocked-wiring (%s)" % dep_name
                            break
                    for part_name in self.failed_diags:
                        part = self._partitions[part_name]
                        if p.name == part.name:
                            p.state = "failed diags"
                        elif p.name in part.parents or p.name in part.children:
                            p.state = "blocked by failed diags"
        except:
            self.logger.error("error in update_partition_state", exc_info=True)

        self._partitions_lock.release()

    update_partition_state = automatic(update_partition_state)

    def add_failed_components(self, component_names):
        success = []
        for name in component_names:
            if self.node_card_cache.has_key(name):
                self.failed_components.add(name)
                success.append(name)
            else:
                for p in self._partitions.values():
                    if name in p.switches:
                        self.failed_components.add(name)
                        success.append(name)
                        break
        return success

    add_failed_component = exposed(add_failed_components)

    def del_failed_components(self, component_names):
        success = []
        for name in component_names:
            try:
                self.failed_components.remove(name)
                success.append(name)
            except KeyError:
                pass

        return success

    del_failed_components = exposed(del_failed_components)

    def list_failed_components(self, component_names):
        return list(self.failed_components)

    list_failed_components = exposed(list_failed_components)

    def launch_diags(self, partition, test_name):
        exit_value = 0
        for nc in partition.node_cards:
            if nc.id in self.failed_components:
                exit_value = 1
        for switch in partition.switches:
            if switch in self.failed_components:
                exit_value = 2

        self.finish_diags(partition, test_name, exit_value)
Exemplo n.º 34
0
class ClusterBaseSystem (Component):
    """base system class.
    
    Methods:
    add_partitions -- tell the system to manage partitions (exposed, query)
    get_partitions -- retrieve partitions in the simulator (exposed, query)
    del_partitions -- tell the system not to manage partitions (exposed, query)
    set_partitions -- change random attributes of partitions (exposed, query)
    update_relatives -- should be called when partitions are added and removed from the managed list
    """
    
    global cluster_hostfile
    
    def __init__ (self, *args, **kwargs):
        Component.__init__(self, *args, **kwargs)
        self.process_groups = ProcessGroupDict()
        self.all_nodes = set()
        self.running_nodes = set()
        self.down_nodes = set()
        self.queue_assignments = {}
        self.node_order = {}
    
        try:
            self.configure(cluster_hostfile)
        except:
            self.logger.error("unable to load hostfile")
        
        self.queue_assignments["default"] = set(self.all_nodes)
        self.alloc_only_nodes = {} # nodename:starttime
        self.cleaning_processes = []
        #keep track of which jobs still have hosts being cleaned
        self.cleaning_host_count = {} # jobid:count
        self.locations_by_jobid = {} #jobid:[locations]
        self.jobid_to_user = {} #jobid:username
        
        self.alloc_timeout = int(get_cluster_system_config("allocation_timeout", 300))

        self.logger.info("allocation timeout set to %d seconds." % self.alloc_timeout)

    def __getstate__(self):
        state = {}
        state.update(Component.__getstate__(self))
        state.update({
                "cluster_base_version": 1, 
                "queue_assignments": self.queue_assignments,
                "down_nodes": self.down_nodes })
        return state

    def __setstate__(self, state):
        Component.__setstate__(self, state)

        self.queue_assignments = state["queue_assignments"]
        self.down_nodes = state["down_nodes"]

        self.process_groups = ProcessGroupDict()
        self.all_nodes = set()
        self.running_nodes = set()
        self.node_order = {}
        try:
            self.configure(cluster_hostfile)
        except:
            self.logger.error("unable to load hostfile")
        self.alloc_only_nodes = {} # nodename:starttime
        if not state.has_key("cleaning_processes"):
            self.cleaning_processes = []
        self.cleaning_host_count = {} # jobid:count
        self.locations_by_jobid = {} #jobid:[locations]
        self.jobid_to_user = {} #jobid:username

        self.alloc_timeout = int(get_cluster_system_config("allocation_timeout", 300))
        self.logger.info("allocation timeout set to %d seconds." % self.alloc_timeout)

    def save_me(self):
        Component.save(self)
    save_me = automatic(save_me)


    def validate_job(self, spec):
        """validate a job for submission

        Arguments:
        spec -- job specification dictionary
        """
        # spec has {nodes, walltime*, procs, mode, kernel}
        
        max_nodes = len(self.all_nodes)
        # FIXME: is bgtype really needed for clusters?
        try:
            sys_type = CP.get('bgsystem', 'bgtype')
        except:
            sys_type = 'bgl'
        if sys_type == 'bgp':
            job_types = ['smp', 'dual', 'vn', 'script']
        else:
            job_types = ['co', 'vn', 'script']
        try:
            spec['nodecount'] = int(spec['nodecount'])
        except:
            raise JobValidationError("Non-integer node count")
        if not 0 < spec['nodecount'] <= max_nodes:
            raise JobValidationError("Node count out of realistic range")
        if float(spec['time']) < 5:
            raise JobValidationError("Walltime less than minimum")
        if not spec['mode']:
            if sys_type == 'bgp':
                spec['mode'] = 'smp'
            else:
                spec['mode'] = 'co'
        if spec['mode'] not in job_types:
            raise JobValidationError("Invalid mode")
        if not spec['proccount']:
            if spec.get('mode', 'co') == 'vn':
                if sys_type == 'bgl':
                    spec['proccount'] = str(2 * int(spec['nodecount']))
                elif sys_type == 'bgp':
                    spec['proccount'] = str(4 * int(spec['nodecount']))
                else:
                    self.logger.error("Unknown bgtype %s" % (sys_type))
            elif spec.get('mode', 'co') == 'dual':
                spec['proccount'] = 2 * int(spec['nodecount'])
            else:
                spec['proccount'] = spec['nodecount']
        else:
            try:
                spec['proccount'] = int(spec['proccount'])
            except:
                JobValidationError("non-integer proccount")
            if spec['proccount'] < 1:
                raise JobValidationError("negative proccount")
            if spec['proccount'] > spec['nodecount']:
                if spec['mode'] not in ['vn', 'dual']:
                    raise JobValidationError("proccount too large")
                if sys_type == 'bgl' and (spec['proccount'] > (2 * spec['nodecount'])):
                    raise JobValidationError("proccount too large")
                elif sys_type == ' bgp'and (spec['proccount'] > (4 * spec['nodecount'])):
                    raise JobValidationError("proccount too large")
        # need to handle kernel
        return spec
    validate_job = exposed(validate_job)
     

    #there is absolutely no reason for this to exist in cluster_systems at this point. --PMR
    def run_diags(self, partition_list, test_name):
     
        self.logger.error("Run_diags not used on cluster systems.")
        return None

    run_diags = exposed(run_diags)
    
    def launch_diags(self, partition, test_name):
        '''override this method in derived classes!'''
        raise NotImplementedError("launch_diags is not implemented by this class.")
    
    def finish_diags(self, partition, test_name, exit_value):
        '''call this method somewhere in your derived class where you deal with the exit values of diags'''
        raise NotImplementedError("Finish diags not implemented in this class.")

    def handle_pending_diags(self):
        '''implement to handle diags that are still running.

        '''
        raise NotImplementedError("handle_pending_diags not implemented in this class.")
    #can't automate what isn't there
    #handle_pending_diags = automatic(handle_pending_diags)
    
    def fail_partitions(self, specs):
        self.logger.error("Fail_partitions not used on cluster systems.")
        return ""
    fail_partitions = exposed(fail_partitions)
    
    def unfail_partitions(self, specs):
        self.logger.error("unfail_partitions not used on cluster systems.")
        return ""
    unfail_partitions = exposed(unfail_partitions)
    
    def _find_job_location(self, args):
        nodes = args['nodes']
        jobid = args['jobid']
        
        available_nodes = self._get_available_nodes(args)
            
        if nodes <= len(available_nodes):
            return {jobid: [available_nodes.pop() for i in range(nodes)]}
        else:
            return None


    def _get_available_nodes(self, args):
        queue = args['queue']
        forbidden = args.get("forbidden", [])
        required = args.get("required", [])
        
        if required:
            available_nodes = set(required)
        else:
            available_nodes = self.queue_assignments[queue].difference(forbidden)

        available_nodes = available_nodes.difference(self.running_nodes)
        available_nodes = available_nodes.difference(self.down_nodes)

        return available_nodes
    
    def _backfill_cmp(self, left, right):
        return cmp(left[1], right[1])
    
    # the argument "required" is used to pass in the set of locations allowed by a reservation;
    def find_job_location(self, arg_list, end_times):
        best_location_dict = {}
        winner = arg_list[0]

        jobid = None
        user = None
        
        # first time through, try for starting jobs based on utility scores
        for args in arg_list:
            location_data = self._find_job_location(args)
            if location_data:
                best_location_dict.update(location_data)
                jobid = int(args['jobid'])
                user = args['user']
                break
        
        # the next time through, try to backfill, but only if we couldn't find anything to start
        if not best_location_dict:
            job_end_times = {}
            total = 0
            for item in sorted(end_times, cmp=self._backfill_cmp):
                total += len(item[0])
                job_end_times[total] = item[1]
    
            needed = winner['nodes'] - len(self._get_available_nodes(winner))
            now = time.time()
            backfill_cutoff = 0
            for num in sorted(job_end_times):
                if needed <= num:
                    backfill_cutoff = job_end_times[num] - now

            for args in arg_list:
                if 60*float(args['walltime']) > backfill_cutoff:
                    continue
               
                location_data = self._find_job_location(args)
                if location_data:
                    best_location_dict.update(location_data)
                    self.logger.info("backfilling job %s" % args['jobid'])
                    jobid = int(args['jobid'])
                    user = args['user']
                    break

        # reserve the stuff in the best_partition_dict, as those partitions are allegedly going to 
        # be running jobs very soon
        for jobid_str, location_list in best_location_dict.iteritems():
            self.running_nodes.update(location_list)
            self.logger.info("Job %s: Allocating nodes: %s" % (int(jobid_str), location_list))
            #just in case we're not going to be running a job soon, and have to
            #return this to the pool:
            self.jobid_to_user[jobid] = user
            alloc_time = time.time()
            for location in location_list:
                self.alloc_only_nodes[location] = alloc_time
            self.locations_by_jobid[jobid] = location_list
        
        
        return best_location_dict
    find_job_location = exposed(find_job_location)
    
    def check_alloc_only_nodes(self):
        loc_to_release = []
        jobids = []
        check_time = time.time()
        dead_locations = []
        for location, start_time in self.alloc_only_nodes.iteritems():
            if int(check_time) - int(start_time) > self.alloc_timeout:
                self.logger.warning("Location: %s: released.  Time between "\
                        "allocation and run exceeded.", location)
                dead_locations.append(location)
        
        if dead_locations == []:
            #well we don't have anything dying this time.
            return

        for jobid, locations in self.locations_by_jobid.iteritems(): 
            clear_from_dead_locations = False
            for location in locations:
                if location in dead_locations:
                    clear_from_dead_locations = True
                    if jobid not in jobids:
                        jobids.append(jobid)
            #bagging the jobid will cause all locs assoc with job to be
            #cleaned so clear them out to make this faster
            if clear_from_dead_locations:
                for location in locations:
                    dead_locations.remove(location)
            if dead_locations == []:
                #well we don't have anything dying this time.
                break
        self.invoke_node_cleanup(jobids)

    check_alloc_only_nodes = automatic(check_alloc_only_nodes, 
            get_cluster_system_config("automatic_method_interval",10.0))

    def invoke_node_cleanup(self, jobids):
        '''Invoke cleanup for nodes that have exceeded their allocated time
           
        '''
        for jobid in jobids:
            user = self.jobid_to_user[jobid]
            locations = self.locations_by_jobid[jobid]
            for location in locations:
                del self.alloc_only_nodes[location]

            self.clean_nodes(locations, user, jobid)


    def _walltimecmp(self, dict1, dict2):
        return -cmp(float(dict1['walltime']), float(dict2['walltime']))


    def find_queue_equivalence_classes(self, reservation_dict, active_queue_names):
        equiv = []
        for q in self.queue_assignments:
            # skip queues that aren't "running"
            if not q in active_queue_names:
                continue

            found_a_match = False
            for e in equiv:
                if e['data'].intersection(self.queue_assignments[q]):
                    e['queues'].add(q)
                    e['data'].update(self.queue_assignments[q])
                    found_a_match = True
                    break
            if not found_a_match:
                equiv.append( { 'queues': set([q]), 'data': set(self.queue_assignments[q]), 'reservations': set() } )
        
        
        real_equiv = []
        for eq_class in equiv:
            found_a_match = False
            for e in real_equiv:
                if e['queues'].intersection(eq_class['queues']):
                    e['queues'].update(eq_class['queues'])
                    e['data'].update(eq_class['data'])
                    found_a_match = True
                    break
            if not found_a_match:
                real_equiv.append(eq_class)

        equiv = real_equiv
                
        for eq_class in equiv:
            for res_name in reservation_dict:
                skip = True
                for host_name in reservation_dict[res_name].split(":"):
                    if host_name in eq_class['data']:
                        eq_class['reservations'].add(res_name)

            for key in eq_class:
                eq_class[key] = list(eq_class[key])
            del eq_class['data']
        
        return equiv
    find_queue_equivalence_classes = exposed(find_queue_equivalence_classes)
    
    def reserve_resources_until(self, location, time, jobid):
        if time is None:
            for host in location:
                self.running_nodes.discard(host)
                self.logger.info("hasty job kill: freeing %s" % host)
        else:
            self.logger.error("failed to reserve location '%r' until '%s'" % (location, time))
    reserve_resources_until = exposed(reserve_resources_until)


    def nodes_up(self, node_list, user_name=None):
        changed = []
        for n in node_list:
            if n in self.down_nodes:
                self.down_nodes.remove(n)
                changed.append(n)
            if n in self.running_nodes:
                self.running_nodes.remove(n)
                changed.append(n)
        if changed:
            self.logger.info("%s marking nodes up: %s", user_name, ", ".join(changed))
        return changed
    nodes_up = exposed(nodes_up)
        

    def nodes_down(self, node_list, user_name=None):
        changed = []
        for n in node_list:
            if n in self.all_nodes:
                self.down_nodes.add(n)
                changed.append(n)
        if changed:
            self.logger.info("%s marking nodes down: %s", user_name, ", ".join(changed))
        return changed
    nodes_down = exposed(nodes_down)

    def get_node_status(self):
        def my_cmp(left, right):
            return cmp(left[2], right[2])
        
        status_list = []
        for n in self.all_nodes:
            if n in self.running_nodes:
                status = "allocated"
            elif n in self.down_nodes:
                status = "down"
            else:
                status = "idle"
            
            status_list.append( (n, status, self.node_order[n]) )
        status_list.sort(my_cmp)
        return status_list
    get_node_status = exposed(get_node_status)

    def get_queue_assignments(self):
        ret = {}
        for q in self.queue_assignments:
            ret[q] = list(self.queue_assignments[q])
        return ret
    get_queue_assignments = exposed(get_queue_assignments)
    
    def set_queue_assignments(self, queue_names, node_list, user_name=None):
        checked_nodes = set()
        for n in node_list:
            if n in self.all_nodes:
                checked_nodes.add(n)
        
        queue_list = queue_names.split(":")
        for q in queue_list:
            if q not in self.queue_assignments:
                self.queue_assignments[q] = set()
                
        for q in self.queue_assignments.keys():
            if q not in queue_list:
                self.queue_assignments[q].difference_update(checked_nodes)
                if len(self.queue_assignments[q])==0:
                    del self.queue_assignments[q]
            else:
                self.queue_assignments[q].update(checked_nodes)
        self.logger.info("%s assigning queues %s to nodes %s", user_name, queue_names, " ".join(checked_nodes))
        return list(checked_nodes)
    set_queue_assignments = exposed(set_queue_assignments)

    def verify_locations(self, location_list):
        """Providing a system agnostic interface for making sure a 'location string' is valid"""
        ret = []
        for l in location_list:
            if l in self.all_nodes:
                ret.append(l)
        return ret
    verify_locations = exposed(verify_locations)

    def configure(self, filename):
        f = open(filename)
        
        counter = 0
        for line in f:
            name = line.strip()
            self.all_nodes.add(name)
            self.node_order[name] = counter
            counter += 1
        
        f.close()

    # this gets called by bgsched in order to figure out if there are partition overlaps;
    # it was written to provide the data that bgsched asks for and raises an exception
    # if you try to ask for more
    def get_partitions (self, specs):
        
        partitions = []
        for spec in specs:
            item = {}
            for n in self.all_nodes:
                if "name" in spec:
                    if spec["name"] == '*':
                        item.update( {"name": n} )
                    elif spec["name"] == n:
                        item.update( {"name": n} )
            
            if "name" in spec:    
                spec.pop("name")
            if "children" in spec:
                item.update( {"children": []} )
                spec.pop("children")
            if "parents" in spec:
                item.update( {"parents": []} )
                spec.pop("parents")
            if spec:
                raise NotSupportedError("clusters lack information on: %s" % ", ".join(spec.keys()))
            if item:
                partitions.append(item)
        
        return partitions
    get_partitions = exposed(get_partitions)


    def clean_nodes(self, locations, user, jobid):
        """Given a process group, start cleaning the nodes that were involved.
        The rest of the cleanup is done in check_done_cleaning.
        
        """
        self.logger.info("Job %s/%s: starting node cleanup." , user, jobid)
        try:
            tmp_data = pwd.getpwnam(user)
            groupid = tmp_data.pw_gid
            group_name = grp.getgrgid(groupid)[0]
        except KeyError:
            group_name = ""
            self.logger.error("Job %s/%s: unable to determine group name for epilogue" % (user, jobid))
     
        self.cleaning_host_count[jobid] = 0
        for host in locations:
            h = host.split(":")[0]
            cleaning_process_info ={
                    "host": h, 
                    "cleaning_id": None, 
                    "user": user,
                    "jobid": jobid,
                    "group": group_name,
                    "start_time":time.time(), 
                    "completed":False, 
                    "retry":False,
                    }
            try:
                cleaning_id = self.launch_script("epilogue", h, jobid, user,
                        group_name)
                if cleaning_id == None:
                    #there was no script to run.
                    self.running_nodes.discard(cleaning_processes["host"]) 
                    return 
                self.cleaning_host_count[jobid] += 1
                cleaning_process_info["cleaning_id"] = cleaning_id
                self.cleaning_processes.append(cleaning_process_info)
            except ComponentLookupError:
                self.logger.warning("Job %s/%s: Error contacting forker "
                        "component.  Will Retry until timeout." % (user, jobid))
                cleaning_process_info["retry"] = True
                self.cleaning_processes.append(cleaning_process_info)
                self.cleaning_host_count[jobid] += 1
            except:
                self.logger.error("Job %s/%s: Failed to run epilogue on host "
                        "%s, marking node down", jobid, user, h, exc_info=True)
                self.down_nodes.add(h)
                self.running_nodes.discard(h)
    


    def launch_script(self, config_option, host, jobid, user, group_name):
        '''Start our script processes used for node prep and cleanup.

        '''
        script = get_cluster_system_config(config_option, None)
        if script == None:
            self.logger.error("Job %s/%s: %s not defined in the "\
                    "cluster_system section of the cobalt config file!",
                    user, jobid, config_option)
            return None
        else:
            cmd = ["/usr/bin/ssh", host, script, 
                    str(jobid), user, group_name]
            return ComponentProxy("system_script_forker").fork(cmd, "system epilogue", 
                    "Job %s/%s" % (jobid, user))

        

    #def launch_cleaning_process(self, host, jobid, user, group_name):
    #    '''Ping the forker to launch the cleaning process.
    #
    #    '''
    #    epilogue_script = get_cluster_system_config("epilogue", None)
    #    if epilogue_script == None:
    #        self.logger.error("Job %s/%s: epilogue not defined in the "\
    #                "cluster_system section of the cobalt config file!",
    #                user, jobid)
    #        return None
    #    else:
    #        cmd = ["/usr/bin/ssh", host, epilogue_script, 
    #                str(jobid), user, group_name]
    #        return ComponentProxy("system_script_forker").fork(cmd, "system epilogue", 
    #                "Job %s/%s" % (jobid, user))

    
    def retry_cleaning_scripts(self):
        '''Continue retrying scripts in the event that we have lost contact 
        with the forker component.  Reset start-time to when script starts.

        '''
        for cleaning_process in self.cleaning_processes:
            if cleaning_process['retry'] == True:
                try:
                    cleaning_id = self.launch_script(
                            "epilogue",
                            cleaning_process["host"],
                            cleaning_process['jobid'],
                            cleaning_process['user'],
                            cleaning_process['group'])
                    cleaning_process["cleaning_id"] = cleaning_id
                    cleaning_process["start_time"] = time.time()
                    cleaning_process["retry"] = False
                except ComponentLookupError:
                    self.logger.warning("Job %s/%s: Error contacting forker "
                        "component." % (pg.jobid, pg.user))
                except:
                    self.logger.error("Job %s/%s: Failed to run epilogue on "
                            "host %s, marking node down", pg.jobid, pg.user, h,
                            exc_info=True)
                    self.cleaning_host_count[jobid] -= 1
                    self.down_nodes.add(h)
                    self.running_nodes.discard(h)

    retry_cleaning_scripts = automatic(retry_cleaning_scripts,
            get_cluster_system_config("automatic_method_interval", 10.0))

    def check_done_cleaning(self):
        """Check to see if the processes we are using to clean up nodes 
        post-run are done. If they are, release the nodes back for general 
        consumption.  If the cleanup fails for some reason, then mark the node
        down and release it. 

        """
        
        if self.cleaning_processes == []:
            #don't worry if we have nothing to cleanup
            return
        
        for cleaning_process in self.cleaning_processes: 

            #if we can't reach the forker, we've lost all the cleanup scripts.
            #don't try and recover, just assume all nodes that were being 
            #cleaned are down. --PMR
            if cleaning_process['retry'] == True:
                continue #skip this.  Try anyway, if component came back up.
            
            jobid = cleaning_process['jobid']
            user = cleaning_process['user']

            try:
                exit_status = ComponentProxy("system_script_forker").child_completed(
                        cleaning_process['cleaning_id'])
                ComponentProxy("system_script_forker").child_cleanup(
                        [cleaning_process['cleaning_id']])

            except ComponentLookupError:
                self.logger.error("Job %s/%s: Error contacting forker "
                        "component. Running child processes are "
                        "unrecoverable." % (jobid, user))
                return

            if exit_status != None:
                #we're done, this node is now free to be scheduled again.
                self.running_nodes.discard(cleaning_process["host"])
                cleaning_process["completed"] = True
                self.cleaning_host_count[jobid] -= 1
            else:
                #timeout exceeded
                if (time.time() - cleaning_process["start_time"] > 
                        float(get_cluster_system_config("epilogue_timeout", 60.0))): 
                    cleaning_process["completed"] = True
                    try:
                        forker = ComponentProxy("system_script_forker").signal(
                                cleaning_process['cleaning_id'], "SIGINT")
                        child_output = forker.get_child_data(
                            cleaning_process['cleaning_id'])
                        forker.child_cleanup([cleaning_process['cleaning_id']])
                            
                        #mark as dirty and arrange to mark down.
                        self.down_nodes.add(cleaning_process['host'])
                        self.running_nodes.discard(cleaning_process['host'].host) # <---????check this!
                        self.logger.error("Job %s/%s: epilogue timed out on host %s, marking hosts down", 
                            user, jobid, cleaning_process['host'])
                        self.logger.error("Job %s/%s: stderr from epilogue on host %s: [%s]",
                            user, jobid,
                            cleaning_process['host'], 
                            child_output['stderr'].strip())
                        self.cleaning_host_count[jobid] -= 1
                    except ComponentLookupError:
                        self.logger.error("Job %s/%s: Error contacting forker "
                            "component. Running child processes are "
                            "unrecoverable." % (jobid, user))

            if self.cleaning_host_count[jobid] == 0:
                self.del_process_groups(jobid)
                #clean up other cleanup-monitoring stuff
                self.logger.info("Job %s/%s: job finished on %s",
                    user, jobid, Cobalt.Util.merge_nodelist(self.locations_by_jobid[jobid]))
                del self.locations_by_jobid[jobid]
                del self.jobid_to_user[jobid]
        
        self.cleaning_processes = [cleaning_process for cleaning_process in self.cleaning_processes 
                                    if cleaning_process["completed"] == False]
            
    check_done_cleaning = automatic(check_done_cleaning, 
            get_cluster_system_config("automatic_method_interval", 10.0))



    def del_process_groups(self, jobid):

        raise NotImplementedError("Must be overridden in child class")
Exemplo n.º 35
0
class ClusterSystem(ClusterBaseSystem):
    """cluster system component.
    
    Methods:
    configure -- load partitions from the bridge API
    add_process_groups -- add (start) an mpirun process on the system (exposed, ~query)
    get_process_groups -- retrieve mpirun processes (exposed, query)
    wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query)
    signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query)
    update_partition_state -- update partition state from the bridge API (runs as a thread)
    """

    name = "system"
    implementation = "cluster_system"

    logger = logger

    def __init__(self, *args, **kwargs):
        ClusterBaseSystem.__init__(self, *args, **kwargs)
        self.process_groups.item_cls = ClusterProcessGroup

    def __setstate__(self, state):
        ClusterBaseSystem.__setstate__(self, state)
        self.process_groups.item_cls = ClusterProcessGroup

    def add_process_groups(self, specs):
        """Create a process group.
        
        Arguments:
        spec -- dictionary hash specifying a process group to start
        """

        self.logger.info("add_process_groups(%r)", specs)
        process_groups = self.process_groups.q_add(specs)
        for pgroup in process_groups:
            self.logger.info(
                "Job %s/%s: process group %s created to track script",
                pgroup.user, pgroup.jobid, pgroup.id)
        #System has started the job.  We need remove them from the temp, alloc array
        #in cluster_base_system.
        for pg in process_groups:
            for location in pg.location:
                del self.alloc_only_nodes[location]

        return process_groups

    add_process_groups = exposed(query(add_process_groups))

    def get_process_groups(self, specs):
        self._get_exit_status()
        return self.process_groups.q_get(specs)

    get_process_groups = exposed(query(get_process_groups))

    def _get_exit_status(self):
        try:
            running = ComponentProxy("forker").active_list("process group")
        except:
            self.logger.error(
                "failed to contact forker component for list of running jobs")
            return

        for each in self.process_groups.itervalues():
            if each.head_pid not in running and each.exit_status is None:
                # FIXME: i bet we should consider a retry thing here -- if we fail enough times, just
                # assume the process is dead?  or maybe just say there's no exit code the first time it happens?
                # maybe the second choice is better
                try:
                    dead_dict = ComponentProxy("forker").get_status(
                        each.head_pid)
                except Queue.Empty:
                    self.logger.error(
                        "failed call for get_status from forker component for pg %s",
                        each.head_pid)
                    return

                if dead_dict is None:
                    self.logger.info(
                        "Job %s/%s: process group %i: exited with unknown status",
                        each.user, each.jobid, each.id)
                    each.exit_status = 1234567
                else:
                    each.exit_status = dead_dict["exit_status"]
                    if dead_dict["signum"] == 0:
                        self.logger.info(
                            "process group %i: job %s/%s exited with status %i",
                            each.id, each.jobid, each.user, each.exit_status)
                    else:
                        if dead_dict["core_dump"]:
                            core_dump_str = ", core dumped"
                        else:
                            core_dump_str = ""
                        self.logger.info(
                            "process group %i: job %s/%s terminated with signal %s%s",
                            each.id, each.jobid, each.user,
                            dead_dict["signum"], core_dump_str)

    _get_exit_status = automatic(_get_exit_status)

    def wait_process_groups(self, specs):
        self._get_exit_status()
        process_groups = [
            pg for pg in self.process_groups.q_get(specs)
            if pg.exit_status is not None
        ]
        for process_group in process_groups:
            self.clean_nodes(
                pg.location, pg.user, pg.jobid
            )  #FIXME: This call is a good place to look for problems
        return process_groups

    wait_process_groups = locking(exposed(query(wait_process_groups)))

    def signal_process_groups(self, specs, signame="SIGINT"):
        my_process_groups = self.process_groups.q_get(specs)
        for pg in my_process_groups:
            if pg.exit_status is None:
                try:
                    ComponentProxy("forker").signal(pg.head_pid, signame)
                except:
                    self.logger.error(
                        "Failed to communicate with forker when signalling job"
                    )

        return my_process_groups

    signal_process_groups = exposed(query(signal_process_groups))

    def del_process_groups(self, jobid):
        '''delete a process group and don't track it anymore.

           jobid -- jobid associated with the process group we are removing

        '''

        del_items = self.process_groups.q_del([{'jobid': jobid}])

        if del_items == []:
            self.logger.warning(
                "Job %s: Process group not found for this jobid.", jobid)
        else:
            self.logger.info("Job %s: Process group deleted.", jobid)