def flush_log_messages(msg=None):
    """
    Prints msg to the log file
    """
    if msg is not None:
        for m in msg:
            pbs.logmsg(pbs.LOG_DEBUG, m)
Пример #2
0
 def _get_usage(self, job):
     pbs.logmsg(pbs.EVENT_DEBUG3, "SGI: %s get_usage" % (job.id))
     report = api.MonitorReport(job.id)
     if report is not None and report[0] == 'total_energy':
         pbs.logjobmsg(job.id, "SGI: energy %fkWh" % report[1])
         return report[1]
     return None
Пример #3
0
 def _deactivate_profile(self, job):
     pbs.logmsg(pbs.LOG_DEBUG, "SGI: deactivate")
     try:
         api.MonitorStop(job.id)
     finally:	# be sure to remove the nodeset
         api.NodesetDelete(job.id)
     return False
Пример #4
0
    def activate_profile(self, profile_name=None, job=None):
        self._check_pmi()
        if job is None:
            job = pbs.event().job

        try:
            ret = self.__pmi._activate_profile(profile_name, job)
            if profile_name is not None:
                hosts = _get_vnode_names(job)
                for h in hosts:
                    try:
                        pbs.event().vnode_list[h].current_eoe = profile_name
                    except:
                        pass
            return ret
        except BackendError as e:
            # get fresh set of profile names, ignore errors
            mynode = pbs.event().vnode_list[pbs.get_local_nodename()]
            if mynode.power_provisioning:
                try:
                    profiles = self.__pmi._query(
                        pbs.Power.QUERY_PROFILE)
                    names = self._map_profile_names(profiles)
                    mynode.resources_available["eoe"] = names
                    pbs.logmsg(pbs.LOG_WARNING,
                               "PMI:activate: set eoe: %s" % names)
                except:
                    pass
            raise BackendError(e)
        except InternalError as e:
            # couldn't do activation so set vnode offline
            me = pbs.get_local_nodename()
            pbs.event().vnode_list[me].state += pbs.ND_OFFLINE
            pbs.logmsg(pbs.LOG_WARNING, "PMI:activate: set vnode offline")
            raise InternalError(e)
Пример #5
0
 def _connect(self, endpoint, port, job):
     if job is None:
         pbs.logmsg(pbs.EVENT_DEBUG3, "SGI: connect")
     else:
         pbs.logmsg(pbs.EVENT_DEBUG3, "SGI: %s connect" % (job.id))
     api.VerifyConnection()
     return
Пример #6
0
def setBudget(project, budget):
        pbs.logmsg(pbs.LOG_DEBUG, "---> setBudget " + str(project) + " to " + str(budget))
        conn = psycopg2.connect(database="pbs_accounting", user = "******", password = "******", host = "mullis01.sns.it", port = "5432")
        cur = conn.cursor()
        cur.execute("UPDATE projects SET project_hours = %s WHERE project_name = %s;",(budget, project))
        conn.commit()
        cur.close()
        conn.close()
Пример #7
0
 def _pmi_power_on(self, hosts):
     pbs.logmsg(pbs.LOG_DEBUG, "Cray: powering-on the node")
     nidset = nodenids(hosts)
     nids, _ = nidlist(None, nidset)
     cmd = "node_on --nids " + nids
     func = "pmi_power_on"
     launch(func, cmd)
     return True
Пример #8
0
 def TouchFile(self,fname,times=None):
     try:
         open(fname, 'a').close()
         os.utime(fname, times)
         return True
     except IOError:
         pbs.logmsg(pbs.EVENT_DEBUG3,"Failed to touch file: %s"%(fname))
         return False
Пример #9
0
def isEntitled(user, project):
	conn = psycopg2.connect(database="pbs_accounting", user = "******", password = "******", host = "mullis01.sns.it", port = "5432")
	cur = conn.cursor()
	cur.execute("SELECT user_projects FROM users WHERE user_name = '" + user + "';")
	#result = cur.fetchone()[0]
	result = cur.fetchone()
	if result is not None:
		pbs.logmsg(pbs.LOG_DEBUG, "---> isEntitled: user %s is part of project(s) %s, we are checking project %s" % (user, result[0], project))
		return (project in result[0])
	else:
		pbs.logmsg(pbs.LOG_DEBUG, "---> isEntitled: user %s is not on database" % user)
		return False
Пример #10
0
def getBudget(project):
	conn = psycopg2.connect(database="pbs_accounting", user = "******", password = "******", host = "mullis01.sns.it", port = "5432")
	cur = conn.cursor()
	cur.execute("SELECT project_hours FROM projects WHERE project_name = '%s';" % project)
	result = cur.fetchone()
	cur.close()
	conn.close()
	if result is not None:
		pbs.logmsg(pbs.LOG_DEBUG, "---> getBudget: project and hours are " + str(project) + " : " + str(result[0]))
		return float(result[0])
	else:
		pbs.logmsg(pbs.LOG_DEBUG, "---> getBudget: project not found")
		return None
Пример #11
0
    def ConvertToBytes(self,value):
        # Determine what units the user would like to use.
        if self.nhc_cfg["disk_space"]["units"].lower() == 'binary':
            units = {'kb':1024,'mb':1048576,'gb':1073741824,'tb':1099511627776}
        elif self.nhc_cfg["disk_space"]["units"].lower() == 'decimal':
            units = {'kb':1000,'mb':1000000,'gb':1000000000,'tb':1000000000000}
        else:
            pbs.logmsg(pbs.EVENT_DEBUG3,"I'm not sure how to handle units: %s\nSo I will default to binary"%\
                                 (self.nhc_cfg["disk_space"]["units"]))
            units = {'kb':1024,'mb':1048576,'gb':1073741824,'tb':1099511627776}

        value = value.lower()
        if value.find('%') !=-1:
            pbs.logmsg(pbs.EVENT_DEBUG3,"found a % symbol")
            # Returned as a float so that I can distinguish between percentage vs free space
            value = float(value.strip('%'))
            pbs.logmsg(pbs.EVENT_DEBUG3,"value: %s"%value)
        else:
            for key in units.keys():
                if value.find(key) != -1: 
                    try:
                        value = int(value[:-2].strip())*units[key]
                    except Exception, e:
                        pbs.logmsg(pbs.EVENT_DEBUG,"Error convertion value to int: %s\tkey: %s"%(value,key))
                        return False
                    break
Пример #12
0
    def ChkMountPoints(self):
        if self.nhc_cfg['mounts']['check'] == False:
            pbs.logmsg(pbs.EVENT_DEBUG3,"Skipping mounts check")
            return True

        for mnt_pnt in self.nhc_cfg["mounts"]["mount_points"]:
            pbs.logmsg(pbs.EVENT_DEBUG3,"mount point: %s, %s"%(mnt_pnt,self.nhc_cfg["mounts"]["mount_points"][mnt_pnt]))
            try:
                # Added the line below to check to see if the real path is a mount or not
                if not os.path.ismount(os.path.realpath(mnt_pnt)):
                    pbs.logmsg(pbs.EVENT_DEBUG3,"Mount: %s\tAction: %s"%(mnt_pnt,self.nhc_cfg["mounts"]["mount_points"][mnt_pnt]))
                    return [self.nhc_cfg["mounts"]["mount_points"][mnt_pnt],'%s does not appear to be mounted'%mnt_pnt] 
            except Exception, e:
                pbs.logmsg(pbs.EVENT_DEBUG,"Mount check error: %s"%e)
                return False
            pbs.logmsg(pbs.EVENT_DEBUG3,"mount point %s checked out"%(mnt_pnt))
Пример #13
0
 def _pmi_ramp_up(self, hosts):
     pbs.logmsg(pbs.LOG_DEBUG, "Cray: ramping up the node")
     nidset = nodenids(hosts)
     nids, _ = nidlist(None, nidset)
     cmd = "get_sleep_state_limit_capabilities --nids " + nids
     func = "pmi_ramp_up"
     out = launch(func, cmd)
     for n in out["nids"]:
         if "data" in n:
             nid = n["nid"]
             states = n["data"]["PWR_Attrs"][0]["PWR_AttrValueCapabilities"]
             for s in reversed(states):
                 if int(s) != 0:
                     cmd = "set_sleep_state_limit --nids " + str(nid) + " --limit " + str(s)
                     launch(func, cmd)
                     sleep_time = random.randint(1, 10)
                     time.sleep(sleep_time)
     return True
Пример #14
0
    def _get_usage(self, job):
        pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: %s get_usage" % (job.id))
        try:
            f = open(energy_file(job), "r")
            start = int(f.read())
            f.close()
        except Exception:
            return None

        e = pbs.event()
        if e.type == pbs.EXECHOST_PERIODIC:
            # This function will be called for each job in turn when
            # running from a periodic hook.  Here we fill in some
            # global variables just once and use the information
            # for each job in turn.  Save the result of calling capmc
            # for all running jobs in the variable ninfo.  Keep a
            # dictionary with the job id's as keys holding a set
            # of nid numbers.
            if Pmi.ninfo is None:
                allnids = set()
                for jobid in e.job_list.keys():
                    j = e.job_list[jobid]
                    nidset = jobnids(j)
                    allnids.update(nidset)
                    Pmi.nidarray[jobid] = nidset
                nids, cnt = nidlist(None, allnids)
                Pmi.ninfo = node_energy("all", nids, cnt)
            nidset = Pmi.nidarray[job.id]
            energy = None
            if Pmi.ninfo is not None and "nodes" in Pmi.ninfo:
                energy = 0
                for node in Pmi.ninfo["nodes"]:
                    if node["nid"] in nidset:		# owned by job of interest
                        energy += node["energy_ctr"]
                pbs.logjobmsg(job.id, "Cray: get_usage: energy %dJ" %
                              energy)
        else:
            nids, cnt = nidlist(job)
            energy = job_energy(job, nids, cnt)
        if energy is not None:
            return float(energy - start) / 3600000.0
        else:
            return None
Пример #15
0
    def rejectjob(reason, action=DEFAULT_ACTION):         
        """Log job rejection and then call pbs.event().reject()"""

        # Arguments to pbs.event().reject() do nothing in execjob events. Log a
        # warning instead, update the job comment, then reject the job.
        if action == RERUN:
            job.rerun()
            reason='Requeued - %s' % reason
        elif action == DELETE:
            job.delete()
            reason='Deleted - %s' % reason
        else:
            reason='Rejected - %s' % reason

        job.comment='%s: %s' % (hook_name, reason)
        pbs.logmsg(pbs.LOG_WARNING, ';'.join([hook_name, job.id, reason]))
        pbs.logjobmsg(job.id, reason) # Add a message that can be tracejob'd
        if VERBOSE_USER_OUTPUT:
            print reason
        pbs_event.reject()
Пример #16
0
    def ChkDirFilePermissions(self):
        """ 
            Returns True if the permissions match. The permissions from python are returned as string with the 
            '0100600'. The last three digits are the file permissions for user,group, world
            Return action if the permissions don't match and NoFileOrDir if it can't find the file/dir
        """

        if self.nhc_cfg["permissions"]["check"] == False:
            pbs.logmsg(pbs.EVENT_DEBUG3,"Skipping permissions check")
            return True

        for file_dir in self.nhc_cfg["permissions"]["check_dirs_and_files"]:
            pbs.logmsg(pbs.EVENT_DEBUG3,"File/Dir: %s\t%s"%(file_dir,str(self.nhc_cfg["permissions"]["check_dirs_and_files"][file_dir][0])))
            try:
                st = os.stat(file_dir)
                permissions = oct(st.st_mode)

                if permissions[-len(self.nhc_cfg["permissions"]["check_dirs_and_files"][file_dir][0]):] != str(self.nhc_cfg["permissions"]["check_dirs_and_files"][file_dir][0]):
                    pbs.logmsg(pbs.EVENT_DEBUG3,"Required permissions: %s\tpermissions: %s"%(str(self.nhc_cfg["permissions"]["check_dirs_and_files"][file_dir][0]),permissions[-len(self.nhc_cfg["permissions"]["check_dirs_and_files"][file_dir][0]):]))
                    return [self.nhc_cfg["permissions"]["check_dirs_and_files"][file_dir][1],"File/Dir: %s\tRequired permissions: %s\tpermissions: %s"% \
                                                                                                     (file_dir, str(self.nhc_cfg["permissions"]["check_dirs_and_files"][file_dir][0]),\
                                                                                                     permissions[-len(self.nhc_cfg["permissions"]["check_dirs_and_files"][file_dir][0]):])]
            except OSError:
                return [self.nhc_cfg["permissions"]["check_dirs_and_files"][file_dir][1],"Can not find file/dir: %s"%file_dir]
            except:
                return False
        
        return True 
Пример #17
0
    def ChkProcesses(self):
        if self.nhc_cfg["processes"]["check"] == False: 
            pbs.logmsg(pbs.EVENT_DEBUG3,"Skipping processes check")
            return True

        # List all of the processes
        procs = {}
        if platform.uname()[0] == 'Linux':
            #out, err = subprocess.Popen(['ps', '-Af'], stdout=subprocess.PIPE).communicate()
            out, err = subprocess.Popen(['top', '-bn1'], stdout=subprocess.PIPE).communicate()
            lines = out.split('\n')
            for line in lines[1:]:
                if line != "":
                    line = line.split()
                    # If ps -Af is used
                    #procs[os.path.split(line[-1].split()[0])[-1]] = line[0]

                    # If top -bn1 is used
                    procs[os.path.split(line[-1].split()[0])[-1]] = line[1]


        pbs.logmsg(pbs.EVENT_DEBUG3,"Processes: %s"%procs)

        # store procs that violate the checks
        chk_procs = {}
        chk_procs['running'] = []
        chk_procs['stopped'] = []
        chk_action = ""

        # Loop through processes
        for proc in self.nhc_cfg["processes"]["running"]:
            if proc not in procs.keys():
                pbs.logmsg(pbs.EVENT_DEBUG,"Process: %s is not in the running process list but should be"%proc)
                chk_procs['running'].append(proc)
                if chk_action == "":
                    chk_action = self.nhc_cfg['processes']['running'][proc][1]

        for proc in self.nhc_cfg['processes']['stopped']:
            if proc in procs.keys():
                pbs.logmsg(pbs.EVENT_DEBUG,"Process: %s is in the stopped process list but was found to be running"%proc)
                chk_procs['stopped'].append(proc)
                if chk_action == "":
                    chk_action = self.nhc_cfg['processes']['stopped'][proc][1]

        if len(chk_procs['running']) > 0 or len(chk_procs['stopped']) > 0:
            line = "running: %s\nstopped: %s"%(join(chk_procs['running'],','),join(chk_procs['stopped'],','))
            return [chk_action, "CheckProcesses: One or more processes were found which violates the check\n%s"%line]

        return True
Пример #18
0
    def CheckNodePeriodic(self):
        #Setup the fail counter
        failCnt = 0

        pbs.logmsg(pbs.EVENT_DEBUG3,"Ready perform check node periodic")

        # Run block of code with timeouts
        pbs.logmsg(pbs.EVENT_DEBUG3,"Ready to check the mounts")
        if not c.ContinueChk(c.ChkMountPoints()):
            failCnt+=1
     
        pbs.logmsg(pbs.EVENT_DEBUG3,"Ready to check the disk usage")
        if not c.ContinueChk(c.ChkDiskUsage()):
            failCnt+=1
     
        pbs.logmsg(pbs.EVENT_DEBUG3,"Ready to check the file permissions")
        if not c.ContinueChk(c.ChkDirFilePermissions()):
            failCnt+=1
     
        pbs.logmsg(pbs.EVENT_DEBUG3,"Exiting CheckNode function")

        return failCnt
Пример #19
0
 def _pmi_power_status(self, hosts):
     # Do a capmc node_status and return a list of ready nodes.
     pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: status of the nodes")
     nidset = nodenids(hosts)
     nids, _ = nidlist(nidset=nidset)
     cmd = "node_status --nids " + nids
     func = "pmi_power_status"
     out = launch(func, cmd)
     ready = []
     nodeset = set()
     if 'ready' in out:
         ready = out['ready']
     else:
         return nodeset
     craynid = "PBScraynid"
     for vnames in hosts:
         vnode = _svr_vnode(vnames)
         if craynid in vnode.resources_available:
             nid = int(vnode.resources_available[craynid])
             if nid in ready:
                 nodeset.add(vnames)
     return nodeset
Пример #20
0
    def stderr(self, msg):
        """Write msg to appropriate file handle for stdout"""
        import sys

        try:
            if not pbs.event().job.interactive and pbs.event().job.in_ms_mom():
                logfile=open(self.stderr_log, 'ab+')
            else:
                logfile=sys.stderr

            if DEBUG: 
                pbs.logmsg(pbs.EVENT_DEBUG3, 
                    '%s;%s;[DEBUG3]: writing %s to %s' %
                        (pbs.event().hook_name, 
                         pbs.event().job.id, 
                         repr(msg), 
                         logfile.name))

            logfile.write(msg)
            logfile.flush()
            logfile.close()
        except IOError:
            trace_hook()
Пример #21
0
    def _activate_profile(self, profile_name, job):
        pbs.logmsg(pbs.LOG_DEBUG,
                   "Cray: %s activate '%s'" % (job.id, str(profile_name)))

        nids, cnt = nidlist(job)
        if cnt == 0:
            pbs.logjobmsg(job.id, "Cray: no compute nodes for power setting")
            return False

        energy = job_energy(job, nids, cnt)
        if energy is not None:
            f = open(energy_file(job), "w")
            f.write(str(energy))
            f.close()

        # If this is the only job, set nodes to capped power.
        if _running_excl(job):
            cmd = "set_power_cap --nids " + nids
            doit = False

            pcap = job.Resource_List['pcap_node']
            if pcap is not None:
                pbs.logjobmsg(job.id, "Cray: pcap node %d" % pcap)
                cmd += " --node " + str(pcap)
                doit = True
            pcap = job.Resource_List['pcap_accelerator']
            if pcap is not None:
                pbs.logjobmsg(job.id, "Cray: pcap accel %d" % pcap)
                cmd += " --accel " + str(pcap)
                doit = True

            if doit:
                launch(job.id, cmd)
            else:
                pbs.logjobmsg(job.id, "Cray: no power cap to set")

        return True
Пример #22
0
    def _activate_profile(self, profile_name, job):
        pbs.logmsg(pbs.LOG_DEBUG, "Cray: %s activate '%s'" %
                   (job.id, str(profile_name)))

        nids, cnt = nidlist(job)
        if cnt == 0:
            pbs.logjobmsg(job.id, "Cray: no compute nodes for power setting")
            return False

        energy = job_energy(job, nids, cnt)
        if energy is not None:
            f = open(energy_file(job), "w")
            f.write(str(energy))
            f.close()

        # If this is the only job, set nodes to capped power.
        if _running_excl(job):
            cmd = "set_power_cap --nids " + nids
            doit = False

            pcap = job.Resource_List['pcap_node']
            if pcap is not None:
                pbs.logjobmsg(job.id, "Cray: pcap node %d" % pcap)
                cmd += " --node " + str(pcap)
                doit = True
            pcap = job.Resource_List['pcap_accelerator']
            if pcap is not None:
                pbs.logjobmsg(job.id, "Cray: pcap accel %d" % pcap)
                cmd += " --accel " + str(pcap)
                doit = True

            if doit:
                launch(job.id, cmd)
            else:
                pbs.logjobmsg(job.id, "Cray: no power cap to set")

        return True
Пример #23
0
def _pbs_conf(confvar):
    # Return the value of a setting in the pbs.conf file if it exists.
    # Save the values in a global dictionary for future use.

    if confvar in os.environ:
        return os.environ[confvar]

    global pmi_pbsconf
    if "pmi_pbsconf" not in globals():
        pmi_pbsconf = dict()
        cfile = "PBS_CONF_FILE"
        if cfile in os.environ:
            pbsconf = os.environ[cfile]
        else:
            pbsconf = "/etc/pbs.conf"

        try:
            fp = open(pbsconf)
        except:
            pbs.logmsg(pbs.DEBUG, "%s: Unable to open conf file." % pbsconf)
            return None 
        else:
            for line in fp:
                line = line.strip()
                # ignore empty lines or those beginning with '#'
                if line == "" or line[0] == "#":
                    continue
                var, eq, val = line.partition('=')
                if val == "":
                    continue
                pmi_pbsconf[var] = val
            fp.close()
    if confvar in pmi_pbsconf:
        return pmi_pbsconf[confvar]
    else:
        return None
Пример #24
0
def check_express_project_code():
    project = pbs.event().job.project
    if not project:
        pbs.event().reject(
            "You must specify an express code with -P when submitting express jobs"
        )

    project = repr(project)
    if not re.match("^exp-[a-z0-9]+$", project):
        pbs.event().reject(
            "Invalid express code: these have the format 'exp-XXXX'")
    if not test_group_membership([project]):
        pbs.event().reject("You are not authorised to use this express code")

    r = None
    try:
        import requests
        r = requests.get(
            "https://api.rcs.imperial.ac.uk/v1.0/express/%s/enabled" %
            (project, ))
    except:  #
        pass

    if r:
        pbs.logmsg(pbs.LOG_ERROR, str(r.status_code))
        pbs.logmsg(pbs.LOG_ERROR, str(r.text))
        if (r.status_code == 200) and (r.text != "1"):
            pbs.event().reject(
                "This express code is not enabled. Please contact [email protected]"
            )
    else:
        pbs.event().reject(
            "This express code cannot be used at this time. Please try later or contact [email protected]"
        )

    return project
def send_notification(subject, email_message, job_owner_email_address):
    try:
        ses_client = boto3.client('ses', region_name=ses_region)
        ses_client.send_email(
            Source=ses_sender_email,
            Destination={'ToAddresses': [
                job_owner_email_address,
            ]},
            Message={
                'Subject': {
                    'Data': subject,
                },
                'Body': {
                    'Html': {
                        'Data': email_message,
                    }
                }
            },
        )
        pbs.logmsg(pbs.LOG_DEBUG,
                   'notify_job_status: SES output' + str(ses_client))
    except Exception as err:
        pbs.logmsg(pbs.LOG_DEBUG,
                   'notify_job_status: Error sending email' + str(err))
Пример #26
0
def _pbs_conf(confvar):
    # Return the value of a setting in the pbs.conf file if it exists.
    # Save the values in a global dictionary for future use.

    if confvar in os.environ:
        return os.environ[confvar]

    global pmi_pbsconf
    if "pmi_pbsconf" not in globals():
        pmi_pbsconf = dict()
        cfile = "PBS_CONF_FILE"
        if cfile in os.environ:
            pbsconf = os.environ[cfile]
        else:
            pbsconf = "/etc/pbs.conf"

        try:
            fp = open(pbsconf)
        except:
            pbs.logmsg(pbs.DEBUG, "%s: Unable to open conf file." % pbsconf)
            return None
        else:
            for line in fp:
                line = line.strip()
                # ignore empty lines or those beginning with '#'
                if line == "" or line[0] == "#":
                    continue
                var, eq, val = line.partition('=')
                if val == "":
                    continue
                pmi_pbsconf[var] = val
            fp.close()
    if confvar in pmi_pbsconf:
        return pmi_pbsconf[confvar]
    else:
        return None
def find_users_in_ldap_group(group_dn):
    if os.path.isdir(
            "/apps/soca/%SOCA_CONFIGURATION/cluster_node_bootstrap/ad_automation"
    ):
        pbs.logmsg(
            pbs.LOG_DEBUG,
            'queue_acl: find_users_in_ldap_group: Detected Active Directory')
        # Active Directory
        with open(
                '/apps/soca/%SOCA_CONFIGURATION/cluster_node_bootstrap/ad_automation/join_domain_user.cache',
                'r') as f:
            ad_user = f.read()
        with open(
                '/apps/soca/%SOCA_CONFIGURATION/cluster_node_bootstrap/ad_automation/join_domain.cache',
                'r') as f:
            ad_password = f.read()
        with open(
                '/apps/soca/%SOCA_CONFIGURATION/cluster_node_bootstrap/ad_automation/domain_name.cache',
                'r') as f:
            domain_name = f.read()
        ldapsearch = 'ldapsearch -x -h ' + domain_name + ' -D "' + ad_user + '@' + domain_name + '" -w "' + ad_password + '" -b "' + group_dn + '" | grep member  | awk \'{print $2}\' | cut -d, -f1 | tr -d "CN="'
        pbs.logmsg(
            pbs.LOG_DEBUG, 'queue_acl: generated ldapsearch command: ' +
            ldapsearch.replace(ad_password, "<REDACTED_PASSWORD>"))
    else:
        # OpenLdap
        pbs.logmsg(pbs.LOG_DEBUG,
                   'queue_acl: find_users_in_ldap_group: Detected OpenLDAP')
        ldapsearch = "ldapsearch -x -b " + group_dn + " -LLL | grep memberUid | awk '{print $2}'"
        pbs.logmsg(pbs.LOG_DEBUG,
                   'queue_acl: generated ldapsearch command: ' + ldapsearch)

    users_in_group = os.popen(ldapsearch).read()  # nosec
    pbs.logmsg(pbs.LOG_DEBUG,
               'queue_acl: find_users_in_ldap_group' + str(users_in_group))
    return list(filter(None, users_in_group.split('\n')))
Пример #28
0
    def __execjob_end_handler(self):
        pbs.logmsg(pbs.LOG_DEBUG, "Docker execjob_end handler start")

        call = "docker stop " + str(self.jid)
        pbs.logmsg(pbs.LOG_DEBUG, "Call is: %s" % call)
        os.system(call)

        call = "docker rm " + str(self.jid)
        pbs.logmsg(pbs.LOG_DEBUG, "Call is: %s" % call)
        os.system(call)
Пример #29
0
    def ConvertToBytes(self, value):
        # Determine what units the user would like to use.
        if self.nhc_cfg["disk_space"]["units"].lower() == 'binary':
            units = {
                'kb': 1024,
                'mb': 1048576,
                'gb': 1073741824,
                'tb': 1099511627776
            }
        elif self.nhc_cfg["disk_space"]["units"].lower() == 'decimal':
            units = {
                'kb': 1000,
                'mb': 1000000,
                'gb': 1000000000,
                'tb': 1000000000000
            }
        else:
            pbs.logmsg(pbs.EVENT_DEBUG3,"I'm not sure how to handle units: %s\nSo I will default to binary"%\
                                 (self.nhc_cfg["disk_space"]["units"]))
            units = {
                'kb': 1024,
                'mb': 1048576,
                'gb': 1073741824,
                'tb': 1099511627776
            }

        value = value.lower()
        if value.find('%') != -1:
            pbs.logmsg(pbs.EVENT_DEBUG3, "found a % symbol")
            # Returned as a float so that I can distinguish between percentage vs free space
            value = float(value.strip('%'))
            pbs.logmsg(pbs.EVENT_DEBUG3, "value: %s" % value)
        else:
            for key in units.keys():
                if value.find(key) != -1:
                    try:
                        value = int(value[:-2].strip()) * units[key]
                    except Exception, e:
                        pbs.logmsg(
                            pbs.EVENT_DEBUG,
                            "Error convertion value to int: %s\tkey: %s" %
                            (value, key))
                        return False
                    break
Пример #30
0
    def ChkDirFilePermissions(self):
        """
            Returns True if the permissions match. The permissions from python are returned as string with the
            '0100600'. The last three digits are the file permissions for user,group, world
            Return action if the permissions don't match and NoFileOrDir if it can't find the file/dir
        """

        if not self.nhc_cfg["permissions"]["check"]:
            pbs.logmsg(pbs.EVENT_DEBUG3, "Skipping permissions check")
            return True

        for file_dir in self.nhc_cfg["permissions"]["check_dirs_and_files"]:
            pbs.logmsg(
                pbs.EVENT_DEBUG3, "File/Dir: %s\t%s" %
                (file_dir,
                 str(self.nhc_cfg["permissions"]["check_dirs_and_files"]
                     [file_dir][0])))
            try:
                st = os.stat(file_dir)
                permissions = oct(st.st_mode)

                if permissions[-len(self.nhc_cfg["permissions"][
                        "check_dirs_and_files"][file_dir][0]):] != str(
                            self.nhc_cfg["permissions"]["check_dirs_and_files"]
                            [file_dir][0]):
                    pbs.logmsg(
                        pbs.EVENT_DEBUG3,
                        "Required permissions: %s\tpermissions: %s" %
                        (str(self.nhc_cfg["permissions"]
                             ["check_dirs_and_files"][file_dir][0]),
                         permissions[-len(self.nhc_cfg["permissions"][
                             "check_dirs_and_files"][file_dir][0]):]))
                    return [
                        self.nhc_cfg["permissions"]["check_dirs_and_files"]
                        [file_dir][1],
                        "File/Dir: %s\tRequired permissions: %s\tpermissions: %s"
                        % (file_dir,
                           str(self.nhc_cfg["permissions"]
                               ["check_dirs_and_files"][file_dir][0]),
                           permissions[-len(self.nhc_cfg["permissions"][
                               "check_dirs_and_files"][file_dir][0]):])
                    ]
            except OSError:
                return [
                    self.nhc_cfg["permissions"]["check_dirs_and_files"]
                    [file_dir][1],
                    "Can not find file/dir: %s" % file_dir
                ]
            except BaseException:
                return False

        return True
Пример #31
0
    def call_hc(self, script):
        stdout = None
        stderr = None

        my_env = os.environ.copy()
        my_env[
            "PATH"] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/bin:/usr/sbin"

        try:
            proc = subprocess.Popen(script,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    env=my_env,
                                    shell=True)

            stdout, stderr = proc.communicate()
            self.rc = proc.returncode
        except Exception as err:
            pbs.logmsg(
                pbs.EVENT_DEBUG,
                "Health-check hook; run script %s error: '%s'" % (script, err))
            self.e.reject()

        pbs.logmsg(pbs.EVENT_DEBUG,
                   "Health-check hook; finished with exit code: %d" % self.rc)

        if stdout or stderr:
            pbs.logmsg(
                pbs.EVENT_DEBUG,
                "Health-check hook; stdout: '%s' stderr: '%s'" %
                (str(stdout).replace("\n", " "), str(stderr).replace(
                    "\n", " ")))

        if stdout:
            lines = stdout.strip().split("\n")
            self.comment = lines[len(lines) - 1]

        if stderr:
            pbs.logmsg(pbs.EVENT_DEBUG,
                       "Health-check hook; stderr not empty, skipping")
            self.e.reject()
Пример #32
0
    def __setallresources_handler(self):
        if self.getandset_cgroups() == False:
            pbs.logmsg(
                pbs.EVENT_DEBUG,
                "%s, failed to get and set cgroups resource" % self.hook_name)

        if self.getandset_cpu_flag() == False:
            pbs.logmsg(
                pbs.EVENT_DEBUG,
                "%s, failed to get and set cpu_flag resource" % self.hook_name)

        if self.getandset_os() == False:
            pbs.logmsg(
                pbs.EVENT_DEBUG,
                "%s, failed to get and set os resource" % self.hook_name)

        if self.getandset_cuda_version() == False:
            pbs.logmsg(
                pbs.EVENT_DEBUG,
                "%s, failed to get and set cuda_version resource" %
                self.hook_name)
Пример #33
0
def parse_cfg():
    config = {}
    if 'PBS_HOOK_CONFIG_FILE' in os.environ:
        config_file = os.environ["PBS_HOOK_CONFIG_FILE"]
        try:
            config = json.loads(open(config_file, 'r').read())
        except Exception as err:
            pbs.logmsg(
                pbs.EVENT_DEBUG,
                "scratch hook; failed to open config file %s: %s" %
                (config_file, str(err)))
            config = {}
            return config

    for i in config.keys():
        if not i in scratch_types.keys():
            pbs.logmsg(
                pbs.EVENT_DEBUG,
                "scratch hook; failed to parse config file, incorrect scratch type %s"
                % str(i))
            config = {}
            return config

        for j in config[i].keys():
            if not j in scratch_types.keys():
                pbs.logmsg(
                    pbs.EVENT_DEBUG,
                    "scratch hook; failed to parse config file, incorrect scratch type %s"
                    % str(j))

            if not list == type(config[i][j]):
                pbs.logmsg(
                    pbs.EVENT_DEBUG,
                    "scratch hook; failed to parse config file, incorrect nodes type"
                )
                config = {}
                return config

    return config
Пример #34
0
def run_file(fpath):
    try:
        pbs.logmsg(pbs.EVENT_DEBUG, "external hook started: %s" % fpath)
        command = fpath
        #new_env = os.environ.copy()
        new_env = j.Variable_List
        new_env['JOBID'] = j.id
        new_env['USER'] = j.euser  #Job_Owner.split("@")[0]
        new_env['GROUP'] = j.egroup
        new_env['HOSTNAME'] = socket.gethostname()
        proc = subprocess.Popen(command,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE,
                                env=new_env)
        (out, err) = proc.communicate()
        pbs.logmsg(
            pbs.EVENT_DEBUG, "external hook %s stdout: '%s' stderr: '%s'" %
            (fpath, out.replace("\n", ","), err.replace("\n", ",")))
        pbs.logmsg(pbs.EVENT_DEBUG,
                   "external hook ended with exitcode: %d" % proc.returncode)
    except Exception as error:
        pbs.logmsg(pbs.EVENT_DEBUG,
                   "external hook %s failed: %s" % (fpath, str(error)))
Пример #35
0
    def file_check(self):

        if not os.path.isfile(self.script): 
            pbs.logmsg(pbs.EVENT_DEBUG, "Health-check hook; %s is not a file or not found" % self.script)
            return False
    
        file_permission = oct(stat.S_IMODE(os.lstat(self.script).st_mode))

        if file_permission != self.allowed_permission:
            pbs.logmsg(pbs.EVENT_DEBUG, "Health-check hook; incorrect file permission: %s" % file_permission)
            return False

        s = os.stat(self.script)

        if s.st_uid != self.allowed_uid:
            pbs.logmsg(pbs.EVENT_DEBUG, "Health-check hook; incorrect file owner: %d:%d" % (s.st_uid, s.st_gid))
            return False

        if s.st_gid != self.allowed_gid:
            pbs.logmsg(pbs.EVENT_DEBUG, "Health-check hook; incorrect file owner: %d:%d" % (s.st_uid, s.st_gid))
            return False

        return True
Пример #36
0
    def __queuejob_handler(self):
        pbs.logmsg(pbs.LOG_DEBUG, "Docker queuejob handler start")

        newselect = []
        if "select" in self.j.Resource_List.keys():
            for i in str(self.j.Resource_List["select"]).split("+"):
                if re.search("docker=[Tt]{1}rue", i):
                    newselect.append(i)
                    continue
                newselect.append(i + ":docker=true")
        else:
            newselect.append("docker=true")

        pbs.logmsg(pbs.LOG_DEBUG, "Old select: %s" % str(self.j.Resource_List))
        self.j.Resource_List["select"] = pbs.select("+".join(newselect))
        pbs.logmsg(pbs.LOG_DEBUG, "New select: %s" % str(self.j.Resource_List))
Пример #37
0
    def getandset_os(self):
        files_to_check = ["/etc/os-release"]
        lines = []
        version_aliases = {"rhel7.6": "centos7"}
        os = ""
        version = ""
        try:
            for file in files_to_check:
                with open(file) as f:
                    l = f.readlines()
                lines += l
        except Exception as err:
            pbs.logmsg(
                pbs.EVENT_DEBUG,
                "%s, getandset_os error: %s" % (self.hook_name, str(err)))
            pass

        try:
            for line in lines:
                line = line.split("=")
                if line[0] == "ID":
                    os = line[1].replace('"', '').strip()
                if line[0] == "VERSION_ID":
                    version = line[1].replace('"', '').strip()
        except Exception as err:
            pbs.logmsg(
                pbs.EVENT_DEBUG,
                "%s, getandset_os error: %s" % (self.hook_name, str(err)))
            return False

        if os == "":
            return False
        else:
            res_value = os + version
            if res_value in version_aliases.keys():
                res_value = version_aliases[res_value]
            self.vnl[self.local_node].resources_available["os"] = res_value
            pbs.logmsg(
                pbs.EVENT_DEBUG,
                "%s, resource os set to: %s" % (self.hook_name, res_value))
        return True
def main():
    try:
        hook_config = {}
        if pbs.hook_config_filename:
            with open(pbs.hook_config_filename) as fr:
                hook_config.update(json.load(fr))

        e = pbs.event()
        if e.type == pbs.QUEUEJOB:
            j = e.job
            hold_on_submit(hook_config, j)
        elif e.type == pbs.PERIODIC:
            periodic_release_hook(hook_config, e)
        else:
            pbs.logmsg(pbs.EVENT_ERROR, "Unknown event type %s" % e.type)
    except SystemExit:
        pbs.logmsg(pbs.LOG_DEBUG, "cycle - Exited with SystemExit")
        raise
    except:
        pbs.logmsg(pbs.EVENT_ERROR, "cycle - %s" % traceback.format_exc())
        raise
Пример #39
0
    def launch_job(self):
        args = copy.deepcopy(self.e.argv)
        pbs.logmsg(pbs.LOG_DEBUG, "args are %s" % self.e.progname)

        self.e.progname = "/usr/bin/docker"
        self.e.argv = []
        self.e.argv.append("docker")
        self.e.argv.append("exec")
        if self.container_type == "interactive":
            self.e.argv.append("-it")
        self.e.argv.append(self.jid)

        if self.container_type == "interactive":
            self.e.argv.append("/bin/bash")
            return

        if self.container_type == "script":
            self.e.argv.append("/bin/bash")
            if not self.job_file:
                pbs.logmsg(pbs.LOG_DEBUG, "Job file is missing")
                return
            self.e.argv.append("-c")
            self.e.argv.append(self.job_file)
            return

        if self.container_type == "service":
            self.e.argv.append("/bin/sh")
            return

        if self.container_type == "executable":
            self.e.argv.append("/bin/bash")
            self.e.argv.append("-c")
            executable = ""
            for arg in args:
                executable += arg + " "
                pbs.logmsg(pbs.LOG_DEBUG, "arg: %s" % arg)
            self.e.argv.append(executable)
            return
Пример #40
0
 def _connect(self, endpoint=None, port=None, job=None):
     if job is None:
         pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: connect")
     else:
         pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: %s connect" % (job.id))
     return
Пример #41
0
 def __init__(self, pyhome=None):
     pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: init")
Пример #42
0
# or contact the Altair Legal Department.
#
# Altair’s dual-license business model allows companies, individuals, and
# organizations to create proprietary derivative works of PBS Pro and
# distribute them - whether embedded or bundled with other software -
# under a commercial license agreement.
#
# Use of Altair’s trademarks, including but not limited to "PBS™",
# "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
# trademark licensing policies.

import pbs

e = pbs.event()
if e.type == pbs.RESVSUB:
    pbs.logmsg(pbs.LOG_DEBUG, "hook event type is resvsub")
elif e.type == pbs.RESV_END:
    pbs.logmsg(pbs.LOG_DEBUG, "hook event type is resv_end")
elif e.type == pbs.QUEUEJOB:
    pbs.logmsg(pbs.LOG_DEBUG, "hook event type is queuejob")
elif e.type == pbs.MODIFYJOB:
    pbs.logmsg(pbs.LOG_DEBUG, "hook event type is modifyjob")
elif e.type == pbs.MOVEJOB:
    pbs.logmsg(pbs.LOG_DEBUG, "hook event type is movejob")
elif e.type == pbs.RUNJOB:
    pbs.logmsg(pbs.LOG_DEBUG, "hook event type is runjob")
elif e.type == pbs.PERIODIC:
    pbs.logmsg(pbs.LOG_DEBUG, "hook event type is periodic")
elif e.type == pbs.EXECJOB_BEGIN:
    pbs.logmsg(pbs.LOG_DEBUG, "hook event type is execjob_begin")
elif e.type == pbs.EXECJOB_PROLOGUE:
Пример #43
0
# The failed nodes are offlined.  The 's' accounting record is generated.

# To register the hook, as root via qmgr:
# qmgr << RJS
# create hook rjs_hook
# set hook rjs_hook event = 'queuejob,execjob_launch'
# set hook rjs_hook enabled = true
# import hook rjs_hook application/x-python default ReliableJobStartup.py
# RJS

import pbs
e = pbs.event()

if e.type == pbs.QUEUEJOB:
    # add a log entry in server logs
    pbs.logmsg(pbs.LOG_DEBUG, "queuejob hook executed")
    e.job.tolerate_node_failures = "job_start"

    # Save current select spec in resource 'site'
    selspec = e.job.Resource_List["select"]
    if selspec is None:
        e.reject("Event job does not have select spec!")
    e.job.Resource_List["site"] = str(selspec)

    # increment_chunks() can use a percentage argument or an integer. For
    # example add 1 chunk to each chunk (except the first) in the job's
    # select spec
    new_select = selspec.increment_chunks(1)
    e.job.Resource_List["select"] = new_select
    pbs.logmsg(pbs.LOG_DEBUG, "job's select spec changed to %s" % new_select)
Пример #44
0
def trace_hook(**kwargs):
    """Simple exception trace logger for PBS hooks
    loglevel=<int> (pbs.LOG_DEBUG): log level to pass to pbs.logmsg()
    reject=True: reject the job upon completion of logging trace
    trace_in_reject=<bool> (False): pass trace to pbs.event().reject()
    trace_in_reject=<str>: message to pass to pbs.event().reject() with trace
    """
    import sys

    if 'loglevel' in kwargs:
        loglevel = kwargs['loglevel']
    else:
        loglevel = pbs.LOG_ERROR
    if 'reject' in kwargs:
        reject = kwargs['reject']
    else:
        reject = True
    if 'trace_in_reject' in kwargs:
        trace_in_reject = kwargs['trace_in_reject']
    else:
        trace_in_reject = False

    # Associate hook events with the appropriate PBS constant. This is a list
    # of all hook events as of PBS Pro 13.0. If the event does not exist, it is
    # removed from the list.
    hook_events = [
        'queuejob', 'modifyjob', 'movejob', 'runjob', 'execjob_begin',
        'execjob_prologue', 'execjob_launch', 'execjob_attach',
        'execjob_preterm', 'execjob_epilogue', 'execjob_end', 'resvsub',
        'provision', 'exechost_periodic', 'exechost_startup', 'execjob_resize',
        'execjob_abort'
    ]

    hook_event = {}
    for he in hook_events:
        # Only set available hooks for the current version of PBS.
        if hasattr(pbs, he.upper()):
            event_code = eval('pbs.' + he.upper())
            hook_event[event_code] = he
            hook_event[he] = event_code
            hook_event[he.upper()] = event_code
            del event_code
        else:
            del hook_events[hook_events.index(he)]

    trace = {
        'line': sys.exc_info()[2].tb_lineno,
        'module': sys.exc_info()[2].tb_frame.f_code.co_name,
        'exception': sys.exc_info()[0].__name__,
        'message': sys.exc_info()[1].message,
    }
    tracemsg = '%s hook %s encountered an exception: Line %s in %s %s: %s' % (
        hook_event[pbs.event().type], pbs.event().hook_name, trace['line'],
        trace['module'], trace['exception'], trace['message'])
    rejectmsg="Hook Error: request rejected as filter hook '%s' encountered " \
        "an exception. Please inform Admin" % pbs.event().hook_name
    if not isinstance(loglevel, int):
        loglevel = pbs.LOG_ERROR
        tracemsg='trace_hook() called with invalid argument (loglevel=%s), '\
            'setting to pbs.LOG_ERROR. ' + tracemsg

    pbs.logmsg(pbs.LOG_ERROR, tracemsg)

    if reject:
        tracemsg += ', request rejected'
        if isinstance(trace_in_reject, bool):
            if trace_in_reject:
                pbs.event().reject(tracemsg)
            else:
                pbs.event().reject(rejectmsg)
        else:
            pbs.event().reject(
                str(trace_in_reject) + 'Line %s in %s %s:\n%s' %
                (trace['line'], trace['module'], trace['exception'],
                 trace['message']))
Пример #45
0
    def _deactivate_profile(self, job):
        pbs.logmsg(pbs.LOG_DEBUG, "Cray: deactivate %s" % job.id)
        nids, cnt = nidlist(job)
        if cnt == 0:
            pbs.logjobmsg(job.id, "Cray: no compute nodes for power setting")
            return False

        # remove initial energy file
        try:
            os.unlink(energy_file(job))
        except Exception:
            pass

        # If this is the only job, undo any power cap we set.
        if _running_excl(job):
            cmd = "set_power_cap --nids " + nids
            doit = False

            pcap = job.Resource_List['pcap_node']
            if pcap is not None:
                pbs.logjobmsg(job.id, "Cray: remove pcap node %d" % pcap)
                cmd += " --node 0"
                doit = True
            pcap = job.Resource_List['pcap_accelerator']
            if pcap is not None:
                pbs.logjobmsg(job.id, "Cray: remove pcap accel %d" % pcap)
                cmd += " --accel 0"
                doit = True

            if doit:
                try:
                    launch(job.id, cmd)
                except Exception:
                    pass
            else:
                pbs.logjobmsg(job.id, "Cray: no power cap to remove")

        # Get final energy value from RUR data
        name = rur_file(job)
        try:
            rurfp = open(name, "r")
        except Exception:
            pbs.logjobmsg(job.id, "Cray: no RUR data")
            return False

        sbuf = os.fstat(rurfp.fileno())
        if (sbuf.st_uid != 0) or (sbuf.st_mode & stat.S_IWOTH):
            pbs.logjobmsg(job.id, "Cray: RUR file permission: %s" % name)
            rurfp.close()
            os.unlink(name)
            return False

        pbs.logjobmsg(job.id, "Cray: reading RUR file: %s" % name)
        energy = 0
        seen = False  # track if energy plugin is seen
        for line in rurfp:
            plugin, _, rest = line.partition(" : ")
            if plugin != "energy":  # check that the plugin is energy
                continue

            apid, _, metstr = rest.partition(" : ")
            seen = True
            try:  # parse the metric list
                metlist = eval(metstr, {})
                metrics = dict(metlist[i:i + 2]
                               for i in range(0, len(metlist), 2))
                joules = metrics["energy_used"]
                energy += joules
                pbs.logjobmsg(
                    job.id,
                    'Cray:RUR: {"apid":%s,"apid_energy":%dJ,"job_energy":%dJ}'
                    % (apid, joules, energy))
            except Exception as e:
                pbs.logjobmsg(job.id,
                              "Cray:RUR: energy_used not found: %s" % str(e))

        rurfp.close()
        os.unlink(name)

        if not seen:
            pbs.logjobmsg(job.id, "Cray:RUR: no energy plugin")
            return False

        old_energy = job.resources_used["energy"]
        new_energy = float(energy) / 3600000.0
        if old_energy is None:
            pbs.logjobmsg(job.id, "Cray:RUR: energy %fkWh" % new_energy)
            job.resources_used["energy"] = new_energy
        elif new_energy > old_energy:
            pbs.logjobmsg(
                job.id,
                "Cray:RUR: energy %fkWh replaces periodic energy %fkWh" %
                (new_energy, old_energy))
            job.resources_used["energy"] = new_energy
        else:
            pbs.logjobmsg(
                job.id, "Cray:RUR: energy %fkWh last periodic usage %fkWh" %
                (new_energy, old_energy))
        return True
Пример #46
0
 def power_status(self, hosts=None):
     self._check_pmi()
     pbs.logmsg(pbs.EVENT_DEBUG3, "PMI:powerstatus: status of nodes")
     return self.__pmi._pmi_power_status(hosts)
Пример #47
0
def debug(msg):
    pbs.logmsg(pbs.EVENT_DEBUG3, 'LA debug: %s' % msg)
Пример #48
0
def log_function_name():
    """
    Log the caller's name
    """
    pbs.logmsg(pbs.EVENT_DEBUG4, '%s:%s: Method called' %
               (pbs.event().hook_name, caller_name(2)))
Пример #49
0
	# If it's a system user accept the job
	if pbs.event().requestor in ["PBS_Server", "Scheduler", "pbs_mom"]:
		pbs.event().accept()

	# Check if project has been set
	if pbs.event().job.project is not None:
		project = str(pbs.event().job.project)
	else:
		project = "_pbs_project_default"

	# Accept if it's default project
	# TODO change it in production
	if project == "_pbs_project_default":
		pbs.event().accept()

	pbs.logmsg(pbs.LOG_DEBUG, "---> Queuejob Hook Start! Requestor is %s and project is %s" % (pbs.event().requestor, project))
	pbs.logmsg(pbs.LOG_DEBUG, "---> select line is %s" % pbs.event().job.Resource_List.select)

	# Check if user is part of the project
	if not isEntitled(pbs.event().requestor, project):
		pbs.logmsg(pbs.LOG_DEBUG, "---> user " + str(pbs.event().requestor) + " is not part of project " + str(project))
		pbs.event().reject("You are not allowed to use the budget of project " + str(project))

	#
	myQueue = str(pbs.event().job.queue)
	if myQueue == "":
		pbs.event().reject("No queue selected, please select a queue")
	if myQueue == "workq":
		pbs.event().reject("Queue workq is not enabled")

	#
Пример #50
0
 def _disconnect(self, job=None):
     if job is None:
         pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: disconnect")
     else:
         pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: %s disconnect" % (job.id))
     return
Пример #51
0
def parse_config_file():
    # Turn everything off by default. These settings be modified
    # when the configuration file is read.
    global pbs_home
    global pbs_exec
    global power_ramp_rate_enable
    global power_on_off_enable
    global node_idle_limit
    global min_node_down_delay
    global max_jobs_analyze_limit
    global max_concurrent_nodes

    try:
        # This block will work for PBS Pro versions 13 and later
        pbs_conf = pbs.get_pbs_conf()
        pbs_home = pbs_conf['PBS_HOME']
        pbs_exec = pbs_conf['PBS_EXEC']
    except:
        pbs.logmsg(pbs.EVENT_DEBUG,
                   "PBS_HOME needs to be defined in the config file")
        pbs.logmsg(pbs.EVENT_DEBUG, "Exiting the power hook")
        pbs.event().accept()

    # Identify the config file and read in the data
    config_file = ''
    if 'PBS_HOOK_CONFIG_FILE' in os.environ:
        config_file = os.environ["PBS_HOOK_CONFIG_FILE"]
    tmpcfg = ''
    if not config_file:
        tmpcfg = os.path.join(pbs_home, 'server_priv', 'hooks',
                              'PBS_power.CF')
    if os.path.isfile(tmpcfg):
        config_file = tmpcfg
    if not config_file:
        tmpcfg = os.path.join(pbs_home, 'mom_priv', 'hooks',
                              'PBS_power.CF')
    if os.path.isfile(tmpcfg):
        config_file = tmpcfg
    if not config_file:
        raise Exception("Config file not found")
    pbs.logmsg(pbs.EVENT_DEBUG3, "Config file is %s" % config_file)
    try:
        fd = open(config_file, 'r')
        config = json.load(fd)
        fd.close()
    except IOError:
        raise Exception("I/O error reading config file")
    except:
        raise Exception("Error reading config file")

    # Assign default values to attributes
    power_ramp_rate_enable = False
    power_on_off_enable = False
    node_idle_limit = 1800
    min_node_down_delay = 1800
    max_jobs_analyze_limit = 100
    max_concurrent_nodes = 10

    # Now assgin values read from config file
    if 'power_on_off_enable' in config:
        power_on_off_enable = config['power_on_off_enable']
        pbs.logmsg(pbs.EVENT_DEBUG3, "power_on_off_enable is set to %s" %
                   str(power_on_off_enable))
    if 'power_ramp_rate_enable' in config:
        power_ramp_rate_enable = config['power_ramp_rate_enable']
        pbs.logmsg(pbs.EVENT_DEBUG3, "power_ramp_rate_enable is set to %s" %
                   str(power_ramp_rate_enable))
    if 'node_idle_limit' in config:
        node_idle_limit = int(config['node_idle_limit'])
        if not node_idle_limit or node_idle_limit < 0:
            node_idle_limit = 1800
        pbs.logmsg(pbs.EVENT_DEBUG3, "node_idle_limit is set to %d" %
                   node_idle_limit)
    if 'min_node_down_delay' in config:
        min_node_down_delay = int(config['min_node_down_delay'])
        if not min_node_down_delay or min_node_down_delay < 0:
            min_node_down_delay = 1800
        pbs.logmsg(pbs.EVENT_DEBUG3, "min_node_down_delay is set to %d" %
                   min_node_down_delay)
    if 'max_jobs_analyze_limit' in config:
        max_jobs_analyze_limit = int(config['max_jobs_analyze_limit'])
        if not max_jobs_analyze_limit or max_jobs_analyze_limit < 0:
            max_jobs_analyze_limit = 100
        pbs.logmsg(pbs.EVENT_DEBUG3, "max_jobs_analyze_limit is set to %d" %
                   max_jobs_analyze_limit)
    if 'max_concurrent_nodes' in config:
        max_concurrent_nodes = int(config['max_concurrent_nodes'])
        if not max_concurrent_nodes or max_concurrent_nodes < 0:
            max_concurrent_nodes = 10
        pbs.logmsg(pbs.EVENT_DEBUG3, "max_concurrent_nodes is set to %d" %
                   max_concurrent_nodes)
Пример #52
0
 def _query(self, query_type):
     pbs.logmsg(pbs.LOG_DEBUG, "Cray: query")
     return None
Пример #53
0
    # pbs_conf() will return PBS_HOME if it is not.
    mom_priv = os.path.abspath(
        os.path.join(pbs_conf()['PBS_MOM_HOME'], 'mom_priv'))

    # Get the hook alarm time from the .HK file if it exists.
    hk_file = os.path.join(mom_priv, 'hooks', '%s.HK' % hook_name)
    if os.path.exists(hk_file):
        hook_settings = dict(
            [l.strip().split('=') for l in open(hk_file, 'r').readlines()])
        if 'alarm' in hook_settings.keys():
            hook_alarm = int(hook_settings['alarm'])
        if 'debug' in hook_settings.keys():
            DEBUG = True if hook_settings['debug'] == 'true' else False

    if DEBUG:
        pbs.logmsg(pbs.LOG_DEBUG,
                   '%s;%s;[DEBUG] starting.' % (hook_name, job.id))

    if 'PBS_HOOK_CONFIG_FILE' in os.environ:
        config_file = os.environ["PBS_HOOK_CONFIG_FILE"]
        config = dict([
            l.split('#')[0].strip().split('=')
            for l in open(config_file, 'r').readlines() if '=' in l
        ])

        # Set the true/false configurations
        if 'ENABLE_PARALLEL' in config.keys():
            ENABLE_PARALLEL = config['ENABLE_PARALLEL'].lower()[0] in [
                't', '1'
            ]
        if 'VERBOSE_USER_OUTPUT' in config.keys():
            VEROSE_USER_OUTPUT = config['VERBOSE_USER_OUTPUT'].lower()[0] in [
# distribute them - whether embedded or bundled with other software -
# under a commercial license agreement.
#
# Use of Altair’s trademarks, including but not limited to "PBS™",
# "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
# trademark licensing policies.
#

import pbs
import os

e = pbs.event()
vnode = e.vnode
aoe = e.aoe

pbs.logmsg(pbs.LOG_DEBUG, "PROVISIONING: Env = %s" % repr(os.environ))
pbs.logmsg(pbs.LOG_DEBUG, "PROVISIONING: PBS Node = %s" % vnode)
pbs.logmsg(pbs.LOG_DEBUG, "PROVISIONING: AOE = %s" % aoe)

# Provision hook will run on PBS Server but provisioning is started from Admin node, both may not run on same node.
# Check for admin node? Read from json config file.
if 'PBS_HOOK_CONFIG_FILE' in os.environ:
    import json
    config_file = os.environ["PBS_HOOK_CONFIG_FILE"]
    #pbs.logmsg(pbs.EVENT_DEBUG, "%s: Config file is %s" % (caller_name(), config_file))
    config = json.load(open(config_file, 'r'), object_hook=decode_dict)

server = pbs.server().name
admin = config['admin-node']

pbs.logmsg(pbs.LOG_DEBUG, "PROVISIONING: server name = %s" % server)
import pbs
e = pbs.event()
j = e.job
who = e.requestor
pbs.logmsg(pbs.LOG_DEBUG, "requestor=%s" % (who,))
admin_ulist = ["PBS_Server", "Scheduler", "pbs_mom", "root"]
if who not in admin_ulist:
	e.reject("Normal users are not allowed to modify their jobs")
Пример #56
0
def error(msg):
    pbs.logmsg(pbs.EVENT_ERROR, 'LA error: %s' % msg)
Пример #57
0
                   max_jobs_analyze_limit)
    if 'max_concurrent_nodes' in config:
        max_concurrent_nodes = int(config['max_concurrent_nodes'])
        if not max_concurrent_nodes or max_concurrent_nodes < 0:
            max_concurrent_nodes = 10
        pbs.logmsg(pbs.EVENT_DEBUG3, "max_concurrent_nodes is set to %d" %
                   max_concurrent_nodes)


# Accept if event not serviceable.
this_event = pbs.event()
if this_event.type not in [pbs.EXECJOB_PROLOGUE, pbs.EXECJOB_EPILOGUE,
                           pbs.EXECJOB_BEGIN, pbs.EXECJOB_END,
                           pbs.EXECHOST_STARTUP, pbs.EXECHOST_PERIODIC,
                           pbs.PERIODIC]:
    pbs.logmsg(pbs.LOG_WARNING,
               "Event not serviceable for power provisioning.")
    this_event.accept()


if this_event.type == pbs.PERIODIC:
    vnlist = this_event.vnode_list
    resvlist = this_event.resv_list
    time_now = time.time()

    # Parse the config file for power attributes
    try:
        parse_config_file()
    except Exception as e:
        this_event.reject(str(e))

    if power_ramp_rate_enable == 0 and power_on_off_enable == 0:
'''
This hook output resource_user.instance_type_used to the current EC2 instance type to the accounting logs

create hook soca_aws_infos event=execjob_begin
import hook soca_aws_infos application/x-python default /apps/soca/<cLUSTER_ID>/cluster_hooks/execjob_begin/soca_aws_infos.py
'''

import re
import socket

import pbs
import urllib2

pbs.logmsg(pbs.LOG_DEBUG, 'soca_aws_infos: start')
instance_type = urllib2.urlopen(
    "http://169.254.169.254/latest/meta-data/instance-type").read()
instance_type = instance_type.replace('.', '_')
pbs.logmsg(pbs.LOG_DEBUG,
           'soca_aws_infos: detected instance: ' + str(instance_type))
e = pbs.event()
j = e.job
host = (socket.gethostname()).split('.')[0]
regex_vnode = r'\(.*?\)'
exec_vnode = str(j.exec_vnode)
vnode_list = re.findall('\(.*?\)', exec_vnode)
if host in vnode_list[0]:
    pbs.logmsg(
        pbs.LOG_DEBUG,
        'soca_aws_infos: detected host, about to specify new resource used')
    try:
        j.resources_used["instance_type_used"] = str(instance_type)
Пример #59
0
def vnodes_enabled(job):
    # see if power operations are allowed on all job vnodes
    for vn in _get_vnode_names(job):
        if not _svr_vnode(vn).power_provisioning:
            pbs.logjobmsg(job.id,
                          "power functionality is disabled on vnode %s" % vn)
            return False
    return True


# Accept if event not serviceable.
this_event = pbs.event()
if this_event.type not in [pbs.EXECJOB_PROLOGUE, pbs.EXECJOB_EPILOGUE,
                           pbs.EXECJOB_BEGIN, pbs.EXECJOB_END,
                           pbs.EXECHOST_STARTUP, pbs.EXECHOST_PERIODIC]:
    pbs.logmsg(pbs.LOG_WARNING,
               "Event not serviceable for power provisioning.")
    this_event.accept()


# Set eoe values for my node
if this_event.type == pbs.EXECHOST_STARTUP:
    from pbs.v1._pmi_utils import _is_node_provisionable

    # Don't connect if the server or sched is running.
    if not _is_node_provisionable():
        pbs.logmsg(pbs.LOG_DEBUG,
                   "Provisioning cannot be enabled on this host")
        this_event.accept()
    power = init_power(this_event)
    profiles = power.query(pbs.Power.QUERY_PROFILE)
    if profiles is not None: