Exemplo n.º 1
0
def arc_get_output(jid, directory):
    """ARC CE job output retrieval"""

    if not __cream_ui_check__():
        return (False, None)

    # construct URI list from ID and output from arcls
    cmd = 'arcls %s %s' % (__arc_get_config_file_arg__(), jid)
    exec_bin = True
    logger.debug('arcls command: %s' % cmd)
    rc, output, m = getShell().cmd1(
        '%s%s' % (__get_cmd_prefix_hack__(binary=exec_bin), cmd),
        allowed_exit=[0, 255],
        timeout=config['SubmissionTimeout'])
    if rc:
        logger.error(
            "Could not find directory associated with ARC job ID '%s'" % jid)
        return False

    # URI is JID + filename
    gfiles = []
    for uri in output.split("\n"):
        if len(uri) == 0:
            continue
        uri = jid + "/" + uri
        gf = GridftpFileIndex()
        gf.id = uri
        gfiles.append(gf)

    cache = GridftpSandboxCache()
    cache.vo = config['VirtualOrganisation']
    cache.uploaded_files = gfiles
    return cache.download(files=map(lambda x: x.id, gfiles),
                          dest_dir=directory)
Exemplo n.º 2
0
def arc_get_output(jid, directory, cred_req):
    """ARC CE job output retrieval"""

    # construct URI list from ID and output from arcls
    cmd = 'arcls %s %s' % (__arc_get_config_file_arg__(), jid)
    logger.debug('arcls command: %s' % cmd)
    rc, output, m = getShell(cred_req).cmd1(
        cmd, allowed_exit=[0, 255], timeout=config['SubmissionTimeout'])
    if rc:
        logger.error(
            "Could not find directory associated with ARC job ID '%s'" % jid)
        return False

    # URI is JID + filename
    gfiles = []
    for uri in output.split("\n"):
        if len(uri) == 0:
            continue
        uri = jid + "/" + uri
        gf = GridftpFileIndex()
        gf.id = uri
        gfiles.append(gf)

    cache = GridftpSandboxCache()
    cache.uploaded_files = gfiles
    return cache.download(cred_req=cred_req,
                          files=map(lambda x: x.id, gfiles),
                          dest_dir=directory)
Exemplo n.º 3
0
def arc_get_output(jid, directory, cred_req):
    """ARC CE job output retrieval"""

    # construct URI list from ID and output from arcls
    cmd = 'arcls %s %s' % (__arc_get_config_file_arg__(), jid)
    logger.debug('arcls command: %s' % cmd)
    rc, output, m = getShell(cred_req).cmd1(cmd,
                                            allowed_exit=[0, 255],
                                            timeout=config['SubmissionTimeout'])
    if rc:
        logger.error(
            "Could not find directory associated with ARC job ID '%s'" % jid)
        return False

    # URI is JID + filename
    gfiles = []
    for uri in output.split("\n"):
        if len(uri) == 0:
            continue
        uri = jid + "/" + uri
        gf = GridftpFileIndex()
        gf.id = uri
        gfiles.append(gf)

    cache = GridftpSandboxCache()
    cache.uploaded_files = gfiles
    return cache.download(cred_req=cred_req, files=map(lambda x: x.id, gfiles), dest_dir=directory)
Exemplo n.º 4
0
    def __init__(self):
        super(ARC, self).__init__()

        # dynamic requirement object loading
        try:
            reqName1 = config['Requirements']
            reqName = config['Requirements'].split('.').pop()
            reqModule = __import__(reqName1, globals(), locals(), [reqName1])
            reqClass = vars(reqModule)[reqName]
            self.requirements = reqClass()

            logger.debug('load %s as LCGRequirements' % reqName)
        except:
            logger.debug('load default LCGRequirements')
            pass

        # dynamic sandbox cache object loading
        # force to use GridftpSandboxCache
        self.sandboxcache = GridftpSandboxCache()
        try:
            scName1 = config['SandboxCache']
            scName = config['SandboxCache'].split('.').pop()
            scModule = __import__(scName1, globals(), locals(), [scName1])
            scClass = vars(scModule)[scName]
            self.sandboxcache = scClass()
            logger.debug('load %s as SandboxCache' % scName)
        except:
            logger.debug('load default SandboxCache')
            pass
Exemplo n.º 5
0
def arc_get_output(jid, directory):
    """ARC CE job output retrieval"""

    if not __cream_ui_check__():
        return (False, None)

    # construct URI list from ID and output from arcls
    cmd = "arcls %s %s" % (__arc_get_config_file_arg__(), jid)
    exec_bin = True
    logger.debug("arcls command: %s" % cmd)
    rc, output, m = getShell().cmd1(
        "%s%s" % (__get_cmd_prefix_hack__(binary=exec_bin), cmd),
        allowed_exit=[0, 255],
        timeout=config["SubmissionTimeout"],
    )
    if rc:
        logger.error("Could not find directory associated with ARC job ID '%s'" % jid)
        return False

    # URI is JID + filename
    gfiles = []
    for uri in output.split("\n"):
        if len(uri) == 0:
            continue
        uri = jid + "/" + uri
        gf = GridftpFileIndex()
        gf.id = uri
        gfiles.append(gf)

    cache = GridftpSandboxCache()
    cache.vo = config["VirtualOrganisation"]
    cache.uploaded_files = gfiles
    return cache.download(files=map(lambda x: x.id, gfiles), dest_dir=directory)
Exemplo n.º 6
0
def cream_get_output(osb_uri_list, directory, cred_req):
    """CREAM CE job output retrieval"""

    gfiles = []
    for uri in osb_uri_list:
        gf = GridftpFileIndex()
        gf.id = uri
        gfiles.append(gf)

    cache = GridftpSandboxCache()
    cache.uploaded_files = gfiles

    return cache.download(cred_req=cred_req, files=map(lambda x: x.id, gfiles), dest_dir=directory)
Exemplo n.º 7
0
def cream_get_output(osb_uri_list, directory, cred_req):
    """CREAM CE job output retrieval"""

    gfiles = []
    for uri in osb_uri_list:
        gf = GridftpFileIndex()
        gf.id = uri
        gfiles.append(gf)

    cache = GridftpSandboxCache()
    cache.uploaded_files = gfiles

    return cache.download(cred_req=cred_req,
                          files=map(lambda x: x.id, gfiles),
                          dest_dir=directory)
Exemplo n.º 8
0
    def __init__(self):
        super(CREAM, self).__init__()

        # dynamic requirement object loading
        try:
            reqName1 = config['Requirements']
            reqName = config['Requirements'].split('.').pop()
            reqModule = __import__(reqName1, globals(), locals(), [reqName1])
            reqClass = vars(reqModule)[reqName]
            self.requirements = reqClass()

            logger.debug('load %s as LCGRequirements' % reqName)
        except:
            logger.debug('load default LCGRequirements')

        # dynamic sandbox cache object loading
        # force to use GridftpSandboxCache
        self.sandboxcache = GridftpSandboxCache()
        try:
            scName1 = config['SandboxCache']
            scName = config['SandboxCache'].split('.').pop()
            scModule = __import__(scName1, globals(), locals(), [scName1])
            scClass = vars(scModule)[scName]
            self.sandboxcache = scClass()
            logger.debug('load %s as SandboxCache' % scName)
        except:
            logger.debug('load default SandboxCache')
Exemplo n.º 9
0
def cream_get_output(osbURIList, directory):
    """CREAM CE job output retrieval"""

    if not __cream_ui_check__():
        return False, None

    gfiles = []
    for uri in osbURIList:
        gf = GridftpFileIndex()
        gf.id = uri
        gfiles.append(gf)

    cache = GridftpSandboxCache()
    cache.vo = config["VirtualOrganisation"]
    cache.uploaded_files = gfiles

    return cache.download(files=map(lambda x: x.id, gfiles), dest_dir=directory)
Exemplo n.º 10
0
def cream_get_output(osbURIList, directory):
    """CREAM CE job output retrieval"""

    if not __cream_ui_check__():
        return False, None

    gfiles = []
    for uri in osbURIList:
        gf = GridftpFileIndex()
        gf.id = uri
        gfiles.append(gf)

    cache = GridftpSandboxCache()
    cache.vo = config['VirtualOrganisation']
    cache.uploaded_files = gfiles

    return cache.download(files=map(lambda x: x.id, gfiles),
                          dest_dir=directory)
Exemplo n.º 11
0
    def cream_get_output(self, osbURIList, directory):
        '''CREAM CE job output retrieval'''

        if not self.__cream_ui_check__():
            return (False,None)

        gfiles = []
        for uri in osbURIList:
            gf = GridftpFileIndex()
            gf.id = uri
            gfiles.append(gf)

        cache = GridftpSandboxCache()
        cache.middleware = 'GLITE'
        cache.vo = self.config['VirtualOrganisation']
        cache.uploaded_files = gfiles

        return cache.download( files=map(lambda x:x.id, gfiles), dest_dir=directory )
Exemplo n.º 12
0
class CREAM(IBackend):

    '''CREAM backend - direct job submission to gLite CREAM CE'''
    _schema = Schema(Version(1, 0), {
        'CE': SimpleItem(defvalue='', doc='CREAM CE endpoint'),
        'jobtype': SimpleItem(defvalue='Normal', doc='Job type: Normal, MPICH'),
        'requirements': ComponentItem('LCGRequirements', doc='Requirements for the resource selection'),
        'sandboxcache': ComponentItem('GridSandboxCache', copyable=1, doc='Interface for handling oversized input sandbox'),
        'id': SimpleItem(defvalue='', typelist=['str', 'list'], protected=1, copyable=0, doc='Middleware job identifier'),
        'status': SimpleItem(defvalue='', typelist=['str', 'dict'], protected=1, copyable=0, doc='Middleware job status'),
        'exitcode': SimpleItem(defvalue='', protected=1, copyable=0, doc='Application exit code'),
        'exitcode_cream': SimpleItem(defvalue='', protected=1, copyable=0, doc='Middleware exit code'),
        'actualCE': SimpleItem(defvalue='', protected=1, copyable=0, doc='The CREAM CE where the job actually runs.'),
        'reason': SimpleItem(defvalue='', protected=1, copyable=0, doc='Reason of causing the job status'),
        'workernode': SimpleItem(defvalue='', protected=1, copyable=0, doc='The worker node on which the job actually runs.'),
        'isbURI': SimpleItem(defvalue='', protected=1, copyable=0, doc='The input sandbox URI on CREAM CE'),
        'osbURI': SimpleItem(defvalue='', protected=1, copyable=0, doc='The output sandbox URI on CREAM CE')
    })

    _category = 'backends'

    _name = 'CREAM'

    def __init__(self):
        super(CREAM, self).__init__()

        # dynamic requirement object loading
        try:
            reqName1 = config['Requirements']
            reqName = config['Requirements'].split('.').pop()
            reqModule = __import__(reqName1, globals(), locals(), [reqName1])
            reqClass = vars(reqModule)[reqName]
            self.requirements = reqClass()

            logger.debug('load %s as LCGRequirements' % reqName)
        except:
            logger.debug('load default LCGRequirements')

        # dynamic sandbox cache object loading
        # force to use GridftpSandboxCache
        self.sandboxcache = GridftpSandboxCache()
        try:
            scName1 = config['SandboxCache']
            scName = config['SandboxCache'].split('.').pop()
            scModule = __import__(scName1, globals(), locals(), [scName1])
            scClass = vars(scModule)[scName]
            self.sandboxcache = scClass()
            logger.debug('load %s as SandboxCache' % scName)
        except:
            logger.debug('load default SandboxCache')

    def __refresh_jobinfo__(self, job):
        '''Refresh the lcg jobinfo. It will be called after resubmission.'''
        job.backend.status = ''
        job.backend.reason = ''
        job.backend.actualCE = ''
        job.backend.exitcode = ''
        job.backend.exitcode_cream = ''
        job.backend.workernode = ''
        job.backend.isbURI = ''
        job.backend.osbURI = ''

    def __setup_sandboxcache__(self, job):
        '''Sets up the sandbox cache object to adopt the runtime configuration of the LCG backend'''

        re_token = re.compile('^token:(.*):(.*)$')

        self.sandboxcache.vo = config['VirtualOrganisation']
        self.sandboxcache.middleware = 'GLITE'
        self.sandboxcache.timeout = config['SandboxTransferTimeout']

        if self.sandboxcache._name == 'LCGSandboxCache':
            if not self.sandboxcache.lfc_host:
                self.sandboxcache.lfc_host = grids[
                    self.sandboxcache.middleware].__get_lfc_host__()

            if not self.sandboxcache.se:

                token = ''
                se_host = config['DefaultSE']
                m = re_token.match(se_host)
                if m:
                    token = m.group(1)
                    se_host = m.group(2)

                self.sandboxcache.se = se_host

                if token:
                    self.sandboxcache.srm_token = token

            if (self.sandboxcache.se_type in ['srmv2']) and (not self.sandboxcache.srm_token):
                self.sandboxcache.srm_token = config['DefaultSRMToken']

        elif self.sandboxcache._name == 'DQ2SandboxCache':

            # generate a new dataset name if not given
            if not self.sandboxcache.dataset_name:
                from GangaAtlas.Lib.ATLASDataset.DQ2Dataset import dq2outputdatasetname
                self.sandboxcache.dataset_name, unused = dq2outputdatasetname(
                    "%s.input" % get_uuid(), 0, False, '')

            # subjobs inherits the dataset name from the master job
            for sj in job.subjobs:
                sj.backend.sandboxcache.dataset_name = self.sandboxcache.dataset_name

        elif self.sandboxcache._name == 'GridftpSandboxCache':
            if config['CreamInputSandboxBaseURI']:
                self.sandboxcache.baseURI = config['CreamInputSandboxBaseURI']
            elif self.CE:
                ce_host = re.sub(r'\:[0-9]+', '', self.CE.split('/cream')[0])
                self.sandboxcache.baseURI = 'gsiftp://%s/opt/glite/var/cream_sandbox/%s' % (
                    ce_host, self.sandboxcache.vo)
            else:
                logger.error('baseURI not available for GridftpSandboxCache')
                return False

        return True

    def __check_and_prestage_inputfile__(self, file):
        '''Checks the given input file size and if it's size is
           over "BoundSandboxLimit", prestage it to a grid SE.

           The argument is a path of the local file.

           It returns a dictionary containing information to refer to the file:

               idx = {'lfc_host': lfc_host,
                      'local': [the local file pathes],
                      'remote': {'fname1': 'remote index1', 'fname2': 'remote index2', ... }
                     }

           If prestaging failed, None object is returned.

           If the file has been previously uploaded (according to md5sum),
           the prestaging is ignored and index to the previously uploaded file
           is returned.
           '''

        idx = {'lfc_host': '', 'local': [], 'remote': {}}

        job = self.getJobObject()

        # read-in the previously uploaded files
        uploadedFiles = []

        # getting the uploaded file list from the master job
        if job.master:
            uploadedFiles += job.master.backend.sandboxcache.get_cached_files()

        # set and get the $LFC_HOST for uploading oversized sandbox
        self.__setup_sandboxcache__(job)

        uploadedFiles += self.sandboxcache.get_cached_files()

        lfc_host = None

        # for LCGSandboxCache, take the one specified in the sansboxcache object.
        # the value is exactly the same as the one from the local grid shell env. if
        # it is not specified exclusively.
        if self.sandboxcache._name == 'LCGSandboxCache':
            lfc_host = self.sandboxcache.lfc_host

        # or in general, query it from the Grid object
        if not lfc_host:
            lfc_host = grids[
                self.sandboxcache.middleware.upper()].__get_lfc_host__()

        idx['lfc_host'] = lfc_host

        abspath = os.path.abspath(file)
        fsize = os.path.getsize(abspath)

        if fsize > config['BoundSandboxLimit']:

            md5sum = get_md5sum(abspath, ignoreGzipTimestamp=True)

            doUpload = True
            for uf in uploadedFiles:
                if uf.md5sum == md5sum:
                    # the same file has been uploaded to the iocache
                    idx['remote'][os.path.basename(file)] = uf.id
                    doUpload = False
                    break

            if doUpload:

                logger.warning(
                    'The size of %s is larger than the sandbox limit (%d byte). Please wait while pre-staging ...' % (file, config['BoundSandboxLimit']))

                if self.sandboxcache.upload([abspath]):
                    remote_sandbox = self.sandboxcache.get_cached_files()[-1]
                    idx['remote'][remote_sandbox.name] = remote_sandbox.id
                else:
                    logger.error(
                        'Oversized sandbox not successfully pre-staged')
                    return None
        else:
            idx['local'].append(abspath)

        return idx

    def __mt_job_prepare__(self, rjobs, subjobconfigs, masterjobconfig):
        '''preparing jobs in multiple threads'''

        logger.warning(
            'preparing %d subjobs ... it may take a while' % len(rjobs))

        # prepare the master job (i.e. create shared inputsandbox, etc.)
        master_input_sandbox = IBackend.master_prepare(self, masterjobconfig)

        # uploading the master job if it's over the WMS sandbox limitation
        for f in master_input_sandbox:
            master_input_idx = self.__check_and_prestage_inputfile__(f)

            if not master_input_idx:
                logger.error('master input sandbox perparation failed: %s' % f)
                return None

        # the algorithm for preparing a single bulk job
        class MyAlgorithm(Algorithm):

            def __init__(self):
                Algorithm.__init__(self)

            def process(self, sj_info):
                my_sc = sj_info[0]
                my_sj = sj_info[1]

                try:
                    logger.debug("preparing job %s" % my_sj.getFQID('.'))
                    jdlpath = my_sj.backend.preparejob(
                        my_sc, master_input_sandbox)

                    if (not jdlpath) or (not os.path.exists(jdlpath)):
                        raise GangaException(
                            'job %s not properly prepared' % my_sj.getFQID('.'))

                    self.__appendResult__(my_sj.id, jdlpath)
                    return True
                except Exception as x:
                    log_user_exception()
                    return False

        mt_data = []
        for sc, sj in zip(subjobconfigs, rjobs):
            mt_data.append([sc, sj])

        myAlg = MyAlgorithm()
        myData = Data(collection=mt_data)

        runner = MTRunner(
            name='lcg_jprepare', algorithm=myAlg, data=myData, numThread=10)
        runner.start()
        runner.join(-1)

        if len(runner.getDoneList()) < len(mt_data):
            return None
        else:
            # return a JDL file dictionary with subjob ids as keys, JDL file
            # paths as values
            return runner.getResults()

    def __mt_bulk_submit__(self, node_jdls):
        '''submitting jobs in multiple threads'''

        job = self.getJobObject()

        logger.warning(
            'submitting %d subjobs ... it may take a while' % len(node_jdls))

        # the algorithm for submitting a single bulk job
        class MyAlgorithm(Algorithm):

            def __init__(self, gridObj, masterInputWorkspace, ce):
                Algorithm.__init__(self)
                self.inpw = masterInputWorkspace
                self.gridObj = gridObj
                self.ce = ce

            def process(self, jdl_info):
                my_sj_id = jdl_info[0]
                my_sj_jdl = jdl_info[1]

                my_sj_jid = self.gridObj.cream_submit(my_sj_jdl, self.ce)

                if not my_sj_jid:
                    return False
                else:
                    self.__appendResult__(my_sj_id, my_sj_jid)
                    return True

        mt_data = []
        for id, jdl in node_jdls.items():
            mt_data.append((id, jdl))

        myAlg = MyAlgorithm(
            gridObj=grids['GLITE'], masterInputWorkspace=job.getInputWorkspace(), ce=self.CE)
        myData = Data(collection=mt_data)

        runner = MTRunner(name='cream_jsubmit', algorithm=myAlg,
                          data=myData, numThread=config['SubmissionThread'])
        runner.start()
        runner.join(timeout=-1)

        if len(runner.getDoneList()) < len(mt_data):
            # not all bulk jobs are successfully submitted. canceling the
            # submitted jobs on WMS immediately
            logger.error(
                'some bulk jobs not successfully (re)submitted, canceling submitted jobs on WMS')
            grids['GLITE'].cancelMultiple(runner.getResults().values())
            return None
        else:
            return runner.getResults()

    def __jobWrapperTemplate__(self):
        '''Create job wrapper'''

        script = """#!/usr/bin/env python
#-----------------------------------------------------
# This job wrapper script is automatically created by
# GANGA LCG backend handler.
#
# It controls:
# 1. unpack input sandbox
# 2. invoke application executable
# 3. invoke monitoring client
#-----------------------------------------------------
import os,os.path,shutil,tempfile
import sys,popen2,time,traceback

#bugfix #36178: subprocess.py crashes if python 2.5 is used
#try to import subprocess from local python installation before an
#import from PYTHON_DIR is attempted some time later
try:
    import subprocess
except ImportError:
    pass

## Utility functions ##
def timeString():
    return time.strftime('%a %b %d %H:%M:%S %Y',time.gmtime(time.time()))

def printInfo(s):
    out.write(timeString() + '  [Info]' +  ' ' + str(s) + os.linesep)
    out.flush()

def printError(s):
    out.write(timeString() + ' [Error]' +  ' ' + str(s) + os.linesep)
    out.flush()

def lcg_file_download(vo,guid,localFilePath,timeout=60,maxRetry=3):
    cmd = 'lcg-cp -t %d --vo %s %s file://%s' % (timeout,vo,guid,localFilePath)

    printInfo('LFC_HOST set to %s' % os.environ['LFC_HOST'])
    printInfo('lcg-cp timeout: %d' % timeout)

    i         = 0
    rc        = 0
    isDone    = False
    try_again = True

    while try_again:
        i = i + 1
        try:
            ps = os.popen(cmd)
            status = ps.close()

            if not status:
                isDone = True
                printInfo('File %s download from iocache' % os.path.basename(localFilePath))
            else:
                raise IOError("Download file %s from iocache failed with error code: %d, trial %d." % (os.path.basename(localFilePath), status, i))

        except IOError as e:
            isDone = False
            printError(str(e))

        if isDone:
            try_again = False
        elif i == maxRetry:
            try_again = False
        else:
            try_again = True

    return isDone

## system command executor with subprocess
def execSyscmdSubprocess(cmd, wdir=os.getcwd()):

    import os, subprocess

    global exitcode

    outfile   = file('stdout','w')
    errorfile = file('stderr','w')

    try:
        child = subprocess.Popen(cmd, cwd=wdir, shell=True, stdout=outfile, stderr=errorfile)

        while 1:
            exitcode = child.poll()
            if exitcode is not None:
                break
            else:
                outfile.flush()
                errorfile.flush()
                time.sleep(0.3)
    finally:
        pass

    outfile.flush()
    errorfile.flush()
    outfile.close()
    errorfile.close()

    return True

## system command executor with multi-thread
## stderr/stdout handler
def execSyscmdEnhanced(cmd, wdir=os.getcwd()):

    import os, threading

    cwd = os.getcwd()

    isDone = False

    try:
        ## change to the working directory
        os.chdir(wdir)

        child = popen2.Popen3(cmd,1)
        child.tochild.close() # don't need stdin

        class PipeThread(threading.Thread):

            def __init__(self,infile,outfile,stopcb):
                self.outfile = outfile
                self.infile = infile
                self.stopcb = stopcb
                self.finished = 0
                threading.Thread.__init__(self)

            def run(self):
                stop = False
                while not stop:
                    buf = self.infile.read(10000)
                    self.outfile.write(buf)
                    self.outfile.flush()
                    time.sleep(0.01)
                    stop = self.stopcb()
                #FIXME: should we do here?: self.infile.read()
                #FIXME: this is to make sure that all the output is read (if more than buffer size of output was produced)
                self.finished = 1

        def stopcb(poll=False):
            global exitcode
            if poll:
                exitcode = child.poll()
            return exitcode != -1

        out_thread = PipeThread(child.fromchild, sys.stdout, stopcb)
        err_thread = PipeThread(child.childerr, sys.stderr, stopcb)

        out_thread.start()
        err_thread.start()
        while not out_thread.finished and not err_thread.finished:
            stopcb(True)
            time.sleep(0.3)

        sys.stdout.flush()
        sys.stderr.flush()

        isDone = True

    except(Exception,e):
        isDone = False

    ## return to the original directory
    os.chdir(cwd)

    return isDone

############################################################################################

###INLINEMODULES###

############################################################################################

## Main program ##

outputsandbox = ###OUTPUTSANDBOX###
input_sandbox = ###INPUTSANDBOX###
wrapperlog = ###WRAPPERLOG###
appexec = ###APPLICATIONEXEC###
appargs = ###APPLICATIONARGS###
appenvs = ###APPLICATIONENVS###
timeout = ###TRANSFERTIMEOUT###

exitcode=-1

import sys, stat, os, os.path, commands

# Change to scratch directory if provided
scratchdir = ''
tmpdir = ''

orig_wdir = os.getcwd()

# prepare log file for job wrapper
out = open(os.path.join(orig_wdir, wrapperlog),'w')

if os.getenv('EDG_WL_SCRATCH'):
    scratchdir = os.getenv('EDG_WL_SCRATCH')
elif os.getenv('TMPDIR'):
    scratchdir = os.getenv('TMPDIR')

if scratchdir:
    (status, tmpdir) = commands.getstatusoutput('mktemp -d %s/gangajob_XXXXXXXX' % (scratchdir))
    if status == 0:
        os.chdir(tmpdir)
    else:
        ## if status != 0, tmpdir should contains error message so print it to stderr
        printError('Error making ganga job scratch dir: %s' % tmpdir)
        printInfo('Unable to create ganga job scratch dir in %s. Run directly in: %s' % ( scratchdir, os.getcwd() ) )

        ## reset scratchdir and tmpdir to disable the usage of Ganga scratch dir
        scratchdir = ''
        tmpdir = ''

wdir = os.getcwd()

if scratchdir:
    printInfo('Changed working directory to scratch directory %s' % tmpdir)
    try:
        os.system("ln -s %s %s" % (os.path.join(orig_wdir, 'stdout'), os.path.join(wdir, 'stdout')))
        os.system("ln -s %s %s" % (os.path.join(orig_wdir, 'stderr'), os.path.join(wdir, 'stderr')))
    except Exception as e:
        printError(sys.exc_info()[0])
        printError(sys.exc_info()[1])
        str_traceback = traceback.format_tb(sys.exc_info()[2])
        for str_tb in str_traceback:
            printError(str_tb)
        printInfo('Linking stdout & stderr to original directory failed. Looking at stdout during job run may not be possible')

os.environ['PATH'] = '.:'+os.environ['PATH']

vo = os.environ['GANGA_LCG_VO']

try:
    printInfo('Job Wrapper start.')

#   download inputsandbox from remote cache
    for f,guid in input_sandbox['remote'].iteritems():
        if not lcg_file_download(vo, guid, os.path.join(wdir,f), timeout=int(timeout)):
            raise IOError('Download remote input %s:%s failed.' % (guid,f) )
        else:
            getPackedInputSandbox(f)

    printInfo('Download inputsandbox from iocache passed.')

#   unpack inputsandbox from wdir
    for f in input_sandbox['local']:
        getPackedInputSandbox(os.path.join(orig_wdir,f))

    printInfo('Unpack inputsandbox passed.')

    #get input files
    ###DOWNLOADINPUTFILES###

    printInfo('Loading Python modules ...')

    sys.path.insert(0,os.path.join(wdir,PYTHON_DIR))

    # check the python library path
    try:
        printInfo(' ** PYTHON_DIR: %s' % os.environ['PYTHON_DIR'])
    except KeyError:
        pass

    try:
        printInfo(' ** PYTHONPATH: %s' % os.environ['PYTHONPATH'])
    except KeyError:
        pass

    for lib_path in sys.path:
        printInfo(' ** sys.path: %s' % lib_path)

#   execute application

    ## convern appenvs into environment setup script to be 'sourced' before executing the user executable

    printInfo('Prepare environment variables for application executable')

    env_setup_script = os.path.join(os.getcwd(), '__ganga_lcg_env__.sh')

    f = open( env_setup_script, 'w')
    f.write('#!/bin/sh' + os.linesep )
    f.write('##user application environmet setup script generated by Ganga job wrapper' + os.linesep)
    for k,v in appenvs.items():

        str_env = 'export %s="%s"' % (k, v)

        printInfo(' ** ' + str_env)
        
        f.write(str_env + os.linesep)
    f.close()

    try: #try to make shipped executable executable
        os.chmod('%s/%s'% (wdir,appexec),stat.S_IXUSR|stat.S_IRUSR|stat.S_IWUSR)
    except:
        pass

    status = False
    try:
        # use subprocess to run the user's application if the module is available on the worker node
        import subprocess
        printInfo('Load application executable with subprocess module')
        status = execSyscmdSubprocess('source %s; %s %s' % (env_setup_script, appexec, appargs), wdir)
    except ImportError as err:
        # otherwise, use separate threads to control process IO pipes
        printInfo('Load application executable with separate threads')
        status = execSyscmdEnhanced('source %s; %s %s' % (env_setup_script, appexec, appargs), wdir)

    os.system("cp %s/stdout stdout.1" % orig_wdir)
    os.system("cp %s/stderr stderr.1" % orig_wdir)

    printInfo('GZipping stdout and stderr...')

    os.system("gzip stdout.1 stderr.1")

    # move them to the original wdir so they can be picked up
    os.system("mv stdout.1.gz %s/stdout.gz" % orig_wdir)
    os.system("mv stderr.1.gz %s/stderr.gz" % orig_wdir)

    if not status:
        raise OSError('Application execution failed.')
    printInfo('Application execution passed with exit code %d.' % exitcode)      

    ###OUTPUTUPLOADSPOSTPROCESSING###

    for f in os.listdir(os.getcwd()):
        command = "cp %s %s" % (os.path.join(os.getcwd(),f), os.path.join(orig_wdir,f))
        os.system(command)            

    createPackedOutputSandbox(outputsandbox,None,orig_wdir)

#   pack outputsandbox
#    printInfo('== check output ==')
#    for line in os.popen('pwd; ls -l').readlines():
#        printInfo(line)

    printInfo('Pack outputsandbox passed.')

    # Clean up after us - All log files and packed outputsandbox should be in "wdir"
    if scratchdir:
        os.chdir(orig_wdir)
        os.system("rm %s -rf" % wdir)
except Exception as e:
    printError(sys.exc_info()[0])
    printError(sys.exc_info()[1])
    str_traceback = traceback.format_tb(sys.exc_info()[2])
    for str_tb in str_traceback:
        printError(str_tb)

printInfo('Job Wrapper stop.')

out.close()

# always return exit code 0 so the in the case of application failure
# one can always get stdout and stderr back to the UI for debug.
sys.exit(0)
"""
        return script

    def preparejob(self, jobconfig, master_job_sandbox):
        '''Prepare the JDL'''

        script = self.__jobWrapperTemplate__()

        job = self.getJobObject()
        inpw = job.getInputWorkspace()

        wrapperlog = '__jobscript__.log'

        import Ganga.Core.Sandbox as Sandbox

        # FIXME: check what happens if 'stdout','stderr' are specified here
        script = script.replace(
            '###OUTPUTSANDBOX###', repr(jobconfig.outputbox))

        script = script.replace(
            '###APPLICATION_NAME###', getName(job.application))
        script = script.replace(
            '###APPLICATIONEXEC###', repr(jobconfig.getExeString()))
        script = script.replace(
            '###APPLICATIONARGS###', repr(jobconfig.getArguments()))

        from Ganga.GPIDev.Lib.File.OutputFileManager import getWNCodeForOutputPostprocessing, getWNCodeForDownloadingInputFiles

        script = script.replace(
            '###OUTPUTUPLOADSPOSTPROCESSING###', getWNCodeForOutputPostprocessing(job, '    '))

        script = script.replace(
            '###DOWNLOADINPUTFILES###', getWNCodeForDownloadingInputFiles(job, '    '))

        if jobconfig.env:
            script = script.replace(
                '###APPLICATIONENVS###', repr(jobconfig.env))
        else:
            script = script.replace('###APPLICATIONENVS###', repr({}))

        script = script.replace('###WRAPPERLOG###', repr(wrapperlog))
        import inspect
        script = script.replace(
            '###INLINEMODULES###', inspect.getsource(Sandbox.WNSandbox))

        mon = job.getMonitoringService()

        self.monInfo = None

        # set the monitoring file by default to the stdout
        if isinstance(self.monInfo, dict):
            self.monInfo['remotefile'] = 'stdout'

        # try to print out the monitoring service information in debug mode
        try:
            logger.debug('job info of monitoring service: %s' %
                         str(self.monInfo))
        except:
            pass

#       prepare input/output sandboxes
        packed_files = jobconfig.getSandboxFiles() + Sandbox.getGangaModulesAsSandboxFiles(Sandbox.getDefaultModules())
        sandbox_files = job.createPackedInputSandbox(packed_files)

        # sandbox of child jobs should include master's sandbox
        sandbox_files.extend(master_job_sandbox)

        # check the input file size and pre-upload larger inputs to the iocache
        lfc_host = ''

        input_sandbox_uris = []
        input_sandbox_names = []

        ick = True

        max_prestaged_fsize = 0
        for f in sandbox_files:

            idx = self.__check_and_prestage_inputfile__(f)

            if not idx:
                logger.error('input sandbox preparation failed: %s' % f)
                ick = False
                break
            else:

                if idx['lfc_host']:
                    lfc_host = idx['lfc_host']

                if idx['remote']:
                    abspath = os.path.abspath(f)
                    fsize = os.path.getsize(abspath)

                    if fsize > max_prestaged_fsize:
                        max_prestaged_fsize = fsize

                    input_sandbox_uris.append(
                        idx['remote'][os.path.basename(f)])

                    input_sandbox_names.append(
                        os.path.basename(urlparse(f)[2]))

                if idx['local']:
                    input_sandbox_uris += idx['local']
                    input_sandbox_names.append(os.path.basename(f))

        if not ick:
            logger.error('stop job submission')
            return None

        # determin the lcg-cp timeout according to the max_prestaged_fsize
        # - using the assumption of 1 MB/sec.
        max_prestaged_fsize = 0
        lfc_host = ''
        transfer_timeout = config['SandboxTransferTimeout']
        predict_timeout = int(math.ceil(max_prestaged_fsize / 1000000.0))

        if predict_timeout > transfer_timeout:
            transfer_timeout = predict_timeout

        if transfer_timeout < 60:
            transfer_timeout = 60

        script = script.replace(
            '###TRANSFERTIMEOUT###', '%d' % transfer_timeout)

        # update the job wrapper with the inputsandbox list
        script = script.replace(
            '###INPUTSANDBOX###', repr({'remote': {}, 'local': input_sandbox_names}))

        # write out the job wrapper and put job wrapper into job's inputsandbox
        scriptPath = inpw.writefile(
            FileBuffer('__jobscript_%s__' % job.getFQID('.'), script), executable=1)
        input_sandbox = input_sandbox_uris + [scriptPath]

        for isb in input_sandbox:
            logger.debug('ISB URI: %s' % isb)

        # compose output sandbox to include by default the following files:
        # - gzipped stdout (transferred only when the JobLogHandler is WMS)
        # - gzipped stderr (transferred only when the JobLogHandler is WMS)
        # - __jobscript__.log (job wrapper's log)
        output_sandbox = [wrapperlog]

        from Ganga.GPIDev.Lib.File.OutputFileManager import getOutputSandboxPatterns
        for outputSandboxPattern in getOutputSandboxPatterns(job):
            output_sandbox.append(outputSandboxPattern)

        if config['JobLogHandler'] in ['WMS']:
            output_sandbox += ['stdout.gz', 'stderr.gz']

        if len(jobconfig.outputbox):
            output_sandbox += [Sandbox.OUTPUT_TARBALL_NAME]

        # compose LCG JDL
        jdl = {
            'VirtualOrganisation': config['VirtualOrganisation'],
            'Executable': os.path.basename(scriptPath),
            'Environment': {'GANGA_LCG_VO': config['VirtualOrganisation'], 'GANGA_LOG_HANDLER': config['JobLogHandler'], 'LFC_HOST': lfc_host},
            'StdOutput': 'stdout',
            'StdError': 'stderr',
            'InputSandbox': input_sandbox,
            'OutputSandbox': output_sandbox,
            'OutputSandboxBaseDestURI': 'gsiftp://localhost'
        }

        jdl['Environment'].update({'GANGA_LCG_CE': self.CE})
        jdl['Requirements'] = self.requirements.merge(
            jobconfig.requirements).convert()

        if self.jobtype.upper() in ['NORMAL', 'MPICH']:
            jdl['JobType'] = self.jobtype.upper()
            if self.jobtype.upper() == 'MPICH':
                #jdl['Requirements'].append('(other.GlueCEInfoTotalCPUs >= NodeNumber)')
                jdl['Requirements'].append(
                    'Member("MPICH",other.GlueHostApplicationSoftwareRunTimeEnvironment)')
                jdl['NodeNumber'] = self.requirements.nodenumber
        else:
            logger.warning('JobType "%s" not supported' % self.jobtype)
            return

#       additional settings from the job
#        if jobconfig.env:
#            jdl['Environment'].update(jobconfig.env)

        jdlText = Grid.expandjdl(jdl)
        logger.debug('subjob JDL: %s' % jdlText)
        return inpw.writefile(FileBuffer('__jdlfile__', jdlText))

    def kill(self):
        '''Kill the job'''
        job = self.getJobObject()

        logger.info('Killing job %s' % job.getFQID('.'))

        if not self.id:
            logger.warning('Job %s is not running.' % job.getFQID('.'))
            return False

        return grids['GLITE'].cream_cancelMultiple([self.id])

    def master_kill(self):
        '''kill the master job to the grid'''

        job = self.getJobObject()

        if not job.master and len(job.subjobs) == 0:
            return IBackend.master_kill(self)
        elif job.master:
            return IBackend.master_kill(self)
        else:
            return self.master_bulk_kill()

    def master_bulk_kill(self):
        '''GLITE bulk resubmission'''

        job = self.getJobObject()

        # killing the individually re-submitted subjobs
        logger.debug('cancelling running/submitted subjobs.')

        # 1. collect job ids
        ids = []
        for sj in job.subjobs:
            if sj.status in ['submitted', 'running'] and sj.backend.id:
                ids.append(sj.backend.id)

        # 2. cancel the collected jobs
        ck = grids['GLITE'].cream_cancelMultiple(ids)
        if not ck:
            logger.warning('Job cancellation failed')
            return False
        else:
            for sj in job.subjobs:
                if sj.backend.id in ids:
                    sj.updateStatus('killed')

            return True

    def master_bulk_submit(self, rjobs, subjobconfigs, masterjobconfig):
        '''submit multiple subjobs in parallel, by default using 10 concurrent threads'''

        assert(implies(rjobs, len(subjobconfigs) == len(rjobs)))

        # prepare the subjobs, jdl repository before bulk submission
        node_jdls = self.__mt_job_prepare__(
            rjobs, subjobconfigs, masterjobconfig)

        if not node_jdls:
            logger.error('Some jobs not successfully prepared')
            return False

        # set all subjobs to submitting status
        for sj in rjobs:
            sj.updateStatus('submitting')

        node_jids = self.__mt_bulk_submit__(node_jdls)

        status = False

        if node_jids:
            for sj in rjobs:
                if sj.id in node_jids.keys():
                    sj.backend.id = node_jids[sj.id]
                    sj.backend.CE = self.CE
                    sj.backend.actualCE = sj.backend.CE
                    sj.updateStatus('submitted')
                    sj.info.submit_counter += 1
                else:
                    logger.warning(
                        'subjob %s not successfully submitted' % sj.getFQID('.'))

            status = True

        return status

    def master_bulk_resubmit(self, rjobs):
        '''CREAM bulk resubmission'''

        from Ganga.Utility.logging import log_user_exception

#        job = self.getJobObject()

        # compose master JDL for collection job
        node_jdls = {}
        for sj in rjobs:
            jdlpath = os.path.join(sj.inputdir, '__jdlfile__')
            node_jdls[sj.id] = jdlpath

        # set all subjobs to submitting status
        for sj in rjobs:
            sj.updateStatus('submitting')

        node_jids = self.__mt_bulk_submit__(node_jdls)

        status = False

        if node_jids:
            for sj in rjobs:
                if sj.id in node_jids.keys():
                    self.__refresh_jobinfo__(sj)
                    sj.backend.id = node_jids[sj.id]
                    sj.backend.CE = self.CE
                    sj.backend.actualCE = sj.backend.CE
                    sj.updateStatus('submitted')
                    sj.info.submit_counter += 1
                else:
                    logger.warning(
                        'subjob %s not successfully submitted' % sj.getFQID('.'))

            status = True

#            # set all subjobs to submitted status
#            # NOTE: this is just a workaround to avoid the unexpected transition
#            #       that turns the master job's status from 'submitted' to 'submitting'.
#            #       As this transition should be allowed to simulate a lock mechanism in Ganga 4, the workaround
#            #       is to set all subjobs' status to 'submitted' so that the transition can be avoided.
#            #       A more clear solution should be implemented with the lock mechanism introduced in Ganga 5.
#            for sj in rjobs:
#                sj.updateStatus('submitted')
#                sj.info.submit_counter += 1

        return status

    def master_submit(self, rjobs, subjobconfigs, masterjobconfig):
        '''Submit the master job to the grid'''

        profiler = ElapsedTimeProfiler(getLogger(name='Profile.LCG'))
        profiler.start()

        job = self.getJobObject()

        # finding CREAM CE endpoint for job submission
        allowed_celist = []
        try:
            allowed_celist = self.requirements.getce()
            if not self.CE and allowed_celist:
                self.CE = allowed_celist[0]
        except:
            logger.warning(
                'CREAM CE assigment from AtlasCREAMRequirements failed.')

        if self.CE and allowed_celist:
            if self.CE not in allowed_celist:
                logger.warning('submission to CE not allowed: %s, use %s instead' % (
                    self.CE, allowed_celist[0]))
                self.CE = allowed_celist[0]

        if not self.CE:
            raise GangaException('CREAM CE endpoint not set')

        # delegate proxy to CREAM CE
        if not grids['GLITE'].cream_proxy_delegation(self.CE):
            logger.warning('proxy delegation to %s failed' % self.CE)

        # doing massive job preparation
        if len(job.subjobs) == 0:
            ick = IBackend.master_submit(
                self, rjobs, subjobconfigs, masterjobconfig)
        else:
            ick = self.master_bulk_submit(
                rjobs, subjobconfigs, masterjobconfig)

        profiler.check('==> master_submit() elapsed time')

        return ick

    def submit(self, subjobconfig, master_job_sandbox):
        '''Submit the job to the grid'''

        ick = False

        jdlpath = self.preparejob(subjobconfig, master_job_sandbox)

        if jdlpath:
            self.id = grids['GLITE'].cream_submit(jdlpath, self.CE)

            if self.id:
                self.actualCE = self.CE
                ick = True

        return ick

    def master_auto_resubmit(self, rjobs):
        """
        Resubmit each subjob individually as bulk resubmission will overwrite
        previous master job statuses
        """

        # check for master failure - in which case bulk resubmit
        mj = self._getParent()
        if mj.status == 'failed':
            return self.master_resubmit(rjobs)

        for j in rjobs:
            if not j.backend.master_resubmit([j]):
                return False

        return True

    def master_resubmit(self, rjobs):
        '''Resubmit the master job to the grid'''

        profiler = ElapsedTimeProfiler(getLogger(name='Profile.LCG'))
        profiler.start()

        job = self.getJobObject()

        ick = False

        # delegate proxy to CREAM CE
        if not grids['GLITE'].cream_proxy_delegation(self.CE):
            logger.warning('proxy delegation to %s failed' % self.CE)

        if not job.master and len(job.subjobs) == 0:
            # case 1: master job normal resubmission
            logger.debug('rjobs: %s' % str(rjobs))
            logger.debug('mode: master job normal resubmission')
            ick = IBackend.master_resubmit(self, rjobs)

        elif job.master:
            # case 2: individual subjob resubmission
            logger.debug('mode: individual subjob resubmission')
            ick = IBackend.master_resubmit(self, rjobs)

        else:
            # case 3: master job bulk resubmission
            logger.debug('mode: master job resubmission')

            ick = self.master_bulk_resubmit(rjobs)
            if not ick:
                raise GangaException('CREAM bulk submission failure')

        profiler.check('job re-submission elapsed time')

        return ick

    def resubmit(self):
        '''Resubmit the job'''

        ick = False

        job = self.getJobObject()

        jdlpath = job.getInputWorkspace().getPath("__jdlfile__")

        if jdlpath:
            self.id = grids['GLITE'].cream_submit(jdlpath, self.CE)

            if self.id:
                # refresh the lcg job information
                self.__refresh_jobinfo__(job)
                self.actualCE = self.CE
                ick = True

        return ick

    @staticmethod
    def updateMonitoringInformation(jobs):
        '''Monitoring loop for normal jobs'''

        jobdict = dict([[job.backend.id, job]
                        for job in jobs if job.backend.id])

        jobInfoDict = grids['GLITE'].cream_status(jobdict.keys())

        jidListForPurge = []

        # update job information for those available in jobInfoDict
        for id, info in jobInfoDict.items():

            if info:

                job = jobdict[id]

                if job.backend.status != info['Current Status'] and ('ExitCode' not in info or ('ExitCode' in info and info['ExitCode'].isdigit())):

                    if 'Worker Node' in info:
                        job.backend.workernode = info['Worker Node']

                    if 'CREAM ISB URI' in info:
                        job.backend.isbURI = info['CREAM ISB URI']

                    if 'CREAM OSB URI' in info:
                        job.backend.osbURI = info['CREAM OSB URI']

                    doStatusUpdate = True

                    # no need to update Ganga job status if backend status is
                    # not changed
                    if info['Current Status'] == job.backend.status:
                        doStatusUpdate = False

                    # download output sandboxes if final status is reached
                    elif info['Current Status'] in ['DONE-OK', 'DONE-FAILED']:

                        # resolve output sandbox URIs based on the JDL
                        # information
                        osbURIList = __cream_resolveOSBList__(job, info['JDL'])

                        logger.debug('OSB list:')
                        for f in osbURIList:
                            logger.debug(f)

                        if osbURIList:

                            if grids['GLITE'].cream_get_output(osbURIList, job.getOutputWorkspace(create=True).getPath() ):
                                (ick, app_exitcode) = grids['GLITE'].__get_app_exitcode__(
                                    job.getOutputWorkspace(create=True).getPath() )
                                job.backend.exitcode = app_exitcode

                                jidListForPurge.append(job.backend.id)

                            else:
                                logger.error(
                                    'fail to download job output: %s' % jobdict[id].getFQID('.'))

                    if doStatusUpdate:
                        job.backend.status = info['Current Status']
                        if 'ExitCode' in info and info['ExitCode'] != "W":
                            try:
                                job.backend.exitcode_cream = int(
                                    info['ExitCode'])
                            except:
                                job.backend.exitcode_cream = 1

                        if 'FailureReason' in info:
                            try:
                                job.backend.reason = info['FailureReason']
                            except:
                                pass

                        job.backend.updateGangaJobStatus()
            else:
                logger.warning(
                    'fail to retrieve job informaton: %s' % jobdict[id].getFQID('.'))

            # purging the jobs the output has been fetched locally
            if jidListForPurge:
                grids['GLITE'].cream_purgeMultiple(jidListForPurge)

    def updateGangaJobStatus(self):
        '''map backend job status to Ganga job status'''

        job = self.getJobObject()

        if self.status in ['RUNNING', 'REALLY-RUNNING']:
            job.updateStatus('running')

        elif self.status == 'DONE-OK':
            if job.backend.exitcode and job.backend.exitcode != 0:
                job.backend.reason = 'non-zero app. exit code: %s' % repr(
                    job.backend.exitcode)
                job.updateStatus('failed')
            elif job.backend.exitcode_cream and job.backend.exitcode_cream != 0:
                job.backend.reason = 'non-zero CREAM job exit code: %s' % repr(
                    job.backend.exitcode_cream)
                job.updateStatus('failed')
            else:
                job.updateStatus('completed')

        elif self.status in ['DONE-FAILED', 'ABORTED', 'UNKNOWN']:
            job.updateStatus('failed')

        elif self.status in ['CANCELLED']:
            job.updateStatus('killed')

        elif self.status in ['REGISTERED', 'PENDING', 'IDLE', 'HELD']:
            pass

        else:
            logger.warning('Unexpected job status "%s"', self.status)
Exemplo n.º 13
0
class ARC(IBackend):
    '''ARC backend - direct job submission to an ARC CE'''
    _schema = Schema(
        Version(1, 0), {
            'CE':
            SimpleItem(defvalue='', doc='ARC CE endpoint'),
            'jobtype':
            SimpleItem(defvalue='Normal', doc='Job type: Normal, MPICH'),
            'requirements':
            ComponentItem('LCGRequirements',
                          doc='Requirements for the resource selection'),
            'sandboxcache':
            ComponentItem(
                'GridSandboxCache',
                copyable=1,
                doc='Interface for handling oversized input sandbox'),
            'id':
            SimpleItem(defvalue='',
                       typelist=[str, list],
                       protected=1,
                       copyable=0,
                       doc='Middleware job identifier'),
            'status':
            SimpleItem(defvalue='',
                       typelist=[str, dict],
                       protected=1,
                       copyable=0,
                       doc='Middleware job status'),
            'exitcode':
            SimpleItem(defvalue='',
                       protected=1,
                       copyable=0,
                       doc='Application exit code'),
            'exitcode_arc':
            SimpleItem(defvalue='',
                       protected=1,
                       copyable=0,
                       doc='Middleware exit code'),
            'actualCE':
            SimpleItem(defvalue='',
                       protected=1,
                       copyable=0,
                       doc='The ARC CE where the job actually runs.'),
            'reason':
            SimpleItem(defvalue='',
                       protected=1,
                       copyable=0,
                       doc='Reason of causing the job status'),
            'workernode':
            SimpleItem(defvalue='',
                       protected=1,
                       copyable=0,
                       doc='The worker node on which the job actually runs.'),
            'isbURI':
            SimpleItem(defvalue='',
                       protected=1,
                       copyable=0,
                       doc='The input sandbox URI on ARC CE'),
            'osbURI':
            SimpleItem(defvalue='',
                       protected=1,
                       copyable=0,
                       doc='The output sandbox URI on ARC CE'),
            'verbose':
            SimpleItem(defvalue=False,
                       doc='Use verbose options for ARC commands')
        })

    _category = 'backends'

    _name = 'ARC'

    def __init__(self):
        super(ARC, self).__init__()

        # dynamic requirement object loading
        try:
            reqName1 = config['Requirements']
            reqName = config['Requirements'].split('.').pop()
            reqModule = __import__(reqName1, globals(), locals(), [reqName1])
            reqClass = vars(reqModule)[reqName]
            self.requirements = reqClass()

            logger.debug('load %s as LCGRequirements' % reqName)
        except:
            logger.debug('load default LCGRequirements')
            pass

        # dynamic sandbox cache object loading
        # force to use GridftpSandboxCache
        self.sandboxcache = GridftpSandboxCache()
        try:
            scName1 = config['SandboxCache']
            scName = config['SandboxCache'].split('.').pop()
            scModule = __import__(scName1, globals(), locals(), [scName1])
            scClass = vars(scModule)[scName]
            self.sandboxcache = scClass()
            logger.debug('load %s as SandboxCache' % scName)
        except:
            logger.debug('load default SandboxCache')
            pass

    def __refresh_jobinfo__(self, job):
        '''Refresh the lcg jobinfo. It will be called after resubmission.'''
        job.backend.status = ''
        job.backend.reason = ''
        job.backend.actualCE = ''
        job.backend.exitcode = ''
        job.backend.exitcode_arc = ''
        job.backend.workernode = ''
        job.backend.isbURI = ''
        job.backend.osbURI = ''

    def __setup_sandboxcache__(self, job):
        '''Sets up the sandbox cache object to adopt the runtime configuration of the LCG backend'''

        re_token = re.compile('^token:(.*):(.*)$')

        self.sandboxcache.vo = config['VirtualOrganisation']
        self.sandboxcache.timeout = config['SandboxTransferTimeout']

        if self.sandboxcache._name == 'LCGSandboxCache':
            if not self.sandboxcache.lfc_host:
                self.sandboxcache.lfc_host = Grid.__get_lfc_host__()

            if not self.sandboxcache.se:

                token = ''
                se_host = config['DefaultSE']
                m = re_token.match(se_host)
                if m:
                    token = m.group(1)
                    se_host = m.group(2)

                self.sandboxcache.se = se_host

                if token:
                    self.sandboxcache.srm_token = token

            if (self.sandboxcache.se_type
                    in ['srmv2']) and (not self.sandboxcache.srm_token):
                self.sandboxcache.srm_token = config['DefaultSRMToken']

        return True

    def __check_and_prestage_inputfile__(self, file):
        '''Checks the given input file size and if it's size is
           over "BoundSandboxLimit", prestage it to a grid SE.

           The argument is a path of the local file.

           It returns a dictionary containing information to refer to the file:

               idx = {'lfc_host': lfc_host,
                      'local': [the local file pathes],
                      'remote': {'fname1': 'remote index1', 'fname2': 'remote index2', ... }
                     }

           If prestaging failed, None object is returned.

           If the file has been previously uploaded (according to md5sum),
           the prestaging is ignored and index to the previously uploaded file
           is returned.
           '''

        idx = {'lfc_host': '', 'local': [], 'remote': {}}

        job = self.getJobObject()

        # read-in the previously uploaded files
        uploadedFiles = []

        # getting the uploaded file list from the master job
        if job.master:
            uploadedFiles += job.master.backend.sandboxcache.get_cached_files()

        # set and get the $LFC_HOST for uploading oversized sandbox
        self.__setup_sandboxcache__(job)

        uploadedFiles += self.sandboxcache.get_cached_files()

        lfc_host = None

        # for LCGSandboxCache, take the one specified in the sansboxcache object.
        # the value is exactly the same as the one from the local grid shell env. if
        # it is not specified exclusively.
        if self.sandboxcache._name == 'LCGSandboxCache':
            lfc_host = self.sandboxcache.lfc_host

        # or in general, query it from the Grid object
        if not lfc_host:
            lfc_host = Grid.__get_lfc_host__()

        idx['lfc_host'] = lfc_host

        abspath = os.path.abspath(file)
        fsize = os.path.getsize(abspath)

        if fsize > config['BoundSandboxLimit']:

            md5sum = get_md5sum(abspath, ignoreGzipTimestamp=True)

            doUpload = True
            for uf in uploadedFiles:
                if uf.md5sum == md5sum:
                    # the same file has been uploaded to the iocache
                    idx['remote'][os.path.basename(file)] = uf.id
                    doUpload = False
                    break

            if doUpload:

                logger.warning(
                    'The size of %s is larger than the sandbox limit (%d byte). Please wait while pre-staging ...'
                    % (file, config['BoundSandboxLimit']))

                if self.sandboxcache.upload([abspath]):
                    remote_sandbox = self.sandboxcache.get_cached_files()[-1]
                    idx['remote'][remote_sandbox.name] = remote_sandbox.id
                else:
                    logger.error(
                        'Oversized sandbox not successfully pre-staged')
                    return None
        else:
            idx['local'].append(abspath)

        return idx

    def __mt_job_prepare__(self, rjobs, subjobconfigs, masterjobconfig):
        '''preparing jobs in multiple threads'''

        logger.warning('preparing %d subjobs ... it may take a while' %
                       len(rjobs))

        # prepare the master job (i.e. create shared inputsandbox, etc.)
        master_input_sandbox = IBackend.master_prepare(self, masterjobconfig)

        # uploading the master job if it's over the WMS sandbox limitation
        for f in master_input_sandbox:
            master_input_idx = self.__check_and_prestage_inputfile__(f)

            if not master_input_idx:
                logger.error('master input sandbox perparation failed: %s' % f)
                return None

        # the algorithm for preparing a single bulk job
        class MyAlgorithm(Algorithm):
            def __init__(self):
                Algorithm.__init__(self)

            def process(self, sj_info):
                my_sc = sj_info[0]
                my_sj = sj_info[1]

                try:
                    logger.debug("preparing job %s" % my_sj.getFQID('.'))
                    jdlpath = my_sj.backend.preparejob(my_sc,
                                                       master_input_sandbox)

                    if (not jdlpath) or (not os.path.exists(jdlpath)):
                        raise GangaException('job %s not properly prepared' %
                                             my_sj.getFQID('.'))

                    self.__appendResult__(my_sj.id, jdlpath)
                    return True
                except Exception as x:
                    log_user_exception()
                    return False

        mt_data = []
        for sc, sj in zip(subjobconfigs, rjobs):
            mt_data.append([sc, sj])

        myAlg = MyAlgorithm()
        myData = Data(collection=mt_data)

        runner = MTRunner(name='lcg_jprepare',
                          algorithm=myAlg,
                          data=myData,
                          numThread=10)
        runner.start()
        runner.join(-1)

        if len(runner.getDoneList()) < len(mt_data):
            return None
        else:
            # return a JDL file dictionary with subjob ids as keys, JDL file
            # paths as values
            return runner.getResults()

    def __mt_bulk_submit__(self, node_jdls):
        '''submitting jobs in multiple threads'''

        job = self.getJobObject()

        logger.warning('submitting %d subjobs ... it may take a while' %
                       len(node_jdls))

        # the algorithm for submitting a single bulk job
        class MyAlgorithm(Algorithm):
            def __init__(self, masterInputWorkspace, ce, arcverbose):
                Algorithm.__init__(self)
                self.inpw = masterInputWorkspace
                self.ce = ce
                self.arcverbose = arcverbose

            def process(self, jdl_info):
                my_sj_id = jdl_info[0]
                my_sj_jdl = jdl_info[1]

                #my_sj_jid = self.gridObj.arc_submit(my_sj_jdl, self.ce, self.verbose)
                my_sj_jid = Grid.arc_submit(my_sj_jdl, self.ce,
                                            self.arcverbose)

                if not my_sj_jid:
                    return False
                else:
                    self.__appendResult__(my_sj_id, my_sj_jid)
                    return True

        mt_data = []
        for id, jdl in node_jdls.items():
            mt_data.append((id, jdl))

        myAlg = MyAlgorithm(masterInputWorkspace=job.getInputWorkspace(),
                            ce=self.CE,
                            arcverbose=self.verbose)
        myData = Data(collection=mt_data)

        runner = MTRunner(name='arc_jsubmit',
                          algorithm=myAlg,
                          data=myData,
                          numThread=config['SubmissionThread'])
        runner.start()
        runner.join(timeout=-1)

        if len(runner.getDoneList()) < len(mt_data):
            # not all bulk jobs are successfully submitted. canceling the
            # submitted jobs on WMS immediately
            logger.error(
                'some bulk jobs not successfully (re)submitted, canceling submitted jobs on WMS'
            )
            Grid.arc_cancelMultiple(runner.getResults().values())
            return None
        else:
            return runner.getResults()

    def __jobWrapperTemplate__(self):
        '''Create job wrapper'''

        script = """#!/usr/bin/env python
#-----------------------------------------------------
# This job wrapper script is automatically created by
# GANGA LCG backend handler.
#
# It controls:
# 1. unpack input sandbox
# 2. invoke application executable
# 3. invoke monitoring client
#-----------------------------------------------------
import os,os.path,shutil,tempfile
import sys,popen2,time,traceback

#bugfix #36178: subprocess.py crashes if python 2.5 is used
#try to import subprocess from local python installation before an
#import from PYTHON_DIR is attempted some time later
try:
    import subprocess
except ImportError:
    pass

## Utility functions ##
def timeString():
    return time.strftime('%a %b %d %H:%M:%S %Y',time.gmtime(time.time()))

def printInfo(s):
    out.write(timeString() + '  [Info]' +  ' ' + str(s) + os.linesep)
    out.flush()

def printError(s):
    out.write(timeString() + ' [Error]' +  ' ' + str(s) + os.linesep)
    out.flush()

def lcg_file_download(vo,guid,localFilePath,timeout=60,maxRetry=3):
    cmd = 'lcg-cp -t %d --vo %s %s file://%s' % (timeout,vo,guid,localFilePath)

    printInfo('LFC_HOST set to %s' % os.environ['LFC_HOST'])
    printInfo('lcg-cp timeout: %d' % timeout)

    i         = 0
    rc        = 0
    isDone    = False
    try_again = True

    while try_again:
        i = i + 1
        try:
            ps = os.popen(cmd)
            status = ps.close()

            if not status:
                isDone = True
                printInfo('File %s download from iocache' % os.path.basename(localFilePath))
            else:
                raise IOError("Download file %s from iocache failed with error code: %d, trial %d." % (os.path.basename(localFilePath), status, i))

        except IOError as e:
            isDone = False
            printError(str(e))

        if isDone:
            try_again = False
        elif i == maxRetry:
            try_again = False
        else:
            try_again = True

    return isDone

## system command executor with subprocess
def execSyscmdSubprocess(cmd, wdir=os.getcwd()):

    import os, subprocess

    global exitcode

    outfile   = file('stdout','w')
    errorfile = file('stderr','w')

    try:
        child = subprocess.Popen(cmd, cwd=wdir, shell=True, stdout=outfile, stderr=errorfile)

        while 1:
            exitcode = child.poll()
            if exitcode is not None:
                break
            else:
                outfile.flush()
                errorfile.flush()
                time.sleep(0.3)
    finally:
        pass

    outfile.flush()
    errorfile.flush()
    outfile.close()
    errorfile.close()

    return True

## system command executor with multi-thread
## stderr/stdout handler
def execSyscmdEnhanced(cmd, wdir=os.getcwd()):

    import os, threading

    cwd = os.getcwd()

    isDone = False

    try:
        ## change to the working directory
        os.chdir(wdir)

        child = popen2.Popen3(cmd,1)
        child.tochild.close() # don't need stdin

        class PipeThread(threading.Thread):

            def __init__(self,infile,outfile,stopcb):
                self.outfile = outfile
                self.infile = infile
                self.stopcb = stopcb
                self.finished = 0
                threading.Thread.__init__(self)

            def run(self):
                stop = False
                while not stop:
                    buf = self.infile.read(10000)
                    self.outfile.write(buf)
                    self.outfile.flush()
                    time.sleep(0.01)
                    stop = self.stopcb()
                #FIXME: should we do here?: self.infile.read()
                #FIXME: this is to make sure that all the output is read (if more than buffer size of output was produced)
                self.finished = 1

        def stopcb(poll=False):
            global exitcode
            if poll:
                exitcode = child.poll()
            return exitcode != -1

        out_thread = PipeThread(child.fromchild, sys.stdout, stopcb)
        err_thread = PipeThread(child.childerr, sys.stderr, stopcb)

        out_thread.start()
        err_thread.start()
        while not out_thread.finished and not err_thread.finished:
            stopcb(True)
            time.sleep(0.3)

        sys.stdout.flush()
        sys.stderr.flush()

        isDone = True

    except(Exception,e):
        isDone = False

    ## return to the original directory
    os.chdir(cwd)

    return isDone

############################################################################################

###INLINEMODULES###

############################################################################################

## Main program ##

outputsandbox = ###OUTPUTSANDBOX###
input_sandbox = ###INPUTSANDBOX###
wrapperlog = ###WRAPPERLOG###
appexec = ###APPLICATIONEXEC###
appargs = ###APPLICATIONARGS###
appenvs = ###APPLICATIONENVS###
timeout = ###TRANSFERTIMEOUT###

exitcode=-1

import sys, stat, os, os.path, commands

# Change to scratch directory if provided
scratchdir = ''
tmpdir = ''

orig_wdir = os.getcwd()

# prepare log file for job wrapper
out = open(os.path.join(orig_wdir, wrapperlog),'w')

if os.getenv('EDG_WL_SCRATCH'):
    scratchdir = os.getenv('EDG_WL_SCRATCH')
elif os.getenv('TMPDIR'):
    scratchdir = os.getenv('TMPDIR')

if scratchdir:
    (status, tmpdir) = commands.getstatusoutput('mktemp -d %s/gangajob_XXXXXXXX' % (scratchdir))
    if status == 0:
        os.chdir(tmpdir)
    else:
        ## if status != 0, tmpdir should contains error message so print it to stderr
        printError('Error making ganga job scratch dir: %s' % tmpdir)
        printInfo('Unable to create ganga job scratch dir in %s. Run directly in: %s' % ( scratchdir, os.getcwd() ) )

        ## reset scratchdir and tmpdir to disable the usage of Ganga scratch dir
        scratchdir = ''
        tmpdir = ''

wdir = os.getcwd()

if scratchdir:
    printInfo('Changed working directory to scratch directory %s' % tmpdir)
    try:
        os.system("ln -s %s %s" % (os.path.join(orig_wdir, 'stdout'), os.path.join(wdir, 'stdout')))
        os.system("ln -s %s %s" % (os.path.join(orig_wdir, 'stderr'), os.path.join(wdir, 'stderr')))
    except Exception as e:
        printError(sys.exc_info()[0])
        printError(sys.exc_info()[1])
        str_traceback = traceback.format_tb(sys.exc_info()[2])
        for str_tb in str_traceback:
            printError(str_tb)
        printInfo('Linking stdout & stderr to original directory failed. Looking at stdout during job run may not be possible')

os.environ['PATH'] = '.:'+os.environ['PATH']

vo = os.environ['GANGA_LCG_VO']

try:
    printInfo('Job Wrapper start.')

#   download inputsandbox from remote cache
    for f,guid in input_sandbox['remote'].iteritems():
        if not lcg_file_download(vo, guid, os.path.join(wdir,f), timeout=int(timeout)):
            raise IOError('Download remote input %s:%s failed.' % (guid,f) )
        else:
            if mimetypes.guess_type(f)[1] in ['gzip', 'bzip2']:
                getPackedInputSandbox(f)
            else:
                shutil.copy(f, os.path.join(os.getcwd(), os.path.basename(f)))

    printInfo('Download inputsandbox from iocache passed.')

#   unpack inputsandbox from wdir
    for f in input_sandbox['local']:
        if mimetypes.guess_type(f)[1] in ['gzip', 'bzip2']:
            getPackedInputSandbox(os.path.join(orig_wdir,f))

    printInfo('Unpack inputsandbox passed.')

    #get input files
    ###DOWNLOADINPUTFILES###

    printInfo('Loading Python modules ...')

    sys.path.insert(0,os.path.join(wdir,PYTHON_DIR))

    # check the python library path
    try:
        printInfo(' ** PYTHON_DIR: %s' % os.environ['PYTHON_DIR'])
    except KeyError:
        pass

    try:
        printInfo(' ** PYTHONPATH: %s' % os.environ['PYTHONPATH'])
    except KeyError:
        pass

    for lib_path in sys.path:
        printInfo(' ** sys.path: %s' % lib_path)

#   execute application

    ## convern appenvs into environment setup script to be 'sourced' before executing the user executable

    printInfo('Prepare environment variables for application executable')

    env_setup_script = os.path.join(os.getcwd(), '__ganga_lcg_env__.sh')

    f = open( env_setup_script, 'w')
    f.write('#!/bin/sh' + os.linesep )
    f.write('##user application environmet setup script generated by Ganga job wrapper' + os.linesep)
    for k,v in appenvs.items():

        str_env = 'export %s="%s"' % (k, v)

        printInfo(' ** ' + str_env)
        
        f.write(str_env + os.linesep)
    f.close()

    try: #try to make shipped executable executable
        os.chmod('%s/%s'% (wdir,appexec),stat.S_IXUSR|stat.S_IRUSR|stat.S_IWUSR)
    except:
        pass

    status = False
    try:
        # use subprocess to run the user's application if the module is available on the worker node
        import subprocess
        printInfo('Load application executable with subprocess module')
        status = execSyscmdSubprocess('source %s; %s %s' % (env_setup_script, appexec, appargs), wdir)
    except ImportError as err:
        # otherwise, use separate threads to control process IO pipes
        printInfo('Load application executable with separate threads')
        status = execSyscmdEnhanced('source %s; %s %s' % (env_setup_script, appexec, appargs), wdir)

    os.system("cp %s/stdout stdout.1" % orig_wdir)
    os.system("cp %s/stderr stderr.1" % orig_wdir)

    printInfo('GZipping stdout and stderr...')

    os.system("gzip stdout.1 stderr.1")

    # move them to the original wdir so they can be picked up
    os.system("mv stdout.1.gz %s/stdout.gz" % orig_wdir)
    os.system("mv stderr.1.gz %s/stderr.gz" % orig_wdir)

    if not status:
        raise OSError('Application execution failed.')
    printInfo('Application execution passed with exit code %d.' % exitcode)      

    ###OUTPUTUPLOADSPOSTPROCESSING###

    for f in os.listdir(os.getcwd()):
        command = "cp %s %s" % (os.path.join(os.getcwd(),f), os.path.join(orig_wdir,f))
        os.system(command)            

    createPackedOutputSandbox(outputsandbox,None,orig_wdir)

#   pack outputsandbox
#    printInfo('== check output ==')
#    for line in os.popen('pwd; ls -l').readlines():
#        printInfo(line)

    printInfo('Pack outputsandbox passed.')

    # Clean up after us - All log files and packed outputsandbox should be in "wdir"
    if scratchdir:
        os.chdir(orig_wdir)
        os.system("rm %s -rf" % wdir)
except Exception as e:
    printError(sys.exc_info()[0])
    printError(sys.exc_info()[1])
    str_traceback = traceback.format_tb(sys.exc_info()[2])
    for str_tb in str_traceback:
        printError(str_tb)

printInfo('Job Wrapper stop.')

out.close()

# always return exit code 0 so the in the case of application failure
# one can always get stdout and stderr back to the UI for debug.
sys.exit(0)
"""
        return script

    def preparejob(self, jobconfig, master_job_sandbox):
        '''Prepare the JDL'''

        script = self.__jobWrapperTemplate__()

        job = self.getJobObject()
        inpw = job.getInputWorkspace()

        wrapperlog = '__jobscript__.log'

        import Ganga.Core.Sandbox as Sandbox

        # FIXME: check what happens if 'stdout','stderr' are specified here
        script = script.replace('###OUTPUTSANDBOX###',
                                repr(jobconfig.outputbox))

        script = script.replace('###APPLICATION_NAME###',
                                getName(job.application))
        script = script.replace('###APPLICATIONEXEC###',
                                repr(jobconfig.getExeString()))
        script = script.replace('###APPLICATIONARGS###',
                                repr(jobconfig.getArguments()))

        from Ganga.GPIDev.Lib.File.OutputFileManager import getWNCodeForOutputPostprocessing, getWNCodeForDownloadingInputFiles

        script = script.replace('###OUTPUTUPLOADSPOSTPROCESSING###',
                                getWNCodeForOutputPostprocessing(job, '    '))

        script = script.replace('###DOWNLOADINPUTFILES###',
                                getWNCodeForDownloadingInputFiles(job, '    '))

        if jobconfig.env:
            script = script.replace('###APPLICATIONENVS###',
                                    repr(jobconfig.env))
        else:
            script = script.replace('###APPLICATIONENVS###', repr({}))

        script = script.replace('###WRAPPERLOG###', repr(wrapperlog))
        import inspect
        script = script.replace('###INLINEMODULES###',
                                inspect.getsource(Sandbox.WNSandbox))

        mon = job.getMonitoringService()

        self.monInfo = None

        # set the monitoring file by default to the stdout
        if isinstance(self.monInfo, dict):
            self.monInfo['remotefile'] = 'stdout'

        # try to print out the monitoring service information in debug mode
        try:
            logger.debug('job info of monitoring service: %s' %
                         str(self.monInfo))
        except:
            pass

#       prepare input/output sandboxes
        import Ganga.Utility.files
        from Ganga.GPIDev.Lib.File import File
        from Ganga.Core.Sandbox.WNSandbox import PYTHON_DIR
        import inspect

        fileutils = File(inspect.getsourcefile(Ganga.Utility.files),
                         subdir=PYTHON_DIR)
        packed_files = jobconfig.getSandboxFiles() + [fileutils]
        sandbox_files = job.createPackedInputSandbox(packed_files)

        # sandbox of child jobs should include master's sandbox
        sandbox_files.extend(master_job_sandbox)

        # check the input file size and pre-upload larger inputs to the iocache
        lfc_host = ''

        input_sandbox_uris = []
        input_sandbox_names = []

        ick = True

        max_prestaged_fsize = 0
        for f in sandbox_files:

            idx = self.__check_and_prestage_inputfile__(f)

            if not idx:
                logger.error('input sandbox preparation failed: %s' % f)
                ick = False
                break
            else:

                if idx['lfc_host']:
                    lfc_host = idx['lfc_host']

                if idx['remote']:
                    abspath = os.path.abspath(f)
                    fsize = os.path.getsize(abspath)

                    if fsize > max_prestaged_fsize:
                        max_prestaged_fsize = fsize

                    input_sandbox_uris.append(
                        idx['remote'][os.path.basename(f)])

                    input_sandbox_names.append(os.path.basename(
                        urlparse(f)[2]))

                if idx['local']:
                    input_sandbox_uris += idx['local']
                    input_sandbox_names.append(os.path.basename(f))

        if not ick:
            logger.error('stop job submission')
            return None

        # determin the lcg-cp timeout according to the max_prestaged_fsize
        # - using the assumption of 1 MB/sec.
        max_prestaged_fsize = 0
        lfc_host = ''
        transfer_timeout = config['SandboxTransferTimeout']
        predict_timeout = int(math.ceil(max_prestaged_fsize / 1000000.0))

        if predict_timeout > transfer_timeout:
            transfer_timeout = predict_timeout

        if transfer_timeout < 60:
            transfer_timeout = 60

        script = script.replace('###TRANSFERTIMEOUT###',
                                '%d' % transfer_timeout)

        # update the job wrapper with the inputsandbox list
        script = script.replace(
            '###INPUTSANDBOX###',
            repr({
                'remote': {},
                'local': input_sandbox_names
            }))

        # write out the job wrapper and put job wrapper into job's inputsandbox
        scriptPath = inpw.writefile(FileBuffer(
            '__jobscript_%s__' % job.getFQID('.'), script),
                                    executable=1)
        input_sandbox = input_sandbox_uris + [scriptPath]

        for isb in input_sandbox:
            logger.debug('ISB URI: %s' % isb)

        # compose output sandbox to include by default the following files:
        # - gzipped stdout (transferred only when the JobLogHandler is WMS)
        # - gzipped stderr (transferred only when the JobLogHandler is WMS)
        # - __jobscript__.log (job wrapper's log)
        output_sandbox = [wrapperlog]

        from Ganga.GPIDev.Lib.File.OutputFileManager import getOutputSandboxPatterns
        for outputSandboxPattern in getOutputSandboxPatterns(job):
            output_sandbox.append(outputSandboxPattern)

        if config['JobLogHandler'] in ['WMS']:
            output_sandbox += ['stdout.gz', 'stderr.gz']

        if len(jobconfig.outputbox):
            output_sandbox += [Sandbox.OUTPUT_TARBALL_NAME]

        # compose ARC XRSL
        xrsl = {
            #'VirtualOrganisation' : config['VirtualOrganisation'],
            'executable': os.path.basename(scriptPath),
            'environment': {
                'GANGA_LCG_VO': config['VirtualOrganisation'],
                'GANGA_LOG_HANDLER': config['JobLogHandler'],
                'LFC_HOST': lfc_host
            },
            #'stdout'                : 'stdout',
            #'stderr'                : 'stderr',
            'inputFiles': input_sandbox,
            'outputFiles': output_sandbox,
            #'OutputSandboxBaseDestURI': 'gsiftp://localhost'
        }

        xrsl['environment'].update({'GANGA_LCG_CE': self.CE})
        #xrsl['Requirements'] = self.requirements.merge(jobconfig.requirements).convert()

        # if self.jobtype.upper() in ['NORMAL','MPICH']:
        #xrsl['JobType'] = self.jobtype.upper()
        # if self.jobtype.upper() == 'MPICH':
        #xrsl['Requirements'].append('(other.GlueCEInfoTotalCPUs >= NodeNumber)')
        # xrsl['Requirements'].append('Member("MPICH",other.GlueHostApplicationSoftwareRunTimeEnvironment)')
        #xrsl['NodeNumber'] = self.requirements.nodenumber
        # else:
        #    logger.warning('JobType "%s" not supported' % self.jobtype)
        #    return

        #       additional settings from the job
        if jobconfig.env:
            xrsl['environment'].update(jobconfig.env)

        xrslText = Grid.expandxrsl(xrsl)

        # append any additional requirements from the requirements object
        xrslText += '\n'.join(self.requirements.other)

        logger.debug('subjob XRSL: %s' % xrslText)
        return inpw.writefile(FileBuffer('__xrslfile__', xrslText))

    def kill(self):
        '''Kill the job'''
        job = self.getJobObject()

        logger.info('Killing job %s' % job.getFQID('.'))

        if not self.id:
            logger.warning('Job %s is not running.' % job.getFQID('.'))
            return False

        return Grid.arc_cancel([self.id])

    def master_kill(self):
        '''kill the master job to the grid'''

        job = self.getJobObject()

        if not job.master and len(job.subjobs) == 0:
            return IBackend.master_kill(self)
        elif job.master:
            return IBackend.master_kill(self)
        else:
            return self.master_bulk_kill()

    def master_bulk_kill(self):
        '''GLITE bulk resubmission'''

        job = self.getJobObject()

        # killing the individually re-submitted subjobs
        logger.debug('cancelling running/submitted subjobs.')

        # 1. collect job ids
        ids = []
        for sj in job.subjobs:
            if sj.status in ['submitted', 'running'] and sj.backend.id:
                ids.append(sj.backend.id)

        # 2. cancel the collected jobs
        ck = Grid.arc_cancelMultiple(ids)
        if not ck:
            logger.warning('Job cancellation failed')
            return False
        else:
            for sj in job.subjobs:
                if sj.backend.id in ids:
                    sj.updateStatus('killed')

            return True

    def master_bulk_submit(self, rjobs, subjobconfigs, masterjobconfig):
        '''submit multiple subjobs in parallel, by default using 10 concurrent threads'''

        from Ganga.Utility.logic import implies
        assert (implies(rjobs, len(subjobconfigs) == len(rjobs)))

        # prepare the subjobs, jdl repository before bulk submission
        node_jdls = self.__mt_job_prepare__(rjobs, subjobconfigs,
                                            masterjobconfig)

        if not node_jdls:
            logger.error('Some jobs not successfully prepared')
            return False

        # set all subjobs to submitting status
        for sj in rjobs:
            sj.updateStatus('submitting')

        node_jids = self.__mt_bulk_submit__(node_jdls)

        status = False

        if node_jids:
            for sj in rjobs:
                if sj.id in node_jids.keys():
                    sj.backend.id = node_jids[sj.id]
                    sj.backend.CE = self.CE
                    sj.backend.actualCE = sj.backend.CE
                    sj.updateStatus('submitted')
                    sj.info.submit_counter += 1
                else:
                    logger.warning('subjob %s not successfully submitted' %
                                   sj.getFQID('.'))

            status = True

        return status

    def master_bulk_resubmit(self, rjobs):
        '''ARC bulk resubmission'''

        from Ganga.Utility.logging import log_user_exception

        #        job = self.getJobObject()

        # compose master JDL for collection job
        node_jdls = {}
        for sj in rjobs:
            jdlpath = os.path.join(sj.inputdir, '__jdlfile__')
            node_jdls[sj.id] = jdlpath

        # set all subjobs to submitting status
        for sj in rjobs:
            sj.updateStatus('submitting')

        node_jids = self.__mt_bulk_submit__(node_jdls)

        status = False

        if node_jids:
            for sj in rjobs:
                if sj.id in node_jids.keys():
                    self.__refresh_jobinfo__(sj)
                    sj.backend.id = node_jids[sj.id]
                    sj.backend.CE = self.CE
                    sj.backend.actualCE = sj.backend.CE
                    sj.updateStatus('submitted')
                    sj.info.submit_counter += 1
                else:
                    logger.warning('subjob %s not successfully submitted' %
                                   sj.getFQID('.'))

            status = True


#            # set all subjobs to submitted status
#            # NOTE: this is just a workaround to avoid the unexpected transition
#            #       that turns the master job's status from 'submitted' to 'submitting'.
#            #       As this transition should be allowed to simulate a lock mechanism in Ganga 4, the workaround
#            #       is to set all subjobs' status to 'submitted' so that the transition can be avoided.
#            #       A more clear solution should be implemented with the lock mechanism introduced in Ganga 5.
#            for sj in rjobs:
#                sj.updateStatus('submitted')
#                sj.info.submit_counter += 1

        return status

    def master_submit(self, rjobs, subjobconfigs, masterjobconfig):
        '''Submit the master job to the grid'''

        profiler = ElapsedTimeProfiler(getLogger(name='Profile.LCG'))
        profiler.start()

        job = self.getJobObject()

        # finding ARC CE endpoint for job submission
        #allowed_celist = []
        # try:
        #    allowed_celist = self.requirements.getce()
        #    if not self.CE and allowed_celist:
        #        self.CE = allowed_celist[0]
        # except:
        #    logger.warning('ARC CE assigment from ARCRequirements failed.')

        # if self.CE and allowed_celist:
        #    if self.CE not in allowed_celist:
        #        logger.warning('submission to CE not allowed: %s, use %s instead' % ( self.CE, allowed_celist[0] ) )
        #        self.CE = allowed_celist[0]

        # use arc info to check for any endpoints recorded in the config file
        rc, output = Grid.arc_info()

        if not self.CE and rc != 0:
            raise GangaException(
                "ARC CE endpoint not set and no default settings in '%s'. " %
                config['ArcConfigFile'])
        elif self.CE:
            logger.info('ARC CE endpoint set to: ' + str(self.CE))
        else:
            logger.info("Using ARC CE endpoints defined in '%s'" %
                        config['ArcConfigFile'])

        # doing massive job preparation
        if len(job.subjobs) == 0:
            ick = IBackend.master_submit(self, rjobs, subjobconfigs,
                                         masterjobconfig)
        else:
            ick = self.master_bulk_submit(rjobs, subjobconfigs,
                                          masterjobconfig)

        profiler.check('==> master_submit() elapsed time')

        return ick

    def submit(self, subjobconfig, master_job_sandbox):
        '''Submit the job to the grid'''

        ick = False

        xrslpath = self.preparejob(subjobconfig, master_job_sandbox)

        if xrslpath:
            self.id = Grid.arc_submit(xrslpath, self.CE, self.verbose)

            if self.id:
                self.actualCE = self.CE
                ick = True

        return ick

    def master_auto_resubmit(self, rjobs):
        """
        Resubmit each subjob individually as bulk resubmission will overwrite
        previous master job statuses
        """

        # check for master failure - in which case bulk resubmit
        mj = self._getParent()
        if mj.status == 'failed':
            return self.master_resubmit(rjobs)

        for j in rjobs:
            if not j.backend.master_resubmit([j]):
                return False

        return True

    def master_resubmit(self, rjobs):
        '''Resubmit the master job to the grid'''

        profiler = ElapsedTimeProfiler(getLogger(name='Profile.LCG'))
        profiler.start()

        job = self.getJobObject()

        ick = False

        if not job.master and len(job.subjobs) == 0:
            # case 1: master job normal resubmission
            logger.debug('rjobs: %s' % str(rjobs))
            logger.debug('mode: master job normal resubmission')
            ick = IBackend.master_resubmit(self, rjobs)

        elif job.master:
            # case 2: individual subjob resubmission
            logger.debug('mode: individual subjob resubmission')
            ick = IBackend.master_resubmit(self, rjobs)

        else:
            # case 3: master job bulk resubmission
            logger.debug('mode: master job resubmission')

            ick = self.master_bulk_resubmit(rjobs)
            if not ick:
                raise GangaException('ARC bulk submission failure')

        profiler.check('job re-submission elapsed time')

        return ick

    def resubmit(self):
        '''Resubmit the job'''

        ick = False

        job = self.getJobObject()

        jdlpath = job.getInputWorkspace().getPath("__jdlfile__")

        if jdlpath:
            self.id = Grid.arc_submit(jdlpath, self.CE, self.verbose)

            if self.id:
                # refresh the lcg job information
                self.__refresh_jobinfo__(job)
                self.actualCE = self.CE
                ick = True

        return ick

    @staticmethod
    def updateMonitoringInformation(jobs):
        '''Monitoring loop for normal jobs'''

        import datetime

        backenddict = {}
        jobdict = {}
        for j in jobs:
            if j.backend.id and (
                (datetime.datetime.utcnow() - j.time.timestamps["submitted"]
                 ).seconds > config["ArcWaitTimeBeforeStartingMonitoring"]):
                jobdict[j.backend.id] = j
                backenddict[j.backend.actualCE] = j

        if len(jobdict.keys()) == 0:
            return

        jobInfoDict = Grid.arc_status(jobdict.keys(), backenddict.keys())
        jidListForPurge = []

        # update job information for those available in jobInfoDict
        for id, info in jobInfoDict.items():

            if info:

                job = jobdict[id]

                if job.backend.actualCE != urlparse(id)[1].split(":")[0]:
                    job.backend.actualCE = urlparse(id)[1].split(":")[0]

                if job.backend.status != info['State']:

                    doStatusUpdate = True

                    # no need to update Ganga job status if backend status is
                    # not changed
                    if info['State'] == job.backend.status:
                        doStatusUpdate = False

                    # download output sandboxes if final status is reached
                    elif info['State'] in [
                            'Finished', '(FINISHED)', 'Finished (FINISHED)'
                    ]:

                        # grab output sandbox
                        if Grid.arc_get_output(
                                job.backend.id,
                                job.getOutputWorkspace(create=True).getPath()):
                            (ick, app_exitcode) = Grid.__get_app_exitcode__(
                                job.getOutputWorkspace(create=True).getPath())
                            job.backend.exitcode = app_exitcode

                            jidListForPurge.append(job.backend.id)

                        else:
                            logger.error('fail to download job output: %s' %
                                         jobdict[id].getFQID('.'))

                    if doStatusUpdate:
                        job.backend.status = info['State']
                        if 'Exit Code' in info:
                            try:
                                job.backend.exitcode_arc = int(
                                    info['Exit Code'])
                            except:
                                job.backend.exitcode_arc = 1

                        if 'Job Error' in info:
                            try:
                                job.backend.reason = info['Job Error']
                            except:
                                pass

                        job.backend.updateGangaJobStatus()
            else:
                logger.warning('fail to retrieve job informaton: %s' %
                               jobdict[id].getFQID('.'))

        # purging the jobs the output has been fetched locally
        if jidListForPurge:
            if not Grid.arc_purgeMultiple(jidListForPurge):
                logger.warning("Failed to purge all ARC jobs.")

    def updateGangaJobStatus(self):
        '''map backend job status to Ganga job status'''

        job = self.getJobObject()

        if self.status.startswith('Running') or self.status.startswith(
                'Finishing'):
            job.updateStatus('running')
        elif self.status.startswith('Finished'):
            if job.backend.exitcode and job.backend.exitcode != 0:
                job.backend.reason = 'non-zero app. exit code: %s' % repr(
                    job.backend.exitcode)
                job.updateStatus('failed')
            elif job.backend.exitcode_arc and job.backend.exitcode_arc != 0:
                job.backend.reason = 'non-zero ARC job exit code: %s' % repr(
                    job.backend.exitcode_arc)
                job.updateStatus('failed')
            else:
                job.updateStatus('completed')

        elif self.status in ['DONE-FAILED', 'ABORTED', 'UNKNOWN', 'Failed']:
            job.updateStatus('failed')

        elif self.status in ['CANCELLED']:
            job.updateStatus('killed')

        elif self.status.startswith('Queuing'):
            pass

        else:
            logger.warning('Unexpected job status "%s"', self.status)
Exemplo n.º 14
0
class CREAM(IBackend):
    '''CREAM backend - direct job submission to gLite CREAM CE'''
    _schema = Schema(Version(1,0), {
        'CE'                  : SimpleItem(defvalue='',doc='CREAM CE endpoint'),
        'jobtype'             : SimpleItem(defvalue='Normal',doc='Job type: Normal, MPICH'),
        'requirements'        : ComponentItem('LCGRequirements',doc='Requirements for the resource selection'),
        'sandboxcache'        : ComponentItem('GridSandboxCache',copyable=1,doc='Interface for handling oversized input sandbox'),
        'id'                  : SimpleItem(defvalue='',typelist=['str','list'],protected=1,copyable=0,doc='Middleware job identifier'),
        'status'              : SimpleItem(defvalue='',typelist=['str','dict'], protected=1,copyable=0,doc='Middleware job status'),
        'exitcode'            : SimpleItem(defvalue='',protected=1,copyable=0,doc='Application exit code'),
        'exitcode_cream'      : SimpleItem(defvalue='',protected=1,copyable=0,doc='Middleware exit code'),
        'actualCE'            : SimpleItem(defvalue='',protected=1,copyable=0,doc='The CREAM CE where the job actually runs.'),
        'reason'              : SimpleItem(defvalue='',protected=1,copyable=0,doc='Reason of causing the job status'),
        'workernode'          : SimpleItem(defvalue='',protected=1,copyable=0,doc='The worker node on which the job actually runs.'),
        'isbURI'              : SimpleItem(defvalue='',protected=1,copyable=0,doc='The input sandbox URI on CREAM CE'),
        'osbURI'              : SimpleItem(defvalue='',protected=1,copyable=0,doc='The output sandbox URI on CREAM CE')
    })

    _category = 'backends'

    _name =  'CREAM'

    def __init__(self):
        super(CREAM, self).__init__()

        # dynamic requirement object loading
        try:
            reqName1  = config['Requirements']
            reqName   = config['Requirements'].split('.').pop()
            reqModule = __import__(reqName1, globals(), locals(), [reqName1])
            reqClass  = vars(reqModule)[reqName]
            self.requirements = reqClass()

            logger.debug('load %s as LCGRequirements' % reqName)
        except:
            logger.debug('load default LCGRequirements')
            pass

        # dynamic sandbox cache object loading
        ## force to use GridftpSandboxCache
        self.sandboxcache = GridftpSandboxCache()
        try:
            scName1  = config['SandboxCache']
            scName   = config['SandboxCache'].split('.').pop()
            scModule = __import__(scName1, globals(), locals(), [scName1])
            scClass  = vars(scModule)[scName]
            self.sandboxcache = scClass()
            logger.debug('load %s as SandboxCache' % scName)
        except:
            logger.debug('load default SandboxCache')
            pass

    def __refresh_jobinfo__(self,job):
        '''Refresh the lcg jobinfo. It will be called after resubmission.'''
        job.backend.status   = ''
        job.backend.reason   = ''
        job.backend.actualCE = ''
        job.backend.exitcode = ''
        job.backend.exitcode_cream = ''
        job.backend.workernode = ''
        job.backend.isbURI = ''
        job.backend.osbURI = ''

    def __setup_sandboxcache__(self, job):
        '''Sets up the sandbox cache object to adopt the runtime configuration of the LCG backend'''

        re_token = re.compile('^token:(.*):(.*)$')

        self.sandboxcache.vo = config['VirtualOrganisation']
        self.sandboxcache.middleware = 'GLITE'
        self.sandboxcache.timeout    = config['SandboxTransferTimeout']

        if self.sandboxcache._name == 'LCGSandboxCache':
            if not self.sandboxcache.lfc_host:
                self.sandboxcache.lfc_host = grids[ self.sandboxcache.middleware ].__get_lfc_host__()

            if not self.sandboxcache.se:

                token   = ''
                se_host = config['DefaultSE']
                m = re_token.match(se_host)
                if m:
                    token   = m.group(1)
                    se_host = m.group(2)

                self.sandboxcache.se = se_host

                if token:
                    self.sandboxcache.srm_token = token

            if (self.sandboxcache.se_type in ['srmv2']) and (not self.sandboxcache.srm_token):
                self.sandboxcache.srm_token = config['DefaultSRMToken']

        elif self.sandboxcache._name == 'DQ2SandboxCache':

            ## generate a new dataset name if not given
            if not self.sandboxcache.dataset_name:
                from GangaAtlas.Lib.ATLASDataset.DQ2Dataset import dq2outputdatasetname
                self.sandboxcache.dataset_name,unused = dq2outputdatasetname("%s.input"%get_uuid(), 0, False, '')

            ## subjobs inherits the dataset name from the master job
            for sj in job.subjobs:
                sj.backend.sandboxcache.dataset_name = self.sandboxcache.dataset_name

        elif self.sandboxcache._name == 'GridftpSandboxCache':
                if config['CreamInputSandboxBaseURI']:
                    self.sandboxcache.baseURI = config['CreamInputSandboxBaseURI']
                elif self.CE:
                    ce_host = re.sub(r'\:[0-9]+','',self.CE.split('/cream')[0])
                    self.sandboxcache.baseURI = 'gsiftp://%s/opt/glite/var/cream_sandbox/%s' % ( ce_host, self.sandboxcache.vo )
                else:
                    logger.error('baseURI not available for GridftpSandboxCache')
                    return False

        return True

    def __check_and_prestage_inputfile__(self, file):
        '''Checks the given input file size and if it's size is
           over "BoundSandboxLimit", prestage it to a grid SE.

           The argument is a path of the local file.

           It returns a dictionary containing information to refer to the file:

               idx = {'lfc_host': lfc_host,
                      'local': [the local file pathes],
                      'remote': {'fname1': 'remote index1', 'fname2': 'remote index2', ... }
                     }

           If prestaging failed, None object is returned.

           If the file has been previously uploaded (according to md5sum),
           the prestaging is ignored and index to the previously uploaded file
           is returned.
           '''

        idx = {'lfc_host':'', 'local':[], 'remote':{}}

        job = self.getJobObject()

        ## read-in the previously uploaded files
        uploadedFiles = []

        ## getting the uploaded file list from the master job
        if job.master:
            uploadedFiles += job.master.backend.sandboxcache.get_cached_files()

        ## set and get the $LFC_HOST for uploading oversized sandbox
        self.__setup_sandboxcache__(job)

        uploadedFiles += self.sandboxcache.get_cached_files()

        lfc_host = None

        ## for LCGSandboxCache, take the one specified in the sansboxcache object.
        ## the value is exactly the same as the one from the local grid shell env. if
        ## it is not specified exclusively.
        if self.sandboxcache._name == 'LCGSandboxCache':
            lfc_host = self.sandboxcache.lfc_host

        ## or in general, query it from the Grid object
        if not lfc_host:
            lfc_host = grids[self.sandboxcache.middleware.upper()].__get_lfc_host__()

        idx['lfc_host'] = lfc_host

        abspath = os.path.abspath(file)
        fsize   = os.path.getsize(abspath)

        if fsize > config['BoundSandboxLimit']:

            md5sum  = get_md5sum(abspath, ignoreGzipTimestamp=True)

            doUpload = True
            for uf in uploadedFiles:
                if uf.md5sum == md5sum:
                    # the same file has been uploaded to the iocache
                    idx['remote'][os.path.basename(file)] = uf.id
                    doUpload = False
                    break

            if doUpload:

                logger.warning('The size of %s is larger than the sandbox limit (%d byte). Please wait while pre-staging ...' % (file,config['BoundSandboxLimit']) )

                if self.sandboxcache.upload( [abspath] ):
                    remote_sandbox = self.sandboxcache.get_cached_files()[-1]
                    idx['remote'][remote_sandbox.name] = remote_sandbox.id
                else:
                    logger.error('Oversized sandbox not successfully pre-staged')
                    return None
        else:
            idx['local'].append(abspath)

        return idx

    def __mt_job_prepare__(self, rjobs, subjobconfigs, masterjobconfig):
        '''preparing jobs in multiple threads'''

        logger.warning('preparing %d subjobs ... it may take a while' % len(rjobs))

        # prepare the master job (i.e. create shared inputsandbox, etc.)
        master_input_sandbox=IBackend.master_prepare(self,masterjobconfig)

        ## uploading the master job if it's over the WMS sandbox limitation
        for f in master_input_sandbox:
            master_input_idx = self.__check_and_prestage_inputfile__(f)

            if not master_input_idx:
                logger.error('master input sandbox perparation failed: %s' % f)
                return None

        # the algorithm for preparing a single bulk job
        class MyAlgorithm(Algorithm):

            def __init__(self):
                Algorithm.__init__(self)

            def process(self, sj_info):
                my_sc = sj_info[0]
                my_sj = sj_info[1]

                try:
                    logger.debug("preparing job %s" % my_sj.getFQID('.'))
                    jdlpath = my_sj.backend.preparejob(my_sc, master_input_sandbox)

                    if (not jdlpath) or (not os.path.exists(jdlpath)):
                        raise GangaException('job %s not properly prepared' % my_sj.getFQID('.'))

                    self.__appendResult__( my_sj.id, jdlpath )
                    return True
                except Exception,x:
                    log_user_exception()
                    return False

        mt_data = []
        for sc,sj in zip(subjobconfigs,rjobs):
            mt_data.append( [sc, sj] )

        myAlg  = MyAlgorithm()
        myData = Data(collection=mt_data)

        runner = MTRunner(name='lcg_jprepare', algorithm=myAlg, data=myData, numThread=10)
        runner.start()
        runner.join(-1)

        if len(runner.getDoneList()) < len(mt_data):
            return None
        else:
            # return a JDL file dictionary with subjob ids as keys, JDL file paths as values
            return runner.getResults()