def submit_workers(self, workspec_list):
        tmpLog = self.make_logger(baseLogger, method_name='submit_workers')

        nWorkers = len(workspec_list)
        tmpLog.debug('start nWorkers={0}'.format(nWorkers))

        # whether to submit any worker
        to_submit_any = True

        # get log subdirectory name from timestamp
        timeNow = datetime.datetime.utcnow()
        log_subdir = timeNow.strftime('%y-%m-%d_%H')
        log_subdir_path = os.path.join(self.logDir, log_subdir)
        if self.condorSchedd is None or not self.useSpool:
            try:
                os.mkdir(log_subdir_path)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
                else:
                    pass

        # get info from harvester queue config
        _queueConfigMapper = QueueConfigMapper()
        harvester_queue_config = _queueConfigMapper.get_queue(self.queueName)

        is_grandly_unified_queue = False
        # get queue info from AGIS by cacher in db
        if self.useAtlasAGIS:
            panda_queues_dict = PandaQueuesDict()
            panda_queue_name = panda_queues_dict.get_panda_queue_name(
                self.queueName)
            this_panda_queue_dict = panda_queues_dict.get(
                self.queueName, dict())
            is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue(
                self.queueName)
            # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName]))
        else:
            panda_queues_dict = dict()
            panda_queue_name = self.queueName
            this_panda_queue_dict = dict()

        # get default information from queue info
        n_core_per_node_from_queue = this_panda_queue_dict.get(
            'corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1
        is_unified_queue = this_panda_queue_dict.get('capability',
                                                     '') == 'ucore'
        pilot_version_orig = str(this_panda_queue_dict.get(
            'pilot_version', ''))
        pilot_version_suffix_str = '_pilot2' if pilot_version_orig == '2' else ''

        # get override requirements from queue configured
        try:
            n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue
        except AttributeError:
            n_core_per_node = n_core_per_node_from_queue

        # deal with Condor schedd and central managers; make a random list the choose
        n_bulks = _div_round_up(nWorkers, self.minBulkToRamdomizedSchedd)
        if isinstance(self.condorSchedd, list) and len(self.condorSchedd) > 0:
            if isinstance(self.condorPool, list) and len(self.condorPool) > 0:
                orig_list = list(zip(self.condorSchedd, self.condorPool))
            else:
                orig_list = [(_schedd, self.condorPool)
                             for _schedd in self.condorSchedd]
            if n_bulks < len(orig_list):
                schedd_pool_choice_list = random.sample(orig_list, n_bulks)
            else:
                schedd_pool_choice_list = orig_list
        else:
            schedd_pool_choice_list = [(self.condorSchedd, self.condorPool)]

        # deal with CE
        special_par = ''
        ce_weighting = None
        if self.useAtlasGridCE:
            # If ATLAS Grid CE mode used
            tmpLog.debug('Using ATLAS Grid CE mode...')
            queues_from_queue_list = this_panda_queue_dict.get('queues', [])
            special_par = this_panda_queue_dict.get('special_par', '')
            ce_auxilary_dict = {}
            for _queue_dict in queues_from_queue_list:
                if not (_queue_dict.get('ce_endpoint') and str(
                        _queue_dict.get('ce_state', '')).upper() == 'ACTIVE'
                        and str(_queue_dict.get('ce_flavour', '')).lower()
                        in set(['arc-ce', 'cream-ce', 'htcondor-ce'])):
                    continue
                ce_endpoint = _queue_dict.get('ce_endpoint')
                if (ce_endpoint in ce_auxilary_dict
                        and str(_queue_dict.get('ce_queue_name',
                                                '')).lower() == 'default'):
                    pass
                else:
                    ce_auxilary_dict[ce_endpoint] = _queue_dict
            # qualified CEs from AGIS info
            n_qualified_ce = len(ce_auxilary_dict)
            if n_qualified_ce > 0:
                # Get CE weighting
                tmpLog.debug('Get CE weighting')
                worker_ce_all_tuple = self.get_ce_statistics(
                    self.queueName, nWorkers)
                ce_weighting = _get_ce_weighting(
                    ce_endpoint_list=list(ce_auxilary_dict.keys()),
                    worker_ce_all_tuple=worker_ce_all_tuple)
                stats_weighting_display_str = _get_ce_stats_weighting_display(
                    ce_auxilary_dict.keys(), worker_ce_all_tuple, ce_weighting)
                tmpLog.debug('CE stats and weighting: {0}'.format(
                    stats_weighting_display_str))
            else:
                tmpLog.error('No valid CE endpoint found')
                to_submit_any = False

        def _handle_one_worker(workspec, to_submit=to_submit_any):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger,
                                            'workerID={0}'.format(
                                                workspec.workerID),
                                            method_name='_handle_one_worker')
            ce_info_dict = dict()
            batch_log_dict = dict()
            data = {
                'workspec': workspec,
                'to_submit': to_submit,
            }
            if to_submit:
                if self.useAtlasGridCE:
                    # choose a CE
                    tmpLog.info('choose a CE...')
                    ce_chosen = _choose_ce(ce_weighting)
                    try:
                        ce_info_dict = ce_auxilary_dict[ce_chosen].copy()
                    except KeyError:
                        tmpLog.info(
                            'Problem choosing CE with weighting. Choose an arbitrary CE endpoint'
                        )
                        ce_info_dict = random.choice(
                            list(ce_auxilary_dict.values())).copy()
                    # go on info of the CE; ignore protocol prefix in ce_endpoint
                    ce_endpoint_from_queue = re.sub(
                        '^\w+://', '', ce_info_dict.get('ce_endpoint', ''))
                    ce_flavour_str = str(ce_info_dict.get('ce_flavour',
                                                          '')).lower()
                    ce_version_str = str(ce_info_dict.get('ce_version',
                                                          '')).lower()
                    ce_info_dict['ce_hostname'] = re.sub(
                        ':\w*', '', ce_endpoint_from_queue)
                    if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue:
                        # add default port to ce_endpoint if missing
                        default_port_map = {
                            'cream-ce': 8443,
                            'arc-ce': 2811,
                            'htcondor-ce': 9619,
                        }
                        if ce_flavour_str in default_port_map:
                            default_port = default_port_map[ce_flavour_str]
                            ce_info_dict['ce_endpoint'] = '{0}:{1}'.format(
                                ce_endpoint_from_queue, default_port)
                    tmpLog.debug(
                        'For site {0} got pilot version: "{1}"; CE endpoint: "{2}", flavour: "{3}"'
                        .format(self.queueName, pilot_version_orig,
                                ce_endpoint_from_queue, ce_flavour_str))
                    if not self.templateFile and os.path.isdir(
                            self.CEtemplateDir) and ce_flavour_str:
                        sdf_template_filename = '{ce_flavour_str}{pilot_version_suffix_str}.sdf'.format(
                            ce_flavour_str=ce_flavour_str,
                            pilot_version_suffix_str=pilot_version_suffix_str)
                        self.templateFile = os.path.join(
                            self.CEtemplateDir, sdf_template_filename)
                else:
                    try:
                        # Manually define site condor schedd as ceHostname and central manager as ceEndpoint
                        if self.ceHostname and isinstance(
                                self.ceHostname,
                                list) and len(self.ceHostname) > 0:
                            if isinstance(self.ceEndpoint,
                                          list) and len(self.ceEndpoint) > 0:
                                ce_info_dict['ce_hostname'], ce_info_dict[
                                    'ce_endpoint'] = random.choice(
                                        list(
                                            zip(self.ceHostname,
                                                self.ceEndpoint)))
                            else:
                                ce_info_dict['ce_hostname'] = random.choice(
                                    self.ceHostname)
                                ce_info_dict['ce_endpoint'] = self.ceEndpoint
                        else:
                            ce_info_dict['ce_hostname'] = self.ceHostname
                            ce_info_dict['ce_endpoint'] = self.ceEndpoint
                    except AttributeError:
                        pass
                    try:
                        # Manually define ceQueueName
                        if self.ceQueueName:
                            ce_info_dict['ce_queue_name'] = self.ceQueueName
                    except AttributeError:
                        pass
                # template for batch script
                try:
                    tmpFile = open(self.templateFile)
                    sdf_template_raw = tmpFile.read()
                    tmpFile.close()
                except AttributeError:
                    tmpLog.error(
                        'No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found'
                    )
                    to_submit = False
                    return data
                else:
                    # get batch_log, stdout, stderr filename, and remobe commented liness
                    sdf_template_str_list = []
                    for _line in sdf_template_raw.split('\n'):
                        if _line.startswith('#'):
                            continue
                        sdf_template_str_list.append(_line)
                        _match_batch_log = re.match('log = (.+)', _line)
                        _match_stdout = re.match('output = (.+)', _line)
                        _match_stderr = re.match('error = (.+)', _line)
                        if _match_batch_log:
                            batch_log_value = _match_batch_log.group(1)
                            continue
                        if _match_stdout:
                            stdout_value = _match_stdout.group(1)
                            continue
                        if _match_stderr:
                            stderr_value = _match_stderr.group(1)
                            continue
                    sdf_template = '\n'.join(sdf_template_str_list)
                    # Choose from Condor schedd and central managers
                    condor_schedd, condor_pool = random.choice(
                        schedd_pool_choice_list)
                    # set submissionHost
                    if not condor_schedd and not condor_pool:
                        workspec.submissionHost = 'LOCAL'
                    else:
                        workspec.submissionHost = '{0},{1}'.format(
                            condor_schedd, condor_pool)
                    tmpLog.debug('set submissionHost={0}'.format(
                        workspec.submissionHost))
                    # Log Base URL
                    if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL:
                        schedd_hostname = re.sub(
                            r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?',
                            lambda matchobj: matchobj.group(1)
                            if matchobj.group(1) else '', condor_schedd)
                        log_base_url = re.sub(r'\[ScheddHostname\]',
                                              schedd_hostname, self.logBaseURL)
                    else:
                        log_base_url = self.logBaseURL
                    # URLs for log files
                    if not (log_base_url is None):
                        if workspec.batchID:
                            batchID = workspec.batchID
                            guess = False
                        else:
                            batchID = ''
                            guess = True
                        batch_log_filename = parse_batch_job_filename(
                            value_str=batch_log_value,
                            file_dir=log_subdir_path,
                            batchID=batchID,
                            guess=guess)
                        stdout_path_file_name = parse_batch_job_filename(
                            value_str=stdout_value,
                            file_dir=log_subdir_path,
                            batchID=batchID,
                            guess=guess)
                        stderr_path_filename = parse_batch_job_filename(
                            value_str=stderr_value,
                            file_dir=log_subdir_path,
                            batchID=batchID,
                            guess=guess)
                        batch_log = '{0}/{1}/{2}'.format(
                            log_base_url, log_subdir, batch_log_filename)
                        batch_stdout = '{0}/{1}/{2}'.format(
                            log_base_url, log_subdir, stdout_path_file_name)
                        batch_stderr = '{0}/{1}/{2}'.format(
                            log_base_url, log_subdir, stderr_path_filename)
                        workspec.set_log_file('batch_log', batch_log)
                        workspec.set_log_file('stdout', batch_stdout)
                        workspec.set_log_file('stderr', batch_stderr)
                        batch_log_dict['batch_log'] = batch_log
                        batch_log_dict['batch_stdout'] = batch_stdout
                        batch_log_dict['batch_stderr'] = batch_stderr
                        batch_log_dict['gtag'] = workspec.workAttributes[
                            'stdOut']
                        tmpLog.debug('Done set_log_file before submission')
                    tmpLog.debug('Done jobspec attribute setting')

                # choose the x509 certificate based on the type of job (analysis or production)
                proxy = _choose_proxy(workspec)

                # set data dict
                data.update({
                    'workspec': workspec,
                    'to_submit': to_submit,
                    'template': sdf_template,
                    'executable_file': self.executableFile,
                    'log_dir': self.logDir,
                    'log_subdir': log_subdir,
                    'n_core_per_node': n_core_per_node,
                    'panda_queue_name': panda_queue_name,
                    'x509_user_proxy': proxy,
                    'ce_info_dict': ce_info_dict,
                    'batch_log_dict': batch_log_dict,
                    'special_par': special_par,
                    'harvester_queue_config': harvester_queue_config,
                    'is_unified_queue': is_unified_queue,
                    'condor_schedd': condor_schedd,
                    'condor_pool': condor_pool,
                    'use_spool': self.useSpool,
                    'pilot_version': pilot_version_orig,
                })
            return data

        def _choose_proxy(workspec):
            """
            Choose the proxy based on the job type
            """
            job_type = workspec.jobType
            proxy = self.x509UserProxy
            if is_grandly_unified_queue and job_type in (
                    'user', 'panda',
                    'analysis') and self.x509UserProxyAnalysis:
                tmpLog.debug('Taking analysis proxy')
                proxy = self.x509UserProxyAnalysis
            else:
                tmpLog.debug('Taking default proxy')

            return proxy

        def _propagate_attributes(workspec, tmpVal):
            # make logger
            tmpLog = core_utils.make_logger(
                baseLogger,
                'workerID={0}'.format(workspec.workerID),
                method_name='_propagate_attributes')
            (retVal, tmpDict) = tmpVal
            workspec.set_attributes_with_dict(tmpDict)
            tmpLog.debug('Done workspec attributes propagation')
            return retVal

        tmpLog.debug('finished preparing worker attributes')

        # map(_handle_one_worker, workspec_list)
        with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool:
            dataIterator = thread_pool.map(_handle_one_worker, workspec_list)
        tmpLog.debug('{0} workers handled'.format(nWorkers))

        # submit
        retValList = submit_bag_of_workers(list(dataIterator))
        tmpLog.debug('{0} workers submitted'.format(nWorkers))

        # propagate changed attributes
        with ThreadPoolExecutor(self.nProcesses) as thread_pool:
            retIterator = thread_pool.map(
                lambda _wv_tuple: _propagate_attributes(*_wv_tuple),
                zip(workspec_list, retValList))

        retList = list(retIterator)
        tmpLog.debug('done')

        return retList
示例#2
0
    def submit_k8s_worker(self, work_spec):
        tmp_log = self.make_logger(base_logger, method_name='submit_k8s_worker')

        # get info from harvester queue config
        _queueConfigMapper = QueueConfigMapper()
        harvester_queue_config = _queueConfigMapper.get_queue(self.queueName)

        # set the stdout log file
        log_file_name = '{0}_{1}.out'.format(harvester_config.master.harvester_id, work_spec.workerID)
        work_spec.set_log_file('stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name))
        # TODO: consider if we want to upload the yaml file to PanDA cache

        yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file)
        try:

            # read the job configuration (if available, only push model)
            job_fields, job_pars_parsed = self.read_job_configuration(work_spec)

            # decide container image and executable to run. In pull mode, defaults are provided
            container_image = self.decide_container_image(job_fields, job_pars_parsed)
            executable, args = self.build_executable(job_fields, job_pars_parsed)
            tmp_log.debug('container_image: "{0}"; executable: "{1}"; args: "{2}"'.format(container_image, executable,
                                                                                          args))

            # choose the appropriate proxy
            panda_queues_dict = PandaQueuesDict()
            this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict())

            is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue(self.queueName)
            cert = self._choose_proxy(work_spec, is_grandly_unified_queue)
            if not cert:
                err_str = 'No proxy specified in proxySecretPath. Not submitted'
                tmp_return_value = (False, err_str)
                return tmp_return_value

            # get the walltime limit
            try:
                max_time = this_panda_queue_dict['maxtime']
            except Exception as e:
                tmp_log.warning('Could not retrieve maxtime field for queue {0}'.format(self.queueName))
                max_time = None

            associated_params_dict = {}
            for key, val in panda_queues_dict.get_harvester_params(self.queueName).items():
                if key in self._allowed_agis_attrs:
                    associated_params_dict[key] = val

            pilot_url = associated_params_dict.get('pilot_url')
            pilot_version = str(this_panda_queue_dict.get('pilot_version', 'current'))
            python_version = str(this_panda_queue_dict.get('python_version', '2'))

            # prod_source_label = harvester_queue_config.get_source_label(work_spec.jobType)
            pilot_opt_dict = submitter_common.get_complicated_pilot_options(work_spec.pilotType)
            if pilot_opt_dict is None:
                prod_source_label = harvester_queue_config.get_source_label(work_spec.jobType)
                pilot_type = work_spec.pilotType
                pilot_url_str = '--piloturl {0}'.format(pilot_url) if pilot_url else ''
            else:
                prod_source_label = pilot_opt_dict['prod_source_label']
                pilot_type = pilot_opt_dict['pilot_type_opt']
                pilot_url_str = pilot_opt_dict['pilot_url_str']

            pilot_python_option = submitter_common.get_python_version_option(python_version, prod_source_label)

            # submit the worker
            rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, prod_source_label,
                                                                           pilot_type, pilot_url_str,
                                                                           pilot_python_option,
                                                                           container_image, executable, args, cert,
                                                                           cpu_adjust_ratio=self.cpuAdjustRatio,
                                                                           memory_adjust_ratio=self.memoryAdjustRatio,
                                                                           max_time=max_time)
        except Exception as _e:
            tmp_log.error(traceback.format_exc())
            err_str = 'Failed to create a JOB; {0}'.format(_e)
            tmp_return_value = (False, err_str)
        else:
            work_spec.batchID = yaml_content['metadata']['name']
            tmp_log.debug('Created worker {0} with batchID={1}'.format(work_spec.workerID, work_spec.batchID))
            tmp_return_value = (True, '')

        return tmp_return_value
示例#3
0
    def submit_k8s_worker(self, work_spec):
        tmp_log = self.make_logger(base_logger,
                                   method_name='submit_k8s_worker')

        # get info from harvester queue config
        _queueConfigMapper = QueueConfigMapper()
        harvester_queue_config = _queueConfigMapper.get_queue(self.queueName)
        prod_source_label = harvester_queue_config.get_source_label(
            work_spec.jobType)

        # set the stdout log file
        log_file_name = '{0}_{1}.out'.format(
            harvester_config.master.harvester_id, work_spec.workerID)
        work_spec.set_log_file(
            'stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name))
        # TODO: consider if we want to upload the yaml file to PanDA cache

        yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file)
        try:

            # read the job configuration (if available, only push model)
            job_fields, job_pars_parsed = self.read_job_configuration(
                work_spec)

            # decide container image and executable to run. In pull mode, defaults are provided
            container_image = self.decide_container_image(
                job_fields, job_pars_parsed)
            executable, args = self.build_executable(job_fields,
                                                     job_pars_parsed)
            tmp_log.debug(
                'container_image: "{0}"; executable: "{1}"; args: "{2}"'.
                format(container_image, executable, args))

            # choose the appropriate proxy
            panda_queues_dict = PandaQueuesDict()
            is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue(
                self.queueName)
            cert, use_secret = self._choose_proxy(work_spec,
                                                  is_grandly_unified_queue)
            if not cert:
                err_str = 'No proxy specified in proxySecretPath or x509UserProxy. Not submitted'
                tmp_return_value = (False, err_str)
                return tmp_return_value

            # get the walltime limit
            try:
                max_time = panda_queues_dict.get(self.queueName)['maxtime']
            except Exception as e:
                tmp_log.warning(
                    'Could not retrieve maxtime field for queue {0}'.format(
                        self.queueName))
                max_time = None

            # submit the worker
            rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(
                yaml_content,
                work_spec,
                prod_source_label,
                container_image,
                executable,
                args,
                cert,
                cert_in_secret=use_secret,
                cpu_adjust_ratio=self.cpuAdjustRatio,
                memory_adjust_ratio=self.memoryAdjustRatio,
                max_time=max_time)
        except Exception as _e:
            tmp_log.error(traceback.format_exc())
            err_str = 'Failed to create a JOB; {0}'.format(_e)
            tmp_return_value = (False, err_str)
        else:
            work_spec.batchID = yaml_content['metadata']['name']
            tmp_log.debug('Created worker {0} with batchID={1}'.format(
                work_spec.workerID, work_spec.batchID))
            tmp_return_value = (True, '')

        return tmp_return_value
示例#4
0
    def run(self):
        while True:
            mainLog = self.make_logger(_logger,
                                       'id={0}'.format(self.get_pid()),
                                       method_name='run')
            mainLog.debug('getting number of jobs to be fetched')
            # get number of jobs to be fetched
            nJobsPerQueue = self.dbProxy.get_num_jobs_to_fetch(
                harvester_config.jobfetcher.nQueues,
                harvester_config.jobfetcher.lookupTime)
            mainLog.debug('got {0} queues'.format(len(nJobsPerQueue)))

            # get up to date queue configuration
            pandaQueueDict = PandaQueuesDict()

            # loop over all queues
            for queueName, nJobs in iteritems(nJobsPerQueue):
                # check queue
                if not self.queueConfigMapper.has_queue(queueName):
                    continue
                tmpLog = self.make_logger(_logger,
                                          'queueName={0}'.format(queueName),
                                          method_name='run')
                # get queue
                queueConfig = self.queueConfigMapper.get_queue(queueName)
                siteName = queueConfig.siteName
                # upper limit
                if nJobs > harvester_config.jobfetcher.maxJobs:
                    nJobs = harvester_config.jobfetcher.maxJobs

                # get jobs
                try:
                    is_grandly_unified_queue = pandaQueueDict.is_grandly_unified_queue(
                        siteName)
                except Exception:
                    is_grandly_unified_queue = False

                default_prodSourceLabel = queueConfig.get_source_label(
                    is_gu=is_grandly_unified_queue)

                pdpm = getattr(queueConfig,
                               'prodSourceLabelRandomWeightsPermille', {})
                choice_list = core_utils.make_choice_list(
                    pdpm=pdpm, default=default_prodSourceLabel)
                prodSourceLabel = random.choice(choice_list)
                tmpLog.debug('getting {0} jobs for prodSourceLabel {1}'.format(
                    nJobs, prodSourceLabel))
                sw = core_utils.get_stopwatch()
                jobs, errStr = self.communicator.get_jobs(
                    siteName, self.nodeName, prodSourceLabel, self.nodeName,
                    nJobs, queueConfig.getJobCriteria)
                tmpLog.info('got {0} jobs with {1} {2}'.format(
                    len(jobs), errStr, sw.get_elapsed_time()))
                # convert to JobSpec
                if len(jobs) > 0:
                    # get extractor plugin
                    if hasattr(queueConfig, 'extractor'):
                        extractorCore = self.pluginFactory.get_plugin(
                            queueConfig.extractor)
                    else:
                        extractorCore = None
                    jobSpecs = []
                    fileStatMap = dict()
                    sw_startconvert = core_utils.get_stopwatch()
                    for job in jobs:
                        timeNow = datetime.datetime.utcnow()
                        jobSpec = JobSpec()
                        jobSpec.convert_job_json(job)
                        jobSpec.computingSite = queueName
                        jobSpec.status = 'starting'
                        jobSpec.subStatus = 'fetched'
                        jobSpec.creationTime = timeNow
                        jobSpec.stateChangeTime = timeNow
                        jobSpec.configID = queueConfig.configID
                        jobSpec.set_one_attribute(
                            'schedulerID', 'harvester-{0}'.format(
                                harvester_config.master.harvester_id))
                        if queueConfig.zipPerMB is not None and jobSpec.zipPerMB is None:
                            jobSpec.zipPerMB = queueConfig.zipPerMB
                        fileGroupDictList = [
                            jobSpec.get_input_file_attributes()
                        ]
                        if extractorCore is not None:
                            fileGroupDictList.append(
                                extractorCore.get_aux_inputs(jobSpec))
                        for fileGroupDict in fileGroupDictList:
                            for tmpLFN, fileAttrs in iteritems(fileGroupDict):
                                # make file spec
                                fileSpec = FileSpec()
                                fileSpec.PandaID = jobSpec.PandaID
                                fileSpec.taskID = jobSpec.taskID
                                fileSpec.lfn = tmpLFN
                                fileSpec.endpoint = queueConfig.ddmEndpointIn
                                fileSpec.scope = fileAttrs['scope']
                                if 'INTERNAL_FileType' in fileAttrs:
                                    fileSpec.fileType = fileAttrs[
                                        'INTERNAL_FileType']
                                    jobSpec.auxInput = JobSpec.AUX_hasAuxInput
                                else:
                                    fileSpec.fileType = 'input'
                                # check file status
                                if tmpLFN not in fileStatMap:
                                    fileStatMap[
                                        tmpLFN] = self.dbProxy.get_file_status(
                                            tmpLFN, fileSpec.fileType,
                                            queueConfig.ddmEndpointIn,
                                            'starting')
                                # set preparing to skip stage-in if the file is (being) taken care of by another job
                                if [
                                        x for x in [
                                            'ready', 'preparing', 'to_prepare',
                                            'triggered'
                                        ] if x in fileStatMap[tmpLFN]
                                ]:
                                    fileSpec.status = 'preparing'
                                else:
                                    fileSpec.status = 'to_prepare'
                                fileStatMap[tmpLFN].setdefault(
                                    fileSpec.status, None)
                                if 'INTERNAL_URL' in fileAttrs:
                                    fileSpec.url = fileAttrs['INTERNAL_URL']
                                jobSpec.add_in_file(fileSpec)
                        jobSpec.trigger_propagation()
                        jobSpecs.append(jobSpec)
                    # insert to DB
                    tmpLog.debug("Converting of {0} jobs {1}".format(
                        len(jobs), sw_startconvert.get_elapsed_time()))
                    sw_insertdb = core_utils.get_stopwatch()
                    self.dbProxy.insert_jobs(jobSpecs)
                    tmpLog.debug('Insert of {0} jobs {1}'.format(
                        len(jobSpecs), sw_insertdb.get_elapsed_time()))
            mainLog.debug('done')
            # check if being terminated
            if self.terminated(harvester_config.jobfetcher.sleepTime):
                mainLog.debug('terminated')
                return