Пример #1
0
 def check_logs(job_defs):
     stalled_jobs = set()
     for job_def in job_defs:
         try:
             log_lines = get_job_log(job_def, write_file=False)
             jid = job_def['jobId']
             now = datetime.now()
             if jid not in job_log_dict.keys():
                 logger.info("Adding job %s to the log tracker at %s." %
                             (jid, now))
                 job_log_dict[jid] = {'log': log_lines, 'check_time': now}
             elif len(job_log_dict[jid]['log']) == len(log_lines):
                 check_dt = now - job_log_dict[jid]['check_time']
                 logger.warning(('Job \'%s\' has not produced output for '
                                 '%d seconds.') %
                                (job_def['jobName'], check_dt.seconds))
                 if check_dt.seconds > idle_log_timeout:
                     logger.warning("Job \'%s\' has stalled." %
                                    job_def['jobName'])
                     stalled_jobs.add(jid)
             else:
                 old_log = job_log_dict[jid]['log']
                 old_log += log_lines[len(old_log):]
                 job_log_dict[jid]['check_time'] = now
         except Exception as e:
             logger.error("Failed to check log for: %s" % str(job_def))
             logger.exception(e)
     return stalled_jobs
def stash_logs(job_defs,
               success_jobs,
               failure_jobs,
               queue_name,
               method='local',
               job_name_prefix=None,
               tag='stash',
               ids_stashed=None):
    if ids_stashed is None:
        ids_stashed = set()

    success_ids = _get_job_ids_to_stash(success_jobs, ids_stashed)
    failure_ids = _get_job_ids_to_stash(failure_jobs, ids_stashed)
    if method == 's3':
        s3_client = boto3.client('s3')

        def stash_log(log_str, name_base):
            name = '%s_%s.log' % (name_base, tag)
            s3_client.put_object(Bucket=bucket_name,
                                 Key='reading_results/%s/logs/%s/%s' %
                                 (job_name_prefix, queue_name, name),
                                 Body=log_str)
    elif method == 'local':
        if job_name_prefix is None:
            job_name_prefix = 'batch_%s' % tag
        dirname = '%s_job_logs' % job_name_prefix
        os.mkdir(dirname)

        def stash_log(log_str, name_base):
            with open(os.path.join(dirname, name_base + '.log'), 'w') as f:
                f.write(log_str)
    else:
        raise ValueError('Invalid method: %s' % method)

    for jobId, job_def_tpl in job_defs.items():
        if jobId not in success_ids and jobId not in failure_ids:
            continue  # Logs aren't done and ready to be loaded.
        try:
            job_def = dict(job_def_tpl)
            lines = get_job_log(job_def, write_file=False)
            if lines is None:
                logger.warning("No logs found for %s." % job_def['jobName'])
                continue
            log_str = ''.join(lines)
            base_name = job_def['jobName']
            if job_def['jobId'] in success_ids:
                base_name += '/SUCCESS'
            elif job_def['jobId'] in failure_ids:
                base_name += '/FAILED'
            else:
                logger.error("Job cannot be logged unless completed.")
                continue
            logger.info('Stashing ' + base_name)
            stash_log(log_str, base_name)
        except Exception as e:
            logger.error("Failed to save logs for: %s" % str(job_def_tpl))
            logger.exception(e)
    ids_stashed |= {jid for jids in [success_ids, failure_ids] for jid in jids}
    return
Пример #3
0
def stash_logs(job_defs, success_jobs, failure_jobs, queue_name, method='local',
               job_name_prefix=None, tag='stash', ids_stashed=None):
    if ids_stashed is None:
        ids_stashed = set()

    success_ids = _get_job_ids_to_stash(success_jobs, ids_stashed)
    failure_ids = _get_job_ids_to_stash(failure_jobs, ids_stashed)
    if method == 's3':
        s3_client = boto3.client('s3')

        def stash_log(log_str, name_base):
            name = '%s_%s.log' % (name_base, tag)
            s3_client.put_object(
                Bucket=bucket_name,
                Key='reading_results/%s/logs/%s/%s' % (
                    job_name_prefix,
                    queue_name,
                    name),
                Body=log_str
                )
    elif method == 'local':
        if job_name_prefix is None:
            job_name_prefix = 'batch_%s' % tag
        dirname = '%s_job_logs' % job_name_prefix
        os.mkdir(dirname)

        def stash_log(log_str, name_base):
            with open(os.path.join(dirname, name_base + '.log'), 'w') as f:
                f.write(log_str)
    else:
        raise ValueError('Invalid method: %s' % method)

    for jobId, job_def_tpl in job_defs.items():
        if jobId not in success_ids and jobId not in failure_ids:
            continue  # Logs aren't done and ready to be loaded.
        try:
            job_def = dict(job_def_tpl)
            lines = get_job_log(job_def, write_file=False)
            if lines is None:
                logger.warning("No logs found for %s." % job_def['jobName'])
                continue
            log_str = ''.join(lines)
            base_name = job_def['jobName']
            if job_def['jobId'] in success_ids:
                base_name += '/SUCCESS'
            elif job_def['jobId'] in failure_ids:
                base_name += '/FAILED'
            else:
                logger.error("Job cannot be logged unless completed.")
                continue
            logger.info('Stashing ' + base_name)
            stash_log(log_str, base_name)
        except Exception as e:
            logger.error("Failed to save logs for: %s" % str(job_def_tpl))
            logger.exception(e)
    ids_stashed |= {jid for jids in [success_ids, failure_ids] for jid in jids}
    return
Пример #4
0
    def check_logs(job_defs):
        """Updates teh job_log_dict."""
        stalled_jobs = set()

        # Check the status of all the jobs we're tracking.
        for job_def in job_defs:
            try:
                # Get the logs for this job.
                log_lines = get_job_log(job_def, write_file=False)

                # Get the job id.
                jid = job_def['jobId']
                now = datetime.now()
                if jid not in job_log_dict.keys():
                    # If the job is new...
                    logger.info("Adding job %s to the log tracker at %s." %
                                (jid, now))
                    job_log_dict[jid] = {
                        'log': log_lines,
                        'last change time': now
                    }
                elif len(job_log_dict[jid]['log']) == len(log_lines):
                    # If the job log hasn't changed, announce as such, and
                    # check to see if it has been the same for longer than
                    # stall time.
                    check_dt = now - job_log_dict[jid]['last change time']
                    logger.warning(('Job \'%s\' has not produced output for '
                                    '%d seconds.') %
                                   (job_def['jobName'], check_dt.seconds))
                    if check_dt.seconds > idle_log_timeout:
                        logger.warning("Job \'%s\' has stalled." %
                                       job_def['jobName'])
                        stalled_jobs.add(jid)
                else:
                    # If the job is known, and the logs have changed, update
                    # the "last change time".
                    old_log = job_log_dict[jid]['log']
                    old_log += log_lines[len(old_log):]
                    job_log_dict[jid]['last change time'] = now
            except Exception as e:
                # Sometimes due to sync et al. issues, a part of this will fail
                # Such things are usually transitory issues so we keep trying.
                logger.error("Failed to check log for: %s" % str(job_def))
                logger.exception(e)

        # Pass up the set of job id's for stalled jobs.
        return stalled_jobs
Пример #5
0
def stash_logs(job_defs,
               success_ids,
               failure_ids,
               queue_name,
               method='local',
               job_name_prefix=None,
               tag='stash'):
    if method == 's3':
        s3_client = boto3.client('s3')

        def stash_log(log_str, name_base):
            name = '%s_%s.log' % (name_base, tag)
            s3_client.put_object(Bucket=bucket_name,
                                 Key='reading_results/%s/logs/%s/%s' %
                                 (job_name_prefix, queue_name, name),
                                 Body=log_str)

    elif method == 'local':
        if job_name_prefix is None:
            job_name_prefix = 'batch_%s' % tag
        dirname = '%s_job_logs' % job_name_prefix
        os.mkdir(dirname)

        def stash_log(log_str, name_base):
            with open(os.path.join(dirname, name_base + '.log'), 'w') as f:
                f.write(log_str)

    for job_def_tpl in job_defs:
        try:
            job_def = dict(job_def_tpl)
            lines = get_job_log(job_def, write_file=False)
            if lines is None:
                logger.warning("No logs found for %s." % job_def['jobName'])
                continue
            log_str = ''.join(lines)
            base_name = job_def['jobName']
            if job_def['jobId'] in success_ids:
                base_name += '_SUCCESS'
            elif job_def['jobId'] in failure_ids:
                base_name += '_FAILED'
            logger.info('Stashing ' + base_name)
            stash_log(log_str, base_name)
        except Exception as e:
            logger.error("Failed to save logs for: %s" % str(job_def_tpl))
            logger.exception(e)
    return
Пример #6
0
    def check_logs(job_defs):
        """Updates teh job_log_dict."""
        stalled_jobs = set()

        # Check the status of all the jobs we're tracking.
        for job_def in job_defs:
            try:
                # Get the logs for this job.
                log_lines = get_job_log(job_def, write_file=False)

                # Get the job id.
                jid = job_def['jobId']
                now = datetime.now()
                if jid not in job_log_dict.keys():
                    # If the job is new...
                    logger.info("Adding job %s to the log tracker at %s."
                                % (jid, now))
                    job_log_dict[jid] = {'log': log_lines,
                                         'last change time': now}
                elif len(job_log_dict[jid]['log']) == len(log_lines):
                    # If the job log hasn't changed, announce as such, and
                    # check to see if it has been the same for longer than
                    # stall time.
                    check_dt = now - job_log_dict[jid]['last change time']
                    logger.warning(('Job \'%s\' has not produced output for '
                                    '%d seconds.')
                                   % (job_def['jobName'], check_dt.seconds))
                    if check_dt.seconds > idle_log_timeout:
                        logger.warning("Job \'%s\' has stalled."
                                       % job_def['jobName'])
                        stalled_jobs.add(jid)
                else:
                    # If the job is known, and the logs have changed, update
                    # the "last change time".
                    old_log = job_log_dict[jid]['log']
                    old_log += log_lines[len(old_log):]
                    job_log_dict[jid]['last change time'] = now
            except Exception as e:
                # Sometimes due to sync et al. issues, a part of this will fail
                # Such things are usually transitory issues so we keep trying.
                logger.error("Failed to check log for: %s" % str(job_def))
                logger.exception(e)

        # Pass up the set of job id's for stalled jobs.
        return stalled_jobs
 def check_logs(job_defs):
     stalled_jobs = set()
     for job_def in job_defs:
         log_lines = get_job_log(job_def, write_file=False)
         jid = job_def['jobId']
         now = datetime.now()
         if jid not in job_log_dict.keys():
             job_log_dict[jid] = {'log': log_lines, 'check_time': now}
         elif len(job_log_dict[jid]['log']) == len(log_lines):
             check_dt = now - job_log_dict[jid]['check_time']
             if check_dt.seconds > idle_log_timeout:
                 logger.warning(('Job \'%s\' has not produced output for '
                                 '%d seconds.') %
                                (job_def['jobName'], check_dt.seconds))
                 stalled_jobs.add(jid)
         else:
             old_log = job_log_dict[jid]['log']
             old_log += log_lines[len(old_log):]
     return stalled_jobs