Пример #1
0
def make(job_id, context, echo=False):  # @UnusedVariable
    """
        Makes a single job.

        Returns a dictionary with fields:
             "user_object"
             "user_object_deps" = set of Promises
             "new_jobs" -> new jobs defined
             "deleted_jobs" -> jobs that were defined but not anymore

        Raises JobFailed
        or JobInterrupted. Also SystemExit, KeyboardInterrupt, MemoryError are
        captured.
    """
    db = context.get_compmake_db()

    int_make = IntervalTimer()

    host = 'hostname'  # XXX

    if get_compmake_config('set_proc_title'):
        setproctitle('cm-%s' % job_id)

    # TODO: should we make sure we are up to date???
    #     up, reason = up_to_date(job_id, db=db)  # @UnusedVariable
    #     if up:
    #         msg = 'Job %r appears already done.' % job_id
    #         msg += 'This can only happen if another compmake process uses the ' \
    #                'same DB.'
    # logger.error(msg)
    #         user_object = get_job_userobject(job_id, db=db)
    #         # XXX: this is not right anyway
    #         return dict(user_object=user_object,
    #                     user_object_deps=collect_dependencies(user_object),
    #                     deleted_jobs=[],
    #                     new_jobs=[])

    job = get_job(job_id, db=db)
    cache = get_job_cache(job_id, db=db)

    if cache.state == Cache.DONE:
        prev_defined_jobs = set(cache.jobs_defined)
        # print('%s had previously defined %s' % (job_id, prev_defined_jobs))
    else:
        # print('%s was not DONE' % job_id)
        prev_defined_jobs = None

    # Note that at this point we save important information in the Cache
    # so if we set this then it's going to destroy it
    # cache.state = Cache.IN _ PROGRESS
    # set_job_cache(job_id, cache, db=db)

    # TODO: delete previous user object

    def progress_callback(stack):
        publish(context, 'job-progress-plus', job_id=job_id, host=host,
                stack=stack)

    init_progress_tracking(progress_callback)

    disable_capture = False
    if disable_capture:
        capture = None
    else:
        echo = False
        capture = OutputCapture(context=context, prefix=job_id,
                                # This is instantaneous echo and should be False
                                # They will generate events anyway.
                                echo_stdout=echo,
                                echo_stderr=echo)

    # TODO: add whether we should just capture and not echo
    old_emit = logging.StreamHandler.emit

    from compmake.ui.coloredlog import colorize_loglevel

    FORMAT = "%(name)10s|%(filename)15s:%(lineno)-4s - %(funcName)-15s| %(message)s"

    formatter = Formatter(FORMAT)

    class Store(object):
        nhidden = 0

    def my_emit(_, log_record):
        # note that log_record.msg might be an exception
        try:
            try:
                s = str(log_record.msg)
            except UnicodeEncodeError:
                s = unicode(log_record.msg)
            except:
                s = 'Could not print log_record %s' % id(log_record)
            log_record.msg = colorize_loglevel(log_record.levelno, s)
            res = formatter.format(log_record)
            print(res)
            # this will be captured by OutputCapture anyway
        except:
            Store.nhidden += 1

    logging.StreamHandler.emit = my_emit

    already = set(context.get_jobs_defined_in_this_session())

    def get_deleted_jobs():
        generated = set(context.get_jobs_defined_in_this_session()) - already
        # print('failure: rolling back %s' % generated)

        from compmake.ui.ui import delete_jobs_recurse_definition

        todelete_ = set()
        # delete the jobs that were previously defined
        if prev_defined_jobs:
            todelete_.update(prev_defined_jobs)
        # and also the ones that were generated
        todelete_.update(generated)

        deleted_jobs_ = delete_jobs_recurse_definition(jobs=todelete_, db=db)
        # now we failed, so we need to roll back other changes
        # to the db
        return deleted_jobs_

    try:
        result = job_compute(job=job, context=context)

        assert isinstance(result, dict) and len(result) == 5
        user_object = result['user_object']
        new_jobs = result['new_jobs']
        int_load_results = result['int_load_results']
        int_compute = result['int_compute']
        int_gc = result['int_gc']
        int_gc.stop()

    except KeyboardInterrupt as e:
        bt = traceback.format_exc()
        deleted_jobs = get_deleted_jobs()
        mark_as_failed(job_id, 'KeyboardInterrupt: ' + str(e), backtrace=bt, db=db)

        cache = get_job_cache(job_id, db=db)
        if capture is not None:
            cache.captured_stderr = capture.get_logged_stderr()
            cache.captured_stdout = capture.get_logged_stdout()
        else:
            msg = '(Capture turned off.)'
            cache.captured_stderr = msg
            cache.captured_stdout = msg
        set_job_cache(job_id, cache, db=db)

        raise JobInterrupted(job_id=job_id, deleted_jobs=deleted_jobs)

    except (BaseException, ArithmeticError,
            BufferError, LookupError, Exception, SystemExit, MemoryError) as e:
        bt = traceback.format_exc()
        if six.PY2:
            s = '%s: %s' % (type(e).__name__, e)
            #
            # s = type(e).__name__ + ': ' + e.__str__().strip()
            # try:
            #     s = s.decode('utf-8', 'replace').encode('utf-8', 'replace')
            # except (UnicodeDecodeError, UnicodeEncodeError) as ue:
            #     print(ue)  # XXX
            #     s = 'Could not represent string.'
        else:
            s = '%s: %s' % (type(e).__name__, e)
        mark_as_failed(job_id, s, backtrace=bt, db=db)
        deleted_jobs = get_deleted_jobs()

        cache = get_job_cache(job_id, db=db)
        if capture is not None:
            cache.captured_stderr = capture.get_logged_stderr()
            cache.captured_stdout = capture.get_logged_stdout()
        else:
            msg = '(Capture turned off.)'
            cache.captured_stderr = msg
            cache.captured_stdout = msg

        set_job_cache(job_id, cache, db=db)

        raise JobFailed(job_id=job_id, reason=s, bt=bt,
                        deleted_jobs=deleted_jobs)
    finally:
        int_finally = IntervalTimer()
        if capture is not None:
            capture.deactivate()
        # even if we send an error, let's save the output of the process
        logging.StreamHandler.emit = old_emit
        if Store.nhidden > 0:
            msg = 'compmake: There were %d messages hidden due to bugs in logging.' % Store.nhidden
            print(msg)
        int_finally.stop()
    #        print('finally: %s' % int_finally)

    int_save_results = IntervalTimer()

    # print('Now %s has defined %s' % (job_id, new_jobs))
    if prev_defined_jobs is not None:
        # did we defined fewer jobs this time around?
        # then we need to delete them
        todelete = set()
        for x in prev_defined_jobs:
            if x not in new_jobs:
                todelete.add(x)
        from compmake.ui.ui import delete_jobs_recurse_definition
        deleted_jobs = delete_jobs_recurse_definition(jobs=todelete, db=db)
    else:
        deleted_jobs = set()

    # print('Now %s has deleted %s' % (job_id, deleted_jobs))

    set_job_userobject(job_id, user_object, db=db)
    int_save_results.stop()

    #    logger.debug('Save time for %s: %s s' % (job_id, walltime_save_result))

    int_make.stop()
    end_time = time()

    cache = Cache(Cache.DONE)

    #    print('int_make: %s' % int_make)
    #    print('int_load_results: %s' % int_load_results)
    #    print('int_compute: %s' % int_compute)
    if int_gc.get_walltime_used() > 1.0:
        logger.warning('Expensive garbage collection detected at the end of %s: %s' % (job_id, int_gc))
    #    print('int_save_results: %s' % int_save_results)

    cache.int_make = int_make
    cache.int_load_results = int_load_results
    cache.int_compute = int_compute
    cache.int_gc = int_gc
    cache.int_save_results = int_save_results

    cache.timestamp = end_time

    cache.walltime_used = int_make.get_walltime_used()
    cache.cputime_used = int_make.get_cputime_used()
    cache.host = host
    cache.jobs_defined = new_jobs
    set_job_cache(job_id, cache, db=db)

    return dict(user_object=user_object,
                user_object_deps=collect_dependencies(user_object),
                new_jobs=new_jobs,
                deleted_jobs=deleted_jobs)
Пример #2
0
def make(job_id, context, echo=False):  # @UnusedVariable
    """
        Makes a single job.

        Returns a dictionary with fields:
             "user_object"
             "user_object_deps" = set of Promises
             "new_jobs" -> new jobs defined
             "deleted_jobs" -> jobs that were defined but not anymore

        Raises JobFailed
        or JobInterrupted. Also SystemExit, KeyboardInterrupt, MemoryError are
        captured.
    """
    db = context.get_compmake_db()

    int_make = IntervalTimer()

    host = 'hostname'  # XXX

    if get_compmake_config('set_proc_title'):
        setproctitle('cm-%s' % job_id)

    # TODO: should we make sure we are up to date???
    #     up, reason = up_to_date(job_id, db=db)  # @UnusedVariable
    #     if up:
    #         msg = 'Job %r appears already done.' % job_id
    #         msg += 'This can only happen if another compmake process uses the ' \
    #                'same DB.'
    # logger.error(msg)
    #         user_object = get_job_userobject(job_id, db=db)
    #         # XXX: this is not right anyway
    #         return dict(user_object=user_object,
    #                     user_object_deps=collect_dependencies(user_object),
    #                     deleted_jobs=[],
    #                     new_jobs=[])

    job = get_job(job_id, db=db)
    cache = get_job_cache(job_id, db=db)

    if cache.state == Cache.DONE:
        prev_defined_jobs = set(cache.jobs_defined)
        # print('%s had previously defined %s' % (job_id, prev_defined_jobs))
    else:
        # print('%s was not DONE' % job_id)
        prev_defined_jobs = None

    # Note that at this point we save important information in the Cache
    # so if we set this then it's going to destroy it
    # cache.state = Cache.IN _ PROGRESS
    # set_job_cache(job_id, cache, db=db)

    # TODO: delete previous user object

    def progress_callback(stack):
        publish(context,
                'job-progress-plus',
                job_id=job_id,
                host=host,
                stack=stack)

    init_progress_tracking(progress_callback)

    disable_capture = False
    if disable_capture:
        capture = None
    else:
        echo = False
        capture = OutputCapture(
            context=context,
            prefix=job_id,
            # This is instantaneous echo and should be False
            # They will generate events anyway.
            echo_stdout=echo,
            echo_stderr=echo)

    # TODO: add whether we should just capture and not echo
    old_emit = logging.StreamHandler.emit

    from compmake.ui.coloredlog import colorize_loglevel

    FORMAT = "%(name)10s|%(filename)15s:%(lineno)-4s - %(funcName)-15s| %(message)s"

    formatter = Formatter(FORMAT)

    class Store(object):
        nhidden = 0

    def my_emit(_, log_record):
        # note that log_record.msg might be an exception
        try:
            try:
                s = str(log_record.msg)
            except UnicodeEncodeError:
                s = unicode(log_record.msg)
            except:
                s = 'Could not print log_record %s' % id(log_record)
            log_record.msg = colorize_loglevel(log_record.levelno, s)
            res = formatter.format(log_record)
            print(res)
            # this will be captured by OutputCapture anyway
        except:
            Store.nhidden += 1

    logging.StreamHandler.emit = my_emit

    already = set(context.get_jobs_defined_in_this_session())

    def get_deleted_jobs():
        generated = set(context.get_jobs_defined_in_this_session()) - already
        # print('failure: rolling back %s' % generated)

        from compmake.ui.ui import delete_jobs_recurse_definition

        todelete_ = set()
        # delete the jobs that were previously defined
        if prev_defined_jobs:
            todelete_.update(prev_defined_jobs)
        # and also the ones that were generated
        todelete_.update(generated)

        deleted_jobs_ = delete_jobs_recurse_definition(jobs=todelete_, db=db)
        # now we failed, so we need to roll back other changes
        # to the db
        return deleted_jobs_

    try:
        result = job_compute(job=job, context=context)

        assert isinstance(result, dict) and len(result) == 5
        user_object = result['user_object']
        new_jobs = result['new_jobs']
        int_load_results = result['int_load_results']
        int_compute = result['int_compute']
        int_gc = result['int_gc']
        int_gc.stop()

    except KeyboardInterrupt as e:
        bt = traceback.format_exc()
        deleted_jobs = get_deleted_jobs()
        mark_as_failed(job_id,
                       'KeyboardInterrupt: ' + str(e),
                       backtrace=bt,
                       db=db)

        cache = get_job_cache(job_id, db=db)
        if capture is not None:
            cache.captured_stderr = capture.get_logged_stderr()
            cache.captured_stdout = capture.get_logged_stdout()
        else:
            msg = '(Capture turned off.)'
            cache.captured_stderr = msg
            cache.captured_stdout = msg
        set_job_cache(job_id, cache, db=db)

        raise JobInterrupted(job_id=job_id, deleted_jobs=deleted_jobs)

    except (BaseException, ArithmeticError, BufferError, LookupError,
            Exception, SystemExit, MemoryError) as e:
        bt = traceback.format_exc()
        if six.PY2:
            s = '%s: %s' % (type(e).__name__, e)
            #
            # s = type(e).__name__ + ': ' + e.__str__().strip()
            # try:
            #     s = s.decode('utf-8', 'replace').encode('utf-8', 'replace')
            # except (UnicodeDecodeError, UnicodeEncodeError) as ue:
            #     print(ue)  # XXX
            #     s = 'Could not represent string.'
        else:
            s = '%s: %s' % (type(e).__name__, e)
        mark_as_failed(job_id, s, backtrace=bt, db=db)
        deleted_jobs = get_deleted_jobs()

        cache = get_job_cache(job_id, db=db)
        if capture is not None:
            cache.captured_stderr = capture.get_logged_stderr()
            cache.captured_stdout = capture.get_logged_stdout()
        else:
            msg = '(Capture turned off.)'
            cache.captured_stderr = msg
            cache.captured_stdout = msg

        set_job_cache(job_id, cache, db=db)

        raise JobFailed(job_id=job_id,
                        reason=s,
                        bt=bt,
                        deleted_jobs=deleted_jobs)
    finally:
        int_finally = IntervalTimer()
        if capture is not None:
            capture.deactivate()
        # even if we send an error, let's save the output of the process
        logging.StreamHandler.emit = old_emit
        if Store.nhidden > 0:
            msg = 'compmake: There were %d messages hidden due to bugs in logging.' % Store.nhidden
            print(msg)
        int_finally.stop()
    #        print('finally: %s' % int_finally)

    int_save_results = IntervalTimer()

    # print('Now %s has defined %s' % (job_id, new_jobs))
    if prev_defined_jobs is not None:
        # did we defined fewer jobs this time around?
        # then we need to delete them
        todelete = set()
        for x in prev_defined_jobs:
            if x not in new_jobs:
                todelete.add(x)
        from compmake.ui.ui import delete_jobs_recurse_definition
        deleted_jobs = delete_jobs_recurse_definition(jobs=todelete, db=db)
    else:
        deleted_jobs = set()

    # print('Now %s has deleted %s' % (job_id, deleted_jobs))

    set_job_userobject(job_id, user_object, db=db)
    int_save_results.stop()

    #    logger.debug('Save time for %s: %s s' % (job_id, walltime_save_result))

    int_make.stop()
    end_time = time()

    cache = Cache(Cache.DONE)

    #    print('int_make: %s' % int_make)
    #    print('int_load_results: %s' % int_load_results)
    #    print('int_compute: %s' % int_compute)
    if int_gc.get_walltime_used() > 1.0:
        logger.warning(
            'Expensive garbage collection detected at the end of %s: %s' %
            (job_id, int_gc))
    #    print('int_save_results: %s' % int_save_results)

    cache.int_make = int_make
    cache.int_load_results = int_load_results
    cache.int_compute = int_compute
    cache.int_gc = int_gc
    cache.int_save_results = int_save_results

    cache.timestamp = end_time

    cache.walltime_used = int_make.get_walltime_used()
    cache.cputime_used = int_make.get_cputime_used()
    cache.host = host
    cache.jobs_defined = new_jobs
    set_job_cache(job_id, cache, db=db)

    return dict(user_object=user_object,
                user_object_deps=collect_dependencies(user_object),
                new_jobs=new_jobs,
                deleted_jobs=deleted_jobs)
Пример #3
0
 def proctitle(event):
     stat = '[%s/%s %s] (compmake)' % (event.progress, event.goal,
                                       event.job_id)
     setproctitle(stat)
Пример #4
0
def parmake_job2(args):
    """
    args = tuple job_id, context, queue_name, show_events
        
    Returns a dictionary with fields "user_object", "new_jobs", 'delete_jobs'.
    "user_object" is set to None because we do not want to 
    load in our thread if not necessary. Sometimes it is necessary
    because it might contain a Promise. 
   
    """
    job_id, context, event_queue_name, show_output = args  # @UnusedVariable
    check_isinstance(job_id, str)
    check_isinstance(event_queue_name, str)
    from .pmake_manager import PmakeManager

    event_queue = PmakeManager.queues[event_queue_name]

    db = context.get_compmake_db()

    setproctitle('compmake:%s' % job_id)

    class G():
        nlostmessages = 0

    try:
        # We register a handler for the events to be passed back
        # to the main process
        def handler(event):
            try:
                if not CompmakeConstants.disable_interproc_queue:
                    event_queue.put(event, block=False)
            except Full:
                G.nlostmessages += 1
                # Do not write messages here, it might create a recursive
                # problem.
                # sys.stderr.write('job %s: Queue is full, message is lost.\n'
                # % job_id)

        remove_all_handlers()

        if show_output:
            register_handler("*", handler)

        def proctitle(event):
            stat = '[%s/%s %s] (compmake)' % (event.progress, event.goal,
                                              event.job_id)
            setproctitle(stat)

        register_handler("job-progress", proctitle)

        publish(context, 'worker-status', job_id=job_id, status='started')

        # Note that this function is called after the fork.
        # All data is conserved, but resources need to be reopened
        try:
            db.reopen_after_fork()  # @UndefinedVariable
        except:
            pass

        publish(context, 'worker-status', job_id=job_id, status='connected')

        res = make(job_id, context=context)

        publish(context, 'worker-status', job_id=job_id, status='ended')

        res['user_object'] = None
        result_dict_check(res)
        return res

    except KeyboardInterrupt:
        assert False, 'KeyboardInterrupt should be captured by make() (' \
                      'inside Job.compute())'
    except JobInterrupted:
        publish(context, 'worker-status', job_id=job_id, status='interrupted')
        raise
    except JobFailed:
        raise
    except BaseException:
        # XXX
        raise
    except:
        raise
    finally:
        publish(context, 'worker-status', job_id=job_id, status='cleanup')
        setproctitle('compmake-worker-finished %s' % job_id)
Пример #5
0
 def proctitle(event):
     stat = '[%s/%s %s] (compmake)' % (event.progress,
                                       event.goal, event.job_id)
     setproctitle(stat)
Пример #6
0
def parmake_job2(args):
    """
    args = tuple job_id, context, queue_name, show_events
        
    Returns a dictionary with fields "user_object", "new_jobs", 'delete_jobs'.
    "user_object" is set to None because we do not want to 
    load in our thread if not necessary. Sometimes it is necessary
    because it might contain a Promise. 
   
    """
    job_id, context, event_queue_name, show_output = args  # @UnusedVariable
    check_isinstance(job_id, str)
    check_isinstance(event_queue_name, str)
    from .pmake_manager import PmakeManager

    event_queue = PmakeManager.queues[event_queue_name]

    db = context.get_compmake_db()

    setproctitle('compmake:%s' % job_id)

    class G():
        nlostmessages = 0

    try:
        # We register a handler for the events to be passed back 
        # to the main process
        def handler( event):
            try:
                if not CompmakeConstants.disable_interproc_queue:
                    event_queue.put(event, block=False)
            except Full:
                G.nlostmessages += 1
                # Do not write messages here, it might create a recursive
                # problem.
                # sys.stderr.write('job %s: Queue is full, message is lost.\n'
                # % job_id)

        remove_all_handlers()

        if show_output:
            register_handler("*", handler)

        def proctitle(event):
            stat = '[%s/%s %s] (compmake)' % (event.progress,
                                              event.goal, event.job_id)
            setproctitle(stat)

        register_handler("job-progress", proctitle)

        publish(context, 'worker-status', job_id=job_id, status='started')

        # Note that this function is called after the fork.
        # All data is conserved, but resources need to be reopened
        try:
            db.reopen_after_fork()  # @UndefinedVariable
        except:
            pass

        publish(context, 'worker-status', job_id=job_id, status='connected')

        res = make(job_id, context=context)

        publish(context, 'worker-status', job_id=job_id, status='ended')

        res['user_object'] = None
        result_dict_check(res)
        return res
        
    except KeyboardInterrupt:
        assert False, 'KeyboardInterrupt should be captured by make() (' \
                      'inside Job.compute())'
    except JobInterrupted:
        publish(context, 'worker-status', job_id=job_id, status='interrupted')
        raise
    except JobFailed:
        raise
    except BaseException:
        # XXX
        raise
    except:
        raise
    finally:
        publish(context, 'worker-status', job_id=job_id, status='cleanup')
        setproctitle('compmake-worker-finished')