예제 #1
0
 def __init__(self, path=None, verbose=10):
     if path is None:
         self.path = fs_tracker.get_queue_directory()
     else:
         self.path = path
     self.logger = logs.getLogger(self.__class__.__name__)
     self.logger.setLevel(verbose)
예제 #2
0
 def __init__(self, path=None):
     if path is None:
         self.path = fs_tracker.get_queue_directory()
     else:
         self.path = path
예제 #3
0
def worker_loop(queue,
                parsed_args,
                single_experiment=False,
                timeout=0,
                verbose=None):

    fetch_artifacts = True

    logger = logs.getLogger('worker_loop')

    hold_period = 4
    retval = 0
    while True:
        msg = queue.dequeue(acknowledge=False, timeout=timeout)
        if not msg:
            break

        # first_exp, ack_key = queue.dequeue(acknowledge=False)
        first_exp, ack_key = msg

        data_dict = json.loads(sixdecode(first_exp))
        experiment_key = data_dict['experiment']['key']
        config = data_dict['config']

        parsed_args.config = config
        if verbose:
            config['verbose'] = verbose
        else:
            verbose = model.parse_verbosity(config.get('verbose'))

        logger.setLevel(verbose)

        logger.debug('Received message: \n{}'.format(data_dict))

        executor = LocalExecutor(parsed_args)

        with model.get_db_provider(config) as db:
            # experiment = experiment_from_dict(data_dict['experiment'])
            def try_get_experiment():
                experiment = db.get_experiment(experiment_key)
                if experiment is None:
                    raise ValueError(
                        'experiment is not found - indicates storage failure')
                return experiment

            experiment = retry(try_get_experiment,
                               sleep_time=10,
                               logger=logger)

            if config.get('experimentLifetime') and \
                int(str2duration(config['experimentLifetime'])
                    .total_seconds()) + experiment.time_added < time.time():
                logger.info(
                    'Experiment expired (max lifetime of {} was exceeded)'.
                    format(config.get('experimentLifetime')))
                queue.acknowledge(ack_key)
                continue

            if allocate_resources(experiment, config, verbose=verbose):

                def hold_job():
                    queue.hold(ack_key, hold_period)

                hold_job()
                sched = BackgroundScheduler()
                sched.add_job(hold_job, 'interval', minutes=hold_period / 2)
                sched.start()

                try:
                    python = 'python'
                    if experiment.pythonver == 3:
                        python = 'python3'
                    if '_singularity' not in experiment.artifacts.keys():
                        pip_diff = pip_needed_packages(experiment.pythonenv,
                                                       python)
                        if any(pip_diff):
                            logger.info(
                                'Setting up python packages for experiment')
                            if pip_install_packages(pip_diff, python,
                                                    logger) != 0:

                                logger.info(
                                    "Installation of all packages together " +
                                    " failed, "
                                    "trying one package at a time")

                                for pkg in pip_diff:
                                    pip_install_packages([pkg], python, logger)

                    for tag, art in six.iteritems(experiment.artifacts):
                        if fetch_artifacts or 'local' not in art.keys():
                            logger.info('Fetching artifact ' + tag)
                            if tag == 'workspace':
                                art['local'] = retry(lambda: db.get_artifact(
                                    art, only_newer=False),
                                                     sleep_time=10,
                                                     logger=logger)
                            else:
                                art['local'] = retry(
                                    lambda: db.get_artifact(art),
                                    sleep_time=10,
                                    logger=logger)

                    returncode = executor.run(experiment)
                    if returncode != 0:
                        retval = returncode
                finally:
                    sched.shutdown()
                    queue.acknowledge(ack_key)

                if single_experiment:
                    logger.info('single_experiment is True, quitting')
                    return retval
            else:
                logger.info('Cannot run experiment ' + experiment.key +
                            ' due lack of resources. Will retry')
                time.sleep(config['sleep_time'])

        # wait_for_messages(queue, timeout, logger)

        # queue = glob.glob(fs_tracker.get_queue_directory() + "/*")

    logger.info("Queue in {} is empty, quitting".format(
        fs_tracker.get_queue_directory()))

    return retval
예제 #4
0
def worker_loop(queue,
                parsed_args,
                setup_pyenv=False,
                single_experiment=False,
                fetch_artifacts=False,
                timeout=0):

    logger = logging.getLogger('worker_loop')

    hold_period = 4
    while queue.has_next():

        first_exp, ack_key = queue.dequeue(acknowledge=False)

        experiment_key = json.loads(first_exp)['experiment']['key']
        config = json.loads(first_exp)['config']
        parsed_args.config = config
        verbose = model.parse_verbosity(config.get('verbose'))
        logger.setLevel(verbose)

        logger.debug(
            'Received experiment {} with config {} from the queue'.format(
                experiment_key, config))

        executor = LocalExecutor(parsed_args)
        experiment = executor.db.get_experiment(experiment_key)

        if allocate_resources(experiment, config, verbose=verbose):

            def hold_job():
                queue.hold(ack_key, hold_period)

            hold_job()
            sched = BackgroundScheduler()
            sched.add_job(hold_job, 'interval', minutes=hold_period / 2)
            sched.start()

            try:
                if setup_pyenv:
                    logger.info('Setting up python packages for experiment')
                    pipp = subprocess.Popen(['pip', 'install'] +
                                            experiment.pythonenv,
                                            stdout=subprocess.PIPE,
                                            stderr=subprocess.STDOUT)

                    pipout, _ = pipp.communicate()
                    logger.info("pip output: \n" + pipout)

                    # pip.main(['install'] + experiment.pythonenv)

                for tag, art in experiment.artifacts.iteritems():
                    if fetch_artifacts or 'local' not in art.keys():
                        logger.info('Fetching artifact ' + tag)
                        if tag == 'workspace':
                            # art['local'] = executor.db.store.get_artifact(
                            #    art, '.', only_newer=False)
                            art['local'] = executor.db.store.get_artifact(
                                art, only_newer=False)
                        else:
                            art['local'] = executor.db.store.get_artifact(art)
                executor.run(experiment)
            finally:
                sched.shutdown()
                queue.acknowledge(ack_key)

            if single_experiment:
                logger.info('single_experiment is True, quitting')
                return
        else:
            logger.info('Cannot run experiment ' + experiment.key +
                        ' due lack of resources. Will retry')
            time.sleep(config['sleep_time'])

        wait_for_messages(queue, timeout, logger)

        # queue = glob.glob(fs_tracker.get_queue_directory() + "/*")

    logger.info("Queue in {} is empty, quitting".format(
        fs_tracker.get_queue_directory()))