示例#1
0
    def __init__(self, queue_name, sub_name=None, verbose=10):
        assert 'GOOGLE_APPLICATION_CREDENTIALS' in os.environ.keys()
        with open(os.environ['GOOGLE_APPLICATION_CREDENTIALS']) as f:
            credentials = json.loads(f.read())

        project_name = credentials['project_id']
        self.logger = logging.getLogger(self.__class__.__name__)
        if verbose is not None:
            self.logger.setLevel(parse_verbosity(verbose))

        self.pubclient = pubsub.PublisherClient()
        self.subclient = pubsub.SubscriberClient()

        self.project = project_name
        self.topic_name = self.pubclient.topic_path(project_name, queue_name)
        self.logger.info("Topic name = {}".format(self.topic_name))
        try:
            self.pubtopic = self.pubclient.get_topic(self.topic_name)
        except BaseException as e:
            self.pubtopic = self.pubclient.create_topic(self.topic_name)
            self.logger.info('topic {} created'.format(self.topic_name))

        sub_name = sub_name if sub_name else queue_name + "_sub"
        self.logger.info("Topic name = {}".format(queue_name))
        self.logger.info("Subscription name = {}".format(sub_name))

        self.sub_name = self.subclient.subscription_path(
            project_name, sub_name)
        try:
            self.subclient.get_subscription(self.sub_name)
        except BaseException as e:
            self.logger.warn(e)
            self.subclient.create_subscription(self.sub_name, self.topic_name)

        self.logger.info('subscription {} created'.format(sub_name))
示例#2
0
def submit_experiments(
        experiments,
        config,
        logger,
        cloud=None,
        queue_name=None,
        python_pkg=[]):

    num_experiments = len(experiments)
    verbose = model.parse_verbosity(config['verbose'])

    start_time = time.time()
    n_workers = min(multiprocessing.cpu_count() * 2, num_experiments)
    with closing(multiprocessing.Pool(n_workers, maxtasksperchild=20)) as p:
        experiments = p.imap_unordered(add_experiment,
                                       zip([config] * num_experiments,
                                           [python_pkg] *
                                           num_experiments,
                                           experiments),
                                       chunksize=1)
        p.close()
        p.join()
    # for e in experiments:
    #     logger.info("Added experiment " + e.key)
    logger.info("Added %s experiments in %s seconds" %
                (num_experiments, int(time.time() - start_time)))

    queue = get_queue(queue_name, cloud, verbose)
    for e in experiments:
        queue.enqueue(json.dumps({
            'experiment': e.__dict__,
            'config': config}))

    return queue.get_name()
示例#3
0
    def __init__(self, args):
        self.config = args.config

        if args.guest:
            self.config['database']['guest'] = True

        self.logger = logs.getLogger('LocalExecutor')
        self.logger.setLevel(model.parse_verbosity(self.config.get('verbose')))
        self.logger.debug("Config: ")
        self.logger.debug(self.config)
示例#4
0
def main(args=sys.argv):
    logger = logging.getLogger('studio-remote-worker')
    parser = argparse.ArgumentParser(description='Studio remote worker. \
                     Usage: studio-remote-worker \
                     ')
    parser.add_argument('--config', help='configuration file', default=None)

    parser.add_argument('--guest',
                        help='Guest mode (does not require db credentials)',
                        action='store_true')

    parser.add_argument(
        '--single-run',
        help='quit after a single run (regardless of the state of the queue)',
        action='store_true')

    parser.add_argument('--queue', help='queue name', required=True)
    parser.add_argument('--verbose',
                        '-v',
                        help='Verbosity level. Allowed vaules: ' +
                        'debug, info, warn, error, crit ' +
                        'or numerical value of logger levels.',
                        default=None)

    parser.add_argument(
        '--timeout',
        '-t',
        help='Timeout after which remote worker stops listening (in seconds)',
        type=int,
        default=-1)

    parsed_args, script_args = parser.parse_known_args(args)
    verbose = model.parse_verbosity(parsed_args.verbose)
    logger.setLevel(verbose)
    if parsed_args.queue.startswith('ec2_') or \
       parsed_args.queue.startswith('sqs_'):
        queue = SQSQueue(parsed_args.queue, verbose=verbose)
    else:
        queue = PubsubQueue(parsed_args.queue, verbose=verbose)
    logger.info('Waiting for the work in the queue...')

    timeout_before = parsed_args.timeout
    timeout_after = timeout_before if timeout_before > 0 else 0
    wait_for_messages(queue, timeout_before, logger)

    logger.info('Starting working')
    worker_loop(queue,
                parsed_args,
                setup_pyenv=True,
                single_experiment=parsed_args.single_run,
                fetch_artifacts=True,
                timeout=timeout_after)
示例#5
0
def main(args=sys.argv[1:]):
    parser = argparse.ArgumentParser(description='Studio WebUI server. \
                     Usage: studio \
                     <arguments>')

    parser.add_argument('--config', help='configuration file', default=None)
    #    parser.add_argument('--guest',
    #                        help='Guest mode (does not require db credentials)',
    #                        action='store_true')

    parser.add_argument('--port',
                        help='port to run Flask server on',
                        type=int,
                        default=5000)

    parser.add_argument('--host', help='host name.', default='0.0.0.0')

    parser.add_argument('--verbose',
                        '-v',
                        help='Verbosity level. Allowed vaules: ' +
                        'debug, info, warn, error, crit ' +
                        'or numerical value of logger levels.',
                        default=None)

    args = parser.parse_args(args)
    config = model.get_config()
    if args.config:
        with open(args.config) as f:
            config = yaml.load(f)

    if args.verbose:
        config['verbose'] = args.verbose


#    if args.guest:
#        config['database']['guest'] = True
    global _config
    global _db_provider
    _config = config
    _db_provider = model.get_db_provider(_config, blocking_auth=False)

    getlogger().setLevel(model.parse_verbosity(config.get('verbose')))

    global _save_auth_cookie
    _save_auth_cookie = True

    print('Starting Studio UI on port {0}'.format(args.port))
    app.run(host=args.host, port=args.port)
示例#6
0
    def __init__(self, name, verbose=10, receive_timeout=300, retry_time=10):
        assert boto3 is not None
        self._client = boto3.client('sqs')

        create_q_response = self._client.create_queue(QueueName=name)

        self._queue_url = create_q_response['QueueUrl']
        self.logger = logs.getLogger('SQSQueue')
        if verbose is not None:
            self.logger.setLevel(parse_verbosity(verbose))
        self._name = name
        self.logger.info('Creating SQS queue with name ' + name)
        self.logger.info('Queue url = ' + self._queue_url)

        self._receive_timeout = receive_timeout
        self._retry_time = retry_time
示例#7
0
    def __init__(self, args):
        self.config = model.get_config()
        if args.config:
            if isinstance(args.config, basestring):
                with open(args.config) as f:
                    self.config.update(yaml.load(f))
            else:
                self.config.update(args.config)

        if args.guest:
            self.config['database']['guest'] = True

        self.db = model.get_db_provider(self.config)
        self.logger = logging.getLogger('LocalExecutor')
        self.logger.setLevel(model.parse_verbosity(self.config.get('verbose')))
        self.logger.debug("Config: ")
        self.logger.debug(self.config)
示例#8
0
def main(args=sys.argv):
    parser = argparse.ArgumentParser(description='Studio worker. \
                     Usage: studio-local-worker \
                     ')

    parser.add_argument('--config', help='configuration file', default=None)
    parser.add_argument('--guest',
                        help='Guest mode (does not require db credentials)',
                        action='store_true')
    parser.add_argument('--timeout', default=0, type=int)
    parser.add_argument('--verbose', default='error')

    parsed_args, script_args = parser.parse_known_args(args)
    verbose = parse_verbosity(parsed_args.verbose)

    queue = LocalQueue(verbose=verbose)
    # queue = glob.glob(fs_tracker.get_queue_directory() + "/*")
    # wait_for_messages(queue, parsed_args.timeout)
    returncode = worker_loop(queue, parsed_args, timeout=parsed_args.timeout)
    sys.exit(returncode)
示例#9
0
def main(args=sys.argv[1:]):
    logger = logs.getLogger('studio-runner')
    parser = argparse.ArgumentParser(
        description='Studio runner. \
                     Usage: studio run <runner_arguments> \
                     script <script_arguments>')
    parser.add_argument('--config', help='configuration file', default=None)
    parser.add_argument('--project', help='name of the project', default=None)
    parser.add_argument(
        '--experiment', '-e',
        help='Name of the experiment. If none provided, ' +
             'random uuid will be generated',
        default=None)

    parser.add_argument(
        '--guest',
        help='Guest mode (does not require db credentials)',
        action='store_true')

    parser.add_argument(
        '--force-git',
        help='If run in a git directory, force running the experiment ' +
             'even if changes are not commited',
        action='store_true')

    parser.add_argument(
        '--gpus',
        help='Number of gpus needed to run the experiment',
        type=int,
        default=None)

    parser.add_argument(
        '--cpus',
        help='Number of cpus needed to run the experiment' +
             ' (used to configure cloud instance)',
        type=int,
        default=None)

    parser.add_argument(
        '--ram',
        help='Amount of RAM needed to run the experiment' +
             ' (used to configure cloud instance), ex: 10G, 10GB',
        default=None)

    parser.add_argument(
        '--gpuMem',
        help='Amount of GPU RAM needed to run the experiment',
        default=None)

    parser.add_argument(
        '--hdd',
        help='Amount of hard drive space needed to run the experiment' +
             ' (used to configure cloud instance), ex: 10G, 10GB',
        default=None)

    parser.add_argument(
        '--queue', '-q',
        help='Name of the remote execution queue',
        default=None)

    parser.add_argument(
        '--cloud',
        help='Cloud execution mode. Could be gcloud, gcspot, ec2 or ec2spot',
        default=None)

    parser.add_argument(
        '--bid',
        help='Spot instance price bid, specified in USD or in percentage ' +
             'of on-demand instance price. Default is %(default)s',
        default='100%')

    parser.add_argument(
        '--capture-once', '-co',
        help='Name of the immutable artifact to be captured. ' +
        'It will be captured once before the experiment is run',
        default=[], action='append')

    parser.add_argument(
        '--capture', '-c',
        help='Name of the mutable artifact to be captured continuously',
        default=[], action='append')

    parser.add_argument(
        '--reuse', '-r',
        help='Name of the artifact from another experiment to use',
        default=[], action='append')

    parser.add_argument(
        '--verbose', '-v',
        help='Verbosity level. Allowed values: ' +
             'debug, info, warn, error, crit ' +
             'or numerical value of logger levels.',
        default=None)

    parser.add_argument(
        '--metric',
        help='Metric to show in the summary of the experiment, ' +
             'and to base hyperparameter search on. ' +
             'Refers a scalar value in tensorboard log ' +
             'example: --metric=val_loss[:final | :min | :max] to report ' +
             'validation loss in the end of the keras experiment ' +
             '(or smallest or largest throughout the experiment for :min ' +
             'and :max respectively)',
        default=None)

    parser.add_argument(
        '--hyperparam', '-hp',
        help='Try out multiple values of a certain parameter. ' +
             'For example, --hyperparam=learning_rate:0.01:0.1:l10 ' +
             'will instantiate 10 versions of the script, replace ' +
             'learning_rate with a one of the 10 values for learning ' +
             'rate that lies on a log grid from 0.01 to 0.1, create '
             'experiments and place them in the queue.',
             default=[], action='append')

    parser.add_argument(
        '--num-workers',
        help='Number of local or cloud workers to spin up',
        type=int,
        default=None)

    parser.add_argument(
        '--python-pkg',
        help='Python package not present in the current environment ' +
             'that is needed for experiment. Only compatible with ' +
             'remote and cloud workers for now',
        default=[], action='append')

    parser.add_argument(
        '--ssh-keypair',
        help='Name of the SSH keypair used to access the EC2 ' +
             'instances directly',
        default=None)

    parser.add_argument(
        '--optimizer', '-opt',
        help='Name of optimizer to use, by default is grid search. ' +
        'The name of the optimizer must either be in ' +
        'studio/optimizer_plugins ' +
        'directory or the path to the optimizer source file ' +
        'must be supplied. ',
        default='grid')

    parser.add_argument(
        '--cloud-timeout',
        help="Time (in seconds) that cloud workers wait for messages. " +
             "If negative, " +
             "wait for the first message in the queue indefinitely " +
             "and shut down " +
             "as soon as no new messages are available. " +
             "If zero, don't wait at all." +
             "Default value is %(default)d",
        type=int,
        default=300)

    parser.add_argument(
        '--user-startup-script',
        help='Path of script to run immediately ' +
             'before running the remote worker',
        default=None)

    parser.add_argument(
        '--branch',
        help='Branch of studioml to use when running remote worker, useful ' +
             'for debugging pull requests. Default is current',
        default=None)

    parser.add_argument(
        '--max-duration',
        help='Max experiment runtime (i.e. time after which experiment ' +
             'should be killed no matter what.).  Examples of values ' +
             'might include 5h, 48h2m10s',
        default=None)

    parser.add_argument(
        '--lifetime',
        help='Max experiment lifetime (i.e. wait time after which ' +
             'experiment loses relevance and should not be started)' +
             '  Examples include 240h30m10s',
        default=None)

    parser.add_argument(
        '--container',
        help='Singularity container in which experiment should be run. ' +
             'Assumes that container has all dependencies installed',
        default=None
    )

    parser.add_argument(
        '--port',
        help='Ports to open on a cloud instance',
        default=[], action='append'
    )

    # detect which argument is the script filename
    # and attribute all arguments past that index as related to the script
    (runner_args, other_args) = parser.parse_known_args(args)
    py_suffix_args = [i for i, arg in enumerate(args) if arg.endswith('.py')
                      or '::' in arg]

    rerun = False
    if len(py_suffix_args) < 1:
        print('None of the arugments end with .py')
        if len(other_args) == 0:
            print("Trying to run a container job")
            assert runner_args.container is not None
            exec_filename = None
        elif len(other_args) == 1:
            print("Treating last argument as experiment key to rerun")
            rerun = True
            experiment_key = args[-1]
        else:
            print("Too many extra arguments - should be either none " +
                  "for container job or one for experiment re-run")
            sys.exit(1)
    else:
        script_index = py_suffix_args[0]
        exec_filename, other_args = args[script_index], args[script_index + 1:]
        runner_args = parser.parse_args(args[:script_index])

    # TODO: Queue the job based on arguments and only then execute.

    config = model.get_config(runner_args.config)

    if runner_args.verbose:
        config['verbose'] = runner_args.verbose

    if runner_args.guest:
        config['database']['guest'] = True

    if runner_args.container:
        runner_args.capture_once.append(
            runner_args.container + ':_singularity')

    verbose = model.parse_verbosity(config['verbose'])
    logger.setLevel(verbose)

    if git_util.is_git() and not git_util.is_clean() and not rerun:
        logger.warn('Running from dirty git repo')
        if not runner_args.force_git:
            logger.error(
                'Specify --force-git to run experiment from dirty git repo')
            sys.exit(1)

    resources_needed = parse_hardware(runner_args, config['resources_needed'])
    logger.debug('resources requested: ')
    logger.debug(str(resources_needed))

    artifacts = {}
    artifacts.update(parse_artifacts(runner_args.capture, mutable=True))
    artifacts.update(parse_artifacts(runner_args.capture_once, mutable=False))
    with model.get_db_provider(config) as db:
        artifacts.update(parse_external_artifacts(runner_args.reuse, db))

    if runner_args.branch:
        config['cloud']['branch'] = runner_args.branch

    if runner_args.user_startup_script:
        config['cloud']['user_startup_script'] = \
            runner_args.user_startup_script

    if runner_args.lifetime:
        config['experimentLifetime'] = runner_args.lifetime

    if any(runner_args.hyperparam):
        if runner_args.optimizer is "grid":
            experiments = add_hyperparam_experiments(
                exec_filename,
                other_args,
                runner_args,
                artifacts,
                resources_needed,
                logger)

            queue_name = submit_experiments(
                experiments,
                config=config,
                logger=logger,
                queue_name=runner_args.queue,
                cloud=runner_args.cloud)

            spin_up_workers(
                runner_args,
                config,
                resources_needed,
                queue_name=queue_name,
                verbose=verbose)
        else:
            opt_modulepath = os.path.join(
                os.path.dirname(os.path.abspath(__file__)),
                "optimizer_plugins",
                runner_args.optimizer + ".py")
            if not os.path.exists(opt_modulepath):
                opt_modulepath = os.path.abspath(
                    os.path.expanduser(runner_args.optimizer))
            logger.info('optimizer path: %s' % opt_modulepath)

            assert os.path.exists(opt_modulepath)
            sys.path.append(os.path.dirname(opt_modulepath))
            opt_module = importlib.import_module(
                os.path.basename(opt_modulepath.replace(".py", '')))

            h = HyperparameterParser(runner_args, logger)
            hyperparams = h.parse()
            optimizer = getattr(
                opt_module,
                "Optimizer")(
                hyperparams,
                config['optimizer'],
                logger)

            workers_started = False
            queue_name = runner_args.queue
            while not optimizer.stop():
                hyperparam_pop = optimizer.ask()
                hyperparam_tuples = h.convert_to_tuples(hyperparam_pop)

                experiments = add_hyperparam_experiments(
                    exec_filename,
                    other_args,
                    runner_args,
                    artifacts,
                    resources_needed,
                    logger,
                    optimizer=optimizer,
                    hyperparam_tuples=hyperparam_tuples)

                queue_name = submit_experiments(
                    experiments,
                    config=config,
                    logger=logger,
                    cloud=runner_args.cloud,
                    queue_name=queue_name)

                if not workers_started:
                    spin_up_workers(
                        runner_args,
                        config,
                        resources_needed,
                        queue_name=queue_name,
                        verbose=verbose)
                    workers_started = True

                fitnesses, behaviors = get_experiment_fitnesses(
                    experiments, optimizer, config, logger)

                # for i, hh in enumerate(hyperparam_pop):
                #     print fitnesses[i]
                #     for hhh in hh:
                #         print hhh
                try:
                    optimizer.tell(hyperparam_pop, fitnesses, behaviors)
                except BaseException:
                    optimizer.tell(hyperparam_pop, fitnesses)

                try:
                    optimizer.disp()
                except BaseException:
                    logger.warn('Optimizer has no disp() method')
    else:
        if rerun:
            with model.get_db_provider(config) as db:
                experiment = db.get_experiment(experiment_key)
                new_key = runner_args.experiment if runner_args.experiment \
                    else experiment_key + '_rerun' + str(uuid.uuid4())
                experiment.key = new_key
                for _, art in six.iteritems(experiment.artifacts):
                    art['mutable'] = False

                experiments = [experiment]

        else:
            experiments = [create_experiment(
                filename=exec_filename,
                args=other_args,
                experiment_name=runner_args.experiment,
                project=runner_args.project,
                artifacts=artifacts,
                resources_needed=resources_needed,
                metric=runner_args.metric,
                max_duration=runner_args.max_duration,
            )]

        queue_name = submit_experiments(
            experiments,
            config=config,
            logger=logger,
            cloud=runner_args.cloud,
            queue_name=runner_args.queue)

        spin_up_workers(
            runner_args,
            config,
            resources_needed,
            queue_name=queue_name,
            verbose=verbose)

    return
示例#10
0
文件: runner.py 项目: Patechoc/studio
def submit_experiments(
        experiments,
        resources_needed,
        config,
        runner_args,
        logger,
        queue_name=None,
        launch_workers=True):

    num_experiments = len(experiments)
    verbose = model.parse_verbosity(config['verbose'])

    if runner_args.cloud is None:
        queue_name = 'local'
        if 'queue' in config.keys():
            queue_name = config['queue']
        if runner_args.queue:
            queue_name = runner_args.queue

    start_time = time.time()
    n_workers = min(multiprocessing.cpu_count() * 2, num_experiments)
    with closing(multiprocessing.Pool(n_workers, maxtasksperchild=20)) as p:
        experiments = p.imap_unordered(add_experiment,
                                       zip([config] * num_experiments,
                                           [runner_args.python_pkg] *
                                           num_experiments,
                                           experiments),
                                       chunksize=1)
        p.close()
        p.join()
    # for e in experiments:
    #     logger.info("Added experiment " + e.key)
    logger.info("Added %s experiments in %s seconds" %
                (num_experiments, int(time.time() - start_time)))

    if runner_args.cloud is not None:
        assert runner_args.cloud in ['gcloud', 'gcspot', 'ec2', 'ec2spot']

        assert runner_args.queue is None, \
            '--queue argument cannot be provided with --cloud argument'
        auth_cookie = None if config['database'].get('guest') \
            else os.path.join(
            auth.TOKEN_DIR,
            config['database']['apiKey']
        )

        if runner_args.cloud in ['gcloud', 'gcspot']:
            if queue_name is None:
                queue_name = 'pubsub_' + str(uuid.uuid4())
                worker_manager = GCloudWorkerManager(
                    runner_args=runner_args,
                    auth_cookie=auth_cookie,
                    zone=config['cloud']['zone']
                )

            queue = PubsubQueue(queue_name, verbose=verbose)

        if runner_args.cloud in ['ec2', 'ec2spot']:
            if queue_name is None:
                queue_name = 'sqs_' + str(uuid.uuid4())
                worker_manager = EC2WorkerManager(
                    runner_args=runner_args,
                    auth_cookie=auth_cookie
                )

            queue = SQSQueue(queue_name, verbose=verbose)

        if launch_workers:
            if runner_args.cloud == 'gcloud' or \
               runner_args.cloud == 'ec2':

                num_workers = int(
                    runner_args.num_workers) if runner_args.num_workers else 1
                for i in range(num_workers):
                    worker_manager.start_worker(
                        queue_name, resources_needed,
                        ssh_keypair=runner_args.ssh_keypair,
                        timeout=runner_args.cloud_timeout)
            else:
                assert runner_args.bid is not None
                if runner_args.num_workers:
                    start_workers = runner_args.num_workers
                    queue_upscaling = False
                else:
                    start_workers = 1
                    queue_upscaling = True

                worker_manager.start_spot_workers(
                    queue_name,
                    runner_args.bid,
                    resources_needed,
                    start_workers=start_workers,
                    queue_upscaling=queue_upscaling,
                    ssh_keypair=runner_args.ssh_keypair,
                    timeout=runner_args.cloud_timeout)
    else:
        if queue_name == 'local':
            queue = LocalQueue()
            queue.clean()
        elif queue_name.startswith('sqs_'):
            queue = SQSQueue(queue_name, verbose=verbose)
        else:
            queue = PubsubQueue(
                queue_name,
                config['database']['projectId'],
                verbose=verbose)

    for e in experiments:
        queue.enqueue(json.dumps({
            'experiment': e.__dict__,
            'config': config}))

    if queue_name == 'local':
        worker_args = ['studio-local-worker']

        if runner_args.config:
            worker_args += ['--config=' + runner_args.config]
        if runner_args.guest:
            worker_args += ['--guest']

        logger.info('worker args: {}'.format(worker_args))
        if not runner_args.num_workers or int(runner_args.num_workers) == 1:
            if 'STUDIOML_DUMMY_MODE' not in os.environ:
                local_worker.main(worker_args)
        else:
            raise NotImplementedError("Multiple local workers are not " +
                                      "implemented yet")
    return queue_name
示例#11
0
def submit_experiments(
        experiments,
        config,
        runner_args,
        logger,
        resources_needed):
    db = model.get_db_provider(config)
    verbose = model.parse_verbosity(config['verbose'])

    queue_name = 'local'
    if 'queue' in config.keys():
        queue_name = config['queue']
    if runner_args.queue:
        queue_name = runner_args.queue

    for e in experiments:
        e.pythonenv = add_packages(e.pythonenv, runner_args.python_pkg)
        db.add_experiment(e)
        logger.info("Added experiment " + e.key)

    if runner_args.cloud is not None:
        assert runner_args.cloud in ['gcloud', 'gcspot', 'ec2', 'ec2spot']

        assert runner_args.queue is None, \
            '--queue argument cannot be provided with --cloud argument'
        auth_cookie = None if config['database'].get('guest') \
            else os.path.join(
            auth.token_dir,
            config['database']['apiKey']
        )

        if runner_args.cloud in ['gcloud', 'gcspot']:

            queue_name = 'pubsub_' + str(uuid.uuid4())

            queue = PubsubQueue(queue_name, verbose=verbose)
            worker_manager = GCloudWorkerManager(
                auth_cookie=auth_cookie,
                zone=config['cloud']['zone']
            )

        if runner_args.cloud in ['ec2', 'ec2spot']:

            queue_name = 'sqs_' + str(uuid.uuid4())

            queue = SQSQueue(queue_name, verbose=verbose)
            worker_manager = EC2WorkerManager(
                auth_cookie=auth_cookie
            )

        if runner_args.cloud == 'gcloud' or \
           runner_args.cloud == 'ec2':

            num_workers = int(
                runner_args.num_workers) if runner_args.num_workers else 1
            for i in range(num_workers):
                worker_manager.start_worker(
                    queue_name, resources_needed,
                    ssh_keypair=runner_args.ssh_keypair,
                    timeout=runner_args.cloud_timeout)
        else:
            assert runner_args.bid is not None
            if runner_args.num_workers:
                start_workers = runner_args.num_workers
                queue_upscaling = False
            else:
                start_workers = 1
                queue_upscaling = True

            worker_manager.start_spot_workers(
                queue_name,
                runner_args.bid,
                resources_needed,
                start_workers=start_workers,
                queue_upscaling=queue_upscaling,
                ssh_keypair=runner_args.ssh_keypair,
                timeout=runner_args.cloud_timeout)

    else:
        if queue_name == 'local':
            queue = LocalQueue()
            queue.clean()
        elif queue_name.startswith('sqs_'):
            queue = SQSQueue(queue_name, verbose=verbose)
        else:
            queue = PubsubQueue(
                queue_name,
                config['database']['projectId'],
                verbose=verbose)

    for e in experiments:
        queue.enqueue(json.dumps({
            'experiment': e.__dict__,
            'config': config}))

    if queue_name == 'local':
        worker_args = ['studio-local-worker']

        if runner_args.config:
            worker_args += ['--config=' + runner_args.config]
        if runner_args.guest:
            worker_args += ['--guest']

        logger.info('worker args: {}'.format(worker_args))
        if not runner_args.num_workers or int(runner_args.num_workers) == 1:
            local_worker.main(worker_args)
        else:
            raise NotImplementedError("Multiple local workers are not " +
                                      "implemented yet")
    return
示例#12
0
def main(args=sys.argv):
    logger = logging.getLogger('studio-runner')
    parser = argparse.ArgumentParser(
        description='Studio runner. \
                     Usage: studio run <runner_arguments> \
                     script <script_arguments>')
    parser.add_argument('--config', help='configuration file', default=None)
    parser.add_argument('--project', help='name of the project', default=None)
    parser.add_argument(
        '--experiment', '-e',
        help='Name of the experiment. If none provided, ' +
             'random uuid will be generated',
        default=None)

    parser.add_argument(
        '--guest',
        help='Guest mode (does not require db credentials)',
        action='store_true')

    parser.add_argument(
        '--force-git',
        help='If run in a git directory, force running the experiment ' +
             'even if changes are not commited',
        action='store_true')

    parser.add_argument(
        '--gpus',
        help='Number of gpus needed to run the experiment',
        default=None)

    parser.add_argument(
        '--cpus',
        help='Number of cpus needed to run the experiment' +
             ' (used to configure cloud instance)',
        default=None)

    parser.add_argument(
        '--ram',
        help='Amount of RAM needed to run the experiment' +
             ' (used to configure cloud instance)',
        default=None)

    parser.add_argument(
        '--hdd',
        help='Amount of hard drive space needed to run the experiment' +
             ' (used to configure cloud instance)',
        default=None)

    parser.add_argument(
        '--queue', '-q',
        help='Name of the remote execution queue',
        default=None)

    parser.add_argument(
        '--cloud',
        help='Cloud execution mode. Could be gcloud, ec2 or ec2spot',
        default=None)

    parser.add_argument(
        '--bid',
        help='Spot instance price bid, specified in USD or in percentage ' +
             'of on-demand instance price. Default is %(default)s',
        default='100%')

    parser.add_argument(
        '--capture-once', '-co',
        help='Name of the immutable artifact to be captured. ' +
        'It will be captured once before the experiment is run',
        default=[], action='append')

    parser.add_argument(
        '--capture', '-c',
        help='Name of the mutable artifact to be captured continuously',
        default=[], action='append')

    parser.add_argument(
        '--reuse', '-r',
        help='Name of the artifact from another experiment to use',
        default=[], action='append')

    parser.add_argument(
        '--verbose', '-v',
        help='Verbosity level. Allowed values: ' +
             'debug, info, warn, error, crit ' +
             'or numerical value of logger levels.',
        default=None)

    parser.add_argument(
        '--metric', '-m',
        help='Metric to show in the summary of the experiment, ' +
             'and to base hyperparameter search on. ' +
             'Refers a scalar value in tensorboard log ' +
             'example: --metric=val_loss[:final | :min | :max] to report ' +
             'validation loss in the end of the keras experiment ' +
             '(or smallest or largest throughout the experiment for :min ' +
             'and :max respectively)',
        default=None)

    parser.add_argument(
        '--hyperparam', '-hp',
        help='Try out multiple values of a certain parameter. ' +
             'For example, --hyperparam=learning_rate:0.01:0.1:l10 ' +
             'will instantiate 10 versions of the script, replace ' +
             'learning_rate with a one of the 10 values for learning ' +
             'rate that lies on a log grid from 0.01 to 0.1, create '
             'experiments and place them in the queue.',
             default=[], action='append')

    parser.add_argument(
        '--num-workers',
        help='Number of local or cloud workers to spin up',
        default=None)

    parser.add_argument(
        '--python-pkg',
        help='Python package not present in the current environment ' +
             'that is needed for experiment. Only compatible with ' +
             'remote and cloud workers for now',
        default=[], action='append')

    parser.add_argument(
        '--ssh-keypair',
        help='Name of the SSH keypair used to access the EC2 ' +
             'instances directly',
        default=None)

    parser.add_argument(
        '--optimizer', '-opt',
        help='Name of optimizer to use, by default is grid search. ' +
        'The name of the optimizer must either be in ' +
        'studio/optimizer_plugins ' +
        'directory or the path to the optimizer source file ' +
        'must be supplied. ',
        default='grid')

    parser.add_argument(
        '--cloud-timeout',
        help="Time (in seconds) that cloud workers wait for messages. " +
             "If negative, " +
             "wait for the first message in the queue indefinitely " +
             "and shut down " +
             "as soon as no new messages are available. " +
             "If zero, don't wait at all." +
             "Default value is %(default)",
        type=int,
        default=300)

    # detect which argument is the script filename
    # and attribute all arguments past that index as related to the script
    py_suffix_args = [i for i, arg in enumerate(args) if arg.endswith('.py')]
    if len(py_suffix_args) < 1:
        print('At least one argument should be a python script ' +
              '(end with *.py)')
        parser.print_help()
        exit()

    script_index = py_suffix_args[0]
    runner_args = parser.parse_args(args[1:script_index])

    exec_filename, other_args = args[script_index], args[script_index + 1:]
    # TODO: Queue the job based on arguments and only then execute.

    config = model.get_config(runner_args.config)

    if runner_args.verbose:
        config['verbose'] = runner_args.verbose

    verbose = model.parse_verbosity(config['verbose'])
    logger.setLevel(verbose)

    db = model.get_db_provider(config)

    if git_util.is_git() and not git_util.is_clean():
        logger.warn('Running from dirty git repo')
        if not runner_args.force_git:
            logger.error(
                'Specify --force-git to run experiment from dirty git repo')
            sys.exit(1)

    resources_needed = parse_hardware(runner_args, config['cloud'])
    logger.debug('resources requested: ')
    logger.debug(str(resources_needed))

    artifacts = {}
    artifacts.update(parse_artifacts(runner_args.capture, mutable=True))
    artifacts.update(parse_artifacts(runner_args.capture_once, mutable=False))
    artifacts.update(parse_external_artifacts(runner_args.reuse, db))

    if any(runner_args.hyperparam):
        if runner_args.optimizer is "grid":
            experiments = add_hyperparam_experiments(
                exec_filename,
                other_args,
                runner_args,
                artifacts,
                resources_needed)
            submit_experiments(
                experiments,
                config,
                runner_args,
                logger,
                resources_needed)
        else:
            opt_modulepath = os.path.join(
                os.path.dirname(os.path.abspath(__file__)),
                "optimizer_plugins",
                runner_args.optimizer + ".py")
            # logger.info('optimizer path: %s' % opt_modulepath)
            if not os.path.exists(opt_modulepath):
                opt_modulepath = os.path.abspath(
                    os.path.expanduser(runner_args.optimizer))
            logger.info('optimizer path: %s' % opt_modulepath)
            assert os.path.exists(opt_modulepath)
            sys.path.append(os.path.dirname(opt_modulepath))
            opt_module = importlib.import_module(
                os.path.basename(opt_modulepath.replace(".py", '')))

            hyperparam_values, log_scale_dict = get_hyperparam_values(
                runner_args)

            optimizer = getattr(opt_module, "Optimizer")(hyperparam_values,
                                                         log_scale_dict)

            while not optimizer.stop():
                hyperparam_tuples = optimizer.ask()

                experiments = add_hyperparam_experiments(
                    exec_filename,
                    other_args,
                    runner_args,
                    artifacts,
                    resources_needed,
                    optimizer=optimizer,
                    hyperparam_tuples=hyperparam_tuples)
                submit_experiments(
                    experiments,
                    config,
                    runner_args,
                    logger,
                    resources_needed)

                fitnesses = get_experiment_fitnesses(experiments,
                                                     optimizer, config, logger)

                optimizer.tell(hyperparam_tuples, fitnesses)
                # if config['verbose'] == "info" or config['verbose'] ==
                # "debug":
                try:
                    optimizer.disp()
                except BaseException:
                    logger.warn('Optimizer has no disp() method')
    else:
        experiments = [model.create_experiment(
            filename=exec_filename,
            args=other_args,
            experiment_name=runner_args.experiment,
            project=runner_args.project,
            artifacts=artifacts,
            resources_needed=resources_needed,
            metric=runner_args.metric)]
        submit_experiments(
            experiments,
            config,
            runner_args,
            logger,
            resources_needed)

    db = None
    return
示例#13
0
def worker_loop(queue,
                parsed_args,
                single_experiment=False,
                timeout=0,
                verbose=None):

    fetch_artifacts = True

    logger = logs.getLogger('worker_loop')

    hold_period = 4
    retval = 0
    while True:
        msg = queue.dequeue(acknowledge=False, timeout=timeout)
        if not msg:
            break

        # first_exp, ack_key = queue.dequeue(acknowledge=False)
        first_exp, ack_key = msg

        data_dict = json.loads(sixdecode(first_exp))
        experiment_key = data_dict['experiment']['key']
        config = data_dict['config']

        parsed_args.config = config
        if verbose:
            config['verbose'] = verbose
        else:
            verbose = model.parse_verbosity(config.get('verbose'))

        logger.setLevel(verbose)

        logger.debug('Received message: \n{}'.format(data_dict))

        executor = LocalExecutor(parsed_args)

        with model.get_db_provider(config) as db:
            # experiment = experiment_from_dict(data_dict['experiment'])
            def try_get_experiment():
                experiment = db.get_experiment(experiment_key)
                if experiment is None:
                    raise ValueError(
                        'experiment is not found - indicates storage failure')
                return experiment

            experiment = retry(try_get_experiment,
                               sleep_time=10,
                               logger=logger)

            if config.get('experimentLifetime') and \
                int(str2duration(config['experimentLifetime'])
                    .total_seconds()) + experiment.time_added < time.time():
                logger.info(
                    'Experiment expired (max lifetime of {} was exceeded)'.
                    format(config.get('experimentLifetime')))
                queue.acknowledge(ack_key)
                continue

            if allocate_resources(experiment, config, verbose=verbose):

                def hold_job():
                    queue.hold(ack_key, hold_period)

                hold_job()
                sched = BackgroundScheduler()
                sched.add_job(hold_job, 'interval', minutes=hold_period / 2)
                sched.start()

                try:
                    python = 'python'
                    if experiment.pythonver == 3:
                        python = 'python3'
                    if '_singularity' not in experiment.artifacts.keys():
                        pip_diff = pip_needed_packages(experiment.pythonenv,
                                                       python)
                        if any(pip_diff):
                            logger.info(
                                'Setting up python packages for experiment')
                            if pip_install_packages(pip_diff, python,
                                                    logger) != 0:

                                logger.info(
                                    "Installation of all packages together " +
                                    " failed, "
                                    "trying one package at a time")

                                for pkg in pip_diff:
                                    pip_install_packages([pkg], python, logger)

                    for tag, art in six.iteritems(experiment.artifacts):
                        if fetch_artifacts or 'local' not in art.keys():
                            logger.info('Fetching artifact ' + tag)
                            if tag == 'workspace':
                                art['local'] = retry(lambda: db.get_artifact(
                                    art, only_newer=False),
                                                     sleep_time=10,
                                                     logger=logger)
                            else:
                                art['local'] = retry(
                                    lambda: db.get_artifact(art),
                                    sleep_time=10,
                                    logger=logger)

                    returncode = executor.run(experiment)
                    if returncode != 0:
                        retval = returncode
                finally:
                    sched.shutdown()
                    queue.acknowledge(ack_key)

                if single_experiment:
                    logger.info('single_experiment is True, quitting')
                    return retval
            else:
                logger.info('Cannot run experiment ' + experiment.key +
                            ' due lack of resources. Will retry')
                time.sleep(config['sleep_time'])

        # wait_for_messages(queue, timeout, logger)

        # queue = glob.glob(fs_tracker.get_queue_directory() + "/*")

    logger.info("Queue in {} is empty, quitting".format(
        fs_tracker.get_queue_directory()))

    return retval
示例#14
0
def worker_loop(queue,
                parsed_args,
                setup_pyenv=False,
                single_experiment=False,
                fetch_artifacts=False,
                timeout=0):

    logger = logging.getLogger('worker_loop')

    hold_period = 4
    while queue.has_next():

        first_exp, ack_key = queue.dequeue(acknowledge=False)

        experiment_key = json.loads(first_exp)['experiment']['key']
        config = json.loads(first_exp)['config']
        parsed_args.config = config
        verbose = model.parse_verbosity(config.get('verbose'))
        logger.setLevel(verbose)

        logger.debug(
            'Received experiment {} with config {} from the queue'.format(
                experiment_key, config))

        executor = LocalExecutor(parsed_args)
        experiment = executor.db.get_experiment(experiment_key)

        if allocate_resources(experiment, config, verbose=verbose):

            def hold_job():
                queue.hold(ack_key, hold_period)

            hold_job()
            sched = BackgroundScheduler()
            sched.add_job(hold_job, 'interval', minutes=hold_period / 2)
            sched.start()

            try:
                if setup_pyenv:
                    logger.info('Setting up python packages for experiment')
                    pipp = subprocess.Popen(['pip', 'install'] +
                                            experiment.pythonenv,
                                            stdout=subprocess.PIPE,
                                            stderr=subprocess.STDOUT)

                    pipout, _ = pipp.communicate()
                    logger.info("pip output: \n" + pipout)

                    # pip.main(['install'] + experiment.pythonenv)

                for tag, art in experiment.artifacts.iteritems():
                    if fetch_artifacts or 'local' not in art.keys():
                        logger.info('Fetching artifact ' + tag)
                        if tag == 'workspace':
                            # art['local'] = executor.db.store.get_artifact(
                            #    art, '.', only_newer=False)
                            art['local'] = executor.db.store.get_artifact(
                                art, only_newer=False)
                        else:
                            art['local'] = executor.db.store.get_artifact(art)
                executor.run(experiment)
            finally:
                sched.shutdown()
                queue.acknowledge(ack_key)

            if single_experiment:
                logger.info('single_experiment is True, quitting')
                return
        else:
            logger.info('Cannot run experiment ' + experiment.key +
                        ' due lack of resources. Will retry')
            time.sleep(config['sleep_time'])

        wait_for_messages(queue, timeout, logger)

        # queue = glob.glob(fs_tracker.get_queue_directory() + "/*")

    logger.info("Queue in {} is empty, quitting".format(
        fs_tracker.get_queue_directory()))