Exemplo n.º 1
0
    def _test_serving(self, data_in, expected_data_out, wrapper=None):

        self.port = randint(5000, 9000)
        server_experimentid = 'test_serving_' + str(uuid.uuid4())

        with get_local_queue_lock():
            args = [
                'studio', 'run', '--force-git', '--verbose=debug',
                '--experiment=' + server_experimentid,
                '--config=' + self.get_config_path(), 'studio::serve_main',
                '--port=' + str(self.port), '--host=localhost'
            ]

            if wrapper:
                args.append('--wrapper=' + wrapper)

            subprocess.Popen(args, cwd=os.path.dirname(__file__))
            time.sleep(60)

        try:
            retval = requests.post(url='http://localhost:' + str(self.port),
                                   json=data_in)
            data_out = retval.json()
            assert data_out == expected_data_out

        finally:
            with model.get_db_provider(model.get_config(
                    self.get_config_path())) as db:

                db.stop_experiment(server_experimentid)
                time.sleep(20)
                db.delete_experiment(server_experimentid)
    def test_experiment_lifetime(self):
        my_path = os.path.dirname(os.path.realpath(__file__))

        logger = logs.getLogger('test_experiment_lifetime')
        logger.setLevel(10)

        config_name = os.path.join(my_path, 'test_config_http_client.yaml')
        key = 'test_experiment_lifetime' + str(uuid.uuid4())

        with model.get_db_provider(model.get_config(config_name)) as db:
            try:
                db.delete_experiment(key)
            except Exception:
                pass

            p = subprocess.Popen(['studio', 'run',
                                  '--config=' + config_name,
                                  '--experiment=' + key,
                                  '--force-git',
                                  '--verbose=debug',
                                  '--lifetime=-10m',
                                  'stop_experiment.py'],
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT,
                                 cwd=my_path)

            pout, _ = p.communicate()

            if pout:
                logger.debug("studio run output: \n" + pout.decode())

            db.delete_experiment(key)
Exemplo n.º 3
0
    def getResultsWithTimeout(self, timeout=0):
        total_sleep_time = 0
        sleep_time = 1

        while True:
            with model.get_db_provider(self.config) as db:
                if self.resumable:
                    experiment_keys = db.get_project_experiments(
                        self.project_name).keys()
                else:
                    experiment_keys = self.submitted

                for key in experiment_keys:
                    e = db.get_experiment(key)
                    if e is not None and e.status == 'finished':
                        self.logger.debug(
                            'Experiment {} finished, getting results' .format(
                                e.key))
                        with open(db.get_artifact(e.artifacts['retval']),
                                  'rb') as f:
                            data = pickle.load(f)

                        if not self.resumable:
                            self.submitted.remove(e.key)
                        else:
                            db.delete_experiment(e.key)

                        return (e.key, data)

            if timeout == 0 or \
               (timeout > 0 and total_sleep_time > timeout):
                return None

            time.sleep(sleep_time)
            total_sleep_time += sleep_time
Exemplo n.º 4
0
    def get_store(self, config_name='test_config.yaml'):
        config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                   config_name)
        with open(config_file) as f:
            config = yaml.load(f, Loader=yaml.SafeLoader)

        return model.get_db_provider(config).store
Exemplo n.º 5
0
    def __enter__(self):
        with model.get_db_provider(self.config):
            pass

        if self.wm:
            self.logger.debug('Spinning up cloud workers')
            if self.use_spot:
                self.wm.start_spot_workers(
                    self.queue_name,
                    self.bid,
                    self.resources_needed,
                    start_workers=self.num_workers,
                    queue_upscaling=self.queue_upscaling,
                    ssh_keypair=self.ssh_keypair,
                    timeout=self.cloud_timeout)
            else:
                for i in range(self.num_workers):
                    self.wm.start_worker(
                        self.queue_name,
                        self.resources_needed,
                        ssh_keypair=self.ssh_keypair,
                        timeout=self.cloud_timeout)

            self.p = None
        else:
            self.logger.debug('Starting local worker')
            self.p = subprocess.Popen([
                'studio-local-worker',
                '--verbose=%s' % self.config['verbose'],
                '--timeout=' + str(self.cloud_timeout)],
                close_fds=True)

        return self
Exemplo n.º 6
0
    def get_firebase_provider(self, config_name='test_config.yaml'):
        config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                   config_name)
        with open(config_file) as f:
            config = yaml.load(f)

        return model.get_db_provider(config)
Exemplo n.º 7
0
    def _create_artifacts(
            self,
            client_code_file,
            args_file,
            workspace_new,
            files):
        artifacts = {
            'retval': {
                'mutable': True,
                'unpack': True
            },
            'clientscript': {
                'mutable': False,
                'local': client_code_file,
                'unpack': True
            },
            'args': {
                'mutable': False,
                'local': args_file,
                'unpack': True
            },
            'workspace': {
                'mutable': False,
                'local': workspace_new,
                'unpack': True
            }
        }

        for tag, name in six.iteritems(files):
            artifacts[tag] = {}
            url_schema = re.compile('^https{0,1}://')
            s3_schema = re.compile('^s3://')
            gcs_schema = re.compile('^gs://')
            studio_schema = re.compile(
                'studio://(?P<experiment>.+)/(?P<artifact>.+)')

            if url_schema.match(name):
                artifacts[tag]['url'] = name
                artifacts[tag]['unpack'] = False
            elif s3_schema.match(name) or gcs_schema.match(name):
                artifacts[tag]['qualified'] = name
                artifacts[tag]['unpack'] = False
            elif studio_schema.match(name):
                ext_experiment_key = studio_schema.match(
                    name).group('experiment')
                ext_tag = studio_schema.match(name).group('artifact')
                with model.get_db_provider(self.config) as db:
                    ext_experiment = db.get_experiment(ext_experiment_key)

                artifacts[tag]['key'] = \
                    ext_experiment.artifacts[ext_tag]['key']
                artifacts[tag]['unpack'] = True
            else:
                artifacts[tag]['local'] = os.path.abspath(
                    os.path.expanduser(name))
                artifacts[tag]['unpack'] = True

            artifacts[tag]['mutable'] = False

        return artifacts
Exemplo n.º 8
0
    def get_provider(self, config_name=None):
        config_name = config_name if config_name else \
            self.get_default_config_name()

        config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                   config_name)
        with open(config_file) as f:
            config = yaml.load(f)

        return model.get_db_provider(config)
Exemplo n.º 9
0
    def getResultsWithTimeout(self, timeout=0):
        total_sleep_time = 0
        sleep_time = self.sleep_time

        assert self.resumable is False

        while True:
            with model.get_db_provider(self.config) as db:

                for key, submitted_time in six.iteritems(self.submitted):
                    try:
                        e = db.get_experiment(key)
                        if e is not None:
                            retval_path = db.get_artifact(
                                e.artifacts['retval'])
                            if os.path.exists(retval_path) and \
                               os.path.getmtime(retval_path) > submitted_time:
                                with open(retval_path, 'rb') as f:
                                    data = pickle.load(f)

                                del self.submitted[e.key]
                                return (e.key, data)
                    except BaseException as e:
                        self.logger.debug(
                            "Getting result failed due to exception:")
                        self.logger.debug(e)

                    '''
                    if e is not None and e.status == 'finished':
                        self.logger.debug(
                            'Experiment {} finished, getting results' .format(
                                e.key))
                        with open(db.get_artifact(e.artifacts['retval']),
                                  'rb') as f:
                            data = pickle.load(f)

                        if not self.resumable:
                            self.submitted.remove(e.key)
                        else:
                            db.delete_experiment(e.key)

                        return (e.key, data)
                    '''

            if timeout == 0 or \
               (timeout > 0 and total_sleep_time > timeout):
                return None

            if self.p is not None:
                assert self.p.poll() is None, \
                    "Executor process died, no point in waiting for results"

            time.sleep(sleep_time)
            total_sleep_time += sleep_time
Exemplo n.º 10
0
def get_db():
    global _config
    global _db_provider
    global _db_provider_timestamp

    if not _db_provider or \
       not _db_provider_timestamp or \
            time.time() - _db_provider_timestamp > DB_PROVIDER_EXPIRATION:
        _db_provider = model.get_db_provider(_config, blocking_auth=False)
        _db_provider_timestamp = time.time()

    return _db_provider
Exemplo n.º 11
0
def main(args=sys.argv[1:]):
    parser = argparse.ArgumentParser(
        description='Studio WebUI server. \
                     Usage: studio \
                     <arguments>')

    parser.add_argument('--config', help='configuration file', default=None)
#    parser.add_argument('--guest',
#                        help='Guest mode (does not require db credentials)',
#                        action='store_true')

    parser.add_argument('--port',
                        help='port to run Flask server on',
                        type=int,
                        default=5000)

    parser.add_argument('--host',
                        help='host name.',
                        default='localhost')

    parser.add_argument(
        '--verbose', '-v',
        help='Verbosity level. Allowed vaules: ' +
             'debug, info, warn, error, crit ' +
             'or numerical value of logger levels.',
        default=None)

    args = parser.parse_args(args)
    config = model.get_config()
    if args.config:
        with open(args.config) as f:
            config = yaml.load(f, Loader=yaml.FullLoader)

    if args.verbose:
        config['verbose'] = args.verbose

#    if args.guest:
#        config['database']['guest'] = True
    global _config
    global _db_provider
    _config = config
    _db_provider = model.get_db_provider(_config)

    getlogger().setLevel(parse_verbosity(config.get('verbose')))

    global _save_auth_cookie
    _save_auth_cookie = True

    print('Starting Studio UI on port {0}'.format(args.port))
    app.run(host=args.host, port=args.port)
Exemplo n.º 12
0
def _list(args, cli_args):
    with model.get_db_provider(cli_args.config) as db:
        if len(args) == 0:
            experiments = db.get_user_experiments()
        elif args[0] == 'project':
            assert len(args) == 2
            experiments = db.get_project_experiments(args[1])
        elif args[0] == 'users':
            assert len(args) == 1
            users = db.get_users()
            for u in users.keys():
                print users[u].get('email')
            return
        elif args[0] == 'user':
            assert len(args) == 2
            users = db.get_users()
            user_ids = [u for u in users if users[u].get('email') == args[1]]
            assert len(user_ids) == 1, \
                'The user with email ' + args[1] + \
                'not found!'
            experiments = db.get_user_experiments(user_ids[0])
        elif args[0] == 'all':
            assert len(args) == 1
            users = db.get_users()
            experiments = []
            for u in users:
                experiments += db.get_user_experiments(u)
        else:
            get_logger().critical('Unknown command ' + args[0])
            return

        if cli_args.short:
            for e in experiments:
                print e
            return

        experiments = [db.get_experiment(e) for e in experiments]

    experiments.sort(key=lambda e: -e.time_added)
    table = [['Time added', 'Key', 'Project', 'Status']]

    for e in experiments:
        table.append([
            time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(e.time_added)),
            e.key,
            e.project,
            e.status])

    print AsciiTable(table).table
    def test_stop_experiment(self):
        my_path = os.path.dirname(os.path.realpath(__file__))

        logger = logs.getLogger('test_stop_experiment')
        logger.setLevel(10)

        config_name = os.path.join(my_path, 'test_config_http_client.yaml')
        key = 'test_stop_experiment' + str(uuid.uuid4())

        with model.get_db_provider(model.get_config(config_name)) as db:
            try:
                db.delete_experiment(key)
            except Exception:
                pass

            p = subprocess.Popen(['studio', 'run',
                                  '--config=' + config_name,
                                  '--experiment=' + key,
                                  '--force-git',
                                  '--verbose=debug',
                                  'stop_experiment.py'],
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT,
                                 cwd=my_path)

            # wait till experiment spins up
            experiment = None
            while experiment is None or experiment.status == 'waiting':
                time.sleep(1)
                try:
                    experiment = db.get_experiment(key)
                except BaseException:
                    pass

            logger.info('Stopping experiment')
            db.stop_experiment(key)
            pout, _ = p.communicate()

            if pout:
                logger.debug("studio run output: \n" + pout.decode())

            db.delete_experiment(key)
Exemplo n.º 14
0
    def __enter__(self):
        with model.get_db_provider(self.config):
            pass
        self.p = None
        if self.wm:
            self.logger.debug('Spinning up cloud workers')
            if self.use_spot:
                self.wm.start_spot_workers(
                    self.queue_name,
                    self.bid,
                    self.resources_needed,
                    start_workers=self.num_workers,
                    queue_upscaling=self.queue_upscaling,
                    ssh_keypair=self.ssh_keypair,
                    timeout=self.cloud_timeout)
            else:
                for i in range(self.num_workers):
                    self.wm.start_worker(
                        self.queue_name,
                        self.resources_needed,
                        ssh_keypair=self.ssh_keypair,
                        timeout=self.cloud_timeout)

        elif self.queue_name is None or self.queue_name == 'local':
            self.logger.debug('Starting local worker')
            self.p = subprocess.Popen([
                'studio-local-worker',
                '--verbose=%s' % self.config['verbose'],
                '--timeout=' + str(self.cloud_timeout)],
                close_fds=True)

        # yet another case is when queue name is specified, but
        # cloud is not - that means running on a separately
        # managed server that listens to the queue
        #
        # The contract is queue_name that starts with sqs or ec2
        # is an SQS queue, otherwise, it is a PubSub queue

        return self
Exemplo n.º 15
0
def stubtest_worker(testclass,
                    experiment_name,
                    runner_args,
                    config_name,
                    test_script,
                    expected_output,
                    script_args=[],
                    queue=LocalQueue(),
                    wait_for_experiment=True,
                    delete_when_done=True,
                    test_output=True):

    my_path = os.path.dirname(os.path.realpath(__file__))
    config_name = os.path.join(my_path, config_name)
    logger = logging.getLogger('stubtest_worker')
    logger.setLevel(10)

    queue.clean()

    with model.get_db_provider(model.get_config(config_name)) as db:
        try:
            db.delete_experiment(experiment_name)
        except Exception:
            pass

    p = subprocess.Popen(['studio', 'run'] + runner_args + [
        '--config=' + config_name, '--verbose=debug', '--force-git',
        '--experiment=' + experiment_name, test_script
    ] + script_args,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT,
                         close_fds=True,
                         cwd=my_path)

    pout, _ = p.communicate()

    if pout:
        logger.debug("studio run output: \n" + pout)

    db = model.get_db_provider(model.get_config(config_name))
    experiments = [
        e for e in db.get_user_experiments() if e.startswith(experiment_name)
    ]

    assert len(experiments) == 1

    experiment_name = experiments[0]

    try:
        # test saved arguments
        keybase = "/experiments/" + experiment_name
        saved_args = db._get(keybase + '/args')
        if saved_args is not None:
            testclass.assertTrue(len(saved_args) == len(script_args))
            for i in range(len(saved_args)):
                testclass.assertTrue(saved_args[i] == script_args[i])
            testclass.assertTrue(db._get(keybase + '/filename') == test_script)
        else:
            testclass.assertTrue(script_args is None or len(script_args) == 0)

        experiment = db.get_experiment(experiment_name)
        if wait_for_experiment:
            while not experiment.status == 'finished':
                time.sleep(1)
                experiment = db.get_experiment(experiment_name)

        if test_output:
            with open(db.store.get_artifact(experiment.artifacts['output']),
                      'r') as f:
                data = f.read()
                split_data = data.strip().split('\n')
                testclass.assertEquals(split_data[-1], expected_output)

        check_workspace(testclass, db, experiment_name)

        if delete_when_done:
            db.delete_experiment(experiment_name)

        return db

    except Exception as e:
        print("Exception {} raised during test".format(e))
        print("worker output: \n {}".format(pout))
        print("Exception trace:")
        print(traceback.format_exc())
        raise e
Exemplo n.º 16
0
def _kill(args, cli_args):
    with model.get_db_provider(cli_args.config) as db:
        for e in args:
            get_logger().info('Deleting experiment ' + e)
            db.delete_experiment(e)
Exemplo n.º 17
0
 def get_db_provider(self):
     config = model.get_config('test_config_http_client.yaml')
     config['database']['serverUrl'] = 'http://localhost:' + str(self.port)
     return model.get_db_provider(config)
Exemplo n.º 18
0
def _get_provider():
    config = _get_config()
    return model.get_db_provider(config)
Exemplo n.º 19
0
def main(args=sys.argv[1:]):
    logger = logs.get_logger('studio-runner')
    parser = argparse.ArgumentParser(description='Studio runner. \
                     Usage: studio run <runner_arguments> \
                     script <script_arguments>')
    parser.add_argument('--config', help='configuration file', default=None)
    parser.add_argument('--project', help='name of the project', default=None)
    parser.add_argument('--experiment',
                        '-e',
                        help='Name of the experiment. If none provided, ' +
                        'random uuid will be generated',
                        default=None)

    parser.add_argument('--guest',
                        help='Guest mode (does not require db credentials)',
                        action='store_true')

    parser.add_argument(
        '--force-git',
        help='If run in a git directory, force running the experiment ' +
        'even if changes are not commited',
        action='store_true')

    parser.add_argument('--gpus',
                        help='Number of gpus needed to run the experiment',
                        type=int,
                        default=None)

    parser.add_argument('--cpus',
                        help='Number of cpus needed to run the experiment' +
                        ' (used to configure cloud instance)',
                        type=int,
                        default=None)

    parser.add_argument('--ram',
                        help='Amount of RAM needed to run the experiment' +
                        ' (used to configure cloud instance), ex: 10G, 10GB',
                        default=None)

    parser.add_argument('--gpuMem',
                        help='Amount of GPU RAM needed to run the experiment',
                        default=None)

    parser.add_argument(
        '--hdd',
        help='Amount of hard drive space needed to run the experiment' +
        ' (used to configure cloud instance), ex: 10G, 10GB',
        default=None)

    parser.add_argument('--queue',
                        '-q',
                        help='Name of the remote execution queue',
                        default=None)

    parser.add_argument(
        '--cloud',
        help='Cloud execution mode. Could be gcloud, gcspot, ec2 or ec2spot',
        default=None)

    parser.add_argument(
        '--bid',
        help='Spot instance price bid, specified in USD or in percentage ' +
        'of on-demand instance price. Default is %(default)s',
        default='100%')

    parser.add_argument(
        '--capture-once',
        '-co',
        help='Name of the immutable artifact to be captured. ' +
        'It will be captured once before the experiment is run',
        default=[],
        action='append')

    parser.add_argument(
        '--capture',
        '-c',
        help='Name of the mutable artifact to be captured continuously',
        default=[],
        action='append')

    parser.add_argument(
        '--reuse',
        '-r',
        help='Name of the artifact from another experiment to use',
        default=[],
        action='append')

    parser.add_argument('--verbose',
                        '-v',
                        help='Verbosity level. Allowed values: ' +
                        'debug, info, warn, error, crit ' +
                        'or numerical value of logger levels.',
                        default=None)

    parser.add_argument(
        '--metric',
        help='Metric to show in the summary of the experiment, ' +
        'and to base hyperparameter search on. ' +
        'Refers a scalar value in tensorboard log ' +
        'example: --metric=val_loss[:final | :min | :max] to report ' +
        'validation loss in the end of the keras experiment ' +
        '(or smallest or largest throughout the experiment for :min ' +
        'and :max respectively)',
        default=None)

    parser.add_argument(
        '--hyperparam',
        '-hp',
        help='Try out multiple values of a certain parameter. ' +
        'For example, --hyperparam=learning_rate:0.01:0.1:l10 ' +
        'will instantiate 10 versions of the script, replace ' +
        'learning_rate with a one of the 10 values for learning ' +
        'rate that lies on a log grid from 0.01 to 0.1, create '
        'experiments and place them in the queue.',
        default=[],
        action='append')

    parser.add_argument('--num-workers',
                        help='Number of local or cloud workers to spin up',
                        type=int,
                        default=None)

    parser.add_argument(
        '--python-pkg',
        help='Python package not present in the current environment ' +
        'that is needed for experiment. Only compatible with ' +
        'remote and cloud workers for now',
        default=[],
        action='append')

    parser.add_argument(
        '--ssh-keypair',
        help='Name of the SSH keypair used to access the EC2 ' +
        'instances directly',
        default=None)

    parser.add_argument(
        '--optimizer',
        '-opt',
        help='Name of optimizer to use, by default is grid search. ' +
        'The name of the optimizer must either be in ' +
        'studio/optimizer_plugins ' +
        'directory or the path to the optimizer source file ' +
        'must be supplied. ',
        default='grid')

    parser.add_argument(
        '--cloud-timeout',
        help="Time (in seconds) that cloud workers wait for messages. " +
        "If negative, " +
        "wait for the first message in the queue indefinitely " +
        "and shut down " + "as soon as no new messages are available. " +
        "If zero, don't wait at all." + "Default value is %(default)d",
        type=int,
        default=300)

    parser.add_argument('--user-startup-script',
                        help='Path of script to run immediately ' +
                        'before running the remote worker',
                        default=None)

    parser.add_argument(
        '--branch',
        help='Branch of studioml to use when running remote worker, useful ' +
        'for debugging pull requests. Default is current',
        default=None)

    parser.add_argument(
        '--max-duration',
        help='Max experiment runtime (i.e. time after which experiment ' +
        'should be killed no matter what.).  Examples of values ' +
        'might include 5h, 48h2m10s',
        default=None)

    parser.add_argument(
        '--lifetime',
        help='Max experiment lifetime (i.e. wait time after which ' +
        'experiment loses relevance and should not be started)' +
        '  Examples include 240h30m10s',
        default=None)

    parser.add_argument(
        '--container',
        help='Singularity container in which experiment should be run. ' +
        'Assumes that container has all dependencies installed',
        default=None)

    parser.add_argument('--port',
                        help='Ports to open on a cloud instance',
                        default=[],
                        action='append')

    # detect which argument is the script filename
    # and attribute all arguments past that index as related to the script
    (runner_args, other_args) = parser.parse_known_args(args)
    py_suffix_args = [
        i for i, arg in enumerate(args) if arg.endswith('.py') or '::' in arg
    ]

    rerun = False
    if len(py_suffix_args) < 1:
        print('None of the arugments end with .py')
        if len(other_args) == 0:
            print("Trying to run a container job")
            assert runner_args.container is not None
            exec_filename = None
        elif len(other_args) == 1:
            print("Treating last argument as experiment key to rerun")
            rerun = True
            experiment_key = args[-1]
        else:
            print("Too many extra arguments - should be either none " +
                  "for container job or one for experiment re-run")
            sys.exit(1)
    else:
        script_index = py_suffix_args[0]
        exec_filename, other_args = args[script_index], args[script_index + 1:]
        runner_args = parser.parse_args(args[:script_index])

    # TODO: Queue the job based on arguments and only then execute.

    config = model.get_config(runner_args.config)

    if runner_args.verbose:
        config['verbose'] = runner_args.verbose

    if runner_args.guest:
        config['database']['guest'] = True

    if runner_args.container:
        runner_args.capture_once.append(runner_args.container +
                                        ':_singularity')

    verbose = model.parse_verbosity(config['verbose'])
    logger.setLevel(verbose)

    if git_util.is_git() and not git_util.is_clean() and not rerun:
        logger.warn('Running from dirty git repo')
        if not runner_args.force_git:
            logger.error(
                'Specify --force-git to run experiment from dirty git repo')
            sys.exit(1)

    resources_needed = _parse_hardware(runner_args, config['resources_needed'])
    logger.debug('resources requested: ')
    logger.debug(str(resources_needed))

    # Set up default artifacts:
    # note that their "local" paths will be updated
    # on Experiment creation,
    # but they must have "local" field defined
    # to have storage credentials set up properly.
    artifacts = {
        'workspace': {
            'mutable': False,
            'local': os.getcwd(),
            'unpack': True
        },
        'modeldir': {
            'mutable': True,
            'local': '',
            'unpack': True
        },
        'retval': {
            'mutable': True,
            'local': '',
            'unpack': True
        },
        'output': {
            'mutable': True,
            'local': '',
            'unpack': True
        },
        'tb': {
            'mutable': True,
            'local': '',
            'unpack': True
        },
        '_metrics': {
            'mutable': True,
            'local': '',
            'unpack': True
        },
        '_metadata': {
            'mutable': True,
            'local': '',
            'unpack': True
        }
    }

    artifacts.update(_parse_artifacts(runner_args.capture, mutable=True))
    artifacts.update(_parse_artifacts(runner_args.capture_once, mutable=False))
    with model.get_db_provider(config) as db:
        artifacts.update(_parse_external_artifacts(runner_args.reuse, db))

    logger.debug("Task artifacts: %s", repr(artifacts))
    storage_creds = config.get('storage', {}).get(KEY_CREDENTIALS, None)
    _setup_artifacts_creds(artifacts, storage_creds)

    if runner_args.branch:
        config['cloud']['branch'] = runner_args.branch

    if runner_args.user_startup_script:
        config['cloud']['user_startup_script'] = \
            runner_args.user_startup_script

    if runner_args.lifetime:
        config['experimentLifetime'] = runner_args.lifetime

    queueLifetime = None

    if any(runner_args.hyperparam):
        if runner_args.optimizer == "grid":
            experiments = _add_hyperparam_experiments(exec_filename,
                                                      other_args, runner_args,
                                                      artifacts,
                                                      resources_needed, logger)

            queue = model.get_queue(queue_name=runner_args.queue,
                                    cloud=runner_args.cloud,
                                    config=config,
                                    close_after=queueLifetime,
                                    logger=logger,
                                    verbose=verbose)

            queue_name = submit_experiments(experiments,
                                            config=config,
                                            logger=logger,
                                            queue=queue)

            spin_up_workers(runner_args,
                            config,
                            resources_needed,
                            queue_name=queue_name,
                            verbose=verbose)
        else:
            opt_modulepath = os.path.join(
                os.path.dirname(os.path.abspath(__file__)),
                "optimizer_plugins", runner_args.optimizer + ".py")
            if not os.path.exists(opt_modulepath):
                opt_modulepath = os.path.abspath(
                    os.path.expanduser(runner_args.optimizer))
            logger.info('optimizer path: %s' % opt_modulepath)

            assert os.path.exists(opt_modulepath)
            sys.path.append(os.path.dirname(opt_modulepath))
            opt_module = importlib.import_module(
                os.path.basename(opt_modulepath.replace(".py", '')))

            h = HyperparameterParser(runner_args, logger)
            hyperparams = h.parse()
            optimizer = getattr(opt_module,
                                "Optimizer")(hyperparams, config['optimizer'],
                                             logger)

            workers_started = False
            queue_name = runner_args.queue
            while not optimizer.stop():
                hyperparam_pop = optimizer.ask()
                hyperparam_tuples = h.convert_to_tuples(hyperparam_pop)

                experiments = _add_hyperparam_experiments(
                    exec_filename,
                    other_args,
                    runner_args,
                    artifacts,
                    resources_needed,
                    logger,
                    optimizer=optimizer,
                    hyperparam_tuples=hyperparam_tuples)

                queue = model.get_queue(queue_name=queue_name,
                                        cloud=runner_args.cloud,
                                        config=config,
                                        close_after=queueLifetime,
                                        logger=logger,
                                        verbose=verbose)

                queue_name = submit_experiments(experiments,
                                                config=config,
                                                logger=logger,
                                                queue=queue)

                if not workers_started:
                    spin_up_workers(runner_args,
                                    config,
                                    resources_needed,
                                    queue_name=queue_name,
                                    verbose=verbose)
                    workers_started = True

                fitnesses, behaviors = get_experiment_fitnesses(
                    experiments, optimizer, config, logger)

                try:
                    optimizer.tell(hyperparam_pop, fitnesses, behaviors)
                except BaseException:
                    util.check_for_kb_interrupt()
                    optimizer.tell(hyperparam_pop, fitnesses)

                try:
                    optimizer.disp()
                except BaseException:
                    util.check_for_kb_interrupt()
                    logger.warn('Optimizer has no disp() method')
    else:
        if rerun:
            with model.get_db_provider(config) as db:
                experiment = db.get_experiment(experiment_key)
                new_key = runner_args.experiment if runner_args.experiment \
                    else experiment_key + '_rerun' + str(uuid.uuid4())
                experiment.key = new_key
                for _, art in six.iteritems(experiment.artifacts):
                    art['mutable'] = False

                experiments = [experiment]

        else:
            experiments = [
                create_experiment(filename=exec_filename,
                                  args=other_args,
                                  experiment_name=runner_args.experiment,
                                  project=runner_args.project,
                                  artifacts=artifacts,
                                  resources_needed=resources_needed,
                                  metric=runner_args.metric,
                                  max_duration=runner_args.max_duration,
                                  dependency_policy=StudioDependencyPolicy())
            ]

        queue = model.get_queue(queue_name=runner_args.queue,
                                cloud=runner_args.cloud,
                                config=config,
                                close_after=queueLifetime,
                                logger=logger,
                                verbose=verbose)

        queue_name = submit_experiments(experiments,
                                        config=config,
                                        logger=logger,
                                        queue=queue)

        spin_up_workers(runner_args,
                        config,
                        resources_needed,
                        queue_name=queue_name,
                        verbose=verbose)

    return
Exemplo n.º 20
0
def get_experiment_fitnesses(experiments, optimizer, config, logger):
    with model.get_db_provider() as db:
        progbar = Progbar(len(experiments), interval=0.0)
        logger.info("Waiting for fitnesses from %s experiments" %
                    len(experiments))

        bad_line_dicts = [dict() for x in range(len(experiments))]
        has_result = [False for i in range(len(experiments))]
        fitnesses = [0.0 for i in range(len(experiments))]
        behaviors = [None for i in range(len(experiments))]
        term_criterion = config['optimizer']['termination_criterion']
        skip_gen_thres = term_criterion['skip_gen_thres']
        skip_gen_timeout = term_criterion['skip_gen_timeout']
        result_timestamp = time.time()

        while sum(has_result) < len(experiments):
            for i, experiment in enumerate(experiments):
                if float(sum(has_result)) / len(experiments) >= skip_gen_thres\
                        and time.time() - result_timestamp > skip_gen_timeout:
                    logger.warn(
                        "Skipping to next gen with %s of solutions evaled" %
                        (float(sum(has_result)) / len(experiments)))
                    has_result = [True] * len(experiments)
                    break
                if has_result[i]:
                    continue
                returned_experiment = db.get_experiment(experiment.key,
                                                        getinfo=True)
                output = db._get_experiment_logtail(returned_experiment)
                if output is None:
                    continue

                for j, line in enumerate(output):

                    if line.startswith(
                            "Traceback (most recent call last):") and \
                            j not in bad_line_dicts[i]:
                        logger.warn("Experiment %s: error"
                                    " discovered in output" %
                                    returned_experiment.key)
                        logger.warn("".join(output[j:]))
                        bad_line_dicts[i][j] = True

                    if line.startswith("Behavior") or \
                            line.startswith("behavior"):
                        try:
                            behavior = eval(line.rstrip().split(':')[1])
                            if isinstance(behavior, np.ndarray):
                                pass
                            elif isinstance(behavior, list):
                                behavior = np.array(behavior)
                            else:
                                raise

                        except BaseException:
                            util.check_for_kb_interrupt()
                            if j not in bad_line_dicts[i]:
                                logger.warn(
                                    'Experiment %s: error parsing or invalid'
                                    ' behavior' % returned_experiment.key)
                                logger.warn(line)
                                bad_line_dicts[i][j] = True
                        else:
                            behaviors[i] = behavior

                    if line.startswith("Fitness") or \
                            line.startswith("fitness"):
                        try:
                            fitness = float(line.rstrip().split(':')[1])
                            # assert fitness >= 0.0
                        except BaseException:
                            util.check_for_kb_interrupt()
                            if j not in bad_line_dicts[i]:
                                logger.warn(
                                    'Experiment %s: error parsing or invalid'
                                    ' fitness' % returned_experiment.key)
                                logger.warn(line)
                                bad_line_dicts[i][j] = True
                        else:
                            if fitness < 0.0:
                                logger.warn('Experiment %s: returned'
                                            ' fitness is less than zero,'
                                            ' setting it to zero' %
                                            returned_experiment.key)
                                fitness = 0.0

                            fitnesses[i] = fitness
                            has_result[i] = True
                            progbar.add(1)
                            result_timestamp = time.time()
                            break

            time.sleep(config['sleep_time'])
        return fitnesses, behaviors
Exemplo n.º 21
0
def worker_loop(queue,
                parsed_args,
                single_experiment=False,
                timeout=0,
                verbose=None):

    fetch_artifacts = True

    logger = logs.get_logger('worker_loop')

    hold_period = 4
    retval = 0
    while True:
        msg = queue.dequeue(acknowledge=False, timeout=timeout)
        if not msg:
            break

        first_exp, ack_key = msg

        data_dict = json.loads(sixdecode(first_exp))
        experiment_key = data_dict['experiment']['key']
        config = data_dict['config']

        parsed_args.config = config
        if verbose:
            config['verbose'] = verbose
        else:
            verbose = model.parse_verbosity(config.get('verbose', None))

        logger.setLevel(verbose)

        logger.debug('Received message: \n{}'.format(data_dict))

        executor = LocalExecutor(queue, parsed_args)

        with model.get_db_provider(config) as db:
            # experiment = experiment_from_dict(data_dict['experiment'])
            def try_get_experiment():
                experiment = db.get_experiment(experiment_key)
                if experiment is None:
                    raise ValueError(
                        'experiment is not found - indicates storage failure')
                return experiment

            experiment = retry(try_get_experiment,
                               sleep_time=10,
                               logger=logger)

            if config.get('experimentLifetime', None) and \
                int(str2duration(config['experimentLifetime'])
                    .total_seconds()) + experiment.time_added < time.time():
                logger.info(
                    'Experiment expired (max lifetime of {0} was exceeded)'.
                    format(config.get('experimentLifetime', None)))
                queue.acknowledge(ack_key)
                continue

            if allocate_resources(experiment, config, verbose=verbose):

                def hold_job():
                    queue.hold(ack_key, hold_period)

                hold_job()
                sched = BackgroundScheduler()
                sched.add_job(hold_job, 'interval', minutes=hold_period / 2)
                sched.start()

                try:
                    python = 'python'
                    if experiment.pythonver[0] == '3':
                        python = 'python3'
                    if '_singularity' not in experiment.artifacts.keys():
                        pip_diff = pip_needed_packages(experiment.pythonenv,
                                                       python)
                        if any(pip_diff):
                            logger.info(
                                'Setting up python packages for experiment')
                            if pip_install_packages(pip_diff, python,
                                                    logger) != 0:

                                logger.info(
                                    "Installation of all packages together " +
                                    " failed, "
                                    "trying one package at a time")

                                for pkg in pip_diff:
                                    pip_install_packages([pkg], python, logger)

                    for tag, item in experiment.artifacts.items():
                        art: Artifact = item
                        if fetch_artifacts or art.local_path is None:
                            get_only_newer: bool = True
                            if tag == 'workspace':
                                get_only_newer = False

                            if not art.is_mutable:
                                logger.info('Fetching artifact ' + tag)
                                art.local_path = retry(lambda: db.get_artifact(
                                    art, only_newer=get_only_newer),
                                                       sleep_time=10,
                                                       logger=logger)
                            else:
                                logger.info('Skipping mutable artifact ' + tag)

                    returncode = executor.run(experiment)
                    if returncode != 0:
                        retval = returncode
                finally:
                    sched.shutdown()
                    queue.acknowledge(ack_key)

                if single_experiment:
                    logger.info('single_experiment is True, quitting')
                    return retval
            else:
                logger.info('Cannot run experiment ' + experiment.key +
                            ' due lack of resources. Will retry')
                # Debounce failed requests we cannot service yet
                time.sleep(config.get('sleep_time', 5))

    logger.info("Queue in {0} is empty, quitting".format(
        fs_tracker.get_queue_directory()))

    return retval
def stubtest_worker(
        testclass,
        experiment_name,
        runner_args,
        config_name,
        test_script,
        expected_output,
        script_args=[],
        queue=LocalQueue(),
        wait_for_experiment=True,
        delete_when_done=True,
        test_output=True,
        test_workspace=True):

    my_path = os.path.dirname(os.path.realpath(__file__))
    config_name = os.path.join(my_path, config_name)
    logger = logs.getLogger('stubtest_worker')
    logger.setLevel(10)

    queue.clean()

    with model.get_db_provider(model.get_config(config_name)) as db:
        try:
            db.delete_experiment(experiment_name)
        except Exception:
            pass

    os.environ['PYTHONUNBUFFERED'] = 'True'
    p = subprocess.Popen(['studio', 'run'] + runner_args +
                         ['--config=' + config_name,
                          '--verbose=debug',
                          '--force-git',
                          '--experiment=' + experiment_name,
                          test_script] + script_args,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT,
                         close_fds=True,
                         cwd=my_path)

    pout, _ = p.communicate()

    if pout:
        logger.debug("studio run output: \n" + sixdecode(pout))
        splitpout = sixdecode(pout).split('\n')
        experiments = [line.split(' ')[-1] for line in splitpout
                       if line.startswith('studio run: submitted experiment')]
        logger.debug("added experiments: {}".format(experiments))

    db = model.get_db_provider(model.get_config(config_name))
    experiment_name = experiments[0]

    try:
        experiment = db.get_experiment(experiment_name)
        if wait_for_experiment:
            while not experiment or not experiment.status == 'finished':
                experiment = db.get_experiment(experiment_name)

        if test_output:
            with open(db.get_artifact(experiment.artifacts['output']),
                      'r') as f:
                data = f.read()
                split_data = data.strip().split('\n')
                print(data)
                testclass.assertEquals(split_data[-1], expected_output)

        if test_workspace:
            check_workspace(testclass, db, experiment_name)

        if delete_when_done:
            retry(lambda: db.delete_experiment(experiment_name), sleep_time=10)

        return db

    except Exception as e:
        print("Exception {} raised during test".format(e))
        print("worker output: \n {}".format(pout))
        print("Exception trace:")
        print(traceback.format_exc())
        raise e
Exemplo n.º 23
0
    def run(self, experiment):
        if isinstance(experiment, six.string_types):
            experiment = self.db.get_experiment(experiment)
        elif not isinstance(experiment, Experiment):
            raise ValueError("Unknown type of experiment: " +
                             str(type(experiment)))

        self.logger.info("Experiment key: " + experiment.key)

        with model.get_db_provider(self.config) as db:
            db.start_experiment(experiment)
            """ Override env variables with those inside the queued message
            """
            env = dict(os.environ)
            if 'env' in self.config.keys():
                for k, v in six.iteritems(self.config['env']):
                    if v is not None:
                        env[str(k)] = str(v)

            env['PYTHONUNBUFFERED'] = 'TRUE'

            fs_tracker.setup_experiment(env, experiment, clean=False)
            log_path = fs_tracker.get_artifact_cache('output', experiment.key)

            self.logger.debug('Child process environment:')
            self.logger.debug(str(env))

            sched = BackgroundScheduler()
            sched.start()

            with open(log_path, 'w') as output_file:
                python = 'python'
                if experiment.pythonver[0] == '3':
                    python = 'python3'

                python = which(python)

                cmd = [python, experiment.filename] + experiment.args
                cwd = experiment.artifacts['workspace'].local_path
                container_artifact = experiment.artifacts.get('_singularity')
                if container_artifact:
                    container = container_artifact.get('local')
                    if not container:
                        container = container_artifact.get('qualified')

                    cwd = fs_tracker.get_artifact_cache(
                        'workspace', experiment.key)

                    for tag, art in six.iteritems(experiment.artifacts):
                        local_path = art.get('local')
                        if not art['mutable'] and os.path.exists(local_path):
                            os.symlink(art['local'],
                                       os.path.join(os.path.dirname(cwd), tag))

                    if experiment.filename is not None:
                        cmd = [
                            'singularity',
                            'exec',
                            container,
                        ] + cmd
                    else:
                        cmd = ['singularity', 'run', container]

                self.logger.info('Running cmd: {0} in {1}'.format(cmd, cwd))

                p = subprocess.Popen(cmd,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.STDOUT,
                                     env=env,
                                     cwd=cwd,
                                     text=True)

                def kill_subprocess():
                    p.kill()

                def get_duration(tag: str):
                    value = self.config.get(tag, '0m')
                    return int(str2duration(value).total_seconds() / 60)

                def checkpoint():
                    try:
                        db.checkpoint_experiment(experiment)
                    except BaseException as e:
                        self.logger.info(e)
                        check_for_kb_interrupt()

                minutes = get_duration('saveWorkspaceFrequency')
                sched.add_job(checkpoint, 'interval', minutes=minutes)

                metrics_path = fs_tracker.get_artifact_cache(
                    '_metrics', experiment.key)

                minutes = get_duration('saveMetricsFrequency')
                sched.add_job(lambda: save_metrics(metrics_path),
                              'interval',
                              minutes=minutes)

                def kill_if_stopped():
                    try:
                        db_expr = db.get_experiment(experiment.key,
                                                    getinfo=False)
                    except:
                        check_for_kb_interrupt()
                        db_expr = None

                    # Transient issues with getting experiment data might
                    # result in a None value being returned, as result
                    # leave the experiment running because we wont be able to
                    # do anything else even if this experiment is stopped
                    # in any event if the experiment runs too long then it
                    # will exceed its allocated time and stop
                    if db_expr is not None:
                        if db_expr.status == 'stopped':
                            kill_subprocess()
                            return

                    if experiment.max_duration is not None and \
                            time.time() > experiment.time_started + \
                            int(str2duration(experiment.max_duration)
                                .total_seconds()):

                        kill_subprocess()
                        return

                    # If our tasks queue is signalled inactive
                    # during work process execution, that means we need to drop
                    # current execution and exit
                    if not self.task_queue.is_active():
                        kill_subprocess()

                sched.add_job(kill_if_stopped, 'interval', seconds=10)

                while True:
                    output = p.stdout.readline()
                    if output == '' and p.poll() is not None:
                        break
                    if output:
                        line_out = output.strip()
                        print(line_out)
                        output_file.write(line_out)

                try:
                    p.wait()
                finally:
                    save_metrics(metrics_path)
                    sched.shutdown()
                    db.checkpoint_experiment(experiment)
                    db.finish_experiment(experiment)
                    return p.returncode
Exemplo n.º 24
0
 def get_db_provider(self):
     config = model.get_config(self.client_config_file)
     config['database']['serverUrl'] = 'http://localhost:' + str(self.port)
     return model.get_db_provider(config)
Exemplo n.º 25
0
 def get_db_provider(self, config_name):
     config_file = os.path.join(
         os.path.dirname(
             os.path.realpath(__file__)),
         config_name)
     return model.get_db_provider(model.get_config(config_file))
Exemplo n.º 26
0
def _stop(args, cli_args):
    with model.get_db_provider(cli_args.config) as db:
        for e in args:
            get_logger().info('Stopping experiment ' + e)
            db.stop_experiment(e)
Exemplo n.º 27
0
    def studio_run(self, line, cell=None):
        script_text = []
        pickleable_ns = {}

        for varname, var in six.iteritems(self.shell.user_ns):
            if not varname.startswith('__'):
                if isinstance(var, ModuleType) and \
                   var.__name__ != 'studio.magics':
                    script_text.append(
                        'import {} as {}'.format(var.__name__, varname)
                    )

                else:
                    try:
                        pickle.dumps(var)
                        pickleable_ns[varname] = var
                    except BaseException:
                        pass

        script_text.append(cell)
        script_text = '\n'.join(script_text)
        stub_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)),
            'run_magic.py.stub')

        with open(stub_path) as f:
            script_stub = f.read()

        script = script_stub.format(script=script_text)

        experiment_key = str(int(time.time())) + \
            "_jupyter_" + str(uuid.uuid4())

        print('Running studio with experiment key ' + experiment_key)
        config = model.get_config()
        if config['database']['type'] == 'http':
            print("Experiment progress can be viewed/shared at:")
            print("{}/experiment/{}".format(
                config['database']['serverUrl'],
                experiment_key))

        workspace_new = fs_tracker.get_artifact_cache(
            'workspace', experiment_key)

        rsync_cp('.', workspace_new)
        with open(os.path.join(workspace_new, '_script.py'), 'w') as f:
            f.write(script)

        ns_path = fs_tracker.get_artifact_cache('_ns', experiment_key)

        with gzip.open(ns_path, 'wb') as f:
            f.write(pickle.dumps(pickleable_ns))

        if any(line):
            runner_args = line.strip().split(' ')
        else:
            runner_args = []

        runner_args.append('--capture={}:_ns'.format(ns_path))
        runner_args.append('--capture-once=.:workspace')
        runner_args.append('--force-git')
        runner_args.append('--experiment=' + experiment_key)

        notebook_cwd = os.getcwd()
        os.chdir(workspace_new)
        print(runner_args + ['_script.py'])
        runner_main(runner_args + ['_script.py'])
        os.chdir(notebook_cwd)

        with model.get_db_provider() as db:
            while True:
                experiment = db.get_experiment(experiment_key)
                if experiment and experiment.status == 'finished':
                    break

                time.sleep(10)

            new_ns_path = db.get_artifact(experiment.artifacts['_ns'])

        with open(new_ns_path) as f:
            new_ns = pickle.loads(f.read())

        self.shell.user_ns.update(new_ns)