Пример #1
0
    def submitTaskWithFiles(self, clientCodeFile, args, files={}, job_id=None):
        old_cwd = os.getcwd()
        cwd = os.path.dirname(os.path.realpath(__file__))
        os.chdir(cwd)

        if job_id is not None:
            experiment_name = self.project_name + "_" + str(job_id)
        else:
            experiment_name = self.project_name + "_" + str(uuid.uuid4())

        tmpdir = tempfile.gettempdir()
        args_file = os.path.join(tmpdir, experiment_name + "_args.pkl")

        workspace_orig = os.getcwd()
        ignore_arg = ''
        ignore_filepath = os.path.join(workspace_orig, ".studioml_ignore")
        if os.path.exists(ignore_filepath) and \
                not os.path.isdir(ignore_filepath):
            ignore_arg = "--exclude-from=%s" % ignore_filepath

        workspace_new = fs_tracker.get_artifact_cache('workspace',
                                                      experiment_name)
        rsync_cp(workspace_orig, workspace_new, ignore_arg, self.logger)
        distpath = os.path.join(old_cwd, 'dist')
        if os.path.exists(distpath):
            self.logger.info('dist folder found at {}, ' +
                             'copying into workspace')
            rsync_cp(distpath, os.path.join(workspace_new, 'dist'))

        self.logger.info('Created workspace ' + workspace_new)

        artifacts = self._create_artifacts(clientCodeFile, args_file,
                                           workspace_new, files)

        with open(args_file, 'wb') as f:
            f.write(pickle.dumps(args, protocol=2))

        experiment = create_experiment('completion_service_client.py',
                                       [self.config['verbose']],
                                       experiment_name=experiment_name,
                                       project=self.project_name,
                                       artifacts=artifacts,
                                       resources_needed=self.resources_needed)

        tic = time.time()
        runner.submit_experiments([experiment],
                                  config=self.config,
                                  logger=self.logger,
                                  cloud=self.cloud,
                                  queue_name=self.queue_name)

        self.submitted[experiment.key] = time.time()
        os.chdir(old_cwd)
        toc = time.time()
        self.logger.info('Submitted experiment ' + experiment.key + ' in ' +
                         str(toc - tic) + ' s')

        return experiment_name
Пример #2
0
    def create_experiments(hyperparam_tuples):
        experiments = []
        # experiment_names = {}
        for hyperparam_tuple in hyperparam_tuples:
            experiment_name = experiment_name_base
            experiment_name += "__opt__%s__%s" % (util.rand_string(32),
                                                  int(time.time()))
            experiment_name = experiment_name.replace('.', '_')

            workspace_new = fs_tracker.get_artifact_cache(
                'workspace', experiment_name)

            current_artifacts = artifacts.copy()
            current_artifacts.update({
                'workspace': {
                    'local': workspace_new,
                    'mutable': True
                }
            })

            rsync_cp(workspace_orig, workspace_new, ignore_arg, logger)
            # shutil.copytree(workspace_orig, workspace_new)

            for param_name, param_value in six.iteritems(hyperparam_tuple):
                if isinstance(param_value, np.ndarray):
                    array_filepath = '/tmp/%s.npy' % util.rand_string(32)
                    np.save(array_filepath, param_value)
                    assert param_name not in current_artifacts
                    current_artifacts[param_name] = {'local': array_filepath,
                                                     'mutable': False}
                else:
                    with open(os.path.join(workspace_new, exec_filename),
                              'r') as f:
                        script_text = f.read()

                    script_text = re.sub(
                        '\\b' +
                        param_name +
                        '\\b(?=[^=]*\\n)',
                        str(param_value),
                        script_text)

                    with open(os.path.join(workspace_new, exec_filename),
                              'w') as f:
                        f.write(script_text)

            experiments.append(create_experiment(
                filename=exec_filename,
                args=other_args,
                experiment_name=experiment_name,
                project=project,
                artifacts=current_artifacts,
                resources_needed=resources_needed,
                metric=runner_args.metric,
                max_duration=runner_args.max_duration,
                dependency_policy=StudioDependencyPolicy()
            ))
        return experiments
Пример #3
0
    def submitTaskWithFiles(self, clientCodeFile, args, files={}):
        old_cwd = os.getcwd()
        cwd = os.path.dirname(os.path.realpath(__file__))
        os.chdir(cwd)

        experiment_name = self.project_name + "_" + str(uuid.uuid4())

        tmpdir = tempfile.gettempdir()
        args_file = os.path.join(tmpdir, experiment_name + "_args.pkl")

        workspace_orig = os.getcwd()
        ignore_arg = ''
        ignore_filepath = os.path.join(workspace_orig, ".studioml_ignore")
        if os.path.exists(ignore_filepath) and \
                not os.path.isdir(ignore_filepath):
            ignore_arg = "--exclude-from=%s" % ignore_filepath

        workspace_new = fs_tracker.get_artifact_cache(
            'workspace', experiment_name)
        rsync_cp(workspace_orig, workspace_new, ignore_arg, self.logger)
        distpath = os.path.join(old_cwd, 'dist')
        if os.path.exists(distpath):
            self.logger.info('dist folder found at {}, ' +
                             'copying into workspace')
            rsync_cp(distpath, os.path.join(workspace_new, 'dist'))

        self.logger.info('Created workspace ' + workspace_new)

        artifacts = {
            'retval': {
                'mutable': True
            },
            'clientscript': {
                'mutable': False,
                'local': clientCodeFile
            },
            'args': {
                'mutable': False,
                'local': args_file
            },
            'workspace': {
                'mutable': False,
                'local': workspace_new
            }
        }

        for tag, name in six.iteritems(files):
            artifacts[tag] = {}
            url_schema = re.compile('^https{0,1}://')
            s3_schema = re.compile('^s3://')
            gcs_schema = re.compile('^gs://')

            if url_schema.match(name):
                artifacts[tag]['url'] = name
            elif s3_schema.match(name) or gcs_schema.match(name):
                artifacts[tag]['qualified'] = name
            else:
                artifacts[tag]['local'] = os.path.abspath(
                    os.path.expanduser(name))
            artifacts[tag]['mutable'] = False

        with open(args_file, 'wb') as f:
            f.write(pickle.dumps(args, protocol=2))

        experiment = create_experiment(
            'completion_service_client.py',
            [self.config['verbose']],
            experiment_name=experiment_name,
            project=self.project_name,
            artifacts=artifacts,
            resources_needed=self.resources_needed)

        tic = time.time()
        runner.submit_experiments(
            [experiment],
            config=self.config,
            logger=self.logger,
            cloud=self.cloud,
            queue_name=self.queue_name)

        self.submitted.add(experiment.key)
        os.chdir(old_cwd)
        toc = time.time()
        self.logger.info('Submitted experiment ' + experiment.key +
                         ' in ' + str(toc - tic) + ' s')

        return experiment_name
Пример #4
0
    def studio_run(self, line, cell=None):
        script_text = []
        pickleable_ns = {}

        for varname, var in six.iteritems(self.shell.user_ns):
            if not varname.startswith('__'):
                if isinstance(var, ModuleType) and \
                   var.__name__ != 'studio.magics':
                    script_text.append(
                        'import {} as {}'.format(var.__name__, varname)
                    )

                else:
                    try:
                        pickle.dumps(var)
                        pickleable_ns[varname] = var
                    except BaseException:
                        pass

        script_text.append(cell)
        script_text = '\n'.join(script_text)
        stub_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)),
            'run_magic.py.stub')

        with open(stub_path) as f:
            script_stub = f.read()

        script = script_stub.format(script=script_text)

        experiment_key = str(int(time.time())) + \
            "_jupyter_" + str(uuid.uuid4())

        print('Running studio with experiment key ' + experiment_key)
        config = model.get_config()
        if config['database']['type'] == 'http':
            print("Experiment progress can be viewed/shared at:")
            print("{}/experiment/{}".format(
                config['database']['serverUrl'],
                experiment_key))

        workspace_new = fs_tracker.get_artifact_cache(
            'workspace', experiment_key)

        rsync_cp('.', workspace_new)
        with open(os.path.join(workspace_new, '_script.py'), 'w') as f:
            f.write(script)

        ns_path = fs_tracker.get_artifact_cache('_ns', experiment_key)

        with gzip.open(ns_path, 'wb') as f:
            f.write(pickle.dumps(pickleable_ns))

        if any(line):
            runner_args = line.strip().split(' ')
        else:
            runner_args = []

        runner_args.append('--capture={}:_ns'.format(ns_path))
        runner_args.append('--capture-once=.:workspace')
        runner_args.append('--force-git')
        runner_args.append('--experiment=' + experiment_key)

        notebook_cwd = os.getcwd()
        os.chdir(workspace_new)
        print(runner_args + ['_script.py'])
        runner_main(runner_args + ['_script.py'])
        os.chdir(notebook_cwd)

        with model.get_db_provider() as db:
            while True:
                experiment = db.get_experiment(experiment_key)
                if experiment and experiment.status == 'finished':
                    break

                time.sleep(10)

            new_ns_path = db.get_artifact(experiment.artifacts['_ns'])

        with open(new_ns_path) as f:
            new_ns = pickle.loads(f.read())

        self.shell.user_ns.update(new_ns)
Пример #5
0
    def run(self, experiment):
        if isinstance(experiment, six.string_types):
            experiment = self.db.get_experiment(experiment)
        elif not isinstance(experiment, Experiment):
            raise ValueError("Unknown type of experiment: " +
                             str(type(experiment)))

        self.logger.info("Experiment key: " + experiment.key)

        with model.get_db_provider(self.config) as db:
            db.start_experiment(experiment)
            """ Override env variables with those inside the queued message
            """
            env = dict(os.environ)
            if 'env' in self.config.keys():
                for k, v in six.iteritems(self.config['env']):
                    if v is not None:
                        env[str(k)] = str(v)

            env['PYTHONUNBUFFERED'] = 'TRUE'

            fs_tracker.setup_experiment(env, experiment, clean=False)
            log_path = fs_tracker.get_artifact_cache('output', experiment.key)

            self.logger.debug('Child process environment:')
            self.logger.debug(str(env))

            sched = BackgroundScheduler()
            sched.start()

            with open(log_path, 'w') as output_file:
                python = 'python'
                if experiment.pythonver[0] == '3':
                    python = 'python3'

                python = which(python)

                cmd = [python, experiment.filename] + experiment.args
                cwd = experiment.artifacts['workspace'].local_path
                container_artifact = experiment.artifacts.get('_singularity')
                if container_artifact:
                    container = container_artifact.get('local')
                    if not container:
                        container = container_artifact.get('qualified')

                    cwd = fs_tracker.get_artifact_cache(
                        'workspace', experiment.key)

                    for tag, art in six.iteritems(experiment.artifacts):
                        local_path = art.get('local')
                        if not art['mutable'] and os.path.exists(local_path):
                            os.symlink(art['local'],
                                       os.path.join(os.path.dirname(cwd), tag))

                    if experiment.filename is not None:
                        cmd = [
                            'singularity',
                            'exec',
                            container,
                        ] + cmd
                    else:
                        cmd = ['singularity', 'run', container]

                self.logger.info('Running cmd: {0} in {1}'.format(cmd, cwd))

                p = subprocess.Popen(cmd,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.STDOUT,
                                     env=env,
                                     cwd=cwd,
                                     text=True)

                def kill_subprocess():
                    p.kill()

                def get_duration(tag: str):
                    value = self.config.get(tag, '0m')
                    return int(str2duration(value).total_seconds() / 60)

                def checkpoint():
                    try:
                        db.checkpoint_experiment(experiment)
                    except BaseException as e:
                        self.logger.info(e)
                        check_for_kb_interrupt()

                minutes = get_duration('saveWorkspaceFrequency')
                sched.add_job(checkpoint, 'interval', minutes=minutes)

                metrics_path = fs_tracker.get_artifact_cache(
                    '_metrics', experiment.key)

                minutes = get_duration('saveMetricsFrequency')
                sched.add_job(lambda: save_metrics(metrics_path),
                              'interval',
                              minutes=minutes)

                def kill_if_stopped():
                    try:
                        db_expr = db.get_experiment(experiment.key,
                                                    getinfo=False)
                    except:
                        check_for_kb_interrupt()
                        db_expr = None

                    # Transient issues with getting experiment data might
                    # result in a None value being returned, as result
                    # leave the experiment running because we wont be able to
                    # do anything else even if this experiment is stopped
                    # in any event if the experiment runs too long then it
                    # will exceed its allocated time and stop
                    if db_expr is not None:
                        if db_expr.status == 'stopped':
                            kill_subprocess()
                            return

                    if experiment.max_duration is not None and \
                            time.time() > experiment.time_started + \
                            int(str2duration(experiment.max_duration)
                                .total_seconds()):

                        kill_subprocess()
                        return

                    # If our tasks queue is signalled inactive
                    # during work process execution, that means we need to drop
                    # current execution and exit
                    if not self.task_queue.is_active():
                        kill_subprocess()

                sched.add_job(kill_if_stopped, 'interval', seconds=10)

                while True:
                    output = p.stdout.readline()
                    if output == '' and p.poll() is not None:
                        break
                    if output:
                        line_out = output.strip()
                        print(line_out)
                        output_file.write(line_out)

                try:
                    p.wait()
                finally:
                    save_metrics(metrics_path)
                    sched.shutdown()
                    db.checkpoint_experiment(experiment)
                    db.finish_experiment(experiment)
                    return p.returncode