示例#1
0
    def run(self, experiment):
        if isinstance(experiment, basestring):
            experiment = self.db.get_experiment(experiment)
        elif not isinstance(experiment, model.Experiment):
            raise ValueError("Unknown type of experiment: " +
                             str(type(experiment)))

        self.logger.info("Experiment key: " + experiment.key)

        with model.get_db_provider(self.config) as db:
            db.start_experiment(experiment)
            """ Override env variables with those inside the queued message
            """
            env = dict(os.environ)
            if 'env' in self.config.keys():
                for k, v in self.config['env'].iteritems():
                    if v is not None:
                        env[str(k)] = str(v)

            fs_tracker.setup_experiment(env, experiment, clean=True)
            log_path = fs_tracker.get_artifact_cache('output', experiment.key)

            # log_path = os.path.join(model_dir, self.config['log']['name'])

            self.logger.debug('Child process environment:')
            self.logger.debug(str(env))

            sched = BackgroundScheduler()
            sched.start()

            with open(log_path, 'w') as output_file:
                p = subprocess.Popen(
                    ["python", experiment.filename] + experiment.args,
                    stdout=output_file,
                    stderr=subprocess.STDOUT,
                    env=env,
                    cwd=experiment.artifacts['workspace']['local'])
                # simple hack to show what's in the log file
                ptail = subprocess.Popen(["tail", "-f", log_path])

                sched.add_job(
                    lambda: db.checkpoint_experiment(experiment),
                    'interval',
                    minutes=self.config['saveWorkspaceFrequencyMinutes'])

                def kill_if_stopped():
                    if db.get_experiment(experiment.key,
                                         getinfo=False).status == 'stopped':
                        p.kill()

                sched.add_job(kill_if_stopped, 'interval', seconds=10)

                try:
                    p.wait()
                finally:
                    ptail.kill()
                    db.finish_experiment(experiment)
                    sched.shutdown()
示例#2
0
    def create_experiments(hyperparam_tuples):
        experiments = []
        # experiment_names = {}
        for hyperparam_tuple in hyperparam_tuples:
            experiment_name = experiment_name_base
            experiment_name += "__opt__%s__%s" % (rand_string(32),
                                                  int(time.time()))
            experiment_name = experiment_name.replace('.', '_')

            workspace_new = fs_tracker.get_artifact_cache(
                'workspace', experiment_name)

            current_artifacts = artifacts.copy()
            current_artifacts.update({
                'workspace': {
                    'local': workspace_new,
                    'mutable': True
                }
            })

            rsync_cp(workspace_orig, workspace_new, ignore_arg, logger)
            # shutil.copytree(workspace_orig, workspace_new)

            for param_name, param_value in six.iteritems(hyperparam_tuple):
                if isinstance(param_value, np.ndarray):
                    array_filepath = '/tmp/%s.npy' % rand_string(32)
                    np.save(array_filepath, param_value)
                    assert param_name not in current_artifacts
                    current_artifacts[param_name] = {'local': array_filepath,
                                                     'mutable': False}
                else:
                    with open(os.path.join(workspace_new, exec_filename),
                              'r') as f:
                        script_text = f.read()

                    script_text = re.sub(
                        '\\b' +
                        param_name +
                        '\\b(?=[^=]*\\n)',
                        str(param_value),
                        script_text)

                    with open(os.path.join(workspace_new, exec_filename),
                              'w') as f:
                        f.write(script_text)

            experiments.append(create_experiment(
                filename=exec_filename,
                args=other_args,
                experiment_name=experiment_name,
                project=project,
                artifacts=current_artifacts,
                resources_needed=resources_needed,
                metric=runner_args.metric,
                max_duration=runner_args.max_duration,
            ))
        return experiments
示例#3
0
    def __init__(self,
                 key,
                 filename,
                 args,
                 pythonenv,
                 project=None,
                 artifacts=None,
                 status='waiting',
                 resources_needed=None,
                 time_added=None,
                 time_started=None,
                 time_last_checkpoint=None,
                 time_finished=None,
                 info={},
                 git=None,
                 metric=None):

        self.key = key
        self.filename = filename
        self.args = args if args else []
        self.pythonenv = pythonenv
        self.project = project

        workspace_path = os.path.abspath('.')
        model_dir = fs_tracker.get_model_directory(key)
        self.artifacts = {
            'workspace': {
                'local': workspace_path,
                'mutable': True
            },
            'modeldir': {
                'local': model_dir,
                'mutable': True
            },
            'output': {
                'local': fs_tracker.get_artifact_cache('output', key),
                'mutable': True
            },
            'tb': {
                'local': fs_tracker.get_tensorboard_dir(key),
                'mutable': True
            }
        }
        if artifacts is not None:
            self.artifacts.update(artifacts)

        self.resources_needed = resources_needed
        self.status = status
        self.time_added = time_added
        self.time_started = time_started
        self.time_last_checkpoint = time_last_checkpoint
        self.time_finished = time_finished
        self.info = info
        self.git = git
        self.metric = metric
示例#4
0
    def create_experiments(hyperparam_tuples):
        experiments = []
        experiment_names = {}
        for hyperparam_tuple in hyperparam_tuples:
            experiment_name = experiment_name_base
            for param_name, param_value in hyperparam_tuple.iteritems():
                experiment_name = experiment_name + '__' + \
                    param_name + '__' + str(param_value)
            experiment_name = experiment_name.replace('.', '_')

            # if experiments uses a previously used name, change it
            if experiment_name in experiment_names:
                new_experiment_name = experiment_name
                counter = 1
                while new_experiment_name in experiment_names:
                    counter += 1
                    new_experiment_name = "%s_v%s" % (experiment_name, counter)
                experiment_name = new_experiment_name
            experiment_names[experiment_name] = True

            workspace_orig = artifacts['workspace']['local'] \
                if 'workspace' in artifacts.keys() else '.'
            workspace_new = fs_tracker.get_artifact_cache(
                'workspace', experiment_name)

            current_artifacts = artifacts.copy()
            current_artifacts.update({
                'workspace': {
                    'local': workspace_new,
                    'mutable': True
                }
            })

            shutil.copytree(workspace_orig, workspace_new)

            with open(os.path.join(workspace_new, exec_filename), 'r') as f:
                script_text = f.read()

            for param_name, param_value in hyperparam_tuple.iteritems():
                script_text = re.sub('\\b' + param_name + '\\b(?=[^=]*\\n)',
                                     str(param_value), script_text)

            with open(os.path.join(workspace_new, exec_filename), 'w') as f:
                f.write(script_text)

            experiments.append(model.create_experiment(
                filename=exec_filename,
                args=other_args,
                experiment_name=experiment_name,
                project=project,
                artifacts=current_artifacts,
                resources_needed=resources_needed,
                metric=runner_args.metric))
        return experiments
示例#5
0
    def put_artifact(self,
                     artifact,
                     local_path=None,
                     cache=True,
                     background=False):
        if local_path is None:
            local_path = artifact['local']

        key = artifact.get('key')
        if os.path.exists(local_path):
            tar_filename = os.path.join(tempfile.gettempdir(),
                                        str(uuid.uuid4()))

            if os.path.isdir(local_path):
                local_basepath = local_path
                local_nameonly = '.'

            else:
                local_nameonly = os.path.basename(local_path)
                local_basepath = os.path.dirname(local_path)

            ignore_arg = ''
            ignore_filepath = os.path.join(local_basepath, ".studioml_ignore")
            if os.path.exists(ignore_filepath) and \
                    not os.path.isdir(ignore_filepath):
                ignore_arg = "--exclude-from=%s" % ignore_filepath
                # self.logger.debug('.studioml_ignore found: %s,'
                #                   ' files listed inside will'
                #                   ' not be tarred or uploaded'
                #                   % ignore_filepath)

            if cache and key:
                cache_dir = fs_tracker.get_artifact_cache(key)
                if cache_dir != local_path:
                    debug_str = "Copying local path {} to cache {}" \
                        .format(local_path, cache_dir)
                    if ignore_arg != '':
                        debug_str += ", excluding files in {}" \
                            .format(ignore_filepath)
                    self.logger.debug(debug_str)

                    util.rsync_cp(local_path, cache_dir, ignore_arg,
                                  self.logger)

            debug_str = ("Tarring and uploading directrory. " +
                         "tar_filename = {}, " + "local_path = {}, " +
                         "key = {}").format(tar_filename, local_path, key)
            if ignore_arg != '':
                debug_str += ", exclude = {}".format(ignore_filepath)
            self.logger.debug(debug_str)

            tarcmd = 'tar {} -czf {} -C {} {}'.format(ignore_arg, tar_filename,
                                                      local_basepath,
                                                      local_nameonly)
            self.logger.debug("Tar cmd = {}".format(tarcmd))

            tarp = subprocess.Popen(['/bin/bash', '-c', tarcmd],
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.STDOUT,
                                    close_fds=True)

            tarout, _ = tarp.communicate()
            if tarp.returncode != 0:
                self.logger.info('tar had a non-zero return code!')
                self.logger.info('tar output: \n ' + tarout)

            if key is None:
                key = 'blobstore/' + util.sha256_checksum(tar_filename) \
                      + '.tgz'

            def finish_upload():
                self._upload_file(key, tar_filename)
                os.remove(tar_filename)

            t = Thread(target=finish_upload)
            t.start()

            if background:
                return (key, t)
            else:
                t.join()
                return key
        else:
            self.logger.debug(("Local path {} does not exist. " +
                               "Not uploading anything.").format(local_path))
示例#6
0
    def get_artifact(self,
                     artifact,
                     local_path=None,
                     only_newer=True,
                     background=False):

        key = artifact['key']

        if local_path is None:
            if 'local' in artifact.keys() and \
                    os.path.exists(artifact['local']):
                local_path = artifact['local']
            else:
                if artifact['mutable']:
                    local_path = fs_tracker.get_artifact_cache(key)
                else:
                    local_path = fs_tracker.get_blob_cache(key)

        local_path = re.sub('\/\Z', '', local_path)
        local_basepath = os.path.dirname(local_path)

        self.logger.info(
            "Downloading dir {} to local path {} from storage...".format(
                key, local_path))

        if only_newer and os.path.exists(local_path):
            self.logger.debug(
                'Comparing date of the artifact in storage with local')
            storage_time = self._get_file_timestamp(key)
            local_time = os.path.getmtime(local_path)
            if storage_time is None:
                self.logger.info(
                    "Unable to get storage timestamp, storage is either " +
                    "corrupted and has not finished uploading")
                return local_path

            if local_time > storage_time - self.timestamp_shift:
                self.logger.info(
                    "Local path is younger than stored, skipping the download")
                return local_path

        tar_filename = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
        self.logger.debug("tar_filename = {} ".format(tar_filename))

        def finish_download():
            self._download_file(key, tar_filename)
            if os.path.exists(tar_filename):
                # first, figure out if the tar file has a base path of .
                # or not
                self.logger.info("Untarring {}".format(tar_filename))
                listtar, _ = subprocess.Popen(
                    ['tar', '-tzf', tar_filename],
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE).communicate()
                listtar = listtar.strip().split('\n')
                self.logger.info('List of files in the tar: ' + str(listtar))
                if listtar[0].startswith('./'):
                    # Files are archived into tar from .; adjust path
                    # accordingly
                    basepath = local_path
                else:
                    basepath = local_basepath

                tarcmd = ('mkdir -p {} && ' +
                          'tar -xzf {} -C {} --keep-newer-files') \
                    .format(basepath, tar_filename, basepath)
                tarp = subprocess.Popen(['/bin/bash', '-c', tarcmd],
                                        stdout=subprocess.PIPE,
                                        stderr=subprocess.STDOUT)

                tarout, tarerr = tarp.communicate()
                if tarp.returncode != 0:
                    self.logger.info('tar had a non-zero return code!')
                    self.logger.info('tar cmd = ' + tarcmd)
                    self.logger.info('tar output: \n ' + tarout)

                if len(listtar) == 1:
                    actual_path = os.path.join(basepath, listtar[0])
                    self.logger.info('Renaming {} into {}'.format(
                        actual_path, local_path))
                    os.rename(actual_path, local_path)
                os.remove(tar_filename)
            else:
                self.logger.warn(
                    'file {} download failed'.format(tar_filename))

        t = Thread(target=finish_download)
        t.start()
        if background:
            return (local_path, t)
        else:
            t.join()
            return local_path
示例#7
0
    def put_artifact(self,
                     artifact,
                     local_path=None,
                     cache=True,
                     background=False):
        if local_path is None:
            local_path = artifact['local']

        key = artifact.get('key')
        if os.path.exists(local_path):
            tar_filename = os.path.join(tempfile.gettempdir(),
                                        str(uuid.uuid4()))

            if os.path.isdir(local_path):
                local_basepath = local_path
                local_nameonly = '.'
            else:
                local_nameonly = os.path.basename(local_path)
                local_basepath = os.path.dirname(local_path)

            if cache and key:
                cache_dir = fs_tracker.get_artifact_cache(key)
                if cache_dir != local_path:
                    self.logger.debug(
                        "Copying local path {} to cache {}".format(
                            local_path, cache_dir))

                    if os.path.exists(cache_dir) and os.path.isdir(cache_dir):
                        shutil.rmtree(cache_dir)

                    pcp = subprocess.Popen(
                        ['cp', '-pR', local_path, cache_dir],
                        stdout=subprocess.PIPE,
                        stderr=subprocess.STDOUT)
                    cpout, _ = pcp.communicate()
                    if pcp.returncode != 0:
                        self.logger.info(
                            'cp returned non-zero exit code. Output:')
                        self.logger.info(cpout)

            self.logger.debug(
                ("Tarring and uploading directrory. " + "tar_filename = {}, " +
                 "local_path = {}, " + "key = {}").format(
                     tar_filename, local_path, key))

            tarcmd = 'tar -czf {} -C {} {}'.format(tar_filename,
                                                   local_basepath,
                                                   local_nameonly)

            self.logger.debug("Tar cmd = {}".format(tarcmd))

            tarp = subprocess.Popen(['/bin/bash', '-c', tarcmd],
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.STDOUT)

            tarout, _ = tarp.communicate()
            if tarp.returncode != 0:
                self.logger.info('tar had a non-zero return code!')
                self.logger.info('tar output: \n ' + tarout)

            if key is None:
                key = 'blobstore/' + util.sha256_checksum(tar_filename) \
                      + '.tgz'

            def finish_upload():
                self._upload_file(key, tar_filename)

                os.remove(tar_filename)

            t = Thread(target=finish_upload)
            t.start()

            if background:
                return (key, t)
            else:
                t.join()
                return key
        else:
            self.logger.debug(("Local path {} does not exist. " +
                               "Not uploading anything.").format(local_path))
示例#8
0
    def _tartifact(self, local_path, key, cache=True):

        tar_filename = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))

        if os.path.isdir(local_path):
            local_basepath = local_path
            local_nameonly = '.'

        else:
            local_nameonly = os.path.basename(local_path)
            local_basepath = os.path.dirname(local_path)

        ignore_arg = ''
        ignore_filepath = os.path.join(local_basepath, ".studioml_ignore")
        if os.path.exists(ignore_filepath) and \
                not os.path.isdir(ignore_filepath):
            ignore_arg = "--exclude-from=%s" % ignore_filepath
            self.logger.debug('.studioml_ignore found: %s,'
                              ' files listed inside will'
                              ' not be tarred or uploaded' % ignore_filepath)

        if cache and key:
            cache_dir = fs_tracker.get_artifact_cache(key)
            if cache_dir != local_path:
                debug_str = "Copying local path {} to cache {}" \
                    .format(local_path, cache_dir)
                if ignore_arg != '':
                    debug_str += ", excluding files in {}" \
                        .format(ignore_filepath)
                self.logger.debug(debug_str)

                util.rsync_cp(local_path, cache_dir, ignore_arg, self.logger)

        debug_str = ("Tarring artifact. " + "tar_filename = {}, " +
                     "local_path = {}, " + "key = {}").format(
                         tar_filename, local_path, key)

        if ignore_arg != '':
            debug_str += ", exclude = {}".format(ignore_filepath)
        self.logger.debug(debug_str)

        tarcmd = 'tar {} {} -cf {} -C {} {}'.format(
            ignore_arg, compression_to_taropt(self.compression), tar_filename,
            local_basepath, local_nameonly)
        self.logger.debug("Tar cmd = {}".format(tarcmd))

        tic = time.time()
        tarp = subprocess.Popen(['/bin/bash', '-c', tarcmd],
                                stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT,
                                close_fds=True)

        tarout, _ = tarp.communicate()
        toc = time.time()

        if tarp.returncode != 0:
            self.logger.info('tar had a non-zero return code!')
            self.logger.info('tar output: \n ' + sixdecode(tarout))

        self.logger.info('tar finished in {}s'.format(toc - tic))
        return tar_filename
示例#9
0
    def get_artifact(self,
                     artifact,
                     local_path=None,
                     only_newer=True,
                     background=False):

        key = artifact.get('key')
        bucket = artifact.get('bucket')

        if key is None:
            assert not artifact['mutable']
            assert artifact.get('url') is not None or \
                artifact.get('qualified') is not None

            remote_path = artifact.get('url')
            if remote_path is None:
                remote_path = artifact.get('qualified')

            key = hashlib.sha256(remote_path.encode()).hexdigest()
            local_path = fs_tracker.get_blob_cache(key)
            if os.path.exists(local_path):
                self.logger.info(
                    ('Immutable artifact exists at local_path {},' +
                     ' skipping the download').format(local_path))
                return local_path

            if artifact.get('url') is not None:
                download_file(remote_path, local_path, self.logger)
            else:
                if remote_path.startswith('dockerhub://') or \
                   remote_path.startswith('shub://'):
                    self.logger.info(
                        ('Qualified {} points to a shub or dockerhub,' +
                         ' skipping the download'))
                    return remote_path

                download_file_from_qualified(remote_path, local_path,
                                             self.logger)

            self.logger.debug(
                'Downloaded file {} from external source {}'.format(
                    local_path, remote_path))
            return local_path

        if local_path is None:
            if 'local' in artifact.keys() and \
                    os.path.exists(artifact['local']):
                local_path = artifact['local']
            else:
                if artifact['mutable']:
                    local_path = fs_tracker.get_artifact_cache(key)
                else:
                    local_path = fs_tracker.get_blob_cache(key)
                    if os.path.exists(local_path):
                        self.logger.info(
                            ('Immutable artifact exists at local_path {},' +
                             ' skipping the download').format(local_path))
                        return local_path

        local_path = re.sub('\/\Z', '', local_path)
        local_basepath = os.path.dirname(local_path)

        self.logger.info(
            "Downloading dir {} to local path {} from storage...".format(
                key, local_path))

        if only_newer and os.path.exists(local_path):
            self.logger.debug(
                'Comparing date of the artifact in storage with local')
            storage_time = self._get_file_timestamp(key)
            local_time = os.path.getmtime(local_path)
            if storage_time is None:
                self.logger.info(
                    "Unable to get storage timestamp, storage is either " +
                    "corrupted or has not finished uploading")
                return local_path

            if local_time > storage_time - self.timestamp_shift:
                self.logger.info(
                    "Local path is younger than stored, skipping the download")
                return local_path

        tar_filename = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
        self.logger.debug("tar_filename = {} ".format(tar_filename))

        def finish_download():
            try:
                self._download_file(key, tar_filename)
            except BaseException as e:
                self.logger.debug(e)

            if os.path.exists(tar_filename):
                # first, figure out if the tar file has a base path of .
                # or not
                self.logger.info("Untarring {}".format(tar_filename))
                listtar, _ = subprocess.Popen(['tar', '-tf', tar_filename],
                                              stdout=subprocess.PIPE,
                                              stderr=subprocess.PIPE,
                                              close_fds=True).communicate()
                listtar = listtar.strip().split(b'\n')
                listtar = [s.decode('utf-8') for s in listtar]

                self.logger.info('List of files in the tar: ' + str(listtar))
                if listtar[0].startswith('./'):
                    # Files are archived into tar from .; adjust path
                    # accordingly
                    basepath = local_path
                else:
                    basepath = local_basepath

                tarcmd = ('mkdir -p {} && ' +
                          'tar -xf {} -C {} --keep-newer-files') \
                    .format(basepath, tar_filename, basepath)

                self.logger.debug('Tar cmd = {}'.format(tarcmd))

                tarp = subprocess.Popen(['/bin/bash', '-c', tarcmd],
                                        stdout=subprocess.PIPE,
                                        stderr=subprocess.STDOUT,
                                        close_fds=True)

                tarout, tarerr = tarp.communicate()
                if tarp.returncode != 0:
                    self.logger.info('tar had a non-zero return code!')
                    self.logger.info('tar cmd = ' + tarcmd)
                    self.logger.info('tar output: \n ' + str(tarout))

                if len(listtar) == 1:
                    actual_path = os.path.join(basepath, listtar[0])
                    self.logger.info('Renaming {} into {}'.format(
                        actual_path, local_path))
                    retry(lambda: os.rename(actual_path, local_path),
                          no_retries=5,
                          sleep_time=1,
                          exception_class=OSError,
                          logger=self.logger)

                os.remove(tar_filename)
            else:
                self.logger.warning(
                    'file {} download failed'.format(tar_filename))

        if background:
            t = Thread(target=finish_download)
            t.start()
            return (local_path, t)
        else:
            finish_download()
            return local_path
示例#10
0
    def __init__(self,
                 key,
                 filename,
                 args,
                 pythonenv,
                 project=None,
                 artifacts=None,
                 status='waiting',
                 resources_needed=None,
                 time_added=None,
                 time_started=None,
                 time_last_checkpoint=None,
                 time_finished=None,
                 info={},
                 git=None,
                 metric=None,
                 pythonver=None,
                 max_duration=None):

        self.key = key
        self.args = []
        self.filename = filename

        if filename and '::' in filename:
            self.filename = '-m'
            module_name = filename.replace('::', '.')
            if module_name.startswith('.'):
                module_name = module_name[1:]

            self.args.append(module_name)

        if args:
            self.args += args

        self.args = [shquote(a) for a in self.args]

        self.pythonenv = pythonenv
        self.project = project
        self.pythonver = pythonver if pythonver else sys.version_info[0]

        workspace_path = os.path.abspath('.')
        try:
            model_dir = fs_tracker.get_model_directory(key)
        except BaseException:
            model_dir = None

        self.artifacts = {
            'workspace': {
                'local': workspace_path,
                'mutable': False,
                'unpack': True
            },
            'modeldir': {
                'local': model_dir,
                'mutable': True,
                'unpack': True
            },
            'output': {
                'local': fs_tracker.get_artifact_cache('output', key),
                'mutable': True,
                'unpack': True
            },
            'tb': {
                'local': fs_tracker.get_tensorboard_dir(key),
                'mutable': True,
                'unpack': True
            },
            '_metrics': {
                'local': fs_tracker.get_artifact_cache('_metrics', key),
                'mutable': True,
                'unpack': True
            }
        }
        if artifacts is not None:
            self.artifacts.update(artifacts)

        self.resources_needed = resources_needed
        self.status = status
        self.time_added = time_added
        self.time_started = time_started
        self.time_last_checkpoint = time_last_checkpoint
        self.time_finished = time_finished
        self.info = info
        self.git = git
        self.metric = metric
        self.max_duration = max_duration
示例#11
0
    def run(self, experiment):
        if isinstance(experiment, six.string_types):
            experiment = self.db.get_experiment(experiment)
        elif not isinstance(experiment, Experiment):
            raise ValueError("Unknown type of experiment: " +
                             str(type(experiment)))

        self.logger.info("Experiment key: " + experiment.key)

        with model.get_db_provider(self.config) as db:
            db.start_experiment(experiment)
            """ Override env variables with those inside the queued message
            """
            env = dict(os.environ)
            if 'env' in self.config.keys():
                for k, v in six.iteritems(self.config['env']):
                    if v is not None:
                        env[str(k)] = str(v)

            env['PYTHONUNBUFFERED'] = 'TRUE'

            fs_tracker.setup_experiment(env, experiment, clean=False)
            log_path = fs_tracker.get_artifact_cache('output', experiment.key)

            # log_path = os.path.join(model_dir, self.config['log']['name'])

            self.logger.debug('Child process environment:')
            self.logger.debug(str(env))

            sched = BackgroundScheduler()
            sched.start()

            with open(log_path, 'w') as output_file:
                python = 'python'
                if experiment.pythonver == 3:
                    python = 'python3'

                cmd = [python, experiment.filename] + experiment.args
                cwd = experiment.artifacts['workspace']['local']
                container_artifact = experiment.artifacts.get('_singularity')
                if container_artifact:
                    container = container_artifact.get('local')
                    if not container:
                        container = container_artifact.get('qualified')

                    cwd = fs_tracker.get_artifact_cache(
                        'workspace', experiment.key)

                    for tag, art in six.iteritems(experiment.artifacts):
                        local_path = art.get('local')
                        if not art['mutable'] and os.path.exists(local_path):
                            os.symlink(art['local'],
                                       os.path.join(os.path.dirname(cwd), tag))

                    if experiment.filename is not None:
                        cmd = [
                            'singularity',
                            'exec',
                            container,
                        ] + cmd
                    else:
                        cmd = ['singularity', 'run', container]

                self.logger.info('Running cmd: \n {} '.format(cmd))

                p = subprocess.Popen(cmd,
                                     stdout=output_file,
                                     stderr=subprocess.STDOUT,
                                     env=env,
                                     cwd=cwd)
                # simple hack to show what's in the log file
                # ptail = subprocess.Popen(["tail", "-f", log_path])

                logtail = Pygtail(log_path)

                def tail_func():
                    while logtail:
                        for line in logtail:
                            print(line)

                        time.sleep(0.1)

                tail_thread = threading.Thread(target=tail_func)
                tail_thread.start()

                minutes = 0
                if self.config.get('saveWorkspaceFrequency'):
                    minutes = int(
                        str2duration(self.config['saveWorkspaceFrequency']).
                        total_seconds() / 60)

                def checkpoint():
                    try:
                        db.checkpoint_experiment(experiment)
                    except BaseException as e:
                        self.logger.info(e)

                sched.add_job(checkpoint, 'interval', minutes=minutes)

                metrics_path = fs_tracker.get_artifact_cache(
                    '_metrics', experiment.key)

                minutes = 0
                if self.config.get('saveMetricsFrequency'):
                    minutes = int(
                        str2duration(self.config['saveMetricsFrequency']).
                        total_seconds() / 60)

                sched.add_job(lambda: save_metrics(metrics_path),
                              'interval',
                              minutes=minutes)

                def kill_if_stopped():
                    if db.get_experiment(experiment.key,
                                         getinfo=False).status == 'stopped':
                        p.kill()

                    if experiment.max_duration is not None and \
                            time.time() > experiment.time_started + \
                            int(str2duration(experiment.max_duration)
                                .total_seconds()):

                        p.kill()

                sched.add_job(kill_if_stopped, 'interval', seconds=10)

                try:
                    p.wait()
                finally:
                    save_metrics(metrics_path)
                    sched.shutdown()
                    logtail = None
                    db.checkpoint_experiment(experiment)
                    db.finish_experiment(experiment)
                    return p.returncode