Пример #1
0
    def copy_paths_rel_cached(self, paths_to_dump, dst_dir):
        raise NotImplementedError
        tar_filename = '{id}.tar.gz'.format(id=id_generator(20))
        tar_path = os.path.join('/tmp/', tar_filename)
        print('tmp_path', tar_path)
        tar = tarfile.open(tar_path, 'w:gz')

        for d in paths_to_dump:
            remote_rel_path, local_path = d['dst_rel'], d['src']
            if remote_rel_path != '':
                raise NotImplementedError
            print('adding_path', local_path)
            tar.add(local_path, arcname=os.path.basename(local_path))
        tar.close()
        sha_digest = self._sha256_file(tar_path)
        print(sha_digest)

        print('dst_dir', dst_dir)

        # WARNING(maciek): this is not concurrent safe

        remote_tar_filename = sha_digest + '.tar.gz'
        if not exists(osp.join(self.scratch_space, remote_tar_filename)):
            self.copy_path(osp.join(self.scratch_space, remote_tar_filename),
                           tar_path)

        run('cp {path_src} {path_dst}'.format(
            path_src=osp.join(self.scratch_space, remote_tar_filename),
            path_dst=osp.join(dst_dir, remote_tar_filename)))

        with cd(dst_dir):
            run('ls *')
            run('tar xfz {remote_tar_filename}'.format(
                remote_tar_filename=remote_tar_filename))
Пример #2
0
    def create_pod(self, yaml_str):
        random_path = '/tmp/{}.yaml'.format(id_generator(10))
        with open(random_path, 'w') as f:
            # print(yaml_str, file=f)
            f.write(yaml_str)

        command = ['kubectl', 'create']
        if self.kube_config is not None:
            command += ['--kubeconfig', self.kube_config]
        command += ['-f', random_path]
        print(' '.join(command))
        # subprocess.run(command, check=True)
        subprocess.call(command)
Пример #3
0
    def copy_paths_rel(self, paths_to_dump, dst_dir):
        # TODO(maciek): describe the semantics of this!!!
        print('copy_paths_rel')
        for d in paths_to_dump:
            print(d)
        tar_filename = '{id}.tar.gz'.format(id=id_generator(20))
        tar_tmp_path = os.path.join('/tmp/', tar_filename)
        print('tmp_path', tar_tmp_path)
        tar = tarfile.open(tar_tmp_path, 'w:gz')

        for d in paths_to_dump:
            remote_rel_path, local_path = d['dst_rel'], d['src']
            # if remote_rel_path != '':
            #     raise NotImplementedError
            print('adding_path', local_path)
            tar.add(local_path, arcname=os.path.basename(local_path))
        tar.close()

        print('dst_dir', dst_dir)
        self.copy_path(dst_dir, tar_tmp_path)
        with cd(dst_dir):
            run('pwd')
            run('ls *')
            run('tar xfz {tar_filename}'.format(tar_filename=tar_filename))
Пример #4
0
    def main(self, argv):

        self.argv = argv
        mrunner_args, rest_argv = self.parse_argv()


        if mrunner_args.storage_url is not None:
            # INFO(maciek): random noise added is for purpose!
            exp_dir_path = os.path.join(mrunner_args.storage_url, datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '_' + id_generator(4))
            print('exp_dir_path', exp_dir_path)
        else:
            print('Warning! no exp_dir_path set')
            exp_dir_path = '.'
        if int(mrunner_args.srun) + int(mrunner_args.sbatch) != 1:
            raise RuntimeError('Please provide exactly one of --srun, --sbatch')

        resource_dir_path = os.path.join(exp_dir_path, 'src')

        paths_to_dump = self._parse_paths_to_dump(resource_dir_path,
                                                  mrunner_args.paths_to_dump_conf,
                                                  mrunner_args.paths_to_dump)
        print(paths_to_dump)

        remote_config_path = os.path.join(resource_dir_path, 'config.yaml')

        if mrunner_args.neptune:
            if mrunner_args.config is None:
                raise RuntimeError('Please supply --config!')
            self.prometheus_api.mkdir(resource_dir_path)
            self.prometheus_api.copy_paths_rel(paths_to_dump, resource_dir_path)
            new_local_config_path = self.mrunner_api.config_to_yaml(mrunner_args.config,
                                                                    mrunner_args.name,
                                                                    mrunner_args.project)


            self.prometheus_api.copy_path(remote_config_path, new_local_config_path)

            paths_to_dump_for_neptune = [os.path.join(p['dst'], os.path.basename(p['src'])) for p in paths_to_dump]
            print(paths_to_dump_for_neptune)
            local_task = self.mrunner_api.create_neptune_run_command(config_path=remote_config_path,
                                                                     paths_to_dump=paths_to_dump_for_neptune,
                                                                     storage_url=mrunner_args.storage_url,
                                                                     tags=mrunner_args.tags,
                                                                     neptune_conf_path=mrunner_args.neptune_conf,
                                                                     rest_argv=rest_argv)
            command_list = local_task.command

            if mrunner_args.neptune_conf is not None:
                with open(mrunner_args.neptune_conf) as f:
                    for line in f.readlines():
                        command_list = [line] + command_list

            command = ' '.join(command_list)
            import random
            sleep_command = "sleep {0:.4f}".format(random.random() * 5)
            command = sleep_command + " ; " + command
            print(command)

            env = local_task.env
            env['EXPERIMENT_ID'] = mrunner_args.experiment_id
            env['STORAGE_URL'] = mrunner_args.storage_url
            env['RESOURCE_DIR_PATH'] = resource_dir_path

            if mrunner_args.pythonpath:
                env['PYTHONPATH'] = mrunner_args.pythonpath

            log_path = '/dev/null'
            modules_to_load = []
            if mrunner_args.modules_to_load:
                modules_to_load = mrunner_args.modules_to_load.split(":")
                modules_to_load = [x for x in modules_to_load if x]  # remove empty strings
                print("Modules to load:{}".format(modules_to_load))

            task = PlgridTask(command=command, cwd=resource_dir_path, env=env, venv_path=mrunner_args.venv_path,
                              after_module_load_cmd=mrunner_args.after_module_load_cmd,
                              script_name=mrunner_args.script_name, modules_to_load=modules_to_load)

        if mrunner_args.with_yaml:
            self.prometheus_api.mkdir(resource_dir_path)
            self.prometheus_api.copy_paths_rel(paths_to_dump, resource_dir_path)
            paths_to_dump_for_neptune = [os.path.join(p['dst'], os.path.basename(p['src'])) for p in paths_to_dump]
            new_local_config_path = self.mrunner_api.config_to_yaml(mrunner_args.config,
                                                                    mrunner_args.name,
                                                                    mrunner_args.project)
            self.prometheus_api.copy_path(remote_config_path, new_local_config_path)
            local_task = self.mrunner_api.create_yaml_run_command(config_path=remote_config_path,
                                                                     paths_to_dump=paths_to_dump_for_neptune,
                                                                     storage_url=mrunner_args.storage_url,
                                                                     tags=mrunner_args.tags,
                                                                     exp_dir_path=exp_dir_path,
                                                                     rest_argv=rest_argv)

            # parms_argv = rest_argv
            # if mrunner_args.with_yaml:
            #     parms_argv.append(" --yaml {}".format(remote_config_path))
            #     new_local_config_path = self.mrunner_api.config_to_yaml(mrunner_args.config,
            #                                                             mrunner_args.name,
            #                                                             mrunner_args.project)
            #
            #     self.prometheus_api.copy_path(remote_config_path, new_local_config_path)
            #
            # local_task = self.mrunner_api.create_normal_run_command(rest_argv, exp_dir_path=exp_dir_path)

            command_list = local_task.command
            if mrunner_args.neptune_conf is not None:
                with open(mrunner_args.neptune_conf) as f:
                    for line in f.readlines():
                        command_list = [line] + command_list


            command = ' '.join(command_list)
            env = local_task.env
            env['EXPERIMENT_ID'] = mrunner_args.experiment_id
            env['STORAGE_URL'] = mrunner_args.storage_url
            env['RESOURCE_DIR_PATH'] = resource_dir_path

            if mrunner_args.pythonpath:
                env['PYTHONPATH'] = "{}:$PYTHONPATH".format(mrunner_args.pythonpath)

            log_path = os.path.join(resource_dir_path, "job_logs.txt")
            modules_to_load = []
            if mrunner_args.modules_to_load:
                modules_to_load = mrunner_args.modules_to_load.split(":")
                modules_to_load = [x for x in modules_to_load if x]  #remove empty strings
                print("Modules to load:{}".format(modules_to_load))

            task = PlgridTask(command=command, cwd=resource_dir_path, env=env, venv_path=mrunner_args.venv_path,
                              after_module_load_cmd=mrunner_args.after_module_load_cmd,
                              script_name=mrunner_args.script_name, modules_to_load=modules_to_load)


        if mrunner_args.srun:
            self.prometheus_api.srun(task, partition=mrunner_args.partition,
                                     cores=mrunner_args.cores, ntasks=mrunner_args.ntasks,
                                     account=mrunner_args.A,
                                     gres=mrunner_args.gres)
        elif mrunner_args.sbatch:
            self.prometheus_api.sbatch(task, partition=mrunner_args.partition,
                                       cores=mrunner_args.cores,
                                       time=mrunner_args.time,
                                       stdout_path = log_path,
                                       ntasks=mrunner_args.ntasks,
                                       account=mrunner_args.A,
                                       gres=mrunner_args.gres)
Пример #5
0
 def _make_script_name(self, script_name):
     return script_name + '_' + id_generator(20) + '.sh'
Пример #6
0
    def main(self, argv):
        self.argv = argv
        mrunner_args, rest_argv = self.parse_argv()

        # INFO(maciek): This is a directory where we now copy needed files, but then
        # we will run 'neptune run', and it itself will create new exp_dir
        # This is f****d!

        subdir = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '_' + id_generator(4)
        temp_exp_dir_path = os.path.join(mrunner_args.storage_url, subdir)
        print('temp_exp_dir_path', temp_exp_dir_path)
        dump_dir = os.path.join(temp_exp_dir_path, DUMP_SUBDIRECTORY_NAME)
        paths_to_dump = self._parse_paths_to_dump(dump_dir,
                                                    None,
                                                  mrunner_args.paths_to_dump)
        mkdir_p(dump_dir)
        self.copy_paths(paths_to_dump)
        if mrunner_args.neptune:
            tmp_config_path = self.mrunner_helper.config_to_yaml(mrunner_args.neptune_exp_config,
                                                                mrunner_args.name,
                                                                mrunner_args.project)

            config_path = os.path.join(dump_dir, 'config.yaml')
            self.copy_path(config_path, tmp_config_path)
            print('config_path', config_path)
            paths_to_dump_for_neptune = [p['dst']  for p in paths_to_dump]

            # We will be constructing neptune run command here
            command_with_env = self.mrunner_helper.create_neptune_run_command(
                config_path=config_path,
                paths_to_dump=paths_to_dump_for_neptune,
                storage_url=mrunner_args.storage_url,
                tags=mrunner_args.tags,
                neptune_host=mrunner_args.neptune_host,
                neptune_port=mrunner_args.neptune_port,
                neptune_username=mrunner_args.neptune_username,
                neptune_password=mrunner_args.neptune_password,
                rest_argv=rest_argv)
        else:
            command_with_env = self.mrunner_helper.create_normal_run_command(rest_argv=rest_argv,
                                                                             exp_dir_path=temp_exp_dir_path)

        print('command_with_env', command_with_env.command, command_with_env.env)
        pod_name = 'mrunner-pod-{id}'.format(id=id_generator(10))

        # TODO(maciek): temporary!!!
        volume_mounts = [
            KubeVolumeMount(name='storage',
                            mountPath='/mnt/ml-team/rl/kubernetes_storage',
                            hostPath={'path': '/mnt/ml-team/rl/kubernetes_storage'}
                            )
        ]


        # volume_mounts = [
        #     KubeVolumeMount(name='storage',
        #                     mountPath=mrunner_args.storage_url,
        #                     hostPath={'path': mrunner_args.storage_url}
        #                     )
        # ]

        if mrunner_args.dry_run is True:
            print('only dry-run, not executing!!!')
        else:
            # INFO(maciek): kube's semantics is not obvious at all what args, and command mean
            # https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#define-a-command-and-arguments-when-you-create-a-pod
            # see the table that comparse Docker and Kube

            # INFO(maciek): this will use container's default entrypoint
            args = command_with_env.command
            command = None
            env = {}
            env.update(command_with_env.env)
            if mrunner_args.pythonpath is not None:
                env['PYTHONPATH'] = mrunner_args.pythonpath

            print('Pod env will be {}'.format(env))


            self.kubernetes_backend.run_command_in_pod(pod_name=pod_name,
                                                       image=mrunner_args.docker_image,
                                                       nr_gpus=mrunner_args.nr_gpus,
                                                       args=args,
                                                       command=command,
                                                       volume_mounts=volume_mounts,
                                                       interactive=mrunner_args.interactive,
                                                       workingDir=dump_dir,
                                                       node_selector_key=mrunner_args.node_selector_key,
                                                       node_selector_value=mrunner_args.node_selector_value,
                                                       env=env,
                                                       )