예제 #1
0
def create_run_script(
        main_filepath,
        argname_lst,
        argvalue_lst,
        script_filepath,
        # entry_folderpath=None,
        output_filepath=None,
        profile_filepath=None):

    sc_lines = ['#!/bin/bash', 'set -e']
    # # change into the entry folder if provided.
    # if entry_folderpath is not None:
    #     sc_lines += ['cd %s' % entry_folderpath]
    # call the main function.
    sc_lines += generate_call_lines(
        **tb_ut.subset_dict_via_selection(locals(), [
            'main_filepath', 'argname_lst', 'argvalue_lst', 'output_filepath',
            'profile_filepath'
        ]))
    # change back to the previous folder if I change to some other folder.
    # if entry_folderpath is not None:
    #     sc_lines += ['cd -']
    tb_io.write_textfile(script_filepath, sc_lines, with_newline=True)
    # add run permissions.
    st = os.stat(script_filepath)
    exec_bits = stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH
    os.chmod(script_filepath, st.st_mode | exec_bits)
예제 #2
0
    def register(self,
                 bash_command,
                 num_cpus=1,
                 num_gpus=0,
                 mem_budget=8.0,
                 time_budget=60.0,
                 mem_units='gigabytes',
                 time_units='minutes',
                 folderpath=None,
                 wait_for_output=True,
                 require_gpu_types=None,
                 require_nodes=None,
                 run_on_head_node=False):

        # NOTE: this is not implemented for now.
        assert not run_on_head_node
        # should not specify both.
        assert require_gpu_types is None or require_nodes is None

        self.jobs.append(
            tb_ut.subset_dict_via_selection(locals(), [
                'bash_command', 'num_cpus', 'num_gpus', 'mem_budget',
                'time_budget', 'mem_units', 'time_units', 'folderpath',
                'wait_for_output', 'require_gpu_types', 'require_nodes',
                'run_on_head_node'
            ]))
예제 #3
0
def list_folders(folderpath,
                 ignore_hidden_folders=True,
                 recursive=False,
                 use_relative_paths=False):

    kwargs = tb_ut.subset_dict_via_selection(
        locals(), ['recursive', 'ignore_hidden_folders', 'use_relative_paths'])
    return list_paths(folderpath, ignore_files=True, **kwargs)
예제 #4
0
def run_on_matrix(bash_command,
                  servername,
                  username,
                  password=None,
                  num_cpus=1,
                  num_gpus=0,
                  mem_budget=8.0,
                  time_budget=60.0,
                  mem_units='gigabytes',
                  time_units='minutes',
                  folderpath=None,
                  wait_for_output=True,
                  require_gpu_type=None,
                  run_on_head_node=False,
                  jobname=None):

    assert (not run_on_head_node) or num_gpus == 0
    assert require_gpu_type is None  ### NOT IMPLEMENTED YET.

    # prompts for password if it has not been provided
    if password == None:
        password = getpass.getpass()

    script_cmd = "\n".join(['#!/bin/bash', bash_command])
    script_name = "run_%s.sh" % uuid.uuid4()

    # either do the call using sbatch, or run directly on the head node.
    if not run_on_head_node:
        cmd_parts = [
            'srun' if wait_for_output else 'sbatch',
            '--cpus-per-task=%d' % num_cpus,
            '--gres=gpu:%d' % num_gpus,
            '--mem=%d' % tb_rs.convert_between_byte_units(
                mem_budget, src_units=mem_units, dst_units='megabytes'),
            '--time=%d' % tb_lg.convert_between_time_units(
                time_budget, time_units, dst_units='minutes')
        ]
        if jobname is not None:
            cmd_parts += ['--job-name=%s' % jobname]
        cmd_parts += [script_name]

        run_script_cmd = ' '.join(cmd_parts)
    else:
        run_script_cmd = './' + script_name

    # actual command to run remotely
    remote_cmd = " && ".join([
        "echo \'%s\' > %s" % (script_cmd, script_name),
        "chmod +x %s" % script_name, run_script_cmd,
        "rm %s" % script_name
    ])

    return run_on_server(
        remote_cmd,
        **tb_ut.subset_dict_via_selection(locals(), [
            'servername', 'username', 'password', 'folderpath',
            'wait_for_output'
        ]))
예제 #5
0
def copy_folder(src_folderpath,
                dst_folderpath,
                ignore_hidden_files=False,
                ignore_hidden_folders=False,
                ignore_file_exts=None,
                abort_if_dst_exists=True,
                create_parent_folders=False):
    assert folder_exists(src_folderpath)
    assert src_folderpath != dst_folderpath
    assert not (abort_if_dst_exists and folder_exists(dst_folderpath))

    if (not abort_if_dst_exists) and folder_exists(dst_folderpath):
        delete_folder(dst_folderpath, abort_if_nonempty=False)

    pref_dst_fo = path_prefix(dst_folderpath)
    assert create_parent_folders or folder_exists(pref_dst_fo)
    create_folder(dst_folderpath, create_parent_folders=create_parent_folders)

    # create all folders in the destination.
    fos = list_folders(src_folderpath,
                       use_relative_paths=True,
                       recursive=True,
                       ignore_hidden_folders=ignore_hidden_folders)

    for fo in fos:
        fo_path = join_paths([dst_folderpath, fo])
        create_folder(fo_path, create_parent_folders=True)

    # copy all files to the destination.
    kwargs = tb_ut.subset_dict_via_selection(
        locals(),
        ['ignore_hidden_folders', 'ignore_hidden_files', 'ignore_file_exts'])
    fis = list_files(src_folderpath,
                     use_relative_paths=True,
                     recursive=True,
                     **kwargs)
    for fi in fis:
        src_fip = join_paths([src_folderpath, fi])
        dst_fip = join_paths([dst_folderpath, fi])
        copy_file(src_fip, dst_fip)
예제 #6
0
def run_on_lithium_node(bash_command,
                        node,
                        servername,
                        username,
                        password=None,
                        visible_gpu_ids=None,
                        folderpath=None,
                        wait_for_output=True,
                        run_on_head_node=False):
    # check that node exists.
    assert node in tb_ut.flatten(get_lithium_nodes())

    # prompting for password if asked about. (because lithium needs password)
    if password == None:
        password = getpass.getpass()

    # if no visilbe gpu are specified, it creates a list with nothing there.
    if visible_gpu_ids is None:
        visible_gpu_ids = []

    # creating the command to run remotely.
    gpu_cmd = 'export CUDA_VISIBLE_DEVICES=%s' % ",".join(
        map(str, visible_gpu_ids))
    if not run_on_head_node:
        cmd = "ssh -T %s \'%s && %s\'" % (node, gpu_cmd, bash_command)
    else:
        # NOTE: perhaps repetition could be improved here. also, probably head
        # node does not have gpus.
        cmd = "%s && %s" % (gpu_cmd, bash_command)

    return run_on_server(
        cmd,
        **tb_ut.subset_dict_via_selection(locals(), [
            'servername', 'username', 'password', 'folderpath',
            'wait_for_output'
        ]))
예제 #7
0
def create_experiment_folder(
        main_filepath,
        argname_lst,
        argval_lst_lst,
        output_folderpath_argname,
        all_experiments_folderpath,
        readme,
        experiment_name=None,
        # entry_folderpath=None,
        code_folderpath=None,
        # data_folderpath=None,
        capture_output=False,
        profile_run=False):

    assert tb_fs.folder_exists(all_experiments_folderpath)
    assert experiment_name is None or (not tb_fs.path_exists(
        tb_fs.join_paths([all_experiments_folderpath, experiment_name])))
    # assert folder_exists(project_folderpath) and file_exists(tb_fs.join_paths([
    #     project_folderpath, main_relfilepath]))

    # create the main folder where things for the experiment will be.
    if experiment_name is None:
        experiment_name = get_available_filename(all_experiments_folderpath,
                                                 "exp")
    experiment_folderpath = tb_fs.join_paths(
        [all_experiments_folderpath, experiment_name])
    tb_fs.create_folder(experiment_folderpath)

    # copy the code to the experiment folder.
    if code_folderpath is not None:
        code_foldername = tb_fs.path_last_element(code_folderpath)
        dst_code_fo = tb_fs.join_paths(
            [experiment_folderpath, code_foldername])

        tb_fs.copy_folder(code_folderpath,
                          dst_code_fo,
                          ignore_hidden_files=True,
                          ignore_hidden_folders=True,
                          ignore_file_exts=['.pyc'])

        # change main_filepath to use that new code.
        main_filepath = tb_fs.join_paths(
            [experiment_folderpath, main_filepath])

    # NOTE: no data copying for now because it often does not make much sense.
    data_folderpath = None  ### TODO: remove later.
    # # copy the code to the experiment folder.
    # if data_folderpath is not None:
    #     data_foldername = path_last_element(data_folderpath)
    #     dst_data_fo = join_paths([experiment_folderpath, data_foldername])

    #     copy_folder(data_folderpath, dst_data_fo,
    #         ignore_hidden_files=True, ignore_hidden_folders=True)

    # write the config for the experiment.
    tb_io.write_jsonfile(
        tb_ut.subset_dict_via_selection(locals(), [
            'main_filepath', 'argname_lst', 'argval_lst_lst',
            'output_folderpath_argname', 'all_experiments_folderpath',
            'readme', 'experiment_name', 'code_folderpath', 'data_folderpath',
            'capture_output', 'profile_run'
        ]), tb_fs.join_paths([experiment_folderpath, 'config.json']))

    # generate the executables for each configuration.
    argname_lst = list(argname_lst)
    argname_lst.append(output_folderpath_argname)
    for (i, vs) in enumerate(argval_lst_lst):
        cfg_folderpath = tb_fs.join_paths([experiment_folderpath, "cfg%d" % i])
        tb_fs.create_folder(cfg_folderpath)

        # create the script
        argvalue_lst = list(vs)
        argvalue_lst.append(cfg_folderpath)
        call_args = tb_ut.subset_dict_via_selection(
            locals(), ['argname_lst', 'argvalue_lst', 'main_filepath'])

        call_args['script_filepath'] = tb_fs.join_paths(
            [cfg_folderpath, 'run.sh'])
        if capture_output:
            call_args['output_filepath'] = tb_fs.join_paths(
                [cfg_folderpath, 'output.txt'])
        if profile_run:
            call_args['profile_filepath'] = tb_fs.join_paths(
                [cfg_folderpath, 'profile.txt'])
        create_run_script(**call_args)

        # write a config file for each configuration
        tb_io.write_jsonfile(tb_ut.create_dict(argname_lst, argvalue_lst),
                             tb_fs.join_paths([cfg_folderpath, 'config.json']))
    # create_runall_script(experiment_folderpath)
    create_runall_script_with_parallelization(experiment_folderpath)

    return experiment_folderpath
예제 #8
0
    def run(self, run_only_if_enough_resources_for_all=True):
        args = tb_ut.subset_dict_via_selection(
            vars(self), ['servername', 'username', 'password'])
        args['abort_if_any_node_unavailable'] = False

        # get the resource availability and filter out unavailable nodes.
        d = get_lithium_resource_availability(**args)
        d = {k: v for (k, v) in d.iteritems() if v is not None}

        g = get_lithium_nodes()

        # assignments to each of the registered jobs
        run_cfgs = []
        for x in self.jobs:
            if x['require_nodes'] is not None:
                req_nodes = x['require_nodes']
            else:
                req_nodes = d.keys()

            # based on the gpu type restriction.
            if x['require_gpu_types'] is not None:
                req_gpu_nodes = tb_ut.flatten(
                    tb_ut.subset_dict_via_selection(g, x['require_gpu_types']))
            else:
                # NOTE: only consider the nodes that are available anyway.
                req_gpu_nodes = d.keys()

            # potentially available nodes to place this job.
            nodes = list(set(req_nodes).intersection(req_gpu_nodes))
            assert len(nodes) > 0

            # greedy assigned to a node.
            assigned = False
            for n in nodes:
                r = d[n]
                # if there are enough resources on the node, assign it to the
                # job.
                if ((r['cpus_free'] >= x['num_cpus'])
                        and (r['gpus_free'] >= x['num_gpus']) and
                    (r['mem_mbs_free'] >= tb_rs.convert_between_byte_units(
                        x['mem_budget'],
                        src_units=x['mem_units'],
                        dst_units='megabytes'))):

                    # record information about where to run the job.
                    run_cfgs.append({
                        'node':
                        n,
                        'visible_gpu_ids':
                        r['free_gpu_ids'][:x['num_gpus']]
                    })

                    # deduct the allocated resources from the available resources
                    # for that node.
                    r['cpus_free'] -= x['num_cpus']
                    r['gpus_free'] -= x['num_gpus']
                    r['mem_mbs_free'] -= tb_rs.convert_between_byte_units(
                        x['mem_budget'],
                        src_units=x['mem_units'],
                        dst_units='megabytes')
                    r['free_gpu_ids'] = r['free_gpu_ids'][x['num_gpus']:]
                    # assigned = True
                    break

            # if not assigned, terminate without doing anything.
            if not assigned:
                run_cfgs.append(None)
                if run_only_if_enough_resources_for_all:
                    print("Insufficient resources to satisfy"
                          " (cpus=%d, gpus=%d, mem=%0.3f%s)" %
                          (x['num_cpus'], x['num_gpus'], x['mem_budget'],
                           x['mem_units']))
                    return None

        # running the jobs that have a valid config.
        remaining_jobs = []
        outs = []
        for x, c in zip(self.jobs, run_cfgs):
            if c is None:
                remaining_jobs.append(x)
            else:
                out = run_on_lithium_node(**tb_ut.merge_dicts([
                    tb_ut.subset_dict_via_selection(vars(
                        self), ['servername', 'username', 'password']),
                    tb_ut.subset_dict_via_selection(x, [
                        'bash_command', 'folderpath', 'wait_for_output',
                        'run_on_head_node'
                    ]),
                    tb_ut.subset_dict_via_selection(
                        c, ['node', 'visible_gpu_ids'])
                ]))
                outs.append(out)

        self.jobs = remaining_jobs
        return outs