Exemplo n.º 1
0
 def _get_memo_paths(self, foldername):
     cfg_filepath = tb_fs.join_paths(
         [self.folderpath,
          "memo_config-%s.json" % foldername])
     memo_folderpath = tb_fs.join_paths(
         [self.folderpath, "memo_value-%s" % foldername])
     return (cfg_filepath, memo_folderpath)
Exemplo n.º 2
0
 def _get_file_paths(self, filename):
     cfg_filepath = tb_fs.join_paths(
         [self.folderpath,
          "file_config-%s.json" % filename])
     value_filepath = tb_fs.join_paths(
         [self.folderpath, "file_value-%s.pkl" % filename])
     return (cfg_filepath, value_filepath)
Exemplo n.º 3
0
    def _fn(e_folderpath):
        cfg = tb_io.read_jsonfile(
            tb_fs.join_paths([e_folderpath, 'config.json']))

        res = None
        if not use_checkpoints:
            res_fpath = tb_fs.join_paths([e_folderpath, 'results.json'])
        else:
            res_fpath = tb_fs.join_paths([e_folderpath, 'checkpoint.json'])

        if tb_fs.file_exists(res_fpath):
            res = tb_io.read_jsonfile(res_fpath)
        return (cfg, res)
Exemplo n.º 4
0
def write_server_run_script():
    assert servertype == 'bridges'
    # NOTE: to edit according to the configuration needed.
    jobname = jobtype
    time_budget_in_hours = 48  # max 48 hours
    mem_budget_in_gb = 16
    partition_name = 'GPU-shared'
    num_cpus = 1  # probably ask a CPU for each GPU (or more if you have data loaders)
    num_gpus = 1  # up to 4 if k80, up to 2 if p100
    gpu_type = 'k80'  # in ['k80', 'p100']

    script_header = [
        '#!/bin/bash',
        '#SBATCH --nodes=1',
        '#SBATCH --partition=%s' % partition_name,
        '#SBATCH --cpus-per-task=%d' % num_cpus,
        '#SBATCH --gres=gpu:%s:%d' % (gpu_type, num_gpus),
        '#SBATCH --mem=%dM' % tb_rs.convert_between_byte_units(
            mem_budget_in_gb, src_units='gigabytes', dst_units='megabytes'),
        '#SBATCH --time=%d' % tb_lg.convert_between_time_units(
            time_budget_in_hours, src_units='hours', dst_units='minutes'),
        '#SBATCH --job-name=%s' % jobname,
    ]
    # NOTE: changes to the environment can be put in the run script.
    script_body = [
        'module load tensorflow/1.5_gpu',
        'PYTHONPATH=%s:$PYTHONPATH' % remote_folderpath,
        'python -u %s > log_%s.txt' % (main_relfilepath, jobname)
    ]

    script_filepath = tb_fs.join_paths([local_folderpath, "run.sh"])
    tb_io.write_textfile(script_filepath, script_header + [''] + script_body)
    subprocess.check_output(['chmod', '+x', script_filepath])
Exemplo n.º 5
0
 def _get_unique_filename(self):
     while True:
         filename = uuid.uuid4()
         if not tb_fs.file_exists(
                 tb_fs.join_paths(
                     [self.folderpath,
                      "config-%s.json" % filename])):
             return filename
Exemplo n.º 6
0
 def _get_unique_name(self, prefix):
     while True:
         filename = uuid.uuid4()
         filepath = tb_fs.join_paths(
             [self.folderpath,
              "%s-%s.json" % (prefix, filename)])
         if not tb_fs.file_exists(filepath):
             return filename
Exemplo n.º 7
0
def create_project_folder(folderpath, project_name, initialize_git_repo=False):
    fn = lambda xs: tb_fs.join_paths([folderpath, project_name] + xs)

    tb_fs.create_folder(fn([]))
    # typical directories
    tb_fs.create_folder(fn([project_name]))
    tb_fs.create_folder(fn(["analyses"]))
    tb_fs.create_folder(fn(["data"]))
    tb_fs.create_folder(fn(["experiments"]))
    tb_fs.create_folder(fn(["notes"]))
    tb_fs.create_folder(fn(["temp"]))

    # code files (in order): data, preprocessing, model definition, model training,
    # model evaluation, main to generate the results with different relevant
    # parameters, setting up different experiments, analyze the results and
    # generate plots and tables.
    tb_fs.create_file(fn([project_name, "__init__.py"]))
    tb_fs.create_file(fn([project_name, "data.py"]))
    tb_fs.create_file(fn([project_name, "preprocess.py"]))
    tb_fs.create_file(fn([project_name, "model.py"]))
    tb_fs.create_file(fn([project_name, "train.py"]))
    tb_fs.create_file(fn([project_name, "evaluate.py"]))
    tb_fs.create_file(fn([project_name, "main.py"]))
    tb_fs.create_file(fn([project_name, "experiment.py"]))
    tb_fs.create_file(fn([project_name, "analyze.py"]))

    # add an empty script that can be used to download data.
    tb_fs.create_file(fn(["data", "download_data.py"]))

    # common notes to keep around.
    tb_fs.create_file(fn(["notes", "journal.txt"]))
    tb_fs.create_file(fn(["notes", "reading_list.txt"]))
    tb_fs.create_file(fn(["notes", "todos.txt"]))

    # placeholders
    tb_io.write_textfile(fn(["experiments", "readme.txt"]),
                         ["All experiments will be placed under this folder."])

    tb_io.write_textfile(fn(["temp", "readme.txt"]), [
        "Here lie temporary files that are relevant or useful for the project "
        "but that are not kept under version control."
    ])

    tb_io.write_textfile(fn(["analyses", "readme.txt"]), [
        "Here lie files containing information extracted from the "
        "results of the experiments. Tables and plots are typical examples."
    ])

    # typical git ignore file.
    tb_io.write_textfile(
        fn([".gitignore"]),
        ["data", "experiments", "temp", "*.pyc", "*.pdf", "*.aux"])

    if initialize_git_repo:
        subprocess.call("cd %s && git init && git add -f .gitignore * && "
                        "git commit -a -m \"Initial commit for %s.\" && cd -" %
                        (fn([]), project_name),
                        shell=True)
Exemplo n.º 8
0
def get_available_filename(folderpath, filename_prefix):
    idx = 0
    while True:
        name = "%s%d" % (filename_prefix, idx)
        path = tb_fs.join_paths([folderpath, name])
        if not tb_fs.path_exists(path):
            break
        else:
            idx += 1
    return name
def create_runall_script(experiment_folderpath):
    fo_names = tb_fs.list_folders(
        experiment_folderpath, recursive=False, use_relative_paths=True)
    num_exps = len(
        [n for n in fo_names if tb_fs.path_last_element(n).startswith('cfg')])

    # creating the script.
    sc_lines = ['#!/bin/bash']
    sc_lines += [
        tb_fs.join_paths([experiment_folderpath,
                          "cfg%d" % i, 'run.sh']) for i in xrange(num_exps)
    ]

    # creating the run all script.
    out_filepath = tb_fs.join_paths([experiment_folderpath, 'run.sh'])
    tb_io.write_textfile(out_filepath, sc_lines, with_newline=True)
    st = os.stat(out_filepath)
    exec_bits = stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH
    os.chmod(out_filepath, st.st_mode | exec_bits)
Exemplo n.º 10
0
def create_runall_script_with_parallelization(experiment_folderpath):
    fo_names = tb_fs.list_folders(experiment_folderpath,
                                  recursive=False,
                                  use_relative_paths=True)
    num_exps = len(
        [n for n in fo_names if tb_fs.path_last_element(n).startswith('cfg')])

    # creating the script.
    sc_lines = [
        '#!/bin/bash', 'if [ "$#" -lt 0 ] && [ "$#" -gt 3 ]; then',
        '    echo "Usage: run.sh [worker_id num_workers] [--force-rerun]"',
        '    exit 1', 'fi', 'force_rerun=0',
        'if [ $# -eq 0 ] || [ $# -eq 1 ]; then', '    worker_id=0',
        '    num_workers=1', '    if [ $# -eq 1 ]; then',
        '        if [ "$1" != "--force-rerun" ]; then',
        '            echo "Usage: run.sh [worker_id num_workers] [--force-rerun]"',
        '            exit 1', '        else', '            force_rerun=1',
        '        fi', '    fi', 'else', '    worker_id=$1',
        '    num_workers=$2', '    if [ $# -eq 3 ]; then',
        '        if [ "$3" != "--force-rerun" ]; then',
        '            echo "Usage: run.sh [worker_id num_workers] [--force-rerun]"',
        '            exit 1', '        else', '            force_rerun=1',
        '        fi', '    fi', 'fi',
        'if [ $num_workers -le $worker_id ] || [ $worker_id -lt 0 ]; then',
        '    echo "Invalid call: requires 0 <= worker_id < num_workers."',
        '    exit 1', 'fi'
        '',
        'num_exps=%d' % num_exps, 'i=0', 'while [ $i -lt $num_exps ]; do',
        '    if [ $(($i % $num_workers)) -eq $worker_id ]; then',
        '        if [ ! -f %s ] || [ $force_rerun -eq 1 ]; then' %
        tb_fs.join_paths([experiment_folderpath, "cfg$i", 'results.json']),
        '            echo cfg$i',
        '            %s' %
        tb_fs.join_paths([experiment_folderpath, "cfg$i", 'run.sh']),
        '        fi', '    fi', '    i=$(($i + 1))', 'done'
    ]
    # creating the run all script.
    out_filepath = tb_fs.join_paths([experiment_folderpath, 'run.sh'])
    tb_io.write_textfile(out_filepath, sc_lines, with_newline=True)
    st = os.stat(out_filepath)
    exec_bits = stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH
    os.chmod(out_filepath, st.st_mode | exec_bits)
Exemplo n.º 11
0
def download_file(urlpath,
                  folderpath,
                  filename=None,
                  abort_if_file_exists=True):
    if filename is None:
        filename = urlpath.split('/')[-1]
    filepath = tb_fs.join_paths([folderpath, filename])
    assert tb_fs.folder_exists(folderpath)
    assert (not tb_fs.file_exists(filepath)) or abort_if_file_exists
    f = urllib.URLopener()
    f.retrieve(urlpath, filepath)
Exemplo n.º 12
0
def create_experimsent_from_fn(fn, overwrite=False):
    experiment_name = fn.__name__
    experiment_folderpath = tb_fs.join_paths(['experiments', experiment_name])
    if overwrite and tb_fs.folder_exists(experiment_folderpath):
        tb_fs.delete_folder(experiment_folderpath, False)

    cfgs = fn()
    argname_lst = list(cfgs[0].keys())
    argval_lst_lst = [[d[k] for k in argname_lst] for d in cfgs]
    tb_ex.create_experiment_folder('beam_learn/main.py', argname_lst,
                                   argval_lst_lst, 'out_folder', 'experiments',
                                   '', experiment_name, 'beam_learn', True)
Exemplo n.º 13
0
    def _fn(cfg_path):
        ds = []
        for name in json_filename_lst:
            p = tb_fs.join_paths([cfg_path, name])

            if (not abort_if_notexists) and (not tb_fs.file_exists(p)):
                d = None
            # if abort if it does not exist, it is going to fail reading the file.
            else:
                d = tb_io.read_jsonfile(p)
            ds.append(d)
        return ds
Exemplo n.º 14
0
def map_experiment_folder(experiment_folderpath, fn):
    fo_paths = tb_fs.list_folders(
        experiment_folderpath, recursive=False, use_relative_paths=False)
    num_exps = len(
        [p for p in fo_paths if tb_fs.path_last_element(p).startswith('cfg')])

    ps = []
    rs = []
    for i in xrange(num_exps):
        p = tb_fs.join_paths([experiment_folderpath, 'cfg%d' % i])
        rs.append(fn(p))
        ps.append(p)
    return (ps, rs)
Exemplo n.º 15
0
def create_table_from_experiment(experiment_name,
                                 rows,
                                 columns,
                                 values,
                                 abort_if_incomplete_configs=True,
                                 use_checkpoints=False,
                                 single_row_multitable=False,
                                 print_to_terminal=True,
                                 max_column_width=10**9,
                                 abort_if_different_keys=True):

    _, xs = explore_experiment('experiments/%s' % experiment_name,
                               use_checkpoints)

    cfgs = []
    res = []
    for (c, r) in xs:
        if r is not None:
            cfgs.append(c)
            res.append(r)
        else:
            assert not abort_if_incomplete_configs
    xs = tb_ut.zip_toggle([cfgs, res])

    ks = keys_with_variation(cfgs)
    c = dict(cfgs[0])
    for k in ks:
        c.pop(k)

    ks.pop('out_folder')
    print("***%s***" % experiment_name)
    pprint(ks)
    print()

    ds = [summarize_results(tb_ut.merge_dicts(x)) for x in xs]

    # if the values are with respective
    if any([
            v in values for v in [
                'dev_precision', 'dev_recall', 'dev_fb1', 'test_precision',
                'test_recall', 'test_fb1'
            ]
    ]):

        def _extract_fn(fpath):

            out = subprocess.check_output(
                ["cat %s | data/conll_2000/conlleval.txt" % fpath], shell=True)

            res_line = out.split('\n')[1]
            f1 = float(res_line.split(';')[-1].split(": ")[1])

            p, r, fb1 = map(lambda x: 0.01 * float(x.split(': ')[1]),
                            res_line.split('%; '))[1:]

            return p, r, fb1

        # add the test and dev performances to the file.
        for d in ds:
            (d['dev_precision'], d['dev_recall'], d['dev_fb1']) = _extract_fn(
                tb_fs.join_paths([d['out_folder'], 'pred_dev.txt']))

            (d['test_precision'], d['test_recall'],
             d['test_fb1']) = _extract_fn(
                 tb_fs.join_paths([d['out_folder'], 'pred_test.txt']))

            # this is the final, last run for conll2000
            fpath = tb_fs.join_paths([d['out_folder'], 'final_pred_test.txt'])
            if tb_fs.file_exists(fpath):

                (d['final_test_precision'], d['final_test_recall'],
                 d['final_test_fb1']) = _extract_fn(fpath)

    df = tb_ut.create_dataframe(ds, abort_if_different_keys)

    # # shorten the names appropriately.
    df = df.rename(columns={k: k[:max_column_width] for k in rows})
    rows = [k[:max_column_width] for k in rows]

    # determines teh table layout.
    if not single_row_multitable:

        ts = [
            df.pivot_table(index=rows, columns=columns, values=[v])
            for v in values
        ]

    else:
        ts = [
            df.pivot_table(
                index=rows, columns=columns,
                values=values)  #.sort_values('dev_accuracy', ascending=False)
        ]

    tb_fs.create_folder('analyses/%s' % experiment_name, abort_if_exists=False)
    s_c = pformat(c)
    ss_df = [
        t.to_string(float_format=get_float_formatter(2, 100.0)) for t in ts
    ]

    lines = [s_c]
    for s in ss_df:
        lines.append('')
        lines.append(s)

    if print_to_terminal:
        # print to terminal
        for s in lines:
            print(s)

    # write to file
    tb_io.write_textfile('analyses/%s/results.txt' % experiment_name, lines)
    tb_io.write_csvfile(ds,
                        'analyses/%s/results.csv' % experiment_name,
                        sort_keys=True,
                        abort_if_different_keys=abort_if_different_keys)
Exemplo n.º 16
0
 def _get_folderpath(self, foldername):
     return tb_fs.join_paths(
         [self.folderpath, "memo_value-%s" % foldername])
Exemplo n.º 17
0
 def _get_filepath(self, filetype, filename, fileext):
     return tb_fs.join_paths(
         [self.folderpath,
          "%s-%s.%s" % (filetype, filename, fileext)])
Exemplo n.º 18
0
def create_experiment_folder(
        main_filepath,
        argname_lst,
        argval_lst_lst,
        output_folderpath_argname,
        all_experiments_folderpath,
        readme,
        experiment_name=None,
        # entry_folderpath=None,
        code_folderpath=None,
        # data_folderpath=None,
        capture_output=False,
        profile_run=False):

    assert tb_fs.folder_exists(all_experiments_folderpath)
    assert experiment_name is None or (not tb_fs.path_exists(
        tb_fs.join_paths([all_experiments_folderpath, experiment_name])))
    # assert folder_exists(project_folderpath) and file_exists(tb_fs.join_paths([
    #     project_folderpath, main_relfilepath]))

    # create the main folder where things for the experiment will be.
    if experiment_name is None:
        experiment_name = get_available_filename(all_experiments_folderpath,
                                                 "exp")
    experiment_folderpath = tb_fs.join_paths(
        [all_experiments_folderpath, experiment_name])
    tb_fs.create_folder(experiment_folderpath)

    # copy the code to the experiment folder.
    if code_folderpath is not None:
        code_foldername = tb_fs.path_last_element(code_folderpath)
        dst_code_fo = tb_fs.join_paths(
            [experiment_folderpath, code_foldername])

        tb_fs.copy_folder(code_folderpath,
                          dst_code_fo,
                          ignore_hidden_files=True,
                          ignore_hidden_folders=True,
                          ignore_file_exts=['.pyc'])

        # change main_filepath to use that new code.
        main_filepath = tb_fs.join_paths(
            [experiment_folderpath, main_filepath])

    # NOTE: no data copying for now because it often does not make much sense.
    data_folderpath = None  ### TODO: remove later.
    # # copy the code to the experiment folder.
    # if data_folderpath is not None:
    #     data_foldername = path_last_element(data_folderpath)
    #     dst_data_fo = join_paths([experiment_folderpath, data_foldername])

    #     copy_folder(data_folderpath, dst_data_fo,
    #         ignore_hidden_files=True, ignore_hidden_folders=True)

    # write the config for the experiment.
    tb_io.write_jsonfile(
        tb_ut.subset_dict_via_selection(locals(), [
            'main_filepath', 'argname_lst', 'argval_lst_lst',
            'output_folderpath_argname', 'all_experiments_folderpath',
            'readme', 'experiment_name', 'code_folderpath', 'data_folderpath',
            'capture_output', 'profile_run'
        ]), tb_fs.join_paths([experiment_folderpath, 'config.json']))

    # generate the executables for each configuration.
    argname_lst = list(argname_lst)
    argname_lst.append(output_folderpath_argname)
    for (i, vs) in enumerate(argval_lst_lst):
        cfg_folderpath = tb_fs.join_paths([experiment_folderpath, "cfg%d" % i])
        tb_fs.create_folder(cfg_folderpath)

        # create the script
        argvalue_lst = list(vs)
        argvalue_lst.append(cfg_folderpath)
        call_args = tb_ut.subset_dict_via_selection(
            locals(), ['argname_lst', 'argvalue_lst', 'main_filepath'])

        call_args['script_filepath'] = tb_fs.join_paths(
            [cfg_folderpath, 'run.sh'])
        if capture_output:
            call_args['output_filepath'] = tb_fs.join_paths(
                [cfg_folderpath, 'output.txt'])
        if profile_run:
            call_args['profile_filepath'] = tb_fs.join_paths(
                [cfg_folderpath, 'profile.txt'])
        create_run_script(**call_args)

        # write a config file for each configuration
        tb_io.write_jsonfile(tb_ut.create_dict(argname_lst, argvalue_lst),
                             tb_fs.join_paths([cfg_folderpath, 'config.json']))
    # create_runall_script(experiment_folderpath)
    create_runall_script_with_parallelization(experiment_folderpath)

    return experiment_folderpath
Exemplo n.º 19
0
 def _get_filepath(self, name, use_json):
     filename = name + (".json" if use_json else '.pkl')
     filepath = tb_fs.join_paths([self.saver_folderpath, filename])
     return filepath