예제 #1
0
    def _get_memo(self, config_lst, create_if_notexists=False):
        memo = self
        for cfg in config_lst:
            key = memo._key_from_config(cfg)
            if key not in memo.key_to_foldername:
                if create_if_notexists:
                    foldername = memo._get_unique_foldername()
                    value_folderpath = memo._get_folderpath(foldername)
                    cfg_filepath = memo._get_filepath("memo_config",
                                                      foldername, "json")
                    tb_fs.create_folder(value_folderpath)
                    tb_io.write_jsonfile(cfg, cfg_filepath)

                    next_memo = NestedMemoManager(value_folderpath)
                    memo.key_to_foldername[key] = foldername
                    memo.key_to_memo[key] = next_memo
                else:
                    return None
            else:
                if key not in memo.key_to_memo:
                    foldername = memo.key_to_foldername[key]
                    value_folderpath = memo._get_folderpath(foldername)
                    memo.key_to_memo[key] = NestedMemoManager(value_folderpath)
                next_memo = memo.key_to_memo[key]
            memo = next_memo
        return memo
예제 #2
0
    def _get_memo(self, config_lst, create_if_notexists=False):
        memo = self
        for cfg in config_lst:
            key = memo._key_from_config(cfg)
            if key not in memo.key_to_foldername:
                if create_if_notexists:
                    # get new unique name
                    foldername = memo._get_unique_foldername()
                    cfg_filepath, memo_folderpath = memo._get_memo_paths(
                        foldername)
                    tb_fs.create_folder(memo_folderpath)
                    tb_io.write_jsonfile(cfg, cfg_filepath)

                    next_memo = SimplifiedNestedMemoManager(memo_folderpath)
                    memo.key_to_foldername[key] = foldername
                    memo.key_to_memo[key] = next_memo
                else:
                    return None
            else:
                if key not in memo.key_to_memo:
                    # use existing name
                    foldername = memo.key_to_foldername[key]
                    memo_folderpath = memo._get_memo_paths(foldername)[1]
                    memo.key_to_memo[key] = SimplifiedNestedMemoManager(
                        memo_folderpath)
                next_memo = memo.key_to_memo[key]
            memo = next_memo
        return memo
예제 #3
0
    def __init__(self, folderpath, create_if_notexists=False):
        self.folderpath = folderpath
        self.key_to_filename = {}
        self.key_to_foldername = {}
        self.key_to_memo = {}

        tb_fs.create_folder(folderpath,
                            abort_if_exists=False,
                            create_parent_folders=create_if_notexists)

        # initialize the memo based on the state of the memo folder:
        for p in tb_fs.list_files(folderpath):
            name_with_ext = tb_fs.path_last_element(p)
            # for the files.
            if name_with_ext.startswith(
                    'file_config-') and name_with_ext.endswith('.json'):
                name = name_with_ext[len('file_config-'):-len('.json')]
                config = tb_io.read_jsonfile(p)
                key = self._key_from_config(config)
                self.key_to_filename[key] = name

            # for the sub-memos.
            elif name_with_ext.startswith(
                    'memo_config-') and name_with_ext.endswith('.json'):
                name = name_with_ext[len('memo_config-'):-len('.json')]
                config = tb_io.read_jsonfile(p)
                key = self._key_from_config(config)
                self.key_to_foldername[key] = name
예제 #4
0
    def __init__(self, folderpath, create_if_notexists=False):
        self.folderpath = folderpath
        self.key_to_filename = {}

        tb_fs.create_folder(folderpath,
                            abort_if_exists=False,
                            create_parent_folders=create_if_notexists)

        # initialize the memo based on the state of the folder.
        for fpath in tb_fs.list_files(folderpath):
            fname_with_ext = tb_fs.path_last_element(fpath)
            if fname_with_ext.startswith(
                    'config-') and fname_with_ext.endswith('.json'):
                fname = fname_with_ext[len('config-'):-len('.json')]
                config = tb_io.read_jsonfile(fpath)
                key = self._key_from_config(config)
                self.key_to_filename[key] = fname
예제 #5
0
def create_project_folder(folderpath, project_name, initialize_git_repo=False):
    fn = lambda xs: tb_fs.join_paths([folderpath, project_name] + xs)

    tb_fs.create_folder(fn([]))
    # typical directories
    tb_fs.create_folder(fn([project_name]))
    tb_fs.create_folder(fn(["analyses"]))
    tb_fs.create_folder(fn(["data"]))
    tb_fs.create_folder(fn(["experiments"]))
    tb_fs.create_folder(fn(["notes"]))
    tb_fs.create_folder(fn(["temp"]))

    # code files (in order): data, preprocessing, model definition, model training,
    # model evaluation, main to generate the results with different relevant
    # parameters, setting up different experiments, analyze the results and
    # generate plots and tables.
    tb_fs.create_file(fn([project_name, "__init__.py"]))
    tb_fs.create_file(fn([project_name, "data.py"]))
    tb_fs.create_file(fn([project_name, "preprocess.py"]))
    tb_fs.create_file(fn([project_name, "model.py"]))
    tb_fs.create_file(fn([project_name, "train.py"]))
    tb_fs.create_file(fn([project_name, "evaluate.py"]))
    tb_fs.create_file(fn([project_name, "main.py"]))
    tb_fs.create_file(fn([project_name, "experiment.py"]))
    tb_fs.create_file(fn([project_name, "analyze.py"]))

    # add an empty script that can be used to download data.
    tb_fs.create_file(fn(["data", "download_data.py"]))

    # common notes to keep around.
    tb_fs.create_file(fn(["notes", "journal.txt"]))
    tb_fs.create_file(fn(["notes", "reading_list.txt"]))
    tb_fs.create_file(fn(["notes", "todos.txt"]))

    # placeholders
    tb_io.write_textfile(fn(["experiments", "readme.txt"]),
                         ["All experiments will be placed under this folder."])

    tb_io.write_textfile(fn(["temp", "readme.txt"]), [
        "Here lie temporary files that are relevant or useful for the project "
        "but that are not kept under version control."
    ])

    tb_io.write_textfile(fn(["analyses", "readme.txt"]), [
        "Here lie files containing information extracted from the "
        "results of the experiments. Tables and plots are typical examples."
    ])

    # typical git ignore file.
    tb_io.write_textfile(
        fn([".gitignore"]),
        ["data", "experiments", "temp", "*.pyc", "*.pdf", "*.aux"])

    if initialize_git_repo:
        subprocess.call("cd %s && git init && git add -f .gitignore * && "
                        "git commit -a -m \"Initial commit for %s.\" && cd -" %
                        (fn([]), project_name),
                        shell=True)
예제 #6
0
def create_table_from_experiment(experiment_name,
                                 rows,
                                 columns,
                                 values,
                                 abort_if_incomplete_configs=True,
                                 use_checkpoints=False,
                                 single_row_multitable=False,
                                 print_to_terminal=True,
                                 max_column_width=10**9,
                                 abort_if_different_keys=True):

    _, xs = explore_experiment('experiments/%s' % experiment_name,
                               use_checkpoints)

    cfgs = []
    res = []
    for (c, r) in xs:
        if r is not None:
            cfgs.append(c)
            res.append(r)
        else:
            assert not abort_if_incomplete_configs
    xs = tb_ut.zip_toggle([cfgs, res])

    ks = keys_with_variation(cfgs)
    c = dict(cfgs[0])
    for k in ks:
        c.pop(k)

    ks.pop('out_folder')
    print("***%s***" % experiment_name)
    pprint(ks)
    print()

    ds = [summarize_results(tb_ut.merge_dicts(x)) for x in xs]

    # if the values are with respective
    if any([
            v in values for v in [
                'dev_precision', 'dev_recall', 'dev_fb1', 'test_precision',
                'test_recall', 'test_fb1'
            ]
    ]):

        def _extract_fn(fpath):

            out = subprocess.check_output(
                ["cat %s | data/conll_2000/conlleval.txt" % fpath], shell=True)

            res_line = out.split('\n')[1]
            f1 = float(res_line.split(';')[-1].split(": ")[1])

            p, r, fb1 = map(lambda x: 0.01 * float(x.split(': ')[1]),
                            res_line.split('%; '))[1:]

            return p, r, fb1

        # add the test and dev performances to the file.
        for d in ds:
            (d['dev_precision'], d['dev_recall'], d['dev_fb1']) = _extract_fn(
                tb_fs.join_paths([d['out_folder'], 'pred_dev.txt']))

            (d['test_precision'], d['test_recall'],
             d['test_fb1']) = _extract_fn(
                 tb_fs.join_paths([d['out_folder'], 'pred_test.txt']))

            # this is the final, last run for conll2000
            fpath = tb_fs.join_paths([d['out_folder'], 'final_pred_test.txt'])
            if tb_fs.file_exists(fpath):

                (d['final_test_precision'], d['final_test_recall'],
                 d['final_test_fb1']) = _extract_fn(fpath)

    df = tb_ut.create_dataframe(ds, abort_if_different_keys)

    # # shorten the names appropriately.
    df = df.rename(columns={k: k[:max_column_width] for k in rows})
    rows = [k[:max_column_width] for k in rows]

    # determines teh table layout.
    if not single_row_multitable:

        ts = [
            df.pivot_table(index=rows, columns=columns, values=[v])
            for v in values
        ]

    else:
        ts = [
            df.pivot_table(
                index=rows, columns=columns,
                values=values)  #.sort_values('dev_accuracy', ascending=False)
        ]

    tb_fs.create_folder('analyses/%s' % experiment_name, abort_if_exists=False)
    s_c = pformat(c)
    ss_df = [
        t.to_string(float_format=get_float_formatter(2, 100.0)) for t in ts
    ]

    lines = [s_c]
    for s in ss_df:
        lines.append('')
        lines.append(s)

    if print_to_terminal:
        # print to terminal
        for s in lines:
            print(s)

    # write to file
    tb_io.write_textfile('analyses/%s/results.txt' % experiment_name, lines)
    tb_io.write_csvfile(ds,
                        'analyses/%s/results.csv' % experiment_name,
                        sort_keys=True,
                        abort_if_different_keys=abort_if_different_keys)
예제 #7
0
def create_experiment_folder(
        main_filepath,
        argname_lst,
        argval_lst_lst,
        output_folderpath_argname,
        all_experiments_folderpath,
        readme,
        experiment_name=None,
        # entry_folderpath=None,
        code_folderpath=None,
        # data_folderpath=None,
        capture_output=False,
        profile_run=False):

    assert tb_fs.folder_exists(all_experiments_folderpath)
    assert experiment_name is None or (not tb_fs.path_exists(
        tb_fs.join_paths([all_experiments_folderpath, experiment_name])))
    # assert folder_exists(project_folderpath) and file_exists(tb_fs.join_paths([
    #     project_folderpath, main_relfilepath]))

    # create the main folder where things for the experiment will be.
    if experiment_name is None:
        experiment_name = get_available_filename(all_experiments_folderpath,
                                                 "exp")
    experiment_folderpath = tb_fs.join_paths(
        [all_experiments_folderpath, experiment_name])
    tb_fs.create_folder(experiment_folderpath)

    # copy the code to the experiment folder.
    if code_folderpath is not None:
        code_foldername = tb_fs.path_last_element(code_folderpath)
        dst_code_fo = tb_fs.join_paths(
            [experiment_folderpath, code_foldername])

        tb_fs.copy_folder(code_folderpath,
                          dst_code_fo,
                          ignore_hidden_files=True,
                          ignore_hidden_folders=True,
                          ignore_file_exts=['.pyc'])

        # change main_filepath to use that new code.
        main_filepath = tb_fs.join_paths(
            [experiment_folderpath, main_filepath])

    # NOTE: no data copying for now because it often does not make much sense.
    data_folderpath = None  ### TODO: remove later.
    # # copy the code to the experiment folder.
    # if data_folderpath is not None:
    #     data_foldername = path_last_element(data_folderpath)
    #     dst_data_fo = join_paths([experiment_folderpath, data_foldername])

    #     copy_folder(data_folderpath, dst_data_fo,
    #         ignore_hidden_files=True, ignore_hidden_folders=True)

    # write the config for the experiment.
    tb_io.write_jsonfile(
        tb_ut.subset_dict_via_selection(locals(), [
            'main_filepath', 'argname_lst', 'argval_lst_lst',
            'output_folderpath_argname', 'all_experiments_folderpath',
            'readme', 'experiment_name', 'code_folderpath', 'data_folderpath',
            'capture_output', 'profile_run'
        ]), tb_fs.join_paths([experiment_folderpath, 'config.json']))

    # generate the executables for each configuration.
    argname_lst = list(argname_lst)
    argname_lst.append(output_folderpath_argname)
    for (i, vs) in enumerate(argval_lst_lst):
        cfg_folderpath = tb_fs.join_paths([experiment_folderpath, "cfg%d" % i])
        tb_fs.create_folder(cfg_folderpath)

        # create the script
        argvalue_lst = list(vs)
        argvalue_lst.append(cfg_folderpath)
        call_args = tb_ut.subset_dict_via_selection(
            locals(), ['argname_lst', 'argvalue_lst', 'main_filepath'])

        call_args['script_filepath'] = tb_fs.join_paths(
            [cfg_folderpath, 'run.sh'])
        if capture_output:
            call_args['output_filepath'] = tb_fs.join_paths(
                [cfg_folderpath, 'output.txt'])
        if profile_run:
            call_args['profile_filepath'] = tb_fs.join_paths(
                [cfg_folderpath, 'profile.txt'])
        create_run_script(**call_args)

        # write a config file for each configuration
        tb_io.write_jsonfile(tb_ut.create_dict(argname_lst, argvalue_lst),
                             tb_fs.join_paths([cfg_folderpath, 'config.json']))
    # create_runall_script(experiment_folderpath)
    create_runall_script_with_parallelization(experiment_folderpath)

    return experiment_folderpath
예제 #8
0
#                     out_d[k] = v
#             else:
#                 raise ValueError
#     for k, v in out_d.iteritems():
#         print k
#         print v

########### INITIALIZATION ###########

idx = sys.argv.index('--config_filepath') + 1
cfg_filepath = sys.argv[idx]
cfg = tb_io.read_jsonfile_with_overlays(cfg_filepath)
pprint(cfg)

tb_fs.create_folder(cfg["out_folder"],
                    abort_if_exists=False,
                    create_parent_folders=True)

if '--train' in sys.argv:
    tb_io.write_jsonfile(cfg, cfg["out_folder"] + "/cfg.json")

# The data is loaded from this folder after it has been processed by main_preprocess.py.
if cfg["data_type"] == "supertagging":
    tags_key = "supertags"
    train_data = tb_io.read_jsonlogfile('data/supertagging/train.jsonl')
    dev_data = tb_io.read_jsonlogfile('data/supertagging/dev.jsonl')
    test_data = tb_io.read_jsonlogfile('data/supertagging/test.jsonl')
# elif cfg["data_type"] == "conll2000":
#     tags_key = "chunk_tags"
#     test_data = tb_io.read_jsonlogfile('data/conll2000/test.jsonl')
#     train_data = tb_io.read_jsonlogfile('data/conll2000/train.jsonl')
예제 #9
0

if __name__ == "__main__":
    # Path to CCG Bank AUTO folder.
    folderpath = "data/ccgbank_1_1/data/AUTO/"
    filepath_lst = tb_fs.list_files(folderpath, recursive=True)
    examples = []
    for fpath in filepath_lst:
        examples.extend(read_supertagging_auto_file(fpath))

    train_examples = []
    dev_examples = []
    test_examples = []
    idx = len("wsj_")
    for e in examples:
        section_id = int(e["example_id"][idx:idx + 2])
        if section_id >= 2 and section_id <= 21:
            train_examples.append(e)
        elif section_id == 0:
            dev_examples.append(e)
        elif section_id == 23:
            test_examples.append(e)
        else:
            continue

    print len(train_examples), len(dev_examples), len(test_examples)
    # Paths for the output files
    tb_fs.create_folder("data/supertagging", abort_if_exists=False)
    tb_io.write_jsonlogfile("data/supertagging/train.jsonl", train_examples)
    tb_io.write_jsonlogfile("data/supertagging/dev.jsonl", dev_examples)
    tb_io.write_jsonlogfile("data/supertagging/test.jsonl", test_examples)