def _get_memo(self, config_lst, create_if_notexists=False): memo = self for cfg in config_lst: key = memo._key_from_config(cfg) if key not in memo.key_to_foldername: if create_if_notexists: foldername = memo._get_unique_foldername() value_folderpath = memo._get_folderpath(foldername) cfg_filepath = memo._get_filepath("memo_config", foldername, "json") tb_fs.create_folder(value_folderpath) tb_io.write_jsonfile(cfg, cfg_filepath) next_memo = NestedMemoManager(value_folderpath) memo.key_to_foldername[key] = foldername memo.key_to_memo[key] = next_memo else: return None else: if key not in memo.key_to_memo: foldername = memo.key_to_foldername[key] value_folderpath = memo._get_folderpath(foldername) memo.key_to_memo[key] = NestedMemoManager(value_folderpath) next_memo = memo.key_to_memo[key] memo = next_memo return memo
def _get_memo(self, config_lst, create_if_notexists=False): memo = self for cfg in config_lst: key = memo._key_from_config(cfg) if key not in memo.key_to_foldername: if create_if_notexists: # get new unique name foldername = memo._get_unique_foldername() cfg_filepath, memo_folderpath = memo._get_memo_paths( foldername) tb_fs.create_folder(memo_folderpath) tb_io.write_jsonfile(cfg, cfg_filepath) next_memo = SimplifiedNestedMemoManager(memo_folderpath) memo.key_to_foldername[key] = foldername memo.key_to_memo[key] = next_memo else: return None else: if key not in memo.key_to_memo: # use existing name foldername = memo.key_to_foldername[key] memo_folderpath = memo._get_memo_paths(foldername)[1] memo.key_to_memo[key] = SimplifiedNestedMemoManager( memo_folderpath) next_memo = memo.key_to_memo[key] memo = next_memo return memo
def __init__(self, folderpath, create_if_notexists=False): self.folderpath = folderpath self.key_to_filename = {} self.key_to_foldername = {} self.key_to_memo = {} tb_fs.create_folder(folderpath, abort_if_exists=False, create_parent_folders=create_if_notexists) # initialize the memo based on the state of the memo folder: for p in tb_fs.list_files(folderpath): name_with_ext = tb_fs.path_last_element(p) # for the files. if name_with_ext.startswith( 'file_config-') and name_with_ext.endswith('.json'): name = name_with_ext[len('file_config-'):-len('.json')] config = tb_io.read_jsonfile(p) key = self._key_from_config(config) self.key_to_filename[key] = name # for the sub-memos. elif name_with_ext.startswith( 'memo_config-') and name_with_ext.endswith('.json'): name = name_with_ext[len('memo_config-'):-len('.json')] config = tb_io.read_jsonfile(p) key = self._key_from_config(config) self.key_to_foldername[key] = name
def __init__(self, folderpath, create_if_notexists=False): self.folderpath = folderpath self.key_to_filename = {} tb_fs.create_folder(folderpath, abort_if_exists=False, create_parent_folders=create_if_notexists) # initialize the memo based on the state of the folder. for fpath in tb_fs.list_files(folderpath): fname_with_ext = tb_fs.path_last_element(fpath) if fname_with_ext.startswith( 'config-') and fname_with_ext.endswith('.json'): fname = fname_with_ext[len('config-'):-len('.json')] config = tb_io.read_jsonfile(fpath) key = self._key_from_config(config) self.key_to_filename[key] = fname
def create_project_folder(folderpath, project_name, initialize_git_repo=False): fn = lambda xs: tb_fs.join_paths([folderpath, project_name] + xs) tb_fs.create_folder(fn([])) # typical directories tb_fs.create_folder(fn([project_name])) tb_fs.create_folder(fn(["analyses"])) tb_fs.create_folder(fn(["data"])) tb_fs.create_folder(fn(["experiments"])) tb_fs.create_folder(fn(["notes"])) tb_fs.create_folder(fn(["temp"])) # code files (in order): data, preprocessing, model definition, model training, # model evaluation, main to generate the results with different relevant # parameters, setting up different experiments, analyze the results and # generate plots and tables. tb_fs.create_file(fn([project_name, "__init__.py"])) tb_fs.create_file(fn([project_name, "data.py"])) tb_fs.create_file(fn([project_name, "preprocess.py"])) tb_fs.create_file(fn([project_name, "model.py"])) tb_fs.create_file(fn([project_name, "train.py"])) tb_fs.create_file(fn([project_name, "evaluate.py"])) tb_fs.create_file(fn([project_name, "main.py"])) tb_fs.create_file(fn([project_name, "experiment.py"])) tb_fs.create_file(fn([project_name, "analyze.py"])) # add an empty script that can be used to download data. tb_fs.create_file(fn(["data", "download_data.py"])) # common notes to keep around. tb_fs.create_file(fn(["notes", "journal.txt"])) tb_fs.create_file(fn(["notes", "reading_list.txt"])) tb_fs.create_file(fn(["notes", "todos.txt"])) # placeholders tb_io.write_textfile(fn(["experiments", "readme.txt"]), ["All experiments will be placed under this folder."]) tb_io.write_textfile(fn(["temp", "readme.txt"]), [ "Here lie temporary files that are relevant or useful for the project " "but that are not kept under version control." ]) tb_io.write_textfile(fn(["analyses", "readme.txt"]), [ "Here lie files containing information extracted from the " "results of the experiments. Tables and plots are typical examples." ]) # typical git ignore file. tb_io.write_textfile( fn([".gitignore"]), ["data", "experiments", "temp", "*.pyc", "*.pdf", "*.aux"]) if initialize_git_repo: subprocess.call("cd %s && git init && git add -f .gitignore * && " "git commit -a -m \"Initial commit for %s.\" && cd -" % (fn([]), project_name), shell=True)
def create_table_from_experiment(experiment_name, rows, columns, values, abort_if_incomplete_configs=True, use_checkpoints=False, single_row_multitable=False, print_to_terminal=True, max_column_width=10**9, abort_if_different_keys=True): _, xs = explore_experiment('experiments/%s' % experiment_name, use_checkpoints) cfgs = [] res = [] for (c, r) in xs: if r is not None: cfgs.append(c) res.append(r) else: assert not abort_if_incomplete_configs xs = tb_ut.zip_toggle([cfgs, res]) ks = keys_with_variation(cfgs) c = dict(cfgs[0]) for k in ks: c.pop(k) ks.pop('out_folder') print("***%s***" % experiment_name) pprint(ks) print() ds = [summarize_results(tb_ut.merge_dicts(x)) for x in xs] # if the values are with respective if any([ v in values for v in [ 'dev_precision', 'dev_recall', 'dev_fb1', 'test_precision', 'test_recall', 'test_fb1' ] ]): def _extract_fn(fpath): out = subprocess.check_output( ["cat %s | data/conll_2000/conlleval.txt" % fpath], shell=True) res_line = out.split('\n')[1] f1 = float(res_line.split(';')[-1].split(": ")[1]) p, r, fb1 = map(lambda x: 0.01 * float(x.split(': ')[1]), res_line.split('%; '))[1:] return p, r, fb1 # add the test and dev performances to the file. for d in ds: (d['dev_precision'], d['dev_recall'], d['dev_fb1']) = _extract_fn( tb_fs.join_paths([d['out_folder'], 'pred_dev.txt'])) (d['test_precision'], d['test_recall'], d['test_fb1']) = _extract_fn( tb_fs.join_paths([d['out_folder'], 'pred_test.txt'])) # this is the final, last run for conll2000 fpath = tb_fs.join_paths([d['out_folder'], 'final_pred_test.txt']) if tb_fs.file_exists(fpath): (d['final_test_precision'], d['final_test_recall'], d['final_test_fb1']) = _extract_fn(fpath) df = tb_ut.create_dataframe(ds, abort_if_different_keys) # # shorten the names appropriately. df = df.rename(columns={k: k[:max_column_width] for k in rows}) rows = [k[:max_column_width] for k in rows] # determines teh table layout. if not single_row_multitable: ts = [ df.pivot_table(index=rows, columns=columns, values=[v]) for v in values ] else: ts = [ df.pivot_table( index=rows, columns=columns, values=values) #.sort_values('dev_accuracy', ascending=False) ] tb_fs.create_folder('analyses/%s' % experiment_name, abort_if_exists=False) s_c = pformat(c) ss_df = [ t.to_string(float_format=get_float_formatter(2, 100.0)) for t in ts ] lines = [s_c] for s in ss_df: lines.append('') lines.append(s) if print_to_terminal: # print to terminal for s in lines: print(s) # write to file tb_io.write_textfile('analyses/%s/results.txt' % experiment_name, lines) tb_io.write_csvfile(ds, 'analyses/%s/results.csv' % experiment_name, sort_keys=True, abort_if_different_keys=abort_if_different_keys)
def create_experiment_folder( main_filepath, argname_lst, argval_lst_lst, output_folderpath_argname, all_experiments_folderpath, readme, experiment_name=None, # entry_folderpath=None, code_folderpath=None, # data_folderpath=None, capture_output=False, profile_run=False): assert tb_fs.folder_exists(all_experiments_folderpath) assert experiment_name is None or (not tb_fs.path_exists( tb_fs.join_paths([all_experiments_folderpath, experiment_name]))) # assert folder_exists(project_folderpath) and file_exists(tb_fs.join_paths([ # project_folderpath, main_relfilepath])) # create the main folder where things for the experiment will be. if experiment_name is None: experiment_name = get_available_filename(all_experiments_folderpath, "exp") experiment_folderpath = tb_fs.join_paths( [all_experiments_folderpath, experiment_name]) tb_fs.create_folder(experiment_folderpath) # copy the code to the experiment folder. if code_folderpath is not None: code_foldername = tb_fs.path_last_element(code_folderpath) dst_code_fo = tb_fs.join_paths( [experiment_folderpath, code_foldername]) tb_fs.copy_folder(code_folderpath, dst_code_fo, ignore_hidden_files=True, ignore_hidden_folders=True, ignore_file_exts=['.pyc']) # change main_filepath to use that new code. main_filepath = tb_fs.join_paths( [experiment_folderpath, main_filepath]) # NOTE: no data copying for now because it often does not make much sense. data_folderpath = None ### TODO: remove later. # # copy the code to the experiment folder. # if data_folderpath is not None: # data_foldername = path_last_element(data_folderpath) # dst_data_fo = join_paths([experiment_folderpath, data_foldername]) # copy_folder(data_folderpath, dst_data_fo, # ignore_hidden_files=True, ignore_hidden_folders=True) # write the config for the experiment. tb_io.write_jsonfile( tb_ut.subset_dict_via_selection(locals(), [ 'main_filepath', 'argname_lst', 'argval_lst_lst', 'output_folderpath_argname', 'all_experiments_folderpath', 'readme', 'experiment_name', 'code_folderpath', 'data_folderpath', 'capture_output', 'profile_run' ]), tb_fs.join_paths([experiment_folderpath, 'config.json'])) # generate the executables for each configuration. argname_lst = list(argname_lst) argname_lst.append(output_folderpath_argname) for (i, vs) in enumerate(argval_lst_lst): cfg_folderpath = tb_fs.join_paths([experiment_folderpath, "cfg%d" % i]) tb_fs.create_folder(cfg_folderpath) # create the script argvalue_lst = list(vs) argvalue_lst.append(cfg_folderpath) call_args = tb_ut.subset_dict_via_selection( locals(), ['argname_lst', 'argvalue_lst', 'main_filepath']) call_args['script_filepath'] = tb_fs.join_paths( [cfg_folderpath, 'run.sh']) if capture_output: call_args['output_filepath'] = tb_fs.join_paths( [cfg_folderpath, 'output.txt']) if profile_run: call_args['profile_filepath'] = tb_fs.join_paths( [cfg_folderpath, 'profile.txt']) create_run_script(**call_args) # write a config file for each configuration tb_io.write_jsonfile(tb_ut.create_dict(argname_lst, argvalue_lst), tb_fs.join_paths([cfg_folderpath, 'config.json'])) # create_runall_script(experiment_folderpath) create_runall_script_with_parallelization(experiment_folderpath) return experiment_folderpath
# out_d[k] = v # else: # raise ValueError # for k, v in out_d.iteritems(): # print k # print v ########### INITIALIZATION ########### idx = sys.argv.index('--config_filepath') + 1 cfg_filepath = sys.argv[idx] cfg = tb_io.read_jsonfile_with_overlays(cfg_filepath) pprint(cfg) tb_fs.create_folder(cfg["out_folder"], abort_if_exists=False, create_parent_folders=True) if '--train' in sys.argv: tb_io.write_jsonfile(cfg, cfg["out_folder"] + "/cfg.json") # The data is loaded from this folder after it has been processed by main_preprocess.py. if cfg["data_type"] == "supertagging": tags_key = "supertags" train_data = tb_io.read_jsonlogfile('data/supertagging/train.jsonl') dev_data = tb_io.read_jsonlogfile('data/supertagging/dev.jsonl') test_data = tb_io.read_jsonlogfile('data/supertagging/test.jsonl') # elif cfg["data_type"] == "conll2000": # tags_key = "chunk_tags" # test_data = tb_io.read_jsonlogfile('data/conll2000/test.jsonl') # train_data = tb_io.read_jsonlogfile('data/conll2000/train.jsonl')
if __name__ == "__main__": # Path to CCG Bank AUTO folder. folderpath = "data/ccgbank_1_1/data/AUTO/" filepath_lst = tb_fs.list_files(folderpath, recursive=True) examples = [] for fpath in filepath_lst: examples.extend(read_supertagging_auto_file(fpath)) train_examples = [] dev_examples = [] test_examples = [] idx = len("wsj_") for e in examples: section_id = int(e["example_id"][idx:idx + 2]) if section_id >= 2 and section_id <= 21: train_examples.append(e) elif section_id == 0: dev_examples.append(e) elif section_id == 23: test_examples.append(e) else: continue print len(train_examples), len(dev_examples), len(test_examples) # Paths for the output files tb_fs.create_folder("data/supertagging", abort_if_exists=False) tb_io.write_jsonlogfile("data/supertagging/train.jsonl", train_examples) tb_io.write_jsonlogfile("data/supertagging/dev.jsonl", dev_examples) tb_io.write_jsonlogfile("data/supertagging/test.jsonl", test_examples)