def _get_memo(self, config_lst, create_if_notexists=False): memo = self for cfg in config_lst: key = memo._key_from_config(cfg) if key not in memo.key_to_foldername: if create_if_notexists: foldername = memo._get_unique_foldername() value_folderpath = memo._get_folderpath(foldername) cfg_filepath = memo._get_filepath("memo_config", foldername, "json") tb_fs.create_folder(value_folderpath) tb_io.write_jsonfile(cfg, cfg_filepath) next_memo = NestedMemoManager(value_folderpath) memo.key_to_foldername[key] = foldername memo.key_to_memo[key] = next_memo else: return None else: if key not in memo.key_to_memo: foldername = memo.key_to_foldername[key] value_folderpath = memo._get_folderpath(foldername) memo.key_to_memo[key] = NestedMemoManager(value_folderpath) next_memo = memo.key_to_memo[key] memo = next_memo return memo
def _get_memo(self, config_lst, create_if_notexists=False): memo = self for cfg in config_lst: key = memo._key_from_config(cfg) if key not in memo.key_to_foldername: if create_if_notexists: # get new unique name foldername = memo._get_unique_foldername() cfg_filepath, memo_folderpath = memo._get_memo_paths( foldername) tb_fs.create_folder(memo_folderpath) tb_io.write_jsonfile(cfg, cfg_filepath) next_memo = SimplifiedNestedMemoManager(memo_folderpath) memo.key_to_foldername[key] = foldername memo.key_to_memo[key] = next_memo else: return None else: if key not in memo.key_to_memo: # use existing name foldername = memo.key_to_foldername[key] memo_folderpath = memo._get_memo_paths(foldername)[1] memo.key_to_memo[key] = SimplifiedNestedMemoManager( memo_folderpath) next_memo = memo.key_to_memo[key] memo = next_memo return memo
def save(self, name, x): cfg = self.name_to_cfg[name] out = cfg['save_fn'](x) filepath = self._get_filepath(name, cfg['use_json']) if cfg['use_json']: tb_io.write_jsonfile(out, filepath) else: tb_io.write_picklefile(out, filepath)
def write_file(self, config, value, abort_if_exists=True): key = self._key_from_config(config) assert not abort_if_exists or key not in self.key_to_filename # if it exists, get it from the dictionary. if key not in self.key_to_filename: filename = self._get_unique_filename() filename = self.key_to_filename[key] cfg_filepath, value_filepath = self._get_file_paths(filename) tb_io.write_jsonfile(config, cfg_filepath) tb_io.write_picklefile(value, value_filepath)
def write(self, config, value, abort_if_exists=True): key = self._key_from_config(config) assert not abort_if_exists or key not in self.key_to_filename # if it exists, get it from the dictionary. if key in self.key_to_filename: filename = self.key_to_filename[key] else: filename = self._get_unique_filename() config_filepath = self._get_filepath('config', filename, 'json') tb_io.write_jsonfile(config, config_filepath) value_filepath = self._get_filepath('value', filename, 'pkl') tb_io.write_picklefile(value, value_filepath) self.key_to_filename[key] = filename
def write_file(self, config_lst, value, abort_if_exists=True): assert len(config_lst) > 0 memo = self._get_memo(config_lst[:-1]) assert memo is not None cfg = config_lst[-1] key = memo._key_from_config(cfg) assert not abort_if_exists or key not in memo.key_to_filename # if it exists, get it from the dictionary. if key in memo.key_to_filename: filename = memo.key_to_filename[key] else: filename = memo._get_unique_filename() config_filepath = memo._get_filepath('file_config', filename, 'json') tb_io.write_jsonfile(cfg, config_filepath) value_filepath = memo._get_filepath('file_value', filename, 'pkl') tb_io.write_picklefile(value, value_filepath) memo.key_to_filename[key] = filename
def write(self, filepath): tb_io.write_jsonfile(self.d, filepath)
def create_experiment_folder( main_filepath, argname_lst, argval_lst_lst, output_folderpath_argname, all_experiments_folderpath, readme, experiment_name=None, # entry_folderpath=None, code_folderpath=None, # data_folderpath=None, capture_output=False, profile_run=False): assert tb_fs.folder_exists(all_experiments_folderpath) assert experiment_name is None or (not tb_fs.path_exists( tb_fs.join_paths([all_experiments_folderpath, experiment_name]))) # assert folder_exists(project_folderpath) and file_exists(tb_fs.join_paths([ # project_folderpath, main_relfilepath])) # create the main folder where things for the experiment will be. if experiment_name is None: experiment_name = get_available_filename(all_experiments_folderpath, "exp") experiment_folderpath = tb_fs.join_paths( [all_experiments_folderpath, experiment_name]) tb_fs.create_folder(experiment_folderpath) # copy the code to the experiment folder. if code_folderpath is not None: code_foldername = tb_fs.path_last_element(code_folderpath) dst_code_fo = tb_fs.join_paths( [experiment_folderpath, code_foldername]) tb_fs.copy_folder(code_folderpath, dst_code_fo, ignore_hidden_files=True, ignore_hidden_folders=True, ignore_file_exts=['.pyc']) # change main_filepath to use that new code. main_filepath = tb_fs.join_paths( [experiment_folderpath, main_filepath]) # NOTE: no data copying for now because it often does not make much sense. data_folderpath = None ### TODO: remove later. # # copy the code to the experiment folder. # if data_folderpath is not None: # data_foldername = path_last_element(data_folderpath) # dst_data_fo = join_paths([experiment_folderpath, data_foldername]) # copy_folder(data_folderpath, dst_data_fo, # ignore_hidden_files=True, ignore_hidden_folders=True) # write the config for the experiment. tb_io.write_jsonfile( tb_ut.subset_dict_via_selection(locals(), [ 'main_filepath', 'argname_lst', 'argval_lst_lst', 'output_folderpath_argname', 'all_experiments_folderpath', 'readme', 'experiment_name', 'code_folderpath', 'data_folderpath', 'capture_output', 'profile_run' ]), tb_fs.join_paths([experiment_folderpath, 'config.json'])) # generate the executables for each configuration. argname_lst = list(argname_lst) argname_lst.append(output_folderpath_argname) for (i, vs) in enumerate(argval_lst_lst): cfg_folderpath = tb_fs.join_paths([experiment_folderpath, "cfg%d" % i]) tb_fs.create_folder(cfg_folderpath) # create the script argvalue_lst = list(vs) argvalue_lst.append(cfg_folderpath) call_args = tb_ut.subset_dict_via_selection( locals(), ['argname_lst', 'argvalue_lst', 'main_filepath']) call_args['script_filepath'] = tb_fs.join_paths( [cfg_folderpath, 'run.sh']) if capture_output: call_args['output_filepath'] = tb_fs.join_paths( [cfg_folderpath, 'output.txt']) if profile_run: call_args['profile_filepath'] = tb_fs.join_paths( [cfg_folderpath, 'profile.txt']) create_run_script(**call_args) # write a config file for each configuration tb_io.write_jsonfile(tb_ut.create_dict(argname_lst, argvalue_lst), tb_fs.join_paths([cfg_folderpath, 'config.json'])) # create_runall_script(experiment_folderpath) create_runall_script_with_parallelization(experiment_folderpath) return experiment_folderpath
def train_model_with_config(): import research_toolbox.tb_logging as tb_lg if cfg["optimizer_type"] == "sgd": trainer = dy.SimpleSGDTrainer(m, cfg["step_size_start"]) elif cfg["optimizer_type"] == "adam": trainer = dy.AdamTrainer(m, cfg["step_size_start"]) elif cfg["optimizer_type"] == "sgd_mom": trainer = dy.MomentumSGDTrainer(m, cfg["step_size_start"]) else: raise ValueError trainer.set_sparse_updates(0) # restarting from a checkpoint if it exists. # optimizer state is not kept. ckpt_filepath = cfg["out_folder"] + "/checkpoint.json" if tb_fs.file_exists(ckpt_filepath): log_d = tb_io.read_jsonfile(ckpt_filepath) current_epoch = len(log_d["dev_acc"]) best_dev_acc = np.max(log_d["dev_acc"]) m.populate(cfg["out_folder"] + '/model.ckpt') else: current_epoch = 0 best_dev_acc = 0.0 log_d = { 'dev_acc': [], 'avg_loss': [], 'train_tks/sec': [], 'eval_tks/sec': [], 'secs_per_epoch': [], "lr": [] } if cfg["debug"] or cfg["compute_train_acc"]: log_d["train_acc"] = [] if cfg["loss_type"] == "log_neighbors": loss_fn = loss_log_neighbors elif cfg["loss_type"] == "log_beam": loss_fn = loss_log_beam elif cfg["loss_type"] == "cost_sensitive_margin_last": loss_fn = loss_cost_sensitive_margin_last elif cfg["loss_type"] == "margin_last": loss_fn = loss_margin_last elif cfg["loss_type"] == "perceptron_first": loss_fn = loss_perceptron_first elif cfg["loss_type"] == "perceptron_last": loss_fn = loss_perceptron_last elif cfg["loss_type"] == "upper_bound": loss_fn = loss_upper_bound else: raise ValueError cfg_accuracy = lambda data: beam_accuracy(data, cfg["beam_size"]) cfg_train_graph = lambda e: train_beam_graph(e, cfg["beam_size"], cfg[ "traj_type"], loss_fn) for epoch in range(current_epoch, cfg["num_epochs"]): if cfg["step_size_schedule_type"] == 'fixed': lr = cfg["step_size_start"] elif cfg["step_size_schedule_type"] == 'cosine': lr = cosine_get_lr(cfg["step_size_start"], cfg["step_size_end"], cfg["num_epochs"], epoch) else: raise ValueError log_d['lr'].append(lr) trainer.learning_rate = lr acc_loss = 0.0 random.shuffle(train_data) epoch_timer = tb_lg.TimeTracker() train_timer = tb_lg.TimeTracker() for i, e in enumerate(train_data): if i % cfg["print_every_num_examples"] == 0 and i > 0: print "Epoch %d - Example %d/%d" % (epoch, i, len(train_data)) loss = cfg_train_graph(e) acc_loss += loss.value() loss.backward() trainer.update() log_d["avg_loss"].append(acc_loss / len(train_data)) log_d["train_tks/sec"].append(num_train_tokens / train_timer.time_since_start()) eval_timer = tb_lg.TimeTracker() # log_d['train_acc'].append(accuracy(train_data)) log_d['dev_acc'].append(cfg_accuracy(dev_data)) # log_d['test_acc'].append(accuracy(test_data)) log_d['eval_tks/sec'].append(( #len(train_data) + num_dev_tokens # + num_test_tokens ) / eval_timer.time_since_start()) log_d["secs_per_epoch"].append(epoch_timer.time_since_start()) if cfg["debug"] or cfg["compute_train_acc"]: train_acc = cfg_accuracy(train_data) print "train_acc: ", train_acc log_d["train_acc"].append(train_acc) pprint({k: vs[-1] for k, vs in log_d.iteritems()}) if best_dev_acc < log_d["dev_acc"][-1]: best_dev_acc = log_d["dev_acc"][-1] m.save(cfg["out_folder"] + '/best_model.ckpt') tb_io.write_jsonfile(log_d, cfg["out_folder"] + "/checkpoint.json") m.save(cfg["out_folder"] + '/model.ckpt') results_filepath = cfg["out_folder"] + "/results.json" if not tb_fs.file_exists(results_filepath): m.populate(cfg["out_folder"] + '/best_model.ckpt') log_d['test_acc'] = cfg_accuracy(test_data) tb_io.write_jsonfile(log_d, cfg["out_folder"] + "/results.json")
# print k # print v ########### INITIALIZATION ########### idx = sys.argv.index('--config_filepath') + 1 cfg_filepath = sys.argv[idx] cfg = tb_io.read_jsonfile_with_overlays(cfg_filepath) pprint(cfg) tb_fs.create_folder(cfg["out_folder"], abort_if_exists=False, create_parent_folders=True) if '--train' in sys.argv: tb_io.write_jsonfile(cfg, cfg["out_folder"] + "/cfg.json") # The data is loaded from this folder after it has been processed by main_preprocess.py. if cfg["data_type"] == "supertagging": tags_key = "supertags" train_data = tb_io.read_jsonlogfile('data/supertagging/train.jsonl') dev_data = tb_io.read_jsonlogfile('data/supertagging/dev.jsonl') test_data = tb_io.read_jsonlogfile('data/supertagging/test.jsonl') # elif cfg["data_type"] == "conll2000": # tags_key = "chunk_tags" # test_data = tb_io.read_jsonlogfile('data/conll2000/test.jsonl') # train_data = tb_io.read_jsonlogfile('data/conll2000/train.jsonl') # n = len(train_data) # num_dev = int(0.2 * n) # dev_data = train_data[:num_dev] # train_data = train_data[num_dev:]
import research_toolbox.tb_io as tb_io import research_toolbox.tb_filesystem as tb_fs import research_toolbox.tb_utils as tb_ut # %% table 1: trained without beam aware models. solved with vanilla beam search. i = 1000 for m in ["vaswani", "lm"]: for z in ["supertagging"]: for x in [1]: for y in ["continue", "reset", "stop"]: tb_io.write_jsonfile( { "_overlays_": ["configs/cfgref.json"], "model_type": m, "data_type": z, "beam_size": x, "traj_type": y, "out_folder": "out/cfg%d" % i }, "configs/cfg%d.json" % i) i += 1 # %% table 2: trained with beam aware models and multiple data collection strategies. ran the same way. i = 2000 for m in ["vaswani", "lm"]: for z in ["supertagging"]: for x in [1, 2, 4, 8]: for y in ["oracle", "continue", "reset", "reset_multiple", "stop"]: tb_io.write_jsonfile( {