Пример #1
0
    def _get_memo(self, config_lst, create_if_notexists=False):
        memo = self
        for cfg in config_lst:
            key = memo._key_from_config(cfg)
            if key not in memo.key_to_foldername:
                if create_if_notexists:
                    foldername = memo._get_unique_foldername()
                    value_folderpath = memo._get_folderpath(foldername)
                    cfg_filepath = memo._get_filepath("memo_config",
                                                      foldername, "json")
                    tb_fs.create_folder(value_folderpath)
                    tb_io.write_jsonfile(cfg, cfg_filepath)

                    next_memo = NestedMemoManager(value_folderpath)
                    memo.key_to_foldername[key] = foldername
                    memo.key_to_memo[key] = next_memo
                else:
                    return None
            else:
                if key not in memo.key_to_memo:
                    foldername = memo.key_to_foldername[key]
                    value_folderpath = memo._get_folderpath(foldername)
                    memo.key_to_memo[key] = NestedMemoManager(value_folderpath)
                next_memo = memo.key_to_memo[key]
            memo = next_memo
        return memo
Пример #2
0
    def _get_memo(self, config_lst, create_if_notexists=False):
        memo = self
        for cfg in config_lst:
            key = memo._key_from_config(cfg)
            if key not in memo.key_to_foldername:
                if create_if_notexists:
                    # get new unique name
                    foldername = memo._get_unique_foldername()
                    cfg_filepath, memo_folderpath = memo._get_memo_paths(
                        foldername)
                    tb_fs.create_folder(memo_folderpath)
                    tb_io.write_jsonfile(cfg, cfg_filepath)

                    next_memo = SimplifiedNestedMemoManager(memo_folderpath)
                    memo.key_to_foldername[key] = foldername
                    memo.key_to_memo[key] = next_memo
                else:
                    return None
            else:
                if key not in memo.key_to_memo:
                    # use existing name
                    foldername = memo.key_to_foldername[key]
                    memo_folderpath = memo._get_memo_paths(foldername)[1]
                    memo.key_to_memo[key] = SimplifiedNestedMemoManager(
                        memo_folderpath)
                next_memo = memo.key_to_memo[key]
            memo = next_memo
        return memo
Пример #3
0
 def save(self, name, x):
     cfg = self.name_to_cfg[name]
     out = cfg['save_fn'](x)
     filepath = self._get_filepath(name, cfg['use_json'])
     if cfg['use_json']:
         tb_io.write_jsonfile(out, filepath)
     else:
         tb_io.write_picklefile(out, filepath)
Пример #4
0
    def write_file(self, config, value, abort_if_exists=True):
        key = self._key_from_config(config)
        assert not abort_if_exists or key not in self.key_to_filename

        # if it exists, get it from the dictionary.
        if key not in self.key_to_filename:
            filename = self._get_unique_filename()
        filename = self.key_to_filename[key]
        cfg_filepath, value_filepath = self._get_file_paths(filename)
        tb_io.write_jsonfile(config, cfg_filepath)
        tb_io.write_picklefile(value, value_filepath)
Пример #5
0
    def write(self, config, value, abort_if_exists=True):
        key = self._key_from_config(config)
        assert not abort_if_exists or key not in self.key_to_filename

        # if it exists, get it from the dictionary.
        if key in self.key_to_filename:
            filename = self.key_to_filename[key]
        else:
            filename = self._get_unique_filename()

        config_filepath = self._get_filepath('config', filename, 'json')
        tb_io.write_jsonfile(config, config_filepath)
        value_filepath = self._get_filepath('value', filename, 'pkl')
        tb_io.write_picklefile(value, value_filepath)
        self.key_to_filename[key] = filename
Пример #6
0
    def write_file(self, config_lst, value, abort_if_exists=True):
        assert len(config_lst) > 0
        memo = self._get_memo(config_lst[:-1])
        assert memo is not None
        cfg = config_lst[-1]
        key = memo._key_from_config(cfg)
        assert not abort_if_exists or key not in memo.key_to_filename

        # if it exists, get it from the dictionary.
        if key in memo.key_to_filename:
            filename = memo.key_to_filename[key]
        else:
            filename = memo._get_unique_filename()

        config_filepath = memo._get_filepath('file_config', filename, 'json')
        tb_io.write_jsonfile(cfg, config_filepath)
        value_filepath = memo._get_filepath('file_value', filename, 'pkl')
        tb_io.write_picklefile(value, value_filepath)
        memo.key_to_filename[key] = filename
Пример #7
0
 def write(self, filepath):
     tb_io.write_jsonfile(self.d, filepath)
Пример #8
0
def create_experiment_folder(
        main_filepath,
        argname_lst,
        argval_lst_lst,
        output_folderpath_argname,
        all_experiments_folderpath,
        readme,
        experiment_name=None,
        # entry_folderpath=None,
        code_folderpath=None,
        # data_folderpath=None,
        capture_output=False,
        profile_run=False):

    assert tb_fs.folder_exists(all_experiments_folderpath)
    assert experiment_name is None or (not tb_fs.path_exists(
        tb_fs.join_paths([all_experiments_folderpath, experiment_name])))
    # assert folder_exists(project_folderpath) and file_exists(tb_fs.join_paths([
    #     project_folderpath, main_relfilepath]))

    # create the main folder where things for the experiment will be.
    if experiment_name is None:
        experiment_name = get_available_filename(all_experiments_folderpath,
                                                 "exp")
    experiment_folderpath = tb_fs.join_paths(
        [all_experiments_folderpath, experiment_name])
    tb_fs.create_folder(experiment_folderpath)

    # copy the code to the experiment folder.
    if code_folderpath is not None:
        code_foldername = tb_fs.path_last_element(code_folderpath)
        dst_code_fo = tb_fs.join_paths(
            [experiment_folderpath, code_foldername])

        tb_fs.copy_folder(code_folderpath,
                          dst_code_fo,
                          ignore_hidden_files=True,
                          ignore_hidden_folders=True,
                          ignore_file_exts=['.pyc'])

        # change main_filepath to use that new code.
        main_filepath = tb_fs.join_paths(
            [experiment_folderpath, main_filepath])

    # NOTE: no data copying for now because it often does not make much sense.
    data_folderpath = None  ### TODO: remove later.
    # # copy the code to the experiment folder.
    # if data_folderpath is not None:
    #     data_foldername = path_last_element(data_folderpath)
    #     dst_data_fo = join_paths([experiment_folderpath, data_foldername])

    #     copy_folder(data_folderpath, dst_data_fo,
    #         ignore_hidden_files=True, ignore_hidden_folders=True)

    # write the config for the experiment.
    tb_io.write_jsonfile(
        tb_ut.subset_dict_via_selection(locals(), [
            'main_filepath', 'argname_lst', 'argval_lst_lst',
            'output_folderpath_argname', 'all_experiments_folderpath',
            'readme', 'experiment_name', 'code_folderpath', 'data_folderpath',
            'capture_output', 'profile_run'
        ]), tb_fs.join_paths([experiment_folderpath, 'config.json']))

    # generate the executables for each configuration.
    argname_lst = list(argname_lst)
    argname_lst.append(output_folderpath_argname)
    for (i, vs) in enumerate(argval_lst_lst):
        cfg_folderpath = tb_fs.join_paths([experiment_folderpath, "cfg%d" % i])
        tb_fs.create_folder(cfg_folderpath)

        # create the script
        argvalue_lst = list(vs)
        argvalue_lst.append(cfg_folderpath)
        call_args = tb_ut.subset_dict_via_selection(
            locals(), ['argname_lst', 'argvalue_lst', 'main_filepath'])

        call_args['script_filepath'] = tb_fs.join_paths(
            [cfg_folderpath, 'run.sh'])
        if capture_output:
            call_args['output_filepath'] = tb_fs.join_paths(
                [cfg_folderpath, 'output.txt'])
        if profile_run:
            call_args['profile_filepath'] = tb_fs.join_paths(
                [cfg_folderpath, 'profile.txt'])
        create_run_script(**call_args)

        # write a config file for each configuration
        tb_io.write_jsonfile(tb_ut.create_dict(argname_lst, argvalue_lst),
                             tb_fs.join_paths([cfg_folderpath, 'config.json']))
    # create_runall_script(experiment_folderpath)
    create_runall_script_with_parallelization(experiment_folderpath)

    return experiment_folderpath
Пример #9
0
def train_model_with_config():
    import research_toolbox.tb_logging as tb_lg

    if cfg["optimizer_type"] == "sgd":
        trainer = dy.SimpleSGDTrainer(m, cfg["step_size_start"])
    elif cfg["optimizer_type"] == "adam":
        trainer = dy.AdamTrainer(m, cfg["step_size_start"])
    elif cfg["optimizer_type"] == "sgd_mom":
        trainer = dy.MomentumSGDTrainer(m, cfg["step_size_start"])
    else:
        raise ValueError
    trainer.set_sparse_updates(0)

    # restarting from a checkpoint if it exists.
    # optimizer state is not kept.
    ckpt_filepath = cfg["out_folder"] + "/checkpoint.json"
    if tb_fs.file_exists(ckpt_filepath):
        log_d = tb_io.read_jsonfile(ckpt_filepath)
        current_epoch = len(log_d["dev_acc"])
        best_dev_acc = np.max(log_d["dev_acc"])
        m.populate(cfg["out_folder"] + '/model.ckpt')
    else:
        current_epoch = 0
        best_dev_acc = 0.0

        log_d = {
            'dev_acc': [],
            'avg_loss': [],
            'train_tks/sec': [],
            'eval_tks/sec': [],
            'secs_per_epoch': [],
            "lr": []
        }
        if cfg["debug"] or cfg["compute_train_acc"]:
            log_d["train_acc"] = []

    if cfg["loss_type"] == "log_neighbors":
        loss_fn = loss_log_neighbors
    elif cfg["loss_type"] == "log_beam":
        loss_fn = loss_log_beam
    elif cfg["loss_type"] == "cost_sensitive_margin_last":
        loss_fn = loss_cost_sensitive_margin_last
    elif cfg["loss_type"] == "margin_last":
        loss_fn = loss_margin_last
    elif cfg["loss_type"] == "perceptron_first":
        loss_fn = loss_perceptron_first
    elif cfg["loss_type"] == "perceptron_last":
        loss_fn = loss_perceptron_last
    elif cfg["loss_type"] == "upper_bound":
        loss_fn = loss_upper_bound
    else:
        raise ValueError

    cfg_accuracy = lambda data: beam_accuracy(data, cfg["beam_size"])
    cfg_train_graph = lambda e: train_beam_graph(e, cfg["beam_size"], cfg[
        "traj_type"], loss_fn)

    for epoch in range(current_epoch, cfg["num_epochs"]):
        if cfg["step_size_schedule_type"] == 'fixed':
            lr = cfg["step_size_start"]
        elif cfg["step_size_schedule_type"] == 'cosine':
            lr = cosine_get_lr(cfg["step_size_start"], cfg["step_size_end"],
                               cfg["num_epochs"], epoch)
        else:
            raise ValueError
        log_d['lr'].append(lr)

        trainer.learning_rate = lr

        acc_loss = 0.0
        random.shuffle(train_data)
        epoch_timer = tb_lg.TimeTracker()
        train_timer = tb_lg.TimeTracker()
        for i, e in enumerate(train_data):
            if i % cfg["print_every_num_examples"] == 0 and i > 0:
                print "Epoch %d - Example %d/%d" % (epoch, i, len(train_data))
            loss = cfg_train_graph(e)
            acc_loss += loss.value()
            loss.backward()
            trainer.update()

        log_d["avg_loss"].append(acc_loss / len(train_data))
        log_d["train_tks/sec"].append(num_train_tokens /
                                      train_timer.time_since_start())
        eval_timer = tb_lg.TimeTracker()
        # log_d['train_acc'].append(accuracy(train_data))
        log_d['dev_acc'].append(cfg_accuracy(dev_data))
        # log_d['test_acc'].append(accuracy(test_data))
        log_d['eval_tks/sec'].append((  #len(train_data) +
            num_dev_tokens
            # + num_test_tokens
        ) / eval_timer.time_since_start())
        log_d["secs_per_epoch"].append(epoch_timer.time_since_start())
        if cfg["debug"] or cfg["compute_train_acc"]:
            train_acc = cfg_accuracy(train_data)
            print "train_acc: ", train_acc
            log_d["train_acc"].append(train_acc)
        pprint({k: vs[-1] for k, vs in log_d.iteritems()})

        if best_dev_acc < log_d["dev_acc"][-1]:
            best_dev_acc = log_d["dev_acc"][-1]
            m.save(cfg["out_folder"] + '/best_model.ckpt')
        tb_io.write_jsonfile(log_d, cfg["out_folder"] + "/checkpoint.json")
        m.save(cfg["out_folder"] + '/model.ckpt')

    results_filepath = cfg["out_folder"] + "/results.json"
    if not tb_fs.file_exists(results_filepath):
        m.populate(cfg["out_folder"] + '/best_model.ckpt')
        log_d['test_acc'] = cfg_accuracy(test_data)
        tb_io.write_jsonfile(log_d, cfg["out_folder"] + "/results.json")
Пример #10
0
#         print k
#         print v

########### INITIALIZATION ###########

idx = sys.argv.index('--config_filepath') + 1
cfg_filepath = sys.argv[idx]
cfg = tb_io.read_jsonfile_with_overlays(cfg_filepath)
pprint(cfg)

tb_fs.create_folder(cfg["out_folder"],
                    abort_if_exists=False,
                    create_parent_folders=True)

if '--train' in sys.argv:
    tb_io.write_jsonfile(cfg, cfg["out_folder"] + "/cfg.json")

# The data is loaded from this folder after it has been processed by main_preprocess.py.
if cfg["data_type"] == "supertagging":
    tags_key = "supertags"
    train_data = tb_io.read_jsonlogfile('data/supertagging/train.jsonl')
    dev_data = tb_io.read_jsonlogfile('data/supertagging/dev.jsonl')
    test_data = tb_io.read_jsonlogfile('data/supertagging/test.jsonl')
# elif cfg["data_type"] == "conll2000":
#     tags_key = "chunk_tags"
#     test_data = tb_io.read_jsonlogfile('data/conll2000/test.jsonl')
#     train_data = tb_io.read_jsonlogfile('data/conll2000/train.jsonl')
#     n = len(train_data)
#     num_dev = int(0.2 * n)
#     dev_data = train_data[:num_dev]
#     train_data = train_data[num_dev:]
Пример #11
0
import research_toolbox.tb_io as tb_io
import research_toolbox.tb_filesystem as tb_fs
import research_toolbox.tb_utils as tb_ut

# %% table 1: trained without beam aware models. solved with vanilla beam search.
i = 1000
for m in ["vaswani", "lm"]:
    for z in ["supertagging"]:
        for x in [1]:
            for y in ["continue", "reset", "stop"]:
                tb_io.write_jsonfile(
                    {
                        "_overlays_": ["configs/cfgref.json"],
                        "model_type": m,
                        "data_type": z,
                        "beam_size": x,
                        "traj_type": y,
                        "out_folder": "out/cfg%d" % i
                    }, "configs/cfg%d.json" % i)
                i += 1

# %% table 2: trained with beam aware models and multiple data collection strategies. ran the same way.

i = 2000
for m in ["vaswani", "lm"]:
    for z in ["supertagging"]:
        for x in [1, 2, 4, 8]:
            for y in ["oracle", "continue", "reset", "reset_multiple", "stop"]:
                tb_io.write_jsonfile(
                    {