示例#1
0
    def test_get_plot(self):
        # save a score_list
        savedir_base = '.tmp'
        exp_dict = {'model':{'name':'mlp', 'n_layers':30}, 
                    'dataset':'mnist', 'batch_size':1}
        score_list = [{'epoch': 0, 'acc':0.5}, {'epoch': 1, 'acc':0.9}]

        hu.save_pkl(os.path.join(savedir_base, hu.hash_dict(exp_dict),
                     'score_list.pkl'), score_list)
        hu.save_json(os.path.join(savedir_base, hu.hash_dict(exp_dict),
                     'exp_dict.json'), exp_dict)
        # check if score_list can be loaded and viewed in pandas
        exp_list = hu.get_exp_list(savedir_base=savedir_base)
        
        fig, axis = hr.get_plot(exp_list,
             savedir_base=savedir_base,
             filterby_list=[({'model':{'name':'mlp'}},
                             {'style':{'color':'red'}})],
             x_metric='epoch',
             y_metric='acc')
        # fig, axis = hr.get_plot(exp_list,
        #      savedir_base=savedir_base,
        #      x_metric='epoch',
        #      y_metric='acc',
        #      mode='pretty_plot')
        fig, axis = hr.get_plot(exp_list,
             savedir_base=savedir_base,
             x_metric='epoch',
             y_metric='acc',
             mode='bar')
        fig.savefig(os.path.join('.tmp', 
                        'test.png'))

        shutil.rmtree('.tmp')
示例#2
0
    def launch_exp_dict(self, exp_dict, savedir, command, job=None):
        """Submit a job job and save job dict and exp_dict."""
        # Check for duplicates
        # if job is not None:
        # assert self._assert_no_duplicates(job)

        fname_exp_dict = os.path.join(savedir, "exp_dict.json")
        hu.save_json(fname_exp_dict, exp_dict)
        assert (hu.hash_dict(
            hu.load_json(fname_exp_dict)) == hu.hash_dict(exp_dict))

        # Define paths
        workdir_job = os.path.join(savedir, "code")

        # Copy the experiment code into the experiment folder
        hu.copy_code(self.workdir + "/", workdir_job, verbose=0)

        # Run  command
        job_id = self.submit_job(command, workdir_job, savedir_logs=savedir)

        # Verbose
        if self.verbose:
            print("Job_id: %s command: %s" % (job_id, command))

        job_dict = {"job_id": job_id, "command": command}

        hu.save_json(get_job_fname(savedir), job_dict)

        return job_dict
示例#3
0
文件: tests.py 项目: alzayats/haven
    def test_get_score_lists(self):
        # save a score_list
        savedir_base = '.tmp'
        exp_dict = {
            'model': {
                'name': 'mlp',
                'n_layers': 30
            },
            'dataset': 'mnist',
            'batch_size': 1
        }
        score_list = [{'epoch': 0, 'acc': 0.5}, {'epoch': 0, 'acc': 0.9}]

        hu.save_pkl(
            os.path.join(savedir_base, hu.hash_dict(exp_dict),
                         'score_list.pkl'), score_list)
        hu.save_json(
            os.path.join(savedir_base, hu.hash_dict(exp_dict),
                         'exp_dict.json'), exp_dict)
        # check if score_list can be loaded and viewed in pandas
        exp_list = hr.get_exp_list(savedir_base=savedir_base)

        score_lists = hr.get_score_lists(exp_list, savedir_base=savedir_base)
        assert (score_lists[0][0]['acc'] == 0.5)
        assert (score_lists[0][1]['acc'] == 0.9)

        shutil.rmtree(savedir_base)
示例#4
0
    def test_get_score_df(self):
        # save a score_list
        savedir_base = '.tmp'
        exp_dict = {'model':{'name':'mlp', 'n_layers':30}, 
                    'dataset':'mnist', 'batch_size':1}
        exp_dict2 = {'model':{'name':'mlp2', 'n_layers':30}, 
                    'dataset':'mnist', 'batch_size':1}

        score_list = [{'epoch': 0, 'acc':0.5}, {'epoch': 0, 'acc':0.9}]

        hu.save_pkl(os.path.join(savedir_base, hu.hash_dict(exp_dict),
                     'score_list.pkl'), score_list)
                     
        hu.save_json(os.path.join(savedir_base, hu.hash_dict(exp_dict),
                     'exp_dict.json'), exp_dict)

        hu.save_json(os.path.join(savedir_base, hu.hash_dict(exp_dict2),
                     'exp_dict.json'), exp_dict)
        # check if score_list can be loaded and viewed in pandas
        exp_list = hu.get_exp_list(savedir_base=savedir_base)
        score_df = hr.get_score_df(exp_list, savedir_base=savedir_base)
        
        assert(np.array(score_df['dataset'])[0].strip("'") == 'mnist')

        shutil.rmtree('.tmp')
示例#5
0
    def test_zipdir(self):
        # save a score_list
        savedir_base = ".tmp"
        exp_dict = {
            "model": {
                "name": "mlp",
                "n_layers": 30
            },
            "dataset": "mnist",
            "batch_size": 1
        }
        score_list = [{"epoch": 0, "acc": 0.5}, {"epoch": 0, "acc": 0.9}]

        hu.save_pkl(
            os.path.join(savedir_base, hu.hash_dict(exp_dict),
                         "score_list.pkl"), score_list)
        hu.save_json(
            os.path.join(savedir_base, hu.hash_dict(exp_dict),
                         "exp_dict.json"), exp_dict)
        # check if score_list can be loaded and viewed in pandas
        exp_list = hr.get_exp_list(savedir_base=savedir_base)

        score_lists = hr.get_score_lists(exp_list, savedir_base=savedir_base)
        assert score_lists[0][0]["acc"] == 0.5
        assert score_lists[0][1]["acc"] == 0.9
        from haven import haven_dropbox as hd

        hd.zipdir([hu.hash_dict(exp_dict) for exp_dict in exp_list],
                  savedir_base,
                  src_fname=".tmp/results.zip")
        shutil.rmtree(savedir_base)
示例#6
0
def test_get_score_lists():
    # save a score_list
    savedir_base = ".tmp"
    exp_dict = {
        "model": {
            "name": "mlp",
            "n_layers": 30
        },
        "dataset": "mnist",
        "batch_size": 1
    }
    score_list = [{"epoch": 0, "acc": 0.5}, {"epoch": 0, "acc": 0.9}]

    hu.save_pkl(
        os.path.join(savedir_base, hu.hash_dict(exp_dict), "score_list.pkl"),
        score_list)
    hu.save_json(
        os.path.join(savedir_base, hu.hash_dict(exp_dict), "exp_dict.json"),
        exp_dict)
    # check if score_list can be loaded and viewed in pandas
    exp_list = hu.get_exp_list(savedir_base=savedir_base)

    score_lists = hr.get_score_lists(exp_list, savedir_base=savedir_base)
    assert score_lists[0][0]["acc"] == 0.5
    assert score_lists[0][1]["acc"] == 0.9

    shutil.rmtree(savedir_base)
示例#7
0
    def launch_job(self, exp_dict, savedir, command, job=None):
        """Submit a job job and save job dict and exp_dict."""
        add_job_utils()
        import haven_jobs_utils as hju

        # Check for duplicates
        if job is not None:
            assert self._assert_no_duplicates(job)

        hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)

        # Define paths
        workdir_job = os.path.join(savedir, "code")

        # Copy the experiment code into the experiment folder
        hu.copy_code(self.workdir + "/", workdir_job)

        # Run  command
        job_command = hju.get_job_command(self.job_config, command, savedir, workdir=workdir_job)
        job_id = hu.subprocess_call(job_command).replace("\n", "")

        # Verbose
        if self.verbose:
            print("Job_id: %s command: %s" % (job_id, command))

        job_dict = {"job_id": job_id, 
                      "started at (Montreal)":hu.time_to_montreal(),
                      "command":command}

        hu.save_json(hju.get_job_fname(savedir), job_dict)

        return job_dict
示例#8
0
def test_get_plot():
    # save a score_list
    savedir_base = ".tmp"
    exp_dict = {
        "model": {
            "name": "mlp",
            "n_layers": 30
        },
        "dataset": "mnist",
        "batch_size": 1
    }
    score_list = [{"epoch": 0, "acc": 0.5}, {"epoch": 1, "acc": 0.9}]

    hu.save_pkl(
        os.path.join(savedir_base, hu.hash_dict(exp_dict), "score_list.pkl"),
        score_list)
    hu.save_json(
        os.path.join(savedir_base, hu.hash_dict(exp_dict), "exp_dict.json"),
        exp_dict)
    # check if score_list can be loaded and viewed in pandas
    exp_list = hu.get_exp_list(savedir_base=savedir_base)

    fig, axis = hr.get_plot(
        exp_list,
        savedir_base=savedir_base,
        filterby_list=[({
            "model": {
                "name": "mlp"
            }
        }, {
            "style": {
                "color": "red"
            }
        })],
        x_metric="epoch",
        y_metric="acc",
    )
    # fig, axis = hr.get_plot(exp_list,
    #      savedir_base=savedir_base,
    #      x_metric='epoch',
    #      y_metric='acc',
    #      mode='pretty_plot')
    fig, axis = hr.get_plot(exp_list,
                            savedir_base=savedir_base,
                            x_metric="epoch",
                            y_metric="acc",
                            mode="bar")
    fig.savefig(os.path.join(".tmp", "test.png"))

    shutil.rmtree(".tmp")
示例#9
0
    def test_checkpoint(self):
        savedir_base = '.results'
        # create exp folder
        exp_dict = {'model':{'name':'mlp', 'n_layers':30}, 'dataset':'mnist', 'batch_size':1}
        savedir = os.path.join(savedir_base, hu.hash_dict(exp_dict))
        hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)
        hu.torch_save(os.path.join(savedir, "model.pth"), torch.zeros(10))
        hu.torch_load(os.path.join(savedir, "model.pth"))
        assert(os.path.exists(savedir))

        # delete exp folder
        hc.delete_experiment(savedir)
        assert(not os.path.exists(savedir))

        # check backup folder
        os.rmdir(savedir_base)
示例#10
0
def save_example_results(savedir_base="results"):
    import os
    import pandas
    import requests
    import io
    import matplotlib.pyplot as plt

    from .. import haven_results as hr
    from .. import haven_utils as hu
    from PIL import Image

    # create hyperparameters
    exp_list = [{
        "dataset": "mnist",
        "model": "mlp",
        "lr": lr
    } for lr in [1e-1, 1e-2, 1e-3]]

    for i, exp_dict in enumerate(exp_list):
        # get hash for experiment
        exp_id = hu.hash_dict(exp_dict)

        # add scores for loss, and accuracy
        score_list = []
        for e in range(1, 10):
            score_list += [{
                "epoch": e,
                "loss": 1 - e * exp_dict["lr"] * 0.9,
                "acc": e * exp_dict["lr"] * 0.1
            }]
        # save scores and images
        hu.save_json(os.path.join(savedir_base, exp_id, "exp_dict.json"),
                     exp_dict)
        hu.save_pkl(os.path.join(savedir_base, exp_id, "score_list.pkl"),
                    score_list)

        url = "https://raw.githubusercontent.com/haven-ai/haven-ai/master/haven/haven_examples/data/%d.png" % (
            i + 1)
        response = requests.get(url).content
        img = plt.imread(io.BytesIO(response), format="JPG")
        hu.save_image(os.path.join(savedir_base, exp_id, "images/1.png"),
                      img[:, :, :3])
示例#11
0
def save_example_results(savedir_base='results'):
    import os, pandas
    import requests, io
    import matplotlib.pyplot as plt

    from .. import haven_results as hr
    from .. import haven_utils as hu
    from PIL import Image

    # create hyperparameters
    exp_list = [{
        'dataset': 'mnist',
        'model': 'mlp',
        'lr': lr
    } for lr in [1e-1, 1e-2, 1e-3]]

    for i, exp_dict in enumerate(exp_list):
        # get hash for experiment
        exp_id = hu.hash_dict(exp_dict)

        # add scores for loss, and accuracy
        score_list = []
        for e in range(1, 10):
            score_list += [{
                'epoch': e,
                'loss': 1 - e * exp_dict['lr'] * 0.9,
                'acc': e * exp_dict['lr'] * 0.1
            }]
        # save scores and images
        hu.save_json(os.path.join(savedir_base, exp_id, 'exp_dict.json'),
                     exp_dict)
        hu.save_pkl(os.path.join(savedir_base, exp_id, 'score_list.pkl'),
                    score_list)

        url = 'https://raw.githubusercontent.com/haven-ai/haven-ai/master/haven/haven_examples/data/%d.png' % (
            i + 1)
        response = requests.get(url).content
        img = plt.imread(io.BytesIO(response), format='JPG')
        hu.save_image(os.path.join(savedir_base, exp_id, 'images/1.png'),
                      img[:, :, :3])
示例#12
0
def test_get_score_df():
    # save a score_list
    savedir_base = ".tmp"
    exp_dict = {
        "model": {
            "name": "mlp",
            "n_layers": 30
        },
        "dataset": "mnist",
        "batch_size": 1
    }
    exp_dict2 = {
        "model": {
            "name": "mlp2",
            "n_layers": 30
        },
        "dataset": "mnist",
        "batch_size": 1
    }

    score_list = [{"epoch": 0, "acc": 0.5}, {"epoch": 0, "acc": 0.9}]

    hu.save_pkl(
        os.path.join(savedir_base, hu.hash_dict(exp_dict), "score_list.pkl"),
        score_list)

    hu.save_json(
        os.path.join(savedir_base, hu.hash_dict(exp_dict), "exp_dict.json"),
        exp_dict)

    hu.save_json(
        os.path.join(savedir_base, hu.hash_dict(exp_dict2), "exp_dict.json"),
        exp_dict)
    # check if score_list can be loaded and viewed in pandas
    exp_list = hu.get_exp_list(savedir_base=savedir_base)
    score_df = hr.get_score_df(exp_list, savedir_base=savedir_base)

    assert np.array(score_df["dataset"])[0].strip("'") == "mnist"

    shutil.rmtree(".tmp")
示例#13
0
def trainval(exp_dict, savedir_base, datadir, reset=False, num_workers=0):
    # bookkeepting stuff
    # ==================
    pprint.pprint(exp_dict)
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)
    if reset:
        hc.delete_and_backup_experiment(savedir)

    os.makedirs(savedir, exist_ok=True)
    if not os.path.join(savedir, "exp_dict.json"):
        hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)
        print("Experiment saved in %s" % savedir)

    # BCD train
    # ==================
    # Ignore the following combinations
    if not ut.is_valid_exp(exp_dict):
        return

    score_list_fname = os.path.join(savedir, 'score_list.pkl')
    if os.path.exists(score_list_fname):
        score_list = hu.load_pkl(score_list_fname)

    else:
        score_list = train(dataset_name=exp_dict['dataset']['name'],
                           loss_name=exp_dict['dataset']['loss'],
                           block_size=exp_dict['block_size'],
                           partition_rule=exp_dict['partition'],
                           selection_rule=exp_dict['selection'],
                           update_rule=exp_dict['update'],
                           n_iters=exp_dict['max_iters'],
                           L1=exp_dict.get('l1', 0),
                           L2=0,
                           datasets_path=datadir)

        hu.save_pkl(score_list_fname, score_list)

    print('Experiment completed.')
    return score_list
示例#14
0
def submit_job(command, savedir):
    # read slurm setting
    lines = "#! /bin/bash \n"
    # if job_config is not None:
    #     lines += "#SBATCH --account=%s \n" % job_configs.ACCOUNT_ID
    #     for key in list(job_config.keys()):
    #         lines += "#SBATCH --%s=%s \n" % (key, job_config[key])
    lines += "#SBATCH --account=%s \n" % job_configs.ACCOUNT_ID
    for key in list(job_configs.JOB_CONFIG.keys()):
        lines += "#SBATCH --%s=%s \n" % (key, job_configs.JOB_CONFIG[key])
    path_log = os.path.join(savedir, "logs.txt")
    path_err = os.path.join(savedir, "err.txt")
    lines += "#SBATCH --output=%s \n" % path_log
    lines += "#SBATCH --error=%s \n" % path_err

    lines += command

    file_name = os.path.join(savedir, "bash.sh")
    hu.save_txt(file_name, lines)
    # launch the exp
    submit_command = "sbatch %s" % file_name
    while True:
        try:
            job_id = hu.subprocess_call(submit_command).split()[-1]
        except Exception:
            print("slurm time out and retry now")
            time.sleep(1)
            continue
        break

    # save the command and job id in job_dict.json
    job_dict = {"command": command, "job_id": job_id}
    hu.save_json(os.path.join(savedir, "job_dict.json"), job_dict)

    # delete the bash.sh
    os.remove(file_name)

    return job_id
示例#15
0
def test_checkpoint():
    savedir_base = ".results"
    # create exp folder
    exp_dict = {
        "model": {
            "name": "mlp",
            "n_layers": 30
        },
        "dataset": "mnist",
        "batch_size": 1
    }
    savedir = os.path.join(savedir_base, hu.hash_dict(exp_dict))
    hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)
    hu.torch_save(os.path.join(savedir, "model.pth"), torch.zeros(10))
    hu.torch_load(os.path.join(savedir, "model.pth"))
    hc.load_checkpoint(exp_dict, savedir_base, fname="model.pth")
    assert os.path.exists(savedir)

    # delete exp folder
    hc.delete_experiment(savedir)
    assert not os.path.exists(savedir)

    # check backup folder
    os.rmdir(savedir_base)
示例#16
0
def trainval(exp_dict,
             savedir_base,
             reset=False,
             num_workers=0,
             run_ssl=False):
    # bookkeeping
    # ---------------

    # get experiment directory
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)

    if reset:
        # delete and backup experiment
        hc.delete_experiment(savedir, backup_flag=True)

    # create folder and save the experiment dictionary
    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict)
    pprint.pprint(exp_dict)
    print('Experiment saved in %s' % savedir)

    # load datasets
    # ==========================
    train_set = datasets.get_dataset(
        dataset_name=exp_dict["dataset_train"],
        data_root=exp_dict["dataset_train_root"],
        split="train",
        transform=exp_dict["transform_train"],
        classes=exp_dict["classes_train"],
        support_size=exp_dict["support_size_train"],
        query_size=exp_dict["query_size_train"],
        n_iters=exp_dict["train_iters"],
        unlabeled_size=exp_dict["unlabeled_size_train"])

    val_set = datasets.get_dataset(
        dataset_name=exp_dict["dataset_val"],
        data_root=exp_dict["dataset_val_root"],
        split="val",
        transform=exp_dict["transform_val"],
        classes=exp_dict["classes_val"],
        support_size=exp_dict["support_size_val"],
        query_size=exp_dict["query_size_val"],
        n_iters=exp_dict["val_iters"],
        unlabeled_size=exp_dict["unlabeled_size_val"])

    test_set = datasets.get_dataset(
        dataset_name=exp_dict["dataset_test"],
        data_root=exp_dict["dataset_test_root"],
        split="test",
        transform=exp_dict["transform_val"],
        classes=exp_dict["classes_test"],
        support_size=exp_dict["support_size_test"],
        query_size=exp_dict["query_size_test"],
        n_iters=exp_dict["test_iters"],
        unlabeled_size=exp_dict["unlabeled_size_test"])

    # get dataloaders
    # ==========================
    train_loader = torch.utils.data.DataLoader(
        train_set,
        batch_size=exp_dict["batch_size"],
        shuffle=True,
        num_workers=num_workers,
        collate_fn=ut.get_collate(exp_dict["collate_fn"]),
        drop_last=True)
    val_loader = torch.utils.data.DataLoader(val_set,
                                             batch_size=1,
                                             shuffle=False,
                                             num_workers=num_workers,
                                             collate_fn=lambda x: x,
                                             drop_last=True)
    test_loader = torch.utils.data.DataLoader(test_set,
                                              batch_size=1,
                                              shuffle=False,
                                              num_workers=num_workers,
                                              collate_fn=lambda x: x,
                                              drop_last=True)

    # create model and trainer
    # ==========================

    # Create model, opt, wrapper
    backbone = backbones.get_backbone(
        backbone_name=exp_dict['model']["backbone"], exp_dict=exp_dict)
    model = models.get_model(model_name=exp_dict["model"]['name'],
                             backbone=backbone,
                             n_classes=exp_dict["n_classes"],
                             exp_dict=exp_dict)

    if run_ssl:
        # runs the SSL experiments
        score_list_path = os.path.join(savedir, 'score_list.pkl')
        if not os.path.exists(score_list_path):
            test_dict = model.test_on_loader(test_loader, max_iter=None)
            hu.save_pkl(score_list_path, [test_dict])
        return

    # Checkpoint
    # -----------
    checkpoint_path = os.path.join(savedir, 'checkpoint.pth')
    score_list_path = os.path.join(savedir, 'score_list.pkl')

    if os.path.exists(score_list_path):
        # resume experiment
        model.load_state_dict(hu.torch_load(checkpoint_path))
        score_list = hu.load_pkl(score_list_path)
        s_epoch = score_list[-1]['epoch'] + 1
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # Run training and validation
    for epoch in range(s_epoch, exp_dict["max_epoch"]):
        score_dict = {"epoch": epoch}
        score_dict.update(model.get_lr())

        # train
        score_dict.update(model.train_on_loader(train_loader))

        # validate
        score_dict.update(model.val_on_loader(val_loader))
        score_dict.update(model.test_on_loader(test_loader))

        # Add score_dict to score_list
        score_list += [score_dict]

        # Report
        score_df = pd.DataFrame(score_list)
        print(score_df.tail())

        # Save checkpoint
        hu.save_pkl(score_list_path, score_list)
        hu.torch_save(checkpoint_path, model.get_state_dict())
        print("Saved: %s" % savedir)

        if "accuracy" in exp_dict["target_loss"]:
            is_best = score_dict[exp_dict["target_loss"]] >= score_df[
                exp_dict["target_loss"]][:-1].max()
        else:
            is_best = score_dict[exp_dict["target_loss"]] <= score_df[
                exp_dict["target_loss"]][:-1].min()

        # Save best checkpoint
        if is_best:
            hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"),
                        score_list)
            hu.torch_save(os.path.join(savedir, "checkpoint_best.pth"),
                          model.get_state_dict())
            print("Saved Best: %s" % savedir)

        # Check for end of training conditions
        if model.is_end_of_training():
            break
示例#17
0
def trainval(exp_dict, savedir_base, reset=False):
    # bookkeeping
    # ---------------

    # get experiment directory
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)

    if reset:
        # delete and backup experiment
        hc.delete_experiment(savedir, backup_flag=True)

    # create folder and save the experiment dictionary
    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict)
    pprint.pprint(exp_dict)
    print('Experiment saved in %s' % savedir)

    # Dataset
    # -----------

    # train loader
    train_loader = datasets.get_loader(dataset_name=exp_dict['dataset'],
                                       datadir=savedir_base,
                                       split='train')

    # val loader
    val_loader = datasets.get_loader(dataset_name=exp_dict['dataset'],
                                     datadir=savedir_base,
                                     split='val')

    # Model
    # -----------
    model = models.get_model(model_name=exp_dict['model'])

    # Checkpoint
    # -----------
    model_path = os.path.join(savedir, 'model.pth')
    score_list_path = os.path.join(savedir, 'score_list.pkl')

    if os.path.exists(score_list_path):
        # resume experiment
        model.set_state_dict(hu.torch_load(model_path))
        score_list = hu.load_pkl(score_list_path)
        s_epoch = score_list[-1]['epoch'] + 1
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # Train & Val
    # ------------
    print('Starting experiment at epoch %d' % (s_epoch))

    for e in range(s_epoch, 10):
        score_dict = {}

        # Train the model
        train_dict = model.train_on_loader(train_loader)

        # Validate the model
        val_dict = model.val_on_loader(val_loader)

        # Get metrics
        score_dict['train_loss'] = train_dict['train_loss']
        score_dict['val_acc'] = val_dict['val_acc']
        score_dict['epoch'] = e

        # Add to score_list and save checkpoint
        score_list += [score_dict]

        # Report & Save
        score_df = pd.DataFrame(score_list)
        print(score_df.tail())
        hu.torch_save(model_path, model.get_state_dict())
        hu.save_pkl(score_list_path, score_list)
        print('Checkpoint Saved: %s' % savedir)

    print('experiment completed')
示例#18
0
def trainval(exp_dict, savedir_base, datadir_base, reset=False, 
            num_workers=0, pin_memory=False, ngpu=1, cuda_deterministic=False):
    # bookkeeping
    # ==================

    # get experiment directory
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)

    if reset:
        # delete and backup experiment
        hc.delete_experiment(savedir, backup_flag=True)

    # create folder and save the experiment dictionary
    hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict)
    pprint.pprint(exp_dict)
    print('Experiment saved in %s' % savedir)

    if DEVICE.type == "cuda":
        if cuda_deterministic:
            cudnn.benchmark = False
            cudnn.deterministic = True
        else:
            cudnn.benchmark = True

    # Dataset
    # ==================
    trainset = get_dataset(exp_dict['dataset'], 'train',
                           exp_dict=exp_dict, datadir_base=datadir_base,
                           n_samples=exp_dict['dataset_size']['train'],
                           transform_lvl=exp_dict['dataset']['transform_lvl'],
                           colorjitter=exp_dict['dataset'].get('colorjitter')
                           )

    valset = get_dataset(exp_dict['dataset'], 'validation',
                         exp_dict=exp_dict, datadir_base=datadir_base,
                         n_samples=exp_dict['dataset_size']['train'],
                         transform_lvl=0,
                         val_transform=exp_dict['dataset']['val_transform'])

    testset = get_dataset(exp_dict['dataset'], 'test',
                          exp_dict=exp_dict, datadir_base=datadir_base,
                          n_samples=exp_dict['dataset_size']['test'],
                          transform_lvl=0,
                          val_transform=exp_dict['dataset']['val_transform'])
    print("Dataset defined.")

    # define dataloaders
    if exp_dict['dataset']['name'] == 'bach':
        testloader = torch.utils.data.DataLoader(testset, batch_size=1,
                                                 shuffle=False,
                                                 num_workers=num_workers,
                                                 pin_memory=pin_memory)
    else:
        testloader = torch.utils.data.DataLoader(testset, batch_size=exp_dict['batch']['size'],
                                                 shuffle=False,
                                                 num_workers=num_workers,
                                                 pin_memory=pin_memory)

    print("Testloader  defined.")

    # Model
    # ==================
    model = get_model(exp_dict, trainset, device=DEVICE)

    print("Model loaded")

    model_path = os.path.join(savedir, 'model.pth')
    model_best_path = os.path.join(savedir, 'model_best.pth')
    score_list_path = os.path.join(savedir, 'score_list.pkl')

    # checkpoint management
    if os.path.exists(score_list_path):
        # resume experiment
        model.load_state_dict(hu.torch_load(model_path))
        score_list = hu.load_pkl(score_list_path)
        s_epoch = len(score_list)
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # define and log random seed for reproducibility
    assert('fixedSeed' in exp_dict)
    seed = exp_dict['fixedSeed']

    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    print("Seed defined.")

    # Train & Val
    # ==================
    print("Starting experiment at epoch %d/%d" % (s_epoch, exp_dict['niter']))

    for epoch in range(s_epoch, exp_dict['niter']):
        s_time = time.time()
        # Sample new train val
        trainloader, valloader = get_train_val_dataloader(exp_dict,
                                                          trainset, valset,
                                                          mixtrainval=exp_dict['mixTrainVal'],
                                                          num_workers=num_workers,
                                                          pin_memory=pin_memory)
        # Train & validate
        train_dict = model.train_on_loader(trainloader, valloader, epoch=epoch,
                                           exp_dict=exp_dict)

        # Test phase
        train_dict_2 = model.test_on_loader(trainloader)
        val_dict = model.test_on_loader(valloader)
        test_dict = model.test_on_loader(testloader)

        # Vis phase
        model.vis_on_loader('train', trainset, savedir_images=os.path.join(
            savedir, 'images'), epoch=epoch)

        score_dict = {}
        score_dict["epoch"] = epoch
        score_dict["test_acc"] = test_dict['acc']
        score_dict["val_acc"] = val_dict['acc']
        score_dict["train_acc"] = train_dict_2['acc']
        score_dict["train_loss"] = train_dict['loss']
        score_dict["time_taken"] = time.time() - s_time
        score_dict["netC_lr"] = train_dict['netC_lr']

        if exp_dict['model']['netA'] is not None:
            if 'transformations_mean' in train_dict:
                for i in range(len(train_dict['transformations_mean'])):
                    score_dict[str(
                        i) + "_mean"] = train_dict['transformations_mean'][i].item()
            if 'transformations_std' in train_dict:
                for i in range(len(train_dict['transformations_std'])):
                    score_dict[str(
                        i) + "_std"] = train_dict['transformations_std'][i].item()

        # Add to score_list and save checkpoint
        score_list += [score_dict]

        # Report & Save
        score_df = pd.DataFrame(score_list)
        print("\n", score_df.tail(), "\n")
        hu.torch_save(model_path, model.get_state_dict())
        hu.save_pkl(score_list_path, score_list)
        print("Checkpoint Saved: %s" % savedir)

        # Update best score
        if epoch == 0 or (score_dict["test_acc"] >= score_df["test_acc"][:-1].max()):
            hu.save_pkl(os.path.join(
                savedir, "score_list_best.pkl"), score_list)
            hu.torch_save(os.path.join(savedir, "model_best.pth"),
                          model.get_state_dict())

            print("Saved Best: %s" % savedir)

    print('experiment completed')
示例#19
0
    def test_get_result_manager(self):
        # save a score_list
        savedir_base = '.tmp_plots'
        if os.path.exists(savedir_base):
            shutil.rmtree(savedir_base)
        exp_dict = {'model':{'name':'mlp', 'n_layers':30}, 
                    'dataset':'mnist', 'batch_size':1}
        score_list = [{'epoch': 0, 'acc':0.5}, {'epoch': 1, 'acc':0.9}]

        hu.save_pkl(os.path.join(savedir_base, hu.hash_dict(exp_dict),
                     'score_list.pkl'), score_list)
        hu.save_json(os.path.join(savedir_base, hu.hash_dict(exp_dict),
                     'exp_dict.json'), exp_dict)

        exp_dict = {'model':{'name':'mlp', 'n_layers':30}, 
                    'dataset':'cifar10', 'batch_size':1}
        score_list = [{'epoch': 0, 'acc':0.25}, {'epoch': 1, 'acc':1.24}, {'epoch': 2, 'acc':1.5}]

        hu.save_pkl(os.path.join(savedir_base, hu.hash_dict(exp_dict),
                     'score_list.pkl'), score_list)
        hu.save_json(os.path.join(savedir_base, hu.hash_dict(exp_dict),
                     'exp_dict.json'), exp_dict)

        exp_dict = {'model':{'name':'lenet', 'n_layers':30}, 
                    'dataset':'cifar10', 'batch_size':1}
        score_list = [{'epoch': 0, 'acc':0.35}, {'epoch': 1, 'acc':1.2}, {'epoch': 2, 'acc':1.3}]

        hu.save_pkl(os.path.join(savedir_base, hu.hash_dict(exp_dict),
                     'score_list.pkl'), score_list)
        hu.save_json(os.path.join(savedir_base, hu.hash_dict(exp_dict),
                     'exp_dict.json'), exp_dict)
                     
        exp_dict = {'model':{'name':'lenet', 'n_layers':30}, 
                    'dataset':'cifar10', 'batch_size':5}
        score_list = [{'epoch': 0, 'acc':0.15}, {'epoch': 1, 'acc':1.21}, {'epoch': 2, 'acc':1.7}]

        hu.save_pkl(os.path.join(savedir_base, hu.hash_dict(exp_dict),
                     'score_list.pkl'), score_list)
        hu.save_json(os.path.join(savedir_base, hu.hash_dict(exp_dict),
                     'exp_dict.json'), exp_dict)

        rm = hr.ResultManager(savedir_base=savedir_base)
        
        # assert(len(rm.exp_groups) == 2)
        # for exp_list in rm.exp_groups:
        #     assert(exp_list[0]['dataset'] in ['mnist', 'cifar10'])
        rm.get_exp_list_df()
        rm.get_score_df(avg_across='dataset')
        rm.get_score_df(avg_across='dataset', add_prefix=True)
        rm.get_score_df()
        rm.get_score_lists()
        rm.get_images()
        table = rm.get_score_table()
        table = rm.get_exp_table()
        
        fig_list = rm.get_plot(x_metric='epoch', y_metric='acc', title_list=['dataset'], legend_list=['model'])
        for i, fig in enumerate(fig_list):
            fig.savefig(os.path.join(savedir_base, '%d.png' % i))
        

        order = 'groups_by_metrics'
        fig_list = rm.get_plot_all(order=order, x_metric='epoch', y_metric_list=['acc', 'epoch'], title_list=['dataset'], 
                              legend_list=['model'], 
                              groupby_list=['dataset'],
                              log_metric_list=['acc'],
                              map_title_list=[{'mnist':'MNIST'}, {'cifar10':'CIFAR-10'}],
                              map_xlabel_list=[{'epoch':'EPOCHS'}],
                              map_ylabel_list=[{'acc':'Score'}],
                              ylim_list=[[(0.5, 0.8),(0.5, 0.8)],
                                         [(0.5, 0.8),(0.5, 0.8)]])

        for i, fig in enumerate(fig_list):
            fig.savefig(os.path.join(savedir_base, '%s_%d.png' % (order, i)))
        
        order = 'metrics_by_groups'
        fig_list = rm.get_plot_all(order=order, x_metric='epoch', y_metric_list=['acc', 'epoch'], title_list=['dataset'], 
                              legend_list=['model'], avg_across='batch_size')
        for i, fig in enumerate(fig_list):
            fig.savefig(os.path.join(savedir_base, '%s_%d.png' % (order, i)))
示例#20
0
def test(exp_dict,
         savedir_base,
         datadir,
         num_workers=0,
         model_path=None,
         scan_id=None):
    # bookkeepting stuff
    # ==================
    pprint.pprint(exp_dict)
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)

    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)
    print("Experiment saved in %s" % savedir)

    # Dataset
    # ==================
    # val set
    test_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                    split="val",
                                    datadir=datadir,
                                    exp_dict=exp_dict,
                                    dataset_size=exp_dict['dataset_size'])
    if str(scan_id) != 'None':
        test_set.active_data = test_set.get_scan(scan_id)
    test_sampler = torch.utils.data.SequentialSampler(test_set)
    test_loader = DataLoader(test_set,
                             sampler=test_sampler,
                             batch_size=1,
                             collate_fn=ut.collate_fn,
                             num_workers=num_workers)

    # Model
    # ==================
    # chk = torch.load('best_model.ckpt')
    model = models.get_model_for_onnx_export(model_dict=exp_dict['model'],
                                             exp_dict=exp_dict,
                                             train_set=test_set).cuda()
    epoch = -1

    if str(model_path) != 'None':
        model_path = model_path
        model.load_state_dict(hu.torch_load(model_path))
    else:
        try:
            exp_dict_train = copy.deepcopy(exp_dict)
            del exp_dict_train['test_mode']
            savedir_train = os.path.join(savedir_base,
                                         hu.hash_dict(exp_dict_train))
            model_path = os.path.join(savedir_train, "model_best.pth")
            score_list = hu.load_pkl(
                os.path.join(savedir_train, 'score_list_best.pkl'))
            epoch = score_list[-1]['epoch']
            print('Loaded model at epoch %d with score %.3f' % epoch)
            model.load_state_dict(hu.torch_load(model_path))
        except:
            pass

    s_time = time.time()
    savedir_images = os.path.join(savedir, 'images')

    # delete image folder if exists
    if os.path.exists(savedir_images):
        shutil.rmtree(savedir_images)

    os.makedirs(savedir_images, exist_ok=True)
    # for i in range(20):
    #     score_dict = model.train_on_loader(test_loader)
    score_dict = model.val_on_loader(test_loader,
                                     savedir_images=savedir_images,
                                     n_images=30000,
                                     save_preds=True)

    score_dict['epoch'] = epoch
    score_dict["time"] = time.time() - s_time
    score_dict["saved_at"] = hu.time_to_montreal()
    # save test_score_list
    test_path = os.path.join(savedir, "score_list.pkl")
    if os.path.exists(test_path):
        test_score_list = [
            sd for sd in hu.load_pkl(test_path) if sd['epoch'] != epoch
        ]
    else:
        test_score_list = []

    # append score_dict to last result
    test_score_list += [score_dict]
    hu.save_pkl(test_path, test_score_list)
    print('Final Score is ', str(score_dict["val_score"]) + "\n")
示例#21
0
def newminimum(exp_id,
               savedir_base,
               datadir,
               name,
               exp_dict,
               metrics_flag=True):
    # bookkeeping
    # ---------------

    # get experiment directory
    old_modeldir = os.path.join(savedir_base, exp_id)
    savedir = os.path.join(savedir_base, exp_id, name)

    old_exp_dict = hu.load_json(os.path.join(old_modeldir, 'exp_dict.json'))

    # TODO: compare exp dict for possible errors:
    # optimizer have to be the same
    # same network, dataset

    # create folder and save the experiment dictionary
    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict)
    pprint.pprint(exp_dict)
    print('Experiment saved in %s' % savedir)

    # set seed
    # ---------------
    seed = 42 + exp_dict['runs']
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Dataset
    # -----------

    # Load Train Dataset
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     train_flag=True,
                                     datadir=datadir,
                                     exp_dict=exp_dict)

    train_loader = torch.utils.data.DataLoader(
        train_set,
        drop_last=True,
        shuffle=True,
        batch_size=exp_dict["batch_size"])

    # Load Val Dataset
    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   train_flag=False,
                                   datadir=datadir,
                                   exp_dict=exp_dict)

    # Model
    # -----------
    model = models.get_model(exp_dict["model"], train_set=train_set)

    # Choose loss and metric function
    loss_function = metrics.get_metric_function(exp_dict["loss_func"])

    # Load Optimizer
    n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"])
    opt = optimizers.get_optimizer(opt=exp_dict["opt"],
                                   params=model.parameters(),
                                   n_batches_per_epoch=n_batches_per_epoch)

    # Checkpoint
    # -----------
    model_path = os.path.join(savedir, 'model.pth')
    score_list_path = os.path.join(savedir, 'score_list.pkl')
    opt_path = os.path.join(savedir, 'opt_state_dict.pth')

    old_model_path = os.path.join(old_modeldir, 'model.pth')
    old_score_list_path = os.path.join(old_modeldir, 'score_list.pkl')
    old_opt_path = os.path.join(old_modeldir, 'opt_state_dict.pth')

    score_list = hu.load_pkl(old_score_list_path)
    model.load_state_dict(torch.load(old_model_path))
    opt.load_state_dict(torch.load(old_opt_path))
    s_epoch = score_list[-1]['epoch'] + 1

    # save current model state for comparison
    minimum = []

    for param in model.parameters():
        minimum.append(param.clone())

    # Train & Val
    # ------------
    print('Starting experiment at epoch %d/%d' %
          (s_epoch, exp_dict['max_epoch']))

    for epoch in range(s_epoch, exp_dict['max_epoch']):
        # Set seed
        np.random.seed(exp_dict['runs'] + epoch)
        torch.manual_seed(exp_dict['runs'] + epoch)
        # torch.cuda.manual_seed_all(exp_dict['runs']+epoch) not needed since no cuda available

        score_dict = {"epoch": epoch}

        if metrics_flag:
            # 1. Compute train loss over train set
            score_dict["train_loss"] = metrics.compute_metric_on_dataset(
                model, train_set, metric_name='softmax_loss')
            #                                    metric_name=exp_dict["loss_func"])
            # TODO: which loss should be used? (normal or with reguralizer?)

            # 2. Compute val acc over val set
            score_dict["val_acc"] = metrics.compute_metric_on_dataset(
                model, val_set, metric_name=exp_dict["acc_func"])

        # 3. Train over train loader
        model.train()
        print("%d - Training model with %s..." %
              (epoch, exp_dict["loss_func"]))

        s_time = time.time()
        for images, labels in tqdm.tqdm(train_loader):
            # images, labels = images.cuda(), labels.cuda() no cuda available

            opt.zero_grad()
            loss = loss_function(model, images, labels, minimum,
                                 0.1)  # just works for custom loss function
            loss.backward()
            opt.step()

        e_time = time.time()

        # Record metrics
        score_dict["step_size"] = opt.state["step_size"]
        score_dict["n_forwards"] = opt.state["n_forwards"]
        score_dict["n_backwards"] = opt.state["n_backwards"]
        score_dict["batch_size"] = train_loader.batch_size
        score_dict["train_epoch_time"] = e_time - s_time

        score_list += [score_dict]

        # Report and save
        print(pd.DataFrame(score_list).tail())
        hu.save_pkl(score_list_path, score_list)
        hu.torch_save(model_path, model.state_dict())
        hu.torch_save(opt_path, opt.state_dict())
        print("Saved: %s" % savedir)

        with torch.nograd():
            print('Current distance: %f',
                  metrics.computedistance(minimum, model))

    print('Experiment completed')
示例#22
0
        "model": {
            "name": "mlp",
            "n_layers": 30
        },
        "dataset": "mnist",
        "batch_size": 1
    }

    score_list = [{"epoch": 4, "acc": 0.5}, {"epoch": 6, "acc": 0.9}]

    hu.save_pkl(
        os.path.join(savedir_base, hu.hash_dict(exp_dict), "score_list.pkl"),
        score_list)

    hu.save_json(
        os.path.join(savedir_base, hu.hash_dict(exp_dict), "exp_dict.json"),
        exp_dict)

    exp_dict = {
        "model": {
            "name": "mlp2",
            "n_layers": 35
        },
        "dataset": "mnist",
        "batch_size": 1
    }
    score_list = [{"epoch": 2, "acc": 0.1}, {"epoch": 6, "acc": 0.3}]
    hu.save_pkl(
        os.path.join(savedir_base, hu.hash_dict(exp_dict), "score_list.pkl"),
        score_list)
    hu.save_json(
def trainval(exp_dict, savedir_base, datadir, reset=False, num_workers=0):
    # bookkeepting stuff
    # ==================
    pprint.pprint(exp_dict)
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)
    if reset:
        hc.delete_and_backup_experiment(savedir)

    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)
    print("Experiment saved in %s" % savedir)

    # set seed
    # ==================
    seed = 42
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Dataset
    # ==================
    # train set
    train_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                     split="train",
                                     datadir=datadir,
                                     exp_dict=exp_dict,
                                     dataset_size=exp_dict['dataset_size'])
    # val set
    val_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                   split="val",
                                   datadir=datadir,
                                   exp_dict=exp_dict,
                                   dataset_size=exp_dict['dataset_size'])

    # test set
    test_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                    split="test",
                                    datadir=datadir,
                                    exp_dict=exp_dict,
                                    dataset_size=exp_dict['dataset_size'])

    # val_sampler = torch.utils.data.SequentialSampler(val_set)
    val_loader = DataLoader(
        val_set,
        # sampler=val_sampler,
        batch_size=1,
        collate_fn=ut.collate_fn,
        num_workers=num_workers)
    test_loader = DataLoader(
        test_set,
        # sampler=val_sampler,
        batch_size=1,
        collate_fn=ut.collate_fn,
        num_workers=num_workers)

    # Model
    # ==================
    model = models.get_model(model_dict=exp_dict['model'],
                             exp_dict=exp_dict,
                             train_set=train_set).cuda()

    # model.opt = optimizers.get_optim(exp_dict['opt'], model)
    model_path = os.path.join(savedir, "model.pth")
    score_list_path = os.path.join(savedir, "score_list.pkl")

    if os.path.exists(score_list_path):
        # resume experiment
        model.load_state_dict(hu.torch_load(model_path))
        score_list = hu.load_pkl(score_list_path)
        s_epoch = score_list[-1]['epoch'] + 1
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # Train & Val
    # ==================
    print("Starting experiment at epoch %d" % (s_epoch))
    model.waiting = 0
    model.val_score_best = -np.inf

    train_sampler = torch.utils.data.RandomSampler(train_set,
                                                   replacement=True,
                                                   num_samples=2 *
                                                   len(test_set))

    train_loader = DataLoader(train_set,
                              sampler=train_sampler,
                              collate_fn=ut.collate_fn,
                              batch_size=exp_dict["batch_size"],
                              drop_last=True,
                              num_workers=num_workers)

    for e in range(s_epoch, exp_dict['max_epoch']):
        # Validate only at the start of each cycle
        score_dict = {}
        test_dict = model.val_on_loader(test_loader,
                                        savedir_images=os.path.join(
                                            savedir, "images"),
                                        n_images=3)
        # Train the model
        train_dict = model.train_on_loader(train_loader)

        # Validate the model
        val_dict = model.val_on_loader(val_loader)
        score_dict["val_score"] = val_dict["val_score"]

        # Get new score_dict
        score_dict.update(train_dict)
        score_dict["epoch"] = e
        score_dict["waiting"] = model.waiting

        model.waiting += 1

        # Add to score_list and save checkpoint
        score_list += [score_dict]

        # Save Best Checkpoint
        score_df = pd.DataFrame(score_list)
        if score_dict["val_score"] >= model.val_score_best:
            test_dict = model.val_on_loader(test_loader,
                                            savedir_images=os.path.join(
                                                savedir, "images"),
                                            n_images=3)
            score_dict.update(test_dict)
            hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"),
                        score_list)
            # score_df.to_csv(os.path.join(savedir, "score_best_df.csv"))
            hu.torch_save(os.path.join(savedir, "model_best.pth"),
                          model.get_state_dict())
            model.waiting = 0
            model.val_score_best = score_dict["val_score"]
            print("Saved Best: %s" % savedir)

        # Report & Save
        score_df = pd.DataFrame(score_list)
        # score_df.to_csv(os.path.join(savedir, "score_df.csv"))
        print("\n", score_df.tail(), "\n")
        hu.torch_save(model_path, model.get_state_dict())
        hu.save_pkl(score_list_path, score_list)
        print("Checkpoint Saved: %s" % savedir)

        if model.waiting > 100:
            break

    print('Experiment completed et epoch %d' % e)
示例#24
0
    def __init__(self,
                 split,
                 transform_lvl,
                 datadir_base,
                 n_samples=None,
                 val_transform='identity'):
        path = datadir_base or '/mnt/projects/bilvlda/dataset/tiny-imagenet-200'
        self.name = 'tinyimagenet'
        self.n_classes = 200
        self.image_size = 64
        self.nc = 3

        normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5],
                                         std=[0.5, 0.5, 0.5])

        self.mean = normalize.mean
        self.std = normalize.std

        if split == 'train':
            if transform_lvl == 0:
                transform = transforms.Compose([
                    transforms.Lambda(lambda x: x.convert("RGB")),
                    transforms.ToTensor(),
                    normalize,
                ])

            elif transform_lvl == 1:
                transform = transforms.Compose([
                    transforms.RandomCrop(self.image_size, padding=4),
                    transforms.ToTensor(),
                    normalize,
                ])

            elif transform_lvl == 1.5:
                transform = transforms.Compose([
                    transforms.Lambda(lambda x: x.convert("RGB")),
                    transforms.RandomHorizontalFlip(),
                    transforms.ToTensor(),
                    normalize,
                ])

            elif transform_lvl == 2:
                transform = transforms.Compose([
                    transforms.Lambda(lambda x: x.convert("RGB")),
                    transforms.RandomCrop(self.image_size, padding=4),
                    transforms.RandomHorizontalFlip(),
                    transforms.ToTensor(),
                    normalize,
                ])

            elif transform_lvl == 2.5:
                transform = transforms.Compose([
                    transforms.RandomCrop(self.image_size, padding=4),
                    transforms.RandomAffine(10, translate=None,
                                            scale=(0.5, 2)),
                    transforms.ToTensor(),
                    normalize,
                ])

            elif transform_lvl == 3:
                transform = transforms.Compose([
                    transforms.RandomCrop(self.image_size, padding=4),
                    transforms.RandomAffine(10, translate=None,
                                            scale=(0.5, 2)),
                    transforms.RandomHorizontalFlip(),
                    transforms.ToTensor(),
                    normalize,
                ])

            else:
                raise ValueError(
                    'only lvls 0, 1, 1.5, 2, 2.5 and 3 are supported')

        elif split in ['validation', 'test']:
            # identity transform
            if val_transform == 'identity':
                transform = transforms.Compose([
                    transforms.Lambda(lambda x: x.convert("RGB")),
                    transforms.ToTensor(), normalize
                ])
            elif val_transform == 'rotation':
                transform = transforms.Compose([
                    transforms.Lambda(lambda x: x.convert("RGB")),
                    transforms.RandomRotation((45, 45)),
                    transforms.ToTensor(), normalize
                ])
            elif val_transform == 'translation':
                transform = transforms.Compose([
                    transforms.Lambda(lambda x: x.convert("RGB")),
                    transforms.Pad((4, 4, 0, 0)),
                    transforms.CenterCrop(self.image_size),
                    transforms.ToTensor(), normalize
                ])
            elif val_transform == 'zoomin':
                transform = transforms.Compose([
                    transforms.Lambda(lambda x: x.convert("RGB")),
                    transforms.Resize(int(self.image_size * 1.5)),
                    transforms.CenterCrop(self.image_size),
                    transforms.ToTensor(), normalize
                ])
            elif val_transform == 'zoomout':
                transform = transforms.Compose([
                    transforms.Lambda(lambda x: x.convert("RGB")),
                    transforms.Resize(int(self.image_size * 0.75)),
                    transforms.Pad(4),
                    transforms.ToTensor(), normalize
                ])

        self.transform = transform

        if split in ['train', 'validation']:
            fname = '/mnt/projects/bilvlda/dataset/tiny-imagenet-200/tinyimagenet_train.json'

            if not os.path.exists(fname):
                dataset = dset.ImageFolder(root=os.path.join(path, 'train'))
                hu.save_json(fname, dataset.imgs)

            self.imgs = np.array(hu.load_json(fname))
            assert (len(self.imgs) == 100000)

        elif split == 'test':
            fname = '/mnt/projects/bilvlda/dataset/tiny-imagenet-200/tinyimagenet_validation.json'

            if not os.path.exists(fname):
                dataset = dset.ImageFolder(root=os.path.join(path, 'val'))
                hu.save_json(fname, dataset.imgs)
            self.imgs = np.array(hu.load_json(fname))
            assert (len(self.imgs) == 10000)

        if n_samples is not None:
            with hu.random_seed(10):
                imgs = np.array(self.imgs)
                ind = np.random.choice(imgs.shape[0], n_samples, replace=False)
                self.imgs = imgs[ind]
示例#25
0
文件: trainval.py 项目: blaxe05/haven
def trainval(exp_dict, savedir_base, datadir_base, reset=False):
    # bookkeeping stuff
    # ==================
    pprint.pprint(exp_dict)
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)
    if reset:
        hc.delete_and_backup_experiment(savedir)

    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)
    print("Experiment saved in %s" % savedir)

    # Dataset
    # ==================

    # load train and acrtive set
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     split="train",
                                     datadir_base=datadir_base,
                                     exp_dict=exp_dict)

    active_set = ActiveLearningDataset(train_set, random_state=42)

    # val set
    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   split="val",
                                   datadir_base=datadir_base,
                                   exp_dict=exp_dict)
    val_loader = DataLoader(val_set, batch_size=exp_dict["batch_size"])

    # Model
    # ==================
    model = models.get_model(model_name=exp_dict['model']['name'],
                             exp_dict=exp_dict).cuda()

    model_path = os.path.join(savedir, "model.pth")
    score_list_path = os.path.join(savedir, "score_list.pkl")

    if os.path.exists(score_list_path):
        # resume experiment
        model.set_state_dict(hu.torch_load(model_path))
        active_set.load_state_dict(
            hu.load_pkl(os.path.join(savedir, "active_set.pkl")))
        score_list = hu.load_pkl(score_list_path)
        inner_s_epoch = score_list[-1]['inner_epoch'] + 1
        s_cycle = score_list[-1]['cycle']
    else:
        # restart experiment
        score_list = []
        inner_s_epoch = 0
        s_cycle = 0

    # Train & Val
    # ==================
    print("Starting experiment at cycle %d epoch %d" %
          (s_cycle, inner_s_epoch))

    for c in range(s_cycle, exp_dict['max_cycle']):
        # Set seed
        np.random.seed(c)
        torch.manual_seed(c)
        torch.cuda.manual_seed_all(c)

        if inner_s_epoch == 0:
            active_set.label_next_batch(model)
            hu.save_pkl(os.path.join(savedir, "active_set.pkl"),
                        active_set.state_dict())

        train_loader = DataLoader(active_set,
                                  sampler=samplers.get_sampler(
                                      exp_dict['sampler']['train'],
                                      active_set),
                                  batch_size=exp_dict["batch_size"])
        # Visualize the model
        model.vis_on_loader(vis_loader,
                            savedir=os.path.join(savedir, "images"))

        for e in range(inner_s_epoch, exp_dict['max_epoch']):
            # Validate only at the start of each cycle
            score_dict = {}
            if e == 0:
                score_dict.update(model.val_on_loader(val_loader))

            # Train the model
            score_dict.update(model.train_on_loader(train_loader))

            # Validate the model
            score_dict["epoch"] = len(score_list)
            score_dict["inner_epoch"] = e
            score_dict["cycle"] = c
            score_dict['n_ratio'] = active_set.n_labelled_ratio
            score_dict["n_train"] = len(train_loader.dataset)
            score_dict["n_pool"] = len(train_loader.dataset.pool)

            # Add to score_list and save checkpoint
            score_list += [score_dict]

            # Report & Save
            score_df = pd.DataFrame(score_list)
            print("\n", score_df.tail(), "\n")
            hu.torch_save(model_path, model.get_state_dict())
            hu.save_pkl(score_list_path, score_list)
            print("Checkpoint Saved: %s" % savedir)

        inner_s_epoch = 0
示例#26
0
def trainval(exp_dict,
             savedir_base,
             reset,
             metrics_flag=True,
             datadir=None,
             cuda=False):
    # bookkeeping
    # ---------------

    # get experiment directory
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)

    if reset:
        # delete and backup experiment
        hc.delete_experiment(savedir, backup_flag=True)

    # create folder and save the experiment dictionary
    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict)
    print(pprint.pprint(exp_dict))
    print('Experiment saved in %s' % savedir)

    # set seed
    # ==================
    seed = 42 + exp_dict['runs']
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        device = 'cuda'
        torch.cuda.manual_seed_all(seed)
    else:
        device = 'cpu'

    print('Running on device: %s' % device)

    # Dataset
    # ==================
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     train_flag=True,
                                     datadir=datadir,
                                     exp_dict=exp_dict)

    train_loader = DataLoader(train_set,
                              drop_last=True,
                              shuffle=True,
                              sampler=None,
                              batch_size=exp_dict["batch_size"])

    # Load Val Dataset
    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   train_flag=False,
                                   datadir=datadir,
                                   exp_dict=exp_dict)

    # Model
    # ==================
    use_backpack = exp_dict['opt'].get("backpack", False)

    model = models.get_model(exp_dict["model"],
                             train_set=train_set,
                             backpack=use_backpack).to(device=device)
    if use_backpack:
        assert exp_dict['opt']['name'] in ['nus_wrapper', 'adaptive_second']
        from backpack import extend
        model = extend(model)

    # Choose loss and metric function
    loss_function = metrics.get_metric_function(exp_dict["loss_func"])

    # Load Optimizer
    # ==============
    n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"])
    opt = optimizers.get_optimizer(opt=exp_dict["opt"],
                                   params=model.parameters(),
                                   n_batches_per_epoch=n_batches_per_epoch,
                                   n_train=len(train_set),
                                   train_loader=train_loader,
                                   model=model,
                                   loss_function=loss_function,
                                   exp_dict=exp_dict,
                                   batch_size=exp_dict["batch_size"])

    # Checkpointing
    # =============
    score_list_path = os.path.join(savedir, "score_list.pkl")
    model_path = os.path.join(savedir, "model_state_dict.pth")
    opt_path = os.path.join(savedir, "opt_state_dict.pth")

    if os.path.exists(score_list_path):
        # resume experiment
        score_list = ut.load_pkl(score_list_path)
        if use_backpack:
            model.load_state_dict(torch.load(model_path), strict=False)
        else:
            model.load_state_dict(torch.load(model_path))
        opt.load_state_dict(torch.load(opt_path))
        s_epoch = score_list[-1]["epoch"] + 1
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # Start Training
    # ==============
    n_train = len(train_loader.dataset)
    n_batches = len(train_loader)
    batch_size = train_loader.batch_size

    for epoch in range(s_epoch, exp_dict["max_epoch"]):
        # Set seed
        seed = epoch + exp_dict['runs']
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        score_dict = {"epoch": epoch}

        # Validate
        # --------
        if metrics_flag:
            # 1. Compute train loss over train set
            score_dict["train_loss"] = metrics.compute_metric_on_dataset(
                model,
                train_set,
                metric_name=exp_dict["loss_func"],
                batch_size=exp_dict['batch_size'])

            # 2. Compute val acc over val set
            score_dict["val_acc"] = metrics.compute_metric_on_dataset(
                model,
                val_set,
                metric_name=exp_dict["acc_func"],
                batch_size=exp_dict['batch_size'])

        # Train
        # -----
        model.train()
        print("%d - Training model with %s..." %
              (epoch, exp_dict["loss_func"]))

        s_time = time.time()

        train_on_loader(model, train_set, train_loader, opt, loss_function,
                        epoch, use_backpack)

        e_time = time.time()

        # Record step size and batch size
        score_dict["step"] = opt.state.get("step",
                                           0) / int(n_batches_per_epoch)
        score_dict["step_size"] = opt.state.get("step_size", {})
        score_dict["step_size_avg"] = opt.state.get("step_size_avg", {})
        score_dict["n_forwards"] = opt.state.get("n_forwards", {})
        score_dict["n_backwards"] = opt.state.get("n_backwards", {})
        score_dict["grad_norm"] = opt.state.get("grad_norm", {})
        score_dict["batch_size"] = batch_size
        score_dict["train_epoch_time"] = e_time - s_time
        score_dict.update(opt.state["gv_stats"])

        # Add score_dict to score_list
        score_list += [score_dict]

        # Report and save
        print(pd.DataFrame(score_list).tail())
        ut.save_pkl(score_list_path, score_list)
        ut.torch_save(model_path, model.state_dict())
        ut.torch_save(opt_path, opt.state_dict())
        print("Saved: %s" % savedir)

    return score_list
    def vis_on_batch(self, batch, savedir_image, save_preds=False):
        # os.makedirs(savedir_image, exist_ok=True)
        self.eval()

        # if self.just_one:
        #   onnx_output = torch.onnx._export(self.model_base, batch['images'].cuda(), 'onnx_model.onnx', verbose=False, opset_version=12)
        #   self.just_one = False

        # onnx_output = torch.onnx._export(self.model_base, torch_input.cuda(), 'onnx_model2.onnx', verbose=False, opset_version=11, operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK)

        # clf
        #pred_mask = self.predict_on_batch(batch).cpu()
        pred_mask = self.predict_on_batch_onnx(batch).cpu()

        # print(pred_mask.sum())
        img = hu.f2l(batch['images'])[0]
        img += abs(img.min())
        img /= img.max()
        img = img.repeat(1, 1, 3)

        mask_vis = batch["masks"].clone().float()[0][..., None]
        mask_vis[mask_vis == 255] = 0

        pred_mask_vis = pred_mask.clone().float()[0][..., None]
        vmax = 0.1

        fig, ax_list = plt.subplots(ncols=3, nrows=1)
        ax_list[0].imshow(
            img[:, :, 0],
            cmap='gray',
            #   interpolation='sinc', vmin=0, vmax=0.4
        )

        colors_all = np.array(['black', 'red', 'blue', 'green', 'purple'])
        colors = colors_all[np.unique(mask_vis).astype(int)]

        vis = label2rgb(mask_vis[:, :, 0].numpy(),
                        image=img.numpy(),
                        colors=colors,
                        bg_label=255,
                        bg_color=None,
                        alpha=0.6,
                        kind='overlay')
        vis = mark_boundaries(vis,
                              mask_vis[:, :, 0].numpy().astype('uint8'),
                              color=(1, 1, 1))

        ax_list[1].imshow(vis, cmap='gray')

        colors = colors_all[np.unique(pred_mask_vis).astype(int)]
        vis = label2rgb(pred_mask_vis[:, :, 0].numpy(),
                        image=img.numpy(),
                        colors=colors,
                        bg_label=255,
                        bg_color=None,
                        alpha=0.6,
                        kind='overlay')
        vis = mark_boundaries(vis,
                              pred_mask_vis[:, :, 0].numpy().astype('uint8'),
                              color=(1, 1, 1))

        ax_list[2].imshow(vis, cmap='gray')

        for i in range(1, self.n_classes):
            plt.plot([None], [None], label='group %d' % i, color=colors_all[i])
        # ax_list[1].axis('off')
        ax_list[0].grid()
        ax_list[1].grid()
        ax_list[2].grid()

        ax_list[0].tick_params(axis='x', labelsize=6)
        ax_list[0].tick_params(axis='y', labelsize=6)

        ax_list[1].tick_params(axis='x', labelsize=6)
        ax_list[1].tick_params(axis='y', labelsize=6)

        ax_list[2].tick_params(axis='x', labelsize=6)
        ax_list[2].tick_params(axis='y', labelsize=6)

        ax_list[0].set_title('Original image', fontsize=8)
        ax_list[1].set_title('Ground-truth', fontsize=8)
        ax_list[2].set_title('Prediction', fontsize=8)

        legend_kwargs = {
            "loc": 2,
            "bbox_to_anchor": (1.05, 1),
            'borderaxespad': 0.,
            "ncol": 1
        }
        ax_list[2].legend(fontsize=6, **legend_kwargs)
        plt.savefig(savedir_image.replace('.jpg', '.png'),
                    bbox_inches='tight',
                    dpi=300)
        plt.close()

        # save predictions
        if save_preds:
            from PIL import Image
            pred_dict = {}
            pred_numpy = pred_mask.cpu().numpy().squeeze().astype('uint8')

            uniques = np.unique(np.array(pred_numpy))
            # print(uniques)
            meta_dict = batch['meta'][0]

            for u in range(self.n_classes):
                meta_dict['gt_group%d_n_pixels' % u] = float(
                    (batch['masks'] == u).float().sum())
                meta_dict['pred_group%d_n_pixels' % u] = float(
                    (pred_mask == u).float().sum())

                if u == 0:
                    continue
                pred = Image.fromarray(pred_numpy == u)
                pred.save(savedir_image.replace('.jpg', '_group%d.png' % u))

            hu.save_json(savedir_image.replace('.jpg', '.json'), meta_dict)
def trainval(exp_dict, savedir_base, reset=False):
    # bookkeeping
    # ---------------

    # get experiment directory
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)

    if reset:
        # delete and backup experiment
        hc.delete_experiment(savedir, backup_flag=True)

    # create folder and save the experiment dictionary
    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)
    print(exp_dict)
    print("Experiment saved in %s" % savedir)

    # Set Seed
    # -------
    seed = exp_dict.get('seed')
    np.random.seed(seed)
    torch.manual_seed(seed)

    # Dataset
    # -----------
    train_dataset = get_dataset('train', exp_dict['dataset'])
    val_dataset = get_dataset('test', exp_dict['dataset'])

    # train and val loader
    train_loader = DataLoader(
        train_dataset,
        batch_size=exp_dict['batch_size'],
        shuffle=True,
        collate_fn=lambda x: x
        if exp_dict['batch_size'] == 1 else default_collate,
        # to handle episodes
        num_workers=args.num_workers)
    val_loader = DataLoader(
        val_dataset,
        batch_size=exp_dict['batch_size'],
        collate_fn=lambda x: x
        if exp_dict['batch_size'] == 1 else default_collate,
        shuffle=True,
        num_workers=args.num_workers)

    # Model
    # -----------
    model = get_model(exp_dict)

    # Checkpoint
    # -----------
    model_path = os.path.join(savedir, "model.pth")
    score_list_path = os.path.join(savedir, "score_list.pkl")

    if os.path.exists(score_list_path):
        # resume experiment
        model.set_state_dict(hu.torch_load(model_path))
        score_list = hu.load_pkl(score_list_path)
        s_epoch = score_list[-1]['epoch'] + 1
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # Train & Val
    # ------------
    print("Starting experiment at epoch %d" % (s_epoch))

    for e in range(s_epoch, exp_dict['max_epoch']):
        score_dict = {}

        # Train the model
        score_dict.update(model.train_on_loader(train_loader))

        # Validate the model
        savepath = os.path.join(savedir_base, exp_dict['dataset']['name'])
        score_dict.update(model.val_on_loader(val_loader, savedir=savepath))
        model.on_train_end(savedir=savedir, epoch=e)
        score_dict["epoch"] = e

        # Visualize the model
        # model.vis_on_loader(vis_loader, savedir=savedir+"/images/")

        # Add to score_list and save checkpoint
        score_list += [score_dict]

        # Report & Save
        score_df = pd.DataFrame(score_list)
        print("\n", score_df.tail())
        hu.torch_save(model_path, model.get_state_dict())
        hu.save_pkl(score_list_path, score_list)
        print("Checkpoint Saved: %s" % savedir)

    print('experiment completed')
示例#29
0
def trainval(exp_dict, savedir_base, reset=False):
    # bookkeeping
    # ---------------

    # get experiment directory
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)

    if reset:
        # delete and backup experiment
        hc.delete_experiment(savedir, backup_flag=True)

    # create folder and save the experiment dictionary
    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict)
    pprint.pprint(exp_dict)
    print('Experiment saved in %s' % savedir)

    # set seed
    # ---------------
    seed = 42 + exp_dict['runs']
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Dataset
    # -----------

    # train loader
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     train_flag=True,
                                     datadir=savedir_base,
                                     exp_dict=exp_dict)

    train_loader = torch.utils.data.DataLoader(
        train_set,
        drop_last=True,
        shuffle=True,
        batch_size=exp_dict["batch_size"])

    # val set
    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   train_flag=False,
                                   datadir=savedir_base,
                                   exp_dict=exp_dict)

    # Model
    # -----------
    model = models.get_model(exp_dict["model"], train_set=train_set).cuda()
    # Choose loss and metric function
    loss_function = metrics.get_metric_function(exp_dict["loss_func"])

    # Compute fstar
    # -------------
    if exp_dict['opt'].get('fstar_flag'):
        ut.compute_fstar(train_set, loss_function, savedir_base, exp_dict)

    # Load Optimizer
    n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"])
    opt = optimizers.get_optimizer(opt_dict=exp_dict["opt"],
                                   params=model.parameters(),
                                   n_batches_per_epoch=n_batches_per_epoch)

    # Checkpoint
    # -----------
    model_path = os.path.join(savedir, 'model.pth')
    score_list_path = os.path.join(savedir, 'score_list.pkl')
    opt_path = os.path.join(savedir, 'opt_state_dict.pth')

    if os.path.exists(score_list_path):
        # resume experiment
        score_list = hu.load_pkl(score_list_path)
        model.load_state_dict(torch.load(model_path))
        opt.load_state_dict(torch.load(opt_path))
        s_epoch = score_list[-1]['epoch'] + 1
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # Train & Val
    # ------------
    print('Starting experiment at epoch %d/%d' %
          (s_epoch, exp_dict['max_epoch']))

    for e in range(s_epoch, exp_dict['max_epoch']):
        # Set seed
        seed = e + exp_dict['runs']
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        score_dict = {}

        # Compute train loss over train set
        score_dict["train_loss"] = metrics.compute_metric_on_dataset(
            model, train_set, metric_name=exp_dict["loss_func"])

        # Compute val acc over val set
        score_dict["val_acc"] = metrics.compute_metric_on_dataset(
            model, val_set, metric_name=exp_dict["acc_func"])

        # Train over train loader
        model.train()
        print("%d - Training model with %s..." % (e, exp_dict["loss_func"]))

        # train and validate
        s_time = time.time()
        for batch in tqdm.tqdm(train_loader):
            images, labels = batch["images"].cuda(), batch["labels"].cuda()

            opt.zero_grad()

            # closure
            def closure():
                return loss_function(model, images, labels, backwards=True)

            opt.step(closure)

        e_time = time.time()

        # Record metrics
        score_dict["epoch"] = e
        score_dict["step_size"] = opt.state["step_size"]
        score_dict["step_size_avg"] = opt.state["step_size_avg"]
        score_dict["n_forwards"] = opt.state["n_forwards"]
        score_dict["n_backwards"] = opt.state["n_backwards"]
        score_dict["grad_norm"] = opt.state["grad_norm"]
        score_dict["batch_size"] = train_loader.batch_size
        score_dict["train_epoch_time"] = e_time - s_time

        score_list += [score_dict]

        # Report and save
        print(pd.DataFrame(score_list).tail())
        hu.save_pkl(score_list_path, score_list)
        hu.torch_save(model_path, model.state_dict())
        hu.torch_save(opt_path, opt.state_dict())
        print("Saved: %s" % savedir)

    print('Experiment completed')
def trainval(exp_dict, savedir_base, data_root, reset=False, tensorboard=True):
    # bookkeeping
    # ---------------
    # get experiment directory
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)

    np.random.seed(exp_dict["seed"])
    torch.manual_seed(exp_dict["seed"])

    if reset:
        # delete and backup experiment
        hc.delete_experiment(savedir, backup_flag=True)

    writer = tensorboardX.SummaryWriter(savedir) \
        if tensorboard == 1 else None

    # create folder and save the experiment dictionary
    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)
    pprint.pprint(exp_dict)
    print("Experiment saved in %s" % savedir)

    # Dataset
    # -----------
    train_dataset, val_dataset = get_dataset(['train', 'val'], data_root,
                                             exp_dict)
    # val_dataset = get_dataset('val', exp_dict)

    # train and val loader
    if exp_dict["episodic"] == False:
        train_loader = DataLoader(train_dataset,
                                  batch_size=exp_dict['batch_size'],
                                  shuffle=True,
                                  num_workers=args.num_workers)
        val_loader = DataLoader(val_dataset,
                                batch_size=exp_dict['batch_size'],
                                shuffle=True,
                                num_workers=args.num_workers)
    else:  # to support episodes TODO: move inside each model
        from datasets.episodic_dataset import EpisodicDataLoader
        train_loader = EpisodicDataLoader(train_dataset,
                                          batch_size=exp_dict['batch_size'],
                                          shuffle=True,
                                          collate_fn=lambda x: x,
                                          num_workers=args.num_workers)
        val_loader = EpisodicDataLoader(val_dataset,
                                        batch_size=exp_dict['batch_size'],
                                        shuffle=True,
                                        collate_fn=lambda x: x,
                                        num_workers=args.num_workers)

    # Model
    # -----------
    model = get_model(exp_dict,
                      labelset=train_dataset.raw_labelset,
                      writer=writer)
    print("Model with:",
          sum(p.numel() for p in model.parameters() if p.requires_grad),
          "parameters")

    # Checkpoint
    # -----------
    model_path = os.path.join(savedir, "model.pth")
    score_list_path = os.path.join(savedir, "score_list.pkl")

    if os.path.exists(score_list_path):
        # resume experiment
        model.set_state_dict(hu.torch_load(model_path))
        score_list = hu.load_pkl(score_list_path)
        s_epoch = score_list[-1]['epoch'] + 1
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # Train & Val
    # ------------
    print("Starting experiment at epoch %d" % (s_epoch))

    for e in range(s_epoch, exp_dict['max_epoch']):
        score_dict = {}

        # Train the model
        score_dict.update(model.train_on_loader(e, train_loader))

        # Validate the model
        score_dict.update(model.val_on_loader(e, val_loader))
        score_dict["epoch"] = e

        if tensorboard:
            for key, value in score_dict.items():
                writer.add_scalar(key, value, e)
            writer.flush()
        # Visualize the model
        # model.vis_on_loader(vis_loader, savedir=savedir+"/images/")

        # Add to score_list and save checkpoint
        score_list += [score_dict]

        # Report & Save
        score_df = pd.DataFrame(score_list)
        print("\n", score_df.tail())
        hu.torch_save(model_path, model.get_state_dict())
        hu.save_pkl(score_list_path, score_list)
        print("Checkpoint Saved: %s" % savedir)

        # if model.is_end():
        #     print("Early stopping")
        #     break
    print('experiment completed')

    # Cleanup
    if tensorboard == 1:
        writer.close()