Exemplo n.º 1
0
    def _submit_job(self, exp_dict, command, reset, submit_dict={}):
        """Submit one job.

        It checks if the experiment exist and manages the special casses, e.g.,
        new experiment, reset, failed, job is already running, completed
        """
        add_job_utils()
        import haven_jobs_utils as hju

        # Define paths
        savedir = os.path.join(self.savedir_base, hu.hash_dict(exp_dict))
        fname = hju.get_job_fname(savedir)
        
        if not os.path.exists(fname):
            # Check if the job already exists
            job_dict = self.launch_job(exp_dict, savedir, command, job=None)
            job_id = job_dict['job_id']
            message = "SUBMITTED: Launching"

        elif reset:
            # Check if the job already exists
            job_id = hu.load_json(fname).get("job_id")
            hju.kill_job(self.api, job_id)
            hc.delete_and_backup_experiment(savedir)

            job_dict = self.launch_job(exp_dict, savedir, command, job=None)
            job_id = job_dict['job_id']
            message = "SUBMITTED: Resetting"

        else:
            job_id = hu.load_json(fname).get("job_id")
            job = hju.get_job(self.api, job_id)

            if job.alive or job.state == 'SUCCEEDED':
                # If the job is alive, do nothing
                message = 'IGNORED: Job %s' % job.state
                
            elif job.state in ["FAILED", "CANCELLED"]:
                message = "SUBMITTED: Retrying %s Job" % job.state
                job_dict = self.launch_job(exp_dict, savedir, command, job=job)
                job_id = job_dict['job_id']
            # This shouldn't happen
            else:
                raise ValueError('wtf')
        
        submit_dict[job_id] = message
Exemplo n.º 2
0
def trainval(exp_dict, savedir_base, datadir, reset=False, num_workers=0):
    # bookkeepting stuff
    # ==================
    pprint.pprint(exp_dict)
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)
    if reset:
        hc.delete_and_backup_experiment(savedir)

    os.makedirs(savedir, exist_ok=True)
    if not os.path.join(savedir, "exp_dict.json"):
        hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)
        print("Experiment saved in %s" % savedir)

    # BCD train
    # ==================
    # Ignore the following combinations
    if not ut.is_valid_exp(exp_dict):
        return

    score_list_fname = os.path.join(savedir, 'score_list.pkl')
    if os.path.exists(score_list_fname):
        score_list = hu.load_pkl(score_list_fname)

    else:
        score_list = train(dataset_name=exp_dict['dataset']['name'],
                           loss_name=exp_dict['dataset']['loss'],
                           block_size=exp_dict['block_size'],
                           partition_rule=exp_dict['partition'],
                           selection_rule=exp_dict['selection'],
                           update_rule=exp_dict['update'],
                           n_iters=exp_dict['max_iters'],
                           L1=exp_dict.get('l1', 0),
                           L2=0,
                           datasets_path=datadir)

        hu.save_pkl(score_list_fname, score_list)

    print('Experiment completed.')
    return score_list
def trainval(exp_dict, savedir_base, datadir, reset=False, num_workers=0):
    # bookkeepting stuff
    # ==================

    savedir = os.path.join(savedir_base, hu.hash_dict(exp_dict))
    os.makedirs(savedir, exist_ok=True)

    if reset:
        hc.delete_and_backup_experiment(savedir)

    print("Experiment saved in %s" % savedir)

    # Dataset
    # ==================
    # train set

    data_transform = A.Compose(
        [
            A.Flip(p=0.3),
            A.IAAAffine(p=0.3),
            A.Rotate(p=0.3),
            A.HueSaturationValue(hue_shift_limit=10,
                                 sat_shift_limit=15,
                                 val_shift_limit=10,
                                 p=0.3),
            A.GaussianBlur(3, p=0.3),
            A.GaussNoise(30, p=0.3)
        ],
        keypoint_params=A.KeypointParams(format='xy'),
        additional_targets={
            'mask0': 'mask',
            'mask1': 'mask',
            'mask2': 'mask',
            'keypoints0': 'keypoints',
            'keypoints1': 'keypoints',
            'keypoints2': 'keypoints',
            'keypoints3': 'keypoints',
            'keypoints4': 'keypoints',
            'keypoints5': 'keypoints'
        })

    # random.seed(20201009)
    random_seed = random.randint(0, 20201009)
    train_set = HEDataset_Fast(data_dir=datadir,
                               n_classes=exp_dict["n_classes"],
                               transform=data_transform,
                               option="Train",
                               random_seed=random_seed,
                               obj_option=exp_dict["obj"],
                               patch_size=exp_dict["patch_size"],
                               bkg_option=exp_dict["bkg"])

    test_transform = A.Compose([A.Resize(1024, 1024)],
                               keypoint_params=A.KeypointParams(format='xy'),
                               additional_targets={
                                   'mask0': 'mask',
                                   'mask1': 'mask'
                               })
    # val set
    val_set = HEDataset(data_dir=datadir,
                        transform=test_transform,
                        option="Validation")

    val_loader = DataLoader(val_set, batch_size=1, num_workers=num_workers)

    # test set
    test_set = HEDataset(data_dir=datadir,
                         transform=test_transform,
                         option="Test")

    test_loader = DataLoader(test_set, batch_size=1, num_workers=num_workers)
    # Model
    # ==================

    # torch.manual_seed(20201009)
    model = models.get_model(exp_dict['model'],
                             exp_dict=exp_dict,
                             train_set=train_set).cuda()

    model_path = os.path.join(savedir, "model.pth")
    score_list_path = os.path.join(savedir, "score_list.pkl")

    if os.path.exists(score_list_path):
        # resume experiment
        model.load_state_dict(hu.torch_load(model_path))
        score_list = hu.load_pkl(score_list_path)
        s_epoch = score_list[-1]['epoch'] + 1
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # Train & Val
    # ==================
    print("Starting experiment at epoch %d" % (s_epoch))

    #     train_sampler = torch.utils.data.RandomSampler(
    #         train_set, replacement=True, num_samples=2*len(val_set))

    train_loader = DataLoader(train_set,
                              batch_size=exp_dict["batch_size"],
                              shuffle=True,
                              num_workers=num_workers)

    for e in range(s_epoch, exp_dict['max_epoch']):
        # Validate only at the start of each cycle
        score_dict = {}

        # Train the model
        train_dict = model.train_on_loader(train_loader)

        # Validate and Visualize the model
        val_dict = model.val_on_loader(val_loader,
                                       savedir_images=os.path.join(
                                           savedir, "images"),
                                       n_images=7)
        score_dict.update(val_dict)

        # Get new score_dict
        score_dict.update(train_dict)
        score_dict["epoch"] = len(score_list)

        # Add to score_list and save checkpoint
        score_list += [score_dict]

        # Report & Save
        score_df = pd.DataFrame(score_list)
        print("\n", score_df.tail(), "\n")
        hu.torch_save(model_path, model.get_state_dict())
        hu.save_pkl(score_list_path, score_list)
        print("Checkpoint Saved: %s" % savedir)

        # Save Best Checkpoint
        if e == 0 or (score_dict.get("val_score", 0) >
                      score_df["val_score"][:-1].fillna(0).max()):
            hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"),
                        score_list)
            hu.torch_save(os.path.join(savedir, "model_best.pth"),
                          model.get_state_dict())
            print("Saved Best: %s" % savedir)

    # if s_epoch==exp_dict['max_epoch']:
    #     e = s_epoch
    model.load_state_dict(
        hu.torch_load(os.path.join(savedir, "model_best.pth")))
    test_dict = model.test_on_loader(test_loader)
    hu.save_pkl(os.path.join(savedir, 'test_iou.pkl'), test_dict)
    print('Test IoU:{}'.format(test_dict["test_iou"]))
    print('Experiment completed et epoch %d' % e)
Exemplo n.º 4
0
    def launch_or_ignore_exp_dict(self,
                                  exp_dict,
                                  command,
                                  reset,
                                  savedir,
                                  submit_dict={}):
        """launch or ignore job.

        It checks if the experiment exist and manages the special casses, e.g.,
        new experiment, reset, failed, job is already running, completed
        """
        # Define paths
        fname = get_job_fname(savedir)

        if not os.path.exists(fname):
            # Check if the job already exists
            job_dict = self.launch_exp_dict(exp_dict,
                                            savedir,
                                            command,
                                            job=None)
            job_id = job_dict['job_id']
            message = "SUBMITTED: Launching"

        elif reset:
            # Check if the job already exists
            job_id = hu.load_json(fname).get("job_id")
            self.kill_job(job_id)
            hc.delete_and_backup_experiment(savedir)

            job_dict = self.launch_exp_dict(exp_dict,
                                            savedir,
                                            command,
                                            job=None)
            job_id = job_dict['job_id']
            message = "SUBMITTED: Resetting"

        else:
            job_id = hu.load_json(fname).get("job_id")
            job = self.get_job(job_id)

            if job['state'] in [
                    'SUCCEEDED', 'RUNNING', 'PENDING', 'COMPLETED',
                    'COMPLETING'
            ]:
                # If the job is alive, do nothing
                message = 'IGNORED: Job %s' % job['state']

            elif job['state'] in [
                    "FAILED", "CANCELLED", "INTERRUPTED", "TIMEOUT"
            ]:
                message = "SUBMITTED: Retrying %s Job" % job['state']
                job_dict = self.launch_exp_dict(exp_dict,
                                                savedir,
                                                command,
                                                job=job)
                job_id = job_dict['job_id']
            # This shouldn't happen
            else:
                raise ValueError('wtf')

        submit_dict[job_id] = {'exp_dict': exp_dict, 'message': message}
def trainval(exp_dict, savedir_base, datadir, reset=False, num_workers=0):
    # bookkeepting stuff
    # ==================
    pprint.pprint(exp_dict)
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)
    if reset:
        hc.delete_and_backup_experiment(savedir)

    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)
    print("Experiment saved in %s" % savedir)

    # set seed
    # ==================
    seed = 42
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Dataset
    # ==================
    # train set
    train_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                     split="train",
                                     datadir=datadir,
                                     exp_dict=exp_dict,
                                     dataset_size=exp_dict['dataset_size'])
    # val set
    val_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                   split="val",
                                   datadir=datadir,
                                   exp_dict=exp_dict,
                                   dataset_size=exp_dict['dataset_size'])

    # test set
    test_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                    split="test",
                                    datadir=datadir,
                                    exp_dict=exp_dict,
                                    dataset_size=exp_dict['dataset_size'])

    # val_sampler = torch.utils.data.SequentialSampler(val_set)
    val_loader = DataLoader(
        val_set,
        # sampler=val_sampler,
        batch_size=1,
        collate_fn=ut.collate_fn,
        num_workers=num_workers)
    test_loader = DataLoader(
        test_set,
        # sampler=val_sampler,
        batch_size=1,
        collate_fn=ut.collate_fn,
        num_workers=num_workers)

    # Model
    # ==================
    model = models.get_model(model_dict=exp_dict['model'],
                             exp_dict=exp_dict,
                             train_set=train_set).cuda()

    # model.opt = optimizers.get_optim(exp_dict['opt'], model)
    model_path = os.path.join(savedir, "model.pth")
    score_list_path = os.path.join(savedir, "score_list.pkl")

    if os.path.exists(score_list_path):
        # resume experiment
        model.load_state_dict(hu.torch_load(model_path))
        score_list = hu.load_pkl(score_list_path)
        s_epoch = score_list[-1]['epoch'] + 1
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # Train & Val
    # ==================
    print("Starting experiment at epoch %d" % (s_epoch))
    model.waiting = 0
    model.val_score_best = -np.inf

    train_sampler = torch.utils.data.RandomSampler(train_set,
                                                   replacement=True,
                                                   num_samples=2 *
                                                   len(test_set))

    train_loader = DataLoader(train_set,
                              sampler=train_sampler,
                              collate_fn=ut.collate_fn,
                              batch_size=exp_dict["batch_size"],
                              drop_last=True,
                              num_workers=num_workers)

    for e in range(s_epoch, exp_dict['max_epoch']):
        # Validate only at the start of each cycle
        score_dict = {}
        test_dict = model.val_on_loader(test_loader,
                                        savedir_images=os.path.join(
                                            savedir, "images"),
                                        n_images=3)
        # Train the model
        train_dict = model.train_on_loader(train_loader)

        # Validate the model
        val_dict = model.val_on_loader(val_loader)
        score_dict["val_score"] = val_dict["val_score"]

        # Get new score_dict
        score_dict.update(train_dict)
        score_dict["epoch"] = e
        score_dict["waiting"] = model.waiting

        model.waiting += 1

        # Add to score_list and save checkpoint
        score_list += [score_dict]

        # Save Best Checkpoint
        score_df = pd.DataFrame(score_list)
        if score_dict["val_score"] >= model.val_score_best:
            test_dict = model.val_on_loader(test_loader,
                                            savedir_images=os.path.join(
                                                savedir, "images"),
                                            n_images=3)
            score_dict.update(test_dict)
            hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"),
                        score_list)
            # score_df.to_csv(os.path.join(savedir, "score_best_df.csv"))
            hu.torch_save(os.path.join(savedir, "model_best.pth"),
                          model.get_state_dict())
            model.waiting = 0
            model.val_score_best = score_dict["val_score"]
            print("Saved Best: %s" % savedir)

        # Report & Save
        score_df = pd.DataFrame(score_list)
        # score_df.to_csv(os.path.join(savedir, "score_df.csv"))
        print("\n", score_df.tail(), "\n")
        hu.torch_save(model_path, model.get_state_dict())
        hu.save_pkl(score_list_path, score_list)
        print("Checkpoint Saved: %s" % savedir)

        if model.waiting > 100:
            break

    print('Experiment completed et epoch %d' % e)
Exemplo n.º 6
0
def trainval(exp_dict, savedir_base, datadir_base, reset=False):
    # bookkeeping stuff
    # ==================
    pprint.pprint(exp_dict)
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)
    if reset:
        hc.delete_and_backup_experiment(savedir)

    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)
    print("Experiment saved in %s" % savedir)

    # Dataset
    # ==================

    # load train and acrtive set
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     split="train",
                                     datadir_base=datadir_base,
                                     exp_dict=exp_dict)

    active_set = ActiveLearningDataset(train_set, random_state=42)

    # val set
    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   split="val",
                                   datadir_base=datadir_base,
                                   exp_dict=exp_dict)
    val_loader = DataLoader(val_set, batch_size=exp_dict["batch_size"])

    # Model
    # ==================
    model = models.get_model(model_name=exp_dict['model']['name'],
                             exp_dict=exp_dict).cuda()

    model_path = os.path.join(savedir, "model.pth")
    score_list_path = os.path.join(savedir, "score_list.pkl")

    if os.path.exists(score_list_path):
        # resume experiment
        model.set_state_dict(hu.torch_load(model_path))
        active_set.load_state_dict(
            hu.load_pkl(os.path.join(savedir, "active_set.pkl")))
        score_list = hu.load_pkl(score_list_path)
        inner_s_epoch = score_list[-1]['inner_epoch'] + 1
        s_cycle = score_list[-1]['cycle']
    else:
        # restart experiment
        score_list = []
        inner_s_epoch = 0
        s_cycle = 0

    # Train & Val
    # ==================
    print("Starting experiment at cycle %d epoch %d" %
          (s_cycle, inner_s_epoch))

    for c in range(s_cycle, exp_dict['max_cycle']):
        # Set seed
        np.random.seed(c)
        torch.manual_seed(c)
        torch.cuda.manual_seed_all(c)

        if inner_s_epoch == 0:
            active_set.label_next_batch(model)
            hu.save_pkl(os.path.join(savedir, "active_set.pkl"),
                        active_set.state_dict())

        train_loader = DataLoader(active_set,
                                  sampler=samplers.get_sampler(
                                      exp_dict['sampler']['train'],
                                      active_set),
                                  batch_size=exp_dict["batch_size"])
        # Visualize the model
        model.vis_on_loader(vis_loader,
                            savedir=os.path.join(savedir, "images"))

        for e in range(inner_s_epoch, exp_dict['max_epoch']):
            # Validate only at the start of each cycle
            score_dict = {}
            if e == 0:
                score_dict.update(model.val_on_loader(val_loader))

            # Train the model
            score_dict.update(model.train_on_loader(train_loader))

            # Validate the model
            score_dict["epoch"] = len(score_list)
            score_dict["inner_epoch"] = e
            score_dict["cycle"] = c
            score_dict['n_ratio'] = active_set.n_labelled_ratio
            score_dict["n_train"] = len(train_loader.dataset)
            score_dict["n_pool"] = len(train_loader.dataset.pool)

            # Add to score_list and save checkpoint
            score_list += [score_dict]

            # Report & Save
            score_df = pd.DataFrame(score_list)
            print("\n", score_df.tail(), "\n")
            hu.torch_save(model_path, model.get_state_dict())
            hu.save_pkl(score_list_path, score_list)
            print("Checkpoint Saved: %s" % savedir)

        inner_s_epoch = 0
Exemplo n.º 7
0
def trainval(exp_dict, savedir_base, datadir, reset=False, num_workers=0):
    # bookkeepting stuff
    # ==================
    pprint.pprint(exp_dict)
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)
    if reset:
        hc.delete_and_backup_experiment(savedir)

    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)
    print("Experiment saved in %s" % savedir)

    # Dataset
    # ==================
    # train set
    train_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                     split="train",
                                     datadir=datadir,
                                     exp_dict=exp_dict,
                                     dataset_size=exp_dict['dataset_size'])
    # val set
    val_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                   split="val",
                                   datadir=datadir,
                                   exp_dict=exp_dict,
                                   dataset_size=exp_dict['dataset_size'])

    val_sampler = torch.utils.data.SequentialSampler(val_set)
    val_loader = DataLoader(val_set,
                            sampler=val_sampler,
                            batch_size=1,
                            num_workers=num_workers)
    # Model
    # ==================
    model = models.get_model(model_dict=exp_dict['model'],
                             exp_dict=exp_dict,
                             train_set=train_set).cuda()

    # model.opt = optimizers.get_optim(exp_dict['opt'], model)
    model_path = os.path.join(savedir, "model.pth")
    score_list_path = os.path.join(savedir, "score_list.pkl")

    if os.path.exists(score_list_path):
        # resume experiment
        model.load_state_dict(hu.torch_load(model_path))
        score_list = hu.load_pkl(score_list_path)
        s_epoch = score_list[-1]['epoch'] + 1
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # Train & Val
    # ==================
    print("Starting experiment at epoch %d" % (s_epoch))

    train_sampler = torch.utils.data.RandomSampler(train_set,
                                                   replacement=True,
                                                   num_samples=2 *
                                                   len(val_set))

    train_loader = DataLoader(train_set,
                              sampler=train_sampler,
                              batch_size=exp_dict["batch_size"],
                              drop_last=True,
                              num_workers=num_workers)

    for e in range(s_epoch, exp_dict['max_epoch']):
        # Validate only at the start of each cycle
        score_dict = {}

        # Train the model
        train_dict = model.train_on_loader(train_loader)

        # Validate and Visualize the model
        val_dict = model.val_on_loader(val_loader,
                                       savedir_images=os.path.join(
                                           savedir, "images"),
                                       n_images=3)
        score_dict.update(val_dict)
        # model.vis_on_loader(
        #     vis_loader, savedir=os.path.join(savedir, "images"))

        # Get new score_dict
        score_dict.update(train_dict)
        score_dict["epoch"] = len(score_list)

        # Add to score_list and save checkpoint
        score_list += [score_dict]

        # Report & Save
        score_df = pd.DataFrame(score_list)
        print("\n", score_df.tail(), "\n")
        hu.torch_save(model_path, model.get_state_dict())
        hu.save_pkl(score_list_path, score_list)
        print("Checkpoint Saved: %s" % savedir)

        # Save Best Checkpoint
        if e == 0 or (score_dict.get("val_score", 0) >
                      score_df["val_score"][:-1].fillna(0).max()):
            hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"),
                        score_list)
            hu.torch_save(os.path.join(savedir, "model_best.pth"),
                          model.get_state_dict())
            print("Saved Best: %s" % savedir)

    print('Experiment completed et epoch %d' % e)
Exemplo n.º 8
0
def trainval(exp_dict, savedir_base, reset):
    # ==================
    # bookkeepting stuff
    # ==================
    pprint.pprint(exp_dict)
    exp_id = hu.hash_dict(exp_dict)
    savedir = savedir_base + "/%s/" % exp_id
    if reset:
        hc.delete_and_backup_experiment(savedir)

    os.makedirs(savedir, exist_ok=True)
    hu.save_json(savedir + "exp_dict.json", exp_dict)
    print("Experiment saved in %s" % savedir)

    # ==================
    # Dataset
    # ==================
    transform = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize((0.5, ), (0.5, ))
    ])

    # train set
    train_set = torchvision.datasets.MNIST(savedir_base,
                                           train=True,
                                           download=True,
                                           transform=transform)
    train_loader = DataLoader(train_set,
                              shuffle=True,
                              batch_size=exp_dict["batch_size"])

    # val set
    val_set = torchvision.datasets.MNIST(savedir_base,
                                         train=False,
                                         download=True,
                                         transform=transform)
    val_loader = DataLoader(val_set,
                            shuffle=False,
                            batch_size=exp_dict["batch_size"])

    # ==================
    # Model
    # ==================
    model = MLP(n_classes=10).cuda()
    model.opt = torch.optim.Adam(model.parameters(), lr=exp_dict["lr"])

    model_path = savedir + "/model.pth"
    score_list_path = savedir + "/score_list.pkl"

    if os.path.exists(score_list_path):
        # resume experiment
        model.set_state_dict(hu.torch_load(model_path))
        score_list = hu.load_pkl(score_list_path)
        s_epoch = len(score_list)
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # ==================
    # Train & Val
    # ==================
    print("Starting experiment at epoch %d" % s_epoch)
    for e in range(s_epoch, 100):
        score_dict = {}

        # Train the model
        score_dict.update(model.train_on_loader(train_loader))

        # Validate the model
        score_dict.update(model.val_on_loader(val_loader))
        score_dict["epoch"] = e

        # Add to score_list and save checkpoint
        score_list += [score_dict]

        # Report & Save
        score_df = pd.DataFrame(score_list)
        print("\n", score_df.tail()[["epoch", "train_loss", "val_acc"]], "\n")
        hu.torch_save(model_path, model.get_state_dict())
        hu.save_pkl(score_list_path, score_list)
        print("Checkpoint Saved: %s" % savedir_base)