예제 #1
0
def main(datapath, clustering_model, encoding_model, batch_size, n_epochs, lr, flattened, device, train_split, valid_split, train_labeled_split,
         experiment, encode, cluster, train_subset=None, path_to_model=None):
    """
    :param datapath: path to the directory containing the samples
    :param clustering_model: which clustering model to use [kmeans, gmm].
    :param encoding_model: which encoding model to use, convolutional, variational or simple autoencoders.
    :param batch_size: batch size
    :param n_epochs: number of epochs
    :param lr: learning rate
    :param flattened: If True return the images in a flatten format.
    :param device: use CUDA device if available else CPU .
    :param overlapped: boolean, if True use the overlapped pixel patches.
    :param experiment: track experiment
    :param encode: boolean, if True, train and apply encoding model.
    :param cluster: boolean, if True, train and apply clustering model.
    :param train_subset: How many elements will be used. Default: all.
    :param path_to_model: path to the directory containing saved models.

    """
    train = HoromaDataset(datapath, split=train_split, subset=train_subset,
                          flattened=flattened)
    labeled = HoromaDataset(datapath, split=train_labeled_split, subset=train_subset,
                            flattened=flattened)
    valid_data = HoromaDataset(
        datapath, split=valid_split, subset=train_subset, flattened=flattened)

    train_label_indices = labeled.targets
    valid_indices = valid_data.targets

    print("Shape of training set: ", train.data.shape)
    print("Shape of validation set: ", valid_data.data.shape)

    if encode:
        # Train and apply encoding model
        train_enc, encoding_model = encoding_model.fit(train, valid_data, batch_size=batch_size, n_epochs=n_epochs,
                                                       lr=lr, device=device, experiment=experiment)
    else:
        encoding_model.load_state_dict(torch.load(path_to_model)["model"])
        train_enc = encode_dataset(encoding_model, train, batch_size, device)
    if cluster:
        train_labeled_enc = encoding_model.encode(
            labeled[train_label_indices][0].to(device))
        valid_enc = encoding_model.encode(labeled[valid_indices][0].to(device))

        # Train and apply clustering model
        clustering_model.train(train_enc)
        cluster_labels = assign_labels_to_clusters(clustering_model, train_labeled_enc,
                                                   labeled.targets[train_label_indices])
        _, accuracy, f1 = eval_model_predictions(clustering_model, valid_enc, labeled.targets[valid_indices],
                                                 cluster_labels)
        experiment.log_metric('accuracy', accuracy)
        experiment.log_metric('f1-score', f1)

        # Save models
        model = {'cluster': clustering_model,
                 'embedding': encoding_model, 'cluster_labels': cluster_labels}
        torch.save(model, Constants.PATH_TO_MODEL +
                   str(experiment.get_key()) + '.pth')
예제 #2
0
def main(datapath, configs, experiment):
    """
    :param datapath: path to the directory containing the samples
    :param configs: dictionary containing hyperparameters for training.
    :param experiment: comet ml experiment object for logging results
    """

    train_split = configs['train_split']
    valid_split = configs['valid_split']
    train_labeled_split = configs['train_labeled_split']

    train = HoromaDataset(datapath,
                          split=train_split,
                          subset=None,
                          flattened=False)
    labeled = HoromaDataset(datapath,
                            split=train_labeled_split,
                            subset=None,
                            flattened=False)
    valid_data = HoromaDataset(datapath,
                               split=valid_split,
                               subset=None,
                               flattened=False)

    train_loader = DataLoader(train,
                              batch_size=configs['batch_size'],
                              shuffle=True)
    labeled_loader = DataLoader(labeled,
                                batch_size=configs['labeled_batch_size'],
                                shuffle=True)
    eval_loader = DataLoader(valid_data,
                             batch_size=configs['labeled_batch_size'],
                             shuffle=True)

    print("Shape of training set: ", train.data.shape)
    print("Shape of validation set: ", valid_data.data.shape)

    n_iterations = np.floor(labeled.data.shape[0] /
                            configs['labeled_batch_size'])
    device = 'cuda'

    net = NRM('AllConv13',
              batch_size=configs['labeled_batch_size'],
              num_class=17,
              use_bias=configs['use_bias'],
              use_bn=configs['use_bn'],
              do_topdown=configs['do_topdown'],
              do_pn=configs['do_pn'],
              do_bnmm=configs['do_bnmm']).to(device)
    net.apply(weights_init)
    best_f1, best_acc, best_model = train_nrm(net, train_loader,
                                              labeled_loader, eval_loader,
                                              configs['n_epochs'], configs,
                                              n_iterations, experiment)

    experiment.log_metric('best_accuracy', best_acc)
    experiment.log_metric('best_f1-score', best_f1)
    experiment.log_metric('best_model_epoch', best_model)
예제 #3
0
def main(datapath, configs, experiment):
    """
    :param datapath: path to the directory containing the samples
    :param configs: dictionary containing hyperparameters for training.
    :param experiment: comet ml experiment object for logging results
    """

    train_split = configs['train_split']
    valid_split = configs['valid_split']
    train_labeled_split = configs['train_labeled_split']

    train = HoromaDataset(datapath, split=train_split, subset=None,
                          flattened=False)
    labeled = HoromaDataset(datapath, split=train_labeled_split, subset=None,
                            flattened=False)
    valid_data = HoromaDataset(
        datapath, split=valid_split, subset=None, flattened=False)

    print("Shape of training set: ", train.data.shape)
    print("Shape of validation set: ", valid_data.data.shape)

    if configs['encode']:
        if configs['enc_model'] == "hali":
            Gx1, Gx2, Gz1, Gz2, Disc, optim_g, optim_d, train_loader, cuda, configs = initialize_hali(
                configs, train)
            training_loop_hali(Gz1, Gz2, Gx1, Gx2, Disc, optim_d,
                               optim_g, train_loader, configs, experiment, cuda)
        else:
            Gx, Gz, Disc, optim_g, optim_d, train_loader, cuda, configs = initialize_ali(
                configs, train)
            training_loop_ali(Gz, Gx, Disc, optim_d, optim_g,
                              train_loader, configs, experiment, cuda)

    if configs['cluster']:

        if configs['enc_model'] == "hali":

            best_f1, best_acc, best_model = get_results_hali(
                configs, experiment, train, labeled, valid_data)

        else:

            best_f1, best_acc, best_model = get_results_ali(
                configs, experiment, train, labeled, valid_data)

        experiment.log_metric('best_accuracy', best_acc)
        experiment.log_metric('best_f1-score', best_f1)
        experiment.log_metric('best_model_epoch', best_model)
예제 #4
0
def main(datapath,
         encoding_model,
         classifier_model,
         batch_size,
         n_epochs,
         lr_unsup,
         lr_sup,
         device,
         train_unlabeled_split,
         valid_split,
         train_labeled_split,
         patience,
         experiment,
         path_to_model=None):
    """
    :param datapath: path to the directory containing the samples
    :param classifier_model: which classifier model to use
    :param encoding_model: which encoding model to use, convolutional, variational or simple autoencoders.
    :param batch_size: batch size
    :param n_epochs: number of epochs
    :param lr_unsup: learning rate for unsupervised part
    :param lr_sup: learning rate for supervised part
    :param train_unlabeled_split: unlabeled data used for unsupervised part
    :param valid_split: valid split for MLP
    :param train_labeled_split: train split for MLP
    :param patience: patience for early stopping
    :param device: use CUDA device if available else CPU .
    :param experiment: track experiment
    :param path_to_model: path to the directory containing saved models.
    """
    train_unlabeled = HoromaDataset(datapath, split=train_unlabeled_split)
    train_labeled = HoromaDataset(datapath, split=train_labeled_split)
    valid_data = HoromaDataset(datapath, split=valid_split)
    valid_loader = DataLoader(valid_data, batch_size=batch_size)

    n_labeled_batch = len(train_labeled) // batch_size
    n_unlabeled_batch = n_labeled_batch

    # Semisupervised Training
    train_semi_supervised_network(encoding_model, classifier_model,
                                  train_unlabeled, train_labeled, valid_loader,
                                  n_epochs, batch_size, lr_unsup, lr_sup,
                                  device, n_labeled_batch, n_unlabeled_batch,
                                  patience, experiment)
예제 #5
0
def load_datasets(datapath, train_subset, flattened=False, split="train_all"):
    """
    Load Horoma datasets from specified data directory.

    :type datapath: str
    :type flattened: bool
    :type train_subset: str
    """

    print("Loading datasets from ({}) ...".format(datapath), end=' ')
    start_time = time()
    dataset = HoromaDataset(datapath,
                            split=split,
                            subset=train_subset,
                            flattened=flattened)

    print("Done in {:.2f} sec".format(time() - start_time))

    return dataset
예제 #6
0
def main(config, resume, test_run=False, helios_run=None, horoma_test=False):
    """
    Execute a training for a model.

    :param config: the configuration of the optimizer, model and trainer.
    :param resume: path to the checkpoint of a model.
    :param test_run: whether it's a test run or not. In case of test run,
    uses custom mnist dataset.
    :param helios_run: start datetime of a run on helios.
    :param horoma_test: whether to use the test horoma dataset or not.
    """
    np.random.seed(config["numpy_seed"])
    torch.manual_seed(config["torch_seed"])
    torch.cuda.manual_seed_all(config["torch_seed"])

    # setup data_loader instances
    if not test_run:
        unlabelled = HoromaDataset(**config["data"]["dataset"],
                                   split='train_overlapped',
                                   transforms=HoromaTransforms())

        labelled = HoromaDataset(
            data_dir=config["data"]["dataset"]['data_dir'],
            flattened=False,
            split='valid_overlapped',
            transforms=HoromaTransforms())
    elif horoma_test:

        unlabelled = HoromaDataset(**config["data"]["dataset"],
                                   split='train_overlapped',
                                   transforms=HoromaTransforms(),
                                   subset=5000)

        labelled = HoromaDataset(
            data_dir=config["data"]["dataset"]['data_dir'],
            flattened=False,
            split='valid_overlapped',
            transforms=HoromaTransforms(),
            subset=1000)
    else:
        unlabelled = CustomMNIST(**config["data"]["dataset"], subset=5000)
        labelled = CustomLabelledMNIST(**config["data"]["dataset"],
                                       subset=1000)

    model = ModelFactory.get(config)

    print(model)
    print()

    trainable_params = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = OptimizerFactory.get(config, trainable_params)

    trainer = TrainerFactory.get(config)(model,
                                         optimizer,
                                         resume=resume,
                                         config=config,
                                         unlabelled=unlabelled,
                                         labelled=labelled,
                                         helios_run=helios_run,
                                         **config['trainer']['options'])

    trainer.train()
예제 #7
0
def main(datapath,
         encoding_model,
         batch_size,
         n_epochs,
         lr,
         device,
         train_split,
         valid_split,
         train_labeled_split,
         experiment,
         path_to_model=None):
    """
    :param datapath: path to the directory containing the samples
    :param encoding_model: which encoding model to use, convolutional, variational or simple autoencoders.
    :param batch_size: batch size
    :param n_epochs: number of epochs
    :param lr: learning rate for unsupervised part
    :param train_split: dataset used for unsupervised part
    :param valid_split: valid split for SVM
    :param train_labeled_split: train split for SVM
    :param device: use CUDA device if available else CPU .
    :param experiment: track experiment
    :param path_to_model: path to the directory containing saved models.
    """
    full_dataset = HoromaDataset(datapath,
                                 split=train_split,
                                 flattened=flattened)
    train_labeled = HoromaDataset(datapath,
                                  split=train_labeled_split,
                                  flattened=flattened)
    # Validation data(labeled) for the supervised task(Classification)
    valid_data = HoromaDataset(datapath,
                               split=valid_split,
                               flattened=flattened)

    # split the full_dataset(labeled and unlabeled train data) into train and valid for autoencoder pre-training
    n_train = int(0.90 * len(full_dataset))
    n_valid = len(full_dataset) - n_train
    train_dataset, valid_dataset = data.random_split(full_dataset,
                                                     [n_train, n_valid])

    # Train and apply encoding model
    train_enc, encoding_model = encoding_model.fit(train_dataset,
                                                   valid_dataset,
                                                   batch_size=batch_size,
                                                   n_epochs=n_epochs,
                                                   lr=lr,
                                                   device=device,
                                                   experiment=experiment)

    # extract latent representation of train_labeled data
    train_labeled_enc = encode_dataset(encoding_model,
                                       train_labeled,
                                       batch_size,
                                       device,
                                       is_unlabeled=False)
    print("Train labeled data encoding complete.\n")

    # extract latent representation of validation data
    valid_enc = encode_dataset(encoding_model,
                               valid_data,
                               batch_size,
                               device,
                               is_unlabeled=False)
    print("validation data encoding complete.\n")

    start_time = time()

    # Train SVM classifier
    svm_classifier = SVMClassifier()
    print("Traing SVM classifier...\n")
    pred_train_y = svm_classifier.train_classifier(train_labeled_enc,
                                                   train_labeled.targets)

    print("Computing metrics for train data\n")
    train_accuracy, train_f1, __train_f1 = __compute_metrics(
        train_labeled.targets, pred_train_y)

    print("Prediction for validation data. \n")
    pred_valid_y = svm_classifier.validate_classifier(valid_enc)

    print("Computing metrics for validation data\n")
    valid_accuracy, valid_f1, __valid_f1 = __compute_metrics(
        valid_data.targets, pred_valid_y)

    print("Done in {:.2f} sec.".format(time() - start_time))
    print("Train : Accuracy: {:.2f} | F1: {:.2f}".format(
        train_accuracy * 100, train_f1 * 100))
    print("Train : F1 score for each class: {}".format(__train_f1 * 100))
    print("Validation : Accuracy: {:.2f} | F1: {:.2f}".format(
        valid_accuracy * 100, valid_f1 * 100))
    print("Validation : F1 score for each class: {}".format(__valid_f1 * 100))

    experiment.log_metric('Train accuracy', train_accuracy)
    experiment.log_metric('Train f1-score', train_f1)
    experiment.log_metric('Validation accuracy', valid_accuracy)
    experiment.log_metric('Validation f1-score', valid_f1)
예제 #8
0
def main(path, data, model_name):
    print('Path: {}'.format(path))
    print('Data: {}'.format(data))
    print('Model: {}'.format(model_name))

    print()

    print('>> Beginning...')

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    unlabelled = HoromaDataset(
        data,
        split='train_overlapped',
        transforms=HoromaTransforms()
    )

    labelled = HoromaDataset(
        data_dir=data,
        split='valid_overlapped',
        transforms=HoromaTransforms()
    )
    labelled = FullDataset(labelled)

    print('>> Dataset lengths : {}, {}'.format(len(unlabelled), len(labelled)))

    labelled_loader = DataLoader(labelled, batch_size=100, shuffle=False)

    print('>> Getting the configuration...', end=' ')

    config = retrieve_config(path + 'config.json')

    print('Done.')

    try:
        model = get_model(config).to(device)
    except TypeError:
        config['model']['args']['dropout'] = .1
        model = get_model(config).to(device)

    print('>> Getting the checkpoint...', end=' ')

    checkpoint = torch.load(path + model_name, map_location=device)

    if 'state_dict' in checkpoint:
        state_dict = checkpoint['state_dict']
        kmeans = checkpoint['cluster_collection'].models['kmeans100'].model
        del checkpoint
    else:
        state_dict = checkpoint

    print('Done.')

    # torch.save(state_dict, path + 'bare_model.pth')

    model.load_state_dict(state_dict)

    model.to('cpu')
    torch.save(model, path + 'bare_model.pth')
    model.to(device)

    cluster_helper = ClusterHelper(
        model=model,
        device=device,
        unlabelled_loader=None,
        train_loader=labelled_loader,
        valid_loader=labelled_loader
    )

    best_kmeans = None
    best_score = 0.

    threshold = .4
    good_models = []

    for n_clusters in [30, 50, 80, 100, 150, 200]:

        print()
        print()

        kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=10000)
        # kmeans = KMeans(n_clusters=100)

        print(kmeans)

        print('>> Getting embeddings...', end=' ')
        data = cluster_helper.get_embeddings(
            DataLoader(unlabelled, batch_size=100, shuffle=True)
        )
        print('Done.')

        print('>> Fitting data...', end=' ')
        kmeans.fit(data)
        print('Done.')

        # n = len(np.unique(labelled.region_ids))
        n = 40

        accuracies = np.empty(n)
        f1_scores = np.empty(n)

        for i in range(n):
            print()

            kfold = SplitDataset(split=.7)
            train, valid = kfold(labelled)

            print('Train: {}'.format(get_classes(train)))
            print('Valid: {}'.format(get_classes(valid)))

            cluster_helper = ClusterHelper(
                model=model,
                device=device,
                unlabelled_loader=None,
                train_loader=DataLoader(train, shuffle=False, batch_size=100),
                valid_loader=DataLoader(valid, shuffle=False, batch_size=100)
            )

            clustering_model = ClusterModel(
                kmeans,
                cluster_helper=cluster_helper
            )

            cluster_helper.build_valid_embeddings()
            cluster_helper.build_train_embeddings()

            clustering_model.create_mapping()

            labels = cluster_helper.valid_labels
            prediction = clustering_model.labelled_predict(
                cluster_helper.valid_embedding
            )

            unlabelled_examples = (prediction == -1).sum()

            print('Unlabelled examples : {}/{} ({:.2%})'.format(
                unlabelled_examples,
                len(prediction),
                unlabelled_examples / len(prediction)
            ))

            accuracy = metrics.accuracy_score(labels, prediction)
            f1_score = metrics.f1_score(labels, prediction, average='weighted')

            accuracies[i] = accuracy
            f1_scores[i] = f1_score

            if accuracy > threshold and f1_score > threshold:
                good_models.append(clustering_model)

            print('Accuracy: {:.3%}'.format(accuracy))
            print('F1 Score: {:.3%}'.format(f1_score))

        print()
        print('  > Average accuracy: {:.3%}'.format(accuracies.mean()))
        print('  > Average F1 Score: {:.3%}'.format(f1_scores.mean()))

        if f1_scores.mean() > best_score:
            best_score = f1_scores.mean()
            best_kmeans = kmeans

    majority_vote = MajorityVote(good_models)

    # Final fit
    cluster_helper = ClusterHelper(
        model=model,
        device=device,
        unlabelled_loader=None,
        train_loader=labelled_loader,
        valid_loader=labelled_loader
    )

    print()
    print('>> Training finished')
    print('>> Best score: {:.3%}\n{}'.format(best_score, best_kmeans))

    clustering_model = ClusterModel(
        best_kmeans,
        cluster_helper=cluster_helper
    )

    cluster_helper.build_valid_embeddings()
    cluster_helper.build_train_embeddings()

    # Test the majority vote
    print('>> Majority vote : {}'.format(
        metrics.f1_score(
            cluster_helper.valid_labels,
            majority_vote.labelled_predict(cluster_helper.valid_embedding),
            average='weighted'
        )
    ))

    clustering_model.create_mapping()
    del clustering_model.cluster_helper

    model.to('cpu')

    wrapper = ModelWrapper(model, clustering_model)
    wrapper_maj = ModelWrapper(model, majority_vote)

    with open(path + 'final_model_full.pkl', 'wb') as f:
        pickle.dump(wrapper, f)

    with open(path + 'final_model_maj_full.pkl', 'wb') as f:
        pickle.dump(wrapper_maj, f)
예제 #9
0
def main(config, resume, test_run=False, helios_run=None, horoma_test=False):
    """
    Run an hyperparameter search with bayesian optimization.

    :param config: configuration of the model optimizer, trainer and
    model to use.
    :param resume: path to a pickled hyperparameters optimizer.
    :param test_run: whether it's a test run or not. In case of test run,
    uses custom mnist dataset.
    :param helios_run: start datetime of a run on helios.
    :param horoma_test: whether to use the test horoma dataset or not.
    """
    np.random.seed(config["numpy_seed"])
    torch.manual_seed(config["torch_seed"])
    torch.cuda.manual_seed_all(config["torch_seed"])

    # setup data_loader instances
    if not test_run:

        unlabelled = HoromaDataset(
            **config["data"]["dataset"],
            split='train_overlapped',
            transforms=HoromaTransforms()
        )

        labelled = HoromaDataset(
            data_dir=config["data"]["dataset"]['data_dir'],
            flattened=False,
            split='valid_overlapped',
            transforms=HoromaTransforms()
        )
    elif horoma_test:

        unlabelled = HoromaDataset(
            **config["data"]["dataset"],
            split='train_overlapped',
            transforms=HoromaTransforms(),
            subset=5000
        )

        labelled = HoromaDataset(
            data_dir=config["data"]["dataset"]['data_dir'],
            flattened=False,
            split='valid_overlapped',
            transforms=HoromaTransforms(),
            subset=1000
        )
    else:
        unlabelled = CustomMNIST(**config["data"]["dataset"], subset=5000)
        labelled = CustomLabelledMNIST(**config["data"]["dataset"],
                                       subset=1000)

    model_hyperparameters_space = ModelFactory.getclass(
        config["model"]["type"]
    ).model_hyperparameters_space

    for h in model_hyperparameters_space:
        h.name = "model.{}".format(h.name)

    hyperparameters_space.extend(
        model_hyperparameters_space
    )

    if not helios_run:
        experiment_datetime = datetime.datetime.now().strftime('%m%d_%H%M%S')
    else:
        experiment_datetime = helios_run

    checkpoint_path = os.path.join(config["trainer"]["log_dir"],
                                   config["name"],
                                   experiment_datetime, 'optimizer.pkl')

    if not resume:
        hp_optimizer = Optimizer(hyperparameters_space)
    else:
        hp_optimizer = load_optimizer(resume)

    for experiment_number in range(len(hp_optimizer.yi), 20):
        hyperparameters = hp_optimizer.ask()

        experiment_folder = os.path.join(experiment_datetime,
                                         str(experiment_number))

        optimizer_hp, model_hp, hp_markdown = \
            hyperparameters_parsing(hyperparameters, hyperparameters_space,
                                    config)

        model = ModelFactory.getclass(
            config["model"]["type"]
        )(**model_hp)

        trainable_params = filter(lambda p: p.requires_grad,
                                  model.parameters())
        optimizer = OptimizerFactory.getclass(
            config["optimizer"]["type"]
        )(trainable_params, **optimizer_hp)

        trainer = TrainerFactory.get(config)(
            model,
            optimizer,
            resume=None,
            config=config,
            unlabelled=unlabelled,
            labelled=labelled,
            helios_run=helios_run,
            experiment_folder=experiment_folder,
            **config['trainer']['options']
        )
        trainer.logger.info(hp_markdown)
        trainer.tb_writer.add_text("hyperparameters", hp_markdown)
        score = trainer.train()

        hp_optimizer.tell(hyperparameters, score)

        save_optimizer(hp_optimizer, checkpoint_path)
예제 #10
0
def eval_model(model_path, dataset_dir, split):
    '''
    # MODIFY HERE #
    This function is meant to be an example

    '''

    # # SETUP MODEL # #
    # Load your best model
    print("\nLoading model from ({}).".format(model_path))
    model = load(model_path)

    # # SETUP DATASET # #
    # Load requested dataset
    """ IMPORTANT # of example per splits.
    "train" = 150700
    "train_overlapped" = 544027
    "valid" = 499
    "valid_overlapped" = 1380
    "test" = 483

    Files available the test folder:
        test_regions_id.txt
        test_x.dat
        test_y.txt
        train_overlapped_regions_id.txt
        train_overlapped_x.dat
        train_overlapped_y.txt
        train_regions_id.txt
        train_x.dat
        train_y.txt
        valid_overlapped_regions_id.txt
        valid_overlapped_x.dat
        valid_overlapped_y.txt
        valid_regions_id.txt
        valid_x.dat
        valid_y.txt

    You need to load the right one according to the `split`.
    """
    dataset = HoromaDataset(
        dataset_dir,
        split,
        transforms=transforms.Compose([transforms.ToPILImage(), transforms.ToTensor()])
    )

    # # INFERENCE # #
    # Use model on dataset to predict the class
    prediction = model.predict(dataset)

    # We return -1 if the cluster has no label...
    map_labels = np.concatenate([dataset.map_labels, np.array([''])])

    pred = map_labels[prediction]

    # # PREDICTIONS # #
    # Return the predicted classes as a numpy array of shape (nb_exemple, 1)
    """ Example:
    [['ES']
     ['EN']
     ['ES']]
    """
    return pred