def main(datapath, clustering_model, encoding_model, batch_size, n_epochs, lr, flattened, device, train_split, valid_split, train_labeled_split, experiment, encode, cluster, train_subset=None, path_to_model=None): """ :param datapath: path to the directory containing the samples :param clustering_model: which clustering model to use [kmeans, gmm]. :param encoding_model: which encoding model to use, convolutional, variational or simple autoencoders. :param batch_size: batch size :param n_epochs: number of epochs :param lr: learning rate :param flattened: If True return the images in a flatten format. :param device: use CUDA device if available else CPU . :param overlapped: boolean, if True use the overlapped pixel patches. :param experiment: track experiment :param encode: boolean, if True, train and apply encoding model. :param cluster: boolean, if True, train and apply clustering model. :param train_subset: How many elements will be used. Default: all. :param path_to_model: path to the directory containing saved models. """ train = HoromaDataset(datapath, split=train_split, subset=train_subset, flattened=flattened) labeled = HoromaDataset(datapath, split=train_labeled_split, subset=train_subset, flattened=flattened) valid_data = HoromaDataset( datapath, split=valid_split, subset=train_subset, flattened=flattened) train_label_indices = labeled.targets valid_indices = valid_data.targets print("Shape of training set: ", train.data.shape) print("Shape of validation set: ", valid_data.data.shape) if encode: # Train and apply encoding model train_enc, encoding_model = encoding_model.fit(train, valid_data, batch_size=batch_size, n_epochs=n_epochs, lr=lr, device=device, experiment=experiment) else: encoding_model.load_state_dict(torch.load(path_to_model)["model"]) train_enc = encode_dataset(encoding_model, train, batch_size, device) if cluster: train_labeled_enc = encoding_model.encode( labeled[train_label_indices][0].to(device)) valid_enc = encoding_model.encode(labeled[valid_indices][0].to(device)) # Train and apply clustering model clustering_model.train(train_enc) cluster_labels = assign_labels_to_clusters(clustering_model, train_labeled_enc, labeled.targets[train_label_indices]) _, accuracy, f1 = eval_model_predictions(clustering_model, valid_enc, labeled.targets[valid_indices], cluster_labels) experiment.log_metric('accuracy', accuracy) experiment.log_metric('f1-score', f1) # Save models model = {'cluster': clustering_model, 'embedding': encoding_model, 'cluster_labels': cluster_labels} torch.save(model, Constants.PATH_TO_MODEL + str(experiment.get_key()) + '.pth')
def main(datapath, configs, experiment): """ :param datapath: path to the directory containing the samples :param configs: dictionary containing hyperparameters for training. :param experiment: comet ml experiment object for logging results """ train_split = configs['train_split'] valid_split = configs['valid_split'] train_labeled_split = configs['train_labeled_split'] train = HoromaDataset(datapath, split=train_split, subset=None, flattened=False) labeled = HoromaDataset(datapath, split=train_labeled_split, subset=None, flattened=False) valid_data = HoromaDataset(datapath, split=valid_split, subset=None, flattened=False) train_loader = DataLoader(train, batch_size=configs['batch_size'], shuffle=True) labeled_loader = DataLoader(labeled, batch_size=configs['labeled_batch_size'], shuffle=True) eval_loader = DataLoader(valid_data, batch_size=configs['labeled_batch_size'], shuffle=True) print("Shape of training set: ", train.data.shape) print("Shape of validation set: ", valid_data.data.shape) n_iterations = np.floor(labeled.data.shape[0] / configs['labeled_batch_size']) device = 'cuda' net = NRM('AllConv13', batch_size=configs['labeled_batch_size'], num_class=17, use_bias=configs['use_bias'], use_bn=configs['use_bn'], do_topdown=configs['do_topdown'], do_pn=configs['do_pn'], do_bnmm=configs['do_bnmm']).to(device) net.apply(weights_init) best_f1, best_acc, best_model = train_nrm(net, train_loader, labeled_loader, eval_loader, configs['n_epochs'], configs, n_iterations, experiment) experiment.log_metric('best_accuracy', best_acc) experiment.log_metric('best_f1-score', best_f1) experiment.log_metric('best_model_epoch', best_model)
def main(datapath, configs, experiment): """ :param datapath: path to the directory containing the samples :param configs: dictionary containing hyperparameters for training. :param experiment: comet ml experiment object for logging results """ train_split = configs['train_split'] valid_split = configs['valid_split'] train_labeled_split = configs['train_labeled_split'] train = HoromaDataset(datapath, split=train_split, subset=None, flattened=False) labeled = HoromaDataset(datapath, split=train_labeled_split, subset=None, flattened=False) valid_data = HoromaDataset( datapath, split=valid_split, subset=None, flattened=False) print("Shape of training set: ", train.data.shape) print("Shape of validation set: ", valid_data.data.shape) if configs['encode']: if configs['enc_model'] == "hali": Gx1, Gx2, Gz1, Gz2, Disc, optim_g, optim_d, train_loader, cuda, configs = initialize_hali( configs, train) training_loop_hali(Gz1, Gz2, Gx1, Gx2, Disc, optim_d, optim_g, train_loader, configs, experiment, cuda) else: Gx, Gz, Disc, optim_g, optim_d, train_loader, cuda, configs = initialize_ali( configs, train) training_loop_ali(Gz, Gx, Disc, optim_d, optim_g, train_loader, configs, experiment, cuda) if configs['cluster']: if configs['enc_model'] == "hali": best_f1, best_acc, best_model = get_results_hali( configs, experiment, train, labeled, valid_data) else: best_f1, best_acc, best_model = get_results_ali( configs, experiment, train, labeled, valid_data) experiment.log_metric('best_accuracy', best_acc) experiment.log_metric('best_f1-score', best_f1) experiment.log_metric('best_model_epoch', best_model)
def main(datapath, encoding_model, classifier_model, batch_size, n_epochs, lr_unsup, lr_sup, device, train_unlabeled_split, valid_split, train_labeled_split, patience, experiment, path_to_model=None): """ :param datapath: path to the directory containing the samples :param classifier_model: which classifier model to use :param encoding_model: which encoding model to use, convolutional, variational or simple autoencoders. :param batch_size: batch size :param n_epochs: number of epochs :param lr_unsup: learning rate for unsupervised part :param lr_sup: learning rate for supervised part :param train_unlabeled_split: unlabeled data used for unsupervised part :param valid_split: valid split for MLP :param train_labeled_split: train split for MLP :param patience: patience for early stopping :param device: use CUDA device if available else CPU . :param experiment: track experiment :param path_to_model: path to the directory containing saved models. """ train_unlabeled = HoromaDataset(datapath, split=train_unlabeled_split) train_labeled = HoromaDataset(datapath, split=train_labeled_split) valid_data = HoromaDataset(datapath, split=valid_split) valid_loader = DataLoader(valid_data, batch_size=batch_size) n_labeled_batch = len(train_labeled) // batch_size n_unlabeled_batch = n_labeled_batch # Semisupervised Training train_semi_supervised_network(encoding_model, classifier_model, train_unlabeled, train_labeled, valid_loader, n_epochs, batch_size, lr_unsup, lr_sup, device, n_labeled_batch, n_unlabeled_batch, patience, experiment)
def load_datasets(datapath, train_subset, flattened=False, split="train_all"): """ Load Horoma datasets from specified data directory. :type datapath: str :type flattened: bool :type train_subset: str """ print("Loading datasets from ({}) ...".format(datapath), end=' ') start_time = time() dataset = HoromaDataset(datapath, split=split, subset=train_subset, flattened=flattened) print("Done in {:.2f} sec".format(time() - start_time)) return dataset
def main(config, resume, test_run=False, helios_run=None, horoma_test=False): """ Execute a training for a model. :param config: the configuration of the optimizer, model and trainer. :param resume: path to the checkpoint of a model. :param test_run: whether it's a test run or not. In case of test run, uses custom mnist dataset. :param helios_run: start datetime of a run on helios. :param horoma_test: whether to use the test horoma dataset or not. """ np.random.seed(config["numpy_seed"]) torch.manual_seed(config["torch_seed"]) torch.cuda.manual_seed_all(config["torch_seed"]) # setup data_loader instances if not test_run: unlabelled = HoromaDataset(**config["data"]["dataset"], split='train_overlapped', transforms=HoromaTransforms()) labelled = HoromaDataset( data_dir=config["data"]["dataset"]['data_dir'], flattened=False, split='valid_overlapped', transforms=HoromaTransforms()) elif horoma_test: unlabelled = HoromaDataset(**config["data"]["dataset"], split='train_overlapped', transforms=HoromaTransforms(), subset=5000) labelled = HoromaDataset( data_dir=config["data"]["dataset"]['data_dir'], flattened=False, split='valid_overlapped', transforms=HoromaTransforms(), subset=1000) else: unlabelled = CustomMNIST(**config["data"]["dataset"], subset=5000) labelled = CustomLabelledMNIST(**config["data"]["dataset"], subset=1000) model = ModelFactory.get(config) print(model) print() trainable_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = OptimizerFactory.get(config, trainable_params) trainer = TrainerFactory.get(config)(model, optimizer, resume=resume, config=config, unlabelled=unlabelled, labelled=labelled, helios_run=helios_run, **config['trainer']['options']) trainer.train()
def main(datapath, encoding_model, batch_size, n_epochs, lr, device, train_split, valid_split, train_labeled_split, experiment, path_to_model=None): """ :param datapath: path to the directory containing the samples :param encoding_model: which encoding model to use, convolutional, variational or simple autoencoders. :param batch_size: batch size :param n_epochs: number of epochs :param lr: learning rate for unsupervised part :param train_split: dataset used for unsupervised part :param valid_split: valid split for SVM :param train_labeled_split: train split for SVM :param device: use CUDA device if available else CPU . :param experiment: track experiment :param path_to_model: path to the directory containing saved models. """ full_dataset = HoromaDataset(datapath, split=train_split, flattened=flattened) train_labeled = HoromaDataset(datapath, split=train_labeled_split, flattened=flattened) # Validation data(labeled) for the supervised task(Classification) valid_data = HoromaDataset(datapath, split=valid_split, flattened=flattened) # split the full_dataset(labeled and unlabeled train data) into train and valid for autoencoder pre-training n_train = int(0.90 * len(full_dataset)) n_valid = len(full_dataset) - n_train train_dataset, valid_dataset = data.random_split(full_dataset, [n_train, n_valid]) # Train and apply encoding model train_enc, encoding_model = encoding_model.fit(train_dataset, valid_dataset, batch_size=batch_size, n_epochs=n_epochs, lr=lr, device=device, experiment=experiment) # extract latent representation of train_labeled data train_labeled_enc = encode_dataset(encoding_model, train_labeled, batch_size, device, is_unlabeled=False) print("Train labeled data encoding complete.\n") # extract latent representation of validation data valid_enc = encode_dataset(encoding_model, valid_data, batch_size, device, is_unlabeled=False) print("validation data encoding complete.\n") start_time = time() # Train SVM classifier svm_classifier = SVMClassifier() print("Traing SVM classifier...\n") pred_train_y = svm_classifier.train_classifier(train_labeled_enc, train_labeled.targets) print("Computing metrics for train data\n") train_accuracy, train_f1, __train_f1 = __compute_metrics( train_labeled.targets, pred_train_y) print("Prediction for validation data. \n") pred_valid_y = svm_classifier.validate_classifier(valid_enc) print("Computing metrics for validation data\n") valid_accuracy, valid_f1, __valid_f1 = __compute_metrics( valid_data.targets, pred_valid_y) print("Done in {:.2f} sec.".format(time() - start_time)) print("Train : Accuracy: {:.2f} | F1: {:.2f}".format( train_accuracy * 100, train_f1 * 100)) print("Train : F1 score for each class: {}".format(__train_f1 * 100)) print("Validation : Accuracy: {:.2f} | F1: {:.2f}".format( valid_accuracy * 100, valid_f1 * 100)) print("Validation : F1 score for each class: {}".format(__valid_f1 * 100)) experiment.log_metric('Train accuracy', train_accuracy) experiment.log_metric('Train f1-score', train_f1) experiment.log_metric('Validation accuracy', valid_accuracy) experiment.log_metric('Validation f1-score', valid_f1)
def main(path, data, model_name): print('Path: {}'.format(path)) print('Data: {}'.format(data)) print('Model: {}'.format(model_name)) print() print('>> Beginning...') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') unlabelled = HoromaDataset( data, split='train_overlapped', transforms=HoromaTransforms() ) labelled = HoromaDataset( data_dir=data, split='valid_overlapped', transforms=HoromaTransforms() ) labelled = FullDataset(labelled) print('>> Dataset lengths : {}, {}'.format(len(unlabelled), len(labelled))) labelled_loader = DataLoader(labelled, batch_size=100, shuffle=False) print('>> Getting the configuration...', end=' ') config = retrieve_config(path + 'config.json') print('Done.') try: model = get_model(config).to(device) except TypeError: config['model']['args']['dropout'] = .1 model = get_model(config).to(device) print('>> Getting the checkpoint...', end=' ') checkpoint = torch.load(path + model_name, map_location=device) if 'state_dict' in checkpoint: state_dict = checkpoint['state_dict'] kmeans = checkpoint['cluster_collection'].models['kmeans100'].model del checkpoint else: state_dict = checkpoint print('Done.') # torch.save(state_dict, path + 'bare_model.pth') model.load_state_dict(state_dict) model.to('cpu') torch.save(model, path + 'bare_model.pth') model.to(device) cluster_helper = ClusterHelper( model=model, device=device, unlabelled_loader=None, train_loader=labelled_loader, valid_loader=labelled_loader ) best_kmeans = None best_score = 0. threshold = .4 good_models = [] for n_clusters in [30, 50, 80, 100, 150, 200]: print() print() kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=10000) # kmeans = KMeans(n_clusters=100) print(kmeans) print('>> Getting embeddings...', end=' ') data = cluster_helper.get_embeddings( DataLoader(unlabelled, batch_size=100, shuffle=True) ) print('Done.') print('>> Fitting data...', end=' ') kmeans.fit(data) print('Done.') # n = len(np.unique(labelled.region_ids)) n = 40 accuracies = np.empty(n) f1_scores = np.empty(n) for i in range(n): print() kfold = SplitDataset(split=.7) train, valid = kfold(labelled) print('Train: {}'.format(get_classes(train))) print('Valid: {}'.format(get_classes(valid))) cluster_helper = ClusterHelper( model=model, device=device, unlabelled_loader=None, train_loader=DataLoader(train, shuffle=False, batch_size=100), valid_loader=DataLoader(valid, shuffle=False, batch_size=100) ) clustering_model = ClusterModel( kmeans, cluster_helper=cluster_helper ) cluster_helper.build_valid_embeddings() cluster_helper.build_train_embeddings() clustering_model.create_mapping() labels = cluster_helper.valid_labels prediction = clustering_model.labelled_predict( cluster_helper.valid_embedding ) unlabelled_examples = (prediction == -1).sum() print('Unlabelled examples : {}/{} ({:.2%})'.format( unlabelled_examples, len(prediction), unlabelled_examples / len(prediction) )) accuracy = metrics.accuracy_score(labels, prediction) f1_score = metrics.f1_score(labels, prediction, average='weighted') accuracies[i] = accuracy f1_scores[i] = f1_score if accuracy > threshold and f1_score > threshold: good_models.append(clustering_model) print('Accuracy: {:.3%}'.format(accuracy)) print('F1 Score: {:.3%}'.format(f1_score)) print() print(' > Average accuracy: {:.3%}'.format(accuracies.mean())) print(' > Average F1 Score: {:.3%}'.format(f1_scores.mean())) if f1_scores.mean() > best_score: best_score = f1_scores.mean() best_kmeans = kmeans majority_vote = MajorityVote(good_models) # Final fit cluster_helper = ClusterHelper( model=model, device=device, unlabelled_loader=None, train_loader=labelled_loader, valid_loader=labelled_loader ) print() print('>> Training finished') print('>> Best score: {:.3%}\n{}'.format(best_score, best_kmeans)) clustering_model = ClusterModel( best_kmeans, cluster_helper=cluster_helper ) cluster_helper.build_valid_embeddings() cluster_helper.build_train_embeddings() # Test the majority vote print('>> Majority vote : {}'.format( metrics.f1_score( cluster_helper.valid_labels, majority_vote.labelled_predict(cluster_helper.valid_embedding), average='weighted' ) )) clustering_model.create_mapping() del clustering_model.cluster_helper model.to('cpu') wrapper = ModelWrapper(model, clustering_model) wrapper_maj = ModelWrapper(model, majority_vote) with open(path + 'final_model_full.pkl', 'wb') as f: pickle.dump(wrapper, f) with open(path + 'final_model_maj_full.pkl', 'wb') as f: pickle.dump(wrapper_maj, f)
def main(config, resume, test_run=False, helios_run=None, horoma_test=False): """ Run an hyperparameter search with bayesian optimization. :param config: configuration of the model optimizer, trainer and model to use. :param resume: path to a pickled hyperparameters optimizer. :param test_run: whether it's a test run or not. In case of test run, uses custom mnist dataset. :param helios_run: start datetime of a run on helios. :param horoma_test: whether to use the test horoma dataset or not. """ np.random.seed(config["numpy_seed"]) torch.manual_seed(config["torch_seed"]) torch.cuda.manual_seed_all(config["torch_seed"]) # setup data_loader instances if not test_run: unlabelled = HoromaDataset( **config["data"]["dataset"], split='train_overlapped', transforms=HoromaTransforms() ) labelled = HoromaDataset( data_dir=config["data"]["dataset"]['data_dir'], flattened=False, split='valid_overlapped', transforms=HoromaTransforms() ) elif horoma_test: unlabelled = HoromaDataset( **config["data"]["dataset"], split='train_overlapped', transforms=HoromaTransforms(), subset=5000 ) labelled = HoromaDataset( data_dir=config["data"]["dataset"]['data_dir'], flattened=False, split='valid_overlapped', transforms=HoromaTransforms(), subset=1000 ) else: unlabelled = CustomMNIST(**config["data"]["dataset"], subset=5000) labelled = CustomLabelledMNIST(**config["data"]["dataset"], subset=1000) model_hyperparameters_space = ModelFactory.getclass( config["model"]["type"] ).model_hyperparameters_space for h in model_hyperparameters_space: h.name = "model.{}".format(h.name) hyperparameters_space.extend( model_hyperparameters_space ) if not helios_run: experiment_datetime = datetime.datetime.now().strftime('%m%d_%H%M%S') else: experiment_datetime = helios_run checkpoint_path = os.path.join(config["trainer"]["log_dir"], config["name"], experiment_datetime, 'optimizer.pkl') if not resume: hp_optimizer = Optimizer(hyperparameters_space) else: hp_optimizer = load_optimizer(resume) for experiment_number in range(len(hp_optimizer.yi), 20): hyperparameters = hp_optimizer.ask() experiment_folder = os.path.join(experiment_datetime, str(experiment_number)) optimizer_hp, model_hp, hp_markdown = \ hyperparameters_parsing(hyperparameters, hyperparameters_space, config) model = ModelFactory.getclass( config["model"]["type"] )(**model_hp) trainable_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = OptimizerFactory.getclass( config["optimizer"]["type"] )(trainable_params, **optimizer_hp) trainer = TrainerFactory.get(config)( model, optimizer, resume=None, config=config, unlabelled=unlabelled, labelled=labelled, helios_run=helios_run, experiment_folder=experiment_folder, **config['trainer']['options'] ) trainer.logger.info(hp_markdown) trainer.tb_writer.add_text("hyperparameters", hp_markdown) score = trainer.train() hp_optimizer.tell(hyperparameters, score) save_optimizer(hp_optimizer, checkpoint_path)
def eval_model(model_path, dataset_dir, split): ''' # MODIFY HERE # This function is meant to be an example ''' # # SETUP MODEL # # # Load your best model print("\nLoading model from ({}).".format(model_path)) model = load(model_path) # # SETUP DATASET # # # Load requested dataset """ IMPORTANT # of example per splits. "train" = 150700 "train_overlapped" = 544027 "valid" = 499 "valid_overlapped" = 1380 "test" = 483 Files available the test folder: test_regions_id.txt test_x.dat test_y.txt train_overlapped_regions_id.txt train_overlapped_x.dat train_overlapped_y.txt train_regions_id.txt train_x.dat train_y.txt valid_overlapped_regions_id.txt valid_overlapped_x.dat valid_overlapped_y.txt valid_regions_id.txt valid_x.dat valid_y.txt You need to load the right one according to the `split`. """ dataset = HoromaDataset( dataset_dir, split, transforms=transforms.Compose([transforms.ToPILImage(), transforms.ToTensor()]) ) # # INFERENCE # # # Use model on dataset to predict the class prediction = model.predict(dataset) # We return -1 if the cluster has no label... map_labels = np.concatenate([dataset.map_labels, np.array([''])]) pred = map_labels[prediction] # # PREDICTIONS # # # Return the predicted classes as a numpy array of shape (nb_exemple, 1) """ Example: [['ES'] ['EN'] ['ES']] """ return pred