def Svm(xtrain_count,xvalid_count,train_y,valid_y,my_tags,xtrain_tfidf,xvalid_tfidf,xtrain_tfidf_ngram,xvalid_tfidf_ngram,xtrain_tfidf_ngram_chars,xvalid_tfidf_ngram_chars): from sklearn.svm import SVC # SVM on Bag of words predictions = md.train_model(SVC(kernel = 'linear', C = 1), xtrain_count, train_y,xvalid_count) cm = confusion_matrix(valid_y, predictions) print (cm) print('SVM Bow accuracy %s' % accuracy_score(predictions, valid_y)) print(classification_report(valid_y, predictions,target_names=my_tags)) # SVM on Word Level TF IDF Vectors predictions = md.train_model(SVC(kernel = 'linear', C = 1), xtrain_tfidf, train_y, xvalid_tfidf) print ("SVM, WordLevel TF-IDF: ", accuracy_score(predictions, valid_y)) cm = confusion_matrix(valid_y, predictions) print (cm) print(classification_report(valid_y,predictions,target_names=my_tags)) # SVM on Character Level TF IDF Vectors predictions = md.train_model(SVC(kernel = 'linear', C = 1), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars) print ("SVM, CharLevel Vectors: ", accuracy_score(predictions, valid_y)) cm = confusion_matrix(valid_y, predictions) print (cm) print(classification_report(valid_y,predictions,target_names=my_tags)) # SVM on Ngram Level TF IDF Vectors predictions = md.train_model(SVC(kernel = 'linear', C = 1), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram) print ("SVM, N-Gram Vectors: ", accuracy_score(predictions, valid_y)) cm = confusion_matrix(valid_y, predictions) print (cm) print(classification_report(valid_y,predictions,target_names=my_tags))
def dtree(train, train_y,valid_y,my_tags,xtrain_tfidf,xvalid_tfidf,xtrain_tfidf_ngram,xvalid_tfidf_ngram,xtrain_tfidf_ngram_chars,xvalid_tfidf_ngram_chars): xtrain_count=train[0] xvalid_count=train[1] from sklearn.tree import DecisionTreeClassifier # training a DescisionTreeClassifier on Bag of words predictions = md.train_model(DecisionTreeClassifier(max_depth = 2), xtrain_count, train_y,xvalid_count) cm = confusion_matrix(valid_y, predictions) print (cm) print('Bow Dtree accuracy %s' % accuracy_score(predictions, valid_y)) print(classification_report(valid_y, predictions,target_names=my_tags)) # training a DescisionTreeClassifier on Word Level TF IDF Vectors predictions = md.train_model(DecisionTreeClassifier(max_depth = 2), xtrain_tfidf, train_y, xvalid_tfidf) cm = confusion_matrix(valid_y, predictions) print (cm) print('Word level TF IDF Vectors Dtree accuracy %s' % accuracy_score(predictions, valid_y)) print(classification_report(valid_y,predictions,target_names=my_tags)) # training a DescisionTreeClassifier on Ngram Level TF IDF Vectors predictions = md.train_model(DecisionTreeClassifier(max_depth = 2), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram) cm = confusion_matrix(valid_y, predictions) print (cm) print('Ngram Level TF IDF Vectors Dtree accuracy %s' % accuracy_score(predictions, valid_y)) print(classification_report(valid_y,predictions,target_names=my_tags)) # training a DescisionTreeClassifier on Character Level TF IDF Vectors predictions = md.train_model(DecisionTreeClassifier(max_depth = 2), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars) cm = confusion_matrix(valid_y, predictions) print (cm) print('Character Level TF IDF Vectors Dtree accuracy %s' % accuracy_score(predictions, valid_y)) print(classification_report(valid_y,predictions,target_names=my_tags))
def main(): data_path = Path("/home/gonzalo_franco/workspaces/python/molecules/data") resolution = 0.7 min_xyz = -5 max_xyz = 5 possible_elements = ['N', 'C', 'H', 'O', 'F'] possible_elements_dict = {} for i, e in enumerate(possible_elements): possible_elements_dict[e] = i range_of_values = np.arange(2 * min_xyz, 2 * max_xyz + resolution, resolution) range_of_values = np.round(range_of_values, 3) print("Loading data") molecule_structure_dict, train_data = processing.load_data( data_path, min_xyz, max_xyz, resolution) print("Training model") modeling.train_model(train_data, molecule_structure_dict, possible_elements_dict, range_of_values, batch_size=32, tensorboard_dir="tensorboard/test_2")
def SGD_Svm(xtrain_count, xvalid_count, train_y, valid_y, my_tags, xtrain_tfidf, xvalid_tfidf, xtrain_tfidf_ngram, xvalid_tfidf_ngram, xtrain_tfidf_ngram_chars, xvalid_tfidf_ngram_chars): # training a SVM_sgd classifier on Bag of words from sklearn.linear_model import SGDClassifier predictions = md.train_model( SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None), xtrain_count, train_y, xvalid_count) cm = confusion_matrix(valid_y, predictions) print(cm) print('SVM_SGD for Bow accuracy %s' % accuracy_score(predictions, valid_y)) print(classification_report(valid_y, predictions, target_names=my_tags)) # SVM SGDClassifier on Word Level TF IDF Vectors predictions = md.train_model( SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None), xtrain_tfidf, train_y, xvalid_tfidf) print("SVM_sgd, WordLevel TF-IDF: ", accuracy_score(predictions, valid_y)) cm = confusion_matrix(valid_y, predictions) print(cm) print(classification_report(valid_y, predictions, target_names=my_tags)) #SVM SGDClassifier on Ngram Level TF IDF Vectors predictions = md.train_model( SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram) print("SVM_sgd, N-Gram Vectors: ", accuracy_score(predictions, valid_y)) cm = confusion_matrix(valid_y, predictions) print(cm) print(classification_report(valid_y, predictions, target_names=my_tags)) # SVM SGDClassifier on Character Level TF IDF Vectors predictions = md.train_model( SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars) print("SVM_sgd, CharLevel Vectors: ", accuracy_score(predictions, valid_y)) cm = confusion_matrix(valid_y, predictions) print(cm) print(classification_report(valid_y, predictions, target_names=my_tags))
def model_sensitivity_method(data, args, visualizer=None, title=None): """ Given a dataset `data` and arguments `args`, run a full test of private prediction using the model sensitivity method. Note: This algorithm only guarantees privacy for models with convex losses. """ assert args.model == "linear", f"Model {args.model} not supported." # initialize model and criterion: num_classes = int(data["train"]["targets"].max()) + 1 num_samples, num_features = data["train"]["features"].size() model = modeling.initialize_model(num_features, num_classes, device=args.device) criterion = nn.CrossEntropyLoss() regularized_criterion = modeling.add_l2_regularization( criterion, model, args.weight_decay) # train classifier: logging.info("Training non-private classifier...") modeling.train_model(model, data["train"], criterion=regularized_criterion, optimizer=args.optimizer, num_epochs=args.num_epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, visualizer=visualizer, title=title) # perturb model parameters: logging.info("Applying model sensitivity method...") scale = sensitivity_scale(args.epsilon, args.delta, args.weight_decay, criterion, num_samples, args.noise_dist) param = modeling.get_parameter_vector(model) mean = torch.zeros_like(param) noise_dist = "gaussian" if args.noise_dist in ["gaussian", "advanced_gaussian"] \ else args.noise_dist perturbation = getattr(noise, noise_dist)(mean, scale) with torch.no_grad(): param.add_(perturbation) modeling.set_parameter_vector(model, param) # perform inference on both training and test set: logging.info("Performing inference with perturbed predictor...") predictions = { split: modeling.test_model(model, data_split).argmax(dim=1) for split, data_split in data.items() } return predictions
def knn(xtrain_count, xvalid_count, train_y, valid_y, my_tags, xtrain_tfidf, xvalid_tfidf, xtrain_tfidf_ngram, xvalid_tfidf_ngram, xtrain_tfidf_ngram_chars, xvalid_tfidf_ngram_chars): from sklearn.neighbors import KNeighborsClassifier # training a KNN classifier on Bag of words predictions = md.train_model(KNeighborsClassifier(n_neighbors=7), xtrain_count, train_y, xvalid_count) cm = confusion_matrix(valid_y, predictions) print(cm) print('For Bow KNN accuracy %s' % accuracy_score(predictions, valid_y)) print(classification_report(valid_y, predictions, target_names=my_tags)) # training a KNN classifier on Word Level TF IDF Vectors predictions = md.train_model(KNeighborsClassifier(n_neighbors=7), xtrain_tfidf, train_y, xvalid_tfidf) cm = confusion_matrix(valid_y, predictions) print(cm) print('For Word Level TF IDF Vectors KNN accuracy %s' % accuracy_score(predictions, valid_y)) print(classification_report(valid_y, predictions, target_names=my_tags)) # training a KNN classifier on Ngram Level TF IDF Vectors predictions = md.train_model(KNeighborsClassifier(n_neighbors=7), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram) cm = confusion_matrix(valid_y, predictions) print(cm) print('For Ngram Level TF IDF Vectors KNN accuracy %s' % accuracy_score(predictions, valid_y)) print(classification_report(valid_y, predictions, target_names=my_tags)) # training a KNN classifier on Ngram Level TF IDF Vectors predictions = md.train_model(KNeighborsClassifier(n_neighbors=7), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars) cm = confusion_matrix(valid_y, predictions) print(cm) print('For Character Level TF IDF Vectors KNN accuracy %s' % accuracy_score(predictions, valid_y)) print(classification_report(valid_y, predictions, target_names=my_tags))
def main(): dtm = readDtm() dtm = (dtm - np.mean(dtm)) / np.std(dtm) # normalize dtm distribution dtm = np.flipud(dtm) # input data set has wrong orientation mask = readEscarpmentMask() # counts, bins = np.histogram(np.ravel(dtm),bins=30) # plt.close('all') # plt.hist(bins[:-1], bins, weights=counts) # plt.show() row_list, col_list = getTileExtents(dtm) data = makePatches(dtm, mask, row_list, col_list) data['mask'][data['mask'] >= 0.5] = 1 data['mask'][data['mask'] < 0.5] = 0 # plt.close('all') # plt.imshow(data['mask'][1, :, :]) # plt.show() # plt.imshow(data['dtm'][1, :, :]) # plt.show() # plt.imshow(data['mask'][65, :, :]) # plt.show() # plt.imshow(data['dtm'][65, :, :]) # plt.show() # plt.imshow(data['mask'][((120*4)+65), :, :]) # plt.show() # plt.imshow(data['dtm'][((120*4)+65), :, :]) # plt.show() data['dtm'] = np.expand_dims(data['dtm'], axis=3) data['mask'] = np.expand_dims(data['mask'], axis=3) train_model(data['dtm'], data['mask'], model_fname='model.h5', N=128, channels=1)
def Naive_Bayes(xtrain_count, xvalid_count, train_y, valid_y, my_tags, xtrain_tfidf, xvalid_tfidf, xtrain_tfidf_ngram, xvalid_tfidf_ngram, xtrain_tfidf_ngram_chars, xvalid_tfidf_ngram_chars): from sklearn import naive_bayes # Naive Bayes on Bag of words predictions = md.train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count) print("MultinomialNB, Bag of words: ", accuracy_score(predictions, valid_y)) cm = confusion_matrix(valid_y, predictions) print(cm) print(classification_report(valid_y, predictions, target_names=my_tags)) # Naive Bayes on Word Level TF IDF Vectors predictions = md.train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf) print("MultinomialNB, WordLevel TF-IDF: ", accuracy_score(predictions, valid_y)) cm = confusion_matrix(valid_y, predictions) print(cm) print(classification_report(valid_y, predictions, target_names=my_tags)) # Naive Bayes on Ngram Level TF IDF Vectors predictions = md.train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram) print("MultinomialNB, N-Gram TF-IDF: ", accuracy_score(predictions, valid_y)) cm = confusion_matrix(valid_y, predictions) print(cm) print(classification_report(valid_y, predictions, target_names=my_tags)) # Naive Bayes on Character Level TF IDF Vectors predictions = md.train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars) print("MultinomialNB, CharLevel TF-IDF: ", accuracy_score(predictions, valid_y)) cm = confusion_matrix(valid_y, predictions) print(cm) print(classification_report(valid_y, predictions, target_names=my_tags))
def Gaussian_Naive_Bayes(xtrain_count,xvalid_count,train_y,valid_y,my_tags,xtrain_tfidf,xvalid_tfidf,xtrain_tfidf_ngram,xvalid_tfidf_ngram,xtrain_tfidf_ngram_chars,xvalid_tfidf_ngram_chars): from sklearn.naive_bayes import GaussianNB xtrain_count=xtrain_count.toarray() xvalid_count=xvalid_count.toarray() # training a Gaussian Naive Bayes classifier for Bag of words predictions = md.train_model(GaussianNB(), xtrain_count, train_y,xvalid_count) cm = confusion_matrix(valid_y,predictions) print (cm) print('gnb accuracy for Bow %s' % accuracy_score(predictions, valid_y)) print(classification_report(valid_y, predictions,target_names=my_tags)) # training a Gaussian Naive Bayes classifier for word level TF-IDF xtrain_tfidf=xtrain_tfidf.toarray() xvalid_tfidf=xvalid_tfidf.toarray() predictions = md.train_model(GaussianNB(), xtrain_tfidf, train_y,xvalid_tfidf) cm = confusion_matrix(valid_y,predictions) print (cm) print('gnb accuracy for word level TF-IDF %s' % accuracy_score(predictions, valid_y)) print(classification_report(valid_y, predictions,target_names=my_tags)) # training a Gaussian Naive Bayes classifier for N-gram TF-IDF xtrain_tfidf_ngram=xtrain_tfidf_ngram.toarray() xvalid_tfidf_ngram=xvalid_tfidf_ngram.toarray() predictions = md.train_model(GaussianNB(), xtrain_tfidf_ngram, train_y,xvalid_tfidf_ngram) cm = confusion_matrix(valid_y,predictions) print (cm) print('gnb accuracy for N-gram TF-IDF %s' % accuracy_score(predictions, valid_y)) print(classification_report(valid_y, predictions,target_names=my_tags)) # training a Gaussian Naive Bayes classifier for character level TF-IDF xtrain_tfidf_ngram_chars=xtrain_tfidf_ngram_chars.toarray() xvalid_tfidf_ngram_chars=xvalid_tfidf_ngram_chars.toarray() predictions = md.train_model(GaussianNB(), xtrain_tfidf_ngram_chars, train_y,xvalid_tfidf_ngram_chars) cm = confusion_matrix(valid_y,predictions) print (cm) print('gnb accuracy for character level TF-IDF %s' % accuracy_score(predictions, valid_y)) print(classification_report(valid_y, predictions,target_names=my_tags))
def Bernoulli_Naive_Bayes(xtrain_count, xvalid_count, train_y, valid_y, my_tags, xtrain_tfidf, xvalid_tfidf, xtrain_tfidf_ngram, xvalid_tfidf_ngram, xtrain_tfidf_ngram_chars, xvalid_tfidf_ngram_chars): from sklearn.naive_bayes import BernoulliNB # training a Bernoulli Naive Bayes classifier for Bag of words predictions = md.train_model(BernoulliNB(), xtrain_count, train_y, xvalid_count) cm = confusion_matrix(valid_y, predictions) print(cm) print('Bnb accuracy %s' % accuracy_score(predictions, valid_y)) print(classification_report(valid_y, predictions, target_names=my_tags)) # training a Bernoulli Naive Bayes classifier for word level TF-IDF predictions = md.train_model(BernoulliNB(), xtrain_tfidf, train_y, xvalid_tfidf) cm = confusion_matrix(valid_y, predictions) print(cm) print('Bnb accuracy for word level TF-IDF %s' % accuracy_score(predictions, valid_y)) print(classification_report(valid_y, predictions, target_names=my_tags)) # training a Bernoulli Naive Bayes classifier for N-gram TF-IDF predictions = md.train_model(BernoulliNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram) cm = confusion_matrix(valid_y, predictions) print(cm) print('Bnb accuracy for N-gram TF-IDF %s' % accuracy_score(predictions, valid_y)) print(classification_report(valid_y, predictions, target_names=my_tags)) # training a Bernoulli Naive Bayes classifier for character level TF-IDF predictions = md.train_model(BernoulliNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars) cm = confusion_matrix(valid_y, predictions) print(cm) print('Bnb accuracy for character level TF-IDF %s' % accuracy_score(predictions, valid_y)) print(classification_report(valid_y, predictions, target_names=my_tags))
def main(): set_seed(42) if prep['clear_tb']: print('Clearing tensorboard logs directory') clear_tb_logs() print('Creating working copy of data directory') create_working_copy(DATA_DIR, WORK_DIR) if prep['rotation']: print('Generating rotated images') generate_rotated_images(pjoin(WORK_DIR, 'train', 'dirty')) generate_rotated_images(pjoin(WORK_DIR, 'train', 'cleaned')) if prep['bg_removal']: print('Removing background from images') remove_bg(pjoin(WORK_DIR, 'train', 'dirty')) remove_bg(pjoin(WORK_DIR, 'train', 'cleaned')) remove_bg(pjoin(WORK_DIR, 'test')) train_val_split(WORK_DIR) prepare_test_dir(WORK_DIR) train_dl, valid_dl, test_dl = get_dataloaders( pjoin(WORK_DIR, 'train_split'), pjoin(WORK_DIR, 'valid_split'), pjoin(WORK_DIR, 'test')) net = NNet(backbone=models.resnet18) model, losses, accuracies = train_model(net, train_dl, valid_dl, nepoch=10) model.eval() if prep['save_model']: if not os.path.isdir('models'): os.makedirs('models') pth = pjoin('models', str(round(time.time())) + '.pt') print(f'Saving trained model to {pth}') net.save(pth)
def main_function(): if tf.test.gpu_device_name(): print('Default GPU Device:{}'.format(tf.test.gpu_device_name())) else: print("Please install GPU version of TF") type = 'Miscellaneous' train_x = np.load('Data/'+str(type)+'/train/'+str(type)+'_train.npy') train_x = (train_x/255.0) train_y = pd.read_csv('Data/'+str(type)+'/train/'+str(type)+'_2014_2015_train.csv') train_y = labels_to_ints(train_y['FIRE_SIZE_CLASS']) train_y_array = np.array(train_y) train_y_array = tf.keras.utils.to_categorical(train_y) input_shape = (train_x.shape[1], train_x.shape[2], train_x.shape[3]) output_shape = train_y_array.shape[1] # Import ML model (must assign Learning Rate first) learning_rate = 0.05 my_model = modeling.create_model(learning_rate, input_shape, output_shape) # Assign ML model hyperparameters #learning_rate = 0.1 epochs = 10 batch_size = 100 validation_split = 0.3 # Train model epochs, hist = modeling.train_model(my_model, train_x, train_y_array, epochs, batch_size, validation_split) # plot modeling.plot_curve(epochs, hist, 'accuracy') print('Test modeling done')
def loss_perturbation_method(data, args, visualizer=None, title=None): """ Given a dataset `data` and arguments `args`, run a full test of the private prediction algorithms of Chaudhuri et al. (2011) / Kifer et al. (2012) generalized to the multi-class setting. Returns a `dict` containing the `predictions` for the training and test data. Note: This algorithm only guarantees privacy under the following assumptions: - The loss is strictly convex and has a continuous Hessian. - The model is linear. - The inputs have a 2-norm restricted to be less than or equal 1. - The Lipschitz constant of the loss function and the spectral norm of the Hessian must be bounded. """ assert args.model == "linear", f"Model {args.model} not supported." assert args.noise_dist != "advanced_gaussian", \ "Advanced Gaussian method not supported for loss perturbation." # get dataset properties: num_classes = int(data["train"]["targets"].max()) + 1 num_samples, num_features = data["train"]["features"].size() # initialize model and criterion: model = modeling.initialize_model(num_features, num_classes, device=args.device) criterion = nn.CrossEntropyLoss() precision, weight_decay = loss_perturbation_params(args.epsilon, args.delta, args.noise_dist, criterion, num_samples, num_classes) weight_decay = max(weight_decay, args.weight_decay) # sample loss perturbation vector: param = modeling.get_parameter_vector(model) mean = torch.zeros_like(param) perturbation = getattr(noise, args.noise_dist)(mean, precision) perturbations = [torch.zeros_like(p) for p in model.parameters()] modeling.set_parameter_vector(perturbations, perturbation) # closure implementing the loss-perturbation criterion: def loss_perturbation_criterion(predictions, targets): loss = criterion(predictions, targets) for param, perturb in zip(model.parameters(), perturbations): loss += ((param * perturb).sum() / num_samples) return loss # add L2-regularizer to the loss: regularized_criterion = modeling.add_l2_regularization( loss_perturbation_criterion, model, weight_decay) # train classifier: logging.info("Training classifier with loss perturbation...") modeling.train_model(model, data["train"], criterion=regularized_criterion, optimizer=args.optimizer, num_epochs=args.num_epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, visualizer=visualizer, title=title) # perform inference on both training and test set: logging.info("Performing inference with loss-perturbed predictor...") predictions = { split: model(data_split["features"]).argmax(dim=1) for split, data_split in data.items() } return predictions
def subsagg_method(data, args, visualizer=None, title=None): """ Given a dataset `data` and arguments `args`, run a full test of the private prediction algorithm of Dwork & Feldman (2018). Returns a `dict` containing the `predictions` for the training and test data. """ # unspecified inference budgets means we are trying many values: if args.inference_budget == -1: inference_budgets = INFERENCE_BUDGETS else: inference_budgets = [args.inference_budget] # split training set into disjoint subsets: data["split_train"] = split_dataset(data["train"], args.num_models) # train all classifiers: logging.info(f"Training {args.num_models} disjoint classifiers...") models = [None] * args.num_models for idx in range(args.num_models): # initialize model: logging.info(f" => training model {idx + 1} of {args.num_models}:") num_classes = int(data["train"]["targets"].max()) + 1 num_features = data["split_train"][idx]["features"].size(1) models[idx] = modeling.initialize_model(num_features, num_classes, model=args.model, device=args.device) # train using L2-regularized loss: regularized_criterion = modeling.add_l2_regularization( nn.CrossEntropyLoss(), models[idx], args.weight_decay) augmentation = (args.model != "linear") modeling.train_model(models[idx], data["split_train"][idx], criterion=regularized_criterion, optimizer=args.optimizer, num_epochs=args.num_epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, augmentation=augmentation, visualizer=visualizer, title=title) # clean up: del data["split_train"] # perform inference on both training and test set: logging.info("Performing inference with private predictor...") predictions = {} for split in data.keys(): # compute predictions of each model: batch_size = data[split]["targets"].size( 0) if args.model == "linear" else 128 preds = [ modeling.test_model( model, data[split], augmentation=augmentation, batch_size=batch_size, ) for model in models ] preds = [pred.argmax(dim=1) for pred in preds] preds = torch.stack(preds, dim=1) # compute private predictions: if split not in predictions: predictions[split] = {} for inference_budget in inference_budgets: # privacy parameter must be corrected for inference budget: epsilon = args.epsilon / float(inference_budget) if args.delta > 0: eps, _ = advanced_compose(args.epsilon, args.delta, inference_budget, args.delta) epsilon = max(eps, epsilon) # compute and store private predictions: predictions[split][inference_budget] = \ private_prediction(preds, epsilon=epsilon) # return predictions: return predictions
def main(): np.random.seed(RANDOM_SEED) torch.manual_seed(RANDOM_SEED) CURRENT_TIME = time.strftime("%Y-%m-%dT%H:%M", time.localtime()) if MODEL_LOAD_NAME: model_save_path = os.path.join(MODEL_ROOT, MODEL_LOAD_NAME) else: model_save_path = os.path.join(MODEL_ROOT, CURRENT_TIME + "_" + MODEL_SAVE_NAME) try: os.makedirs(model_save_path) except: pass LOGGER = getLogger(name="main", model_save_path=model_save_path) if PREPROCESS: LOGGER.info("START preprocessing") scaler_dict = csv2pickle(raw_filepath=RAW_FILEPATH, score_filepath=SCORE_FILEPATH, image_root=IMAGE_ROOT, train_filepath=TRAIN_FILEPATH, val_filepath=VAL_FILEPATH, model_save_path=model_save_path, random_seed=RANDOM_SEED) LOGGER.info("START Initiating Datasets") LOGGER.info("Build Datasets") LOGGER.info("Build Dataloaders") datasets, dataloaders = init_datasets(train_filepath=TRAIN_FILEPATH, val_filepath=VAL_FILEPATH, image_root=IMAGE_ROOT, num_col_ids=NUM_COLUMN_IDS, array_col_id=ARRAY_COLUMN_ID, transform=TRANSFORM, batch_size=BATCH_SIZE) dataset_sizes = {x: len(datasets[x]) for x in ["train", "val"]} LOGGER.info("Sample size: Train %d" % dataset_sizes['train']) LOGGER.info("Sample size: Val %d" % dataset_sizes['val']) num_tabular_features = len(datasets["train"][0]["ftrs"]) LOGGER.info("Numeric features count: %d" % len(NUM_COLUMN_IDS)) LOGGER.info("Embedding features dimension: %d" % (num_tabular_features - len(NUM_COLUMN_IDS))) LOGGER.info("Features name: %s" % [nn for nn in datasets['train'].csv_file.columns[NUM_COLUMN_IDS+[ARRAY_COLUMN_ID]]]) LOGGER.info("START Initiating Model") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") criterion = nn.MSELoss() if TRAINING: model = PriceModel(num_ftrs=num_tabular_features, hidden_units=HIDDEN_UNITS, fine_tune=FINE_TUNE).to(device) params = model.parameters() optimizer = optim.SGD(params, lr=LEARNING_RATE, momentum=MOMENTUM) if SCHEDULER_REDUCE_ON_PLATEAU: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, verbose=True) else: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=SCHEDULER_STEP_SIZE) LOGGER.info("Model parameters: Training epochs = %d" % NUM_EPOCHS) LOGGER.info("Model parameters: Learning rate = %.3f" % LEARNING_RATE) LOGGER.info("Model parameters: Momentum = %.2f" % MOMENTUM) LOGGER.info("Model parameters: Scheduler step size = %d" % SCHEDULER_STEP_SIZE) LOGGER.info("Model parameters: FC layers = %s" % ([2048+num_tabular_features]+HIDDEN_UNITS+[1])) LOGGER.info("Model parameters: Fine tuning last %d layers" % FINE_TUNE) # train/save/read model LOGGER.info("START Training Model") model, all_records, best_records = train_model(dataloaders=dataloaders, dataset_sizes=dataset_sizes, model=model, criterion=criterion, optimizer=optimizer, scheduler=exp_lr_scheduler, num_epochs=NUM_EPOCHS, device=device, min_max_scaler=scaler_dict["winning_bid"], model_save_path=model_save_path) model_load_path = model_save_path LOGGER.info("Save model to %s" % model_save_path) save_model(model_save_path=model_save_path, model=model, all_records=all_records, best_records=best_records, scaler_dict=scaler_dict) else: model_load_path = os.path.join(MODEL_ROOT, MODEL_LOAD_NAME) LOGGER.info("Load model from %s" % model_load_path) model, all_records, best_records, scaler_dict = load_model(model_load_path=model_load_path) # eval model LOGGER.info("START Evaluation") model = model.to("cpu") evaluate_model_price(model, device, datasets, "train", scaler_dict, TRAIN_FILEPATH, model_load_path) evaluate_model_price(model, device, datasets, "val", scaler_dict, VAL_FILEPATH, model_load_path) visualization_save_path = os.path.join(model_load_path, "eval_visualizations") try: os.makedirs(visualization_save_path) except: pass evaluate_model(model, dataset=datasets["val"], idxs=EVALUATION_IDXS, scaler=scaler_dict["winning_bid"], save_path=visualization_save_path, model_load_path=model_load_path) return None
def process(args): global log_to_file global SENSOR_CSV if args.log: log_to_file = True logging.basicConfig(filename=args.log, level=logging.INFO) else: logging.basicConfig(stream=sys.stderr, level=logging.INFO) if (args.command == 'process'): from modeling import process_data if not args.csv_file: args.csv_file = SENSOR_CSV #process the raw data df = process_data(args.csv_file) #save data to file save_data(df, args.data_file) elif (args.command == 'train'): from modeling import process_data from modeling import train_model if (args.csv_file): #input is csv file #process the raw data df = process_data(args.csv_file) #save data to file save_data(df, args.data_file) else: #input is processed data file #load the processed data df = load_data(args.data_file) #for aircon in ON status, build model to predict TURN OFF action on_model = train_model(df[0], df[1], args.classifier) #for aircon in OFF status, build model to predict TURN ON action off_model = train_model(df[2], df[3], args.classifier) #save models to file save_model([on_model, off_model], args.model_file) elif (args.command == 'predict'): from modeling import ACTION_TURN_OFF from modeling import ACTION_TURN_ON from modeling import ACTION_NOTHING #parse the input values from sensors status, inputs = parse_sensors(args.sensors) #load prediction models on_model, off_model = load_model(args.model_file) if status == 0: #aircon is OFF, predict TURN ON action = predict(off_model, inputs) else: #aircon is ON, predict TURN OFF action = predict(on_model, inputs) if action == ACTION_TURN_ON: print "TURN_ON" elif action == ACTION_TURN_OFF: print "TURN_OFF" else: print "DO_NOTHING" elif (args.command == 'evaluate'): import modeling if (args.csv_file): #input is raw data #process raw data df = modeling.process_data(args.csv_file) else: #input is processed data #load processed data df = load_data(args.data_file) print "\n\nPerformance for TURN ON prediction" con_mats = modeling.evaluate_model(df[2], df[3], args.classifier) fold = 1 for c1, c2 in con_mats: print "\nPrediction performance for fold " + str(fold) print "\n... on training data" print_confusion_matrix(c1, ("NOTHING", "TURN-ON")) print "\n... on testing data" print_confusion_matrix(c2, ("NOTHING", "TURN-ON")) fold += 1 print "\n\nPerformance for TURN OFF prediction" #cross validation evaluation con_mats = modeling.evaluate_model(df[0], df[1], args.classifier) #report the performance fold = 1 for c1, c2 in con_mats: print "\nPrediction performance for fold " + str(fold) print "\n... on training data" print_confusion_matrix(c1, ("DO-NOTHING", "TURN-OFF")) print "\n... on testing data" print_confusion_matrix(c2, ("DO-NOTHING", "TURN-OFF")) fold += 1 elif (args.command == 'reinforce'): reinforce() else: raise EngineError("unknown command")
def logit_sensitivity_method(data, args, visualizer=None, title=None): """ Given a dataset `data` and arguments `args`, run a full test of the logit sensitivity method. Returns a `dict` containing the `predictions` for the training and test data. Note: This algorithm only guarantees privacy for models with convex losses. """ assert args.model == "linear", f"Model {args.model} not supported." # unspecified inference budgets means we are trying many values: if args.inference_budget == -1: inference_budgets = INFERENCE_BUDGETS else: inference_budgets = [args.inference_budget] # initialize model and criterion: num_classes = int(data["train"]["targets"].max()) + 1 num_samples, num_features = data["train"]["features"].size() model = modeling.initialize_model(num_features, num_classes, device=args.device) criterion = nn.CrossEntropyLoss() regularized_criterion = modeling.add_l2_regularization( criterion, model, args.weight_decay) # train classifier: logging.info("Training non-private classifier...") modeling.train_model(model, data["train"], criterion=regularized_criterion, optimizer=args.optimizer, num_epochs=args.num_epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, visualizer=visualizer, title=title) # perform inference on both training and test set: logging.info("Performing inference with private predictor...") predictions = {} for split in data.keys(): if split not in predictions: predictions[split] = {} for inference_budget in inference_budgets: # account for the budget in the noise scale: scale = sensitivity_scale(args.epsilon / float(inference_budget), args.delta / float(inference_budget), args.weight_decay, criterion, num_samples, args.noise_dist) if args.delta > 0: # linearly search for the optimal noise scale under advanced # composition: del_primes = torch.linspace(0, args.delta, 1000)[1:-1] ind_eps_del = [ advanced_compose(args.epsilon, args.delta, inference_budget, dp) for dp in del_primes ] scales = [ sensitivity_scale(epsilon, delta, args.weight_decay, criterion, num_samples, args.noise_dist) for epsilon, delta in ind_eps_del ] # for small budgets the naive scale may be better: scale = max(max(scales), scale) # make private predictions: noise_dist = "gaussian" if args.noise_dist in ["gaussian", "advanced_gaussian"] \ else args.noise_dist preds = modeling.test_model(model, data[split]) mean = torch.zeros_like(preds).T preds += getattr(noise, noise_dist)(mean, scale).T # make private predictions: predictions[split][inference_budget] = preds.argmax(dim=1) # return predictions: return predictions
def process(args): global log_to_file global SENSOR_CSV if args.log: log_to_file = True logging.basicConfig(filename=args.log, level=logging.INFO) else: logging.basicConfig(stream=sys.stderr, level=logging.INFO) if (args.command == 'process'): from modeling import process_data if not args.csv_file: args.csv_file = SENSOR_CSV #process the raw data df = process_data(args.csv_file) #save data to file save_data(df, args.data_file) elif (args.command == 'train'): from modeling import process_data from modeling import train_model if (args.csv_file): #input is csv file #process the raw data df = process_data(args.csv_file) #save data to file save_data(df, args.data_file) else: #input is processed data file #load the processed data df = load_data(args.data_file) #for aircon in ON status, build model to predict TURN OFF action on_model = train_model(df[0], df[1], args.classifier) #for aircon in OFF status, build model to predict TURN ON action off_model = train_model(df[2], df[3], args.classifier) #save models to file save_model([on_model, off_model], args.model_file) elif (args.command == 'predict'): from modeling import ACTION_TURN_OFF from modeling import ACTION_TURN_ON from modeling import ACTION_NOTHING #parse the input values from sensors status, inputs = parse_sensors(args.sensors) #load prediction models on_model, off_model = load_model(args.model_file) if status == 0: #aircon is OFF, predict TURN ON action = predict(off_model, inputs) else: #aircon is ON, predict TURN OFF action = predict(on_model, inputs) if action==ACTION_TURN_ON: print "TURN_ON" elif action==ACTION_TURN_OFF: print "TURN_OFF" else: print "DO_NOTHING" elif (args.command == 'evaluate'): import modeling if (args.csv_file): #input is raw data #process raw data df = modeling.process_data(args.csv_file) else: #input is processed data #load processed data df = load_data(args.data_file) print "\n\nPerformance for TURN ON prediction" con_mats = modeling.evaluate_model(df[2], df[3], args.classifier) fold = 1 for c1, c2 in con_mats: print "\nPrediction performance for fold " + str(fold) print "\n... on training data" print_confusion_matrix(c1, ("NOTHING", "TURN-ON")) print "\n... on testing data" print_confusion_matrix(c2, ("NOTHING", "TURN-ON")) fold += 1 print "\n\nPerformance for TURN OFF prediction" #cross validation evaluation con_mats = modeling.evaluate_model(df[0], df[1], args.classifier) #report the performance fold = 1 for c1, c2 in con_mats: print "\nPrediction performance for fold " + str(fold) print "\n... on training data" print_confusion_matrix(c1, ("DO-NOTHING", "TURN-OFF")) print "\n... on testing data" print_confusion_matrix(c2, ("DO-NOTHING", "TURN-OFF")) fold += 1 elif (args.command == 'reinforce'): reinforce() else: raise EngineError("unknown command")
def dpsgd_method(data, args, visualizer=None, title=None): """ Given a dataset `data` and arguments `args`, run a full test of private prediction using the differentially private SGD training method of dpsgd et al. (2016). """ # assertions: if args.optimizer != "sgd": raise ValueError( f"DP-SGD does not work with {args.optimizer} optimizer.") if args.delta <= 0.: raise ValueError( f"Specified delta must be positive (not {args.delta}).") # initialize model and criterion: num_classes = int(data["train"]["targets"].max()) + 1 num_samples = data["train"]["features"].size(0) num_features = data["train"]["features"].size(1) model = modeling.initialize_model(num_features, num_classes, model=args.model, device=args.device) regularized_criterion = modeling.add_l2_regularization( nn.CrossEntropyLoss(), model, args.weight_decay) # compute standard deviation of noise to add to gradient: num_samples = data["train"]["features"].size(0) std, eps = dpsgd_privacy.compute_noise_multiplier(args.epsilon, args.delta, num_samples, args.batch_size, args.num_epochs) logging.info(f"DP-SGD with noise multiplier (sigma) of {std}.") logging.info(f"Epsilon error is {abs(eps - args.epsilon):.5f}.") # convert model to make differentially private gradient updates: model = modeling.privatize_model(model, args.clip, std) # train classifier: logging.info("Training classifier using private SGD...") augmentation = (args.model != "linear") modeling.train_model(model, data["train"], optimizer=args.optimizer, criterion=regularized_criterion, num_epochs=args.num_epochs, learning_rate=args.learning_rate, batch_size=args.batch_size, momentum=0.0, use_lr_scheduler=args.use_lr_scheduler, augmentation=augmentation, visualizer=visualizer, title=title) # convert model back to "regular" model: model = modeling.unprivatize_model(model) # perform inference on both training and test set: logging.info("Performing inference with DP-SGD predictor...") predictions = { split: modeling.test_model(model, data_split, augmentation=augmentation).argmax(dim=1) for split, data_split in data.items() } return predictions
nyu = { 'train': NyuV2(os.path.join(data_path, 'train'), transform=transformers['train']), 'val': NyuV2(os.path.join(data_path, 'val'), transform=transformers['val']) } dataloaders = { 'train': data.DataLoader(nyu['train'], num_workers=8, batch_size=batch_size, shuffle=True), 'val': data.DataLoader(nyu['val'], num_workers=8, batch_size=batch_size, shuffle=True) } resnet_wts = './models/pretrained_resnet/model.pt' model = DEN(resnet_wts) model = model.to(device) params_to_update = utils.params_to_update(model) optimizer = optim.Adam(model.parameters(), lr=16e-5) criterion = nn.MSELoss(reduction='sum') train_model(model, dataloaders, criterion, optimizer, n_epochs, device, exp_dir, early_stopping_th)