# train training_features = np.column_stack( (den_training, eig_0_training, eig_1_training, eig_2_training, log_mass)) print(training_features.shape) cv = True third_features = int((training_features.shape[1] - 1) / 3) param_grid = { "n_estimators": [1000, 1300, 1600], "max_features": [third_features, "sqrt", 25, 40], "min_samples_leaf": [5, 15], #"criterion": ["mse", "mae"], } clf = ml.MLAlgorithm(training_features, method="regression", cross_validation=cv, split_data_method=None, n_jobs=60, save=True, path=saving_path + "classifier/classifier.pkl", param_grid=param_grid) if cv is True: print(clf.best_estimator) print(clf.algorithm.best_params_) print(clf.algorithm.best_score_) np.save(saving_path + "f_imp.npy", clf.feature_importances)
features_training = np.load( "/home/lls/mlhalos_code/stored_files/50k_features.npy") features_training = np.column_stack( (features_training[:, :-1], features_training[:, -1], features_training[:, -1])) features_test = np.load( "/home/lls/mlhalos_code/stored_files/features_test.npy") features_test = np.column_stack( (features_test[:, :-1], features_test[:, -1], features_test[:, -1])) # train algorithm algo = ml.MLAlgorithm(features_training, split_data_method=None, cross_validation=False, num_cv_folds=10, n_jobs=22) # predict probabilities pred = algo.classifier.predict(features_test[:, :-1]) true = features_test[:, -1] np.save( "/home/lls/mlhalos_code/stored_files/true_label_feature/predicted_probabilities.npy", pred) np.save( "/home/lls/mlhalos_code/stored_files/true_label_feature/true_labels.npy", true)
min_halo = 0 max_halo = 400 min_mass = ic_all.halo[max_halo]['mass'].sum() max_mass = ic_all.halo[min_halo]['mass'].sum() ic = parameters.InitialConditionsParameters(min_halo_number=min_halo, max_halo_number=max_halo, min_mass_scale=min_mass, max_mass_scale=max_mass) feat_w_EPS = features.extract_labeled_features(initial_parameters=ic, add_EPS_label=True, n_samples=50000) # Train the algorithm algorithm_50k = ml.MLAlgorithm(features_training) np.save('/Users/lls/Documents/CODE/stored_files/all_out/train_set_x.npy', algorithm_50k.X_train) np.save('/Users/lls/Documents/CODE/stored_files/all_out/test_set_x.npy', algorithm_50k.X_test) np.save('/Users/lls/Documents/CODE/stored_files/all_out/train_set_y.npy', algorithm_50k.y_train) np.save('/Users/lls/Documents/CODE/stored_files/all_out/test_set_y.npy', algorithm_50k.y_true) # Make predictions on all particles other than the training set features_left = features_full_mass[~np.in1d(np.arange(len(features_full_mass) ), index_training)] predicted_probabilities = algorithm_50k.classifier.predict_proba(
testing_ind = np.load("/share/data1/lls/regression/50k_testing_ids.npy") except IOError: print("Generating training/testing indices and saving") training_ind = np.random.choice(len(traj), 50000) np.save("/share/data1/lls/regression/50k_training_ids.npy", training_ind) testing_ind = np.arange(len(traj))[~np.in1d(range(len(traj)), training_ind)] np.save("/share/data1/lls/regression/50k_testing_ids.npy", testing_ind) feat_training = np.column_stack((traj[training_ind], halo_mass[training_ind])) X_test = traj[testing_ind] y_test = halo_mass[testing_ind] del traj del halo_mass cv = True path_clf="/share/data1/lls/regression/CV/classifier/classifier.pkl" clf = ml.MLAlgorithm(feat_training, method="regression", split_data_method=None, n_jobs=60, save=True, cross_validation=cv, path=path_clf) if cv is True: print(clf.best_estimator) print(clf.algorithm.best_params_) print(clf.algorithm.best_score_) np.save("/share/data1/lls/regression/CV/f_imp.npy", clf.feature_importances) y_predicted = clf.algorithm.predict(X_test) np.save("/share/data1/lls/regression/CV/predicted_halo_mass.npy", y_predicted) np.save("/share/data1/lls/regression/CV/true_halo_mass.npy", y_test)
training_ind) testing_ind = np.arange( len(traj))[~np.in1d(range(len(traj)), training_ind)] np.save("/share/data1/lls/try_classifier/50k_testing_ids.npy", training_ind) feat_training = np.column_stack( (traj[training_ind], true_labels[training_ind])) # X_test = traj[testing_ind] # y_test = halo_mass[testing_ind] del traj del true_labels clf = ml.MLAlgorithm( feat_training, method="classification", split_data_method=None, n_jobs=60, save=True, path="/share/data1/lls/regression/try_classifier/classifier.pkl") print(clf.best_estimator) print(clf.algorithm.best_params_) print(clf.algorithm.best_score_) np.save("/share/data1/lls/regression/try_classifier/f_imp.npy", clf.feature_importances) # y_predicted = clf.algorithm.predict(X_test) # np.save("/share/data1/lls/regression/predicted_halo_mass.npy", y_predicted) # np.save("/share/data1/lls/regression/true_halo_mass.npy", y_test)
import sys sys.path.append("/home/lls/mlhalos_code/scripts") import numpy as np from mlhalos import machinelearning as ml density_shear_features = np.load("/home/lls/stored_files/shear_and_density/density_shear_features.npy") aucs = np.zeros((10, 2)) for i in range(10): index_training_i = np.random.choice(range(len(density_shear_features)), 100000) features = density_shear_features[index_training_i] trained_algo = ml.MLAlgorithm(features, split_data_method="train_test_split", train_size=50000, n_jobs=60) print(trained_algo.classifier.best_params_) auc_validation = trained_algo.classifier.best_score_ auc_test = ml.get_auc_score(trained_algo.predicted_proba_test, trained_algo.true_label_test) aucs[i, 0] = auc_validation aucs[i, 1] = auc_test np.save("/home/lls/stored_files/shear_and_density/aucs_val_test.npy", aucs)
(den_features[training_ind], eig_0[training_ind], class_labels[training_ind])) # X_test = np.column_stack((den_features[testing_ids], eig_0[testing_ids])) cv = True param_grid = { "n_estimators": [800, 1000, 1300], "max_features": ["auto", 0.4], "min_samples_leaf": [15, 5], "criterion": ["gini", "entropy"] } clf = ml.MLAlgorithm(feat_training, method="classification", cross_validation=cv, split_data_method=None, n_jobs=60, save=True, param_grid=param_grid, path=saving_path + "classifier/classifier.pkl") if cv is True: print(clf.best_estimator) print(clf.algorithm.best_params_) print(clf.algorithm.best_score_) np.save(saving_path + "f_imp.npy", clf.feature_importances) # classify clf = joblib.load(saving_path + "classifier/classifier.pkl") testing_ids = np.load(saving_path + "testing_ids.npy") X_test = np.column_stack((den_features[testing_ids], eig_0[testing_ids]))
density_shear_features = np.load( "/home/lls/stored_files/shear_and_density/density_shear_features.npy") index_training = np.load("/home/lls/stored_files/50k_features_index.npy") training_den_shear_features = density_shear_features[index_training, :] np.save( "/home/lls/stored_files/shear_and_density/training_density_shear_features.npy", training_den_shear_features) # train only on density+prolateness training_features = np.column_stack( (training_den_shear_features[:, :50], training_den_shear_features[:, 100:])) trained_algo = ml.MLAlgorithm( training_features, split_data_method=None, n_jobs=60, save=True, path= "/home/lls/stored_files/shear_and_density/den+prol/classifier/classifier.pkl" ) print(trained_algo.classifier.best_score_) print(trained_algo.classifier.best_estimator_) print(trained_algo.classifier.best_params_) np.save( "/home/lls/stored_files/shear_and_density/den+prol/feature_importances.npy", trained_algo.feature_importances)
# for i in range(50): # plt.hist(training_den_shear_features[np.where(training_den_shear_features[:,-1]==1)[0], i], label="in", # normed=True, histtype="step", bins=30) # plt.hist(training_den_shear_features[np.where(training_den_shear_features[:, -1] == -1)[0], i], label="out", # normed=True, histtype="step", bins=30) # plt.xlabel("feature " + str(i)) # plt.legend(loc="best") # plt.savefig("/Users/lls/Documents/CODE/stored_files/all_out/distributions_50k/feature_" + str(i) +".pdf") # plt.clf() # training_den_shear_features = np.column_stack((training_den_shear_features[:,50:100], training_den_shear_features[:, # -1])) trained_algo = ml.MLAlgorithm( training_den_shear_features, cross_validation=True, split_data_method=None, n_jobs=60, save=True, path= "/home/lls/stored_files/shear_and_density/full_eigenvalues/not_rescaled/" "classifier/classifier.pkl") print(trained_algo.classifier.best_score_) print(trained_algo.classifier.best_estimator_) print(trained_algo.classifier.best_params_) np.save( "/home/lls/stored_files/shear_and_density/full_eigenvalues/not_rescaled/feature_importances.npy", trained_algo.feature_importances)
def train_algorithm(features): trained_algorithm = ml.MLAlgorithm(features) return trained_algorithm
""" This should be done on hypatia. """ import numpy as np from mlhalos import machinelearning as ml # load training and test features with EPS label as feature features_training = np.load("/home/lls/mlhalos_code/stored_files/with_EPS_label/50k_features_w_EPS_label.npy") features_test = np.load("/home/lls/mlhalos_code/stored_files/with_EPS_label/features_w_EPS_test.npy") # train algorithm algo = ml.MLAlgorithm(features_training, split_data_method=None, num_cv_folds=10, n_jobs=22) # predict probabilities pred = algo.classifier.predict_proba(features_test[:, :-1]) true = features_test[:, -1] np.save("/home/lls/mlhalos_code/stored_files/with_EPS_label/predicted_probabilities.npy", pred) np.save("/home/lls/mlhalos_code/stored_files/with_EPS_label/true_labels.npy", true) # save classifier details f_imp = algo.classifier.best_estimator_.feature_importances_ np.save("/home/lls/mlhalos_code/stored_files/with_EPS_label/feature_importances.npy", f_imp)
def find_predicted_and_true_labels(trained_algorithm, features): predicted_probabilities = trained_algorithm.algorithm.predict_proba( features[:, :-1]) true_labels = features[:, -1] return predicted_probabilities, true_labels ######################## SCRIPT ######################## if __name__ == "__main__": # Train the algorithm features_training = load_features(features_type="training") trained_classifier = ml.MLAlgorithm(features_training) # Make predictions on all particles other than the training set test_features = load_features(features_type="test") predicted_probabilities, true_labels = find_predicted_and_true_labels( trained_classifier, test_features) np.save( '/Users/lls/Documents/CODE/stored_files/all_out/predicted_probabilities.npy', predicted_probabilities) np.save('/Users/lls/Documents/CODE/stored_files/all_out/true_labels.npy', true_labels) # # Plot feature importance #
traj = np.load( "/share/data1/lls/shear_quantities/quantities_id_ordered/density_trajectories.npy" ) feat_training = np.column_stack((traj[training_ind], halo_mass[training_ind])) X_test = traj[testing_ind] del traj del halo_mass cv = False clf = ml.MLAlgorithm( feat_training, method="regression", cross_validation=cv, split_data_method=None, n_jobs=60, save=True, path= "/share/data1/lls/regression/balanced_training_set/classifier/classifier.pkl" ) if cv is True: print(clf.best_estimator) print(clf.algorithm.best_params_) print(clf.algorithm.best_score_) np.save("/share/data1/lls/regression/balanced_training_set/f_imp.npy", clf.feature_importances) # classify y_predicted = clf.algorithm.predict(X_test)
features_all) #### TRAINING ##### # Select 50,000 particles to use as training set index_training = np.random.choice(len(features_all), size=50000, replace=False) features_training = features_all[index_training] np.save("/home/lls/stored_files/non_rescaled/features_training_index.npy", index_training) np.save("/home/lls/stored_files/non_rescaled/50k_features_training.npy", features_training) # Train the algorithm RF = ml.MLAlgorithm(features_training, split_data_method=None, n_jobs=24) #### PREDICT PROBABILITIES ON REMAINING PARTICLES IN THE BOX ###### features_left = features_all[~np.in1d(np.arange(len(features_all) ), index_training)] predicted_probabilities = RF.classifier.predict_proba(features_left[:, :-1]) true_labels = features_left[:, -1] np.save('/home/lls/stored_files/non_rescaled/pred_proba_features_left.npy', predicted_probabilities) np.save('/home/lls/stored_files/non_rescaled/rue_labels_features_left.npy', true_labels) #### EPS PREDICTIONS ON REMAINING PARTICLES IN THE BOX ######
training_features = np.column_stack( (den_training, eig_0_training, log_mass)) print(training_features.shape) cv = True third_features = int((training_features.shape[1] - 1) / 3) param_grid = { "n_estimators": [1000, 1300], "max_features": [third_features, "sqrt", 5, 10], "min_samples_leaf": [5, 15], # "criterion": ["mse", "mae"], } clf = ml.MLAlgorithm(training_features, method="regression", cross_validation=cv, split_data_method=None, n_jobs=60, param_grid=param_grid) if cv is True: print(clf.best_estimator) print(clf.algorithm.best_params_) print(clf.algorithm.best_score_) np.save(saving_path + "f_imp_" + str(i) + ".npy", clf.feature_importances) f_imp_all[i] = clf.feature_importances np.save(saving_path + "f_imp_all.npy", f_imp_all)
import sys sys.path.append("/Users/lls/Documents/mlhalos_code/") import numpy as np from mlhalos import machinelearning as ml traj = np.load("/Users/lls/Documents/mlhalos_files/stored_files/shear/shear_quantities/density_trajectories.npy") halo_mass = np.load("/Users/lls/Documents/mlhalos_files/stored_files/halo_mass_particles.npy") training_ind = np.random.choice(len(traj), 50000) feat_training = np.column_stack((traj[training_ind], halo_mass[training_ind])) clf = ml.MLAlgorithm(feat_training, method="regression", split_data_method=None) print(clf.best_estimator) print(clf.algorithm.best_params_) print(clf.algorithm.best_score_)