def svmClassifier(X, y, cv_size=5): """The following function is used to perform SVM for classification. @X This is the feature vector of type numpy @y labels of type numpy @ cv_size The k-fold size @return It returns SVC object and accuracy score value This function uses GridSearchCV to tune the parameters for the best performance The parameters are as follows: { 'kernel': ('linear', 'rbf', 'poly', 'sigmoid'), 'degree': [2,3,4,5,6] 'C': [1,4,10], 'gamma': ['auto', 'scale'] }""" temp_cls_ = SVC() parameters = { 'C': reciprocal(1, 100).rvs(5), 'gamma': reciprocal(1, 100).rvs(5), 'random_state': [0] } param_tuner_ = RandomizedSearchCV(temp_cls_, parameters, cv=cv_size, n_iter=16) param_tuner_.fit(X, y) cls = param_tuner_.best_estimator_.fit(X, y) return cls, param_tuner_.best_score_
def test_pdf(self): try: from scipy.stats import reciprocal from numpy.random import randint, uniform a = randint(1, 100) b = a + randint(1, 1000) d = dist(a, b) for _ in range(100): x = uniform(a, b) self.assertAlmostEqual(d.pdf(x), reciprocal(a, b).pdf(x)) except ImportError: pass # ok, no luck checking things with scipy... d = dist(a=10, b=5000) self.assertEqual(d.pdf(0), 0.0) self.assertEqual(d.pdf(6000), 0.0) self.assertNotEqual(d.pdf(d.a), 0.0) self.assertGreater(d.pdf(d.a), 0.0) self.assertNotEqual(d.pdf(d.b), 0.0) self.assertGreater(d.pdf(d.b), 0.0)
def exercise9(): dataset = datasets.fetch_mldata('MNIST original') X = dataset['data'] y = dataset['target'] dv = 60000 X_train, X_test = X[:dv], X[dv:] y_train, y_test = y[:dv], y[dv:] rnd_idx = np.random.permutation(dv) X_train, y_train = X_train[rnd_idx], y_train[rnd_idx] scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # svc = LinearSVC(multi_class='ovr') svc = SVC(decision_function_shape='ovr') param_distr = {'gamma': reciprocal(0.001, 0.1), 'C': uniform(1, 10)} search_cv = RandomizedSearchCV(svc, param_distributions=param_distr, n_iter=10) search_cv.fit(X_train_scaled[:1000], y_train[:1000]) y_test_pred = search_cv.predict(X_test_scaled) print(accuracy_score(y_test, y_test_pred))
def tuning(): from scipy.stats import reciprocal from sklearn.model_selection import RandomizedSearchCV param_distribs = { "n_hidden": [0, 1, 2, 3], "n_neurons": np.arange(1, 100), "learning_rate": reciprocal(3e-4, 3e-2), } keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model2) rnd_search_cv = RandomizedSearchCV(keras_reg, param_distribs, n_iter=10, cv=3, verbose=2) rnd_search_cv.fit(X_train, y_train, epochs=100, validation_data=(X_valid, y_valid), callbacks=[keras.callbacks.EarlyStopping(patience=10)]) print(rnd_search_cv.best_params_) rnd_search_cv.score(X_test, y_test) model3 = rnd_search_cv.best_estimator_.model print(model3.evaluate(X_test, y_test)) # tuning() # model = keras.models.load_model("keras_seq_minst_model.h5") # rollback to best model # print(model.evaluate(X_test, y_test))
def load_params_nn(p, label, features): from scipy.stats import reciprocal features = features.count('|') + 1 n_hidden = p[label]['n_hidden'] + 1 n_neurons = p[label]['n_neurons'] + 1 input_shape = p[label]['input_shape'] dropout = p[label]['dropout'] params = { 'n_hidden': np.arange(1, n_hidden), 'n_neurons': np.arange(1, n_neurons), 'input_shape': [input_shape], 'dropout': np.linspace(0, dropout, num=4), 'features': [features] } if label != 'cnn': lr = p[label]['learning_rate'] lr = [lr / 100, lr] params.update({'learning_rate': reciprocal(lr[0], lr[1])}) if label == 'lstm' or label == 'gru': dropout_rec = p[label]['dropout_rec'] params.update({'dropout_rec': np.linspace(0, dropout_rec, num=4)}) if label == 'cnn': filters = p[label]['filters'] pool_size = p[label]['pool_size'] kernel_size = p[label]['kernel_size'] params.update({'filters': filters}) params.update({'kernel_size': kernel_size}) params.update({'pool_size': [x for x in range(1, pool_size)]}) params.update({'n_neurons': np.arange(64, n_neurons, 64)}) return params
def _get_rand_space_vals(param_space): """ Get user defined single hyperparameter and modify it to fit sklearn ParameterSampler input. :param param_space: (dict) user defined hyperparams space :return: (list/np.array) transformed user space definition to Distribution (scipy) or a list for categorical space """ space = None if param_space['type'] == "static": space = [param_space['search_vals']] elif param_space['type'] == "categorical": space = param_space['search_vals'] elif param_space['type'] == "normal": space = stats.norm(param_space['search_vals'][0], param_space['search_vals'][1]) elif param_space['type'] == "exp": space = stats.expon(param_space['search_vals'][0]) elif param_space['type'] == "poisson": space = stats.poisson(param_space['search_vals'][0]) elif param_space['type'] == "log-uniform": space = stats.reciprocal(param_space['search_vals'][0], param_space['search_vals'][1]) elif param_space['type'] == "uniform": space = stats.uniform(param_space['search_vals'][0], param_space['search_vals'][1]) elif param_space['type'] == "int-uniform": space = stats.randint(param_space['search_vals'][0], param_space['search_vals'][1]) return space
def train_with_svr_random_search(data_prepared, labels): """ exercise 2 :param data_prepared: :param labels: :return: """ param_distribs = { 'kernel': ['linear', 'rbf'], 'C': reciprocal(20, 200000), 'gamma': expon(scale=1.0) } svm_reg = SVR() random_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs, n_iter=50, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=4, random_state=42) random_search.fit(data_prepared, labels) mse = random_search.best_score_ print("rmse: ", np.sqrt(-mse)) print("best params: ", random_search.best_params_)
def __init__(self, name, params_dict): # Get component name self.name = name # Check that config dictionary as correct number of entries if self.npars != len(params_dict["values"].keys()): print( f"Component needs to have {self.npars} number of parameters.") sys.exit(1) if "latex_names" in params_dict: if (self.npars != len(params_dict["latex_names"])): print(f"Component latex names must have size {self.npars}") sys.exit(1) self.parameter_latex_names = params_dict["latex_names"] if "units" in params_dict: if (self.npars != len(params_dict["units"])): print(f"Component units must have size {self.npars}") sys.exit(1) self.parameter_units = params_dict["units"] # Setup the priors and init parameters self.prior = [] params_values = params_dict["values"] for pname in self.parameter_names: if params_values[pname][0]: print("Fixing parameter values is not yet supported") sys.exit(1) if params_values[pname][2] == "uniform": self.prior.append( uniform(params_values[pname][3], params_values[pname][4] - params_values[pname][3])) elif params_values[pname][2] == "normal": self.prior.append( norm(params_values[pname][3], params_values[pname][4])) elif params_values[pname][2] == "reciprocal": if params_values[pname][3] != 0: print( "Reciprocal prior cannot have a starting interval value of 0.0" ) sys.exit(1) self.prior.append( reciprocal(params_values[pname][3], params_values[pname][4])) else: print( f"Parameter prior distribution chosen not recognized: {params_values[pname][2]}" ) sys.exit(1) self.prior = np.asarray(self.prior)
def exercise5_10(): from sklearn.datasets import fetch_california_housing from sklearn.model_selection import train_test_split, RandomizedSearchCV from sklearn.preprocessing import StandardScaler from sklearn.svm import SVR from sklearn.metrics import mean_squared_error from scipy.stats import reciprocal, uniform housing = fetch_california_housing() X = housing["data"] y = housing["target"] train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42) scaler = StandardScaler() train_x_scaled = scaler.fit_transform(train_x) test_x_scaled = scaler.transform(test_x) rnd_param = { "gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10) } rnd_search = RandomizedSearchCV(SVR(), rnd_param, cv=3, n_iter=10, verbose=2, random_state=42, n_jobs=-1) rnd_search.fit(train_x_scaled, train_y) best_estimator = rnd_search.best_estimator_ print("Best estimator : \n{}".format(best_estimator)) train_pred = best_estimator.predict(train_x_scaled) train_mse = mean_squared_error(train_y, train_pred) train_rmse = np.sqrt(train_mse) print("Train RMSE : {}".format(train_rmse)) test_pred = best_estimator.predict(test_x_scaled) test_mse = mean_squared_error(test_y, test_pred) test_rmse = np.sqrt(test_mse) print("Test RMSE : {}".format(test_rmse))
def random_search(model, cross_valid: int, iterations: int, random_state: int, X, y): param_distribs = { 'n_estimators': stats.randint(low=1, high=200), 'max_features': stats.randint(low=1, high=8), 'C': stats.reciprocal(20, 200000), 'gamma': stats.expon(scale=1.0), 'alpha': stats.uniform() } r_search = RandomizedSearchCV(model, param_distributions=param_distribs, n_iter=iterations, cv=cross_valid, scoring='neg_mean_squared_error', random_state=random_state) r_search.fit(X, y) for mean_score, params in zip(r_search.cv_results_['mean_test_score'], r_search.cv_results_['params']): print(np.sqrt(-mean_score), params) return r_search.best_estimator_
def test_continous_induced_measure_ppf(self): degree = 2 alpha_stat, beta_stat = 3, 3 ab = jacobi_recurrence( degree+1, alpha=beta_stat-1, beta=alpha_stat-1, probability=True) tol = 1e-15 var = stats.beta(alpha_stat, beta_stat, -5, 10) can_lb, can_ub = -1, 1 lb, ub = var.support() print(lb, ub) cx = np.linspace(can_lb, can_ub, 51) def can_pdf(xx): loc, scale = lb+(ub-lb)/2, (ub-lb)/2 return var.pdf(xx*scale+loc)*scale cdf_vals = continuous_induced_measure_cdf( can_pdf, ab, degree, can_lb, can_ub, tol, cx) assert np.all(cdf_vals <= 1.0) ppf_vals = continuous_induced_measure_ppf( var, ab, degree, cdf_vals, 1e-10, 1e-8) assert np.allclose(cx, ppf_vals) try: var = stats.loguniform(1.e-5, 1.e-3) except: var = stats.reciprocal(1.e-5, 1.e-3) ab = get_recursion_coefficients_from_variable(var, degree+5, {}) can_lb, can_ub = -1, 1 cx = np.linspace(can_lb, can_ub, 51) lb, ub = var.support() def can_pdf(xx): loc, scale = lb+(ub-lb)/2, (ub-lb)/2 return var.pdf(xx*scale+loc)*scale cdf_vals = continuous_induced_measure_cdf( can_pdf, ab, degree, can_lb, can_ub, tol, cx) # differences caused by root finding optimization tolerance assert np.all(cdf_vals <= 1.0) ppf_vals = continuous_induced_measure_ppf( var, ab, degree, cdf_vals, 1e-10, 1e-8) # import matplotlib.pyplot as plt # plt.plot(cx, cdf_vals) # plt.plot(ppf_vals, cdf_vals, 'r*', ms=2) # plt.show() assert np.allclose(cx, ppf_vals)
def exercise10(): dataset = datasets.fetch_california_housing() X = dataset['data'] y = dataset['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train.astype(np.float32)) X_test_scaled = scaler.transform(X_test.astype(np.float32)) svr = SVR() params = {'gamma': reciprocal(0.001, 0.1), 'C': uniform(1, 10)} search_cv = RandomizedSearchCV(svr, param_distributions=params, n_iter=10) search_cv.fit(X_train_scaled, y_train) y_test_pred = search_cv.predict(X_test_scaled) mse = mean_squared_error(y_test, y_test_pred) print('rmse', np.sqrt(mse)) print('best SVR', search_cv.best_estimator_)
class DBSCAN_Model(ModelRepresentationBase): klass = DBSCANWrapper category = StepCategories.Model type_of_variable = None # TypeOfVariables.NUM # is_regression = False type_of_model = TypeOfProblem.CLUSTERING custom_hyper = { "eps": reciprocal(1e-5, 1), "metric": ["minkowski"], "leaf_size": sp_randint(10, 100), "min_samples": sp_randint(1, 100), "p": sp_randint(1, 20), "scale_eps": [True], } use_y = False
def reciprocal_dstr_example(): ''' Shows why reciprocal is good for a uniform distribution of scales(logs) ''' from scipy.stats import reciprocal import matplotlib.pyplot as plt distr = reciprocal(20, 200) samples = distr.rvs(10000, random_state=42) plt.figure(figsize=(10, 4)) plt.subplot(121) plt.title('Reciprocal distribution (scale=1.0)') plt.hist(samples, bins=50) plt.subplot(122) plt.title('Log of this distribution') plt.hist(np.log(samples), bins=50) plt.show() samples = np.array(samples) range3_4 = np.sum(np.logical_and(samples < np.exp(4), samples > np.exp(3))) range4_5 = np.sum(np.logical_and(samples > np.exp(4), samples < np.exp(5))) print(range3_4, range4_5)
# # ``` # $ tensorboard --logdir=tf_logs # ``` # Now you can play around with the hyperparameters (e.g. the `batch_size` or the `learning_rate`) and run training again and again, comparing the learning curves. You can even automate this process by implementing grid search or randomized search. Below is a simple implementation of a randomized search on both the batch size and the learning rate. For the sake of simplicity, the checkpoint mechanism was removed. # In[125]: from scipy.stats import reciprocal n_search_iterations = 10 for search_iteration in range(n_search_iterations): batch_size = np.random.randint(1, 100) learning_rate = reciprocal(0.0001, 0.1).rvs(random_state=search_iteration) n_inputs = 2 + 4 logdir = log_dir("logreg") print("Iteration", search_iteration) print(" logdir:", logdir) print(" batch size:", batch_size) print(" learning_rate:", learning_rate) print(" training: ", end="") reset_graph() X = tf.placeholder(tf.float32, shape=(None, n_inputs + 1), name="X") y = tf.placeholder(tf.float32, shape=(None, 1), name="y")
callbacks=[keras.callbacks.EarlyStopping(patience=5)]) mse_test = keras_reg.score(X_test, y_test) y_pred = keras_reg.predict(X_new) #%% np.random.seed(42) tf.random.set_seed(42) #%% from scipy.stats import reciprocal from sklearn.model_selection import RandomizedSearchCV param_distribs = { "n_hidden": [0, 1, 2, 3], "n_neurons": np.arange(1, 100), "learning_rate": reciprocal(3e-4, 3e-2), } rnd_search_cv = RandomizedSearchCV(keras_reg, param_distribs, n_iter=10, cv=3, verbose=2) rnd_search_cv.fit(X_train, y_train, epochs=100, validation_data=(X_valid, y_valid), callbacks=[keras.callbacks.EarlyStopping(patience=10)]) #%% rnd_search_cv.best_params_
y_pred = lin_svr.predict(X_train_scaled) mse = mean_squared_error(y_train, y_pred) mse #RMSE np.sqrt(mse) #%% SVR with RBF kernel from sklearn.svm import SVR from sklearn.model_selection import RandomizedSearchCV from scipy.stats import reciprocal, uniform param_distributions = {'gamma': reciprocal(0.001, 0.1), 'C': uniform(1, 10)} rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, n_iter=10, verbose=2, cv=3, random_state=42) rnd_search_cv.fit(X_train_scaled, y_train) rnd_search_cv.best_score_ rnd_search_cv.best_estimator_ y_pred = rnd_search_cv.best_estimator_.predict(X_train_scaled) mse = mean_squared_error(y_train, y_pred) mse np.sqrt(mse) #%% Predict the test set
def run_random_cv_for_SVM(X_train, y_train, parameter_svm, pipe_run, scorers, refit_scorer_name, number_of_samples=400, kfolds=5, n_iter_search=2000, plot_best=20): ''' Execute random search cv :args: :X_train: feature dataframe X :y_train: ground truth dataframe y :parameter_svm: Variable parameter range for C and gamma :pipe_run: Pipe to run :scorers: Scorers :refit_scorer_name: Refit scrorer name :number_of_samples: Number of samples to use from the training data. Default=400 :kfolds: Number of folds for cross validation. Default=5 :n_iter_search: Number of random search iterations. Default=2000 :plot_best: Number of top results selected for narrowing the parameter range. Default=20 :return: ''' # Extract data subset to train on X_train_subset, y_train_subset = modelutil.extract_data_subset( X_train, y_train, number_of_samples) # Main set of parameters for the grid search run 2: Select solver parameter # Reciprocal for the logarithmic range params_run = { 'model__C': reciprocal(parameter_svm.loc['param_model__C']['min'], parameter_svm.loc['param_model__C']['max']), 'model__gamma': reciprocal(parameter_svm.loc['param_model__gamma']['min'], parameter_svm.loc['param_model__gamma']['max']) } # K-Fold settings skf = StratifiedKFold(n_splits=kfolds) # run randomized search random_search_run = RandomizedSearchCV(pipe_run, param_distributions=params_run, n_jobs=-1, n_iter=n_iter_search, cv=skf, scoring=scorers, refit=refit_scorer_name, return_train_score=True, verbose=5).fit( X_train_subset, y_train_subset) # random_search_run = RandomizedSearchCV(pipe_run, param_distributions=params_run, n_jobs=-1, # n_iter=n_iter_search, cv=skf, scoring=scorers, # refit=refit_scorer_name, return_train_score=True, # iid=True, verbose=5).fit(X_train_subset, y_train_subset) print("Best parameters: ", random_search_run.best_params_) print("Best score: {:.3f}".format(random_search_run.best_score_)) # Create the result table results = modelutil.generate_result_table(random_search_run, params_run, refit_scorer_name) # Get limits of the best values and focus in this area parameter_svm = generate_parameter_limits_for_SVM(results, plot_best) # display(parameter_svm) # Display results print(results.round(3).head(5)) print(parameter_svm) return parameter_svm, results, random_search_run
def main(args=None): from ligo.lw import lsctables from ligo.lw import utils as ligolw_utils from ligo.lw import ligolw import lal.series from scipy import stats p = parser() args = p.parse_args(args) xmldoc = ligolw.Document() xmlroot = xmldoc.appendChild(ligolw.LIGO_LW()) process = register_to_xmldoc(xmldoc, p, args) gwcosmo = GWCosmo( cosmology.default_cosmology.get_cosmology_from_string(args.cosmology)) ns_mass_min = 1.0 ns_mass_max = 2.0 bh_mass_min = 5.0 bh_mass_max = 50.0 ns_astro_spin_min = -0.05 ns_astro_spin_max = +0.05 ns_astro_mass_dist = stats.norm(1.33, 0.09) ns_astro_spin_dist = stats.uniform(ns_astro_spin_min, ns_astro_spin_max - ns_astro_spin_min) ns_broad_spin_min = -0.4 ns_broad_spin_max = +0.4 ns_broad_mass_dist = stats.uniform(ns_mass_min, ns_mass_max - ns_mass_min) ns_broad_spin_dist = stats.uniform(ns_broad_spin_min, ns_broad_spin_max - ns_broad_spin_min) bh_astro_spin_min = -0.99 bh_astro_spin_max = +0.99 bh_astro_mass_dist = stats.pareto(b=1.3) bh_astro_spin_dist = stats.uniform(bh_astro_spin_min, bh_astro_spin_max - bh_astro_spin_min) bh_broad_spin_min = -0.99 bh_broad_spin_max = +0.99 bh_broad_mass_dist = stats.reciprocal(bh_mass_min, bh_mass_max) bh_broad_spin_dist = stats.uniform(bh_broad_spin_min, bh_broad_spin_max - bh_broad_spin_min) if args.distribution.startswith('bns_'): m1_min = m2_min = ns_mass_min m1_max = m2_max = ns_mass_max if args.distribution.endswith('_astro'): x1_min = x2_min = ns_astro_spin_min x1_max = x2_max = ns_astro_spin_max m1_dist = m2_dist = ns_astro_mass_dist x1_dist = x2_dist = ns_astro_spin_dist elif args.distribution.endswith('_broad'): x1_min = x2_min = ns_broad_spin_min x1_max = x2_max = ns_broad_spin_max m1_dist = m2_dist = ns_broad_mass_dist x1_dist = x2_dist = ns_broad_spin_dist else: # pragma: no cover assert_not_reached() elif args.distribution.startswith('nsbh_'): m1_min = bh_mass_min m1_max = bh_mass_max m2_min = ns_mass_min m2_max = ns_mass_max if args.distribution.endswith('_astro'): x1_min = bh_astro_spin_min x1_max = bh_astro_spin_max x2_min = ns_astro_spin_min x2_max = ns_astro_spin_max m1_dist = bh_astro_mass_dist m2_dist = ns_astro_mass_dist x1_dist = bh_astro_spin_dist x2_dist = ns_astro_spin_dist elif args.distribution.endswith('_broad'): x1_min = bh_broad_spin_min x1_max = bh_broad_spin_max x2_min = ns_broad_spin_min x2_max = ns_broad_spin_max m1_dist = bh_broad_mass_dist m2_dist = ns_broad_mass_dist x1_dist = bh_broad_spin_dist x2_dist = ns_broad_spin_dist else: # pragma: no cover assert_not_reached() elif args.distribution.startswith('bbh_'): m1_min = m2_min = bh_mass_min m1_max = m2_max = bh_mass_max if args.distribution.endswith('_astro'): x1_min = x2_min = bh_astro_spin_min x1_max = x2_max = bh_astro_spin_max m1_dist = m2_dist = bh_astro_mass_dist x1_dist = x2_dist = bh_astro_spin_dist elif args.distribution.endswith('_broad'): x1_min = x2_min = bh_broad_spin_min x1_max = x2_max = bh_broad_spin_max m1_dist = m2_dist = bh_broad_mass_dist x1_dist = x2_dist = bh_broad_spin_dist else: # pragma: no cover assert_not_reached() else: # pragma: no cover assert_not_reached() dists = (m1_dist, m2_dist, x1_dist, x2_dist) # Read PSDs psds = list( lal.series.read_psd_xmldoc( ligolw_utils.load_fileobj( args.reference_psd, contenthandler=lal.series.PSDContentHandler)).values()) # Construct mass1, mass2, spin1z, spin2z grid. m1 = np.geomspace(m1_min, m1_max, 10) m2 = np.geomspace(m2_min, m2_max, 10) x1 = np.linspace(x1_min, x1_max, 10) x2 = np.linspace(x2_min, x2_max, 10) params = m1, m2, x1, x2 # Calculate the maximum distance on the grid. max_z = gwcosmo.get_max_z(psds, args.waveform, args.f_low, args.min_snr, m1, m2, x1, x2, jobs=args.jobs) if args.max_distance is not None: new_max_z = cosmology.z_at_value(gwcosmo.cosmo.luminosity_distance, args.max_distance * units.Mpc) max_z[max_z > new_max_z] = new_max_z max_distance = gwcosmo.sensitive_distance(max_z).to_value(units.Mpc) # Find piecewise constant approximate upper bound on distance. max_distance = cell_max(max_distance) # Calculate V * T in each grid cell cdfs = [dist.cdf(param) for param, dist in zip(params, dists)] cdf_los = [cdf[:-1] for cdf in cdfs] cdfs = [np.diff(cdf) for cdf in cdfs] probs = np.prod(np.meshgrid(*cdfs, indexing='ij'), axis=0) probs /= probs.sum() probs *= 4 / 3 * np.pi * max_distance**3 volume = probs.sum() probs /= volume probs = probs.ravel() volumetric_rate = args.nsamples / volume * units.year**-1 * units.Mpc**-3 # Draw random grid cells dist = stats.rv_discrete(values=(np.arange(len(probs)), probs)) indices = np.unravel_index(dist.rvs(size=args.nsamples), max_distance.shape) # Draw random intrinsic params from each cell cols = {} cols['mass1'], cols['mass2'], cols['spin1z'], cols['spin2z'] = [ dist.ppf(stats.uniform(cdf_lo[i], cdf[i]).rvs(size=args.nsamples)) for i, dist, cdf_lo, cdf in zip(indices, dists, cdf_los, cdfs) ] # Swap binary components as needed to ensure that mass1 >= mass2. # Note that the .copy() is important. # See https://github.com/numpy/numpy/issues/14428 swap = cols['mass1'] < cols['mass2'] cols['mass1'][swap], cols['mass2'][swap] = \ cols['mass2'][swap].copy(), cols['mass1'][swap].copy() cols['spin1z'][swap], cols['spin2z'][swap] = \ cols['spin2z'][swap].copy(), cols['spin1z'][swap].copy() # Draw random extrinsic parameters cols['distance'] = stats.powerlaw( a=3, scale=max_distance[indices]).rvs(size=args.nsamples) cols['longitude'] = stats.uniform(0, 2 * np.pi).rvs(size=args.nsamples) cols['latitude'] = np.arcsin(stats.uniform(-1, 2).rvs(size=args.nsamples)) cols['inclination'] = np.arccos( stats.uniform(-1, 2).rvs(size=args.nsamples)) cols['polarization'] = stats.uniform(0, 2 * np.pi).rvs(size=args.nsamples) cols['coa_phase'] = stats.uniform(-np.pi, 2 * np.pi).rvs(size=args.nsamples) cols['time_geocent'] = stats.uniform(1e9, units.year.to( units.second)).rvs(size=args.nsamples) # Convert from sensitive distance to redshift and comoving distance. # FIXME: Replace this brute-force lookup table with a solver. z = np.linspace(0, max_z.max(), 10000) ds = gwcosmo.sensitive_distance(z).to_value(units.Mpc) dc = gwcosmo.cosmo.comoving_distance(z).to_value(units.Mpc) z_for_ds = interp1d(ds, z, kind='cubic', assume_sorted=True) dc_for_ds = interp1d(ds, dc, kind='cubic', assume_sorted=True) zp1 = 1 + z_for_ds(cols['distance']) cols['distance'] = dc_for_ds(cols['distance']) # Apply redshift factor to convert from comoving distance and source frame # masses to luminosity distance and observer frame masses. for key in ['distance', 'mass1', 'mass2']: cols[key] *= zp1 # Populate sim_inspiral table sims = xmlroot.appendChild(lsctables.New(lsctables.SimInspiralTable)) for row in zip(*cols.values()): sims.appendRow(**dict(dict.fromkeys(sims.validcolumns, None), process_id=process.process_id, simulation_id=sims.get_next_id(), waveform=args.waveform, f_lower=args.f_low, **dict(zip(cols.keys(), row)))) # Record process end time. process.comment = str(volumetric_rate) process.set_end_time_now() # Write output file. write_fileobj(xmldoc, args.output)
def get_percentile_distr(): """Get a distribution of percentiles geometrically spaced between 50-100 and 5-50. Less in the middle, more at the ends.""" second_part = (np.geomspace(5, 50, 10)).astype(int) first_part = (101. - np.geomspace(1, 51, 20)).astype(int) return np.hstack([first_part, second_part]) # define all the tunable params for each of them LOGISTIC_TUNABLE = [{ 'classify__penalty': ['l1', 'l2'], # l1 and l2 regularization, l1 introduces sparsity(lasso) 'classify__C': stats.reciprocal(a=1e-1, b=1e4) }] SVM_TUNABLE = [{ 'classify__penalty': ['l2'], 'classify__C': stats.reciprocal(a=1e-1, b=1e4), }] DECISION_TREE_TUNABLE = [{'classify__max_features': ['sqrt', 'log2']}] RANDOM_FOREST_TUNABLE = [{'classify__max_features': ['sqrt', 'log2']}] DEEP_TUNABLE = [{ 'classify__optimizer': ['adagrad', 'adam', 'rmsprop'], 'classify__activation': ['relu', 'selu', 'sigmoid', 'tanh'], 'classify__dropout':
x = np.linspace(reciprocal.ppf(0.01, a, b), reciprocal.ppf(0.99, a, b), 100) ax.plot(x, reciprocal.pdf(x, a, b), 'r-', lw=5, alpha=0.6, label='reciprocal pdf') # Alternatively, the distribution object can be called (as a function) # to fix the shape, location and scale parameters. This returns a "frozen" # RV object holding the given parameters fixed. # Freeze the distribution and display the frozen ``pdf``: rv = reciprocal(a, b) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') # Check accuracy of ``cdf`` and ``ppf``: vals = reciprocal.ppf([0.001, 0.5, 0.999], a, b) np.allclose([0.001, 0.5, 0.999], reciprocal.cdf(vals, a, b)) # True # Generate random numbers: r = reciprocal.rvs(a, b, size=1000) # And compare the histogram: ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2)
plt.subplot(121) plt.title("Exponential distribution (scale=1.0)") plt.hist(samples, bins=50) plt.subplot(122) plt.title("Log of this distribution") plt.hist(np.log(samples), bins=50) plt.show() #%% reciprocal continuous random variable # use reciprocal distribution when you have no idea what the scale of the hyperparameter should be # log of the samples roughly constant as scale of the samples picked from a uniform distribution reciprocal_distrib = reciprocal(20, 200000) samples = reciprocal_distrib.rvs(10000, random_state=42) plt.figure(figsize=(10, 4)) plt.subplot(121) plt.title("Reciprocal distribution (scale=1.0)") plt.hist(samples, bins=50) plt.subplot(122) plt.title("Log of this distribution") plt.hist(np.log(samples), bins=50) plt.show() #%% linear looking data
# data. # # Notes: some combinations of the hyper-parameters proposed above are invalid. # You can make the parameter search accept such failures by setting `error_score` # to `np.nan`. The warning messages give more details on which parameter # combinations but the computation will proceed. # # Once the computation has completed, print the best combination of parameters # stored in the `best_params_` attribute. # %% from sklearn.model_selection import RandomizedSearchCV from scipy.stats import reciprocal param_distributions = { "logisticregression__C": reciprocal(0.001, 10), "logisticregression__solver": ["liblinear", "lbfgs"], "logisticregression__penalty": ["l2", "l1"], "columntransformer__cat-preprocessor__drop": [None, "first"] } model_random_search = RandomizedSearchCV( model, param_distributions=param_distributions, n_iter=20, error_score=np.nan, n_jobs=2, verbose=1) model_random_search.fit(df_train, target_train) model_random_search.best_params_
tokens = nltk.word_tokenize(text) stems = [] for item in tokens: stems.append(PorterStemmer().stem(item)) return stems def nlkt_tokenize(text): return nltk.word_tokenize(text) pipe = Pipeline([('tfidf', TfidfVectorizer()), ('lsa', OptionalTruncatedSVD()), ('clf', RandomForestClassifier())]) params = { "tfidf__ngram_range": [(1, 1), (1, 2), (2, 2)], "tfidf__min_df": stats.randint(1, 3), "tfidf__max_df": stats.uniform(.95, .3), "tfidf__sublinear_tf": [True, False], "tfidf__tokenizer": [None, stemmer, lemmatizer, nlkt_tokenize], "lsa__passthrough": [True, False, True, True, True, True, True], "lsa__n_components": stats.randint(100, 3000), 'clf__n_estimators': stats.randint(100, 300), 'clf__criterion': ['gini', 'entropy'], 'clf__max_features': ['auto', 'log2', None], 'clf__max_depth': stats.randint(10, 150), 'clf__class_weight': [None, 'balanced'], 'clf__min_samples_split': stats.reciprocal(.0001, .2), 'clf__min_samples_leaf': stats.reciprocal(.0001, .2) }
default="accuracy", help="Scoring metric to be used") args = vars(ap.parse_args()) # TODO: make grid to other classifiers like, GaussianNB and Logistic Regression randomized_params = { "KNeighborsClassifier": { "n_neighbors": randint(low=1, high=30) }, "RandomForest": { "n_estimators": randint(low=1, high=200), "max_features": randint(low=1, high=8), }, "SVM": { "kernel": ["linear", "rbf"], "C": reciprocal(0.1, 200000), "gamma": expon(scale=1.0), }, } models = { "KNeighborsClassifier": KNeighborsClassifier(), "RandomForest": RandomForestClassifier(), "SVM": SVC(), } if __name__ == "__main__": import tensorflow as tf model_name = args["model"] scoring = args["scoring"]
if not os.path.exists(logdir): os.mkdir(logdir) output_model_file = os.path.join(logdir, "model.h5") sklearn_model = keras.wrappers.scikit_learn.KerasRegressor( build_fn = build_model) from scipy.stats import reciprocal # f(x) = 1/(x*log(b/a)) a <= x <= b param_distribution = { "hidden_layers":[1, 2, 3, 4], "layer_size": np.arange(1, 100), "learning_rate": reciprocal(1e-4, 1e-2), } from sklearn.model_selection import RandomizedSearchCV callbacks = [keras.callbacks.ModelCheckpoint(output_model_file,save_best_only = True),keras.callbacks.EarlyStopping(patience=5, min_delta=1e-2)] random_search_cv = RandomizedSearchCV(sklearn_model, param_distribution, n_iter = 10, cv = 3, n_jobs = 1) history = random_search_cv.fit(x_train_scaled, y_train, epochs = 30, validation_data = (x_valid_scaled, y_valid), callbacks = callbacks) def plot_learning_curves(history): pd.DataFrame(history.history).plot(figsize=(8, 5))
logger.info( "Running SVM algorithm on best combination of features from FLAS subsets..." ) features_flas = self.data.combinations_flas_top10[0][0] logger.info(f"These features are : {features_flas}") x_train = self.X_train[features_flas].values y_train = self.y_train.values x_test = self.X_test[features_flas].values y_test = self.y_test.values scaler = StandardScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) svm_flas = svm.SVC(kernel=kernel_choice) #Hyperparameter tuning param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)} rnd_search_cv_flas = RandomizedSearchCV(svm_flas, param_distributions, n_iter=10, verbose=2, cv=3) rnd_search_cv_flas.fit(X_train, y_train) rnd_search_cv_flas.best_estimator_ logger.info("Fitting train data on SVM model...") rnd_search_cv_flas.best_estimator_.fit(x_train, y_train) y_pred_flas = rnd_search_cv_flas.best_estimator_.predict(x_test) logger.info( f"Train accuracy : {accuracy_score(y_train, svm_flas.predict(x_train))}" ) logger.info(f"Test accuracy : {accuracy_score(y_test, y_pred_flas)}") logger.info( "Saving results for svm with best combination of features from FLAS subsets..." )
y_pred = lin_clf.predict(X_train_scaled) accuracy_score(y_train, y_pred) #%% Train scaled data with SVC + RBF kernel from sklearn.svm import SVC svm_clf = SVC(gamma="scale") svm_clf.fit(X_train_scaled[:10000], y_train[:10000]) #%% Narrow down hyperparameters with randomized search + CV from sklearn.model_selection import RandomizedSearchCV from scipy.stats import reciprocal, uniform param_distributions = {'gamma': reciprocal(0.001, 0.1), 'C': uniform(1, 10)} rnd_search_cv = RandomizedSearchCV(svm_clf, param_distributions, n_iter=10, verbose=2, cv=3) rnd_search_cv.fit(X_train_scaled[:1000], y_train[:1000]) rnd_search_cv.best_score_ rnd_search_cv.best_estimator_.fit(X_train_scaled, y_train) y_pred = rnd_search_cv.best_estimator_.predict(X_train_scaled) accuracy_score(y_train, y_pred) #%% And now the test set
svm_reg = SVR() grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=4) grid_search.fit(housing_prepared, housing_labels) negative_mse = grid_search.best_score_ rmse = np.sqrt(-negative_mse) rmse print(grid_search.best_params_) # exercise 2 from sklearn.model_selection import RandomizedSearchCV from scipy.stats import expon, reciprocal param_distribs = { 'kernel': ['linear', 'rbf'], 'C': reciprocal(20, 200000), 'gamma': expon(scale=1.0), } svm_reg = SVR() rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs, n_iter=50, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=4, random_state=42) rnd_search.fit(housing_prepared, housing_labels) negative_mse = rnd_search.best_score_ rmse = np.sqrt(-negative_mse) rmse rnd_search.best_params_ # exercise 3 from sklearn.base import BaseEstimator, TransformerMixin
for layer in range(n_hidden): model.add(keras.layers.Dense(n_neurons, activation=activation)) model.add(keras.layers.Dense(1)) optimizer = keras.optimizers.Adam(lr=lr) model.compile(loss='mse', optimizer=optimizer) return model from scipy.stats import reciprocal from sklearn.model_selection import RandomizedSearchCV param_distribs = { 'n_hidden': [0, 1, 2, 3], 'n_neurons': np.arange(1, 100), 'lr': reciprocal(3e-4, 3e-2) # 역수? } keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model) rnd__search_cv = RandomizedSearchCV(keras_reg, param_distribs, n_iter=10, cv=None) # fit rnd__search_cv.fit(x_train, y_train, epochs=2, validation_data=(x_valid, y_valid)) # evaluate, predict