def best_rp_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) rp = GaussianRandomProjection(n_components=X_train_scl.shape[1]) X_train_transformed = rp.fit_transform(X_train_scl, y_train) X_test_transformed = rp.transform(X_test_scl) ## top 2 kurt = kurtosis(X_train_transformed) i = kurt.argsort()[::-1] X_train_transformed_sorted = X_train_transformed[:, i] X_train_transformed = X_train_transformed_sorted[:,0:2] kurt = kurtosis(X_test_transformed) i = kurt.argsort()[::-1] X_test_transformed_sorted = X_test_transformed[:, i] X_test_transformed = X_test_transformed_sorted[:,0:2] # save filename = './' + self.save_dir + '/nba_rp_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def rp_analysis(self, X_train, X_test, y_train, y_test, data_set_name): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) ks = [] for i in range(1000): ## ## Random Projection ## rp = GaussianRandomProjection(n_components=X_train_scl.shape[1]) rp.fit(X_train_scl) X_train_rp = rp.transform(X_train_scl) ks.append(kurtosis(X_train_rp)) mean_k = np.mean(ks, 0) ## ## Plots ## ph = plot_helper() title = 'Kurtosis (Randomized Projection) for ' + data_set_name name = data_set_name.lower() + '_rp_kurt' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_simple_bar(np.arange(1, len(mean_k)+1, 1), mean_k, np.arange(1, len(mean_k)+1, 1).astype('str'), 'Feature Index', 'Kurtosis', title, filename)
def ica_analysis(self, X_train, X_test, y_train, y_test, data_set_name): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ## ## ICA ## ica = FastICA(n_components=X_train_scl.shape[1]) X_ica = ica.fit_transform(X_train_scl) ## ## Plots ## ph = plot_helper() kurt = kurtosis(X_ica) print(kurt) title = 'Kurtosis (FastICA) for ' + data_set_name name = data_set_name.lower() + '_ica_kurt' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_simple_bar(np.arange(1, len(kurt)+1, 1), kurt, np.arange(1, len(kurt)+1, 1).astype('str'), 'Feature Index', 'Kurtosis', title, filename)
def best_ica_wine(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ica = FastICA(n_components=X_train_scl.shape[1]) X_train_transformed = ica.fit_transform(X_train_scl, y_train) X_test_transformed = ica.transform(X_test_scl) ## top 2 kurt = kurtosis(X_train_transformed) i = kurt.argsort()[::-1] X_train_transformed_sorted = X_train_transformed[:, i] X_train_transformed = X_train_transformed_sorted[:,0:2] kurt = kurtosis(X_test_transformed) i = kurt.argsort()[::-1] X_test_transformed_sorted = X_test_transformed[:, i] X_test_transformed = X_test_transformed_sorted[:,0:2] # save filename = './' + self.save_dir + '/wine_ica_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_ica_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_ica_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_ica_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def scale_feature_matrix(feature_M, linear=False, outliers=False): from sklearn.preprocessing import StandardScaler, RobustScaler import numpy as np binary_fields = [col for col in feature_M.columns if len(set(feature_M[col])) == 2] if outliers: #Scaling 0 median & unit variance scaler_obj = RobustScaler() print 'centering around median' else: #Scale 0 mean & unit variance scaler_obj = StandardScaler() print 'centering around mean' print 'found these binaries' print '-' * 10 print '\n'.join(binary_fields) X_scaled = scaler_obj.fit_transform(feature_M.drop(binary_fields, axis=1)) X_scaled_w_cats = np.c_[X_scaled, feature_M[binary_fields].as_matrix()] return X_scaled_w_cats, scaler_obj
def nn_wine_orig(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) self.part4.nn_analysis(X_train_scl, X_test_scl, y_train, y_test, 'Wine', 'Neural Network Original')
def standardize_columns(data): """ We decided to standardize the weather factor due to outliers. """ columns_to_standardize = ['temp', 'atemp', 'humidity', 'windspeed'] min_max_scaler = RobustScaler() for column in columns_to_standardize: data[column] = min_max_scaler.fit_transform(data[column]) return data
def lda_analysis(self, X_train, X_test, y_train, y_test, data_set_name): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ## ## Plots ## ph = plot_helper() scores = [] train_scores = [] rng = range(1, X_train_scl.shape[1]+1) for i in rng: lda = LinearDiscriminantAnalysis(n_components=i) cv = KFold(X_train_scl.shape[0], 3, shuffle=True) # cross validation cv_scores = [] for (train, test) in cv: lda.fit(X_train_scl[train], y_train[train]) score = lda.score(X_train_scl[test], y_train[test]) cv_scores.append(score) mean_score = np.mean(cv_scores) scores.append(mean_score) # train score lda = LinearDiscriminantAnalysis(n_components=i) lda.fit(X_train_scl, y_train) train_score = lda.score(X_train_scl, y_train) train_scores.append(train_score) print(i, mean_score) ## ## Score Plot ## title = 'Score Summary Plot (LDA) for ' + data_set_name name = data_set_name.lower() + '_lda_score' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_series(rng, [scores, train_scores], [None, None], ['cross validation score', 'training score'], cm.viridis(np.linspace(0, 1, 2)), ['o', '*'], title, 'n_components', 'Score', filename)
def demensionReduction(numFeatures,cateFeatures): """ :param numFeatures: :param labels: :return: """ scaler = RobustScaler() scaledFeatures = scaler.fit_transform(numFeatures) pca = PCA(n_components=5) reducedFeatures = pca.fit_transform(scaledFeatures) allFeatures = np.concatenate((reducedFeatures,cateFeatures),axis=1) return allFeatures
def best_lda_cluster_wine(self): dh = data_helper() dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ## ## K-Means ## km = KMeans(n_clusters=4, algorithm='full') X_train_transformed = km.fit_transform(X_train_scl) X_test_transformed = km.transform(X_test_scl) # save filename = './' + self.save_dir + '/wine_kmeans_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_kmeans_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_kmeans_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_kmeans_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False) ## ## GMM ## gmm = GaussianMixture(n_components=4, covariance_type='full') X_train_transformed = km.fit_transform(X_train_scl) X_test_transformed = km.transform(X_test_scl) # save filename = './' + self.save_dir + '/wine_gmm_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_gmm_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_gmm_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_gmm_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def transform_dataframe(dataframe): """ Function to read dataframe and standardize the dataframe with a mean 0 and unit variance on every column Parameters: dataframe : Input pandas dataframe Input types: pd.Dataframe Output types: pd.Dataframe """ cols = [col for col in dataframe.columns] robust_scaler = RobustScaler() df = robust_scaler.fit_transform(dataframe[cols]) dataframe.columns = df return dataframe
def scale(self,columns,categorical_cols,apply_list,target_column): from sklearn.preprocessing import RobustScaler scaler = RobustScaler() if apply_list: numerical_cols = columns else: numerical_cols = [] for col in self.dataset.columns.values: if col not in categorical_cols: numerical_cols.append(col) else: pass # We don't want to scale the target variable, as it is already binary. # The target column uses the same value as target_value from Split Data section # in the settings popup. numerical_cols.remove(target_column) # Scale, fit and transform all the numerical columns scaled_data = scaler.fit_transform(self.dataset[numerical_cols]) self.dataset[numerical_cols] = scaled_data return self.dataset
def detect_bad_channels(inst, pick_types=None, threshold=.2): from sklearn.preprocessing import RobustScaler from sklearn.covariance import EmpiricalCovariance from jr.stats import median_abs_deviation if pick_types is None: pick_types = dict(meg='mag') inst = inst.pick_types(copy=True, **pick_types) cov = EmpiricalCovariance() cov.fit(inst._data.T) cov = cov.covariance_ # center scaler = RobustScaler() cov = scaler.fit_transform(cov).T cov /= median_abs_deviation(cov) cov -= np.median(cov) # compute robust summary metrics mu = np.median(cov, axis=0) sigma = median_abs_deviation(cov, axis=0) mu /= median_abs_deviation(mu) sigma /= median_abs_deviation(sigma) distance = np.sqrt(mu ** 2 + sigma ** 2) bad = np.where(distance < threshold)[0] bad = [inst.ch_names[ch] for ch in bad] return bad
def best_pca_wine(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) pca = PCA(n_components=3) X_train_transformed = pca.fit_transform(X_train_scl, y_train) X_test_transformed = pca.transform(X_test_scl) # save filename = './' + self.save_dir + '/wine_pca_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_pca_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_pca_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_pca_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def best_lda_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) lda = LinearDiscriminantAnalysis(n_components=2) X_train_transformed = lda.fit_transform(X_train_scl, y_train) X_test_transformed = lda.transform(X_test_scl) # save filename = './' + self.save_dir + '/nba_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
if end_index > nr: end_index = nr if start_index > nr: end_index = nr+1 test_size = 0.20 if pf is True: test_size = 0.05 #X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=42) X_test = X[0:nr_test] Y_test = Y[0:nr_test] X_train = X[nr_test+1:len(X)] Y_train = Y[nr_test+1:len(X)] X_train = robust_scaler.fit_transform(X_train) # save standard scaler joblib.dump(robust_scaler, base_path + 'data/rs-' + algorithm + '-' + str(ps[psi]) + '.pkl') X_test = robust_scaler.transform(X_test) if algorithm == 'kernel-approx': rbf_feature = RBFSampler(gamma=1, random_state=1) X_train = rbf_feature.fit_transform(X_train) X_test = rbf_feature.fit_transform(X_test) elif algorithm == 'mlp': n_output = len(set(Y)) #n_output = 2460 n_input = len(X_train[0]) + 1 n_neurons = int(round(sqrt(n_input*n_output)))
def gmm_analysis(self, X_train, X_test, y_train, y_test, data_set_name, max_clusters, analysis_name='GMM'): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) em_bic = [] em_aic = [] em_completeness_score = [] em_homogeneity_score = [] em_measure_score = [] em_adjusted_rand_score = [] em_adjusted_mutual_info_score = [] cluster_range = np.arange(2, max_clusters+1, 1) for k in cluster_range: print('K Clusters: ', k) ## ## Expectation Maximization ## em = GaussianMixture(n_components=k, covariance_type='full') em.fit(X_train_scl) em_pred = em.predict(X_train_scl) em_bic.append(em.bic(X_train_scl)) em_aic.append(em.aic(X_train_scl)) # metrics y_train_score = y_train.reshape(y_train.shape[0],) em_homogeneity_score.append(homogeneity_score(y_train_score, em_pred)) em_completeness_score.append(completeness_score(y_train_score, em_pred)) em_measure_score.append(v_measure_score(y_train_score, em_pred)) em_adjusted_rand_score.append(adjusted_rand_score(y_train_score, em_pred)) em_adjusted_mutual_info_score.append(adjusted_mutual_info_score(y_train_score, em_pred)) ## ## Plots ## ph = plot_helper() ## ## BIC/AIC Plot ## title = 'Information Criterion Plot (' + analysis_name + ') for ' + data_set_name name = data_set_name.lower() + '_' + analysis_name.lower() + '_ic' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_series(cluster_range, [em_bic, em_aic], [None, None], ['bic', 'aic'], cm.viridis(np.linspace(0, 1, 2)), ['o', '*'], title, 'Number of Clusters', 'Information Criterion', filename) ## ## Score Plot ## title = 'Score Summary Plot (' + analysis_name + ') for ' + data_set_name name = data_set_name.lower() + '_' + analysis_name.lower() + '_score' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_series(cluster_range, [em_homogeneity_score, em_completeness_score, em_measure_score, em_adjusted_rand_score, em_adjusted_mutual_info_score], [None, None, None, None, None, None], ['homogeneity', 'completeness', 'measure', 'adjusted_rand', 'adjusted_mutual_info'], cm.viridis(np.linspace(0, 1, 5)), ['o', '^', 'v', '>', '<', '1'], title, 'Number of Clusters', 'Score', filename)
print(df['Amount'].cummin()) original_df = df # print(df.head(5)) df = df.sample(frac=1, random_state=rand_state) #SCALING DATA rob_scaler = RobustScaler() df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1)) df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1)) df.drop(['Time','Amount'], axis=1, inplace=True) scaled_amount = df['scaled_amount'] scaled_time = df['scaled_time'] df.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True) df.insert(0, 'scaled_amount', scaled_amount) df.insert(1, 'scaled_time', scaled_time) #The two features Time and Amount are scaled
train["SalePrice"] = np.log1p(train["SalePrice"]) numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index all_data = pandas.get_dummies(all_data) all_data = all_data.fillna(all_data.mean()) #log transform skewed numeric features: skewness = all_data[numeric_feats].apply(lambda x: skew(x.dropna())) left_skewed_feats = skewness[skewness > 0.5].index right_skewed_feats = skewness[skewness < -0.5].index all_data[left_skewed_feats] = np.log1p(all_data[left_skewed_feats]) #all_data[right_skewed_feats] = np.exp(all_data[right_skewed_feats]) scaler = RobustScaler() all_data[numeric_feats] = scaler.fit_transform(all_data[numeric_feats]) X_train = all_data[:train.shape[0]] X_test = all_data[train.shape[0]:] y = train['SalePrice'] linear_model = ElasticNet(alpha=0.001) linear_model.fit(X_train, y) svr_model = SVR(kernel='rbf', C=2, epsilon=0.05) svr_model.fit(X_train, y) test['SalePrice'] = np.expm1( (linear_model.predict(X_test) + svr_model.predict(X_test)) / 2.0) test.to_csv('kaggle-houses-submission.csv',
def pca_analysis(self, X_train, X_test, y_train, y_test, data_set_name): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ## ## PCA ## pca = PCA(n_components=X_train_scl.shape[1], svd_solver='full') X_pca = pca.fit_transform(X_train_scl) ## ## Plots ## ph = plot_helper() ## ## Explained Variance Plot ## title = 'Explained Variance (PCA) for ' + data_set_name name = data_set_name.lower() + '_pca_evar_err' filename = './' + self.out_dir + '/' + name + '.png' self.plot_explained_variance(pca, title, filename) ## ## Reconstruction Error ## all_mses, rng = self.reconstruction_error(X_train_scl, PCA) title = 'Reconstruction Error (PCA) for ' + data_set_name name = data_set_name.lower() + '_pca_rec_err' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_series(rng, [all_mses.mean(0)], [all_mses.std(0)], ['mse'], ['red'], ['o'], title, 'Number of Features', 'Mean Squared Error', filename) ## ## Manually compute eigenvalues ## cov_mat = np.cov(X_train_scl.T) eigen_values, eigen_vectors = np.linalg.eig(cov_mat) print(eigen_values) sorted_eigen_values = sorted(eigen_values, reverse=True) title = 'Eigen Values (PCA) for ' + data_set_name name = data_set_name.lower() + '_pca_eigen' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_simple_bar(np.arange(1, len(sorted_eigen_values)+1, 1), sorted_eigen_values, np.arange(1, len(sorted_eigen_values)+1, 1).astype('str'), 'Principal Components', 'Eigenvalue', title, filename) ## TODO Factor this out to new method ## ## Scatter ## '''
def robust_scaler(data): scaler = RobustScaler() data = scaler.fit_transform(data) return data
test['Date'] = test['Date'].astype('datetime64[D]') add_datepart(test, 'Date') # recombine tfidf values with other data train_y = np.array(train['Label'].values) test_y = np.array(test['Label'].values) temp_train = train.drop(['Label', 'Text'], axis=1) train_x = np.append(train_dense, temp_train.values, axis=1) temp_test = test.drop(['Label', 'Text'], axis=1) test_x = np.append(test_dense, temp_test.values, axis=1) feature_names = list(temp_train.columns) feature_names.extend(words) # scale features and labels to gaussian distribution scaler = RobustScaler() train_x = scaler.fit_transform(train_x) test_x = scaler.transform(test_x) train_y = scaler.fit_transform(train_y.reshape(-1, 1)) test_y = scaler.transform(test_y.reshape(-1, 1)) final_train = pd.DataFrame(train_x, columns=feature_names) final_train['##Label##'] = train_y final_train = final_train.astype('float16') final_train.to_hdf('Data/final_busfin_train_' + years + '.h5', key='train') final_test = pd.DataFrame(test_x, columns=feature_names) final_test['##Label##'] = test_y final_test = final_test.astype('float16') final_test.to_hdf('Data/final_busfin_test_' + years + '.h5', key='test')
overfit = list(overfit) return overfit overfitted_features = overfit_reducer(X) X.drop(overfitted_features, axis=1, inplace=True) test.drop(overfitted_features, axis=1, inplace=True) print('X.shape', X.shape) print('test.shape', test.shape) std_scaler = StandardScaler() rbst_scaler = RobustScaler() power_transformer = PowerTransformer() X_std = std_scaler.fit_transform(X) X_rbst = rbst_scaler.fit_transform(X) X_pwr = power_transformer.fit_transform(X) test_std = std_scaler.transform(test) test_rbst = rbst_scaler.transform(test) test_pwr = power_transformer.transform(test) X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.002, random_state=52) print('X_train Shape :', X_train.shape) print('X_test Shape :', X_test.shape) print('y_train Shape :', y_train.shape) print('y_test Shape :', y_test.shape)
def kmeans_analysis(self, X_train, X_test, y_train, y_test, data_set_name, max_clusters, analysis_name='K-Means'): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) km_inertias = [] km_completeness_score = [] km_homogeneity_score = [] km_measure_score = [] km_adjusted_rand_score = [] km_adjusted_mutual_info_score = [] cluster_range = np.arange(2, max_clusters+1, 1) for k in cluster_range: print('K Clusters: ', k) ## ## KMeans ## km = KMeans(n_clusters=k, algorithm='full', n_jobs=-1) km.fit(X_train_scl) # inertia is the sum of distances from each point to its center km_inertias.append(km.inertia_) # metrics y_train_score = y_train.reshape(y_train.shape[0],) km_homogeneity_score.append(homogeneity_score(y_train_score, km.labels_)) km_completeness_score.append(completeness_score(y_train_score, km.labels_)) km_measure_score.append(v_measure_score(y_train_score, km.labels_)) km_adjusted_rand_score.append(adjusted_rand_score(y_train_score, km.labels_)) km_adjusted_mutual_info_score.append(adjusted_mutual_info_score(y_train_score, km.labels_)) ## ## Silhouette Plot ## title = 'Silhouette Plot (' + analysis_name + ', k=' + str(k) + ') for ' + data_set_name name = data_set_name.lower() + '_' + analysis_name.lower() + '_silhouette_' + str(k) filename = './' + self.out_dir + '/' + name + '.png' self.silhouette_plot(X_train_scl, km.labels_, title, filename) ## ## Plots ## ph = plot_helper() ## ## Elbow Plot ## title = 'Elbow Plot (' + analysis_name + ') for ' + data_set_name name = data_set_name.lower() + '_' + analysis_name.lower() + '_elbow' filename = './' + self.out_dir + '/' + name + '.png' # line to help visualize the elbow lin = ph.extended_line_from_first_two_points(km_inertias, 0, 2) ph.plot_series(cluster_range, [km_inertias, lin], [None, None], ['inertia', 'projected'], cm.viridis(np.linspace(0, 1, 2)), ['o', ''], title, 'Number of Clusters', 'Inertia', filename) ## ## Score Plot ## title = 'Score Summary Plot (' + analysis_name + ') for ' + data_set_name name = data_set_name.lower() + '_' + analysis_name.lower() + '_score' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_series(cluster_range, [km_homogeneity_score, km_completeness_score, km_measure_score, km_adjusted_rand_score, km_adjusted_mutual_info_score], [None, None, None, None, None, None], ['homogeneity', 'completeness', 'measure', 'adjusted_rand', 'adjusted_mutual_info'], cm.viridis(np.linspace(0, 1, 5)), ['o', '^', 'v', '>', '<', '1'], title, 'Number of Clusters', 'Score', filename)
def least_square_reference( inst, empty_room=None, max_times_samples=2000, bad_channels=None, scaler=None, mrk=None, elp=None, hsp=None ): """ Fits and applies Least Square projection of the reference channels (potentially from an empty room) and removes the corresponding component from the recordings of a subject. Parameters ---------- inst : Raw | str Raw instance or path to raw data. empty_room : str | None Path to raw data acquired in empty room. max_times_samples : int Number of time sample to use for pinv. Defautls to 2000 bad_channels : list | array, shape (n_chans) of strings Lists bad channels scaler : function | None Scaler functions to normalize data. Defaults to sklearn.preprocessing.RobustScaler. Returns ------- inst : Raw adapted from Adeen Flinker 6/2013 (<*****@*****.**>) LSdenoise.m Main EHN - Automatically detects channel types. - Allows flexible scaler; Robust by default. - The data is projected back in Tesla. - Allows memory control. TODO: - Allow other kind of MNE-Python inst - Allow baseline selection (pre-stim instead of empty room) - Clean up memory - Allow fancy solver (l1, etc) """ from scipy.linalg import pinv from mne.io import read_raw_kit from mne.io import _BaseRaw # Least square can be fitted on empty room or on subject's data if empty_room is None: if not isinstance(inst, _BaseRaw): raw = read_raw_kit(inst, preload=True) else: raw = inst else: if not isinstance(empty_room, _BaseRaw): raw = read_raw_kit(empty_room, preload=True) else: raw = empty_room # Parameters n_chans, n_times = raw._data.shape chan_info = raw.info["chs"] # KIT: axial gradiometers (equiv to mag) ch_mag = np.where([ch["coil_type"] == 6001 for ch in chan_info])[0] # KIT: ref magnetometer ch_ref = np.where([ch["coil_type"] == 6002 for ch in chan_info])[0] # Other channels ch_misc = np.where([ch["coil_type"] not in [6001, 6002] for ch in chan_info])[0] # Bad channel ch_bad = np.empty(0) if (bad_channels is not None) and len(bad_channels): if np.all([isinstance(ch, int) for ch in bad_channels]): bad_channels = np.array(bad_channels) elif np.all([isinstance(ch, str) for ch in bad_channels]): bad_channels = [ii for ii, ch in enumerate(raw.ch_names) if ch in bad_channels] else: raise ValueError("bad_channels needs array of int or array of str") else: bad_channels = [] default_bad_channels = [ii for ii, ch in enumerate(raw.ch_names) if ch in raw.info["bads"]] bad_channels = np.array(default_bad_channels + bad_channels, int) print("bad channels:", [raw.ch_names[bad] for bad in bad_channels]) # To avoid memory error, let's subsample across time sel_times = slice(0, n_times, int(np.ceil(n_times // max_times_samples))) # Whiten data if scaler is None: from sklearn.preprocessing import RobustScaler scaler = RobustScaler() data_bsl = scaler.fit_transform(raw._data.T) # Fit Least Square coefficients on baseline data empty_sensors = data_bsl[:, ch_mag] if len(ch_bad): empty_sensors[:, ch_bad] = 0 # remove bad channels coefs = np.dot(pinv(data_bsl[sel_times, ch_ref]), empty_sensors[sel_times, :]) empty_sensors, data_bsl = None, None # clear memory # Apply correction on subject data if empty_room is not None: del raw raw = read_raw_kit(inst, preload=True) data_subject = scaler.transform(raw._data.T) subject_sensors = data_subject[:, ch_mag] - np.dot(data_subject[:, ch_ref], coefs) # Remove bad channels if len(ch_bad): subject_sensors[:, ch_bad] = 0 # Reproject baseline new_ref = np.dot(subject_sensors, pinv(coefs)) # Un-whiten data to get physical units back data = np.concatenate((subject_sensors, new_ref, raw._data[ch_misc, :].T), axis=1) data = scaler.inverse_transform(data) # Output raw._data = data.T return raw
from sklearn.svm import SVR import pandas as pd from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import RobustScaler path = "/Users/xiaofeifei/I/Kaggle/Benz/" train = pd.read_csv(path+'train_start.csv') # test = pd.read_csv(path+'test_start.csv') y = train["y"] train = train.drop(["y"], axis = 1) # # poly svm = SVR(kernel='rbf', C=1.0, epsilon=0.05) a= RobustScaler() train = a.fit_transform(train,y) kr = GridSearchCV(SVR(kernel='rbf', C=1.0, epsilon=0.05), cv=5, n_jobs = 6,verbose=1,scoring='r2', param_grid={"C": [20,30], "epsilon": [0.02,0.03,0.05,0.07]}) kr.fit(train, y) print kr.best_params_ print kr.best_score_ print kr.best_estimator_ # {'epsilon': 0.01, 'C': 30} # 0.536811148843
X1 = np.random.multivariate_normal(mean=mu1, cov=Cov, size=n_datapoints) X2 = np.random.multivariate_normal(mean=mu2, cov=Cov, size=n_datapoints) Y_test = np.hstack([[-1]*n_datapoints, [1]*n_datapoints]) X_test = np.vstack([X1, X2]) X_train[0, 0] = -1000 # a fairly large outlier # Scale data standard_scaler = StandardScaler() Xtr_s = standard_scaler.fit_transform(X_train) Xte_s = standard_scaler.transform(X_test) robust_scaler = RobustScaler() Xtr_r = robust_scaler.fit_transform(X_train) Xte_r = robust_scaler.fit_transform(X_test) # Plot data fig, ax = plt.subplots(1, 3, figsize=(12, 4)) ax[0].scatter(X_train[:, 0], X_train[:, 1], color=np.where(Y_train > 0, 'r', 'b')) ax[1].scatter(Xtr_s[:, 0], Xtr_s[:, 1], color=np.where(Y_train > 0, 'r', 'b')) ax[2].scatter(Xtr_r[:, 0], Xtr_r[:, 1], color=np.where(Y_train > 0, 'r', 'b')) ax[0].set_title("Unscaled data") ax[1].set_title("After standard scaling (zoomed in)") ax[2].set_title("After robust scaling (zoomed in)") # for the scaled data, we zoom in to the data center (outlier can't be seen!) for a in ax[1:]: a.set_xlim(-3, 3)
def choose_geometries(list_of_molecules, features='fingerprint', maximum_number_of_seeds=8): if len(list_of_molecules) < 2: cluster_logger.info( " Not enough data to cluster (only %d), returning original" % len(list_of_molecules)) return list_of_molecules if len(list_of_molecules) <= maximum_number_of_seeds: cluster_logger.info(' Not enough data for clustering. ' ' Removing similar geometries from the list') return remove_similar(list_of_molecules) cluster_logger.info('Clustering on {} geometries'.format( len(list_of_molecules))) if features == 'fingerprint': dt = [ pyar.representations.fingerprint(i.atoms_list, i.coordinates) for i in list_of_molecules ] elif features == 'scm': dt = [ pyar.representations.sorted_coulomb_matrix( pyar.representations.coulomb_matrix(i.atoms_list, i.coordinates)) for i in list_of_molecules ] elif features == 'moi': dt = [ pyar.property.get_principal_axes(i.moments_of_inertia_tensor) for i in list_of_molecules ] elif features == 'rsmd': dt = [ pyar.representations.get_rsmd(i.moments_of_inertia_tensor) for i in list_of_molecules ] else: cluster_logger.error('This feature is not implemented') return list_of_molecules dt = np.around(dt, decimals=5) df = pd.DataFrame(dt) df.to_csv("features.csv") scale_it = RobustScaler() dt = scale_it.fit_transform(dt) try: labels = generate_labels(dt) except Exception as e: cluster_logger.exception("All Clustering algorithms failed") cluster_logger.exception(e) return list_of_molecules best_from_each_cluster = select_best_from_each_cluster( labels, list_of_molecules) if len(best_from_each_cluster) == 1: return best_from_each_cluster else: cluster_logger.info(" Removing similar molecules after clustering.") reduced_best_from_each_cluster = remove_similar(best_from_each_cluster) if len(reduced_best_from_each_cluster) > maximum_number_of_seeds: return choose_geometries( reduced_best_from_each_cluster, maximum_number_of_seeds=maximum_number_of_seeds) else: return reduced_best_from_each_cluster
devtest='./exp/ivectors_semeval_devtest_NGMM_2048_W_2_DIM_200/feats.txt' dev='./exp/ivectors_semeval_dev_NGMM_2048_W_2_DIM_200/feats.txt' train='./exp/ivectors_semeval_train_NGMM_2048_W_2_DIM_200/feats.txt' trainy,trainx=imdb_bag_of_word_libs.loadFeatsText(train) trainy=imdb_bag_of_word_libs.kaldiID_2_LB(trainy) evaly,evalx=imdb_bag_of_word_libs.loadFeatsText(dev) evaly=imdb_bag_of_word_libs.kaldiID_2_LB(evaly) evaly2,evalx2=imdb_bag_of_word_libs.loadFeatsText(devtest) evaly2=imdb_bag_of_word_libs.kaldiID_2_LB(evaly2) robust_scaler = RobustScaler() trainx=robust_scaler.fit_transform(trainx) evalx=robust_scaler.transform(evalx) clf= LinearDiscriminantAnalysis() # clf.fit(trainx,trainy) predictValue=clf.predict(evalx) print semeval2016_libs.scoreSameOrder(predictValue,configure.SCORE_REF_DEV) evalx2=robust_scaler.transform(evalx2) predictValue=clf.predict(evalx2) print semeval2016_libs.scoreSameOrder(predictValue,configure.SCORE_REF_DEVTEST)
plt.subplot(3, 2, 2) scaler1 = MinMaxScaler() X_new = scaler1.fit_transform(X) plt.scatter(X_new[:, 0], X_new[:, 1]) plt.subplot(3, 2, 3) scaler2 = MaxAbsScaler() X_new2 = scaler2.fit_transform(X) plt.scatter(X_new2[:, 0], X_new2[:, 1]) plt.subplot(3, 2, 4) scaler3 = StandardScaler() X_new3 = scaler3.fit_transform(X) plt.scatter(X_new3[:, 0], X_new3[:, 1]) plt.xlim(-2, 2) plt.ylim(-2, 2) plt.subplot(3, 2, 5) scaler4 = RobustScaler() X_new4 = scaler4.fit_transform(X) plt.scatter(X_new4[:, 0], X_new4[:, 1]) plt.xlim(-1, 1) plt.ylim(-1, 1) plt.subplot(3, 2, 6) scaler5 = Normalizer() X_new5 = scaler5.fit_transform(X) plt.scatter(X_new5[:, 0], X_new5[:, 1]) plt.xlim(0, 1) plt.ylim(0, 1) plt.show()
with open('Metro_Interstate_Traffic_Volume.csv.gz', mode='wb') as file: file.write(req.content) x_scaler = StandardScaler() y_scaler = RobustScaler(quantile_range=(.1, .9)) df = pd.read_csv('Metro_Interstate_Traffic_Volume.csv.gz', compression='gzip', parse_dates=['date_time']) df = pd.concat((df, parse_date(df['date_time'])), axis=1) x_train = x_scaler.fit_transform(df[[ 'year', 'month_x', 'month_y', 'weekday_x', 'weekday_y', 'day_x', 'day_y', 'hour_x', 'hour_y' ]]) y_train = y_scaler.fit_transform(df[['traffic_volume']]) x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=.1, shuffle=False) x_train = torch.tensor(x_train).float().to(device=device) x_test = torch.tensor(x_test).float().to(device=device) y_train = torch.tensor(y_train).float().to(device=device) y_train = y_train.view((-1, 1)) net = MDN(x_train.shape[1]).to(device=device) net.zero_grad() learning_rate = 1e-3 epochs = 5000
colors = ["red","green"] mapper = CategoricalColorMapper(factors = factors,palette = colors) p.circle('suicides_no', 'population', size=4, source=source, legend='sex', fill_alpha=0.2, color = {"field":"sex","transform":mapper}) show(p)''' #https://pythonspot.com/3d-scatterplot/ #https://matplotlib.org/gallery/mplot3d/scatter3d.html import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import axes3d from sklearn.preprocessing import RobustScaler #https://www.kaggle.com/janiobachmann/credit-fraud-dealing-with-imbalanced-datasets rob_scaler = RobustScaler() wh['scaled_population'] = rob_scaler.fit_transform( wh['population'].values.reshape(-1, 1)) wh['scaled_suicides_no'] = rob_scaler.fit_transform( wh['suicides_no'].values.reshape(-1, 1)) # Create plot fig = plt.figure() #ax = fig.add_subplot(1, 1, 1) #ax = fig.gca(projection='3d') ax = fig.add_subplot(111, projection='3d') #ax.set_xscale('log') wh = wh[wh.suicides_no > 0] ax.scatter(wh[wh.sex == 'female'].scaled_population, wh[wh.sex == 'female'].year, wh[wh.sex == 'female'].scaled_suicides_no, alpha=0.2, c="red",
plt.show() # Scale data using a Robust Scaler fig, ax = plt.subplots(2, 2, figsize=(8, 8)) ax[0, 0].scatter(data[:, 0], data[:, 1]) ax[0, 0].set_xlim([-10, 10]) ax[0, 0].set_ylim([-10, 10]) ax[0, 0].grid() ax[0, 0].set_xlabel('X') ax[0, 0].set_ylabel('Y') ax[0, 0].set_title('Raw data') rs = RobustScaler(quantile_range=(15, 85)) scaled_data = rs.fit_transform(data) ax[0, 1].scatter(scaled_data[:, 0], scaled_data[:, 1]) ax[0, 1].set_xlim([-10, 10]) ax[0, 1].set_ylim([-10, 10]) ax[0, 1].grid() ax[0, 1].set_xlabel('X') ax[0, 1].set_ylabel('Y') ax[0, 1].set_title('Scaled data (15% - 85%)') rs1 = RobustScaler(quantile_range=(25, 75)) scaled_data1 = rs1.fit_transform(data) ax[1, 0].scatter(scaled_data1[:, 0], scaled_data1[:, 1]) ax[1, 0].set_xlim([-10, 10]) ax[1, 0].set_ylim([-10, 10])
"hematocrit": row["hematocrit_apache"], "wbc": row["wbc_apache"], } return np.sum([calculate_single_scores(v,k) for k,v in cols.items()]) df["apacheScore"] = df.apply(getAPACHEScore , axis=1) df["apacheScore"].describe() from sklearn.preprocessing import RobustScaler from sklearn.preprocessing import PowerTransformer rs = RobustScaler() pt = PowerTransformer() df.loc[:,numeric_cols] = rs.fit_transform(df.loc[:,numeric_cols]) df.loc[:,numeric_cols] = pt.fit_transform(df.loc[:,numeric_cols]) ndf = df.copy() cat_cols_minus = [c for c in cat_cols if c not in ["clusterId","hospital_death", "encounter_id" , "hospital_id" , "patient_id"]] cat_cols_minus_useless = [c for c in cat_cols if c not in ["clusterId", "encounter_id" , "hospital_id" , "patient_id" , "icu_id" ]] #df #pcadf = pcadf.join(df[cat_cols_minus_useless]) #ndf = pcadf cols_to_dummy = [c for c in cat_cols_minus_useless if c != "hospital_death"] from sklearn.preprocessing import OneHotEncoder ohe = OneHotEncoder(sparse=False , handle_unknown='ignore') endcodedNdf = ohe.fit_transform(ndf.loc[:,cols_to_dummy])
# Feature scaling to get more accurate representation and better learning performance ''' Most machine learning algorithms take into account only the magnitude of the measurements, not the units of those measurements. The feature with a very high magnitude (number) may affect the prediction a lot more than an equally important feature. e.g. the AGE (within certain fixed range) and the PAY_AMTn (monetary) features have very different ranges of values RobustScaler: The Robust Scaler uses statistics that are robust to outliers. This usage of interquartiles means that they focus on the parts where the bulk of the data is. This makes them very suitable for working with outliers. Notice that after Robust scaling, the distributions are brought into the same scale and overlap, but the outliers remain outside of bulk of the new distributions. ''' x = df.drop('default', axis=1) robust_scaler = RobustScaler() x = robust_scaler.fit_transform(x) # rescale all the features to a same range y = df['default'] # stratify parameter makes data split in a stratified fashion meaning the proportion of values in the sample produced will be the same as the proportion of values provided to parameter stratify x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=123, stratify=y) # In[ ]: def c_matrix(CM, labels=['pay', 'default']): df = pd.DataFrame(data=CM, index=labels, columns=labels) df.index.name = 'TRUE' df.columns.name = 'PREDICTION'
plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') plt.show() ###################### ########KNN # dividindo o dataset em test e treino from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(Xnegativo, ynegativo, test_size = 0.30) #Feature Scaling from sklearn.preprocessing import RobustScaler sc = RobustScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=11) knn.fit(X_train, y_train) y_pred = knn.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) #acuracia from sklearn.metrics import accuracy_score
def go(self, all_data, cols, colsP): train = all_data.loc[(all_data.SalePrice > 0), cols].reset_index(drop=True, inplace=False) y_train = all_data.SalePrice[all_data.SalePrice > 0].reset_index( drop=True, inplace=False) test = all_data.loc[(all_data.SalePrice == 0), cols].reset_index(drop=True, inplace=False) # Main script here scale = RobustScaler() df = pd.DataFrame(scale.fit_transform(train[cols]), columns=cols) #select features based on P values ln_model = sm.OLS(y_train, df) result = ln_model.fit() print(result.summary2()) pv_cols = cols.values SL = 0.051 pv_cols, LR = self.backwardElimination(df, y_train, SL, pv_cols) pred = LR.predict(df[pv_cols]) y_pred = pred.apply(lambda x: 1 if x > 0.5 else 0) print('Fvalue: {:.6f}'.format(LR.fvalue)) print('MSE total on the train data: {:.4f}'.format(LR.mse_total)) ls = Lasso(alpha=0.0005, max_iter=161, selection='cyclic', tol=0.002, random_state=101) rfecv = RFECV(estimator=ls, n_jobs=-1, step=1, scoring='neg_mean_squared_error', cv=5) rfecv.fit(df, y_train) select_features_rfecv = rfecv.get_support() RFEcv = cols[select_features_rfecv] print('{:d} Features Select by RFEcv:\n{:}'.format( rfecv.n_features_, RFEcv.values)) score = r2_score ls = Lasso(alpha=0.0005, max_iter=161, selection='cyclic', tol=0.002, random_state=101) sbs = SequentialFeatureSelection(ls, k_features=1, scoring=score) sbs.fit(df, y_train) print('Best Score: {:2.2%}\n'.format(max(sbs.scores_))) print('Best score with:{0:2d}.\n'.\ format(len(list(df.columns[sbs.subsets_[np.argmax(sbs.scores_)]])))) SBS = list(df.columns[list(sbs.subsets_[max( np.arange(0, len(sbs.scores_))[(sbs.scores_ == max(sbs.scores_))])])]) print('\nBest score with {0:2d} features:\n{1:}'.format(len(SBS), SBS)) skb = SelectKBest(score_func=f_regression, k=80) skb.fit(df, y_train) select_features_kbest = skb.get_support() kbest_FR = cols[select_features_kbest] scores = skb.scores_[select_features_kbest] skb = SelectKBest(score_func=mutual_info_regression, k=80) skb.fit(df, y_train) select_features_kbest = skb.get_support() kbest_MIR = cols[select_features_kbest] scores = skb.scores_[select_features_kbest] X_train, X_test, y, y_test = train_test_split(df, y_train, test_size=0.30, random_state=101) # fit model on all training data #importance_type='gain' model = XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0, max_delta_step=0, random_state=101, min_child_weight=1, missing=None, n_jobs=4, scale_pos_weight=1, seed=None, silent=True, subsample=1) model.fit(X_train, y) # Using each unique importance as a threshold thresholds = np.sort(np.unique(model.feature_importances_)) best = 1e36 colsbest = 31 my_model = model threshold = 0 for thresh in thresholds: # select features using threshold selection = SelectFromModel(model, threshold=thresh, prefit=True) select_X_train = selection.transform(X_train) # train model selection_model = XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0, max_delta_step=0, random_state=101, min_child_weight=1, missing=None, n_jobs=4, scale_pos_weight=1, seed=None, silent=True, subsample=1) selection_model.fit(select_X_train, y) # eval model select_X_test = selection.transform(X_test) y_pred = selection_model.predict(select_X_test) predictions = [round(value) for value in y_pred] r2 = r2_score(y_test, predictions) mse = mean_squared_error(y_test, predictions) print( "Thresh={:1.3f}, n={:d}, R2: {:2.2%} with MSE: {:.4f}".format( thresh, select_X_train.shape[1], r2, mse)) if (best >= mse): best = mse colsbest = select_X_train.shape[1] my_model = selection_model threshold = thresh feature_importances = [ (score, feature) for score, feature in zip(model.feature_importances_, cols) ] XGBest = pd.DataFrame(sorted( sorted(feature_importances, reverse=True)[:colsbest]), columns=['Score', 'Feature']) XGBestCols = XGBest.iloc[:, 1].tolist() bcols = set(pv_cols).union(set(RFEcv)).union(set(kbest_FR)).union( set(kbest_MIR)).union(set(XGBestCols)).union(set(SBS)) intersection = set(SBS).intersection(set(kbest_MIR)).intersection( set(RFEcv)).intersection(set(pv_cols)).intersection( set(kbest_FR)).intersection(set(XGBestCols)) print(intersection, '\n') print('_' * 75, '\nUnion All Features Selected:') print('Total number of features selected:', len(bcols)) print('\n{0:2d} features removed if use the union of selections: {1:}'. format(len(cols.difference(bcols)), cols.difference(bcols))) totalCols = list(bcols.union(set(colsP))) #self.trainingData = self.trainingData.loc[list(totalCols)].reset_index(drop=True, inplace=False) #self.testingData = self.testingData.loc[list(totalCols)].reset_index(drop=True, inplace=False) #self.combinedData = [self.trainingData, self.testingData] return DataObject(self.trainingData, self.testingData, self.combinedData), totalCols, RFEcv, XGBestCols
y_feature = 'deferral_payments' for point, poi in zip(features, labels): x = point[features_list.index(x_feature) - 1] y = point[features_list.index(y_feature) - 1] color = 'red' if poi else 'blue' matplotlib.pyplot.scatter(x, y, c=color) matplotlib.pyplot.xlabel(x_feature) matplotlib.pyplot.ylabel(y_feature) matplotlib.pyplot.savefig(x_feature + '_' + y_feature + '.png') from sklearn.preprocessing import RobustScaler scaler = RobustScaler() features = scaler.fit_transform(features) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html from sklearn.tree import DecisionTreeClassifier param_grid = { 'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 'log2', None], 'max_depth': [5, 10, None], 'min_samples_split': [1, 2, 5, 10] }
test = test.join(one_hot2) df.columns features = list(num_col) + list(catOneHot_col) # prova con xgboost e crossvalidation x_train = df[list(features)].values y_train = df["SPEED_AVG"].values #si prendono le features e la target variable dal dataset di test x_test = test[list(features)].values y_test = test["SPEED_AVG"].values #scaling scaler = RobustScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) gb = XGBRegressor(learning_rate=0.1, n_estimators=2000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='reg:gamma', nthread=8, scale_pos_weight=1, seed=27) # Nel caso si voglia usare la Cross Validation
df_non_clicks = df.loc[df['clicks'] == 0][:number_of_clicks] df_balanced = pd.concat([df_clicks, df_non_clicks]) #%% #Encoding categorical data using the "hashing trick" vectorizer = FeatureHasher(n_features=2**25, input_type='string') invent_src = vectorizer.fit_transform(df_balanced.inventory_source) #geo_zip = vectorizer.fit_transform(df_balanced.geo_zip.apply(str)) screen_size = vectorizer.fit_transform(df_balanced.platform_device_screen_size) carrier = vectorizer.fit_transform(df_balanced.platform_carrier) bandwidth = vectorizer.fit_transform(df_balanced.platform_bandwidth) maker = vectorizer.fit_transform(df_balanced.platform_device_make) model = vectorizer.fit_transform(df_balanced.platform_device_model) day_of_week = vectorizer.fit_transform(df_balanced.day_of_week) scaler = RobustScaler()#StandardScaler() bid_floor = np.transpose(csr_matrix(scaler.fit_transform([df_balanced.bid_floor.values]))) #spend = np.transpose(csr_matrix(scaler.fit_transform([df_balanced.spend.values]))) #%% y = df_balanced['clicks'] X = hstack([invent_src, screen_size, carrier, bandwidth, maker, model, day_of_week, bid_floor]) #%% X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) model = LogisticRegression(solver='saga',n_jobs=8, penalty='l2', verbose=5,C=0.01) model.fit(X_train, y_train) mm.model_report_card(model, X_train, y_train, X_test, y_test)
import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.model_selection import cross_val_score import numpy as np datadir = '~/Desktop/my package/machine_learning-and-deep_learning/Data/houseprice/' X_train = pd.read_csv(datadir + 'X2.csv').drop('SalePrice', axis=1) y_train = pd.read_csv(datadir + 'X2.csv')['SalePrice'] X_test = pd.read_csv(datadir + 'test_X2.csv') from sklearn.preprocessing import RobustScaler scalor = RobustScaler() X = scalor.fit_transform(X_train) test_X = scalor.fit_transform(X_test) Id = list(range(1461, 1461 + 1459)) print(X.shape, test_X.shape) #Base line croos validation error lr = LinearRegression() lr_val_score = cross_val_score(lr, X, y_train, scoring='neg_mean_absolute_error', cv=10, n_jobs=-1) print('Baseline log CV score:', np.log(np.mean(-lr_val_score)))
def pca_sets(sample_length=1000, num_samples=1000, random_state=69): real_set_1_ = load_n_samples(real=True, num_samples=num_samples, sample_length=sample_length, random_state=random_state) real_set_2_ = load_n_samples(real=True, num_samples=num_samples, sample_length=sample_length, random_state=2 * random_state) fake_set_1_ = load_n_samples(real=False, num_samples=num_samples, sample_length=sample_length, random_state=random_state) fake_set_2_ = load_n_samples(real=False, num_samples=num_samples, sample_length=sample_length, random_state=2 * random_state) real_set_3_ = load_n_samples(real=True, num_samples=num_samples, sample_length=sample_length, random_state=3 * random_state) real_set_4_ = load_n_samples(real=True, num_samples=num_samples, sample_length=sample_length, random_state=4 * random_state) r_scaler = RobustScaler() for sample in range(num_samples): real_set_1_[sample] = r_scaler.fit_transform(real_set_1_[sample]) real_set_2_[sample] = r_scaler.fit_transform(real_set_2_[sample]) fake_set_1_[sample] = r_scaler.fit_transform(fake_set_1_[sample]) fake_set_2_[sample] = r_scaler.fit_transform(fake_set_2_[sample]) real_set_3_[sample] = r_scaler.fit_transform(real_set_3_[sample]) real_set_4_[sample] = r_scaler.fit_transform(real_set_4_[sample]) real_set_1 = np.zeros((num_samples, N_COLS, N_COLS)) real_set_2 = np.zeros((num_samples, N_COLS, N_COLS)) fake_set_1 = np.zeros((num_samples, N_COLS, N_COLS)) fake_set_2 = np.zeros((num_samples, N_COLS, N_COLS)) real_set_3 = np.zeros((num_samples, N_COLS, N_COLS)) real_set_4 = np.zeros((num_samples, N_COLS, N_COLS)) pca = PCA() for x in range(num_samples): pca.fit(real_set_1_[x]) real_set_1[x] = pca.components_ pca.fit(real_set_2_[x]) real_set_2[x] = pca.components_ pca.fit(fake_set_1_[x]) fake_set_1[x] = pca.components_ pca.fit(fake_set_2_[x]) fake_set_2[x] = pca.components_ pca.fit(real_set_3_[x]) real_set_3[x] = pca.components_ pca.fit(real_set_4_[x]) real_set_4[x] = pca.components_ return real_set_1, real_set_2, real_set_3, real_set_4, fake_set_1, fake_set_2
for i in range(np.size(portret, 0)): for j in range(np.size(portret, 1)): if portret[i, j] == -99.99: portret[i, j] = np.nan #%% Price the cross-section dates = pd.DataFrame({'Date': Date}) df2 = df.merge(dates, how='inner', on='Date') df3 = df2.merge(ff3, how='inner', on='Date') # Define feature riskfac = df3.Close_vix.values - df3.Close_vix3m.values rf = df3.RF.values m = np.zeros(np.size(portret, 1)) X = np.vstack((robust_scaler.fit_transform(riskfac.reshape(-1, 1)).T, robust_scaler.fit_transform(df3.MKTRF.values.reshape(-1, 1)).T, robust_scaler.fit_transform(df3.SMB.values.reshape(-1, 1)).T, robust_scaler.fit_transform(df3.HML.values.reshape(-1, 1)).T)).T numFac = np.size(X, 1) b = np.zeros((np.size(portret, 1), numFac)) X = sm.add_constant(X) # Obtain betas from first-pass time-series regressions for i in range(np.size(portret, 1)): y = portret[:, i] - rf # LHS variable is excess returns m[i] = y.mean( ) # store expected excess returns for the cross-sectional regressions model = sm.OLS( y, X, missing='drop'
predictions_df['predictions'] = np.nan mae_cv = np.zeros((n_folds, 1)) # -------------------------------------------------------------------------- for i_fold, (train_idx, test_idx) in enumerate(kf.split(x, y)): x_train, x_test = x[train_idx], x[test_idx] y_train, y_test = y[train_idx], y[test_idx] print('CV iteration: %d' % (i_fold + 1)) # -------------------------------------------------------------------------- # Normalization/Scaling/Standardization scaler = RobustScaler() x_train_norm = scaler.fit_transform(x_train) x_test_norm = scaler.transform(x_test) # -------------------------------------------------------------------------- # Model gpr = GaussianProcessRegressor() # -------------------------------------------------------------------------- # Model selection # Search space param_grid = [ { 'kernel': [RBF(), DotProduct()], 'alpha': [1e0, 1e-1, 1.5e-1, 1e-2, 1.5e-2] }, ]
def transform_data(X, y=None, test=False): """ Preparing final dataset with all features. Arguments --- X - dataframe with preprocessed features and target variable test - boolean; if false, it means X is the training set If true, it means X is the test set """ config = load_yaml("./config.yaml") columns = list(X.columns) log_cols = config["transform"]["log_cols"] log1p_cols = config["transform"]["log1p_cols"] boxcox1p_cols = config["transform"]["boxcox1p_cols"] onehot_cols = config["transform"]["onehot_cols"] targetencode_cols = config["transform"]["targetencode_cols"] log_target = config["transform"]["log_target"] # generate time features (only relevant for time series) # TODO: make datetime column identifiable from config file if "timestamp" in columns: X.timestamp = pd.to_datetime(X.timestamp, format="%Y-%m-%d %H:%M:%S") # adjust the desirable format accordingly X["hour"] = X.timestamp.dt.hour X["weekday"] = X.timestamp.dt.weekday if not test: X.sort_values("timestamp", inplace=True) X.reset_index(drop=True, inplace=True) # TODO: make cols identified from config file if log_cols: for col in log_cols: # this will replace the columns with their log values X[col] = np.log(X[col]) if log1p_cols: for col in log1p_cols: # this will replace the columns with their log1p values X[col] = np.log1p(X[col]) if boxcox1p_cols: for col in boxcox1p_cols: if col in columns: print("taking the log of " + str(col)) # this will replace the columns with their boxcox1p values X[col] = boxcox1p(X[col], 0.15) # robust scaler numeric_cols = X.select_dtypes(include=np.number).columns.tolist() if not test: global robust_scaler robust_scaler = RobustScaler() robust_scaler.fit_transform(X[numeric_cols]) else: robust_scaler.transform(X[numeric_cols]) # transforming target if log_target and not test: y = np.log1p(y) # target encoding if targetencode_cols: if not test: global target_encoder target_encoder = ce.TargetEncoder(cols=targetencode_cols) X = target_encoder.fit_transform(X, y) else: X = target_encoder.transform(X) if test: return X else: return X, y
def robust_modified(df): robust_scaler = RobustScaler() robust_df = pd.DataFrame(robust_scaler.fit_transform(df[zone_columns]), columns=[zone_columns]) new_column = [x+'_robust' for x in zone_columns] df[new_column] = robust_df return df, new_column
def main(args): out_file_name = "results.log" if args.classify: # Cast to list to keep it all in memory train = list(csv.reader(open(args.train_file, 'r'))) test = list(csv.reader(open(args.test_file, 'r'))) x_train = np.array(train[1:], dtype=float) x_test = np.array(test[1:], dtype=float) train_labels_file = open(args.train_labels) y_train = np.array([int(x.strip()) for x in train_labels_file.readlines()]) test_labels_file = open(args.test_labels) y_test = np.array([int(x.strip()) for x in test_labels_file.readlines()]) train_labels_file.close() test_labels_file.close() if args.sampling_technique: print "Attempting to use sampling technique: " + args.sampling_technique if args.sampling_ratio == float('NaN'): print "Unable to use sampling technique. Ratio is NaN." else: x_train, y_train = __get_sample_transformed_examples(args.sampling_technique, x_train, y_train, args.sampling_ratio) if args.scale: scaler = RobustScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.fit_transform(x_test) for classifier in args.classifiers: model = __get_classifier_model(classifier, args) print "Using classifier " + classifier print "Fitting data to model" if args.grid_search: print "Applying parameter tuning to model" if classifier == LOG_REG: parameters = {'loss':('log','hinge'), 'penalty':('l2', 'l1'), 'shuffle':[True], 'n_iter':[5], 'n_jobs':[-1], 'random_state':[179]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == SVM: parameters = {'kernel':('rbf', 'poly'), 'cache_size':[8096], 'random_state':[17]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == ADA_BOOST: parameters = {'n_estimators':[300], 'random_state':[13]} model = grid_search.GridSearchCV(model, parameters, scoring=roc_auc_score, verbose=2) elif classifier == RF: parameters = {'criterion':('gini', 'entropy'), 'n_jobs':[-1], 'n_estimators':[300], 'random_state':[17]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == GRADIENT_BOOST: parameters = {'n_estimators':[300], 'random_state':[17]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == EXTRA_TREES: parameters = {'n_estimators':[300], 'random_state':[17], 'n_jobs':[-1], 'criterion':('gini', 'entropy'), 'max_features':('log2', 40, 0.4), 'max_features':[40, 0.4], 'bootstrap':[True, False], 'bootstrap_features':[True, False]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == BAGGING: parameters = {'n_estimators':[300], 'random_state':[17], 'max_samples': [.4, 30],'max_features':[40, 0.4], 'bootstrap':[True, False], 'bootstrap_features':[True, False], 'n_jobs':[-1]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) print "Best params: " + str(model.best_params_) clf = model.fit(x_train, y_train) print "Parameters used in model:" #print clf.get_params(deep=False) if args.select_best: # Unable to use BaggingClassifier with SelectFromModel if classifier != BAGGING: print "Selecting best features" sfm = SelectFromModel(clf, prefit=True) x_train = sfm.transform(x_train) x_test = sfm.transform(x_test) clf = model.fit(x_train, y_train) __print_and_log_results(clf, classifier, x_train, x_test, y_test, out_file_name, args) elif args.cross_validate: # Cast to list to keep it all in memory labels_file = open(args.labels) labels = np.array([int(x.strip()) for x in labels_file.readlines()]) labels_file.close() data_file = open(args.data_file, 'r') data = list(csv.reader(data_file)) data_file.close() examples = np.array(data[1:], dtype=float) X_train, X_test, y_train, y_test = cross_validation.train_test_split(examples, labels, test_size=0.1) if args.sampling_technique: print "Attempting to use sampling technique: " + args.sampling_technique if args.sampling_ratio == float('NaN'): print "Unable to use sampling technique. Ratio is NaN." else: X_train, y_train = __get_sample_transformed_examples(args.sampling_technique, X_train, y_train, args.sampling_ratio) if args.scale: scaler = StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) for classifier in args.classifiers: print "Using classifier " + classifier model = __get_classifier_model(classifier, args) print "Fitting model" if args.grid_search: print "Applying parameter tuning to model" if classifier == LOG_REG: parameters = {'loss':('log','hinge'), 'penalty':('l2', 'l1'), 'shuffle':[True], 'n_iter':[5], 'n_jobs':[-1], 'random_state':[179]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == SVM: parameters = {'kernel':('rbf', 'poly'), 'cache_size':[8096], 'random_state':[17]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == ADA_BOOST: parameters = {'n_estimators':[300], 'random_state':[13]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == RF: parameters = {'criterion':('gini', 'entropy'), 'n_jobs':[-1], 'n_estimators':[300], 'random_state':[17]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == GRADIENT_BOOST: parameters = {'n_estimators':[300], 'random_state':[17]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == EXTRA_TREES: parameters = {'n_estimators':[300], 'random_state':[17], 'n_jobs':[-1], 'criterion':('gini', 'entropy'), 'max_features':('log2', 40, 0.4), 'max_features':[40, 0.4], 'bootstrap':[True, False], 'bootstrap_features':[True, False]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == BAGGING: #parameters = {'n_estimators' : [400], 'random_state' : [17], # 'max_samples' : np.arange(0.5, 0.9, 0.1), # 'max_features' : np.arange(0.5, 0.9, 0.1), # 'bootstrap':[False], 'bootstrap_features':[False], 'n_jobs':[-1]} parameters = {"base_estimator__criterion" : ["gini", "entropy"], "base_estimator__splitter" : ["best", "random"], "base_estimator__max_depth" : [10, 15, 20, 25], "base_estimator__class_weight" : ['balanced'], "base_estimator__max_features" : ['auto', 'log2'] } model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) clf = model.fit(X_train, y_train) if args.grid_search: print "Best params: " + str(model.best_params_) if args.select_best: if classifier != BAGGING: print "Selecting best features" sfm = SelectFromModel(clf, prefit = True) X_train = sfm.transform(X_train) X_test = sfm.transform(X_test) clf = model.fit(X_train, y_train) print "Evaluating results" __print_and_log_results(clf, classifier, X_train, X_test, y_test, out_file_name, args) elif args.kfold: # Cast to list to keep it all in memory data_file = open(args.data_file, 'r') data = list(csv.reader(data_file)) data_file.close() labels_file = open(args.labels) labels = np.array([int(x.strip()) for x in labels_file.readlines()]) labels_file.close() X = np.array(data[1:], dtype=float) kf = KFold(len(X), n_folds=10, shuffle=True, random_state=42) for train, test in kf: print "kfold loop iterate" X_train, X_test, y_train, y_test = X[train], X[test], labels[train], labels[test] if args.sampling_technique: print "Attempting to use sampling technique: " + args.sampling_technique if args.sampling_ratio == float('NaN'): print "Unable to use sampling technique. Ratio is NaN." else: X_train, y_train = __get_sample_transformed_examples(args.sampling_technique, X_train, y_train, args.sampling_ratio) if args.scale: scaler = StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) for classifier in args.classifiers: print "Using classifier " + classifier model = __get_classifier_model(classifier, args) print "Fitting model" clf = model.fit(X_train, y_train) if args.select_best: if classifier != BAGGING: sfm = SelectFromModel(clf, prefit = True) X_train = sfm.transform(X_train) X_test = sfm.transform(X_test) clf = model.fit(X_train, y_train) print "Evaluating results" __print_and_log_results(clf, classifier, X_train, X_test, y_test, out_file_name, args) print "kfold loop done"
"""### 데이터 스케일링 """ import pandas as pd dataframe = pd.DataFrame(train_dataset) dataframe.to_csv("meas_train_dataset.csv", header=False, index=False) print(train_dataset) from sklearn.preprocessing import RobustScaler scaler = RobustScaler() scaled_train_data = scaler.fit_transform(train_dataset) scaled_test_data = scaler.transform(test_dataset) print(scaled_train_data) """## 모델""" from kerastuner.tuners import RandomSearch def build_model(hp): model = keras.Sequential() for i in range(hp.Int('num_layers', 2, 20)): model.add(layers.Dense(units=hp.Int('units_' + str(i), min_value=32, #max_value=512, max_value=64,
from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import BaggingRegressor from sklearn.ensemble import AdaBoostRegressor from sklearn.linear_model import BayesianRidge from sklearn.svm import SVR from sklearn.linear_model import LinearRegression from sklearn.preprocessing import RobustScaler from sklearn import metrics from sklearn import cross_validation rscaler = RobustScaler() air_frame = pd.read_csv('airfoil_self_noise.dat',sep='\t') column_names = ['Frequency','Attack Angle','Chord Length','Free Velocity','Suction Side','Scaled Sound'] air_frame.column = column_names scaled_data = rscaler.fit_transform(air_frame.values) X = scaled_data[:,:5] Y = scaled_data[:,5] train_data,test_data,train_regressor,test_regressor = cross_validation.train_test_split(X,Y,test_size=0.3) rf = RandomForestRegressor() grad = GradientBoostingRegressor() bag = BaggingRegressor() ada = AdaBoostRegressor() bayes = BayesianRidge() svr = SVR() lin_reg = LinearRegression() regressors_names = ['Random Forests','Gradient Boost','Bagging','Ada Boost','Bayesian Ridge','SVR','Linear Reg'] regressors = [rf,grad,bag,ada,bayes,svr,lin_reg]
df_neg = pd.read_csv('NegativeYH.csv', header=None) df_neg['Status'] = 0 df_pos['Status'] = 1 df_neg = df_neg.sample(n=len(df_pos)) df = pd.concat([df_pos, df_neg]) df = df.reset_index() df = df.sample(frac=1) df = df.iloc[:, 1:] X = df.iloc[:, 0:1986].values y = df.iloc[:, 1986:].values scaler = RobustScaler() X = scaler.fit_transform(X) kf = StratifiedKFold(n_splits=5) accuracy = [] specificity = [] sensitivity = [] precision = [] recall = [] m_coef = [] auc_list = [] Rf_fpr_list = [] Rf_tpr_list = [] o = 0 max_accuracy = float("-inf") Rf_fpr = None
'y_SN_2', 'log_y_err_SN_2'] feat_SN_3 = ['g_SN_3', 'log_g_err_SN_3', 'r_SN_3', 'log_r_err_SN_3', 'i_SN_3', 'log_i_err_SN_3', 'z_SN_3', 'log_z_err_SN_3', 'y_SN_3', 'log_y_err_SN_3'] feat_SN_4 = ['g_SN_4', 'log_g_err_SN_4', 'r_SN_4', 'log_r_err_SN_4', 'i_SN_4', 'log_i_err_SN_4', 'z_SN_4', 'log_z_err_SN_4', 'y_SN_4', 'log_y_err_SN_4'] feat_SN_5 = ['g_SN_5', 'log_g_err_SN_5', 'r_SN_5', 'log_r_err_SN_5', 'i_SN_5', 'log_i_err_SN_5', 'z_SN_5', 'log_z_err_SN_5', 'y_SN_5', 'log_y_err_SN_5'] ### training features with robust scaler ### X_train = RS.fit_transform(df_train[feat_train]) ### validation features in different noise levels ### X_valid_SN_1 = RS.transform(df_valid[feat_SN_1]) X_valid_SN_2 = RS.transform(df_valid[feat_SN_2]) X_valid_SN_3 = RS.transform(df_valid[feat_SN_3]) X_valid_SN_4 = RS.transform(df_valid[feat_SN_4]) X_valid_SN_5 = RS.transform(df_valid[feat_SN_5]) ### The targets that we wish to learn ### Y_train = df_train['redshift'] Y_valid = df_valid['redshift'] ### Some scaling of the target between 0 and 1 ### ### so we can model it with a beta function ### ### given that Beta function is not defined ###
elif ei > 4: exps = [expinds[ei - 4]] it = 0 while it < itmax: df_all = pd.DataFrame() for exp in exps: print(exp) fname = fold + 'damage_' + exp + '_s25.txt' print('input file: ', fname) df_sm = pd.read_csv(fname, delim_whitespace=True) df = df_sm.dropna().copy() trans = RobustScaler() df[features] = trans.fit_transform(df[features].values) df_all = df_all.append(df, ignore_index=True) #df_all = df.copy() # random split train/test inds = np.random.uniform(0, 1, len(df_all)) <= .80 df_all['is_train'] = inds train, test = df_all[df_all['is_train'] == True], df_all[ df_all['is_train'] == False] x_train = train[features] y_train = train[pred_str] x_test = test[features] y_test = test[pred_str]
# In[11]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) # In[12]: from sklearn.preprocessing import RobustScaler # In[13]: scaler = RobustScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) # # Training using various models # In[14]: from sklearn.linear_model import LogisticRegression model = LogisticRegression(C=0.5) log = model.fit(X_train, y_train) # In[15]: np.set_printoptions(precision=5) pred = log.predict_proba(X_test)
#Print the elapsed time and celebrate that a trained network has been made! elapsed = time.time() - start print("Elapsed Time: {:.3f}".format(elapsed)) print('Finished Training') #Save the trained network torch.save(self.net, self.runName + "_Future_Model.pth") if __name__ == '__main__': #Command Line Argument Parser parser = argparse.ArgumentParser() #parser.add_argument('-td', '--trainData') parser.add_argument('-rn', '--runName') args = parser.parse_args() allData = jb.load('lstm_Data.joblib') scaler = RobustScaler() y = scaler.fit_transform(allData[1]) #Initialize Runner obj and run training cycle #needs: trainingData - dataframe of all training data futureNet = Runner(allData[0], y, runName=args.runName) futureNet.train() #Save log, diabled temporarily until review is finished #saveLog(log_Path, iden, experiment['datapath'], net.arch_Name, finalEpoch, true, pred, net.seed, elapsed, str(args.runName), net)
#aa = X.groupby('VisitNumber').groups #X_new = pd.DataFrame(columns = X.keys()) #for key in aa.keys(): # X_new = X_new.append(X.iloc[aa[key],:].mean(),ignore_index=True) #%% #%% from sklearn import linear_model from sklearn.preprocessing import StandardScaler, RobustScaler standard_scaler = StandardScaler() robust_scaler = RobustScaler() X_train = robust_scaler.fit_transform(aa) X_train1 = standard_scaler.fit_transform(aa) #%% for the test data X_test = testData for col in colName: X_test[col] = abs((X_test[col].apply(hash))%2**(16)) #%% print ("handle missing data") X_test.fillna(X_test.mean(),inplace=True)
default.drop('education', axis=1, inplace=True) default['male'] = (default['sex']==1).astype('int') default.drop('sex', axis=1, inplace=True) #default['married'] = (default['marraige'] == 1).astype('int') #default.drop('marraige', axis=1, inplace=True) #for pay features if the <=0 then it means it was not delayed pay_features = ['pay_0','pay_2','pay_3','pay_4','pay_5','pay_6'] for p in pay_features: default.loc[default[p]<=0, p] = 0 default.rename(columns={'default payment next month':'default'}, inplace=True) target_name= 'default' X = default.drop('default' , axis=1) robust_scaler = RobustScaler() x = robust_scaler.fit_transform(X) y= default[target_name] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=123, stratify=y) def CMatrix(CM, labels=['pay','default']): df = pd.DataFrame(data=CM, index=labels, columns=labels) df.index.name='TRUE' df.columns.name='PREDICTION' df.loc['Total'] = df.sum() df['Total'] = df.sum(axis=1) return df metrics= pd.DataFrame(index=['accuracy','precision', 'recall'], columns=['NULL', 'LogisticReg','ClassTree', 'NaiveBayes']) y_pred_test= np.repeat(y_train.value_counts().idxmax(), y_test.size) metrics.loc['accuracy','NULL'] = accuracy_score(y_pred=y_pred_test, y_true=y_test) metrics.loc['precision','NULL'] = precision_score(y_pred=y_pred_test, y_true=y_test)
del result['unique_id'] #%% handle missing value print ("handle missing data") result.fillna(result.mean(),inplace=True) #%% data preprocessing from sklearn import linear_model from sklearn.preprocessing import StandardScaler, RobustScaler standard_scaler = StandardScaler() robust_scaler = RobustScaler() X_train = robust_scaler.fit_transform(result) X_train1 = standard_scaler.fit_transform(result) #%% performace def performence(clf,train,label,clfName): re = cross_validation.ShuffleSplit(train.shape[0],n_iter=10,test_size =0.25,random_state =43) aucList = [] accuracyList = [] for train_index, test_index in re: clf.fit(train.iloc[train_index,:],y.iloc[train_index]) pre_y = clf.predict_proba(train.iloc[test_index,:]) # probablity to get the AUC aucList.append(roc_auc_score(y.iloc[test_index],pre_y[:,1])) y_pred = clf.predict(train.iloc[test_index,:]) # get the accuracy of model accuracyList.append(accuracy_score(y.iloc[test_index],y_pred))
test_size=0.1, random_state=6) ca_x, ca_x_test, ca_y, ca_y_test = train_test_split(ca_x, ca_y, test_size=0.1, random_state=6) na_x, na_x_test, na_y, na_y_test = train_test_split(na_x, na_y, test_size=0.1, random_state=6) # scalling scaler = RobustScaler() # scaler = MinMaxScaler() hhb_x = scaler.fit_transform(hhb_x) hhb_x_test = scaler.transform(hhb_x_test) x_pred_hhb = scaler.transform(x_pred_hhb) hbo2_x = scaler.fit_transform(hbo2_x) hbo2_x_test = scaler.transform(hbo2_x_test) x_pred_hbo2 = scaler.transform(x_pred_hbo2) ca_x = scaler.fit_transform(ca_x) ca_x_test = scaler.transform(ca_x_test) x_pred_ca = scaler.transform(x_pred_ca) na_x = scaler.fit_transform(na_x) na_x_test = scaler.transform(na_x_test) x_pred_na = scaler.transform(x_pred_na)
# # Min-Max Scaler $\frac{x_i - min(x)}{max(x) - min(x)}$ # In[5]: mms = MinMaxScaler() views['minmax'] = mms.fit_transform(views[['views']]) views # In[6]: (vw[0] - np.min(vw)) / (np.max(vw) - np.min(vw)) # # Robust Scaler $\frac{x_i - median(x)}{IQR_{(1,3)}(x)}$ # In[7]: rs = RobustScaler() views['robust'] = rs.fit_transform(views[['views']]) views # In[8]: quartiles = np.percentile(vw, (25., 75.)) iqr = quartiles[1] - quartiles[0] (vw[0] - np.median(vw)) / iqr
class Learned(Model): def __init__(self, *args, scale=False, center=False, **kwargs): """ A machine learned model. Beyond :class:`revscoring.Model`, this "Learned" models implement :func:`~revscoring.scoring.models.Learned.fit` and :func:`~revscoring.scoring.models.Learned.cross_validate`. """ super().__init__(*args, **kwargs) self.trained = None if scale or center: self.scaler = RobustScaler(with_centering=center, with_scaling=scale) else: self.scaler = None self.params.update({ 'scale': scale, 'center': center }) def train(self, values_labels): """ Fits the model using labeled data by learning its shape. :Parameters: values_labels : [( `<feature_values>`, `<label>` )] an iterable of labeled data Where <values_labels> is an ordered collection of predictive values that correspond to the :class:`revscoring.Feature` s provided to the constructor """ raise NotImplementedError() def fit_scaler_and_transform(self, fv_vectors): """ Fits the internal scale to labeled data. :Parameters: fv_vectors : `iterable` (( `<feature_values>`, `<label>` )) an iterable of labeled data Where <values_labels> is an ordered collection of predictive values that correspond to the `Feature` s provided to the constructor :Returns: A dictionary of model statistics. """ if self.scaler is not None: return self.scaler.fit_transform(fv_vectors) else: return fv_vectors def apply_scaling(self, fv_vector): if self.scaler is not None: if not hasattr(self.scaler, "center_") and \ not hasattr(self.scaler, "scale_"): raise RuntimeError("Cannot scale a vector before " + "training the scaler") fv_vector = self.scaler.transform([fv_vector])[0] return fv_vector def _clean_copy(self): raise NotImplementedError() def cross_validate(self, values_labels, folds=10, processes=1): """ Trains and tests the model agaists folds of labeled data. :Parameters: values_labels : [( `<feature_values>`, `<label>` )] an iterable of labeled data Where <values_labels> is an ordered collection of predictive values that correspond to the `Feature` s provided to the constructor folds : `int` When set to 1, cross-validation will run in the parent thread. When set to 2 or greater, a :class:`multiprocessing.Pool` will be created. """ folds_i = KFold(n_splits=folds, shuffle=True, random_state=0) if processes == 1: mapper = map else: pool = Pool(processes=processes or cpu_count()) mapper = pool.map results = mapper(self._cross_score, ((i, [values_labels[i] for i in train_i], [values_labels[i] for i in test_i]) for i, (train_i, test_i) in enumerate( folds_i.split(values_labels)))) agg_score_labels = [] for score_labels in results: agg_score_labels.extend(score_labels) self.info['statistics'].fit(agg_score_labels) return self.info['statistics'] def _cross_score(self, i_train_test): i, train_set, test_set = i_train_test logger.info("Performing cross-validation {0}...".format(i + 1)) model = self._clean_copy() logger.debug("Training cross-validation for {0}...".format(i + 1)) model.train(train_set) logger.debug("Scoring cross-validation for {0}...".format(i + 1)) feature_values, labels = map(list, zip(*test_set)) docs = model.score_many(feature_values) return list(zip(docs, labels))