def best_ica_wine(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ica = FastICA(n_components=X_train_scl.shape[1]) X_train_transformed = ica.fit_transform(X_train_scl, y_train) X_test_transformed = ica.transform(X_test_scl) ## top 2 kurt = kurtosis(X_train_transformed) i = kurt.argsort()[::-1] X_train_transformed_sorted = X_train_transformed[:, i] X_train_transformed = X_train_transformed_sorted[:,0:2] kurt = kurtosis(X_test_transformed) i = kurt.argsort()[::-1] X_test_transformed_sorted = X_test_transformed[:, i] X_test_transformed = X_test_transformed_sorted[:,0:2] # save filename = './' + self.save_dir + '/wine_ica_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_ica_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_ica_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_ica_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def ica_analysis(self, X_train, X_test, y_train, y_test, data_set_name): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ## ## ICA ## ica = FastICA(n_components=X_train_scl.shape[1]) X_ica = ica.fit_transform(X_train_scl) ## ## Plots ## ph = plot_helper() kurt = kurtosis(X_ica) print(kurt) title = 'Kurtosis (FastICA) for ' + data_set_name name = data_set_name.lower() + '_ica_kurt' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_simple_bar(np.arange(1, len(kurt)+1, 1), kurt, np.arange(1, len(kurt)+1, 1).astype('str'), 'Feature Index', 'Kurtosis', title, filename)
def best_rp_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) rp = GaussianRandomProjection(n_components=X_train_scl.shape[1]) X_train_transformed = rp.fit_transform(X_train_scl, y_train) X_test_transformed = rp.transform(X_test_scl) ## top 2 kurt = kurtosis(X_train_transformed) i = kurt.argsort()[::-1] X_train_transformed_sorted = X_train_transformed[:, i] X_train_transformed = X_train_transformed_sorted[:,0:2] kurt = kurtosis(X_test_transformed) i = kurt.argsort()[::-1] X_test_transformed_sorted = X_test_transformed[:, i] X_test_transformed = X_test_transformed_sorted[:,0:2] # save filename = './' + self.save_dir + '/nba_rp_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def processing(df): dummies_df = pd.get_dummies(df["City Group"]) def add_CG(name): return "CG_" + name dummies_df = dummies_df.rename(columns=add_CG) # print dummies_df.head() df = pd.concat([df, dummies_df.iloc[:, 0]], axis=1) dummies_df = pd.get_dummies(df["Type"]) def add_Type(name): return "Type_" + name dummies_df = dummies_df.rename(columns=add_Type) df = pd.concat([df, dummies_df.iloc[:, 0:3]], axis=1) # try to put in age as a column def add_Age(string): age = datetime.datetime.now() - datetime.datetime.strptime(string, "%m/%d/%Y") return age.days df["Age"] = df["Open Date"].map(add_Age) df = df.drop(["Id", "Open Date", "City", "City Group", "Type", "revenue"], axis=1) # scaler = StandardScaler().fit(df) scaler = RobustScaler().fit(df) df = scaler.transform(df) # print df.head() return df
def num_scaler(d_num,t_num): scl = RobustScaler() scl.fit(d_num) d_num = scl.transform(d_num) t_num = scl.transform(t_num) return d_num, t_num
def _robust_scaler(self, input_df): """Uses Scikit-learn's RobustScaler to scale the features using statistics that are robust to outliers Parameters ---------- input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} Input DataFrame to scale Returns ------- scaled_df: pandas.DataFrame {n_samples, n_features + ['guess', 'group', 'class']} Returns a DataFrame containing the scaled features """ training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1) if len(training_features.columns.values) == 0: return input_df.copy() # The scaler must be fit on only the training data scaler = RobustScaler() scaler.fit(training_features.values.astype(np.float64)) scaled_features = scaler.transform(input_df.drop(['class', 'group', 'guess'], axis=1).values.astype(np.float64)) for col_num, column in enumerate(input_df.drop(['class', 'group', 'guess'], axis=1).columns.values): input_df.loc[:, column] = scaled_features[:, col_num] return input_df.copy()
def nn_wine_orig(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) self.part4.nn_analysis(X_train_scl, X_test_scl, y_train, y_test, 'Wine', 'Neural Network Original')
def lda_analysis(self, X_train, X_test, y_train, y_test, data_set_name): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ## ## Plots ## ph = plot_helper() scores = [] train_scores = [] rng = range(1, X_train_scl.shape[1]+1) for i in rng: lda = LinearDiscriminantAnalysis(n_components=i) cv = KFold(X_train_scl.shape[0], 3, shuffle=True) # cross validation cv_scores = [] for (train, test) in cv: lda.fit(X_train_scl[train], y_train[train]) score = lda.score(X_train_scl[test], y_train[test]) cv_scores.append(score) mean_score = np.mean(cv_scores) scores.append(mean_score) # train score lda = LinearDiscriminantAnalysis(n_components=i) lda.fit(X_train_scl, y_train) train_score = lda.score(X_train_scl, y_train) train_scores.append(train_score) print(i, mean_score) ## ## Score Plot ## title = 'Score Summary Plot (LDA) for ' + data_set_name name = data_set_name.lower() + '_lda_score' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_series(rng, [scores, train_scores], [None, None], ['cross validation score', 'training score'], cm.viridis(np.linspace(0, 1, 2)), ['o', '*'], title, 'n_components', 'Score', filename)
def test_robustscaler_vs_sklearn(): # Compare msmbuilder.preprocessing.RobustScaler # with sklearn.preprocessing.RobustScaler robustscalerr = RobustScalerR() robustscalerr.fit(np.concatenate(trajs)) robustscaler = RobustScaler() robustscaler.fit(trajs) y_ref1 = robustscalerr.transform(trajs[0]) y1 = robustscaler.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
def best_lda_cluster_wine(self): dh = data_helper() dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ## ## K-Means ## km = KMeans(n_clusters=4, algorithm='full') X_train_transformed = km.fit_transform(X_train_scl) X_test_transformed = km.transform(X_test_scl) # save filename = './' + self.save_dir + '/wine_kmeans_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_kmeans_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_kmeans_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_kmeans_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False) ## ## GMM ## gmm = GaussianMixture(n_components=4, covariance_type='full') X_train_transformed = km.fit_transform(X_train_scl) X_test_transformed = km.transform(X_test_scl) # save filename = './' + self.save_dir + '/wine_gmm_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_gmm_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_gmm_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_gmm_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def best_lda_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) lda = LinearDiscriminantAnalysis(n_components=2) X_train_transformed = lda.fit_transform(X_train_scl, y_train) X_test_transformed = lda.transform(X_test_scl) # save filename = './' + self.save_dir + '/nba_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def best_pca_wine(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) pca = PCA(n_components=3) X_train_transformed = pca.fit_transform(X_train_scl, y_train) X_test_transformed = pca.transform(X_test_scl) # save filename = './' + self.save_dir + '/wine_pca_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_pca_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_pca_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_pca_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def pca_analysis(self, X_train, X_test, y_train, y_test, data_set_name): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ## ## PCA ## pca = PCA(n_components=X_train_scl.shape[1], svd_solver='full') X_pca = pca.fit_transform(X_train_scl) ## ## Plots ## ph = plot_helper() ## ## Explained Variance Plot ## title = 'Explained Variance (PCA) for ' + data_set_name name = data_set_name.lower() + '_pca_evar_err' filename = './' + self.out_dir + '/' + name + '.png' self.plot_explained_variance(pca, title, filename) ## ## Reconstruction Error ## all_mses, rng = self.reconstruction_error(X_train_scl, PCA) title = 'Reconstruction Error (PCA) for ' + data_set_name name = data_set_name.lower() + '_pca_rec_err' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_series(rng, [all_mses.mean(0)], [all_mses.std(0)], ['mse'], ['red'], ['o'], title, 'Number of Features', 'Mean Squared Error', filename) ## ## Manually compute eigenvalues ## cov_mat = np.cov(X_train_scl.T) eigen_values, eigen_vectors = np.linalg.eig(cov_mat) print(eigen_values) sorted_eigen_values = sorted(eigen_values, reverse=True) title = 'Eigen Values (PCA) for ' + data_set_name name = data_set_name.lower() + '_pca_eigen' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_simple_bar(np.arange(1, len(sorted_eigen_values)+1, 1), sorted_eigen_values, np.arange(1, len(sorted_eigen_values)+1, 1).astype('str'), 'Principal Components', 'Eigenvalue', title, filename) ## TODO Factor this out to new method ## ## Scatter ## '''
def train(filename_train, filename_model, n_events_train=-1, simple=False, n_features=7, n_hidden=30, n_epochs=5, batch_size=64, step_size=0.01, decay=0.7, random_state=1): # Initialization gated = not simple logging.info("Calling with...") logging.info("\tfilename_train = %s" % filename_train) logging.info("\tfilename_model = %s" % filename_model) logging.info("\tn_events_train = %d" % n_events_train) logging.info("\tgated = %s" % gated) logging.info("\tn_features = %d" % n_features) logging.info("\tn_hidden = %d" % n_hidden) logging.info("\tn_epochs = %d" % n_epochs) logging.info("\tbatch_size = %d" % batch_size) logging.info("\tstep_size = %f" % step_size) logging.info("\tdecay = %f" % decay) logging.info("\trandom_state = %d" % random_state) rng = check_random_state(random_state) # Make data logging.info("Loading data...") fd = open(filename_train, "rb") X, y = pickle.load(fd, encoding='latin1') fd.close() y = np.array(y) if n_events_train > 0: indices = check_random_state(123).permutation(len(X))[:n_events_train] X = [X[i] for i in indices] y = y[indices] logging.info("\tfilename = %s" % filename_train) logging.info("\tX size = %d" % len(X)) logging.info("\ty size = %d" % len(y)) # Preprocessing logging.info("Preprocessing...") X = [extract(permute_by_pt(rewrite_content(jet))) for jet in X] tf = RobustScaler().fit(np.vstack([jet["content"] for jet in X])) for jet in X: jet["content"] = tf.transform(jet["content"]) # Split into train+validation logging.info("Splitting into train and validation...") X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=5000, random_state=rng) # Training logging.info("Training...") if gated: predict = grnn_predict_gated init = grnn_init_gated else: predict = grnn_predict_simple init = grnn_init_simple trained_params = init(n_features, n_hidden, random_state=rng) n_batches = int(np.ceil(len(X_train) / batch_size)) best_score = [-np.inf] # yuck, but works best_params = [trained_params] def loss(X, y, params): y_pred = predict(params, X) l = log_loss(y, y_pred).mean() return l def objective(params, iteration): rng = check_random_state(iteration % n_batches) start = rng.randint(len(X_train) - batch_size) idx = slice(start, start + batch_size) return loss(X_train[idx], y_train[idx], params) def callback(params, iteration, gradient): if iteration % 25 == 0: roc_auc = roc_auc_score(y_valid, predict(params, X_valid)) if roc_auc > best_score[0]: best_score[0] = roc_auc best_params[0] = copy.deepcopy(params) fd = open(filename_model, "wb") pickle.dump(best_params[0], fd) fd.close() logging.info( "%5d\t~loss(train)=%.4f\tloss(valid)=%.4f" "\troc_auc(valid)=%.4f\tbest_roc_auc(valid)=%.4f" % (iteration, loss(X_train[:5000], y_train[:5000], params), loss(X_valid, y_valid, params), roc_auc, best_score[0])) for i in range(n_epochs): logging.info("epoch = %d" % i) logging.info("step_size = %.4f" % step_size) trained_params = adam(ag.grad(objective), trained_params, step_size=step_size, num_iters=1 * n_batches, callback=callback) step_size = step_size * decay
y_total = df.iloc[:, -1:].values x_total = df.iloc[:, :-1].values y_test = y_total[-test_size:, :] x_test = x_total[-test_size:, :] y_train = y_total[:-val_size - test_size, :] x_train = x_total[:-val_size - test_size, :] y_val = y_total[-val_size - test_size - 1:-test_size, :] x_val = x_total[-val_size - test_size - 1:-test_size, :] n_samples = x_train.shape[0] m = len(y_train) scalerX = RobustScaler(quantile_range=(10, 90)) scalerY = RobustScaler(quantile_range=(10, 90)) x_train = scalerX.fit_transform(x_train) y_train = scalerY.fit_transform(y_train) x_val = scalerX.transform(x_val) y_val = scalerY.transform(y_val) x_test = scalerX.transform(x_test) y_test = scalerY.transform(y_test) tempo = time.time() epochs = 200 learning_rate = 0.01 batch_size = m # random seed os.environ['PYTHONHASHSEED'] = '0' seed = 123456 if seed is not None: np.random.seed(seed) rn.seed(seed) tf.set_random_seed(seed)
import pandas as pd import matplotlib.pyplot as plt from sgmcmc_ssm.models.gauss_hmm import GaussHMMSampler from tqdm import tqdm np.random.seed(12345) # Load and Scale Data from scipy.io import loadmat ion_data = loadmat('data/alamethicin.mat') from sklearn.preprocessing import RobustScaler scaler = RobustScaler() observations = scaler.fit_transform(ion_data['originaldata'][1095:-3000]) filtered_observations = scaler.transform(ion_data['filtereddata']) T = len(observations) # Plot Data fig, ax = plt.subplots(1, 1) ax.plot(np.arange(T)[::50], observations[::50], '-', label='scaled data') ax.plot(np.arange(T)[::50], filtered_observations[::50], '-', label='scaled filtered data') ax.set_title('Scaled Ion Data') ax.set_xlabel('Time') ax.set_ylabel('Voltage (Scaled)') ax.legend() # Process all
# Drop the encrypted phone number (LineNumber), and the Call category (As labeled by data team) athena = athena.drop(['LineNumber', 'CallCategory'], axis=1) # Split into subgroups, as training on the entire dataset breaks my computer group = np.array_split(athena, 4) # Iterate through each group for i in range(len(group)): print('======= GROUP {} ======'.format(i)) subdata = group[i] ## Scale the data to have mean=0 and unit variance: print('Scaling Data') scaler = RobustScaler().fit(athena) scaler.transform(athena) ## Reduce data for clustering print('Reducing dimensions') model = umap.UMAP(n_neighbors=20, min_dist=0.15, metric='braycurtis') data_2d = model.fit_transform(subdata) print('Clustering Data') cluster = DBSCAN(eps=3, min_samples=2).fit(subdata) print('Configuring data to clusters') subdata['PCA1'] = data_2d[:, 0] subdata['PCA2'] = data_2d[:, 1] cluster.labels_[cluster.labels_ > 0] = 1 subdata['cluster'] = cluster.labels_
numsamples = 50000 # Set a random seed for reproducibility randomseed = 5 # Separate data into training and testing data X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X_fSelect, y, range(len(y)), train_size=numsamples, random_state=randomseed) # Create input data scaler based only on training set scaler_X = RobustScaler() scaler_X = scaler_X.fit(X_train) X_train_scaled = scaler_X.transform(X_train) X_test_scaled = scaler_X.transform(X_test) # Create the SVM model clf = svm.SVC(kernel='rbf',C=0.1,gamma=0.01,class_weight={1:50},probability=True) clf.fit(X_train_scaled,y_train) ## Step 5: SVM evaluation We use a variety of evaluation metrics to gauge model performance, but emphasize the Total Skill Score (TSS) here due to its insensitivity on class imbalance ratio [[Bloomfield et al., 2012](http://iopscience.iop.org/article/10.1088/2041-8205/747/2/L41/meta "Bloomfield - TSS")]. All metrics require the use of the entries of the contingency, or confusion matix. For the scintillation/no-scintillation classification problem the matrix is https://github.com/rmcgranaghan/machine-learning-with-GNSS-data/blob/master/confusion_matrix_schematic.png.
else: break for ds_i, ds in enumerate(DS): if input(f'Run {DSNAMES[ds_i]}? >') == 'y': X, y = ds if scaler == 'rs': scale = RobustScaler().fit(X) elif scaler == 'ss': scale = StandardScaler().fit(X) elif scaler == 'qt': scale = QuantileTransformer(n_quantiles=np.min([1000, X.shape[0]]), output_distribution='uniform').fit(X) else: raise ValueError('Improper scaling method chosen') X = scale.transform(X) Path(os.path.join(OUTFILE, DSNAMES[ds_i])).mkdir(parents=True, exist_ok=True) outpath = os.path.join(OUTFILE, DSNAMES[ds_i]) print(f'Running {DSNAMES[ds_i]}\n') # Part 1 - Cluster data print('Running Clustering') if not (os.path.isfile(os.path.join(outpath, 'KM_est.pkl')) and os.path.isfile(os.path.join(outpath, 'EM_est.pkl'))): KM_est, EM_est = handle_clusters(X, outpath) handle_cluster_visualization(KM_est, X, y, outpath) handle_cluster_visualization(EM_est, X, y, outpath) with open(os.path.join(outpath, 'KM_est.pkl'), 'wb') as kmpk: pickle.dump(KM_est, kmpk, -1) with open(os.path.join(outpath, 'EM_est.pkl'), 'wb') as empk:
#values that prediction is based on x = csv[["AirTemp", "Press", "UMR"]] #values to be predicted y = csv[["NO", "NO2", "O3", "PM10"]] #return four marix: two for learn and two for test x_learn, x_test, y_learn, y_test = train_test_split(x, y, test_size=0.3, random_state=0) #transformer for transforming the values transformer = RobustScaler().fit(x_learn) #scalar type of the x_learn matrix x_learn_scalar = transformer.transform(x_learn) #scalar type of the x_test matrix x_test_scalar = transformer.transform(x_test) model = LinearRegression(fit_intercept=True, normalize=True).fit(x_learn_scalar, y_learn) #returns coefficient of determination determ_coef = model.score(x_test_scalar, y_test) #returns the intercept for each value intercept = model.intercept_ #returns the slope for each value slope = model.coef_ print("Coefficient of determination: ", determ_coef) print("Intercept: ", intercept) print("Slope: ", slope)
labels = train_df['y'] train_df = train_df.drop('y',axis =1 ) #%% all_data = pd.concat([train_df,test_df],axis=0,ignore_index=True) #%% all_data["galaxy"] = all_data["galaxy"].astype('category') all_data["galaxy"] = all_data["galaxy"].cat.codes #%% all_data_without_year_name = all_data.drop(['galactic year','galaxy'],axis=1) #%% scaler = RobustScaler().fit(all_data_without_year_name) all_data_without_year_name_scaled = scaler.transform(all_data_without_year_name) #%% year_name = all_data[['galactic year','galaxy']] all_data_without_year_name_scaled_df = pd.DataFrame(all_data_without_year_name_scaled,columns=all_data_without_year_name.columns) #%% all_data_scaled = pd.concat([year_name,all_data_without_year_name_scaled_df],axis=1,sort=False) # all_data_scaled['galactic year'] =all_data_scaled['galactic year'] - all_data_scaled['galactic year'][0] #%% #all_data_scaled = all_data_scaled.fillna(0) #%% X_train = all_data_scaled[0:len(train_df)] X_test = all_data_scaled[len(train_df):] # %% Non_data_col = ['galaxy','y'] predictors = [x for x in all_data_scaled.columns if x not in Non_data_col]
#%% Prepare train and test sets for the model train_set = traintest_set[:train_len] test_set = traintest_set[train_len:] train_set = train_set.drop('Id', axis=1) test_set = test_set.drop('Id', axis=1) X = train_set.drop('SalePrice', axis=1) y = train_set['SalePrice'] test_set = test_set.drop('SalePrice', axis=1) sc = RobustScaler() X = sc.fit_transform(X) test_set = sc.transform(test_set) #%% Build the model model = Lasso(alpha=.001, random_state=1) model.fit(X, y) #%% Kaggle submission pred = model.predict(test_set) preds = np.exp(pred) print(model.score(X, y)) output = pd.DataFrame({'Id': test2.Id, 'SalePrice': preds}) output.to_csv('submission.csv', index=False) output.head()
loan_default1=train['loan_default'] df_training=pd.concat([training_data,loan_default1],axis=1) print(df_training.columns) # ########################################NOR FOR TRAINING :- df_training , FOR TESTING test_data##############################33 x_train= df_training.drop(['loan_default'],axis=1) y_train=df_training['loan_default'] print(df_training.dtypes) print(x_train.shape) #y_prediction=pd.DataFrame(y_prediction, columns=["loan_default"]) #print(y_prediction.tail()) from sklearn.preprocessing import RobustScaler scaler = RobustScaler() scaler.fit(x_train) #scaler.fit(y_train) xscale=scaler.transform(x_train) #yscale=scaler.transform(y) scaler.fit(test_data) test_scaled=scaler.transform(test_data) #y_prediction.to_csv("C:/Users/hp/Desktop/lt/submitl22.csv") ####################MAKING TEST SET TO SAME TYPE########### #test= pd.read_csv("C:/Users/hp/Desktop/lt/test_bqCt9Pv.csv") ################################################USING RANDOM FOREST ########################### import matplotlib.pyplot as plt import tensorflow as tf import tensorflow
#rescales the data set such that all feature values are in the range [0, 1] #For large outliers, it compresses lower values to too small numbers. #Sensitive to outliers. scaler2 = MinMaxScaler() scaler2.fit(X) X2 = scaler2.transform(X) df2 = pd.DataFrame(data=X2, columns=column_names) print(df2.describe()) sns.jointplot(x='MedInc', y='AveOccup', data=df2, xlim=[0,1], ylim=[0,0.005]) #Data scaled but outliers still exist #3 RobustScaler # the centering and scaling statistics of this scaler are based on percentiles #and are therefore not influenced by a few number of very large marginal outliers. scaler3 = RobustScaler() scaler3.fit(X) X3 = scaler3.transform(X) df3 = pd.DataFrame(data=X3, columns=column_names) print(df3.describe()) sns.jointplot(x='MedInc', y='AveOccup', data=df3, xlim=[-2,3], ylim = [-2,3]) #Range -2 to 3 #4 PowerTransformer # applies a power transformation to each feature to make the data more Gaussian-like scaler4 = PowerTransformer() scaler4.fit(X) X4 = scaler4.transform(X) df4 = pd.DataFrame(data=X4, columns=column_names) print(df4.describe()) sns.jointplot(x='MedInc', y='AveOccup', data=df4) # #5 QuantileTransformer
def train(filename_train,filename_valid,filename_model,n_train=1200000,n_valid=400000,n_features=7, n_hidden=40,n_epochs=18,batch_size=128,step_size=0.005,decay=0.9): logging.info("Calling with...") logging.info("\tfilename_train = %s" % filename_train) logging.info("\tfilename_valid = %s" % filename_valid) logging.info("\tfilename_model = %s" % filename_model) logging.info("\tn_train = %d" % n_train) logging.info("\tn_valid = %d" % n_valid) logging.info("\tn_features = %d" % n_features) logging.info("\tn_hidden = %d" % n_hidden) logging.info("\tn_epochs = %d" % n_epochs) logging.info("\tbatch_size = %d" % batch_size) logging.info("\tstep_size = %f" % step_size) logging.info("\tdecay = %f" % decay) ####################### Reading the train data ################################# logging.info("Loading train data") fd = open(filename_train, "rb") X, y = pickle.load(fd,encoding='latin-1') fd.close() y = np.array(y) indices = torch.randperm(len(X)).numpy()[:n_train] X = [X[i] for i in indices] y = y[indices] print("\tfilename = %s" % filename_train) print("\tX size = %d" % len(X)) print("\ty size = %d" % len(y)) # Preprocessing # feature scaling logging.info("Preprocessing the train data") X = [extract(pt_order(rewrite_content(jet))) for jet in X] transfer_feature= RobustScaler().fit(np.vstack([jet["content"] for jet in X])) for jet in X: jet["content"] = transfer_feature.transform(jet["content"]) X_train=X y_train=y '''----------------------------------------------------------------------- ''' logging.info("Loading validation data") fd = open(filename_valid, "rb") X, y = pickle.load(fd,encoding='latin-1') fd.close() y = np.array(y) indices = torch.randperm(len(X)).numpy()[:n_valid] X = [X[i] for i in indices] y = y[indices] print("\tfilename = %s" % filename_valid) print("\tX size = %d" % len(X)) print("\ty size = %d" % len(y)) logging.info("Preprocessing the train data") X = [extract(pt_order(rewrite_content(jet))) for jet in X] for jet in X: jet["content"] = transfer_feature.transform(jet["content"]) X_valid=X y_valid=y ###########################################Define MODEL ############################## logging.info("Initializing model...") model = Predict(n_features,n_hidden) if torch.cuda.is_available(): logging.warning("Moving model to GPU") model.cuda() logging.warning("Moved model to GPU") ###########################OPTIMIZER AND LOSS ########################################## logging.info("Building optimizer...") optimizer = Adam(model.parameters(), lr=step_size) scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=decay) n_batches = int(len(X_train) // batch_size) best_score = [-np.inf] best_model_state_dict = copy.deepcopy(model.state_dict()) # intial parameters of model ###############################VALIDATION OF DATA ######################################## def callback(epoch, iteration, model): if iteration % n_batches == 0: model.eval() offset = 0; train_loss = []; valid_loss = [] yy, yy_pred, accuracy_train, accuracy_valid = [], [],[],[] for i in range(len(X_valid) // batch_size): idx = slice(offset, offset+batch_size) Xt, yt = X_train[idx], y_train[idx] X_var = wrap_X(Xt); y_var = wrap(yt) tl = unwrap(loss(model(X_var), y_var)); train_loss.append(tl) y_pred_train = model(X_var) y = unwrap(y_var); y_pred_train = unwrap(y_pred_train) X = unwrap_X(X_var) Xv, yv = X_valid[idx], y_valid[idx] X_var = wrap_X(Xv); y_var = wrap(yv) y_pred = model(X_var) vl = unwrap(loss(y_pred, y_var)); valid_loss.append(vl) Xv = unwrap_X(X_var); yv = unwrap(y_var); y_pred = unwrap(y_pred) yy.append(yv); yy_pred.append(y_pred) y_pred=np.column_stack(y_pred).ravel() accuracy_valid.append(np.sum(np.rint(y_pred)==yv)/float(len(yv))) offset+=batch_size train_loss = np.mean(np.array(train_loss)) valid_loss = np.mean(np.array(valid_loss)) accuracy_valid=np.mean(np.array(accuracy_valid)) print("accuracy_valid:",accuracy_valid) print("train_loss:",train_loss) roc_auc = roc_auc_score(np.column_stack(yy).ravel(), np.column_stack(yy_pred).ravel()) print("roc_auc:",roc_auc) if roc_auc > best_score[0]: best_score[0]=roc_auc best_model_state_dict[0] = copy.deepcopy(model.state_dict()) with open(filename_model, 'wb') as f: torch.save(best_model_state_dict[0], f) scheduler.step(valid_loss) model.train() ###############################TRAINING ######################################## logging.warning("Training the data") iteration=1 for i in range(n_epochs): print("epoch = %d" % i) print("step_size = %.4f" % step_size) t0 = time.time() for _ in range(n_batches): ## mini batch iteration += 1 model.train() optimizer.zero_grad() start = torch.round(torch.rand(1) * (len(X_train) - batch_size)).numpy()[0].astype(np.int32) idx = slice(start, start+batch_size) X, y = X_train[idx], y_train[idx] X_var = wrap_X(X); y_var = wrap(y) ## wrap_X, wrap moves to GPU l = loss(model(X_var), y_var) l.backward() optimizer.step() X = unwrap_X(X_var); y = unwrap(y_var) ## unwrap_X, unwrap moves to GPU callback(i, iteration, model) t1 = time.time() ### print(f'Epoch took {t1-t0} seconds') scheduler.step() step_size = step_size * decay
def least_square_reference( inst, empty_room=None, max_times_samples=2000, bad_channels=None, scaler=None, mrk=None, elp=None, hsp=None ): """ Fits and applies Least Square projection of the reference channels (potentially from an empty room) and removes the corresponding component from the recordings of a subject. Parameters ---------- inst : Raw | str Raw instance or path to raw data. empty_room : str | None Path to raw data acquired in empty room. max_times_samples : int Number of time sample to use for pinv. Defautls to 2000 bad_channels : list | array, shape (n_chans) of strings Lists bad channels scaler : function | None Scaler functions to normalize data. Defaults to sklearn.preprocessing.RobustScaler. Returns ------- inst : Raw adapted from Adeen Flinker 6/2013 (<*****@*****.**>) LSdenoise.m Main EHN - Automatically detects channel types. - Allows flexible scaler; Robust by default. - The data is projected back in Tesla. - Allows memory control. TODO: - Allow other kind of MNE-Python inst - Allow baseline selection (pre-stim instead of empty room) - Clean up memory - Allow fancy solver (l1, etc) """ from scipy.linalg import pinv from mne.io import read_raw_kit from mne.io import _BaseRaw # Least square can be fitted on empty room or on subject's data if empty_room is None: if not isinstance(inst, _BaseRaw): raw = read_raw_kit(inst, preload=True) else: raw = inst else: if not isinstance(empty_room, _BaseRaw): raw = read_raw_kit(empty_room, preload=True) else: raw = empty_room # Parameters n_chans, n_times = raw._data.shape chan_info = raw.info["chs"] # KIT: axial gradiometers (equiv to mag) ch_mag = np.where([ch["coil_type"] == 6001 for ch in chan_info])[0] # KIT: ref magnetometer ch_ref = np.where([ch["coil_type"] == 6002 for ch in chan_info])[0] # Other channels ch_misc = np.where([ch["coil_type"] not in [6001, 6002] for ch in chan_info])[0] # Bad channel ch_bad = np.empty(0) if (bad_channels is not None) and len(bad_channels): if np.all([isinstance(ch, int) for ch in bad_channels]): bad_channels = np.array(bad_channels) elif np.all([isinstance(ch, str) for ch in bad_channels]): bad_channels = [ii for ii, ch in enumerate(raw.ch_names) if ch in bad_channels] else: raise ValueError("bad_channels needs array of int or array of str") else: bad_channels = [] default_bad_channels = [ii for ii, ch in enumerate(raw.ch_names) if ch in raw.info["bads"]] bad_channels = np.array(default_bad_channels + bad_channels, int) print("bad channels:", [raw.ch_names[bad] for bad in bad_channels]) # To avoid memory error, let's subsample across time sel_times = slice(0, n_times, int(np.ceil(n_times // max_times_samples))) # Whiten data if scaler is None: from sklearn.preprocessing import RobustScaler scaler = RobustScaler() data_bsl = scaler.fit_transform(raw._data.T) # Fit Least Square coefficients on baseline data empty_sensors = data_bsl[:, ch_mag] if len(ch_bad): empty_sensors[:, ch_bad] = 0 # remove bad channels coefs = np.dot(pinv(data_bsl[sel_times, ch_ref]), empty_sensors[sel_times, :]) empty_sensors, data_bsl = None, None # clear memory # Apply correction on subject data if empty_room is not None: del raw raw = read_raw_kit(inst, preload=True) data_subject = scaler.transform(raw._data.T) subject_sensors = data_subject[:, ch_mag] - np.dot(data_subject[:, ch_ref], coefs) # Remove bad channels if len(ch_bad): subject_sensors[:, ch_bad] = 0 # Reproject baseline new_ref = np.dot(subject_sensors, pinv(coefs)) # Un-whiten data to get physical units back data = np.concatenate((subject_sensors, new_ref, raw._data[ch_misc, :].T), axis=1) data = scaler.inverse_transform(data) # Output raw._data = data.T return raw
# In[ ]: scaler = RobustScaler() # In[ ]: n_train = train.shape[0] X = data_pipe[:n_train] test_X = data_pipe[n_train:] y = train.SalePrice X_scaled = scaler.fit(X).transform(X) y_log = np.log(train.SalePrice) test_X_scaled = scaler.transform(test_X) # ## Feature Selection # + __I have to confess, the feature engineering above is not enough, so we need more.__ # + __Combining different features is usually a good way, but we have no idea what features should we choose. Luckily there are some models that can provide feature selection, here I use Lasso, but you are free to choose Ridge, RandomForest or GradientBoostingTree.__ # In[ ]: lasso = Lasso(alpha=0.001) lasso.fit(X_scaled, y_log) # In[ ]: FI_lasso = pd.DataFrame({"Feature Importance": lasso.coef_}, index=data_pipe.columns)
def scale_data_robust(self): scaler = RobustScaler().fit(self.X_train) self.X_train = scaler.transform(self.X_train) self.X_validation = scaler.transform(self.X_validation)
"Balanced Accuracy", "MSE", "r2", "spearmanr" ]) for split in np.arange(numsplits): print("Evaluating fold " + str(split)) train_index = kfolds["fold_" + str(split)]["train"] test_index = kfolds["fold_" + str(split)]["test"] X_train, X_test = features_nosurv.iloc[train_index], features_nosurv.iloc[ test_index] y_train, y_test = surv_days[train_index], surv_days[test_index] # scale target with a quantile transform qtfm = RobustScaler() y_train = np.squeeze(qtfm.fit_transform(y_train.values.reshape(-1, 1))) y_test = np.squeeze(qtfm.transform(y_test.values.reshape(-1, 1))) # y_train, y_test = surv_classes[train_index], surv_classes[test_index] # for every split, perform feature selection for sel_name, sel in zip(selectornames_short, selectors): print('#####') print(sel_name) print('#####') if sel_name is "CHSQ": # shift X values to be non-negative for chsq feature selection X_train_tmp = X_train + np.abs(X_train.min()) selscore = sel(X_train_tmp, y_train) selidx = np.argsort(selscore)[::-1] selidx = selidx[0:numfeat] selscore = selscore[selidx]
var_dums = pd.get_dummies(all_data["Variety"]) all_data = all_data.drop(columns="Variety") all_data = pd.concat([all_data, var_dums], axis=1) all_data = all_data.drop(columns="Site ID") all_data = all_data.dropna() all_data = all_data[all_data["Assessment Score"] != '*'] #split features and target Y = all_data["Assessment Score"] X = all_data.drop(columns="Assessment Score") #scale features from sklearn.preprocessing import RobustScaler transformer = RobustScaler().fit(X) X = transformer.transform(X) Y = np.array(Y) Y[Y == ''] = 0.0 Y = Y.astype(np.float) #make dense network model import neural_net NeuralNet = neural_net.NeuralNet #crop_score_model = NeuralNet(X, Y, 6, 256, "r", 20) #check accuracy from sklearn.metrics import mean_squared_error '''
# split x, y x = train.loc[:, 'rho':'990_dst'] test = test.loc[:, 'rho':'990_dst'] y = train.loc[:, 'hhb':'na'] # split train, test x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.9, random_state=0) # scalling scaler = RobustScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) test = scaler.transform(test) # search model parameters parameters = { 'n_estimators': [310, 350, 390], 'max_depth': [4, 5, 6], 'learning_rate': [0.06, 0.11], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9], 'colsample_bylevel': [0.6, 0.7, 0.8] } # name_ls ( y columns == class 4 values) name_ls = ['hhb','hbo2','ca','na'] # final predict values (submit DataFrame) tmp_dic = dict() # xgb model feature importance
# 使用Z-标准化 scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) regressor = KNeighborsRegressor() regressor.fit(X_train_scaled, Y_train) Y_est = regressor.predict(X_test_scaled) print("MAE=", mean_squared_error(Y_test, Y_est)) # In[13]: # 鲁棒性缩放 scaler2 = RobustScaler() X_train_scaled = scaler2.fit_transform(X_train) X_test_scaled = scaler2.transform(X_test) regressor = KNeighborsRegressor() regressor.fit(X_train_scaled, Y_train) Y_est = regressor.predict(X_test_scaled) print("MAE=", mean_squared_error(Y_test, Y_est)) # In[14]: # 对特定特征使用非线性修正 non_linear_feat = 5 X_train_new_feat = np.sqrt(X_train[:, non_linear_feat]) X_test_new_feat = np.sqrt(X_test[:, non_linear_feat]) X_train_new_feat.shape = (X_train_new_feat.shape[0], 1) X_train_extended = np.hstack([X_train, X_train_new_feat])
random_state=6) ca_x, ca_x_test, ca_y, ca_y_test = train_test_split(ca_x, ca_y, test_size=0.1, random_state=6) na_x, na_x_test, na_y, na_y_test = train_test_split(na_x, na_y, test_size=0.1, random_state=6) # scalling scaler = RobustScaler() # scaler = MinMaxScaler() hhb_x = scaler.fit_transform(hhb_x) hhb_x_test = scaler.transform(hhb_x_test) x_pred_hhb = scaler.transform(x_pred_hhb) hbo2_x = scaler.fit_transform(hbo2_x) hbo2_x_test = scaler.transform(hbo2_x_test) x_pred_hbo2 = scaler.transform(x_pred_hbo2) ca_x = scaler.fit_transform(ca_x) ca_x_test = scaler.transform(ca_x_test) x_pred_ca = scaler.transform(x_pred_ca) na_x = scaler.fit_transform(na_x) na_x_test = scaler.transform(na_x_test) x_pred_na = scaler.transform(x_pred_na) # modelling
df2_test = df2_test[df2_test['activity'] != 'r2.Dress'] #Separating the label from the data Y = df2['activity'] np.unique(Y) Y = label_encoder.fit_transform(Y) Y = Y.reshape(Y.shape[0], 1) np.unique(Y) df2.shape X = df2[df2.columns[:-1]] X.shape #Scaling the data transformer = RobustScaler().fit(X) X = transformer.transform(X) X ###validation split X, valX, Y, valY = train_test_split(X, Y, test_size=0.2, random_state=0) X.shape model = keras.Sequential([ keras.layers.Dense(1000, activation='relu', input_dim=32), keras.layers.Dense(800, activation='relu'), keras.layers.Dense(640, activation='relu'), keras.layers.Dense(580, activation='relu'), keras.layers.Dense(330, activation='relu'), keras.layers.Dense(250, activation='relu'), keras.layers.Dense(100, activation='relu'), keras.layers.Dense(42, activation='softmax') ])
def main(args): out_file_name = "results.log" if args.classify: # Cast to list to keep it all in memory train = list(csv.reader(open(args.train_file, 'r'))) test = list(csv.reader(open(args.test_file, 'r'))) x_train = np.array(train[1:], dtype=float) x_test = np.array(test[1:], dtype=float) train_labels_file = open(args.train_labels) y_train = np.array([int(x.strip()) for x in train_labels_file.readlines()]) test_labels_file = open(args.test_labels) y_test = np.array([int(x.strip()) for x in test_labels_file.readlines()]) train_labels_file.close() test_labels_file.close() if args.sampling_technique: print "Attempting to use sampling technique: " + args.sampling_technique if args.sampling_ratio == float('NaN'): print "Unable to use sampling technique. Ratio is NaN." else: x_train, y_train = __get_sample_transformed_examples(args.sampling_technique, x_train, y_train, args.sampling_ratio) if args.scale: scaler = RobustScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.fit_transform(x_test) for classifier in args.classifiers: model = __get_classifier_model(classifier, args) print "Using classifier " + classifier print "Fitting data to model" if args.grid_search: print "Applying parameter tuning to model" if classifier == LOG_REG: parameters = {'loss':('log','hinge'), 'penalty':('l2', 'l1'), 'shuffle':[True], 'n_iter':[5], 'n_jobs':[-1], 'random_state':[179]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == SVM: parameters = {'kernel':('rbf', 'poly'), 'cache_size':[8096], 'random_state':[17]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == ADA_BOOST: parameters = {'n_estimators':[300], 'random_state':[13]} model = grid_search.GridSearchCV(model, parameters, scoring=roc_auc_score, verbose=2) elif classifier == RF: parameters = {'criterion':('gini', 'entropy'), 'n_jobs':[-1], 'n_estimators':[300], 'random_state':[17]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == GRADIENT_BOOST: parameters = {'n_estimators':[300], 'random_state':[17]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == EXTRA_TREES: parameters = {'n_estimators':[300], 'random_state':[17], 'n_jobs':[-1], 'criterion':('gini', 'entropy'), 'max_features':('log2', 40, 0.4), 'max_features':[40, 0.4], 'bootstrap':[True, False], 'bootstrap_features':[True, False]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == BAGGING: parameters = {'n_estimators':[300], 'random_state':[17], 'max_samples': [.4, 30],'max_features':[40, 0.4], 'bootstrap':[True, False], 'bootstrap_features':[True, False], 'n_jobs':[-1]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) print "Best params: " + str(model.best_params_) clf = model.fit(x_train, y_train) print "Parameters used in model:" #print clf.get_params(deep=False) if args.select_best: # Unable to use BaggingClassifier with SelectFromModel if classifier != BAGGING: print "Selecting best features" sfm = SelectFromModel(clf, prefit=True) x_train = sfm.transform(x_train) x_test = sfm.transform(x_test) clf = model.fit(x_train, y_train) __print_and_log_results(clf, classifier, x_train, x_test, y_test, out_file_name, args) elif args.cross_validate: # Cast to list to keep it all in memory labels_file = open(args.labels) labels = np.array([int(x.strip()) for x in labels_file.readlines()]) labels_file.close() data_file = open(args.data_file, 'r') data = list(csv.reader(data_file)) data_file.close() examples = np.array(data[1:], dtype=float) X_train, X_test, y_train, y_test = cross_validation.train_test_split(examples, labels, test_size=0.1) if args.sampling_technique: print "Attempting to use sampling technique: " + args.sampling_technique if args.sampling_ratio == float('NaN'): print "Unable to use sampling technique. Ratio is NaN." else: X_train, y_train = __get_sample_transformed_examples(args.sampling_technique, X_train, y_train, args.sampling_ratio) if args.scale: scaler = StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) for classifier in args.classifiers: print "Using classifier " + classifier model = __get_classifier_model(classifier, args) print "Fitting model" if args.grid_search: print "Applying parameter tuning to model" if classifier == LOG_REG: parameters = {'loss':('log','hinge'), 'penalty':('l2', 'l1'), 'shuffle':[True], 'n_iter':[5], 'n_jobs':[-1], 'random_state':[179]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == SVM: parameters = {'kernel':('rbf', 'poly'), 'cache_size':[8096], 'random_state':[17]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == ADA_BOOST: parameters = {'n_estimators':[300], 'random_state':[13]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == RF: parameters = {'criterion':('gini', 'entropy'), 'n_jobs':[-1], 'n_estimators':[300], 'random_state':[17]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == GRADIENT_BOOST: parameters = {'n_estimators':[300], 'random_state':[17]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == EXTRA_TREES: parameters = {'n_estimators':[300], 'random_state':[17], 'n_jobs':[-1], 'criterion':('gini', 'entropy'), 'max_features':('log2', 40, 0.4), 'max_features':[40, 0.4], 'bootstrap':[True, False], 'bootstrap_features':[True, False]} model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) elif classifier == BAGGING: #parameters = {'n_estimators' : [400], 'random_state' : [17], # 'max_samples' : np.arange(0.5, 0.9, 0.1), # 'max_features' : np.arange(0.5, 0.9, 0.1), # 'bootstrap':[False], 'bootstrap_features':[False], 'n_jobs':[-1]} parameters = {"base_estimator__criterion" : ["gini", "entropy"], "base_estimator__splitter" : ["best", "random"], "base_estimator__max_depth" : [10, 15, 20, 25], "base_estimator__class_weight" : ['balanced'], "base_estimator__max_features" : ['auto', 'log2'] } model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2) clf = model.fit(X_train, y_train) if args.grid_search: print "Best params: " + str(model.best_params_) if args.select_best: if classifier != BAGGING: print "Selecting best features" sfm = SelectFromModel(clf, prefit = True) X_train = sfm.transform(X_train) X_test = sfm.transform(X_test) clf = model.fit(X_train, y_train) print "Evaluating results" __print_and_log_results(clf, classifier, X_train, X_test, y_test, out_file_name, args) elif args.kfold: # Cast to list to keep it all in memory data_file = open(args.data_file, 'r') data = list(csv.reader(data_file)) data_file.close() labels_file = open(args.labels) labels = np.array([int(x.strip()) for x in labels_file.readlines()]) labels_file.close() X = np.array(data[1:], dtype=float) kf = KFold(len(X), n_folds=10, shuffle=True, random_state=42) for train, test in kf: print "kfold loop iterate" X_train, X_test, y_train, y_test = X[train], X[test], labels[train], labels[test] if args.sampling_technique: print "Attempting to use sampling technique: " + args.sampling_technique if args.sampling_ratio == float('NaN'): print "Unable to use sampling technique. Ratio is NaN." else: X_train, y_train = __get_sample_transformed_examples(args.sampling_technique, X_train, y_train, args.sampling_ratio) if args.scale: scaler = StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) for classifier in args.classifiers: print "Using classifier " + classifier model = __get_classifier_model(classifier, args) print "Fitting model" clf = model.fit(X_train, y_train) if args.select_best: if classifier != BAGGING: sfm = SelectFromModel(clf, prefit = True) X_train = sfm.transform(X_train) X_test = sfm.transform(X_test) clf = model.fit(X_train, y_train) print "Evaluating results" __print_and_log_results(clf, classifier, X_train, X_test, y_test, out_file_name, args) print "kfold loop done"
def main(): protocol = 'modeller_fast' with open('descriptors-avg.json', 'r') as fp: alldata = json.load(fp) desc = set() for cpx, cpxdata in alldata.items(): desc |= set(cpxdata[protocol].keys()) desc = [ d for d in desc if not d.startswith('>') and d != 'NRES' and d != 'AGBNP' and d != 'GBMV_POL' and d != 'SOAP-Protein-OD' ] print('Number of descriptors:', len(desc)) cpxs = [c for c in alldata.keys() if c.startswith('FY')] data = np.zeros((len(cpxs), len(desc)), dtype=float) for i, d in enumerate(desc): for j, c in enumerate(cpxs): data[j][i] = alldata[c][protocol][d] scaler = RobustScaler().fit(data) X = scaler.transform(data) X = X.transpose() # Histograms numd = len(desc) ncols = 10 nrows = math.ceil(numd / ncols) plt.figure(figsize=(3 * ncols, 3 * nrows - 0.5)) plt.subplots_adjust(hspace=0.4, wspace=0.3) for n, d in enumerate(desc): plt.subplot(nrows, ncols, n + 1) plt.title(d) plt.hist(X[n], bins='auto') plt.savefig('fig/histograms.png', bbox_inches='tight', dpi=300) plt.clf() # Dendogram method = 'complete' # complete or average seem better Z = linkage(X, method=method, metric='correlation', optimal_ordering=True) fig = plt.figure(figsize=(6, 10)) dn = dendrogram(Z, orientation='right', labels=desc) plt.savefig('fig/dendogram-%s.png' % (method), bbox_inches='tight', dpi=300) plt.clf() # Reorder based on dendogram labels = list(reversed(dn['ivl'])) ndx = [desc.index(l) for l in labels] X = X[ndx, :] # Cross-correlation matrix size = len(desc) mtx = np.ones((size, size), dtype=float) for i in range(size): for j in range(i + 1, size): rp = stats.pearsonr(X[i], X[j])[0] rs = stats.spearmanr(X[i], X[j])[0] mtx[i][j] = rp mtx[j][i] = rs plot_crosscorr(mtx, labels, 'fig/crosscorr-%s.png' % (method)) # Explained_variance pca = sklearn.decomposition.PCA().fit(X.transpose()) nc = pca.n_components_ cumul = np.zeros(nc) for i in range(nc): cumul[i] = pca.explained_variance_ratio_[i] if (i > 0): cumul[i] += cumul[i - 1] cut = 25 plt.figure(figsize=(6.4, 4.8)) plt.plot(range(1, cut + 1), cumul[:cut], 'ro-', label='Cumulative explained variance') plt.bar(range(1, cut + 1), pca.explained_variance_ratio_[:cut], label='Explained variance ratio') plt.ylim(-0.05, 1.05) plt.xlabel('Number of components') plt.ylabel('Explained Variance') plt.xticks(range(1, cut + 1)) plt.legend() plt.savefig('fig/explained_variance.png', bbox_inches='tight', dpi=300) plt.clf()
X1 = np.random.multivariate_normal(mean=mu1, cov=Cov, size=n_datapoints) X2 = np.random.multivariate_normal(mean=mu2, cov=Cov, size=n_datapoints) Y_test = np.hstack([[-1]*n_datapoints, [1]*n_datapoints]) X_test = np.vstack([X1, X2]) X_train[0, 0] = -1000 # a fairly large outlier # Scale data standard_scaler = StandardScaler() Xtr_s = standard_scaler.fit_transform(X_train) Xte_s = standard_scaler.transform(X_test) robust_scaler = RobustScaler() Xtr_r = robust_scaler.fit_transform(X_train) Xte_r = robust_scaler.transform(X_test) # Plot data fig, ax = plt.subplots(1, 3, figsize=(12, 4)) ax[0].scatter(X_train[:, 0], X_train[:, 1], color=np.where(Y_train > 0, 'r', 'b')) ax[1].scatter(Xtr_s[:, 0], Xtr_s[:, 1], color=np.where(Y_train > 0, 'r', 'b')) ax[2].scatter(Xtr_r[:, 0], Xtr_r[:, 1], color=np.where(Y_train > 0, 'r', 'b')) ax[0].set_title("Unscaled data") ax[1].set_title("After standard scaling (zoomed in)") ax[2].set_title("After robust scaling (zoomed in)") # for the scaled data, we zoom in to the data center (outlier can't be seen!) for a in ax[1:]: a.set_xlim(-3, 3) a.set_ylim(-3, 3)
print('is_anomaly_test_count',total_rows) # Remove is_anomaly column from train and test data since semi-supervised learning del train['is_anomaly'] del test['is_anomaly'] print(train.shape, test.shape) #------------------------- # Step 5 Scaling of data #------------------------- from sklearn.preprocessing import RobustScaler scaler = RobustScaler() scaler = scaler.fit(train[['value']]) train['value'] = scaler.transform(train[['value']]) test['value'] = scaler.transform(test[['value']]) #----------------------------------- # Step 6 - Prepare Input for LSTM #----------------------------------- # We’ll split the data into sub-sequences - changing input to a shape as accepted by # lstm autoencoder def create_dataset(X, y, time_steps=1): Xs, ys = [], [] for i in range(len(X) - time_steps): v = X.iloc[i:(i + time_steps)].values Xs.append(v) ys.append(y.iloc[i + time_steps]) return np.array(Xs), np.array(ys)
devtest='./exp/ivectors_semeval_devtest_NGMM_2048_W_2_DIM_200/feats.txt' dev='./exp/ivectors_semeval_dev_NGMM_2048_W_2_DIM_200/feats.txt' train='./exp/ivectors_semeval_train_NGMM_2048_W_2_DIM_200/feats.txt' trainy,trainx=imdb_bag_of_word_libs.loadFeatsText(train) trainy=imdb_bag_of_word_libs.kaldiID_2_LB(trainy) evaly,evalx=imdb_bag_of_word_libs.loadFeatsText(dev) evaly=imdb_bag_of_word_libs.kaldiID_2_LB(evaly) evaly2,evalx2=imdb_bag_of_word_libs.loadFeatsText(devtest) evaly2=imdb_bag_of_word_libs.kaldiID_2_LB(evaly2) robust_scaler = RobustScaler() trainx=robust_scaler.fit_transform(trainx) evalx=robust_scaler.transform(evalx) clf= LinearDiscriminantAnalysis() # clf.fit(trainx,trainy) predictValue=clf.predict(evalx) print semeval2016_libs.scoreSameOrder(predictValue,configure.SCORE_REF_DEV) evalx2=robust_scaler.transform(evalx2) predictValue=clf.predict(evalx2) print semeval2016_libs.scoreSameOrder(predictValue,configure.SCORE_REF_DEVTEST)
def gmm_analysis(self, X_train, X_test, y_train, y_test, data_set_name, max_clusters, analysis_name='GMM'): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) em_bic = [] em_aic = [] em_completeness_score = [] em_homogeneity_score = [] em_measure_score = [] em_adjusted_rand_score = [] em_adjusted_mutual_info_score = [] cluster_range = np.arange(2, max_clusters+1, 1) for k in cluster_range: print('K Clusters: ', k) ## ## Expectation Maximization ## em = GaussianMixture(n_components=k, covariance_type='full') em.fit(X_train_scl) em_pred = em.predict(X_train_scl) em_bic.append(em.bic(X_train_scl)) em_aic.append(em.aic(X_train_scl)) # metrics y_train_score = y_train.reshape(y_train.shape[0],) em_homogeneity_score.append(homogeneity_score(y_train_score, em_pred)) em_completeness_score.append(completeness_score(y_train_score, em_pred)) em_measure_score.append(v_measure_score(y_train_score, em_pred)) em_adjusted_rand_score.append(adjusted_rand_score(y_train_score, em_pred)) em_adjusted_mutual_info_score.append(adjusted_mutual_info_score(y_train_score, em_pred)) ## ## Plots ## ph = plot_helper() ## ## BIC/AIC Plot ## title = 'Information Criterion Plot (' + analysis_name + ') for ' + data_set_name name = data_set_name.lower() + '_' + analysis_name.lower() + '_ic' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_series(cluster_range, [em_bic, em_aic], [None, None], ['bic', 'aic'], cm.viridis(np.linspace(0, 1, 2)), ['o', '*'], title, 'Number of Clusters', 'Information Criterion', filename) ## ## Score Plot ## title = 'Score Summary Plot (' + analysis_name + ') for ' + data_set_name name = data_set_name.lower() + '_' + analysis_name.lower() + '_score' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_series(cluster_range, [em_homogeneity_score, em_completeness_score, em_measure_score, em_adjusted_rand_score, em_adjusted_mutual_info_score], [None, None, None, None, None, None], ['homogeneity', 'completeness', 'measure', 'adjusted_rand', 'adjusted_mutual_info'], cm.viridis(np.linspace(0, 1, 5)), ['o', '^', 'v', '>', '<', '1'], title, 'Number of Clusters', 'Score', filename)
X_to_predict = X_to_predict.values Y_labels = Y_labels.values #Normalized #transformer = Normalizer().fit(X_train_test) #X_train_test = transformer.transform(X_train_test) #X_to_predict = transformer.transform(X_to_predict) #Scaling #transformer = PowerTransformer().fit(X_train_test) #X_train_test = transformer.transform(X_train_test) #X_to_predict = transformer.transform(X_to_predict) #Scaling transformer = RobustScaler().fit(X_train_test) X_train_test = transformer.transform(X_train_test) X_to_predict = transformer.transform(X_to_predict) #PCA #pca_model = PCA(n_components=10, svd_solver='full') #X_train_test = pca_model.fit_transform(X_train_test, Y_labels) #X_to_predict = pca_model.transform(X_to_predict) #Select best features #selecter = SelectKBest(chi2, k=6) #X_train_test = selecter.fit_transform(X_train_test, Y_labels) #X_to_predict = selecter.transform(X_to_predict) gettingDistributionOfDatas()
print(cv_results['test_score']) print("mean of CV scores:") print(mean(cv_results['test_score'])) print("cross_validation scores:", file=res) print(cv_results['test_score'], file=res) print("mean of CV scores:", file=res) print(mean(cv_results['test_score']), file=res) # TEST test_set = pd.read_csv("X_test.csv") x_test = test_set.drop('id', axis=1) # missing values x_test_filled = imputer.transform(x_test) x_test = pd.DataFrame(x_test_filled) # scaling x_test_scaled = scaler.transform(x_test) cols = list(x_test.columns.values) x_test = pd.DataFrame(data=x_test_scaled, columns=cols) # feature selection x_test = pd.DataFrame(data=x_test, columns=new_features) # prediction y_test = reg.predict(x_test) Id = test_set['id'] df = pd.DataFrame(Id) df.insert(1, "y", y_test) df.to_csv(('solution1.csv_' + str(n_estimators) + 'estimators_' + str(max_iter) + 'max_iter'), index=False)
test.drop('Id', axis=1, inplace=True) x = train.drop('SalePrice', axis=1) #Drop Target feature from train. y = train['SalePrice'] test = test.drop('SalePrice', axis=1) #known outliers(some from author notes and some from notebook guides) outliers = [30, 88, 462, 631, 1322] x = x.drop(x.index[outliers]) y = y.drop(y.index[outliers]) x = x.drop('MSSubClass_150', axis=1) test = test.drop('MSSubClass_150', axis=1) #Robustscalar normalizes the data so it is more robust to outliers. sc = RobustScaler() x = sc.fit_transform(x) test = sc.transform(test) #Train model = Lasso(alpha=0.0005, random_state=1) #other alphas were tried too . model.fit(x, y) #Predict pred = model.predict(test) predFinal = np.exp(pred) #Revert the log. #Data export output = pd.DataFrame({'Id': test2.Id, 'SalePrice': predFinal}) output.to_csv('submission.csv', index=False) output.head()
def kmeans_analysis(self, X_train, X_test, y_train, y_test, data_set_name, max_clusters, analysis_name='K-Means'): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) km_inertias = [] km_completeness_score = [] km_homogeneity_score = [] km_measure_score = [] km_adjusted_rand_score = [] km_adjusted_mutual_info_score = [] cluster_range = np.arange(2, max_clusters+1, 1) for k in cluster_range: print('K Clusters: ', k) ## ## KMeans ## km = KMeans(n_clusters=k, algorithm='full', n_jobs=-1) km.fit(X_train_scl) # inertia is the sum of distances from each point to its center km_inertias.append(km.inertia_) # metrics y_train_score = y_train.reshape(y_train.shape[0],) km_homogeneity_score.append(homogeneity_score(y_train_score, km.labels_)) km_completeness_score.append(completeness_score(y_train_score, km.labels_)) km_measure_score.append(v_measure_score(y_train_score, km.labels_)) km_adjusted_rand_score.append(adjusted_rand_score(y_train_score, km.labels_)) km_adjusted_mutual_info_score.append(adjusted_mutual_info_score(y_train_score, km.labels_)) ## ## Silhouette Plot ## title = 'Silhouette Plot (' + analysis_name + ', k=' + str(k) + ') for ' + data_set_name name = data_set_name.lower() + '_' + analysis_name.lower() + '_silhouette_' + str(k) filename = './' + self.out_dir + '/' + name + '.png' self.silhouette_plot(X_train_scl, km.labels_, title, filename) ## ## Plots ## ph = plot_helper() ## ## Elbow Plot ## title = 'Elbow Plot (' + analysis_name + ') for ' + data_set_name name = data_set_name.lower() + '_' + analysis_name.lower() + '_elbow' filename = './' + self.out_dir + '/' + name + '.png' # line to help visualize the elbow lin = ph.extended_line_from_first_two_points(km_inertias, 0, 2) ph.plot_series(cluster_range, [km_inertias, lin], [None, None], ['inertia', 'projected'], cm.viridis(np.linspace(0, 1, 2)), ['o', ''], title, 'Number of Clusters', 'Inertia', filename) ## ## Score Plot ## title = 'Score Summary Plot (' + analysis_name + ') for ' + data_set_name name = data_set_name.lower() + '_' + analysis_name.lower() + '_score' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_series(cluster_range, [km_homogeneity_score, km_completeness_score, km_measure_score, km_adjusted_rand_score, km_adjusted_mutual_info_score], [None, None, None, None, None, None], ['homogeneity', 'completeness', 'measure', 'adjusted_rand', 'adjusted_mutual_info'], cm.viridis(np.linspace(0, 1, 5)), ['o', '^', 'v', '>', '<', '1'], title, 'Number of Clusters', 'Score', filename)
train.to_csv(path_or_buf= filepath + "/trainfinal.csv", index=False) test.to_csv(path_or_buf= filepath + "/testfinal.csv", index=False) print("Exported") train = [] test = [] #Obtaining the columns required for training the model train = pd.read_csv(filepath + "/trainfinal.csv") test = pd.read_csv(filepath + "/testfinal.csv") cols = [c for c in train.columns if c not in ['is_churn','msno']] #Pre-processing the file with Robust Scaler scaler = RobustScaler() scaler.fit(train[cols]) train_x = scaler.transform(train[cols]) test_x = scaler.transform(test[cols]) train_y = train['is_churn'] print("Pre-processing completed") #Training Random Forest Classifier model = RandomForestClassifier(n_estimators = 50) model.fit(train_x,train_y) print("Training Completed") #Predicting the test data with the trained model predictions = model.predict(test_x) #Exporting the msno and predicted values to a csv file submission = pd.DataFrame() submission['msno'] = test['msno']
y = eeg_dataset[['class']].values.ravel() # Segmentar los datos x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=.7, test_size=.3, random_state=25) # Escalado de caracteristicas from sklearn.preprocessing import RobustScaler scaler = RobustScaler() scaler.fit(X_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) # Arquitectura de modelo max_features = 512 model = Sequential() model.add(Embedding(max_features, output_dim=64)) model.add(LSTM(64)) model.add(Dropout(0.8)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
""" Method to generate box plot :param data: Pandas dataframe to be plotted """ assert data is not None data2 = pd.melt(data, id_vars='Label') sns.boxplot(x='variable', y='value', hue='Label', vert=False, data=data2, showfliers=False) plt.show() plt.savefig('Figures/Boxplot.png') if __name__ == "__main__": train_data, train_weights, train_labels, test_data, *ret = import_from_csv( path='Datasets', drop_labels=False) # subsample data to 10% frac_train_data = train_data.sample(frac=0.1) # Normalize data rs = RobustScaler() rs = rs.fit(train_data.iloc[:, :-1]) train_data.iloc[:, :-1] = rs.transform(train_data.iloc[:, :-1]) box_plot_data(data=train_data) print("plot complete")
'i_SN_3', 'log_i_err_SN_3', 'z_SN_3', 'log_z_err_SN_3', 'y_SN_3', 'log_y_err_SN_3'] feat_SN_4 = ['g_SN_4', 'log_g_err_SN_4', 'r_SN_4', 'log_r_err_SN_4', 'i_SN_4', 'log_i_err_SN_4', 'z_SN_4', 'log_z_err_SN_4', 'y_SN_4', 'log_y_err_SN_4'] feat_SN_5 = ['g_SN_5', 'log_g_err_SN_5', 'r_SN_5', 'log_r_err_SN_5', 'i_SN_5', 'log_i_err_SN_5', 'z_SN_5', 'log_z_err_SN_5', 'y_SN_5', 'log_y_err_SN_5'] ### training features with robust scaler ### X_train = RS.fit_transform(df_train[feat_train]) ### validation features in different noise levels ### X_valid_SN_1 = RS.transform(df_valid[feat_SN_1]) X_valid_SN_2 = RS.transform(df_valid[feat_SN_2]) X_valid_SN_3 = RS.transform(df_valid[feat_SN_3]) X_valid_SN_4 = RS.transform(df_valid[feat_SN_4]) X_valid_SN_5 = RS.transform(df_valid[feat_SN_5]) ### The targets that we wish to learn ### Y_train = df_train['redshift'] Y_valid = df_valid['redshift'] ### Some scaling of the target between 0 and 1 ### ### so we can model it with a beta function ### ### given that Beta function is not defined ### ### at 0 or 1 I've come up with this ulgy hack ### max_train_Y = Y_train.max() + 0.00001 min_train_Y = Y_train.min() - 0.00001
# -------------- # Scaling features to lie between a given minimum and maximum value, often between 0 and 1 min_max_scaler = MinMaxScaler() X_train = min_max_scaler.fit_transform(X_train) X_test = min_max_scaler.transform(X_test) print("\nMinMaxScalar:" "\n=============" "\nX_train:", X_train) print('\nX_test:', X_test) # -------------- # ROBUSTSCALAR | # -------------- # This removed the median and scaled the data according to the quantile range robust_scaler = RobustScaler() X_train = robust_scaler.fit_transform(X_train) X_test = robust_scaler.transform(X_test) print("\nRobustScalar:" "\n=============" "\nX_train:", X_train) print('\nX_test:', X_test) # -------------- # NORMALIZER | # -------------- # Normalize samples individually to unit norm # Each sample (each row of the data matrix) with at least one non zero component is rescaled # indepentently o other samples so that its norm (|1 or |2) equals 1 normalizer_scaler = Normalizer() X_train = normalizer_scaler.fit_transform(X_train) X_test = normalizer_scaler.transform(X_test) print("\nNormalizer:" "\n===========" "\nX_train:", X_train)
print 'done in',time.time()-ts,len(x),len(y) y=imdb_bag_of_word_libs.kaldiID_2_LB(y) print y[0],x[0] x=np.array(x) y=np.array(y) trainx,trainy=x,y robust_scaler = RobustScaler() trainx=robust_scaler.fit_transform(trainx) evalx=robust_scaler.transform(testx) clf= LinearDiscriminantAnalysis() clf.fit(trainx,trainy) predictValue=clf.predict(evalx) sdict=dict() ptrue=list() for id,score in zip(testy,predictValue): sdict[id]=score #print id,score truevalue=int(id.split('_')[2]) if truevalue>=5: ptrue.append('1') else: ptrue.append('0')
from sklearn.preprocessing import RobustScaler f_columns = [ '2_prev', '3_prev', '4_prev', '5_prev', '6_prev', '7_prev', '8_prev', '9_prev', '10_prev', '11_prev', '12_prev', 'MONTH', 'HOUR', 'WEEKDAY', 'WEEKEND', 'Demand Forecast', 'SPOT Market Volume', 'Wind Forecast', 'RoR Forecast', 'Yuk Tahmin Planı (MWh)', 'Market Clearing Price' ] f_transformer = RobustScaler() cnt_transformer = RobustScaler() f_transformer = f_transformer.fit(train[f_columns].to_numpy()) cnt_transformer = cnt_transformer.fit(train[['NetOrder']]) train.loc[:, f_columns] = f_transformer.transform(train[f_columns].to_numpy()) train['NetOrder'] = cnt_transformer.transform(train[['NetOrder']]) test.loc[:, f_columns] = f_transformer.transform(test[f_columns].to_numpy()) test['NetOrder'] = cnt_transformer.transform(test[['NetOrder']]) def create_dataset(X, y, time_steps=1): Xs, ys = [], [] for i in range(len(X) - time_steps): v = X.iloc[i:(i + time_steps)].values Xs.append(v) ys.append(y.iloc[i + time_steps]) return np.array(Xs).astype(np.float32), np.array(ys).astype(np.float32)
def get_evoked_feats(f_list, stim_chan, sig_chan, pre_win=1., post_win=1.5, thresh=3, t_thresh=0.1): all_evoked_burst = None IBI = [] all_evoked_onset = [] all_prev_onset = [] stim_lockout_s = 1. for f in f_list: dat = pyabf.ABF(f) stim_id = abf.get_channel_id_by_label(dat, stim_chan) sig_id = abf.get_channel_id_by_label(dat, sig_chan) sr = dat.dataRate scl = RobustScaler() Y_cat = cat_sweeps(dat, sig_chan).T.ravel() scl.fit(Y_cat[:, np.newaxis]) for ii in range(dat.sweepCount): dat.setSweep(ii, stim_id) stim_samp = rlab_signal.binary_onsets(dat.sweepY, 4.)[0] dat.setSweep(ii, sig_id) # if sr == 10000: # print('Downsampling') # y = dat.sweepY # y = scipy.signal.decimate(y, 10) # sr = sr / 10 # else: # y = dat.sweepY y = dat.sweepY stim_lockout = int(stim_lockout_s * sr) yscl = scl.transform(y[:, np.newaxis]).ravel() yscl_NN = yscl - np.min(yscl) onsets, offsets = burst.detect_burst(yscl, sr, thresh=thresh, t_thresh=t_thresh) # onsets, offsets = burst.rm_endpoint_bursts(yscl, onsets, offsets, pre_win * sr, post_win * sr) # Get the threshold crossing time of the bursts that happened within a time window of the evoked #Used to get the evoked burst shapek try: evoked_onset_idx = np.where( onsets > (stim_samp - int(pre_win / 9. * sr)))[0][0] next_onset_idx = evoked_onset_idx + 1 prev_onset_idx = evoked_onset_idx - 1 evoked_onset = onsets[evoked_onset_idx] except: IBI.append(np.nan) all_prev_onset.append(np.nan) all_evoked_onset.append(np.nan) evoked_burst = np.empty([int(pre_win * sr + post_win * sr), 1 ]) * np.nan if all_evoked_burst is None: all_evoked_burst = evoked_burst else: all_evoked_burst = np.concatenate( [all_evoked_burst, evoked_burst], axis=1) continue # evoked_burst = np.empty([int(pre_win * sr + post_win * sr), 1]) * np.nan if next_onset_idx > len(onsets) - 1: next_onset = np.nan else: next_onset = onsets[next_onset_idx] if prev_onset_idx < 0: prev_onset = np.nan else: prev_onset = onsets[prev_onset_idx] # Get the threshold crossing of the second burst after stim (good for IBI) if evoked_onset < int(stim_samp + stim_lockout): evoked_burst = burst.get_aligned_bursts( yscl_NN, [evoked_onset], int(pre_win * sr), int(post_win * sr)) IBI.append(next_onset - evoked_onset) all_evoked_onset.append(evoked_onset) all_prev_onset.append(prev_onset) else: IBI.append(np.nan) all_prev_onset.append(np.nan) all_evoked_onset.append(np.nan) evoked_burst = np.empty([int(pre_win * sr + post_win * sr), 1 ]) * np.nan if all_evoked_burst is None: all_evoked_burst = evoked_burst else: all_evoked_burst = np.concatenate( [all_evoked_burst, evoked_burst], axis=1) evoked_onset = np.array(all_evoked_onset) / sr prev_onset = np.array(all_prev_onset) / sr IBI = np.array(IBI) / sr return (all_evoked_burst, evoked_onset, prev_onset, IBI)
class Learned(Model): def __init__(self, *args, scale=False, center=False, **kwargs): """ A machine learned model. Beyond :class:`revscoring.Model`, this "Learned" models implement :func:`~revscoring.scoring.models.Learned.fit` and :func:`~revscoring.scoring.models.Learned.cross_validate`. """ super().__init__(*args, **kwargs) self.trained = None if scale or center: self.scaler = RobustScaler(with_centering=center, with_scaling=scale) else: self.scaler = None self.params.update({ 'scale': scale, 'center': center }) def train(self, values_labels): """ Fits the model using labeled data by learning its shape. :Parameters: values_labels : [( `<feature_values>`, `<label>` )] an iterable of labeled data Where <values_labels> is an ordered collection of predictive values that correspond to the :class:`revscoring.Feature` s provided to the constructor """ raise NotImplementedError() def fit_scaler_and_transform(self, fv_vectors): """ Fits the internal scale to labeled data. :Parameters: fv_vectors : `iterable` (( `<feature_values>`, `<label>` )) an iterable of labeled data Where <values_labels> is an ordered collection of predictive values that correspond to the `Feature` s provided to the constructor :Returns: A dictionary of model statistics. """ if self.scaler is not None: return self.scaler.fit_transform(fv_vectors) else: return fv_vectors def apply_scaling(self, fv_vector): if self.scaler is not None: if not hasattr(self.scaler, "center_") and \ not hasattr(self.scaler, "scale_"): raise RuntimeError("Cannot scale a vector before " + "training the scaler") fv_vector = self.scaler.transform([fv_vector])[0] return fv_vector def _clean_copy(self): raise NotImplementedError() def cross_validate(self, values_labels, folds=10, processes=1): """ Trains and tests the model agaists folds of labeled data. :Parameters: values_labels : [( `<feature_values>`, `<label>` )] an iterable of labeled data Where <values_labels> is an ordered collection of predictive values that correspond to the `Feature` s provided to the constructor folds : `int` When set to 1, cross-validation will run in the parent thread. When set to 2 or greater, a :class:`multiprocessing.Pool` will be created. """ folds_i = KFold(n_splits=folds, shuffle=True, random_state=0) if processes == 1: mapper = map else: pool = Pool(processes=processes or cpu_count()) mapper = pool.map results = mapper(self._cross_score, ((i, [values_labels[i] for i in train_i], [values_labels[i] for i in test_i]) for i, (train_i, test_i) in enumerate( folds_i.split(values_labels)))) agg_score_labels = [] for score_labels in results: agg_score_labels.extend(score_labels) self.info['statistics'].fit(agg_score_labels) return self.info['statistics'] def _cross_score(self, i_train_test): i, train_set, test_set = i_train_test logger.info("Performing cross-validation {0}...".format(i + 1)) model = self._clean_copy() logger.debug("Training cross-validation for {0}...".format(i + 1)) model.train(train_set) logger.debug("Scoring cross-validation for {0}...".format(i + 1)) feature_values, labels = map(list, zip(*test_set)) docs = model.score_many(feature_values) return list(zip(docs, labels))