def loadData(path="../data/",k=5,log='add',pca_n=0,SEED=34): from pandas import DataFrame, read_csv from numpy import log as ln from sklearn.cross_validation import KFold from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import StandardScaler train = read_csv(path+"train.csv") test = read_csv(path+"test.csv") id = test.id target = train.target encoder = LabelEncoder() target_nnet = encoder.fit_transform(target).astype('int32') feat_names = [x for x in train.columns if x.startswith('feat')] train = train[feat_names].astype(float) test = test[feat_names] if log == 'add': for v in train.columns: train[v+'_log'] = ln(train[v]+1) test[v+'_log'] = ln(test[v]+1) elif log == 'replace': for v in train.columns: train[v] = ln(train[v]+1) test[v] = ln(test[v]+1) if pca_n > 0: from sklearn.decomposition import PCA pca = PCA(pca_n) train = pca.fit_transform(train) test = pca.transform(test) scaler = StandardScaler() scaler.fit(train) train = DataFrame(scaler.transform(train),columns=['feat_'+str(x) for x in range(train.shape[1])]) test = DataFrame(scaler.transform(test),columns=['feat_'+str(x) for x in range(train.shape[1])]) cv = KFold(len(train), n_folds=k, shuffle=True, random_state=SEED) return train, test, target, target_nnet, id, cv, encoder
def lr_with_scale2(): """ Submission: lr_with_scale2_0704_03.csv E_val: E_in: 0.878996 E_out: 0.8768131004917349 """ from sklearn.linear_model import LogisticRegressionCV from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) clf = LogisticRegressionCV(Cs=50, cv=5, scoring='roc_auc', n_jobs=-1, class_weight='auto') clf.fit(X_scaled, y) logger.debug('Best C: %f', clf.C_[0]) logger.debug('Cs: %s', clf.Cs_) logger.debug('Grid scores: %f', clf.scores_) logger.debug('Ein: %f', Util.auc_score(clf, X_scaled, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('lr', clf)]), 'lr_with_scale2_0704_03')
class FeaturePreProcesser(): def __init__(self): pass def fit(self,X): self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) self.imputer.fit(X) X = self.imputer.transform(X) self.std_scaler = StandardScaler() self.std_scaler.fit(X) def fit_transform(self, X): self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) self.imputer.fit(X) X = self.imputer.transform(X) self.std_scaler = StandardScaler() self.std_scaler.fit(X) X = self.std_scaler.transform(X) return X def transform(self, X): X = self.imputer.transform(X) X = self.std_scaler.transform(X) return X
class Regressor(BaseEstimator): def __init__(self): self.clf = Pipeline([ ("RF", RandomForestRegressor(n_estimators=200, max_depth=15, n_jobs=N_JOBS))]) self.scaler = StandardScaler() self.agglo = FeatureAgglomeration(n_clusters=500) def fit(self, X, y): y = y.ravel() n_samples, n_lags, n_lats, n_lons = X.shape self.scaler.fit(X[:, -1].reshape(n_samples, -1)) X = X.reshape(n_lags * n_samples, -1) connectivity = grid_to_graph(n_lats, n_lons) self.agglo.connectivity = connectivity X = self.scaler.transform(X) X = self.agglo.fit_transform(X) X = X.reshape(n_samples, -1) self.clf.fit(X, y) def predict(self, X): n_samples, n_lags, n_lats, n_lons = X.shape X = X.reshape(n_lags * n_samples, -1) X = self.scaler.transform(X) X = self.agglo.transform(X) X = X.reshape(n_samples, -1) return self.clf.predict(X)
def rf2(): """ Submission: rf2_0704_04.csv 3000 trees E_val: 0.871431 E_in: 0.999998 E_out: 30000 trees E_val: E_in: E_out: """ from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestClassifier X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) rf = RandomForestClassifier(n_estimators=30000, oob_score=True, n_jobs=-1, class_weight='auto', max_features='log2') rf.fit(X_scaled, y) logger.debug('Eval(oob): %f', rf.oob_score_) logger.debug('Ein: %f', Util.auc_score(rf, X_scaled, y)) IO.cache(rf, Path.of_cache('rf.RandomForestClassifier.log2.pkl')) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('rf', rf)]), 'rf2_0704_04')
def knn(x_train, y_train, x_valid): x_train=np.log(x_train+1) x_valid=np.log(x_valid+1) where_are_nan = np.isnan(x_train) where_are_inf = np.isinf(x_train) x_train[where_are_nan] = 0 x_train[where_are_inf] = 0 where_are_nan = np.isnan(x_valid) where_are_inf = np.isinf(x_valid) x_valid[where_are_nan] = 0 x_valid[where_are_inf] = 0 scale=StandardScaler() scale.fit(x_train) x_train=scale.transform(x_train) x_valid=scale.transform(x_valid) #pca = PCA(n_components=10) #pca.fit(x_train) #x_train = pca.transform(x_train) #x_valid = pca.transform(x_valid) kneighbors=KNeighborsClassifier(n_neighbors=200,n_jobs=-1) knn_train, knn_test = stacking(kneighbors, x_train, y_train, x_valid, "knn") return knn_train, knn_test, "knn"
def test_scaler_1d(): """Test scaling of dataset along single axis""" rng = np.random.RandomState(0) X = rng.randn(5) X_orig_copy = X.copy() scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X_scaled = scale(X) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
def process(discrete, cont): # Create discrete and continuous data matrices discrete_X = np.array(discrete) cont_X = np.array(cont) # Impute discrete values imp = Imputer(strategy='most_frequent') discrete_X = imp.fit_transform(discrete_X) # Impute continuous values imp_c = Imputer(strategy='mean') cont_X = imp_c.fit_transform(cont_X) # Discrete basis representation enc = OneHotEncoder() enc.fit(discrete_X) discrete_X = enc.transform(discrete_X).toarray() # Continuous scaling scaler = StandardScaler() scaler.fit(cont_X) cont_X = scaler.transform(cont_X) # Merge to one array X = np.concatenate((discrete_X, cont_X), axis=1) return X
def load_data_csv(datafile): """ Loads data from given CSV file. The first line in the given CSV file is expected to be the names of the columns. :param datafile: path of the file :return: a NumPy array containing a data point in each row """ # File format for CSV file. For example, setting _X_COLUMN to 'x' means that x coordinates of geographical location # will be at the column named 'x' in the CSV file. # This will be useful later when we start adding more features. _COLUMN_X = 'x' _COLUMN_Y = 'y' _COLUMN_W = 'color' data = pd.read_csv(datafile) # Normalize scaler = StandardScaler() scaler.fit(data[[_COLUMN_X, _COLUMN_Y]]) data[[_COLUMN_X, _COLUMN_Y]] = scaler.transform(data[[_COLUMN_X, _COLUMN_Y]]) data_coords = data[[_COLUMN_X, _COLUMN_Y]].values data_words = [[e] for e in data[[_COLUMN_W]].values.flatten().tolist()] data = {"coordinates": data_coords, "words": data_words} return sparsify_data(data, None, None), scaler # None for both params since SVD is not used
def prepare_data(): # prepare data from sklearn import datasets iris = datasets.load_iris() X = iris.data[:, [2, 3]] y = iris.target print('Class labels:', np.unique(y)) print(X.shape, y.shape) # split train and test from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y) print(X_train.shape, X_test.shape) print('Labels counts in y:', np.bincount(y)) print('Labels counts in y_train:', np.bincount(y_train)) print('Labels counts in y_test:', np.bincount(y_test)) # scaler from sklearn.preprocessing import StandardScaler sc = StandardScaler() sc.fit(X_train) # mean + sd of train data X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) return X_train_std, X_test_std, y_train, y_test
def load_data_csv_advanced(datafile): """ Loads data from given CSV file. The first line in the given CSV file is expected to be the names of the columns. :param datafile: path of the file :return: a NumPy array containing a data point in each row """ # File format for CSV file. For example, setting _X_COLUMN to 'x' means that x coordinates of geographical location # will be at the column named 'x' in the CSV file. _COLUMN_X = 'x' _COLUMN_Y = 'y' data = pd.read_csv(datafile) # Normalize scaler = StandardScaler() scaler.fit(data[[_COLUMN_X, _COLUMN_Y]]) data[[_COLUMN_X, _COLUMN_Y]] = scaler.transform(data[[_COLUMN_X, _COLUMN_Y]]) # Get feature vector names by removing "x" and "y" feature_vector_names = data.columns.difference([_COLUMN_X, _COLUMN_Y]) data_coords = data[[_COLUMN_X, _COLUMN_Y]].values result = {"coordinates": data_coords} for feature in feature_vector_names: data_words = [[e.strip() for e in venue_data.split(",")] for venue_data in data[feature].values.flatten().tolist()] result[feature] = data_words return sparsify_data(result, None, None), scaler # None for both params since SVD is not used
class GPR(object): def __init__(self, X, y, kernel=None): self.X = X self.y = y self._noise_variance = 0.00001 self._kernel = kernel self._scaler = StandardScaler(with_std=False) self._scaler.fit(self.y) self.y = self._scaler.transform(self.y) assert self._kernel is not None @property def noise_variance(self): return self._noise_variance @noise_variance.setter def noise_variance(self, value): self._noise_variance = value def predict(self, X_test): assert isinstance(self._kernel, Kern) K = self._kernel.K(self.X) K_star = self._kernel.K(self.X, X_test) K_star_star = self._kernel.K(X_test) L = np.linalg.cholesky(K + self._noise_variance * np.eye(len(K))) Lk = np.linalg.solve(L, K_star) mu = np.dot(Lk.T, np.linalg.solve(L, self.y)) s2 = np.diag(K_star_star) - np.sum(Lk ** 2, axis=0) + self._noise_variance return mu + self._scaler.mean_, s2
def get_features_and_labels(frame): ''' Transforms and scales the input data and returns numpy arrays for training and testing inputs and targets. ''' # Convert values to floats arr = np.array(frame, dtype=np.float) # Normalize the entire data set from sklearn.preprocessing import StandardScaler, MinMaxScaler arr = MinMaxScaler().fit_transform(arr) # Use the last column as the target value X, y = arr[:, :-1], arr[:, -1] # Use 50% of the data for training, but we will test against the # entire set # '''ADD LINES FOR CREATING TRAINING AND TEST SET''' # Normalize the attribute values to mean=0 and variance=1 from sklearn.preprocessing import StandardScaler scaler = StandardScaler() # Fit the scaler based on the training data, then apply the same # scaling to both training and test sets. scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # Return the training and test sets return X_train, X_test, y_train, y_test
def dbscan_outliers(df): """ Find outliers (noise points) using DBSCAN. Parameters ---------- df: A pandas.DataFrame Returns ------- A tuple of (a sklearn.DBSCAN instance, a pandas.DataFrame) """ scaler = StandardScaler() scaler.fit(df) scaled = scaler.transform(df) dbs = DBSCAN() db = dbs.fit(scaled) outliers = dbs.fit_predict(scaled) df_o = df.ix[np.nonzero(outliers)] return db, df_o
def sgd(X, y, weight, X_test=False): from sklearn.linear_model import SGDRegressor from sklearn import cross_validation from sklearn.metrics import confusion_matrix from sklearn.preprocessing import StandardScaler #X_train, X_test, y_train, y_test, weight_train, weight_test = cross_validation.train_test_split( # X, y, weight, test_size=0.2, random_state=0) clf = SGDRegressor(loss="huber", n_iter=100, penalty="l1") #clf = LogisticRegression( max_iter=100) X_train = X y_train = y scaler = StandardScaler(with_mean=False) scaler.fit(X_train) # Don't cheat - fit only on training data X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # apply same transformation to test data clf.fit(X_train, y_train, sample_weight=weight) print(clf.score(X_train,y_train,weight)) y_pred = clf.predict(X_test) from sklearn.externals import joblib import scipy.io as sio joblib.dump(clf, 'models/sgd_.pkl') sio.savemat('predict_y_forward.mat', {'y':y_pred})
def sgc_test(X, y, weight): from sklearn.linear_model import SGDClassifier from sklearn import cross_validation from sklearn.metrics import confusion_matrix from sklearn.preprocessing import StandardScaler for i in range(0,1): X_train, X_test, y_train, y_test, weight_train, weight_test = cross_validation.train_test_split( X, y, weight, test_size=0.2, random_state=0) clf = SGDClassifier(loss="hinge", n_iter=100, n_jobs=-1, penalty="l2") #clf = LogisticRegression( max_iter=100) scaler = StandardScaler(with_mean=False) scaler.fit(X_train) # Don't cheat - fit only on training data X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # apply same transformation to test data clf.fit(X_train, y_train, sample_weight=weight_train) y_pred = clf.predict(X_train) #print(confusion_matrix(y_train, y_pred)) print(clf.score(X_train,y_train,weight_train)) y_pred = clf.predict(X_test) #print(confusion_matrix(y_test, y_pred)) print(clf.score(X_test,y_test,weight_test))
def standardize(x_data): print 'Started standardizing of the data' sc = StandardScaler() sc.fit(x_data) x_std = sc.transform(x_data) print 'Finished standardizing of the data' return x_std
def standard_scaler(self, rates): import fx2.fx_config as fxconf config = fxconf.FxConfig() SAVE_FLAG = config.get_scale_save_flag() SAVE_FILENAME = config.get_scale_save_filename() if SAVE_FLAG: fin = open(SAVE_FILENAME, "w") for i in range(len(rates)): ratesNp = np.array(rates[i]) reshaped = ratesNp.reshape(-1, 1) # StandardScallerの入力形式に合わせる scaler = StandardScaler() scaler.fit(reshaped) if SAVE_FLAG: self.__save_scaler_value(fin, scaler,i) dataStd = scaler.transform(reshaped) for j in range(0,len(rates[i])): rates[i][j] = dataStd[j][0] del ratesNp del reshaped gc.collect() if SAVE_FLAG: fin.close()
def analyse_stock(X, Y): poly_degree = 3 # print X.shape, Y.shape scaler = StandardScaler() scaler.fit(X[:, 0]) X = scaler.transform(X) Y = scaler.transform(Y) # print X[0] X_train, X_test, y_train, y_test = cross_validation.train_test_split( \ X, Y, test_size=0.2, random_state=0) stock1_model_pipeline = Pipeline([('poly', PolynomialFeatures(degree=poly_degree)), ('linear', LinearRegression(fit_intercept=False))]) stock1_model = stock1_model_pipeline.fit(X_train, y_train) print stock1_model.score(X_test, y_test) # print stock1_model.predict(X_test[0]) stock1_model_pipeline = Pipeline([('poly', PolynomialFeatures(degree=poly_degree)), ('linear', Ridge(alpha=.1))]) print stock1_model.score(X_test, y_test)
def plot_lr_regularization(): iris = datasets.load_iris() X = iris.data[:, [2, 3]] y = iris.target X_train, _, y_train, _ = train_test_split( X, y, test_size=0.3, random_state=0, ) sc = StandardScaler() sc.fit(X_train) X_train_std = sc.transform(X_train) weights = [] params = [] for c in np.logspace(-5, 4, num=10): lr = LogisticRegression(C=c, random_state=0) lr.fit(X_train_std, y_train) weights.append(lr.coef_[1]) params.append(c) weights = np.array(weights) plt.plot(params, weights[:, 0], label='petal length') plt.plot(params, weights[:, 1], linestyle='--', label='petal width') plt.ylabel('weight coefficient') plt.xlabel('C') plt.legend(loc='upper left') plt.xscale('log') plt.show()
def main(use_idf=False, random_state=None, std=False, n_jobs=-1, verbose=2): wc_idf_map = None if use_idf: # ingredients inverse document frequencies wc_components = build_tfidf_wc(verbose=(verbose > 0)) wc_idf = wc_components['model'].idf_ wc_idf_words = wc_components['model'].get_feature_names() wc_idf_map = dict(zip(wc_idf_words, wc_idf)) # word2vec recipe feature vectors wc_components = build_word2vec_wc(feature_vec_size=120, avg=True, idf=wc_idf_map, verbose=(verbose > 0)) y_train = wc_components['train']['df']['cuisine_code'].as_matrix() X_train = wc_components['train']['features_matrix'] # standardize features aka mean ~ 0, std ~ 1 if std: scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) # random forest supervised classifier time_0 = time.time() clf = RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=n_jobs, random_state=random_state, verbose=verbose) # perform cross validation cv_n_fold = 8 print 'cross validating %s ways...' % cv_n_fold scores_cv = cross_val_score(clf, X_train, y_train, cv=cv_n_fold, n_jobs=-1) print 'accuracy: %0.5f (+/- %0.5f)' % (scores_cv.mean(), scores_cv.std() * 2) time_1 = time.time() elapsed_time = time_1 - time_0 print 'cross validation took %.3f seconds' % elapsed_time
def kfolds_cv(estimator, X, y): num_folds = 10 kf = KFold(len(X), n_folds=num_folds, shuffle=True) yhat_train = np.zeros(len(y), dtype = y.dtype) yhat_test = np.zeros(len(y), dtype = y.dtype) train_err = [] test_err = [] for train_idx, test_idx in kf: X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] # Scale the data scaler = StandardScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) # fit the estimator (estimator.__class__.__name__) estimator = estimator.fit(X_train_scaled, y_train) yhat_train = estimator.predict(X_train_scaled) yhat_test = estimator.predict(X_test_scaled) # store train and test error train_err.append( rmsle(y_train, yhat_train) ) test_err.append( rmsle(y_test, yhat_test) ) return {"Model Name":(estimator.__class__.__name__), "Err Train": np.mean(train_err), "Err Test": np.mean(test_err)}
def data_processing(train,test,features): # train['StreetNo'] = train['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0) # test['StreetNo'] = test['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0) # train['Address'] = train['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x) # test['Address'] = test['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x) # train['hour'] = train['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12) # test['hour'] = test['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12) # train['dark'] = train['Dates'].apply(lambda x: 1 if (len(x) > 4 and int(x[11:13]) >= 18 and int(x[11:13]) < 6) else 0) # test['dark'] = test['Dates'].apply(lambda x: 1 if (len(x) > 4 and int(x[11:13]) >= 18 and int(x[11:13]) < 6) else 0) # features += ['hour','dark','StreetNo'] print("Filling NAs") # print(train.mode()) train = train.fillna(train.median().iloc[0]) test = test.fillna(test.median().iloc[0]) print("Label Encoder") le=LabelEncoder() for col in features: le.fit(list(train[col])+list(test[col])) train[col]=le.transform(train[col]) test[col]=le.transform(test[col]) le.fit(list(train[target])) train[target]=le.transform(train[target]) print("Standard Scalaer") scaler=StandardScaler() for col in features: scaler.fit(list(train[col])) train[col]=scaler.transform(train[col]) test[col]=scaler.transform(test[col]) return train,test,features
def load_data(dataset, scale=False): ''' Loads the dataset :type dataset: string :param dataset: The folder in ../data/ containing the training/testing numpy arrays ''' print '... loading data' path = "../data/" + dataset + "/" #training set trainingData = numpy.load(path + "training.data.npy") trainingIndices = numpy.load(path + "training.indices.npy") trainingIndptr = numpy.load(path + "training.indptr.npy") training_y = numpy.load(path + "training.labels.npy") training_X = scipy.sparse.csr_matrix((trainingData, trainingIndices, trainingIndptr)) #testing set testingData = numpy.load(path + "testing.data.npy") testingIndices = numpy.load(path + "testing.indices.npy") testingIndptr = numpy.load(path + "testing.indptr.npy") testing_y = numpy.load(path + "testing.labels.npy") testing_X = scipy.sparse.csr_matrix((testingData, testingIndices, testingIndptr)) #scale the data if scale: print "..training scaler" scaler = StandardScaler(with_mean=False) scaler.fit(training_X) print "..scaling features" training_X = scaler.transform(training_X) testing_X = scaler.transform(testing_X) return [(training_X, training_y),(testing_X, testing_y)]
def get_norm_nFoldData(trainXY, testXY): trainX = trainXY[:,:-1] trainY = trainXY[:,-1] testX = testXY[:,:-1] testY = testXY[:,-1] #standardise only x values not labels scaler = StandardScaler() scaler.fit(trainX) trainX = scaler.transform(trainX) scaler.fit(testX) testX = scaler.transform(testX) trainY = trainY.reshape((trainY.shape[0],1)) testY = testY.reshape((testY.shape[0],1)) train_X_Y = np.concatenate((trainX,trainY),axis=1) test_X_Y = np.concatenate((testX,testY),axis=1) folds_tr = [] folds_te = [] nfolds = 5 for i in range(nfolds): xp = int(train_X_Y.shape[0]*.8) np.random.shuffle(train_X_Y) folds_tr.append(train_X_Y[:xp,:]) folds_te.append(train_X_Y[xp:,:]) return folds_tr, folds_te
def svc_appr(): """ Best params: {'C': 0.022139881953014046} Submission: E_val: E_in: E_out: """ from sklearn.svm import LinearSVC from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.cross_validation import StratifiedKFold from sklearn.grid_search import RandomizedSearchCV from scipy.stats import expon X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) svc = LinearSVC(dual=False, class_weight='auto') rs = RandomizedSearchCV(svc, n_iter=50, scoring='roc_auc', n_jobs=-1, cv=StratifiedKFold(y, 5), verbose=2, param_distributions={'C': expon()}) rs.fit(X_scaled, y) logger.debug('Got best SVC.') logger.debug('Best params: %s', rs.best_params_) logger.debug('Grid scores:') for i, grid_score in enumerate(rs.grid_scores_): print('\t%s' % grid_score) logger.debug('Best score (E_val): %s', rs.best_score_) logger.debug('E_in: %f', Util.auc_score(rs, X_scaled, y))
def lr_with_scale3(): """ Check the performance of normalizing TEST SET. Submission: lr_with_scale3_0707_04.csv E_val: E_in: 0.879233 E_out: 0.8770121701777971 Submission: lr_with_scale3_0712_01.csv E_val: E_in: E_out: """ from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import cross_val_score from sklearn.pipeline import Pipeline import numpy as np X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(np.r_[X, dataset.load_test()]) X_scaled = raw_scaler.transform(X) clf = LogisticRegression(C=0.03, class_weight='auto') clf.fit(X_scaled, y) logger.debug('E_in: %f', Util.auc_score(clf, X_scaled, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('lr', clf)]), 'lr_with_scale3_0712_01') scores = cross_val_score(clf, X_scaled, y, scoring='roc_auc', n_jobs=-1) logger.debug('E_val: %f <- %s', np.average(scores), scores)
def bagging_lr(): """ Submission: bagging_lr_0707_02.csv E_val: E_in: E_out: """ from sklearn.linear_model import LogisticRegression from sklearn.ensemble import BaggingClassifier from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) bag = BaggingClassifier(LogisticRegression(class_weight='auto'), n_estimators=3000, oob_score=True, n_jobs=-1, verbose=2) logger.debug('E_val (oob): %f', bag.oob_score_) logger.debug('E_in: %f', Util.auc_score(bag, X_scaled, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('bag', bag)]), 'bagging_lr_0707_02')
def ada_boost_dt(): """ Submission: ada_boost_dt_0707_03.csv E_val: 0.854350 E_in: 0.889561 E_out: 0.8832315976033993 """ from sklearn.ensemble import AdaBoostClassifier from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import cross_val_score from sklearn.pipeline import Pipeline X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) ab = AdaBoostClassifier(n_estimators=300) scores = cross_val_score(ab, X_scaled, y, cv=5, n_jobs=-1) logger.debug('CV: %s', scores) logger.debug('E_val: %f', sum(scores) / len(scores)) ab.fit(X_scaled, y) logger.debug('E_in: %f', Util.auc_score(ab, X_scaled, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('ab', ab)]), 'ada_boost_dt_0707_03')
def __init__(self): """ Constructs a SimulateData object. """ # Read the simulated data. simulated = pd.read_csv("simulated.csv", index_col=0) predictors = np.asarray(simulated)[:, 0:-1] responses = np.asarray(simulated)[:, -1] # Divide the simulated data into training and test sets. predictors_training, predictors_test,\ self.responses_training, self.responses_test =\ train_test_split(predictors, responses, test_size=0.33) # Standardize the predictors, both training and test. scaler = StandardScaler() scaler.fit(predictors_training) self.predictors_training = scaler.transform(predictors_training) self.predictors_test = scaler.transform(predictors_test) # Keep track of the number of samples in the training and test sets, # and also the number of features. self.training_sample_count = len(self.responses_training) self.test_sample_count = len(self.responses_test) self.feature_count = np.size(predictors, 1) return None
> From cc.T[0] we can see that PC1 appears to correlate the most with arts feature with a correlation value of 0.537 2. For the second component: > From cc.T[1] we can see that PC2 appears to correlate the most with Healthcare feature with a correlation value of 0.1939 """ """<h2>4f. PCA with standardizing</h2>""" data = pd.read_csv('places.txt',delim_whitespace=True,na_values='?') table = data[['Climate', 'HousingCost', 'HlthCare', 'Crime', 'Transp', 'Educ', 'Arts','Recreat', 'Econ']] from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(table) mean = scaler.mean_ table = scaler.transform(table) mean = np.mean(table,axis = 0) print(mean) std = np.std(table,axis = 0) print(std) from sklearn.decomposition import PCA pca = PCA() pca.fit(table) print(pca.components_.shape) pa1 = pca.components_[0]
return (X, Y) #train data data = np.genfromtxt('output/train/train.csv', delimiter=';') (X_train, Y_train) = clean(data, ncases) #test data data = np.genfromtxt('output/test/test.csv', delimiter=';') (X_test, Y_test) = clean(data, ncases) del data #preprocess from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) #define the method from sklearn.neural_network import MLPClassifier layers = [9] activation = 'relu' alpha = 0.001 type_rate = 'adaptive' rate = 0.1 momentum = 0.09 max_iter = 2000 model = MLPClassifier( solver='sgd', hidden_layer_sizes=layers,
def train_NN(X,Y,target_names): print('Neural Network') #split the dataset into training set and testing set X[np.isnan(X)] = 0 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.33, random_state=0) print('training set') print(X_train.shape) #preprocessing the data scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) num_training_sample = len(X_train) best_hidden_layers_list,best_hidden_layers_tuple = grid_search(X_train, X_test, Y_train, Y_test,num_training_sample) nn_clf = MLPClassifier(alpha=1e-5, hidden_layer_sizes=best_hidden_layers_tuple, random_state=1) #fit the training data to the model nn_clf.fit(X_train,Y_train) Y_pred = nn_clf.predict(X_test) #common standard to compare across models print('f1') f1_clf = f1_score(Y_test, Y_pred, average='samples') print(f1_clf) print('classification report') print(classification_report(Y_test,Y_pred)) ##save model f_nn = open('nn_clf.pkl',"wb+") pickle.dump(nn_clf, f_nn) f_nn.close() f_nn_sc = open('nn_scaler.pkl',"wb+") pickle.dump(scaler, f_nn_sc) f_nn_sc.close() ''' precision recall f1-score support 0 0.00 0.00 0.00 28 1 1.00 0.12 0.22 16 2 0.00 0.00 0.00 39 3 0.00 0.00 0.00 31 4 0.00 0.00 0.00 27 5 0.60 0.52 0.56 29 6 0.00 0.00 0.00 23 7 0.00 0.00 0.00 19 micro avg 0.63 0.08 0.14 212 macro avg 0.20 0.08 0.10 212 weighted avg 0.16 0.08 0.09 212 samples avg 0.00 0.00 0.00 212 ['Case', 'Model', 'PowerDissipation', 'StorageTemperature', 'ThermalResistance', 'Type', 'Voltage', 'Weigth'] ''' return nn_clf, f1_clf
from income_data import X, y, X_train, X_test, y_train, y_test ###### # Run Grid Search to find optimal components ###### # import packages from sklearn.svm import SVC from sklearn.neural_network import MLPClassifier from sklearn.metrics import accuracy_score from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from plot_learning_curve import drawLearningCurve # Scale the data scaler = StandardScaler() scaler.fit(X) X_train_std = scaler.transform(X_train) X_test_std = scaler.transform(X_test) X_toTransform = X_train_std y_train = y_train y_test = y_test # Define the classifier # svm = SVC(random_state=1) # parameters = {'kernel':(['linear']) # ,'C':[10] # ,'gamma':([0.1]) # } # clf = GridSearchCV(svm, param_grid=parameters, cv=3) # N_FEATURES_OPTIONS = [2]
import numpy as np import copy from ann2 import Net from replicate import replicate_data from sklearn.preprocessing import StandardScaler from train import train # Load training and testing data as pd dataframe training_data = pd.read_excel('Data3/reduced_training_data.xlsx') testing_data = pd.read_excel('Data3/test_data.xlsx') # Standardise training and testing data scaler_train = StandardScaler() scaler_test = StandardScaler() scaler_train.fit(training_data) scaler_test.fit(testing_data) testing_data = scaler_test.transform(testing_data) # Convert training data to pd dataframe columns = "BC NC LP LI".split() training_data = pd.DataFrame(data=training_data, index=None, columns=columns) # Replicate the training data replicated_data1 = replicate_data(training_data, 10, 0.03) replicated_data2 = replicate_data(training_data, 10, 0.05) training_data = training_data.append(replicated_data1, ignore_index=True, sort=False)
def predict(data, learn_range): x, y = [], [] for start in range(len(data[0]) - learn_range): group_data = pd.DataFrame() for i in reversed(range(len(data))): add_data = pd.DataFrame(data[i])[start : start + learn_range] group_data = pd.concat((group_data, add_data), axis=0) group_data = group_data.as_matrix() if group_data[-1] < data[0][start + learn_range]: y.append(1) else: y.append(0) x.append([e for i in group_data for e in i]) x_train, x_test, y_train, y_test \ = train_test_split(x, y, test_size=0.3, random_state=0, shuffle=False) select = SelectFromModel(RandomForestClassifier(), threshold="median").fit(x_train, y_train) X_train_selected = select.transform(x_train) X_test_selected = select.transform(x_test) algorithm = RandomForestClassifier predict_result = [] predict_result2 = [] if algorithm == tree.DecisionTreeClassifier or algorithm == RandomForestClassifier: clf = algorithm(random_state=0) clf.fit(x_train, y_train) predict_result = clf.predict(x_test) clf2 = algorithm(random_state=0) clf2.fit(X_train_selected, y_train) predict_result2 = clf2.predict(X_test_selected) if algorithm == SVC or algorithm == xgb.XGBClassifier: sc = StandardScaler() sc.fit(x_train) x_train_std = sc.transform(x_train) x_test_std = sc.transform(x_test) clf = algorithm(random_state=0) clf.fit(x_train_std, y_train) predict_result = clf.predict(x_test_std) total = [] for i in range(len(y_test)): before = data[0][i + learn_range + len(y_train) - 1] after = data[0][i + learn_range + len(y_train)] if predict_result[i] == 1: total.append(after - before) else: total.append(before - after) count_0 = float(y.count(0)) count_1 = float(y.count(1)) high = max([count_0, count_1]) / (count_0 + count_1) print accuracy_score(predict_result, y_test), accuracy_score(predict_result2, y_test) #print clf.feature_importances_ #print clf2.feature_importances_ print select.get_support() return (count_0, count_1, high, accuracy_score(predict_result, y_test), sum(total), clf.feature_importances_ , accuracy_score(predict_result2, y_test))
def preprocess_data(self, prefix, normalize=True, load_adj_dir = None, use_random_walks = True, load_walks=False, num_walk = 50, walk_len = 5, supervised=True, train_all_edge=False): G = self.G if G == None: raise Exception("Data hasn't been load") print("Loaded data.. now preprocessing..") # Categorize train, val and test nodes # Using id_maps.keys to control the node index self.nodes_ids = np.array([n for n in G.node.keys()]) # if not train_all_edge and 0: # self.train_nodes_ids = np.array([n for n in self.nodes_ids if not G.node[n]['val'] and not G.node[n]['test']]) # self.val_nodes_ids = np.array([n for n in self.nodes_ids if G.node[n]['val']]) # self.test_nodes_ids = np.array([n for n in self.nodes_ids if G.node[n]['test']]) # else: self.train_nodes_ids = np.array([n for n in self.nodes_ids]) self.val_nodes_ids = np.array([n for n in self.nodes_ids]) self.test_nodes_ids = np.array([n for n in self.nodes_ids]) self.nodes = np.array([self.id_map[n] for n in self.nodes_ids]) self.train_nodes = np.array([self.id_map[n] for n in self.train_nodes_ids]) self.val_nodes = np.array([self.id_map[n] for n in self.val_nodes_ids]) self.test_nodes = np.array([self.id_map[n] for n in self.test_nodes_ids]) ## Make sure the graph has edge train_removed annotations ## (some datasets might already have this..) for edge in G.edges(): # if (G.node[edge[0]]['val'] or G.node[edge[1]]['val'] or # G.node[edge[0]]['test'] or G.node[edge[1]]['test']): # G[edge[0]][edge[1]]['train_removed'] = True # else: G[edge[0]][edge[1]]['train_removed'] = False #Remove isolated train nodes after remove "train_remove" edge from train graph # and val nodes and test nodes from original graph if not train_all_edge: self.remove_isolated_node() #Construct train_deg and deg, deg[i] is degree of node that have idx i, train_deg consider "train_remove" edge if not train_all_edge: self.construct_train_val_deg() else: self.construct_all_deg() #Construct train_adj and adj, adj is matrix of Uniformly samples neighbors of nodes if load_adj_dir is not None: self.train_adj = np.load(load_adj_dir + "train_adj.npy") self.adj = np.load(load_adj_dir + "adj.npy") else: if not train_all_edge: self.construct_train_val_adj() else: self.construct_all_adj() if normalize and not self.feats is None: from sklearn.preprocessing import StandardScaler # import pdb # pdb.set_trace() train_feats = self.feats[self.train_nodes] scaler = StandardScaler() scaler.fit(train_feats) self.feats = scaler.transform(self.feats) if not supervised: if use_random_walks: if load_walks and os.path.exists(prefix + "-walks.txt"): walks = [] with open(prefix + "-walks.txt") as fp: for line in fp: walks.append(map(self.conversion, line.split())) self.walks = walks if len(walks) == 0: raise Exception("Empty walks file at {0}".format(prefix + "-walks.txt")) else: if load_walks: print("Walks file not exist, run random walk with num_walk {0} and len_walk {1}".format(num_walk, walk_len)) else: print("Run random walk with num_walk {0} and len_walk {1}".format(num_walk, walk_len)) self.walks = self.run_random_walks(out_file = self.prefix + "-walks.txt", num_walks = num_walk, walk_len = walk_len) print("Total walks edge: {0}".format(len(self.walks))) if not train_all_edge: self.construct_train_val_edge() else: self.construct_train_all_edge() print("Preprocessing finished, graph info:") print(nx.info(G))
def validate(): """ run KFOLD method for regression """ #defining directories dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged" dir_out = "/lustre/fs0/home/mtadesse/merraLRValidation" surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef" #cd to the lagged predictors directory os.chdir(dir_in) x = 475 y = 476 #empty dataframe for model validation df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse']) #looping through for tg in range(x,y): os.chdir(dir_in) tg_name = os.listdir()[tg] print(tg, tg_name) ########################################## #check if this tg is already taken care of ########################################## os.chdir(dir_out) if os.path.isfile(tg_name): return "file already analyzed!" os.chdir(dir_in) #load predictor pred = pd.read_csv(tg_name) pred.drop('Unnamed: 0', axis = 1, inplace = True) #add squared and cubed wind terms (as in WPI model) pickTerms = lambda x: x.startswith('wnd') wndTerms = pred.columns[list(map(pickTerms, pred.columns))] wnd_sqr = pred[wndTerms]**2 wnd_cbd = pred[wndTerms]**3 pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis = 1) #standardize predictor data dat = pred.iloc[:,1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred['date'], dat_standardized], axis = 1) #load surge data os.chdir(surge_path) surge = pd.read_csv(tg_name) surge.drop('Unnamed: 0', axis = 1, inplace = True) #remove duplicated surge rows surge.drop(surge[surge['ymd'].duplicated()].index, axis = 0, inplace = True) surge.reset_index(inplace = True) surge.drop('index', axis = 1, inplace = True) #adjust surge time format to match that of pred time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d')) surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns = ['date']) time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis = 1) #merge predictors and surge to find common time frame pred_surge = pd.merge(pred_standardized, surge_new.iloc[:,:2], on='date', how='right') pred_surge.sort_values(by = 'date', inplace = True) #find rows that have nans and remove them row_nan = pred_surge[pred_surge.isna().any(axis =1)] pred_surge.drop(row_nan.index, axis = 0, inplace = True) pred_surge.reset_index(inplace = True) pred_surge.drop('index', axis = 1, inplace = True) #in case pred and surge don't overlap if pred_surge.shape[0] == 0: print('-'*80) print('Predictors and Surge don''t overlap') print('-'*80) continue pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \ pred_surge['date'])), \ columns = ['date']) #prepare data for training/testing X = pred_surge.iloc[:,1:-1] y = pd.DataFrame(pred_surge['surge']) y = y.reset_index() y.drop(['index'], axis = 1, inplace = True) #apply PCA pca = PCA(.95) pca.fit(X) X_pca = pca.transform(X) #apply 10 fold cross validation kf = KFold(n_splits=10, random_state=29) metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs']) for train_index, test_index in kf.split(X): X_train, X_test = X_pca[train_index], X_pca[test_index] y_train, y_test = y['surge'][train_index], y['surge'][test_index] #train regression model lm = LinearRegression() lm.fit(X_train, y_train) #predictions predictions = lm.predict(X_test) # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \ # pd.DataFrame(np.array(y_test))], \ # axis = 1) # pred_obs.columns = ['pred', 'obs'] # combo = pd.concat([combo, pred_obs], axis = 0) #evaluation matrix - check p value if stats.pearsonr(y_test, predictions)[1] >= 0.05: print("insignificant correlation!") continue else: print(stats.pearsonr(y_test, predictions)) metric_corr.append(stats.pearsonr(y_test, predictions)[0]) print(np.sqrt(metrics.mean_squared_error(y_test, predictions))) metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions))) #number of years used to train/test model num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\ pred_surge['date'][0]).days/365 longitude = surge['lon'][0] latitude = surge['lat'][0] num_pc = X_pca.shape[1] #number of principal components corr = np.mean(metric_corr) rmse = np.mean(metric_rmse) print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' - avg_rmse (m) = ', \ np.mean(metric_rmse), '\n') #original size and pca size of matrix added new_df = pd.DataFrame([tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T new_df.columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse'] df = pd.concat([df, new_df], axis = 0) #save df as cs - in case of interruption os.chdir(dir_out) df.to_csv(tg_name) #cd to dir_in os.chdir(dir_in)
def main(): """ Main program """ local_device_protos = device_lib.list_local_devices() logging.info( [x.name for x in local_device_protos if x.device_type == 'GPU']) bq = _bq.BQHandler() io = _io.IO(gs_bucket=options.gs_bucket) viz = _viz.Viz() starttime, endtime = io.get_dates(options) #save_path = options.save_path+'/'+options.config_name logging.info('Using dataset {} and time range {} - {}'.format( options.feature_dataset, starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'))) all_param_names = options.label_params + options.feature_params + options.meta_params aggs = io.get_aggs_from_param_names(options.feature_params) logging.info('Reading data...') bq.set_params(starttime, endtime, batch_size=2500000, loc_col='trainstation', project=options.project, dataset=options.feature_dataset, table=options.feature_table, parameters=all_param_names, only_winters=options.only_winters) data = bq.get_rows() data = io.filter_train_type(labels_df=data, train_types=options.train_types, sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=['train_count', 'delay'], aggs=aggs) if options.y_avg_hours is not None: data = io.calc_running_delay_avg(data, options.y_avg_hours) if options.y_avg: data = io.calc_delay_avg(data) data.sort_values(by=['time', 'trainstation'], inplace=True) if options.normalize: logging.info('Normalizing data...') xscaler = StandardScaler() yscaler = StandardScaler() non_scaled_data = data.loc[:, options.meta_params] labels = data.loc[:, options.label_params].astype( np.float32).values.reshape((-1, 1)) yscaler.fit(labels) scaled_labels = pd.DataFrame(yscaler.transform(labels), columns=['delay']) scaled_features = pd.DataFrame(xscaler.fit_transform( data.loc[:, options.feature_params].astype(np.float32)), columns=options.feature_params) data = pd.concat([non_scaled_data, scaled_features, scaled_labels], axis=1) if options.pca: logging.info('Doing PCA analyzis for the data...') ipca = IncrementalPCA(n_components=options.pca_components, whiten=options.whiten, copy=False) non_processed_data = data.loc[:, options.meta_params + options.label_params] processed_data = data.loc[:, options.feature_params] ipca.fit(processed_data) processed_features = pd.DataFrame(ipca.transform(processed_data)) data = pd.concat([non_processed_data, processed_data], axis=1) fname = options.output_path + '/ipca_explained_variance.png' viz.explained_variance(ipca, fname) io._upload_to_bucket(filename=fname, ext_filename=fname) data_train, data_test = train_test_split(data, test_size=0.33) X_test, y_test = io.extract_batch(data_test, options.time_steps, batch_size=None, pad_strategy=options.pad_strategy, quantile=options.quantile, label_params=options.label_params, feature_params=options.feature_params) # Define model batch_size = io.get_batch_size(data_train, options.pad_strategy, quantile=options.quantile) logging.info('Batch size: {}'.format(batch_size)) model = LSTM.LSTM(options.time_steps, len(options.feature_params), 1, options.n_hidden, options.lr, options.p_drop, batch_size=batch_size) # Initialization rmses, mses, maes, steps, train_mse = [], [], [], [], [] saver = tf.train.Saver() sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) summary_writer = tf.summary.FileWriter(options.log_dir, graph=tf.get_default_graph()) #tf.summary.scalar('Training MSE', model.loss) tf.summary.scalar('Validation_MSE', model.mse) tf.summary.scalar('Validation_RMSE', model.rmse) tf.summary.scalar('Validation_MAE', model.mae) tf.summary.histogram('y_pred_hist', model.y_pred) merged_summary_op = tf.summary.merge_all() train_summary_op = tf.summary.scalar('Training_MSE', model.loss) train_step = 0 start = 0 while True: # If slow is set, go forward one time step at time, # else proceed whole batch size if options.slow: X_train, y_train = io.extract_batch( data_train, options.time_steps, start=start, pad_strategy=options.pad_strategy, quantile=options.quantile, label_params=options.label_params, feature_params=options.feature_params) else: X_train, y_train = io.extract_batch( data_train, options.time_steps, train_step, pad_strategy=options.pad_strategy, quantile=options.quantile, label_params=options.label_params, feature_params=options.feature_params) if (len(X_train) < options.time_steps): break if options.cv: logging.info('Doing random search for hyper parameters...') param_grid = { "C": [0.001, 0.01, 0.1, 1, 10], "epsilon": [0.01, 0.1, 0.5], "kernel": ['rbf', 'linear', 'poly', 'sigmoid', 'precomputed'], "degree": [2, 3, 4], "shrinking": [True, False], "gamma": [0.001, 0.01, 0.1], "coef0": [0, 0.1, 1] } random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=int( options.n_iter_search), n_jobs=-1) random_search.fit(X_train, y_train) logging.info("RandomizedSearchCV done.") fname = options.output_path + '/random_search_cv_results.txt' report_cv_results(random_search.cv_results_, fname) io._upload_to_bucket(filename=fname, ext_filename=fname) sys.exit() else: if train_step == 0: logging.info('Training...') feed_dict = {model.X: X_train, model.y: y_train} _, loss, train_summary = sess.run( [model.train_op, model.loss, train_summary_op], feed_dict=feed_dict) summary_writer.add_summary(train_summary, train_step * batch_size) # Metrics feed_dict = {model.X: X_test, model.y: y_test} #model.cell_init_state: state} val_loss, rmse, mse, mae, y_pred, summary = sess.run( [ model.loss, model.rmse, model.mse, model.mae, model.y_pred, merged_summary_op ], feed_dict=feed_dict) train_mse.append(loss) mses.append(mse) rmses.append(rmse) maes.append(mae) steps.append(train_step) summary_writer.add_summary(summary, train_step * batch_size) if train_step % 50 == 0: logging.info("Step {}:".format(train_step)) logging.info("Training loss: {:.4f}".format(loss)) logging.info("Validation MSE: {:.4f}".format(val_loss)) logging.info('Validation RMSE: {}'.format(rmse)) logging.info('Validation MAE: {}'.format(mae)) logging.info('................') saver.save(sess, options.save_file) train_step += 1 start += 1 # <-- while True: saver.save(sess, options.save_file) if options.normalize: fname = options.save_path + '/yscaler.pkl' io.save_scikit_model(yscaler, fname, fname) io._upload_dir_to_bucket(options.save_path, options.save_path) try: fname = options.output_path + '/learning_over_time.png' metrics = [{ 'metrics': [{ 'values': mses, 'label': 'Validation MSE' }, { 'values': train_mse, 'label': 'Train MSE' }], 'y_label': 'MSE' }, { 'metrics': [{ 'values': rmses, 'label': 'Validation RMSE' }], 'y_label': 'RMSE' }, { 'metrics': [{ 'values': maes, 'label': 'Validation MAE' }], 'y_label': 'MAE' }] viz.plot_learning(metrics, fname) io._upload_to_bucket(filename=fname, ext_filename=fname) except Exception as e: logging.error(e) error_data = { 'steps': steps, 'mse': mses, 'rmse': rmses, 'mae': maes, 'train_mse': train_mse } fname = '{}/training_time_validation_errors.csv'.format( options.output_path) io.write_csv(error_data, filename=fname, ext_filename=fname)
#done at top # ** Create a StandardScaler() object called scaler.** # In[13]: scaler = StandardScaler() # ** Fit scaler to the features.** # In[14]: data_features = data.drop('TARGET CLASS', axis=1) scaler.fit(data_features) # **Use the .transform() method to transform the features to a scaled version.** # In[17]: scaled_features = scaler.transform(data.drop('TARGET CLASS', axis=1)) # **Convert the scaled features to a dataframe and check the head of this dataframe to make sure the scaling worked.** # In[20]: data_feat = pd.DataFrame(scaled_features, columns=data.columns[0:-1]) data_feat.head()
class Model(object): def __init__(self): self.features = [] # self.model = KNeighborsRegressor(n_neighbors=3, p=2) # self.model = LinearRegression() self.model = RandomForestRegressor(n_estimators=300) #self.model = AdaBoostRegressor(n_estimators=200) self.imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0) self.scaler = StandardScaler() self.trained = False @staticmethod def get_labeled_logs( dataset: List[ProgramLog]) -> List[Tuple[ProgramLog, float]]: # Logic here is a bit messy -- basically just want MAX_LOG_GRANULARITY entries at most for one log items = [] for log in dataset: total_time = log.duration() calls_per_entry = max(len(log.calls) // MAX_LOG_GRANULARITY, 1) acc = [] for syscall in log.calls: acc.append(syscall) if len(acc) > 1 and len(acc) % calls_per_entry == 0: new_log = ProgramLog(log.cmd, acc) items.append( (new_log, (new_log.duration() / total_time) * 1)) print("labeled log count", len(items)) return items def update_features(self, logs: List[ProgramLog]) -> (): features = set(self.features) for log in logs: features.update(log.to_feature_map().keys()) self.features = list(sorted(features)) def extract_features(self, log: ProgramLog) -> List[Any]: vec = [] feature_map = log.to_feature_map() for feature in self.features: vec.append(feature_map.get(feature, nan)) return vec def train(self, cmd: List[str], dataset: List[ProgramLog]): print("Generating trimmed dataset...") # Create a trimmed dataset of commands that prefix match -- getting as specific as possible trimmed_dataset = dataset i = 0 while len(cmd) > i: candidate = list( filter(lambda log: i < len(log.cmd) and log.cmd[i] == cmd[i], dataset)) if len(candidate) == 0: break else: trimmed_dataset = candidate i += 1 print("Generating labeled logs...") trimmed_dataset = Model.get_labeled_logs(trimmed_dataset) self.update_features([log for log, label in trimmed_dataset]) if len(trimmed_dataset) > 0: x = [] y = [] print("Extracting features from labeled logs...") for log, label in trimmed_dataset: x.append(self.extract_features(log)) y.append(label) x = np.array(x) y = np.array(y) print("Preprocessing data...") self.imp.fit(x, y) x = self.imp.transform(x) self.scaler.fit(x, y) x = self.scaler.transform(x) print("Fitting model...") self.model.fit(x, y) print("model training accuracy = ", self.model.score(x, y) * 100, "%") self.trained = True def predict_completion(self, log: ProgramLog) -> float: if not self.trained: return 1. features = [self.extract_features(log)] features = self.imp.transform(features) features = self.scaler.transform(features) return self.model.predict(features) def check_accuracy(self, labled_logs: List[Tuple[ProgramLog, float]]): if not self.trained: return 1. x = [self.extract_features(log) for log, _ in labled_logs] x = self.imp.transform(x) x = self.scaler.transform(x) y = [label for _, label in labled_logs] return self.model.score(x, y)
df = pd.read_csv(filepath + os.sep + "iris.data", skiprows=0, header=None) # target variable y = df.iloc[:, 4].map(class_label).values # print(df.head()) # sys.exit() # feature matrix print('*' * 30) first_feature = int(input('Please enter first feature >> ')) second_feature = int(input('Please enter second feature >> ')) print('*' * 30) X = df.iloc[:, [first_feature, second_feature]] # standardization of the feature matrix std_sc = StandardScaler(copy=True, with_mean=True, with_std=True) # X_new = std_sc.fit_transform(X) # Compute the mean and std to be used for later scaling std_sc.fit(X) # Perform standardization by centering and scaling and return X X_std = std_sc.transform(X) # standardized feature matrix # random splitting of train and test data # splitting date for training and test X_train, X_test, y_train, y_test = train_test_split(X_std, y, train_size=0.8, random_state=1, shuffle=True, stratify=y) # support vector machine classification svm = SVC(C=1, kernel='rbf', degree=3,
unscaled_inputs.columns.values #columns_to_scale = ['Month of absence', 'Day of the week', 'Seasons', # 'Transportation expense', 'Distance from Residence to Work', # 'Service time', 'Age', 'Work load Average/day ', 'Hit target', # 'Disciplinary failure', 'Son', 'Social drinker', # 'Social smoker', 'Pet', 'Weight', 'Height', 'Body mass index'] columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education'] columns_to_scale = [ x for x in unscaled_inputs.columns.values if x not in columns_to_omit ] absent_scaler = CustomScaler(columns_to_scale) absent_scaler.fit(unscaled_inputs) scaled_input = absent_scaler.transform(unscaled_inputs) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(scaled_input, targets, train_size=0.8, random_state=20) from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() classifier.fit(X_train, y_train) classifier.score(X_train, y_train) y_pred = classifier.predict(X_test)
for errormean3 in errorall: var = var + ((errormean3 - errormean)**2) var = var / (78 - 1) deviation = math.sqrt(var) x0, x1, lx2, mdatax2 = dispose('wine_train.csv', 0, 1) trainingx2 = [] trainingx2.append(x0) trainingx2.append(x1) trainingx2 = np.array(trainingx2) trainingx2 = trainingx2.T mdatax2 = np.array(mdatax2) lx2 = np.array(lx2) std2 = StandardScaler() std2.fit(trainingx2) train_std2 = std2.fit_transform(trainingx2) y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, l2, mdata2 = dispose2( 'wine_train.csv', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) training2 = [] training2.append(y0) training2.append(y1) training2.append(y2) training2.append(y3) training2.append(y4) training2.append(y5) training2.append(y6) training2.append(y7) training2.append(y8) training2.append(y9)
label_encoded_df = all_df[label_cols].apply(label_encoder) #label_encoder 함수 적용(숫자 데이터로 만들어줌) numerical_df = pd.DataFrame(scaler.fit_transform(all_df[numerical_cols]), columns=numerical_cols) # StandardScaler 적용해서 값들 전처리 target_df = all_df[TARGET] # 열 단위로 집계(axis = 0), 행 단위로 집계(axis = 1) all_df = pd.concat([numerical_df, label_encoded_df, onehot_encoded_df, target_df], axis=1) all_df_scaled = all_df.drop([TARGET], axis = 1).copy() scaler.fit(all_df.drop([TARGET], axis = 1)) all_df_scaled = scaler.transform(all_df_scaled) # 학습 데이터 세트로 fit() 된 Scaler를 이용하여 테스트 데이터를 변환할 경우에는 # 테스트 데이터에서 다시 fit()하지 않고 반드시 그대로 이 Scaler를 이용하여 transform()을 수행해야 한다. all_df_scaled = pd.DataFrame(all_df_scaled, columns=all_df.drop([TARGET], axis = 1).columns) X = all_df_scaled y = all_df[TARGET] print (f'X:{X.shape} y: {y.shape}') # X:(200000, 21) y: (200000,) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.20, random_state = RANDOM_SEED) test = all_df_scaled[len(train):]
def reconstructRF(): """ run KFOLD method for random forest regression """ #import packages import os import numpy as np import pandas as pd #from sklearn import metrics #from scipy import stats #import seaborn as sns #import matplotlib.pyplot as plt #from sklearn.model_selection import KFold from datetime import datetime from sklearn.ensemble import RandomForestRegressor from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler #defining directories dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged" dir_out = "/lustre/fs0/home/mtadesse/rfReconstruction" surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef" # #load KFOLD result csv file # os.chdir('F:\\06_eraint_results\\sonstig') # kf_dat = pd.read_csv('eraint_randForest_kfold.csv') # #edit the tg names to be usable later on # editName = lambda x: x.split('.csv')[0] # kf_dat['tg'] = pd.DataFrame(list(map(editName, kf_dat['tg'])), columns= ['tg']) #cd to the lagged predictors directory os.chdir(dir_in) x = 39 y = 40 #looping through for tg in range(x, y): os.chdir(dir_in) tg_name = os.listdir()[tg] print(tg, tg_name) #load predictor pred = pd.read_csv(tg_name) pred.drop('Unnamed: 0', axis=1, inplace=True) #add squared and cubed wind terms (as in WPI model) pickTerms = lambda x: x.startswith('wnd') wndTerms = pred.columns[list(map(pickTerms, pred.columns))] wnd_sqr = pred[wndTerms]**2 wnd_cbd = pred[wndTerms]**3 pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1) #standardize predictor data dat = pred.iloc[:, 1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1) #load surge data os.chdir(surge_path) surge = pd.read_csv(tg_name) surge.drop('Unnamed: 0', axis=1, inplace=True) #remove duplicated surge rows surge.drop(surge[surge['ymd'].duplicated()].index, axis=0, inplace=True) surge.reset_index(inplace=True) surge.drop('index', axis=1, inplace=True) #adjust surge time format to match that of pred time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d')) surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns=['date']) time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis=1) #merge predictors and surge to find common time frame pred_surge = pd.merge(pred_standardized, surge_new.iloc[:, :2], on='date', how='right') pred_surge.sort_values(by='date', inplace=True) #find rows that have nans and remove them row_nan = pred_surge[pred_surge.isna().any(axis=1)] pred_surge.drop(row_nan.index, axis=0, inplace=True) pred_surge.reset_index(inplace=True) pred_surge.drop('index', axis=1, inplace=True) #in case pred and surge don't overlap if pred_surge.shape[0] == 0: print('-' * 80) print('Predictors and Surge don' 't overlap') print('-' * 80) continue pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \ pred_surge['date'])), \ columns = ['date']) #prepare data for training/testing X = pred_surge.iloc[:, 1:-1] y = pd.DataFrame(pred_surge['surge']) y = y.reset_index() y.drop(['index'], axis=1, inplace=True) #apply PCA #get the number of PCs used during validation # pc_num = kf_dat.loc[kf_dat['tg'] == tg_name]['num_95pcs'] pca = PCA(0.95) pca.fit(X) X_pca = pca.transform(X) { # #apply 10 fold cross validation # kf = KFold(n_splits=10, random_state=29) # metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs']) # for train_index, test_index in kf.split(X): # X_train, X_test = X_pca[train_index], X_pca[test_index] # y_train, y_test = y['surge'][train_index], y['surge'][test_index] # #train regression model # rf = RandomForestRegressor(n_estimator = 50, min_samples_leaf = 1) # lm.fit(X_train, y_train) # #predictions # predictions = lm.predict(X_test) # # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \ # # pd.DataFrame(np.array(y_test))], \ # # axis = 1) # # pred_obs.columns = ['pred', 'obs'] # # combo = pd.concat([combo, pred_obs], axis = 0) # #evaluation matrix - check p value # if stats.pearsonr(y_test, predictions)[1] >= 0.05: # print("insignificant correlation!") # continue # else: # #print(stats.pearsonr(y_test, predictions)) # metric_corr.append(stats.pearsonr(y_test, predictions)[0]) # #print(np.sqrt(metrics.mean_squared_error(y_test, predictions))) # metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions))) # #number of years used to train/test model # num_years = np.ceil((pred_surge['date'][pred_surge.shape[0]-1] -\ # pred_surge['date'][0]).days/365) } longitude = surge['lon'][0] latitude = surge['lat'][0] num_pc = X_pca.shape[1] #number of principal components # corr = np.mean(metric_corr) # rmse = np.mean(metric_rmse) # print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',\ # np.mean(metric_corr), ' - avg_rmse (m) = ', \ # np.mean(metric_rmse), '\n') #%% #surge reconstruction pred_for_recon = pred[~pred.isna().any(axis=1)] pred_for_recon = pred_for_recon.reset_index().drop('index', axis=1) #standardize predictor data dat = pred_for_recon.iloc[:, 1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat( [pred_for_recon['date'], dat_standardized], axis=1) X_recon = pred_standardized.iloc[:, 1:] #apply PCA pca = PCA(num_pc) #use the same number of PCs used for training pca.fit(X_recon) X_pca_recon = pca.transform(X_recon) #%% #model preparation #defining the rf model with number of trees and minimum leaves rf = RandomForestRegressor(n_estimators=50, min_samples_leaf=1, \ random_state = 29) rf.fit(X_pca, y) #get prediction interval def pred_ints(model, X_pca_recon, percentile=95): """ function to construct prediction interval taking into account the result of each regression tree """ err_down = [] err_up = [] preds = [] for pred in model.estimators_: preds.append(pred.predict(X_pca_recon)) preds = np.vstack(preds).T err_down = np.percentile(preds, (100 - percentile)/2., axis = 1, \ keepdims = True) err_up = np.percentile(preds, 100 - (100 - percentile)/2., axis =1, \ keepdims = True) return err_down.reshape(-1), err_up.reshape(-1) #compute 95% prediction intervals err_down, err_up = pred_ints(rf, X_pca_recon, percentile=95) #reconstructed surge goes here truth = rf.predict(X_pca_recon) correct = 0. for i, val in enumerate(truth): if err_down[i] <= val <= err_up[i]: correct += 1 print(correct * 100 / len(truth), '\n') #final dataframe final_dat = pd.concat([pred_standardized['date'], \ pd.DataFrame([truth, err_down, err_up]).T], axis = 1) final_dat['lon'] = longitude final_dat['lat'] = latitude final_dat.columns = ['date', 'surge_reconsturcted', 'pred_int_lower',\ 'pred_int_upper', 'lon', 'lat'] { #plot - optional # time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) # final_dat['date'] = pd.DataFrame(list(map(time_stamp, final_dat['date'])), columns = ['date']) # surge['date'] = pd.DataFrame(list(map(time_stamp, surge['date'])), columns = ['date']) # sns.set_context('notebook', font_scale = 2) # plt.figure() # plt.plot(final_dat['date'], final_dat['mean'], color = 'green') # plt.scatter(surge['date'], surge['surge'], color = 'blue') #prediction intervals # plt.plot(final_dat['date'], final_dat['obs_ci_lower'], color = 'red', linestyle = "--", lw = 0.8) # plt.plot(final_dat['date'], final_dat['obs_ci_upper'], color = 'red', linestyle = "--", lw = 0.8) #confidence intervals # plt.plot(final_dat['date'], final_dat['mean_ci_upper'], color = 'black', linestyle = "--", lw = 0.8) # plt.plot(final_dat['date'], final_dat['mean_ci_lower'], color = 'black', linestyle = "--", lw = 0.8) } #save df as cs - in case of interruption os.chdir(dir_out) final_dat.to_csv(tg_name) #cd to dir_in os.chdir(dir_in)
y = generate_random_points(n=100, p=10) x = generate_random_points(n=100, p=10) #supress scikit future warnings def warn(*args, **kwargs): pass import warnings warnings.warn = warn from numpy import mean, std from sklearn.preprocessing import StandardScaler, scale scaler = StandardScaler() scaler.fit(x) x_transformed = scaler.transform(x) x_scaled = scale(x) y_scaled = scale(y) # print(x_scaled) # print(x_transformed) #print(mean(x)) #print(mean(x_scaled)) #print(mean(x_transformed)) #print(std(x)) #print(std(x_scaled)) #print(std(x_transformed)) #print(mean(x_scaled)) #print(std(x_scaled))
print("LASSO OF unstandardized :") #print(clf.coef_) print("\n", pretty_print_linear(clf.coef_)) print("Training score of LASSO with alpha {} is {} \n".format( alpha, clf.score(X_all_train, y_all_train))) print("Testing score of LASSO with alpha {} is {} \n".format( alpha, clf.score(X_all_test, y_all_test))) print("clf != 0 : ", sum(clf.coef_ != 0)) # # # from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(dataset) dataset = scaler.transform(dataset) dataset = pd.DataFrame( dataset, columns=[ 'Age', 'Number of sexual partners', 'First sexual intercourse', 'Num of pregnancies', 'Smokes', 'Smokes (years)', 'Smokes (packs/year)', 'Hormonal Contraceptives', 'Hormonal Contraceptives (years)', 'IUD', 'IUD (years)', 'STDs', 'STDs (number)', 'STDs:condylomatosis', 'STDs:vaginal condylomatosis', 'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis', 'STDs:pelvic inflammatory disease', 'STDs:genital herpes', 'STDs:molluscum contagiosum', 'STDs:HIV', 'STDs:Hepatitis B', 'STDs:HPV', 'STDs: Number of diagnosis', 'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller', 'Citology', 'Biopsy' ])
# Разбили набор данных на обучающий и испытательный X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y) # Проверить количество меток print("Метки y:", np.bincount(y)) print("Метки y_train:", np.bincount(y_train)) print("Метки y_test:", np.bincount(y_test)) # Масштабирование признаков print("// Обучение начато //") sc = StandardScaler() sc.fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) print("// Обучение завершено //") # Перцептрон ppn = Perceptron(max_iter=40, eta0=0.01, random_state=1) ppn.fit(X_train_std, y_train) y_pred = ppn.predict(X_test_std) print("Неправильно классифицированные: %d" % (y_test != y_pred).sum()) # Правильность print("Правильность: %.2f" % accuracy_score(y_test, y_pred))
from sklearn.preprocessing import StandardScaler import matplotlib.pylab as plt import seaborn as sns train_val = ftrs_df['q'].unique() t_ixs, v_ixs = train_test_split(train_val, test_size=0.2, random_state=SEED) x_train = data.loc[ftrs_df['q'].isin(t_ixs)] y_train = ftrs_df.loc[ftrs_df['q'].isin(t_ixs), 'rank'] x_val = data.loc[ftrs_df['q'].isin(v_ixs)] y_val = ftrs_df.loc[ftrs_df['q'].isin(v_ixs), 'rank'] x_train, y_train = shuffle(x_train, y_train, random_state=SEED) x_val, y_val = shuffle(x_val, y_val, random_state=SEED) scaler = StandardScaler() print(scaler.fit(x_train)) x_train = pd.DataFrame(scaler.transform(x_train), columns=x_train.columns) x_val = pd.DataFrame(scaler.transform(x_val), columns=x_val.columns) plt.scatter(x_val['bm25'], x_val['tfidf_gs'], c=y_val, alpha=0.1) plt.show() from sklearn import linear_model model = linear_model.LogisticRegression(C=1) model.fit(x_train, y_train) probs = model.predict_proba(x_val)[:, 0] ones = probs[np.where(y_val == 1)] twoes = probs[np.where(y_val == 2)]
x = wine.drop("quality", axis=1) # y 레이블 변경하기 --- (*2) newlist = [] for v in list(y): if v <= 4: newlist += [0] elif v <= 7: newlist += [1] else: newlist += [2] y = newlist from sklearn.preprocessing import StandardScaler, MinMaxScaler scaler = StandardScaler() scaler.fit(x) x = scaler.transform(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) # 학습하기 model = RandomForestClassifier(n_estimators=800, max_features='log2', n_jobs=4) model.fit(x_train, y_train) score = model.score(x_test, y_test) # 평가하기 y_pred = model.predict(x_test) print(classification_report(y_test, y_pred)) print("정답률=", accuracy_score(y_test, y_pred)) print("Score :", score) #└ 여기서는 model의 score와 accuracy_score와 점수가 동일하게 나오는데, 이는 현재 모델이 분류모델이기에 데이터 값이 분류형태로 나왔기 때문이다.
# In[25]: from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 33) # In[26]: from sklearn.preprocessing import StandardScaler num_values1=data.select_dtypes(['float64','int64']).columns scaler = StandardScaler() scaler.fit(data[num_values1]) data[num_values1]=scaler.transform(data[num_values1]) # In[27]: from sklearn.linear_model import LinearRegression lr = LinearRegression() lr.fit(X_train,y_train) y_pred_lr = lr.predict(X_test) # In[28]:
def reconstruct(): """ run KFOLD method for regression """ #import packages import os import pandas as pd import statsmodels.api as sm from datetime import datetime from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler #defining directories dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged" dir_out = "/lustre/fs0/home/mtadesse/mlrReconstruction" surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef" #cd to the lagged predictors directory os.chdir(dir_in) x = 173 y = 174 #looping through for tg in range(x, y): os.chdir(dir_in) tg_name = os.listdir()[tg] print(tg, tg_name) #load predictor pred = pd.read_csv(tg_name) pred.drop('Unnamed: 0', axis=1, inplace=True) #add squared and cubed wind terms (as in WPI model) pickTerms = lambda x: x.startswith('wnd') wndTerms = pred.columns[list(map(pickTerms, pred.columns))] wnd_sqr = pred[wndTerms]**2 wnd_cbd = pred[wndTerms]**3 pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1) #standardize predictor data dat = pred.iloc[:, 1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1) #load surge data os.chdir(surge_path) surge = pd.read_csv(tg_name) surge.drop('Unnamed: 0', axis=1, inplace=True) #remove duplicated surge rows surge.drop(surge[surge['ymd'].duplicated()].index, axis=0, inplace=True) surge.reset_index(inplace=True) surge.drop('index', axis=1, inplace=True) #adjust surge time format to match that of pred time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d')) surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns=['date']) time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis=1) #merge predictors and surge to find common time frame pred_surge = pd.merge(pred_standardized, surge_new.iloc[:, :2], on='date', how='right') pred_surge.sort_values(by='date', inplace=True) #find rows that have nans and remove them row_nan = pred_surge[pred_surge.isna().any(axis=1)] pred_surge.drop(row_nan.index, axis=0, inplace=True) pred_surge.reset_index(inplace=True) pred_surge.drop('index', axis=1, inplace=True) #in case pred and surge don't overlap if pred_surge.shape[0] == 0: print('-' * 80) print('Predictors and Surge don' 't overlap') print('-' * 80) continue pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \ pred_surge['date'])), \ columns = ['date']) #prepare data for training/testing X = pred_surge.iloc[:, 1:-1] y = pd.DataFrame(pred_surge['surge']) y = y.reset_index() y.drop(['index'], axis=1, inplace=True) #apply PCA pca = PCA(.95) pca.fit(X) X_pca = pca.transform(X) { # #apply 10 fold cross validation # kf = KFold(n_splits=10, random_state=29) # metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs']) # for train_index, test_index in kf.split(X): # X_train, X_test = X_pca[train_index], X_pca[test_index] # y_train, y_test = y['surge'][train_index], y['surge'][test_index] # #train regression model # lm = LinearRegression() # lm.fit(X_train, y_train) # #predictions # predictions = lm.predict(X_test) # # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \ # # pd.DataFrame(np.array(y_test))], \ # # axis = 1) # # pred_obs.columns = ['pred', 'obs'] # # combo = pd.concat([combo, pred_obs], axis = 0) # #evaluation matrix - check p value # if stats.pearsonr(y_test, predictions)[1] >= 0.05: # print("insignificant correlation!") # continue # else: # #print(stats.pearsonr(y_test, predictions)) # metric_corr.append(stats.pearsonr(y_test, predictions)[0]) # #print(np.sqrt(metrics.mean_squared_error(y_test, predictions))) # metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions))) # # #number of years used to train/test model # num_years = np.ceil((pred_surge['date'][pred_surge.shape[0]-1] -\ # pred_surge['date'][0]).days/365) # longitude = surge['lon'][0] # latitude = surge['lat'][0] # num_pc = X_pca.shape[1] #number of principal components # corr = np.mean(metric_corr) # rmse = np.mean(metric_rmse) # print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',\ # np.mean(metric_corr), ' - avg_rmse (m) = ', \ # np.mean(metric_rmse), '\n') } num_pc = X_pca.shape[1] #number of principal components longitude = surge['lon'][0] latitude = surge['lat'][0] #surge reconstruction pred_for_recon = pred[~pred.isna().any(axis=1)] pred_for_recon = pred_for_recon.reset_index().drop('index', axis=1) #standardize predictor data dat = pred_for_recon.iloc[:, 1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat( [pred_for_recon['date'], dat_standardized], axis=1) X_recon = pred_standardized.iloc[:, 1:] #apply PCA pca = PCA(num_pc) #use the same number of PCs used for training pca.fit(X_recon) X_pca_recon = pca.transform(X_recon) #model preparation #first train model using observed surge and corresponding predictors X_pca = sm.add_constant(X_pca) est = sm.OLS(y['surge'], X_pca).fit() #predict with X_recon and get 95% prediction interval X_pca_recon = sm.add_constant(X_pca_recon) predictions = est.get_prediction(X_pca_recon).summary_frame(alpha=0.05) #drop confidence interval and mean_se columns predictions.drop(['mean_se', 'mean_ci_lower','mean_ci_upper'], \ axis = 1, inplace = True) #final dataframe final_dat = pd.concat([pred_standardized['date'], predictions], axis=1) final_dat['lon'] = longitude final_dat['lat'] = latitude final_dat.columns = ['date', 'surge_reconsturcted', 'pred_int_lower',\ 'pred_int_upper', 'lon', 'lat'] { # plot - optional # time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) # final_dat['date'] = pd.DataFrame(list(map(time_stamp, final_dat['date'])), columns = ['date']) # surge['date'] = pd.DataFrame(list(map(time_stamp, surge['date'])), columns = ['date']) # sns.set_context('notebook', font_scale = 2) # plt.figure() # plt.plot(final_dat['date'], final_dat['mean'], color = 'green') # plt.scatter(surge['date'], surge['surge'], color = 'blue') # prediction intervals # plt.plot(final_dat['date'], final_dat['obs_ci_lower'], color = 'red', linestyle = "--", lw = 0.8) # plt.plot(final_dat['date'], final_dat['obs_ci_upper'], color = 'red', linestyle = "--", lw = 0.8) # confidence intervals # plt.plot(final_dat['date'], final_dat['mean_ci_upper'], color = 'black', linestyle = "--", lw = 0.8) # plt.plot(final_dat['date'], final_dat['mean_ci_lower'], color = 'black', linestyle = "--", lw = 0.8) } #save df as cs - in case of interruption os.chdir(dir_out) final_dat.to_csv(tg_name)
def get_data_raw(scale, add_dummies, var_dummies, TrainTestSplit=True, sz_test=0.3, impute_method='drop', convert_month2int=False, date_method='drop'): print('We are addressing your request.') listdir('./../data_meteo/') list_files = np.empty(36, dtype='|U12') i = 0 for fichier in listdir('./../data_meteo/'): if 'train' in fichier: list_files[i] = fichier i = i + 1 df = pd.DataFrame() for file in list_files: df = pd.concat([df, open_and_transform(file)]) df = df.sort_values(by=['ech', 'date'], ascending=True) print('Data has been imported. Size:', df.shape) if convert_month2int: df = convert_month_to_int(df) print('Months converted to int.') if add_dummies: df_dummies = pd.get_dummies(df[var_dummies]) df = pd.concat([df, df_dummies], axis=1) df = df.drop(var_dummies, axis=1) print('Dummies added.') if date_method == 'drop': df = df.drop(['date'], axis=1) print('Date dropped.') if impute_method == 'drop': N_before = df.shape[0] df = df.dropna(axis=0) N_after = df.shape[0] print("%d data points deleted. %0.2f %s" % (N_before - N_after, (N_before - N_after) / N_before * 100, '%')) if TrainTestSplit: Y = df['tH2_obs'] X = df X = X.drop(['tH2_obs'], axis=1) ## !!! Date? X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=sz_test, random_state=11) print('Train size: %d, Test size: %d' % (X_train.shape[0], X_test.shape[0])) if scale: scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) # Meme transformation sur le test X_test = scaler.transform(X_test) print('Data scaled') return X_train, X_test, Y_train, Y_test, scaler
# #isolation forest # clf = IsolationForest(random_state=0, contamination="auto").fit(X_Y) # inliers = clf.predict(X_Y) # # covariance # cov = EllipticEnvelope(random_state=0, contamination=0.2).fit(X_Y) # inliers = cov.predict(X_Y) print('Number of inliners are ', str(len(inliers[inliers == 1]))) # keeping only the inliers (all variables) X = country_data.loc[:, columns_to_consider].values.reshape(-1, len(columns_to_consider)) Y = country_data.loc[:, ['Price', 'Demand']].values.reshape(-1, 2) X = X[inliers == 1, :] Y = Y[inliers == 1, :] # scaling X and Y scaler_Y = StandardScaler() scaler_Y.fit(Y) Y_scaled = scaler_Y.transform(Y) scaler_X = StandardScaler() scaler_X.fit(X) X_scaled = scaler_X.transform(X) FR_RD_scaler = 2 X_scaled[:, 0] = FR_RD_scaler * X_scaled[:, 0] # finding KNN design neigh = KNeighborsRegressor() param_grid = [{'n_neighbors': [5, 15, 52, 168], 'weights': ['uniform']}] clf = GridSearchCV(neigh, param_grid, scoring='r2', cv=10, refit=True) # scoring='neg_mean_squared_error' clf.fit(X_scaled, Y_scaled) # loading all data points (not just the inliers) ------------------------------- X = country_data.loc[:, columns_to_consider].values.reshape(-1, len( columns_to_consider)) # comment to ignore outliers in the predictions
print("running", data_dir) if data_dir == "feat": print("Using only features..") feats = np.load(dataset_dir + "/dolphins-feats.npy") ## Logistic gets thrown off by big counts, so log transform num comments and score feats[:, 0] = np.log(feats[:, 0] + 1.0) feats[:, 1] = np.log(feats[:, 1] - min(np.min(feats[:, 1]), -1)) feat_id_map = json.load(open(dataset_dir + "/dolphins-id_map.json")) feat_id_map = {int(id): val for id, val in feat_id_map.iteritems()} train_feats = feats[[feat_id_map[id] for id in train_ids]] test_feats = feats[[feat_id_map[id] for id in test_ids]] print("Running regression..") from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(train_feats) train_feats = scaler.transform(train_feats) test_feats = scaler.transform(test_feats) run_regression(train_feats, train_labels, test_feats, test_labels) else: embeds = np.load(data_dir + "/val.npy") id_map = {} with open(data_dir + "/val.txt") as fp: for i, line in enumerate(fp): id_map[int(line.strip())] = i train_embeds = embeds[[id_map[id] for id in train_ids]] test_embeds = embeds[[id_map[id] for id in test_ids]] print("Running regression..") run_regression(train_embeds, train_labels, test_embeds, test_labels)
plt.scatter(x_test[:,0], x_test[:,1], c='', linewidth=1, marker='o', s=80, label='testSet') plt.xlabel('x1') plt.ylabel('x2') plt.legend(loc=2) plt.title(title) plt.show() if __name__=='__main__': iris=datasets.load_iris() x=iris.data[:,[2,3]] y=iris.target x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=0) sc = StandardScaler() sc.fit(x_train) # calcuate average and Standard Deviation of x_train x_train_std=sc.transform(x_train) # regulation x_train x_test_std=sc.transform(x_test) # regulation x_test ml=SVC(kernel='linear', C=10.0, gamma=0.10, random_state=0) ml.fit(x_train_std, y_train) y_pred = ml.predict(x_test_std) print('total test set : %d, total error : %d' %(len(y_test), (y_test != y_pred).sum())) print('accuracy : %.2f' %accuracy_score(y_test, y_pred)) x_total = np.vstack((x_train_std, x_test_std)) #stack vertical y_total = np.hstack((y_train, y_test)) # stack horizental plot_decision_region(x=x_total, y=y_total, classifier=ml, title='scikit-learn SVM RBF')
na_values='?', engine="python").dropna() X, Xt = train_data[columns[::-1]], test_data[columns[::-1]] y = [-1 if s == '<=50K' else 1 for s in train_data["income"]] yt = [-1 if s == '<=50K.' else 1 for s in test_data["income"]] demographic_groups(X) vq_demographic_groups(X) # numerical columns : standardize numcols = [ 'age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week' ] ss = StandardScaler() ss.fit(X[numcols]) Xnum, Xtnum = ss.transform(X[numcols]), ss.transform(Xt[numcols]) # categorical columns: apply 1-hot-encoding catcols = [ 'workClass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country' ] enc = OneHotEncoder() enc.fit(X[catcols]) Xcat, Xtcat = enc.transform(X[catcols]).toarray(), enc.transform( Xt[catcols]).toarray() X, Xt = np.concatenate((Xnum, Xcat), axis=1), np.concatenate((Xtnum, Xtcat), axis=1) pca = PCA(n_components=10)
#Converting string to float le=LabelEncoder() for col in convt_columns: le.fit(X[col].astype(str)) X[col]=le.transform(X[col].astype(str)) # Filling of empty values X=X.fillna(round(X.mean(),2)) Y=Y.fillna(round(Y.mean(),2)) ##################################################################################################### #Splitting data into train and test Train_X,Test_X,Train_Y,Test_Y=train_test_split(X,Y,test_size=0.20,random_state=30) #Transorming data for SVM model scaler=StandardScaler() scaler.fit(Train_X) Train_X=scaler.fit_transform(Train_X) Test_X=scaler.fit_transform(Test_X) #Training the model from sklearn.svm import SVR model=SVR(kernel='rbf') model.fit(Train_X,Train_Y) pred=model.predict(Test_X) print("Model: SVM \n") print("Score :",round(model.score(Test_X,Test_Y),4)) print("Mean absolute error :",round(metrics.mean_absolute_error(Test_Y,pred),2)) print("_____________________________\n") ##################################################################################################### # Uploading test dataset df=pd.read_excel('Test_dataset.xlsx')
output = X.copy() if self.columns is not None: for col in self.columns: output[col] = LabelEncoder().fit_transform(output[col]) else: for colname,col in output.iteritems(): output[colname] = LabelEncoder().fit_transform(col) return output def fit_transform(self,X,y=None): return self.fit(X,y).transform(X) dataset=MultiColumnLabelEncoder(columns = ['day','week','weather','holiday','Special','meal type']).fit_transform(dataset) test_set_att=MultiColumnLabelEncoder(columns = ['day','week','weather','holiday','Special','meal type']).fit_transform(test_set_att) scaler = StandardScaler() print(scaler.fit(dataset)) dataset = scaler.transform(dataset) print(dataset) test_set_att = scaler.transform(test_set_att) lin_reg = LinearRegression() lin_reg.fit(dataset,dataset_label) dataset_prediction=lin_reg.predict(test_set_att) lin_mse = mean_squared_error(test_set_label,dataset_prediction) lin_rmse = np.sqrt(lin_mse) print(lin_rmse) accuracy = lin_reg.score(test_set_label, dataset_prediction) print(accuracy*100,'%')