def test_min_max_scaler_iris(): X = iris.data scaler = MinMaxScaler() # default params X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), 0) assert_array_almost_equal(X_trans.min(axis=0), 0) assert_array_almost_equal(X_trans.max(axis=0), 1) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # not default params: min=1, max=2 scaler = MinMaxScaler(feature_range=(1, 2)) X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), 1) assert_array_almost_equal(X_trans.max(axis=0), 2) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # min=-.5, max=.6 scaler = MinMaxScaler(feature_range=(-.5, .6)) X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), -.5) assert_array_almost_equal(X_trans.max(axis=0), .6) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # raises on invalid range scaler = MinMaxScaler(feature_range=(2, 1)) assert_raises(ValueError, scaler.fit, X)
def prescale_data(x_train, x_test, method): """ Pre-scales training data and (optionally test data) using the specified method. :param x_train: The training data to be pre-scaled. :param x_test: The (optional) test data to be pre-scaled. Beware that the prescaler is only fit to the training data and not to the test data. :param method: The method to be used for prescaling. Allowed values are "minmaxscaler" and "standartscaler" :return: A tuple of the pre-scaled training and test data or only the training data if the test data was set to None """ if method is not None: scaler = None if method == "minmaxscaler": from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() if method == "standartscaler": from sklearn.preprocessing import StandardScaler scaler = StandardScaler() if scaler is None: raise ValueError("Invalid pre-scaling method: {}".format(method)) scaler.fit_transform(x_train) x_train = scaler.transform(x_train) if x_test is not None: x_test = scaler.transform(x_test) if x_test is not None: return x_train, x_test else: return x_train
class NMFReducer(): def __init__(self, dataset, dataset_name, num_components=10): self.dataset = dataset self.dataset_name = dataset_name self.labels = dataset.target self.scaler = MinMaxScaler() self.data = self.scaler.fit_transform(dataset.data) self.n_samples, self.n_features = self.data.shape self.reducer = NMF(n_components=num_components, max_iter=5000) def reduce(self): self.reducer.fit(self.data) self.reduced = self.scaler.fit_transform(self.reducer.transform(self.data)) return self.reduced def benchmark(self, estimator, name, data): t0 = time() sample_size = 300 labels = self.labels estimator.fit(data) print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size=sample_size))) def display_reduced_digits(self): sys.stdout = open('out/NMFReduceDigitsOutput.txt', 'w') print("NMF Reduction of %s:\n" % self.dataset_name) print(40 * '-') print(self.reduced) print("\nLength of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0])) print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0])) print(40 * '-') print(self.reducer.reconstruction_err_) def display_reduced_iris(self): sys.stdout = open('out/NMFReduceIrisOutput.txt', 'w') print("NMF Reduction of %s:\n" % self.dataset_name) print(40 * '-') print(self.reduced) print("\nLength of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0])) print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0])) print(40 * '-') print(self.reducer.reconstruction_err_) def reduce_crossvalidation_set(self, X_train, X_test): self.reducer.fit(X_train) reduced_X_train = self.scaler.transform(X_train) reduced_X_test = self.scaler.transform(X_test) return reduced_X_train, reduced_X_test
def _scale(self, y): z = MinMaxScaler() try: return z.fit_transform(y) except: y = np.array(y) y = z.fit_transform(y) return y.tolist()
def predict_simple_linear(df_train_clean, df_test_clean): X_train_cols = list(df_train_clean[['P_1_bid', 'V_1_bid', 'P_1_ask', 'V_1_ask', 'P_2_bid', 'V_2_bid', 'P_2_ask' , 'V_2_ask']].columns.values) X_train = np.array(df_train_clean[['P_1_bid', 'V_1_bid', 'P_1_ask', 'V_1_ask', 'P_2_bid', 'V_2_bid', 'P_2_ask' , 'V_2_ask']]) Y_train = np.array(df_train_clean[['labels']])[:,0] X_test = np.array(df_test_clean[['P_1_bid', 'V_1_bid', 'P_1_ask', 'V_1_ask', 'P_2_bid', 'V_2_bid', 'P_2_ask' , 'V_2_ask']]) Y_test = np.array(df_test_clean[['labels']])[:,0] # Define the labels labels = np.unique(Y_train) ## # Scale Data scaler = MinMaxScaler() X_test = scaler.fit_transform(X_test) X_train = scaler.fit_transform(X_train) # Set up the data logreg = linear_model.LogisticRegression(C=1e5) # Fit logreg.fit(X_train, Y_train) # Predict Y_hat = logreg.predict(X_test) Y_probs = logreg.predict_proba(X_test) ## # Misclassification error rate miss_err = 1-accuracy_score(Y_test, Y_hat) ## # Log Loss eps = 10^(-15) logloss = log_loss(Y_test, Y_probs, eps = eps) ##confusion_matrix confusion_matrix1 = confusion_matrix(y_true=Y_test, y_pred=Y_hat , labels=labels) # classification_report classification_report1 = classification_report(y_true=Y_test, y_pred=Y_hat) # Output results in a list format result = [] result.append("confusion_matrix") result.append(confusion_matrix1) result.append("classification_report") result.append(classification_report1) result.append("logloss") result.append(logloss) result.append("miss_err") result.append(miss_err) result.append("Y_hat") result.append(Y_hat) return result
def feature_scale(self, X_train, X_val, X_test): "Normalize all columns""" from sklearn.preprocessing import MinMaxScaler mms = MinMaxScaler() X_train_std = mms.fit_transform(X_train) X_val_std = mms.fit_transform(X_val) X_test_std = mms.fit_transform(X_test) return X_train_std, X_val_std, X_test_std
def rescaleSalAndStockValues(): from sklearn.preprocessing import MinMaxScaler import numpy as np maxStock, maxSal, minStock, minSal = findMaxMinValues() # define the given sal and stock values salVal = 200000.0 stockVal = 1000000.0 scaler = MinMaxScaler() scaledSal = scaler.fit_transform([[maxSal],[minSal],[salVal]]) scaledStock = scaler.fit_transform([[maxStock],[minStock],[stockVal]]) return scaledSal, scaledStock
def use(method): if method == 'naive bayes': estimators = [("skb", SelectKBest(score_func=f_classif)),('pca', PCA()), ('bayes',GaussianNB())] clf = Pipeline(estimators) parameters = {"skb__k":[8,9,10,11,12], "pca__n_components":[2,6,4,8]} clf = grid_search.GridSearchCV(clf, parameters) scaler = MinMaxScaler() features_train_scaled = scaler.fit_transform(features_train) features_test_scaled = scaler.transform(features_test) clf.fit(features_train_scaled, labels_train) pred = clf.predict(features_test_scaled) print clf.best_params_ features_k = clf.best_params_['skb__k'] SKB_k = SelectKBest(f_classif, k = features_k) SKB_k.fit_transform(features_train_scaled, labels_train) print "features score: " print SKB_k.scores_ features_selected = [features_list[1:][i]for i in SKB_k.get_support(indices=True)] print features_selected elif method == 'svm': estimators = [('reduce_dim', PCA()), ('svc', SVC())] clf = Pipeline(estimators) parameters = {'svc__C': [1,10]} clf = grid_search.GridSearchCV(clf, parameters) scaler = MinMaxScaler() features_train_scaled = scaler.fit_transform(features_train) features_test_scaled = scaler.transform(features_test) clf.fit(features_train_scaled, labels_train) pred = clf.predict(features_test_scaled) print clf.best_estimator_ elif method == 'decision tree': estimators = [("skb", SelectKBest(score_func=f_classif)),('pca', PCA()), ('tree', tree.DecisionTreeClassifier())] clf = Pipeline(estimators) parameters = {"tree__min_samples_split": [2,10],"skb__k":[8,9,10,11,12], "pca__n_components":[2,4,6,8]} clf = grid_search.GridSearchCV(clf, parameters) scaler = MinMaxScaler() features_train_scaled = scaler.fit_transform(features_train) features_test_scaled = scaler.transform(features_test) clf.fit(features_train_scaled, labels_train) pred = clf.predict(features_test_scaled) print clf.best_params_ features_k = clf.best_params_['skb__k'] SKB_k = SelectKBest(f_classif, k = features_k) SKB_k.fit_transform(features_train, labels_train) features_selected = [features_list[1:][i]for i in SKB_k.get_support(indices=True)] print features_selected accuracy = accuracy_score(labels_test, pred) print "accuracy score:" print accuracy calculate_precision_recall(pred, labels_test)
def featureScale(df): """ FEATURE SCALING """ scaler = MinMaxScaler() #print(df['ApplicantIncome'].head()) df[['ApplicantIncome']] = scaler.fit_transform(df[['ApplicantIncome']]) df[['CoapplicantIncome']] = scaler.fit_transform(df[['CoapplicantIncome']]) df[['LoanAmount']] = scaler.fit_transform(df[['LoanAmount']]) df[['Loan_Amount_Term']] = scaler.fit_transform(df[['Loan_Amount_Term']]) print("Scaling Done") #print(df['ApplicantIncome'].head()) return df
def test_min_max_scaler(): X = iris.data scaler = MinMaxScaler() # default params X_trans = scaler.fit_transform(X) assert_equal(X_trans.min(axis=0), 0) assert_equal(X_trans.max(axis=0), 1) # not default params scaler = MinMaxScaler(feature_range=(1, 2)) X_trans = scaler.fit_transform(X) assert_equal(X_trans.min(axis=0), 1) assert_equal(X_trans.max(axis=0), 2)
def getDNN(df, random_split=None): df_tr, df_val = split(df, rand_ratio=random_split) X, Y = to_array(df.drop("validation", axis=1)) Xtr, Ytr = to_array(df_tr) Xval, Yval = to_array(df_val) scaler = MinMaxScaler((0, 1)) Xtr = scaler.fit_transform(Xtr) Xval = scaler.transform(Xval) # Start create model print("Create a DNN Classifier") model = Sequential() model.add(Dense(100, input_dim=Xtr.shape[1], activation='tanh')) model.add(PReLU()) model.add(Dropout(0.2)) model.add(Dense(80, activation='linear')) model.add(ELU(alpha=0.3)) model.add(Dropout(0.2)) model.add(Dense(60, activation='tanh')) model.add(PReLU()) model.add(Dropout(0.2)) model.add(Dense(40, activation='linear')) model.add(ELU(alpha=0.1)) model.add(Dropout(0.2)) model.add(Dense(15, activation='linear')) model.add(PReLU()) model.add(Dropout(0.2)) model.add(Dense(1, activation='sigmoid')) # trainer = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) trainer = Adadelta(lr=0.1, tho=0.98, epsilon=1e-7) model.compile(loss='binary_crossentropy', optimizer=trainer) print(Ytr, Yval) model.fit(Xtr, Ytr, nb_epoch=30, batch_size=32, verbose=1, validation_data=(Xval, Yval)) pred_tr = model.predict_proba(Xtr) pred = model.predict_proba(Xval) print("auc on train: {}".format(roc_auc_score(Ytr, pred_tr))) print("auc on validation: {}".format(roc_auc_score(Yval, pred))) X = scaler.fit_transform(X) model.fit(X, Y, nb_epoch=30, batch_size=32) return model, scaler
def readTestData(): testData = np.loadtxt('data/test.csv', delimiter=',', skiprows=1) xTest = testData[:,1:31] scale = MMS() allX = scale.fit_transform(xTest) indexTest = list(testData[:,0]) return [allX, indexTest]
] df = dataframe.dropna(axis=1, thresh=243500).iloc[:, 9:] df = df.dropna() df = df.reset_index(drop=True) # Randomly select a chunk of data from random import randint start = randint(0, len(df) - 6000) print(start) dataset = df.iloc[start:start + 6000, ] # # Normalize and add 5% noise # normalize data scaler = MinMaxScaler() train_scaled = scaler.fit_transform(dataset.iloc[:3600, ].values) test_scaled = scaler.transform(dataset.iloc[3600:, ].values) # add 5% noises as anomalies into train and test data in order to evaluate the method import math qty = math.floor(len(train_scaled) * 0.05) train_anomalies = np.random.choice(train_scaled.shape[0], size=qty, replace=False) print(train_anomalies) temp_data = train_scaled[train_anomalies, :] + np.random.normal( 0, 1, size=train_scaled.shape[1]) i = 0 for row in train_anomalies: train_scaled[row, :] = temp_data[i, :]
import pandas as pd import warnings warnings.filterwarnings('ignore') start = date(2015,1,1) end= date.today() data = get_history(symbol="SBIN", start=start, end=end) max_=data[['Open','High','Low','Close']].max().max() min_=data[['Open','High','Low','Close']].min().min() scl=MinMaxScaler() X1=(data[['Open','High','Low','Close']]-min_)/(max_-min_) X2=scl.fit_transform(data[['Volume']].values.reshape(-1,1)) X1=np.array(X1) data=data.assign(Open=X1[:,0]) data=data.assign(High=X1[:,1]) data=data.assign(Low=X1[:,2]) data=data.assign(Close=X1[:,3]) data=data.assign(Volume=X2[:,0]) data.tail() X=data[['Open','High','Low','Close','Volume']] y=data.Last.shift(-1) timestep=1 X_list=[]
from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error import xgboost as xgb if __name__ == '__main__': df = pd.read_pickle( '/Users/USER/Documents/Python/Data Analysis_Practice_GJ/ED_waiting_time/df_processed.pkl' ) features = df.drop('waiting_time', axis=1) y = np.log1p(df['waiting_time']) scaler = MinMaxScaler() features.iloc[:, :2] = scaler.fit_transform(features.iloc[:, :2]) x = features y_scaled = scaler.fit_transform(y.values.reshape(-1, 1)) X_train, X_test, y_train, y_test = train_test_split(x, y_scaled, test_size=0.3, random_state=42) xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.05, reg_lambda=1.3) xgb_regressor.fit(X_train.values, y_train)
### 3. 2nd EDA # Explore the continuous variables/features using Seaborn's scatterplot matrix import seaborn as sns cont_features = list(cc_apps.loc[:, cc_apps.dtypes == float].columns) sns.pairplot(data=cc_apps, hue='ApprovalStatus' ) # By default, pairplot() will skip the object data types # Scale the continuous features from sklearn.preprocessing import StandardScaler, MinMaxScaler scaler = MinMaxScaler() cc_apps_scaled = cc_apps[['ApprovalStatus']].copy() for col in cont_features: cc_apps_scaled[col] = scaler.fit_transform(cc_apps[col].values.reshape( -1, 1)) sns.pairplot(data=cc_apps_scaled, hue='ApprovalStatus' ) # Build a scatterplot matrix with the logged features # The scaler does not change the shape of the distributions, because it changes the scale only. Let's explore a transformation that changes the distributions' shapes: the log # log the continuous features cc_apps_logged = cc_apps[['ApprovalStatus']].copy() for col in cont_features: cc_apps_logged['ln_{}'.format(col)] = np.log(cc_apps[col] + 1) sns.pairplot(data=cc_apps_logged, hue='ApprovalStatus') # This way is more clear that people with age, years of employment, credit score, income and (even) debt tend to be higher for those whose credits are approved. ### 4. Baseline model: a logistic classifier
#MinMaxScaler (区间缩放,基于最大最小值,将数据转换到-1,1区间上的) #提升模型收敛速度,提升模型精度 #常见用于神经网络 #Normalizer (基于矩阵的行,将样本向量转换为单位向量) #其目的在于样本向量在点乘运算或其他核函数计算相似性时,拥有统一的标准 #常见用于文本分类和聚类、logistic回归中也会使用,有效防止过拟合 ss = MinMaxScaler() #用标准化方法对数据进行处理并转换 ## scikit learn中模型API说明: ### fit: 模型训练;基于给定的训练集(X,Y)训练出一个模型;该API是没有返回值;eg: ss.fit(X_train, Y_train)执行后ss这个模型对象就训练好了 ### transform:数据转换;使用训练好的模型对给定的数据集(X)进行转换操作;一般如果训练集进行转换操作,那么测试集也需要转换操作;这个API只在特征工程过程中出现 ### predict: 数据转换/数据预测;功能和transform类似,都是对给定的数据集X进行转换操作,只是transform中返回的是一个新的X, 而predict返回的是预测值Y;这个API只在算法模型中出现 ### fit_transform: fit+transform两个API的合并,表示先根据给定的数据训练模型(fit),然后使用训练好的模型对给定的数据X进行转换操作 x_train = ss.fit_transform(x_train) x_test = ss.transform(x_test) print("原始数据各个特征属性的调整最小值:", ss.min_) print("原始数据各个特征属性的缩放数据值:", ss.scale_) #特征选择:从已有的特征中选择出影响目标值最大的特征属性 # 类比:l1正则 线性回归 稀疏解 #常用方法: # { 分类:F统计量、卡方系数,互信息mutual_info_classif #{ 连续 回归:皮尔逊相关系数 F统计量 互信息mutual_info_classif #SelectKBest(卡方系数) #在当前的案例中,使用SelectKBest这个方法从4个原始的特征属性,选择出来3个 ch2 = SelectKBest(chi2, k=3) #K默认为10
import numpy as np import seaborn as sns housing = pd.read_csv("house_pricing.csv") housing housing.describe().transpose() summary = housing.describe() summary = summary.transpose() print(summary) X = housing.drop(['medianHouseValue'], axis=1) y = housing['medianHouseValue'] from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() x_scale = scaler.fit_transform(X) x_scale from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(x_scale, y, test_size=0.2, random_state=0) from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) X_test import keras
# Plotting the return rate per sample rets = close_px / close_px.shift(1) - 1 rets.plot(label='return') # plt.show() data_train = goog[goog['Date'] < '2019-01-01'].copy() # print(data_train) data_test = goog[goog['Date'] >= '2019-01-01'].copy() # print(data_test) training_data = data_train.drop(['Date'], axis=1) print(training_data) scaler = MinMaxScaler() training_data = scaler.fit_transform(training_data) print(training_data) x_train = [] y_train = [] for i in range(30, training_data.shape[0]): x_train.append(training_data[i - 30:i]) y_train.append(training_data[i, 0]) x_train, y_train = np.array(x_train), np.array(y_train) # Sequential based model regression = Sequential() # input shape for the first layer of LSTM model
Y_training = training_data_df[['total_earnings']].values # Load testing data set from CSV file test_data_df = pd.read_csv("sales_data_test.csv", dtype=float) # Pull out columns for X (data to train with) and Y (value to predict) X_testing = test_data_df.drop('total_earnings', axis=1).values Y_testing = test_data_df[['total_earnings']].values # All data needs to be scaled to a small range like 0 to 1 for the neural # network to work well. Create scalers for the inputs and outputs. X_scaler = MinMaxScaler(feature_range=(0, 1)) Y_scaler = MinMaxScaler(feature_range=(0, 1)) # Scale both the training inputs and outputs X_scaled_training = X_scaler.fit_transform(X_training) Y_scaled_training = Y_scaler.fit_transform(Y_training) # It's very important that the training and test data are scaled with the same scaler. X_scaled_testing = X_scaler.transform(X_testing) Y_scaled_testing = Y_scaler.transform(Y_testing) # Define model parameters learning_rate = 0.001 training_epochs = 100 display_step = 5 # Define how many inputs and outputs are in our neural network number_of_inputs = 9 number_of_outputs = 1
feattstld2 = pd.read_csv(path + '../features/lead2_tst_ip_device_os_app%s.gz' % (add_), compression='gzip') featld2 = pd.concat([feattrnld2, feattstld2]) del feattrnld2, feattstld2 featld2.fillna(-1, inplace=True) featld2 = transform_lead(featld2) featld2.head() print('[{}] Load Entropy Features'.format(time.time() - start_time)) featentip = pd.read_csv(path + '../features/entropyip.gz', compression='gzip') featentip.iloc[:, 1:] = featentip.iloc[:, 1:].astype(np.float32) featentip.iloc[:, 0] = featentip.iloc[:, 0].astype('uint32') scaler = MinMaxScaler() cols_ = [c for c in featentip.columns if c != 'ip'] featentip[cols_] = scaler.fit_transform(featentip[cols_]) featentip[cols_] = featentip[cols_].astype(np.float16) len_train = len(train_df) train_df = train_df.append(test_df) del test_df gc.collect() print('[{}] Concat Features'.format(time.time() - start_time)) train_df = pd.concat([train_df, featapp, featspl, featctn, featcum, featld2], axis=1) print('[{}] Add entropy'.format(time.time() - start_time)) train_df = train_df.merge(featentip, on=['ip'], how='left') print('[{}] hour, day, wday....'.format(time.time() - start_time)) train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8')
def main(infolder, outfolder): descriptor = 'PPCALI' print "RF Peptide Learning Info\n========================\n" print datetime.now().strftime("%Y-%m-%d_%H-%M") + "\n" print( "INPUT:\nInputfolder is\t%s\nOutputfolder is\t%s\nDescriptor is\t%s , auto-correlated (window 7)\n" % (infolder, outfolder, descriptor)) # -------------------------------- TRAINING -------------------------------- print "LOG:\nLoading data..." Pos = PeptideDescriptor(infolder + '/Pos.fasta', descriptor) Pos.filter_duplicates() Neg = PeptideDescriptor(infolder + '/Neg.fasta', descriptor) Neg.filter_duplicates() targets = np.array(len(Pos.sequences) * [1] + len(Neg.sequences) * [0]) # target vector # Descriptor calculation print "Calculating %s descriptor..." % descriptor Data = PeptideDescriptor(Pos.sequences + Neg.sequences, descriptor) Data.calculate_autocorr(7) # Standard Scaling print "Standard scaling %s descriptor..." % descriptor scaler = StandardScaler() Data = scaler.fit_transform(Data.descriptor) # Classifier clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1, oob_score=True, random_state=seed, verbose=0, warm_start=False) # fitting classifier print "Fitting Random Forest classifier..." clf.fit(Data, targets) fit_leafs = clf.apply(Data) print "\tRF out-of-bag score: %.2f" % clf.oob_score_ # -------------------------------- LIBRARY -------------------------------- # Loading library print "Loading sequence library..." Lib = PeptideDescriptor(infolder + '/Lib.fasta', descriptor) class_labels = [l[:3] for l in Lib.names ] # extract class labels from sequence names print "\tLibrary size: %i" % len(Lib.sequences) print "\tLibrary composition is:\n\t\thel: %i\n\t\tasy: %i\n\t\tnCM: %i" % ( class_labels.count('hel'), class_labels.count('asy'), class_labels.count('nCM')) # Calculating descriptors for library members print "Calculating %s descriptor for library..." % descriptor D = PeptideDescriptor(Lib.sequences, descriptor) D.calculate_autocorr(7) # combining both libraries and scaling descriptor print "Standard scaling %s descriptor for library..." % descriptor X = scaler.transform(D.descriptor) # -------------------------------- PREDICTING -------------------------------- # get single tree predictions and calculate stdev print "Predicting single tree results, standard deviation and entropy for library..." start = time.time() preds = get_tree_pred(clf, X) print "Predicting class probabilities for library..." probas = clf.predict_proba(X) probas = probas[:, 1].tolist() variance = np.var(preds, axis=1) print("\tPredictions took %.1f s" % (time.time() - start)) # calculate similarity of library members to training data print "Calculating Random Forest similarity (cosine)..." start = time.time() lib_leafs = clf.apply( X ) # leaf indices where library samples end up in -> RF intrinsic similarity measure D_RF = pairwise_distances(lib_leafs, fit_leafs, metric='cosine') RF_dist = D_RF.mean(axis=1).tolist() print("\tDistance calculation took %.1f s" % (time.time() - start)) # scaling all output features print "Min-Max scaling outputs..." sclr = MinMaxScaler() # some transformations from lists to numpy matrices to arrays back to min-max scaled list: variance = np.squeeze(sclr.fit_transform(variance.reshape(-1, 1))).tolist() RF_dist = np.squeeze(sclr.fit_transform(np.array(RF_dist).reshape( -1, 1))).tolist() # construct final list with all values (prediction, RF_dist, var, sum) print "Creating result dictionaries..." sums = [ x + 0.5 * y + 0.5 * z for x, y, z in zip(probas, RF_dist, variance) ] # weighed [1,0.5,0.5] sum of all values # create data frame with all values d = pd.DataFrame( { 'Class': class_labels, 'Prediction': probas, 'RFDistance': RF_dist, 'TreeVariance': variance, 'WeighedSum': sums }, index=Lib.sequences) d.index.name = 'Sequence' d = d[['Class', 'Prediction', 'RFDistance', 'TreeVariance', 'WeighedSum']].sort_values('Prediction', ascending=False) # get top and bottom two predictions for every class (total 12 sequences = one synthesis) d_hel_top = d.loc[d['Class'] == 'hel'].sort_values('Prediction', ascending=False)[:2] d_hel_bot = d.loc[d['Class'] == 'hel'].sort_values('Prediction', ascending=True)[:2] d_asy_top = d.loc[d['Class'] == 'asy'].sort_values('Prediction', ascending=False)[:2] d_asy_bot = d.loc[d['Class'] == 'asy'].sort_values('Prediction', ascending=True)[:2] d_nCM_top = d.loc[d['Class'] == 'nCM'].sort_values('Prediction', ascending=False)[:2] d_nCM_bot = d.loc[d['Class'] == 'nCM'].sort_values('Prediction', ascending=True)[:2] synth_sele = pd.concat( [d_hel_top, d_hel_bot, d_asy_top, d_asy_bot, d_nCM_top, d_nCM_bot]) # writing output print "Saving files to output directory..." synth_sele.to_csv(outfolder + '/' + datetime.now().strftime("%Y-%m-%d_%H-%M") + 'synthesis_selection.csv') d.to_csv(outfolder + '/library_pred.csv') # saving scaler and classifier to pickle file for later usage pickle.dump( sclr, open( outfolder + datetime.now().strftime("%Y-%m-%d_%H-%M") + '-scaler.p', 'w')) pickle.dump( clf, open( outfolder + datetime.now().strftime("%Y-%m-%d_%H-%M") + '-classifier.p', 'w')) print("Total runtime: %.1f s\n" % (time.time() - globstart)) print "\nALL DONE SUCCESSFULLY" print "Look for your results file in %s\nAnd maybe save this terminal output to a logfile ;-)" % outfolder
def preprocess_data(data_params): # *************** params ****************** look_back = data_params['look_back'] train_set_fraction = data_params['train_set_fraction'] # 0.75 dataset_path = data_params["dataset_path"] num_features = data_params['input_num_features'], data_params[ 'output_num_features'] data = pd.read_csv(dataset_path) # print(data.isnull().values.any()) # print(data.head(10)) data['date'] = pd.to_datetime(data['Timestamp'], unit='s').dt.date group = data.groupby('date') daily_price = group['Weighted_Price'].mean() print(daily_price.head()) # print(daily_price.tail()) print(str(len(daily_price.index))) print(daily_price.index[0]) num_samples = len(daily_price.index) train_start_idx = 0 train_end_idx = int(train_set_fraction * num_samples) data_params['training_set_size'] = train_end_idx - train_start_idx data_params[ 'validation_set_size'] = num_samples - data_params['training_set_size'] # new logic raw_values = daily_price.values train_set = raw_values[train_start_idx:train_end_idx] daily_price_x, daily_price_y = to_supervised(raw_values, look_back, num_features) print(daily_price_y.shape) daily_price_x = difference(daily_price_x, look_back) # leaving daily_price_y raw train_x = daily_price_x[train_start_idx:train_end_idx] train_y = daily_price_y[train_start_idx:train_end_idx] train_Y = difference(train_y, look_back) test_x = daily_price_x[train_end_idx:] test_y = daily_price_y[train_end_idx:] print(train_x.shape, test_x.shape) scaler = MinMaxScaler(feature_range=(-1, 1)) # feature_range=(-1,1) print(train_x.shape, train_y.shape) train_set = np.reshape(train_set, (max(train_set.shape), 1)) train_set_scaled = scaler.fit_transform(train_set) # scaler.fit() train_x = np.reshape(train_x, (max(train_x.shape), 1)) train_x = scaler.transform( train_x) # don't want to scale test data's labels train_y = np.reshape(train_y, (max(train_y.shape), 1)) train_y = scaler.transform( train_y) # scale train's labels, required for loss calculations test_x = np.reshape(test_x, (max(test_x.shape), 1)) test_x = scaler.transform(test_x) # don't want to scale test data's labels train_x = train_x.reshape([max(train_x.shape), look_back, num_features[0]]) test_x = test_x.reshape([max(test_x.shape), look_back, num_features[0]]) train_y = train_y.reshape([max(train_y.shape), look_back, num_features[1]]) test_y = test_y.reshape([max(test_y.shape), look_back, num_features[1]]) return raw_values, train_x, train_y, test_x, test_y, scaler
class DataLoader(): """A class for loading and transforming data for the LSTM model""" def __init__(self, path, split, cols, label_col, MinMax, start_from=None, end=None, returns=True): filename = path dataframe = pd.read_csv(filename) dataframe = dataframe.dropna(axis=0) print(dataframe.isnull().sum()) self.dates = dataframe['Date'] if start_from is not None: dataframe.Date = pd.to_datetime(dataframe.Date) start = pd.to_datetime(start_from) dataframe = dataframe.loc[dataframe.Date > start] if end is not None: dataframe.Date = pd.to_datetime(dataframe.Date) end = pd.to_datetime(end) dataframe = dataframe.loc[dataframe.Date < end] self.dates = dataframe['Date'] if returns: dataframe['log_ret'] = np.log(dataframe['Adj Close'] / dataframe['Adj Close'].shift(1)) dataframe = dataframe.iloc[1:] dataframe = dataframe.get(cols) if split is not None: i_split = int(len(dataframe) * split) print(self.dates.values[i_split]) self.data_train = dataframe.values[:i_split] self.data_test = dataframe.values[i_split:] self.len_test = len(self.data_test) if split is None: self.data_train = dataframe.values self.len_train = len(self.data_train) self.label_col_indx = (dataframe.columns.get_loc(label_col) ) # Get index of label column if MinMax: self.scaler = MinMaxScaler() self.data_train = self.scaler.fit_transform(self.data_train) self.data_test = self.scaler.transform(self.data_test) self.w_normalisation_p0_train = [] self.w_normalisation_p0_test = [] def get_train_data(self, seq_len, normalise, num_forward=1): ''' Seq_len: total length, ie. the last gets to be the label ''' seq_len = seq_len seq_plus_forward = seq_len + num_forward data_x = [] data_y = [] for i in range(self.len_train - seq_plus_forward): x, y, first_row = self._next_window(i, seq_plus_forward, 'train', normalise, num_forward) self.w_normalisation_p0_train.append(first_row) data_x.append(x) data_y.append(y) return np.array(data_x), np.array(data_y) def get_test_data(self, seq_len, normalise, num_forward=1): ''' Seq_len: total length, ie. the last gets to be the label ''' seq_len = seq_len seq_plus_forward = seq_len + num_forward data_x = [] data_y = [] for i in range(self.len_test - seq_plus_forward): x, y, first_row = self._next_window(i, seq_plus_forward, 'test', normalise, num_forward) self.w_normalisation_p0_test.append(first_row) data_x.append(x) data_y.append(y) return np.array(data_x), np.array(data_y) def _next_window(self, i, seq_len, split, normalise, num_forward): """Generates the next data window from the given index location i""" '' if split == 'train': window = self.data_train[i:i + seq_len] first_row = window[0, :] window = self.normalise_windows( window, single_window=True)[0] if normalise else window x = window[:seq_len - num_forward] y = window[-1, [self.label_col_indx]] if split == 'test': window = self.data_test[i:i + seq_len] first_row = window[0, :] window = self.normalise_windows( window, single_window=True)[0] if normalise else window x = window[:seq_len - num_forward] y = window[-1, [self.label_col_indx]] return x, y, first_row def normalise_windows(self, window_data, single_window=False): '''Normalise window with a base value of zero''' normalised_data = [] window_data = [window_data] if single_window else window_data for window in window_data: normalised_window = [] for col_i in range(window.shape[1]): normalised_col = [((float(p) / float(window[0, col_i])) - 1) for p in window[:, col_i]] normalised_window.append(normalised_col) normalised_window = np.array( normalised_window ).T # reshape and transpose array back into original multidimensional format normalised_data.append(normalised_window) return np.array(normalised_data)
class LSTM(): def __init__(self): self.df = pd.read_csv('Dados/new_dataset.csv') self.NormalizeData() self.timesteps = 15 self.nr_parametos = 2 self.PrepareData(self.timesteps) self.Build(self.timesteps, self.nr_parametos) self.Fit() def NormalizeData(self): self.scaler = MinMaxScaler(feature_range=(-1, 1)) self.normalized = self.scaler.fit_transform(self.df) #print(self.normalized) def Denormalize(self, dfNormalized): pass def PrepareData(self, timesteps): i = 0 self.X = [] self.Y = [] while i in range(len(self.normalized - timesteps)): input_index = i + timesteps label_index = input_index + 1 if (label_index < len(self.normalized)): self.X.append(self.normalized[i:input_index, 1:3]) self.Y.append(self.normalized[input_index:label_index, 1:2]) i += 1 self.X = np.array(self.X) self.Y = np.array(self.Y) #print(self.X) X = self.X #print(self.Y) Y = self.Y ''' def Prepare_Data(self,dataset): x = dataset.drop(columns=['Total_Deaths']) self.X = x.to_numpy() y = dataset['Total_Deaths'] self.Y = y.to_numpy() self.Y = self.Y.astype(float) ''' def Build(self, janela, nmr_parametros): self.model = keras.Sequential() self.model.add( keras.layers.LSTM(32, input_shape=(janela, nmr_parametros), return_sequences=True)) self.model.add(keras.layers.LSTM(64, return_sequences=True)) self.model.add(keras.layers.LSTM(128, return_sequences=False)) self.model.add(keras.layers.Dropout(0.2)) self.model.add( keras.layers.Dense(32, activation="relu", kernel_initializer="uniform")) self.model.add(keras.layers.Dense(1, activation="linear")) def RMSE(self, y_true, y_pred): return keras.backend.sqrt( keras.backend.mean(keras.backend.square(y_pred - y_true))) def Fit(self): self.model.compile(loss=self.RMSE, optimizer=keras.optimizers.Adam(), metrics=['mae', self.RMSE]) self.model.load_weights("model.h5") self.history = self.model.fit(x=self.X, y=self.Y, epochs=20, shuffle=False) #self.model.save_weights("model.h5") def Predict(self, data): result = self.model.predict(data, verbose=True) return result def forecast(self): timesteps = self.timesteps multisteps = 50 data_norm = pd.DataFrame(self.normalized) input_seq = data_norm[-timesteps:].values inp = input_seq[:, 1:3] predictions = list() inp = np.array(inp).astype('float32') #print(inp) for step in range(1, multisteps + 1): inp = inp.reshape(1, timesteps, 2) taxa_Erro = random.uniform(-0.005, 0.005) yhat = self.Predict(inp) + taxa_Erro Denormalized = np.ndarray((1, 7)) Denormalized[0][0] = -1 Denormalized[0][1] = yhat Denormalized[0][2] = inp[0][inp.shape[1] - 1][1] Denormalized[0][3] = -1 Denormalized[0][4] = -1 Denormalized[0][5] = -1 Denormalized[0][6] = -1 #print(self.scaler) #print(self.normalized) value = self.scaler.inverse_transform(Denormalized) #print(value) predictions.append(value[0][1]) #predictions.append(yhat[0][0]) newCase = np.array((float(yhat), inp[0][inp.shape[1] - 7][1])) # inp = np.append(inp, newCase) inp = np.reshape(inp, (-1, 2)) inp = inp[-timesteps:, :] #print(inp) self.PredictionGraph(predictions) def PredictionGraph(self, prediction): fig, ax = plt.subplots(figsize=(20, 10)) fig.suptitle('Previsao Casos ', fontweight='bold', fontsize=30, color='#0c3c6e') plt.xlabel('Dia', fontsize=20) plt.ylabel('Previsão', fontsize=20) ax.plot(np.arange(len(self.df)), (self.df['Total_Cases']), label='Days Gone with real data') ax.plot(np.arange(len(self.df), len(self.df) + len(prediction)), (prediction), label='50 days LSTM prediction') ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1, fontsize='15') plt.xticks(fontsize=16) plt.yticks(fontsize=16) ax.grid() fig.savefig('static/lstm.png')
similar_words.append( cosine_value(doc_lines[0], doc_lines[i], doc_lines)) sen_freq.append(calculate_sentence_freq(doc_lines[i], vocab)) keywords.append(calculate_top_words(top_words, doc_lines[i])) for i in range(len(doc_lines)): features.append([ lengths[i], positions[i], similar_words[i], degrees[i], sen_freq[i], keywords[i] ]) features = numpy.array(features) target_values = numpy.array(target_values) scalar = MinMaxScaler() features = scalar.fit_transform(features) # model =create_model() # model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # model.fit(features,target_values, batch_size=10, epochs=1000) # model.save('model5.h5') # seed =7 kfold = model_selection.KFold(n_splits=3, shuffle=True, random_state=42) # cvscores = [] # evaluate the model # scores = model.evaluate(features[test], target_values[test], verbose=2) # print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) # cvscores.append(scores[1] * 100) # create model
min_exercise_stock_options = min(min_exercise_stock_options, feature[1]) if feature[0] != 0: max_salary = max(max_salary, feature[0]) min_salary = min(min_salary, feature[0]) print 'Max value of "exercised_stock_options": ', max_exercise_stock_options print 'Min value of "exercised_stock_options": ', min_exercise_stock_options print 'Max value of "salary": ', max_salary print 'Min value of "salary": ', min_salary ### apply feature scaling from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaled_features = scaler.fit_transform(finance_features) print 'Scaled salary of $200,000 & stock options of $1,000,000: ', \ scaler.transform([[200000., 1000000.]]) ### rename the "name" parameter when you change the number of features ### so that the figure gets saved to a different file try: Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2) except NameError: print "no predictions object named pred found, no clusters to plot" ### rename the "name" parameter when you change the number of features ### so that the figure gets saved to a different file try:
GRN_MTR = GRN_MTR.loc[GRN_MTR['TI_TAG_DESCRIPTION'] == "Meter flow rate"] GRN_MTR = GRN_MTR[["TD_TAG_VALUE"]] GRN_MTR.columns = ["GRN_FLOW"] ### Moving average DON_MTR_12 = mov_avg(DON_MTR_12, "10s", "1min") s450_MTR = mov_avg(s450_MTR, "10s", "1min") GRN_MTR = mov_avg(GRN_MTR, "10s", "1min") both = DON_MTR_12.join(s450_MTR).fillna(0) both = both.join(GRN_MTR).fillna(0) values = both.values values = values.astype('float32') scaler = MinMaxScaler(feature_range=(0, 1)) joblib.dump(scaler, "/data/scaler.save") scaled = scaler.fit_transform(values) reframed = series_to_supervised(scaled, 180, 1) for i in range(1, 181): behind = str(i) reframed = reframed.drop('var3(t-' + behind + ')', axis=1) values = reframed.values n_train_hours = int(len(values) * 0.80) train = values[:n_train_hours, :] test = values[n_train_hours:, :] # split into input and outputs train_X, train_y = train[:, :-1], train[:, -1] test_X, test_y = test[:, :-1], test[:, -1] # reshape input to be 3D [samples, timesteps, features]
class WGAN(object): def __init__(self, **kwargs): """ Constructor """ self._defaults() self._args(kwargs) # override defaults with args passed self.setup() self.build() def _defaults(self): """ Sets default variable values """ self.attack_type = None self.critic = None self.generator = None self.gan = None self.evaluator = None # saved_states can be used to save states of a GAN, say # 5 of them so that the best can be saved when breaking out. self.saved_states = [] self.confusion_matrix = None self.classification_report = None self.scaler = None self.optimizer_learning_rate = 0.001 self.optimizer = RMSprop(lr=0.00005) self.max_epochs = 7000 self.batch_size = 255 self.sample_size = 500 self.clip_value = 0.01 self.valid = None self.fake = None self.X_train = None self.generator_alpha = 0.1 self.generator_momentum = 0.0 self.generator_layers = [8, 16, 32] self.confusion_matrix = None self.classification_report = None self.save_file = None def _args(self, kwargs): """ kwargs handler """ for key, value in kwargs.items(): if key == 'attack_type': self.attack_type = value elif key == 'max_epochs': self.max_epochs = value elif key == 'batch_size': self.batch_size = value elif key == 'sample_size': self.sample_size = value elif key == 'optimizer_learning_rate': self.optimizer_learning_rate = value elif key == 'critic': self.critic = value elif key == 'generator_layers': self.generator_layers = value elif key == 'generator_alpha': self.generator_alpha = value elif key == 'generator_momentum': self.generator_momentum = value def setup(self): """ Setups the GAN """ # TODO new method called from init opt passed print("Attack type: " + self.attack_type) conn = SQLConnector() data = conn.pull_kdd99(attack=self.attack_type, num=5000) dataframe = pd.DataFrame.from_records( data=data, columns=conn.pull_kdd99_columns(allQ=True)) # ========== # ENCODING # ========== # https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn d = defaultdict(LabelEncoder) # Splitting the data from features and lablels. Want labels to be consistent with evaluator encoding, so # we use the utils attack_to_num function features = dataframe.iloc[:, :41] attack_labels = dataframe.iloc[:, 41:] for i in range(0, attack_labels.size): attack_labels.at[i, 'attack_type'] = util.attacks_to_num( attack_labels.at[i, 'attack_type']) features = features.apply( lambda x: d[x.name].fit_transform(x)) # fit is encoded dataframe # feature scaling, reccomended from github implementation self.scaler = MinMaxScaler(feature_range=(-1, 1)) scaled_features = self.scaler.fit_transform(features.astype(float)) scaled_df = pd.DataFrame(data=scaled_features) # Join the seperately encoded sections back into one dataframe dataframe = scaled_df.join(attack_labels) dataset = dataframe.values # transform to ndarray print(dataset) # TODO: Feature scaling? May be necessary. Has to be on a per-feature basis? # Splitting up the evaluation dataset. Should maybe be moved? eval_dataset = pd.read_csv('PortsweepAndNonportsweep.csv', header=None) eval_dataset = eval_dataset.values self.eval_dataset_X = eval_dataset[:, 0:41].astype(int) self.eval_dataset_Y = eval_dataset[:, 41] validationToTrainRatio = 0.05 validationSize = int(validationToTrainRatio * len(self.eval_dataset_X)) self.eval_validation_data = self.eval_dataset_X[:validationSize] self.eval_validation_labels = self.eval_dataset_Y[:validationSize] self.eval_dataset_X = self.eval_dataset_X[validationSize:] self.eval_dataset_Y = self.eval_dataset_Y[validationSize:] testToTrainRatio = 0.05 testSize = int(testToTrainRatio * len(self.eval_dataset_X)) self.eval_test_data = self.eval_dataset_X[:testSize] self.eval_test_labels = self.eval_dataset_Y[:testSize] self.eval_dataset_X = self.eval_dataset_X[testSize:] self.eval_dataset_Y = self.eval_dataset_Y[testSize:] # to visually judge encoded dataset print("Real encoded " + self.attack_type + " attacks:") print(dataset[:1]) # Set X as our input data and Y as our label self.X_train = dataset[:, 0:41].astype(float) Y_train = dataset[:, 41] # labels for data. 1 for valid attacks, 0 for fake (generated) attacks self.valid = np.ones((self.batch_size, 1)) self.fake = np.zeros((self.batch_size, 1)) def build(self): """ Build the GAN """ # build the discriminator portion eval_args = { 'train_data': self.eval_dataset_X, 'train_labels': self.eval_dataset_Y, 'validation_data': self.eval_validation_data, 'validation_labels': self.eval_validation_labels, 'test_data': self.eval_test_data, 'test_labels': self.eval_test_labels, } # Doing this so we can read the data from the evaluator object evaluator_object = Evaluator(**eval_args) self.evaluator = evaluator_object.get_model() print("Evaluator metrics after training:") print(evaluator_object.performance) critic_layers = self.generator_layers.copy() critic_layers.reverse() print(critic_layers) critic_args = { 'layers': critic_layers, 'alpha': self.generator_alpha, 'optimizer': self.optimizer, } self.critic = Critic( **critic_args).get_model() #self.discriminator_layers self.critic.compile(loss=self.wasserstein_loss, optimizer=self.optimizer, metrics=['accuracy']) # build the generator portion gen_args = { 'layers': self.generator_layers, 'alpha': self.generator_alpha, } self.generator = Generator(**gen_args).get_model() #**gen_args # input and output of our combined model z = Input(shape=(41, )) attack = self.generator(z) validity = self.critic(attack) # build combined model from generator and discriminator self.gan = Model(z, validity) self.gan.compile(loss=self.wasserstein_loss, optimizer=self.optimizer) def train(self): """ Trains the GAN system """ # break condition for training (when diverging) loss_increase_count = 0 prev_g_loss = 0 conn = SQLConnector() idx = np.arange(self.batch_size) for epoch in range(self.max_epochs): #selecting batch_size random attacks from our training data #idx = np.random.randint(0, X_train.shape[0], batch_size) attacks = self.X_train[idx] # generate a matrix of noise vectors noise = np.random.normal(0, 1, (self.batch_size, 41)) # create an array of generated attacks gen_attacks = self.generator.predict(noise) # loss functions, based on what metrics we specify at model compile time c_loss_real = self.critic.train_on_batch(attacks, self.valid) c_loss_fake = self.critic.train_on_batch(gen_attacks, self.fake) d_loss = 0.5 * np.add(c_loss_real, c_loss_fake) for l in self.critic.layers: weights = l.get_weights() weights = [ np.clip(w, -self.clip_value, self.clip_value) for w in weights ] l.set_weights(weights) # generator loss function g_loss = self.gan.train_on_batch(noise, self.valid) if epoch % 500 == 0: print( "%d [D loss: %f, acc.: %.2f%%] [G loss: %f] [Loss change: %.3f, Loss increases: %.0f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss, g_loss - prev_g_loss, loss_increase_count)) gen_attacks = self.scaler.inverse_transform(gen_attacks) predicted_gen_attack_labels = self.evaluator.predict( gen_attacks).transpose().astype(int) gen_attack_labels = np.full(predicted_gen_attack_labels.shape, 1) print("Generated attack labels: ") print(gen_attack_labels) print("Predicted labels of generated attacks: ") print(predicted_gen_attack_labels) right = (predicted_gen_attack_labels == 1).sum() wrong = (predicted_gen_attack_labels != 1).sum() accuracy = (right / float(right + wrong)) print("5 generated attacks: ") print(gen_attacks[:5, :]) print() print("Accuracy of evaluator on generated data: %.4f " % accuracy) if accuracy > .50: conn.write_gens(gen_attacks, util.attacks_to_num(self.attack_type)) layersstr = str(self.generator_layers[0]) + "," + str( self.generator_layers[1]) + "," + str(self.generator_layers[2]) attack_num = util.attacks_to_num(self.attack_type) conn.write_hypers(layerstr=layersstr, attack_encoded=attack_num, accuracy=accuracy) # TODO: Add foreign key for attack type in hypers table def test(self): """ A GAN should know how to test itself and save its results into a confusion matrix. """ # TODO pass # This functions should only be passed the FEATURES, we don't want to scale the labels def feature_scale(self, dataset): # Scale all features, minus the label for i in range(0, len(dataset[0, :])): col_avg = np.mean(dataset[:, i]) col_sd = np.std(dataset[:, i]) dataset[:, i] = (dataset[:, i] - col_avg) / col_sd ########################################################################################## # Uses Sklearn's confusion matrix maker # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html ########################################################################################## def make_confusion_matrix(self, y_true, y_pred): self.confusion_matrix = confusion_matrix(y_true, y_pred) self.classification_report = classification_report(y_true, y_pred) def wasserstein_loss(self, y_true, y_pred): return K.mean(y_true * y_pred) ################################################################################ # Use these to save instances of a trained network with some desirable settings # Suggestion to save and load from the object's __dict__ taken from: # https://stackoverflow.com/questions/2709800/how-to-pickle-yourself ################################################################################ def save_this(self, filename): ''' Provide a basic filename to pickle this object for recovery later. Unlike the load function, this requires a save file, so that it will never accidentally overwrite a previous file. ''' self.save_file = filename + '.pickle' with open(self.save_file, 'wb') as f: pickle.dump(self, f, pickle.HIGHEST_PROTOCOL) def load_state_from_file(self, filename=None): if not filename: if self.save_file: filename = self.save_file else: print("Error: No savefile for this object. \ \n Using save_this(filename) will set the save filename." ) return with open(filename, 'rb') as f: tmp_dict = pickle.load(f) self.__dict__.update(tmp_dict.__dict__) f.close()
df = downcast_dtypes(df) # nombres names = list(df.columns) # datos de training train_df = df.iloc[0:test_index] # datos de testing con timesteps hacia atrás test_df = df.iloc[test_index - timesteps:] print(train_df.info()) # reset de indices para eliminar las fechas df.reset_index(drop=True, inplace=True) # normalizar los datos sc = MinMaxScaler(feature_range=(0, 1)) # training train_df = sc.fit_transform(train_df) # testing test_df = sc.transform(test_df) # hacer reshape para las transformaciones de las celdas lstm x_train, y_train = lstm_preparation(train_df, timesteps=timesteps) x_test, y_test = lstm_preparation(test_df, timesteps=timesteps) print(x_train.shape, y_train.shape) print(x_test.shape, y_test.shape) # modelo lstm = tf.keras.Sequential() lstm.add( tf.keras.layers.LSTM(units=512, input_shape=(np.array(x_train).shape[1], np.array(x_train).shape[2])))
# print(total_lobster_df['Total Lobster'][5110]) print(Lobster_df['Total Lobster'][167]) train_set = Lobster_df.head(676) test_set = Lobster_df.tail(52) # train_set = total_lobster_df.head(4745) # test_set = total_lobster_df.tail(365) train_set.tail() from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() train_scaled = scaler.fit_transform(train_set) test_scaled = scaler.transform(test_set) print(train_scaled[:5]) # print(train_scaled) # for i in train_scaled: # print(i) # print(test_scaled) #train_scaled y_train = train_scaled[:,:-1] x_train = train_scaled[:, -1:] #test scaled y_test = test_scaled[:,:-1]
# Stock prices keras import numpy as np import pandas as pd import matplotlib.pyplot as plt from keras.models import Sequential from keras.layers import Dense, LSTM, Dropout from keras import metrics from sklearn.model_selection import train_test_split dataset_train = pd.read_csv('NSE-TATAGLOBAL.csv') training_set = dataset_train.iloc[:, 5:6].values trainig_see = pd.DataFrame(training_set) from sklearn.preprocessing import MinMaxScaler sc = MinMaxScaler(feature_range=(0, 1)) training_set_scaled = sc.fit_transform(training_set) ''' LSTMs expect our data to be in a specific format, usually a 3D array. We start by creating data in 60 timesteps and converting it into an array using NumPy. Next, we convert the data into a 3D dimension array with X_train samples, 60 timestamps, and one feature at each step. ''' X_train = [] y_train = [] for i in range(60, 2035): X_train.append(training_set_scaled[i - 60:i, 0]) y_train.append(training_set_scaled[i, 0]) X_train, y_train = np.array(X_train), np.array(y_train) X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
'chg_per_svc', 'denied', 'psps_denied_services_cnt' ], inplace=True,axis = 1) X_train, X_test, y_train, y_test = train_test_split(results_df_sample, y, stratify=y,test_size=0.20, random_state=123) print(X_train.info()) print(X_test.info()) WOE_encoder = WOEEncoder() X_train_enc = WOE_encoder.fit_transform(X_train, y_train) X_test_enc = WOE_encoder.transform(X_test) scaler = MinMaxScaler() X_train_enc_scaled = pd.DataFrame(scaler.fit_transform(X_train_enc, y_train)) X_test_enc_scaled = pd.DataFrame(scaler.transform(X_test_enc)) param_grid = {'C': [6,7,8,9,10,11,12], 'gamma': [1],'kernel': ['rbf']} print(param_grid) SVC = SVC() # Random search of parameters, using 3 fold cross validation, # search across 10 different combinations, and use all available cores RFC_CV = GridSearchCV(estimator=SVC, param_grid=param_grid, cv= 2,verbose = 2) RFC_CV.fit(X_train_enc_scaled,y_train) print(RFC_CV.best_params_) print(RFC_CV.best_score_)
y_train = y_data[0:num_train] y_test = y_data[num_train:] # print(y_train) # num_x_signals = x_data.shape[0] print(num_x_signals) num_y_signals = y_data.shape[0] print(num_y_signals) # # # # print("Min:",(min(x_train))) # # print("Max:",(max(x_train))) # x_scaler = MinMaxScaler() x_train_scaled = x_scaler.fit_transform(x_train) # print(x_train_scaled) print("Min:", np.min(x_train_scaled)) print("Max:", np.max(x_train_scaled)) x_test_scaled = x_scaler.transform(x_test) # y_scaler = MinMaxScaler() y_train_scaled = y_scaler.fit_transform(y_train.reshape(-1, 1)) y_test_scaled = y_scaler.transform(y_test.reshape(-1, 1)) # print(x_train_scaled.shape) # print(y_train_scaled.shape) # # def batch_generator(batch_size, sequence_length):
"attack" ] df = df.filter(items=features) #Split into X (matrix) and y (array) dataset = df.values X = dataset[:, 0:8] #X = pd.DataFrame(X).fillna(0) y = dataset[:, 8] #Split training and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) #Normalize data scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) #Start algorithm n_neighbors = 7 knn = KNeighborsClassifier(n_neighbors, weights='distance') print("Starting algorithm") knn.fit(X_train, y_train) print('Accuracy of K-NN classifier on training set: {:.5f}'.format( knn.score(X_train, y_train))) print('Accuracy of K-NN classifier on test set: {:.5f}'.format( knn.score(X_test, y_test))) pickle.dump(knn, open('KNN_Model', 'wb'))
def scale(data): scaler = MinMaxScaler() for i in ['age', 'bmi', 'avg_glucose_level']: data[i] = scaler.fit_transform(data[i].values.reshape(-1, 1)) return data
warnings.filterwarnings("ignore", category=FutureWarning) ################################ # K-MEANS ################################ df = pd.read_csv("datasets/USArrests.csv", index_col=0) df.head() df.isnull().sum() df.info() df.describe().T sc = MinMaxScaler((0, 1)) df = sc.fit_transform(df) df[0:5] kmeans = KMeans(n_clusters=4) k_fit = kmeans.fit(df) k_fit k_fit.n_clusters k_fit.cluster_centers_ k_fit.labels_ df[0:5] ################################ # Kümelerin Görselleştirilmesi
plt.plot(NC_time, 'o', label='Nearest centroid') plt.plot(GNB_time, 'o', label='Gaussian Naive Bayes') plt.plot(DT_time, 'o', label='Decision tree') plt.legend() plt.plot(KNN_time_simple, 'o', label='K-nearest neighbors (simple)') plt.plot(KNN_time_with_improvement, 'o', label='K-nearest neighbors (with improvement)') plt.plot(Grid_KNN_time, 'o', label='Grid search for KNN') plt.plot(Grid_DT_time, 'o', label='Grid serach for Decision tree') plt.legend() tic = timeit.default_timer() mms = MinMaxScaler() # feature vektort normalizáljuk x_train_neural = mms.fit_transform(x_train) x_test_neural = mms.fit_transform(x_test) model = Sequential([ Dense(32, input_shape=(x_train_neural.shape[1], )), Activation('relu'), Dense(32), Activation('relu'), Dense(32), Activation('relu'), Dropout(0.25), # Regularizáció Dense(2), Activation('softmax'), ]) model.compile(loss='binary_crossentropy',
model.add(Dense(units=1)) model.compile(loss='mean_squared_error', optimizer='adam') return model if __name__ == '__main__': #set random seed np.random.seed(seed) #import data data = read_csv(filename, usecols=[1], engine='python', skipfooter=footer) dataset = data.values.astype('float32') #standarize data scaler = MinMaxScaler() dataset = scaler.fit_transform(dataset) train_size = int(len(dataset) * 0.67) validation_size = len(dataset) - train_size train, validation = dataset[0:train_size, :], dataset[ train_size:len(dataset), :] X_train, y_train = create_dataset(train) X_validation, y_validation = create_dataset(validation) # transform data as [sample, time step, feature] X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1])) X_validation = np.reshape( X_validation, (X_validation.shape[0], 1, X_validation.shape[1])) #train model model = create_model()
# plt.figure() # fig1 = sns.boxplot(df['close']) # fig1.set_title('Box plot of %s'%stock_code) # plt.show() # plt.figure() # fig2 = sns.lineplot(df['date'], df['close']) # fig2.set_title('Time series of %s'%stock_code) # plt.show() # print(np.array(df[['close','open']]).shape) # exit() # 数据正规化处理 scaler = MinMaxScaler(feature_range=(0, 1)) df['scaled_close'] = scaler.fit_transform(np.array(df['close']).reshape(-1, 1)) # print(df['scaled_close']) # 数据集划分处理 split_date = datetime(year=2019, month=7, day=1) df_train = df.loc[df['date'] < split_date] df_val = df.loc[df['date'] >= split_date] df_val.reset_index(drop=True, inplace=True) # print(df_train.shape, df_val.shape) # exit() def makeXy(df, time_steps): # 本函数用于生成训练模型的数组数据
x,y = build_timeseries(x, y_col_index) return trim_dataset(x), trim_dataset(y) log.info('Loading and preprocessing...') log.info('Loading data...') df_data = get_data(data_sym, dates, data_cols=train_cols, sing=True) df_train, df_test = train_test_split(df_data, train_size=train_size, test_size=test_size, shuffle=False) log.info('Data loaded.') print() log.info('Preprocessing data...') x = df_train.loc[:,train_cols].values min_max_scaler = MinMaxScaler() x_train = min_max_scaler.fit_transform(x) x_test = min_max_scaler.transform(df_test.loc[:,train_cols]) x_t, y_t = preprocess(x_train, 3) x_temp, y_temp = preprocess(x_test, 3) x_val, x_test_t = np.split(x_temp, 2) y_val, y_test_t = np.split(y_temp, 2) log.info('Done preprocessing.') print() model_path = model_dir+model_name if os.path.isfile(model_path): log.info('Loading model...')
trainImages = trainImages.reshape(4000,1*270*300) testImages = testImages.reshape(4000,1*270*300) #Verificamos que el dataset de imagenes sea de 2 dimensiones testImages.shape trainImages.shape #Verificamos que el dataset de labels sea de 2 dimensiones testLabels.shape trainLabels.shape #---------------------------------------------------------------------------------------------------------------------- #Transformo los datos con el método MinMaxScaler() a una escala particular scaler = MinMaxScaler() X_train = scaler.fit_transform(trainImages) X_test = scaler.transform(testImages) #---------------------------------------------------------------------------------------------------------------------- ''' Los autovectores son las direcciones en las que la varianza de los datos es mayor. Recordemos que, en teoría de probabilidad, la varianza de una variable aleatoria es una medida de dispersión (definida como la esperanza del cuadrado de la desviación de dicha variable respecto a su media). Por tanto, las direcciones en las que la varianza es mayor, representan la esencia principal de la información contenida en el dataset, por eso se les llama componentes principales. Al igual que un autovector es una dirección, el autovalor es un número, que representa el valor de la varianza sobre ese autovector. Por ello, para encontrar las componentes principales que condensen esa esencia de la información del dataset, calcularemos primero la matriz de covarianza, que nos da la medida de dispersión conjunta entre variables. Para ello, usaremos la función Covariance Matrix de la librería Numpy.
def test_min_max_scaler_zero_variance_features(): """Check min max scaler on toy data with zero variance features""" X = [[0., 1., 0.5], [0., 1., -0.1], [0., 1., 1.1]] X_new = [[+0., 2., 0.5], [-1., 1., 0.0], [+0., 1., 1.5]] # default params scaler = MinMaxScaler() X_trans = scaler.fit_transform(X) X_expected_0_1 = [[0., 0., 0.5], [0., 0., 0.0], [0., 0., 1.0]] assert_array_almost_equal(X_trans, X_expected_0_1) X_trans_new = scaler.transform(X_new) X_expected_0_1_new = [[+0., 1., 0.500], [-1., 0., 0.083], [+0., 0., 1.333]] assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2) # not default params scaler = MinMaxScaler(feature_range=(1, 2)) X_trans = scaler.fit_transform(X) X_expected_1_2 = [[1., 1., 1.5], [1., 1., 1.0], [1., 1., 2.0]] assert_array_almost_equal(X_trans, X_expected_1_2)
def loaddataset(self,path,module): df=pd.read_csv(path) subdf = df[['PassengerId','Pclass','Sex','Age','Embarked','Fare','SibSp','Parch']] SibSp=subdf['SibSp'] Parch=subdf['Parch'] # supplement Age Age=subdf['Age'].fillna(value=subdf.Age.mean()) Fare=subdf['Fare'].fillna(value=subdf.Fare.mean()) dummies_Sex=pd.get_dummies(subdf['Sex'],prefix='Sex') dummies_Embarked = pd.get_dummies(subdf['Embarked'], prefix= 'Embarked') dummies_Pclass = pd.get_dummies(subdf['Pclass'], prefix= 'Pclass') PassengerId=subdf['PassengerId'] # Age&Fare to Scaler scaler=MinMaxScaler() age_scaled=scaler.fit_transform(Age.values) fare_scaled=scaler.fit_transform(Fare.values) Age_Scaled=pd.DataFrame(age_scaled,columns=['Age_Scaled']) Fare_Scaled=pd.DataFrame(fare_scaled,columns=['Fare_Scaled']) if module=='train': self.trainlabel=df.Survived self.trainset=pd.concat([dummies_Pclass,dummies_Sex,dummies_Embarked,Age_Scaled,Fare_Scaled,SibSp,Parch],axis=1) elif module=='test': self.testset=pd.concat([PassengerId,dummies_Pclass,dummies_Sex,dummies_Embarked,Age_Scaled,Fare_Scaled,SibSp,Parch],axis=1)
def normalize_data(tr_x,ts_x,normz=None,axis=0): if normz is 'scale': tr_x = scale(tr_x,axis=axis) ts_x = scale(ts_x,axis=axis) elif normz is 'minmax': minmax_scaler = MinMaxScaler() if axis==0: for c_i in range(tr_x.shape[1]): tr_x[:,c_i] = minmax_scaler.fit_transform(tr_x[:,c_i]) ts_x[:,c_i] = minmax_scaler.fit_transform(ts_x[:,c_i]) elif axis==1: for r_i in range(tr_x.shape[0]): tr_x[r_i,:] = minmax_scaler.fit_transform(tr_x[r_i,:]) ts_x[r_i,:] = minmax_scaler.fit_transform(ts_x[r_i,:]) elif normz is 'sigmoid': if axis==0: col_max = np.max(tr_x,axis=0) cols_non_norm = np.argwhere(col_max>1).tolist() tr_x[:,cols_non_norm] = -0.5 + (1 / (1 + np.exp(-tr_x[:,cols_non_norm]))) # TODO: implement col_max col_non_norm for test set ts_x[:,cols_non_norm] = -0.5 + (1/(1+np.exp(-ts_x[:,cols_non_norm]))) elif axis==1: row_max = np.max(tr_x,axis=1) rows_non_norm = np.argwhere(row_max>1).tolist() tr_x[rows_non_norm,:] = -0.5 + (1 / (1 + np.exp(-tr_x[rows_non_norm,:]))) # TODO: implement row_max row_non_norm for test set ts_x[rows_non_norm,:] = -0.5 + (1/(1+np.exp(-ts_x[rows_non_norm,:]))) return tr_x,ts_x
def get_training_data_by_category(category, limit=0): limit_pos = limit*0.2 limit_neg = limit*0.8 N_pos = DataDAO.count_training_data_by_category(category) if N_pos < limit_pos: limit_pos = N_pos limit_neg = N_pos*5 training_data = [] training_target = [] positive = DataDAO.get_training_data_by_category(category) for ind, sample in enumerate(positive): if limit != 0 and ind >= limit_pos: break training_data.append(sample) training_target.append(1) negative = DataDAO.get_training_data_by_other_categories(category) for ind, sample in enumerate(negative): if limit != 0 and ind >= limit_neg: break training_data.append(sample) training_target.append(0) scaler = MinMaxScaler() training_data_scaled = scaler.fit_transform(training_data) # training_data_scaled = scale(training_data,axis=0) tr_data_sparse = csr_matrix(training_data_scaled) return tr_data_sparse, training_target, scaler
def test_stratified_shuffle_split(clf, dataset, feature_list, folds = 1000, scale_features = True): data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) # Scale features if(scale_features): scaler = MinMaxScaler() features = scaler.fit_transform(features) cv = StratifiedShuffleSplit(labels, folds, random_state = 42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print "Warning: Found a predicted label not == 0 or 1." print "All predictions should take value 0 or 1." print "Evaluating performance for processed predictions:" break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0*(true_positives + true_negatives)/total_predictions precision = 1.0*true_positives/(true_positives+false_positives) recall = 1.0*true_positives/(true_positives+false_negatives) f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives) f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall) print 'Total predictions: '+str(total_predictions) print 'Accuracy: '+str(accuracy) print 'Precision: '+str(precision) print 'Recall: '+str(recall) print 'F1: '+str(f1) print 'F2: '+str(f2) print "" except: print "Got a divide by zero when trying out:", clf print "Precision or recall may be undefined due to a lack of true positive predicitons."
def cluster(final_data_dict, cluster_range, list_or_dict): final_data_list= clustering_module.convert_to_list(final_data_dict) respondent_IDs = np.array(map(int, final_data_dict.keys())) feature_names = final_data_dict.values()[0].keys() final_data_list_imputed = clustering_module.preprocess(final_data_list) Scaler = MinMaxScaler() final_data_list_scaled = Scaler.fit_transform(final_data_list_imputed) #Transformed is distance of each respondent from each cluster center #Predicted is the cluster membership of each respondent merging_list = clustering_module.convert_to_list(final_data_dict,remove_NaN=0 ) data = list(merging_list) ignore_set_added = set(['ids']) for num_clusters in cluster_range: transformed, predicted, score = clustering_module.clustering(final_data_list_scaled, num_clusters) cluster_name = "%s_clusters" % num_clusters ignore_set_added.add(cluster_name) data, feature_names = clustering_module.add_new_data_to_rows(predicted, data, feature_names, [cluster_name]) data, feature_names = clustering_module.add_new_data_to_rows(respondent_IDs, data, feature_names, ["ids"], "before") if list_or_dict == "dict": temp = dictionary_conversion.create_dictionary(data, feature_names) num_converted = dictionary_conversion.convert_values_to_int(temp) #Set of features that should be different due to being categorical ignore_set_changed = set(['busgrn', 'peopgrn', 'sex', 'race', 'topprob1', 'topprob2']) verdict = compare_respondent_dicts(respondent_IDs, num_converted, final_data_dict, ignore_set_changed, ignore_set_added) return num_converted, verdict elif list_or_dict == "list": return data, feature_names
def vary_border(pred_true,y,num_iter=101): mms = MinMaxScaler() pred=pred_true.copy() pred=mms.fit_transform(pred) best_score = 0 for k1 in range(num_iter): c1 = k1/(num_iter-1) for k2 in range(num_iter): c2 = k2/(num_iter-1) for k3 in range(num_iter): c3 = k3/(num_iter-1) if c1 < c2 and c1 < c3 and c2 < c3 and c1 > 0.25 and c1 < 0.5 and c3 < 0.9: tmp_pred = pred.copy() mask1 = tmp_pred < c1 mask2 = (tmp_pred >=c1) * (tmp_pred < c2) mask3 = (tmp_pred >=c2) * (tmp_pred < c3) mask4 = tmp_pred >=c3 tmp_pred[mask1] = 1 tmp_pred[mask2] = 2 tmp_pred[mask3] = 3 tmp_pred[mask4] = 4 score = quadratic_weighted_kappa(y,tmp_pred) if score > best_score: best_score = score best_coef = [c1,c2,c3] best_pred = tmp_pred.copy() #print(best_score,best_coef) return best_pred, best_coef
def minmaxscaling(df): # MinMaxScaling between 0 and 1 is bad when you have outliers. # https://stats.stackexchange.com/a/10298 scaler = MinMaxScaler(feature_range=(0, 1)) # min max scaler want features in the columns and samples in the rows -> ok df = scaler.fit_transform(df) return df, scaler
def getips(conf, net, superpixels_num, layer='inner_product_target'): (options, args) = parser.parse_args() layer = options.layer data = net.blobs[layer].data #data = net.blobs['InnerProduct1'].data feature_len = data.shape[1] try: negative_numbers = conf.model['number_of_negatives'] except: negative_numbers = 1 reps = np.zeros((superpixels_num*negative_numbers, feature_len)) for i in xrange(superpixels_num): if i%1000==1: print i net.forward() reps[i] = np.sum(net.blobs[layer].data, axis=1) reps_slice = reps[..., 0] from sklearn.preprocessing import MinMaxScaler clf = MinMaxScaler() reps_slice = clf.fit_transform(reps_slice) if negative_numbers > 1: reps_slice = np.square(reps_slice) #reps_slice[reps_slice<np.mean(reps_slice)] = 0 for i in xrange(reps_slice.shape[0]): reps[i] = reps_slice[i] # print net.blobs['inner_product_target'].data[1:10] return reps
def scale(self): # Scaling is an important part of this process: many of our algorithms # require our data to be scaled or otherwise standardized. We # do this by scaling features to values between [0,1]. This preserves # zero entries in our sparse matrix which is always a desirable # quality when working with this sort of data. # Scaling is sort of a convoluted process because Scipy/Scikit # doesn't offer a way to do this natively. We transpose the matrix, # convert it to LIL format (which isn't inefficient in this operation), # and divide each row (column in the original matrix) by the row's # sum before transposing and converting back to CSR. # However, if the matrix is not sparse, we don't have to worry about # this and can simply use one of Scikit's utility methods. # TODO: Maybe look at profiling to ensure that this strategy really # is the least expensive one. if self.sparse: self.vecs = self.vecs.tolil() self.vecs = self.vecs.transpose() num_features, _ = self.vecs.shape for i in range(num_features): self.vecs[i] /= self.vecs[i].sum() self.vecs = self.vecs.transpose() self.vecs = self.vecs.tocsr() else: mms = MinMaxScaler(copy = False) self.vecs = mms.fit_transform(self.vecs)
def runAlgorithm(data, categories, function, iterations = 5, num_partitions = 2): results_table = np.empty([iterations*num_partitions,4], dtype=float) scaler = MinMaxScaler() data = scaler.fit_transform(data) for i in range(iterations): # Se realiza una partición aleatoria print("Iteration ", i) partition = makePartitions(data, categories, random_ppio) for j in range(num_partitions): print("Sub iteration ", j) start = time.time() training_data = partition[0][j] training_categ = partition[1][j] test_data = np.array([partition[0][k][l] for k in range(num_partitions) if k!=j for l in range(len(partition[0][k]))], float) test_categ = np.array([partition[1][k][l] for k in range(num_partitions) if k!=j for l in range(len(partition[1][k]))]) solution, train_rate = function(training_data, training_categ) end = time.time() nbrs = neighbors.KNeighborsClassifier(3) nbrs.fit(training_data[:,solution],training_categ) rate = 100*nbrs.score(test_data[:,solution], test_categ) results_table[i*num_partitions+j,0] = train_rate/len(training_data)*100 results_table[i*num_partitions+j,1] = rate results_table[i*num_partitions+j,2] = (1 - sum(solution)/len(training_data[0]))*100 results_table[i*num_partitions+j,3] = end-start print("Rate = " + str(rate) + "\nTime = " + str(end-start) + " s") return results_table
def analysis_7(df_Coredata): """ 多次元多項式モデル """ #https://www.jeremyjordan.me/polynomial-regression/ X = df_Coredata[['d','e','f','g','i']] y = df_Coredata['j'] # グラフのスタイルを指定 sns.set(style = 'whitegrid', context = 'notebook') # 変数のペアの関係をプロット #sns.pairplot(df_Coredata) #plt.show() #X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 0) #lr = linear_model.LinearRegression().fit(X_train, y_train) #print("Trainng set score: {:.2f}".format(lr.score(X_train, y_train))) #print("Test set score: {:.2f}".format(lr.score(X_test, y_test))) ### データのスケール変換 # 標準化 std_Scaler = StandardScaler() data_std = std_Scaler.fit_transform(X) mmx_Scaler =MinMaxScaler() X_scaled = mmx_Scaler.fit_transform(X) #X_test_scaled = scaler.transform(X_test) #print(X_train_scaled) poly = PolynomialFeatures(degree = 2).fit(data_std) print(poly.get_feature_names())
def plot_prediction_relevance(results, EFA=True, classifier='ridge', rotate='oblimin', change=False, size=4.6, dpi=300, ext='png', plot_dir=None): """ Plots the relevant relevance of each factor for predicting all outcomes """ predictions = results.load_prediction_object(EFA=EFA, change=change, classifier=classifier, rotate=rotate)['data'] targets = list(predictions.keys()) predictors = predictions[targets[0]]['predvars'] importances = abs(np.vstack([predictions[k]['importances'] for k in targets])) # scale to 0-1 scaler = MinMaxScaler() scaled_importances = scaler.fit_transform(importances.T).T # make proportion scaled_importances = scaled_importances/np.expand_dims(scaled_importances.sum(1),1) # convert to dataframe scaled_df = pd.DataFrame(scaled_importances, index=targets, columns=predictors) melted = scaled_df.melt(var_name='Factor', value_name='Importance') plt.figure(figsize=(8,12)) f=sns.boxplot(y='Factor', x='Importance', data=melted, width=.5) if plot_dir is not None: filename = 'prediction_relevance' save_figure(f, path.join(plot_dir, filename), {'bbox_inches': 'tight', 'dpi': dpi}) plt.close()
def uniform_to_normal(df, continuous_features): scaler = MinMaxScaler() df_scaled = pd.DataFrame(scaler.fit_transform(df[continuous_features].dropna()), columns=continuous_features) uniform = set() alpha = 0.05 for c in continuous_features: statistic, pvalue = kstest(df_scaled[c], scipy.stats.uniform().cdf) if statistic < alpha: uniform.add(c) zero_to_one = [f for f in uniform if df[f].min() > 0 and df[f].min() < 0.001 and df[f].max() < 1 and df[f].max() > 0.999] zero_to_ten = [f for f in uniform if df[f].min() > 0 and df[f].min() < 0.01 and df[f].max() < 10 and df[f].max() > 9.99] zero_to_hundred = [f for f in uniform if df[f].min() > 0 and df[f].min() < 0.1 and df[f].max() < 100 and df[f].max() > 99.9] for f in uniform: min = 0 if f in zero_to_one or f in zero_to_ten or f in zero_to_hundred else df[f].min() max = 1 if f in zero_to_one else (10 if f in zero_to_ten else 100 if f in zero_to_hundred else df[f].max()) df[f] = df[f].map(lambda x: norm.ppf((x - min) / ( max - min))) # we could use df_scaled but this should give us better results since what we think are the actual min and max, and not the observed min and max df.replace([np.inf, -np.inf], np.nan, inplace=True) df.dropna(inplace=True) return uniform
def sdae_syn(X_s,P,h_layer,activations,noise,epoch,loss,batch_size): """Generate synthetic samples using stacked De-noising Encoders Parameters ---------- X_s: positive class sample (Numpy Array) (Input Must be in within range of 0 to 1) P: Over Sampling Percentage h_layer: hidden layer (list) activation: activation functions list (same length as hidden layer) noise : [None,Gaussian,mask] epoch: epoch for each layer (list with same size as hidden layer) loss: 'rmse' or 'cross-entropy' batch_size = mini_batch size For more detaisl on input parameters https://github.com/rajarsheem/libsdae """ n_samples=int(X_s.shape[0]*P/100) print "generating %d samples" %(n_samples) X_init=np.random.standard_normal(size=(n_samples,X_s.shape[1])) scaler=MinMaxScaler() X_init=scaler.fit_transform(X_init) model = StackedAutoEncoder(dims=h_layer, activations=activations, noise=noise, epoch=epoch,loss=loss, batch_size=batch_size, lr=0.007, print_step=2000) model.fit(X_s) syn_Z=model.transform(X_init) return syn_Z
def rank_to_dict(ranks, names, order=1, ratio=1): minmax = MinMaxScaler() ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0] if np.mean(ranks) == 0: ranks+=1 ranks = map(lambda x: round(x, 2), ranks) return dict(zip(names, ranks ))
def train_model(feats_csv): df = pd.DataFrame() df = pd.read_csv(feats_csv).iloc[:,1:] y = np.ravel(df.iloc[:,-1:]) X = np.array(df.iloc[:,:-1]) ############ 15 Best selected features using ANOVA F-value score function ############### X_new = SelectKBest(f_classif, k=15).fit_transform(X, y) selected_features = SelectKBest(f_classif, k=15).fit(X, y).get_support(indices = True) ############ KNN manhattan ############### ##### preprocessing: data scaling######## min_max_scaler = MinMaxScaler() X_new = min_max_scaler.fit_transform(X_new) model = KNeighborsClassifier(n_neighbors = 1,algorithm = 'brute',metric = 'manhattan',weights = 'uniform') model.fit(X_new,y) newdir = '../kNN_clfr' os.mkdir(newdir) joblib.dump(model, os.path.join(newdir,'kNN.pkl')) return
def readTrainingData(): data = np.loadtxt( 'data/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) }) allY = data[:, 32] allX = data[:, 1:31] allW = data[:, 31] scale = MMS() allX = scale.fit_transform(allX) np.random.seed(42) r = np.random.rand(allY.shape[0]) xTrain = allX[r<=0.4] yTrain = allY[r<=0.4] wTrain = allW[r<=0.4] xValid = allX[r>0.7] yValid = allY[r>0.7] wValid = allW[r>0.7] v = np.random.rand(yValid.shape[0]) xCrossValid = xValid[v<=0.5] yCrossValid = yValid[v<=0.5] wCrossValid = wValid[v<=0.5] xTestValid = xValid[v>0.5] yTestValid = yValid[v>0.5] wTestValid = wValid[v>0.5] return [xTrain, yTrain, wTrain, xCrossValid, yCrossValid, wCrossValid, xTestValid, yTestValid, wTestValid]