x_std[7] = (7-1)/(7-1) ''' print('---------- Standard Scaing -----------') # Standard Scalar standard = preprocessing.StandardScaler().fit(x) print(standard.transform(x)) ''' x_std[0] = (1-4)/np.std(x[:,0] ''' print('--------------------------------------') #Binarizer scaling ''' Giving the threshold for the each data. eg in the Neural net ''' print('------------ Binarizer --------------------') print(preprocessing.Binarizer(3.0).fit(x).transform(x)) print('-------------------------------------------') #Normalize ''' x0 = 1 norm0 = math.sqrt(1+4+9) x0/norm0 ''' print('--------- Normalize ------------') print(preprocessing.Normalizer().fit(x).transform(x)) print('--------------------------------')
def _setBinarizer(self, a_df: pd.DataFrame = pd.DataFrame()) -> pd.DataFrame: return pd.DataFrame(preprocessing.Binarizer(threshold=1.4).transform(a_df), columns=a_df.columns, index=a_df.index)
#标准化(均值移除) data_standardized = preprocessing.scale(data) print("\nMean =", data_standardized.mean(axis=0)) print("Std deviation =", data_standardized.std(axis=0)) #归一化 data_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)) data_scaled = data_scaler.fit_transform(data) print("\nMin max scaled data =", data_scaled) #中心化 data_normalized = preprocessing.normalize(data, norm='l1') print("\nL1 normalized data =", data_normalized) #二值化 data_binarized = preprocessing.Binarizer(threshold=1.4).transform(data) print("\nBinarized data =", data_binarized) #独热编码 encoder = preprocessing.OneHotEncoder() encoder.fit([[0, 2, 1, 12], [1, 3, 5, 3], [2, 3, 2, 12], [1, 2, 4, 3]]) encoded_vector = encoder.transform([[2, 3, 5, 3]]).toarray() print("\nEncoded vector =", encoded_vector) ''' 分析: 4个特征: 第一个特征(即为第一列)为[0,1,2,1] ,其中三类特征值[0,1,2],因此One-Hot Code可将[0,1,2]表示为:[100,010,001] 同理第二个特征列可将两类特征值[2,3]表示为[10,01] 第三个特征将4类特征值[1,2,4,5]表示为[1000,0100,0010,0001] 第四个特征将2类特征值[3,12]表示为[10,01] 因此最后可将[2,3,5,3]表示为[0,0,1,0,1,0,0,0,1,1,0]
import pandas as pd import sklearn.ensemble as se import sklearn.metrics as sm import sklearn.model_selection as ms import sklearn.preprocessing as sp import numpy as np import sklearn.svm as svm train_data = pd.read_csv('../train.csv') test_data = pd.read_csv('../test.csv') train_data_y = train_data['label'].values print(type(train_data_y)) train_data_x = train_data.drop('label', axis=1) print(type(train_data_x)) one_zero = sp.Binarizer(threshold=0) test_data = one_zero.transform(test_data) train_data_x = one_zero.transform(train_data_x) train_x, test_x, train_y, test_y = ms.train_test_split(train_data_x, train_data_y, test_size=0.15, random_state=4) # params = [{'max_depth':[35,40], 'n_estimators':[1600,1800]}] # # model = ms.GridSearchCV(se.RandomForestClassifier(random_state=4), params, cv=3) # model.fit(train_x, train_y) # for param, score in zip(model.cv_results_['params'], model.cv_results_['mean_test_score']): # print(param, score) # print(model.best_params_) # print(model.best_score_) # print(model.best_estimator_) # model = se.RandomForestClassifier(max_depth=35, n_estimators=1800,random_state=3) model = svm.SVC(kernel='poly', degree=7)
def binarization(): data_binarized = preprocessing.Binarizer( threshold=1.4).transform(input_data) print("\nBinarized data =", data_binarized)
if __name__ == '__main__': ls = lstm(7, 7, 10) train_data = reberGrammar.get_n_embedded_examples(1000) error = [] t1 = time.clock() for i in xrange(60): print '\n', i, '/60' err = 0 for x, y in train_data: tmp = ls.train(x, y) err += tmp print tmp, '\r', #print ls.predict(train_data[0][0]) error.append(err) print 'time:', time.clock() - t1 plt.plot(np.arange(60), error, 'b-') plt.xlabel('epochs') plt.ylabel('error') plt.show() print error test_data = reberGrammar.get_n_embedded_examples(100) binarizer = preprocessing.Binarizer(threshold=0.1) error = 0 for x, y in test_data: y_pred = ls.predict(x) #print y_pred y_pred = binarizer.transform(y_pred) for a, b in zip(y, y_pred): error += np.mean(a - b) print error
def fit_transform(self, dataset): """ Transform a datframe <dataset> into a feature matrix params: dataset : Pandas DataFrame, the input dataset Returns a matrix N samples x M features """ ###First step, select some fields we care about, all of these are numeric, so we can just pick them out data = np.array(dataset[[ 'age', 'NumberOfDependents' #, 'NumberOfOpenCreditLinesAndLoans', 'RevolvingUtilizationOfUnsecuredLines' ]]) # Can add in additional features here #dataset['age_to_linescredit'] = dataset['age'] / dataset['RevolvingUtilizationOfUnsecuredLines'] #dataset['debt_to_monthlyincome'] = dataset['DebtRatio'] / dataset['MonthlyIncome'] # debt ratio to monthly income relationship # ## You want to perform some more interesting transformations of the data # ## For example, ratios # dataset['dollar_per_year'] = dataset['MonthlyIncome'] / dataset['age'] ## One preprocesing step we will need to perform is imputation, fill in missing values imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) data = imputer.fit_transform(data) #return np.hstack([data]) ## Scaling features may be important you have very large outliers or need more intepretable coefficients scaler = preprocessing.StandardScaler() scaled_income = scaler.fit_transform(data[:,1]) # Added by Will #scaled_debtratio = scaler.fit_transform(data[:,1]) data = np.column_stack([data, scaled_income]) # ## Turning features into discrete features is important if you are using linear classifier, but the underlying # ## data does not have a linear relationship ## NOTE: the binarizer, turns everything > 0 to 1 and and everything less than 0 to 0, so use the StandardScaler first binarizer = preprocessing.Binarizer() #binned_income = binarizer.fit_transform(scaled_income) binned_numberoftimes90dayslate = binarizer.fit_transform(dataset['NumberOfTimes90DaysLate']) binned_numberoftime3059dayspastduenotworse = binarizer.fit_transform(dataset['NumberOfTime30-59DaysPastDueNotWorse']) binned_numberoftime6089dayspastduenotworse = binarizer.fit_transform(dataset['NumberOfTime60-89DaysPastDueNotWorse']) binned_numberofopencreditlinesandloans = binarizer.fit_transform(dataset['NumberOfOpenCreditLinesAndLoans']) binned_revolvingutilizationofunsecuredlines = binarizer.fit_transform(dataset['RevolvingUtilizationOfUnsecuredLines']) binned_debtratio = binarizer.fit_transform((.5 - dataset['DebtRatio'])) # Default #data = np.column_stack( [data,binned_income ] ) # column_stack is pretty much an append #data = np.column_stack( [data,binned_income ] ) # column_stack is pretty much an append data = np.column_stack( [data, binned_numberoftimes90dayslate] ) data = np.column_stack( [data, binned_numberoftime3059dayspastduenotworse] ) data = np.column_stack( [data, binned_numberoftime6089dayspastduenotworse] ) data = np.column_stack( [data, binned_numberofopencreditlinesandloans] ) data = np.column_stack( [data, binned_revolvingutilizationofunsecuredlines]) data = np.column_stack( [data, binned_debtratio]) return data
import pandas as pd import numpy as np from sklearn.naive_bayes import BernoulliNB from sklearn.model_selection import train_test_split from sklearn import preprocessing train_data = pd.read_csv("/home/amitoj/Downloads/data.csv") del train_data['id'] X = train_data[train_data.columns[1:30]].values Y = train_data.loc[:, ['diagnosis']].values X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0) X_Train = preprocessing.Binarizer().fit_transform(X_train) print(X_Train) X_Test = preprocessing.Binarizer(threshold=1.0).fit_transform(X_test) print(X_Test) Y_train = (np.ravel(np.array(y_train))) Y_test = (np.ravel(np.array(y_test))) le = preprocessing.LabelEncoder() Y_Train = le.fit_transform(Y_train) Y_Test = le.fit_transform(Y_test) # print(Y_Train) # print(len(Y_Test)) Nb_clf = BernoulliNB() print(Nb_clf.fit(X_train, Y_Train)) prediction = Nb_clf.predict(X_Test)
# -*- coding: utf-8 -*- from __future__ import unicode_literals """ demo04_bin.py 二值化 """ import numpy as np import sklearn.preprocessing as sp raw_samples = np.array([[17., 100., 4000], [20., 80., 5000], [23., 75., 5500]]) bin = sp.Binarizer(threshold=80) r = bin.transform(raw_samples) print(r) raw_samples[raw_samples <= 80] = 0 raw_samples[raw_samples > 80] = 1 print(raw_samples)
import pandas as pd df = pd.DataFrame(features, columns=['features_1', 'features_2']) print(df.apply(add_ten)) #处理异常点 houses = pd.DataFrame() houses['Price'] = [534433, 392333, 293222, 4322032] houses['Bathrooms'] = [2, 3.5, 2, 116] houses['Square_Feet'] = [1500, 2500, 1500, 48000] houses_re = houses[houses['Bathrooms'] < 20] houses["outlier"] = np.where(houses["Bathrooms"] < 20, 0, 1) print(houses) age = np.array([[6], [12], [20], [36], [65]]) binarier = preprocessing.Binarizer(18) print(binarier.fit_transform(age)) #multi threholds print(np.digitize(age, bins=[20, 30, 64])) #kmeans from sklearn.datasets import make_blobs from sklearn.cluster import KMeans features, _ = make_blobs(n_samples=50, n_features=2, centers=3, random_state=1) dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"]) clusterer = KMeans(3, random_state=0) clusterer.fit(features) dataframe['group'] = clusterer.predict(features) print(dataframe.head(5)) #删除缺失值
def main(): dataframe = extractData( ) #pd.DataFrame([[1,"v",2],[1,2,3]],columns=["name1","name2","name3"]) # 'Binarizer', preprocessing.Binarizer().fit_transform()
def create_binary_cols(df, cols=[], thresh=0.1): cols = cols or df.columns for c in cols: binzr = preprocessing.Binarizer(thresh).fit(df[c]) df[c + "_bin"] = binzr.transform(df[c]) return df
np.set_printoptions(precision=3) print('MinMaxScaler transformed data:\n{0}\n'.format(rescaledX[0:5, :])) # ---------------------------------------- # multiple transformations # ---------------------------------------- scalers = dict() # transform so that smallest data is 0, and largest is 1 scaler = pp.MinMaxScaler(feature_range=(0, 1)).fit(X) scalers['MinMaxScaler'] = (scaler, scaler.transform(X)) # transform so that data is standard normal Gaussian distribution; ie, mean # of 0 and standard deviation of 1 scaler = pp.StandardScaler().fit(X) scalers['StandardScaler'] = (scaler, scaler.transform(X)) # transform so that the length of each observation (row) has a length of 1 # (unit vector) scaler = pp.Normalizer().fit(X) scalers['Normalizer'] = (scaler, scaler.transform(X)) # transform so that all values above a threshold are 1 and all values below # a threshold are 0 scaler = pp.Binarizer(threshold=2).fit(X) scalers['Binarizer'] = (scaler, scaler.transform(X)) # display results of transformations for entry in scalers.items(): print('{0} transformed data:\n{1}\n'.format(entry[0], entry[1][1][0:5, :]))
data = df['amount'] # 获取要聚类的数据,名为amount的列 data_reshape = data.reshape((data.shape[0], 1)) # 转换数据形状 model_kmeans = KMeans(n_clusters=4, random_state=0) # 创建KMeans模型并指定要聚类数量 keames_result = model_kmeans.fit_predict(data_reshape) # 建模聚类 df['amount2'] = keames_result # 新离散化的数据合并到原数据框 print(df.head(5)) # 打印输出前5条数据 # 方法3:使用4分位数实现离散化 df['amount3'] = pd.qcut(df['amount'], 4, labels=['bad', 'medium', 'good', 'awesome']) # 按四分位数进行分隔 df = df.drop('amount', 1) # 丢弃名为amount的列 print(df.head(5)) # 打印输出前5条数据 # 针对连续数据的二值化 binarizer_scaler = preprocessing.Binarizer( threshold=df['income'].mean()) # 建立Binarizer模型对象 income_tmp = binarizer_scaler.fit_transform(df['income']) # Binarizer标准化转换 income_tmp.resize(df['income'].shape) # 转换数据形状 df['income'] = income_tmp # Binarizer标准化转换 print(df.head(5)) # 打印输出前5条数据 #################################################################### # 3.12.1 网页数据解析 # 导入库 import requests # 用于请求 from bs4 import BeautifulSoup # 用于HTML格式化处理 import re # 用于HTML配合查找条件 import time # 用于文件名保存 # 获取总页面数量
from sklearn.neighbors import KNeighborsClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn import preprocessing dataset = pandas.read_csv('mlExcel_2.csv', header=0) originalHeaders = list(dataset.columns.values) array = dataset.values # converts winning to binary Y = array[:, -1] for string in Y: string = float(string) Y = Y.reshape(-1, 1) binarizer = preprocessing.Binarizer().fit(Y) Y = binarizer.transform(Y) Y = Y.reshape(len(Y)) dataset = dataset._get_numeric_data() numericHeaders = list(dataset.columns.values) array = dataset.values X = array[:, 0:-1] validationSize = 0.20 seed = 7 scoring = 'accuracy' X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split( X, Y, test_size=validationSize, random_state=seed)
print("origin data") print(np.mean(features, axis=0)) print(np.std(features, axis=0)) features_new = preprocessing.StandardScaler().fit_transform(features) # print(features_new) print(np.mean(features_new, axis=0)) print(np.std(features_new, axis=0)) """1.1.2 区间缩放:将特征值缩放到[0, 1]区间的数据(对列向量处理)""" features_new = preprocessing.MinMaxScaler().fit_transform(features) print("max mean") print(np.mean(features_new, axis=0)) """1.1.3 归一化:将行向量转化为“单位向量”(对每个样本处理)""" features_new = preprocessing.Normalizer().fit_transform(features) print(features_new) """1.2 对定量特征二值化:设定一个阈值,大于阈值的赋值为1,小于等于阈值的赋值为0""" features_new = preprocessing.Binarizer(threshold=6).fit_transform(features) print(features_new) """ 1.3 对定性(分类)特征编码 1.3.1 one-hot (也可用pandas.get_dummies函数) 1.3.2 label-encoder (略) """ enc = preprocessing.OneHotEncoder() enc.fit(features) result = preprocessing.OneHotEncoder().fit_transform(features) print(features[0:5]) print(result[0:5].toarray()) print(enc.transform([[0, 1, 3, 1]]).toarray()) """# 1.4 缺失值计算(也可用pandas.fillna函数)""" imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0) null_data = vstack((array([nan, nan, nan, nan]), features))
def HSBF_main(mode, clf_index, runtimes): pwd = os.getcwd() print(pwd) father_path = os.path.abspath(os.path.dirname(pwd) + os.path.sep + ".") # print(father_path) datapath = father_path + '/dataset-inOne/' spath = father_path + '/results/' if not os.path.exists(spath): os.mkdir(spath) # datasets = [['ant-1.3.csv', 'arc-1.csv', 'camel-1.0.csv', 'ivy-1.4.csv', 'jedit-3.2.csv', 'log4j-1.0.csv', 'lucene-2.0.csv', 'poi-2.0.csv', 'redaktor-1.csv', 'synapse-1.0.csv', 'tomcat-6.0.389418.csv', 'velocity-1.6.csv', 'xalan-2.4.csv', 'xerces-init.csv'], # ['ant-1.7.csv', 'arc-1.csv', 'camel-1.6.csv', 'ivy-2.0.csv', 'jedit-4.3.csv', 'log4j-1.1.csv', 'lucene-2.0.csv', 'poi-2.0.csv', 'redaktor-1.csv', 'synapse-1.2.csv', 'tomcat-6.0.389418.csv', 'velocity-1.6.csv', 'xalan-2.6.csv', 'xerces-1.3.csv'], # ['EQ.csv', 'JDT.csv', 'LC.csv', 'ML.csv', 'PDE.csv'], # ['Apache.csv', 'Safe.csv', 'Zxing.csv']] datasets = [['ant-1.3.csv', 'arc-1.csv', 'camel-1.0.csv'], ['Apache.csv', 'Safe.csv', 'Zxing.csv']] datanum = 0 for i in range(len(datasets)): datanum = datanum + len(datasets[i]) # print(datanum) # mode = [preprocess_mode, train_mode, save_file_name] preprocess_mode = mode[0] train_mode = mode[1] save_file_name = mode[2] df_file_measures = pd.DataFrame( ) # the measures of all files in all runtimes classifiername = [] # file_list = os.listdir(fpath) n = 0 for i in range(len(datasets)): for file_te in datasets[i]: n = n + 1 print('----------%s:%d/%d------' % ('Dataset', n, datanum)) # print('testfile', file_te) start_time = time.time() Address_te = datapath + file_te Samples_te = NumericStringLabel2BinaryLabel( Address_te) # DataFrame data = Samples_te.values # DataFrame2Array X = data[:, :-1] # test features y = data[:, -1] # test labels Sample_tr0 = NumericStringLabel2BinaryLabel(Address_te) column_name = Sample_tr0.columns.values # the column name of the data df_r_measures = pd.DataFrame( ) # the measures of each file in runtimes for r in range(runtimes): if train_mode == 'M2O_CPDP': # train data contains more files different from the test data project X_test = X y_test = y Samples_tr_all = pd.DataFrame( ) # initialize the candidate training data of all cp data trfilelist = [] for file_tr in datasets[i]: if file_tr != file_te: # print('train_file:', file_tr) Address_tr = datapath + file_tr trfilelist.append(Address_tr) Samples_tr = NumericStringLabel2BinaryLabel( Address_tr) # original train data, DataFrame Samples_tr.columns = column_name.tolist() # 批量更改列名 Samples_tr_all = pd.concat( [Samples_tr_all, Samples_tr], ignore_index=False, axis=0, sort=False) # Samples_tr_all.to_csv(f2, index=None, columns=None) # 将类标签二元化的数据保存,保留列名,不增加行索引 # random sample 90% negative samples and 90% positive samples # string = 'bug' # Sample_tr_pos, Sample_tr_neg, Sample_pos_index, Sample_neg_index \ # = Random_Stratified_Sample_fraction(Samples_tr_all, string, r=r) # Sample_tr = np.concatenate((Sample_tr_neg, Sample_tr_pos), axis=0) # array垂直拼接 # data_train_unique = Drop_Duplicate_Samples(pd.DataFrame(Sample_tr)) # drop duplicate samples # source = data_train_unique.values # target = np.c_[X_test, y_test] # *******************HSBF********************************* method_name = mode[1] + '_' + mode[ 2] # scenario + filter method print('----------%s:%d/%d------' % (method_name, r + 1, runtimes)) df_filter_time, X_train_new, y_train_new, = HSBF( trfilelist, Address_te, k1=10, k2=20, r=r) y_train_new = preprocessing.Binarizer( threshold=0).transform(y_train_new.reshape(-1, 1)) # # Train model: classifier / model requires the label must beong to {0, 1}. modelname_hsbf, model_hsbf = Selection_Classifications( clf_index, r) # select classifier classifiername.append(modelname_hsbf) # print("modelname:", modelname) measures_hsbf = Build_Evaluation_Classification_Model( model_hsbf, X_train_new, y_train_new, X_test, y_test) # build and evaluate models end_time = time.time() run_time = end_time - start_time measures_hsbf.update({ 'train_len_before': len(Samples_tr_all), 'train_len_after': len(X_train_new), 'test_len': len(X_test), 'runtime': run_time, 'clfindex': clf_index, 'clfname': modelname_hsbf, 'testfile': file_te, 'trainfile': 'More1', 'runtimes': r + 1 }) df_m2ocp_measures = pd.DataFrame(measures_hsbf, index=[r]) # print('df_m2ocp_measures:\n', df_m2ocp_measures) df_r_measures = pd.concat( [df_r_measures, df_m2ocp_measures], axis=0, sort=False, ignore_index=False) else: pass # print('df_file_measures:\n', df_file_measures) # print('所有文件运行一次的结果为:\n', df_file_measures) df_file_measures = pd.concat( [df_file_measures, df_r_measures], axis=0, sort=False, ignore_index=False) # the measures of all files in runtimes # df_r_measures['testfile'] = file_list # print('df_r_measures1:\n', df_r_measures) modelname = np.unique(classifiername) # pathname = spath + '\\' + (save_file_name + '_clf' + str(clf_index) + '.csv') pathname = spath + '\\' + (save_file_name + '_' + modelname[0] + '.csv') df_file_measures.to_csv(pathname) # print('df_file_measures:\n', df_file_measures) return df_file_measures
# 2. Min max scaling X_mm minmax_scaler = preprocessing.MinMaxScaler().fit(X) X_mm = minmax_scaler.transform(X) # let comprae min and max of first features of X and X_mm print("compare min and max of first features of X and X_mm") f1_X = X[:, 0] print('%.2f %.2f' % (np.min(f1_X), np.max(f1_X))) f1_Xmm = X_mm[:, 0] print('%.2f %.2f' % (np.min(f1_Xmm), np.max(f1_Xmm))) # 3. Binarizing X_Binarize X_Binarize = preprocessing.Binarizer(0.0).fit(X).transform(X) # let comprae Binarizing of first features of X and X_Binarize print("compare Binarizing of first features of X and X_Binarize") f1_X = X[0] print(f1_X) f1_X_Binarize = X_Binarize[0] print(f1_X_Binarize) # 3. Normalizing X_Normalize = preprocessing.Normalizer().fit(X).transform(X) # let comprae Binarizing of first features of X and X_Normalize print("compare Normalizing of first features of X and X_Normalize") f1_X = X[0] print(f1_X)
def preprocessing_module(Extracted_Features, Coma_Features, Corrected_Features, Norm, ontology): # Replace tab separated csv into comma separated csv and replace categorial variables into iteration lvltrace.lvltrace("LVLEntree dans preprocessing_module data_preproc") onto = open(ontology, "w") writer = csv.writer(onto, lineterminator=',') class_number = 1 onto.write("Iteration,Class,Class_number,Neuron_name\n") Iteration = 1 for root, dirs, files in os.walk(Extracted_Features): for i in files: if not i.startswith('.'): #LVLprint i input_i = Extracted_Features + i output_i = Coma_Features + i file = open(output_i, "w") writer = csv.writer(file, lineterminator=',') lines = tools.file_lines(input_i) + 1 ncol = tools.file_col(input_i) - 1 for line in xrange(lines): for col in xrange(ncol): if line == 0: if col == 1: # Skipping neuron names #print "skip neuron name" # pour faire propre je devrais inverser laurent = 1 else: file.write( "%s," % tools.read_csv_tab(input_i, col, line)) else: if col == 0: # replace class names by an integer file.write("%i," % class_number) else: if col == 1: #print "skip neuron name" onto.write("%i,%s,%i,%s\n" % (Iteration, i, class_number, tools.read_csv_tab( input_i, col, line))) Iteration = Iteration + 1 else: file.write( "%s," % tools.read_csv_tab(input_i, col, line)) file.write("\n") file.close() class_number = class_number + 1 if lines > 3: input_file = Coma_Features + i data = np.loadtxt( input_file, delimiter=',', usecols=range(ncol - 1), skiprows=1) # ncol-1 because we skip the class names X = data[:, :ncol] y = data[:, 0].astype(np.int) # Labels (class) #Replace missing values 'nan' by column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X) Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0) # Output replacement "Nan" values Y = imp.transform(X) #Data Standardization if Norm == 'normalize': Z = preprocessing.normalize(Y, axis=0, norm='l2') # Normalize else: if Norm == 'binarize': binarizer = preprocessing.Binarizer().fit( Y) # Binarize for Bernoulli Z = binarizer.transform(Y) else: if Norm == 'standardize': min_max_scaler = preprocessing.MinMaxScaler( ) # Normalize the data to [0,1] Z = min_max_scaler.fit_transform(Y) else: Z = preprocessing.scale(Y) #Scaling #Create new files with corrected and standardized data output_file = Corrected_Features + i file = open(output_file, "w") writer = csv.writer(file, lineterminator=',') for line_1 in xrange(lines - 1): for col_1 in xrange(ncol - 1): if col_1 == 0: file.write("%s," % y[line_1]) else: file.write("%f," % Z[line_1, col_1]) file.write("\n") file.close() else: #print "skip class" # We skip the class with not enough data # pour faire propre je devrais inverser laurent = 1 onto.close() lvltrace.lvltrace("LVLSortie dans preprocessing_module data_preproc")
var = ['school','sex','address','famsize','Pstatus','Mjob','Fjob','reason','guardian','schoolsup','famsup', 'paid','activities','nursery','higher','internet','romantic'] """for v in var: print('\nFrequency count for variable %s'%v) print(dataset[v].value_counts()) """ #label encode from sklearn.preprocessing import LabelEncoder var_to_encode = ['school','sex','address','famsize','Pstatus','Mjob','Fjob','reason','guardian','schoolsup','famsup', 'paid','activities','nursery','higher','internet','romantic'] for col in var_to_encode: dataset[col] = LabelEncoder().fit_transform(dataset[col]) # Binarize G3<=11: G3=0 G3>11: G3=1 dataset[['G3']] = preprocessing.Binarizer(threshold=11).transform(dataset[['G3']]) x=dataset[dataset.columns.drop('G3')] y= dataset['G3'] # divide dataset into train set and test set, size of test equals = 0.33*size of dataset from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split( x,y,test_size=0.33, random_state=0) #--------------------------------------END OF PREPROCESSING---------------------- from sklearn import neighbors as nb kmax=50
[35] # 5- Feature Creation # Foi criada uma Feature para agrupar o tamanho da população e o idade da casa dframe['FamilySize'] = dframe['Population'] + dframe['HouseAge'] dframe.head() [38] # 6- Discretization and Binarization from sklearn import preprocessing binarizer = preprocessing.Binarizer().fit('Population') print(binarizer) binarizer.threshold=3.50 [50] # 7- Attribute Transformation def draw_missing_data_table(dframe): total = dframe.isnull().sum().sort_values(ascending=False) percent = (dframe.isnull().sum()/dframe.isnull().count()).sort_values(ascending=False) missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) return missing_data
from sklearn import preprocessing boston = datasets.load_boston() #Similar to scaling, we have function and Class #preprocessing.binarize #preprocessing.Binarizer new_target = preprocessing.binarize(boston.target, threshold=boston.target.mean()) print("New Target after Binarization :") print(new_target[:5]) print("To Verify :") print((boston.target[:5] > boston.target.mean()).astype(int)) bin = preprocessing.Binarizer(boston.target.mean()) new_target = bin.fit_transform(boston.target) print(new_target[:5]) #----------------------------------------------- # special case for sparse matrix. threshold cannot be less than zero from scipy.sparse import coo spar = coo.coo_matrix(np.random.binomial(1, 0.25, 100)) #preprocessing.binarize(spar,threshold=-1) #------------------------------------------------- # working with categorical variable iris = datasets.load_iris() X = iris.data y = iris.target
import numpy as np from sklearn import preprocessing #샘플 데이터 정의 input_data = np.array([[3.1, 2.9, 3.3], [-1.2, 7, 6.1], [3.13, -3.13, 20], [7.28, -9.9, -2.5]]) #데이터 이진화 data_binarized = preprocessing.Binarizer(threshold=2.1).transform(input_data) print("\nBinarized data : \n", data_binarized) #평균과 표준편차 출력 print("\n BEFORE : ") print("Mean = ", input_data.mean(axis=0)) print("Std deviation = ", input_data.std(axis=0)) #평균 제거 data_scaed = preprocessing.scale(input_data) print("\n AFTER : ") print("Mean = ", data_scaed.mean(axis=0)) print("Std deviation = ", data_scaed.std(axis=0)) #크기 조정 #최솟값 /최댓값 조정 data_scaler_minmax = preprocessing.MinMaxScaler(feature_range=(0, 1)) data_scaled_minmax = data_scaler_minmax.fit_transform(input_data) print("\nMin max scaled data : \n", data_scaled_minmax) #정규화 #데이터 정규화 data_normalized_l1 = preprocessing.normalize(input_data, norm='l1')
# Creating a new feature for outlier: houses["Outlier"] = np.where(houses["Bathrooms"] < 20, 0, 1) # adding new feature based on square meter houses["Log_Of_Square_Feet"] = [np.log(x) for x in houses["Square_Feet"]] print(houses) # Discretizating Features (creating categories) age = np.array([[6], [12], [20], [36], [65]]) binarizer = preprocessing.Binarizer(20) print(binarizer.fit_transform(age)) # using multibple treshholds print(np.digitize(age, bins=[20, 30, 64], right=True)) # 0 --> LESS than 20; right=True --> LESS THAN OR EQUAL TO 20 # Grouping Observations Using Clustering # Make simulated feature matrix features, _ = make_blobs(n_samples=50, n_features=2, centers=3, random_state=1) dataframe = pandas.DataFrame(features, columns=["feature_1", "feature_2"]) # Make k-means clusterer
# # pyplot.plot(voting, label='MLP') # # pyplot.legend() # pyplot.xlabel('Time') # pyplot.ylabel('USD/TRY') # pyplot.show() x_train, y_train, x_test, y_test = load_data(All) x_train, y_train, x_test, y_test = convert_to_numpy(x_train, y_train, x_test, y_test) min_max_scaler = preprocessing.MinMaxScaler() min_max_scaler2 = preprocessing.MinMaxScaler() quantile_transformer = preprocessing.QuantileTransformer(random_state=0) quantile_transformer2 = preprocessing.QuantileTransformer(random_state=0) binarizer = preprocessing.Binarizer() binarizer2 = preprocessing.Binarizer() max_abs_scaler = preprocessing.MaxAbsScaler() max_abs_scaler2 = preprocessing.MaxAbsScaler() yeo_johnson_power_transformer = preprocessing.PowerTransformer(standardize=False) yeo_johnson_power_transformer2 = preprocessing.PowerTransformer(standardize=False) yeo_johnson_power_transformer_standardized = preprocessing.PowerTransformer() yeo_johnson_power_transformer_standardized2 = preprocessing.PowerTransformer() normalized_train_x = x_train normalized_test_x = x_test print('With No Alteration') run_regressors() normalized_train_x = preprocessing.scale(x_train) normalized_test_x = preprocessing.scale(x_test)
(np.max(x_train), np.min(x_train))) x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) print('after scalling, max is %d and min is %d' % (np.max(x_train), np.min(x_train))) x_train # In[12]: from sklearn import preprocessing X_scaled = preprocessing.scale(x_train) X_scaled X_scaled.mean(axis=0) # mean is zero X_scaled.std(axis=0) # variance is 1 # # Scaling the data to a specific range # In[13]: min_max_scaler = preprocessing.MinMaxScaler( feature_range=(-1, 1)) #setting the range X_train_minmax = min_max_scaler.fit_transform(x_train) X_train_minmax # # Binarization # In[14]: binarizer = preprocessing.Binarizer().fit(X_train_minmax) # fit does nothing binarizer.transform(X_train_minmax)
from sklearn.random_projection import GaussianRandomProjection as GRP from sklearn.mixture import GaussianMixture import scipy dataset1 = pd.read_csv("./DATASET/student/student-por.csv") var_to_encode = [ 'school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic' ] for col in var_to_encode: dataset1[col] = LabelEncoder().fit_transform(dataset1[col]) y0 = list(dataset1['G3']) # Binarize G3<=11: G3=0 G3>11: G3=1 dataset1[['G3' ]] = preprocessing.Binarizer(threshold=12).transform(dataset1[['G3' ]]) x1 = dataset1[dataset1.columns.drop('G3')] y1 = list(dataset1['G3']) scaler = StandardScaler() scaler.fit(x1) x1_n = scaler.transform(x1) #<-----------------------DATASET1 dataset2 = pd.read_csv("./DATASET/BANK/MT_Train.csv") dataset2.drop('default', axis=1, inplace=True) le = LabelEncoder() var_to_encode = [ 'job', 'marital', 'education', 'day_of_week', 'month', 'housing', 'loan', 'poutcome' ] for col in var_to_encode:
def binarizer(arr0, threshold): matrix = np.array(arr0) temp = preprocessing.Binarizer(threshold=threshold).fit_transform(matrix) # result = data_utility.retrieve_nan_index(temp.tolist(), index) result = temp.tolist() return result
import numpy as np from sklearn import preprocessing input_data = np.array([[1, 2], [3, 4], [5, 6]]) data_binerizer = preprocessing.Binarizer(threshold=4) binerized = data_binerizer.transform(input_data) print(binerized)
import numpy as np import sklearn.preprocessing as sp import scrapy.middleware as sm import matplotlib.pyplot as mp a = np.array([[10, 20, 5], [2, 4, 1], [10, 11, 15]]) bin = sp.Binarizer(threshold=10) result = bin.transform(a) print(result) lily = sm.imread('../素材/da_data/lily.jpg', True) bin = sp.Binarizer(threshold=127) result = bin.transform(lily) mp.imshow(result, cmap='gray') mp.show()