def transform(categorical_columns, numerical_columns, data): cat = ('categorical', ohe(), categorical_columns) num = ('numeric', ss(), numerical_columns) col_trans = ColumnTransformer([cat, num]) df_trans_scaled = col_trans.fit_transform(data) col_names = get_column_names_from_ColumnTransformer(col_trans) for vals in numerical_columns: col_names.append(vals) df_trans_scaled = pd.DataFrame({ col_names[0]: df_trans_scaled[:, 0], col_names[1]: df_trans_scaled[:, 1], col_names[2]: df_trans_scaled[:, 2], col_names[3]: df_trans_scaled[:, 3], col_names[4]: df_trans_scaled[:, 4], col_names[5]: df_trans_scaled[:, 5], col_names[6]: df_trans_scaled[:, 6], col_names[7]: df_trans_scaled[:, 7], col_names[8]: df_trans_scaled[:, 8], col_names[9]: df_trans_scaled[:, 9], col_names[10]: df_trans_scaled[:, 10], col_names[11]: df_trans_scaled[:, 11], col_names[12]: df_trans_scaled[:, 12], col_names[13]: df_trans_scaled[:, 13], col_names[14]: df_trans_scaled[:, 14], col_names[15]: df_trans_scaled[:, 15], col_names[16]: df_trans_scaled[:, 16], col_names[17]: df_trans_scaled[:, 17] }) return df_trans_scaled, col_names
def fit(self, data): if not isinstance(data, pd.DataFrame): # Needs to be dataframe data = pd.DataFrame(data) self.p = data.shape[1] self.cidx = np.where(data.dtypes == 'object')[0] self.nidx = np.where(~(data.dtypes == 'object'))[0] self.cenc = ohe(sparse=False, dtype=int, handle_unknown='ignore', drop=self.drop) self.cenc.categories_ = [ list(data.iloc[:, x].value_counts().index) for x in self.cidx ] self.cenc.drop_idx_ = np.repeat(0, len(self.cenc.categories_)) # Total feature size: categories + num self.p2 = sum([len(x) - 1 for x in self.cenc.categories_]) + len(self.nidx) self.nenc = ss() self.nenc.mean_ = data.iloc[:, self.nidx].mean().values self.nenc.scale_ = data.iloc[:, self.nidx].std().values self.nenc.n_features_in_ = self.nidx.shape[0] self.cn = list(self.cenc.get_feature_names(data.columns[self.cidx].astype(str))) + \ data.columns[self.nidx].to_list() self.lst_enc = [self.cenc, self.nenc] self.lst_cidx = [self.cidx, self.nidx] self.lst_iter = [len(z) > 0 for z in self.lst_cidx]
def xg_eval(learning_rate,n_estimators, max_depth,n_components): # 12.1 Make pipeline. Pass parameters directly here pipe_xg1 = make_pipeline (ss(), # Why repeat this here for each evaluation? PCA(n_components=int(round(n_components))), XGBClassifier( silent = False, n_jobs=2, learning_rate=learning_rate, max_depth=int(round(max_depth)), n_estimators=int(round(n_estimators)) ) ) # 12.2 Now fit the pipeline and evaluate cv_result = cross_val_score(estimator = pipe_xg1, X= X_train, y = y_train, cv = 2, n_jobs = 2, scoring = 'f1' ).mean() # take the average of all results # 12.3 Finally return maximum/average value of result return cv_result
def get_data(tt_split=0.8): not_fraud = pd.read_csv('../data/data.csv', header='infer') not_fraud['is_fraud'] = 0 not_fraud['is_not_fraud'] = 1 fraud = pd.read_csv('../data/data_fraud.csv', header='infer') fraud['is_fraud'] = 1 fraud['is_not_fraud'] = 0 res = shuffle(pd.concat([fraud, not_fraud])) res['slum'] = 0 res['middle'] = 0 res['posh'] = 0 res.loc[res.area == 'slum', 'slum'] = 1 res.loc[res.area == 'middle', 'middle'] = 1 res.loc[res.area == 'posh', 'posh'] = 1 x = res[[ 'lower_education', 'higher_education', 'jewellery', 'car', 'bike', 'tax', 'misc_credit', 'misc_debit', 'slum', 'middle', 'posh' ]] scaler = ss().fit(x) x = scaler.transform(x) y = res[['is_fraud', 'is_not_fraud']] x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=tt_split, random_state=42) return (x_train, x_test, y_train, y_test, scaler)
def preprocess(data, flag): cont_v = [n for n in range(np.shape(data)[1]) if n != 3] normalized_data = ss().fit_transform(data[:, cont_v]) outlier_row, outlier_col = np.where(np.abs(normalized_data) > 3) if (flag): for i in range(0, len(outlier_col)): normalized_data[outlier_row[i]][outlier_col[i]] = np.sign( normalized_data[outlier_row[i]][outlier_col[i]]) * 3 return normalized_data
def preprocess(self, path): names = [] data = {} # must use {}, if use [], means it is list and the index can not be string if 'car' in path: names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'cls'] data = pd.read_csv(path, names=names, header=None, na_values='?') elif 'iris' in path: names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'cls'] data = pd.read_csv(path, names=names, header=None, na_values='?') elif 'adult' in path: names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'cls'] data = pd.read_csv(path, names=names, header=None, sep=',\s', na_values='?', engine='python') # convert NaN to most frequent attribute value in each column for attr in names[:len(names) - 1]: value = data[attr].value_counts().idxmax() data = data.fillna({attr: value}) # convert categorical values into numerical values except the 'cls' column import sklearn.preprocessing as pp enc = {} # must use {}, if use [], means it is list and the index can not be string for column in data.columns[: len(names) - 1]: if data.dtypes[column] == np.object: enc[column] = pp.LabelEncoder() data[column] = enc[column].fit_transform(data[column]) # get dummy variable of target value(cls), convert categorical values to numerical data = pd.get_dummies(data, columns=['cls'], prefix=['cls']) # set new header names, since after get_dummies operation, more column are added new_names = list(data) col_names = new_names[0: len(names) - 1] X = data[col_names] cls_name = new_names[len(names) - 1:] y = data[cls_name] # split data into training and test dataset from sklearn.model_selection import train_test_split as tt_split X_train, X_test, y_train, y_test = tt_split(X, y) # standardizing and scaling from sklearn.preprocessing import StandardScaler as ss scaler = ss() scaler.fit(X_train) # only fit the training data X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # convert y_test DataFrame to numpy.ndarray, and combine attributes and target values y_test = y_test.as_matrix() test_data = np.concatenate((X_test, y_test), axis=1) # ','.join(new_names) is to convert header name list to str and use ',' as delimiter self.__processed_data_to_csv(test_data, self.test, ','.join(new_names)) return X_train, y_train
def model_stop(df): """ Create a model from a dataframe. """ #create a model from a dataframe #df = pd.get_dummies(df,columns=['day']) #features = ['day_'+str(i) for i in range(0,7)] #for f in features: # if f not in df.columns: # df[f] = 0i df = df[df['traveltime'] > 0] X = df[df['traveltime'] < df['traveltime'].quantile(0.95)] X = df[df['traveltime'] > df['traveltime'].quantile(0.05)] features = ['rain', 'temp', 'hour', 'day'] scaler_X = ss() X = scaler_X.fit_transform(df[features]) scaler_Y = ss() Y_real = df['traveltime'] Y = scaler_Y.fit_transform(df['traveltime'].values.reshape(-1, 1)) model = mlp().fit(X, Y) return model, X, features, scaler_X, scaler_Y, Y_real
def predict_boston(): print "回归预测房价" # 加载 boston 数据 boston = lb() # 输出数据特征 print boston['DESCR'] # 验证集合样本大小占比 cratio = 0.4 # 输出数据主键 print boston.keys() # 数据分成验证集和训练集 X, Xtest, Y, Ytest = cv.train_test_split(boston['data'], boston['target'], test_size=cratio) # 加载数据归一化 S = ss() X = S.fit_transform(X) # 归一化拟合 Xtest = S.fit_transform(Xtest) # 归一化拟合 # 加载岭回归 lr = limd.Ridge() # 模型训练 model = lr.fit(X, Y) print "模型\n", model print("训练拟合评分\n %.3f" % lr.score(X, Y)) # 模型预测 Ypred = model.predict(Xtest) print("预测均方误差\n %.3f" % metrics.mean_squared_error(Ytest, Ypred)) print("系数\n %s " % lr.coef_) print "截距\n", lr.intercept_ # 作图 fig = plt.figure() # 画出直线 y=x 并且设置颜色和粗细 plt.plot([min(Ytest), max(Ytest)], [min(Ytest), max(Ytest)], 'r', lw=5) # 设置颜色 color = abs(Ypred - Ytest) / Ytest # 画出散点图 p = plt.scatter(Ypred, Ytest, c=color, marker='.') # 颜色刻度 plt.colorbar() plt.ylabel("Predieted Price") plt.xlabel("Real Price") # 图片显示 plt.show() return
def timeseries_scaling(X, is_training_data=True, list_of_transformers=None): #X += epsilon #X.shape = (nsamples, timesteps, features) #is_training_data and list_of_transformers can take values 'True' and 'None, and 'False' and 'not None' only. X_new = np.zeros_like(X) for i in range(X.shape[0]): X_new[i] = mms().fit_transform(X[i]) # #""" X_new2 = X_new.copy() if is_training_data: list_of_transformers = list() # for i in range(X.shape[1]): if is_training_data: tr = ss() tr.fit(X_new[:, i]) list_of_transformers.append(tr) else: tr = list_of_transformers[i] X_new2[:, i] = tr.transform(X_new[:, i]) return X_new2, list_of_transformers
def featureMat(data): # Kurtosis kurtosis_out = kurtosis(data) # print("kurtosis", kurtosis_out) # LAGE lage_out = lage(data) # print("LAGE: ", LAGE_out) # Entropy entropy_out = entropy(data) # print("Entropy:", entropy_out) # FFT Top5 features fft5_out = mfft(data) # print("FFT Top5", fft5_out) # FeatureMatrix featureMAT = np.hstack((fft5_out, kurtosis_out, lage_out, entropy_out)) featureMAT = ss().fit_transform(featureMAT.real) return featureMAT
def predict(request): data = UserReport.objects.filter(user=request.user).latest('date_time') print(data.age) reportData = np.array([ data.age, data.sex, data.cp, data.restbps, data.chol, data.fbs, data.ecg, data.heart_rate, data.ex_in_angina, data.st_depression_in_exercise, data.peak_st_segment, data.vessels_by_flourosopy, data.thal ]) reportData = reportData.reshape(1, -1) sc = ss() trained_data = sc.fit_transform(HdpsConfig.trained_data) reportData = sc.transform(reportData) result = HdpsConfig.classifier.predict(reportData) if data.sex == 1: sex = 'MALE' else: sex = 'FEMALE' if data.ecg == 0: ecg_val = 'Normal' elif data.ecg == 1: ecg_val = 'ST-T Wave Abnormality' else: ecg_val = 'Left Ventricular Hyperthrophy' try: profile = Profile.objects.get(user=request.user) except: profile = None return render( request, 'prediction_report.html', { 'report_data': data, 'result': result, 'sex': sex, 'profile': profile, 'ecg_val': ecg_val })
col = np.genfromtxt(full_path, delimiter=',') col = sorted(col) interact = np.zeros((np.size(X, axis=0), (np.size(col)*(np.size(col)-1))/2)) for i in range(0, np.size(col)-1): for j in range(i+1, np.size(col)): interact[:, c] = X[:, col[i]]*X[:, col[j]] c += 1 X = np.append(X, np.square(X), axis=1) # add the squares X = np.append(X, interact, axis=1) print 'Completed Full Model' if do_noise: from sklearn.preprocessing import MinMaxScaler as ss stan = ss(feature_range=(-1, 1)) x_norm = np.random.randn(np.size(X, axis=0), 1) X = np.column_stack((X, x_norm)) X = stan.fit_transform(X) # build train test validate sets # seed(41) j = 0 coef = np.zeros((np.size(X, axis=1), num_runs)) print result_title while j < num_runs: trn_x, trn_y, val_x, val_y, tst_x, tst_y = tvt(X, Y)
import plotly.tools as tls plotly.tools.set_credentials_file(username='******', api_key='JHK7zCU6FCcYgg6LcjaO') import matplotlib.pyplot as plt from scipy.signal import correlate d_c = read_clean_dataset(summary=True) #Clean data set c_d = read_corrupted_dataset(summary=True) #corrupted data set #Initially import all necessary modules and functions. Please note that this #api key for plotly is from an account I made specifically for this project #so you can access the generated plots at username: engleberry, #password: Droice1212. There should be only 2 graphs in the account to view for PCA. std = ss().fit_transform(d_c[0]) #Standardization #Covariance matrix from standardized data. cov_mat = np.cov(std.T) eig_vals, eig_vecs = np.linalg.eig(cov_mat) #Correlation matrix from raw data. cor_mat = np.corrcoef(d_c[0].T) eig_vals2, eig_vecs2 = np.linalg.eig(cor_mat) #Correlation matrix from standardized data. cor_mat2 = np.corrcoef(std.T) eig_vals3, eig_vecs3 = np.linalg.eig(cor_mat2) #SVD could also be used to find the eigenvectors. #eig_vec4,s,v = np.linalg.svd(std.T)
X_test.shape # (35014, 30) y_train.shape # (65025,) y_test.shape # (35014,) ################# CC. Create pipeline ################# #### Pipe using XGBoost # 5 Pipeline steps # steps: List of (name, transform) tuples # (implementing fit/transform) that are # chained, in the order in which they # are chained, with the last object an # estimator. steps_xg = [('sts', ss() ), ('pca', PCA()), ('xg', XGBClassifier(silent = False, n_jobs=2) # Specify other parameters here ) ] # 5.1 Instantiate Pipeline object pipe_xg = Pipeline(steps_xg) # 5.2 Another way to create pipeline: # Not used below pipe_xg1 = make_pipeline (ss(), PCA(), XGBClassifier(silent = False, n_jobs=2)
def fit(self, x): # Fit the encoder/scaler self.n = x.shape[0] self.p = x.shape[1] dt1 = pd.Series([type(x.iloc[0][kk]).__name__ for kk in range(self.p)]) dt2 = x.dtypes.astype(str).reset_index(drop=True) self.dt = pd.Series( np.where( dt1.isin(['int64', 'float64']) & dt2.isin(['int64', 'float64']), 'float', 'str')) if not all(self.dt.values == 'float'): self.dt[~(self.dt.values == 'float')] = \ np.where(x.loc[:, ~(self.dt.values == 'float')].apply(lambda x: x.str.contains('\\|', na=False).any()), 'lst',self.dt[~(self.dt.values == 'float')]) self.cn = np.array(x.columns) stopifnot(all(self.dt.isin(['float', 'lst', 'str']))) self.cidx = np.where(self.dt == 'str')[0] self.nidx = np.where(self.dt == 'float')[0] self.tidx = np.where(self.dt == 'lst')[0] stopifnot( all( np.sort(reduce(np.union1d, [self.cidx, self.nidx, self.tidx])) == np.arange(self.p))) self.iter = {'cenc': True, 'nenc': True, 'tenc': True} self.all_enc = {} ############################################################# # --- Encoder (i): Categorical/ordinal integer features --- # if len(self.cidx) > 0: self.cenc = ohe(sparse=self.sparse, dtype=self.dtype, handle_unknown='ignore', drop=None) self.cenc.categories_ = [ np.unique(x.iloc[:, kk]) for kk in self.cidx ] self.cmode = [x.iloc[:, kk].mode()[0] for kk in self.cidx] cmode_idx = np.array([ np.where(vec == mm)[0][0] for vec, mm in zip(self.cenc.categories_, self.cmode) ]) cum_idx = np.append([0], np.cumsum( [len(z) for z in self.cenc.categories_])) self.cenc.drop_idx = [] self.cenc.drop_idx_ = None self.cenc.p = cum_idx.max() - len( self.cenc.drop_idx ) # How many features after dropping most common self.cenc.cn = list( np.delete(self.cenc.get_feature_names(self.cn[self.cidx]), self.cenc.drop_idx)) self.all_enc['cenc'] = self.cenc else: self.iter['cenc'] = False ############################################### # --- Encoder (ii): Continuous numerical ---- # if len(self.nidx) > 0: if self.quantize: u_nidx = np.array( [len(x.iloc[:, kk].unique()) for kk in self.nidx]) self.nidx1 = self.nidx[u_nidx > 31] # quantize self.nidx2 = self.nidx[u_nidx <= 31] # one-hot-encode self.nenc = {'enc': {}, 'cn': {}} if len(self.nidx1) > 0: self.nenc1 = KD(n_bins=self.nbins, strategy='quantile') if not self.sparse: self.nenc1.encode = 'onehot-dense' self.nenc1.fit(x.iloc[:, self.nidx1]) self.nenc1.cn = ljoin([ cn + '_q' + pd.Series(qq).astype(str) for cn, qq in zip(self.cn[self.nidx1], [ np.arange(len(z) - 1) + 1 for z in self.nenc1.bin_edges_ ]) ]) self.nenc['enc']['nenc1'] = self.nenc1 self.nenc['cn']['nenc1'] = self.nenc1.cn if len(self.nidx2) > 0: self.nenc2 = ohe(sparse=self.sparse, handle_unknown='ignore', drop=None) self.nenc2.fit(x.iloc[:, self.nidx2]) self.nenc2.cn = self.nenc2.get_feature_names( self.cn[self.nidx2]) self.nenc['enc']['nenc2'] = self.nenc2 self.nenc['cn']['nenc2'] = self.nenc2.cn self.nenc['cn'] = ljoin(list(self.nenc['cn'].values())) self.all_enc['nenc'] = self.nenc else: self.nenc = ss(copy=False) self.nenc.mean_ = x.iloc[:, self.nidx].mean(axis=0).values self.nenc.scale_ = x.iloc[:, self.nidx].std(axis=0).values self.nenc.n_features_in_ = self.nidx.shape[0] self.nenc.p = self.nidx.shape[0] self.nenc.cn = list(self.cn[self.nidx]) self.all_enc['nenc'] = self.nenc else: self.iter['nenc'] = False ################################################ # --- Encoder (iii): Tokenize text blocks ---- # if len(self.tidx) > 0: self.tenc = dict( zip(self.cn[self.tidx], [ cv(tokenizer=lambda x: tok_fun(x), lowercase=False, token_pattern=None, binary=True) for z in range(self.tidx.shape[0]) ])) self.tenc = {'cv': self.tenc} for kk, jj in enumerate(self.cn[self.tidx]): self.tenc['cv'][jj].fit(x.loc[:, jj].astype('U')) self.tenc['p'] = sum( [len(z.vocabulary_) for z in self.tenc['cv'].values()]) self.tenc['cn'] = ljoin([ l + '_' + pd.Series(list(z.vocabulary_.keys())) for z, l in zip(self.tenc['cv'].values(), self.tenc['cv'].keys()) ]) self.all_enc['tenc'] = self.tenc else: self.iter['tenc'] = False # Store all in dictionary to iteration over self.iter self.enc_transform = { 'cenc': self.cenc_transform, 'nenc': self.nenc_transform, 'tenc': self.tenc_transform } # Get the valid categories self.tt = np.array(list(self.iter.keys()))[np.where( list(self.iter.values()))[0]] # Get full feature names cn = [] for ee in self.tt: if hasattr(self.all_enc[ee], 'cn'): cn.append(self.all_enc[ee].cn) else: cn.append(self.all_enc[ee]['cn']) cn = ljoin(cn) self.cn_transform = cn
from sklearn.preprocessing import LabelEncoder as lbe, OneHotEncoder as ode # Input column labencoder_country = lbe() ind_features[:, 0] = labencoder_country.fit_transform(ind_features[:, 0]) hotencoder = ode(categorical_features=[0]) ind_features = hotencoder.fit_transform(ind_features).toarray() # Output column labencoder_output = lbe() output_label = labencoder_output.fit_transform(output_label) # Split the data into training set and test set ======================================= from sklearn.cross_validation import train_test_split as tts ind_train_set, ind_test_set, out_train_set, out_test_set = tts(ind_features, output_label, test_size=0.3) # Put all the real parameters with the same scale # The dummy variable scaling opens a great discussion. # Largely depends on the context. This time, they will be scaled. """ Normalization: new_value = (old_value - min)/(max - min) Standartization: new_value = (old_value - mean)/(standard_devitation) """ from sklearn.preprocessing import StandardScaler as ss datascaler = ss() ind_train_set = datascaler.fit_transform(ind_train_set) ind_test_set = datascaler.transform(ind_test_set)
df.loc[2] = [3500, 2, 2, 1, 0, 'Skinny'] df.loc[3] = [1400, 0, 1, 0, 3, 'Skinny'] df.loc[4] = [1600, 1, 0, 2, 0, 'Normal'] df.loc[5] = [3200, 1, 2, 1, 1, 'Fat'] df.loc[6] = [1750, 1, 0, 0, 1, 'Skinny'] df.loc[7] = [1600, 1, 0, 0, 0, 'Skinny'] print(df) # Split feature vectors and labels x = df[['Calory', 'Breakfast', 'Lunch', 'Dinner', 'Excercise']] y = df[['Body Shape']] print(y) # The mean of 'Calory' will be very high compared to the means of the other 4 columns....so we have to normalise it # Using StandardScaler (-1,1) from sklearn.preprocessing import StandardScaler as ss x_std = ss().fit_transform(x) print(x_std) # Covariance matrix of features # Features are columns from x_std import numpy as np features = x_std.T covariance_matrix = np.cov(features) print(covariance_matrix) #PCA believed that the points which have close cov or corr have more impact #Eigen vector- the respected features....not suppressed # .t means transform
# 9.1 Which columns are numerical and which categorical? num_columns = X.select_dtypes(include = ['float64','int64']).columns num_columns cat_columns = X.select_dtypes(include = ['object']).columns cat_columns # 10. Start creating transformation objects # 10.1 Tuple for categorical columns cat = ("cattrans", ohe(), cat_columns) # 10.2 tuple for numeric columns num = ("numtrans", ss() , num_columns) # 10.3 Instantiate column transformer object colTrans = ct([num,cat]) # 10.4 Fit and transform X_trans = colTrans.fit_transform(X) X_trans.shape # 19100 X 19 ## 11.0 Label encoding # 11.1 Map labels to 1 and 0 y = y.map({"continue" : 1, "drop" : 0}) y.head()
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) missing_data.head(20) print("total :\n",missing_data) """ #take some values as training and predict output of some test cases x_train, x_test, y_train, y_test = tts(x, y, test_size=0.2, random_state=0) print("x:\n", x) print("x_train before scaling:\n", x_train) print("x_test before scaling:\n", x_test) #scale x and y to make them meaningful in calculations SS = ss() x_train = SS.fit_transform(x_train) x_test = SS.transform(x_test) print("x_train after sclaing:\n", x_train) print("x_test after scaling:\n", x_test) print("y:\n", y) print("y_train:\n", y_train) print("y_test:\n", y_test) X_set, y_set = x_train, y_train X1, X2 = np.meshgrid(
# 5.3 X.columns # 5.4 y=bc.loc[:,'diagnosis'] # OR y=bc.iloc[:, 0] # 5.5 y has two unique values y.unique() # Number of unique values in y # 6 Center and scale # Initialize the centering/scaling object scaler=ss() # 6.1 Use the object to create model model=scaler.fit(X) # 6.2 And now transform data data_trans=model.transform(X) # 6.3 Check data_trans.shape data_trans.mean() type(data_trans) #### 7. PCA now pca=PCA() # PCA object first. Instantiate the class # 7.1 Get PCA model now pca_model=pca.fit(data_trans)
# import the dataset dataset = pd.read_csv('data\Data.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 3].values # replace missing data in X using mean of the whole column imputer = im(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(X[:, 1:3]) X[:, 1:3] = imputer.transform(X[:, 1:3]) # encode categorical data labelencode_X = le() X[:, 0] = labelencode_X.fit_transform(X[:, 0]) # dummy encoding the data ohotencode = ohe(categorical_features=[0]) X = ohotencode.fit_transform(X).toarray() labelencode_Y = le() y = labelencode_Y.fit_transform(y) # splitting the data into train and test set X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=0) # feature scaling standardscale_X = ss() X_train = standardscale_X.fit_transform(X_train) X_test = standardscale_X.transform(X_test)
def scal(self): ss1 = ss() self.xtr = ss1.fit_transform(self.xtr) self.xte = ss1.transform(self.xte) return self.xtr, self.xte
def delt(beta, vect): # use optimization to find the minimum positive delta with constraints # (1-d)||b||_2^2 <= ||Xb||_2^2 <= (1+d)||b||_2^2 return np.abs(np.sum(np.square(np.dot(vect, beta))) - 1) # get the data data = np.genfromtxt(csv_path, delimiter=",") # remove rows that have NaN values (not ideal but IDGAF yet) data = data[~np.isnan(data).any(axis=1)] # from stack overflow: https://bit.ly/1QhfcmZ Y = np.array([x[1] - 1 for x in data]) # y values in the second column X = np.array([x[2:] for x in data]) del x del data stan = ss() x_norm = norm(stan.fit_transform(X.astype("float")), axis=0) n_feat = np.size(X, axis=1) n_row = np.size(X, axis=0) del X del Y # test instances if test_runs: x_norm = norm(stan.fit_transform(np.random.normal(size=(n_row, n_feat))), axis=0) results = np.zeros((num_runs, max_s)) d_quant = np.zeros((max_s, 4)) while run and b <= max_s: print b
#print(df2.head()) df2 = pd.get_dummies(df2) #print(df2.columns) df2 = df2.drop(columns=["pay_schedule_semi-monthly"]) response = df2["e_signed"] users = df2["entry_id"] df2 = df2.drop(columns=["e_signed", "entry_id"]) #splitting data x_train, x_test, y_train, y_test = train_test_split(df2, response, test_size=0.3, random_state=0) sc_x = ss() x_train2 = pd.DataFrame(sc_x.fit_transform(x_train)) x_test2 = pd.DataFrame(sc_x.fit_transform(x_test)) x_train2.columns = x_train.columns x_test2.columns = x_test.columns #print(x_train2) x_train = x_train2 x_test = x_test2 #Model #Logistice Regression LR = LogisticRegression(random_state=0, penalty="l1") LR.fit(x_train, y_train) y_pred = LR.predict(x_test)
if __name__=='__main__': iris = datasets.load_iris() X = iris.data[:,[2,3]] y = iris.target #spliting the data for test(30%) and training(70%) using tts X_train,X_test,y_train, y_test = \ tts(X,y,test_size=0.3, random_state=0) #Standardising the feature (feature scaling) using ss sc =ss() #Using fit to estimate 'sample mean','standard deviation' to do feature scaling #for each feature dimension using training data sc.fit(X_train) #tranform is used to standardize the trainig data (TrDS) and test data(TsDS) #Note: we have used same parameter for feature scaling X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) #n_iter:- Number of Epochs(passes over the TrDS set) #eta0/eta:-learning rate #reproducibility of initial shuffling of TrDS after each epoch ppn = Perceptron(n_iter=40,eta0=0.1, random_state=0) #training using fit ppn.fit(X_train_std,y_train)
def delt(beta, vect): # use optimization to find the minimum positive delta with constraints # (1-d)||b||_2^2 <= ||Xb||_2^2 <= (1+d)||b||_2^2 return np.abs(np.sum(np.square(np.dot(vect, beta)))-1) # get the data data = np.genfromtxt(csv_path, delimiter=',') # remove rows that have NaN values (not ideal but IDGAF yet) data = data[~np.isnan(data).any(axis=1)] # from stack overflow: https://bit.ly/1QhfcmZ Y = np.array([x[1]-1 for x in data]) # y values in the second column X = np.array([x[2:] for x in data]) del x del data stan = ss() x_norm = norm(stan.fit_transform(X.astype('float')), axis=0) n_feat = np.size(X, axis=1) n_row = np.size(X, axis=0) del X del Y # test instances if test_runs: x_norm = norm(stan.fit_transform(np.random.normal(size=(n_row, n_feat))), axis=0) results = np.zeros((num_runs, max_s)) d_quant = np.zeros((max_s, 4)) while run and b <= max_s: print b
col = np.genfromtxt(full_path, delimiter=',') col = sorted(col) interact = np.zeros( (np.size(X, axis=0), (np.size(col) * (np.size(col) - 1)) / 2)) for i in range(0, np.size(col) - 1): for j in range(i + 1, np.size(col)): interact[:, c] = X[:, col[i]] * X[:, col[j]] c += 1 X = np.append(X, np.square(X), axis=1) # add the squares X = np.append(X, interact, axis=1) print 'Completed Full Model' if do_noise: from sklearn.preprocessing import MinMaxScaler as ss stan = ss(feature_range=(-1, 1)) x_norm = np.random.randn(np.size(X, axis=0), 1) X = np.column_stack((X, x_norm)) X = stan.fit_transform(X) # build train test validate sets # seed(41) j = 0 coef = np.zeros((np.size(X, axis=1), num_runs)) print result_title while j < num_runs: trn_x, trn_y, val_x, val_y, tst_x, tst_y = tvt(X, Y)
) from sklearn.model_selection import train_test_split as tts factors = csv[["EstimatedSalary", "Age"]] print(max(csv[["EstimatedSalary"]].values)) purchased = csv[["Purchased"]] factor_training, factor_testing, purchased_training, purchased_testing = tts( factors, purchased, test_size=0.25, random_state=0) from sklearn.preprocessing import StandardScaler as ss factor_training_score = ss().fit_transform(factor_training) factor_testing_score = ss().fit_transform(factor_testing) from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0) classifier.fit(factor_training_score, purchased_training) predictedPurchase = classifier.predict(factor_testing_score) from sklearn.metrics import accuracy_score accuracy_score(purchased_testing, predictedPurchase) import numpy as np
from sklearn.preprocessing import StandardScaler as ss df = pd.read_csv('Social_Network_Ads.csv') #sns.scatterplot(df['EstimatedSalary'],df['Purchased']) df.drop('User ID', inplace=True, axis=1) #print(df.info()) gen = pd.get_dummies(df['Gender'], drop_first=True) df.drop('Gender', inplace=True, axis=1) #print(gen.head()) dff = pd.concat([df, gen], axis=1) #print(dff.info()) x = dff.drop('Purchased', axis=1) y = dff['Purchased'] print(y.head()) sss = ss() xx = sss.fit_transform(x) xtrain, xtest, ytrain, ytest = train_test_split(xx, y, test_size=0.3, random_state=101) cm = knc(n_neighbors=3) cm.fit(xtrain, ytrain) pdata = cm.predict(xtest) creport = cr(ytest, pdata) print(creport)
#Preprocessing X = df.iloc[:, :-1].values y = df.iloc[:, -1].values from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) from sklearn.preprocessing import StandardScaler as ss sc = ss() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) ############################################################################################################################################### print("GuassianNB") from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) from sklearn.metrics import confusion_matrix
import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.preprocessing import Imputer, OneHotEncoder from sklearn.preprocessing import LabelEncoder as le from sklearn.model_selection import train_test_split as tts from sklearn.preprocessing import StandardScaler as ss from sklearn.linear_model import LinearRegression as lr from sklearn.preprocessing import PolynomialFeatures as pf from sklearn.svm import SVR plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False dataset = pd.read_csv("Position_Salaries.csv") X = dataset.iloc[:, 1:2].values y = dataset.iloc[:, 2:3].values sc_X = ss() sc_y = ss() X = sc_X.fit_transform(X) y = sc_y.fit_transform(y) regressor = SVR(kernel="rbf") regressor.fit(X, y) y_pred = regressor.predict([[6.5]]) y_pred = sc_y.inverse_transform(y_pred) plt.scatter(X, y, color="blue") plt.plot(X, regressor.predict(X), color="green") plt.title("支持向量回归") plt.xlabel("级别") plt.ylabel("工资") plt.show() X_grid = np.arange(min(X), max(X), 0.01)
#Logistic Regression import pandas as pd import numpy as np import matplotlib.pyplot as plt dataset=pd.read_csv('Social_Network_Ads.csv') x=dataset.iloc[:,[2,3]].values y=dataset.iloc[:,4].values from sklearn.model_selection import train_test_split as tts xTrain,xTest,yTrain,yTest=tts(x,y,test_size=0.25,random_state=0) from sklearn.preprocessing import StandardScaler as ss scale=ss() xTrain=scale.fit_transform(xTrain) xTest=scale.transform(xTest) from sklearn.linear_model import LogisticRegression as lr classifier=lr(random_state=0) classifier.fit(xTrain,yTrain) yPred=classifier.predict(xTest) from sklearn.metrics import confusion_matrix as cm cm=cm(yTest,yPred) from matplotlib.colors import ListedColormap X_set, y_set = xTrain, yTrain X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), alpha = 0.75, cmap = ListedColormap(('red', 'green'))) plt.xlim(X1.min(), X1.max()) plt.ylim(X2.min(), X2.max()) for i, j in enumerate(np.unique(y_set)): plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'green'))(i), label = j) plt.title('Logistic Regression (Training set)')
fig3 = plt.figure(figsize=(12,12)) ax3 = fig3.add_subplot(1,4,1) sns.boxenplot(x='diagnosis',y='symmetry_mean',data=df) ax3 = fig3.add_subplot(1,4,2) sns.boxenplot(x='diagnosis',y='fractal_dimension_mean',data=df) # Selecting the Columns X = df.loc[:, 'radius_mean' : 'fractal_dimension_worst'] X.isnull().sum() X.head() X.shape # Scale the Numeric data scaleit = ss() s=scaleit.fit_transform(df.loc[:, 'radius_mean' : 'fractal_dimension_worst']) s=scaleit.fit_transform(X) pca = PCA() principleComp = pca.fit_transform(X) principleComp.shape pca.explained_variance_ratio_ X = pca.explained_variance_ratio_.cumsum() X # Plotting the Distplot graph ns.distplot(X,bins=5) X = principleComp[:,0:11]