def test_accuracy(self, pb, lmbd=0.1, max_iter=100): test_x, test_y = pb.get_batch_with_label(8000) test_y = np.argmax(test_y, axis=1) feed_final = {"X": test_x[:6000], "lmbd": lmbd} train_data = self.optimize(X=test_x[:6000], lmbd=lmbd, max_iter=max_iter) lgrs = lgr() lgrs.fit(train_data, test_y[:6000]) # test_x,test_y = pb.get_batch_with_label(1000) # test_y = np.argmax(test_y,axis=1) lis_out = self.optimize(X=test_x[6000:], lmbd=lmbd, max_iter=max_iter) y_pre = lgrs.predict(lis_out) return accuracy_score(test_y[6000:], y_pre)
def test_accuracy(self, pb, lmbd=0.1): test_x, test_y = pb.get_batch_with_label(8000) test_y = np.argmax(test_y, axis=1) zs_test = np.zeros((8000, self.D.shape[0])) feed_final = {"Z": zs_test[:6000], "X": test_x[:6000], "lmbd": lmbd} train_data = self.output(**feed_final) lgrs = lgr() lgrs.fit(train_data, test_y[:6000]) # test_x,test_y = pb.get_batch_with_label(1000) # test_y = np.argmax(test_y,axis=1) feed_final = {"Z": zs_test[:2000], "X": test_x[6000:], "lmbd": lmbd} lis_out = self.output(**feed_final) y_pre = lgrs.predict(lis_out) return accuracy_score(test_y[6000:], y_pre)
def SNP_loop(x_train, x_test, y_train, num_samples): x_train = tf.constant() x_train = x_train.numpy() y_train = y_train.numpy() a, b, c = x_train.shape output_shap = np.zeros((num_samples, 0, c)) model_outputs = [] for i in range(b): x_train1 = np.squeeze(x_train[:num_samples, i, :]) x_test1 = np.squeeze(x_test[:num_samples, i, :]) y_train1 = y_train[:num_samples, ...] model = lgr().fit(x_train1, y_train1) scores = shap_scores(model, x_train1[:num_samples, ...], x_test1[:num_samples, ...]) scores3 = np.expand_dims(scores, axis=1) output_shap = np.append(output_shap, scores3, axis=1) model_outputs.append((model.coef_, model.intercept_)) return [output_shap], model_outputs
plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predict') plt.savefig(name+"_confusion_matrix.jpg") plt.close() x_train, x_validation, x_test, x_train_SMOTE, x_train_undersample,\ y_train, y_validation, y_test, y_train_SMOTE, y_train_undersample = get_data() params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'penalty': ['l1', 'l2']} grid_normal = gscv(lgr(), params, cv=10) grid_undersample = gscv(lgr(), params, cv=10) grid_SMOTE = gscv(lgr(), params, cv=10) grid_normal.fit(x_train, y_train) grid_undersample.fit(x_train_undersample, y_train_undersample) grid_SMOTE.fit(x_train_SMOTE, y_train_SMOTE) result_normal = pd.DataFrame(grid_normal.cv_results_) result_undersample = pd.DataFrame(grid_undersample.cv_results_) result_SMOTE = pd.DataFrame(grid_SMOTE.cv_results_) #best1 = np.argmax(result1.mean_test_score.values) name = ["normal", "undersample", "SMOTE"] y_normal_predict = grid_normal.predict(x_test)
mnist = input_data.read_data_sets('MNIST_data', one_hot=True) im = mnist.train.next_batch(N)[0] im = im.reshape(N, 28, 28) # im = [imresize(a, (17, 17), interp='bilinear', mode='L')-.5 # for a in im] X = np.array(im).reshape(N, -1) # model = ResNet50(weights='imagenet',include_top=False) # X = feat_extract(model,im) print(X.shape) dl = DictionaryLearning(K, alpha=lmbd*N, fit_algorithm='cd', n_jobs=-1, verbose=1) dl.fit(X) D = dl.components_.reshape(K, -1) np.save(fname, D) return D if __name__ == '__main__': D = create_dictionary_dl(0.1) pb = MnistProblemGenerator(D,0.1) test_x,test_y = pb.get_batch_with_label(8000) test_y = np.argmax(test_y,axis=1) from sklearn.linear_model import LogisticRegression as lgr from sklearn.metrics import accuracy_score lgrs = lgr() lgrs.fit(test_x[:6000],test_y[:6000]) y_pre = lgrs.predict(test_x[6000:]) print accuracy_score(test_y[6000:],y_pre)
def classify(s): import pandas as pd import numpy import pandas_montecarlo from scipy.stats import shapiro, kruskal, f_oneway from sklearn.ensemble import RandomForestClassifier as rfc from sklearn.neighbors import KNeighborsClassifier as knc from sklearn.svm import SVC as svc from sklearn.linear_model import LogisticRegression as lgr ## RandomForest Classifier with monte carlo simulated training set numpy.random.seed(s) #df = pd.read_csv("mc_test_data.csv") #df = pd.read_csv("rndf_filt_data.csv") df = pd.read_csv("data.csv") #random forest selected the following columns as most predictive df = df[['diagnosis','area_worst','concave points_mean','concave points_worst','perimeter_worst','radius_worst']] #print(df.head()) #df = df.drop(["id","Unnamed: 32"],axis=1) #df = df.drop(["Unnamed: 0"],axis=1) df = df.replace({'diagnosis': "M"}, 1) df = df.replace({'diagnosis': "B"}, 0) #split dataset for mc seed and testing df_mc, df = numpy.split(df, [int(.7*len(df))]) #split dataset by class #df_1 = pd.read_csv("mc_data_M.csv").drop(["Unnamed: 0"],axis=1) #df_0 = pd.read_csv("mc_data_B.csv").drop(["Unnamed: 0"],axis=1) df_1 = df_mc.loc[df_mc.diagnosis==1] df_0 = df_mc.loc[df_mc.diagnosis==0] df_1 = df_1.drop(["diagnosis"],axis=1) df_0 = df_0.drop(["diagnosis"],axis=1) #simulate class 0 data mc_sim_df_0 = pd.DataFrame() mc_sim_df_0['diagnosis']= ['0'] * len(df_0.index) for col in df_0.columns: col_sim = df_0[col].montecarlo(sims = 2, bust = 0, goal = 0).data col_sim = col_sim.drop(["original"],axis = 1) for col2 in col_sim.columns: mc_sim_df_0[col]=col_sim[col2] #if(shapiro(mc_sim_df_1[col])[1]>0.05): #print(kruskal(mc_sim_df_1[col],df_1[col])) #else: #print(f_oneway(mc_sim_df_1[col],df_1[col])) #simulate class 1 data mc_sim_df_1 = pd.DataFrame() mc_sim_df_1['diagnosis']= ['1'] * len(df_1.index) for col in df_1.columns: col_sim = df_1[col].montecarlo(sims = 2, bust = 0, goal = 0).data col_sim = col_sim.drop(["original"],axis = 1) for col2 in col_sim.columns: mc_sim_df_1[col]=col_sim[col2] #if(shapiro(mc_sim_df_1[col])[1]>0.05): #print(kruskal(mc_sim_df_1[col],df_1[col])) #else: #print(f_oneway(mc_sim_df_1[col],df_1[col])) #diag = mc_sim_df_1.append(mc_sim_df_0)['diagnosis'] mc_sim_df = mc_sim_df_1.append(mc_sim_df_0) #shuffling dataframe for good luck #mc_sim_df = mc_sim_df.sample(frac=1) #mc_sim_df['diagnosis']=diag mc_sim_df.head(20) #values formatted labels = df["diagnosis"] df = df.drop("diagnosis",axis=1) dfDev, dfTes = numpy.split(df, [int(.7*len(df))]) DDev, DTes = numpy.split(labels, [int(.7*len(labels))]) #DTrn = mc_sim_df['diagnosis'] #dfTrn = mc_sim_df.drop(['diagnosis'], axis = 1) DTrn = df_mc['diagnosis'] dfTrn = df_mc.drop(['diagnosis'], axis = 1) scores = [] #run model and test #randomforest model = rfc() model = model.fit(dfTrn.values,DTrn) pd = model.predict(dfDev) hit = 0 for i in range(len(pd)): if(int(pd[i])==int(DDev.iloc[i])): hit+=1 scores.append(hit/len(pd)) #knn model = knc() model = model.fit(dfTrn.values,DTrn) pd = model.predict(dfDev) hit = 0 for i in range(len(pd)): if(int(pd[i])==int(DDev.iloc[i])): hit+=1 scores.append(hit/len(pd)) #svc model = svc(kernel="linear") model = model.fit(dfTrn.values,DTrn) pd = model.predict(dfDev) hit = 0 for i in range(len(pd)): if(int(pd[i])==int(DDev.iloc[i])): hit+=1 scores.append(hit/len(pd)) #svc model = svc(kernel="rbf") model = model.fit(dfTrn.values,DTrn) pd = model.predict(dfDev) hit = 0 for i in range(len(pd)): if(int(pd[i])==int(DDev.iloc[i])): hit+=1 scores.append(hit/len(pd)) #logistic regression model = lgr() model = model.fit(dfTrn.values,DTrn) pd = model.predict(dfDev) hit = 0 for i in range(len(pd)): if(int(pd[i])==int(DDev.iloc[i])): hit+=1 scores.append(hit/len(pd)) return scores
MiddleDF75 = DiabetesMiddle.loc[predictarray<0.75] FinalTrain = pd.concat([DiabetesNoMiddle, MiddleDF75], axis=0) #Get the logistic regression fit object, after removing specific columns: TrainLR = FinalTrain.drop(['diabfeat_neurologic', 'race_AfricanAmerican', 'A1Cresult_>7', 'primarydiag_injury', 'number_diagnoses', 'med_glimepiride', 'med_insulin', 'diag_infection', 'medical_specialty_Orthopedics', 'med_nateglinide', 'discharge_disposition_leftAMA', 'admission_source_id_3', 'change_Ch', 'diag_circulatory', 'medical_specialty_Gastroenterology', 'medical_specialty_Surgery', 'primarydiag_infection', 'primarydiag_mentaldis'], axis=1) TrainLRX = TrainLR.drop('readmitted', axis=1) TrainLRY = TrainLR['readmitted'].replace([2,1], [1,0]) from sklearn.linear_model import LogisticRegression as lgr lgr = lgr() lgr.set_params(C=0.1, class_weight={0:.2, 1:.8}) lgr.fit(TrainLRX, TrainLRY) #Get random forest fit object: from sklearn.ensemble import RandomForestClassifier as rfc rfc = rfc() rfc.set_params(n_estimators=1000, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', max_depth=60, random_state=42, class_weight={0:.2, 1:.8}) FinalTrainX = FinalTrain.drop('readmitted', axis=1) FinalTrainY = FinalTrain['readmitted'].replace([2,1], [1,0])