def c(): X,y=load_breast_cancer(return_X_y=True,as_frame=True) cols=X.columns X = preprocessing.StandardScaler().fit_transform(X) Xdf = pd.DataFrame(X,columns=cols) k=3 kf = KFold(n_splits=k) LR = LogisticRegression(fit_intercept=True) acc=[] ind = 1 for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] LR.fit_non_vectorised(X_train, y_train, n_iter=1000) # here you can use fit_non_vectorised / fit_autograd methods y_hat = LR.predict(X_test) acc.append(accuracy(y_hat, y_test)) print("Fold {}, Accuracy: {}".format(ind,acc[-1])) ind+=1 print("Overall Accuracy: ",np.mean(acc)) LR.plot_surface(Xdf,y,0,1)
def b(): N = 30 M = 5 X = pd.DataFrame(np.random.randn(N, M)) y = pd.Series(np.random.randint(0,2,N)) X = preprocessing.StandardScaler().fit_transform(X) print(X) print(y) for fit_intercept in [True,False]: LR = LogisticRegression(fit_intercept=fit_intercept) LR.fit_autograd(X, y,n_iter=1000) # here you can use fit_non_vectorised / fit_autograd methods y_hat = LR.predict(X) # print(np.array(y)) # print(np.array(y_hat)) print('Accuracy: ', accuracy(y_hat, y))
def optimize_lambda(X, y, folds=2): assert (len(X) == len(y)) assert (len(X) > 0) lambdas = [] for i in range(1, 100): lambdas.append(0.1 * i) max_lambda = max(lambdas) LRs = {} accuracies = {} ch = int(len(X) // folds) for fold in (range(folds)): i = range(fold * ch, (fold + 1) * ch) current_fold = pd.Series([False for i in range(len(X))]) current_fold.loc[i] = True X_train, y_train = X[~current_fold].reset_index( drop=True), y[~current_fold].reset_index(drop=True) X_test, y_test = X[current_fold].reset_index( drop=True), y[current_fold].reset_index(drop=True) LR = LogisticRegression(fit_intercept=fit_intercept) LRs[fold + 1] = LR for Lambda in lambdas: LR.samantha = Lambda LR.fit_L1_autograd(X_train, y_train) y_hat = LR.predict(X_test) if fold + 1 in accuracies: accuracies[fold + 1][Lambda] = accuracy(y_hat, y_test) else: accuracies[fold + 1] = {Lambda: accuracy(y_hat, y_test)} accuracies = pd.DataFrame(accuracies).transpose() accuracies.index.name = "Fold Number" accuracies.loc["mean"] = accuracies.mean() print("Optimum lambda = {}".format(accuracies.loc["mean"].idxmax()))
def l2(load=False): if not load: X,y=load_breast_cancer(return_X_y=True,as_frame=True) cols=X.columns X = preprocessing.StandardScaler().fit_transform(X) Xdf = pd.DataFrame(X,columns=cols) in_split = 3 out_split = 4 cv_outer = KFold(n_splits=out_split, shuffle=True, random_state=1) outer_results = list() # X=X[:60] # y=y[:60] # print(X.shape) # print(y.shape) ind = 0 models_st=[] for train_ix, test_ix in cv_outer.split(X): ind += 1 # split data X_train, X_test = X[train_ix, :], X[test_ix, :] y_train, y_test = y[train_ix], y[test_ix] # configure the cross-validation procedure cv_inner = KFold(n_splits=in_split, shuffle=True, random_state=1) # define the model model = LogisticRegression(fit_intercept=True) # define search space space = np.array([10**x for x in range(-3,3)]) res={} for penalty in space: acc = [] for train_ix2, test_ix2 in cv_inner.split(X_train): X_train2, X_test2 = X[train_ix2, :], X[test_ix2, :] y_train2, y_test2 = y[train_ix2], y[test_ix2] LR = LogisticRegression(fit_intercept=True,l2_coef=penalty) LR.fit_autograd(X_train2, y_train2, n_iter=1000) # here you can use fit_non_vectorised / fit_autograd methods y_hat2 = LR.predict(X_test2) acc.append(accuracy(y_hat2, y_test2)) res[LR] = np.mean(acc) # for key,val in res.items(): # print("l1_penalty: {}, acc: {}".format(key.l1_coef,val)) best_model = max(res, key=res.get) models_st.append(best_model) # evaluate model on the hold out dataset yhat3 = best_model.predict(X_test) # evaluate the model acc = accuracy(y_test, yhat3) # store the result outer_results.append(acc) # report progress print('Fold=%d, acc=%.8f, best_l2_penalty=%.6f' % (ind, acc, best_model.l2_coef)) # summarize the estimated performance of the model print("Overall Estimated Model Performance") print('Accuracy: %.5f (%.5f)' % (np.mean(outer_results), np.std(outer_results)))
import numpy as np import pandas as pd import matplotlib.pyplot as plt from logisticRegression.logisticRegression import LogisticRegression from sklearn.datasets import load_breast_cancer from sklearn.preprocessing import MinMaxScaler from metrics import * import math scalar = MinMaxScaler() data = load_breast_cancer() X = pd.DataFrame(data['data']) y = pd.Series(data['target']) scalar.fit(X) X = scalar.transform(X) X = pd.DataFrame(X) # This scales data to the range 0-1 and is easier to train LR = LogisticRegression(reg="L1") LR.fit_autograd(X, y, n_iter=400, lr=8e-3) y_hat = LR.predict(X) print('Accuracy: ', accuracy(y_hat, y)) LR = LogisticRegression(reg="L2") LR.fit_autograd(X, y, n_iter=400, lr=8e-3) y_hat = LR.predict(X) print('Accuracy: ', accuracy(y_hat, y))
np.random.seed(42) data = load_breast_cancer() X = pd.DataFrame(data.data, columns=data.feature_names) scaler = MinMaxScaler() X = pd.DataFrame(scaler.fit_transform(X)) y = pd.Series(data.target) folds = 3 fold_size = X.shape[0] // folds datasets = [X.iloc[fold_size * i:fold_size * (i + 1)] for i in range(folds)] print("\n----------Gradient Descent (Formula vs Autograd)----------") for fit_intercept in [True]: LR = LogisticRegression(fit_intercept=fit_intercept) #LR = LogisticRegression(fit_intercept=fit_:intercept, regularization='L1', reg_lambda=5) #LR = LogisticRegression(fit_intercept=fit_intercept, regularization='L2', reg_lambda=0.5) LR.fit_vectorised(X, y, X.shape[0], n_iter=100, lr=1) y_hat = LR.predict(X) print("Gradient Descent using Formula: ", accuracy(y_hat, y)) LR.fit_autograd(X, y, X.shape[0], n_iter=100, lr=1) y_hat = LR.predict(X) print("Gradient Descent using Autograd", accuracy(y_hat, y)) #3 folds cross validation print("\n----------3 Folds Accuracy----------") fold_acc = [] for itr1 in range(folds):
def predictEstimates(X,theta): return sigmoid(X.dot(theta[1:])+theta[0]) def predict(X,theta): return np.where(predictEstimates(X,theta)>0.5,1,0) X = load_breast_cancer().data y = load_breast_cancer().target scalar = MinMaxScaler() scalar.fit(X) X = scalar.transform(X) print('\n----------Autograd Fit L1 regularised------------') LR = LogisticRegression(learningRate = 0.1, maxIterations = 100, regularization='l1', invRegLambda = 0.1) LR.fit_autograd(X, y) y_hat = LR.predict(X) print(accuracy(y_hat, y)) print('\n----------Autograd Fit L2 regularised---------------') LR = LogisticRegression(learningRate = 0.1, maxIterations = 100, regularization='l2', invRegLambda = 0.1) LR.fit_autograd(X, y) y_hat = LR.predict(X) print(accuracy(y_hat, y)) X = load_breast_cancer().data y = load_breast_cancer().target scalar = MinMaxScaler() scalar.fit(X)
from sklearn.model_selection import KFold x = load_digits(as_frame=True) X = x.data s = MinMaxScaler() X[list(X)] = s.fit_transform(X) y = x.target X = np.array(X) y = np.array(y) kf = KFold(n_splits=4) a = [] for train, test in kf.split(X, y): batch_size = 5 fit_intercept = False LR = LogisticRegression(fit_intercept=fit_intercept) LR.fit_k_class_regularised(pd.DataFrame(X), pd.Series(y), n_iter=500) y_hat = LR.k_class_predict(pd.DataFrame(X), False) a.append(accuracy(y_hat, y)) print(a) X = np.array([[1.0, 2.0], [2.0, 1.0], [3.0, 1.0], [8.0, 9.0], [9.0, 8.0]]) y = np.array([0, 0, 0, 1, 1]) fit_intercept = False LR = LogisticRegression(fit_intercept=fit_intercept) LR.fit_k_class_regularised(pd.DataFrame(X), pd.Series(y), n_iter=500) y_hat = LR.k_class_predict(pd.DataFrame(X), False) # print(y_hat) # print("y",y) print((accuracy(y_hat, y))) print(LR.confusion(X, y))
for i in range(3): xti = X.iloc[i * 190:min(570, (i + 1) * 190)] yti = y[i * 190:min(570, (i + 1) * 190)] xi1 = X.iloc[0:i * 190] xi2 = X.iloc[min(570, (i + 1) * 190):570] yi1 = y[0:i * 190] yi2 = y[min(570, (i + 1) * 190):570] xi = pd.concat([xi1, xi2]) yi = pd.concat([yi1, yi2]) xi.reset_index(drop=True, inplace=True) yi.reset_index(drop=True, inplace=True) xti.reset_index(drop=True, inplace=True) yti.reset_index(drop=True, inplace=True) LR = LogisticRegression() LR.fit(xi, yi, n_iter=600, lr=7e-03) y_hat = LR.predict(xti) acc_curr = accuracy(y_hat, yti) if (acc_curr > best_accuracy): best_accuracy = acc_curr best_LR = copy.deepcopy(LR) print(f'Accuracy Fold {i+1}: ', acc_curr) acc_ov += acc_curr print("Overall Accuracy:", acc_ov / 3) print("Decision boundary for features 0 and 1:") LR = LogisticRegression() LR.fit(X[[0, 1]], y, n_iter=500, lr=5e-3) LR.plot(np.array(X[[0, 1]]), np.array(y))
import numpy as np import pandas as pd import matplotlib.pyplot as plt from logisticRegression.logisticRegression import LogisticRegression from sklearn.datasets import load_breast_cancer from sklearn.preprocessing import MinMaxScaler from metrics import * import math scalar = MinMaxScaler() data = load_breast_cancer() X = pd.DataFrame(data['data']) y = pd.Series(data['target']) scalar.fit(X) X = scalar.transform(X) X = pd.DataFrame(X) # This scales data to the range 0-1 and is easier to train LR = LogisticRegression() LR.fit(X, y, n_iter=500, lr=5e-3) y_hat = LR.predict(X) print('Accuracy: ', accuracy(y_hat, y)) LR = LogisticRegression() LR.fit_autograd(X, y, n_iter=400, lr=8e-3) y_hat = LR.predict(X) print('Accuracy: ', accuracy(y_hat, y))
from sklearn.model_selection import KFold x = load_breast_cancer(as_frame=True) X = x.data s = MinMaxScaler() X[list(X)]= s.fit_transform(X) y = x.target X = np.array(X) y = np.array(y) kf = KFold(n_splits=3) a = [] b = [] for train,test in kf.split(X,y): batch_size = 5 fit_intercept = True LR = LogisticRegression(fit_intercept=fit_intercept) LR.fit_regularised(pd.DataFrame(X[train]), pd.Series(y[train]),batch_size) y_hat = LR.predict(pd.DataFrame(X[test])) a.append(accuracy(y_hat,y[test])) print (a) for train,test in kf.split(X,y): batch_size = 5 fit_intercept = True LR = LogisticRegression(fit_intercept=fit_intercept) LR.fit_regularised_autograd(pd.DataFrame(X[train]), pd.Series(y[train]), batch_size) y_hat = LR.predict(pd.DataFrame(X[test])) b.append(accuracy(y_hat,y[test])) print (b) X = np.array([[1.0,2.0],[2.0,1.0],[3.0,1.0],[8.0,9.0],[9.0,8.0]])
myAnswers = [] for i in sigmoidZ: myAnswers.append(np.argmax(i)) return myAnswers X = load_digits().data y = load_digits().target scalar = MinMaxScaler() scalar.fit(X) X = scalar.transform(X) print('\n----------MultiClass Normal Fit------------') yEncoded = oneHotEncoding(y, 10, dtype="float") LR = LogisticRegression(learningRate=6e-2, maxIterations=60, regularization=None) LR.fitMulticlass(X, yEncoded) y_hat = LR.predictMulticlass(X) print(multiaccuracy(y_hat, yEncoded)) print('\n----------MultiClass Autograd Fit------------') LR = LogisticRegression(learningRate=6e-2, maxIterations=60, regularization=None) LR.fitMulticlassAutograd(X, yEncoded) y_hat = LR.predictMulticlass(X) print(multiaccuracy(y_hat, yEncoded)) kfolding = KFold(4, True, 1) for train, test in kfolding.split(X):
X_test.reset_index(drop=True, inplace=True) y_test.reset_index(drop=True, inplace=True) best_accuracy = 0 for train_index_nested, test_index_nested in kf.split(X_train): X_train_nested = X_train.iloc[train_index_nested] y_train_nested = y_train[train_index_nested] X_test_nested = X_train.iloc[test_index_nested] y_test_nested = y_train[test_index_nested] X_train_nested.reset_index(drop=True, inplace=True) y_train_nested.reset_index(drop=True, inplace=True) X_test_nested.reset_index(drop=True, inplace=True) y_test_nested.reset_index(drop=True, inplace=True) LR = LogisticRegression(reg="L1", reg_coef=k) LR.fit_autograd(X_train_nested, y_train_nested, n_iter=75, lr=8e-3) y_hat = LR.predict(X_test_nested) acc = accuracy(y_hat, y_test_nested) if (acc > best_accuracy): best_accuracy = acc best_LR = copy.deepcopy(LR) y_hat = best_LR.predict(X_test) acc = accuracy(y_hat, y_test) if (acc > best_acc_fold): best_acc_fold = acc best_accuracies_fold.append([best_acc_fold, k]) best_accuracies_fold.sort(reverse=True) print("L1 Regularisation")
from metric import * from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import KFold scaler = MinMaxScaler() np.random.seed(42) from sklearn.datasets import load_breast_cancer X = load_breast_cancer().data y = load_breast_cancer().target scalar = MinMaxScaler() scalar.fit(X) X = scalar.transform(X) print('\n---------Unregularised Normal Fit----------') LR = LogisticRegression(learningRate=0.1, maxIterations=1000) LR.fit(X, y) y_hat = LR.predict(X) print(accuracy(y_hat, y)) print('\n---------Unregularised Autograd Fit----------') LR = LogisticRegression(learningRate=0.1, maxIterations=1000) LR.fit_autograd(X, y) y_hat = LR.predict(X) print(accuracy(y_hat, y)) print('\n---------3 KFold----------') Kfolding = KFold(3, True, 1) avgAccuracy = 0 for trainData, testData in Kfolding.split(X): trainSetData, testSetData = X[trainData], X[testData]
def l1(load=False): if not load: X,y=load_breast_cancer(return_X_y=True,as_frame=True) cols=X.columns X = preprocessing.StandardScaler().fit_transform(X) Xdf = pd.DataFrame(X,columns=cols) in_split = 3 out_split = 4 cv_outer = KFold(n_splits=out_split, shuffle=True, random_state=1) outer_results = list() # X=X[:60] # y=y[:60] # print(X.shape) # print(y.shape) ind = 0 models_st=[] for train_ix, test_ix in cv_outer.split(X): ind += 1 # split data X_train, X_test = X[train_ix, :], X[test_ix, :] y_train, y_test = y[train_ix], y[test_ix] # configure the cross-validation procedure cv_inner = KFold(n_splits=in_split, shuffle=True, random_state=1) # define the model model = LogisticRegression(fit_intercept=True) # define search space space = np.array([10**x for x in range(-3,3)]) res={} for penalty in space: acc = [] for train_ix2, test_ix2 in cv_inner.split(X_train): X_train2, X_test2 = X[train_ix2, :], X[test_ix2, :] y_train2, y_test2 = y[train_ix2], y[test_ix2] LR = LogisticRegression(fit_intercept=True,l1_coef=penalty) LR.fit_autograd(X_train2, y_train2, n_iter=1000) # here you can use fit_non_vectorised / fit_autograd methods y_hat2 = LR.predict(X_test2) acc.append(accuracy(y_hat2, y_test2)) res[LR] = np.mean(acc) # for key,val in res.items(): # print("l1_penalty: {}, acc: {}".format(key.l1_coef,val)) best_model = max(res, key=res.get) models_st.append(best_model) # evaluate model on the hold out dataset yhat3 = best_model.predict(X_test) # evaluate the model acc = accuracy(y_test, yhat3) # store the result outer_results.append(acc) # report progress print('Fold=%d, acc=%.8f, best_l1_penalty=%.6f' % (ind, acc, best_model.l1_coef)) # summarize the estimated performance of the model print("Overall Estimated Model Performance") print('Accuracy: %.5f (%.5f)' % (np.mean(outer_results), np.std(outer_results))) a_file = open("q2_data.pkl", "wb") pickle.dump((cols,models_st), a_file) a_file.close() a_file = open("q2_data.pkl", "rb") cols,models_st = pickle.load(a_file) a_file.close() # Feature Importance for model in models_st[:1]: print("Top 5 important features") theta = np.squeeze(model.coef_)[1:] res = dict() for i,val in enumerate(theta): res[cols[i]] = theta[i] res=OrderedDict(sorted(res.items(),key=lambda x:np.abs(x[1]),reverse=True)) k = 5 ind = 0 for key,val in res.items(): print("Feature: {}, coefficient: {}".format(key,val)) ind += 1 if(ind==k): break
import numpy as np import pandas as pd import matplotlib.pyplot as plt from logisticRegression.logisticRegression import LogisticRegression from sklearn.datasets import load_digits from sklearn.preprocessing import MinMaxScaler from metrics import * import math scalar = MinMaxScaler() data = load_digits() X = pd.DataFrame(data['data']) y = pd.Series(data['target']) scalar.fit(X) X = scalar.transform(X) X = pd.DataFrame(X) # This scales data to the range 0-1 and is easier to train LR = LogisticRegression() LR.fit_multiclass(X, y, n_iter=60, lr=6e-2) y_hat = LR.predict_multiclass(X) print('Accuracy: ', accuracy(y_hat, y)) LR = LogisticRegression() LR.fit_multiclass_autograd(X, y, n_iter=60, lr=1e-3) y_hat = LR.predict_multiclass(X) print('Accuracy: ', accuracy(y_hat, y))
for lamda in lamda_range: #__________ Create Trees for multiple lamdas and find the one with the best avg_val_accuracy ____________# print("\t lamda = {}".format(lamda)) avg_validation_accuracy = 0 for k in range(cross_val_folds): #__________ Further splitting into multiple Folds ____________# print("\t \t Validation_Fold = {}".format(k + 1), end =" ") X_traindash, y_traindash = X_train_folds.copy(), y_train_folds.copy() #__________ Use one of them as Validation fold ____________# X_valid,y_valid = X_train_folds[k],y_train_folds[k] X_traindash.pop(k) y_traindash.pop(k) #__________ Concat the rest to create the nested Train Fold ____________# train_X, train_y = pd.concat(X_traindash), pd.concat(y_traindash) LR = LogisticRegression() LR.fit_autograd(train_X.reset_index(drop=True), train_y.reset_index(drop=True), n_iter=200,batch_size = len(train_X),lr = 0.5, reg_type = reg_type, lamda=lamda) y_hat = LR.predict(X_valid.reset_index(drop=True)) # LR.plot_desicion_boundary(X_valid,y_valid, None) valid_accuracy = accuracy(y_hat,y_valid.reset_index(drop=True)) print("Accuracy: {}".format(valid_accuracy)) avg_validation_accuracy += valid_accuracy avg_validation_accuracy = avg_validation_accuracy/cross_val_folds print("\t \t \t Avg_val_accuracy: {}".format(avg_validation_accuracy)) val_accuracies.append([avg_validation_accuracy, lamda]) val_accuracies.sort(reverse = True) opt_lamda = 0 opt_acc = 0 for i in range(len(val_accuracies)): if(val_accuracies[i][0]>= opt_acc): opt_lamda = val_accuracies[i][1]
i = 1 ov_ac = 0 best_acc = 0 best_LR = None for train_index, test_index in skf.split(X, y): X_train = X.iloc[train_index] y_train = y[train_index] X_test = X.iloc[test_index] y_test = y[test_index] X_train.reset_index(drop=True, inplace=True) y_train.reset_index(drop=True, inplace=True) X_test.reset_index(drop=True, inplace=True) y_test.reset_index(drop=True, inplace=True) LR = LogisticRegression() LR.fit_multiclass(X_train, y_train, n_iter=50, lr=9e-3) # 50, 9e-3 y_hat = LR.predict_multiclass(X_test) acc = accuracy(y_hat, y_test) if (acc > best_acc): best_acc = acc best_LR = copy.deepcopy(LR) print(f'Accuracy for Fold {i}:', acc) ov_ac += acc i += 1 print("Overall Accuracy:", ov_ac / 4) y_hat = best_LR.predict_multiclass(X) confusion_matrix = np.zeros((10, 10)) for k in range(len(y)): confusion_matrix[y[k]][y_hat[k]] += 1
from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() X = pd.DataFrame(scaler.fit_transform(X)) # X = (X - X.min( )) / (X.max( ) - X.min( )) # This doesn't work, time to use skelarn... LOL! data = pd.concat([X, y.rename("y")],axis=1, ignore_index=True) data = data.sample(frac=1).reset_index(drop=True) # RANDOMLY SHUFFLING THE DATASET split = int(0.7*len(data)) # TRAIN-TEST SPLIT X_train, y_train = data.iloc[:split].iloc[:,:-1], data.iloc[:split].iloc[:,-1] X_test, y_test = data.iloc[split:].iloc[:,:-1], data.iloc[split:].iloc[:,-1] # a) --------- Multi-class Logistic Regression: implemented in LogisticRegression.py --------# print("\n|--------- Multi-class Logistic Regression using self-update rules ----------|") fit_intercept = True LR = LogisticRegression(fit_intercept=fit_intercept) LR.fit_multi(X_train, y_train, n_iter=100,batch_size = len(X_train),lr = 3) # here you can use fit_non_vectorised / fit_autograd methods y_hat = LR.predict_multi(X_test) print('Accuracy: ', accuracy(y_hat, y_test)) # b) --------- Multi-class Logistic Regression AUTOGRAD: implemented in LogisticRegression.py --------# print("\n|--------- Multi-class Logistic Regression using Autograd ----------|") fit_intercept = True LR = LogisticRegression(fit_intercept=fit_intercept) LR.fit_multi_autograd(X_train, y_train, n_iter=100,batch_size = len(X_train),lr = 3) # here you can use fit_non_vectorised / fit_autograd methods y_hat = LR.predict_multi(X_test) print('Accuracy: ', accuracy(y_hat, y_test)) # c) --------- K-Folds Multi-class Logistic Regression over DIGITS --------#
# accu = accuracy(y_hat,y) # if accu >= maxaccu : # maxaccu = accu # maxlamda = alpha*i # return maxlamda np.random.seed(42) N = 30 P = 2 X = pd.DataFrame(np.random.randn(N, P)) y = pd.Series([ 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1 ]) fit_intercept = True LR = LogisticRegression(fit_intercept=fit_intercept) LR.fit_L1_autograd(pd.DataFrame(X), pd.Series(y)) y_hat = LR.predict(pd.DataFrame(X)) print((accuracy(y_hat, y))) LR.fit_L2_autograd(pd.DataFrame(X), pd.Series(y)) y_hat = LR.predict(pd.DataFrame(X)) print((accuracy(y_hat, y))) print(LR.coef_) optimize_lambda(pd.DataFrame(X), pd.DataFrame(y)) # x = load_breast_cancer(as_frame=True) # X = x.data # s = MinMaxScaler() # X[list(X)]= s.fit_transform(X) # y = x.target # X = np.array(X)