def fit(self, X, y): self.data = X self.labels = y self.classes = list(set(y)) n = len(X) weights = [1 / n for i in range(n)] for estimator in range(self.n_estimators): self.clfs.append(X) self.clfsy.append(y) Dtree = DecisionTree("information_gain", max_depth=1) Dtree.fit(X, y, sample_weights=weights) self.estimators_list.append(Dtree) err = 0 for i in range(n): if Dtree.predict(X.iloc[[i]]) != y[i]: err += weights[i] alpha = 0.5 * math.log2((1 - err) / err) self.alphas.append(alpha) for i in range(n): if Dtree.predict(X.iloc[[i]]) != y[i]: weights[i] = weights[i] * math.exp(alpha) else: weights[i] = weights[i] * math.exp(-alpha) #Normalise the weights temp = [t / sum(weights) for t in weights] weights = temp
def find_Time(case): axis_Nf=[0]*25 # These are time for different N values by fixing P on fit axis_Np=[0]*25 # for predict axis_Pf=[0]*11 # Different P for fixed P and for model fit axis_Pp=[0]*11 # for predict print("Started 1") for i in range(100,500,20): X,y=CreateFakeData(i,5,case) #we fix p = 5 mod=DecisionTree() st1=time() mod.fit(X,y) ed1=time() st2=time() y_=mod.predict(X) ed2=time() axis_Nf[(i-100)//20]=(ed1-st1) axis_Np[(i-150)//20]=(ed2-st2) print("Started 2") for i in range(2,24,2): X,y=CreateFakeData(100,i,case) mod=DecisionTree() st1=time() mod.fit(X,y) ed1=time() st2=time() y_=mod.predict(X) ed2=time() axis_Pf[(i-2)//2]=(ed1-st1) axis_Pp[(i-2)//2]=(ed2-st2) return axis_Nf,axis_Np,axis_Pf,axis_Pp
def nested_cross_validation(dataset, y): for i in range(5): test = dataset[30 * i:30 * (i + 1)] test_label = y[30 * i:30 * (i + 1)] if 30 * (i + 1) + 120 <= 150: train = dataset[30 * (i + 1):] train_label = y[30 * (i + 1):] #print("yo") else: train1 = dataset[0:30 * (i + 1) - 30] train1_label = y[0:30 * (i + 1) - 30] train2 = dataset[30 * (i + 1):] train2_label = y[30 * (i + 1):] train = np.append(train1, train2, axis=0) train_label = np.append(train1_label, train2_label, axis=0) #print("yoo") accuracy_validation = {} for depth in range(1, 11): avg_acc = 0 for j in range(4): #print("yooooo") #print(train.shape,train_label.shape) validation = train[30 * j:30 * (j + 1)] validation_label = train_label[30 * j:30 * (j + 1)] train_1 = train[30 * (j + 1):] train1_label = train_label[30 * (j + 1):] train_2 = train[0:30 * (j + 1) - 30] train2_label = train_label[0:30 * (j + 1) - 30] train_new = np.append(train_1, train_2, axis=0) train_new_label = np.append(train1_label, train2_label, axis=0) tree = DecisionTree(criterion="gini_index", max_depth=depth) #print(pd.DataFrame[train]) #print(train_new.shape,train_new_label.shape) #print(train_new.shape,train_new_label.shape) train_new = pd.DataFrame(train_new) train_new_label = pd.Series(train_new_label, dtype="category") train_new.reset_index(drop=True, inplace=True) train_new_label.reset_index(drop=True, inplace=True) #print(train_new) #print(train_new_label) tree.fit(train_new, train_new_label) #print("training done") #print("now testing") avg_acc += accuracy(tree.predict(validation), validation_label) #print("acc",acc) #print(tree.predict(pd.DataFrame(train))) accuracy_validation[depth] = avg_acc / 4 value = max(accuracy_validation, key=accuracy_validation.get) tree = DecisionTree(criterion="gini_index", max_depth=value) train = pd.DataFrame(train) train_label = pd.Series(train_label, dtype="category") tree.fit(train, train_label) #tree = tree_iris(train,value,0) print("Accuracy is,", accuracy(tree.predict(test), test_label), " for iteration", i + 1, ". The depth of the optimal tree is ", value)
def cross_validtion_5_fold(X, y, depth): X_original = X y_original = y clf = DecisionTree(criterion="a", max_depth=depth) clf.fit(pd.DataFrame(X[0:120]), pd.Series(y[0:120], dtype="category")) y = y[120:] y_hat = clf.predict(pd.DataFrame(X[120:])) print(accuracy(pd.Series(y_hat), pd.Series(y))) X = X_original y = y_original clf = DecisionTree(criterion="a", max_depth=depth) clf.fit(pd.DataFrame(np.append(X[90:], X[0:60], axis=0)), pd.Series(np.append(y[90:], y[0:60], axis=0), dtype="category")) y = y[60:90] y_hat = clf.predict(X[60:90]) print(accuracy(pd.Series(y_hat), pd.Series(y))) X = X_original y = y_original clf = DecisionTree(criterion="a", max_depth=depth) clf.fit(pd.DataFrame(np.append(X[120:], X[0:90], axis=0)), pd.Series(np.append(y[120:], y[0:90], axis=0), dtype="category")) y = y[90:120] y_hat = clf.predict(X[90:120]) print(accuracy(pd.Series(y_hat), pd.Series(y))) X = X_original y = y_original clf = DecisionTree(criterion="a", max_depth=depth) clf.fit(pd.DataFrame(X[30:]), pd.Series(y[30:], dtype="category")) y = y[0:30] y_hat = clf.predict(X[0:30]) print(accuracy(pd.Series(y_hat), pd.Series(y))) X = X_original y = y_original clf = DecisionTree(criterion="a", max_depth=depth) clf.fit(pd.DataFrame(np.append(X[0:30], X[60:], axis=0)), pd.Series(np.append(y[0:30], y[60:], axis=0), dtype="category")) y = y[30:60] y_hat = clf.predict(X[30:60]) print(accuracy(pd.Series(y_hat), pd.Series(y)))
def analyseTime(case): assert (1 <= case <= 4) fitTimes = {'N': list(), 'P': list(), 'time': list()} predictTimes = {'N': list(), 'P': list(), 'time': list()} for N in range(40, 50): for P in range(2, 10): print("Running with N", N, "and P", P) X, y = createFakeData(N, P, case) tree = DecisionTree(criterion="information_gain", max_depth=3) startTime = time.time() tree.fit(X, y) endTime = time.time() fitTimes['N'].append(N) fitTimes['P'].append(P) fitTimes['time'].append(endTime - startTime) startTime = time.time() y_hat = tree.predict(X) endTime = time.time() predictTimes['N'].append(N) predictTimes['P'].append(P) predictTimes['time'].append(endTime - startTime) plotTimings(fitTimes) plotTimings(predictTimes)
def five_fold_validation(X, y, depth=5): """Function to do five fold cross validation on iris""" X_original = X y_original = y accs = [] # last 5th chunk as test data clf = DecisionTree(criterion="information_gain", max_depth=depth) clf.fit(pd.DataFrame(X[0:120]), pd.Series(y[0:120], dtype="category")) y_hat = clf.predict(pd.DataFrame(X[120:])) accs.append(accuracy(pd.Series(y_hat), pd.Series(y[120:]))) # 4rd chunk as test data clf = DecisionTree(criterion="information_gain", max_depth=depth) pass_X = pd.DataFrame(np.append(X[90:], X[0:60], axis=0)) pass_y = pd.Series(np.append(y[90:], y[0:60], axis=0), dtype="category") clf.fit(pass_X, pass_y) y_hat = clf.predict(pd.DataFrame(X[60:90])) accs.append(accuracy(pd.Series(y_hat), pd.Series(y[60:90]))) # 3nd chunk as test data clf = DecisionTree(criterion="information_gain", max_depth=depth) clf.fit(pd.DataFrame(np.append(X[120:], X[0:90], axis=0)), pd.Series(np.append(y[120:], y[0:90], axis=0), dtype="category")) y_hat = clf.predict(pd.DataFrame(X[90:120])) accs.append(accuracy(pd.Series(y_hat), pd.Series(y[90:120]))) # 2st chunk as test data clf = DecisionTree(criterion="information_gain", max_depth=depth) clf.fit(pd.DataFrame(X[30:]), pd.Series(y[30:], dtype="category")) y_hat = clf.predict(pd.DataFrame(X[0:30])) accs.append(accuracy(pd.Series(y_hat), pd.Series(y[0:30]))) # 1st chunk as test data clf = DecisionTree(criterion="information_gain", max_depth=depth) clf.fit(pd.DataFrame(np.append(X[0:30], X[60:], axis=0)), pd.Series(np.append(y[0:30], y[60:], axis=0), dtype="category")) y_hat = clf.predict(pd.DataFrame(X[30:60])) accs.append(accuracy(pd.Series(y_hat), pd.Series(y[30:60]))) print("Individual Accuracies:") print(*accs) print("Average Accuracy:") avg = sum(accs) / 5 print(avg)
def my_regr(X, y, max_depth=5, criterion="information_gain"): """Function to train and predict on estate dataset using my decision tree""" clf = DecisionTree(criterion=criterion, max_depth=max_depth) clf.fit(pd.DataFrame(X[0:330]), pd.Series(y[0:330])) # clf.plot() y = y[330:] y_hat = clf.predict(pd.DataFrame(X[330:])) y = pd.Series(y) print(rmse(y_hat, y)) print(mae(y_hat, y))
def train_and_predict(X, y, max_depth=15): """Function to train and predict iris using my decision tree""" clf = DecisionTree(criterion="information_gain", max_depth=max_depth) clf.fit(pd.DataFrame(X[0:120]), pd.Series(y[0:120], dtype="category")) y = y[120:] y_hat = clf.predict(pd.DataFrame(X[120:])) print("Accuracy", accuracy(pd.Series(y_hat), pd.Series(y))) y = pd.Series(y) for cls in y.unique(): print('Precision: ', cls, " : ", precision(y_hat, y, cls)) print('Recall: ', cls, " : ", recall(y_hat, y, cls))
def nested_cross(data, y, k1=5, k2=4): val1 = len(data) // k1 for i in range(k1): y_test = y[val1 * i:val1 * (i + 1)] x_test = data[val1 * i:val1 * (i + 1)] x_train = np.append(data[0:val1 * i], data[val1 * (i + 1):], axis=0) y_train = np.append(y[0:val1 * i], y[val1 * (i + 1):], axis=0) acc = [] for depth in range(2, 10): s = 0 for j in range(4): val2 = len(x_train) // k2 x_val_test = x_train[val2 * j:val2 * (j + 1)] y_val_test = y_train[val2 * j:val2 * (j + 1)] x_val_train = np.append(x_train[0:val2 * j], x_train[val2 * (j + 1):], axis=0) y_val_train = np.append(y_train[0:val2 * j], y_train[val2 * (j + 1):], axis=0) tree = DecisionTree("information_gain", max_depth=depth) x_val_train = pd.DataFrame(x_val_train) y_val_train = pd.DataFrame(y_val_train) x_val_test = pd.DataFrame(x_val_test) y_val_test = pd.DataFrame(y_val_test) x_val_train.dtype = "sda" y_val_train.dtype = "category" x_val_test.dtype = "sda" y_val_test.dtype = "category" tree.fit(x_val_train, y_val_train) s += (accuracy(np.array(y_val_test), np.array(tree.predict(x_val_test)))) acc.append(s / 4) value = max(acc) index = acc.index(max(acc)) tree = DecisionTree("information_gain", max_depth=value) print("Best Accuracy is : - " + str(value)) print("At Depth : - " + str(index + 1))
# Defining Train Test Split train_test_split = int(0.7*len(iris_data)) X = X_data.iloc[:train_test_split, :] X_test = X_data.iloc[train_test_split:, :] y = y_data.iloc[:train_test_split] y_test = y_data.iloc[train_test_split:] # Training and Testing for criteria in ['information_gain', 'gini_index']: tree = DecisionTree(criterion=criteria, max_depth=3) # Build Decision Tree tree.fit(X, y) #Predict y_hat = tree.predict(X) y_test_hat = tree.predict(X_test) tree.plot() print('Criteria :', criteria) print('Train Accuracy: ', accuracy(y_hat, y)) print('Test Accuracy: ', accuracy(y_test_hat, y_test)) # Precesion and Recall for each class for cls in y.unique(): print("Class =",cls) print('Precision: ', precision(y_test_hat, y_test, cls)) print('Recall: ', recall(y_test_hat, y_test, cls)) #################################################################################### # 5 fold cross-validation
import pandas as pd import numpy as np import matplotlib.pyplot as plt from tree.base import DecisionTree from metrics import * np.random.seed(42) N = 30 P = 5 X = pd.DataFrame({ i: pd.Series(np.random.randint(P, size=N), dtype="category") for i in range(5) }) y = pd.Series(np.random.randint(P, size=N), dtype="category") print('\n\n##Discrete Input and Discrete Output##') for criteria in ['information_gain']: tree = DecisionTree(criterion=criteria, max_depth=np.inf) #Split based on Inf. Gain tree.fit(X, y) y_hat = tree.predict(X) tree.plot() print('Criteria :', criteria) print('Accuracy: ', accuracy(y_hat, y)) for cls in y.unique(): print(cls) print('Precision: ', precision(y_hat, y, cls)) print('Recall: ', recall(y_hat, y, cls))
np.random.seed(42) # Read IRIS data set # ... # tree = DecisionTree(criterion='information_gain',max_depth=10) #Split based on Inf. Gain tree.output="category" tree.input="real" df=pd.read_csv("iris.data",names=['sepal_length','sepal_width','petal_length','petal_width','label']) train_data,test_data=tree.train_test_split(df) sub_tree = tree.decision_tree_algorithm(train_data) tree.tree=sub_tree rows,colums=test_data.values.shape y_hat = tree.predict(test_data.iloc[:,0:colums-1]) y= test_data.iloc[:,-1] print('Accuracy: ', accuracy(y_hat, y)) for cls in y.unique(): print('Class Name: ',cls) print('Precision: ', precision(y_hat, y, cls)) print('Recall: ', recall(y_hat, y, cls)) print() index=df.index.tolist() len,_=df.values.shape #print(len) test_size=int(len*0.2) for i in range(5):
# Read IRIS data set # ... # dataset = load_iris() X, y = dataset.data, dataset.target #from sklearn.utils import shuffle #X, y = shuffle(X, y, random_state=0) print("fit model for iris dataset for 70-30 division") clf = DecisionTree(criterion="a", max_depth=5) clf.fit(pd.DataFrame(X[0:120]), pd.Series(y[0:120], dtype="category")) y = y[120:] y_hat = clf.predict(pd.DataFrame(X[120:])) print("Accuracy", accuracy(pd.Series(y_hat), pd.Series(y))) y = pd.Series(y) for cls in y.unique(): print('Precision: for class ', cls, " : ", precision(y_hat, y, cls)) print('Recall: ', cls, " : ", recall(y_hat, y, cls)) def cross_validtion_5_fold(X, y, depth): X_original = X y_original = y clf = DecisionTree(criterion="a", max_depth=depth) clf.fit(pd.DataFrame(X[0:120]), pd.Series(y[0:120], dtype="category")) y = y[120:]
# Read real-estate data set # ... # data = pd.read_csv(r'C:\Users\Anshuman Yadav\Documents\Real.csv') X_train, X_test, Y_train, Y_test = train_test_split(data[data.columns[1:-1]], data[data.columns[-1]]) X_train = X_train.reset_index(drop=True) X_test = X_test.reset_index(drop=True) Y_train = Y_train.reset_index(drop=True) Y_test = Y_test.reset_index(drop=True) X_train.dtype = "d" X_test.dtype = "d" tree = DecisionTree("ad", max_depth=25) tree.fit(X_train, Y_train) tree.root y_pred = tree.predict(X_test) print("MAE my tree : -") print(mae(np.array(Y_test), np.array(y_pred))) print("MSE my tree : -") print(rmse(np.array(Y_test), np.array(y_pred))) d_tree_sklearn = tree5.DecisionTreeRegressor() d_tree_sklearn = d_tree_sklearn.fit(X_train, Y_train) y_sklearn = d_tree_sklearn.predict(X_test) print("MAE sklearn : -") print(mae(np.array(Y_test), np.array(y_sklearn))) print("MSE sklearn : -") print(rmse(np.array(Y_test), np.array(y_sklearn)))
np.random.seed(42) # Read IRIS data set # ... # iris = pd.read_csv('iris.csv') iris = iris.sample(frac=1).reset_index(drop=True) split_at = int(0.7 * (iris.shape[0])) X_train = iris.iloc[:split_at, :-1] y_train = iris.iloc[:split_at, -1] X_test = iris.iloc[split_at:, :-1] y_test = iris.iloc[split_at:, -1] model = DecisionTree() model.fit(X_train, y_train) y_out = model.predict(X_test) print("Accuracy is: ", accuracy(y_out, y_test)) for group in np.unique(y_test): print("Precision of {} is: {}".format(group, precision(y_out, y_test, group))) print("Recal of {} is: {}".format(group, recall(y_out, y_test, group))) #Accuracy of all five models fold = int(0.2 * (iris.shape[0])) for i in range(5): n_split1 = i * fold n_split2 = n_split1 + fold X_test1 = iris.iloc[n_split1:n_split2, :-1].reset_index(drop=True) y_test1 = pd.Series(list(iris.iloc[n_split1:n_split2, -1])) X_train1 = iris.iloc[:n_split1, :-1].append( iris.iloc[n_split2:, :-1]).reset_index(drop=True)
# Preprocessing X = shuffled.iloc[:, :-1].squeeze() y = (shuffled.iloc[:, -1:]).T.squeeze() len_estate = len(y) # Splitting data X_train, y_train = X.loc[:split*len_estate], y.loc[:split*len_estate] X_test, y_test = X.loc[split*len_estate+1:].reset_index( drop=True), y.loc[split*len_estate+1:].reset_index(drop=True) # Learning tree print("Please wait for some time, it takes time, you can change max depth if it takes too long time.") tree = DecisionTree(criterion="information_gain", max_depth=max_depth) tree.fit(X_train, y_train) tree.plot() # Printing accuracies for different depths for depth in range(2, max_depth+1): y_hat = tree.predict(X_test, max_depth=depth) print("Depth: ", depth) print('\tRMSE: ', rmse(y_hat, y_test)) print('\tMAE: ', mae(y_hat, y_test)) # Decision Tree Regressor from Sci-kit learn dt = DecisionTreeRegressor(random_state=0) dt.fit(X_train, y_train) y_hat = pd.Series(dt.predict(X_test)) print('Sklearn RMSE: ', rmse(y_hat, y_test)) print('Sklearn MAE: ', mae(y_hat, y_test))
import numpy as np import matplotlib.pyplot as plt from tree.base import DecisionTree from metrics import * from sklearn.tree import DecisionTreeRegressor np.random.seed(42) # Read real-estate data set # ... # estate = pd.read_csv('Real_estate.csv', index_col='No', dtype=float) estate = estate.sample(frac=1).reset_index(drop=True) split_at = int(0.3 * (estate.shape[0])) X_train = estate.iloc[:split_at, :-1] y_train = estate.iloc[:split_at, -1] X_test = estate.iloc[split_at:, :-1] y_test = estate.iloc[split_at:, -1] model = DecisionTree(max_depth=2) model.fit(X_train, y_train) y_out = model.predict(X_test) print("Rmse is: ", rmse(y_out, y_test)) print("Mae is: ", mae(y_out, y_test)) model2 = DecisionTreeRegressor(max_depth=2) model2.fit(X_train, y_train) y_out = model2.predict(X_test) print("Rmse of Sklearn is: ", rmse(y_out, y_test)) print("Mae of Sklearn is: ", mae(y_out, y_test))
X = data.iloc[:train_test_split, :-1] X_test = data.iloc[train_test_split:, :-1] y = data.iloc[:train_test_split, -1] y_test = data.iloc[train_test_split:, -1] maxdepth = 4 # Building Decesion Tree based on my model criteria = 'information_gain' mytree = DecisionTree(criterion=criteria, max_depth=maxdepth) #Split based on Inf. Gain mytree.fit(X, y) mytree.plot() print("My Model") y_hat = mytree.predict(X) print("Train Scores:") print('\tRMSE: ', rmse(y_hat, y)) print('\tMAE: ', mae(y_hat, y)) y_test_hat = mytree.predict(X_test) print("Test Scores:") print('\tRMSE: ', rmse(y_test_hat, y_test)) print('\tMAE: ', mae(y_test_hat, y_test)) ################################################################################### # Building Decesion Tree based on sklearn print("Sklearn Model") clf = tree.DecisionTreeRegressor(max_depth=maxdepth) clf = clf.fit(X,y)
print("-----------------------------------------------------------") print("Decision stump on random data") print("-----------------------------------------------------------") N = 30 P = 2 NUM_OP_CLASSES = 2 n_estimators = 3 X = pd.DataFrame(np.abs(np.random.randn(N, P))) y = pd.Series(np.random.randint(NUM_OP_CLASSES, size=N), dtype="category") criteria = 'information_gain' tree = DecisionTree(criterion=criteria) re = X.shape[0] img_weights = [1 / re] * re tree.fit(X, y, img_weights) yhat = pd.Series(tree.predict(X)) print('Criteria :', criteria) print('Accuracy: ', accuracy(yhat, y)) for cls in y.unique(): print("***Class :" + str(cls) + "***") print('Precision: ', precision(yhat, y, cls)) print('Recall: ', recall(yhat, y, cls)) print("-----------------------------------------------------------") print("Adaboost on random data") print("-----------------------------------------------------------") Classifier_AB = AdaBoostClassifier(base_estimator=tree, n_estimators=n_estimators) Classifier_AB.fit(X, y) y_hat = Classifier_AB.predict(X)
for i in range(len(y)): if (y[i]!='Iris-virginica'): y[i] = 'not virginica' N = len(y) t = int(np.floor(0.6*N)) X_train = X.iloc[:t,:] y_train = y[:t] X_test = X.iloc[t:,:] y_test = list(y[t:]) y_test = pd.Series(y_test) criteria = 'information_gain' tree = DecisionTree(criterion=criteria,max_depth=1) Classifier_AB = AdaBoostClassifier(base_estimator=tree, n_estimators=n_estimators ) Classifier_AB.fit(X_train, y_train) y_hat = Classifier_AB.predict(X_test) # [fig1, fig2] = Classifier_AB.plot() print('Criteria :', criteria) print('Accuracy: ', accuracy(y_hat, y_test)) for cls in y.unique(): print('Precision: ', precision(y_hat, y_test, cls)) print('Recall: ', recall(y_hat, y_test, cls)) print("\nDECISION STUMP") tree.fit(X_train,y_train,np.ones(N)/N) y_hat = tree.predict(X_test) print('Criteria :', criteria) print('Accuracy: ', accuracy(y_hat, y_test)) for cls in y.unique(): print('Precision: ', precision(y_hat, y_test, cls)) print('Recall: ', recall(y_hat, y_test, cls))