def testEntropyOneEl(self): """ Test with all same element, empty list, more than 2 labels, and a couple uneven distributions """ dt = decisionTree.DecisionTree() self.assertEquals(dt.entropy(['yes']), 0)
def testRecursiveBuild(self): dt = decisionTree.DecisionTree() tree = dt.makeTree([[1,'a'],[2,'a']], ['yes','no'],['a1','a2'], {'a1': [1,2], 'a2': ['a','b']}, 'default') self.assertNotEquals(repr(tree), 'yes') self.assertNotEquals(repr(tree), 'no') self.assertNotEquals(repr(tree), 'default')
def tree_fit_predict(xTrain, yTrain, xTest, index): model = decisionTree.DecisionTree(maxDepth=maxDepth, verbose=True) # new = tf.gather(xTrain,index) # print(new) # # index = np.array(index) # # index = index.astype(int) # # print(index.dtype) result = model.fit(xTrain[index], yTrain[index]).predict(xTest) return tf.convert_to_tensor(result)
def testTennis(self): """ Test entire program on the tennis data set """ tennis = mu.extract_data('tennis.csv') tennis = mu.enhance_data(tennis) dt = decisionTree.DecisionTree(tennis['feature_dict'], tennis['feature_names']) dt.fit(tennis['data'],tennis['target']) for x,y in zip(tennis['data'],tennis['target']): self.assertEquals(dt.predict([x]), [y]) self.assertEquals(dt.predict(tennis['data']), tennis['target'])
def run(sc, numTrees=100): def zero_matrix(n, m): return np.zeros(n * m, dtype=int).reshape(n, m) def vote_increment(y_est): increment = zero_matrix(y_est.size, n_ys) increment[np.arange(y_est.size), y_est] = 1 return increment # test point x class matrix with 1s marking the estimator prediction airlineData = pd.read_csv("AirlineReduced") #airlineData = spark.read.text("/user/sumish/data/AirlineReduced") #airlineData=pd.read_csv("hdfs:///user/sumish/data/AirlineReduced") airlineData = airlineData.loc[:, ~airlineData.columns.isin([ 'DepTime', 'ArrTime', 'CRSArrTime', 'CRSDepTime', 'ActualElapsedTime', 'ArrTimeInMins', 'ArrDelay' ])] X_train, X_test, y_train, y_test = train_test_split( airlineData.loc[:, ~airlineData.columns.isin(['IsDelayed'])], airlineData['IsDelayed'], test_size=0.75, random_state=42) X_train, X_test, y_train, y_test = np.array(X_train), np.array( X_test), np.array(y_train), np.array(y_test) n_test = X_test.shape[0] n_ys = np.unique(y_train).size ###broad cast variables X_train_BC = sc.broadcast(X_train) X_test_BC = sc.broadcast(X_test) y_train_BC = sc.broadcast(y_train) y_test_BC = sc.broadcast(y_test) model = decisionTree.DecisionTree() # Partition the training data into random sub-samples with replacement. listOfRanomIndexes = [ random.sample(list(range(y_train.size)), int((2 / 3) * y_train.size)) for i in range(numTrees) ] # features = [random.sample(list(range(y_train.size)),int((2/3)*y_train.size)) for i in range(10)] samples = sc.parallelize(listOfRanomIndexes) # Train a model for each sub-sample and apply it to the test data. vote_tally = samples.map(lambda index: model.fit(X_train_BC.value[ index], y_train_BC.value[index]).predict(X_test_BC.value)).map( vote_increment).fold(zero_matrix(n_test, n_ys), np.add) # Take the learner majority vote. y_estimate_vote = np.argmax(vote_tally, axis=1) return f1_score(y_test_BC.value, y_estimate_vote)
def fit(self, dataset, label): label = np.asarray(label) yi = label y_prev = 0 for i in range(self.iteration): begin = time.time() model = decisionTree.DecisionTree(self.max_depth_tree, self.min_size, 'sse') model.fit(dataset, yi) y_predict = model.predict(dataset) y_prev = y_prev + y_predict yi = label - self.learning_rate * y_prev self.models.append(model) # self.gammas.append(gamma) print("Iteration " + str(i+1) + ": " + str(time.time() - begin) + " s")
# random forest rf = randomForest.RandomForest(10,10,train_X.shape[0],train_X.shape[1]) rf.train(train_X,train_y) res = rf.predict(validation_X) score = 0 for i in range(len(res)): if res[i] == validation_y[i]: score += 1 score /= len(res) print(score) # decision tree tree = decisionTree.DecisionTree(10,train_X.shape[1]) tree.train(train_X,train_y) res = tree.predict(validation_X) score = 0 for i in range(len(res)): if res[i] == validation_y[i]: score += 1 score /= len(res) print(score) # with open('spam_prediction.csv','wt') as f: # writer = csv.writer(f, delimiter=',')
def main(): temp = lambda col: col not in ['payer_code'] diabet = pd.read_csv('dataset/diabetic_data.csv', na_values='?', usecols=temp) # print(diabet.isnull().sum()) # print(diabet.dtypes) fill = {} for column in diabet: if diabet[column].dtypes == 'int64': fill[column] = diabet[column].mean() else: temp = diabet[column].mode() fill[column] = temp[0] # diabet = diabet.fillna(value=fill) # print(diabet.isnull().sum()) for column in diabet: if diabet[column].dtypes != 'int64': try: float(diabet[column][0]) # diabet[column] = diabet[column].apply(lambda x: x.isnumeric()) diabet[column] = diabet[column].apply(pd.to_numeric, errors='coerce') except ValueError: continue # print(diabet['diag_1']) # print(diabet['diag_2']) # print(diabet.dtypes) temp = pd.DataFrame() for column in diabet: clean_df = diabet[column] elements = np.array(clean_df) if diabet[column].dtypes == 'int64': mean = np.mean(elements, axis=0) sd = np.std(elements, axis=0) final_list = [x for x in clean_df if (x > mean - 2 * sd)] final_list = [x for x in final_list if (x < mean + 2 * sd)] temp[column] = pd.Series(final_list) else: temp[column] = pd.Series(elements) temp = temp.fillna(value=fill) # print(temp.isnull().sum()) # print(temp.head()) # temp.sort_values(by=[''], ascending = False) # temp.to_csv('dataset/pre_processed.csv', encoding='UTF8', index=False) temp = lambda col: col not in ['payer_code'] df = pd.read_csv('dataset/pre_processed.csv', usecols=temp) y = df['diabetesMed'].map({'No': 0, 'Yes': 1}) # print(df) clean_df = featureSelection.feature_engineering(df) # print(clean_df) pca = PCA(15) pca_arr = pca.fit_transform(clean_df) pca_df = pd.DataFrame(data=pca_arr) # print(pca_df) # clean_df['label'] = y # clean_df.to_csv("dataset/clean.csv", index_label=False) # pca_df['label'] = y # clean_df.to_csv("dataset/clean_pca.csv", index_label=False) df_clean = pd.read_csv("dataset/clean.csv") df_clean = df_clean[:10000] y = df_clean["label"] del df_clean["label"] # print(df_clean) model = gradientBoosting.GradientBoosting(10, 0.1, max_depth_tree=4) model.fit(df_clean, y) for dt in model.models: print(dt.root) a = model.predict(df_clean) print(a) dt = decisionTree.DecisionTree(5, 100) dt.fit(df_clean, y) b = dt.predict(df_clean)
import preprocess as pp import decisionTree as dt import numpy as np import random random.seed() file = 'data.csv' rows = pp.readFile(file=file) X_train, Y_train, X_test, Y_test = pp.splitData(rows) tree = dt.DecisionTree(X_train, Y_train, 9) tree.train() Y_pred, error = tree.test(X_test, Y_test) print("error in decision tree testing =", error) for i in range(len(Y_test)): print(Y_pred[i], Y_test[i]) print("vignesh's prediction") Xmale = [['3', '9.6', 'M', '28.3', '7', '1']] Xfemale = [['3', '9.6', 'F', '28.3', '7', '1']] Y = [10] yp, e = tree.test(Xmale, Y) print("vignesh's grade", yp[0]) yp, e = tree.test(Xfemale, Y) print("chhakka vignesh's grade", yp[0])
def testFitEmpty(self): dt = decisionTree.DecisionTree(default_v='default') dt.fit([],[]) self.assertEquals(repr(dt.clf), 'default')
def testEntropyMax(self): dt = decisionTree.DecisionTree() self.assertEquals(dt.entropy(['yes','no']), 1.0)
def testEntropyEmpty(self): """ Test with empty list """ dt = decisionTree.DecisionTree() self.assertEquals(dt.entropy([]), 0)
def testbuildEmpty(self): dt = decisionTree.DecisionTree() tree = dt.makeTree([],[],[],{},'default') self.assertEquals(repr(tree), 'default')
def testOneFeature(self): dt = decisionTree.DecisionTree() tree = dt.makeTree([[1],[1],[2]], ['no','yes','yes'], ['a1'], {'a1': [1,2], 'a2': ['a','b']}, 'default') self.assertEquals(tree.attribute, 'a1')
def testSelAttRest(self): dt = decisionTree.DecisionTree() rest = mu.extract_data('restaurant.csv') attrib = dt.selectAttribute(rest['data'], rest['target']) self.assertEquals(attrib, 4)
def _init_(self, data): self.data = data self.decisionTree = dt.DecisionTree()
def testSelAttTenn(self): dt = decisionTree.DecisionTree() tennis = mu.extract_data('tennis.csv') attrib = dt.selectAttribute(tennis['data'], tennis['target']) self.assertEquals(attrib, 0)
def testEntropyUnbal(self): dt = decisionTree.DecisionTree() self.assertAlmostEquals(dt.entropy(['yes','no','yes']), 0.918296, places=4)
def testEntropyThreeVal(self): dt = decisionTree.DecisionTree() self.assertAlmostEquals(dt.entropy(['yes','no','maybe']), 1.58496, places=4)
def k_fold_eval(traind, k): '''See the Word doc for specifications ''' test_size = int(len(traind["data"]) / k) result_dict = {} #testset's precisions,recalls,accuracy v_precisions = [] v_recalls = [] v_accuracys = [] #trainset's precisions,recalls,accuracy t_precisions = [] t_recalls = [] t_accuracys = [] # get random id numbers in range(0,k) for a number of k id_ls = random.sample(xrange(0, k), k) for i in id_ls: #each time, select the i_th part of data as testset which depends on the id from random_id list test_set = traind["data"][i * test_size:(i + 1) * test_size] #copy all samples/labels from data and data's labels train_set = traind["data"][:] train_labels = traind["target"][:] #Then delete the samples and labels which testset contains...Now,the remaining sets are the trainset del (train_labels[i * test_size:(i + 1) * test_size]) del (train_set[i * test_size:(i + 1) * test_size]) #fit the trainset and predict the testset clf = decisionTree.DecisionTree(attrib_d=data['feature_dict'], attribs=data['feature_names'], default_v="default") clf.fit(train_set, train_labels) train_tars = clf.predict(train_set) test_tars = clf.predict(test_set) #actual targets for testing set test_act = traind["target"][i * test_size:(i + 1) * test_size] #Assume that the most common value is the positive one positive = decisionTree.zeroR(traind['target']) #init TP,TN,FP,FN TP = 0.0 TN = 0.0 FP = 0.0 FN = 0.0 # calculate tests' performance for i in range(len(test_tars)): if test_tars[i] == test_act[i]: if test_tars[i] == positive: TP = TP + 1 else: TN = TN + 1 # else, we get a false value else: if test_tars[i] == positive: FP = FP + 1 else: FN = FN + 1 if (TP + FP) == 0.0: v_precisions.append(0.0) else: v_precisions.append(float(TP / (TP + FP))) if (TP + FN) == 0.0: v_recalls.append(0.0) else: v_recalls.append((TP / (TP + FN))) v_accuracys.append(float((TP + TN) / len(test_act))) #set TP,TN,FP,FN to 0 in order calculate trainset's performance TP = 0.0 TN = 0.0 FP = 0.0 FN = 0.0 # For trainset...(the same as testset) for i in range(len(train_tars)): if train_tars[i] == train_labels[i]: if train_tars[i] == positive: TP = TP + 1 else: TN = TN + 1 # else, we get a false value else: if train_tars[i] == positive: FP = FP + 1 else: FN = FN + 1 if (TP + FP) == 0.0: t_precisions.append(0.0) else: t_precisions.append((TP / (TP + FP))) if (TP + FN) == 0.0: t_recalls.append(0.0) else: t_recalls.append((TP / (TP + FN))) t_accuracys.append(((TP + TN) / len(train_labels))) #calculate the average for each value v_precision = sum(v_precisions) / k v_recall = sum(v_recalls) / k v_accuracy = sum(v_accuracys) / k result_dict["test_precision"] = v_precision result_dict["test_recall"] = v_recall result_dict["test_accuracy"] = v_accuracy t_precision = sum(t_precisions) / k t_recall = sum(t_recalls) / k t_accuracy = sum(t_accuracys) / k result_dict["train_precision"] = t_precision result_dict["train_recall"] = t_recall result_dict["train_accuracy"] = t_accuracy return result_dict
def testNoFeatures(self): dt = decisionTree.DecisionTree() tree = dt.makeTree([[],[],[]], ['no','yes','yes'], [], {'a1': [1,2], 'a2': ['a','b']}, 'default') self.assertEquals(repr(tree), 'yes')
def testbuildZeroEnt(self): dt = decisionTree.DecisionTree() tree = dt.makeTree([[1,'a'],[2,'a']], ['yes','yes'],['a1','a2'], {'a1': [1,2], 'a2': ['a','b']}, 'default') self.assertEquals(repr(tree), 'yes')
def testEntropyAllSame(self): dt = decisionTree.DecisionTree() self.assertEquals(dt.entropy(['yes','yes']), 0)