def print_tree(self, rows, head, spacing=""): """ A tree printing function. PARAMETERS ========== rows: list A list of lists to store the dataset. head: list A list to store the headings of the columns of the dataset. spacing: String To store and update the spaces to print the tree in an organised manner. RETURNS ======= None """ # Try partitioning the dataset on each of the unique attribute, # calculate the gini impurity, # and return the question that produces the least gini impurity. gain, question = find_best_split(rows, head) # Base case: we've reached a leaf if gain == 0: print(spacing + "Predict", class_counts(rows, len(rows[0])-1)) return # If we reach here, we have found a useful feature / value # to partition on. true_rows, false_rows = partition(rows, question) # Print the question at this node print(spacing + str(question)) # Call this function recursively on the true branch print(spacing + '--> True:') self.print_tree(true_rows, head, spacing + " ") # Call this function recursively on the false branch print(spacing + '--> False:') self.print_tree(false_rows, head, spacing + " ")
def predict(self, A, head, n_estimators=100): """ Determine the predictions of the subsets of the dataset through the DecisionTreeClassifier class and print the mode of the predicted values. PARAMETERS ========== A: ndarray(dtype=int,ndim=2) 2-D Array of Dataset's Input n_estimators: int Number of Decision Trees to be iterated over for the classification. RETURNS ======= None """ prediction = {} print("Predictions of individual decision trees") # Iterate to collect predictions of # 100 Decision Trees after taking # random samples from the dataset. for i in range(n_estimators): M = len(A) # Finding random indexes for rows # to collect the bootstrapped samples # of the dataset. indexrow = np.random.randint(0, M-1, 6) rows = [] for j in indexrow: rows.append(A[j]) label = len(rows[0])-1 # Get prediction values for the rows prediction_val = class_counts(rows, label) for d in prediction_val: prediction_val[d] = 0 # Create object of class DecisionTreeClassifier RandomF = DecisionTreeClassifier() # Store the returned dictionary of the # predictions of the subsets of the dataset. di = RandomF.classify(rows, head, prediction_val) print(di) # find maximum predicted value for the subsets # of the dataset. maximum = 0 for j in di: if di[j] > maximum: maximum = di[j] maxk = j # Update the dictionary prediction with the # maximum predicted value in the # predictions of the subsets of the dataset. if maxk not in prediction: prediction[maxk] = maximum else: prediction[maxk] = prediction[maxk]+maximum # find maximum predicted value, hence the # final prediction of the Random Forest Algorithm. maximum = 0 for i in prediction: if prediction[i] > maximum: maximum = prediction[i] maxk = i # predicting the maximum occurence print("\n Predict = {", maxk, "}")
def classify(self, rows, head, prediction_val): """ A function to make predictions of the subsets of the dataset. PARAMETERS ========== rows: list A list of lists to store the subsets of the dataset. head: list A list to store the headings of the columns of the subset of the dataset. prediction_val: dictionary A dictionary to update and return the predictions of the subsets of the dataset. RETURNS ======= prediction_val Dictionary to return the predictions corresponding to the subsets of the dataset. """ N = len(rows[0]) # Finding random indexes for columns # to collect random samples of the dataset. indexcol = [] for j in range(0, 5): r = np.random.randint(0, N-2) if r not in indexcol: indexcol.append(r) row = [] for j in rows: L = [] for k in indexcol: L.append(j[k]) row.append(L) # add last column to the random sample so created. for j in range(0, len(row)): row[j].append(rows[j][N-1]) rows = row # Try partitioning the dataset on each of the unique attribute, # calculate the gini impurity, # and return the question that produces the least gini impurity. gain, question = find_best_split(rows, head) # Base case: we've reached a leaf if gain == 0: # Get the predictions of the current set of rows. p = class_counts(rows, len(rows[0])-1) for d in prediction_val: for j in p: if d == j: # update the predictions to be returned. prediction_val[d] = prediction_val[d] + p[j] return prediction_val # If we reach here, we have found a useful feature / value # to partition on. true_rows, false_rows = partition(rows, question) # Recursively build the true branch. self.classify(true_rows, head, prediction_val) # Recursively build the false branch. self.classify(false_rows, head, prediction_val) # Return the dictionary of the predictions # at the end of the recursion. return prediction_val