Пример #1
0
    def print_tree(self, rows, head, spacing=""):
        """
        A tree printing function.

        PARAMETERS
        ==========

        rows: list
            A list of lists to store the dataset.

        head: list
            A list to store the headings of the
            columns of the dataset.

        spacing: String
            To store and update the spaces to
            print the tree in an organised manner.

        RETURNS
        =======

        None

        """

        # Try partitioning the dataset on each of the unique attribute,
        # calculate the gini impurity,
        # and return the question that produces the least gini impurity.
        gain, question = find_best_split(rows, head)

        # Base case: we've reached a leaf
        if gain == 0:
            print(spacing + "Predict", class_counts(rows, len(rows[0])-1))
            return

        # If we reach here, we have found a useful feature / value
        # to partition on.
        true_rows, false_rows = partition(rows, question)

        # Print the question at this node
        print(spacing + str(question))

        # Call this function recursively on the true branch
        print(spacing + '--> True:')
        self.print_tree(true_rows, head, spacing + "  ")

        # Call this function recursively on the false branch
        print(spacing + '--> False:')
        self.print_tree(false_rows, head, spacing + "  ")
Пример #2
0
    def predict(self, A, head, n_estimators=100):
        """
        Determine the predictions of the
        subsets of the dataset through the
        DecisionTreeClassifier class and
        print the mode of the predicted values.

        PARAMETERS
        ==========

        A: ndarray(dtype=int,ndim=2)
            2-D Array of Dataset's Input

        n_estimators: int
            Number of Decision Trees to be
            iterated over for the classification.

        RETURNS
        =======

        None
        """

        prediction = {}
        print("Predictions of individual decision trees")
        # Iterate to collect predictions of
        # 100 Decision Trees after taking
        # random samples from the dataset.
        for i in range(n_estimators):
            M = len(A)

            # Finding random indexes for rows
            # to collect the bootstrapped samples
            # of the dataset.
            indexrow = np.random.randint(0, M-1, 6)
            rows = []
            for j in indexrow:
                rows.append(A[j])

            label = len(rows[0])-1

            # Get prediction values for the rows
            prediction_val = class_counts(rows, label)
            for d in prediction_val:
                prediction_val[d] = 0

            # Create object of class DecisionTreeClassifier
            RandomF = DecisionTreeClassifier()

            # Store the returned dictionary of the
            # predictions of the subsets of the dataset.
            di = RandomF.classify(rows, head, prediction_val)

            print(di)

            # find maximum predicted value for the subsets
            # of the dataset.
            maximum = 0
            for j in di:
                if di[j] > maximum:
                    maximum = di[j]
                    maxk = j

            # Update the dictionary prediction with the
            # maximum predicted value in the
            # predictions of the subsets of the dataset.
            if maxk not in prediction:
                prediction[maxk] = maximum
            else:
                prediction[maxk] = prediction[maxk]+maximum

        # find maximum predicted value, hence the
        # final prediction of the Random Forest Algorithm.
        maximum = 0
        for i in prediction:
            if prediction[i] > maximum:
                maximum = prediction[i]
                maxk = i

        # predicting the maximum occurence
        print("\n Predict = {", maxk, "}")
Пример #3
0
    def classify(self, rows, head, prediction_val):
        """
        A function to make predictions of
        the subsets of the dataset.

        PARAMETERS
        ==========

        rows: list
            A list of lists to store the subsets
            of the dataset.

        head: list
            A list to store the headings of the
            columns of the subset of the dataset.

        prediction_val: dictionary
            A dictionary to update and return the
            predictions of the subsets of the
            dataset.

        RETURNS
        =======

        prediction_val
            Dictionary to return the predictions
            corresponding to the subsets of the
            dataset.

        """

        N = len(rows[0])

        # Finding random indexes for columns
        # to collect random samples of the dataset.
        indexcol = []
        for j in range(0, 5):
            r = np.random.randint(0, N-2)
            if r not in indexcol:
                indexcol.append(r)

        row = []
        for j in rows:
            L = []
            for k in indexcol:
                L.append(j[k])
            row.append(L)

        # add last column to the random sample so created.
        for j in range(0, len(row)):
            row[j].append(rows[j][N-1])

        rows = row

        # Try partitioning the dataset on each of the unique attribute,
        # calculate the gini impurity,
        # and return the question that produces the least gini impurity.
        gain, question = find_best_split(rows, head)

        # Base case: we've reached a leaf
        if gain == 0:
            # Get the predictions of the current set of rows.
            p = class_counts(rows, len(rows[0])-1)
            for d in prediction_val:
                for j in p:
                    if d == j:
                        # update the predictions to be returned.
                        prediction_val[d] = prediction_val[d] + p[j]
            return prediction_val

        # If we reach here, we have found a useful feature / value
        # to partition on.
        true_rows, false_rows = partition(rows, question)

        # Recursively build the true branch.
        self.classify(true_rows, head, prediction_val)

        # Recursively build the false branch.
        self.classify(false_rows, head, prediction_val)

        # Return the dictionary of the predictions
        # at the end of the recursion.
        return prediction_val