Exemplo n.º 1
0
    def train_by_samples(self, criterion="entropy", splitter="best", max_depth=10, ccp_alpha=0):

        validation_split = .1
        data_set_size = len(self.data_features)
        indices = list(range(data_set_size))
        split = int(np.floor(validation_split * data_set_size))
        train_index, valid_index = indices[split:], indices[:split]
        data_features = np.array(self.data_features)
        data_tags = np.array(self.data_tags)

        train_data = data_features[train_index]
        train_tags = data_tags[train_index]

        validation_data = data_features[valid_index]
        validation_tags = data_tags[valid_index]

        ######FOR SPLITTING DATA TO ERROR GRAPH####
        clf_split = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, splitter=splitter,
                                           ccp_alpha=ccp_alpha)
        if ccp_alpha != 0:
            clf_split.ccp_alpha = ccp_alpha

        train_split_acc, valid_split_acc, samples_train = [], [], []

        d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15 = np.array_split(train_data, 15)
        t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15 = np.array_split(train_tags, 15)

        d = [d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15]
        t = [t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15]

        for split_index in range(0, 15):
            train_split_data = np.concatenate((d[0:(split_index + 1)]), axis=0)
            train_split_tags = np.concatenate((t[0:(split_index + 1)]), axis=0)

            clf_split = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, splitter=splitter,
                                               ccp_alpha=ccp_alpha)
            # Train Logistic regression Tree Classifer
            clf_split = clf_split.fit(train_split_data, train_split_tags)

            # Predict the response for test dataset
            valid_predictions = clf_split.predict(validation_data)
            train_part_predictions = clf_split.predict(train_split_data)

            train_split_acc.append(metrics.accuracy_score(train_split_tags, train_part_predictions))
            valid_split_acc.append(metrics.accuracy_score(validation_tags, valid_predictions))
            samples_train.append(len(train_split_tags))

        DecisionTreeFB.draw_training_samples(samples_train, train_split_acc, valid_split_acc)