def train_by_samples(self, criterion="entropy", splitter="best", max_depth=10, ccp_alpha=0): validation_split = .1 data_set_size = len(self.data_features) indices = list(range(data_set_size)) split = int(np.floor(validation_split * data_set_size)) train_index, valid_index = indices[split:], indices[:split] data_features = np.array(self.data_features) data_tags = np.array(self.data_tags) train_data = data_features[train_index] train_tags = data_tags[train_index] validation_data = data_features[valid_index] validation_tags = data_tags[valid_index] ######FOR SPLITTING DATA TO ERROR GRAPH#### clf_split = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, splitter=splitter, ccp_alpha=ccp_alpha) if ccp_alpha != 0: clf_split.ccp_alpha = ccp_alpha train_split_acc, valid_split_acc, samples_train = [], [], [] d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15 = np.array_split(train_data, 15) t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15 = np.array_split(train_tags, 15) d = [d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15] t = [t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15] for split_index in range(0, 15): train_split_data = np.concatenate((d[0:(split_index + 1)]), axis=0) train_split_tags = np.concatenate((t[0:(split_index + 1)]), axis=0) clf_split = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, splitter=splitter, ccp_alpha=ccp_alpha) # Train Logistic regression Tree Classifer clf_split = clf_split.fit(train_split_data, train_split_tags) # Predict the response for test dataset valid_predictions = clf_split.predict(validation_data) train_part_predictions = clf_split.predict(train_split_data) train_split_acc.append(metrics.accuracy_score(train_split_tags, train_part_predictions)) valid_split_acc.append(metrics.accuracy_score(validation_tags, valid_predictions)) samples_train.append(len(train_split_tags)) DecisionTreeFB.draw_training_samples(samples_train, train_split_acc, valid_split_acc)