def new(self, **kwargs): self.graph = kwargs["graph"] self.net = ensemble.HistGradientBoostingClassifier( verbose=1, learning_rate=kwargs["lr"], max_iter=kwargs["max_iter"], l2_regularization=kwargs["l2_regularization"], early_stopping=kwargs["early_stop"], validation_fraction=20 / 70) return self.net
def TrainBDT(featureNames, trainingData, classificationArray): clf = ensemble.HistGradientBoostingClassifier() trainingData = trainingData[featureNames] # Remove all irrelevant columns if cfg.balanceClasses: imp = IterativeImputer() imp.fit(trainingData) trainingData = imp.transform(trainingData) sm = smt(sampling_strategy=1) trainingData, classificationArray = sm.fit_sample( trainingData, classificationArray) clfView = clf.fit(trainingData, classificationArray) return clfView
def optimize_score(all_features, labels, current_score, trn_count, tst_count, level, excluded_columns): print('Score for level', level, 'is', current_score * 100, 'columns', all_features.columns, file=open("w207_project_bruteforce.log", "a")) processed_columns = [] processed_columns.extend(excluded_columns) for c in all_features.columns: if c in processed_columns: continue processed_columns.append(c) df_features = all_features.drop(c, axis=1) train_features = df_features.values[:trn_count] test_features = df_features.values[trn_count:trn_count + tst_count] train_labels = labels.values[:trn_count] test_labels = labels.values[trn_count:trn_count + tst_count] clf = ske.HistGradientBoostingClassifier(random_state=123) clf.fit(train_features, train_labels) score = clf.score(test_features, test_labels) print('Level', level, ': Dropping', c, train_features.shape, test_features.shape, "HistGradientBoosting", current_score * 100, score * 100, score >= current_score, score > current_score, file=open("w207_project_bruteforce.log", "a")) if score >= current_score: optimize_score(df_features, labels, score, trn_count, tst_count, level + 1, processed_columns)
def get_skl_estimator(self, **default_parameters): return ensemble.HistGradientBoostingClassifier(**default_parameters)
print(full_features.shape, file=open("w207_project_v7.log", "a")) train_count = int(totalrows * 0.8) train_features = full_features.values[:train_count] test_features = full_features.values[train_count:] train_labels = full_labels.values[:train_count] test_labels = full_labels.values[train_count:] print(train_labels.shape, test_labels.shape, file=open("w207_project_v7.log", "a")) clf = ske.HistGradientBoostingClassifier(random_state=123) clf.fit(train_features, train_labels) all_columns_score = clf.score(test_features, test_labels) print("All columns (original)", train_features.shape, "HistGradientBoostingClassifier", all_columns_score * 100, file=open("w207_project_v7.log", "a")) def optimize_score(all_features, labels, current_score, trn_count, tst_count, level): print('Score for level', level, 'is',
def optimize_score(all_features, labels, current_score, trn_count, tst_count, level, excluded_columns): # This is the main function that is calling itself when the precision increases # all_features is the source recordset # labels contains te labels for the data # current_score is tha score that we need to improve # trn_count size of the training recordset # tst_count is the size of the test data # level is the nesting level for the information purposes # excluded_columns contain the list of columns that should be skipped for optimization purposes print('Score for level', level, 'is', current_score * 100, 'columns', all_features.columns, file=open("w207_project_bruteforce.log", "a")) # Processed column list is the list that contains the columns that we already checked processed_columns = [] # Marking columns that should be excluded as processed processed_columns.extend(excluded_columns) # Looping through column names for c in all_features.columns: if c in processed_columns: continue processed_columns.append(c) # Dropping the column and generating a recordset without this column df_features = all_features.drop(c, axis=1) # Splitting the recordset into training and testing data train_features = df_features.values[:trn_count] test_features = df_features.values[trn_count:trn_count + tst_count] # Splitting the labels into training and testing labels train_labels = labels.values[:trn_count] test_labels = labels.values[trn_count:trn_count + tst_count] # Calculating the score for the recordset with dropped column clf = ske.HistGradientBoostingClassifier(random_state=123) clf.fit(train_features, train_labels) score = clf.score(test_features, test_labels) # Logging the new score print('Level', level, ': Dropping', c, train_features.shape, test_features.shape, "HistGradientBoosting", current_score * 100, score * 100, score >= current_score, score > current_score, file=open("w207_project_bruteforce.log", "a")) # If the score has improved or is the same, then the recordset either has the same score # with less features or the precision score is imrpoved. Either way is more optimal # Therefore we are keeping this structure and trying to optimize it even more # Passing optimized recordset to the same function for further optimizations if score >= current_score: optimize_score(df_features, labels, score, trn_count, tst_count, level + 1, processed_columns)