예제 #1
0
파일: models.py 프로젝트: cmu-snap/unfair
 def new(self, **kwargs):
     self.graph = kwargs["graph"]
     self.net = ensemble.HistGradientBoostingClassifier(
         verbose=1,
         learning_rate=kwargs["lr"],
         max_iter=kwargs["max_iter"],
         l2_regularization=kwargs["l2_regularization"],
         early_stopping=kwargs["early_stop"],
         validation_fraction=20 / 70)
     return self.net
예제 #2
0
def TrainBDT(featureNames, trainingData, classificationArray):
    clf = ensemble.HistGradientBoostingClassifier()
    trainingData = trainingData[featureNames]  # Remove all irrelevant columns
    if cfg.balanceClasses:
        imp = IterativeImputer()
        imp.fit(trainingData)
        trainingData = imp.transform(trainingData)
        sm = smt(sampling_strategy=1)
        trainingData, classificationArray = sm.fit_sample(
            trainingData, classificationArray)
    clfView = clf.fit(trainingData, classificationArray)
    return clfView
예제 #3
0
def optimize_score(all_features, labels, current_score, trn_count, tst_count,
                   level, excluded_columns):

    print('Score for level',
          level,
          'is',
          current_score * 100,
          'columns',
          all_features.columns,
          file=open("w207_project_bruteforce.log", "a"))
    processed_columns = []
    processed_columns.extend(excluded_columns)

    for c in all_features.columns:
        if c in processed_columns:
            continue

        processed_columns.append(c)
        df_features = all_features.drop(c, axis=1)

        train_features = df_features.values[:trn_count]
        test_features = df_features.values[trn_count:trn_count + tst_count]

        train_labels = labels.values[:trn_count]
        test_labels = labels.values[trn_count:trn_count + tst_count]

        clf = ske.HistGradientBoostingClassifier(random_state=123)
        clf.fit(train_features, train_labels)
        score = clf.score(test_features, test_labels)

        print('Level',
              level,
              ': Dropping',
              c,
              train_features.shape,
              test_features.shape,
              "HistGradientBoosting",
              current_score * 100,
              score * 100,
              score >= current_score,
              score > current_score,
              file=open("w207_project_bruteforce.log", "a"))

        if score >= current_score:
            optimize_score(df_features, labels, score, trn_count, tst_count,
                           level + 1, processed_columns)
예제 #4
0
 def get_skl_estimator(self, **default_parameters):
     return ensemble.HistGradientBoostingClassifier(**default_parameters)
예제 #5
0
print(full_features.shape, file=open("w207_project_v7.log", "a"))

train_count = int(totalrows * 0.8)

train_features = full_features.values[:train_count]
test_features = full_features.values[train_count:]

train_labels = full_labels.values[:train_count]
test_labels = full_labels.values[train_count:]

print(train_labels.shape,
      test_labels.shape,
      file=open("w207_project_v7.log", "a"))

clf = ske.HistGradientBoostingClassifier(random_state=123)
clf.fit(train_features, train_labels)
all_columns_score = clf.score(test_features, test_labels)
print("All columns (original)",
      train_features.shape,
      "HistGradientBoostingClassifier",
      all_columns_score * 100,
      file=open("w207_project_v7.log", "a"))


def optimize_score(all_features, labels, current_score, trn_count, tst_count,
                   level):

    print('Score for level',
          level,
          'is',
예제 #6
0
def optimize_score(all_features, labels, current_score, trn_count, tst_count,
                   level, excluded_columns):
    # This is the main function that is calling itself when the precision increases
    # all_features is the source recordset
    # labels contains te labels for the data
    # current_score is tha score that we need to improve
    # trn_count size of the training recordset
    # tst_count is the size of the test data
    # level is the nesting level for the information purposes
    # excluded_columns contain the list of columns that should be skipped for optimization purposes

    print('Score for level',
          level,
          'is',
          current_score * 100,
          'columns',
          all_features.columns,
          file=open("w207_project_bruteforce.log", "a"))

    # Processed column list is the list that contains the columns that we already checked
    processed_columns = []
    # Marking columns that should be excluded as processed
    processed_columns.extend(excluded_columns)

    # Looping through column names
    for c in all_features.columns:
        if c in processed_columns:
            continue

        processed_columns.append(c)

        # Dropping the column and generating a recordset without this column
        df_features = all_features.drop(c, axis=1)

        # Splitting the recordset into training and testing data
        train_features = df_features.values[:trn_count]
        test_features = df_features.values[trn_count:trn_count + tst_count]

        # Splitting the labels into training and testing labels
        train_labels = labels.values[:trn_count]
        test_labels = labels.values[trn_count:trn_count + tst_count]

        # Calculating the score for the recordset with dropped column
        clf = ske.HistGradientBoostingClassifier(random_state=123)
        clf.fit(train_features, train_labels)
        score = clf.score(test_features, test_labels)

        # Logging the new score
        print('Level',
              level,
              ': Dropping',
              c,
              train_features.shape,
              test_features.shape,
              "HistGradientBoosting",
              current_score * 100,
              score * 100,
              score >= current_score,
              score > current_score,
              file=open("w207_project_bruteforce.log", "a"))

        # If the score has improved or is the same, then the recordset either has the same score
        # with less features or the precision score is imrpoved. Either way is more optimal
        # Therefore we are keeping this structure and trying to optimize it even more
        # Passing optimized recordset to the same function for further optimizations
        if score >= current_score:
            optimize_score(df_features, labels, score, trn_count, tst_count,
                           level + 1, processed_columns)