예제 #1
0
from sklearn.calibration import CalibratedClassifierCV

warnings.filterwarnings('ignore')
data = pd.read_csv('data/RecentData.csv')

########################################################################################
# Split training and testing data and preprocessing
########################################################################################
data_test = data[(data['Tx date and time'] > '2018-09-30') & (data['Tx date and time'] <= '2018-11-30')]
data_train = data[(data['Tx date and time'] <= '2018-09-30') & (data['Tx date and time'] > '2016-12-31')]

data_train = data_train.drop(['Tx date and time'], axis=1)
# X_train = data_train.drop(['Label', 'Tx date and time'], axis=1)
# y_train = data_train['Label']

X_train, y_train = utils.preprocessunEqualDistribution(data_train, 7)

X_test = data_test.drop(['Label', 'Tx date and time'], axis=1)
y_test = data_test['Label']

tuned_parameters = [{'n_estimators': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                     'max_depth': [1, 2, 3, 5, 10, 20, 50]}]
rf_clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, scoring='recall_macro')

rf_clf.fit(X_train, y_train)
best_n = rf_clf.best_params_['n_estimators']
best_depth = rf_clf.best_params_['max_depth']
print(best_n, best_depth)

calibrated_classifier = CalibratedClassifierCV(RandomForestClassifier(n_estimators=best_n, max_depth=best_depth),
                                               method='isotonic', cv=5)
예제 #2
0
data_train = train_file[model_col]
data_train['Label'] = train_file['Label']

test_file = pd.read_excel('data/Sept/Transformed-Aug19CCDC.xlsx')

print('Test data', test_file.shape)

tuned_parameters = [{'max_depth': [5, 6, 7, 8, 9, 10]}]
training_batches = utils.createBatches(data_train, 4)

ccdc_clf = []
sampling = 7

for batch in training_batches:
    X, y = utils.preprocessunEqualDistribution(batch, sampling)

    rf_clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, scoring='recall_macro')
    rf_clf.fit(X, y)

    ccdc_clf.append(rf_clf)


def averageTest(X_test, clfs):
    """
    :type X_test: dataframe
    """
    y_pred = [0] * X_test.shape[0]
    y_prob = [[0] * 2 for i in range(X_test.shape[0])]

    for clf in clfs:
예제 #3
0
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingClassifier

data = pd.read_csv('data/NewFeatures.csv')

data_test = data[(data['Tx date and time'] > '2018-09-30')
                 & (data['Tx date and time'] <= '2018-11-30')]
data_train = data[(data['Tx date and time'] > '2016-12-31')
                  & (data['Tx date and time'] <= '2018-09-30')]

data_train = data_train.drop(['Tx date and time'], axis=1)
# For sampling
X_train_stacking, y_train_stacking = utils.preprocessunEqualDistribution(
    data_train, 5)

X_train_rf = data_train.drop('Label', axis=1)
y_train_rf = data_train['Label']

## Split without sampling
# X_train = data_train.drop(['Label'], axis=1)
# y_train = data_train['Label']

X_test_stacking = data_test.drop(['Label', 'Tx date and time'], axis=1)
y_test_stacking = data_test['Label']

clf1 = DecisionTreeClassifier()
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
clf4 = KNeighborsClassifier()