from sklearn.calibration import CalibratedClassifierCV warnings.filterwarnings('ignore') data = pd.read_csv('data/RecentData.csv') ######################################################################################## # Split training and testing data and preprocessing ######################################################################################## data_test = data[(data['Tx date and time'] > '2018-09-30') & (data['Tx date and time'] <= '2018-11-30')] data_train = data[(data['Tx date and time'] <= '2018-09-30') & (data['Tx date and time'] > '2016-12-31')] data_train = data_train.drop(['Tx date and time'], axis=1) # X_train = data_train.drop(['Label', 'Tx date and time'], axis=1) # y_train = data_train['Label'] X_train, y_train = utils.preprocessunEqualDistribution(data_train, 7) X_test = data_test.drop(['Label', 'Tx date and time'], axis=1) y_test = data_test['Label'] tuned_parameters = [{'n_estimators': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20], 'max_depth': [1, 2, 3, 5, 10, 20, 50]}] rf_clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, scoring='recall_macro') rf_clf.fit(X_train, y_train) best_n = rf_clf.best_params_['n_estimators'] best_depth = rf_clf.best_params_['max_depth'] print(best_n, best_depth) calibrated_classifier = CalibratedClassifierCV(RandomForestClassifier(n_estimators=best_n, max_depth=best_depth), method='isotonic', cv=5)
data_train = train_file[model_col] data_train['Label'] = train_file['Label'] test_file = pd.read_excel('data/Sept/Transformed-Aug19CCDC.xlsx') print('Test data', test_file.shape) tuned_parameters = [{'max_depth': [5, 6, 7, 8, 9, 10]}] training_batches = utils.createBatches(data_train, 4) ccdc_clf = [] sampling = 7 for batch in training_batches: X, y = utils.preprocessunEqualDistribution(batch, sampling) rf_clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, scoring='recall_macro') rf_clf.fit(X, y) ccdc_clf.append(rf_clf) def averageTest(X_test, clfs): """ :type X_test: dataframe """ y_pred = [0] * X_test.shape[0] y_prob = [[0] * 2 for i in range(X_test.shape[0])] for clf in clfs:
from sklearn.tree import DecisionTreeClassifier from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression from mlxtend.classifier import StackingClassifier data = pd.read_csv('data/NewFeatures.csv') data_test = data[(data['Tx date and time'] > '2018-09-30') & (data['Tx date and time'] <= '2018-11-30')] data_train = data[(data['Tx date and time'] > '2016-12-31') & (data['Tx date and time'] <= '2018-09-30')] data_train = data_train.drop(['Tx date and time'], axis=1) # For sampling X_train_stacking, y_train_stacking = utils.preprocessunEqualDistribution( data_train, 5) X_train_rf = data_train.drop('Label', axis=1) y_train_rf = data_train['Label'] ## Split without sampling # X_train = data_train.drop(['Label'], axis=1) # y_train = data_train['Label'] X_test_stacking = data_test.drop(['Label', 'Tx date and time'], axis=1) y_test_stacking = data_test['Label'] clf1 = DecisionTreeClassifier() clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() clf4 = KNeighborsClassifier()