def exercise1(): breast_cancer = pd.read_csv('breast_cancer.csv') X, y = split_dataset_transformed(breast_cancer, 'Class', ['?']) # 1.a clf = DecisionTreeClassifier() cross_val_stats = cross_val(clf, X, y) print_dict(cross_val_stats) # 1.b min_samples = np.arange(2, 11) for samples in min_samples: print('Experimenting with {} samples'.format(samples)) clf = DecisionTreeClassifier(min_samples_split=samples) cross_val_stats = cross_val(clf, X, y) print_dict(cross_val_stats) # 1.c clf = DecisionTreeClassifier() clf.fit(X, y) export_graphviz(clf, out_file='unpruned.dot') clf = DecisionTreeClassifier(max_depth=4) clf.fit(X, y) export_graphviz(clf, out_file='pruned.dot')
def exercise2(): breast_cancer = pd.read_csv('breast_cancer.csv') X, y = split_dataset_transformed(breast_cancer, 'Class', ['?']) X_train, X_test, y_train, y_test = split_train_test(X, y) # 2.a clf = RandomForestClassifier() clf_stats = classifier_statistics(clf, X_train, X_test, y_train, y_test) print_dict(clf_stats, ['predicted']) # 2.b numb_trees = np.arange(10, 201, step=10) for trees in numb_trees: print('Experimenting with {} number of trees'.format(trees)) clf = RandomForestClassifier(n_estimators=trees) clf_stats = classifier_statistics(clf, X_train, X_test, y_train, y_test) print_dict(clf_stats, ['predicted']) # 2.c depths = np.arange(5, 20) for dep in depths: print('Experimenting with {} depth'.format(dep)) clf = RandomForestClassifier(max_depth=dep) clf_stats = classifier_statistics(clf, X_train, X_test, y_train, y_test) print_dict(clf_stats, ['predicted'])
def exercise3(): credit = pd.read_csv('credit.csv') X, y = split_dataset_transformed(credit, 'class') X_train, X_test, y_train, y_test = split_train_test(X, y) min_samples = np.arange(2, 11) for samples in min_samples: print('Experimenting with {} number of instances to split'.format( samples)) print('Train data') clf = DecisionTreeClassifier(min_samples_split=samples) clf_stats = classifier_statistics(clf, X_train, X_train, y_train, y_train) print_dict(clf_stats, ['predicted']) print('Test data') clf = DecisionTreeClassifier(min_samples_split=samples) clf_stats = classifier_statistics(clf, X_train, X_test, y_train, y_test) print_dict(clf_stats, ['predicted']) print() print() numb_trees = np.arange(10, 201, step=10) for trees in numb_trees: print('Experimenting with {} number of trees'.format(trees)) print('Train data') clf = RandomForestClassifier(n_estimators=trees) clf_stats = classifier_statistics(clf, X_train, X_train, y_train, y_train) print_dict(clf_stats, ['predicted']) print('Test data') clf = RandomForestClassifier(n_estimators=trees) clf_stats = classifier_statistics(clf, X_train, X_test, y_train, y_test) print_dict(clf_stats, ['predicted']) print() print()
from sklearn.neighbors import KNeighborsClassifier X, X_test, y, y_test = get_data(False) #X_train, X_test, y_train, y_test = split_train_test(X, y) """ Balancing data only with training instances so we can check if it generalizes well on the unbalanced set with X_test and y_test """ sm = SMOTE(random_state=12, ratio = 1.0) X_train_res, y_train_res = sm.fit_sample(X, y) print('Balanced data: {}'.format(collections.Counter(y_train_res))) """ clf = RandomForestClassifier(n_estimators=25, random_state=42) results = classifier_statistics(clf, X_train_res, X_test, y_train_res, y_test) print_dict(results, excluded_keys=['predicted']) cost1 = 10 cost2 = 500 conf_matrix = results['confusion_matrix'] fp = conf_matrix[0][1] fn = conf_matrix[1][0] total_cost = cost1*fp + cost2*fn print('Total cost aachived: {}'.format(total_cost)) """
print_dict(clf_stats, ['predicted']) print() print() diabetes = pd.read_csv('diabetes.csv') X, y = split_dataset_transformed(diabetes, 'class') X_train, X_test, y_train, y_test = split_train_test(X, y) # 3.a enc = KBinsDiscretizer(n_bins=10, encode='onehot') X_binned = enc.fit_transform(X) X_binned = X_binned.toarray() X_binned_train, X_binned_test, y_binned_train, y_binned_test = split_train_test( X_binned, y) print('With discretization') clf1 = RandomForestClassifier() clf_stats = classifier_statistics(clf1, X_binned_train, X_binned_test, y_binned_train, y_binned_test) print_dict(clf_stats, ['predicted']) print('Without discretization') clf2 = RandomForestClassifier() clf_stats = classifier_statistics(clf2, X_train, X_test, y_train, y_test) print_dict(clf_stats, ['predicted'])