def run(): print(' => Reading features dataset') df = pd.read_csv('results/processed_extracted_features.csv').drop('ids', axis=1) y = df['Diagnosis'].values x = df[df.columns[1:]].values x = StandardScaler().fit_transform(x) results = pd.DataFrame( columns=['Classifier', 'Params', 'BAS', 'BER', 'MCC', 'Accuracy']) print(' => Testing classifiers') print(' ==> Naive Bayes .. ', end='\r') evaluation = evaluate(GaussianNB, x, y) evaluation.update({'Classifier': 'Naive Bayes'}) results = results.append(evaluation, ignore_index=True).round(4) print(f' ==> Naive Bayes .. {evaluation["BAS"]}') print(' ==> Random Forest .. ', end='\r') evaluation = evaluate(RandomForestClassifier, x, y, params={'n_estimators': 50}) evaluation.update({'Classifier': 'Random Forest', 'Params': '# Trees: 50'}) results = results.append(evaluation, ignore_index=True).round(4) print(f' ==> Random Forest .. {evaluation["BAS"]}') print(' ==> Ada Boost .. ', end='\r') evaluation = evaluate(AdaBoostClassifier, x, y, params={'learning_rate': 0.05}) evaluation.update({ 'Classifier': 'Ada Boost', 'Params': 'Learning rate: 0.5' }) results = results.append(evaluation, ignore_index=True).round(4) print(f' ==> Ada Boost .. {evaluation["BAS"]}') print(' ==> SVC .. ', end='\r') evaluation = evaluate(SVC, x, y, params={'kernel': 'linear', 'C': 0.1}) evaluation.update({ 'Classifier': 'SVM', 'Params': 'kernel: linear, C: 0.1' }) results = results.append(evaluation, ignore_index=True).round(4) print(f' ==> SVC .. {evaluation["BAS"]}') print(' ==> KNN .. ', end='\r') evaluation = evaluate(KNeighborsClassifier, x, y, params={'n_neighbors': 7}) evaluation.update({'Classifier': 'KNN', 'Params': 'K: 7'}) results = results.append(evaluation, ignore_index=True).round(4) print(f' ==> KNN .. {evaluation["BAS"]}') print(' => Done!') return results
def ffs(): stdout.write(' => Reading DF') df = pd.read_csv(TRAIN_URI) stdout.write('\r => Gettting FDR ') fdr = FDR(df) stdout.write('\r => Initializing sets ') ir = set(['CDR']) not_ir = set(df.columns[1:]) not_ir.remove('CDR') Y = df['Diagnosis'] res = pd.DataFrame(columns=['feature', 'Accuracy']) evaluation = evaluate(SVC, df[['CDR']].values, Y, params={ 'C': 0.1, 'kernel': 'linear' }) evaluation['feature'] = 'CDR' res = res.append(evaluation, ignore_index=True) for i in range(len(not_ir)): features = list(ir) best_result = {SELECTION_CRITERIA: 0} for f in not_ir: values = df[features + [f]].values evaluation = evaluate(SVC, values, Y, params={ 'C': 0.1, 'kernel': 'linear' }) if evaluation[SELECTION_CRITERIA] > best_result[SELECTION_CRITERIA]: best_result = evaluation best_result['feature'] = f elif evaluation[SELECTION_CRITERIA] == best_result[ SELECTION_CRITERIA]: champion = best_result['feature'] challenger = f if fdr[challenger] > fdr[champion]: best_result = evaluation best_result['feature'] = f res = res.append(best_result, ignore_index=True) not_ir.remove(best_result['feature']) ir.add(best_result['feature']) stdout.write('\r ==> %d features selected .. %0.04f sensibility' % (res.shape[0], best_result[SELECTION_CRITERIA])) stdout.write('\n') return res
def grid_search(): """Main handler. This function looks for the best classifier & it's best combination of parameters. """ cols = ['Accuracy', 'BAS', 'BER', 'MCC', 'Sensibility', 'Specificity'] all_res = pd.DataFrame(columns=(['Classifier'] + cols)) print(' => Reading features dataset') df = pd.read_csv(TRAIN_URI) # .drop('ids', axis=1) y = df['Diagnosis'].values x = df[df.columns[1:]].values print('\nNaive Bayes') results = pd.DataFrame([evaluate(GaussianNB, x, y)], columns=cols) print(results) results['Classifier'] = 'Naive Bayes' all_res = all_res.append(results.iloc[0], ignore_index=True) print('\nRandom Forest') results = pd.DataFrame( [evaluate(RandomForestClassifier, x, y, params={'n_estimators': 50})], columns=cols) print(results) results['Classifier'] = 'Random Forest' all_res = all_res.append(results.iloc[0], ignore_index=True) print('\nAda Boost') results = ada_boost.grid_search(x, y).sort_values('Accuracy', ascending=False) print(results) results['Classifier'] = 'Ada Boost' all_res = all_res.append(results.iloc[0], ignore_index=True) print('\nSVC linear') results = linear_svm.grid_search(x, y).sort_values('Accuracy', ascending=False) print(results) results['Classifier'] = 'SVC' all_res = all_res.append(results.iloc[0], ignore_index=True) print('\nKNN') results = knn.grid_search(x, y).sort_values('Accuracy', ascending=False) print(results) results['Classifier'] = 'KNN' all_res = all_res.append(results.iloc[0], ignore_index=True) print('\n', all_res.sort_values('Accuracy', ascending=False))
def grid_search(values, target, verbose=True): """Looks for the best param combinations for SVC.""" stdout.write(' => Best Cs for SVM\n') results = pd.DataFrame(columns=['C', 'Accuracy']) cs = [10**c for c in range(-3, 2)] for i, c in enumerate(cs): if verbose: stdout.write(f'\r ==> SVC .... {i + 1}/{len(cs)}') evaluation = evaluate(SVC, values, target, params={ 'C': c, 'kernel': 'linear' }) evaluation.update({'C': c}) results = results.append(evaluation, ignore_index=True).round(4) if verbose: stdout.write( f'\x1b[2k\r => Best SVM, {results.shape[0]} combs tested!\n') return results
def get_best_comb(images, target, msg=''): """Looks for the best distance on Haralick features.""" results = pd.DataFrame() # In disc for dist in range(1, 4): print(f' ==> ({msg}) Distance {dist}/3 (all degrees + mean)', end='\r') haralick = extractor.get_haralick(images, dist, HARALICK_NAMES) values = StandardScaler().fit_transform(haralick) res = evaluator.evaluate(GaussianNB, values, target) res.update({'Distance': dist}) results = results.append(res, ignore_index=True).round(4) return results
def grid_search(values, target, verbose=True): """Looks for the best param combinations for KNN.""" stdout.write(' => Best Ks for KNN\n') results = pd.DataFrame(columns=['K', 'Accuracy']) ks = [k for k in range(1, 20, 2)] for i, k in enumerate(ks): if verbose: stdout.write(f'\r ==> KNN .... {i + 1}/{len(ks)}') evaluation = evaluate(KNeighborsClassifier, values, target, params={'n_neighbors': k}) evaluation.update({'K': k}) results = results.append(evaluation, ignore_index=True).round(4) if verbose: stdout.write(f'\x1b[2k\r => KNN, {results.shape[0]} combs tested!\n') return results
def grid_search(values, target, verbose=True): """Looks for the best param combinations for AdaBoost.""" if verbose: stdout.write(' => Getting best LRates for AdaBoost\n') results = pd.DataFrame(columns=['Learning rate', 'Accuracy']) l_rates = [lr / 1000 for lr in range(80, 121, 10)] for i, lr in enumerate(l_rates): if verbose: stdout.write(f'\r ==> AdaBoost LRates .... {i + 1}/{len(l_rates)}') evaluation = evaluate(AdaBoostClassifier, values, target, params={'learning_rate': lr}) evaluation.update({'Learning rate': lr}) results = results.append(evaluation, ignore_index=True).round(4) if verbose: stdout.write( f'\x1b[2k\r => AdaBoost, {results.shape[0]} combs tested! \n') return results
from lib.evaluator import evaluate from lib import rimone POINTS = [6, 7, 8, 9, 10] RADIUS = [1, 2, 3] ds = rimone.dataset() Y = ds.Y res = pd.DataFrame() for p in POINTS: for r in RADIUS: print(f'p: {p}, r: {r}') X = StandardScaler().fit_transform(get_lbp(ds.cups, radius=r, points=p)) evaluation = evaluate(GaussianNB, X, Y) evaluation.update({ 'radius': r, 'point': p }) res = res.append(evaluation, ignore_index=True).sort_values('Score', ascending=False) res.to_csv('results/lbp_cups.csv', index=False) print(res.sort_values('Score', ascending=False)) POINTS = [6, 7, 8, 9, 10] RADIUS = [1, 2, 3] res = pd.DataFrame() for p in POINTS: