def run_single_test(data_dir, output_dir): from classification import train_classifier, classify from keras import backend as K from keras.models import load_model from os import environ from os.path import abspath, dirname, join train_dir = join(data_dir, 'train') test_dir = join(data_dir, 'test') train_gt = read_csv(join(train_dir, 'gt.csv')) train_img_dir = join(train_dir, 'images') train_classifier(train_gt, train_img_dir, fast_train=True) code_dir = dirname(abspath(__file__)) print('loading model...') model = load_model(join(code_dir, 'birds_model.hdf5')) print('loaded') test_img_dir = join(test_dir, 'images') img_classes = classify(model, test_img_dir) save_csv(img_classes, join(output_dir, 'output.csv')) if environ.get('KERAS_BACKEND') == 'tensorflow': K.clear_session()
def get_lung_sound_classifiers(feature_type='engineered', deep_model_num=[], clf_type='rf', verbose=0): if verbose > 0: print('Loading labels') is_wheeze, is_crackle, is_br, sound_file_locs = load_labels_and_file_locs() if verbose > 0: print('Generating features') x_frame = get_features(sound_file_locs, feature_type=feature_type, deep_model_num=deep_model_num) x = x_frame.get_values().astype(float) # Create wheeze classifier y = is_wheeze if verbose > 0: print('Training wheezing classifier') wheeze_clf, has_prob = classification.train_classifier(x, y, clf_type=clf_type) # Create crackle classifer y = is_crackle if verbose > 0: print('Training crackle classifier') crackle_clf, has_prob = classification.train_classifier(x, y, clf_type=clf_type) return wheeze_clf, crackle_clf, has_prob
def demo(): frac_train = 0.5 random_seed = 560 dataset_prefix = "AMT" # choose between "AMT", "AMT_wo_neutral", "SSI", "SSI_wo_neutral" data_subset = "test" # choose between "all", "train", "test" features = ['volitionality', 'reliability', 'privacy', 'relevance', 'causes_outcome', 'caused_by_sensitive_feature', 'causal_loop', 'causes_disparity_in_outcomes'] control_features = ["fairness", "worker"] # load preprocessed dataset preprocess_all_datasets() data = load_preprocessed_classification_data(dataset_prefix, show_preview=False) # CLASSIFICATION # train & evaluate classifiers accuracy_cv, auc_cv = list(), list() for i in range(0, 5): random_seed -= 1 # train classifier clsfr = train_classifier(data, features, control_features, frac_train=frac_train, random_seed=random_seed) # make predictions ground_truth, predicted, predicted_prob, fairness_control, worker_control = make_predictions(clsfr, data, "test", features, control_features, frac_train=frac_train, random_seed=random_seed) ## accuracy & auc accuracy, auc = calculate_evaluation_metrics(ground_truth, predicted) accuracy_cv.append(accuracy) auc_cv.append(auc) print "Average accuracy: ", np.average(np.array(accuracy_cv)) print "Average AUC: ", np.average(np.array(auc_cv)) # characterize misclassifications ## evaluate on whole data ground_truth, predicted, predicted_prob, fairness_control, worker_control = make_predictions(clsfr, data, "all", features, control_features, frac_train=frac_train, random_seed=random_seed) ## missclassifications per fairness rating rating_mistakes = characterize_mistakes_per_rating(ground_truth, predicted, predicted_prob, fairness_control) print "\n\nCharacterize misclassifications per fairness rating\n" print_table(rating_mistakes) ## misclassifications per worker print "\n\nCharacterize misclassifications per worker (CDF)\n" worker_mistakes_cdf = characterize_mistakes_per_worker(ground_truth, predicted, worker_control) print_cdf(worker_mistakes_cdf) # CONSENSUS clsfr = train_classifier(data, features, control_features, frac_train=frac_train, random_seed=555) ground_truth, predicted, predicted_prob, fairness_control, worker_control = make_predictions(clsfr, data, "all", features, control_features, frac_train=frac_train, random_seed=random_seed) data["predicted_fairness"] = predicted concensus = calculate_concensus(data)
def create_and_analyze_classifier(outcome, lung_sound_algorithm_flag, exclude_na_behavior='rows'): x_frame,y = load_data(outcome, lung_sound_algorithm_flag = lung_sound_algorithm_flag, exclude_na_behavior = exclude_na_behavior) x = x_frame.get_values().astype(float) # Split dataset percent_test = 0.2 x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=percent_test, stratify=y) clf,has_prob = classification.train_classifier(x_train,y_train,clf_type='lr',verbose=1) auc = classification.evaluate_classifier(clf,x_test,y_test,has_prob=has_prob) print('Test set auc: {:0.03f}'.format(auc)) coef = np.squeeze(clf.named_steps['clf'].coef_) classification.plot_feature_importance(x_frame.columns.values,coef) # plt.title('AUC: {:0.03f}'.format(auc)) plt.tight_layout() plt.savefig('img/feature_importance_' + outcome + '_LSA' + str(int(lung_sound_algorithm_flag)) + '.png') plt.close() classification.plot_roc(clf,x_test,y_test,has_prob=has_prob) plt.savefig('img/roc_curve' + outcome + '_LSA' + str(int(lung_sound_algorithm_flag)) + '.png') plt.close()
def train_ranker(embeddings_dict, training_data_file, field_names): training_data = _extract_from_training_file(training_data_file, field_names) #print training_data feature_vector_dict = _get_feature_vectors(training_data, embeddings_dict) classifier = classification.train_classifier(feature_vector_dict['feature_vectors'], feature_vector_dict['labels']) #print feature_vector_dict return classifier
def test_model(outcome='wheeze', clf_type='rf', feature_type='engineered', deep_model_num=[]): print('Load labels') if outcome == 'wheeze' or outcome == 'crackle': is_wheeze, is_crackle, is_br, sound_file_locs = load_labels_and_file_locs( exclude_br=True) if outcome == 'wheeze': y = is_wheeze else: y = is_crackle elif outcome == 'br': is_wheeze, is_crackle, is_br, sound_file_locs = load_labels_and_file_locs( exclude_br=False) y = is_br print('Generate features') x = get_features(sound_file_locs, feature_type=feature_type, deep_model_num=deep_model_num) # Evaluate wheeze classifier percent_test = 0.2 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=percent_test, stratify=y) print('Training classifier') clf, has_prob = classification.train_classifier(x_train, y_train, clf_type=clf_type, verbose=0) print('Evaluating classifier') auc = classification.evaluate_classifier(clf, x_test, y_test, has_prob=has_prob) # print('Wheeze test set auc: {:0.03f}'.format(wheeze_auc)) ## clf_type = 'rf' # print('Train wheeze classifier') # clf,has_prob = classification.train_classifier(x_train,y_train,clf_type=clf_type,verbose=0) # print('Evaluate model') # wheeze_auc = classification.evaluate_classifier(clf,x_test,y_test,has_prob=has_prob) # print('Wheeze test set auc: {:0.03f}'.format(wheeze_auc)) ## # # Evaluate crackle classifer # y = is_crackle # percent_test = 0.2 # x_train,x_test,y_train,y_test = train_test_split(x,y, # test_size=percent_test, # stratify=y) # # print('Train crackle classifier') # clf,has_prob = classification.train_classifier(x_train,y_train,clf_type=clf_type) # print('Evaluate model') # crackle_auc = classification.evaluate_classifier(clf,x_test,y_test,has_prob=has_prob) # print('Crackle test set auc: {:0.03f}'.format(crackle_auc)) return auc
def compare_feature_sets(outcome): feature_types = np.asarray(['lung_sound_doctor','pfm','questionnaire']) feature_combos = np.asarray(list(itertools.product([0,1], repeat=3))).astype(bool) feature_combos = feature_combos[1:,:] output_file = 'results/diagnosis_feature_importance_results.csv' if not os.path.isfile(output_file): with open(output_file,'w') as f: f.write('outcome,lung_sounds,pfm,questionnaire,auc_median,auc_low,auc_high\n') for m in np.arange(len(feature_combos)): x_frame,y = load_data(outcome, features_to_use=feature_types[feature_combos[m,:]], exclude_na_behavior = 'rows') x = x_frame.get_values().astype(float) clf,has_prob = classification.train_classifier(x,y,clf_type='lr',verbose=0) auc_median,auc_low,auc_high= classification.generate_average_roc(x,y,clf,has_prob) plt.savefig('img/feature_importance/roc_' + outcome + '_LS{:d}_PF{:d}_QU{:d}.png'.format(feature_combos[m,0],feature_combos[m,1],feature_combos[m,2])) plt.close() with open(output_file,'a') as f: f.write('{},{:d},{:d},{:d},{:0.03f},{:0.03f},{:0.03f}\n'.format(outcome,feature_combos[m,0],feature_combos[m,1],feature_combos[m,2],auc_median,auc_low,auc_high))
from classification import train_classifier, test_classifier, cross_validation from sklearn.tree import DecisionTreeClassifier if __name__ == '__main__': classifier = DecisionTreeClassifier() train_classifier(classifier, select_features=False) test_classifier(classifier, select_features=False) print '------------------' cross_validation(classifier, select_features=False)
parts = line.rstrip('\n').split(',') res[parts[0]] = int(parts[1]) return res def compute_accuracy(classified, gt): correct = 0 total = len(classified) for filename, class_id in classified.items(): if class_id == gt[filename]: correct += 1 print(correct, total) return correct / total train_gt = read_csv(join(train_dir, 'gt.csv')) train_img_dir = join(train_dir, 'images') train_classifier(train_gt, train_img_dir, fast_train=True) #model = train_classifier(train_gt, train_img_dir) #model.save('birds_model.hdf5') model = load_model('best/birds_model.hdf5') test_img_dir = join(test_dir, 'img_test') img_classes = classify(model, test_img_dir) test_gt = read_csv(join(test_dir, 'gt.csv')) acc = compute_accuracy(img_classes, test_gt) print('Accuracy: ', acc)
from classification import train_classifier, test_classifier, cross_validation from sklearn.ensemble import AdaBoostClassifier if __name__ == '__main__': classifier = AdaBoostClassifier() train_classifier(classifier, select_features=True) test_classifier(classifier, select_features=True) print '------------------' cross_validation(classifier, select_features=True)