示例#1
0
def run_single_test(data_dir, output_dir):
    from classification import train_classifier, classify
    from keras import backend as K
    from keras.models import load_model
    from os import environ
    from os.path import abspath, dirname, join

    train_dir = join(data_dir, 'train')
    test_dir = join(data_dir, 'test')

    train_gt = read_csv(join(train_dir, 'gt.csv'))
    train_img_dir = join(train_dir, 'images')

    train_classifier(train_gt, train_img_dir, fast_train=True)

    code_dir = dirname(abspath(__file__))
    print('loading model...')
    model = load_model(join(code_dir, 'birds_model.hdf5'))
    print('loaded')
    test_img_dir = join(test_dir, 'images')
    img_classes = classify(model, test_img_dir)
    save_csv(img_classes, join(output_dir, 'output.csv'))

    if environ.get('KERAS_BACKEND') == 'tensorflow':
        K.clear_session()
示例#2
0
def get_lung_sound_classifiers(feature_type='engineered',
                               deep_model_num=[],
                               clf_type='rf',
                               verbose=0):
    if verbose > 0:
        print('Loading labels')
    is_wheeze, is_crackle, is_br, sound_file_locs = load_labels_and_file_locs()
    if verbose > 0:
        print('Generating features')

    x_frame = get_features(sound_file_locs,
                           feature_type=feature_type,
                           deep_model_num=deep_model_num)
    x = x_frame.get_values().astype(float)
    # Create wheeze classifier
    y = is_wheeze
    if verbose > 0:
        print('Training wheezing classifier')
    wheeze_clf, has_prob = classification.train_classifier(x,
                                                           y,
                                                           clf_type=clf_type)

    # Create crackle classifer
    y = is_crackle
    if verbose > 0:
        print('Training crackle classifier')
    crackle_clf, has_prob = classification.train_classifier(x,
                                                            y,
                                                            clf_type=clf_type)
    return wheeze_clf, crackle_clf, has_prob
示例#3
0
def demo():        
    frac_train = 0.5
    random_seed = 560
    dataset_prefix = "AMT" # choose between "AMT", "AMT_wo_neutral", "SSI", "SSI_wo_neutral"
    data_subset = "test" # choose between "all", "train", "test"
    features = ['volitionality', 'reliability', 'privacy', 'relevance', 'causes_outcome', 'caused_by_sensitive_feature', 'causal_loop', 'causes_disparity_in_outcomes']
    control_features = ["fairness", "worker"]

    # load preprocessed dataset
    preprocess_all_datasets()
    data = load_preprocessed_classification_data(dataset_prefix, show_preview=False)


    # CLASSIFICATION
    # train & evaluate classifiers
    accuracy_cv, auc_cv = list(), list()
    for i in range(0, 5):
        random_seed -= 1
        # train classifier
        clsfr = train_classifier(data, features, control_features, frac_train=frac_train, random_seed=random_seed)    
        # make predictions
        ground_truth, predicted, predicted_prob, fairness_control, worker_control = make_predictions(clsfr, data, "test", features, control_features, frac_train=frac_train, random_seed=random_seed)    
        ## accuracy & auc
        accuracy, auc = calculate_evaluation_metrics(ground_truth, predicted)
        accuracy_cv.append(accuracy)
        auc_cv.append(auc)

    print "Average accuracy: ", np.average(np.array(accuracy_cv))
    print "Average AUC: ", np.average(np.array(auc_cv))

    # characterize misclassifications
    ## evaluate on whole data
    ground_truth, predicted, predicted_prob, fairness_control, worker_control = make_predictions(clsfr, data, "all", features, control_features, frac_train=frac_train, random_seed=random_seed)    
    
    ## missclassifications per fairness rating
    rating_mistakes = characterize_mistakes_per_rating(ground_truth, predicted, predicted_prob, fairness_control)
    print "\n\nCharacterize misclassifications per fairness rating\n"
    print_table(rating_mistakes)

    ## misclassifications per worker
    print "\n\nCharacterize misclassifications per worker (CDF)\n"
    worker_mistakes_cdf = characterize_mistakes_per_worker(ground_truth, predicted, worker_control)
    print_cdf(worker_mistakes_cdf)


    # CONSENSUS
    clsfr = train_classifier(data, features, control_features, frac_train=frac_train, random_seed=555)
    ground_truth, predicted, predicted_prob, fairness_control, worker_control = make_predictions(clsfr, data, "all", features, control_features, frac_train=frac_train, random_seed=random_seed)    
    data["predicted_fairness"] = predicted
    concensus = calculate_concensus(data)
示例#4
0
def create_and_analyze_classifier(outcome,
                                  lung_sound_algorithm_flag,
                                  exclude_na_behavior='rows'):
    x_frame,y = load_data(outcome,
                          lung_sound_algorithm_flag = lung_sound_algorithm_flag,
                          exclude_na_behavior = exclude_na_behavior)
    x = x_frame.get_values().astype(float)
    # Split dataset
    percent_test = 0.2
    x_train,x_test,y_train,y_test = train_test_split(x,y,
                                                     test_size=percent_test,
                                                     stratify=y)      
    
    clf,has_prob = classification.train_classifier(x_train,y_train,clf_type='lr',verbose=1)
    auc = classification.evaluate_classifier(clf,x_test,y_test,has_prob=has_prob)
    print('Test set auc: {:0.03f}'.format(auc))
    coef = np.squeeze(clf.named_steps['clf'].coef_)
    classification.plot_feature_importance(x_frame.columns.values,coef)
#    plt.title('AUC: {:0.03f}'.format(auc))
    plt.tight_layout()
    plt.savefig('img/feature_importance_' + outcome + '_LSA' + str(int(lung_sound_algorithm_flag)) + '.png')
    plt.close()
    
    classification.plot_roc(clf,x_test,y_test,has_prob=has_prob)
    plt.savefig('img/roc_curve' + outcome + '_LSA' + str(int(lung_sound_algorithm_flag)) + '.png')
    plt.close()    
def train_ranker(embeddings_dict, training_data_file, field_names):
    training_data = _extract_from_training_file(training_data_file, field_names)
    #print training_data
    feature_vector_dict = _get_feature_vectors(training_data, embeddings_dict)
    classifier = classification.train_classifier(feature_vector_dict['feature_vectors'], feature_vector_dict['labels'])
    #print feature_vector_dict
    return classifier
示例#6
0
def test_model(outcome='wheeze',
               clf_type='rf',
               feature_type='engineered',
               deep_model_num=[]):
    print('Load labels')
    if outcome == 'wheeze' or outcome == 'crackle':
        is_wheeze, is_crackle, is_br, sound_file_locs = load_labels_and_file_locs(
            exclude_br=True)
        if outcome == 'wheeze':
            y = is_wheeze
        else:
            y = is_crackle
    elif outcome == 'br':
        is_wheeze, is_crackle, is_br, sound_file_locs = load_labels_and_file_locs(
            exclude_br=False)
        y = is_br

    print('Generate features')
    x = get_features(sound_file_locs,
                     feature_type=feature_type,
                     deep_model_num=deep_model_num)
    # Evaluate wheeze classifier

    percent_test = 0.2
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=percent_test,
                                                        stratify=y)
    print('Training classifier')
    clf, has_prob = classification.train_classifier(x_train,
                                                    y_train,
                                                    clf_type=clf_type,
                                                    verbose=0)
    print('Evaluating classifier')
    auc = classification.evaluate_classifier(clf,
                                             x_test,
                                             y_test,
                                             has_prob=has_prob)
    #    print('Wheeze test set auc: {:0.03f}'.format(wheeze_auc))
    ##    clf_type = 'rf'
    #    print('Train wheeze classifier')
    #    clf,has_prob = classification.train_classifier(x_train,y_train,clf_type=clf_type,verbose=0)
    #    print('Evaluate model')
    #    wheeze_auc = classification.evaluate_classifier(clf,x_test,y_test,has_prob=has_prob)
    #    print('Wheeze test set auc: {:0.03f}'.format(wheeze_auc))
    ##
    #    # Evaluate crackle classifer
    #    y = is_crackle
    #    percent_test = 0.2
    #    x_train,x_test,y_train,y_test = train_test_split(x,y,
    #                                                     test_size=percent_test,
    #                                                     stratify=y)
    #
    #    print('Train crackle classifier')
    #    clf,has_prob = classification.train_classifier(x_train,y_train,clf_type=clf_type)
    #    print('Evaluate model')
    #    crackle_auc = classification.evaluate_classifier(clf,x_test,y_test,has_prob=has_prob)
    #    print('Crackle test set auc: {:0.03f}'.format(crackle_auc))
    return auc
示例#7
0
def compare_feature_sets(outcome):
    feature_types = np.asarray(['lung_sound_doctor','pfm','questionnaire'])
    feature_combos = np.asarray(list(itertools.product([0,1], repeat=3))).astype(bool)
    feature_combos = feature_combos[1:,:]
    
    output_file = 'results/diagnosis_feature_importance_results.csv'
    if not os.path.isfile(output_file):
        with open(output_file,'w') as f:
            f.write('outcome,lung_sounds,pfm,questionnaire,auc_median,auc_low,auc_high\n')
    for m in np.arange(len(feature_combos)):
        x_frame,y = load_data(outcome,
                              features_to_use=feature_types[feature_combos[m,:]],
                              exclude_na_behavior = 'rows')
        x = x_frame.get_values().astype(float)
        clf,has_prob = classification.train_classifier(x,y,clf_type='lr',verbose=0)
        auc_median,auc_low,auc_high= classification.generate_average_roc(x,y,clf,has_prob)
        plt.savefig('img/feature_importance/roc_' + outcome + '_LS{:d}_PF{:d}_QU{:d}.png'.format(feature_combos[m,0],feature_combos[m,1],feature_combos[m,2]))
        plt.close()
        with open(output_file,'a') as f:
            f.write('{},{:d},{:d},{:d},{:0.03f},{:0.03f},{:0.03f}\n'.format(outcome,feature_combos[m,0],feature_combos[m,1],feature_combos[m,2],auc_median,auc_low,auc_high))
from classification import train_classifier, test_classifier, cross_validation
from sklearn.tree import DecisionTreeClassifier

if __name__ == '__main__':
    classifier = DecisionTreeClassifier()
    train_classifier(classifier, select_features=False)
    test_classifier(classifier, select_features=False)
    print '------------------'
    cross_validation(classifier, select_features=False)
            parts = line.rstrip('\n').split(',')
            res[parts[0]] = int(parts[1])
    return res


def compute_accuracy(classified, gt):

    correct = 0
    total = len(classified)
    for filename, class_id in classified.items():
        if class_id == gt[filename]:
            correct += 1
    print(correct, total)
    return correct / total


train_gt = read_csv(join(train_dir, 'gt.csv'))
train_img_dir = join(train_dir, 'images')

train_classifier(train_gt, train_img_dir, fast_train=True)

#model = train_classifier(train_gt, train_img_dir)
#model.save('birds_model.hdf5')
model = load_model('best/birds_model.hdf5')
test_img_dir = join(test_dir, 'img_test')
img_classes = classify(model, test_img_dir)

test_gt = read_csv(join(test_dir, 'gt.csv'))
acc = compute_accuracy(img_classes, test_gt)
print('Accuracy: ', acc)
from classification import train_classifier, test_classifier, cross_validation
from sklearn.ensemble import AdaBoostClassifier

if __name__ == '__main__':
    classifier = AdaBoostClassifier()
    train_classifier(classifier, select_features=True)
    test_classifier(classifier, select_features=True)
    print '------------------'
    cross_validation(classifier, select_features=True)