示例#1
0
def fit(cnf, predict, per_patient, features_file, n_iter, blend_cnf, test_dir):

    config = util.load_module(cnf).config
    image_files = data.get_image_files(config.get('train_dir'))
    names = data.get_names(image_files)
    labels = data.get_labels(names).astype(np.float32)[:, np.newaxis]

    if features_file is not None:
        runs = {'run': [features_file]}
    else:
        runs = data.parse_blend_config(yaml.load(open(blend_cnf)))

    scalers = {run: StandardScaler() for run in runs}

    tr, te = data.split_indices(image_files, labels)

    y_preds = []
    for i in range(n_iter):
        print("iteration {} / {}".format(i + 1, n_iter))
        for run, files in runs.items():
            print("fitting features for run {}".format(run))
            X = data.load_features(files)
            X = scalers[run].fit_transform(X)
            X = data.per_patient_reshape(X) if per_patient else X
            est = get_estimator(X.shape[1], image_files, labels,
                                eval_size=0.0 if predict else 0.1)
            est.fit(X, labels)
            if not predict:
                y_pred = est.predict(X[te]).ravel()
                y_preds.append(y_pred)
                y_pred = np.mean(y_preds, axis=0)
                y_pred = np.clip(np.round(y_pred).astype(int),
                                 np.min(labels), np.max(labels))
                print("kappa after run {}, iter {}: {}".format(
                    run, i, util.kappa(labels[te], y_pred)))
                print("confusion matrix")
                print(confusion_matrix(labels[te], y_pred))
            else:
                X = data.load_features(files, test=True)
                X = scalers[run].transform(X)
                X = data.per_patient_reshape(X) if per_patient else X
                y_pred = est.predict(X).ravel()
                y_preds.append(y_pred)

    if predict:
        y_pred = np.mean(y_preds, axis=0)
        y_pred = np.clip(np.round(y_pred),
                         np.min(labels), np.max(labels)).astype(int)
        submission_filename = util.get_submission_filename()
        image_files = data.get_image_files(test_dir or config.get('test_dir'))
        names = data.get_names(image_files)
        image_column = pd.Series(names, name='image')
        level_column = pd.Series(y_pred, name='level')
        predictions = pd.concat([image_column, level_column], axis=1)

        print("tail of predictions file")
        print(predictions.tail())

        predictions.to_csv(submission_filename, index=False)
        print("saved predictions to {}".format(submission_filename))
示例#2
0
def fit(cnf, predict, per_patient, features_file, n_iter, blend_cnf, test_dir):

    config = util.load_module(cnf).config
    image_files = data.get_image_files(config.get('train_dir'))
    names = data.get_names(image_files)
    labels = data.get_labels(names).astype(np.float32)[:, np.newaxis]

    if features_file is not None:
        runs = {'run': [features_file]}
    else:
        runs = data.parse_blend_config(yaml.load(open(blend_cnf)))

    scalers = {run: StandardScaler() for run in runs}

    tr, te = data.split_indices(image_files, labels)

    y_preds = []
    for i in range(n_iter):
        print("iteration {} / {}".format(i + 1, n_iter))
        for run, files in list(runs.items()):
            print("fitting features for run {}".format(run))
            X = data.load_features(files)
            X = scalers[run].fit_transform(X)
            X = data.per_patient_reshape(X) if per_patient else X
            est = get_estimator(X.shape[1], image_files, labels,
                                eval_size=0.0 if predict else 0.1)
            est.fit(X, labels)
            if not predict:
                y_pred = est.predict(X[te]).ravel()
                y_preds.append(y_pred)
                y_pred = np.mean(y_preds, axis=0)
                y_pred = np.clip(np.round(y_pred).astype(int),
                                 np.min(labels), np.max(labels))
                print("kappa after run {}, iter {}: {}".format(
                    run, i, util.kappa(labels[te], y_pred)))
                print("confusion matrix")
                print(confusion_matrix(labels[te], y_pred))
            else:
                X = data.load_features(files, test=True)
                X = scalers[run].transform(X)
                X = data.per_patient_reshape(X) if per_patient else X
                y_pred = est.predict(X).ravel()
                y_preds.append(y_pred)

    if predict:
        y_pred = np.mean(y_preds, axis=0)
        y_pred = np.clip(np.round(y_pred),
                         np.min(labels), np.max(labels)).astype(int)
        submission_filename = util.get_submission_filename()
        image_files = data.get_image_files(test_dir or config.get('test_dir'))
        names = data.get_names(image_files)
        image_column = pd.Series(names, name='image')
        level_column = pd.Series(y_pred, name='level')
        predictions = pd.concat([image_column, level_column], axis=1)

        print("tail of predictions file")
        print(predictions.tail())

        predictions.to_csv(submission_filename, index=False)
        print("saved predictions to {}".format(submission_filename))
示例#3
0
    def __init__(self,
                 val_split=0.1,
                 test_split=0.1,
                 label_col='20d_ret',
                 features_dir=None):
        features_dir = features_dir or FEATURES_DIR
        X, y, dates = load_features(features_dir, label_col=label_col)
        self.X = X
        self.y = y
        self.dates = pd.to_datetime(dates)

        self.val_split, self.test_split = val_split, test_split

        self.X_train, self.X_val, self.X_test = None, None, None
        self.y_train, self.y_val, self.y_test = None, None, None

        print("Processing data...")
        self._process_data()
        print("Done")
示例#4
0
import json
import numpy as np
import pylab as pl
from sklearn import neighbors
from data import load_features, load_mfccs


# Load and divide data

##songs, prop_dict = load_mfccs()
songs, prop_dict = load_features()
training_set = dict([e for e in prop_dict.items()[0:30]])
testing_set = dict([e for e in prop_dict.items()[30:]])


# Train k-NN

fo = open('data/drums.genres.json', 'r')
genres = json.loads(fo.read())
cmap = {'pop': 0, 'rock': 1, 'reggae': 2, 'jazz': 3, 'classical': 4}
classes = [cmap[genres[k]] for k in training_set.keys()]
fo.close()

X = np.array([p for k, p in training_set.items()])
Y = np.array(classes)
n_neighbors = 9
clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance').fit(X, Y)


# Test k-NN
示例#5
0
if features_file is not None:
    runs = {'run': [features_file]}
else:
    runs = data.parse_blend_config(yaml.load(open(blend_cnf)))

scalers = {run: StandardScaler() for run in runs}

tr, te = data.split_indices(image_files, labels)

y_preds = []
for i in range(n_iter):
    print("iteration {} / {}".format(i + 1, n_iter))
    for run, files in runs.items():
        print("fitting features for run {}".format(run))
        X = data.load_features(files)
        X = scalers[run].fit_transform(X)
        X = data.per_patient_reshape(X) if per_patient else X
        est = get_estimator(X.shape[1], image_files, labels,
                                eval_size=0.0 if predict else 0.1)
        est.fit(X, labels)
        if not predict:
            y_pred = est.predict(X[te]).ravel()
            y_preds.append(y_pred)
            y_pred = np.mean(y_preds, axis=0)
            y_pred = np.clip(np.round(y_pred).astype(int),
                             np.min(labels), np.max(labels))
            print("kappa after run {}, iter {}: {}".format(
                run, i, util.kappa(labels[te], y_pred)))
            print("confusion matrix")
            print(confusion_matrix(labels[te], y_pred))
示例#6
0
def fit(cnf, exp_run_folder, classifier, features_file, n_iter, blend_cnf,
        test_dir, fold):

    config = util.load_module(cnf).config
    config.cnf[
        'fold'] = fold  # <-- used to change the directories for weights_best, weights_epoch and weights_final
    config.cnf['exp_run_folder'] = exp_run_folder

    folds = yaml.load(open('folds/' + data.settings['protocol'] + '.yml'))
    f0, f1 = fold.split('x')
    train_list = folds['Fold_' + f0][int(f1) - 1]
    test_list = folds['Fold_' + f0][0 if f1 == '2' else 1]

    image_files = data.get_image_files(config.get('train_dir'), train_list)
    names = data.get_names(image_files)
    labels = data.get_labels(names,
                             label_file='folds/' + data.settings['protocol'] +
                             '.csv').astype(np.int32)[:, np.newaxis]

    if features_file is not None:
        runs = {'run': [features_file]}
    else:
        runs = {
            run: [
                os.path.join(exp_run_folder + '/data/features', f)
                for f in files
            ]
            for run, files in yaml.load(open(blend_cnf)).items()
        }

    scalers = {run: StandardScaler() for run in runs}

    y_preds = []
    y_preds_proba = []
    for i in range(n_iter):
        print("iteration {} / {}".format(i + 1, n_iter))
        for run, files in runs.items():
            files = [
                f.replace('f0xf1.npy', '{}.npy'.format(fold)) for f in files
            ]

            if classifier is None:
                X_test = data.load_features(files, test=True)
                if data.settings['protocol'] != 'protocol3':
                    y_pred_proba = X_test
                    y_proba = []
                    for i in range(0, len(X_test)):
                        y_proba.append(
                            y_pred_proba[i][1])  #using score from the positive
                    y_pred = np.clip(np.round(y_proba), 0, 1).astype(int)
                else:
                    y_pred_proba = est.predict_proba(X)
            else:
                print("fitting features for run {}".format(run))
                X_train = data.load_features(files)
                l2Norm = np.linalg.norm(X_train, axis=1)
                X_train = np.divide(X_train.T, l2Norm).T
                est = estimator(data.settings['protocol'],
                                classifier,
                                X_train.shape[1],
                                image_files,
                                X_train,
                                labels,
                                run,
                                fold,
                                eval_size=0.1)
                open(
                    exp_run_folder +
                    "/best_estimator_fold_{}.txt".format(fold),
                    "w").write(str(est))
                X_test = data.load_features(files, test=True)
                l2Norm = np.linalg.norm(X_test, axis=1)
                X_test = np.divide(X_test.T, l2Norm).T
                if data.settings['protocol'] != 'protocol3':
                    y_pred = est.predict(X_test).ravel()
                    y_pred_proba = est.predict_proba(X_test).ravel()
                    y_proba = []
                    for i in range(0, 2 * len(X_test), 2):
                        y_proba.append(
                            y_pred_proba[i +
                                         1])  #using score from the positive
                else:
                    y_pred_binary = est.predict(X_test)
                    y_pred = preprocessing.LabelBinarizer().fit([0, 1, 2])
                    y_pred = y_pred.inverse_transform(y_pred_binary)
                    y_proba = est.predict_proba(X_test)

    image_files = data.get_image_files(test_dir or config.get('test_dir'),
                                       test_list)
    names = data.get_names(image_files)
    labels = data.get_labels(
        names, label_file='folds/' + data.settings['protocol'] +
        '.csv').astype(np.int32)[:, np.newaxis]  # , per_patient=per_patient

    image_column = pd.Series(names, name='image')
    labels_column = pd.Series(np.squeeze(labels), name='true')

    level_column = pd.Series(y_pred, name='pred')
    if data.settings['protocol'] != 'protocol3':
        proba_column = pd.Series(y_proba, name='proba')
        predictions = pd.concat(
            [image_column, labels_column, level_column, proba_column], axis=1)
    else:
        proba_label_0 = pd.Series(y_proba[:, 0], name='proba_label_0')
        proba_label_1 = pd.Series(y_proba[:, 1], name='proba_label_1')
        proba_label_2 = pd.Series(y_proba[:, 2], name='proba_label_2')
        predictions = pd.concat([
            image_column, labels_column, level_column, proba_label_0,
            proba_label_1, proba_label_2
        ],
                                axis=1)

    predictions.to_csv(exp_run_folder +
                       "/ranked_list_fold_{}.csv".format(fold),
                       sep=';')

    print("tail of predictions")
    print(predictions.tail())
    acc = len(filter(lambda
                     (l, y): l == y, zip(labels, y_pred))) / float(len(labels))
    print("accuracy: {}".format(acc))
    print("confusion matrix")
    print(confusion_matrix(labels, y_pred))

    if data.settings['protocol'] != 'protocol3':
        auc = calc_auc(y_proba, labels, exp_run_folder, classifier, fold)
        print("AUC: {}".format(auc))
        average_precision = average_precision_score(labels, y_proba)
        print("average precision: {}".format(average_precision))
        c_matrix = confusion_matrix(labels, y_pred)
        print("sensitivity: {}".format(c_matrix[1][1] /
                                       (c_matrix[1][1] + c_matrix[0][1])))
        print("specificity: {}".format(c_matrix[0][0] /
                                       (c_matrix[0][0] + c_matrix[1][0])))
    else:
        y_test = label_binarize(labels, classes=[0, 1, 2])
        auc = roc_auc_score(y_test, y_proba, average='macro')
        print("AUC: {}".format(auc))
        average_precision = average_precision_score(y_test,
                                                    y_proba,
                                                    average="macro")
        print("mean average precision: {}".format(average_precision))

    results = pd.concat([
        pd.Series(exp_run_folder, name='folder'),
        pd.Series(fold, name='fold'),
        pd.Series(auc, name='auc'),
        pd.Series(average_precision, name='ap'),
        pd.Series(acc, name='acc')
    ],
                        axis=1)
    with open('results.csv', 'a') as f:
        results.to_csv(f, header=False)