def fit(cnf, predict, per_patient, features_file, n_iter, blend_cnf, test_dir): config = util.load_module(cnf).config image_files = data.get_image_files(config.get('train_dir')) names = data.get_names(image_files) labels = data.get_labels(names).astype(np.float32)[:, np.newaxis] if features_file is not None: runs = {'run': [features_file]} else: runs = data.parse_blend_config(yaml.load(open(blend_cnf))) scalers = {run: StandardScaler() for run in runs} tr, te = data.split_indices(image_files, labels) y_preds = [] for i in range(n_iter): print("iteration {} / {}".format(i + 1, n_iter)) for run, files in runs.items(): print("fitting features for run {}".format(run)) X = data.load_features(files) X = scalers[run].fit_transform(X) X = data.per_patient_reshape(X) if per_patient else X est = get_estimator(X.shape[1], image_files, labels, eval_size=0.0 if predict else 0.1) est.fit(X, labels) if not predict: y_pred = est.predict(X[te]).ravel() y_preds.append(y_pred) y_pred = np.mean(y_preds, axis=0) y_pred = np.clip(np.round(y_pred).astype(int), np.min(labels), np.max(labels)) print("kappa after run {}, iter {}: {}".format( run, i, util.kappa(labels[te], y_pred))) print("confusion matrix") print(confusion_matrix(labels[te], y_pred)) else: X = data.load_features(files, test=True) X = scalers[run].transform(X) X = data.per_patient_reshape(X) if per_patient else X y_pred = est.predict(X).ravel() y_preds.append(y_pred) if predict: y_pred = np.mean(y_preds, axis=0) y_pred = np.clip(np.round(y_pred), np.min(labels), np.max(labels)).astype(int) submission_filename = util.get_submission_filename() image_files = data.get_image_files(test_dir or config.get('test_dir')) names = data.get_names(image_files) image_column = pd.Series(names, name='image') level_column = pd.Series(y_pred, name='level') predictions = pd.concat([image_column, level_column], axis=1) print("tail of predictions file") print(predictions.tail()) predictions.to_csv(submission_filename, index=False) print("saved predictions to {}".format(submission_filename))
def fit(cnf, predict, per_patient, features_file, n_iter, blend_cnf, test_dir): config = util.load_module(cnf).config image_files = data.get_image_files(config.get('train_dir')) names = data.get_names(image_files) labels = data.get_labels(names).astype(np.float32)[:, np.newaxis] if features_file is not None: runs = {'run': [features_file]} else: runs = data.parse_blend_config(yaml.load(open(blend_cnf))) scalers = {run: StandardScaler() for run in runs} tr, te = data.split_indices(image_files, labels) y_preds = [] for i in range(n_iter): print("iteration {} / {}".format(i + 1, n_iter)) for run, files in list(runs.items()): print("fitting features for run {}".format(run)) X = data.load_features(files) X = scalers[run].fit_transform(X) X = data.per_patient_reshape(X) if per_patient else X est = get_estimator(X.shape[1], image_files, labels, eval_size=0.0 if predict else 0.1) est.fit(X, labels) if not predict: y_pred = est.predict(X[te]).ravel() y_preds.append(y_pred) y_pred = np.mean(y_preds, axis=0) y_pred = np.clip(np.round(y_pred).astype(int), np.min(labels), np.max(labels)) print("kappa after run {}, iter {}: {}".format( run, i, util.kappa(labels[te], y_pred))) print("confusion matrix") print(confusion_matrix(labels[te], y_pred)) else: X = data.load_features(files, test=True) X = scalers[run].transform(X) X = data.per_patient_reshape(X) if per_patient else X y_pred = est.predict(X).ravel() y_preds.append(y_pred) if predict: y_pred = np.mean(y_preds, axis=0) y_pred = np.clip(np.round(y_pred), np.min(labels), np.max(labels)).astype(int) submission_filename = util.get_submission_filename() image_files = data.get_image_files(test_dir or config.get('test_dir')) names = data.get_names(image_files) image_column = pd.Series(names, name='image') level_column = pd.Series(y_pred, name='level') predictions = pd.concat([image_column, level_column], axis=1) print("tail of predictions file") print(predictions.tail()) predictions.to_csv(submission_filename, index=False) print("saved predictions to {}".format(submission_filename))
def __init__(self, val_split=0.1, test_split=0.1, label_col='20d_ret', features_dir=None): features_dir = features_dir or FEATURES_DIR X, y, dates = load_features(features_dir, label_col=label_col) self.X = X self.y = y self.dates = pd.to_datetime(dates) self.val_split, self.test_split = val_split, test_split self.X_train, self.X_val, self.X_test = None, None, None self.y_train, self.y_val, self.y_test = None, None, None print("Processing data...") self._process_data() print("Done")
import json import numpy as np import pylab as pl from sklearn import neighbors from data import load_features, load_mfccs # Load and divide data ##songs, prop_dict = load_mfccs() songs, prop_dict = load_features() training_set = dict([e for e in prop_dict.items()[0:30]]) testing_set = dict([e for e in prop_dict.items()[30:]]) # Train k-NN fo = open('data/drums.genres.json', 'r') genres = json.loads(fo.read()) cmap = {'pop': 0, 'rock': 1, 'reggae': 2, 'jazz': 3, 'classical': 4} classes = [cmap[genres[k]] for k in training_set.keys()] fo.close() X = np.array([p for k, p in training_set.items()]) Y = np.array(classes) n_neighbors = 9 clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance').fit(X, Y) # Test k-NN
if features_file is not None: runs = {'run': [features_file]} else: runs = data.parse_blend_config(yaml.load(open(blend_cnf))) scalers = {run: StandardScaler() for run in runs} tr, te = data.split_indices(image_files, labels) y_preds = [] for i in range(n_iter): print("iteration {} / {}".format(i + 1, n_iter)) for run, files in runs.items(): print("fitting features for run {}".format(run)) X = data.load_features(files) X = scalers[run].fit_transform(X) X = data.per_patient_reshape(X) if per_patient else X est = get_estimator(X.shape[1], image_files, labels, eval_size=0.0 if predict else 0.1) est.fit(X, labels) if not predict: y_pred = est.predict(X[te]).ravel() y_preds.append(y_pred) y_pred = np.mean(y_preds, axis=0) y_pred = np.clip(np.round(y_pred).astype(int), np.min(labels), np.max(labels)) print("kappa after run {}, iter {}: {}".format( run, i, util.kappa(labels[te], y_pred))) print("confusion matrix") print(confusion_matrix(labels[te], y_pred))
def fit(cnf, exp_run_folder, classifier, features_file, n_iter, blend_cnf, test_dir, fold): config = util.load_module(cnf).config config.cnf[ 'fold'] = fold # <-- used to change the directories for weights_best, weights_epoch and weights_final config.cnf['exp_run_folder'] = exp_run_folder folds = yaml.load(open('folds/' + data.settings['protocol'] + '.yml')) f0, f1 = fold.split('x') train_list = folds['Fold_' + f0][int(f1) - 1] test_list = folds['Fold_' + f0][0 if f1 == '2' else 1] image_files = data.get_image_files(config.get('train_dir'), train_list) names = data.get_names(image_files) labels = data.get_labels(names, label_file='folds/' + data.settings['protocol'] + '.csv').astype(np.int32)[:, np.newaxis] if features_file is not None: runs = {'run': [features_file]} else: runs = { run: [ os.path.join(exp_run_folder + '/data/features', f) for f in files ] for run, files in yaml.load(open(blend_cnf)).items() } scalers = {run: StandardScaler() for run in runs} y_preds = [] y_preds_proba = [] for i in range(n_iter): print("iteration {} / {}".format(i + 1, n_iter)) for run, files in runs.items(): files = [ f.replace('f0xf1.npy', '{}.npy'.format(fold)) for f in files ] if classifier is None: X_test = data.load_features(files, test=True) if data.settings['protocol'] != 'protocol3': y_pred_proba = X_test y_proba = [] for i in range(0, len(X_test)): y_proba.append( y_pred_proba[i][1]) #using score from the positive y_pred = np.clip(np.round(y_proba), 0, 1).astype(int) else: y_pred_proba = est.predict_proba(X) else: print("fitting features for run {}".format(run)) X_train = data.load_features(files) l2Norm = np.linalg.norm(X_train, axis=1) X_train = np.divide(X_train.T, l2Norm).T est = estimator(data.settings['protocol'], classifier, X_train.shape[1], image_files, X_train, labels, run, fold, eval_size=0.1) open( exp_run_folder + "/best_estimator_fold_{}.txt".format(fold), "w").write(str(est)) X_test = data.load_features(files, test=True) l2Norm = np.linalg.norm(X_test, axis=1) X_test = np.divide(X_test.T, l2Norm).T if data.settings['protocol'] != 'protocol3': y_pred = est.predict(X_test).ravel() y_pred_proba = est.predict_proba(X_test).ravel() y_proba = [] for i in range(0, 2 * len(X_test), 2): y_proba.append( y_pred_proba[i + 1]) #using score from the positive else: y_pred_binary = est.predict(X_test) y_pred = preprocessing.LabelBinarizer().fit([0, 1, 2]) y_pred = y_pred.inverse_transform(y_pred_binary) y_proba = est.predict_proba(X_test) image_files = data.get_image_files(test_dir or config.get('test_dir'), test_list) names = data.get_names(image_files) labels = data.get_labels( names, label_file='folds/' + data.settings['protocol'] + '.csv').astype(np.int32)[:, np.newaxis] # , per_patient=per_patient image_column = pd.Series(names, name='image') labels_column = pd.Series(np.squeeze(labels), name='true') level_column = pd.Series(y_pred, name='pred') if data.settings['protocol'] != 'protocol3': proba_column = pd.Series(y_proba, name='proba') predictions = pd.concat( [image_column, labels_column, level_column, proba_column], axis=1) else: proba_label_0 = pd.Series(y_proba[:, 0], name='proba_label_0') proba_label_1 = pd.Series(y_proba[:, 1], name='proba_label_1') proba_label_2 = pd.Series(y_proba[:, 2], name='proba_label_2') predictions = pd.concat([ image_column, labels_column, level_column, proba_label_0, proba_label_1, proba_label_2 ], axis=1) predictions.to_csv(exp_run_folder + "/ranked_list_fold_{}.csv".format(fold), sep=';') print("tail of predictions") print(predictions.tail()) acc = len(filter(lambda (l, y): l == y, zip(labels, y_pred))) / float(len(labels)) print("accuracy: {}".format(acc)) print("confusion matrix") print(confusion_matrix(labels, y_pred)) if data.settings['protocol'] != 'protocol3': auc = calc_auc(y_proba, labels, exp_run_folder, classifier, fold) print("AUC: {}".format(auc)) average_precision = average_precision_score(labels, y_proba) print("average precision: {}".format(average_precision)) c_matrix = confusion_matrix(labels, y_pred) print("sensitivity: {}".format(c_matrix[1][1] / (c_matrix[1][1] + c_matrix[0][1]))) print("specificity: {}".format(c_matrix[0][0] / (c_matrix[0][0] + c_matrix[1][0]))) else: y_test = label_binarize(labels, classes=[0, 1, 2]) auc = roc_auc_score(y_test, y_proba, average='macro') print("AUC: {}".format(auc)) average_precision = average_precision_score(y_test, y_proba, average="macro") print("mean average precision: {}".format(average_precision)) results = pd.concat([ pd.Series(exp_run_folder, name='folder'), pd.Series(fold, name='fold'), pd.Series(auc, name='auc'), pd.Series(average_precision, name='ap'), pd.Series(acc, name='acc') ], axis=1) with open('results.csv', 'a') as f: results.to_csv(f, header=False)