def fit(cnf, predict, per_patient, features_file, n_iter, blend_cnf, test_dir): config = util.load_module(cnf).config image_files = data.get_image_files(config.get('train_dir')) names = data.get_names(image_files) labels = data.get_labels(names).astype(np.float32)[:, np.newaxis] if features_file is not None: runs = {'run': [features_file]} else: runs = data.parse_blend_config(yaml.load(open(blend_cnf))) scalers = {run: StandardScaler() for run in runs} tr, te = data.split_indices(image_files, labels) y_preds = [] for i in range(n_iter): print("iteration {} / {}".format(i + 1, n_iter)) for run, files in runs.items(): print("fitting features for run {}".format(run)) X = data.load_features(files) X = scalers[run].fit_transform(X) X = data.per_patient_reshape(X) if per_patient else X est = get_estimator(X.shape[1], image_files, labels, eval_size=0.0 if predict else 0.1) est.fit(X, labels) if not predict: y_pred = est.predict(X[te]).ravel() y_preds.append(y_pred) y_pred = np.mean(y_preds, axis=0) y_pred = np.clip(np.round(y_pred).astype(int), np.min(labels), np.max(labels)) print("kappa after run {}, iter {}: {}".format( run, i, util.kappa(labels[te], y_pred))) print("confusion matrix") print(confusion_matrix(labels[te], y_pred)) else: X = data.load_features(files, test=True) X = scalers[run].transform(X) X = data.per_patient_reshape(X) if per_patient else X y_pred = est.predict(X).ravel() y_preds.append(y_pred) if predict: y_pred = np.mean(y_preds, axis=0) y_pred = np.clip(np.round(y_pred), np.min(labels), np.max(labels)).astype(int) submission_filename = util.get_submission_filename() image_files = data.get_image_files(test_dir or config.get('test_dir')) names = data.get_names(image_files) image_column = pd.Series(names, name='image') level_column = pd.Series(y_pred, name='level') predictions = pd.concat([image_column, level_column], axis=1) print("tail of predictions file") print(predictions.tail()) predictions.to_csv(submission_filename, index=False) print("saved predictions to {}".format(submission_filename))
def main(cnf, classes, weights_from, predict): config = util.load_module(cnf).config files = data.get_image_files(config.get('train_dir')) names = data.get_names(files) names = [int(x) for x in names ] data.classes = int(classes) labels = data.get_labels(names) net = create_net(config) print files.shape print labels.shape if predict : if weights_from is None: weights_from = config.weights_file else: weights_from = str(weights_from) print weights_from try: net.load_params_from(weights_from) print("loaded weights from {}".format(weights_from)) except IOError: print("couldn't load weights starting from scratch") if not predict: print("fitting ...") net.fit(files, labels) else: print("predicting ...") test_files = data.get_image_files(config.get('test_dir')) y_pred = net.predict(test_files) y_pred = y_pred.transpose() print y_pred y_pred = np.clip(np.round(y_pred), np.min(labels), np.max(labels)).astype(int) #print y_pred submission_filename = util.get_submission_filename() image_files = data.get_image_files(config.get('test_dir')) names = data.get_names(image_files) image_column = pd.Series(names, name='photo_id') level_column = pd.DataFrame(y_pred)#name='labels') level_column = level_column.apply(lambda x : string_submit(x)) predictions = pd.concat([image_column, level_column], axis=1) print("tail of predictions file") print(predictions.tail()) predictions.columns = ['photo_id', 'labels'] predictions.to_csv(submission_filename, index=False) print("saved predictions to {}".format(submission_filename))
def transform(cnf, n_iter, skip, test, train, weights_from, test_dir): config = util.load_module(cnf).config runs = {} if train: runs["train"] = config.get("train_dir") if test or test_dir: runs["test"] = test_dir or config.get("test_dir") net = nn.create_net(config) if weights_from is None: net.load_params_from(config.weights_file) print("loaded weights from {}".format(config.weights_file)) else: weights_from = str(weights_from) net.load_params_from(weights_from) print("loaded weights from {}".format(weights_from)) if n_iter > 1: tfs, color_vecs = tta.build_quasirandom_transforms( n_iter, skip=skip, color_sigma=config.cnf["sigma"], **config.cnf["aug_params"] ) else: tfs, color_vecs = tta.build_quasirandom_transforms( n_iter, skip=skip, color_sigma=0.0, **data.no_augmentation_params ) for run, directory in sorted(runs.items(), reverse=True): print("extracting features for files in {}".format(directory)) tic = time.time() files = data.get_image_files(directory) Xs, Xs2 = None, None for i, (tf, color_vec) in enumerate(zip(tfs, color_vecs), start=1): print("{} transform iter {}".format(run, i)) X = net.transform(files, transform=tf, color_vec=color_vec) if Xs is None: Xs = X Xs2 = X ** 2 else: Xs += X Xs2 += X ** 2 print("took {:6.1f} seconds".format(time.time() - tic)) if i % 5 == 0 or n_iter < 5: std = np.sqrt((Xs2 - Xs ** 2 / i) / (i - 1)) config.save_features(Xs / i, i, skip=skip, test=True if run == "test" else False) config.save_std(std, i, skip=skip, test=True if run == "test" else False) print("saved {} iterations".format(i))
def main(directory): filenames = data.get_image_files(directory) bs = 1000 batches = [filenames[i * bs : (i + 1) * bs] for i in range(int(len(filenames) / bs) + 1)] Us, evs = [], [] for batch in batches: images = np.array([data.load_augment(f, 256, 256) for f in batch]) X = images.transpose(0, 2, 3, 1).reshape(-1, 3) cov = np.dot(X.T, X) / X.shape[0] U, S, V = np.linalg.svd(cov) ev = np.sqrt(S) Us.append(U) evs.append(ev) print('U') print(np.mean(Us, axis=0)) print('eigenvalues') print(np.mean(evs, axis=0))
def main(cnf, weights_from): config = util.load_module(cnf).config if weights_from is None: weights_from = config.weights_file else: weights_from = str(weights_from) files = data.get_image_files(config.get('train_dir')) names = data.get_names(files) labels = data.get_labels(names).astype(np.float32) net = create_net(config) try: net.load_params_from(weights_from) print("loaded weights from {}".format(weights_from)) except IOError: print("couldn't load weights starting from scratch") print("fitting ...") net.fit(files, labels)
def fit(cnf, exp_run_folder, classifier, features_file, n_iter, blend_cnf, test_dir, fold): config = util.load_module(cnf).config config.cnf[ 'fold'] = fold # <-- used to change the directories for weights_best, weights_epoch and weights_final config.cnf['exp_run_folder'] = exp_run_folder folds = yaml.load(open('folds/' + data.settings['protocol'] + '.yml')) f0, f1 = fold.split('x') train_list = folds['Fold_' + f0][int(f1) - 1] test_list = folds['Fold_' + f0][0 if f1 == '2' else 1] image_files = data.get_image_files(config.get('train_dir'), train_list) names = data.get_names(image_files) labels = data.get_labels(names, label_file='folds/' + data.settings['protocol'] + '.csv').astype(np.int32)[:, np.newaxis] if features_file is not None: runs = {'run': [features_file]} else: runs = { run: [ os.path.join(exp_run_folder + '/data/features', f) for f in files ] for run, files in yaml.load(open(blend_cnf)).items() } scalers = {run: StandardScaler() for run in runs} y_preds = [] y_preds_proba = [] for i in range(n_iter): print("iteration {} / {}".format(i + 1, n_iter)) for run, files in runs.items(): files = [ f.replace('f0xf1.npy', '{}.npy'.format(fold)) for f in files ] if classifier is None: X_test = data.load_features(files, test=True) if data.settings['protocol'] != 'protocol3': y_pred_proba = X_test y_proba = [] for i in range(0, len(X_test)): y_proba.append( y_pred_proba[i][1]) #using score from the positive y_pred = np.clip(np.round(y_proba), 0, 1).astype(int) else: y_pred_proba = est.predict_proba(X) else: print("fitting features for run {}".format(run)) X_train = data.load_features(files) l2Norm = np.linalg.norm(X_train, axis=1) X_train = np.divide(X_train.T, l2Norm).T est = estimator(data.settings['protocol'], classifier, X_train.shape[1], image_files, X_train, labels, run, fold, eval_size=0.1) open( exp_run_folder + "/best_estimator_fold_{}.txt".format(fold), "w").write(str(est)) X_test = data.load_features(files, test=True) l2Norm = np.linalg.norm(X_test, axis=1) X_test = np.divide(X_test.T, l2Norm).T if data.settings['protocol'] != 'protocol3': y_pred = est.predict(X_test).ravel() y_pred_proba = est.predict_proba(X_test).ravel() y_proba = [] for i in range(0, 2 * len(X_test), 2): y_proba.append( y_pred_proba[i + 1]) #using score from the positive else: y_pred_binary = est.predict(X_test) y_pred = preprocessing.LabelBinarizer().fit([0, 1, 2]) y_pred = y_pred.inverse_transform(y_pred_binary) y_proba = est.predict_proba(X_test) image_files = data.get_image_files(test_dir or config.get('test_dir'), test_list) names = data.get_names(image_files) labels = data.get_labels( names, label_file='folds/' + data.settings['protocol'] + '.csv').astype(np.int32)[:, np.newaxis] # , per_patient=per_patient image_column = pd.Series(names, name='image') labels_column = pd.Series(np.squeeze(labels), name='true') level_column = pd.Series(y_pred, name='pred') if data.settings['protocol'] != 'protocol3': proba_column = pd.Series(y_proba, name='proba') predictions = pd.concat( [image_column, labels_column, level_column, proba_column], axis=1) else: proba_label_0 = pd.Series(y_proba[:, 0], name='proba_label_0') proba_label_1 = pd.Series(y_proba[:, 1], name='proba_label_1') proba_label_2 = pd.Series(y_proba[:, 2], name='proba_label_2') predictions = pd.concat([ image_column, labels_column, level_column, proba_label_0, proba_label_1, proba_label_2 ], axis=1) predictions.to_csv(exp_run_folder + "/ranked_list_fold_{}.csv".format(fold), sep=';') print("tail of predictions") print(predictions.tail()) acc = len(filter(lambda (l, y): l == y, zip(labels, y_pred))) / float(len(labels)) print("accuracy: {}".format(acc)) print("confusion matrix") print(confusion_matrix(labels, y_pred)) if data.settings['protocol'] != 'protocol3': auc = calc_auc(y_proba, labels, exp_run_folder, classifier, fold) print("AUC: {}".format(auc)) average_precision = average_precision_score(labels, y_proba) print("average precision: {}".format(average_precision)) c_matrix = confusion_matrix(labels, y_pred) print("sensitivity: {}".format(c_matrix[1][1] / (c_matrix[1][1] + c_matrix[0][1]))) print("specificity: {}".format(c_matrix[0][0] / (c_matrix[0][0] + c_matrix[1][0]))) else: y_test = label_binarize(labels, classes=[0, 1, 2]) auc = roc_auc_score(y_test, y_proba, average='macro') print("AUC: {}".format(auc)) average_precision = average_precision_score(y_test, y_proba, average="macro") print("mean average precision: {}".format(average_precision)) results = pd.concat([ pd.Series(exp_run_folder, name='folder'), pd.Series(fold, name='fold'), pd.Series(auc, name='auc'), pd.Series(average_precision, name='ap'), pd.Series(acc, name='acc') ], axis=1) with open('results.csv', 'a') as f: results.to_csv(f, header=False)
def transform(cnf, n_iter, skip, test, train, weights_from, test_dir): config = util.load_module(cnf).config config.cnf['batch_size_train'] = 128 config.cnf['batch_size_test'] = 128 runs = {} if train: runs['train'] = config.get('train_dir') if test or test_dir: runs['test'] = test_dir or config.get('test_dir') net = nn.create_net(config) if weights_from is None: net.load_params_from(config.weights_file) print("loaded weights from {}".format(config.weights_file)) else: weights_from = str(weights_from) net.load_params_from(weights_from) print("loaded weights from {}".format(weights_from)) if n_iter > 1: tfs, color_vecs = tta.build_quasirandom_transforms( n_iter, skip=skip, color_sigma=config.cnf['sigma'], **config.cnf['aug_params']) else: tfs, color_vecs = tta.build_quasirandom_transforms( n_iter, skip=skip, color_sigma=0.0, **data.no_augmentation_params) for run, directory in sorted(runs.items(), reverse=True): print("extracting features for files in {}".format(directory)) tic = time.time() files = data.get_image_files(directory) Xs, Xs2 = None, None for i, (tf, color_vec) in enumerate(zip(tfs, color_vecs), start=1): print("{} transform iter {}".format(run, i)) X = net.transform(files, transform=tf, color_vec=color_vec) if Xs is None: Xs = X Xs2 = X**2 else: Xs += X Xs2 += X**2 print('took {:6.1f} seconds'.format(time.time() - tic)) if i % 10 == 0 or n_iter < 5: std = np.sqrt((Xs2 - Xs**2 / i) / (i - 1)) if i % 50 == 0: config.save_features(Xs / i, i, skip=skip, test=True if run == 'test' else False) config.save_std(std, i, skip=skip, test=True if run == 'test' else False) print('saved {} iterations'.format(i))