def guppy(self, data): fwidth = [] swidth = [] rwb = [] sband_min = [] sband_max = [] for index, row in data.iterrows(): fast = [ row['ema3'], row['ema5'], row['ema7'], row['ema10'], row['ema12'], row['ema15'] ] slow = [ row['ema30'], row['ema35'], row['ema40'], row['ema45'], row['ema50'], row['ema60'] ] fmin, fmax = utils.minmax(fast) smin, smax = utils.minmax(slow) sband_min.append(smin) sband_max.append(smax) if row['ema3'] > row['ema15']: fwidth.append(fmax - fmin) else: fwidth.append(fmin - fmax) if row['ema30'] > row['ema60']: swidth.append(smax - smin) else: swidth.append(smin - smax) if fmin > smax: rwb.append(fmin - smax) elif smin > fmax: rwb.append(fmax - smin) else: rwb.append(0.0) data['fwidth'] = fwidth data['fwidth_pb'] = utils.positive_bars(data['fwidth']) data['fwidth_roc'] = utils.roc(fwidth) data['fwidth_roc_pb'] = utils.positive_bars(data['fwidth_roc']) data['fwidth_ranking'] = utils.relative_rank(data['fwidth']) data['swidth'] = swidth data['sband_min'] = sband_min data['sband_max'] = sband_max data['swidth_pb'] = utils.positive_bars(data['swidth']) data['swidth_roc'] = utils.roc(swidth) data['swidth_roc_pb'] = utils.positive_bars(data['swidth_roc']) data['swidth_ranking'] = utils.relative_rank(data['swidth']) data['rwb'] = rwb data['rwb_pb'] = utils.positive_bars(data['rwb']) data['rwb_roc'] = utils.roc(rwb) data['rwb_roc_pb'] = utils.positive_bars(data['rwb_roc']) data['rwb_ranking'] = utils.relative_rank(data['rwb'])
def macd(self, data): np_closes = np.array(data['close'], dtype=float) macd, macd_sig, macd_hist = talib.MACD(np_closes) data['macd'] = macd.tolist() data['macd_sig'] = macd_sig.tolist() data['macd_hist'] = macd_hist.tolist() data['macd_roc'] = utils.roc(data['macd']) data['macd_roc_pb'] = utils.positive_bars(data['macd_roc']) data['macd_sig_roc'] = utils.roc(data['macd_sig']) data['macd_sig_roc_pb'] = utils.positive_bars(data['macd_sig_roc']) data['macd_hist_roc'] = utils.roc(data['macd_hist']) data['macd_hist_pb'] = utils.positive_bars(data['macd_hist']) data['macd_hist_roc_pb'] = utils.positive_bars(data['macd_hist_roc'])
def stochastic(self, data): np_close = np.array(data['close'], dtype=float) np_high = np.array(data['high'], dtype=float) np_low = np.array(data['low'], dtype=float) slowk, slowd = talib.STOCH(np_high, np_low, np_close, fastk_period=14, slowk_period=3) data['slowk'] = slowk data['slowd'] = slowd data['slowk_roc'] = utils.roc(data['slowk']) data['slowd_roc'] = utils.roc(data['slowd']) data['diff_slow_kd'] = slowk - slowd data['diff_slow_kd_pb'] = utils.positive_bars(data['diff_slow_kd'])
def ma(self, data): np_closes = np.array(data['close'], dtype=float) for period in ma_periods: data['ema' + str(period)] = talib.EMA(np_closes, timeperiod=period).tolist() data['ema' + str(period) + '_roc'] = utils.roc(data['ema' + str(period)])
def random_roc(): # random data should give 0.5 eer # and random roc curve pos = np.random.random(1000) neg = np.random.random(1000) # compare eer versions print(utils.eer(pos, neg)) # plot curve fars, frrs = utils.roc(pos, neg) plt.plot(fars, frrs) plt.show()
def separable_roc(): # separable data should give low # eer and convex roc curve pos = np.random.normal(1, 0.5, 1000) neg = np.random.normal(0, 0.5, 1000) # compare eer versions print(utils.eer(pos, neg)) # plot curve fars, frrs = utils.roc(pos, neg) plt.plot(fars, frrs) plt.show()
def evaluate_model(dataset, save_file, random_state, pipeline_components, pipeline_parameters, n_combos, label): features, labels, feature_names = read_file(dataset, label) # pipelines = [dict(zip(pipeline_parameters.keys(), list(parameter_combination))) # for parameter_combination in itertools.product(*pipeline_parameters.values())] # Create a temporary folder to store the transformers of the pipeline cachedir = mkdtemp() memory = Memory(cachedir=cachedir, verbose=0) # print ( pipeline_components) # print(pipeline_parameters) with warnings.catch_warnings(): # Squash warning messages. Turn this off when debugging! warnings.simplefilter('ignore') cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state) hyperparameters = {} for k, v in pipeline_parameters.items(): for param, pvals in v.items(): hyperparameters.update({k + '__' + param: pvals}) pipeline = Pipeline(pipeline_components, memory=memory) # run Randomized Search CV to tune the hyperparameter settings est = RandomizedSearchCV(estimator=pipeline, param_distributions=hyperparameters, n_iter=n_combos, cv=cv, random_state=random_state, refit=True, error_score=0.0) est.fit(features, labels) best_est = est.best_estimator_ # generate cross-validated predictions for each data point using the best estimator cv_predictions = cross_val_predict(estimator=best_est, X=features, y=labels, cv=cv) # get cv probabilities skip = False if getattr(best_est, "predict_proba", None): method = "predict_proba" elif getattr(best_est, "decision_function", None): method = "decision_function" else: skip = True if not skip: cv_probabilities = cross_val_predict(estimator=best_est, X=features, y=labels, method=method, cv=cv) if method == "predict_proba": cv_probabilities = cv_probabilities[:, 1] accuracy = accuracy_score(labels, cv_predictions) macro_f1 = f1_score(labels, cv_predictions, average='macro') balanced_accuracy = balanced_accuracy_score(labels, cv_predictions) try: roc_auc = roc_auc_score(labels, cv_probabilities) except ValueError as ve: print("roc_auc_score: %s" % (str(ve))) roc_auc = -1 preprocessor_classes = [p[0] for p in pipeline_components[:-1]] preprocessor_param_string = 'default' for preprocessor_class in preprocessor_classes: if preprocessor_class in pipeline_parameters.keys(): preprocessor_param_string = ','.join([ '{}={}'.format( parameter, '|'.join([x.strip() for x in str(value).split(',')])) for parameter, value in pipeline_parameters[preprocessor_class].items() ]) classifier_class = pipeline_components[-1][0] param_string = ','.join( ['{}={}'.format(p, v) for p, v in est.best_params_.items()]) # for parameter, value in pipeline_parameters[classifier_class].items()]) out_text = '\t'.join([ dataset.split('/')[-1].split('.')[0], ','.join(preprocessor_classes), preprocessor_param_string, classifier_class, param_string, str(random_state), str(accuracy), str(macro_f1), str(balanced_accuracy), str(roc_auc) ]) print(out_text) with open(save_file, 'a') as out: out.write(out_text + '\n') sys.stdout.flush() # write feature importances est_name = classifier_class feature_importance(save_file, best_est, est_name, feature_names, features, labels, random_state, ','.join(preprocessor_classes), preprocessor_param_string, classifier_class, param_string) # write roc curves if not skip: roc(save_file, best_est, labels, cv_probabilities, random_state, ','.join(preprocessor_classes), preprocessor_param_string, classifier_class, param_string) # Delete the temporary cache before exiting rmtree(cachedir)
def prediction_roc(self, groups, output_file_path, labels, test_size=0.2, c=.2, verbose=False, save=True, iterations=100, plot=True, kind='test', method='AUC', band=False, fig=None, ax=None, color=None, test_function=get_training_testing, p='', penalty='l1'): """ :param groups: list containing groups of ROC features, for example [[H3K4me3 columns], [H3K27ac columns]] :param output_file_path: output file path and prefix :param labels: labels of each group :param test_size: :param c: :param verbose: :param save: :param iterations: :param plot: :param kind: :param method: :param band: :param fig: :param ax: :param color: :param test_function: :param p: :param penalty: :return: """ train_df = label_label(self.training_table, self.gene_meta_df) all_auc_train = {} all_tpr_train = {} all_auc_test = {} all_tpr_test = {} scores = defaultdict(float) mean_fpr = np.linspace(0, 1, 101) for r in range(iterations): x_train, x_test, y_train, y_test = test_function(train_df, random_state=r, test_size=test_size) y_trues_train = None y_scores_train = None y_trues_test = None y_scores_test = None first = True for i in range(len(groups)): cur_column = groups[i] cur_x_train = x_train.ix[:, cur_column] cur_x_test = x_test.ix[:, cur_column] if len(cur_x_train.shape) == 1: cur_x_train = cur_x_train.to_frame() cur_x_test = cur_x_test.to_frame() cur_predictor = predict_logisticregression(cur_x_train, y_train, penalty=penalty, c=c) if method == 'score': cur_score = score(cur_x_test, y_test, cur_predictor) scores[labels[i]] += cur_score cur_y_score_train = predict_decision(cur_predictor, cur_x_train, False) cur_y_score_test = predict_decision(cur_predictor, cur_x_test, False) if first: y_trues_train = y_train y_trues_test = y_test y_scores_train = cur_y_score_train.values y_scores_test = cur_y_score_test.values first = False else: y_trues_train = np.concatenate((y_trues_train, y_train), axis=1) y_trues_test = np.concatenate((y_trues_test, y_test), axis=1) y_scores_train = np.concatenate((y_scores_train, cur_y_score_train.values), axis=1) y_scores_test = np.concatenate((y_scores_test, cur_y_score_test.values), axis=1) auc_train, fpr_train, tpr_train = roc(y_trues_train, y_scores_train, labels) auc_test, fpr_test, tpr_test = roc(y_trues_test, y_scores_test, labels) for label in labels: if kind == 'train' and band: plt.plot(fpr_train[label], tpr_train[label], lw=0.4, alpha=0.1, color='grey') elif kind == 'test' and band: plt.plot(fpr_test[label], tpr_test[label], lw=0.4, alpha=0.1, color='grey') tpr_train[label] = interp(mean_fpr, fpr_train[label], tpr_train[label]) tpr_test[label] = interp(mean_fpr, fpr_test[label], tpr_test[label]) for l in range(len(labels)): label = labels[l] if label not in all_auc_train: all_auc_train[label] = auc_train[label] all_auc_test[label] = auc_test[label] all_tpr_train[label] = [tpr_train[label]] all_tpr_test[label] = [tpr_test[label]] else: all_auc_train[label] += auc_train[label] all_auc_test[label] += auc_test[label] all_tpr_train[label].append(tpr_train[label]) all_tpr_test[label].append(tpr_test[label]) for label in labels: all_auc_train[label] /= iterations all_auc_test[label] /= iterations if method == 'score': scores[label] /= iterations if plot: if kind == 'train': roc_plot(all_auc_train, mean_fpr, all_tpr_train, len(labels), output_file_path, labels, verbose=verbose, save=save, band=band, fig=fig, ax=ax, color=color, p=p) elif kind == 'test': roc_plot(all_auc_test, mean_fpr, all_tpr_test, len(labels), output_file_path, labels, verbose=verbose, save=save, band=band, fig=fig, ax=ax, color=color, p=p) if method == 'AUC': results_train_df = pd.DataFrame.from_dict(all_auc_train, orient='index') results_train_df.columns = ['train'] results_test_df = pd.DataFrame.from_dict(all_auc_test, orient='index') results_test_df.columns = ['test'] result_df = results_train_df.join(results_test_df) return result_df elif method == 'score': result_df = pd.DataFrame.from_dict(scores, orient='index') result_df.columns = ['accuracy'] else: result_df = None if kind == 'test': for key in all_tpr_test.keys(): all_tpr_test[key] = np.mean(all_tpr_test[key], axis=0) # TPRs_test_df = pd.DataFrame.from_dict(all_tpr_test) # TPRs_test_df.to_csv(title+'TPR.csv') return result_df
image_batch = test_images[idx*config.BATCH_SIZE:(idx+1)*config.BATCH_SIZE] label_batch = test_labels[idx*config.BATCH_SIZE:(idx+1)*config.BATCH_SIZE] latent_loss, latent_gen_loss = sess.run([model.encoded_input, model.encoded_sample], feed_dict={model.image:image_batch}) latent_error = np.mean(abs(latent_loss-latent_gen_loss), axis=-1) latent_error = np.reshape(latent_error, [-1]) scores_out = np.append(scores_out, latent_error) labels_out = np.append(labels_out, label_batch) #out_str = "---------->%d/%d" % (config.BATCH_SIZE*idx, config.BATCH_SIZE*test_num_iters) #print(out_str, end='\r') scores_out = np.array(scores_out) labels_out = np.array(labels_out) scores_out = (scores_out - scores_out.min())/(scores_out.max()-scores_out.min()) auc_out = utils.roc(labels_out, scores_out) print("AUC: %.4f BEST AUC: %.4f" %(auc_out, best_auc)) if auc_out > best_auc: best_auc = auc_out #if True: # Create directories if needed if not os.path.isdir("%s/%04d"%("best_checkpoints",epoch)): os.makedirs("%s/%04d"%("best_checkpoints",epoch)) print('Saving model with global step %d ( = %d epochs) to disk' % (global_step, epoch)) saver.save(sess, "%s/%04d/model.ckpt"%("best_checkpoints",epoch)) # Save latest checkpoint to same file name print('Saving model with %d epochs to disk' % (epoch)) saver.save(sess, "best_checkpoints/model.ckpt")
if num_batches * batch_size < n: # Computing rest rest = n - num_batches * batch_size idx = range(n-rest, n) x_batch = X[idx] out = predict(x_batch) preds.append(out) # Making metadata predictions = np.concatenate(preds, axis = 0) acc_eval = utils.accuracy(predictions, y) all_accuracy.append(acc_eval) auc_eval = utils.auc(predictions, y) all_auc.append(auc_eval) roc_eval_fpr, roc_eval_tpr, roc_eval_thresholds = utils.roc(predictions, y) all_roc_fpr.append(roc_eval_fpr) all_roc_tpr.append(roc_eval_tpr) all_roc_thresholds.append(roc_eval_thresholds) if Print: print " validating: %s loss" % subset print " average evaluation accuracy (%s): %.5f" % (subset, acc_eval) print " average evaluation AUC (%s): %.5f" % (subset, auc_eval) print print "Epoch %d of %d" % (epoch + 1, num_epochs) if epoch in learning_rate_schedule: lr = np.float32(learning_rate_schedule[epoch]) print " setting learning rate to %.7f" % lr learning_rate.set_value(lr) print "Shuffling data"
help='range to plot y axis') flags = parser.parse_args() for path in flags.files: if path.endswith('.txt'): # read comparisons file pos = [] neg = [] with open(path, 'r') as f: for line in f: t, score = line.split() score = float(score) if int(t) == 1: pos.append(score) else: neg.append(score) # compute roc fars, frrs = utils.roc(pos, neg) # plot roc plt.plot(fars, frrs, label=path) plt.legend(loc='upper right') plt.xlabel('FAR') plt.ylabel('FRR') plt.axis(flags.xrange + flags.yrange) plt.grid() plt.show()
if num_batches * batch_size < n: # Computing rest rest = n - num_batches * batch_size idx = range(n - rest, n) x_batch = X[idx] out = predict(x_batch) preds.append(out) # Making metadata predictions = np.concatenate(preds, axis=0) acc_eval = utils.accuracy(predictions, y) all_accuracy.append(acc_eval) auc_eval = utils.auc(predictions, y) all_auc.append(auc_eval) roc_eval_fpr, roc_eval_tpr, roc_eval_thresholds = utils.roc( predictions, y) all_roc_fpr.append(roc_eval_fpr) all_roc_tpr.append(roc_eval_tpr) all_roc_thresholds.append(roc_eval_thresholds) if Print: print " validating: %s loss" % subset print " average evaluation accuracy (%s): %.5f" % (subset, acc_eval) print " average evaluation AUC (%s): %.5f" % (subset, auc_eval) print print "Epoch %d of %d" % (epoch + 1, num_epochs) if epoch in learning_rate_schedule: lr = np.float32(learning_rate_schedule[epoch]) print " setting learning rate to %.7f" % lr
normalize, load_model, binary_cross_entropy, roc, ) if __name__ == "__main__": parser = argparse.ArgumentParser(description="") parser.add_argument("dataset_test", type=open_datafile, help="dataset to use") parser.add_argument("model", help="model to use") parser.add_argument("-vi", "--visu", help="Display graphs", action="store_true") args = parser.parse_args() n = load_model(args.model) # test = args.dataset_test.drop(args.dataset_test.columns[0], axis=1) test = args.dataset_test[[ 1, 2, 3, 8, 11, 12, 17, 18, 19, 21, 26, 28, 30, 31 ]] test = normalize(test) test = np.array(test) error, acc = binary_cross_entropy(test, n) print(f"Cross Binary Entropy Error = {error:.5f}") print(f"Accuracy = {acc:.5f}") if args.visu is True: roc(test, n)
def evaluate_model(dataset, save_file, random_state, clf, clf_name, hyper_params, longitudinal=False, rare=True): print('reading data...', end='') features, labels, pt_ids, feature_names, zfile = read_file( dataset, longitudinal, rare) print('done.', len(labels), 'samples,', np.sum(labels == 1), 'cases,', features.shape[1], 'features') if 'Feat' in clf_name: #set feature names clf.feature_names = ','.join(feature_names).encode() n_splits = 10 cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state) scoring = make_scorer(balanced_accuracy) ### # controls matching on age and sex ### idx_age = np.argmax(feature_names == 'age') idx_sex = np.argmax(feature_names == 'SEX') #sampler = NearMiss(random_state=random_state, return_indices=True) sampler = QuartileExactMatch(quart_locs=[idx_age], exact_locs=[idx_sex], random_state=random_state) print('sampling data...', end='') X, y, sidx = sampler.fit_sample(features, labels) print('sampled data contains', np.sum(y == 1), 'cases', np.sum(y == 0), 'controls') ### # split into train/test ### X_train, X_test, y_train, y_test, sidx_train, sidx_test = ( train_test_split(X, y, sidx, train_size=0.5, test_size=0.5, random_state=random_state)) # X,y,sidx = sampler.fit_sample(features[train_idx],labels[train_idx]) if len(hyper_params) > 0: param_grid = list(ParameterGrid(hyper_params)) #clone estimators Clfs = [clone(clf).set_params(**p) for p in param_grid] # fit with hyperparameter optimization cv_scores = np.zeros((len(param_grid), 10)) # cross validated scores cv_preds = np.zeros( (len(param_grid), len(y_train))) # cross validated predictions cv_probs = np.zeros( (len(param_grid), len(y_train))) # cross validated probabilities FI = np.zeros(( len(param_grid), features.shape[1])) # cross validated, permuted feature importance FI_internal = np.zeros( (len(param_grid), features.shape[1])) # cross validated feature importance ########### # this is a manual version of 10-fold cross validation with hyperparameter tuning t0 = time.process_time() for j, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)): print('fold', j) for i, est in enumerate(Clfs): print('training', type(est).__name__, i + 1, 'of', len(Clfs)) if 'Feat' in clf_name: est.logfile = (est.logfile.decode().split('.log')[0] + '.log.param' + str(i) + '.cv' + str(j)).encode() ########## # fit model ########## if longitudinal: est.fit(X_train[train_idx], y_train[train_idx], zfile, pt_ids[sidx_train[train_idx]]) else: est.fit(X_train[train_idx], y_train[train_idx]) ########## # get predictions ########## print('getting validation predictions...') if longitudinal: # cv_preds[i,val_idx] = est.predict(X_train[val_idx], # zfile,pt_ids[sidx_train[train_idx]]) if getattr(clf, "predict_proba", None): cv_probs[i, val_idx] = est.predict_proba( X_train[val_idx], zfile, pt_ids[sidx_train[train_idx]])[:, 1] elif getattr(clf, "decision_function", None): cv_probs[i, val_idx] = est.decision_function( X_train[val_idx], zfile, pt_ids[sidx_train[train_idx]]) else: # cv_preds[i,val_idx] = est.predict(X_train[val_idx]) if getattr(clf, "predict_proba", None): cv_probs[i, val_idx] = est.predict_proba( X_train[val_idx])[:, 1] elif getattr(clf, "decision_function", None): cv_probs[i, val_idx] = est.decision_function( X_train[val_idx]) ########## # scores ########## cv_scores[i, j] = roc_auc_score(y_train[val_idx], cv_probs[i, val_idx]) runtime = time.process_time() - t0 ########### print('gridsearch finished in', runtime, 'seconds') ########## # get best model and its information mean_cv_scores = [np.mean(s) for s in cv_scores] best_clf = Clfs[np.argmax(mean_cv_scores)] ########## else: print('skipping hyperparameter tuning') best_clf = clf # this option is for skipping model tuning t0 = time.process_time() print('fitting tuned model to all training data...') if longitudinal: best_clf.fit(X_train, y_train, zfile, pt_ids[sidx_train]) else: best_clf.fit(X_train, y_train) if len(hyper_params) == 0: runtime = time.process_time() - t0 # cv_predictions = cv_preds[np.argmax(mean_cv_scores)] # cv_probabilities = cv_probs[np.argmax(mean_cv_scores)] if not longitudinal: # internal feature importances cv_FI_int = compute_imp_score(best_clf, clf_name, X_train, y_train, random_state, perm=False) # cv_FI_int = FI_internal[np.argmax(mean_cv_scores)] # permutation importances FI = compute_imp_score(best_clf, clf_name, X_test, y_test, random_state, perm=True) ########## # metrics: test the best classifier on the held-out test set print('getting test predictions...') if longitudinal: print('best_clf.predict(X_test, zfile, pt_ids[sidx_test])') test_predictions = best_clf.predict(X_test, zfile, pt_ids[sidx_test]) if getattr(clf, "predict_proba", None): print('best_clf.predict_proba(X_test, zfile, pt_ids[sidx_test])') test_probabilities = best_clf.predict_proba( X_test, zfile, pt_ids[sidx_test])[:, 1] elif getattr(clf, "decision_function", None): test_probabilities = best_clf.decision_function( X_test, zfile, pt_ids[sidx_test]) else: test_predictions = best_clf.predict(X_test) if getattr(clf, "predict_proba", None): test_probabilities = best_clf.predict_proba(X_test)[:, 1] elif getattr(clf, "decision_function", None): test_probabilities = best_clf.decision_function(X_test) # # write cv_pred and cv_prob to file # df = pd.DataFrame({'cv_prediction':cv_predictions,'cv_probability':cv_probabilities, # 'pt_id':pt_ids}) # df.to_csv(save_file.split('.csv')[0] + '_' + str(random_state) + '.cv_predictions',index=None) accuracy = accuracy_score(y_test, test_predictions) macro_f1 = f1_score(y_test, test_predictions, average='macro') bal_acc = balanced_accuracy(y_test, test_predictions) roc_auc = roc_auc_score(y_test, test_probabilities) ########## # save results to file print('saving results...') param_string = ','.join([ '{}={}'.format(p, v) for p, v in best_clf.get_params().items() if p != 'feature_names' ]).replace('\n', '').replace(' ', '') out_text = '\t'.join([ dataset.split('/')[-1], clf_name, param_string, str(random_state), str(accuracy), str(macro_f1), str(bal_acc), str(roc_auc), str(runtime) ]) print(out_text) with open(save_file, 'a') as out: out.write(out_text + '\n') sys.stdout.flush() print('saving feature importance') # write feature importances if not longitudinal: feature_importance(save_file, best_clf, feature_names, X_test, y_test, random_state, clf_name, param_string, cv_FI_int, perm=False) feature_importance(save_file, best_clf, feature_names, X_test, y_test, random_state, clf_name, param_string, FI, perm=True) # write roc curves print('saving roc') roc(save_file, y_test, test_probabilities, random_state, clf_name, param_string) return best_clf