def test_small_cross(): ds = DataSet('../datasets/', 'test', 'small-cross') print('DS: {}; iterations: {}'.format(ds.name, ds.set_count)) for i in range(1, ds.set_count + 1): print("ITER #{}".format(i)) trn, tst = ds.get_dataset(i) print('\tTRAIN: {}'.format(trn)) print('\tTEST: {}'.format(tst)) trns, tsts = utils.get_edges_set(trn), utils.get_edges_set(tst) scores = get_small_scores() auc_res_tot = mtr.auc(ds.vx_count, trns, tsts, scores) auc_res_010 = mtr.auc(ds.vx_count, trns, tsts, scores, 10) auc_res_100 = mtr.auc(ds.vx_count, trns, tsts, scores, 100) auc_res_01k = mtr.auc(ds.vx_count, trns, tsts, scores, 1000) # auc_res_10k = mtr.auc(ds.vx_count, trns, tsts, scores, 10000) # auc_res_1ck = mtr.auc(ds.vx_count, trns, tsts, scores, 100000) # auc_res_01m = mtr.auc(ds.vx_count, trns, tsts, scores, 1000000) prc_res_002 = mtr.precision(ds.vx_count, trns, tsts, scores, 2) print('\tMETRICS:') print('\t\t-> AUC___TOT: {:.04}'.format(auc_res_tot)) # expected: 0.67 print('\t\t-> AUC____10: {:.04}'.format(auc_res_010)) print('\t\t-> AUC___100: {:.04}'.format(auc_res_100)) print('\t\t-> AUC____1K: {:.04}'.format(auc_res_01k)) # print('\t\t-> AUC___10K: {:.04}'.format(auc_res_10k)) # print('\t\t-> AUC__100K: {:.04}'.format(auc_res_1ck)) # print('\t\t-> AUC____1M: {:.04}'.format(auc_res_01m)) print('\t\t-> PREC____2: {:.04}'.format(prc_res_002)) # expected: 0.50 print()
def __experiment_02(data_set, set_no=1, aucn=2000, category='math.GN'): print('Kategoria: ',category) data = dataset.DataSet('../datasets/', category, data_set) matrix = sparse.csc_matrix( data.get_training_set(mode='adjacency_matrix_lil', ds_index=set_no), dtype='d') training = data.get_training_set() #metrics.get_edges_set(data.get_training_set()) test = data.get_test_edges() #metrics.get_edges_set(data.get_test_edges()) print('Rozmiar grafu=',data.vx_count) print('Obliczanie MERW i GRW...') Pgrw, sd = merw.compute_grw(matrix) Pmerw, vekt, eval, stat = merw.compute_merw_matrix(matrix) for a in [.1, .5, .9]: print('alfa=', a) p_dist = merw.compute_P_distance(Pgrw, alpha=a) print(' Skuteczność PD (AUC {}):'.format(aucn), metrics.auc(data.vx_count, training, test, p_dist, aucn)) p_dist = merw.compute_P_distance(Pmerw, alpha=a) print(' Skuteczność MEPD (AUC {}):'.format(aucn), metrics.auc(data.vx_count, training, test, p_dist, aucn)) p_dist = merw.compute_P_distance(Pgrw, alpha=a) print(' Skuteczność PDM (AUC {}):'.format(aucn), metrics.auc(data.vx_count, training, test, p_dist, aucn)) p_dist = merw.compute_P_distance(Pmerw, alpha=a) print('Skuteczność MEPDM (AUC {}):'.format(aucn), metrics.auc(data.vx_count, training, test, p_dist, aucn))
def my_incremental_evaluate(sess, model, minibatch_iter, size, test=False): val_losses = [] val_preds = [] labels = [] iter_num = 0 finished = False while not finished: feed_dict_val, batch_labels, finished, _ = \ minibatch_iter.incremental_node_val_feed_dict( size, iter_num, test=test) node_outs_val = sess.run([model.preds, model.loss], feed_dict=feed_dict_val) val_preds.append(node_outs_val[0]) labels.append(batch_labels) val_losses.append(node_outs_val[1]) iter_num += 1 val_preds = np.vstack(val_preds) labels = np.vstack(labels) precision, recall, thresholds = precision_recall_curve( labels[:, 1], val_preds[:, 1]) area = auc(recall, precision) return area
def supervised_eval(self, train_or_valid): data = self.dataset.get_labeled_data(train_or_valid) if data == None: raise ValueError('no labeled examples present in dataset') X_labeled, y_true, _ = data y_pred = self.model.predict(X_labeled) p, r, ac, g, auc = metrics.precision(y_true, y_pred),metrics.recall(y_true, y_pred),\ metrics.accuracy(y_true, y_pred), metrics.g_means(y_true, y_pred),\ metrics.auc(y_true, y_pred) self.metrics[train_or_valid].append((p, r, ac, g, auc))
def test(test_out_filename, clf, test_df, y_true): y_prob = clf.predict_proba(test_df) y_score = y_prob[:, 1] uids = test_df['did'].values with open(test_out_filename, 'w') as e_out: auc = metrics.auc(y_true, y_score) e_out.write("auc: %s\n" % str(auc)) logging.info("auc: %s", str(auc)) gauc = metrics.gauc(y_true, y_score, uids) # e_out.write("ndcg: %s\n" % str(ndcg)) e_out.write("gauc: %s\n" % str(gauc)) logging.info("gauc: %s", str(gauc))
def plot_cmc_curve(os_scores, oaa_scores, extra_name=None): """ The CMC shows how often the biometric subject template appears in the ranks (1, 5, 10, 100, etc.), based on the match rate. It is a method of showing measured accuracy performance of a biometric system operating in the closed-set identification task. Templates are compared and ranked based on their similarity. """ # Compute mean values os_mean = np.mean(os_scores, axis=0) oaa_mean = np.mean(oaa_scores, axis=0) x_axis = range(len(os_mean)) os_auc = auc(x_axis, os_mean) ooa_auc = auc(x_axis, oaa_mean) # Plot Cumulative Matching Characteristic curve plt.clf() plt.plot(x_axis, os_mean, color='blue', linestyle='--', label='Open-set HPLS (%0.3f)' % (os_auc / len(os_scores[0]))) plt.plot(x_axis, oaa_mean, color='red', linestyle='-', label='Closed-set OAA-PLS (%0.3f)' % (ooa_auc / len(os_scores[0]))) plt.xlim([0, len(os_scores[0])]) plt.ylim([0.0, 1.05]) plt.xlabel('Rank') plt.ylabel('Accuracy Rate') plt.title('Cumulative Matching Characteristic') plt.legend(loc="lower right") plt.grid() if extra_name == None: plt.show() else: plt.savefig('./plots/CMC_' + extra_name + '.pdf')
def get_auc(item_score, user_pos_test): item_score = sorted(item_score.items(), key=lambda kv: kv[1]) item_score.reverse() item_sort = [x[0] for x in item_score] posterior = [x[1] for x in item_score] r = [] for i in item_sort: if i in user_pos_test: r.append(1) else: r.append(0) auc = metrics.auc(ground_truth=r, prediction=posterior) return auc
def active_simulation_eval(self): data = self.dataset.get_unlabeled_data() if data == None: UserWarning( 'all examples have been labeled; this eval mode works ' 'if there is unlabeled pool of data in `simulate` mode' ) return X_unlabeled, unlabeled_indexes = data # get unlabeled examples labels in simulation with `y_ideal` y_true = self.dataset.y_ideal[unlabeled_indexes] y_pred = self.model.predict(X_unlabeled) p, r, ac, g, auc = metrics.precision(y_true, y_pred),metrics.recall(y_true, y_pred),\ metrics.accuracy(y_true, y_pred), metrics.g_means(y_true, y_pred),\ metrics.auc(y_true, y_pred) self.metrics['simulate'].append((p, r, ac, g, auc))
def test(data_set, model, data_loader, show_auc = False, use_dummy_gcn=False, use_struc=None): with torch.no_grad(): logging.info('----- start_test -----') model.eval() precision = [] recall = [] ndcg_score = [] auc_score = [] for user_ids, _, __ in data_loader: user_ids = user_ids.to(device) ratings = model.get_users_ratings(user_ids, use_dummy_gcn, use_struc) ground_truths = [] for i, user_id_t in enumerate(user_ids): user_id = user_id_t.item() ground_truths.append(data_set.test_user_dict[user_id]) train_pos = data_set.train_user_dict[user_id] for pos_item in train_pos: ratings[i][pos_item] = -1 # delete train data in ratings # Precision, Recall, NDCG ___, index_k = torch.topk(ratings, k=TOPK) # index_k.shape = (batch_size, TOPK), dtype=torch.int batch_predict_items = index_k.cpu().tolist() batch_precision, batch_recall = precision_and_recall(batch_predict_items, ground_truths) batch_ndcg = ndcg(batch_predict_items, ground_truths) # AUC if show_auc: ratings = ratings.cpu().numpy() batch_auc = auc(ratings, data_set.get_item_num(), ground_truths) auc_score.append(batch_auc) precision.append(batch_precision) recall.append(batch_recall) ndcg_score.append(batch_ndcg) precision = np.mean(precision) recall = np.mean(recall) ndcg_score = np.mean(ndcg_score) if show_auc: # Calculate AUC scores spends a long time auc_score = np.mean(auc_score) logging.info('test result: precision ' + str(precision) + '; recall ' + str(recall) + '; ndcg ' + str(ndcg_score) + '; auc ' + str(auc_score)) else: logging.info('test result: precision ' + str(precision) + '; recall ' + str(recall) + '; ndcg ' + str(ndcg_score))
def pr_curve(y_true, scores, stage, show_var=False): precision, recall, thresholds = precision_recall_curve(y_true, scores) y_pred = (scores > 0).astype(np.int) auc_val = auc(y_true, y_pred) plt.figure(figsize=(20, 10)) plt.plot(recall, precision, color='r') if show_var: precision_std = np.std(precision) precision_upper = precision + precision_std precision_lower = precision - precision_std plt.fill_between(recall, precision_upper, precision_lower, color='r', alpha=0.1) plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title('%s Precision-Recall curve: AP=%0.2f' % (stage.title(), auc_val)) plt.grid("on") plt.show()
def __experiment_01(data_set, skipSimRank=False, set_no=1, a=0.5, aucn=2000, simrank_iter=10, category='math.GN'): print('Kategoria: ',category) data = dataset.DataSet('../datasets/', category, data_set) matrix = sparse.csc_matrix( data.get_training_set(mode='adjacency_matrix_csc', ds_index=set_no), dtype='d') training = data.get_training_set() #metrics.get_edges_set(data.get_training_set()) test = data.get_test_edges() #metrics.get_edges_set(data.get_test_edges()) print('Zestaw',set_no,' N=', data.vx_count) #print('Obliczanie: macierzy przejścia MERW...', end=' ') #print(vekt) #print(Pmerw.get_shape()[0]) #print('macierzy "odległości"...') #print('Obliczanie: macierzy przejścia GRW... ', end=' ') Pgrw, sd = merw.compute_grw(matrix) #print('macierzy "odległości"...') p_dist_grw = merw.compute_P_distance(Pgrw, alpha=a) print(' Skuteczność PD (AUC {}):'.format(aucn), metrics.auc(data.vx_count, training, test, p_dist_grw, aucn)) Pmerw, vekt, eval, stat = merw.compute_merw_matrix(matrix) p_dist_merw = merw.compute_P_distance(Pmerw, alpha=a) print(' Skuteczność MEPD (AUC {}):'.format(aucn), metrics.auc(data.vx_count, training, test, p_dist_merw, aucn)) ep_dist_grw = merw.compute_P_distance(Pgrw, alpha=a) print(' Skuteczność PDM (AUC {}):'.format(aucn), metrics.auc(data.vx_count, training, test, ep_dist_grw, aucn)) ep_dist_merw = merw.compute_P_distance(Pmerw, alpha=a) print(' Skuteczność PDM (AUC {}):'.format(aucn), metrics.auc(data.vx_count, training, test, ep_dist_merw, aucn)) if skipSimRank: return graph = merw.matrix_to_graph(matrix) #print(graph) print('SimRank...',end='') sr, eps = merw.compute_basic_simrank(graph, a, maxiter=simrank_iter) print(' Dokładność:', eps) print(' Skuteczność SR (AUC {}):'.format(aucn), metrics.auc(data.vx_count, training, test, sr, aucn)) print('MERW SimRank...',end='') sr, eps = merw.compute_merw_simrank_ofmatrix(matrix, a, maxiter=simrank_iter) print(' Dokładność:', eps) print(' Skuteczność MESR (AUC {}):'.format(aucn), metrics.auc(data.vx_count, training, test, sr, aucn))
def generate_det_curve(y_label_list, y_score_list): """ DET curves typically feature missed detection rate on the Y axis, and false positive rate on the X axis. This means that the bottom left corner of the plot is the ideal point - a false positive rate of zero, and a missed detection rate of zero as well. This is not very realistic, but it does mean that a smaller area under the curve (AUC) is usually better. """ # Prepare input data label_list = [] score_list = [] for line in y_label_list: temp_list = [item[1] for item in line] label_list.append(temp_list) for line in y_score_list: temp_list = [item[1] for item in line] score_list.append(temp_list) label_array = np.array(label_list) score_array = np.array(score_list) # Compute micro-average DET curve and DET area det = dict() det['fpr'], det['fnr'], det['thresh'] = detection_error_tradeoff( label_array.ravel(), score_array.ravel()) det['auc'] = auc(det['fpr'], det['fnr']) return det
def generate_roc_curve(y_label_list, y_score_list): """ ROC curves typically feature true positive rate on the Y axis, and false positive rate on the X axis. This means that the top left corner of the plot is the ideal point - a false positive rate of zero, and a true positive rate of one. This is not very realistic, but it does mean that a larger area under the curve (AUC) is usually better. """ # Prepare input data label_list = [] score_list = [] for line in y_label_list: temp_list = [item[1] for item in line] label_list.append(temp_list) for line in y_score_list: temp_list = [item[1] for item in line] score_list.append(temp_list) label_array = np.array(label_list) score_array = np.array(score_list) # Compute micro-average ROC curve and ROC area roc = dict() roc['fpr'], roc['tpr'], roc['thresh'] = roc_curve(label_array.ravel(), score_array.ravel()) roc['auc'] = auc(roc['fpr'], roc['tpr']) return roc
def run(): while True: trial = pull_pending() if trial is None: break params = eval(trial['Parameters']) logging.info(trial) dataset = load(trial['Dataset']) fold = int(trial['Fold']) - 1 (X_train, y_train), (X_test, y_test) = dataset[fold][0], dataset[fold][1] n_minority = Counter(y_train).most_common()[1][1] n_majority = Counter(y_train).most_common()[0][1] imblearn_ratios = [ ((n_majority - n_minority) * ratio + n_minority) / n_majority for ratio in [0.5, 0.75, 1.0] ] clf = { 'NB': NB(), 'KNN': KNN(), 'SVM': SVM(gamma='scale'), 'CART': CART() }[params['classifier']] if (trial['Algorithm'] is None) or (trial['Algorithm'] == 'None'): algorithm = None else: algorithms = { 'AKNN': ResamplingCV(AKNN, clf, n_neighbors=[1, 3, 5, 7]), 'Bord': ResamplingCV(SMOTE, clf, kind=['borderline1'], k_neighbors=[1, 3, 5, 7, 9], m_neighbors=[5, 10, 15], sampling_strategy=imblearn_ratios), 'CC': ResamplingCV(CC, clf, sampling_strategy=imblearn_ratios), 'CNN': ResamplingCV(CNN, clf, n_neighbors=[1, 3, 5, 7]), 'ENN': ResamplingCV(ENN, clf, n_neighbors=[1, 3, 5, 7]), 'IHT': ResamplingCV(IHT, clf, sampling_strategy=imblearn_ratios, cv=[2]), 'NCL': ResamplingCV(NCL, clf, n_neighbors=[1, 3, 5, 7]), 'NM': ResamplingCV(NM, clf, n_neighbors=[1, 3, 5, 7]), 'OSS': ResamplingCV(OSS, clf, n_neighbors=[1, 3, 5, 7]), 'RBO': ResamplingCV(RBO, clf, gamma=[0.01, 0.1, 1.0, 10.0], ratio=[0.5, 0.75, 1.0]), 'RBU': ResamplingCV(RBU, clf, gamma=params.get('gamma'), ratio=params.get('ratio')), 'RENN': ResamplingCV(RENN, clf, n_neighbors=[1, 3, 5, 7]), 'ROS': ResamplingCV(ROS, clf, sampling_strategy=imblearn_ratios), 'RUS': ResamplingCV(RUS, clf, sampling_strategy=imblearn_ratios), 'SMOTE': ResamplingCV(SMOTE, clf, k_neighbors=[1, 3, 5, 7, 9], sampling_strategy=imblearn_ratios), 'SMOTE+ENN': ResamplingCV( SMOTEENN, clf, smote=[SMOTE(k_neighbors=k) for k in [1, 3, 5, 7, 9]], sampling_strategy=imblearn_ratios), 'SMOTE+TL': ResamplingCV( SMOTETomek, clf, smote=[SMOTE(k_neighbors=k) for k in [1, 3, 5, 7, 9]], sampling_strategy=imblearn_ratios), 'TL': TL(), } algorithm = algorithms.get(trial['Algorithm']) if algorithm is None: raise NotImplementedError if algorithm is not None: X_train, y_train = algorithm.fit_sample(X_train, y_train) clf = clf.fit(X_train, y_train) predictions = clf.predict(X_test) scores = { 'Precision': metrics.precision(y_test, predictions), 'Recall': metrics.recall(y_test, predictions), 'F-measure': metrics.f_measure(y_test, predictions), 'AUC': metrics.auc(y_test, predictions), 'G-mean': metrics.g_mean(y_test, predictions) } submit_result(trial, scores)
def test_u1234567(): y_true = [0., 0., 1., 1.] y_score = [-20., 0.2, 0.1, 0.9] print('Expected AUC = 0.75') print('Calculated AUC = %f' % auc(y_true, y_score))
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params): if os.path.exists(config.DF_FILE): print("FD EXISTED") with open(config.DF_FILE, 'rb') as fd_f: fd = pickle.load(fd_f) else: print("FD NO EXISTED") fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) with open(config.DF_FILE, 'wb') as fd_f: pickle.dump(fd, fd_f) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, y_test = data_parser.parse(df=dfTest, has_label=True) #测试集也是有label # print(y_test) # print(Xi_train) # print(Xv_train) # print(y_train) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = len(Xi_train[0]) print(dfm_params) # print(dfm_params) y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] auc_results_cv = np.zeros(len(folds), dtype=float) test_auc_results_cv = np.zeros(len(folds), dtype=float) auc_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) auc_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) # best_test_res = 0.0 for i, (train_idx, valid_idx) in enumerate(folds): print(f"Fold {i}:") Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx) # print(Xi_train_) # print(Xv_train_) # print(y_train_) # print(Xi_valid_) # print(Xv_valid_) # print(y_valid_) dfm = DeepFM(**dfm_params) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_, i) y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:,0] = dfm.predict(Xi_test, Xv_test) auc_results_cv[i] = auc(y_valid_, y_train_meta[valid_idx]) test_auc_results = auc(y_test, y_test_meta) # if test_auc_results > best_test_res: # MODEL_PATH = config.MODEL_PATH % (i, ) # dfm.save_model(config.MODEL_PATH)#可以写保存地址 test_auc_results_cv[i] = test_auc_results auc_results_epoch_train[i] = dfm.train_result auc_results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds)) # save result if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" print("%s: %.5f (%.5f)"%(clf_str, auc_results_cv.mean(), auc_results_cv.std())) print("test auc: ", test_auc_results_cv) filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, auc_results_cv.mean(), auc_results_cv.std()) # _make_submission(ids_test, y_test_meta, filename) # _plot_fig(auc_results_epoch_train, auc_results_epoch_valid, clf_str) return y_train_meta, y_test_meta
def _eval(self, models, rs, true_r, loss_list, metrics, plot, node_embs_list=None, graph_embs_mat=None, attentions=None, eps_dir=None): rtn = OrderedDict() for metric in metrics: if metric == 'mrr' or metric == 'mse' or metric == 'time' or \ 'acc' in metric or metric == 'kendalls_tau' or \ metric == 'spearmans_rho': d = plot_single_number_metric( FLAGS.dataset, models, rs, true_r, metric, self.norms, sim_kernel=get_flags('sim_kernel'), yeta=get_flags('yeta'), scale=get_flags('scale'), thresh_poss=[get_flags('thresh_val_test_pos')], thresh_negs=[get_flags('thresh_val_test_neg')], thresh_poss_sim=[0.5], thresh_negs_sim=[0.5], plot_results=plot, eps_dir=eps_dir) rtn.update(d) elif metric == 'draw_gt_rk': comb_gt_rk(FLAGS.dataset, FLAGS.dist_algo, rs[FLAGS.model], eps_dir + '/gt_rk') elif metric == 'groundtruth': pass elif metric == 'draw_heat_hist': if node_embs_list is not None: draw_emb_hist_heat( FLAGS.dataset, node_embs_list, FLAGS.dist_norm, max_nodes=FLAGS.max_nodes, apply_sigmoid=True, eps_dir=eps_dir + '/mne') elif metric == 'emb_vis_gradual': if graph_embs_mat is not None: visualize_embeddings_gradual( FLAGS.dataset, graph_embs_mat, eps_dir=eps_dir + '/emb_vis_gradual') elif metric == 'ranking': pass # ranking( # FLAGS.dataset, FLAGS.dist_algo, rs[FLAGS.model], # eps_dir=eps_dir + '/ranking' # ) elif metric == 'attention': if attentions is not None: draw_attention( FLAGS.dataset, FLAGS.dist_algo, attentions, eps_dir=eps_dir + '/attention') elif metric == 'auc': auc_score = auc( true_r, rs[FLAGS.model], thresh_pos= get_flags('thresh_val_test_pos'), thresh_neg= get_flags('thresh_val_test_neg'), norm=FLAGS.dist_norm) print('auc', auc_score) rtn.update({'auc': auc_score}) elif 'prec@k' in metric: d = plot_preck( FLAGS.dataset, models, rs, true_r, metric, self.norms, plot, eps_dir=eps_dir) rtn.update(d) elif metric == 'loss': rtn.update({metric: np.mean(loss_list)}) elif metric == 'emb_vis_binary': if graph_embs_mat is not None: visualize_embeddings_binary( FLAGS.dataset, graph_embs_mat, self.true_test_result, thresh_pos= get_flags('thresh_val_test_pos'), thresh_neg= get_flags('thresh_val_test_neg'), thresh_pos_sim=0.5, thresh_neg_sim=0.5, norm=FLAGS.dist_norm, eps_dir=eps_dir + '/emb_vis_binary') else: raise RuntimeError('Unknown metric {}'.format(metric)) return rtn
X, y, cv=5, scoring=scoring) plt_handle.show() # train and report test results clf_supervised = models.default_model() clf_supervised.fit(X, y) sup_y_test_preds = clf_supervised.predict(X_test) supervised_results = { 'accuracy': metrics.accuracy(y_test, sup_y_test_preds), 'precision': metrics.precision(y_test, sup_y_test_preds), 'recall': metrics.recall(y_test, sup_y_test_preds), 'gmeans': metrics.g_means(y_test, sup_y_test_preds), 'auc': metrics.auc(y_test, sup_y_test_preds), 'cohen-kappa': metrics.user_machine_agreement(y_test, sup_y_test_preds) } #============================================================ # IV (a) - Active Learning #============================================================ # Part IV of this demo is divided into two sub-parts: # # (a) - Here we demonstrate the active "learning phase" of # a typical predictive coding life cycle, and since # demo will be in simulation mode, we will not # require an interactive session to get user labels # # (b) - In this part we will simulate the review phase. #
'Follow training by printing the loss and prediction for each image' #print("Batch ", i, "/",training_generator.__len__(), # ", Loss: ", loss_values.numpy()[0]) #print("Prediction: ", logits.numpy()[0,:], ", Label: ", y_train[0]) temp_loss_list.append(loss_values.numpy().mean()) label_list.append(y_train[0]) pred_list.append(logits.numpy()[0,1]) grads = tape.gradient(loss_values, model.trainable_variables[-4:]) optimizer.apply_gradients(zip(grads, model.trainable_variables[-4:])) 'Compute metrics on training set' loss_train = np.mean(np.asarray(temp_loss_list)) auc_train = auc(label_list, pred_list) print("Training loss: ", loss_train, ", AUC: ", auc_train) acc, sens, spec = conf(label_list, pred_list) print("Training accuracy: ", acc, ", sensitivity: ", sens, ", specificity: ", spec) 'Evaluate on validation set' for i in range(validation_generator.__len__()): x_train,y_train=validation_generator.__getitem__(i) logits = model(x_train, training = False) loss_values = loss(y_train, logits) #print("Batch ", i, "/", validation_generator.__len__(), # ", Loss: ", loss_values.numpy()[0]) #print("Prediction: ", logits.numpy()[0,:], ", Label: ", y_train[0])
def dk_tests_1k(): ds = DataSet('../datasets/', 'gr-qc', 'eg1k') trn, tst = ds.get_dataset() trns, tsts = utils.get_edges_set(trn), utils.get_edges_set(tst) rmtrns, rmtsts = set(), set() toTest = True for x in tsts: if x in trns: if toTest: rmtrns.add(x) else: rmtsts.add(x) toTest = not toTest for x in rmtrns: trns.remove(x) for x in rmtsts: tsts.remove(x) for x in tsts: if x in trns: print("NO!") A = lil_matrix((ds.vx_count, ds.vx_count)) for v1, v2 in trns: A[v1, v2] = 1 A[v2, v1] = 1 A = csr_matrix(A, (ds.vx_count, ds.vx_count), 'd') ls, vs = sla.eigsh(A, 1, which='LA') l_max = ls[0] v_max = vs[:, 0] # print("Values of AUC (1000 samples) and precision (K=30) " + # "for heat diffusion kernel variants:") print("Values of AUC (10000 samples) for heat diffusion kernel variants:") auc_sampl = 10000 prc_k = 30 # DK DK = kern.heat_diffusion_kernel(kern.laplacian(A)) auc = mtr.auc(ds.vx_count, trns, tsts, DK, auc_sampl) print(" DK - AUC: {:.4f}".format(auc)) prc = mtr.precision(ds.vx_count, trns, tsts, DK, prc_k) print(" DK - PRC: {:.4f}".format(prc)) # NDK warnings.filterwarnings("ignore") NDK = kern.heat_diffusion_kernel(kern.symmetric_normalized_laplacian(A)) auc = mtr.auc(ds.vx_count, trns, tsts, NDK, auc_sampl) # prc = mtr.precision(ds.vx_count, trns, tsts, NDK, prc_k) print(" NDK - AUC: {:.4f}".format(auc)) # print(" NDK - PREC: {:.4f}".format(prc)) # MEDK MEDK = kern.heat_diffusion_kernel(kern.mecl(A, l_max, v_max)) auc = mtr.auc(ds.vx_count, trns, tsts, MEDK, auc_sampl) # prc = mtr.precision(ds.vx_count, trns, tsts, MEDK, prc_k) print(" MEDK - AUC: {:.4f}".format(auc)) # print(" MEDK - PREC: {:.4f}".format(prc)) # NMEDK NMEDK = kern.heat_diffusion_kernel(kern.mecl(A, l_max, v_max, type='sym')) auc = mtr.auc(ds.vx_count, trns, tsts, NMEDK, auc_sampl) # prc = mtr.precision(ds.vx_count, trns, tsts, NMEDK, prc_k) print("NMEDK - AUC: {:.4f}".format(auc))
# 0.99, 0.4, 0.01 try: with open("cascade.pkl", "rb") as f: cascade_classifier = pickle.load(f) except: cascade_classifier = cascade.train_cascade(train_f, train_y, 0.99, 0.4, 0.01) with open("cascade.pkl", "wb") as f: pickle.dump(cascade_classifier, f, protocol=pickle.HIGHEST_PROTOCOL) test_f, i_f = feature.get_features(test_x) f_pred = classifier(test_f) y_pred = classifier.predict(test_f) print(metrics.tpr_fpr(test_y, y_pred)) print(metrics.auc(test_y, f_pred)) # Top 10 features shape = (19, 19) for i, (base, alpha) in enumerate(classifier): print(base.index) print("Feature {}: theta {:.2f}, alpha {:.2f}".format( i, base.theta, alpha)) visulization.visualize_feature(shape, i_f[base.index], base.parity, save="feature_{}.png".format(i), show=False) plt.figure() for i in [1, 3, 5, 10]:
# compare score to sliding box up to some width, up to last decile max_box_width = np.sort(seq_lengths)[-len(seq_lengths)//10] for box_width in xrange(max_box_width): if (box_width%10)==0: # select only unmasked and comparable datapoints. m = ~np.isnan(y[:,:,1]) # uncensored or within box_width of boundary m[m] = (y[:,:,1][m]==1)|(box_width<y[:,:,1][m]) actual = y[:,:,0][m].flatten()<=box_width pred = weibull.cmf(a=predicted[:,:,0],b=predicted[:,:,1],t=box_width)[m].flatten() fpr,tpr,thresholds = metrics.roc_curve(actual,pred) auc = metrics.auc(fpr,tpr) print('auc: ',auc,' sliding box ',box_width) aucs.append(auc) plt.plot(aucs) plt.ylabel('AUC') plt.xlabel('box width') ## Esoteric plots # Animate predicted churn # Those with alpha higher than at the their last step is red. Red stream of blood going to the right corner are predicted churners #### Walk through the timeline and look at the embedding. # by day padded = tr.right_pad_to_left_pad(predicted) events_tmp = tr.right_pad_to_left_pad(events) # by day since signup
def train(model, train_loader, test_loader, criterion, optimizer, n_epochs, batches_per_epoch, model_name, batch_size): print('Start train') loss_plot = np.empty(int(batches_per_epoch * n_epochs)) auc_plot = np.empty(int(batches_per_epoch * n_epochs)) vloss_plot = np.zeros(int(batches_per_epoch * n_epochs)) vauc_plot = np.zeros(int(batches_per_epoch * n_epochs)) yhat_tosave = np.zeros((int(n_epochs), int(batches_per_epoch), batch_size)) y_tosave = np.zeros((int(n_epochs), int(batches_per_epoch), batch_size)) yhat_test_tosave = np.array([[]]) y_test_tosave = np.array([[]]) t0 = time.time() for epoch in range(n_epochs): cost = 0 batch = 0 for batch_ind, (x, y) in enumerate(train_loader): # x, y = x.to(device), y.to(device) # Train on batch # print('First Batch') optimizer.zero_grad() # clear gradient z = model(x) # make prediciton loss = criterion(z, y) # calculate loss loss.backward() # calculate gradients optimizer.step() # update parameters cost += loss.item() # Save aucs and loss plot_index = int(batch + batches_per_epoch * epoch) metrics_text = '' if show_auc: y_np = y.detach().numpy() z_np = z.detach().numpy() sz_yhat, inter_yhat = met.split_yhat(y_np, z_np) a, _, _ = met.auc(sz_yhat, inter_yhat) auc_plot[plot_index] = a metrics_text += 'AUC: %.2g ' % a if show_loss: loss_plot[plot_index] = loss.item() metrics_text += 'loss: %.3g ' % loss.item() if save_forecasts: y_np = y.detach().numpy() yhat_np = z.detach().numpy() yhat_tosave[epoch, batch_ind, :y_np.shape[0]] = yhat_np.flatten() y_tosave[epoch, batch_ind, :y_np.shape[0]] = y_np.flatten() # print batch += 1 t = time.time() - t0 percent_done = batch / (batches_per_epoch * n_epochs) + epoch / n_epochs print( 'Epoch %d of %d, Batch %d of %d, %0.1f done, %0.2f of %0.2f seconds. ' % (epoch + 1, n_epochs, batch, batches_per_epoch, percent_done * 100, t, t / percent_done) + metrics_text) # sys.stdout.write('\rBatch %d of %d, %0.1f done, %0.2f of %0.2f seconds. ' % ( # batch, batches_per_epoch, percent_done * 100, t, t / percent_done) + metrics_text) if plot_loss: val_loss = 0 print('Calculating Test Loss') z_np_ = np.array([]) y_np_ = np.array([]) for x_, y_ in test_loader: z_ = model(x_) l = criterion(z_, y_) vloss_ind = int(batches_per_epoch * (epoch + 1)) - 1 sz_, inter_ = met.split_yhat(y_.detach().numpy(), z_.detach().numpy()) a, _, _ = met.auc(sz_, inter_) z_np_ = np.append(z_np, z_.detach().numpy().flatten()) y_np_ = np.append(y_np, y_.detach().numpy().flatten()) vloss_plot[vloss_ind] = l.item() vauc_plot[vloss_ind] = a vis.loss_and_auc(loss_plot, auc_plot, vloss_plot, vauc_plot, model_name, batches_per_epoch, n_epochs) if save_forecasts: if epoch > 0: yhat_test_tosave = np.append(yhat_test_tosave, np.reshape( z_np_, (1, z_np_.size)), axis=0) y_test_tosave = np.append(y_test_tosave, np.reshape( y_np_, (1, y_np_.size)), axis=0) else: yhat_test_tosave = np.append(yhat_test_tosave, np.reshape( z_np_, (1, z_np_.size)), axis=1) y_test_tosave = np.append(y_test_tosave, np.reshape( y_np_, (1, y_np_.size)), axis=1) print('--') if save_forecasts: np.save( '/media/projects/daniel_lstm/forecasts_training/' + model_name + '_yhat', yhat_tosave) np.save( '/media/projects/daniel_lstm/forecasts_training/' + model_name + '_y', y_tosave) np.save( '/media/projects/daniel_lstm/forecasts_training/' + model_name + '_yhat_t', yhat_test_tosave) np.save( '/media/projects/daniel_lstm/forecasts_training/' + model_name + '_y_t', y_test_tosave) return model