def predict_proba_df(verif, want_edges): """ Predicts task probabilities in one of two ways: (1) if the edge was in the training set then its cross-validated probability is returned. (2) if the edge was not in the training set, then the average prediction over all cross validated classifiers are used. """ clf_key = verif.clf_key task_key = verif.task_key data_key = verif.data_key pblm = verif.pblm # Load pre-predicted probabilities for intra-training set edges res = pblm.task_combo_res[task_key][clf_key][data_key] # Normalize and align combined result sample edges train_uv = np.array(res.probs_df.index.tolist()) assert np.all(train_uv.T[0] < train_uv.T[1] ), 'edges must be in lower triangular form' assert len(vt.unique_row_indexes(train_uv)) == len( train_uv), 'edges must be unique' assert sorted(ut.emap(tuple, train_uv.tolist())) == sorted( ut.emap(tuple, pblm.samples.aid_pairs.tolist())) want_uv = np.array(want_edges) # Determine which edges need/have probabilities want_uv_, train_uv_ = vt.structure_rows(want_uv, train_uv) unordered_have_uv_ = np.intersect1d(want_uv_, train_uv_) need_uv_ = np.setdiff1d(want_uv_, unordered_have_uv_) flags = vt.flag_intersection(train_uv_, unordered_have_uv_) # Re-order have_edges to agree with test_idx have_uv_ = train_uv_[flags] need_uv, have_uv = vt.unstructure_rows(need_uv_, have_uv_) # Convert to tuples for pandas lookup. bleh... have_edges = ut.emap(tuple, have_uv.tolist()) need_edges = ut.emap(tuple, need_uv.tolist()) want_edges = ut.emap(tuple, want_uv.tolist()) assert set(have_edges) & set(need_edges) == set([]) assert set(have_edges) | set(need_edges) == set(want_edges) # Predict on unseen edges using an ensemble of evaluation classifiers logger.info('Predicting %s probabilities' % (task_key, )) eclf_probs = verif.ensemble.predict_proba_df(need_edges) # Combine probabilities --- get probabilites for each sample # edges = have_edges + need_edges have_probs = res.probs_df.loc[have_edges] assert ( have_probs.index.intersection(eclf_probs.index).size == 0 ), 'training (have) data was not disjoint from new (want) data ' probs = pd.concat([have_probs, eclf_probs]) return probs
def _print_previous_loop_statistics(infr, count): # Print stats about what happend in the this loop history = infr.metrics_list[-count:] recover_blocks = ut.group_items([ (k, sum(1 for i in g)) for k, g in it.groupby(ut.take_column(history, 'recovering')) ]).get(True, []) infr.print(( 'Recovery mode entered {} times, ' 'made {} recovery decisions.').format( len(recover_blocks), sum(recover_blocks)), color='green') testaction_hist = ut.dict_hist(ut.take_column(history, 'test_action')) infr.print( 'Test Action Histogram: {}'.format( ut.repr4(testaction_hist, si=True)), color='yellow') if infr.params['inference.enabled']: action_hist = ut.dict_hist( ut.emap(frozenset, ut.take_column(history, 'action'))) infr.print( 'Inference Action Histogram: {}'.format( ub.repr2(action_hist, si=True)), color='yellow') infr.print( 'Decision Histogram: {}'.format(ut.repr2(ut.dict_hist( ut.take_column(history, 'pred_decision') ), si=True)), color='yellow') infr.print( 'User Histogram: {}'.format(ut.repr2(ut.dict_hist( ut.take_column(history, 'user_id') ), si=True)), color='yellow')
def find_opt_ratio(pblm): """ script to help find the correct value for the ratio threshold >>> from wbia.algo.verif.vsone import * # NOQA >>> pblm = OneVsOneProblem.from_empty('PZ_PB_RF_TRAIN') >>> pblm = OneVsOneProblem.from_empty('GZ_Master1') """ # Find best ratio threshold pblm.load_samples() infr = pblm.infr edges = ut.emap(tuple, pblm.samples.aid_pairs.tolist()) task = pblm.samples['match_state'] pos_idx = task.class_names.tolist().index(POSTV) config = {'ratio_thresh': 1.0, 'sv_on': False} matches = infr._exec_pairwise_match(edges, config) import wbia.plottool as pt import sklearn.metrics pt.qtensure() thresholds = np.linspace(0, 1.0, 100) pos_truth = task.y_bin.T[pos_idx] ratio_fs = [m.local_measures['ratio'] for m in matches] aucs = [] # Given the current correspondences: Find the optimal # correspondence threshold. for thresh in ut.ProgIter(thresholds, 'computing thresh'): scores = np.array([fs[fs < thresh].sum() for fs in ratio_fs]) roc = sklearn.metrics.roc_auc_score(pos_truth, scores) aucs.append(roc) aucs = np.array(aucs) opt_auc = aucs.max() opt_thresh = thresholds[aucs.argmax()] if True: pt.plt.plot(thresholds, aucs, 'r-', label='') pt.plt.plot(opt_thresh, opt_auc, 'ro', label='L opt=%r' % (opt_thresh, )) pt.set_ylabel('auc') pt.set_xlabel('ratio threshold') pt.legend()
for fig in figures: fig = figures[0] text = fig.summary_str(outline=True, numlines=float('inf')) fpaths = [info['fpath'] for info in fig.parse_includegraphics()] if fpaths: cmd_to_fpaths[cmd].extend(fpaths) for key in cmd_to_fpaths.keys(): cmd = key.lstrip('\\') if not root.find_descendant_type(cmd): print(key) from os.path import abspath, dirname used_fpaths = ut.flatten(cmd_to_fpaths.values()) used_fpaths = set(ut.emap(abspath, used_fpaths)) all_fpaths = set(ut.emap(abspath, ut.glob('.', ['*.png', '*.jpg'], recursive=True))) unused = list(all_fpaths - used_fpaths) unuse_dirs = ut.group_items(unused, ut.emap(dirname, unused)) semi_used = {} for dpath, fpaths in unuse_dirs.items(): used_in_dpath = set(ut.ls(dpath)) - set(fpaths) if len(used_in_dpath) == 0: # completely unused directories print(dpath) else: