Пример #1
0
    def predict_proba_df(verif, want_edges):
        """
        Predicts task probabilities in one of two ways:
            (1) if the edge was in the training set then its cross-validated
                probability is returned.
            (2) if the edge was not in the training set, then the average
                prediction over all cross validated classifiers are used.
        """
        clf_key = verif.clf_key
        task_key = verif.task_key
        data_key = verif.data_key

        pblm = verif.pblm

        # Load pre-predicted probabilities for intra-training set edges
        res = pblm.task_combo_res[task_key][clf_key][data_key]

        # Normalize and align combined result sample edges
        train_uv = np.array(res.probs_df.index.tolist())
        assert np.all(train_uv.T[0] < train_uv.T[1]
                      ), 'edges must be in lower triangular form'
        assert len(vt.unique_row_indexes(train_uv)) == len(
            train_uv), 'edges must be unique'
        assert sorted(ut.emap(tuple, train_uv.tolist())) == sorted(
            ut.emap(tuple, pblm.samples.aid_pairs.tolist()))
        want_uv = np.array(want_edges)

        # Determine which edges need/have probabilities
        want_uv_, train_uv_ = vt.structure_rows(want_uv, train_uv)
        unordered_have_uv_ = np.intersect1d(want_uv_, train_uv_)
        need_uv_ = np.setdiff1d(want_uv_, unordered_have_uv_)
        flags = vt.flag_intersection(train_uv_, unordered_have_uv_)
        # Re-order have_edges to agree with test_idx
        have_uv_ = train_uv_[flags]
        need_uv, have_uv = vt.unstructure_rows(need_uv_, have_uv_)

        # Convert to tuples for pandas lookup. bleh...
        have_edges = ut.emap(tuple, have_uv.tolist())
        need_edges = ut.emap(tuple, need_uv.tolist())
        want_edges = ut.emap(tuple, want_uv.tolist())
        assert set(have_edges) & set(need_edges) == set([])
        assert set(have_edges) | set(need_edges) == set(want_edges)

        # Predict on unseen edges using an ensemble of evaluation classifiers
        logger.info('Predicting %s probabilities' % (task_key, ))
        eclf_probs = verif.ensemble.predict_proba_df(need_edges)

        # Combine probabilities --- get probabilites for each sample
        # edges = have_edges + need_edges
        have_probs = res.probs_df.loc[have_edges]
        assert (
            have_probs.index.intersection(eclf_probs.index).size == 0
        ), 'training (have) data was not disjoint from new (want) data '

        probs = pd.concat([have_probs, eclf_probs])
        return probs
Пример #2
0
 def _print_previous_loop_statistics(infr, count):
     # Print stats about what happend in the this loop
     history = infr.metrics_list[-count:]
     recover_blocks = ut.group_items([
         (k, sum(1 for i in g))
         for k, g in it.groupby(ut.take_column(history, 'recovering'))
     ]).get(True, [])
     infr.print((
         'Recovery mode entered {} times, '
         'made {} recovery decisions.').format(
             len(recover_blocks), sum(recover_blocks)), color='green')
     testaction_hist = ut.dict_hist(ut.take_column(history, 'test_action'))
     infr.print(
         'Test Action Histogram: {}'.format(
             ut.repr4(testaction_hist, si=True)), color='yellow')
     if infr.params['inference.enabled']:
         action_hist = ut.dict_hist(
             ut.emap(frozenset, ut.take_column(history, 'action')))
         infr.print(
             'Inference Action Histogram: {}'.format(
                 ub.repr2(action_hist, si=True)), color='yellow')
     infr.print(
         'Decision Histogram: {}'.format(ut.repr2(ut.dict_hist(
             ut.take_column(history, 'pred_decision')
         ), si=True)), color='yellow')
     infr.print(
         'User Histogram: {}'.format(ut.repr2(ut.dict_hist(
             ut.take_column(history, 'user_id')
         ), si=True)), color='yellow')
Пример #3
0
    def find_opt_ratio(pblm):
        """
        script to help find the correct value for the ratio threshold

            >>> from wbia.algo.verif.vsone import *  # NOQA
            >>> pblm = OneVsOneProblem.from_empty('PZ_PB_RF_TRAIN')
            >>> pblm = OneVsOneProblem.from_empty('GZ_Master1')
        """
        # Find best ratio threshold
        pblm.load_samples()
        infr = pblm.infr
        edges = ut.emap(tuple, pblm.samples.aid_pairs.tolist())
        task = pblm.samples['match_state']
        pos_idx = task.class_names.tolist().index(POSTV)

        config = {'ratio_thresh': 1.0, 'sv_on': False}
        matches = infr._exec_pairwise_match(edges, config)

        import wbia.plottool as pt
        import sklearn.metrics

        pt.qtensure()
        thresholds = np.linspace(0, 1.0, 100)
        pos_truth = task.y_bin.T[pos_idx]
        ratio_fs = [m.local_measures['ratio'] for m in matches]

        aucs = []
        # Given the current correspondences: Find the optimal
        # correspondence threshold.
        for thresh in ut.ProgIter(thresholds, 'computing thresh'):
            scores = np.array([fs[fs < thresh].sum() for fs in ratio_fs])
            roc = sklearn.metrics.roc_auc_score(pos_truth, scores)
            aucs.append(roc)
        aucs = np.array(aucs)
        opt_auc = aucs.max()
        opt_thresh = thresholds[aucs.argmax()]

        if True:
            pt.plt.plot(thresholds, aucs, 'r-', label='')
            pt.plt.plot(opt_thresh,
                        opt_auc,
                        'ro',
                        label='L opt=%r' % (opt_thresh, ))
            pt.set_ylabel('auc')
            pt.set_xlabel('ratio threshold')
            pt.legend()
Пример #4
0
    for fig in figures:
        fig = figures[0]
        text = fig.summary_str(outline=True, numlines=float('inf'))
        fpaths = [info['fpath'] for info in fig.parse_includegraphics()]
        if fpaths:
            cmd_to_fpaths[cmd].extend(fpaths)


for key in cmd_to_fpaths.keys():
    cmd = key.lstrip('\\')
    if not root.find_descendant_type(cmd):
        print(key)

from os.path import abspath, dirname
used_fpaths = ut.flatten(cmd_to_fpaths.values())
used_fpaths = set(ut.emap(abspath, used_fpaths))

all_fpaths = set(ut.emap(abspath, ut.glob('.', ['*.png', '*.jpg'], recursive=True)))

unused = list(all_fpaths - used_fpaths)

unuse_dirs = ut.group_items(unused, ut.emap(dirname, unused))


semi_used = {}
for dpath, fpaths in unuse_dirs.items():
    used_in_dpath = set(ut.ls(dpath)) - set(fpaths)
    if len(used_in_dpath) == 0:
        # completely unused directories
        print(dpath)
    else: