示例#1
0
 def _print_previous_loop_statistics(infr, count):
     # Print stats about what happend in the this loop
     history = infr.metrics_list[-count:]
     recover_blocks = ub.group_items([
         (k, sum(1 for i in g))
         for k, g in it.groupby(util.take_column(history, 'recovering'))
     ]).get(True, [])
     infr.print(
         ('Recovery mode entered {} times, '
          'made {} recovery decisions.').format(len(recover_blocks),
                                                sum(recover_blocks)),
         color='green')
     testaction_hist = ub.dict_hist(util.take_column(
         history, 'test_action'))
     infr.print('Test Action Histogram: {}'.format(
         ub.repr2(testaction_hist, si=True)),
                color='yellow')
     if infr.params['inference.enabled']:
         action_hist = ub.dict_hist(
             util.emap(frozenset, util.take_column(history, 'action')))
         infr.print('Inference Action Histogram: {}'.format(
             ub.repr2(action_hist, si=True)),
                    color='yellow')
     infr.print('Decision Histogram: {}'.format(
         ub.repr2(ub.dict_hist(util.take_column(history, 'pred_decision')),
                  si=True)),
                color='yellow')
     infr.print('User Histogram: {}'.format(
         ub.repr2(ub.dict_hist(util.take_column(history, 'user_id')),
                  si=True)),
                color='yellow')
示例#2
0
def test_dict_hist_ordered():
    import random
    import string
    import ubelt as ub
    rng = random.Random(0)
    items = [rng.choice(string.ascii_letters) for _ in range(100)]
    # Ensure that the ordered=True bug is fixed
    a = sorted(ub.dict_hist(items, ordered=True).items())
    b = sorted(ub.dict_hist(items, ordered=False).items())
    assert a == b
示例#3
0
def generate_phase1_data_tables():
    cfg = viame_wrangler.config.WrangleConfig({
        'annots':
        ub.truepath(
            '~/data/viame-challenge-2018/phase1-annotations/*/*coarse*bbox-keypoint*.json'
        )
    })

    all_stats = {}

    annots = cfg.annots
    fpaths = list(glob.glob(annots))
    print('fpaths = {}'.format(ub.repr2(fpaths)))
    for fpath in fpaths:
        dset_name = os.path.basename(fpath).split('-')[0]
        dset = CocoDataset(fpath, img_root=cfg.img_root, tag=dset_name)

        assert not dset.missing_images()
        assert not dset._find_bad_annotations()
        assert all([
            img['has_annots'] in [True, False, None]
            for img in dset.imgs.values()
        ])

        print(ub.dict_hist([g['has_annots'] for g in dset.imgs.values()]))

        stats = {}
        stats.update(ub.dict_subset(dset.basic_stats(), ['n_anns', 'n_imgs']))

        roi_shapes_hist = dict()
        populated_cats = dict()
        for name, item in dset.category_annotation_type_frequency().items():
            if item:
                populated_cats[name] = sum(item.values())
                for k, v in item.items():
                    roi_shapes_hist[k] = roi_shapes_hist.get(k, 0) + v

        stats['n_cats'] = populated_cats
        stats['n_roi_shapes'] = roi_shapes_hist
        stats['n_imgs_with_annots'] = ub.map_keys(
            {
                None: 'unsure',
                True: 'has_objects',
                False: 'no_objects'
            }, ub.dict_hist([g['has_annots'] for g in dset.imgs.values()]))
        all_stats[dset_name] = stats

    print(ub.repr2(all_stats, nl=3, sk=1))
示例#4
0
def regenerate_phase1_flavors():
    """
    Assumes original data is in a good format
    """
    cfg = viame_wrangler.config.WrangleConfig({
        'annots':
        ub.truepath(
            '~/data/viame-challenge-2018/phase1-annotations/*/original_*.json')
    })

    annots = cfg.annots
    fpaths = list(glob.glob(annots))
    print('Reading raw mscoco files')
    for fpath in fpaths:
        print('reading fpath = {!r}'.format(fpath))
        dset_name = os.path.basename(fpath).replace('original_',
                                                    '').split('.')[0]
        orig_dset = CocoDataset(fpath, img_root=cfg.img_root, tag=dset_name)
        dpath = os.path.dirname(fpath)

        assert not orig_dset.missing_images()
        assert not orig_dset._find_bad_annotations()
        assert all([
            img['has_annots'] in [True, False, None]
            for img in orig_dset.imgs.values()
        ])
        print(ub.dict_hist([g['has_annots'] for g in orig_dset.imgs.values()]))

        make_dataset_flavors(orig_dset, dpath, dset_name)
示例#5
0
 def verbose_dump(dset, fpath):
     print('Dumping {}'.format(fpath))
     if False:
         print(ub.repr2(dset.category_annotation_type_frequency(), nl=1, sk=1))
     print(ub.dict_hist([img['has_annots'] for img in dset.imgs.values()]))
     print(ub.repr2(dset.basic_stats()))
     dset.dump(fpath)
示例#6
0
    def from_data(xpu, item, **kwargs):
        """
        Creates an XPU to represent the processing device a Tensor or Variable
        is on

        Example:
            >>> xpu = XPU.from_data(torch.randn(3))
            >>> assert not xpu.is_gpu()
            >>> if torch.cuda.is_available():
            >>>     xpu = XPU.from_data(torch.randn(3).cuda())
            >>>     assert xpu.is_gpu()
            >>>     for i in range(torch.cuda.device_count()):
            >>>         xpu = XPU.from_data(torch.randn(3).cuda(i))
            >>>         assert xpu.is_gpu()
            >>>         assert xpu.main_device == i
        """
        if hasattr(item, 'is_cuda'):
            if item.is_cuda:
                return XPU(item.get_device())
            else:
                return XPU(None)
        elif hasattr(item, 'state_dict'):
            state_dict = item.state_dict()
            hist = ub.dict_hist(v.get_device() if v.is_cuda else None
                                for v in state_dict.values())
            device_num = ub.argsort(hist)[-1]
            return XPU(device_num)
        else:
            raise TypeError(type(item))
示例#7
0
        def _generate_abs(n):
            # Randomly choose images to generate boxes for
            chosen_gids = np.array(sorted(rng.choice(all_gids, size=n)))
            gid_to_nboxes = ub.dict_hist(chosen_gids)

            neg_gids = []
            neg_boxes = []
            for gid, nboxes in gid_to_nboxes.items():
                qtree = self.qtrees[gid]
                scale = (qtree.width, qtree.height)
                anchors_ = np.array([window_size]) / np.array(scale)
                if np.any(anchors_ > 1.0):
                    continue
                img_boxes = kwimage.Boxes.random(num=nboxes,
                                                 scale=1.0,
                                                 format='tlbr',
                                                 anchors=anchors_,
                                                 anchor_std=0,
                                                 rng=rng)
                img_boxes = img_boxes.scale(scale)
                for box in img_boxes:
                    # isect_aids, overlaps = self.ious(gid, box)
                    isect_aids, overlaps = self.iooas(gid, box)
                    if len(overlaps) == 0 or overlaps.max() < thresh:
                        neg_gids.append(gid)
                        neg_boxes.append(box.data)
            return neg_gids, neg_boxes
示例#8
0
文件: buildapc.py 项目: Erotemic/misc
def motherboard_info():
    """
    REQUIRES SUDO

    xdoctest -m ~/misc/notes/buildapc.py motherboard_info
    """
    import re
    info = ub.cmd('sudo dmidecode -t 9')
    pcie_slots = []
    chunks = info['out'].split('\n\n')
    for chunk in chunks:
        item = {}
        for line in chunk.split('\n'):
            # doesn't get all data correctly (e.g. characteristics)
            parts = re.split('\t*:', line, maxsplit=1)
            if len(parts) == 2:
                key, val = parts
                key = key.strip()
                val = val.strip()
                if key in item:
                    raise KeyError(f'key={key} already exists')
                item[key] = val
        if item:
            item = ub.map_keys(slugify_key, item)
            pcie_slots.append(item)

    pcie_usage = ub.dict_hist(item['current_usage'] for item in pcie_slots)

    _varied = varied_values(pcie_slots, min_variations=0)
    _varied = ub.map_keys(slugify_key, _varied)
    unvaried = {k: ub.peek(v) for k, v in _varied.items() if len(v) == 1}
    varied = {k: v for k, v in _varied.items() if len(v) > 1}

    print(info['out'])
示例#9
0
def graph_info(graph, ignore=None, stats=False, verbose=False):
    from graphid import util
    import pandas as pd

    node_dict = graph.nodes
    node_attrs = list(node_dict.values())
    edge_attrs = list(take_column(graph.edges(data=True), 2))

    if stats:
        node_df = pd.DataFrame(node_attrs)
        edge_df = pd.DataFrame(edge_attrs)
        if ignore is not None:
            util.delete_dict_keys(node_df, ignore)
            util.delete_dict_keys(edge_df, ignore)
        # Not really histograms anymore
        try:
            node_attr_hist = node_df.describe().to_dict()
        except ValueError:
            node_attr_hist
        try:
            edge_attr_hist = edge_df.describe().to_dict()
        except ValueError:
            edge_attr_hist = {}
        key_order = ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
        node_attr_hist = ub.map_dict_vals(lambda x: util.order_dict_by(x, key_order), node_attr_hist)
        edge_attr_hist = ub.map_dict_vals(lambda x: util.order_dict_by(x, key_order), edge_attr_hist)
    else:
        node_attr_hist = ub.dict_hist(ub.flatten([attr.keys() for attr in node_attrs]))
        edge_attr_hist = ub.dict_hist(ub.flatten([attr.keys() for attr in edge_attrs]))
        if ignore is not None:
            util.delete_dict_keys(edge_attr_hist, ignore)
            util.delete_dict_keys(node_attr_hist, ignore)
    node_type_hist = ub.dict_hist(list(map(type, graph.nodes())))
    info_dict = ub.odict([
        ('directed', graph.is_directed()),
        ('multi', graph.is_multigraph()),
        ('num_nodes', len(graph)),
        ('num_edges', len(list(graph.edges()))),
        ('edge_attr_hist', util.sort_dict(edge_attr_hist)),
        ('node_attr_hist', util.sort_dict(node_attr_hist)),
        ('node_type_hist', util.sort_dict(node_type_hist)),
        ('graph_attrs', graph.graph),
        ('graph_name', graph.name),
    ])
    if verbose:
        print(ub.repr2(info_dict))
    return info_dict
示例#10
0
 def _balance_report(self, limit=None):
     # Print the epoch / item label frequency per epoch
     label_sequence = []
     index_sequence = []
     if limit is None:
         limit = self.num_batches
     for item_indices, _ in zip(self, range(limit)):
         item_indices = np.array(item_indices)
         item_labels = list(ub.take(self.index_to_label, item_indices))
         index_sequence.extend(item_indices)
         label_sequence.extend(ub.unique(item_labels))
     label_hist = ub.dict_hist(label_sequence)
     index_hist = ub.dict_hist(index_sequence)
     label_hist = ub.sorted_vals(label_hist, reverse=True)
     index_hist = ub.sorted_vals(index_hist, reverse=True)
     index_hist = ub.dict_subset(index_hist, list(index_hist.keys())[0:5])
     print('label_hist = {}'.format(ub.repr2(label_hist, nl=1)))
     print('index_hist = {}'.format(ub.repr2(index_hist, nl=1)))
示例#11
0
 def sample(self, *shape):
     """
     Sampling from a mixture of k distributions with weights w_k is
     equivalent to picking a distribution with probability w_k, and then
     sampling from the picked distribution.
     """
     # Choose which distributions are picked for each sample
     idxs = self._idx_choice.sample(*shape)
     idx_to_nsamples = ub.dict_hist(idxs.ravel())
     out = np.zeros(*shape)
     for idx, n in idx_to_nsamples.items():
         # Sample the from the distribution we picked
         mask = (idx == idxs)
         subsample = self.pdfs[idx].sample(n)
         out[mask] = subsample
     return out
示例#12
0
    def category_annotation_type_frequency(self):
        """
        Reports the number of annotations of each type for each category

        Example:
            >>> dataset = demo_coco_data()
            >>> self = CocoDataset(dataset, tag='demo')
            >>> hist = self.category_annotation_frequency()
            >>> print(ub.repr2(hist))
        """
        catname_to_nannot_types = {}
        for cid, aids in self.cid_to_aids.items():
            name = self.cats[cid]['name']
            hist = ub.dict_hist(map(annot_type, ub.take(self.anns, aids)))
            catname_to_nannot_types[name] = ub.map_keys(
                lambda k: k[0] if len(k) == 1 else k, hist)
        return catname_to_nannot_types
示例#13
0
    def __getitem__(self, index):
        # Choose a label for each item in the batch
        if not hasattr(self.rng, 'choices'):
            # python 3.5 support
            chosen_labels = [
                self.rng.choice(self.labels) for _ in range(self.batch_size)
            ]
        else:
            chosen_labels = self.rng.choices(self.labels, k=self.batch_size)
        # Count the number of items we need for each label
        label_freq = ub.dict_hist(chosen_labels)

        # Sample those indices
        batch_idxs = list(
            ub.flatten([
                self.label_to_subsampler[label].sample(num)
                for label, num in label_freq.items()
            ]))
        return batch_idxs
示例#14
0
    def make_optimizer(hyper, named_parameters):
        """
        Instantiate the optimizer defined by the hyperparams

        Contains special logic to create param groups

        Example:
            >>> import netharn as nh
            >>> config = {'optimizer': 'sgd', 'params': [
            >>>     {'lr': 3e-3, 'params': '.*\\.bias'},
            >>>     {'lr': 1e-3, 'params': '.*\\.weight'},
            >>>     #{'lr': 100, 'params': '.*\\.doesnotmatch'},
            >>> ]}
            >>> optim_ = nh.api.Optimizer.coerce(config)
            >>> hyper = nh.HyperParams(optimizer=optim_)
            >>> model = nh.models.ToyNet1d()
            >>> named_parameters = list(model.named_parameters())
            >>> optimizer = hyper.make_optimizer(named_parameters)
            >>> print('optimizer = {!r}'.format(optimizer))
        """
        if hyper._optimizer_info['instance'] is not None:
            return hyper._optimizer_info['instance']
        # What happens if we want to group parameters
        optim_kw = hyper.optimizer_params.copy()
        params = optim_kw.pop('params', None)
        if params is None:
            param_groups = [p for (name, p) in named_parameters]
        else:
            import re
            named_parameters = list(named_parameters)
            name_to_param = dict(named_parameters)
            param_groups = []
            if isinstance(params, dict):
                # remember the group key
                groups = [{'key': k, **g} for k, g in params.items()]
            if isinstance(params, list):
                groups = params

            PREVENT_DUPLICATES = 1

            seen_ = set()
            for group in groups:
                # Transform param grouping specifications into real params
                group = group.copy()
                spec = group.pop('params')
                if isinstance(spec, list):
                    if len(spec):
                        first = ub.peek(spec)
                        if isinstance(first, str):
                            real_params = [name_to_param[k] for k in spec]
                        elif isinstance(first, torch.nn.Parameter):
                            real_params = spec
                        else:
                            raise TypeError(type(first))
                    else:
                        real_params = []

                # Python 3.6 doesn't have re.Pattern
                elif isinstance(spec, str) or hasattr(spec, 'match'):
                    if hasattr(spec, 'match'):
                        pat = spec
                    else:
                        pat = re.compile(spec)
                    real_params = [
                        p for name, p in name_to_param.items()
                        if pat.match(name)
                    ]
                else:
                    raise TypeError(type(spec))

                if PREVENT_DUPLICATES:
                    # give priority to earlier params
                    # This is Python 3.6+ only
                    real_params = list(ub.oset(real_params) - seen_)
                    seen_.update(real_params)

                group['params'] = real_params
                param_groups.append(group)

            CHECK = 1
            if CHECK:
                # Determine if we are using the same param more than once
                # or if we are not using a param at all.
                # NOTE: torch does do a duplicate check.
                param_group_ids = []
                for group in param_groups:
                    ids = list(map(id, group['params']))
                    param_group_ids.append(ids)

                all_param_ids = [id(p) for n, p in named_parameters]
                flat_ids = list(ub.flatten(param_group_ids))
                freq = ub.dict_hist(flat_ids, labels=all_param_ids)
                num_unused = any(v == 0 for v in freq.values())
                num_dups = any(v > 1 for v in freq.values())
                if num_unused:
                    warnings.warn(
                        'There are {} unused params'.format(num_unused))
                if num_dups:
                    warnings.warn(
                        'There are {} duplicate params'.format(num_dups))

        optimizer = hyper.optimizer_cls(param_groups, **optim_kw)
        return optimizer
示例#15
0
def convert_camvid_raw_to_coco(camvid_raw_info):
    """
    Converts the raw camvid format to an MSCOCO based format, ( which lets use
    use kwcoco's COCO backend).

    Example:
        >>> # xdoctest: +REQUIRES(--download)
        >>> camvid_raw_info = grab_raw_camvid()
        >>> # test with a reduced set of data
        >>> del camvid_raw_info['img_paths'][2:]
        >>> del camvid_raw_info['mask_paths'][2:]
        >>> dset = convert_camvid_raw_to_coco(camvid_raw_info)
        >>> # xdoctest: +REQUIRES(--show)
        >>> import kwplot
        >>> plt = kwplot.autoplt()
        >>> kwplot.figure(fnum=1, pnum=(1, 2, 1))
        >>> dset.show_image(gid=1)
        >>> kwplot.figure(fnum=1, pnum=(1, 2, 2))
        >>> dset.show_image(gid=2)
    """
    import re
    import kwimage
    import kwcoco
    print('Converting CamVid to MS-COCO format')

    dset_root, img_paths, label_path, mask_paths = ub.take(
        camvid_raw_info,
        'dset_root, img_paths, label_path, mask_paths'.split(', '))

    img_infos = {
        'img_fname': img_paths,
        'mask_fname': mask_paths,
    }
    keys = list(img_infos.keys())
    next_vals = list(zip(*img_infos.values()))
    image_items = [{k: v for k, v in zip(keys, vals)} for vals in next_vals]

    dataset = {
        'img_root': dset_root,
        'images': [],
        'categories': [],
        'annotations': [],
    }

    lines = ub.readfrom(label_path).split('\n')
    lines = [line for line in lines if line]
    for line in lines:
        color_text, name = re.split('\t+', line)
        r, g, b = map(int, color_text.split(' '))
        color = (r, g, b)

        # Parse the special camvid format
        cid = (r << 16) + (g << 8) + (b << 0)
        cat = {
            'id': cid,
            'name': name,
            'color': color,
        }
        dataset['categories'].append(cat)

    for gid, img_item in enumerate(image_items, start=1):
        img = {
            'id': gid,
            'file_name': img_item['img_fname'],
            # nonstandard image field
            'segmentation': img_item['mask_fname'],
        }
        dataset['images'].append(img)

    dset = kwcoco.CocoDataset(dataset)
    dset.rename_categories({'Void': 'background'})

    assert dset.name_to_cat['background']['id'] == 0
    dset.name_to_cat['background'].setdefault('alias', []).append('Void')

    if False:
        _define_camvid_class_hierarcy(dset)

    if 1:
        # TODO: Binarize CCs (and efficiently encode if possible)
        import numpy as np

        bad_info = []
        once = False

        # Add images
        dset.remove_annotations(list(dset.index.anns.keys()))
        for gid, img in ub.ProgIter(dset.imgs.items(),
                                    desc='parse label masks'):
            mask_fpath = join(dset_root, img['segmentation'])

            rgb_mask = kwimage.imread(mask_fpath, space='rgb')
            r, g, b = rgb_mask.T.astype(np.int64)
            cid_mask = np.ascontiguousarray(rgb_to_cid(r, g, b).T)

            cids = set(np.unique(cid_mask)) - {0}

            for cid in cids:
                if cid not in dset.cats:
                    if gid == 618:
                        # Handle a known issue with image 618
                        c_mask = (cid == cid_mask).astype(np.uint8)
                        total_bad = c_mask.sum()
                        if total_bad < 32:
                            if not once:
                                print(
                                    'gid 618 has a few known bad pixels, ignoring them'
                                )
                                once = True
                            continue
                        else:
                            raise Exception('more bad pixels than expected')
                    else:
                        raise Exception(
                            'UNKNOWN cid = {!r} in gid={!r}'.format(cid, gid))

                    # bad_rgb = cid_to_rgb(cid)
                    # print('bad_rgb = {!r}'.format(bad_rgb))
                    # print('WARNING UNKNOWN cid = {!r} in gid={!r}'.format(cid, gid))
                    # bad_info.append({
                    #     'gid': gid,
                    #     'cid': cid,
                    # })
                else:
                    ann = {
                        'category_id': cid,
                        'image_id': gid
                        # 'segmentation': mask.to_coco()
                    }
                    assert cid in dset.cats
                    c_mask = (cid == cid_mask).astype(np.uint8)
                    mask = kwimage.Mask(c_mask, 'c_mask')

                    box = kwimage.Boxes([mask.get_xywh()], 'xywh')
                    # box = mask.to_boxes()

                    ann['bbox'] = ub.peek(box.to_coco())
                    ann['segmentation'] = mask.to_coco()
                    dset.add_annotation(**ann)

        if 0:
            bad_cids = [i['cid'] for i in bad_info]
            print(sorted([c['color'] for c in dataset['categories']]))
            print(sorted(set([cid_to_rgb(i['cid']) for i in bad_info])))

            gid = 618
            img = dset.imgs[gid]
            mask_fpath = join(dset_root, img['segmentation'])
            rgb_mask = kwimage.imread(mask_fpath, space='rgb')
            r, g, b = rgb_mask.T.astype(np.int64)
            cid_mask = np.ascontiguousarray(rgb_to_cid(r, g, b).T)
            cid_hist = ub.dict_hist(cid_mask.ravel())

            bad_cid_hist = {}
            for cid in bad_cids:
                bad_cid_hist[cid] = cid_hist.pop(cid)

            import kwplot
            kwplot.autompl()
            kwplot.imshow(rgb_mask)

    if 0:
        import kwplot
        plt = kwplot.autoplt()
        plt.clf()
        dset.show_image(1)

        import xdev
        gid_list = list(dset.imgs)
        for gid in xdev.InteractiveIter(gid_list):
            dset.show_image(gid)
            xdev.InteractiveIter.draw()

    dset._build_index()
    dset._build_hashid()
    return dset
def fix_dataset_phase1_original():
    cfg = viame_wrangler.config.WrangleConfig({
        'annots': ub.truepath('~/data/viame-challenge-2018/phase1-annotations/*/original_*.json')
    })

    annots = cfg.annots
    fpaths = list(glob.glob(annots))
    print('Reading raw mscoco files')
    fpath_iter = iter(fpaths)

    for fpath in fpath_iter:
        print('reading fpath = {!r}'.format(fpath))
        dset_name = os.path.basename(fpath).replace('original_', '').split('.')[0]
        dset = CocoDataset(fpath, img_root=cfg.img_root)

        did_fix = False
        if dset.missing_images():
            did_fix = True
            print('Fixing missing images')
            for img in dset.dataset['images']:
                if img['file_name'].startswith(dset_name):
                    assert False
                img['file_name'] = join(dset_name, img['file_name'])
            assert not dset.missing_images()
        # dset.dataset.keys()
        # dset.dataset['categories']

        bad_annots = dset._find_bad_annotations()
        if bad_annots:
            print('Fixing bad annots')
            did_fix = True
            for ann in bad_annots:
                dset.remove_annotation(ann)
            dset._build_index()

        bad_hasannots_flags = not all([img.get('has_annots', ub.NoParam) in [True, False, None] for img in dset.imgs.values()])

        if bad_hasannots_flags:
            did_fix = True
            for gid, img in dset.imgs.items():
                aids = dset.gid_to_aids.get(gid, [])

                if True:
                    # SPECIAL CASES
                    if img['file_name'] == 'afsc_seq1/003496.jpg':
                        img['has_annots'] = True

                # if False:
                #     if img['has_annots'] is None:
                #         dset.show_annotation(gid=img['id'])
                #         break
                # If there is at least one annotation, always mark as has_annots
                if img.get('has_annots', None) not in [True, False, None]:
                    if str(img['has_annots']).lower() == 'false':
                        img['has_annots'] = False
                    else:
                        assert False, ub.repr2(img)
                if len(aids) > 0:
                    img['has_annots'] = True
                else:
                    # Otherwise set has_annots to null if it has not been
                    # explicitly labeled
                    if 'has_annots' not in img:
                        img['has_annots'] = None

            print(ub.dict_hist([g['has_annots'] for g in dset.imgs.values()]))

        if did_fix:
            print('manual check')
            break
            # ut.print_difftext(ut.get_textdiff(dset.dumps(), orig_dset.dumps()))
            dset.dump(fpath)
示例#17
0
def find_consistent_labeling(grouped_oldnames, extra_prefix='_extra_name',
                             verbose=False):
    """
    Solves a a maximum bipirtite matching problem to find a consistent
    name assignment that minimizes the number of annotations with different
    names. For each new grouping of annotations we assign

    For each group of annotations we must assign them all the same name, either from




    To reduce the running time

    Args:
        gropued_oldnames (list): A group of old names where the grouping is
            based on new names. For instance:

                Given:
                    aids      = [1, 2, 3, 4, 5]
                    old_names = [0, 1, 1, 1, 0]
                    new_names = [0, 0, 1, 1, 0]

                The grouping is
                    [[0, 1, 0], [1, 1]]

                This lets us keep the old names in a split case and
                re-use exising names and make minimal changes to
                current annotation names while still being consistent
                with the new and improved grouping.

                The output will be:
                    [0, 1]

                Meaning that all annots in the first group are assigned the
                name 0 and all annots in the second group are assigned the name
                1.

    References:
        http://stackoverflow.com/questions/1398822/assignment-problem-numpy


    Example:
        >>> grouped_oldnames = demodata_oldnames(25, 15,  5, n_per_incon=5)
        >>> new_names = find_consistent_labeling(grouped_oldnames, verbose=1)
        >>> grouped_oldnames = demodata_oldnames(0, 15,  5, n_per_incon=1)
        >>> new_names = find_consistent_labeling(grouped_oldnames, verbose=1)
        >>> grouped_oldnames = demodata_oldnames(0, 0, 0, n_per_incon=1)
        >>> new_names = find_consistent_labeling(grouped_oldnames, verbose=1)

    Example:
        >>> ydata = []
        >>> xdata = list(range(10, 150, 50))
        >>> for x in xdata:
        >>>     print('x = %r' % (x,))
        >>>     grouped_oldnames = demodata_oldnames(x, 15,  5, n_per_incon=5)
        >>>     t = ub.Timerit(3, verbose=1)
        >>>     for timer in t:
        >>>         with timer:
        >>>             new_names = find_consistent_labeling(grouped_oldnames)
        >>>     ydata.append(t.min())
        >>> # xdoc: +REQUIRES(--show)
        >>> import plottool as pt
        >>> pt.qtensure()
        >>> pt.multi_plot(xdata, [ydata])
        >>> util.show_if_requested()

    Example:
        >>> grouped_oldnames = [['a', 'b', 'c'], ['b', 'c'], ['c', 'e', 'e']]
        >>> new_names = find_consistent_labeling(grouped_oldnames, verbose=1)
        >>> result = ub.repr2(new_names)
        >>> print(new_names)
        ['a', 'b', 'e']

    Example:
        >>> grouped_oldnames = [['a', 'b'], ['a', 'a', 'b'], ['a']]
        >>> new_names = find_consistent_labeling(grouped_oldnames)
        >>> result = ub.repr2(new_names)
        >>> print(new_names)
        ['b', 'a', '_extra_name0']

    Example:
        >>> grouped_oldnames = [['a', 'b'], ['e'], ['a', 'a', 'b'], [], ['a'], ['d']]
        >>> new_names = find_consistent_labeling(grouped_oldnames)
        >>> result = ub.repr2(new_names)
        >>> print(new_names)
        ['b', 'e', 'a', '_extra_name0', '_extra_name1', 'd']

    Example:
        >>> grouped_oldnames = [[], ['a', 'a'], [],
        >>>                     ['a', 'a', 'a', 'a', 'a', 'a', 'a', 'b'], ['a']]
        >>> new_names = find_consistent_labeling(grouped_oldnames)
        >>> result = ub.repr2(new_names)
        >>> print(new_names)
        ['_extra_name0', 'a', '_extra_name1', 'b', '_extra_name2']
    """
    unique_old_names = list(ub.unique(ub.flatten(grouped_oldnames)))
    n_old_names = len(unique_old_names)
    n_new_names = len(grouped_oldnames)

    # Initialize assignment to all Nones
    assignment = [None for _ in range(n_new_names)]

    if verbose:
        print('finding maximally consistent labeling')
        print('n_old_names = %r' % (n_old_names,))
        print('n_new_names = %r' % (n_new_names,))

    # For each old_name, determine now many new_names use it.
    oldname_sets = list(map(set, grouped_oldnames))
    oldname_usage = ub.dict_hist(ub.flatten(oldname_sets))

    # Any name used more than once is a conflict and must be resolved
    conflict_oldnames = {k for k, v in oldname_usage.items() if v > 1}

    # Partition into trivial and non-trivial cases
    nontrivial_oldnames = []
    nontrivial_new_idxs = []

    trivial_oldnames = []
    trivial_new_idxs = []
    for new_idx, group in enumerate(grouped_oldnames):
        if set(group).intersection(conflict_oldnames):
            nontrivial_oldnames.append(group)
            nontrivial_new_idxs.append(new_idx)
        else:
            trivial_oldnames.append(group)
            trivial_new_idxs.append(new_idx)

    # Rectify trivial cases
    # Any new-name that does not share any of its old-names with other
    # new-names can be resolved trivially
    n_trivial_unchanged = 0
    n_trivial_ignored = 0
    n_trivial_merges = 0
    for group, new_idx in zip(trivial_oldnames, trivial_new_idxs):
        if len(group) > 0:
            # new-names that use more than one old-name are simple merges
            h = ub.dict_hist(group)
            if len(h) > 1:
                n_trivial_merges += 1
            else:
                n_trivial_unchanged += 1
            hitems = list(h.items())
            hvals = [i[1] for i in hitems]
            maxval = max(hvals)
            g = min([k for k, v in hitems if v == maxval])
            assignment[new_idx] = g
        else:
            # new-names that use no old-names can be ignored
            n_trivial_ignored += 1

    if verbose:
        n_trivial = len(trivial_oldnames)
        n_nontrivial = len(nontrivial_oldnames)
        print('rectify %d trivial groups' % (n_trivial,))
        print('  * n_trivial_unchanged = %r' % (n_trivial_unchanged,))
        print('  * n_trivial_merges = %r' % (n_trivial_merges,))
        print('  * n_trivial_ignored = %r' % (n_trivial_ignored,))
        print('rectify %d non-trivial groups' % (n_nontrivial,))

    # Partition nontrivial_oldnames into smaller disjoint sets
    nontrivial_oldnames_sets = list(map(set, nontrivial_oldnames))
    import networkx as nx
    g = nx.Graph()
    g.add_nodes_from(range(len(nontrivial_oldnames_sets)))
    for u, group1 in enumerate(nontrivial_oldnames_sets):
        rest = nontrivial_oldnames_sets[u + 1:]
        for v, group2 in enumerate(rest, start=u + 1):
            if group1.intersection(group2):
                g.add_edge(u, v)
    nontrivial_partition = list(nx.connected_components(g))
    if verbose:
        print('  * partitioned non-trivial into %d subgroups' %
              (len(nontrivial_partition)))
        from graphid import util
        part_size_stats = util.stats_dict(map(len, nontrivial_partition))
        stats_str = ub.repr2(part_size_stats, precision=2, strkeys=True)
        print('  * partition size stats = %s'  % (stats_str,))

    # Rectify nontrivial cases
    for part_idxs in ub.ProgIter(nontrivial_partition, desc='rectify parts',
                                 enabled=verbose):
        part_oldnames = list(ub.take(nontrivial_oldnames, part_idxs))
        part_newidxs  = list(ub.take(nontrivial_new_idxs, part_idxs))
        # Rectify this part
        assignment_ = simple_munkres(part_oldnames)
        for new_idx, new_name in zip(part_newidxs, assignment_):
            assignment[new_idx] = new_name

    # Any unassigned name is now given a new unique label with a prefix
    if extra_prefix is not None:
        num_extra = 0
        for idx, val in enumerate(assignment):
            if val is None:
                assignment[idx] = '%s%d' % (extra_prefix, num_extra,)
                num_extra += 1
    return assignment
示例#18
0
    def _fix_keys(model_state_dict):
        """
        Hack around DataParallel wrapper. If there is nothing in common between
        the two models check to see if prepending 'module.' to other keys fixes
        it.
        """
        other_keys = set(model_state_dict)
        self_keys = set(self_state)

        if 0:
            # Automatic way to reduce nodes in the trees?
            # If node b always follows node a, can we contract it?
            nodes1 = [n for p in other_keys for n in p.split('.')]
            nodes2 = [n for p in self_keys for n in p.split('.')]
            tups1 = list(tup for key in other_keys
                         for tup in ub.iter_window(key.split('.'), 2))
            tups2 = list(tup for key in self_keys
                         for tup in ub.iter_window(key.split('.'), 2))
            x = ub.ddict(list)
            for a, b in tups1:
                x[a].append(b)
            for a, b in tups2:
                x[a].append(b)

            nodehist = ub.dict_hist(nodes1 + nodes2)

            for k, v in x.items():
                print('----')
                print(k)
                print(nodehist[k])
                follow_hist = ub.dict_hist(v)
                print(follow_hist)
                total = sum(follow_hist.values())
                if ub.allsame(follow_hist.values()) and total == nodehist[k]:
                    print('CONTRACT')

            # pair_freq = ub.dict_hist(ub.flatten([tups1, tups2]))
            # print(forest_str(paths_to_otree(other_keys, '.')))

        # common_keys = other_keys.intersection(self_keys)
        # if not common_keys:
        if not other_keys.issubset(self_keys):
            if association == 'strict':
                pass
            elif association == 'module-hack':
                # If there are no common keys try a hack
                prefix = 'module.'

                def smap(f, ss):
                    return set(map(f, ss))

                def fix1(k):
                    return prefix + k

                def fix2(k):
                    if k.startswith(prefix):
                        return k[len(prefix):]

                if smap(fix1, other_keys).intersection(self_keys):
                    model_state_dict = ub.map_keys(fix1, model_state_dict)
                elif smap(fix2, other_keys).intersection(self_keys):
                    model_state_dict = ub.map_keys(fix2, model_state_dict)
            elif association == 'prefix-hack':
                import functools

                def add_prefix(k, prefix):
                    return prefix + k

                def remove_prefix(k, prefix):
                    if k.startswith(prefix):
                        return k[len(prefix):]

                # set1 = other_keys
                # target_set2 = self_keys
                found = _best_prefix_transform(other_keys, self_keys)
                if found is not None:
                    for action, prefix in found['transform']:
                        if action == 'add':
                            func = functools.partial(add_prefix, prefix=prefix)
                        elif action == 'remove':
                            func = functools.partial(remove_prefix,
                                                     prefix=prefix)
                        else:
                            raise AssertionError
                        model_state_dict = ub.map_keys(func, model_state_dict)
            elif association in {'embedding', 'isomorphism'}:
                if verbose > 1:
                    print('Using subpath {} association, may take some time'.
                          format(association))
                # I believe this is the correct way to solve the problem
                paths1 = sorted(other_keys)
                paths2 = sorted(self_state)

                if 1:
                    # hack to filter to reduce tree size in embedding problem
                    def shrink_paths(paths):
                        new_paths = []
                        for p in paths:
                            p = p.replace('.0', ':0')
                            p = p.replace('.1', ':1')
                            p = p.replace('.2', ':2')
                            p = p.replace('.3', ':3')
                            p = p.replace('.4', ':4')
                            p = p.replace('.5', ':5')
                            p = p.replace('.6', ':6')
                            p = p.replace('.7', ':7')
                            p = p.replace('.8', ':8')
                            p = p.replace('.9', ':9')
                            p = p.replace('.weight', ':weight')
                            p = p.replace('.bias', ':bias')
                            p = p.replace('.num_batches_tracked',
                                          ':num_batches_tracked')
                            p = p.replace('.running_mean', ':running_mean')
                            p = p.replace('.running_var', ':running_var')
                            # p = p.replace('.conv1', ':conv1')
                            # p = p.replace('.conv2', ':conv2')
                            # p = p.replace('.conv3', ':conv3')
                            # p = p.replace('.bn1', ':bn1')
                            # p = p.replace('.bn2', ':bn2')
                            # p = p.replace('.bn3', ':bn3')
                            new_paths.append(p)
                        return new_paths

                    # Reducing the depth saves a lot of time
                    paths1_ = shrink_paths(paths1)
                    paths2_ = shrink_paths(paths2)

                subpaths1, subpaths2 = maximum_common_ordered_subpaths(
                    paths1_, paths2_, sep='.', mode=association)
                subpaths1 = [p.replace(':', '.') for p in subpaths1]
                subpaths2 = [p.replace(':', '.') for p in subpaths2]
                mapping = ub.dzip(subpaths1, subpaths2)
                if verbose > 1:
                    other_unmapped = sorted(other_keys - set(mapping.keys()))
                    self_unmapped = sorted(self_keys - set(mapping.values()))
                    print('-- embed association (other -> self) --')
                    print('mapping = {}'.format(ub.repr2(mapping, nl=1)))
                    print('self_unmapped = {}'.format(
                        ub.repr2(self_unmapped, nl=1)))
                    print('other_unmapped = {}'.format(
                        ub.repr2(other_unmapped, nl=1)))
                    print('len(mapping) = {}'.format(
                        ub.repr2(len(mapping), nl=1)))
                    print('len(self_unmapped) = {}'.format(
                        ub.repr2(len(self_unmapped), nl=1)))
                    print('len(other_unmapped) = {}'.format(
                        ub.repr2(len(other_unmapped), nl=1)))
                    print('-- end embed association --')

                # HACK: something might be wrong, there was an instance with
                # HRNet_w32 where multiple keys mapped to the same key
                # bad keys were incre_modules.3.0.conv1.weight and conv1.weight
                #
                # This will not error, but may produce bad output
                try:
                    model_state_dict = ub.map_keys(lambda k: mapping.get(k, k),
                                                   model_state_dict)
                except Exception as ex:
                    HACK = 1
                    if HACK:
                        new_state_dict_ = {}
                        for k, v in model_state_dict.items():
                            new_state_dict_[mapping.get(k, k)] = v
                        model_state_dict = new_state_dict_
                        warnings.warn('ex = {!r}'.format(ex))
                    else:
                        raise
            else:
                raise KeyError(association)
        return model_state_dict
示例#19
0
    def _training_sample_weights(self):
        """
        Assigns weighting to each image to includence sample probability.

        We want to see very frequent categories less often,
        but we also don't really care about the rarest classes to the point
        where we should smaple them more than uncommon classes.  We also don't
        want to sample images without any or with too many annotations very
        often.
        """
        index_to_gid = [img['id'] for img in self.dset.dataset['images']]
        index_to_aids = list(ub.take(self.dset.gid_to_aids, index_to_gid))
        index_to_cids = [[self.dset.anns[aid]['category_id'] for aid in aids]
                         for aids in index_to_aids]

        catname_to_cid = {
            cat['name']: cid
            for cid, cat in self.dset.cats.items()}

        # median frequency weighting with minimum threshold
        min_examples = 20
        cat_freq = pd.Series(self.dset.category_annotation_frequency())

        valid_freq = cat_freq[cat_freq > min_examples]
        normal_mfw = valid_freq.median() / valid_freq

        # Draw anything under the threshold with probability equal to the median
        too_few = cat_freq[(cat_freq <= min_examples) & (cat_freq > 0)]
        too_few[:] = 1.0
        category_mfw = pd.concat([normal_mfw, too_few])

        cid_to_mfw = category_mfw.rename(catname_to_cid)

        cid_to_mfw_dict = cid_to_mfw.to_dict()

        index_to_weights = [list(ub.take(cid_to_mfw_dict, cids)) for cids in index_to_cids]
        index_to_nannots = np.array(list(map(len, index_to_weights)))

        # Each image becomes represented by the category with maximum median
        # frequency weight. This allows us to assign each image a proxy class
        # We make another proxy class to represent images without anything in
        # them.
        EMPTY_PROXY_CID = -1
        index_to_proxyid = [
            # cid_to_mfw.loc[cids].idxmax()
            ub.argmax(ub.dict_subset(cid_to_mfw_dict, cids))
            if len(cids) else EMPTY_PROXY_CID
            for cids in index_to_cids
        ]

        proxy_freq = pd.Series(ub.dict_hist(index_to_proxyid))
        proxy_root_mfw = proxy_freq.median() / proxy_freq
        power = 0.878
        proxy_root_mfw = proxy_root_mfw ** power
        # We now have a weight for each item in out dataset
        index_to_weight = np.array(list(ub.take(proxy_root_mfw.to_dict(), index_to_proxyid)))

        if False:
            # Figure out how the likelihoods of each class change
            xy = {}
            for power in [0, .5, .878, 1]:
                proxy_root_mfw = proxy_freq.median() / proxy_freq
                # dont let weights get too high
                # proxy_root_mfw = np.sqrt(proxy_root_mfw)
                # power = .88
                proxy_root_mfw = proxy_root_mfw ** power
                # proxy_root_mfw = np.clip(proxy_root_mfw, a_min=None, a_max=3)

                index_to_weight = list(ub.take(proxy_root_mfw.to_dict(), index_to_proxyid))

                if 1:
                    # what is the probability we draw an empty image?
                    df = pd.DataFrame({
                        'nannots': index_to_nannots,
                        'weight': index_to_weight,
                    })
                    df['prob'] = df.weight / df.weight.sum()

                    prob_empty = df.prob[df.nannots == 0].sum()

                    probs = {'empty': prob_empty}
                    for cid in cid_to_mfw.index:
                        flags = [cid in cids for cids in index_to_cids]
                        catname = self.dset.cats[cid]['name']
                        p = df[flags].prob.sum()
                        probs[catname] = p
                    xy['p{}'.format(power)] = pd.Series(probs)
            xy['freq'] = {}
            for cid in cid_to_mfw.index:
                catname = self.dset.cats[cid]['name']
                xy['freq'][catname] = proxy_freq[cid]
            print(pd.DataFrame(xy))

        # index_to_prob = index_to_weight / index_to_weight.sum()
        return index_to_weight
示例#20
0
    def __init__(self,
                 index_to_labels,
                 batch_size=1,
                 num_batches='auto',
                 label_to_weight=None,
                 shuffle=False,
                 rng=None):
        import kwarray

        rng = kwarray.ensure_rng(rng, api='python')
        label_to_indices = ub.ddict(set)

        flat_groups = []
        for index, item_labels in enumerate(index_to_labels):
            flat_groups.extend([index] * len(item_labels))
            for label in item_labels:
                label_to_indices[label].add(index)
        flat_labels = np.hstack(index_to_labels)
        label_to_freq = ub.dict_hist(flat_labels)

        # Use tf-idf based scheme to compute sample probabilities
        label_to_idf = {}
        label_to_tfidf = {}
        labels = sorted(set(flat_labels))
        for label in labels:
            # tf for each img, is the number of times the label appears
            index_to_tf = np.zeros(len(index_to_labels))
            for index, item_labels in enumerate(index_to_labels):
                index_to_tf[index] = (label == item_labels).sum()
            # idf is the #imgs / #imgs-with-label
            idf = len(index_to_tf) / (index_to_tf > 0).sum()
            if label_to_weight:
                idf = idf * label_to_weight[label]
            label_to_idf[label] = idf
            label_to_tfidf[label] = np.maximum(index_to_tf * idf, 1)
        index_to_weight = sum(label_to_tfidf.values())
        index_to_prob = index_to_weight / index_to_weight.sum()

        if 0:
            index_to_unique_labels = list(map(set, index_to_labels))
            unique_freq = ub.dict_hist(ub.flatten(index_to_unique_labels))
            tot = sum(unique_freq.values())
            unweighted_odds = ub.map_vals(lambda x: x / tot, unique_freq)

            label_to_indices = ub.ddict(set)
            for index, item_labels in enumerate(index_to_labels):
                for label in item_labels:
                    label_to_indices[label].add(index)
            ub.map_vals(len, label_to_indices)

            label_to_odds = ub.ddict(lambda: 0)
            for label, indices in label_to_indices.items():
                for idx in indices:
                    label_to_odds[label] += index_to_prob[idx]

            coi = {x for x, w in label_to_weight.items() if w > 0}
            coi_weighted = ub.dict_subset(label_to_odds, coi)
            coi_unweighted = ub.dict_subset(unweighted_odds, coi)
            print('coi_weighted = {}'.format(ub.repr2(coi_weighted, nl=1)))
            print('coi_unweighted = {}'.format(ub.repr2(coi_unweighted, nl=1)))

        self.index_to_prob = index_to_prob
        self.indices = np.arange(len(index_to_prob))

        if num_batches == 'auto':
            self.num_batches = self._auto_num_batches()
        else:
            self.num_batches = num_batches

        self.label_to_freq = label_to_freq
        self.index_to_labels = index_to_labels
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.rng = kwarray.ensure_rng(rng, api='numpy')
示例#21
0
from mnist_matching import setup_harn

resnet_harn = setup_harn(arch='resnet').initialize()
simple_harn = setup_harn(arch='simple').initialize()

harn = simple_harn

batch_vali = harn._demo_batch(tag='vali')
batch_train = harn._demo_batch(tag='train')

batch = batch_train

inputs = batch['chip']
outputs = harn.model(inputs)
dvecs = outputs['dvecs']

output_shape = harn.model.module.output_shape_for(inputs.shape)
print(ub.repr2(output_shape.hidden, nl=-1))

labels = batch['cpu_nx']
print('labels = {}'.format(
    ub.repr2(ub.odict(sorted(ub.dict_hist(labels.numpy()).items())), nl=1)))

labels = labels[0:8]
dvecs = dvecs[0:8]

info1 = harn.criterion.mine_negatives(dvecs, labels, num=1, mode='hard')
info2 = harn.criterion.mine_negatives(dvecs, labels, num=1, mode='consistent')
info3 = harn.criterion.hard_triples(dvecs, labels)
info4 = harn.criterion.hard_triples2(dvecs, labels)
示例#22
0
def compare_results():
    print('Comparing results')
    import pandas as pd
    from tabulate import tabulate

    # Read in output of demo script
    measure_fpath = 'measurements_haul83.csv'
    py_df = pd.DataFrame.from_csv(measure_fpath, index_col=None)
    # Convert python length output from mm into cm for consistency
    py_df['fishlen'] = py_df['fishlen'] / 10
    py_df['current_frame'] = py_df['current_frame'].astype(np.int)

    # janky CSV parsing
    py_df['box_pts1'] = py_df['box_pts1'].map(
        lambda p: eval(p.replace(';', ','), np.__dict__))
    py_df['box_pts2'] = py_df['box_pts2'].map(
        lambda p: eval(p.replace(';', ','), np.__dict__))

    py_df['obox1'] = [
        ctalgo.OrientedBBox(*cv2.minAreaRect(pts[:, None, :].astype(np.int)))
        for pts in py_df['box_pts1']
    ]
    py_df['obox2'] = [
        ctalgo.OrientedBBox(*cv2.minAreaRect(pts[:, None, :].astype(np.int)))
        for pts in py_df['box_pts2']
    ]
    py_df.drop(['box_pts1', 'box_pts2'], axis=1, inplace=True)

    # Remap to matlab names
    py_df = py_df.rename(columns={
        'error': 'Err',
        'fishlen': 'fishLength',
        'range': 'fishRange',
    })

    # Load matlab results
    mat_df = _read_kresimir_results()

    FORCE_COMPARABLE_RANGE = True
    # FORCE_COMPARABLE_RANGE = False
    if FORCE_COMPARABLE_RANGE:
        # Be absolutely certain we are in comparable regions (may slightly bias
        # results, against python and in favor of matlab)
        min_frame = max(mat_df.current_frame.min(), py_df.current_frame.min())
        max_frame = min(mat_df.current_frame.max(), py_df.current_frame.max())
        print('min_frame = {!r}'.format(min_frame))
        print('max_frame = {!r}'.format(max_frame))

        mat_df = mat_df[(mat_df.current_frame >= min_frame)
                        & (mat_df.current_frame <= max_frame)]
        py_df = py_df[(py_df.current_frame >= min_frame)
                      & (py_df.current_frame <= max_frame)]

    intersect_frames = np.intersect1d(mat_df.current_frame,
                                      py_df.current_frame)
    print('intersecting frames = {} / {} (matlab)'.format(
        len(intersect_frames), len(set(mat_df.current_frame))))
    print('intersecting frames = {} / {} (python)'.format(
        len(intersect_frames), len(set(py_df.current_frame))))

    #  Reuse the hungarian algorithm implementation from ctalgo
    min_assign = ctalgo.StereoLengthMeasurments.minimum_weight_assignment

    correspond = []
    for f in intersect_frames:
        pidxs = np.where(py_df.current_frame == f)[0]
        midxs = np.where(mat_df.current_frame == f)[0]

        pdf = py_df.iloc[pidxs]
        mdf = mat_df.iloc[midxs]

        ppts1 = np.array([o.center for o in pdf['obox1']])
        mpts1 = np.array([o.center for o in mdf['obox1']])

        ppts2 = np.array([o.center for o in pdf['obox2']])
        mpts2 = np.array([o.center for o in mdf['obox2']])

        dists1 = sklearn.metrics.pairwise.pairwise_distances(ppts1, mpts1)
        dists2 = sklearn.metrics.pairwise.pairwise_distances(ppts2, mpts2)

        # arbitrarilly chosen threshold
        thresh = 100
        for i, j in min_assign(dists1):
            d1 = dists1[i, j]
            d2 = dists2[i, j]
            if d1 < thresh and d2 < thresh and abs(d1 - d2) < thresh / 4:
                correspond.append((pidxs[i], midxs[j]))
    correspond = np.array(correspond)

    # pflags = np.array(ub.boolmask(correspond.T[0], len(py_df)))
    mflags = np.array(ub.boolmask(correspond.T[1], len(mat_df)))
    # print('there are {} detections that seem to be in common'.format(len(correspond)))
    # print('The QC flags of the common detections are:       {}'.format(
    #     ub.dict_hist(mat_df[mflags]['QC'].values)))
    # print('The QC flags of the other matlab detections are: {}'.format(
    #     ub.dict_hist(mat_df[~mflags]['QC'].values)))

    print('\n\n----\n## All stats\n')
    print(
        ub.codeblock('''
        Overall, the matlab script made {nmat} length measurements and the
        python script made {npy} length measurements.  Here is a table
        summarizing the average lengths / ranges / errors of each script:
        ''').format(npy=len(py_df), nmat=len(mat_df)))
    stats = pd.DataFrame(columns=['python', 'matlab'])
    for key in ['fishLength', 'fishRange', 'Err']:
        stats.loc[key, 'python'] = '{:6.2f} ± {:6.2f}'.format(
            py_df[key].mean(), py_df[key].std())
        stats.loc[key, 'matlab'] = '{:6.2f} ± {:6.2f}'.format(
            mat_df[key].mean(), mat_df[key].std())

    stats.loc['nTotal', 'python'] = '{}'.format(len(py_df))
    stats.loc['nTotal', 'matlab'] = '{}'.format(len(mat_df))
    print(tabulate(stats, headers='keys', tablefmt='psql', stralign='right'))

    print('\n\n----\n## Only COMMON detections\n')
    py_df_c = py_df.iloc[correspond.T[0]]
    mat_df_c = mat_df.iloc[correspond.T[1]]
    stats = pd.DataFrame(columns=['python', 'matlab'])
    for key in ['fishLength', 'fishRange', 'Err']:
        stats.loc[key, 'python'] = '{:6.2f} ± {:6.2f}'.format(
            py_df_c[key].mean(), py_df_c[key].std())
        stats.loc[key, 'matlab'] = '{:6.2f} ± {:6.2f}'.format(
            mat_df_c[key].mean(), mat_df_c[key].std())

    stats.loc['nTotal', 'python'] = '{}'.format(len(py_df_c))
    stats.loc['nTotal', 'matlab'] = '{}'.format(len(mat_df_c))

    print(
        ub.codeblock('''
        Now, we investigate how many dections matlab and python made in common.
        (Note, choosing which dections in one version correspond to which in
         another is done using a heuristic based on distances between bbox
         centers and a thresholded minimum assignment problem).

        Python made {npy_c}/{nmat} = {percent:.2f}% of the detections matlab made

        ''').format(npy_c=len(py_df_c),
                    nmat=len(mat_df),
                    percent=100 * len(py_df_c) / len(mat_df)))
    print(tabulate(stats, headers='keys', tablefmt='psql', stralign='right'))

    print('\n\n----\n## Evaulation using the QC code\n')
    hist_hit = ub.dict_hist(mat_df[mflags]['QC'].values)
    hist_miss = ub.dict_hist(mat_df[~mflags]['QC'].values)
    print(
        ub.codeblock('''
        However, not all of those matlab detections were good. Because we have
        detections in corrsepondences with each other we can assign the python
        detections QC codes.

        Here is a histogram of the QC codes for these python detections:
        {}
        (Note: read histogram as <QC-code>: <frequency>)

        Here is a histogram of the other matlab detections that python did not
        find:
        {}

        To summarize:
            python correctly rejected {:.2f}% of the matlab QC=0 detections
            python correctly accepted {:.2f}% of the matlab QC=1 detections
            python correctly accepted {:.2f}% of the matlab QC=2 detections

            Note, that because python made detections that matlab did not make,
            the remaining {} detections may be right or wrong, but there is
            no way to tell from this analysis.

        Lastly, here are the statistics for the common detections that had a
        non-zero QC code.
        ''').format(ub.repr2(hist_hit, nl=1), ub.repr2(hist_miss, nl=1),
                    100 * hist_miss[0] / (hist_hit[0] + hist_miss[0]),
                    100 * hist_hit[1] / (hist_hit[1] + hist_miss[1]),
                    100 * hist_hit[2] / (hist_hit[2] + hist_miss[2]),
                    len(py_df) - len(py_df_c)))

    is_qc = (mat_df_c['QC'] > 0).values
    mat_df_c = mat_df_c[is_qc]
    py_df_c = py_df_c[is_qc]
    stats = pd.DataFrame(columns=['python', 'matlab'])
    for key in ['fishLength', 'fishRange', 'Err']:
        stats.loc[key, 'python'] = '{:6.2f} ± {:6.2f}'.format(
            py_df_c[key].mean(), py_df_c[key].std())
        stats.loc[key, 'matlab'] = '{:6.2f} ± {:6.2f}'.format(
            mat_df_c[key].mean(), mat_df_c[key].std())

    stats.loc['nTotal', 'python'] = '{}'.format(len(py_df_c))
    stats.loc['nTotal', 'matlab'] = '{}'.format(len(mat_df_c))
    print(tabulate(stats, headers='keys', tablefmt='psql', stralign='right'))
示例#23
0
def 字典_统计(数组, weights=None, ordered=False, labels=None):
    data = ub.dict_hist(数组, weights, ordered, labels)
    return data
示例#24
0
def main(bib_fpath=None):
    r"""
    intro point to fixbib script

    CommmandLine:
        fixbib
        python -m fixtex bib
        python -m fixtex bib --dryrun
        python -m fixtex bib --dryrun --debug
    """

    if bib_fpath is None:
        bib_fpath = 'My Library.bib'

    # DEBUG = ub.argflag('--debug')
    # Read in text and ensure ascii format
    dirty_text = ut.readfrom(bib_fpath)

    from fixtex.fix_tex import find_used_citations, testdata_fpaths

    if exists('custom_extra.bib'):
        extra_parser = bparser.BibTexParser(ignore_nonstandard_types=False)
        parser = bparser.BibTexParser()
        ut.delete_keys(parser.alt_dict, ['url', 'urls'])
        print('Parsing extra bibtex file')
        extra_text = ut.readfrom('custom_extra.bib')
        extra_database = extra_parser.parse(extra_text, partial=False)
        print('Finished parsing extra')
        extra_dict = extra_database.get_entry_dict()
    else:
        extra_dict = None

    #udata = dirty_text.decode("utf-8")
    #dirty_text = udata.encode("ascii", "ignore")
    #dirty_text = udata

    # parser = bparser.BibTexParser()
    # bib_database = parser.parse(dirty_text)
    # d = bib_database.get_entry_dict()

    print('BIBTEXPARSER LOAD')
    parser = bparser.BibTexParser(ignore_nonstandard_types=False,
                                  common_strings=True)
    ut.delete_keys(parser.alt_dict, ['url', 'urls'])
    print('Parsing bibtex file')
    bib_database = parser.parse(dirty_text, partial=False)
    print('Finished parsing')

    bibtex_dict = bib_database.get_entry_dict()
    old_keys = list(bibtex_dict.keys())
    new_keys = []
    for key in ub.ProgIter(old_keys, label='fixing keys'):
        new_key = key
        new_key = new_key.replace(':', '')
        new_key = new_key.replace('-', '_')
        new_key = re.sub('__*', '_', new_key)
        new_keys.append(new_key)

    # assert len(ut.find_duplicate_items(new_keys)) == 0, 'new keys created conflict'
    assert len(ub.find_duplicates(new_keys)) == 0, 'new keys created conflict'

    for key, new_key in zip(old_keys, new_keys):
        if key != new_key:
            entry = bibtex_dict[key]
            entry['ID'] = new_key
            bibtex_dict[new_key] = entry
            del bibtex_dict[key]

    # The bibtext is now clean. Print it to stdout
    #print(clean_text)
    verbose = None
    if verbose is None:
        verbose = 1

    # Find citations from the tex documents
    key_list = None
    if key_list is None:
        cacher = ub.Cacher('texcite1', enabled=0)
        data = cacher.tryload()
        if data is None:
            fpaths = testdata_fpaths()
            key_list, inverse = find_used_citations(fpaths,
                                                    return_inverse=True)
            # ignore = ['JP', '?', 'hendrick']
            # for item in ignore:
            #     try:
            #         key_list.remove(item)
            #     except ValueError:
            #         pass
            if verbose:
                print('Found %d citations used in the document' %
                      (len(key_list), ))
            data = key_list, inverse
            cacher.save(data)
        key_list, inverse = data

    # else:
    #     key_list = None

    unknown_pubkeys = []
    debug_author = ub.argval('--debug-author', default=None)
    # ./fix_bib.py --debug_author=Kappes

    if verbose:
        print('Fixing %d/%d bibtex entries' %
              (len(key_list), len(bibtex_dict)))

    # debug = True
    debug = False
    if debug_author is not None:
        debug = False

    known_keys = list(bibtex_dict.keys())
    missing_keys = set(key_list) - set(known_keys)
    if extra_dict is not None:
        missing_keys.difference_update(set(extra_dict.keys()))

    if missing_keys:
        print('The library is missing keys found in tex files %s' %
              (ub.repr2(missing_keys), ))

    # Search for possible typos:
    candidate_typos = {}
    sedlines = []
    for key in missing_keys:
        candidates = ut.closet_words(key, known_keys, num=3, subset=True)
        if len(candidates) > 1:
            top = candidates[0]
            if ut.edit_distance(key, top) == 1:
                # "sed -i -e 's/{}/{}/g' *.tex".format(key, top)
                import os
                replpaths = ' '.join(
                    [relpath(p, os.getcwd()) for p in inverse[key]])
                sedlines.append("sed -i -e 's/{}/{}/g' {}".format(
                    key, top, replpaths))
        candidate_typos[key] = candidates
        print('Cannot find key = %r' % (key, ))
        print('Did you mean? %r' % (candidates, ))

    print('Quick fixes')
    print('\n'.join(sedlines))

    # group by file
    just = max([0] + list(map(len, missing_keys)))
    missing_fpaths = [inverse[key] for key in missing_keys]
    for fpath in sorted(set(ub.flatten(missing_fpaths))):
        # ut.fix_embed_globals()
        subkeys = [k for k in missing_keys if fpath in inverse[k]]
        print('')
        ut.cprint('--- Missing Keys ---', 'blue')
        ut.cprint('fpath = %r' % (fpath, ), 'blue')
        ut.cprint('{} | {}'.format('Missing'.ljust(just), 'Did you mean?'),
                  'blue')
        for key in subkeys:
            print('{} | {}'.format(ut.highlight_text(key.ljust(just), 'red'),
                                   ' '.join(candidate_typos[key])))

    # for key in list(bibtex_dict.keys()):

    if extra_dict is not None:
        # Extra database takes precidence over regular
        key_list = list(ut.unique(key_list + list(extra_dict.keys())))
        for k, v in extra_dict.items():
            bibtex_dict[k] = v

    full = ub.argflag('--full')

    for key in key_list:
        try:
            entry = bibtex_dict[key]
        except KeyError:
            continue
        self = BibTexCleaner(key, entry, full=full)

        if debug_author is not None:
            debug = debug_author in entry.get('author', '')

        if debug:
            ut.cprint(' --- ENTRY ---', 'yellow')
            print(ub.repr2(entry, nl=1))

        entry = self.fix()
        # self.clip_abstract()
        # self.shorten_keys()
        # self.fix_authors()
        # self.fix_year()
        # old_pubval = self.fix_pubkey()
        # if old_pubval:
        #     unknown_pubkeys.append(old_pubval)
        # self.fix_arxiv()
        # self.fix_general()
        # self.fix_paper_types()

        if debug:
            print(ub.repr2(entry, nl=1))
            ut.cprint(' --- END ENTRY ---', 'yellow')
        bibtex_dict[key] = entry

    unwanted_keys = set(bibtex_dict.keys()) - set(key_list)
    if verbose:
        print('Removing unwanted %d entries' % (len(unwanted_keys)))
    ut.delete_dict_keys(bibtex_dict, unwanted_keys)

    if 0:
        d1 = bibtex_dict.copy()
        full = True
        for key, entry in d1.items():
            self = BibTexCleaner(key, entry, full=full)
            pub = self.publication()
            if pub is None:
                print(self.entry['ENTRYTYPE'])

            old = self.fix_pubkey()
            x1 = self._pubval()
            x2 = self.standard_pubval(full=full)
            # if x2 is not None and len(x2) > 5:
            #     print(ub.repr2(self.entry))

            if x1 != x2:
                print('x2 = %r' % (x2, ))
                print('x1 = %r' % (x1, ))
                print(ub.repr2(self.entry))

            # if 'CVPR' in self.entry.get('booktitle', ''):
            #     if 'CVPR' != self.entry.get('booktitle', ''):
            #         break
            if old:
                print('old = %r' % (old, ))
            d1[key] = self.entry

    if full:
        d1 = bibtex_dict.copy()

        import numpy as np
        import pandas as pd
        df = pd.DataFrame.from_dict(d1, orient='index')

        paged_items = df[~pd.isnull(df['pub_accro'])]
        has_pages = ~pd.isnull(paged_items['pages'])
        print('have pages {} / {}'.format(has_pages.sum(), len(has_pages)))
        print(ub.repr2(paged_items[~has_pages]['title'].values.tolist()))

        entrytypes = dict(list(df.groupby('pub_type')))
        if False:
            # entrytypes['misc']
            g = entrytypes['online']
            g = g[g.columns[~np.all(pd.isnull(g), axis=0)]]

            entrytypes['book']
            entrytypes['thesis']
            g = entrytypes['article']
            g = entrytypes['incollection']
            g = entrytypes['conference']

        def lookup_pub(e):
            if e == 'article':
                return 'journal', 'journal'
            elif e == 'incollection':
                return 'booksection', 'booktitle'
            elif e == 'conference':
                return 'conference', 'booktitle'
            return None, None

        for e, g in entrytypes.items():
            print('e = %r' % (e, ))
            g = g[g.columns[~np.all(pd.isnull(g), axis=0)]]
            if 'pub_full' in g.columns:
                place_title = g['pub_full'].tolist()
                print(ub.repr2(ub.dict_hist(place_title)))
            else:
                print('Unknown publications')

        if 'report' in entrytypes:
            g = entrytypes['report']
            missing = g[pd.isnull(g['title'])]
            if len(missing):
                print('Missing Title')
                print(ub.repr2(missing[['title', 'author']].values.tolist()))

        if 'journal' in entrytypes:
            g = entrytypes['journal']
            g = g[g.columns[~np.all(pd.isnull(g), axis=0)]]

            missing = g[pd.isnull(g['journal'])]
            if len(missing):
                print('Missing Journal')
                print(ub.repr2(missing[['title', 'author']].values.tolist()))

        if 'conference' in entrytypes:
            g = entrytypes['conference']
            g = g[g.columns[~np.all(pd.isnull(g), axis=0)]]

            missing = g[pd.isnull(g['booktitle'])]
            if len(missing):
                print('Missing Booktitle')
                print(ub.repr2(missing[['title', 'author']].values.tolist()))

        if 'incollection' in entrytypes:
            g = entrytypes['incollection']
            g = g[g.columns[~np.all(pd.isnull(g), axis=0)]]

            missing = g[pd.isnull(g['booktitle'])]
            if len(missing):
                print('Missing Booktitle')
                print(ub.repr2(missing[['title', 'author']].values.tolist()))

        if 'thesis' in entrytypes:
            g = entrytypes['thesis']
            g = g[g.columns[~np.all(pd.isnull(g), axis=0)]]
            missing = g[pd.isnull(g['institution'])]
            if len(missing):
                print('Missing Institution')
                print(ub.repr2(missing[['title', 'author']].values.tolist()))

        # import utool
        # utool.embed()

    # Overwrite BibDatabase structure
    bib_database._entries_dict = bibtex_dict
    bib_database.entries = list(bibtex_dict.values())

    #conftitle_to_types_set_hist = {key: set(val) for key, val in conftitle_to_types_hist.items()}
    #print(ub.repr2(conftitle_to_types_set_hist))

    print('Unknown conference keys:')
    print(ub.repr2(sorted(unknown_pubkeys)))
    print('len(unknown_pubkeys) = %r' % (len(unknown_pubkeys), ))

    writer = BibTexWriter()
    writer.contents = ['comments', 'entries']
    writer.indent = '  '
    writer.order_entries_by = ('type', 'author', 'year')

    new_bibtex_str = bibtexparser.dumps(bib_database, writer)

    # Need to check
    #jegou_aggregating_2012

    # Fix the Journal Abreviations
    # References:
    # https://www.ieee.org/documents/trans_journal_names.pdf

    # Write out clean bibfile in ascii format
    clean_bib_fpath = ub.augpath(bib_fpath.replace(' ', '_'), suffix='_clean')

    if not ub.argflag('--dryrun'):
        ut.writeto(clean_bib_fpath, new_bibtex_str)
示例#25
0
def simple_munkres(part_oldnames):
    """
    Defines a munkres problem to solve name rectification.

    Notes:
        We create a matrix where each rows represents a group of annotations in
        the same PCC and each column represents an original name. If there are
        more PCCs than original names the columns are padded with extra values.
        The matrix is first initialized to be negative infinity representing
        impossible assignments. Then for each column representing a padded
        name, we set we its value to $1$ indicating that each new name could be
        assigned to a padded name for some small profit.  Finally, let $f_{rc}$
        be the the number of annotations in row $r$ with an original name of
        $c$. Each matrix value $(r, c)$ is set to $f_{rc} + 1$ if $f_{rc} > 0$,
        to represent how much each name ``wants'' to be labeled with a
        particular original name, and the extra one ensures that these original
        names are always preferred over padded names.

    Example:
        >>> part_oldnames = [['a', 'b'], ['b', 'c'], ['c', 'a', 'a']]
        >>> new_names = simple_munkres(part_oldnames)
        >>> result = ub.repr2(new_names)
        >>> print(new_names)
        ['b', 'c', 'a']

    Example:
        >>> part_oldnames = [[], ['a', 'a'], [],
        >>>                  ['a', 'a', 'a', 'a', 'a', 'a', 'a', 'b'], ['a']]
        >>> new_names = simple_munkres(part_oldnames)
        >>> result = ub.repr2(new_names)
        >>> print(new_names)
        [None, 'a', None, 'b', None]

    Example:
        >>> part_oldnames = [[], ['b'], ['a', 'b', 'c'], ['b', 'c'], ['c', 'e', 'e']]
        >>> new_names = find_consistent_labeling(part_oldnames)
        >>> result = ub.repr2(new_names)
        >>> print(new_names)
        ['_extra_name0', 'b', 'a', 'c', 'e']

        Profit Matrix
            b   a   c   e  _0
        0 -10 -10 -10 -10   1
        1   2 -10 -10 -10   1
        2   2   2   2 -10   1
        3   2 -10   2 -10   1
        4 -10 -10   2   3   1
    """
    unique_old_names = list(ub.unique(ub.flatten(part_oldnames)))
    num_new_names = len(part_oldnames)
    num_old_names = len(unique_old_names)

    # Create padded dummy values.  This accounts for the case where it is
    # impossible to uniquely map to the old db
    num_pad = max(num_new_names - num_old_names, 0)
    total = num_old_names + num_pad
    shape = (total, total)

    # Allocate assignment matrix.
    # rows are new-names and cols are old-names.
    # Initially the profit of any assignment is effectively -inf
    # This effectively marks all assignments as invalid
    profit_matrix = np.full(shape, -2 * total, dtype=np.int)
    # Overwrite valid assignments with positive profits
    from graphid import util
    oldname2_idx = util.make_index_lookup(unique_old_names)
    name_freq_list = [ub.dict_hist(names) for names in part_oldnames]
    # Initialize profit of a valid assignment as 1 + freq
    # This incentivizes using a previously used name
    for rowx, name_freq in enumerate(name_freq_list):
        for name, freq in name_freq.items():
            colx = oldname2_idx[name]
            profit_matrix[rowx, colx] = freq + 1
    # Set a much smaller profit for using an extra name
    # This allows the solution to always exist
    profit_matrix[:, num_old_names:total] = 1

    # Convert to minimization problem
    big_value = (profit_matrix.max()) - (profit_matrix.min())
    cost_matrix = big_value - profit_matrix

    # Use scipy implementation of munkres algorithm.
    rx2_cx = dict(zip(*scipy.optimize.linear_sum_assignment(cost_matrix)))

    # Each row (new-name) has now been assigned a column (old-name)
    # Map this back to the input-space (using None to indicate extras)
    cx2_name = dict(enumerate(unique_old_names))

    if False:
        import pandas as pd
        columns = unique_old_names + ['_%r' % x for x in range(num_pad)]
        print('Profit Matrix')
        print(pd.DataFrame(profit_matrix, columns=columns))

        print('Cost Matrix')
        print(pd.DataFrame(cost_matrix, columns=columns))

    assignment_ = [cx2_name.get(rx2_cx[rx], None)
                   for rx in range(num_new_names)]
    return assignment_
示例#26
0
def naive_password_strategy(required_len=14,
                            required_caps=1,
                            required_special=1,
                            required_digits=1):
    """
    Simulate a "bad" password that meets typical password requirements

    Get a naive version of the N char min special char password One
    common strategy for getting a 14 char pass is using 2 words or a word
    and a date with misspellings, shuffled case, and a special char,
    which is probably _, -, ., !, or @

    Example:
        scheme = naive_password_strategy()
        print(f'scheme={scheme}')
    """
    # When people are forced to include a special character, this is the
    # liklihood they choose one of the following:
    # https://www.reddit.com/r/dataisbeautiful/comments/2vfgvh/most_frequentlyused_special_characters_in_10/
    special_char_freq = {
        '_': 0.332,
        '.': 0.304,
        '-': 0.086,
        '!': 0.065,
        '@': 0.052,
        '*': 0.032,
        '$': 0.019,
        '&': 0.009,
        '%': 0.007,
    }
    _total = sum(special_char_freq.values())
    special_char_prob = ub.map_vals(lambda x: x / _total, special_char_freq)

    # Only seach the most likely special chars
    naive_special_chars = {
        k: v
        for k, v in special_char_prob.items() if v > 0.05
    }

    if 0:
        import diceware
        wlpath = diceware.wordlist.get_wordlist_path('en')
        wlpath = diceware.wordlist.get_wordlist_path('en_securedrop')
        wordlist = list(diceware.wordlist.WordList(wlpath))
        word_lengths = sorted(map(len, wordlist))
        word_length_hist = ub.dict_hist(word_lengths)
    else:
        # Number of common password words with a specific length
        word_length_hist = {
            1: 10,
            2: 90,
            3: 582,
            4: 2279,
            5: 3350,
            6: 1313,
            7: 539,
            8: 22,
            9: 5,
            10: 2
        }

    # Also needs a number and special char
    required_word_len = required_len - 2

    # How many permutations of N words are there that get over the char limit?
    total_passwords = 0
    import itertools as it
    import functools
    import operator as op
    possible_num_word = [1, 2, 3]
    for num_words in possible_num_word:
        for ts in it.product(*[word_length_hist.items()] * num_words):
            ks = [k for k, v in ts]
            vs = [v for k, v in ts]
            # If the lengths are above, we can take any of these permutations
            # (with replacement)
            if sum(ks) > required_word_len:
                # Compute the number of phrases, then augment this with the
                # special properties.
                num_phrases = functools.reduce(op.mul, vs)

                # People might insert a special character at the start, middle,
                # or end, or predictably replace a letter.
                predictability_factor = 2
                num_special_locs = (num_words + 1) * predictability_factor
                special_factor = required_special * len(
                    naive_special_chars) * num_special_locs

                # People might insert a digit at start, middle, or end, or maybe
                # inside of a word replacing a common letter.
                num_digit_locs = num_words + 1
                num_digits = 10 + 100  # usually a 1 or 2 digit number
                digit_factor = required_digits * num_digits * num_digit_locs

                # People might only shuffle the case of 1 or 2 letters.
                # usually at the beginning of words
                caps_factor = required_caps * num_words

                total = (num_phrases * (1 + special_factor) *
                         (1 + caps_factor) * (1 + digit_factor))
                total_passwords += total

    name_parts = ['naive', str(required_len)]
    if required_caps:
        name_parts.append('caps')

    if required_digits:
        name_parts.append('digit')

    if required_special:
        name_parts.append('special')

    name = '-'.join(name_parts)

    scheme = {
        'name': name,
        'num': 1,
        'base': total_passwords,
    }
    return scheme
示例#27
0
文件: expt.py 项目: Kitware/VIAME
def compare_results():
    print('Comparing results')
    import pandas as pd
    from tabulate import tabulate

    # Read in output of demo script
    measure_fpath = 'measurements_haul83.csv'
    py_df = pd.DataFrame.from_csv(measure_fpath, index_col=None)
    # Convert python length output from mm into cm for consistency
    py_df['fishlen'] = py_df['fishlen'] / 10
    py_df['current_frame'] = py_df['current_frame'].astype(np.int)

    # janky CSV parsing
    py_df['box_pts1'] = py_df['box_pts1'].map(lambda p: eval(p.replace(';', ','), np.__dict__))
    py_df['box_pts2'] = py_df['box_pts2'].map(lambda p: eval(p.replace(';', ','), np.__dict__))

    py_df['obox1'] = [ctalgo.OrientedBBox(*cv2.minAreaRect(pts[:, None, :].astype(np.int)))
                      for pts in py_df['box_pts1']]
    py_df['obox2'] = [ctalgo.OrientedBBox(*cv2.minAreaRect(pts[:, None, :].astype(np.int)))
                      for pts in py_df['box_pts2']]
    py_df.drop(['box_pts1', 'box_pts2'], axis=1, inplace=True)

    # Remap to matlab names
    py_df = py_df.rename(columns={
        'error': 'Err',
        'fishlen': 'fishLength',
        'range': 'fishRange',
    })

    # Load matlab results
    mat_df = _read_kresimir_results()

    FORCE_COMPARABLE_RANGE = True
    # FORCE_COMPARABLE_RANGE = False
    if FORCE_COMPARABLE_RANGE:
        # Be absolutely certain we are in comparable regions (may slightly bias
        # results, against python and in favor of matlab)
        min_frame = max(mat_df.current_frame.min(), py_df.current_frame.min())
        max_frame = min(mat_df.current_frame.max(), py_df.current_frame.max())
        print('min_frame = {!r}'.format(min_frame))
        print('max_frame = {!r}'.format(max_frame))

        mat_df = mat_df[(mat_df.current_frame >= min_frame) &
                        (mat_df.current_frame <= max_frame)]
        py_df = py_df[(py_df.current_frame >= min_frame) &
                      (py_df.current_frame <= max_frame)]

    intersect_frames = np.intersect1d(mat_df.current_frame, py_df.current_frame)
    print('intersecting frames = {} / {} (matlab)'.format(
        len(intersect_frames), len(set(mat_df.current_frame))))
    print('intersecting frames = {} / {} (python)'.format(
        len(intersect_frames), len(set(py_df.current_frame))))

    #  Reuse the hungarian algorithm implementation from ctalgo
    min_assign = ctalgo.FishStereoMeasurments.minimum_weight_assignment

    correspond = []
    for f in intersect_frames:
        pidxs = np.where(py_df.current_frame == f)[0]
        midxs = np.where(mat_df.current_frame == f)[0]

        pdf = py_df.iloc[pidxs]
        mdf = mat_df.iloc[midxs]

        ppts1 = np.array([o.center for o in pdf['obox1']])
        mpts1 = np.array([o.center for o in mdf['obox1']])

        ppts2 = np.array([o.center for o in pdf['obox2']])
        mpts2 = np.array([o.center for o in mdf['obox2']])

        dists1 = sklearn.metrics.pairwise.pairwise_distances(ppts1, mpts1)
        dists2 = sklearn.metrics.pairwise.pairwise_distances(ppts2, mpts2)

        # arbitrarilly chosen threshold
        thresh = 100
        for i, j in min_assign(dists1):
            d1 = dists1[i, j]
            d2 = dists2[i, j]
            if d1 < thresh and d2 < thresh and abs(d1 - d2) < thresh / 4:
                correspond.append((pidxs[i], midxs[j]))
    correspond = np.array(correspond)

    # pflags = np.array(ub.boolmask(correspond.T[0], len(py_df)))
    mflags = np.array(ub.boolmask(correspond.T[1], len(mat_df)))
    # print('there are {} detections that seem to be in common'.format(len(correspond)))
    # print('The QC flags of the common detections are:       {}'.format(
    #     ub.dict_hist(mat_df[mflags]['QC'].values)))
    # print('The QC flags of the other matlab detections are: {}'.format(
    #     ub.dict_hist(mat_df[~mflags]['QC'].values)))

    print('\n\n----\n## All stats\n')
    print(ub.codeblock(
        '''
        Overall, the matlab script made {nmat} length measurements and the
        python script made {npy} length measurements.  Here is a table
        summarizing the average lengths / ranges / errors of each script:
        ''').format(npy=len(py_df), nmat=len(mat_df)))
    stats = pd.DataFrame(columns=['python', 'matlab'])
    for key in ['fishLength', 'fishRange', 'Err']:
        stats.loc[key, 'python'] = '{:6.2f} ± {:6.2f}'.format(py_df[key].mean(), py_df[key].std())
        stats.loc[key, 'matlab'] = '{:6.2f} ± {:6.2f}'.format(mat_df[key].mean(), mat_df[key].std())

    stats.loc['nTotal', 'python'] = '{}'.format(len(py_df))
    stats.loc['nTotal', 'matlab'] = '{}'.format(len(mat_df))
    print(tabulate(stats, headers='keys', tablefmt='psql', stralign='right'))

    print('\n\n----\n## Only COMMON detections\n')
    py_df_c = py_df.iloc[correspond.T[0]]
    mat_df_c = mat_df.iloc[correspond.T[1]]
    stats = pd.DataFrame(columns=['python', 'matlab'])
    for key in ['fishLength', 'fishRange', 'Err']:
        stats.loc[key, 'python'] = '{:6.2f} ± {:6.2f}'.format(py_df_c[key].mean(), py_df_c[key].std())
        stats.loc[key, 'matlab'] = '{:6.2f} ± {:6.2f}'.format(mat_df_c[key].mean(), mat_df_c[key].std())

    stats.loc['nTotal', 'python'] = '{}'.format(len(py_df_c))
    stats.loc['nTotal', 'matlab'] = '{}'.format(len(mat_df_c))

    print(ub.codeblock(
        '''
        Now, we investigate how many dections matlab and python made in common.
        (Note, choosing which dections in one version correspond to which in
         another is done using a heuristic based on distances between bbox
         centers and a thresholded minimum assignment problem).

        Python made {npy_c}/{nmat} = {percent:.2f}% of the detections matlab made

        ''').format(npy_c=len(py_df_c), nmat=len(mat_df),
                    percent=100 * len(py_df_c) / len(mat_df)))
    print(tabulate(stats, headers='keys', tablefmt='psql', stralign='right'))

    print('\n\n----\n## Evaulation using the QC code\n')
    hist_hit = ub.dict_hist(mat_df[mflags]['QC'].values)
    hist_miss = ub.dict_hist(mat_df[~mflags]['QC'].values)
    print(ub.codeblock(
        '''
        However, not all of those matlab detections were good. Because we have
        detections in corrsepondences with each other we can assign the python
        detections QC codes.

        Here is a histogram of the QC codes for these python detections:
        {}
        (Note: read histogram as <QC-code>: <frequency>)

        Here is a histogram of the other matlab detections that python did not
        find:
        {}

        To summarize:
            python correctly rejected {:.2f}% of the matlab QC=0 detections
            python correctly accepted {:.2f}% of the matlab QC=1 detections
            python correctly accepted {:.2f}% of the matlab QC=2 detections

            Note, that because python made detections that matlab did not make,
            the remaining {} detections may be right or wrong, but there is
            no way to tell from this analysis.

        Lastly, here are the statistics for the common detections that had a
        non-zero QC code.
        ''').format(
            ub.repr2(hist_hit, nl=1),
            ub.repr2(hist_miss, nl=1),
            100 * hist_miss[0] / (hist_hit[0] + hist_miss[0]),
            100 * hist_hit[1] / (hist_hit[1] + hist_miss[1]),
            100 * hist_hit[2] / (hist_hit[2] + hist_miss[2]),
            len(py_df) - len(py_df_c)
                   )
    )

    is_qc = (mat_df_c['QC'] > 0).values
    mat_df_c = mat_df_c[is_qc]
    py_df_c = py_df_c[is_qc]
    stats = pd.DataFrame(columns=['python', 'matlab'])
    for key in ['fishLength', 'fishRange', 'Err']:
        stats.loc[key, 'python'] = '{:6.2f} ± {:6.2f}'.format(py_df_c[key].mean(), py_df_c[key].std())
        stats.loc[key, 'matlab'] = '{:6.2f} ± {:6.2f}'.format(mat_df_c[key].mean(), mat_df_c[key].std())

    stats.loc['nTotal', 'python'] = '{}'.format(len(py_df_c))
    stats.loc['nTotal', 'matlab'] = '{}'.format(len(mat_df_c))
    print(tabulate(stats, headers='keys', tablefmt='psql', stralign='right'))