def _print_previous_loop_statistics(infr, count): # Print stats about what happend in the this loop history = infr.metrics_list[-count:] recover_blocks = ub.group_items([ (k, sum(1 for i in g)) for k, g in it.groupby(util.take_column(history, 'recovering')) ]).get(True, []) infr.print( ('Recovery mode entered {} times, ' 'made {} recovery decisions.').format(len(recover_blocks), sum(recover_blocks)), color='green') testaction_hist = ub.dict_hist(util.take_column( history, 'test_action')) infr.print('Test Action Histogram: {}'.format( ub.repr2(testaction_hist, si=True)), color='yellow') if infr.params['inference.enabled']: action_hist = ub.dict_hist( util.emap(frozenset, util.take_column(history, 'action'))) infr.print('Inference Action Histogram: {}'.format( ub.repr2(action_hist, si=True)), color='yellow') infr.print('Decision Histogram: {}'.format( ub.repr2(ub.dict_hist(util.take_column(history, 'pred_decision')), si=True)), color='yellow') infr.print('User Histogram: {}'.format( ub.repr2(ub.dict_hist(util.take_column(history, 'user_id')), si=True)), color='yellow')
def test_dict_hist_ordered(): import random import string import ubelt as ub rng = random.Random(0) items = [rng.choice(string.ascii_letters) for _ in range(100)] # Ensure that the ordered=True bug is fixed a = sorted(ub.dict_hist(items, ordered=True).items()) b = sorted(ub.dict_hist(items, ordered=False).items()) assert a == b
def generate_phase1_data_tables(): cfg = viame_wrangler.config.WrangleConfig({ 'annots': ub.truepath( '~/data/viame-challenge-2018/phase1-annotations/*/*coarse*bbox-keypoint*.json' ) }) all_stats = {} annots = cfg.annots fpaths = list(glob.glob(annots)) print('fpaths = {}'.format(ub.repr2(fpaths))) for fpath in fpaths: dset_name = os.path.basename(fpath).split('-')[0] dset = CocoDataset(fpath, img_root=cfg.img_root, tag=dset_name) assert not dset.missing_images() assert not dset._find_bad_annotations() assert all([ img['has_annots'] in [True, False, None] for img in dset.imgs.values() ]) print(ub.dict_hist([g['has_annots'] for g in dset.imgs.values()])) stats = {} stats.update(ub.dict_subset(dset.basic_stats(), ['n_anns', 'n_imgs'])) roi_shapes_hist = dict() populated_cats = dict() for name, item in dset.category_annotation_type_frequency().items(): if item: populated_cats[name] = sum(item.values()) for k, v in item.items(): roi_shapes_hist[k] = roi_shapes_hist.get(k, 0) + v stats['n_cats'] = populated_cats stats['n_roi_shapes'] = roi_shapes_hist stats['n_imgs_with_annots'] = ub.map_keys( { None: 'unsure', True: 'has_objects', False: 'no_objects' }, ub.dict_hist([g['has_annots'] for g in dset.imgs.values()])) all_stats[dset_name] = stats print(ub.repr2(all_stats, nl=3, sk=1))
def regenerate_phase1_flavors(): """ Assumes original data is in a good format """ cfg = viame_wrangler.config.WrangleConfig({ 'annots': ub.truepath( '~/data/viame-challenge-2018/phase1-annotations/*/original_*.json') }) annots = cfg.annots fpaths = list(glob.glob(annots)) print('Reading raw mscoco files') for fpath in fpaths: print('reading fpath = {!r}'.format(fpath)) dset_name = os.path.basename(fpath).replace('original_', '').split('.')[0] orig_dset = CocoDataset(fpath, img_root=cfg.img_root, tag=dset_name) dpath = os.path.dirname(fpath) assert not orig_dset.missing_images() assert not orig_dset._find_bad_annotations() assert all([ img['has_annots'] in [True, False, None] for img in orig_dset.imgs.values() ]) print(ub.dict_hist([g['has_annots'] for g in orig_dset.imgs.values()])) make_dataset_flavors(orig_dset, dpath, dset_name)
def verbose_dump(dset, fpath): print('Dumping {}'.format(fpath)) if False: print(ub.repr2(dset.category_annotation_type_frequency(), nl=1, sk=1)) print(ub.dict_hist([img['has_annots'] for img in dset.imgs.values()])) print(ub.repr2(dset.basic_stats())) dset.dump(fpath)
def from_data(xpu, item, **kwargs): """ Creates an XPU to represent the processing device a Tensor or Variable is on Example: >>> xpu = XPU.from_data(torch.randn(3)) >>> assert not xpu.is_gpu() >>> if torch.cuda.is_available(): >>> xpu = XPU.from_data(torch.randn(3).cuda()) >>> assert xpu.is_gpu() >>> for i in range(torch.cuda.device_count()): >>> xpu = XPU.from_data(torch.randn(3).cuda(i)) >>> assert xpu.is_gpu() >>> assert xpu.main_device == i """ if hasattr(item, 'is_cuda'): if item.is_cuda: return XPU(item.get_device()) else: return XPU(None) elif hasattr(item, 'state_dict'): state_dict = item.state_dict() hist = ub.dict_hist(v.get_device() if v.is_cuda else None for v in state_dict.values()) device_num = ub.argsort(hist)[-1] return XPU(device_num) else: raise TypeError(type(item))
def _generate_abs(n): # Randomly choose images to generate boxes for chosen_gids = np.array(sorted(rng.choice(all_gids, size=n))) gid_to_nboxes = ub.dict_hist(chosen_gids) neg_gids = [] neg_boxes = [] for gid, nboxes in gid_to_nboxes.items(): qtree = self.qtrees[gid] scale = (qtree.width, qtree.height) anchors_ = np.array([window_size]) / np.array(scale) if np.any(anchors_ > 1.0): continue img_boxes = kwimage.Boxes.random(num=nboxes, scale=1.0, format='tlbr', anchors=anchors_, anchor_std=0, rng=rng) img_boxes = img_boxes.scale(scale) for box in img_boxes: # isect_aids, overlaps = self.ious(gid, box) isect_aids, overlaps = self.iooas(gid, box) if len(overlaps) == 0 or overlaps.max() < thresh: neg_gids.append(gid) neg_boxes.append(box.data) return neg_gids, neg_boxes
def motherboard_info(): """ REQUIRES SUDO xdoctest -m ~/misc/notes/buildapc.py motherboard_info """ import re info = ub.cmd('sudo dmidecode -t 9') pcie_slots = [] chunks = info['out'].split('\n\n') for chunk in chunks: item = {} for line in chunk.split('\n'): # doesn't get all data correctly (e.g. characteristics) parts = re.split('\t*:', line, maxsplit=1) if len(parts) == 2: key, val = parts key = key.strip() val = val.strip() if key in item: raise KeyError(f'key={key} already exists') item[key] = val if item: item = ub.map_keys(slugify_key, item) pcie_slots.append(item) pcie_usage = ub.dict_hist(item['current_usage'] for item in pcie_slots) _varied = varied_values(pcie_slots, min_variations=0) _varied = ub.map_keys(slugify_key, _varied) unvaried = {k: ub.peek(v) for k, v in _varied.items() if len(v) == 1} varied = {k: v for k, v in _varied.items() if len(v) > 1} print(info['out'])
def graph_info(graph, ignore=None, stats=False, verbose=False): from graphid import util import pandas as pd node_dict = graph.nodes node_attrs = list(node_dict.values()) edge_attrs = list(take_column(graph.edges(data=True), 2)) if stats: node_df = pd.DataFrame(node_attrs) edge_df = pd.DataFrame(edge_attrs) if ignore is not None: util.delete_dict_keys(node_df, ignore) util.delete_dict_keys(edge_df, ignore) # Not really histograms anymore try: node_attr_hist = node_df.describe().to_dict() except ValueError: node_attr_hist try: edge_attr_hist = edge_df.describe().to_dict() except ValueError: edge_attr_hist = {} key_order = ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] node_attr_hist = ub.map_dict_vals(lambda x: util.order_dict_by(x, key_order), node_attr_hist) edge_attr_hist = ub.map_dict_vals(lambda x: util.order_dict_by(x, key_order), edge_attr_hist) else: node_attr_hist = ub.dict_hist(ub.flatten([attr.keys() for attr in node_attrs])) edge_attr_hist = ub.dict_hist(ub.flatten([attr.keys() for attr in edge_attrs])) if ignore is not None: util.delete_dict_keys(edge_attr_hist, ignore) util.delete_dict_keys(node_attr_hist, ignore) node_type_hist = ub.dict_hist(list(map(type, graph.nodes()))) info_dict = ub.odict([ ('directed', graph.is_directed()), ('multi', graph.is_multigraph()), ('num_nodes', len(graph)), ('num_edges', len(list(graph.edges()))), ('edge_attr_hist', util.sort_dict(edge_attr_hist)), ('node_attr_hist', util.sort_dict(node_attr_hist)), ('node_type_hist', util.sort_dict(node_type_hist)), ('graph_attrs', graph.graph), ('graph_name', graph.name), ]) if verbose: print(ub.repr2(info_dict)) return info_dict
def _balance_report(self, limit=None): # Print the epoch / item label frequency per epoch label_sequence = [] index_sequence = [] if limit is None: limit = self.num_batches for item_indices, _ in zip(self, range(limit)): item_indices = np.array(item_indices) item_labels = list(ub.take(self.index_to_label, item_indices)) index_sequence.extend(item_indices) label_sequence.extend(ub.unique(item_labels)) label_hist = ub.dict_hist(label_sequence) index_hist = ub.dict_hist(index_sequence) label_hist = ub.sorted_vals(label_hist, reverse=True) index_hist = ub.sorted_vals(index_hist, reverse=True) index_hist = ub.dict_subset(index_hist, list(index_hist.keys())[0:5]) print('label_hist = {}'.format(ub.repr2(label_hist, nl=1))) print('index_hist = {}'.format(ub.repr2(index_hist, nl=1)))
def sample(self, *shape): """ Sampling from a mixture of k distributions with weights w_k is equivalent to picking a distribution with probability w_k, and then sampling from the picked distribution. """ # Choose which distributions are picked for each sample idxs = self._idx_choice.sample(*shape) idx_to_nsamples = ub.dict_hist(idxs.ravel()) out = np.zeros(*shape) for idx, n in idx_to_nsamples.items(): # Sample the from the distribution we picked mask = (idx == idxs) subsample = self.pdfs[idx].sample(n) out[mask] = subsample return out
def category_annotation_type_frequency(self): """ Reports the number of annotations of each type for each category Example: >>> dataset = demo_coco_data() >>> self = CocoDataset(dataset, tag='demo') >>> hist = self.category_annotation_frequency() >>> print(ub.repr2(hist)) """ catname_to_nannot_types = {} for cid, aids in self.cid_to_aids.items(): name = self.cats[cid]['name'] hist = ub.dict_hist(map(annot_type, ub.take(self.anns, aids))) catname_to_nannot_types[name] = ub.map_keys( lambda k: k[0] if len(k) == 1 else k, hist) return catname_to_nannot_types
def __getitem__(self, index): # Choose a label for each item in the batch if not hasattr(self.rng, 'choices'): # python 3.5 support chosen_labels = [ self.rng.choice(self.labels) for _ in range(self.batch_size) ] else: chosen_labels = self.rng.choices(self.labels, k=self.batch_size) # Count the number of items we need for each label label_freq = ub.dict_hist(chosen_labels) # Sample those indices batch_idxs = list( ub.flatten([ self.label_to_subsampler[label].sample(num) for label, num in label_freq.items() ])) return batch_idxs
def make_optimizer(hyper, named_parameters): """ Instantiate the optimizer defined by the hyperparams Contains special logic to create param groups Example: >>> import netharn as nh >>> config = {'optimizer': 'sgd', 'params': [ >>> {'lr': 3e-3, 'params': '.*\\.bias'}, >>> {'lr': 1e-3, 'params': '.*\\.weight'}, >>> #{'lr': 100, 'params': '.*\\.doesnotmatch'}, >>> ]} >>> optim_ = nh.api.Optimizer.coerce(config) >>> hyper = nh.HyperParams(optimizer=optim_) >>> model = nh.models.ToyNet1d() >>> named_parameters = list(model.named_parameters()) >>> optimizer = hyper.make_optimizer(named_parameters) >>> print('optimizer = {!r}'.format(optimizer)) """ if hyper._optimizer_info['instance'] is not None: return hyper._optimizer_info['instance'] # What happens if we want to group parameters optim_kw = hyper.optimizer_params.copy() params = optim_kw.pop('params', None) if params is None: param_groups = [p for (name, p) in named_parameters] else: import re named_parameters = list(named_parameters) name_to_param = dict(named_parameters) param_groups = [] if isinstance(params, dict): # remember the group key groups = [{'key': k, **g} for k, g in params.items()] if isinstance(params, list): groups = params PREVENT_DUPLICATES = 1 seen_ = set() for group in groups: # Transform param grouping specifications into real params group = group.copy() spec = group.pop('params') if isinstance(spec, list): if len(spec): first = ub.peek(spec) if isinstance(first, str): real_params = [name_to_param[k] for k in spec] elif isinstance(first, torch.nn.Parameter): real_params = spec else: raise TypeError(type(first)) else: real_params = [] # Python 3.6 doesn't have re.Pattern elif isinstance(spec, str) or hasattr(spec, 'match'): if hasattr(spec, 'match'): pat = spec else: pat = re.compile(spec) real_params = [ p for name, p in name_to_param.items() if pat.match(name) ] else: raise TypeError(type(spec)) if PREVENT_DUPLICATES: # give priority to earlier params # This is Python 3.6+ only real_params = list(ub.oset(real_params) - seen_) seen_.update(real_params) group['params'] = real_params param_groups.append(group) CHECK = 1 if CHECK: # Determine if we are using the same param more than once # or if we are not using a param at all. # NOTE: torch does do a duplicate check. param_group_ids = [] for group in param_groups: ids = list(map(id, group['params'])) param_group_ids.append(ids) all_param_ids = [id(p) for n, p in named_parameters] flat_ids = list(ub.flatten(param_group_ids)) freq = ub.dict_hist(flat_ids, labels=all_param_ids) num_unused = any(v == 0 for v in freq.values()) num_dups = any(v > 1 for v in freq.values()) if num_unused: warnings.warn( 'There are {} unused params'.format(num_unused)) if num_dups: warnings.warn( 'There are {} duplicate params'.format(num_dups)) optimizer = hyper.optimizer_cls(param_groups, **optim_kw) return optimizer
def convert_camvid_raw_to_coco(camvid_raw_info): """ Converts the raw camvid format to an MSCOCO based format, ( which lets use use kwcoco's COCO backend). Example: >>> # xdoctest: +REQUIRES(--download) >>> camvid_raw_info = grab_raw_camvid() >>> # test with a reduced set of data >>> del camvid_raw_info['img_paths'][2:] >>> del camvid_raw_info['mask_paths'][2:] >>> dset = convert_camvid_raw_to_coco(camvid_raw_info) >>> # xdoctest: +REQUIRES(--show) >>> import kwplot >>> plt = kwplot.autoplt() >>> kwplot.figure(fnum=1, pnum=(1, 2, 1)) >>> dset.show_image(gid=1) >>> kwplot.figure(fnum=1, pnum=(1, 2, 2)) >>> dset.show_image(gid=2) """ import re import kwimage import kwcoco print('Converting CamVid to MS-COCO format') dset_root, img_paths, label_path, mask_paths = ub.take( camvid_raw_info, 'dset_root, img_paths, label_path, mask_paths'.split(', ')) img_infos = { 'img_fname': img_paths, 'mask_fname': mask_paths, } keys = list(img_infos.keys()) next_vals = list(zip(*img_infos.values())) image_items = [{k: v for k, v in zip(keys, vals)} for vals in next_vals] dataset = { 'img_root': dset_root, 'images': [], 'categories': [], 'annotations': [], } lines = ub.readfrom(label_path).split('\n') lines = [line for line in lines if line] for line in lines: color_text, name = re.split('\t+', line) r, g, b = map(int, color_text.split(' ')) color = (r, g, b) # Parse the special camvid format cid = (r << 16) + (g << 8) + (b << 0) cat = { 'id': cid, 'name': name, 'color': color, } dataset['categories'].append(cat) for gid, img_item in enumerate(image_items, start=1): img = { 'id': gid, 'file_name': img_item['img_fname'], # nonstandard image field 'segmentation': img_item['mask_fname'], } dataset['images'].append(img) dset = kwcoco.CocoDataset(dataset) dset.rename_categories({'Void': 'background'}) assert dset.name_to_cat['background']['id'] == 0 dset.name_to_cat['background'].setdefault('alias', []).append('Void') if False: _define_camvid_class_hierarcy(dset) if 1: # TODO: Binarize CCs (and efficiently encode if possible) import numpy as np bad_info = [] once = False # Add images dset.remove_annotations(list(dset.index.anns.keys())) for gid, img in ub.ProgIter(dset.imgs.items(), desc='parse label masks'): mask_fpath = join(dset_root, img['segmentation']) rgb_mask = kwimage.imread(mask_fpath, space='rgb') r, g, b = rgb_mask.T.astype(np.int64) cid_mask = np.ascontiguousarray(rgb_to_cid(r, g, b).T) cids = set(np.unique(cid_mask)) - {0} for cid in cids: if cid not in dset.cats: if gid == 618: # Handle a known issue with image 618 c_mask = (cid == cid_mask).astype(np.uint8) total_bad = c_mask.sum() if total_bad < 32: if not once: print( 'gid 618 has a few known bad pixels, ignoring them' ) once = True continue else: raise Exception('more bad pixels than expected') else: raise Exception( 'UNKNOWN cid = {!r} in gid={!r}'.format(cid, gid)) # bad_rgb = cid_to_rgb(cid) # print('bad_rgb = {!r}'.format(bad_rgb)) # print('WARNING UNKNOWN cid = {!r} in gid={!r}'.format(cid, gid)) # bad_info.append({ # 'gid': gid, # 'cid': cid, # }) else: ann = { 'category_id': cid, 'image_id': gid # 'segmentation': mask.to_coco() } assert cid in dset.cats c_mask = (cid == cid_mask).astype(np.uint8) mask = kwimage.Mask(c_mask, 'c_mask') box = kwimage.Boxes([mask.get_xywh()], 'xywh') # box = mask.to_boxes() ann['bbox'] = ub.peek(box.to_coco()) ann['segmentation'] = mask.to_coco() dset.add_annotation(**ann) if 0: bad_cids = [i['cid'] for i in bad_info] print(sorted([c['color'] for c in dataset['categories']])) print(sorted(set([cid_to_rgb(i['cid']) for i in bad_info]))) gid = 618 img = dset.imgs[gid] mask_fpath = join(dset_root, img['segmentation']) rgb_mask = kwimage.imread(mask_fpath, space='rgb') r, g, b = rgb_mask.T.astype(np.int64) cid_mask = np.ascontiguousarray(rgb_to_cid(r, g, b).T) cid_hist = ub.dict_hist(cid_mask.ravel()) bad_cid_hist = {} for cid in bad_cids: bad_cid_hist[cid] = cid_hist.pop(cid) import kwplot kwplot.autompl() kwplot.imshow(rgb_mask) if 0: import kwplot plt = kwplot.autoplt() plt.clf() dset.show_image(1) import xdev gid_list = list(dset.imgs) for gid in xdev.InteractiveIter(gid_list): dset.show_image(gid) xdev.InteractiveIter.draw() dset._build_index() dset._build_hashid() return dset
def fix_dataset_phase1_original(): cfg = viame_wrangler.config.WrangleConfig({ 'annots': ub.truepath('~/data/viame-challenge-2018/phase1-annotations/*/original_*.json') }) annots = cfg.annots fpaths = list(glob.glob(annots)) print('Reading raw mscoco files') fpath_iter = iter(fpaths) for fpath in fpath_iter: print('reading fpath = {!r}'.format(fpath)) dset_name = os.path.basename(fpath).replace('original_', '').split('.')[0] dset = CocoDataset(fpath, img_root=cfg.img_root) did_fix = False if dset.missing_images(): did_fix = True print('Fixing missing images') for img in dset.dataset['images']: if img['file_name'].startswith(dset_name): assert False img['file_name'] = join(dset_name, img['file_name']) assert not dset.missing_images() # dset.dataset.keys() # dset.dataset['categories'] bad_annots = dset._find_bad_annotations() if bad_annots: print('Fixing bad annots') did_fix = True for ann in bad_annots: dset.remove_annotation(ann) dset._build_index() bad_hasannots_flags = not all([img.get('has_annots', ub.NoParam) in [True, False, None] for img in dset.imgs.values()]) if bad_hasannots_flags: did_fix = True for gid, img in dset.imgs.items(): aids = dset.gid_to_aids.get(gid, []) if True: # SPECIAL CASES if img['file_name'] == 'afsc_seq1/003496.jpg': img['has_annots'] = True # if False: # if img['has_annots'] is None: # dset.show_annotation(gid=img['id']) # break # If there is at least one annotation, always mark as has_annots if img.get('has_annots', None) not in [True, False, None]: if str(img['has_annots']).lower() == 'false': img['has_annots'] = False else: assert False, ub.repr2(img) if len(aids) > 0: img['has_annots'] = True else: # Otherwise set has_annots to null if it has not been # explicitly labeled if 'has_annots' not in img: img['has_annots'] = None print(ub.dict_hist([g['has_annots'] for g in dset.imgs.values()])) if did_fix: print('manual check') break # ut.print_difftext(ut.get_textdiff(dset.dumps(), orig_dset.dumps())) dset.dump(fpath)
def find_consistent_labeling(grouped_oldnames, extra_prefix='_extra_name', verbose=False): """ Solves a a maximum bipirtite matching problem to find a consistent name assignment that minimizes the number of annotations with different names. For each new grouping of annotations we assign For each group of annotations we must assign them all the same name, either from To reduce the running time Args: gropued_oldnames (list): A group of old names where the grouping is based on new names. For instance: Given: aids = [1, 2, 3, 4, 5] old_names = [0, 1, 1, 1, 0] new_names = [0, 0, 1, 1, 0] The grouping is [[0, 1, 0], [1, 1]] This lets us keep the old names in a split case and re-use exising names and make minimal changes to current annotation names while still being consistent with the new and improved grouping. The output will be: [0, 1] Meaning that all annots in the first group are assigned the name 0 and all annots in the second group are assigned the name 1. References: http://stackoverflow.com/questions/1398822/assignment-problem-numpy Example: >>> grouped_oldnames = demodata_oldnames(25, 15, 5, n_per_incon=5) >>> new_names = find_consistent_labeling(grouped_oldnames, verbose=1) >>> grouped_oldnames = demodata_oldnames(0, 15, 5, n_per_incon=1) >>> new_names = find_consistent_labeling(grouped_oldnames, verbose=1) >>> grouped_oldnames = demodata_oldnames(0, 0, 0, n_per_incon=1) >>> new_names = find_consistent_labeling(grouped_oldnames, verbose=1) Example: >>> ydata = [] >>> xdata = list(range(10, 150, 50)) >>> for x in xdata: >>> print('x = %r' % (x,)) >>> grouped_oldnames = demodata_oldnames(x, 15, 5, n_per_incon=5) >>> t = ub.Timerit(3, verbose=1) >>> for timer in t: >>> with timer: >>> new_names = find_consistent_labeling(grouped_oldnames) >>> ydata.append(t.min()) >>> # xdoc: +REQUIRES(--show) >>> import plottool as pt >>> pt.qtensure() >>> pt.multi_plot(xdata, [ydata]) >>> util.show_if_requested() Example: >>> grouped_oldnames = [['a', 'b', 'c'], ['b', 'c'], ['c', 'e', 'e']] >>> new_names = find_consistent_labeling(grouped_oldnames, verbose=1) >>> result = ub.repr2(new_names) >>> print(new_names) ['a', 'b', 'e'] Example: >>> grouped_oldnames = [['a', 'b'], ['a', 'a', 'b'], ['a']] >>> new_names = find_consistent_labeling(grouped_oldnames) >>> result = ub.repr2(new_names) >>> print(new_names) ['b', 'a', '_extra_name0'] Example: >>> grouped_oldnames = [['a', 'b'], ['e'], ['a', 'a', 'b'], [], ['a'], ['d']] >>> new_names = find_consistent_labeling(grouped_oldnames) >>> result = ub.repr2(new_names) >>> print(new_names) ['b', 'e', 'a', '_extra_name0', '_extra_name1', 'd'] Example: >>> grouped_oldnames = [[], ['a', 'a'], [], >>> ['a', 'a', 'a', 'a', 'a', 'a', 'a', 'b'], ['a']] >>> new_names = find_consistent_labeling(grouped_oldnames) >>> result = ub.repr2(new_names) >>> print(new_names) ['_extra_name0', 'a', '_extra_name1', 'b', '_extra_name2'] """ unique_old_names = list(ub.unique(ub.flatten(grouped_oldnames))) n_old_names = len(unique_old_names) n_new_names = len(grouped_oldnames) # Initialize assignment to all Nones assignment = [None for _ in range(n_new_names)] if verbose: print('finding maximally consistent labeling') print('n_old_names = %r' % (n_old_names,)) print('n_new_names = %r' % (n_new_names,)) # For each old_name, determine now many new_names use it. oldname_sets = list(map(set, grouped_oldnames)) oldname_usage = ub.dict_hist(ub.flatten(oldname_sets)) # Any name used more than once is a conflict and must be resolved conflict_oldnames = {k for k, v in oldname_usage.items() if v > 1} # Partition into trivial and non-trivial cases nontrivial_oldnames = [] nontrivial_new_idxs = [] trivial_oldnames = [] trivial_new_idxs = [] for new_idx, group in enumerate(grouped_oldnames): if set(group).intersection(conflict_oldnames): nontrivial_oldnames.append(group) nontrivial_new_idxs.append(new_idx) else: trivial_oldnames.append(group) trivial_new_idxs.append(new_idx) # Rectify trivial cases # Any new-name that does not share any of its old-names with other # new-names can be resolved trivially n_trivial_unchanged = 0 n_trivial_ignored = 0 n_trivial_merges = 0 for group, new_idx in zip(trivial_oldnames, trivial_new_idxs): if len(group) > 0: # new-names that use more than one old-name are simple merges h = ub.dict_hist(group) if len(h) > 1: n_trivial_merges += 1 else: n_trivial_unchanged += 1 hitems = list(h.items()) hvals = [i[1] for i in hitems] maxval = max(hvals) g = min([k for k, v in hitems if v == maxval]) assignment[new_idx] = g else: # new-names that use no old-names can be ignored n_trivial_ignored += 1 if verbose: n_trivial = len(trivial_oldnames) n_nontrivial = len(nontrivial_oldnames) print('rectify %d trivial groups' % (n_trivial,)) print(' * n_trivial_unchanged = %r' % (n_trivial_unchanged,)) print(' * n_trivial_merges = %r' % (n_trivial_merges,)) print(' * n_trivial_ignored = %r' % (n_trivial_ignored,)) print('rectify %d non-trivial groups' % (n_nontrivial,)) # Partition nontrivial_oldnames into smaller disjoint sets nontrivial_oldnames_sets = list(map(set, nontrivial_oldnames)) import networkx as nx g = nx.Graph() g.add_nodes_from(range(len(nontrivial_oldnames_sets))) for u, group1 in enumerate(nontrivial_oldnames_sets): rest = nontrivial_oldnames_sets[u + 1:] for v, group2 in enumerate(rest, start=u + 1): if group1.intersection(group2): g.add_edge(u, v) nontrivial_partition = list(nx.connected_components(g)) if verbose: print(' * partitioned non-trivial into %d subgroups' % (len(nontrivial_partition))) from graphid import util part_size_stats = util.stats_dict(map(len, nontrivial_partition)) stats_str = ub.repr2(part_size_stats, precision=2, strkeys=True) print(' * partition size stats = %s' % (stats_str,)) # Rectify nontrivial cases for part_idxs in ub.ProgIter(nontrivial_partition, desc='rectify parts', enabled=verbose): part_oldnames = list(ub.take(nontrivial_oldnames, part_idxs)) part_newidxs = list(ub.take(nontrivial_new_idxs, part_idxs)) # Rectify this part assignment_ = simple_munkres(part_oldnames) for new_idx, new_name in zip(part_newidxs, assignment_): assignment[new_idx] = new_name # Any unassigned name is now given a new unique label with a prefix if extra_prefix is not None: num_extra = 0 for idx, val in enumerate(assignment): if val is None: assignment[idx] = '%s%d' % (extra_prefix, num_extra,) num_extra += 1 return assignment
def _fix_keys(model_state_dict): """ Hack around DataParallel wrapper. If there is nothing in common between the two models check to see if prepending 'module.' to other keys fixes it. """ other_keys = set(model_state_dict) self_keys = set(self_state) if 0: # Automatic way to reduce nodes in the trees? # If node b always follows node a, can we contract it? nodes1 = [n for p in other_keys for n in p.split('.')] nodes2 = [n for p in self_keys for n in p.split('.')] tups1 = list(tup for key in other_keys for tup in ub.iter_window(key.split('.'), 2)) tups2 = list(tup for key in self_keys for tup in ub.iter_window(key.split('.'), 2)) x = ub.ddict(list) for a, b in tups1: x[a].append(b) for a, b in tups2: x[a].append(b) nodehist = ub.dict_hist(nodes1 + nodes2) for k, v in x.items(): print('----') print(k) print(nodehist[k]) follow_hist = ub.dict_hist(v) print(follow_hist) total = sum(follow_hist.values()) if ub.allsame(follow_hist.values()) and total == nodehist[k]: print('CONTRACT') # pair_freq = ub.dict_hist(ub.flatten([tups1, tups2])) # print(forest_str(paths_to_otree(other_keys, '.'))) # common_keys = other_keys.intersection(self_keys) # if not common_keys: if not other_keys.issubset(self_keys): if association == 'strict': pass elif association == 'module-hack': # If there are no common keys try a hack prefix = 'module.' def smap(f, ss): return set(map(f, ss)) def fix1(k): return prefix + k def fix2(k): if k.startswith(prefix): return k[len(prefix):] if smap(fix1, other_keys).intersection(self_keys): model_state_dict = ub.map_keys(fix1, model_state_dict) elif smap(fix2, other_keys).intersection(self_keys): model_state_dict = ub.map_keys(fix2, model_state_dict) elif association == 'prefix-hack': import functools def add_prefix(k, prefix): return prefix + k def remove_prefix(k, prefix): if k.startswith(prefix): return k[len(prefix):] # set1 = other_keys # target_set2 = self_keys found = _best_prefix_transform(other_keys, self_keys) if found is not None: for action, prefix in found['transform']: if action == 'add': func = functools.partial(add_prefix, prefix=prefix) elif action == 'remove': func = functools.partial(remove_prefix, prefix=prefix) else: raise AssertionError model_state_dict = ub.map_keys(func, model_state_dict) elif association in {'embedding', 'isomorphism'}: if verbose > 1: print('Using subpath {} association, may take some time'. format(association)) # I believe this is the correct way to solve the problem paths1 = sorted(other_keys) paths2 = sorted(self_state) if 1: # hack to filter to reduce tree size in embedding problem def shrink_paths(paths): new_paths = [] for p in paths: p = p.replace('.0', ':0') p = p.replace('.1', ':1') p = p.replace('.2', ':2') p = p.replace('.3', ':3') p = p.replace('.4', ':4') p = p.replace('.5', ':5') p = p.replace('.6', ':6') p = p.replace('.7', ':7') p = p.replace('.8', ':8') p = p.replace('.9', ':9') p = p.replace('.weight', ':weight') p = p.replace('.bias', ':bias') p = p.replace('.num_batches_tracked', ':num_batches_tracked') p = p.replace('.running_mean', ':running_mean') p = p.replace('.running_var', ':running_var') # p = p.replace('.conv1', ':conv1') # p = p.replace('.conv2', ':conv2') # p = p.replace('.conv3', ':conv3') # p = p.replace('.bn1', ':bn1') # p = p.replace('.bn2', ':bn2') # p = p.replace('.bn3', ':bn3') new_paths.append(p) return new_paths # Reducing the depth saves a lot of time paths1_ = shrink_paths(paths1) paths2_ = shrink_paths(paths2) subpaths1, subpaths2 = maximum_common_ordered_subpaths( paths1_, paths2_, sep='.', mode=association) subpaths1 = [p.replace(':', '.') for p in subpaths1] subpaths2 = [p.replace(':', '.') for p in subpaths2] mapping = ub.dzip(subpaths1, subpaths2) if verbose > 1: other_unmapped = sorted(other_keys - set(mapping.keys())) self_unmapped = sorted(self_keys - set(mapping.values())) print('-- embed association (other -> self) --') print('mapping = {}'.format(ub.repr2(mapping, nl=1))) print('self_unmapped = {}'.format( ub.repr2(self_unmapped, nl=1))) print('other_unmapped = {}'.format( ub.repr2(other_unmapped, nl=1))) print('len(mapping) = {}'.format( ub.repr2(len(mapping), nl=1))) print('len(self_unmapped) = {}'.format( ub.repr2(len(self_unmapped), nl=1))) print('len(other_unmapped) = {}'.format( ub.repr2(len(other_unmapped), nl=1))) print('-- end embed association --') # HACK: something might be wrong, there was an instance with # HRNet_w32 where multiple keys mapped to the same key # bad keys were incre_modules.3.0.conv1.weight and conv1.weight # # This will not error, but may produce bad output try: model_state_dict = ub.map_keys(lambda k: mapping.get(k, k), model_state_dict) except Exception as ex: HACK = 1 if HACK: new_state_dict_ = {} for k, v in model_state_dict.items(): new_state_dict_[mapping.get(k, k)] = v model_state_dict = new_state_dict_ warnings.warn('ex = {!r}'.format(ex)) else: raise else: raise KeyError(association) return model_state_dict
def _training_sample_weights(self): """ Assigns weighting to each image to includence sample probability. We want to see very frequent categories less often, but we also don't really care about the rarest classes to the point where we should smaple them more than uncommon classes. We also don't want to sample images without any or with too many annotations very often. """ index_to_gid = [img['id'] for img in self.dset.dataset['images']] index_to_aids = list(ub.take(self.dset.gid_to_aids, index_to_gid)) index_to_cids = [[self.dset.anns[aid]['category_id'] for aid in aids] for aids in index_to_aids] catname_to_cid = { cat['name']: cid for cid, cat in self.dset.cats.items()} # median frequency weighting with minimum threshold min_examples = 20 cat_freq = pd.Series(self.dset.category_annotation_frequency()) valid_freq = cat_freq[cat_freq > min_examples] normal_mfw = valid_freq.median() / valid_freq # Draw anything under the threshold with probability equal to the median too_few = cat_freq[(cat_freq <= min_examples) & (cat_freq > 0)] too_few[:] = 1.0 category_mfw = pd.concat([normal_mfw, too_few]) cid_to_mfw = category_mfw.rename(catname_to_cid) cid_to_mfw_dict = cid_to_mfw.to_dict() index_to_weights = [list(ub.take(cid_to_mfw_dict, cids)) for cids in index_to_cids] index_to_nannots = np.array(list(map(len, index_to_weights))) # Each image becomes represented by the category with maximum median # frequency weight. This allows us to assign each image a proxy class # We make another proxy class to represent images without anything in # them. EMPTY_PROXY_CID = -1 index_to_proxyid = [ # cid_to_mfw.loc[cids].idxmax() ub.argmax(ub.dict_subset(cid_to_mfw_dict, cids)) if len(cids) else EMPTY_PROXY_CID for cids in index_to_cids ] proxy_freq = pd.Series(ub.dict_hist(index_to_proxyid)) proxy_root_mfw = proxy_freq.median() / proxy_freq power = 0.878 proxy_root_mfw = proxy_root_mfw ** power # We now have a weight for each item in out dataset index_to_weight = np.array(list(ub.take(proxy_root_mfw.to_dict(), index_to_proxyid))) if False: # Figure out how the likelihoods of each class change xy = {} for power in [0, .5, .878, 1]: proxy_root_mfw = proxy_freq.median() / proxy_freq # dont let weights get too high # proxy_root_mfw = np.sqrt(proxy_root_mfw) # power = .88 proxy_root_mfw = proxy_root_mfw ** power # proxy_root_mfw = np.clip(proxy_root_mfw, a_min=None, a_max=3) index_to_weight = list(ub.take(proxy_root_mfw.to_dict(), index_to_proxyid)) if 1: # what is the probability we draw an empty image? df = pd.DataFrame({ 'nannots': index_to_nannots, 'weight': index_to_weight, }) df['prob'] = df.weight / df.weight.sum() prob_empty = df.prob[df.nannots == 0].sum() probs = {'empty': prob_empty} for cid in cid_to_mfw.index: flags = [cid in cids for cids in index_to_cids] catname = self.dset.cats[cid]['name'] p = df[flags].prob.sum() probs[catname] = p xy['p{}'.format(power)] = pd.Series(probs) xy['freq'] = {} for cid in cid_to_mfw.index: catname = self.dset.cats[cid]['name'] xy['freq'][catname] = proxy_freq[cid] print(pd.DataFrame(xy)) # index_to_prob = index_to_weight / index_to_weight.sum() return index_to_weight
def __init__(self, index_to_labels, batch_size=1, num_batches='auto', label_to_weight=None, shuffle=False, rng=None): import kwarray rng = kwarray.ensure_rng(rng, api='python') label_to_indices = ub.ddict(set) flat_groups = [] for index, item_labels in enumerate(index_to_labels): flat_groups.extend([index] * len(item_labels)) for label in item_labels: label_to_indices[label].add(index) flat_labels = np.hstack(index_to_labels) label_to_freq = ub.dict_hist(flat_labels) # Use tf-idf based scheme to compute sample probabilities label_to_idf = {} label_to_tfidf = {} labels = sorted(set(flat_labels)) for label in labels: # tf for each img, is the number of times the label appears index_to_tf = np.zeros(len(index_to_labels)) for index, item_labels in enumerate(index_to_labels): index_to_tf[index] = (label == item_labels).sum() # idf is the #imgs / #imgs-with-label idf = len(index_to_tf) / (index_to_tf > 0).sum() if label_to_weight: idf = idf * label_to_weight[label] label_to_idf[label] = idf label_to_tfidf[label] = np.maximum(index_to_tf * idf, 1) index_to_weight = sum(label_to_tfidf.values()) index_to_prob = index_to_weight / index_to_weight.sum() if 0: index_to_unique_labels = list(map(set, index_to_labels)) unique_freq = ub.dict_hist(ub.flatten(index_to_unique_labels)) tot = sum(unique_freq.values()) unweighted_odds = ub.map_vals(lambda x: x / tot, unique_freq) label_to_indices = ub.ddict(set) for index, item_labels in enumerate(index_to_labels): for label in item_labels: label_to_indices[label].add(index) ub.map_vals(len, label_to_indices) label_to_odds = ub.ddict(lambda: 0) for label, indices in label_to_indices.items(): for idx in indices: label_to_odds[label] += index_to_prob[idx] coi = {x for x, w in label_to_weight.items() if w > 0} coi_weighted = ub.dict_subset(label_to_odds, coi) coi_unweighted = ub.dict_subset(unweighted_odds, coi) print('coi_weighted = {}'.format(ub.repr2(coi_weighted, nl=1))) print('coi_unweighted = {}'.format(ub.repr2(coi_unweighted, nl=1))) self.index_to_prob = index_to_prob self.indices = np.arange(len(index_to_prob)) if num_batches == 'auto': self.num_batches = self._auto_num_batches() else: self.num_batches = num_batches self.label_to_freq = label_to_freq self.index_to_labels = index_to_labels self.batch_size = batch_size self.shuffle = shuffle self.rng = kwarray.ensure_rng(rng, api='numpy')
from mnist_matching import setup_harn resnet_harn = setup_harn(arch='resnet').initialize() simple_harn = setup_harn(arch='simple').initialize() harn = simple_harn batch_vali = harn._demo_batch(tag='vali') batch_train = harn._demo_batch(tag='train') batch = batch_train inputs = batch['chip'] outputs = harn.model(inputs) dvecs = outputs['dvecs'] output_shape = harn.model.module.output_shape_for(inputs.shape) print(ub.repr2(output_shape.hidden, nl=-1)) labels = batch['cpu_nx'] print('labels = {}'.format( ub.repr2(ub.odict(sorted(ub.dict_hist(labels.numpy()).items())), nl=1))) labels = labels[0:8] dvecs = dvecs[0:8] info1 = harn.criterion.mine_negatives(dvecs, labels, num=1, mode='hard') info2 = harn.criterion.mine_negatives(dvecs, labels, num=1, mode='consistent') info3 = harn.criterion.hard_triples(dvecs, labels) info4 = harn.criterion.hard_triples2(dvecs, labels)
def compare_results(): print('Comparing results') import pandas as pd from tabulate import tabulate # Read in output of demo script measure_fpath = 'measurements_haul83.csv' py_df = pd.DataFrame.from_csv(measure_fpath, index_col=None) # Convert python length output from mm into cm for consistency py_df['fishlen'] = py_df['fishlen'] / 10 py_df['current_frame'] = py_df['current_frame'].astype(np.int) # janky CSV parsing py_df['box_pts1'] = py_df['box_pts1'].map( lambda p: eval(p.replace(';', ','), np.__dict__)) py_df['box_pts2'] = py_df['box_pts2'].map( lambda p: eval(p.replace(';', ','), np.__dict__)) py_df['obox1'] = [ ctalgo.OrientedBBox(*cv2.minAreaRect(pts[:, None, :].astype(np.int))) for pts in py_df['box_pts1'] ] py_df['obox2'] = [ ctalgo.OrientedBBox(*cv2.minAreaRect(pts[:, None, :].astype(np.int))) for pts in py_df['box_pts2'] ] py_df.drop(['box_pts1', 'box_pts2'], axis=1, inplace=True) # Remap to matlab names py_df = py_df.rename(columns={ 'error': 'Err', 'fishlen': 'fishLength', 'range': 'fishRange', }) # Load matlab results mat_df = _read_kresimir_results() FORCE_COMPARABLE_RANGE = True # FORCE_COMPARABLE_RANGE = False if FORCE_COMPARABLE_RANGE: # Be absolutely certain we are in comparable regions (may slightly bias # results, against python and in favor of matlab) min_frame = max(mat_df.current_frame.min(), py_df.current_frame.min()) max_frame = min(mat_df.current_frame.max(), py_df.current_frame.max()) print('min_frame = {!r}'.format(min_frame)) print('max_frame = {!r}'.format(max_frame)) mat_df = mat_df[(mat_df.current_frame >= min_frame) & (mat_df.current_frame <= max_frame)] py_df = py_df[(py_df.current_frame >= min_frame) & (py_df.current_frame <= max_frame)] intersect_frames = np.intersect1d(mat_df.current_frame, py_df.current_frame) print('intersecting frames = {} / {} (matlab)'.format( len(intersect_frames), len(set(mat_df.current_frame)))) print('intersecting frames = {} / {} (python)'.format( len(intersect_frames), len(set(py_df.current_frame)))) # Reuse the hungarian algorithm implementation from ctalgo min_assign = ctalgo.StereoLengthMeasurments.minimum_weight_assignment correspond = [] for f in intersect_frames: pidxs = np.where(py_df.current_frame == f)[0] midxs = np.where(mat_df.current_frame == f)[0] pdf = py_df.iloc[pidxs] mdf = mat_df.iloc[midxs] ppts1 = np.array([o.center for o in pdf['obox1']]) mpts1 = np.array([o.center for o in mdf['obox1']]) ppts2 = np.array([o.center for o in pdf['obox2']]) mpts2 = np.array([o.center for o in mdf['obox2']]) dists1 = sklearn.metrics.pairwise.pairwise_distances(ppts1, mpts1) dists2 = sklearn.metrics.pairwise.pairwise_distances(ppts2, mpts2) # arbitrarilly chosen threshold thresh = 100 for i, j in min_assign(dists1): d1 = dists1[i, j] d2 = dists2[i, j] if d1 < thresh and d2 < thresh and abs(d1 - d2) < thresh / 4: correspond.append((pidxs[i], midxs[j])) correspond = np.array(correspond) # pflags = np.array(ub.boolmask(correspond.T[0], len(py_df))) mflags = np.array(ub.boolmask(correspond.T[1], len(mat_df))) # print('there are {} detections that seem to be in common'.format(len(correspond))) # print('The QC flags of the common detections are: {}'.format( # ub.dict_hist(mat_df[mflags]['QC'].values))) # print('The QC flags of the other matlab detections are: {}'.format( # ub.dict_hist(mat_df[~mflags]['QC'].values))) print('\n\n----\n## All stats\n') print( ub.codeblock(''' Overall, the matlab script made {nmat} length measurements and the python script made {npy} length measurements. Here is a table summarizing the average lengths / ranges / errors of each script: ''').format(npy=len(py_df), nmat=len(mat_df))) stats = pd.DataFrame(columns=['python', 'matlab']) for key in ['fishLength', 'fishRange', 'Err']: stats.loc[key, 'python'] = '{:6.2f} ± {:6.2f}'.format( py_df[key].mean(), py_df[key].std()) stats.loc[key, 'matlab'] = '{:6.2f} ± {:6.2f}'.format( mat_df[key].mean(), mat_df[key].std()) stats.loc['nTotal', 'python'] = '{}'.format(len(py_df)) stats.loc['nTotal', 'matlab'] = '{}'.format(len(mat_df)) print(tabulate(stats, headers='keys', tablefmt='psql', stralign='right')) print('\n\n----\n## Only COMMON detections\n') py_df_c = py_df.iloc[correspond.T[0]] mat_df_c = mat_df.iloc[correspond.T[1]] stats = pd.DataFrame(columns=['python', 'matlab']) for key in ['fishLength', 'fishRange', 'Err']: stats.loc[key, 'python'] = '{:6.2f} ± {:6.2f}'.format( py_df_c[key].mean(), py_df_c[key].std()) stats.loc[key, 'matlab'] = '{:6.2f} ± {:6.2f}'.format( mat_df_c[key].mean(), mat_df_c[key].std()) stats.loc['nTotal', 'python'] = '{}'.format(len(py_df_c)) stats.loc['nTotal', 'matlab'] = '{}'.format(len(mat_df_c)) print( ub.codeblock(''' Now, we investigate how many dections matlab and python made in common. (Note, choosing which dections in one version correspond to which in another is done using a heuristic based on distances between bbox centers and a thresholded minimum assignment problem). Python made {npy_c}/{nmat} = {percent:.2f}% of the detections matlab made ''').format(npy_c=len(py_df_c), nmat=len(mat_df), percent=100 * len(py_df_c) / len(mat_df))) print(tabulate(stats, headers='keys', tablefmt='psql', stralign='right')) print('\n\n----\n## Evaulation using the QC code\n') hist_hit = ub.dict_hist(mat_df[mflags]['QC'].values) hist_miss = ub.dict_hist(mat_df[~mflags]['QC'].values) print( ub.codeblock(''' However, not all of those matlab detections were good. Because we have detections in corrsepondences with each other we can assign the python detections QC codes. Here is a histogram of the QC codes for these python detections: {} (Note: read histogram as <QC-code>: <frequency>) Here is a histogram of the other matlab detections that python did not find: {} To summarize: python correctly rejected {:.2f}% of the matlab QC=0 detections python correctly accepted {:.2f}% of the matlab QC=1 detections python correctly accepted {:.2f}% of the matlab QC=2 detections Note, that because python made detections that matlab did not make, the remaining {} detections may be right or wrong, but there is no way to tell from this analysis. Lastly, here are the statistics for the common detections that had a non-zero QC code. ''').format(ub.repr2(hist_hit, nl=1), ub.repr2(hist_miss, nl=1), 100 * hist_miss[0] / (hist_hit[0] + hist_miss[0]), 100 * hist_hit[1] / (hist_hit[1] + hist_miss[1]), 100 * hist_hit[2] / (hist_hit[2] + hist_miss[2]), len(py_df) - len(py_df_c))) is_qc = (mat_df_c['QC'] > 0).values mat_df_c = mat_df_c[is_qc] py_df_c = py_df_c[is_qc] stats = pd.DataFrame(columns=['python', 'matlab']) for key in ['fishLength', 'fishRange', 'Err']: stats.loc[key, 'python'] = '{:6.2f} ± {:6.2f}'.format( py_df_c[key].mean(), py_df_c[key].std()) stats.loc[key, 'matlab'] = '{:6.2f} ± {:6.2f}'.format( mat_df_c[key].mean(), mat_df_c[key].std()) stats.loc['nTotal', 'python'] = '{}'.format(len(py_df_c)) stats.loc['nTotal', 'matlab'] = '{}'.format(len(mat_df_c)) print(tabulate(stats, headers='keys', tablefmt='psql', stralign='right'))
def 字典_统计(数组, weights=None, ordered=False, labels=None): data = ub.dict_hist(数组, weights, ordered, labels) return data
def main(bib_fpath=None): r""" intro point to fixbib script CommmandLine: fixbib python -m fixtex bib python -m fixtex bib --dryrun python -m fixtex bib --dryrun --debug """ if bib_fpath is None: bib_fpath = 'My Library.bib' # DEBUG = ub.argflag('--debug') # Read in text and ensure ascii format dirty_text = ut.readfrom(bib_fpath) from fixtex.fix_tex import find_used_citations, testdata_fpaths if exists('custom_extra.bib'): extra_parser = bparser.BibTexParser(ignore_nonstandard_types=False) parser = bparser.BibTexParser() ut.delete_keys(parser.alt_dict, ['url', 'urls']) print('Parsing extra bibtex file') extra_text = ut.readfrom('custom_extra.bib') extra_database = extra_parser.parse(extra_text, partial=False) print('Finished parsing extra') extra_dict = extra_database.get_entry_dict() else: extra_dict = None #udata = dirty_text.decode("utf-8") #dirty_text = udata.encode("ascii", "ignore") #dirty_text = udata # parser = bparser.BibTexParser() # bib_database = parser.parse(dirty_text) # d = bib_database.get_entry_dict() print('BIBTEXPARSER LOAD') parser = bparser.BibTexParser(ignore_nonstandard_types=False, common_strings=True) ut.delete_keys(parser.alt_dict, ['url', 'urls']) print('Parsing bibtex file') bib_database = parser.parse(dirty_text, partial=False) print('Finished parsing') bibtex_dict = bib_database.get_entry_dict() old_keys = list(bibtex_dict.keys()) new_keys = [] for key in ub.ProgIter(old_keys, label='fixing keys'): new_key = key new_key = new_key.replace(':', '') new_key = new_key.replace('-', '_') new_key = re.sub('__*', '_', new_key) new_keys.append(new_key) # assert len(ut.find_duplicate_items(new_keys)) == 0, 'new keys created conflict' assert len(ub.find_duplicates(new_keys)) == 0, 'new keys created conflict' for key, new_key in zip(old_keys, new_keys): if key != new_key: entry = bibtex_dict[key] entry['ID'] = new_key bibtex_dict[new_key] = entry del bibtex_dict[key] # The bibtext is now clean. Print it to stdout #print(clean_text) verbose = None if verbose is None: verbose = 1 # Find citations from the tex documents key_list = None if key_list is None: cacher = ub.Cacher('texcite1', enabled=0) data = cacher.tryload() if data is None: fpaths = testdata_fpaths() key_list, inverse = find_used_citations(fpaths, return_inverse=True) # ignore = ['JP', '?', 'hendrick'] # for item in ignore: # try: # key_list.remove(item) # except ValueError: # pass if verbose: print('Found %d citations used in the document' % (len(key_list), )) data = key_list, inverse cacher.save(data) key_list, inverse = data # else: # key_list = None unknown_pubkeys = [] debug_author = ub.argval('--debug-author', default=None) # ./fix_bib.py --debug_author=Kappes if verbose: print('Fixing %d/%d bibtex entries' % (len(key_list), len(bibtex_dict))) # debug = True debug = False if debug_author is not None: debug = False known_keys = list(bibtex_dict.keys()) missing_keys = set(key_list) - set(known_keys) if extra_dict is not None: missing_keys.difference_update(set(extra_dict.keys())) if missing_keys: print('The library is missing keys found in tex files %s' % (ub.repr2(missing_keys), )) # Search for possible typos: candidate_typos = {} sedlines = [] for key in missing_keys: candidates = ut.closet_words(key, known_keys, num=3, subset=True) if len(candidates) > 1: top = candidates[0] if ut.edit_distance(key, top) == 1: # "sed -i -e 's/{}/{}/g' *.tex".format(key, top) import os replpaths = ' '.join( [relpath(p, os.getcwd()) for p in inverse[key]]) sedlines.append("sed -i -e 's/{}/{}/g' {}".format( key, top, replpaths)) candidate_typos[key] = candidates print('Cannot find key = %r' % (key, )) print('Did you mean? %r' % (candidates, )) print('Quick fixes') print('\n'.join(sedlines)) # group by file just = max([0] + list(map(len, missing_keys))) missing_fpaths = [inverse[key] for key in missing_keys] for fpath in sorted(set(ub.flatten(missing_fpaths))): # ut.fix_embed_globals() subkeys = [k for k in missing_keys if fpath in inverse[k]] print('') ut.cprint('--- Missing Keys ---', 'blue') ut.cprint('fpath = %r' % (fpath, ), 'blue') ut.cprint('{} | {}'.format('Missing'.ljust(just), 'Did you mean?'), 'blue') for key in subkeys: print('{} | {}'.format(ut.highlight_text(key.ljust(just), 'red'), ' '.join(candidate_typos[key]))) # for key in list(bibtex_dict.keys()): if extra_dict is not None: # Extra database takes precidence over regular key_list = list(ut.unique(key_list + list(extra_dict.keys()))) for k, v in extra_dict.items(): bibtex_dict[k] = v full = ub.argflag('--full') for key in key_list: try: entry = bibtex_dict[key] except KeyError: continue self = BibTexCleaner(key, entry, full=full) if debug_author is not None: debug = debug_author in entry.get('author', '') if debug: ut.cprint(' --- ENTRY ---', 'yellow') print(ub.repr2(entry, nl=1)) entry = self.fix() # self.clip_abstract() # self.shorten_keys() # self.fix_authors() # self.fix_year() # old_pubval = self.fix_pubkey() # if old_pubval: # unknown_pubkeys.append(old_pubval) # self.fix_arxiv() # self.fix_general() # self.fix_paper_types() if debug: print(ub.repr2(entry, nl=1)) ut.cprint(' --- END ENTRY ---', 'yellow') bibtex_dict[key] = entry unwanted_keys = set(bibtex_dict.keys()) - set(key_list) if verbose: print('Removing unwanted %d entries' % (len(unwanted_keys))) ut.delete_dict_keys(bibtex_dict, unwanted_keys) if 0: d1 = bibtex_dict.copy() full = True for key, entry in d1.items(): self = BibTexCleaner(key, entry, full=full) pub = self.publication() if pub is None: print(self.entry['ENTRYTYPE']) old = self.fix_pubkey() x1 = self._pubval() x2 = self.standard_pubval(full=full) # if x2 is not None and len(x2) > 5: # print(ub.repr2(self.entry)) if x1 != x2: print('x2 = %r' % (x2, )) print('x1 = %r' % (x1, )) print(ub.repr2(self.entry)) # if 'CVPR' in self.entry.get('booktitle', ''): # if 'CVPR' != self.entry.get('booktitle', ''): # break if old: print('old = %r' % (old, )) d1[key] = self.entry if full: d1 = bibtex_dict.copy() import numpy as np import pandas as pd df = pd.DataFrame.from_dict(d1, orient='index') paged_items = df[~pd.isnull(df['pub_accro'])] has_pages = ~pd.isnull(paged_items['pages']) print('have pages {} / {}'.format(has_pages.sum(), len(has_pages))) print(ub.repr2(paged_items[~has_pages]['title'].values.tolist())) entrytypes = dict(list(df.groupby('pub_type'))) if False: # entrytypes['misc'] g = entrytypes['online'] g = g[g.columns[~np.all(pd.isnull(g), axis=0)]] entrytypes['book'] entrytypes['thesis'] g = entrytypes['article'] g = entrytypes['incollection'] g = entrytypes['conference'] def lookup_pub(e): if e == 'article': return 'journal', 'journal' elif e == 'incollection': return 'booksection', 'booktitle' elif e == 'conference': return 'conference', 'booktitle' return None, None for e, g in entrytypes.items(): print('e = %r' % (e, )) g = g[g.columns[~np.all(pd.isnull(g), axis=0)]] if 'pub_full' in g.columns: place_title = g['pub_full'].tolist() print(ub.repr2(ub.dict_hist(place_title))) else: print('Unknown publications') if 'report' in entrytypes: g = entrytypes['report'] missing = g[pd.isnull(g['title'])] if len(missing): print('Missing Title') print(ub.repr2(missing[['title', 'author']].values.tolist())) if 'journal' in entrytypes: g = entrytypes['journal'] g = g[g.columns[~np.all(pd.isnull(g), axis=0)]] missing = g[pd.isnull(g['journal'])] if len(missing): print('Missing Journal') print(ub.repr2(missing[['title', 'author']].values.tolist())) if 'conference' in entrytypes: g = entrytypes['conference'] g = g[g.columns[~np.all(pd.isnull(g), axis=0)]] missing = g[pd.isnull(g['booktitle'])] if len(missing): print('Missing Booktitle') print(ub.repr2(missing[['title', 'author']].values.tolist())) if 'incollection' in entrytypes: g = entrytypes['incollection'] g = g[g.columns[~np.all(pd.isnull(g), axis=0)]] missing = g[pd.isnull(g['booktitle'])] if len(missing): print('Missing Booktitle') print(ub.repr2(missing[['title', 'author']].values.tolist())) if 'thesis' in entrytypes: g = entrytypes['thesis'] g = g[g.columns[~np.all(pd.isnull(g), axis=0)]] missing = g[pd.isnull(g['institution'])] if len(missing): print('Missing Institution') print(ub.repr2(missing[['title', 'author']].values.tolist())) # import utool # utool.embed() # Overwrite BibDatabase structure bib_database._entries_dict = bibtex_dict bib_database.entries = list(bibtex_dict.values()) #conftitle_to_types_set_hist = {key: set(val) for key, val in conftitle_to_types_hist.items()} #print(ub.repr2(conftitle_to_types_set_hist)) print('Unknown conference keys:') print(ub.repr2(sorted(unknown_pubkeys))) print('len(unknown_pubkeys) = %r' % (len(unknown_pubkeys), )) writer = BibTexWriter() writer.contents = ['comments', 'entries'] writer.indent = ' ' writer.order_entries_by = ('type', 'author', 'year') new_bibtex_str = bibtexparser.dumps(bib_database, writer) # Need to check #jegou_aggregating_2012 # Fix the Journal Abreviations # References: # https://www.ieee.org/documents/trans_journal_names.pdf # Write out clean bibfile in ascii format clean_bib_fpath = ub.augpath(bib_fpath.replace(' ', '_'), suffix='_clean') if not ub.argflag('--dryrun'): ut.writeto(clean_bib_fpath, new_bibtex_str)
def simple_munkres(part_oldnames): """ Defines a munkres problem to solve name rectification. Notes: We create a matrix where each rows represents a group of annotations in the same PCC and each column represents an original name. If there are more PCCs than original names the columns are padded with extra values. The matrix is first initialized to be negative infinity representing impossible assignments. Then for each column representing a padded name, we set we its value to $1$ indicating that each new name could be assigned to a padded name for some small profit. Finally, let $f_{rc}$ be the the number of annotations in row $r$ with an original name of $c$. Each matrix value $(r, c)$ is set to $f_{rc} + 1$ if $f_{rc} > 0$, to represent how much each name ``wants'' to be labeled with a particular original name, and the extra one ensures that these original names are always preferred over padded names. Example: >>> part_oldnames = [['a', 'b'], ['b', 'c'], ['c', 'a', 'a']] >>> new_names = simple_munkres(part_oldnames) >>> result = ub.repr2(new_names) >>> print(new_names) ['b', 'c', 'a'] Example: >>> part_oldnames = [[], ['a', 'a'], [], >>> ['a', 'a', 'a', 'a', 'a', 'a', 'a', 'b'], ['a']] >>> new_names = simple_munkres(part_oldnames) >>> result = ub.repr2(new_names) >>> print(new_names) [None, 'a', None, 'b', None] Example: >>> part_oldnames = [[], ['b'], ['a', 'b', 'c'], ['b', 'c'], ['c', 'e', 'e']] >>> new_names = find_consistent_labeling(part_oldnames) >>> result = ub.repr2(new_names) >>> print(new_names) ['_extra_name0', 'b', 'a', 'c', 'e'] Profit Matrix b a c e _0 0 -10 -10 -10 -10 1 1 2 -10 -10 -10 1 2 2 2 2 -10 1 3 2 -10 2 -10 1 4 -10 -10 2 3 1 """ unique_old_names = list(ub.unique(ub.flatten(part_oldnames))) num_new_names = len(part_oldnames) num_old_names = len(unique_old_names) # Create padded dummy values. This accounts for the case where it is # impossible to uniquely map to the old db num_pad = max(num_new_names - num_old_names, 0) total = num_old_names + num_pad shape = (total, total) # Allocate assignment matrix. # rows are new-names and cols are old-names. # Initially the profit of any assignment is effectively -inf # This effectively marks all assignments as invalid profit_matrix = np.full(shape, -2 * total, dtype=np.int) # Overwrite valid assignments with positive profits from graphid import util oldname2_idx = util.make_index_lookup(unique_old_names) name_freq_list = [ub.dict_hist(names) for names in part_oldnames] # Initialize profit of a valid assignment as 1 + freq # This incentivizes using a previously used name for rowx, name_freq in enumerate(name_freq_list): for name, freq in name_freq.items(): colx = oldname2_idx[name] profit_matrix[rowx, colx] = freq + 1 # Set a much smaller profit for using an extra name # This allows the solution to always exist profit_matrix[:, num_old_names:total] = 1 # Convert to minimization problem big_value = (profit_matrix.max()) - (profit_matrix.min()) cost_matrix = big_value - profit_matrix # Use scipy implementation of munkres algorithm. rx2_cx = dict(zip(*scipy.optimize.linear_sum_assignment(cost_matrix))) # Each row (new-name) has now been assigned a column (old-name) # Map this back to the input-space (using None to indicate extras) cx2_name = dict(enumerate(unique_old_names)) if False: import pandas as pd columns = unique_old_names + ['_%r' % x for x in range(num_pad)] print('Profit Matrix') print(pd.DataFrame(profit_matrix, columns=columns)) print('Cost Matrix') print(pd.DataFrame(cost_matrix, columns=columns)) assignment_ = [cx2_name.get(rx2_cx[rx], None) for rx in range(num_new_names)] return assignment_
def naive_password_strategy(required_len=14, required_caps=1, required_special=1, required_digits=1): """ Simulate a "bad" password that meets typical password requirements Get a naive version of the N char min special char password One common strategy for getting a 14 char pass is using 2 words or a word and a date with misspellings, shuffled case, and a special char, which is probably _, -, ., !, or @ Example: scheme = naive_password_strategy() print(f'scheme={scheme}') """ # When people are forced to include a special character, this is the # liklihood they choose one of the following: # https://www.reddit.com/r/dataisbeautiful/comments/2vfgvh/most_frequentlyused_special_characters_in_10/ special_char_freq = { '_': 0.332, '.': 0.304, '-': 0.086, '!': 0.065, '@': 0.052, '*': 0.032, '$': 0.019, '&': 0.009, '%': 0.007, } _total = sum(special_char_freq.values()) special_char_prob = ub.map_vals(lambda x: x / _total, special_char_freq) # Only seach the most likely special chars naive_special_chars = { k: v for k, v in special_char_prob.items() if v > 0.05 } if 0: import diceware wlpath = diceware.wordlist.get_wordlist_path('en') wlpath = diceware.wordlist.get_wordlist_path('en_securedrop') wordlist = list(diceware.wordlist.WordList(wlpath)) word_lengths = sorted(map(len, wordlist)) word_length_hist = ub.dict_hist(word_lengths) else: # Number of common password words with a specific length word_length_hist = { 1: 10, 2: 90, 3: 582, 4: 2279, 5: 3350, 6: 1313, 7: 539, 8: 22, 9: 5, 10: 2 } # Also needs a number and special char required_word_len = required_len - 2 # How many permutations of N words are there that get over the char limit? total_passwords = 0 import itertools as it import functools import operator as op possible_num_word = [1, 2, 3] for num_words in possible_num_word: for ts in it.product(*[word_length_hist.items()] * num_words): ks = [k for k, v in ts] vs = [v for k, v in ts] # If the lengths are above, we can take any of these permutations # (with replacement) if sum(ks) > required_word_len: # Compute the number of phrases, then augment this with the # special properties. num_phrases = functools.reduce(op.mul, vs) # People might insert a special character at the start, middle, # or end, or predictably replace a letter. predictability_factor = 2 num_special_locs = (num_words + 1) * predictability_factor special_factor = required_special * len( naive_special_chars) * num_special_locs # People might insert a digit at start, middle, or end, or maybe # inside of a word replacing a common letter. num_digit_locs = num_words + 1 num_digits = 10 + 100 # usually a 1 or 2 digit number digit_factor = required_digits * num_digits * num_digit_locs # People might only shuffle the case of 1 or 2 letters. # usually at the beginning of words caps_factor = required_caps * num_words total = (num_phrases * (1 + special_factor) * (1 + caps_factor) * (1 + digit_factor)) total_passwords += total name_parts = ['naive', str(required_len)] if required_caps: name_parts.append('caps') if required_digits: name_parts.append('digit') if required_special: name_parts.append('special') name = '-'.join(name_parts) scheme = { 'name': name, 'num': 1, 'base': total_passwords, } return scheme
def compare_results(): print('Comparing results') import pandas as pd from tabulate import tabulate # Read in output of demo script measure_fpath = 'measurements_haul83.csv' py_df = pd.DataFrame.from_csv(measure_fpath, index_col=None) # Convert python length output from mm into cm for consistency py_df['fishlen'] = py_df['fishlen'] / 10 py_df['current_frame'] = py_df['current_frame'].astype(np.int) # janky CSV parsing py_df['box_pts1'] = py_df['box_pts1'].map(lambda p: eval(p.replace(';', ','), np.__dict__)) py_df['box_pts2'] = py_df['box_pts2'].map(lambda p: eval(p.replace(';', ','), np.__dict__)) py_df['obox1'] = [ctalgo.OrientedBBox(*cv2.minAreaRect(pts[:, None, :].astype(np.int))) for pts in py_df['box_pts1']] py_df['obox2'] = [ctalgo.OrientedBBox(*cv2.minAreaRect(pts[:, None, :].astype(np.int))) for pts in py_df['box_pts2']] py_df.drop(['box_pts1', 'box_pts2'], axis=1, inplace=True) # Remap to matlab names py_df = py_df.rename(columns={ 'error': 'Err', 'fishlen': 'fishLength', 'range': 'fishRange', }) # Load matlab results mat_df = _read_kresimir_results() FORCE_COMPARABLE_RANGE = True # FORCE_COMPARABLE_RANGE = False if FORCE_COMPARABLE_RANGE: # Be absolutely certain we are in comparable regions (may slightly bias # results, against python and in favor of matlab) min_frame = max(mat_df.current_frame.min(), py_df.current_frame.min()) max_frame = min(mat_df.current_frame.max(), py_df.current_frame.max()) print('min_frame = {!r}'.format(min_frame)) print('max_frame = {!r}'.format(max_frame)) mat_df = mat_df[(mat_df.current_frame >= min_frame) & (mat_df.current_frame <= max_frame)] py_df = py_df[(py_df.current_frame >= min_frame) & (py_df.current_frame <= max_frame)] intersect_frames = np.intersect1d(mat_df.current_frame, py_df.current_frame) print('intersecting frames = {} / {} (matlab)'.format( len(intersect_frames), len(set(mat_df.current_frame)))) print('intersecting frames = {} / {} (python)'.format( len(intersect_frames), len(set(py_df.current_frame)))) # Reuse the hungarian algorithm implementation from ctalgo min_assign = ctalgo.FishStereoMeasurments.minimum_weight_assignment correspond = [] for f in intersect_frames: pidxs = np.where(py_df.current_frame == f)[0] midxs = np.where(mat_df.current_frame == f)[0] pdf = py_df.iloc[pidxs] mdf = mat_df.iloc[midxs] ppts1 = np.array([o.center for o in pdf['obox1']]) mpts1 = np.array([o.center for o in mdf['obox1']]) ppts2 = np.array([o.center for o in pdf['obox2']]) mpts2 = np.array([o.center for o in mdf['obox2']]) dists1 = sklearn.metrics.pairwise.pairwise_distances(ppts1, mpts1) dists2 = sklearn.metrics.pairwise.pairwise_distances(ppts2, mpts2) # arbitrarilly chosen threshold thresh = 100 for i, j in min_assign(dists1): d1 = dists1[i, j] d2 = dists2[i, j] if d1 < thresh and d2 < thresh and abs(d1 - d2) < thresh / 4: correspond.append((pidxs[i], midxs[j])) correspond = np.array(correspond) # pflags = np.array(ub.boolmask(correspond.T[0], len(py_df))) mflags = np.array(ub.boolmask(correspond.T[1], len(mat_df))) # print('there are {} detections that seem to be in common'.format(len(correspond))) # print('The QC flags of the common detections are: {}'.format( # ub.dict_hist(mat_df[mflags]['QC'].values))) # print('The QC flags of the other matlab detections are: {}'.format( # ub.dict_hist(mat_df[~mflags]['QC'].values))) print('\n\n----\n## All stats\n') print(ub.codeblock( ''' Overall, the matlab script made {nmat} length measurements and the python script made {npy} length measurements. Here is a table summarizing the average lengths / ranges / errors of each script: ''').format(npy=len(py_df), nmat=len(mat_df))) stats = pd.DataFrame(columns=['python', 'matlab']) for key in ['fishLength', 'fishRange', 'Err']: stats.loc[key, 'python'] = '{:6.2f} ± {:6.2f}'.format(py_df[key].mean(), py_df[key].std()) stats.loc[key, 'matlab'] = '{:6.2f} ± {:6.2f}'.format(mat_df[key].mean(), mat_df[key].std()) stats.loc['nTotal', 'python'] = '{}'.format(len(py_df)) stats.loc['nTotal', 'matlab'] = '{}'.format(len(mat_df)) print(tabulate(stats, headers='keys', tablefmt='psql', stralign='right')) print('\n\n----\n## Only COMMON detections\n') py_df_c = py_df.iloc[correspond.T[0]] mat_df_c = mat_df.iloc[correspond.T[1]] stats = pd.DataFrame(columns=['python', 'matlab']) for key in ['fishLength', 'fishRange', 'Err']: stats.loc[key, 'python'] = '{:6.2f} ± {:6.2f}'.format(py_df_c[key].mean(), py_df_c[key].std()) stats.loc[key, 'matlab'] = '{:6.2f} ± {:6.2f}'.format(mat_df_c[key].mean(), mat_df_c[key].std()) stats.loc['nTotal', 'python'] = '{}'.format(len(py_df_c)) stats.loc['nTotal', 'matlab'] = '{}'.format(len(mat_df_c)) print(ub.codeblock( ''' Now, we investigate how many dections matlab and python made in common. (Note, choosing which dections in one version correspond to which in another is done using a heuristic based on distances between bbox centers and a thresholded minimum assignment problem). Python made {npy_c}/{nmat} = {percent:.2f}% of the detections matlab made ''').format(npy_c=len(py_df_c), nmat=len(mat_df), percent=100 * len(py_df_c) / len(mat_df))) print(tabulate(stats, headers='keys', tablefmt='psql', stralign='right')) print('\n\n----\n## Evaulation using the QC code\n') hist_hit = ub.dict_hist(mat_df[mflags]['QC'].values) hist_miss = ub.dict_hist(mat_df[~mflags]['QC'].values) print(ub.codeblock( ''' However, not all of those matlab detections were good. Because we have detections in corrsepondences with each other we can assign the python detections QC codes. Here is a histogram of the QC codes for these python detections: {} (Note: read histogram as <QC-code>: <frequency>) Here is a histogram of the other matlab detections that python did not find: {} To summarize: python correctly rejected {:.2f}% of the matlab QC=0 detections python correctly accepted {:.2f}% of the matlab QC=1 detections python correctly accepted {:.2f}% of the matlab QC=2 detections Note, that because python made detections that matlab did not make, the remaining {} detections may be right or wrong, but there is no way to tell from this analysis. Lastly, here are the statistics for the common detections that had a non-zero QC code. ''').format( ub.repr2(hist_hit, nl=1), ub.repr2(hist_miss, nl=1), 100 * hist_miss[0] / (hist_hit[0] + hist_miss[0]), 100 * hist_hit[1] / (hist_hit[1] + hist_miss[1]), 100 * hist_hit[2] / (hist_hit[2] + hist_miss[2]), len(py_df) - len(py_df_c) ) ) is_qc = (mat_df_c['QC'] > 0).values mat_df_c = mat_df_c[is_qc] py_df_c = py_df_c[is_qc] stats = pd.DataFrame(columns=['python', 'matlab']) for key in ['fishLength', 'fishRange', 'Err']: stats.loc[key, 'python'] = '{:6.2f} ± {:6.2f}'.format(py_df_c[key].mean(), py_df_c[key].std()) stats.loc[key, 'matlab'] = '{:6.2f} ± {:6.2f}'.format(mat_df_c[key].mean(), mat_df_c[key].std()) stats.loc['nTotal', 'python'] = '{}'.format(len(py_df_c)) stats.loc['nTotal', 'matlab'] = '{}'.format(len(mat_df_c)) print(tabulate(stats, headers='keys', tablefmt='psql', stralign='right'))