def group_pfiles(cls, pfiles, step_idx=None): """ Creates groups of pfiles that *might* be the same. Example: >>> fpaths = _demodata_files() >>> pfiles = [ProgressiveFile(f) for f in fpaths] >>> groups1 = ProgressiveFile.group_pfiles(pfiles) >>> for pfile in pfiles: >>> pfile.refine() >>> groups2 = ProgressiveFile.group_pfiles(pfiles) >>> for pfile in pfiles[0::2]: >>> pfile.refine() >>> groups3 = ProgressiveFile.group_pfiles(pfiles) >>> for pfile in pfiles[1::2]: >>> pfile.refine() >>> groups4 = ProgressiveFile.group_pfiles(pfiles) """ if step_idx is not None: # We are given the step idx to use, so do that final_groups = ub.group_items(pfiles, key=lambda x: x.step_id(step_idx)) else: # Otherwise do something reasonable size_groups = ub.group_items(pfiles, key=lambda x: x.size) final_groups = ub.ddict(list) for group in size_groups.values(): # we have to use the minimum refine step available # for any unfinished pfile to ensure consistency step_idx = ProgressiveFile.compatible_step_idx(group) step_groups = ub.group_items(group, key=lambda x: x.step_id(step_idx)) for key, val in step_groups.items(): final_groups[key].extend(val) return final_groups
def init_test_mode(infr): from graphid.core import nx_dynamic_graph infr.print('init_test_mode') infr.test_mode = True # infr.edge_truth = {} infr.metrics_list = [] infr.test_state = { 'n_decision': 0, 'n_algo': 0, 'n_manual': 0, 'n_true_merges': 0, 'n_error_edges': 0, 'confusion': None, } infr.test_gt_pos_graph = nx_dynamic_graph.DynConnGraph() infr.test_gt_pos_graph.add_nodes_from(infr.aids) infr.nid_to_gt_cc = ub.group_items(infr.aids, infr.orig_name_labels) infr.node_truth = ub.dzip(infr.aids, infr.orig_name_labels) # infr.real_n_pcc_mst_edges = sum( # len(cc) - 1 for cc in infr.nid_to_gt_cc.values()) # util.cprint('real_n_pcc_mst_edges = %r' % ( # infr.real_n_pcc_mst_edges,), 'red') infr.metrics_list = [] infr.nid_to_gt_cc = ub.group_items(infr.aids, infr.orig_name_labels) infr.real_n_pcc_mst_edges = sum( len(cc) - 1 for cc in infr.nid_to_gt_cc.values()) infr.print('real_n_pcc_mst_edges = %r' % (infr.real_n_pcc_mst_edges, ), color='red')
def test_group_items_sorted(): pairs = [ ('ham', 'protein'), ('jam', 'fruit'), ('spam', 'protein'), ('eggs', 'protein'), ('cheese', 'dairy'), ('banana', 'fruit'), ] item_list, groupid_list = zip(*pairs) result1 = ub.group_items(item_list, groupid_list, sorted_=False) result2 = ub.group_items(item_list, groupid_list, sorted_=True) result1 = ub.map_vals(set, result1) result2 = ub.map_vals(set, result2) assert result1 == result2
def fix_conference_places(bibman): pubman = constants_tex_fixes.PubManager() needed = set() for entry in bibman.cleaned.values(): if entry['pub_type'] == 'conference': accro, year = (entry['pub_accro'], entry['year']) pub = pubman.find(accro) if pub.places is None or int(year) not in pub.places: needed.add((accro, year)) else: place = pub.places[int(year)] print('place = {!r}'.format(place)) entry['address'] = place if needed: needed = list(needed) used_years = ub.group_items(needed, ut.take_column(needed, 0)) for k, v in list(used_years.items()): used_years[k] = sorted(v) sortby = ub.map_vals(lambda vs: (len(vs), max(e[1] for e in vs)), used_years) used_years = ut.order_dict_by(used_years, ub.argsort(sortby)) print('NEED CONFERENCE LOCATIONS') print(ub.repr2(used_years, nl=2))
def _print_previous_loop_statistics(infr, count): # Print stats about what happend in the this loop history = infr.metrics_list[-count:] recover_blocks = ub.group_items([ (k, sum(1 for i in g)) for k, g in it.groupby(util.take_column(history, 'recovering')) ]).get(True, []) infr.print( ('Recovery mode entered {} times, ' 'made {} recovery decisions.').format(len(recover_blocks), sum(recover_blocks)), color='green') testaction_hist = ub.dict_hist(util.take_column( history, 'test_action')) infr.print('Test Action Histogram: {}'.format( ub.repr2(testaction_hist, si=True)), color='yellow') if infr.params['inference.enabled']: action_hist = ub.dict_hist( util.emap(frozenset, util.take_column(history, 'action'))) infr.print('Inference Action Histogram: {}'.format( ub.repr2(action_hist, si=True)), color='yellow') infr.print('Decision Histogram: {}'.format( ub.repr2(ub.dict_hist(util.take_column(history, 'pred_decision')), si=True)), color='yellow') infr.print('User Histogram: {}'.format( ub.repr2(ub.dict_hist(util.take_column(history, 'user_id')), si=True)), color='yellow')
def find_connecting_edges(infr): """ Searches for a small set of edges, which if reviewed as positive would ensure that each PCC is k-connected. Note that in somes cases this is not possible """ label = 'name_label' node_to_label = infr.get_node_attrs(label) label_to_nodes = ub.group_items(node_to_label.keys(), node_to_label.values()) # k = infr.params['redun.pos'] k = 1 new_edges = [] prog = ub.ProgIter(list(label_to_nodes.keys()), desc='finding connecting edges', enabled=infr.verbose > 0) for nid in prog: nodes = set(label_to_nodes[nid]) G = infr.pos_graph.subgraph(nodes, dynamic=False) impossible = nxu.edges_inside(infr.neg_graph, nodes) impossible |= nxu.edges_inside(infr.incomp_graph, nodes) candidates = set(nx.complement(G).edges()) candidates.difference_update(impossible) aug_edges = nxu.k_edge_augmentation(G, k=k, avail=candidates) new_edges += aug_edges prog.ensure_newline() return new_edges
def master(): master_fpath = ub.grabdata( 'https://raw.githubusercontent.com/pokemongo-dev-contrib/pokemongo-game-master/master/versions/latest/V2_GAME_MASTER.json', expires=24 * 60 * 60) with open(master_fpath) as file: master = json.load(file) master.keys() def item_type(item): data = item['data'] if 'move' in data: return 'move' if 'pokemon' in data: return 'pokemon' type_to_items = ub.group_items(master['template'], key=item_type) pokemon_items = type_to_items['pokemon'] # NOQA move_items = type_to_items['move'] for item in move_items: uid = item['data']['move']['uniqueId'] if 'MOONBLAST' in uid: print('item = {}'.format(ub.repr2(item, nl=3)))
def test_group_items_callable(): pairs = [ ('ham', 'protein'), ('jam', 'fruit'), ('spam', 'protein'), ('eggs', 'protein'), ('cheese', 'dairy'), ('banana', 'fruit'), ] items, groupids = zip(*pairs) lut = dict(zip(items, groupids)) result1 = ub.group_items(items, groupids) result2 = ub.group_items(items, lut.__getitem__) result1 = ub.map_vals(set, result1) result2 = ub.map_vals(set, result2) assert result1 == result2
def draw(self, color='blue', ax=None, alpha=None, coord_axes=[1, 0], radius=1): """ Note: unlike other methods, the defaults assume x/y internal data Args: coord_axes (Tuple): specify which image axes each coordinate dim corresponds to. For 2D images, if you are storing r/c data, set to [0,1], if you are storing x/y data, set to [1,0]. Example: >>> # xdoc: +REQUIRES(module:kwplot) >>> from kwimage.structs.coords import * # NOQA >>> self = Coords.random(10) >>> # xdoc: +REQUIRES(--show) >>> self.draw(radius=3.0) >>> import kwplot >>> kwplot.autompl() >>> self.draw(radius=3.0) """ import matplotlib as mpl import kwimage from matplotlib import pyplot as plt if ax is None: ax = plt.gca() data = self.data if self.dim != 2: raise NotImplementedError('need 2d for mpl') # More grouped patches == more efficient runtime if alpha is None: alpha = [1.0] * len(data) elif not ub.iterable(alpha): alpha = [alpha] * len(data) ptcolors = [kwimage.Color(color, alpha=a).as01('rgba') for a in alpha] color_groups = ub.group_items(range(len(ptcolors)), ptcolors) default_centerkw = { 'radius': radius, 'fill': True } centerkw = default_centerkw.copy() collections = [] for pcolor, idxs in color_groups.items(): yx_list = [row[coord_axes] for row in data[idxs]] patches = [ mpl.patches.Circle((x, y), ec=None, fc=pcolor, **centerkw) for y, x in yx_list ] col = mpl.collections.PatchCollection(patches, match_original=True) collections.append(col) ax.add_collection(col) return collections
def test_group_items_sorted_mixed_types(): import random groupid_list = [ 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, '1', '2', '3', '1', '2', '3', '1', '2', '3', '1', '2', '3', ] item_list = list(range(len(groupid_list))) # Randomize the order random.Random(947043).shuffle(groupid_list) random.Random(947043).shuffle(item_list) result1 = ub.group_items(item_list, groupid_list, sorted_=True) result2 = ub.group_items(item_list, groupid_list, sorted_=False) result1 = ub.map_vals(set, result1) result2 = ub.map_vals(set, result2) assert result1 == result2 assert '1' in result1 assert 1 in result1
def _devcheck_manage_monitor(workdir, dry=True): all_sessions = collect_sessions(workdir) # Get all the images in the monitor directories # (this is a convention and not something netharn does by default) all_files = [] # factor = 100 max_keep = 300 def _choose_action(file_infos): import kwarray file_infos = kwarray.shuffle(file_infos, rng=0) n_keep = max_keep # n_keep = (len(file_infos) // factor) + 1 # n_keep = min(max_keep, n_keep) for info in file_infos[:n_keep]: info['action'] = 'keep' for info in file_infos[n_keep:]: info['action'] = 'delete' for session in ub.ProgIter(all_sessions, desc='checking monitor files'): dpaths = [ join(session.dpath, 'monitor', 'train', 'batch'), join(session.dpath, 'monitor', 'vali', 'batch'), join(session.dpath, 'monitor', 'train'), join(session.dpath, 'monitor', 'vali'), ] exts = ['*.jpg', '*.png'] for dpath in dpaths: for ext in exts: fpaths = list(glob.glob(join(dpath, ext))) file_infos = [{ 'size': os.stat(p).st_size, 'fpath': p } for p in fpaths] _choose_action(file_infos) all_files.extend(file_infos) grouped_actions = ub.group_items(all_files, lambda x: x['action']) for key, group in grouped_actions.items(): size = byte_str(sum([s['size'] for s in group])) print('{:>4} images: {:>4}, size={}'.format(key.capitalize(), len(group), size)) if dry: print('Dry run') else: delete = grouped_actions.get('delete', []) delete_fpaths = [item['fpath'] for item in delete] for p in ub.ProgIter(delete_fpaths, desc='deleting'): ub.delete(p)
def test_class_torch(): import numpy as np import torch import netharn as nh import ubelt as ub # from netharn.util.nms.torch_nms import torch_nms # from netharn.util import non_max_supression thresh = .5 num = 500 rng = nh.util.ensure_rng(0) cpu_boxes = nh.util.Boxes.random(num, scale=400.0, rng=rng, format='tlbr', tensor=True) cpu_tlbr = cpu_boxes.to_tlbr().data # cpu_scores = torch.Tensor(rng.rand(len(cpu_tlbr))) # make all scores unique to ensure comparability cpu_scores = torch.Tensor(np.linspace(0, 1, len(cpu_tlbr))) cpu_cls = torch.LongTensor(rng.randint(0, 10, len(cpu_tlbr))) tlbr = cpu_boxes.to_tlbr().data.to('cuda') scores = cpu_scores.to('cuda') classes = cpu_cls.to('cuda') keep1 = [] for idxs in ub.group_items(range(len(classes)), classes.cpu().numpy()).values(): # cls_tlbr = tlbr.take(idxs, axis=0) # cls_scores = scores.take(idxs, axis=0) cls_tlbr = tlbr[idxs] cls_scores = scores[idxs] cls_keep = torch_nms(cls_tlbr, cls_scores, thresh=thresh, bias=0) keep1.extend(list(ub.compress(idxs, cls_keep.cpu().numpy()))) keep1 = sorted(keep1) keep_ = torch_nms(tlbr, scores, classes=classes, thresh=thresh, bias=0) keep2 = np.where(keep_.cpu().numpy())[0].tolist() keep3 = nh.util.non_max_supression(tlbr.cpu().numpy(), scores.cpu().numpy(), classes=classes.cpu().numpy(), thresh=thresh, bias=0, impl='gpu') print(len(keep1)) print(len(keep2)) print(len(keep3)) print(set(keep1) - set(keep2)) print(set(keep2) - set(keep1))
def test_class_torch(): import numpy as np import torch import ubelt as ub import kwarray import kwimage thresh = .5 num = 500 rng = kwarray.ensure_rng(0) cpu_boxes = kwimage.Boxes.random(num, scale=400.0, rng=rng, format='ltrb', tensor=True) cpu_ltrb = cpu_boxes.to_ltrb().data # cpu_scores = torch.Tensor(rng.rand(len(cpu_ltrb))) # make all scores unique to ensure comparability cpu_scores = torch.Tensor(np.linspace(0, 1, len(cpu_ltrb))) cpu_cls = torch.LongTensor(rng.randint(0, 10, len(cpu_ltrb))) ltrb = cpu_boxes.to_ltrb().data.to('cuda') scores = cpu_scores.to('cuda') classes = cpu_cls.to('cuda') keep1 = [] for idxs in ub.group_items(range(len(classes)), classes.cpu().numpy()).values(): # cls_ltrb = ltrb.take(idxs, axis=0) # cls_scores = scores.take(idxs, axis=0) cls_ltrb = ltrb[idxs] cls_scores = scores[idxs] cls_keep = torch_nms(cls_ltrb, cls_scores, thresh=thresh, bias=0) keep1.extend(list(ub.compress(idxs, cls_keep.cpu().numpy()))) keep1 = sorted(keep1) keep_ = torch_nms(ltrb, scores, classes=classes, thresh=thresh, bias=0) keep2 = np.where(keep_.cpu().numpy())[0].tolist() keep3 = kwimage.non_max_supression(ltrb.cpu().numpy(), scores.cpu().numpy(), classes=classes.cpu().numpy(), thresh=thresh, bias=0, impl='gpu') print(len(keep1)) print(len(keep2)) print(len(keep3)) print(set(keep1) - set(keep2)) print(set(keep2) - set(keep1))
def randomized_ibeis_dset(dbname, dim=224): """ Ignore: >>> from clab.live.siam_train import * >>> datasets = randomized_ibeis_dset('PZ_MTEST') >>> ut.qtensure() >>> self = datasets['train'] >>> self.augment = True >>> self.show_sample() """ # from clab.live.siam_train import * # dbname = 'PZ_MTEST' import utool as ut from ibeis.algo.verif import vsone # pblm = vsone.OneVsOneProblem.from_empty('PZ_MTEST') pblm = vsone.OneVsOneProblem.from_empty(dbname) pccs = list(pblm.infr.positive_components()) pcc_freq = list(map(len, pccs)) freq_grouped = ub.group_items(pccs, pcc_freq) # Simpler very randomized sample strategy train_pccs = [] vali_pccs = [] test_pccs = [] import math # vali_frac = .1 test_frac = .1 vali_frac = 0 for i, group in freq_grouped.items(): group = ut.shuffle(group, rng=432232 + i) n_test = 0 if len(group) == 1 else math.ceil(len(group) * test_frac) test, learn = group[:n_test], group[n_test:] n_vali = 0 if len(group) == 1 else math.ceil(len(learn) * vali_frac) vali, train = group[:n_vali], group[-n_vali:] train_pccs.extend(train) test_pccs.extend(test) vali_pccs.extend(vali) test_dataset = RandomBalancedIBEISSample(pblm, test_pccs, dim=dim) train_dataset = RandomBalancedIBEISSample(pblm, train_pccs, dim=dim) vali_dataset = RandomBalancedIBEISSample(pblm, vali_pccs, dim=dim) train_dataset.augment = True datasets = { 'train': train_dataset, # 'vali': vali_dataset, 'test': test_dataset, } return datasets
def randomized_ibeis_dset(dbname, dim=224): """ CommandLine: xdoctest ~/code/netharn/netharn/examples/siam_ibeis.py randomized_ibeis_dset --show Example: >>> datasets = randomized_ibeis_dset('PZ_MTEST') >>> # xdoctest: +REQUIRES(--show) >>> nh.util.qtensure() >>> self = datasets['train'] >>> self.show_sample() >>> nh.util.show_if_requested() """ import math from ibeis.algo.verif import vsone pblm = vsone.OneVsOneProblem.from_empty(dbname) pccs = list(pblm.infr.positive_components()) pcc_freq = list(map(len, pccs)) freq_grouped = ub.group_items(pccs, pcc_freq) # Simpler very randomized sample strategy train_pccs = [] vali_pccs = [] test_pccs = [] vali_frac = .1 test_frac = .1 for i, group in freq_grouped.items(): group = nh.util.shuffle(group, rng=432232 + i) n_test = 0 if len(group) == 1 else math.ceil(len(group) * test_frac) test, learn = group[:n_test], group[n_test:] n_vali = 0 if len(group) == 1 else math.ceil(len(learn) * vali_frac) vali, train = group[:n_vali], group[-n_vali:] train_pccs.extend(train) test_pccs.extend(test) vali_pccs.extend(vali) test_dataset = RandomBalancedIBEISSample(pblm, test_pccs, dim=dim) train_dataset = RandomBalancedIBEISSample(pblm, train_pccs, dim=dim, augment=False) vali_dataset = RandomBalancedIBEISSample(pblm, vali_pccs, dim=dim, augment=False) datasets = { 'train': train_dataset, 'vali': vali_dataset, 'test': test_dataset, } datasets.pop('test', None) # dont test for now (speed consideration) return datasets
def main(): grouped = ub.group_items(options, lambda x: x['type']) build = {} for key, values in grouped.items(): print('key = {!r}'.format(key)) values = sorted(values, key=lambda x: x['price']) chosen = values[-1] print('chosen = {!r}'.format(chosen)) build[key] = chosen print('build = {}'.format(ub.repr2(build, nl=2)))
def rank_inventory(inventory): candidates = list(ub.flatten(list(pkmn.family(ancestors=False, node=True)) for pkmn in inventory)) groups = ub.group_items(candidates, key=lambda p: p.name) leages = { 'master': {'max_cp': float('inf')}, 'ultra': {'max_cp': 2500}, 'great': {'max_cp': 1500}, 'little': {'max_cp': 500}, } max_level = 45 # for XL candy # max_level = 40 # normal all_dfs = [] for name, group in groups.items(): print('\n\n------------\n\n') print('name = {!r}'.format(name)) for leage_name, leage_filters in leages.items(): max_cp = leage_filters['max_cp'] print('') print(' ========== ') print(' --- {} in {} --- '.format(name, leage_name)) not_eligible = [p for p in group if p.cp is not None and p.cp > max_cp] eligible = [p for p in group if p.cp is None or p.cp <= max_cp] print('not_eligible = {!r}'.format(not_eligible)) if len(eligible) > 0: first = ub.peek(eligible) have_ivs = eligible df = first.leage_rankings_for(have_ivs, max_cp=max_cp, max_level=max_level) all_dfs.append(df) else: print('none eligable') # Print out the best ranks for each set of IVS over all possible forms # (lets you know which ones can be transfered safely) iv_to_rank = ub.ddict(list) for df in all_dfs: if df is not None: df = df.set_index(['iva', 'ivd', 'ivs']) for iv, rank in zip(df.index, df['rank']): iv_to_rank[iv].append(rank) iv_to_best_rank = ub.map_vals(sorted, iv_to_rank) iv_to_best_rank = ub.sorted_vals(iv_to_best_rank) print('iv_to_best_rank = {}'.format(ub.repr2(iv_to_best_rank, nl=1, align=':')))
def finalize_dets(ready_dets, ready_gids): gid_to_ready_dets = ub.group_items(ready_dets, ready_gids) for gid, dets_list in gid_to_ready_dets.items(): if len(dets_list) == 0: dets = kwimage.Detections.concatenate([]) elif len(dets_list) == 1: dets = dets_list[0] elif len(dets_list) > 1: dets = kwimage.Detections.concatenate(dets_list) keep = dets.non_max_supression( thresh=self.config['nms_thresh'], ) dets = dets.take(keep) yield (gid, dets)
def progressive_duplicates(pfiles, idx=1): step_ids = [pfile.refined_to(idx) for pfile in ub.ProgIter(pfiles)] final_groups = {} grouped = ub.group_items(pfiles, step_ids) for key, group in grouped.items(): if len(group) > 1: if all(not g.can_refine for g in group): # Group is ~100% a real duplicate final_groups[key] = group else: pfiles = group deduped = progressive_duplicates(pfiles, idx=idx + 1) final_groups.update(deduped) else: final_groups[key] = group return final_groups
def find_clique_edges(infr, label='name_label'): """ Augmenting edges that would complete each the specified cliques. (based on the group inferred from `label`) Args: label (str): node attribute to use as the group id to form the cliques. """ node_to_label = infr.get_node_attrs(label) label_to_nodes = ub.group_items(node_to_label.keys(), node_to_label.values()) new_edges = [] for label, nodes in label_to_nodes.items(): for edge in it.combinations(nodes, 2): if infr.edge_decision(edge) == UNREV: new_edges.append(edge) return new_edges
def internal_deduplicate(self): hash_groups = ub.group_items(self.all_fpaths, self.all_hashes) hash_groups_dup = { k: v for k, v in hash_groups.items() if len(v) > 1 } from os.path import dirname hash_groups_dup['ef46db3751d8e999'] for key, values in hash_groups_dup.items(): for v in values: if v.endswith('.avi'): break [basename(v) for v in values] [dirname(v) for v in values]
def print_graph_connections(infr, label='orig_name_label'): """ label = 'orig_name_label' """ node_to_label = infr.get_node_attrs(label) label_to_nodes = ub.group_items(node_to_label.keys(), node_to_label.values()) print('CC info') for name, cc in label_to_nodes.items(): print('\nname = %r' % (name, )) edges = list(nxu.edges_between(infr.graph, cc)) print(infr.get_edge_df_text(edges)) print('CC pair info') for (n1, cc1), (n2, cc2) in it.combinations(label_to_nodes.items(), 2): if n1 == n2: continue print('\nname_pair = {}-vs-{}'.format(n1, n2)) edges = list(nxu.edges_between(infr.graph, cc1, cc2)) print(infr.get_edge_df_text(edges))
def predict_proba_df(verif, edges): """ CommandLine: python -m graphid.demo DummyVerif.predict_edges Example: >>> from graphid import demo >>> kwargs = dict(num_pccs=40, size=2) >>> infr = demo.demodata_infr(**kwargs) >>> verif = infr.dummy_verif >>> edges = list(infr.graph.edges()) >>> probs = verif.predict_proba_df(edges) """ infr = verif.infr edges = list(it.starmap(verif.infr.e_, edges)) prob_cache = infr.task_probs['match_state'] is_miss = np.array([e not in prob_cache for e in edges]) # is_hit = ~is_miss if np.any(is_miss): miss_edges = list(ub.compress(edges, is_miss)) miss_truths = [verif._get_truth(edge) for edge in miss_edges] grouped_edges = ub.group_items(miss_edges, miss_truths) # Need to make this determenistic too states = [POSTV, NEGTV, INCMP] for key in sorted(grouped_edges.keys()): group = grouped_edges[key] probs0 = util.randn(shape=[len(group)], rng=verif.rng, a_max=1, a_min=0, **verif.dummy_params[key]) # Just randomly assign other probs probs1 = verif.rng.rand(len(group)) * (1 - probs0) probs2 = 1 - (probs0 + probs1) for edge, probs in zip(group, zip(probs0, probs1, probs2)): prob_cache[edge] = ub.dzip(states, probs) probs = pd.DataFrame( list(ub.take(prob_cache, edges)), index=util.ensure_multi_index(edges, ('aid1', 'aid2')) ) return probs
def _dump_measures(tb_data, out_dpath, mode=None, smoothing=0.0, ignore_outliers=True): """ This is its own function in case we need to modify formatting CommandLine: xdoctest -m netharn.mixins _dump_measures --out_dpath=. Example: >>> # SCRIPT >>> # Reread a dumped pickle file >>> from netharn.mixins import * # NOQA >>> from netharn.mixins import _dump_monitor_tensorboard, _dump_measures >>> import json >>> from os.path import join >>> import ubelt as ub >>> try: >>> import seaborn as sns >>> sns.set() >>> except ImportError: >>> pass >>> out_dpath = ub.expandpath('~/work/project/fit/nice/nicename/monitor/tensorboard/') >>> out_dpath = ub.argval('--out_dpath', default=out_dpath) >>> mode = ['epoch', 'iter'] >>> fpath = join(out_dpath, 'tb_data.json') >>> tb_data = json.load(open(fpath, 'r')) >>> import kwplot >>> kwplot.autompl() >>> _dump_measures(tb_data, out_dpath, smoothing=0) """ import ubelt as ub from os.path import join import numpy as np import kwplot import matplotlib as mpl from kwplot.auto_backends import BackendContext with BackendContext('agg'): # kwplot.autompl() # TODO: Is it possible to get htop to show this process with some name that # distinguishes it from the dataloader workers? # import sys # import multiprocessing # if multiprocessing.current_process().name != 'MainProcess': # if sys.platform.startswith('linux'): # import ctypes # libc = ctypes.cdll.LoadLibrary('libc.so.6') # title = 'Netharn MPL Dump Measures' # libc.prctl(len(title), title, 0, 0, 0) # NOTE: This cause warnings when exeucted as daemon process # try: # import seaborn as sbn # sbn.set() # except ImportError: # pass valid_modes = ['epoch', 'iter'] if mode is None: mode = valid_modes if ub.iterable(mode): # Hack: Call with all modes for mode_ in mode: _dump_measures(tb_data, out_dpath, mode=mode_, smoothing=smoothing, ignore_outliers=ignore_outliers) return else: assert mode in valid_modes meta = tb_data.get('meta', {}) nice = meta.get('nice', '?nice?') special_groupers = meta.get('special_groupers', ['loss']) fig = kwplot.figure(fnum=1) plot_keys = [ key for key in tb_data if ('train_' + mode in key or 'vali_' + mode in key or 'test_' + mode in key or mode + '_' in key) ] y01_measures = [ '_acc', '_ap', '_mAP', '_auc', '_mcc', '_brier', '_mauc', ] y0_measures = ['error', 'loss'] keys = set(tb_data.keys()).intersection(set(plot_keys)) # print('mode = {!r}'.format(mode)) # print('tb_data.keys() = {!r}'.format(tb_data.keys())) # print('plot_keys = {!r}'.format(plot_keys)) # print('keys = {!r}'.format(keys)) def smooth_curve(ydata, beta): """ Curve smoothing algorithm used by tensorboard """ import pandas as pd alpha = 1.0 - beta if alpha <= 0: return ydata ydata_smooth = pd.Series(ydata).ewm(alpha=alpha).mean().values return ydata_smooth def inlier_ylim(ydatas): """ outlier removal used by tensorboard """ low, high = None, None for ydata in ydatas: q1 = 0.05 q2 = 0.95 low_, high_ = np.quantile(ydata, [q1, q2]) # Extrapolate how big the entire span should be based on inliers inner_q = q2 - q1 inner_extent = high_ - low_ extrap_total_extent = inner_extent / inner_q # amount of padding to add to either side missing_p1 = q1 missing_p2 = 1 - q2 frac1 = missing_p1 / (missing_p2 + missing_p1) frac2 = missing_p2 / (missing_p2 + missing_p1) missing_extent = extrap_total_extent - inner_extent pad1 = missing_extent * frac1 pad2 = missing_extent * frac2 low_ = low_ - pad1 high_ = high_ + pad2 low = low_ if low is None else min(low_, low) high = high_ if high is None else max(high_, high) return (low, high) # Hack values that we don't apply smoothing to HACK_NO_SMOOTH = ['lr', 'momentum'] def tag_grouper(k): # parts = ['train_epoch', 'vali_epoch', 'test_epoch'] # parts = [p.replace('epoch', 'mode') for p in parts] parts = [p + mode for p in ['train_', 'vali_', 'test_']] for p in parts: if p in k: return p.split('_')[0] return 'unknown' GROUP_LOSSES = True GROUP_AND_INDIVIDUAL = False INDIVIDUAL_PLOTS = True GROUP_SPECIAL = True if GROUP_LOSSES: # Group all losses in one plot for comparison loss_keys = [k for k in keys if 'loss' in k] tagged_losses = ub.group_items(loss_keys, tag_grouper) tagged_losses.pop('unknown', None) kw = {} kw['ymin'] = 0.0 # print('tagged_losses = {!r}'.format(tagged_losses)) for tag, losses in tagged_losses.items(): min_abs_y = .01 min_y = 0 xydata = ub.odict() for key in sorted(losses): ydata = tb_data[key]['ydata'] if HACK_NO_SMOOTH not in key.split('_'): ydata = smooth_curve(ydata, smoothing) try: min_y = min(min_y, ydata.min()) pos_ys = ydata[ydata > 0] min_abs_y = min(min_abs_y, pos_ys.min()) except Exception: pass xydata[key] = (tb_data[key]['xdata'], ydata) kw['ymin'] = min_y if ignore_outliers: low, kw['ymax'] = inlier_ylim( [t[1] for t in xydata.values()]) yscales = ['symlog', 'linear'] for yscale in yscales: fig.clf() ax = fig.gca() title = nice + '\n' + tag + '_' + mode + ' losses' kwplot.multi_plot(xydata=xydata, ylabel='loss', xlabel=mode, yscale=yscale, title=title, fnum=1, ax=ax, **kw) if yscale == 'symlog': if LooseVersion( mpl.__version__) >= LooseVersion('3.3'): ax.set_yscale('symlog', linthresh=min_abs_y) else: ax.set_yscale('symlog', linthreshy=min_abs_y) fname = '_'.join([tag, mode, 'multiloss', yscale]) + '.png' fpath = join(out_dpath, fname) ax.figure.savefig(fpath) # don't dump losses individually if we dump them in a group if not GROUP_AND_INDIVIDUAL: keys.difference_update(set(loss_keys)) # print('keys = {!r}'.format(keys)) if GROUP_SPECIAL: tag_groups = ub.group_items(keys, tag_grouper) tag_groups.pop('unknown', None) # Group items matching these strings kw = {} for tag, tag_keys in tag_groups.items(): for groupname in special_groupers: group_keys = [ k for k in tag_keys if groupname in k.split('_') ] if len(group_keys) > 1: # Gather data for this group xydata = ub.odict() for key in sorted(group_keys): ydata = tb_data[key]['ydata'] if HACK_NO_SMOOTH not in key.split('_'): ydata = smooth_curve(ydata, smoothing) xydata[key] = (tb_data[key]['xdata'], ydata) if ignore_outliers: low, kw['ymax'] = inlier_ylim( [t[1] for t in xydata.values()]) yscales = ['linear'] for yscale in yscales: fig.clf() ax = fig.gca() title = nice + '\n' + tag + '_' + mode + ' ' + groupname kwplot.multi_plot(xydata=xydata, ylabel=groupname, xlabel=mode, yscale=yscale, title=title, fnum=1, ax=ax, **kw) if yscale == 'symlog': ax.set_yscale('symlog', linthreshy=min_abs_y) fname = '_'.join([ tag, mode, 'group-' + groupname, yscale ]) + '.png' fpath = join(out_dpath, fname) ax.figure.savefig(fpath) if not GROUP_AND_INDIVIDUAL: keys.difference_update(set(group_keys)) if INDIVIDUAL_PLOTS: # print('keys = {!r}'.format(keys)) for key in keys: d = tb_data[key] ydata = d['ydata'] ydata = smooth_curve(ydata, smoothing) kw = {} if any(m.lower() in key.lower() for m in y01_measures): kw['ymin'] = 0.0 kw['ymax'] = 1.0 elif any(m.lower() in key.lower() for m in y0_measures): kw['ymin'] = min(0.0, ydata.min()) if ignore_outliers: low, kw['ymax'] = inlier_ylim([ydata]) # NOTE: this is actually pretty slow fig.clf() ax = fig.gca() title = nice + '\n' + key kwplot.multi_plot(d['xdata'], ydata, ylabel=key, xlabel=mode, title=title, fnum=1, ax=ax, **kw) # png is slightly smaller than jpg for this kind of plot fpath = join(out_dpath, key + '.png') # print('save fpath = {!r}'.format(fpath)) ax.figure.savefig(fpath)
def detection_confusions(true_boxes, true_cxs, true_weights, pred_boxes, pred_scores, pred_cxs, bg_weight=1.0, ovthresh=0.5, bg_cls=-1): """ Given predictions and truth for an image return (y_pred, y_true, y_score), which is suitable for sklearn classification metrics Args: true_boxes (ndarray): boxes in tlbr format true_cxs (ndarray): classes of each box true_weights (ndarray): weight of this each groundtruth item pred_boxes (ndarray): predicted boxes in tlbr format pred_scores (ndarray): scores for each prediction pred_cxs (ndarray): class predictions ovthresh (float): overlap threshold bg_weight (ndarray): weight of background predictions (default=1) Returns: pd.DataFrame: with relevant clf information Example: >>> true_boxes = np.array([[ 0, 0, 10, 10], >>> [10, 0, 20, 10], >>> [10, 0, 20, 10], >>> [20, 0, 30, 10]]) >>> true_weights = np.array([1, 0, .9, 1]) >>> bg_weight = 1.0 >>> true_cxs = np.array([0, 0, 1, 1]) >>> pred_boxes = np.array([[6, 2, 20, 10], >>> [3, 2, 9, 7], >>> [20, 0, 30, 10]]) >>> pred_scores = np.array([.5, .5, .5]) >>> pred_cxs = np.array([0, 0, 1]) >>> y = detection_confusions(true_boxes, true_cxs, true_weights, >>> pred_boxes, pred_scores, pred_cxs, >>> bg_weight=bg_weight, ovthresh=.5) >>> pd.DataFrame(y) >>> print(y) # xdoc: +IGNORE_WANT cx pred score true weight 0 1 1 0.5000 1 1.0 1 0 0 0.5000 -1 1.0 2 0 -1 0.0000 0 1.0 3 1 -1 0.0000 1 0.9 """ y_pred = [] y_true = [] y_score = [] y_weight = [] cxs = [] if bg_weight is None: bg_weight = 1.0 # Group true boxes by class # Keep track which true boxes are unused / not assigned cx_to_idxs = ub.group_items(range(len(true_cxs)), true_cxs) cx_to_unused = {cx: [True] * len(idxs) for cx, idxs in cx_to_idxs.items()} # cx_to_boxes = ub.group_items(true_boxes, true_cxs) # cx_to_boxes = ub.map_vals(np.array, cx_to_boxes) # sort predictions by score sortx = pred_scores.argsort()[::-1] pred_boxes = pred_boxes.take(sortx, axis=0) pred_cxs = pred_cxs.take(sortx, axis=0) pred_scores = pred_scores.take(sortx, axis=0) for cx, box, score in zip(pred_cxs, pred_boxes, pred_scores): cls_true_idxs = cx_to_idxs.get(cx, []) ovmax = -np.inf ovidx = None weight = bg_weight if len(cls_true_idxs): cls_true_boxes = true_boxes.take(cls_true_idxs, axis=0) ovmax, ovidx = iou_overlap(cls_true_boxes, box) if true_weights is None: weight = 1.0 else: true_idx = cls_true_idxs[ovidx] weight = true_weights[true_idx] unused = cx_to_unused[cx] if ovmax > ovthresh and unused[ovidx]: # Mark this prediction as a true positive if weight > 0: # Ignore matches to truth with weight 0 (difficult cases) y_pred.append(cx) y_true.append(cx) y_score.append(score) y_weight.append(weight) cxs.append(cx) unused[ovidx] = False else: # Mark this prediction as a false positive y_pred.append(cx) y_true.append(bg_cls) # use -1 as background ignore class y_score.append(score) y_weight.append(weight) cxs.append(cx) # Mark true boxes we failed to predict as false negatives for cx, unused in cx_to_unused.items(): for ovidx, flag in enumerate(unused): if flag: if true_weights is None: weight = 1.0 else: cls_true_idxs = cx_to_idxs.get(cx, []) true_idx = cls_true_idxs[ovidx] weight = true_weights[true_idx] # if it has a nonzero weight if weight > 0: # Mark this prediction as a false negative y_pred.append(-1) y_true.append(cx) y_score.append(0.0) y_weight.append(weight) cxs.append(cx) y = { 'pred': y_pred, 'true': y_true, 'score': y_score, 'weight': y_weight, 'cx': cxs, } # y = pd.DataFrame() return y
def autogen_imports(fpath_or_text): """ Generate import statements for python code Example: >>> import vimtk >>> source = ub.codeblock( ''' math it ''') >>> text = vimtk.autogen_imports(source) >>> print(text) import itertools as it import math """ try: import xinspect except Exception: print('UNABLE TO IMPORT XINSPECT') print('sys.prefix = {!r}'.format(sys.prefix)) raise from os.path import exists from xinspect.autogen import Importables importable = Importables() importable._use_recommended_defaults() base = { 'it': 'import itertools as it', 'nh': 'import netharn as nh', 'np': 'import numpy as np', 'pd': 'import pandas as pd', 'ub': 'import ubelt as ub', 'nx': 'import networkx as nx', 'Image': 'from PIL import Image', 'mpl': 'import matplotlib as mpl', 'nn': 'from torch import nn', 'torch_data': 'import torch.utils.data as torch_data', 'F': 'import torch.nn.functional as F', 'math': 'import math', } importable.known.update(base) user_importable = None try: user_importable = CONFIG.get('vimtk_auto_importable_modules') importable.known.update(user_importable) except Exception as ex: logger.info('ex = {!r}'.format(ex)) logger.info('ERROR user_importable = {!r}'.format(user_importable)) kw = {'importable': importable} if exists(fpath_or_text): kw['fpath'] = fpath_or_text else: kw['source'] = fpath_or_text lines = xinspect.autogen_imports(**kw) x = ub.group_items(lines, [x.startswith('from ') for x in lines]) ordered_lines = [] ordered_lines += sorted(x.get(False, [])) ordered_lines += sorted(x.get(True, [])) import_block = '\n'.join(ordered_lines) return import_block
def main(): """ Run password security analysis Example: >>> import sys, ubelt >>> sys.path.append(ubelt.expandpath('~/misc/notes')) >>> from password_model import * # NOQA >>> main() """ import itertools as it from fractions import Fraction import pandas as pd # Build our adversary and our strategies devices, scales = build_threat_models() password_schemes = build_password_strategy() # Other estimates or assumptions estimates = { # estimated cost of using a kilowatt for an hour # http://www.wrecc.com/what-uses-watts-in-your-home/ # https://www.coinwarz.com/mining/ethereum/calculator 'dollars_per_kwh': 0.10, } rows = [] for device, scheme, scale in it.product(devices, password_schemes, scales): for benchmark in device['benchmarks']: states = Fraction(scheme['states']) num_devices = Fraction(scale['num_devices']) dollars_per_kwh = Fraction(estimates['dollars_per_kwh']) hashmode_attempts_per_second = benchmark['attempts_per_second'] attempts_per_second = num_devices * Fraction( int(hashmode_attempts_per_second)) seconds = states / Fraction(attempts_per_second) hours = seconds / Fraction(3600) device_kilowatts = Fraction(device['watts']) / Fraction(1000) device_dollars_per_hour = device_kilowatts * dollars_per_kwh dollars_per_device = device_dollars_per_hour * hours dollars = dollars_per_device * num_devices total_kilowatts = device_kilowatts * num_devices * hours row = { 'scheme': scheme['name'], 'entropy': scheme['entropy'], 'hashmode': benchmark['hashmode'], 'hashmode_attempts_per_second': int(hashmode_attempts_per_second), 'device': device['name'], 'scale': scale['name'], 'num_devices': scale['num_devices'], 'seconds': seconds, 'dollars': dollars, 'kilowatts': total_kilowatts, 'hours': hours, 'dollars_per_kwh': estimates['dollars_per_kwh'], } rows.append(row) df = pd.DataFrame(rows) df = df.sort_values('entropy') chosen_device = 'RTX_3090' df = df[df['device'] == chosen_device] df['time'] = df['seconds'].apply(humanize_seconds) df['cost'] = df['dollars'].apply(partial(humanize_dollars, colored=1)) df['entropy'] = df['entropy'].round(2) df['num_devices'] = df['num_devices'].apply(int) hashmodes = sorted([d['hashmode'] for d in device['benchmarks']]) # https://github.com/pandas-dev/pandas/issues/18066 monkeypatch_pandas_colored_stdout() # Output our assumptions print('\n---') print('Assumptions:') device_info = ub.group_items(devices, lambda x: x['name'])[chosen_device][0] print('estimates = {!r}'.format(estimates)) print('device_info = {}'.format(ub.repr2(device_info, nl=2))) # For each hashmode, print the scheme-vs-num_devices-vs-time matrix hashmode_to_pivots = {} for hashmode in hashmodes: subdf = df subdf = subdf[subdf['hashmode'] == hashmode] subdf = subdf.sort_values(['entropy', 'num_devices']) piv = subdf.pivot(['entropy', 'cost', 'scheme'], ['num_devices', 'scale'], 'time') # piv.style.applymap(color_cases) hashmode_to_pivots[hashmode] = piv for hashmode in hashmodes: print('\n---') print('hashmode = {!r}'.format(hashmode)) piv = hashmode_to_pivots[hashmode] print(piv) # Print the scheme-vs-hashmode-vs-cost matrix print('\n---') print('Cost Matrix:') subdf = df[df['scale'] == df['scale'].iloc[0]] piv = subdf.pivot(['entropy', 'scheme'], ['hashmode_attempts_per_second', 'hashmode'], 'cost') piv = piv.sort_index(axis=1, ascending=False) piv.columns = piv.columns.droplevel(0) print(piv) # Make the visualizations if ub.argflag('--show'): import kwplot from matplotlib.colors import LogNorm import matplotlib as mpl plt = kwplot.autoplt() sns = kwplot.autosns() use_latex = ub.argflag('--latex') if use_latex: mpl.rcParams['text.usetex'] = True def time_labelize(x): text = humanize_seconds(x, colored=False, named=True, precision=2) parts = text.split(' ') if use_latex: text = r'{\huge ' + parts[0] + '}' + '\n' + ' '.join(parts[1:]) else: text = parts[0] + '\n' + ' '.join(parts[1:]) return text def dollar_labelize(dollars): cost = humanize_dollars(dollars, named=(dollars > 1)) if use_latex: cost = cost.replace('$', r'\$') return cost hashmode_to_notes = {} for dev in devices[0]['benchmarks']: hashmode_to_notes[dev['hashmode']] = dev['notes'] if 1: # Independent of the adversary scale we can plot cost versus scheme # cost vs hashmod? subdf = df[df['scale'] == df['scale'].iloc[0]] piv = subdf.pivot(['entropy', 'scheme'], ['hashmode_attempts_per_second', 'hashmode'], 'dollars') piv = piv.sort_index(axis=1, ascending=False) # https://stackoverflow.com/questions/64234474/cust-y-lbls-seaborn ax: mpl.axes.Axes = plt.subplots(figsize=(15, 10))[1] annot = piv.applymap(dollar_labelize) piv = piv.applymap(float) sns.heatmap(piv, annot=annot, ax=ax, fmt='s', norm=LogNorm(vmin=1, vmax=100_000_000_000_000_000), annot_kws={'size': 16}, cmap='cividis', cbar_kws={ 'label': 'dollars', 'pad': 0.001 }) # Find colorbar for subax in ax.figure.axes: if subax.get_label() == '<colorbar>': subax.set_ylabel('dollars', labelpad=0) break new_ytick_labels = [] for ent, scheme in piv.index.to_list(): if use_latex: scheme = r'{\LARGE ' + scheme + '}' _ = '{scheme}\nEntropy={ent}bits'.format(scheme=scheme, ent=ent) new_ytick_labels.append(_) new_xtick_labels = [] for _, hashmode in piv.columns.to_list(): notes = '' if hashmode in hashmode_to_notes: notes = '\n(' + hashmode_to_notes[hashmode] + ')' new_xtick_labels.append(hashmode + notes) ax.set_xticklabels(new_xtick_labels, rotation=0) ax.set_yticklabels(new_ytick_labels, rotation=0) ax.set_ylabel('Password Scheme, Entropy', labelpad=24) ax.set_xlabel('Hashmode', labelpad=16) if use_latex: title = '{{\\Huge Password Cost Security}}' ax.set_title(title) else: ax.set_title('Password Cost Security') ax.figure.subplots_adjust(bottom=0.1, left=0.20, right=1.0, top=0.90, wspace=0.001) if ub.argflag('--save'): fname = 'passwd_cost_security.png' ax.figure.savefig(fname) if 1: # For each hashmode plot (scheme versus adversary scale) for hashmode in ub.ProgIter(hashmodes, desc='plotting'): subdf = df subdf = subdf[subdf['hashmode'] == hashmode] subdf = subdf.sort_values(['entropy', 'num_devices']) piv = subdf.pivot(['entropy', 'dollars', 'scheme'], ['num_devices', 'scale'], 'seconds') piv = piv.applymap(float) # https://stackoverflow.com/questions/64234474/cust-y-lbls-seaborn ax: mpl.axes.Axes = plt.subplots(figsize=(15, 10))[1] annot = piv.applymap(time_labelize) sns.heatmap(piv, annot=annot, ax=ax, fmt='s', norm=LogNorm(vmin=1, vmax=8640000000), annot_kws={'size': 10}, cbar_kws={ 'label': 'seconds', 'pad': 0.001 }) # Find colorbar for subax in ax.figure.axes: if subax.get_label() == '<colorbar>': subax.set_ylabel('seconds', labelpad=0) break new_ytick_labels = [] for ent, dollars, scheme in piv.index.to_list(): cost = dollar_labelize(dollars) if use_latex: scheme = r'{\LARGE ' + scheme + '}' _ = '{scheme}\nEntropy={ent}bits\nCost={cost}'.format( scheme=scheme, cost=cost, ent=ent) new_ytick_labels.append(_) new_xtick_labels = [] for n, name in piv.columns.to_list(): if use_latex: name = r'{\LARGE ' + name + '}' _ = name + '\n' + named_large_number(n, precision=0) + ' GPUs' new_xtick_labels.append(_) ax.set_xticklabels(new_xtick_labels, rotation=0) # ax.set_yticklabels(new_ytick_labels, horizontalalignment='left', pad=30) ax.set_yticklabels(new_ytick_labels) ax.set_ylabel('Password Scheme, Entropy, and Cost to Crack', labelpad=24) ax.set_xlabel('Adversary Resources', labelpad=16) notes = '' if hashmode in hashmode_to_notes: notes = ' (' + hashmode_to_notes[hashmode] + ')' if use_latex: title = '{{\\Huge Password Time Security}}\nhashmode={}{}'.format( hashmode, notes) ax.set_title(title) else: ax.set_title( 'Password Time Security\n(hashmode={}{})'.format( hashmode, notes)) ax.figure.subplots_adjust(bottom=0.1, left=0.20, right=1.0, top=0.90, wspace=0.001) if ub.argflag('--save'): fname = 'passwd_robustness_{}.png'.format(hashmode) ax.figure.savefig(fname) plt.show()
def draw_points(xy, color='blue', class_idxs=None, classes=None, ax=None, alpha=None, radius=1, **kwargs): """ Args: xy (ndarray): of points. Example: >>> from kwplot.mpl_draw import * # NOQA >>> import kwimage >>> xy = kwimage.Points.random(10).xy >>> draw_points(xy, radius=0.01) >>> draw_points(xy, class_idxs=np.random.randint(0, 3, 10), >>> radius=0.01, classes=['a', 'b', 'c'], color='classes') Ignore: >>> import kwplot >>> kwplot.autompl() """ import kwimage import matplotlib as mpl from matplotlib import pyplot as plt if ax is None: ax = plt.gca() xy = xy.reshape(-1, 2) # More grouped patches == more efficient runtime if alpha is None: alpha = [1.0] * len(xy) elif not ub.iterable(alpha): alpha = [alpha] * len(xy) if color == 'distinct': colors = kwimage.Color.distinct(len(alpha)) elif color == 'classes': # TODO: read colors from categories if they exist if class_idxs is None or classes is None: raise Exception( 'cannot draw class colors without class_idxs and classes') try: cls_colors = kwimage.Color.distinct(len(classes)) except KeyError: raise Exception( 'cannot draw class colors without class_idxs and classes') import kwarray _keys, _vals = kwarray.group_indices(class_idxs) colors = list(ub.take(cls_colors, class_idxs)) else: colors = [color] * len(alpha) ptcolors = [ kwimage.Color(c, alpha=a).as01('rgba') for c, a in zip(colors, alpha) ] color_groups = ub.group_items(range(len(ptcolors)), ptcolors) circlekw = { 'radius': radius, 'fill': True, 'ec': None, } if 'fc' in kwargs: import warnings warnings.warning('Warning: specifying fc to Points.draw overrides ' 'the color argument. Use color instead') circlekw.update(kwargs) fc = circlekw.pop('fc', None) # hack collections = [] for pcolor, idxs in color_groups.items(): # hack for fc if fc is not None: pcolor = fc patches = [ mpl.patches.Circle((x, y), fc=pcolor, **circlekw) for x, y in xy[idxs] ] col = mpl.collections.PatchCollection(patches, match_original=True) collections.append(col) ax.add_collection(col) return collections
def draw_boxes(boxes, alpha=None, color='blue', labels=None, centers=False, fill=False, ax=None, lw=2): """ Args: boxes (kwimage.Boxes): labels (List[str]): of labels alpha (List[float]): alpha for each box centers (bool): draw centers or not lw (float): linewidth Example: >>> import kwimage >>> bboxes = kwimage.Boxes([[.1, .1, .6, .3], [.3, .5, .5, .6]], 'xywh') >>> draw_boxes(bboxes) >>> #kwplot.autompl() """ import kwplot import matplotlib as mpl from matplotlib import pyplot as plt if ax is None: ax = plt.gca() xywh = boxes.to_xywh().data transparent = kwplot.Color((0, 0, 0, 0)).as01('rgba') # More grouped patches == more efficient runtime if alpha is None: alpha = [1.0] * len(xywh) elif not ub.iterable(alpha): alpha = [alpha] * len(xywh) edgecolors = [kwplot.Color(color, alpha=a).as01('rgba') for a in alpha] color_groups = ub.group_items(range(len(edgecolors)), edgecolors) for edgecolor, idxs in color_groups.items(): if fill: fc = edgecolor else: fc = transparent rectkw = dict(ec=edgecolor, fc=fc, lw=lw, linestyle='solid') patches = [ mpl.patches.Rectangle((x, y), w, h, **rectkw) for x, y, w, h in xywh[idxs] ] col = mpl.collections.PatchCollection(patches, match_original=True) ax.add_collection(col) if centers not in [None, False]: default_centerkw = { # 'radius': 1, 'fill': True } centerkw = default_centerkw.copy() if isinstance(centers, dict): centerkw.update(centers) xy_centers = boxes.xy_center for fcolor, idxs in color_groups.items(): # TODO: radius based on size of bbox # if 'radius' not in centerkw: # boxes.area[idxs] patches = [ mpl.patches.Circle((x, y), ec=None, fc=fcolor, **centerkw) for x, y in xy_centers[idxs] ] col = mpl.collections.PatchCollection(patches, match_original=True) ax.add_collection(col) if labels: texts = [] default_textkw = { 'horizontalalignment': 'left', 'verticalalignment': 'top', 'backgroundcolor': (0, 0, 0, .8), 'color': 'white', 'fontproperties': mpl.font_manager.FontProperties(size=6, family='monospace'), } tkw = default_textkw.copy() for (x1, y1, w, h), label in zip(xywh, labels): texts.append((x1, y1, label, tkw)) for (x1, y1, catname, tkw) in texts: ax.text(x1, y1, catname, **tkw)
def main(): # TODO: progressive hashing data structure inv1 = Inventory('/media/joncrall/raid/', blocklist) inv2 = Inventory('/media/joncrall/media', blocklist) # inv1 = Inventory('/media/joncrall/raid/Applications/NotGames', blocklist) # inv2 = Inventory('/media/joncrall/media/Applications/NotGames', blocklist) # inv1 = Inventory('/media/joncrall/raid/Applications', blocklist) # inv2 = Inventory('/media/joncrall/media/Applications', blocklist) self = inv1 # NOQA inv1.build() inv2.build() thresh = { 'frac': 0.5, 'byte': 100 * int(2**20) # only use the first few mb to determine overlap } verbose = 1 pfiles1 = inv1.pfiles pfiles2 = inv2.pfiles overlap, only1, only2 = ProgressiveFile.likely_overlaps(pfiles1, pfiles2, thresh=thresh, verbose=verbose) stats = { 'overlap': len(overlap), 'only1': len(only1), 'only2': len(only2), } print('stats = {}'.format(ub.repr2(stats, nl=1))) only2_list = sorted([p.fpath for group in only2.values() for p in group]) print('only2_list = {}'.format(ub.repr2(only2_list, nl=1))) print('stats = {}'.format(ub.repr2(stats, nl=1))) # for pfile in inv1.pfiles: # pfile._check_integrity() import numpy as np mb_read = np.array([ pfile._parts[-1][1] / int(2**20) for pfile in ub.ProgIter(inv2.pfiles) ]) mb_read.max() mb_read.min() # Build all hashes up to a reasonable degree inv1.build_hashes(max_workers=0) maybe_dups = inv1.likely_duplicates(thresh=0.2) len(maybe_dups) maybe_dups = ub.sorted_keys(maybe_dups, key=lambda x: x[2]) import networkx as nx import itertools as it # Check which directories are most likely to be duplicates graph = nx.Graph() for key, group in ub.ProgIter(maybe_dups.items(), total=len(maybe_dups), desc='build dup dir graph'): if key[0] == '': continue dpaths = [dirname(pfile.fpath) for pfile in group] for d1, d2 in it.combinations(dpaths, 2): graph.add_edge(d1, d2) edge = graph.edges[(d1, d2)] if 'dups' not in edge: edge['dups'] = 0 edge['dups'] += 1 edge_data = list(graph.edges(data=True)) for dpath in ub.ProgIter(graph.nodes, desc='find lens'): num_children = len(os.listdir(dpath)) graph.nodes[dpath]['num_children'] = num_children for d1, d2, dat in edge_data: nc1 = graph.nodes[d1]['num_children'] nc2 = graph.nodes[d2]['num_children'] ndups = dat['dups'] dup_score = (dat['dups'] / min(nc1, nc2)) dat['dup_score'] = dup_score if dup_score > 0.9: print('dup_score = {!r}'.format(dup_score)) print('d1 = {!r}'.format(d1)) print('d2 = {!r}'.format(d2)) print('nc1 = {!r}'.format(nc1)) print('nc2 = {!r}'.format(nc2)) print('ndups = {!r}'.format(ndups)) print('edge_data = {}'.format(ub.repr2(edge_data, nl=2))) print('maybe_dups = {}'.format(ub.repr2(maybe_dups.keys(), nl=3))) for key, group in maybe_dups.items(): if key[0] == '': continue print('key = {!r}'.format(key)) print('group = {}'.format(ub.repr2(group, nl=1))) for pfile in group: pfile.refined_to(float('inf')) print('key = {!r}'.format(key)) inv2.build_hashes(max_workers=6, mode='thread') inv1.pfiles = [ p for p in ub.ProgIter(inv1.pfiles, desc='exist check') if exists(p.fpath) ] inv2.pfiles = [ p for p in ub.ProgIter(inv2.pfiles, desc='exist check') if exists(p.fpath) ] pfiles1 = inv1.pfiles pfiles2 = inv2.pfiles def compute_likely_overlaps(pfiles1, pfiles2): step_idx1 = ProgressiveFile.compatible_step_idx(pfiles1) step_idx2 = ProgressiveFile.compatible_step_idx(pfiles2) step_idx = min(step_idx1, step_idx2) grouped1 = ProgressiveFile.group_pfiles(pfiles1, step_idx=step_idx) grouped2 = ProgressiveFile.group_pfiles(pfiles2, step_idx=step_idx) thresh = 0.2 verbose = 1 # TODO: it would be nice if we didn't have to care about internal # deduplication when we attempt to find cross-set overlaps dups1 = ProgressiveFile.likely_duplicates(inv1.pfiles, thresh=thresh, verbose=verbose) dups2 = ProgressiveFile.likely_duplicates(inv2.pfiles, thresh=thresh, verbose=verbose) pfiles = inv1.pfiles + inv2.pfiles dups3 = ProgressiveFile.likely_duplicates(pfiles, thresh=thresh, verbose=verbose) only_on_inv2 = {} for key, group in dups3.items(): if not any( item.fpath.startswith(inv1.root_fpath) for item in group): only_on_inv2[key] = group for p1 in inv1.pfiles: if 'Chase HQ 2 (JUE) [!].zip' in p1.fpath: break for p2 in inv2.pfiles: if 'Chase HQ 2 (JUE) [!].zip' in p2.fpath: break look = list(ub.flatten(only_on_inv2.values())) takealook = sorted([p.fpath for p in look]) print('takealook = {}'.format(ub.repr2(takealook, nl=1))) keys1 = set(grouped1) keys2 = set(grouped2) missing_keys2 = keys2 - keys1 missing_groups2 = ub.dict_subset(grouped2, missing_keys2) missing_fpaths2 = [] for key, values in missing_groups2.items(): print('key = {!r}'.format(key)) print('values = {}'.format(ub.repr2(values, nl=1))) missing_fpaths2.extend(values) missing_fpaths2 = sorted([p.fpath for p in missing_fpaths2]) print('missing_fpaths2 = {}'.format(ub.repr2(missing_fpaths2, nl=1))) # pass import xdev set_overlaps = xdev.set_overlaps(keys1, keys2) print('set_overlaps = {}'.format(ub.repr2(set_overlaps, nl=1))) # We want to know what files in set2 do not exist in set1 if 0: fpath = inv1.all_fpaths[0] pfile = ProgressiveFile(fpath) fpath1 = '/media/joncrall/raid/unsorted/yet-another-backup/card-usb-drive/Transfer/Zebras/DownloadedLibraries/lightspeed/solve_triu.m' fpath2 = '/media/joncrall/raid/unsorted/yet-another-backup/card-usb-drive/Zebras/downloaded_libraries/lightspeed/solve_triu.m' fpath1 = '/media/joncrall/raid/Applications/Wii/WiiHacksAndStuff/CurrentHacks/Falco/DarkFalco02.pcs' fpath2 = '/media/joncrall/raid/Applications/Wii/WiiHacksAndStuff/CurrentHacks/Ivysaur/Kraid-v2-Ivy.pcs' pfile = pfile1 = ProgressiveFile(fpath1) pfile2 = ProgressiveFile(fpath2) pfile.maybe_equal(pfile2, thresh=0.1) fpath_demodata = inv1.all_fpaths[::len(inv1.all_fpaths) // 500] # fpaths = hash_groups1_dup['ef46db3751d8e999'] pfiles_demodata = [ProgressiveFile(f) for f in fpath_demodata] def progressive_duplicates(pfiles, idx=1): step_ids = [pfile.refined_to(idx) for pfile in ub.ProgIter(pfiles)] final_groups = {} grouped = ub.group_items(pfiles, step_ids) for key, group in grouped.items(): if len(group) > 1: if all(not g.can_refine for g in group): # Group is ~100% a real duplicate final_groups[key] = group else: pfiles = group deduped = progressive_duplicates(pfiles, idx=idx + 1) final_groups.update(deduped) else: final_groups[key] = group return final_groups pfiles = pfiles_demodata final_groups = progressive_duplicates(pfiles) for key, group in final_groups.items(): if len(group) > 1: print('key = {!r}'.format(key)) print('group = {}'.format(ub.repr2(group, nl=1))) inv1.build_hashes() inv2.build_hashes() hash_groups1 = ub.group_items(inv1.all_fpaths, inv1.all_hashes) hash_groups2 = ub.group_items(inv2.all_fpaths, inv2.all_hashes) hash_groups1_dup = { k: v for k, v in hash_groups1.items() if len(v) > 1 } hash_groups2_dup = { k: v for k, v in hash_groups2.items() if len(v) > 1 } len(hash_groups1_dup) len(hash_groups2_dup) # common = set(hash_groups1) & set(hash_groups2) # xdev.set_overlaps(hash_groups1, hash_groups2) fnames1 = ub.group_items(inv1.all_fpaths, key=basename) fnames2 = ub.group_items(inv2.all_fpaths, key=basename) missing = ub.dict_diff(fnames2, fnames1) sorted(ub.flatten(missing.values())) len(missing) fpath_demodata = inv1.all_fpaths[::len(inv1.all_fpaths) // 500] def internal_deduplicate(self): hash_groups = ub.group_items(self.all_fpaths, self.all_hashes) hash_groups_dup = { k: v for k, v in hash_groups.items() if len(v) > 1 } from os.path import dirname hash_groups_dup['ef46db3751d8e999'] for key, values in hash_groups_dup.items(): for v in values: if v.endswith('.avi'): break [basename(v) for v in values] [dirname(v) for v in values]