def check_relationships(branches): ancestors = {b: set() for b in branches} length = len(branches) * (len(branches) - 1) for b1, b2 in ub.ProgIter(it.combinations(branches, 2), length=length): ret = ub.cmd('git merge-base --is-ancestor {} {}'.format(b1, b2))['ret'] if ret == 0: ancestors[b1].add(b2) ret = ub.cmd('git merge-base --is-ancestor {} {}'.format(b2, b1))['ret'] if ret == 0: ancestors[b2].add(b1) print('<key> is an ancestor of <value>') print(ub.repr2(ancestors)) descendants = {b: set() for b in branches} for key, others in ancestors.items(): for o in others: descendants[o].add(key) print('<key> descends from <value>') print(ub.repr2(descendants)) import plottool as pt import networkx as nx G = nx.DiGraph() G.add_nodes_from(branches) for key, others in ancestors.items(): for o in others: # G.add_edge(key, o) G.add_edge(o, key) from networkx.algorithms.connectivity.edge_augmentation import collapse flag = True G2 = G while flag: flag = False for u, v in list(G2.edges()): if G2.has_edge(v, u): G2 = collapse(G2, [[u, v]]) node_relabel = ub.ddict(list) for old, new in G2.graph['mapping'].items(): node_relabel[new].append(old) G2 = nx.relabel_nodes(G2, {k: '\n'.join(v) for k, v in node_relabel.items()}) flag = True break G3 = nx.transitive_reduction(G2) pt.show_nx(G3, arrow_width=1.5, prog='dot', layoutkw=dict(prog='dot')) pt.zoom_factory() pt.pan_factory() pt.plt.show()
def _configure(self): logger.debug(' ----- ' + self.__class__.__name__ + ' configure') config = tmp_smart_cast_config(self) logger.info('triangulator config = {}'.format(ub.repr2(config, nl=2))) output_fpath = config.pop('output_fpath') cal_fpath = config.pop('cal_fpath') self.triangulator = ctalgo.FishStereoMeasurments(**config) # Camera loading process is not working correctly. # Load camera calibration data here for now. # if not os.path.exists(cal_fpath): raise KeyError('must specify a valid camera calibration path') self.cal = ctalgo.StereoCalibration.from_file(cal_fpath) logger.info('self.cal = {!r}'.format(self.cal)) self.headers = ['current_frame', 'fishlen', 'range', 'error', 'dz', 'box_pts1', 'box_pts2'] self.output_file = open(output_fpath, 'w') self.output_file.write(','.join(self.headers) + '\n') self.output_file.close() self.output_file = open(output_fpath, 'a') self._base_configure() self.prog = ub.ProgIter(verbose=3) self.prog.begin()
def _recurse(d): import torch import numpy as np if isinstance(d, dict): return ub.odict(sorted([(k, _recurse(v)) for k, v in d.items()])) clsname = type(d).__name__ if 'Container' in clsname: meta = ub.odict(sorted([ ('stack', d.stack), # ('padding_value', d.padding_value), # ('pad_dims', d.pad_dims), # ('datatype', d.datatype), ('cpu_only', d.cpu_only), ])) meta = ub.repr2(meta, nl=0) return {type(d).__name__ + meta: _recurse(d.data)} elif isinstance(d, list): return [_recurse(v) for v in d] elif isinstance(d, tuple): return tuple([_recurse(v) for v in d]) elif isinstance(d, torch.Tensor): return d.shape elif isinstance(d, np.ndarray): return d.shape elif isinstance(d, (str, bytes)): return d elif isinstance(d, (int, float)): return d elif isinstance(d, slice): return d elif 'PolygonMasks' == clsname: # hack for mmdet return repr(d) elif 'BitmapMasks' == clsname: # hack for mmdet return repr(d) elif hasattr(d, 'shape'): return d.shape elif hasattr(d, 'items'): # hack for dict-like objects return ub.odict(sorted([(k, _recurse(v)) for k, v in d.items()])) else: raise TypeError(type(d))
def _update_hashes(): """ for dev use to update hashes of the demo images CommandLine: xdoctest -m kwimage.im_demodata _update_hashes xdoctest -m kwimage.im_demodata _update_hashes --require-hashes """ TEST_IMAGES = _TEST_IMAGES.copy() for key in TEST_IMAGES.keys(): item = TEST_IMAGES[key] grabkw = { 'appname': 'kwimage/demodata', } # item['sha512'] = 'not correct' # Wait until ubelt 9.1 is released to change hasher due to # issue in ub.grabdata # hasher_priority = ['sha512', 'sha1'] hasher_priority = ['sha1'] REQUIRE_EXISTING_HASH = ub.argflag('--require-hashes') if REQUIRE_EXISTING_HASH: for hasher in hasher_priority: if hasher in item: grabkw.update({ 'hash_prefix': item[hasher], 'hasher': hasher, }) break if 'fname' in item: grabkw['fname'] = item['fname'] item.pop('sha512', None) fpath = ub.grabdata(item['url'], **grabkw) if 'hasher' not in item: hasher = hasher_priority[0] prefix = ub.hash_file(fpath, hasher=hasher) item[hasher] = prefix[0:64] print('_TEST_IMAGES = ' + ub.repr2(TEST_IMAGES, nl=2))
def sed(regexpr, repl, dpath=None, include=None, exclude=None, recursive=True, dry=False, verbose=1): r""" Execute a sed on multiple files. Example: >>> from xdev.search_replace import * # NOQA >>> from xdev.search_replace import _create_test_filesystem >>> dpath = _create_test_filesystem()['root'] >>> sed('a', 'x', dpath=dpath, dry=True) """ num_changed = 0 num_files_checked = 0 num_skipped = 0 fpaths_changed = [] fpath_generator = find(dpath=dpath, type='f', include=include, exclude=exclude, recursive=recursive) for fpath in fpath_generator: try: changed_lines = sedfile(fpath, regexpr, repl, dry=dry) except UnicodeDecodeError: num_skipped += 1 else: num_files_checked += 1 if len(changed_lines) > 0: fpaths_changed.append(fpath) num_changed += len(changed_lines) if verbose: print('num_files_checked = {}'.format(num_files_checked)) print('num probable binary files skipped = {}'.format(num_skipped)) print('fpaths_changed = {}'.format(ub.repr2(sorted(fpaths_changed)))) print('total lines changed = {!r}'.format(num_changed))
def setup_dpath(self, train_dpath, short=True, hashed=True): train_info = self.train_info(train_dpath, short, hashed) train_dpath = ub.ensuredir(train_info['train_dpath']) # backwards compatability code, # can eventually remove after a major version change if True: # backwards compatability code if os.path.exists( train_info['old_train_dpath']) and not os.path.islink( train_info['old_train_dpath']): ub.delete(train_info['train_dpath']) ub.symlink(train_info['old_train_dpath'], train_info['train_dpath'], overwrite=True, verbose=3) # setup symlinks # ub.ensuredir(dirname(train_info['link_dpath'])) # ub.symlink(train_info['train_dpath'], train_info['link_dpath'], # overwrite=True, verbose=3) if train_info['nice_dpath']: ub.ensuredir(dirname(train_info['nice_dpath'])) ub.symlink(train_info['train_dpath'], train_info['nice_dpath'], overwrite=True, verbose=3) verbose = 0 if verbose: print('+=========') # print('hyper_strid = {!r}'.format(params.hyper_id())) # print('train_init_id = {!r}'.format(train_info['input_id'])) # print('arch = {!r}'.format(train_info['arch_id'])) # print('train_hyper_hashid = {!r}'.format(train_info['train_hyper_hashid'])) print('hyper = {}'.format(ub.repr2(train_info['hyper'], nl=3))) print('train_hyper_id_brief = {!r}'.format( train_info['train_hyper_id_brief'])) print('train_id = {!r}'.format(train_info['train_id'])) print('+=========') return train_info
def text_between_lines(lnum1, lnum2, col1=0, col2=sys.maxsize - 1): import vim # lines = vim.eval('getline({}, {})'.format(lnum1, lnum2)) lines = vim.current.buffer[lnum1 - 1:lnum2] lines = [ub.ensure_unicode(line) for line in lines] try: if len(lines) == 0: pass elif len(lines) == 1: lines[0] = lines[0][col1:col2 + 1] else: # lines[0] = lines[0][col1:] # lines[-1] = lines[-1][:col2 + 1] for i in range(len(lines)): lines[i] = lines[i][col1:col2 + 1] text = '\n'.join(lines) except Exception: print(ub.repr2(lines)) raise return text
def _load_sized_image(self, index, inp_size): # load the raw data from VOC cacher = ub.Cacher('voc_img', cfgstr=ub.repr2([index, inp_size]), appname='clab') data = cacher.tryload() if data is None: image = self._load_image(index) orig_size = np.array(image.shape[0:2][::-1]) factor = inp_size / orig_size # squish the image into network input coordinates interpolation = (cv2.INTER_AREA if factor.sum() <= 2 else cv2.INTER_CUBIC) hwc255 = cv2.resize(image, tuple(inp_size), interpolation=interpolation) data = hwc255, orig_size, factor cacher.save(data) hwc255, orig_size, factor = data return hwc255, orig_size, factor
def main(): coarse = Coarse() fine1 = Fine_V1() fine2 = Fine_V2() print('coarse = {!r}'.format(coarse)) print('fine1 = {!r}'.format(fine1)) print('fine2 = {!r}'.format(fine2)) cls_list = [Coarse, Fine_V1, Fine_V2] for data_cls in cls_list: data = data_cls() print('data_cls = {!r}'.format(data_cls)) for coerce_cls in cls_list: res = coerce_cls.coerce(data) print(' child_cls = {}, {}'.format(ub.repr2(coerce_cls, nl=1), res)) Coarse.coerce(fine1) Coarse.coerce(fine2)
def run_checks(): cfg = viame_wrangler.config.WrangleConfig({ 'annots': ub.truepath('~/data/viame-challenge-2018/phase1-annotations/*/*.json') }) fpaths = list(glob.glob(cfg.annots)) print('fpaths = {}'.format(ub.repr2(fpaths))) for fpath in fpaths: dset_name = os.path.basename(fpath).split('-')[0].split('.')[0] dset = CocoDataset(fpath, img_root=cfg.img_root, tag=dset_name) assert not dset.missing_images() assert not dset._find_bad_annotations() assert all([ img['has_annots'] in [True, False, None] for img in dset.imgs.values() ]) if 'original' not in dset_name: assert len(dset.cats) in [106, 21]
def _cmd(repo, command, cwd=ub.NoParam, verbose=ub.NoParam): if verbose is ub.NoParam: verbose = repo.verbose if cwd is ub.NoParam: cwd = repo.dpath repo._logged_cmds.append((command, cwd)) repo.debug('Run {!r} in {!r}'.format(command, cwd)) info = ub.cmd(command, cwd=cwd, verbose=verbose) if verbose: if info['out'].strip(): repo.info(info['out']) if info['err'].strip(): repo.debug(info['err']) if info['ret'] != 0: raise ShellException(ub.repr2(info)) return info
def format_quotes_in_file(fpath, diff=True, write=False, verbose=3): """ Autoformat quotation marks in Python files Args: fpath (str): The file to format diff (bool): if True write the diff between old and new to stdout write (bool): if True write the modifications to disk verbose (int): verbosity level """ if verbose > 1: print('reading fpath = {!r}'.format(fpath)) with open(fpath, 'r') as file: text = file.read() new_text = format_quotes_in_text(text) difftext = xdev.difftext(text, new_text, context_lines=3, colored=True) did_anything = bool(difftext.strip()) if verbose > 1: if not did_anything: print('No difference!') if diff: print(difftext) if write: # Write the file if did_anything: if verbose > 1: print('writing to fpath = {}'.format(ub.repr2(fpath, nl=1))) with open(fpath, 'w') as file: file.write(new_text) else: if not diff: if verbose > 1: print('dump formatted text to stdout') print(new_text)
def _print_previous_loop_statistics(infr, count): # Print stats about what happend in the this loop history = infr.metrics_list[-count:] recover_blocks = ut.group_items([ (k, sum(1 for i in g)) for k, g in it.groupby(ut.take_column(history, 'recovering')) ]).get(True, []) infr.print( ('Recovery mode entered {} times, ' 'made {} recovery decisions.').format(len(recover_blocks), sum(recover_blocks)), color='green', ) testaction_hist = ut.dict_hist(ut.take_column(history, 'test_action')) infr.print( 'Test Action Histogram: {}'.format( ut.repr4(testaction_hist, si=True)), color='yellow', ) if infr.params['inference.enabled']: action_hist = ut.dict_hist( ut.emap(frozenset, ut.take_column(history, 'action'))) infr.print( 'Inference Action Histogram: {}'.format( ub.repr2(action_hist, si=True)), color='yellow', ) infr.print( 'Decision Histogram: {}'.format( ut.repr2(ut.dict_hist(ut.take_column(history, 'pred_decision')), si=True)), color='yellow', ) infr.print( 'User Histogram: {}'.format( ut.repr2(ut.dict_hist(ut.take_column(history, 'user_id')), si=True)), color='yellow', )
def test_incomp_inference(): infr = demo.demodata_infr(num_pccs=0) # Make 2 consistent and 2 inconsistent CCs infr.add_feedback((1, 2), POSTV) infr.add_feedback((2, 3), POSTV) infr.add_feedback((3, 4), POSTV) infr.add_feedback((4, 1), POSTV) # ----- infr.add_feedback((11, 12), POSTV) infr.add_feedback((12, 13), POSTV) infr.add_feedback((13, 14), POSTV) infr.add_feedback((14, 11), POSTV) infr.add_feedback((12, 14), NEGTV) # ----- infr.add_feedback((21, 22), POSTV) infr.add_feedback((22, 23), POSTV) infr.add_feedback((23, 21), NEGTV) # ----- infr.add_feedback((31, 32), POSTV) infr.add_feedback((32, 33), POSTV) infr.add_feedback((33, 31), POSTV) infr.add_feedback((2, 32), NEGTV) infr.add_feedback((3, 33), NEGTV) infr.add_feedback((12, 21), NEGTV) # ----- # Incomparable within CCs print('==========================') infr.add_feedback((1, 3), INCMP) infr.add_feedback((1, 4), INCMP) infr.add_feedback((1, 2), INCMP) infr.add_feedback((11, 13), INCMP) infr.add_feedback((11, 14), INCMP) infr.add_feedback((11, 12), INCMP) infr.add_feedback((1, 31), INCMP) infr.add_feedback((2, 32), INCMP) infr.add_feedback((12, 21), INCMP) infr.add_feedback((23, 21), INCMP) infr.add_feedback((12, 14), INCMP) print('Final state:') print(ub.repr2(sorted(infr.gen_edge_attrs('decision'))))
def fit2(harn, prevstate_fpath=None, dry=False): from pysseg.backend.find_segnet_caffe import import_segnet_caffe from pysseg.backend import iface_caffe as iface caffe = import_segnet_caffe(gpu_num=harn.gpu_num) harn.prepare_solver() solver_info = iface.parse_solver_info(harn.solver_fpath) snapshot_iters = solver_info['snapshot'] # Assuming that the solver .prototxt has already been configured including # the corresponding training and testing network definitions (as .prototxt). solver = caffe.SGDSolver(harn.solver_fpath) pretrained = harn.init_pretrained_fpath prev_iter = 0 if prevstate_fpath is not None: print('Restoring State from {}'.format(prevstate_fpath)) solver.restore(prevstate_fpath) prev_iter = iface.snapshot_iterno(prevstate_fpath) elif pretrained is not None: # https://github.com/BVLC/caffe/issues/3336 print( 'Loading pretrained model weights from {}'.format(pretrained)) solver.net.copy_from(pretrained) # net = self.solver.net # Do iterations over batches # prev = None n_steps = solver_info['display'] bx = prev_iter while bx < solver_info['max_iter']: # Run until we can produce a snapshot info = solver.step(n_steps) print('bx = {!r}'.format(bx)) print('step info = {}'.format(ub.repr2(info))) bx += n_steps yield bx
def main(cmdline=True, **kw): config = ConvertConfig(default=kw, cmdline=cmdline) print('config = {}'.format(ub.repr2(dict(config), nl=1))) # TODO: ability to map image ids to agree with another coco file csv_fpaths = config['src'] new_root = config['new_root'] old_root = config['old_root'] images = config['images'] dst_fpath = config['dst'] dst_root = dirname(dst_fpath) dset = coco_from_viame_csv(csv_fpaths, images) dset.fpath = dst_fpath dset.img_root = dst_root try: dset.reroot(new_root=new_root, old_root=old_root, check=1) except Exception as ex: print('Reroot failed') print('ex = {!r}'.format(ex)) print('dset.fpath = {!r}'.format(dset.fpath)) dset.dump(dset.fpath, newlines=True)
def extract_ggr_pccs(coco_dset): import graphid graph = graphid.api.GraphID() graph.add_annots_from(coco_dset.annots().aids) infr = graph.infr infr.params['inference.enabled'] = False all_aids = list(coco_dset.annots().aids) aids_set = set(all_aids) for aid1 in ub.ProgIter(all_aids, desc='construct graph'): annot = coco_dset.anns[aid1] # resolve duplicate reviews (take the last one) aid2_to_decision = {} for aid2, decision in annot['review_ids']: aid2_to_decision[aid2] = decision for aid2, decision in aid2_to_decision.items(): if aid2 not in aids_set: # hack because data is setup wrong continue edge = (aid1, aid2) if decision == 'positive': infr.add_feedback(edge, evidence_decision=graphid.core.POSTV) elif decision == 'negative': infr.add_feedback(edge, evidence_decision=graphid.core.NEGTV) elif decision == 'incomparable': infr.add_feedback(edge, evidence_decision=graphid.core.INCMP) else: raise KeyError(decision) infr.params['inference.enabled'] = True infr.apply_nondynamic_update() print('status = {}' + ub.repr2(infr.status(True))) pccs = list(map(frozenset, infr.positive_components())) for pcc in pccs: for aid in pcc: print('aid = {!r}'.format(aid)) assert aid in coco_dset.anns return pccs
def test_negative_newlines(): import ubelt as ub dict_ = { 'k1': [[1, 2, 3], [4, 5, 6]], 'k2': [[[1, 2, [1, 2, 3]], [1, 2, 3], 3], [4, 5, 6]], 'k3': [1, 2, 3], 'k4': [[[1, 2, 3], 2, 3], [4, 5, 6]], } text = ub.repr2(dict_, nl=-1) print(text) assert text == ub.codeblock(''' { 'k1': [ [1, 2, 3], [4, 5, 6] ], 'k2': [ [ [ 1, 2, [1, 2, 3] ], [1, 2, 3], 3 ], [4, 5, 6] ], 'k3': [1, 2, 3], 'k4': [ [ [1, 2, 3], 2, 3 ], [4, 5, 6] ] } ''')
def refresh_candidate_edges(infr): """ Search for candidate edges. Assign each edge a priority and add to queue. """ infr.print('refresh_candidate_edges', 1) infr.assert_consistency_invariant() if infr.ibs is not None: candidate_edges = infr.find_lnbnn_candidate_edges() elif hasattr(infr, 'dummy_verif'): infr.print('Searching for dummy candidates') infr.print('dummy vsone params =' + ub.repr2(infr.dummy_verif.dummy_params, nl=1, si=True)) ranks_top = infr.params['ranking.ntop'] candidate_edges = infr.dummy_verif.find_candidate_edges( K=ranks_top) else: raise Exception( 'No method available to search for candidate edges') infr.add_candidate_edges(candidate_edges) infr.assert_consistency_invariant()
def main(cls, cmdline=True, **kw): """ Example: >>> kw = {'src': ['special:shapes8', 'special:shapes1']} >>> cmdline = False >>> cls = CocoUnionCLI >>> cls.main(cmdline, **kw) """ import kwcoco config = cls.CLIConfig(kw, cmdline=cmdline) print('config = {}'.format(ub.repr2(dict(config), nl=1))) if config['src'] is None: raise Exception('must specify sources: {}'.format(config['src'])) if len(config['src']) == 0: raise ValueError('Must provide at least one input dataset') datasets = [] for fpath in ub.ProgIter(config['src'], desc='reading datasets', verbose=1): print('reading fpath = {!r}'.format(fpath)) dset = kwcoco.CocoDataset.coerce(fpath) if config['absolute']: dset.reroot(absolute=True) datasets.append(dset) combo = kwcoco.CocoDataset.union(*datasets) out_fpath = config['dst'] out_dpath = dirname(out_fpath) if out_dpath: ub.ensuredir(out_dpath) print('Writing to out_fpath = {!r}'.format(out_fpath)) combo.fpath = out_fpath combo.dump(combo.fpath, newlines=True)
def fit(self, prevstate_fpath): from pysseg.backend.find_segnet_caffe import import_segnet_caffe from pysseg.backend import iface_caffe as iface harn = self.harn caffe = import_segnet_caffe(gpu_num=harn.gpu_num) harn.prepare_solver() solver_info = iface.parse_solver_info(harn.solver_fpath) model_fpath = solver_info['train_model_path'] model_info = iface.parse_model_info(model_fpath) self.solver = caffe.SGDSolver(harn.solver_fpath) pretrained = harn.init_pretrained_fpath if prevstate_fpath is not None: print('Restoring State from {}'.format(prevstate_fpath)) self.solver.restore(prevstate_fpath) elif pretrained is not None: print( 'Loading pretrained model weights from {}'.format(pretrained)) self.solver.net.copy_from(pretrained) layers = model_info['layer'] start_layer = layers[1]['name'] # Do iterations over batches for bx in range(solver_info['max_iter']): self.load_batch_data(bx) outputs = self.solver.net.forward(start=start_layer) import ubelt as ub print(ub.repr2(outputs)) self.solver.net.backwards() # need to manually update weights. bleh... self.update(bx)
def read_raw_categories(): cfg = viame_wrangler.config.WrangleConfig() img_root = cfg.img_root annot_dir = cfg.annot_dir fpaths = list(glob.glob(join(annot_dir, '*.json'))) print('Reading') dsets = [ coco_wrangler.CocoDataset(fpath, autobuild=True) for fpath in fpaths ] if 0: for dset in dsets: print(dset.img_root) # print(ub.repr2([d['name'] for d in dset.cats.values()])) # print(ub.repr2(dset.basic_stats())) print(ub.repr2(dset.category_annotation_frequency())) print('Merging') merged = coco_wrangler.CocoDataset.union(*dsets) merged.img_root = img_root # merged._run_fixes() # print(ub.repr2(merged.category_annotation_frequency())) tree0 = viame_wrangler.lifetree.LifeCatalog(autoparse=True) mapper = viame_wrangler.cats_2018.make_raw_category_mapping(merged, tree0) merged.rename_categories(mapper) print('Building') node_to_freq = merged.category_annotation_frequency() for node in tree0.G.nodes(): tree0.G.node[node]['freq'] = node_to_freq.get(node, 0) tree0.accumulate_frequencies() tree0.remove_unsupported_nodes() if DRAW: tree0.draw('c0-fine-classes-raw.png') return tree0, merged, mapper
def description(): import bs4 import requests resp = requests.get( 'https://gwg.nga.mil/ntb/baseline/software/testfile/Nitfv2_1/scen_2_1.html', verify=False) soup = bs4.BeautifulSoup(resp.text, 'html.parser') tables = soup.findAll('table') names_noext = [n.split('.')[0] for n in NITF_TEST_NAMES] name = None name_to_desc = {} for tab in tables: for td in tab.findAll('td'): if name is not None: desc = td.text.strip() name_to_desc[name] = desc.replace('\r', '').replace( '\n', '').replace('\t', '').replace('\xa0', '') name = None elif td.text.strip() in names_noext: name = td.text.strip() print(ub.repr2(name_to_desc, nl=1))
def main(): import timerit import ubelt as ub import random import string # expected = "58178059833426840615453390153965" length = 20 expected = ''.join(random.choices(string.printable, k=length)) def flip_char(text, pos): old = text[pos] new = random.choice(string.printable) while new == old: pass before = text[:pos - 1] after = text[pos:] return before + new + after variants = dict( ne_first=flip_char(expected, 0), ne_mid=flip_char(expected, length // 2), ne_last=flip_char(expected, length - 1), too_long='F' * len(expected) * 10, too_short='F', correct=expected, ) ti = timerit.Timerit(10000000, bestof=10, verbose=2) for key, value in variants.items(): for _ in ti.reset(key): value == expected print('ti.rankings = {}'.format( ub.repr2(ti.rankings['min'], nl=2, align=':')))
def closure_(obj, name): # TODO: handle assignments if name in visitor.import_lines: # Check and see if the name was imported from elsewhere return 'import', visitor.import_lines[name] elif name in visitor.assignments: type_, value = visitor.assignments[name] if type_ == 'node': # TODO, need to handle non-simple expressions return type_, '{} = {}'.format(name, value.value.id) else: # when value is a dict we need to be sure it is # extracted in the same order as we see it return type_, '{} = {}'.format(name, ub.repr2(value)) elif isinstance(obj, types.FunctionType): if obj.__module__ == module_name: sourcecode = inspect.getsource(obj) return 'code', sourcecode elif isinstance(obj, type): if obj.__module__ == module_name: sourcecode = inspect.getsource(obj) return 'code', sourcecode raise NotImplementedError(str(obj) + ' ' + str(name))
def test_hash_data(): counter = [0] failed = [] def check_hash(want, input_): count = counter[0] = counter[0] + 1 got = ub.hash_data(input_) # assert got.startswith(want), 'want={}, got={}'.format(want, got) print('check_hash({!r}, {!r})'.format(got, input_)) if want is not None and not got.startswith(want): item = (got, input_, count, want) failed.append(item) check_hash('egexcbwgdtmjrzafljtjwqpgfhmfetjs', '1') check_hash('hjvebphzylxgtxncyphclsjglvmstsbq', ['1']) check_hash('hjvebphzylxgtxncyphclsjglvmstsbq', tuple(['1'])) check_hash('ftzqivzayzivmobwymodjnnzzxzrvvjz', b'12') check_hash('jiwjkgkffldfoysfqblsemzkailyridf', [b'1', b'2']) check_hash('foevisahdffoxfasicvyklrmuuwqnfcc', ['1', '2', '3']) check_hash('rkcnfxkjwkrfejhbpcpopmyubhbvonkt', ['1', np.array([1, 2, 3], dtype=np.int64), '3']) check_hash('lxssoxdkstvccsyqaybaokehclyctgmn', '123') check_hash('fpvptydigvgjimbzadztgpvjpqrevwcq', zip([1, 2, 3], [4, 5, 6])) print(ub.repr2(failed, nl=1)) assert len(failed) == 0
def benchmark_hash_data(): """ CommandLine: python ~/code/ubelt/dev/bench_hash.py --convert=True --show python ~/code/ubelt/dev/bench_hash.py --convert=False --show """ import ubelt as ub #ITEM = 'JUST A STRING' * 100 ITEM = [0, 1, 'a', 'b', ['JUST A STRING'] * 4] HASHERS = ['sha1', 'sha512', 'xxh32', 'xxh64', 'blake3'] scales = list(range(5, 13)) results = ub.AutoDict() # Use json is faster or at least as fast it most cases # xxhash is also significantly faster than sha512 convert = ub.argval('--convert', default='True').lower() == 'True' print('convert = {!r}'.format(convert)) ti = ub.Timerit(9, bestof=3, verbose=1, unit='ms') for s in ub.ProgIter(scales, desc='benchmark', verbose=3): N = 2**s print(' --- s={s}, N={N} --- '.format(s=s, N=N)) data = [ITEM] * N for hasher in HASHERS: for timer in ti.reset(hasher): ub.hash_data(data, hasher=hasher, convert=convert) results[hasher].update({N: ti.mean()}) col = {h: results[h][N] for h in HASHERS} sortx = ub.argsort(col) ranking = ub.dict_subset(col, sortx) print('walltime: ' + ub.repr2(ranking, precision=9, nl=0)) best = next(iter(ranking)) #pairs = list(ub.iter_window( 2)) pairs = [(k, best) for k in ranking] ratios = [ranking[k1] / ranking[k2] for k1, k2 in pairs] nicekeys = ['{}/{}'.format(k1, k2) for k1, k2 in pairs] relratios = ub.odict(zip(nicekeys, ratios)) print('speedup: ' + ub.repr2(relratios, precision=4, nl=0)) # xdoc +REQUIRES(--show) # import pytest # pytest.skip() import pandas as pd df = pd.DataFrame.from_dict(results) df.columns.name = 'hasher' df.index.name = 'N' ratios = df.copy().drop(columns=df.columns) for k1, k2 in [('sha512', 'xxh32'), ('sha1', 'xxh32'), ('xxh64', 'xxh32')]: ratios['{}/{}'.format(k1, k2)] = df[k1] / df[k2] print() print('Seconds per iteration') print(df.to_string(float_format='%.9f')) print() print('Ratios of seconds') print(ratios.to_string(float_format='%.2f')) print() print('Average Ratio (over all N)') print('convert = {!r}'.format(convert)) print(ratios.mean().sort_values()) if ub.argflag('--show'): import kwplot kwplot.autompl() xdata = sorted(ub.peek(results.values()).keys()) ydata = ub.map_vals(lambda d: [d[x] for x in xdata], results) kwplot.multi_plot(xdata, ydata, xlabel='N', ylabel='seconds', title='convert = {}'.format(convert)) kwplot.show_if_requested()
def compute_likely_overlaps(pfiles1, pfiles2): step_idx1 = ProgressiveFile.compatible_step_idx(pfiles1) step_idx2 = ProgressiveFile.compatible_step_idx(pfiles2) step_idx = min(step_idx1, step_idx2) grouped1 = ProgressiveFile.group_pfiles(pfiles1, step_idx=step_idx) grouped2 = ProgressiveFile.group_pfiles(pfiles2, step_idx=step_idx) thresh = 0.2 verbose = 1 # TODO: it would be nice if we didn't have to care about internal # deduplication when we attempt to find cross-set overlaps dups1 = ProgressiveFile.likely_duplicates(inv1.pfiles, thresh=thresh, verbose=verbose) dups2 = ProgressiveFile.likely_duplicates(inv2.pfiles, thresh=thresh, verbose=verbose) pfiles = inv1.pfiles + inv2.pfiles dups3 = ProgressiveFile.likely_duplicates(pfiles, thresh=thresh, verbose=verbose) only_on_inv2 = {} for key, group in dups3.items(): if not any( item.fpath.startswith(inv1.root_fpath) for item in group): only_on_inv2[key] = group for p1 in inv1.pfiles: if 'Chase HQ 2 (JUE) [!].zip' in p1.fpath: break for p2 in inv2.pfiles: if 'Chase HQ 2 (JUE) [!].zip' in p2.fpath: break look = list(ub.flatten(only_on_inv2.values())) takealook = sorted([p.fpath for p in look]) print('takealook = {}'.format(ub.repr2(takealook, nl=1))) keys1 = set(grouped1) keys2 = set(grouped2) missing_keys2 = keys2 - keys1 missing_groups2 = ub.dict_subset(grouped2, missing_keys2) missing_fpaths2 = [] for key, values in missing_groups2.items(): print('key = {!r}'.format(key)) print('values = {}'.format(ub.repr2(values, nl=1))) missing_fpaths2.extend(values) missing_fpaths2 = sorted([p.fpath for p in missing_fpaths2]) print('missing_fpaths2 = {}'.format(ub.repr2(missing_fpaths2, nl=1))) # pass import xdev set_overlaps = xdev.set_overlaps(keys1, keys2) print('set_overlaps = {}'.format(ub.repr2(set_overlaps, nl=1)))
def main(): from sqlalchemy.orm import sessionmaker import ubelt as ub from sqlalchemy import create_engine engine = create_engine('sqlite:///:memory:') Base.metadata.create_all(engine) DBSession = sessionmaker(bind=engine) session = DBSession() session.add(Annotation(id=1, image_id=1, bbox=[13, 13, 28, 15])) session.add(Annotation(id=2, image_id=2, bbox=[13, 13, 28, 15])) session.add(Annotation(id=3, image_id=2, bbox=[18, 10, 25, 17])) session.add(Annotation(id=4, image_id=4, bbox=[13, 10, 25, 17])) session.add(Image(id=1, file_name='img1.jpg')) session.add(Image(id=2, file_name='img2.jpg')) session.add(Image(id=3, file_name='img3.jpg')) session.add(Image(id=4, file_name='img4.jpg')) session.add(Image(id=5, file_name='img5.jpg')) session.commit() import pandas as pd print(pd.read_sql_table('annotations', con=engine)) print(pd.read_sql_table('images', con=engine)) # Args: parent_keyattr = Image.id keyattr = Annotation.image_id valattr = Annotation.id """ ----------- ## A Correct Solution With Raw SQL ## """ ### # Raw SQLite: Does exactly what I want ### parent_table = parent_keyattr.class_.__tablename__ table = keyattr.class_.__tablename__ parent_keycol = parent_table + '.' + parent_keyattr.name keycol = table + '.' + keyattr.name valcol = table + '.' + valattr.name expr = ('SELECT {parent_keycol}, json_group_array({valcol}) ' 'FROM {parent_table} ' 'LEFT OUTER JOIN {table} ON {keycol} = {parent_keycol} ' 'GROUP BY {parent_keycol} ORDER BY {parent_keycol}').format( parent_table=parent_table, table=table, parent_keycol=parent_keycol, keycol=keycol, valcol=valcol, ) print(expr) import json result = session.execute(expr) final = [] for row in result.fetchall(): key = row[0] group = json.loads(row[1]) if group[0] is None: group = set() else: group = set(group) tup = (key, group) final.append(tup) print('final = {}'.format(ub.repr2(final, nl=1))) """ This expands out to: SELECT images.id, json_group_array(annotations.id) FROM images LEFT OUTER JOIN annotations ON annotations.image_id = images.id GROUP BY images.id ORDER BY images.id and with some post-processing on row[1] returns: ``` final = [ (1, {1}), (2, {2, 3}), (3, {}), (4, {4}), (5, {}), ] ``` The images 3 and 5 without annotations are correctly accounted for. But I'm having a very hard time figuring out how to do the equivalent behavior with SQLAlchemy. I've tried several variation: ----------- ## An Almost Correct Solution With SQLAlchemy ## """ # SQLite Alchemy ### # VERSION 1: Does not correctly return null for images without annotations ### grouped_vals = sqlalchemy.func.json_group_array(valattr, type_=JSON) parent_table = parent_keyattr.class_.__table__ table = keyattr.class_.__table__ # TODO: This might have to be different for PostgreSQL grouped_vals = sqlalchemy.func.json_group_array(valattr, type_=JSON) query = (session.query(keyattr, grouped_vals).outerjoin( parent_table, parent_keyattr == keyattr).group_by( parent_keyattr).order_by(parent_keyattr)) print(query.statement) final = [] for row in query.all(): key = row[0] group = row[1] if group[0] is None: group = set() else: group = set(group) tup = (key, group) final.append(tup) print('final = {}'.format(ub.repr2(final, nl=1))) """ This expands to: SELECT annotations.image_id, json_group_array(annotations.id) AS json_group_array_1 FROM annotations LEFT OUTER JOIN images ON images.id = annotations.image_id GROUP BY images.id ORDER BY images.id And returns: ``` final = [ (1, {1}), (2, {2, 3}), (4, {4}), ] ``` which is missing the values for images 3 and 5. This is because I queried on `keyattr` (annotations.image_id) instead of `parent_keyattr` (images.id). ----------- ## An Attempt To Fix The Issue ## But if I try to use parent_keyattr I get an error when I try the outer join """ query = (session.query(parent_keyattr, grouped_vals).outerjoin(parent_table, parent_keyattr == keyattr)) """ Looking at: `print(session.query(parent_keyattr, grouped_vals))` this makes sense because I get: ``` SELECT images.id AS images_id, json_group_array(annotations.id) AS json_group_array_1 FROM images, annotations ``` The issue is the both images and annotations are in the FROM statement. I'm not sure if there is a way to force `grouped_vals` to think its FROM statement targets the annotations table. I've tried several variants but have had little luck sofar. ----------- ## A Better But Not Perfect Fix ## The best luck I've had was by wrapping `grouped_vals` in a `str`. Which does let me get exactly what I want, but I lose the nice `type_=JSON` that automatically took care of converting the result to json for me. """ query = (session.query(parent_keyattr, str(grouped_vals)).outerjoin( table, parent_keyattr == keyattr).group_by(parent_keyattr).order_by( parent_keyattr)) print(query.statement) final = [] for row in query.all(): key = row[0] group = json.loads(row[1]) if group[0] is None: group = set() else: group = set(group) tup = (key, group) final.append(tup) print('final = {}'.format(ub.repr2(final, nl=1))) """ I would like to know if there is a way to force `grouped_vals` to target the "images" table instead of "annotations", so I don't have to wrap it in a string, and I don't have to manually convert to JSON. """ print( session.query(parent_keyattr.expression, grouped_vals).select_from(parent_table)) subq = session.query(parent_keyattr.expression, grouped_vals).subquery() y = subq.outerjoin(table, parent_keyattr == keyattr).select() z = y.group_by(parent_keyattr).order_by(parent_keyattr) print(z) z.all() print(subq) print(subq.outerjoin(table, parent_keyattr == keyattr)) x = session.query( parent_keyattr.expression, grouped_vals.select().select_from(parent_table)).subquery() x.outerjoin(table, parent_keyattr == keyattr) # .group_by(parent_keyattr).order_by(parent_keyattr) print(x) z = session.query(parent_keyattr).outerjoin(table, parent_keyattr == keyattr) z = session.query(parent_keyattr).outerjoin(table, parent_keyattr == keyattr) z.all() query = (session.query(parent_keyattr, str(grouped_vals)).outerjoin( table, parent_keyattr == keyattr).group_by(parent_keyattr).order_by( parent_keyattr)) print(query.statement) ojoin = parent_table.outerjoin(table, parent_keyattr == keyattr) z = ojoin.select() sub = session.query(z).subquery() print(sub) print(session.query(z)) # .all() z = ojoin.select() session.execute(z).fetchall() sel = sqlalchemy.select([parent_keyattr, grouped_vals]).select_from() print(sel) session.execute(sel) """
jon/viame/master jon/viame/next master dev/tracking-framework viame/master viame/query-wip viame/tracking-work viame/master-no-pybind viame/master-w-pytorch " """ # branches = [x.strip() for x in ''' # jon/viame/master # jon/viame/next # master # dev/tracking-framework # viame/master # viame/query-wip # viame/tracking-work # viame/master-no-pybind # viame/master-w-pytorch # '''.splitlines() if x.strip()] import sys argv = sys.argv[1:] branches = [] for item in argv: for sub in item.split(): sub = sub.strip() if sub: branches.append(sub) print('branches = {}'.format(ub.repr2(branches))) check_relationships(branches)
def main(): # TODO: progressive hashing data structure inv1 = Inventory('/media/joncrall/raid/', blocklist) inv2 = Inventory('/media/joncrall/media', blocklist) # inv1 = Inventory('/media/joncrall/raid/Applications/NotGames', blocklist) # inv2 = Inventory('/media/joncrall/media/Applications/NotGames', blocklist) # inv1 = Inventory('/media/joncrall/raid/Applications', blocklist) # inv2 = Inventory('/media/joncrall/media/Applications', blocklist) self = inv1 # NOQA inv1.build() inv2.build() thresh = { 'frac': 0.5, 'byte': 100 * int(2**20) # only use the first few mb to determine overlap } verbose = 1 pfiles1 = inv1.pfiles pfiles2 = inv2.pfiles overlap, only1, only2 = ProgressiveFile.likely_overlaps(pfiles1, pfiles2, thresh=thresh, verbose=verbose) stats = { 'overlap': len(overlap), 'only1': len(only1), 'only2': len(only2), } print('stats = {}'.format(ub.repr2(stats, nl=1))) only2_list = sorted([p.fpath for group in only2.values() for p in group]) print('only2_list = {}'.format(ub.repr2(only2_list, nl=1))) print('stats = {}'.format(ub.repr2(stats, nl=1))) # for pfile in inv1.pfiles: # pfile._check_integrity() import numpy as np mb_read = np.array([ pfile._parts[-1][1] / int(2**20) for pfile in ub.ProgIter(inv2.pfiles) ]) mb_read.max() mb_read.min() # Build all hashes up to a reasonable degree inv1.build_hashes(max_workers=0) maybe_dups = inv1.likely_duplicates(thresh=0.2) len(maybe_dups) maybe_dups = ub.sorted_keys(maybe_dups, key=lambda x: x[2]) import networkx as nx import itertools as it # Check which directories are most likely to be duplicates graph = nx.Graph() for key, group in ub.ProgIter(maybe_dups.items(), total=len(maybe_dups), desc='build dup dir graph'): if key[0] == '': continue dpaths = [dirname(pfile.fpath) for pfile in group] for d1, d2 in it.combinations(dpaths, 2): graph.add_edge(d1, d2) edge = graph.edges[(d1, d2)] if 'dups' not in edge: edge['dups'] = 0 edge['dups'] += 1 edge_data = list(graph.edges(data=True)) for dpath in ub.ProgIter(graph.nodes, desc='find lens'): num_children = len(os.listdir(dpath)) graph.nodes[dpath]['num_children'] = num_children for d1, d2, dat in edge_data: nc1 = graph.nodes[d1]['num_children'] nc2 = graph.nodes[d2]['num_children'] ndups = dat['dups'] dup_score = (dat['dups'] / min(nc1, nc2)) dat['dup_score'] = dup_score if dup_score > 0.9: print('dup_score = {!r}'.format(dup_score)) print('d1 = {!r}'.format(d1)) print('d2 = {!r}'.format(d2)) print('nc1 = {!r}'.format(nc1)) print('nc2 = {!r}'.format(nc2)) print('ndups = {!r}'.format(ndups)) print('edge_data = {}'.format(ub.repr2(edge_data, nl=2))) print('maybe_dups = {}'.format(ub.repr2(maybe_dups.keys(), nl=3))) for key, group in maybe_dups.items(): if key[0] == '': continue print('key = {!r}'.format(key)) print('group = {}'.format(ub.repr2(group, nl=1))) for pfile in group: pfile.refined_to(float('inf')) print('key = {!r}'.format(key)) inv2.build_hashes(max_workers=6, mode='thread') inv1.pfiles = [ p for p in ub.ProgIter(inv1.pfiles, desc='exist check') if exists(p.fpath) ] inv2.pfiles = [ p for p in ub.ProgIter(inv2.pfiles, desc='exist check') if exists(p.fpath) ] pfiles1 = inv1.pfiles pfiles2 = inv2.pfiles def compute_likely_overlaps(pfiles1, pfiles2): step_idx1 = ProgressiveFile.compatible_step_idx(pfiles1) step_idx2 = ProgressiveFile.compatible_step_idx(pfiles2) step_idx = min(step_idx1, step_idx2) grouped1 = ProgressiveFile.group_pfiles(pfiles1, step_idx=step_idx) grouped2 = ProgressiveFile.group_pfiles(pfiles2, step_idx=step_idx) thresh = 0.2 verbose = 1 # TODO: it would be nice if we didn't have to care about internal # deduplication when we attempt to find cross-set overlaps dups1 = ProgressiveFile.likely_duplicates(inv1.pfiles, thresh=thresh, verbose=verbose) dups2 = ProgressiveFile.likely_duplicates(inv2.pfiles, thresh=thresh, verbose=verbose) pfiles = inv1.pfiles + inv2.pfiles dups3 = ProgressiveFile.likely_duplicates(pfiles, thresh=thresh, verbose=verbose) only_on_inv2 = {} for key, group in dups3.items(): if not any( item.fpath.startswith(inv1.root_fpath) for item in group): only_on_inv2[key] = group for p1 in inv1.pfiles: if 'Chase HQ 2 (JUE) [!].zip' in p1.fpath: break for p2 in inv2.pfiles: if 'Chase HQ 2 (JUE) [!].zip' in p2.fpath: break look = list(ub.flatten(only_on_inv2.values())) takealook = sorted([p.fpath for p in look]) print('takealook = {}'.format(ub.repr2(takealook, nl=1))) keys1 = set(grouped1) keys2 = set(grouped2) missing_keys2 = keys2 - keys1 missing_groups2 = ub.dict_subset(grouped2, missing_keys2) missing_fpaths2 = [] for key, values in missing_groups2.items(): print('key = {!r}'.format(key)) print('values = {}'.format(ub.repr2(values, nl=1))) missing_fpaths2.extend(values) missing_fpaths2 = sorted([p.fpath for p in missing_fpaths2]) print('missing_fpaths2 = {}'.format(ub.repr2(missing_fpaths2, nl=1))) # pass import xdev set_overlaps = xdev.set_overlaps(keys1, keys2) print('set_overlaps = {}'.format(ub.repr2(set_overlaps, nl=1))) # We want to know what files in set2 do not exist in set1 if 0: fpath = inv1.all_fpaths[0] pfile = ProgressiveFile(fpath) fpath1 = '/media/joncrall/raid/unsorted/yet-another-backup/card-usb-drive/Transfer/Zebras/DownloadedLibraries/lightspeed/solve_triu.m' fpath2 = '/media/joncrall/raid/unsorted/yet-another-backup/card-usb-drive/Zebras/downloaded_libraries/lightspeed/solve_triu.m' fpath1 = '/media/joncrall/raid/Applications/Wii/WiiHacksAndStuff/CurrentHacks/Falco/DarkFalco02.pcs' fpath2 = '/media/joncrall/raid/Applications/Wii/WiiHacksAndStuff/CurrentHacks/Ivysaur/Kraid-v2-Ivy.pcs' pfile = pfile1 = ProgressiveFile(fpath1) pfile2 = ProgressiveFile(fpath2) pfile.maybe_equal(pfile2, thresh=0.1) fpath_demodata = inv1.all_fpaths[::len(inv1.all_fpaths) // 500] # fpaths = hash_groups1_dup['ef46db3751d8e999'] pfiles_demodata = [ProgressiveFile(f) for f in fpath_demodata] def progressive_duplicates(pfiles, idx=1): step_ids = [pfile.refined_to(idx) for pfile in ub.ProgIter(pfiles)] final_groups = {} grouped = ub.group_items(pfiles, step_ids) for key, group in grouped.items(): if len(group) > 1: if all(not g.can_refine for g in group): # Group is ~100% a real duplicate final_groups[key] = group else: pfiles = group deduped = progressive_duplicates(pfiles, idx=idx + 1) final_groups.update(deduped) else: final_groups[key] = group return final_groups pfiles = pfiles_demodata final_groups = progressive_duplicates(pfiles) for key, group in final_groups.items(): if len(group) > 1: print('key = {!r}'.format(key)) print('group = {}'.format(ub.repr2(group, nl=1))) inv1.build_hashes() inv2.build_hashes() hash_groups1 = ub.group_items(inv1.all_fpaths, inv1.all_hashes) hash_groups2 = ub.group_items(inv2.all_fpaths, inv2.all_hashes) hash_groups1_dup = { k: v for k, v in hash_groups1.items() if len(v) > 1 } hash_groups2_dup = { k: v for k, v in hash_groups2.items() if len(v) > 1 } len(hash_groups1_dup) len(hash_groups2_dup) # common = set(hash_groups1) & set(hash_groups2) # xdev.set_overlaps(hash_groups1, hash_groups2) fnames1 = ub.group_items(inv1.all_fpaths, key=basename) fnames2 = ub.group_items(inv2.all_fpaths, key=basename) missing = ub.dict_diff(fnames2, fnames1) sorted(ub.flatten(missing.values())) len(missing) fpath_demodata = inv1.all_fpaths[::len(inv1.all_fpaths) // 500] def internal_deduplicate(self): hash_groups = ub.group_items(self.all_fpaths, self.all_hashes) hash_groups_dup = { k: v for k, v in hash_groups.items() if len(v) > 1 } from os.path import dirname hash_groups_dup['ef46db3751d8e999'] for key, values in hash_groups_dup.items(): for v in values: if v.endswith('.avi'): break [basename(v) for v in values] [dirname(v) for v in values]
def git_squash_streaks(): """ git-squash-streaks Usage: See argparse """ import argparse try: import argcomplete except ImportError: argcomplete = None raise description, help_dict = _autoparse_desc(squash_streaks) parser = argparse.ArgumentParser(description=description) parser.add_argument(*('--timedelta',), type=str, help=help_dict['timedelta']) parser.add_argument(*('--custom_streak',), nargs=2, help='hack to specify one custom streak') parser.add_argument(*('--pattern',), type=str, help=help_dict['pattern']) parser.add_argument(*('--tags',), action='store_true', help='experimental') parser.add_argument(*('--no-preserve-tags',), dest='preserve_tags', action='store_false', help=help_dict['preserve_tags']) parser.add_argument(*('--oldest-commit',), dest='oldest_commit', help=help_dict['oldest_commit']) parser.add_argument(*('--inplace',), action='store_true', help=help_dict['inplace']) parser.add_argument(*('--auto-rollback',), action='store_true', dest='auto_rollback', help=help_dict['auto_rollback']) parser.add_argument('--authors', type=str, help=(help_dict['authors'] + ' Defaults to your git config user.name')) group = parser.add_mutually_exclusive_group() group.add_argument(*('-n', '--dry'), dest='dry', action='store_true', help=help_dict['dry']) group.add_argument(*('-f', '--force'), dest='dry', action='store_false', help='opposite of --dry') group = parser.add_mutually_exclusive_group() group.add_argument(*('-v', '--verbose'), dest='verbose', action='store_const', const=1, help='verbosity flag flag') group.add_argument(*('-q', '--quiet'), dest='verbose', action='store_const', const=0, help='suppress output') parser.set_defaults( tags=False, inplace=False, preserve_tags=True, auto_rollback=False, authors=None, pattern=None, timedelta='sameday', dry=True, verbose=True, ) if argcomplete: argcomplete.autocomplete(parser) args = parser.parse_args() # Postprocess args ns = args.__dict__.copy() if ns.pop('tags'): do_tags() return try: ns['timedelta'] = float(ns['timedelta']) except ValueError: valid_timedelta_categories = ['sameday', 'alltime'] if ns['timedelta'] not in valid_timedelta_categories: raise ValueError('timedelta = {}'.format(ns['timedelta'])) if ns['authors'] is None: ns['authors'] = {git.Git().config('user.name')} # HACK: for me. todo user alias # SEE: .mailmap file to auto extract? # https://git-scm.com/docs/git-shortlog#_mapping_authors """ # .mailmap # Proper Name <*****@*****.**> Commit Name <*****@*****.**> Jon Crall <*****@*****.**> joncrall <*****@*****.**> Jon Crall <*****@*****.**> jon.crall <*****@*****.**> Jon Crall <*****@*****.**> Jon Crall <*****@*****.**> Jon Crall <*****@*****.**> joncrall <*****@*****.**> Jon Crall <*****@*****.**> joncrall <*****@*****.**> Jon Crall <*****@*****.**> Jon Crall <*****@*****.**> """ if {'joncrall', 'Jon Crall', 'jon.crall'}.intersection(ns['authors']): ns['authors'].update({'joncrall', 'Jon Crall'}) else: ns['authors'] = {a.strip() for a in ns['authors'].split(',')} print(ub.repr2(ns, nl=1)) squash_streaks(**ns) if ns['dry']: if ns['verbose']: print('Finished the dry run. Use -f to force')
def compare_results(): print('Comparing results') import pandas as pd from tabulate import tabulate # Read in output of demo script measure_fpath = 'measurements_haul83.csv' py_df = pd.DataFrame.from_csv(measure_fpath, index_col=None) # Convert python length output from mm into cm for consistency py_df['fishlen'] = py_df['fishlen'] / 10 py_df['current_frame'] = py_df['current_frame'].astype(np.int) # janky CSV parsing py_df['box_pts1'] = py_df['box_pts1'].map(lambda p: eval(p.replace(';', ','), np.__dict__)) py_df['box_pts2'] = py_df['box_pts2'].map(lambda p: eval(p.replace(';', ','), np.__dict__)) py_df['obox1'] = [ctalgo.OrientedBBox(*cv2.minAreaRect(pts[:, None, :].astype(np.int))) for pts in py_df['box_pts1']] py_df['obox2'] = [ctalgo.OrientedBBox(*cv2.minAreaRect(pts[:, None, :].astype(np.int))) for pts in py_df['box_pts2']] py_df.drop(['box_pts1', 'box_pts2'], axis=1, inplace=True) # Remap to matlab names py_df = py_df.rename(columns={ 'error': 'Err', 'fishlen': 'fishLength', 'range': 'fishRange', }) # Load matlab results mat_df = _read_kresimir_results() FORCE_COMPARABLE_RANGE = True # FORCE_COMPARABLE_RANGE = False if FORCE_COMPARABLE_RANGE: # Be absolutely certain we are in comparable regions (may slightly bias # results, against python and in favor of matlab) min_frame = max(mat_df.current_frame.min(), py_df.current_frame.min()) max_frame = min(mat_df.current_frame.max(), py_df.current_frame.max()) print('min_frame = {!r}'.format(min_frame)) print('max_frame = {!r}'.format(max_frame)) mat_df = mat_df[(mat_df.current_frame >= min_frame) & (mat_df.current_frame <= max_frame)] py_df = py_df[(py_df.current_frame >= min_frame) & (py_df.current_frame <= max_frame)] intersect_frames = np.intersect1d(mat_df.current_frame, py_df.current_frame) print('intersecting frames = {} / {} (matlab)'.format( len(intersect_frames), len(set(mat_df.current_frame)))) print('intersecting frames = {} / {} (python)'.format( len(intersect_frames), len(set(py_df.current_frame)))) # Reuse the hungarian algorithm implementation from ctalgo min_assign = ctalgo.FishStereoMeasurments.minimum_weight_assignment correspond = [] for f in intersect_frames: pidxs = np.where(py_df.current_frame == f)[0] midxs = np.where(mat_df.current_frame == f)[0] pdf = py_df.iloc[pidxs] mdf = mat_df.iloc[midxs] ppts1 = np.array([o.center for o in pdf['obox1']]) mpts1 = np.array([o.center for o in mdf['obox1']]) ppts2 = np.array([o.center for o in pdf['obox2']]) mpts2 = np.array([o.center for o in mdf['obox2']]) dists1 = sklearn.metrics.pairwise.pairwise_distances(ppts1, mpts1) dists2 = sklearn.metrics.pairwise.pairwise_distances(ppts2, mpts2) # arbitrarilly chosen threshold thresh = 100 for i, j in min_assign(dists1): d1 = dists1[i, j] d2 = dists2[i, j] if d1 < thresh and d2 < thresh and abs(d1 - d2) < thresh / 4: correspond.append((pidxs[i], midxs[j])) correspond = np.array(correspond) # pflags = np.array(ub.boolmask(correspond.T[0], len(py_df))) mflags = np.array(ub.boolmask(correspond.T[1], len(mat_df))) # print('there are {} detections that seem to be in common'.format(len(correspond))) # print('The QC flags of the common detections are: {}'.format( # ub.dict_hist(mat_df[mflags]['QC'].values))) # print('The QC flags of the other matlab detections are: {}'.format( # ub.dict_hist(mat_df[~mflags]['QC'].values))) print('\n\n----\n## All stats\n') print(ub.codeblock( ''' Overall, the matlab script made {nmat} length measurements and the python script made {npy} length measurements. Here is a table summarizing the average lengths / ranges / errors of each script: ''').format(npy=len(py_df), nmat=len(mat_df))) stats = pd.DataFrame(columns=['python', 'matlab']) for key in ['fishLength', 'fishRange', 'Err']: stats.loc[key, 'python'] = '{:6.2f} ± {:6.2f}'.format(py_df[key].mean(), py_df[key].std()) stats.loc[key, 'matlab'] = '{:6.2f} ± {:6.2f}'.format(mat_df[key].mean(), mat_df[key].std()) stats.loc['nTotal', 'python'] = '{}'.format(len(py_df)) stats.loc['nTotal', 'matlab'] = '{}'.format(len(mat_df)) print(tabulate(stats, headers='keys', tablefmt='psql', stralign='right')) print('\n\n----\n## Only COMMON detections\n') py_df_c = py_df.iloc[correspond.T[0]] mat_df_c = mat_df.iloc[correspond.T[1]] stats = pd.DataFrame(columns=['python', 'matlab']) for key in ['fishLength', 'fishRange', 'Err']: stats.loc[key, 'python'] = '{:6.2f} ± {:6.2f}'.format(py_df_c[key].mean(), py_df_c[key].std()) stats.loc[key, 'matlab'] = '{:6.2f} ± {:6.2f}'.format(mat_df_c[key].mean(), mat_df_c[key].std()) stats.loc['nTotal', 'python'] = '{}'.format(len(py_df_c)) stats.loc['nTotal', 'matlab'] = '{}'.format(len(mat_df_c)) print(ub.codeblock( ''' Now, we investigate how many dections matlab and python made in common. (Note, choosing which dections in one version correspond to which in another is done using a heuristic based on distances between bbox centers and a thresholded minimum assignment problem). Python made {npy_c}/{nmat} = {percent:.2f}% of the detections matlab made ''').format(npy_c=len(py_df_c), nmat=len(mat_df), percent=100 * len(py_df_c) / len(mat_df))) print(tabulate(stats, headers='keys', tablefmt='psql', stralign='right')) print('\n\n----\n## Evaulation using the QC code\n') hist_hit = ub.dict_hist(mat_df[mflags]['QC'].values) hist_miss = ub.dict_hist(mat_df[~mflags]['QC'].values) print(ub.codeblock( ''' However, not all of those matlab detections were good. Because we have detections in corrsepondences with each other we can assign the python detections QC codes. Here is a histogram of the QC codes for these python detections: {} (Note: read histogram as <QC-code>: <frequency>) Here is a histogram of the other matlab detections that python did not find: {} To summarize: python correctly rejected {:.2f}% of the matlab QC=0 detections python correctly accepted {:.2f}% of the matlab QC=1 detections python correctly accepted {:.2f}% of the matlab QC=2 detections Note, that because python made detections that matlab did not make, the remaining {} detections may be right or wrong, but there is no way to tell from this analysis. Lastly, here are the statistics for the common detections that had a non-zero QC code. ''').format( ub.repr2(hist_hit, nl=1), ub.repr2(hist_miss, nl=1), 100 * hist_miss[0] / (hist_hit[0] + hist_miss[0]), 100 * hist_hit[1] / (hist_hit[1] + hist_miss[1]), 100 * hist_hit[2] / (hist_hit[2] + hist_miss[2]), len(py_df) - len(py_df_c) ) ) is_qc = (mat_df_c['QC'] > 0).values mat_df_c = mat_df_c[is_qc] py_df_c = py_df_c[is_qc] stats = pd.DataFrame(columns=['python', 'matlab']) for key in ['fishLength', 'fishRange', 'Err']: stats.loc[key, 'python'] = '{:6.2f} ± {:6.2f}'.format(py_df_c[key].mean(), py_df_c[key].std()) stats.loc[key, 'matlab'] = '{:6.2f} ± {:6.2f}'.format(mat_df_c[key].mean(), mat_df_c[key].std()) stats.loc['nTotal', 'python'] = '{}'.format(len(py_df_c)) stats.loc['nTotal', 'matlab'] = '{}'.format(len(mat_df_c)) print(tabulate(stats, headers='keys', tablefmt='psql', stralign='right'))
def _configure(self): logger.debug(' ----- configure ' + self.__class__.__name__) config = tmp_smart_cast_config(self) print('detector config = {}'.format(ub.repr2(config, nl=2))) self.detector = ctalgo.GMMForegroundObjectDetector(**config) self._base_configure()
def demo(config=None): """ Runs the algorithm end-to-end. """ # dataset = 'test' # dataset = 'haul83' if config is None: import argparse parser = argparse.ArgumentParser(description='Standalone camtrawl demo') parser.add_argument('--cal', help='path to matlab or numpy stereo calibration file', default='cal.npz') parser.add_argument('--left', help='path to directory containing left images', default='left') parser.add_argument('--right', help='path to directory containing right images', default='right') parser.add_argument('--out', help='output directory', default='./out') parser.add_argument('-f', '--overwrite', action='store_true', help='will delete any existing output') parser.add_argument('--draw', action='store_true', help='draw visualization of algorithm steps') parser.add_argument('--dataset', default=None, help='Developer convenience assumes you have demo ' ' data downloaded and available. If you dont ' ' specify the other args.') args = parser.parse_args() config = args.__dict__.copy() config = FrozenKeyDict(config) if config['dataset'] is not None: img_path1, img_path2, cal_fpath = demodata_input(dataset=config['dataset']) config['left'] = img_path1 config['right'] = img_path2 config['cal'] = cal_fpath img_path1, img_path2, cal_fpath = ub.take(config, [ 'left', 'right', 'cal']) out_dpath = config['out'] logging.info('Demo Config = {!r}'.format(config)) ub.ensuredir(out_dpath) # ---- # Choose parameter configurations # ---- # Use GMM based model gmm_params = { } triangulate_params = { } DRAWING = config['draw'] # ---- # Initialize algorithms # ---- detector1 = ctalgo.GMMForegroundObjectDetector(**gmm_params) detector2 = ctalgo.GMMForegroundObjectDetector(**gmm_params) triangulator = ctalgo.FishStereoMeasurments(**triangulate_params) try: import pyfiglet print(pyfiglet.figlet_format('CAMTRAWL', font='cybermedium')) except ImportError: logging.debug('pyfiglet is not installed') print('========') print('CAMTRAWL') print('========') logging.info('Detector1 Config: ' + ub.repr2(detector1.config, nl=1)) logging.info('Detector2 Config: ' + ub.repr2(detector2.config, nl=1)) logging.info('Triangulate Config: ' + ub.repr2(triangulator.config, nl=1)) logging.info('DRAWING = {!r}'.format(DRAWING)) cal = ctalgo.StereoCalibration.from_file(cal_fpath) stream = StereoFrameStream(img_path1, img_path2) stream.preload() # HACK IN A BEGIN FRAME if len(stream) > 2200: stream.seek(2200) # ---- # Run the algorithm # ---- # n_frames = 2000 # stream.aligned_frameids = stream.aligned_frameids[:stream.index] measure_fpath = join(out_dpath, 'measurements.csv') if exists(measure_fpath): if config['overwrite']: ub.delete(measure_fpath) else: raise IOError('Measurement path already exists') output_file = open(measure_fpath, 'a') if DRAWING: drawing_dpath = join(out_dpath, 'visual') if exists(drawing_dpath): if config['overwrite']: ub.delete(drawing_dpath) else: raise IOError('Output path already exists') ub.ensuredir(drawing_dpath) headers = ['current_frame', 'fishlen', 'range', 'error', 'dz', 'box_pts1', 'box_pts2'] output_file.write(','.join(headers) + '\n') output_file.flush() measurements = [] logger.info('begin camtrawl iteration') import tqdm # prog = ub.ProgIter(iter(stream), total=len(stream), desc='camtrawl demo', # clearline=False, freq=1, adjust=False) prog = tqdm.tqdm(iter(stream), total=len(stream), desc='camtrawl demo', leave=True) def csv_repr(d): if isinstance(d, np.ndarray): d = d.tolist() s = repr(d) return s.replace('\n', '').replace(',', ';').replace(' ', '') for frame_num, (frame_id, img1, img2) in enumerate(prog): logger.debug('frame_num = {!r}'.format(frame_num)) detections1 = list(detector1.detect(img1)) detections2 = list(detector2.detect(img2)) masks1 = detector1._masks masks2 = detector2._masks any_detected = len(detections1) > 0 or len(detections2) > 0 if any_detected: assignment, assign_data, cand_errors = triangulator.find_matches( cal, detections1, detections2) # Append assignments to the measurements for data in assign_data: data['current_frame'] = int(frame_id) measurements.append(data) line = ','.join([csv_repr(d) for d in ub.take(data, headers)]) output_file.write(line + '\n') output_file.flush() else: cand_errors = None assignment, assign_data = None, None if DRAWING >= 2 or (DRAWING and any_detected): DRAWING = 3 stacked = DrawHelper.draw_stereo_detections(img1, detections1, masks1, img2, detections2, masks2, assignment, assign_data, cand_errors) if cv2.__version__.startswith('2'): cv2.putText(stacked, text='frame #{}, id={}'.format(frame_num, frame_id), org=(10, 50), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(255, 0, 0), thickness=2, lineType=cv2.cv.CV_AA) else: stacked = cv2.putText(stacked, text='frame #{}, id={}'.format(frame_num, frame_id), org=(10, 50), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(255, 0, 0), thickness=2, lineType=cv2.LINE_AA) cv2.imwrite(drawing_dpath + '/mask{}_draw.png'.format(frame_id), stacked) output_file.close() n_total = len(measurements) logger.info('n_total = {!r}'.format(n_total)) if n_total: all_errors = np.array([d['error'] for d in measurements]) all_lengths = np.array([d['fishlen'] for d in measurements]) logger.info('ave_error = {:.2f} +- {:.2f}'.format(all_errors.mean(), all_errors.std())) logger.info('ave_lengths = {:.2f} +- {:.2f} '.format(all_lengths.mean(), all_lengths.std())) return measurements
def _coerce_datasets(config): import netharn as nh import ndsampler import numpy as np from torchvision import transforms coco_datasets = nh.api.Datasets.coerce(config) print('coco_datasets = {}'.format(ub.repr2(coco_datasets, nl=1))) for tag, dset in coco_datasets.items(): dset._build_hashid(hash_pixels=False) workdir = ub.ensuredir(ub.expandpath(config['workdir'])) samplers = { tag: ndsampler.CocoSampler(dset, workdir=workdir, backend=config['sampler_backend']) for tag, dset in coco_datasets.items() } for tag, sampler in ub.ProgIter(list(samplers.items()), desc='prepare frames'): sampler.frames.prepare(workers=config['workers']) # TODO: basic ndsampler torch dataset, likely has to support the transforms # API, bleh. transform = transforms.Compose([ transforms.Resize(config['input_dims']), transforms.CenterCrop(config['input_dims']), transforms.ToTensor(), transforms.Lambda(lambda x: x.mul(255)) ]) torch_datasets = { key: SamplerDataset( sapmler, transform=transform, # input_dims=config['input_dims'], # augmenter=config['augmenter'] if key == 'train' else None, ) for key, sapmler in samplers.items() } # self = torch_dset = torch_datasets['train'] if config['normalize_inputs']: # Get stats on the dataset (todo: turn off augmentation for this) import kwarray _dset = torch_datasets['train'] stats_idxs = kwarray.shuffle(np.arange(len(_dset)), rng=0)[0:min(1000, len(_dset))] stats_subset = torch.utils.data.Subset(_dset, stats_idxs) cacher = ub.Cacher('dset_mean', cfgstr=_dset.input_id + 'v3') input_stats = cacher.tryload() from netharn.data.channel_spec import ChannelSpec channels = ChannelSpec.coerce(config['channels']) if input_stats is None: # Use parallel workers to load data faster from netharn.data.data_containers import container_collate from functools import partial collate_fn = partial(container_collate, num_devices=1) loader = torch.utils.data.DataLoader( stats_subset, collate_fn=collate_fn, num_workers=config['workers'], shuffle=True, batch_size=config['batch_size']) # Track moving average of each fused channel stream channel_stats = {key: nh.util.RunningStats() for key in channels.keys()} assert len(channel_stats) == 1, ( 'only support one fused stream for now') for batch in ub.ProgIter(loader, desc='estimate mean/std'): if isinstance(batch, (tuple, list)): inputs = {'rgb': batch[0]} # make assumption else: inputs = batch['inputs'] for key, val in inputs.items(): try: for part in val.numpy(): channel_stats[key].update(part) except ValueError: # final batch broadcast error pass perchan_input_stats = {} for key, running in channel_stats.items(): running = ub.peek(channel_stats.values()) perchan_stats = running.simple(axis=(1, 2)) perchan_input_stats[key] = { 'std': perchan_stats['mean'].round(3), 'mean': perchan_stats['std'].round(3), } input_stats = ub.peek(perchan_input_stats.values()) cacher.save(input_stats) else: input_stats = {} torch_loaders = { tag: dset.make_loader( batch_size=config['batch_size'], num_batches=config['num_batches'], num_workers=config['workers'], shuffle=(tag == 'train'), balance=(config['balance'] if tag == 'train' else None), pin_memory=True) for tag, dset in torch_datasets.items() } dataset_info = { 'torch_datasets': torch_datasets, 'torch_loaders': torch_loaders, 'input_stats': input_stats } return dataset_info
def detect_cli(config={}): """ CommandLine: python -m bioharn.detect_predict --help CommandLine: python -m bioharn.detect_predict \ --dataset=~/data/noaa/Habcam_2015_g027250_a00102917_c0001_v2_test.mscoco.json \ --deployed=/home/joncrall/work/bioharn/fit/runs/bioharn-det-v11-test-cascade/myovdqvi/deploy_MM_CascadeRCNN_myovdqvi_035_MVKVVR.zip \ --out_dpath=~/work/bioharn/habcam_test_out \ --draw=100 \ --input_dims=512,512 \ --xpu=0 --batch_size=1 Ignore: >>> config = {} >>> config['dataset'] = '~/data/noaa/Habcam_2015_g027250_a00102917_c0001_v2_vali.mscoco.json' >>> config['deployed'] = '/home/joncrall/work/bioharn/fit/runs/bioharn-det-v11-test-cascade/myovdqvi/deploy_MM_CascadeRCNN_myovdqvi_035_MVKVVR.zip' >>> config['out_dpath'] = 'out' """ import kwarray import ndsampler from os.path import basename, join, exists, isfile, isdir # NOQA config = DetectPredictCLIConfig(config, cmdline=True) print('config = {}'.format(ub.repr2(config.asdict()))) out_dpath = ub.expandpath(config.get('out_dpath')) import six if isinstance(config['dataset'], six.string_types): if config['dataset'].endswith('.json'): dataset_fpath = ub.expandpath(config['dataset']) coco_dset = ndsampler.CocoDataset(dataset_fpath) # Running prediction is much faster if you can build a sampler. sampler_backend = { 'type': 'cog', 'config': { 'compress': 'JPEG', }, '_hack_old_names': False, # flip to true to use legacy caches } sampler_backend = None print('coco hashid = {}'.format(coco_dset._build_hashid())) else: sampler_backend = None if exists(config['dataset']) and isfile(config['dataset']): # Single image case image_fpath = ub.expandpath(config['dataset']) coco_dset = ndsampler.CocoDataset() coco_dset.add_image(image_fpath) elif isinstance(config['dataset'], list): # Multiple image case gpaths = config['dataset'] gpaths = [ub.expandpath(g) for g in gpaths] coco_dset = ndsampler.CocoDataset() for gpath in gpaths: coco_dset.add_image(gpath) else: raise TypeError(config['dataset']) draw = config.get('draw') workdir = ub.expandpath(config.get('workdir')) det_outdir = ub.ensuredir((out_dpath, 'pred')) pred_config = ub.dict_subset(config, DetectPredictConfig.default) print('Create sampler') sampler = ndsampler.CocoSampler(coco_dset, workdir=workdir, backend=sampler_backend) print('prepare frames') sampler.frames.prepare(workers=config['workers']) print('Create predictor') predictor = DetectPredictor(pred_config) print('Ensure model') predictor._ensure_model() pred_dataset = coco_dset.dataset.copy() pred_dataset['annotations'] = [] pred_dset = ndsampler.CocoDataset(pred_dataset) # self = predictor predictor.config['verbose'] = 1 pred_gen = predictor.predict_sampler(sampler) buffered_gen = AsyncBufferedGenerator(pred_gen, size=coco_dset.n_images) gid_to_pred = {} prog = ub.ProgIter(buffered_gen, total=coco_dset.n_images, desc='buffered detect') for img_idx, (gid, dets) in enumerate(prog): gid_to_pred[gid] = dets for ann in dets.to_coco(): ann['image_id'] = gid try: catname = ann['category_name'] ann['category_id'] = pred_dset._resolve_to_cid(catname) except KeyError: if 'category_id' not in ann: cid = pred_dset.add_category(catname) ann['category_id'] = cid pred_dset.add_annotation(**ann) single_img_coco = pred_dset.subset([gid]) single_pred_dpath = ub.ensuredir((det_outdir, 'single_image')) single_pred_fpath = join(single_pred_dpath, 'detections_gid_{:08d}.mscoco.json'.format(gid)) single_img_coco.dump(single_pred_fpath, newlines=True) if draw is True or (draw and img_idx < draw): draw_outdir = ub.ensuredir((out_dpath, 'draw')) img_fpath = coco_dset.load_image_fpath(gid) gname = basename(img_fpath) viz_fname = ub.augpath(gname, prefix='detect_', ext='.jpg') viz_fpath = join(draw_outdir, viz_fname) image = kwimage.imread(img_fpath) flags = dets.scores > .2 flags[kwarray.argmaxima(dets.scores, num=10)] = True top_dets = dets.compress(flags) toshow = top_dets.draw_on(image, alpha=None) # kwplot.imshow(toshow) kwimage.imwrite(viz_fpath, toshow, space='rgb') pred_fpath = join(det_outdir, 'detections.mscoco.json') print('Dump detections to pred_fpath = {!r}'.format(pred_fpath)) pred_dset.dump(pred_fpath, newlines=True)
dpath = expandvars(expanduser(dpath)) # try: # os.makedirs(dpath, exist_ok=True) # except Exception: # ub.ensuredir(dpath) if not os.path.exists(dpath): os.makedirs(dpath) dpath_to_url[dpath].append(url) return dpath_to_url def update_urls(): global PROJECT_URLS global PROJECT_REPOS for dpath, urls in _parse_custom_urls().items(): print('urls = {!r}'.format(urls)) repos_urls, repos = repo_list(urls, dpath) PROJECT_URLS += repos_urls PROJECT_REPOS += repos update_urls() # print('PROJECT_URLS = {!r}'.format(PROJECT_URLS)) try: print('PROJECT_REPOS = {}'.format(ub.repr2(PROJECT_REPOS))) except NameError: pass