def report_partitioning_statistics(new_reduced_joint): # compute partitioning statistics import vtool as vt vals, idxs = vt.group_indices(new_reduced_joint.values.ravel()) #groupsize = list(map(len, idxs)) #groupassigns = ut.unflat_vecmap(new_reduced_joint.assignment, idxs) all_states = new_reduced_joint._row_labels(asindex=True) clusterstats = [tuple(sorted(list(ut.dict_hist(a).values()))) for a in all_states] grouped_vals = ut.group_items(new_reduced_joint.values.ravel(), clusterstats) #probs_assigned_to_clustertype = [( # sorted(np.unique(np.array(b).round(decimals=5)).tolist())[::-1], a) # for a, b in grouped_vals.items()] probs_assigned_to_clustertype = [( ut.dict_hist(np.array(b).round(decimals=5)), a) for a, b in grouped_vals.items()] sortx = ut.argsort([max(c[0].keys()) for c in probs_assigned_to_clustertype]) probs_assigned_to_clustertype = ut.take(probs_assigned_to_clustertype, sortx) # This list of 2-tuples with the first item being the unique # probabilies that are assigned to a cluster type along with the number # of times they were assigned. A cluster type is the second item. Every # number represents how many annotations were assigned to a specific # label. The length of that list is the number of total labels. For # all low scores you will see [[{somenum: 1}, {0: 800}], [1, 1, 1, ... 1]] # indicating that that the assignment of everyone to a different label happend once # where the probability was somenum and a 800 times where the probability was 0. #print(sorted([(b, a) for a, b in ut.map_dict_vals(sum, x)]).items()) #z = sorted([(b, a) for a, b in ut.map_dict_vals(sum, grouped_vals).items()]) print(ut.repr2(probs_assigned_to_clustertype, nl=2, precision=2, sorted_=True))
def find_needsmove_to_other(self, other): hash1 = self.get_prop('md5_stride') hash2 = other.get_prop('md5_stride') idxs1 = list(range(len(hash1))) hash_to_idxs = ut.group_items(idxs1, hash1) # Find what we have that other doesnt have and move it there other_missing = set(hash1).difference(hash2) missing_idxs1 = ut.flatten(ut.take(hash_to_idxs, other_missing)) data = ut.ColumnLists({ 'idx': missing_idxs1, 'fname': self.get_prop('fname', missing_idxs1), 'dname': self.get_prop('dname', missing_idxs1), 'full_path': self.get_prop('full_path', missing_idxs1), 'nbytes': self.get_prop('nbytes', missing_idxs1), }) data = data.compress([f != 'Thumbs.db' for f in data['fname']]) data['ext'] = self.get_prop('ext', data['idx']) ut.dict_hist(data['ext']) data.print(ignore=['full_path', 'dname'])
def inference_stats(infr_list_): relabel_stats = [] for infr in infr_list_: num_ccs, num_inconsistent = infr.relabel_using_reviews() state_hist = ut.dict_hist(nx.get_edge_attributes(infr.graph, 'decision').values()) if POSTV not in state_hist: state_hist[POSTV] = 0 hist = ut.dict_hist(nx.get_edge_attributes(infr.graph, '_speed_split').values()) subgraphs = infr.positive_connected_compoments() subgraph_sizes = [len(g) for g in subgraphs] info = ut.odict([ ('num_nonmatch_edges', state_hist[NEGTV]), ('num_match_edges', state_hist[POSTV]), ('frac_nonmatch_edges', state_hist[NEGTV] / (state_hist[POSTV] + state_hist[NEGTV])), ('num_inconsistent', num_inconsistent), ('num_ccs', num_ccs), ('edges_flipped', hist.get('flip', 0)), ('edges_unchanged', hist.get('orig', 0)), ('bad_unreviewed_edges', hist.get('new', 0)), ('orig_size', len(infr.graph)), ('new_sizes', subgraph_sizes), ]) relabel_stats.append(info) return relabel_stats
def _print_previous_loop_statistics(infr, count): # Print stats about what happend in the this loop history = infr.metrics_list[-count:] recover_blocks = ut.group_items([ (k, sum(1 for i in g)) for k, g in it.groupby(ut.take_column(history, 'recovering')) ]).get(True, []) infr.print(( 'Recovery mode entered {} times, ' 'made {} recovery decisions.').format( len(recover_blocks), sum(recover_blocks)), color='green') testaction_hist = ut.dict_hist(ut.take_column(history, 'test_action')) infr.print( 'Test Action Histogram: {}'.format( ut.repr4(testaction_hist, si=True)), color='yellow') if infr.params['inference.enabled']: action_hist = ut.dict_hist( ut.emap(frozenset, ut.take_column(history, 'action'))) infr.print( 'Inference Action Histogram: {}'.format( ub.repr2(action_hist, si=True)), color='yellow') infr.print( 'Decision Histogram: {}'.format(ut.repr2(ut.dict_hist( ut.take_column(history, 'pred_decision') ), si=True)), color='yellow') infr.print( 'User Histogram: {}'.format(ut.repr2(ut.dict_hist( ut.take_column(history, 'user_id') ), si=True)), color='yellow')
def sub(self, other): """ CommandLine: python -m mtgmonte.mtgobjs --exec-ManaSet.sub:0 python -m mtgmonte.mtgobjs --exec-ManaSet.sub:1 Example: >>> # ENABLE_DOCTEST >>> from mtgmonte.mtgobjs import * >>> from mtgmonte import mtgobjs >>> self = mtgobjs.ManaSet('RRRUC') >>> other = mtgobjs.ManaSet('RRU') >>> mana = self - other >>> result = ('mana = %s' % (mana,)) >>> print(result) mana = {RC} Example: >>> # ENABLE_DOCTEST >>> from mtgmonte.mtgobjs import * # NOQA >>> self = ManaSet(['WWURC']) >>> other = ManaCost([('W', 'colored'), ('W', 'colored'), ('U', 'colored'), ('1', 'uncolored')]) >>> mana = self - other >>> result = ('mana = %s' % (mana,)) >>> print(result) mana = {R} """ if isinstance(other, ManaCost): colored_cost = other.colored.to_manaset() remainder1 = self.sub(colored_cost) color2_remain = remainder1.get_colordict() uncolored_need = other.num_uncolored # TODO: value different colors differently for payment if uncolored_need > 0: for color in list(color2_remain.keys()): using = min(uncolored_need, color2_remain[color]) color2_remain[color] -= using uncolored_need -= using if uncolored_need > 0: raise NotEnoughManaError('Cannot subtract more mana from less') # Todo hybrid / phyrexian else: color2_need = ut.dict_hist(other._manas) color2_remain = ut.ddict(lambda: 0, ut.dict_hist(self._manas)) for color, num_need in color2_need.items(): num_have = color2_remain[color] if num_have < num_need: raise NotEnoughManaError('Cannot subtract more mana from less') color2_remain[color] -= num_need color2_remain = delete_dict_zeros(color2_remain) remainder = ManaSet(color2_remain) return remainder
def author_hist(): #print(all_authors) hist_ = ut.dict_hist(all_authors, ordered=True) hist_[''] = None del hist_[''] print('Author histogram') print(ut.dict_str(hist_)[-1000:])
def cheetah_stats(ibs): filters = [ dict(view=['right', 'frontright', 'backright'], minqual='good'), dict(view=['right', 'frontright', 'backright']), ] for filtkw in filters: annots = ibs.annots(ibs.filter_annots_general(**filtkw)) unique_nids, grouped_annots = annots.group(annots.nids) annots_per_name = ut.lmap(len, grouped_annots) annots_per_name_freq = ut.dict_hist(annots_per_name) def bin_mapper(num): if num < 5: return (num, num + 1) else: for bin, mod in [(20, 5), (50, 10)]: if num < bin: low = (num // mod) * mod high = low + mod return (low, high) if num >= bin: return (bin, None) else: assert False, str(num) hist = ut.ddict(lambda: 0) for num in annots_per_name: hist[bin_mapper(num)] += 1 hist = ut.sort_dict(hist) print('------------') print('filters = %s' % ut.repr4(filtkw)) print('num_annots = %r' % (len(annots))) print('num_names = %r' % (len(unique_nids))) print('annots_per_name_freq = %s' % (ut.repr4(annots_per_name_freq))) print('annots_per_name_freq (ranges) = %s' % (ut.repr4(hist))) assert sum(hist.values()) == len(unique_nids)
def inspect_deck(deck): def get_card_tags(card, deck): tags = [] stats = card.mana_source_stats(deck) if stats is not None: tags.append("land") if len(stats[1]) > 0: tags.append("tapland") else: tags.append("untapland") return tags # ------------ print("len(deck) = %r" % (len(deck),)) tags_list = [get_card_tags(card, deck) for card in deck.card_list] print("Deck Counts:") print(ut.repr2(ut.dict_hist(ut.flatten(tags_list)), nl=True)) hand = deck.sample_hand() manastats_list = [card.mana_source_stats(deck) for card in hand] print(ut.list_str([card.name + ": " + text_type(stats) for card, stats in zip(hand, manastats_list)])) tags_list = [get_card_tags(card, deck) for card in hand] print("Hand Counts") print(ut.repr2(ut.dict_hist(ut.flatten(tags_list)), nl=True)) valid_tags = ["land", "tapland", "untapland"] x = {tag: [] for tag in valid_tags} for _ in range(500): hand = deck.sample_hand() tags_list = [get_card_tags(card, deck) for card in hand] taghist = ut.dict_hist(ut.flatten(tags_list)) for key, val in x.items(): val.append(taghist.get(key, 0)) print("Monte Stats:") for key, val in list(x.items()): print("%15s: %s" % (key, ut.repr2(ut.get_stats(val), precision=2))) def hand_stats(): # [card.types for card in hand] # [card.rrr() for card in hand] [card.mana_source_stats(deck) for card in hand] card.types
def tag_coocurrence(tags_list): import utool as ut co_occur_list = [] for tags in tags_list: for combo in ut.combinations(tags, 2): key = tuple(sorted(combo)) co_occur_list.append(key) co_occur = ut.dict_hist(co_occur_list, ordered=True) # co_occur[key] += 1 #co_occur = ut.odict(co_occur) return co_occur
def glossterms(): re_glossterm = ut.named_field('glossterm', '.' + ut.REGEX_NONGREEDY) pat = r'\\glossterm{' + re_glossterm + '}' tup = ut.grep(pat, fpath_list=testdata_fpaths(), verbose=True) found_fpath_list, found_lines_list, found_lxs_list = tup glossterm_list = [] for line in ut.flatten(found_lines_list): match = re.search(pat, line) glossterm = match.groupdict()['glossterm'] glossterm_list.append(glossterm) print('Glossary Terms: ') print(ut.repr2(ut.dict_hist(glossterm_list), nl=True, strvals=True))
def _sync_filter_only_multiple_sightings(ibs, aid_list): r""" Returns: filtered_aids (list): the subset of aid_list such that every annot has a name and each name appears at least 2x. """ name_list = ibs._sync_get_names(aid_list) name_hist = ut.dict_hist(name_list) aid_names = zip(aid_list, name_list) filtered_aids = [aid for (aid, name) in aid_names if name_hist[name] > 1] filtered_aid_names = [ name for (aid, name) in aid_names if name_hist[name] > 1 ] return filtered_aids, filtered_aid_names
def graph_info(graph, verbose=False): import utool as ut node_attrs = list(graph.node.values()) edge_attrs = list(ut.take_column(graph.edges(data=True), 2)) node_attr_hist = ut.dict_hist(ut.flatten([attr.keys() for attr in node_attrs])) edge_attr_hist = ut.dict_hist(ut.flatten([attr.keys() for attr in edge_attrs])) node_type_hist = ut.dict_hist(list(map(type, graph.nodes()))) info_dict = ut.odict([ ('directed', graph.is_directed()), ('multi', graph.is_multigraph()), ('num_nodes', len(graph)), ('num_edges', len(list(graph.edges()))), ('edge_attr_hist', ut.sort_dict(edge_attr_hist)), ('node_attr_hist', ut.sort_dict(node_attr_hist)), ('node_type_hist', ut.sort_dict(node_type_hist)), ('graph_attrs', graph.graph), ('graph_name', graph.name), ]) #unique_attrs = ut.map_dict_vals(ut.unique, ut.dict_accum(*node_attrs)) #ut.dict_isect_combine(*node_attrs)) #[list(attrs.keys())] if verbose: print(ut.repr3(info_dict)) return info_dict
def check_baseline_results(sim): import networkx as nx infr = sim.infr n_names_possible = 0 real_groups = ut.group_pairs(infr.gen_node_attrs('orig_name_label')) possible_clusters = [] for nid, nodes in real_groups.items(): if len(nodes) == 1: possible_clusters.append(nodes) n_names_possible += 1 continue cc_cand_edges = list(ut.nx_edges_between(infr.graph, nodes)) cc = ut.nx_from_node_edge(nodes, cc_cand_edges) mst = nx.minimum_spanning_tree(cc) ccs = list(nx.connected_components(mst)) possible_clusters.extend(ccs) n_names_possible += (len(ccs)) sumafter = 3 best_possible_compare_results = compare_groups( list(real_groups.values()), list(possible_clusters)) possible_per_num = ut.map_vals( len, ut.group_items(best_possible_compare_results['common'], map(len, best_possible_compare_results['common']))) greater = [i for i in possible_per_num.keys() if i > sumafter] possible_per_num['>%s' % sumafter] = sum( ut.take(possible_per_num, greater)) ut.delete_keys(possible_per_num, greater) for k, v in possible_per_num.items(): sim.results['possible@' + str(k)] = v sim.results['possible'] = len(best_possible_compare_results['common']) # Measure the number of real names in the test (per number of annots) real_per_num = ut.dict_hist(map(len, real_groups.values())) greater = [i for i in real_per_num.keys() if i > sumafter] real_per_num['>%s' % sumafter] = sum(ut.take(real_per_num, greater)) ut.delete_keys(real_per_num, greater) for k, v in real_per_num.items(): sim.results['real@' + str(k)] = v sim.results['n_names_possible'] = n_names_possible sim.results['n_names_real'] = len(real_groups) sim.results['real'] = len(real_groups)
def random_case_set(): r""" Returns: tuple: (labels, pairwise_feats) CommandLine: python -m ibeis.algo.hots.testem random_case_set --show Example: >>> # DISABLE_DOCTEST >>> from ibeis.algo.hots.testem import * # NOQA >>> (labels, pairwise_feats) = random_case_set() >>> result = ('(labels, pairwise_feats) = %s' % (ut.repr2((labels, pairwise_feats)),)) >>> print(result) """ rng = np.random.RandomState(0) case_params = dict(num_names=5, rng=rng) num_annots = 600 test_cases = [ random_test_annot(**case_params) for _ in ut.ProgIter(range(num_annots), bs=1) ] pairxs = list(ut.product_nonsame(range(num_annots), range(num_annots))) import utool utool.embed() test_pairs = list(ut.unflat_take(test_cases, pairxs)) cases1 = ut.instancelist(ut.take_column(test_pairs, 0), check=False) cases2 = ut.instancelist(ut.take_column(test_pairs, 1), check=False) # FIXME labels = labels1 = make_test_pairwise_labels2(cases1, cases2) # NOQA #labels = np.array([make_test_pairwise_labels(case1, case2) # for case1, case2 in ut.ProgIter(test_pairs, bs=1)]) pairwise_feats_ = [ make_test_pairwise_fetaures(case1, case2, label, rng) for label, (case1, case2) in ut.ProgIter(list(zip(labels, test_pairs)), bs=1) ] pairwise_feats = np.vstack(pairwise_feats_) print(ut.dict_hist(labels)) return labels, pairwise_feats
def random_case_set(): r""" Returns: tuple: (labels, pairwise_feats) CommandLine: python -m ibeis.algo.hots.testem random_case_set --show Example: >>> # DISABLE_DOCTEST >>> from ibeis.algo.hots.testem import * # NOQA >>> (labels, pairwise_feats) = random_case_set() >>> result = ('(labels, pairwise_feats) = %s' % (ut.repr2((labels, pairwise_feats)),)) >>> print(result) """ rng = np.random.RandomState(0) case_params = dict(num_names=5, rng=rng) num_annots = 600 test_cases = [random_test_annot(**case_params) for _ in ut.ProgIter(range(num_annots), bs=1)] pairxs = list(ut.product_nonsame(range(num_annots), range(num_annots))) import utool utool.embed() test_pairs = list(ut.unflat_take(test_cases, pairxs)) cases1 = ut.make_instancelist(ut.take_column(test_pairs, 0), check=False) cases2 = ut.make_instancelist(ut.take_column(test_pairs, 1), check=False) # FIXME labels = labels1 = make_test_pairwise_labels2(cases1, cases2) # NOQA #labels = np.array([make_test_pairwise_labels(case1, case2) # for case1, case2 in ut.ProgIter(test_pairs, bs=1)]) pairwise_feats_ = [make_test_pairwise_fetaures(case1, case2, label, rng) for label, (case1, case2) in ut.ProgIter(list(zip(labels, test_pairs)), bs=1)] pairwise_feats = np.vstack(pairwise_feats_) print(ut.dict_hist(labels)) return labels, pairwise_feats
def ext_hist(self): return ut.dict_hist(self.attrs['ext'])
def find_consistent_labeling(grouped_oldnames): """ Solves a a maximum bipirtite matching problem to find a consistent name assignment. Notes: # Install module containing the Hungarian algorithm for matching pip install munkres Example: >>> # DISABLE_DOCTEST >>> from ibeis.scripts.name_recitifer import * # NOQA >>> grouped_oldnames = [['a', 'b'], ['b', 'c'], ['c', 'a', 'a']] >>> new_names = find_consistent_labeling(grouped_oldnames) >>> print(new_names) [u'b', u'c', u'a'] Example: >>> # DISABLE_DOCTEST >>> from ibeis.scripts.name_recitifer import * # NOQA >>> grouped_oldnames = [['a', 'b', 'c'], ['b', 'c'], ['c', 'e', 'e']] >>> new_names = find_consistent_labeling(grouped_oldnames) >>> print(new_names) [u'a', u'b', u'e'] Example: >>> # DISABLE_DOCTEST >>> from ibeis.scripts.name_recitifer import * # NOQA >>> grouped_oldnames = [['a', 'b'], ['a', 'a', 'b'], ['a']] >>> new_names = find_consistent_labeling(grouped_oldnames) >>> print(new_names) [u'a', u'b', u'e'] """ import numpy as np try: import munkres except ImportError: print('Need to install Hungrian algorithm bipartite matching solver.') print('Run:') print('pip install munkres') raise unique_old_names = ut.unique(ut.flatten(grouped_oldnames)) num_new_names = len(grouped_oldnames) num_old_names = len(unique_old_names) extra_oldnames = [] # Create padded dummy values. This accounts for the case where it is # impossible to uniquely map to the old db num_extra = num_new_names - num_old_names if num_extra > 0: extra_oldnames = [ '_extra_name%d' % (count, ) for count in range(num_extra) ] elif num_extra < 0: pass else: extra_oldnames = [] assignable_names = unique_old_names + extra_oldnames total = len(assignable_names) # Allocate assignment matrix profit_matrix = np.zeros((total, total), dtype=np.int) # Populate assignment profit matrix oldname2_idx = ut.make_index_lookup(assignable_names) name_freq_list = [ut.dict_hist(names) for names in grouped_oldnames] for rowx, name_freq in enumerate(name_freq_list): for name, freq in name_freq.items(): colx = oldname2_idx[name] profit_matrix[rowx, colx] += freq # Add extra profit for using a previously used name profit_matrix[profit_matrix > 0] += 2 # Add small profit for using an extra name extra_colxs = ut.take(oldname2_idx, extra_oldnames) profit_matrix[:, extra_colxs] += 1 # Convert to minimization problem big_value = (profit_matrix.max()) cost_matrix = big_value - profit_matrix m = munkres.Munkres() indexes = m.compute(cost_matrix) # Map output to be aligned with input rx2_cx = dict(indexes) assignment = [assignable_names[rx2_cx[rx]] for rx in range(num_new_names)] return assignment
def ingest_serengeti_mamal_cameratrap(species): """ Downloads data from Serengeti dryad server References: http://datadryad.org/resource/doi:10.5061/dryad.5pt92 Swanson AB, Kosmala M, Lintott CJ, Simpson RJ, Smith A, Packer C (2015) Snapshot Serengeti, high-frequency annotated camera trap images of 40 mammalian species in an African savanna. Scientific Data 2: 150026. http://dx.doi.org/10.1038/sdata.2015.26 Swanson AB, Kosmala M, Lintott CJ, Simpson RJ, Smith A, Packer C (2015) Data from: Snapshot Serengeti, high-frequency annotated camera trap images of 40 mammalian species in an African savanna. Dryad Digital Repository. http://dx.doi.org/10.5061/dryad.5pt92 Args: species (?): CommandLine: python -m ibeis.dbio.ingest_database --test-ingest_serengeti_mamal_cameratrap --species zebra_plains python -m ibeis.dbio.ingest_database --test-ingest_serengeti_mamal_cameratrap --species cheetah Example: >>> # SCRIPT >>> from ibeis.dbio.ingest_database import * # NOQA >>> import ibeis >>> species = ut.get_argval('--species', type_=str, default=ibeis.const.TEST_SPECIES.ZEB_PLAIN) >>> # species = ut.get_argval('--species', type_=str, default='cheetah') >>> result = ingest_serengeti_mamal_cameratrap(species) >>> print(result) """ 'https://snapshotserengeti.s3.msi.umn.edu/' import ibeis if species is None: code = 'ALL' elif species == 'zebra_plains': code = 'PZ' elif species == 'cheetah': code = 'CHTH' else: raise NotImplementedError() if species == 'zebra_plains': serengeti_sepcies = 'zebra' else: serengeti_sepcies = species print('species = %r' % (species,)) print('serengeti_sepcies = %r' % (serengeti_sepcies,)) dbname = code + '_Serengeti' print('dbname = %r' % (dbname,)) dbdir = ut.ensuredir(join(ibeis.sysres.get_workdir(), dbname)) print('dbdir = %r' % (dbdir,)) image_dir = ut.ensuredir(join(dbdir, 'images')) base_url = 'http://datadryad.org/bitstream/handle/10255' all_images_url = base_url + '/dryad.86392/all_images.csv' consensus_metadata_url = base_url + '/dryad.86348/consensus_data.csv' search_effort_url = base_url + '/dryad.86347/search_effort.csv' gold_standard_url = base_url + '/dryad.76010/gold_standard_data.csv' all_images_fpath = ut.grab_file_url(all_images_url, download_dir=dbdir) consensus_metadata_fpath = ut.grab_file_url(consensus_metadata_url, download_dir=dbdir) search_effort_fpath = ut.grab_file_url(search_effort_url, download_dir=dbdir) gold_standard_fpath = ut.grab_file_url(gold_standard_url, download_dir=dbdir) print('all_images_fpath = %r' % (all_images_fpath,)) print('consensus_metadata_fpath = %r' % (consensus_metadata_fpath,)) print('search_effort_fpath = %r' % (search_effort_fpath,)) print('gold_standard_fpath = %r' % (gold_standard_fpath,)) def read_csv(csv_fpath): import utool as ut csv_text = ut.read_from(csv_fpath) csv_lines = csv_text.split('\n') print(ut.list_str(csv_lines[0:2])) csv_data = [[field.strip('"').strip('\r') for field in line.split(',')] for line in csv_lines if len(line) > 0] csv_header = csv_data[0] csv_data = csv_data[1:] return csv_data, csv_header def download_image_urls(image_url_info_list): # Find ones that we already have print('Requested %d downloaded images' % (len(image_url_info_list))) full_gpath_list = [join(image_dir, basename(gpath)) for gpath in image_url_info_list] exists_list = [ut.checkpath(gpath) for gpath in full_gpath_list] image_url_info_list_ = ut.compress(image_url_info_list, ut.not_list(exists_list)) print('Already have %d/%d downloaded images' % ( len(image_url_info_list) - len(image_url_info_list_), len(image_url_info_list))) print('Need to download %d images' % (len(image_url_info_list_))) #import sys #sys.exit(0) # Download the rest imgurl_prefix = 'https://snapshotserengeti.s3.msi.umn.edu/' image_url_list = [imgurl_prefix + suffix for suffix in image_url_info_list_] for img_url in ut.ProgressIter(image_url_list, lbl='Downloading image'): ut.grab_file_url(img_url, download_dir=image_dir) return full_gpath_list # Data contains information about which events have which animals if False: species_class_csv_data, species_class_header = read_csv(gold_standard_fpath) species_class_eventid_list = ut.get_list_column(species_class_csv_data, 0) #gold_num_species_annots_list = ut.get_list_column(gold_standard_csv_data, 2) species_class_species_list = ut.get_list_column(species_class_csv_data, 2) #gold_count_list = ut.get_list_column(gold_standard_csv_data, 3) else: species_class_csv_data, species_class_header = read_csv(consensus_metadata_fpath) species_class_eventid_list = ut.get_list_column(species_class_csv_data, 0) species_class_species_list = ut.get_list_column(species_class_csv_data, 7) # Find the zebra events serengeti_sepcies_set = sorted(list(set(species_class_species_list))) print('serengeti_sepcies_hist = %s' % ut.dict_str(ut.dict_hist(species_class_species_list), key_order_metric='val')) #print('serengeti_sepcies_set = %s' % (ut.list_str(serengeti_sepcies_set),)) assert serengeti_sepcies in serengeti_sepcies_set, 'not a known seregeti species' species_class_chosen_idx_list = ut.list_where( [serengeti_sepcies == species_ for species_ in species_class_species_list]) chosen_eventid_list = ut.take(species_class_eventid_list, species_class_chosen_idx_list) print('Number of chosen species:') print(' * len(species_class_chosen_idx_list) = %r' % (len(species_class_chosen_idx_list),)) print(' * len(chosen_eventid_list) = %r' % (len(chosen_eventid_list),)) # Read info about which events have which images images_csv_data, image_csv_header = read_csv(all_images_fpath) capture_event_id_list = ut.get_list_column(images_csv_data, 0) image_url_info_list = ut.get_list_column(images_csv_data, 1) # Group photos by eventid eventid_to_photos = ut.group_items(image_url_info_list, capture_event_id_list) # Filter to only chosens unflat_chosen_url_infos = ut.dict_take(eventid_to_photos, chosen_eventid_list) chosen_url_infos = ut.flatten(unflat_chosen_url_infos) image_url_info_list = chosen_url_infos chosen_path_list = download_image_urls(chosen_url_infos) ibs = ibeis.opendb(dbdir=dbdir, allow_newdir=True) gid_list_ = ibs.add_images(chosen_path_list, auto_localize=False) # NOQA # Attempt to automatically detect the annotations #aids_list = ibs.detect_random_forest(gid_list_, species) #aids_list #if False: # # remove non-zebra photos # from os.path import basename # base_gname_list = list(map(basename, zebra_url_infos)) # all_gname_list = ut.list_images(image_dir) # nonzebra_gname_list = ut.setdiff_ordered(all_gname_list, base_gname_list) # nonzebra_gpath_list = ut.fnames_to_fpaths(nonzebra_gname_list, image_dir) # ut.remove_fpaths(nonzebra_gpath_list) return ibs
print('\n --- TYPE = %r' % (e.upper(), )) g = g[g.columns[~np.all(pd.isnull(g), axis=0)]] missing_cols = g.columns[np.any(pd.isnull(g), axis=0)] if e in ignore: missing_cols = missing_cols.difference(ignore[e]) print('missing_cols = {!r}'.format(missing_cols.tolist())) for col in missing_cols: print('col = {!r}'.format(col)) print(g[pd.isnull(g[col])].index.tolist()) for e, g in entrytypes.items(): print('e = %r' % (e, )) g = g[g.columns[~np.all(pd.isnull(g), axis=0)]] if 'pub_full' in g.columns: place_title = g['pub_full'].tolist() print(ut.repr4(ut.dict_hist(place_title))) else: print(g) print('Unknown publications') if 'report' in entrytypes: g = entrytypes['report'] missing = g[pd.isnull(g['title'])] if len(missing): print('Missing Title') print(ut.repr4(missing[['title', 'author']].values.tolist())) if 'journal' in entrytypes: g = entrytypes['journal'] g = g[g.columns[~np.all(pd.isnull(g), axis=0)]]
def find_consistent_labeling_old(grouped_oldnames, extra_prefix='_extra_name', verbose=False): import numpy as np import scipy.optimize unique_old_names = ut.unique(ut.flatten(grouped_oldnames)) # TODO: find names that are only used once, and just ignore those for # optimization. # unique_set = set(unique_old_names) oldname_sets = list(map(set, grouped_oldnames)) usage_hist = ut.dict_hist(ut.flatten(oldname_sets)) conflicts = {k for k, v in usage_hist.items() if v > 1} # nonconflicts = {k for k, v in usage_hist.items() if v == 1} conflict_groups = [] orig_idxs = [] assignment = [None] * len(grouped_oldnames) ntrivial = 0 for idx, group in enumerate(grouped_oldnames): if set(group).intersection(conflicts): orig_idxs.append(idx) conflict_groups.append(group) else: ntrivial += 1 if len(group) > 0: h = ut.dict_hist(group) hitems = list(h.items()) hvals = [i[1] for i in hitems] maxval = max(hvals) g = min([k for k, v in hitems if v == maxval]) assignment[idx] = g else: assignment[idx] = None if verbose: print('rectify %d non-trivial groups' % (len(conflict_groups), )) print('rectify %d trivial groups' % (ntrivial, )) num_extra = 0 if len(conflict_groups) > 0: grouped_oldnames_ = conflict_groups unique_old_names = ut.unique(ut.flatten(grouped_oldnames_)) num_new_names = len(grouped_oldnames_) num_old_names = len(unique_old_names) extra_oldnames = [] # Create padded dummy values. This accounts for the case where it is # impossible to uniquely map to the old db num_extra = num_new_names - num_old_names if num_extra > 0: extra_oldnames = [ '%s%d' % ( extra_prefix, count, ) for count in range(num_extra) ] elif num_extra < 0: pass else: extra_oldnames = [] assignable_names = unique_old_names + extra_oldnames total = len(assignable_names) # Allocate assignment matrix # Start with a large negative value indicating # that you must select from your assignments only profit_matrix = -np.ones((total, total), dtype=np.int) * (2 * total) # Populate assignment profit matrix oldname2_idx = ut.make_index_lookup(assignable_names) name_freq_list = [ut.dict_hist(names) for names in grouped_oldnames_] # Initialize base profit for using a previously used name for rowx, name_freq in enumerate(name_freq_list): for name, freq in name_freq.items(): colx = oldname2_idx[name] profit_matrix[rowx, colx] = 1 # Now add in the real profit for rowx, name_freq in enumerate(name_freq_list): for name, freq in name_freq.items(): colx = oldname2_idx[name] profit_matrix[rowx, colx] += freq # Set a small profit for using an extra name extra_colxs = ut.take(oldname2_idx, extra_oldnames) profit_matrix[:, extra_colxs] = 1 # Convert to minimization problem big_value = (profit_matrix.max()) - (profit_matrix.min()) cost_matrix = big_value - profit_matrix # Don't use munkres, it is pure python and very slow. Use scipy instead indexes = list(zip(*scipy.optimize.linear_sum_assignment(cost_matrix))) # Map output to be aligned with input rx2_cx = dict(indexes) assignment_ = [ assignable_names[rx2_cx[rx]] for rx in range(num_new_names) ] # Reintegrate trivial values for idx, g in zip(orig_idxs, assignment_): assignment[idx] = g for idx, val in enumerate(assignment): if val is None: assignment[idx] = '%s%d' % ( extra_prefix, num_extra, ) num_extra += 1 return assignment
def simple_munkres(part_oldnames): """ Defines a munkres problem to solve name rectification. Notes: We create a matrix where each rows represents a group of annotations in the same PCC and each column represents an original name. If there are more PCCs than original names the columns are padded with extra values. The matrix is first initialized to be negative infinity representing impossible assignments. Then for each column representing a padded name, we set we its value to $1$ indicating that each new name could be assigned to a padded name for some small profit. Finally, let $f_{rc}$ be the the number of annotations in row $r$ with an original name of $c$. Each matrix value $(r, c)$ is set to $f_{rc} + 1$ if $f_{rc} > 0$, to represent how much each name ``wants'' to be labeled with a particular original name, and the extra one ensures that these original names are always preferred over padded names. CommandLine: python -m ibeis.scripts.name_recitifer simple_munkres Example: >>> # ENABLE_DOCTEST >>> from ibeis.scripts.name_recitifer import * # NOQA >>> part_oldnames = [['a', 'b'], ['b', 'c'], ['c', 'a', 'a']] >>> new_names = simple_munkres(part_oldnames) >>> result = ut.repr2(new_names) >>> print(new_names) ['b', 'c', 'a'] Example: >>> # ENABLE_DOCTEST >>> from ibeis.scripts.name_recitifer import * # NOQA >>> part_oldnames = [[], ['a', 'a'], [], >>> ['a', 'a', 'a', 'a', 'a', 'a', 'a', 'b'], ['a']] >>> new_names = simple_munkres(part_oldnames) >>> result = ut.repr2(new_names) >>> print(new_names) [None, 'a', None, 'b', None] Example: >>> # ENABLE_DOCTEST >>> from ibeis.scripts.name_recitifer import * # NOQA >>> part_oldnames = [[], ['b'], ['a', 'b', 'c'], ['b', 'c'], ['c', 'e', 'e']] >>> new_names = find_consistent_labeling(part_oldnames) >>> result = ut.repr2(new_names) >>> print(new_names) ['_extra_name0', 'b', 'a', 'c', 'e'] Profit Matrix b a c e _0 0 -10 -10 -10 -10 1 1 2 -10 -10 -10 1 2 2 2 2 -10 1 3 2 -10 2 -10 1 4 -10 -10 2 3 1 """ import numpy as np import scipy.optimize unique_old_names = ut.unique(ut.flatten(part_oldnames)) num_new_names = len(part_oldnames) num_old_names = len(unique_old_names) # Create padded dummy values. This accounts for the case where it is # impossible to uniquely map to the old db num_pad = max(num_new_names - num_old_names, 0) total = num_old_names + num_pad shape = (total, total) # Allocate assignment matrix. # rows are new-names and cols are old-names. # Initially the profit of any assignment is effectively -inf # This effectively marks all assignments as invalid profit_matrix = np.full(shape, -2 * total, dtype=np.int) # Overwrite valid assignments with positive profits oldname2_idx = ut.make_index_lookup(unique_old_names) name_freq_list = [ut.dict_hist(names) for names in part_oldnames] # Initialize profit of a valid assignment as 1 + freq # This incentivizes using a previously used name for rowx, name_freq in enumerate(name_freq_list): for name, freq in name_freq.items(): colx = oldname2_idx[name] profit_matrix[rowx, colx] = freq + 1 # Set a much smaller profit for using an extra name # This allows the solution to always exist profit_matrix[:, num_old_names:total] = 1 # Convert to minimization problem big_value = (profit_matrix.max()) - (profit_matrix.min()) cost_matrix = big_value - profit_matrix # Use scipy implementation of munkres algorithm. rx2_cx = dict(zip(*scipy.optimize.linear_sum_assignment(cost_matrix))) # Each row (new-name) has now been assigned a column (old-name) # Map this back to the input-space (using None to indicate extras) cx2_name = dict(enumerate(unique_old_names)) if False: import pandas as pd columns = unique_old_names + ['_%r' % x for x in range(num_pad)] print('Profit Matrix') print(pd.DataFrame(profit_matrix, columns=columns)) print('Cost Matrix') print(pd.DataFrame(cost_matrix, columns=columns)) assignment_ = [ cx2_name.get(rx2_cx[rx], None) for rx in range(num_new_names) ] return assignment_
def download_sharks(XMLdata, number): """ cd ~/work/WS_ALL python -m ibeis.scripts.getshark >>> from ibeis.scripts.getshark import * # NOQA >>> url = 'www.whaleshark.org/listImages.jsp' >>> XMLdata = ut.url_read(url) >>> number = None """ # Prepare the output directory for writing, if it doesn't exist output_dir = 'sharkimages' ut.ensuredir(output_dir) dom = parseString(XMLdata) # Download files if number: maxCount = min(number, len(dom.getElementsByTagName('img'))) else: maxCount = len(dom.getElementsByTagName('img')) parsed_info = dict( img_url_list=[], localid_list=[], nameid_list=[], orig_fname_list=[], new_fname_list=[], ) print('Preparing to fetch %i files...' % maxCount) for shark in dom.getElementsByTagName('shark'): localCount = 0 for imageset in shark.getElementsByTagName('imageset'): for img in imageset.getElementsByTagName('img'): localCount += 1 img_url = img.getAttribute('href') orig_fname = split(img_url)[1] ext = splitext(orig_fname)[1].lower() nameid = shark.getAttribute('number') new_fname = '%s-%i%s' % ( nameid, localCount, ext) parsed_info['img_url_list'].append(img_url) parsed_info['nameid_list'].append(nameid) parsed_info['localid_list'].append(localCount) parsed_info['orig_fname_list'].append(orig_fname) parsed_info['new_fname_list'].append(new_fname) print('Parsed %i / %i files.' % (len(parsed_info['orig_fname_list']), maxCount)) if number is not None and len(parsed_info['orig_fname_list']) == number: break parsed_info['new_fpath_list'] = [join(output_dir, _fname) for _fname in parsed_info['new_fname_list']] print('Filtering parsed images') # Filter based on image type (keep only jpgs) ext_flags = [_fname.endswith('.jpg') or _fname.endswith('.jpg') for _fname in parsed_info['new_fname_list']] parsed_info = {key: ut.compress(list_, ext_flags) for key, list_ in parsed_info.items()} # Filter to only images matching the appropriate tags from ibeis import tag_funcs parsed_info['tags_list'] = parse_shark_tags(parsed_info['orig_fname_list']) tag_flags = tag_funcs.filterflags_general_tags( parsed_info['tags_list'], has_any=['view-left'], none_match=['qual.*', 'view-top', 'part-.*', 'cropped'], ) parsed_info = {key: ut.compress(list_, tag_flags) for key, list_ in parsed_info.items()} print('Tags in chosen images:') print(ut.dict_hist(ut.flatten(parsed_info['tags_list'] ))) # Download selected subset print('Downloading selected subset') _iter = list(zip(parsed_info['img_url_list'], parsed_info['new_fpath_list'])) _iter = ut.ProgressIter(_iter, lbl='downloading sharks') for img_url, new_fpath in _iter: if not exists(new_fpath): ut.download_url(img_url, new_fpath) # Remove corrupted or ill-formatted images print('Checking for corrupted images') import vtool as vt noncorrupt_flags = vt.filterflags_valid_images(parsed_info['new_fpath_list']) parsed_info = { key: ut.compress(list_, noncorrupt_flags) for key, list_ in parsed_info.items() } print('Removing small images') import numpy as np imgsize_list = np.array([vt.open_image_size(gpath) for gpath in parsed_info['new_fpath_list']]) sqrt_area_list = np.sqrt(np.prod(imgsize_list, axis=1)) areq_flags_list = sqrt_area_list >= 750 parsed_info = {key: ut.compress(list_, areq_flags_list) for key, list_ in parsed_info.items()} grouped_idxs = ut.group_items(list(range(len(parsed_info['nameid_list']))), parsed_info['nameid_list']) keep_idxs = sorted(ut.flatten([idxs for key, idxs in grouped_idxs.items() if len(idxs) >= 2])) parsed_info = {key: ut.take(list_, keep_idxs) for key, list_ in parsed_info.items()} print('Moving imagse to secondary directory') named_outputdir = 'named-left-sharkimages' # Build names parsed_info['namedir_fpath_list'] = [ join(named_outputdir, _nameid, _fname) for _fname, _nameid in zip(parsed_info['new_fname_list'], parsed_info['nameid_list'])] # Create directories ut.ensuredir(named_outputdir) named_dirs = ut.unique_ordered(list(map(dirname, parsed_info['namedir_fpath_list']))) for dir_ in named_dirs: ut.ensuredir(dir_) # Copy ut.copy_files_to(src_fpath_list=parsed_info['new_fpath_list'], dst_fpath_list=parsed_info['namedir_fpath_list'])
def get_toy_data_1vM(num_annots, num_names=None, **kwargs): r""" Args: num_annots (int): num_names (int): (default = None) Kwargs: initial_aids, initial_nids, nid_sequence, seed Returns: tuple: (pair_list, feat_list) CommandLine: python -m ibeis.algo.hots.demobayes --exec-get_toy_data_1vM --show Example: >>> # DISABLE_DOCTEST >>> from ibeis.algo.hots.demobayes import * # NOQA >>> num_annots = 1000 >>> num_names = 40 >>> get_toy_data_1vM(num_annots, num_names) >>> ut.quit_if_noshow() >>> import plottool as pt >>> ut.show_if_requested() """ import vtool as vt tup_ = get_toy_annots(num_annots, num_names, **kwargs) aids, nids, aids1, nids1, all_aids, all_nids = tup_ rng = vt.ensure_rng(None) # Test a simple SVM classifier nid2_nexemp = ut.dict_hist(nids1) aid2_nid = dict(zip(aids, nids)) ut.fix_embed_globals() #def add_to_globals(globals_, subdict): # globals_.update(subdict) unique_nids = list(nid2_nexemp.keys()) def annot_to_class_feats2(aid, aid2_nid, top=None): pair_list = [] score_list = [] nexemplar_list = [] for nid in unique_nids: label = (aid2_nid[aid] == nid) num_exemplars = nid2_nexemp.get(nid, 0) if num_exemplars == 0: continue params = toy_params[label] mu, sigma = ut.dict_take(params, ['mu', 'sigma']) score_ = rng.normal(mu, sigma, size=num_exemplars).max() score = np.clip(score_, 0, np.inf) pair_list.append((aid, nid)) score_list.append(score) nexemplar_list.append(num_exemplars) rank_list = ut.argsort(score_list, reverse=True) feat_list = np.array([score_list, rank_list, nexemplar_list]).T sortx = np.argsort(rank_list) feat_list = feat_list.take(sortx, axis=0) pair_list = np.array(pair_list).take(sortx, axis=0) if top is not None: feat_list = feat_list[:top] pair_list = pair_list[0:top] return pair_list, feat_list toclass_features = [ annot_to_class_feats2(aid, aid2_nid, top=5) for aid in aids ] aidnid_pairs = np.vstack(ut.get_list_column(toclass_features, 0)) feat_list = np.vstack(ut.get_list_column(toclass_features, 1)) score_list = feat_list.T[0:1].T lbl_list = [aid2_nid[aid] == nid for aid, nid in aidnid_pairs] from sklearn import svm #clf1 = svm.LinearSVC() print('Learning classifiers') clf3 = svm.SVC(probability=True) clf3.fit(feat_list, lbl_list) #prob_true, prob_false = clf3.predict_proba(feat_list).T clf1 = svm.LinearSVC() clf1.fit(score_list, lbl_list) # Score new annots against the training database tup_ = get_toy_annots(num_annots * 2, num_names, initial_aids=all_aids, initial_nids=all_nids) aids, nids, aids1, nids1, all_aids, all_nids = tup_ aid2_nid = dict(zip(aids, nids)) toclass_features = [annot_to_class_feats2(aid, aid2_nid) for aid in aids] aidnid_pairs = np.vstack(ut.get_list_column(toclass_features, 0)) feat_list = np.vstack(ut.get_list_column(toclass_features, 1)) lbl_list = np.array([aid2_nid[aid] == nid for aid, nid in aidnid_pairs]) print('Running tests') score_list = feat_list.T[0:1].T tp_feat_list = feat_list[lbl_list] tn_feat_list = feat_list[~lbl_list] tp_lbls = lbl_list[lbl_list] tn_lbls = lbl_list[~lbl_list] print('num tp: %d' % len(tp_lbls)) print('num fp: %d' % len(tn_lbls)) tp_score_list = score_list[lbl_list] tn_score_list = score_list[~lbl_list] print('tp_feat' + ut.repr3(ut.get_stats(tp_feat_list, axis=0), precision=2)) print('tp_feat' + ut.repr3(ut.get_stats(tn_feat_list, axis=0), precision=2)) print('tp_score' + ut.repr2(ut.get_stats(tp_score_list), precision=2)) print('tp_score' + ut.repr2(ut.get_stats(tn_score_list), precision=2)) tp_pred3 = clf3.predict(tp_feat_list) tn_pred3 = clf3.predict(tn_feat_list) print((tp_pred3.sum(), tp_pred3.shape)) print((tn_pred3.sum(), tn_pred3.shape)) tp_score3 = clf3.score(tp_feat_list, tp_lbls) tn_score3 = clf3.score(tn_feat_list, tn_lbls) tp_pred1 = clf1.predict(tp_score_list) tn_pred1 = clf1.predict(tn_score_list) print((tp_pred1.sum(), tp_pred1.shape)) print((tn_pred1.sum(), tn_pred1.shape)) tp_score1 = clf1.score(tp_score_list, tp_lbls) tn_score1 = clf1.score(tn_score_list, tn_lbls) print('tp score with rank = %r' % (tp_score3, )) print('tn score with rank = %r' % (tn_score3, )) print('tp score without rank = %r' % (tp_score1, )) print('tn score without rank = %r' % (tn_score1, )) toy_data = {} return toy_data
def check_doublewords(): """ ./texfix.py --fpaths chapter4-application.tex --check-doublewords ./texfix.py --check-doublewords ./texfix.py --fpaths main.tex --outline --asmarkdown --numlines=999 -w --ignoreinputstartswith=def,Crall,header,colordef,figdef text = ut.readfrom('outline_main.md') >>> from texfix import * # NOQA """ # TODO: Do this on a per section basis to remove math considerations automagically root = testdata_main( ignoreinputstartswith=['def', 'Crall', 'header', 'colordef', 'figdef']) #root = latex_parser.LatexDocPart.parse_fpath('chapter4-application.tex') root._config['asmarkdown'] = True root._config['numlines'] = 999 #text = root.summary_str(outline=True) #document = root.find_descendant_type('document') import re #text = ut.readfrom('outline_main.md') #lines = text.split('\n') found_duplicates = [] found_lines = [] found_linenos = [] def check_palendrome(sequence_norm): half1 = sequence_norm[0:len(sequence_norm) // 2] half2 = sequence_norm[len(sequence_norm) // 2:] return all([a == b for a, b in zip(half1, half2)]) #for num, line in enumerate(lines): num = 0 for x, node in enumerate( root.iter_nodes(invalid_types=['comment', 'equation'])): block = node.summary_str(outline=True, highlight=False, depth=1) for line in block.split('\n'): num += 1 line_ = re.sub('\\$.*?\\$', 'mathpart' + str(num) + 'math', line) words = line_.split(' ') #if len(words) > 10: # break for size in [2, 4, 6, 8, 10]: for sequence in ut.iter_window(words, size=size): sequence_norm = [ re.sub('[^a-zA-Z0-9]', '', s.lower()) for s in sequence ] if sequence_norm[0] == '' or 'mathpart' in sequence_norm[0]: continue #if ut.allsame(sequence_norm): if check_palendrome(sequence_norm): print('sequence_norm = %r' % (sequence_norm, )) print(('Potential repeat of %r ' % (sequence_norm, )) + node.parsed_location_span()) found_duplicates.append(sequence_norm) found_lines.append(line_) found_linenos.append(num) print('found_linenos = ' + '\n'.join(ut.lmap(str, found_linenos))) print('found_lines = ' + '\n'.join(found_lines)) print('found_duplicates = ' + ut.repr3(found_duplicates, nl=1)) constants_tex_fixes.CAPITAL_LIST proper_words = [ 'Identification', 'Park.', 'Discussion', 'Hamming', 'Grevy', 'Affine', 'Equation', 'Sweetwaters', 'National', 'Nairobi', 'The', 'Hessian', 'Fisher', 'Gaussian', 'Section', "Grevy's", 'Masai', 'Figure', 'Jason', 'March', 'Parham', 'Euclidean', 'Bayes', 'Chapter', 'Subsection', 'Lowe', 'Luigi', 'Dryad', 'Jablons', 'Wildbook', 'Apache', 'Hadoop', 'Zack', 'Lincoln', 'Peterson', 'Alessandro', 'Oddone', 'Earth', 'Darwin', 'Markov', 'Bayesian', 'Table', 'Boxer', 'Beagle', 'Platt', 'K' ] flagged_words = [] #for num, line in enumerate(lines): for x, node in enumerate( root.iter_nodes(invalid_types=['comment', 'equation'])): block = node.summary_str(outline=True, highlight=False, depth=1) for line in block.split('\n'): #print('node.type_ = %r' % (node.type_,)) #print('line = %r' % (line[0:20],)) #if x > 30: # break #if node.type_ in ['equation', 'comment']: # continue line_ = re.sub('\\$.*?\\$', 'mathpart' + str(num) + 'math', line) line_ = re.sub('[0-9]+', '', line_) line_ = re.sub('\'s\\b', '', line_) line_ = re.sub('\\\\[A-Za-z]+\\b', '', line_) line_ = line_.replace('#', '') line_ = line_.replace('\\', '') line_ = line_.replace('(', '') line_ = line_.replace('Nairobi National Park', '') line_ = line_.replace('Plains zebras', '') line_ = line_.replace('Ol Pejeta', '') line_ = line_.replace('Darwin Core', '') line_ = line_.replace(')', '') line_ = line_.replace('*', '') line_ = line_.lstrip(' ') words = line_.split(' ') flag = False for w in words[1:]: matches = re.findall('[A-Z]', w) if w in proper_words: continue if len(matches) == 1: #print(w) print(('Bad caps word %r ' % (w, )) + node.fpath_root() + ' at line ' + str(node.line_num)) flagged_words.append(w) flag = True if flag: pass print(line_) print('Found caps problems') hist = ut.dict_hist(flagged_words, ordered=True) print(ut.repr3(hist, nl=1))
def find_consistent_labeling(grouped_oldnames): """ Solves a a maximum bipirtite matching problem to find a consistent name assignment. Notes: # Install module containing the Hungarian algorithm for matching pip install munkres Example: >>> # DISABLE_DOCTEST >>> from ibeis.scripts.name_recitifer import * # NOQA >>> grouped_oldnames = [['a', 'b'], ['b', 'c'], ['c', 'a', 'a']] >>> new_names = find_consistent_labeling(grouped_oldnames) >>> print(new_names) [u'b', u'c', u'a'] Example: >>> # DISABLE_DOCTEST >>> from ibeis.scripts.name_recitifer import * # NOQA >>> grouped_oldnames = [['a', 'b', 'c'], ['b', 'c'], ['c', 'e', 'e']] >>> new_names = find_consistent_labeling(grouped_oldnames) >>> print(new_names) [u'a', u'b', u'e'] Example: >>> # DISABLE_DOCTEST >>> from ibeis.scripts.name_recitifer import * # NOQA >>> grouped_oldnames = [['a', 'b'], ['a', 'a', 'b'], ['a']] >>> new_names = find_consistent_labeling(grouped_oldnames) >>> print(new_names) [u'a', u'b', u'e'] """ import numpy as np try: import munkres except ImportError: print('Need to install Hungrian algorithm bipartite matching solver.') print('Run:') print('pip install munkres') raise unique_old_names = ut.unique(ut.flatten(grouped_oldnames)) num_new_names = len(grouped_oldnames) num_old_names = len(unique_old_names) extra_oldnames = [] # Create padded dummy values. This accounts for the case where it is # impossible to uniquely map to the old db num_extra = num_new_names - num_old_names if num_extra > 0: extra_oldnames = ['_extra_name%d' % (count,) for count in range(num_extra)] elif num_extra < 0: pass else: extra_oldnames = [] assignable_names = unique_old_names + extra_oldnames total = len(assignable_names) # Allocate assignment matrix profit_matrix = np.zeros((total, total), dtype=np.int) # Populate assignment profit matrix oldname2_idx = ut.make_index_lookup(assignable_names) name_freq_list = [ut.dict_hist(names) for names in grouped_oldnames] for rowx, name_freq in enumerate(name_freq_list): for name, freq in name_freq.items(): colx = oldname2_idx[name] profit_matrix[rowx, colx] += freq # Add extra profit for using a previously used name profit_matrix[profit_matrix > 0] += 2 # Add small profit for using an extra name extra_colxs = ut.take(oldname2_idx, extra_oldnames) profit_matrix[:, extra_colxs] += 1 # Convert to minimization problem big_value = (profit_matrix.max()) cost_matrix = big_value - profit_matrix m = munkres.Munkres() indexes = m.compute(cost_matrix) # Map output to be aligned with input rx2_cx = dict(indexes) assignment = [assignable_names[rx2_cx[rx]] for rx in range(num_new_names)] return assignment
def get_colordict(self): color2_num = ut.dict_hist([m.color for m in self._manas for _ in range(m.num)]) return color2_num
def download_sharks(XMLdata, number): """ cd ~/work/WS_ALL python -m ibeis.scripts.getshark >>> from ibeis.scripts.getshark import * # NOQA >>> url = 'www.whaleshark.org/listImages.jsp' >>> XMLdata = ut.url_read(url) >>> number = None """ # Prepare the output directory for writing, if it doesn't exist output_dir = 'sharkimages' ut.ensuredir(output_dir) dom = parseString(XMLdata) # Download files if number: maxCount = min(number, len(dom.getElementsByTagName('img'))) else: maxCount = len(dom.getElementsByTagName('img')) parsed_info = dict( img_url_list=[], localid_list=[], nameid_list=[], orig_fname_list=[], new_fname_list=[], ) print('Preparing to fetch %i files...' % maxCount) for shark in dom.getElementsByTagName('shark'): localCount = 0 for imageset in shark.getElementsByTagName('imageset'): for img in imageset.getElementsByTagName('img'): localCount += 1 img_url = img.getAttribute('href') orig_fname = split(img_url)[1] ext = splitext(orig_fname)[1].lower() nameid = shark.getAttribute('number') new_fname = '%s-%i%s' % (nameid, localCount, ext) parsed_info['img_url_list'].append(img_url) parsed_info['nameid_list'].append(nameid) parsed_info['localid_list'].append(localCount) parsed_info['orig_fname_list'].append(orig_fname) parsed_info['new_fname_list'].append(new_fname) print('Parsed %i / %i files.' % (len(parsed_info['orig_fname_list']), maxCount)) if number is not None and len( parsed_info['orig_fname_list']) == number: break parsed_info['new_fpath_list'] = [ join(output_dir, _fname) for _fname in parsed_info['new_fname_list'] ] print('Filtering parsed images') # Filter based on image type (keep only jpgs) ext_flags = [ _fname.endswith('.jpg') or _fname.endswith('.jpg') for _fname in parsed_info['new_fname_list'] ] parsed_info = { key: ut.compress(list_, ext_flags) for key, list_ in parsed_info.items() } # Filter to only images matching the appropriate tags from ibeis import tag_funcs parsed_info['tags_list'] = parse_shark_tags(parsed_info['orig_fname_list']) tag_flags = tag_funcs.filterflags_general_tags( parsed_info['tags_list'], has_any=['view-left'], none_match=['qual.*', 'view-top', 'part-.*', 'cropped'], ) parsed_info = { key: ut.compress(list_, tag_flags) for key, list_ in parsed_info.items() } print('Tags in chosen images:') print(ut.dict_hist(ut.flatten(parsed_info['tags_list']))) # Download selected subset print('Downloading selected subset') _iter = list( zip(parsed_info['img_url_list'], parsed_info['new_fpath_list'])) _iter = ut.ProgressIter(_iter, lbl='downloading sharks') for img_url, new_fpath in _iter: if not exists(new_fpath): ut.download_url(img_url, new_fpath) # Remove corrupted or ill-formatted images print('Checking for corrupted images') import vtool as vt noncorrupt_flags = vt.filterflags_valid_images( parsed_info['new_fpath_list']) parsed_info = { key: ut.compress(list_, noncorrupt_flags) for key, list_ in parsed_info.items() } print('Removing small images') import numpy as np imgsize_list = np.array( [vt.open_image_size(gpath) for gpath in parsed_info['new_fpath_list']]) sqrt_area_list = np.sqrt(np.prod(imgsize_list, axis=1)) areq_flags_list = sqrt_area_list >= 750 parsed_info = { key: ut.compress(list_, areq_flags_list) for key, list_ in parsed_info.items() } grouped_idxs = ut.group_items(list(range(len(parsed_info['nameid_list']))), parsed_info['nameid_list']) keep_idxs = sorted( ut.flatten( [idxs for key, idxs in grouped_idxs.items() if len(idxs) >= 2])) parsed_info = { key: ut.take(list_, keep_idxs) for key, list_ in parsed_info.items() } print('Moving imagse to secondary directory') named_outputdir = 'named-left-sharkimages' # Build names parsed_info['namedir_fpath_list'] = [ join(named_outputdir, _nameid, _fname) for _fname, _nameid in zip( parsed_info['new_fname_list'], parsed_info['nameid_list']) ] # Create directories ut.ensuredir(named_outputdir) named_dirs = ut.unique_ordered( list(map(dirname, parsed_info['namedir_fpath_list']))) for dir_ in named_dirs: ut.ensuredir(dir_) # Copy ut.copy_files_to(src_fpath_list=parsed_info['new_fpath_list'], dst_fpath_list=parsed_info['namedir_fpath_list'])
def get_cnn_labeler_training_images_pytorch( ibs, dest_path=None, image_size=224, category_list=None, min_examples=10, category_mapping=None, viewpoint_mapping=None, purge=True, strict=True, skip_rate=0.0, valid_rate=0.2, use_axis_aligned_chips=False, train_gid_set=None, ): from os.path import join, expanduser, exists import random import cv2 if dest_path is None: dest_path = expanduser(join('~', 'Desktop', 'extracted')) name = 'labeler-pytorch' dbname = ibs.dbname name_path = join(dest_path, name) train_path = join(name_path, 'train') valid_path = join(name_path, 'val') if purge: ut.delete(name_path) ut.ensuredir(name_path) ut.ensuredir(train_path) ut.ensuredir(valid_path) logger.info('category mapping = %s' % (ut.repr3(category_mapping), )) logger.info('viewpoint mapping = %s' % (ut.repr3(viewpoint_mapping), )) # train_gid_set = ibs.get_valid_gids() if train_gid_set is None: train_gid_set = set( ibs.get_imageset_gids( ibs.get_imageset_imgsetids_from_text('TRAIN_SET'))) aids_list = ibs.get_image_aids(train_gid_set) # bboxes_list = [ ibs.get_annot_bboxes(aid_list) for aid_list in aids_list ] # aid_list = ibs.get_valid_aids() aid_list = ut.flatten(aids_list) # import random # random.shuffle(aid_list) # aid_list = sorted(aid_list[:100]) species_list = ibs.get_annot_species_texts(aid_list) if category_mapping is not None: species_list = [ category_mapping.get(species, species) for species in species_list ] species_set = set(species_list) yaw_list = ibs.get_annot_viewpoints(aid_list) if category_list is None: category_list = sorted(list(species_set)) undesired_list = [ 'unspecified_animal', ibs.get_species_nice(ibs.const.UNKNOWN_SPECIES_ROWID), ] for undesired_species in undesired_list: if undesired_species in category_list: category_list.remove(undesired_species) category_set = set(category_list) # Filter the tup_list based on the requested categories tup_list = list(zip(aid_list, species_list, yaw_list)) old_len = len(tup_list) tup_list = [(aid, species, viewpoint_mapping.get(species, {}).get(yaw, yaw)) for aid, species, yaw in tup_list if species in category_set] new_len = len(tup_list) logger.info('Filtered annotations: keep %d / original %d' % (new_len, old_len)) # Skip any annotations that are of the wanted category and don't have a specified viewpoint counter = 0 seen_dict = {} yaw_dict = {} for tup in tup_list: aid, species, yaw = tup # Keep track of the number of overall instances if species not in seen_dict: seen_dict[species] = 0 seen_dict[species] += 1 # Keep track of yaws that aren't None if yaw is not None: if species not in yaw_dict: yaw_dict[species] = {} if yaw not in yaw_dict[species]: yaw_dict[species][yaw] = 0 yaw_dict[species][yaw] += 1 else: counter += 1 # Get the list of species that do not have enough viewpoint examples for training invalid_seen_set = set([]) invalid_yaw_set = set([]) for species in seen_dict: # Check that the number of instances is above the min_examples if seen_dict[species] < min_examples: invalid_seen_set.add(species) continue # If the species has viewpoints, check them as well if strict: if species in yaw_dict: # Check that all viewpoints exist # if len(yaw_dict[species]) < 8: # invalid_yaw_set.add(species) # continue # Check that all viewpoints have a minimum number of instances for yaw in yaw_dict[species]: # assert yaw in ibs.const.VIEWTEXT_TO_YAW_RADIANS if yaw_dict[species][yaw] < min_examples: invalid_yaw_set.add(species) continue else: invalid_yaw_set.add(species) continue logger.info('Null yaws: %d' % (counter, )) valid_seen_set = category_set - invalid_seen_set valid_yaw_set = valid_seen_set - invalid_yaw_set logger.info('Requested categories:') category_set = sorted(category_set) ut.print_list(category_set) # logger.info('Invalid yaw categories:') # ut.print_list(sorted(invalid_yaw_set)) # logger.info('Valid seen categories:') # ut.print_list(sorted(valid_seen_set)) logger.info('Valid yaw categories:') valid_yaw_set = sorted(valid_yaw_set) ut.print_list(valid_yaw_set) logger.info('Invalid seen categories (could not fulfill request):') invalid_seen_set = sorted(invalid_seen_set) ut.print_list(invalid_seen_set) skipped_yaw = 0 skipped_seen = 0 aid_list_ = [] category_list_ = [] for tup in tup_list: aid, species, yaw = tup if species in valid_yaw_set: # If the species is valid, but this specific annotation has no yaw, skip it if yaw is None: skipped_yaw += 1 continue category = '%s:%s' % (species, yaw) elif species in valid_seen_set: category = '%s' % (species, ) else: skipped_seen += 1 continue aid_list_.append(aid) category_list_.append(category) logger.info('Skipped Yaw: skipped %d / total %d' % (skipped_yaw, len(tup_list))) logger.info('Skipped Seen: skipped %d / total %d' % (skipped_seen, len(tup_list))) for category in sorted(set(category_list_)): logger.info('Making folder for %r' % (category, )) ut.ensuredir(join(train_path, category)) ut.ensuredir(join(valid_path, category)) config = { 'dim_size': (image_size, image_size), 'resize_dim': 'wh', 'axis_aligned': use_axis_aligned_chips, } chip_list_ = ibs.depc_annot.get_property('chips', aid_list_, 'img', config=config) # Get training data label_list = [] for aid, chip, category in zip(aid_list_, chip_list_, category_list_): args = (aid, ) logger.info('Processing AID: %r' % args) if skip_rate > 0.0 and random.uniform(0.0, 1.0) <= skip_rate: logger.info('\t Skipping') continue is_valid = random.uniform(0.0, 1.0) < valid_rate dest_path = valid_path if is_valid else train_path raw_path = join(dest_path, category) assert exists(dest_path) # Compute data values = ( dbname, aid, ) patch_filename = '%s_annot_aid_%s.png' % values patch_filepath = join(raw_path, patch_filename) cv2.imwrite(patch_filepath, chip) # Compute label label = '%s,%s' % (patch_filename, category) label_list.append(label) logger.info('Using labels for labeler training:') logger.info(ut.repr3(ut.dict_hist(category_list_))) return name_path
def fix_annotmatch_pzmaster1(): """ PZ_Master1 had annotmatch rowids that did not agree with the current name labeling. Looking at the inconsistencies in the graph interface was too cumbersome, because over 3000 annots were incorrectly grouped together. This function deletes any annotmatch rowid that is not consistent with the current labeling so we can go forward with using the new AnnotInference object """ import wbia ibs = wbia.opendb('PZ_Master1') infr = wbia.AnnotInference(ibs=ibs, aids=ibs.get_valid_aids(), verbose=5) infr.initialize_graph() annots = ibs.annots() aid_to_nid = ut.dzip(annots.aids, annots.nids) if False: infr.reset_feedback() infr.ensure_mst() infr.apply_feedback_edges() infr.relabel_using_reviews() infr.start_qt_interface() # Get annotmatch rowids that agree with current labeling if False: annotmatch = ibs.db.get_table_as_pandas('annotmatch') import pandas as pd flags1 = pd.isnull(annotmatch['annotmatch_evidence_decision']) flags2 = annotmatch['annotmatch_tag_text'] == '' bad_part = annotmatch[flags1 & flags2] rowids = bad_part.index.tolist() ibs.delete_annotmatch(rowids) if False: # Delete bidirectional annotmatches annotmatch = ibs.db.get_table_as_pandas('annotmatch') df = annotmatch.set_index(['annot_rowid1', 'annot_rowid2']) # Find entires that have both directions pairs1 = annotmatch[['annot_rowid1', 'annot_rowid2']].values f_edges = {tuple(p) for p in pairs1} b_edges = {tuple(p[::-1]) for p in pairs1} isect_edges = {tuple(sorted(p)) for p in b_edges.intersection(f_edges)} isect_edges1 = list(isect_edges) isect_edges2 = [p[::-1] for p in isect_edges] # cols = ['annotmatch_evidence_decision', 'annotmatch_tag_text'] import pandas as pd custom_ = { (559, 4909): (False, ['photobomb']), (7918, 8041): (False, ['photobomb']), (6634, 6754): (False, ['photobomb']), (3707, 3727): (False, ['photobomb']), (86, 103): (False, ['photobomb']), } extra_ = {} fixme_edges = [] d1 = df.loc[isect_edges1].reset_index(drop=False) d2 = df.loc[isect_edges2].reset_index(drop=False) flags = d1['annotmatch_evidence_decision'] != d2[ 'annotmatch_evidence_decision'] from wbia.tag_funcs import _parse_tags for f, r1, r2 in zip(flags, d1.iterrows(), d2.iterrows()): v1, v2 = r1[1], r2[1] aid1 = v1['annot_rowid1'] aid2 = v1['annot_rowid2'] truth_real = (ibs.const.EVIDENCE_DECISION.POSITIVE if aid_to_nid[aid1] == aid_to_nid[aid2] else ibs.const.EVIDENCE_DECISION.NEGATIVE) truth1 = v1['annotmatch_evidence_decision'] truth2 = v2['annotmatch_evidence_decision'] t1 = _parse_tags(v1['annotmatch_tag_text']) t2 = _parse_tags(v2['annotmatch_tag_text']) newtag = ut.union_ordered(t1, t2) if (aid1, aid2) in custom_: continue fixme_flag = False if not pd.isnull(truth1): if truth_real != truth1: fixme_flag = True if not pd.isnull(truth2): if truth_real != truth2: fixme_flag = True if fixme_flag: logger.info('newtag = %r' % (newtag, )) logger.info('truth_real = %r' % (truth_real, )) logger.info('truth1 = %r' % (truth1, )) logger.info('truth2 = %r' % (truth2, )) logger.info('aid1 = %r' % (aid1, )) logger.info('aid2 = %r' % (aid2, )) fixme_edges.append((aid1, aid2)) else: extra_[(aid1, aid2)] = (truth_real, newtag) extra_.update(custom_) new_pairs = extra_.keys() new_truths = ut.take_column(ut.dict_take(extra_, new_pairs), 0) new_tags = ut.take_column(ut.dict_take(extra_, new_pairs), 1) new_tag_texts = [';'.join(t) for t in new_tags] aids1, aids2 = ut.listT(new_pairs) # Delete the old ibs.delete_annotmatch((d1['annotmatch_rowid'].values.tolist() + d2['annotmatch_rowid'].values.tolist())) # Add the new ams = ibs.add_annotmatch_undirected(aids1, aids2) ibs.set_annotmatch_evidence_decision(ams, new_truths) ibs.set_annotmatch_tag_text(ams, new_tag_texts) if False: import wbia.guitool as gt gt.ensure_qapp() ut.qtensure() from wbia.gui import inspect_gui inspect_gui.show_vsone_tuner(ibs, aid1, aid2) # pairs2 = pairs1.T[::-1].T # idx1, idx2 = ut.isect_indices(list(map(tuple, pairs1)), # list(map(tuple, pairs2))) # r_edges = list(set(map(tuple, map(sorted, pairs1[idx1])))) # unique_pairs = list(set(map(tuple, map(sorted, pairs1[idx1])))) # df = annotmatch.set_index(['annot_rowid1', 'annot_rowid2']) x = ut.ddict(list) annotmatch = ibs.db.get_table_as_pandas('annotmatch') import ubelt as ub _iter = annotmatch.iterrows() prog = ub.ProgIter(_iter, length=len(annotmatch)) for k, m in prog: aid1 = m['annot_rowid1'] aid2 = m['annot_rowid2'] if m['annotmatch_evidence_decision'] == ibs.const.EVIDENCE_DECISION.POSITIVE: if aid_to_nid[aid1] == aid_to_nid[aid2]: x['agree1'].append(k) else: x['disagree1'].append(k) elif m['annotmatch_evidence_decision'] == ibs.const.EVIDENCE_DECISION.NEGATIVE: if aid_to_nid[aid1] == aid_to_nid[aid2]: x['disagree2'].append(k) else: x['agree2'].append(k) ub.map_vals(len, x) ut.dict_hist(annotmatch.loc[x['disagree1']]['annotmatch_tag_text']) disagree1 = annotmatch.loc[x['disagree1']] pb_disagree1 = disagree1[disagree1['annotmatch_tag_text'] == 'photobomb'] aids1 = pb_disagree1['annot_rowid1'].values.tolist() aids2 = pb_disagree1['annot_rowid2'].values.tolist() aid_pairs = list(zip(aids1, aids2)) infr = wbia.AnnotInference.from_pairs(aid_pairs, ibs=ibs, verbose=5) if False: feedback = infr.read_wbia_annotmatch_feedback(edges=infr.edges()) infr.external_feedback = feedback infr.apply_feedback_edges() infr.start_qt_interface(loop=False) # Delete these values if False: nonpb_disagree1 = disagree1[ disagree1['annotmatch_tag_text'] != 'photobomb'] disagree2 = annotmatch.loc[x['disagree2']] ibs.delete_annotmatch(nonpb_disagree1['annotmatch_rowid']) ibs.delete_annotmatch(disagree2['annotmatch_rowid']) # ut.dict_hist(disagree1['annotmatch_tag_text']) import networkx as nx graph = nx.Graph() graph.add_edges_from( zip(pb_disagree1['annot_rowid1'], pb_disagree1['annot_rowid2'])) list(nx.connected_components(graph)) set(annotmatch.loc[x['disagree2']]['annotmatch_tag_text'])
def find_consistent_labeling(grouped_oldnames, extra_prefix='_extra_name', verbose=False): r""" Solves a a maximum bipirtite matching problem to find a consistent name assignment that minimizes the number of annotations with different names. For each new grouping of annotations we assign For each group of annotations we must assign them all the same name, either from To reduce the running time Args: gropued_oldnames (list): A group of old names where the grouping is based on new names. For instance: Given: aids = [1, 2, 3, 4, 5] old_names = [0, 1, 1, 1, 0] new_names = [0, 0, 1, 1, 0] The grouping is [[0, 1, 0], [1, 1]] This lets us keep the old names in a split case and re-use exising names and make minimal changes to current annotation names while still being consistent with the new and improved grouping. The output will be: [0, 1] Meaning that all annots in the first group are assigned the name 0 and all annots in the second group are assigned the name 1. References: http://stackoverflow.com/questions/1398822/assignment-problem-numpy CommandLine: python -m ibeis.scripts.name_recitifer find_consistent_labeling Example: >>> # ENABLE_DOCTEST >>> from ibeis.scripts.name_recitifer import * # NOQA >>> grouped_oldnames = testdata_oldnames(25, 15, 5, n_per_incon=5) >>> new_names = find_consistent_labeling(grouped_oldnames, verbose=1) >>> grouped_oldnames = testdata_oldnames(0, 15, 5, n_per_incon=1) >>> new_names = find_consistent_labeling(grouped_oldnames, verbose=1) >>> grouped_oldnames = testdata_oldnames(0, 0, 0, n_per_incon=1) >>> new_names = find_consistent_labeling(grouped_oldnames, verbose=1) Example: >>> # ENABLE_DOCTEST >>> from ibeis.scripts.name_recitifer import * # NOQA >>> ydata = [] >>> xdata = list(range(10, 150, 50)) >>> for x in xdata: >>> print('x = %r' % (x,)) >>> grouped_oldnames = testdata_oldnames(x, 15, 5, n_per_incon=5) >>> t = ut.Timerit(3, verbose=1) >>> for timer in t: >>> with timer: >>> new_names = find_consistent_labeling(grouped_oldnames) >>> ydata.append(t.ave_secs) >>> ut.quit_if_noshow() >>> import plottool_ibeis as pt >>> pt.qtensure() >>> pt.multi_plot(xdata, [ydata]) >>> ut.show_if_requested() Example: >>> # ENABLE_DOCTEST >>> from ibeis.scripts.name_recitifer import * # NOQA >>> grouped_oldnames = [['a', 'b', 'c'], ['b', 'c'], ['c', 'e', 'e']] >>> new_names = find_consistent_labeling(grouped_oldnames, verbose=1) >>> result = ut.repr2(new_names) >>> print(new_names) ['a', 'b', 'e'] Example: >>> # ENABLE_DOCTEST >>> from ibeis.scripts.name_recitifer import * # NOQA >>> grouped_oldnames = [['a', 'b'], ['a', 'a', 'b'], ['a']] >>> new_names = find_consistent_labeling(grouped_oldnames) >>> result = ut.repr2(new_names) >>> print(new_names) ['b', 'a', '_extra_name0'] Example: >>> # ENABLE_DOCTEST >>> from ibeis.scripts.name_recitifer import * # NOQA >>> grouped_oldnames = [['a', 'b'], ['e'], ['a', 'a', 'b'], [], ['a'], ['d']] >>> new_names = find_consistent_labeling(grouped_oldnames) >>> result = ut.repr2(new_names) >>> print(new_names) ['b', 'e', 'a', '_extra_name0', '_extra_name1', 'd'] Example: >>> # ENABLE_DOCTEST >>> from ibeis.scripts.name_recitifer import * # NOQA >>> grouped_oldnames = [[], ['a', 'a'], [], >>> ['a', 'a', 'a', 'a', 'a', 'a', 'a', 'b'], ['a']] >>> new_names = find_consistent_labeling(grouped_oldnames) >>> result = ut.repr2(new_names) >>> print(new_names) ['_extra_name0', 'a', '_extra_name1', 'b', '_extra_name2'] """ unique_old_names = ut.unique(ut.flatten(grouped_oldnames)) n_old_names = len(unique_old_names) n_new_names = len(grouped_oldnames) # Initialize assignment to all Nones assignment = [None for _ in range(n_new_names)] if verbose: print('finding maximally consistent labeling') print('n_old_names = %r' % (n_old_names, )) print('n_new_names = %r' % (n_new_names, )) # For each old_name, determine now many new_names use it. oldname_sets = list(map(set, grouped_oldnames)) oldname_usage = ut.dict_hist(ut.flatten(oldname_sets)) # Any name used more than once is a conflict and must be resolved conflict_oldnames = {k for k, v in oldname_usage.items() if v > 1} # Partition into trivial and non-trivial cases nontrivial_oldnames = [] nontrivial_new_idxs = [] trivial_oldnames = [] trivial_new_idxs = [] for new_idx, group in enumerate(grouped_oldnames): if set(group).intersection(conflict_oldnames): nontrivial_oldnames.append(group) nontrivial_new_idxs.append(new_idx) else: trivial_oldnames.append(group) trivial_new_idxs.append(new_idx) # Rectify trivial cases # Any new-name that does not share any of its old-names with other # new-names can be resolved trivially n_trivial_unchanged = 0 n_trivial_ignored = 0 n_trivial_merges = 0 for group, new_idx in zip(trivial_oldnames, trivial_new_idxs): if len(group) > 0: # new-names that use more than one old-name are simple merges h = ut.dict_hist(group) if len(h) > 1: n_trivial_merges += 1 else: n_trivial_unchanged += 1 hitems = list(h.items()) hvals = [i[1] for i in hitems] maxval = max(hvals) g = min([k for k, v in hitems if v == maxval]) assignment[new_idx] = g else: # new-names that use no old-names can be ignored n_trivial_ignored += 1 if verbose: n_trivial = len(trivial_oldnames) n_nontrivial = len(nontrivial_oldnames) print('rectify %d trivial groups' % (n_trivial, )) print(' * n_trivial_unchanged = %r' % (n_trivial_unchanged, )) print(' * n_trivial_merges = %r' % (n_trivial_merges, )) print(' * n_trivial_ignored = %r' % (n_trivial_ignored, )) print('rectify %d non-trivial groups' % (n_nontrivial, )) # Partition nontrivial_oldnames into smaller disjoint sets nontrivial_oldnames_sets = list(map(set, nontrivial_oldnames)) import networkx as nx g = nx.Graph() g.add_nodes_from(range(len(nontrivial_oldnames_sets))) for u, group1 in enumerate(nontrivial_oldnames_sets): rest = nontrivial_oldnames_sets[u + 1:] for v, group2 in enumerate(rest, start=u + 1): if group1.intersection(group2): g.add_edge(u, v) nontrivial_partition = list(nx.connected_components(g)) if verbose: print(' * partitioned non-trivial into %d subgroups' % (len(nontrivial_partition))) part_size_stats = ut.get_stats(map(len, nontrivial_partition)) stats_str = ut.repr2(part_size_stats, precision=2, strkeys=True) print(' * partition size stats = %s' % (stats_str, )) # Rectify nontrivial cases for part_idxs in ut.ProgIter(nontrivial_partition, labels='rectify parts', enabled=verbose): part_oldnames = ut.take(nontrivial_oldnames, part_idxs) part_newidxs = ut.take(nontrivial_new_idxs, part_idxs) # Rectify this part assignment_ = simple_munkres(part_oldnames) for new_idx, new_name in zip(part_newidxs, assignment_): assignment[new_idx] = new_name # Any unassigned name is now given a new unique label with a prefix if extra_prefix is not None: num_extra = 0 for idx, val in enumerate(assignment): if val is None: assignment[idx] = '%s%d' % ( extra_prefix, num_extra, ) num_extra += 1 return assignment
def fix_duplicates(drive): r""" for every duplicate file passing a (eg avi) filter, remove the file that is in the smallest directory. On a tie use the smallest dpath. This will filter all duplicate files in a folder into a single folder. but... need to look at non-duplicates in that folder and decide if they should be moved as well. So, should trigger on folders that have at least 50% duplicate. Might not want to move curated folders. Example: cd ~/local/scripts >>> from register_files import * # NOQA >>> dpaths = ut.get_argval('--drives', type_=list, default=['E:/'])#'D:/', 'E:/', 'F:/']) >>> drives = [Drive(root_dpath) for root_dpath in dpaths] >>> E = drive = drives[0] >>> #D, E, F = drives """ print('Fixing Duplicates in %r' % (drive,)) list_ = drive.fpath_hashX_list multiindex_dict_ = build_multindex(list_) duplicate_hashes = [ key for key, val in six.iteritems(multiindex_dict_) if len(val) > 1 ] duplicate_idxs = ut.dict_take(multiindex_dict_, duplicate_hashes) unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs) # Check if any dups have been removed still_exists = ut.unflat_map(exists, unflat_fpaths) unflat_idxs2 = ut.zipcompress(duplicate_idxs, still_exists) duplicate_idxs = [idxs for idxs in unflat_idxs2 if len(idxs) > 1] # Look at duplicate files unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs) unflat_sizes = ut.list_unflat_take(drive.fpath_bytes_list, duplicate_idxs) # Find highly coupled directories if True: coupled_dirs = [] for fpaths in unflat_fpaths: #basedir = ut.longest_existing_path(commonprefix(fpaths)) dirs = sorted(list(map(dirname, fpaths))) _list = list(range(len(dirs))) idxs = ut.upper_diag_self_prodx(_list) coupled_dirs.extend(list(map(tuple, ut.list_unflat_take(dirs, idxs)))) hist_ = ut.dict_hist(coupled_dirs) coupled_idxs = ut.list_argsort(hist_.values())[::-1] most_coupled = ut.take(list(hist_.keys()), coupled_idxs[0:100]) print('Coupled fpaths: ' + ut.list_str(most_coupled, nl=True)) print('%d unique files are duplicated' % (len(unflat_sizes),)) #print('Duplicate sizes: ' + ut.list_str(unflat_sizes[0:10], nl=True)) #print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths[0:10], nl=True)) #print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths[0::5], nl=True)) print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths, nl=True)) # Find duplicate directories dpath_list = list(drive.dpath_to_fidx.keys()) fidxs_list = ut.dict_take(drive.dpath_to_fidx, drive.dpath_list) #exists_list = list(map(exists, drive.fpath_list)) #unflat_exists = ut.list_unflat_take(exists_list, fidxs_list) fname_registry = [basename(fpath) for fpath in drive.fpath_list] unflat_fnames = ut.list_unflat_take(fname_registry, fidxs_list) def unsorted_list_hash(list_): return ut.hashstr27(str(sorted(list_))) unflat_fname_sets = list(map(unsorted_list_hash, ut.ProgIter(unflat_fnames, freq=10000))) fname_based_duplicate_dpaths = [] multiindex_dict2_ = build_multindex(unflat_fname_sets) fname_based_duplicate_hashes = [key for key, val in multiindex_dict2_.items() if len(val) > 1] print('#fname_based_duplicate_dpaths = %r' % (len(fname_based_duplicate_hashes),)) fname_based_duplicate_didxs = ut.dict_take(multiindex_dict2_, fname_based_duplicate_hashes) fname_based_duplicate_dpaths = ut.list_unflat_take(dpath_list, fname_based_duplicate_didxs) print(ut.repr3(fname_based_duplicate_dpaths[0:10]))
def parse_shark_tags(orig_fname_list): import re invalid_tag_patterns = [ re.escape('-'), re.escape('(') + '?\\d*' + re.escape(')') + '?', '\\d+-\\d+-\\d+', '\\d+,', '\\d+', 'vi*', 'i*v', 'i+', '\\d+th', '\\d+nd', '\\d+rd', 'remant', 'timnfe', 't', 'e', 'sjl', 'disc', 'dec', 'road', 'easter', 'western', 'west', 'tn', '\\d*ap', 'whaleshark\\d*', 'shark\\d*', 'whale\\d*', 'whalesharking', 'sharking', 'whalesharks', 'whales', 'picture', 'australien', 'australia', 'nick', 'tim\\d*', 'imageset', 'holiday', 'visit', 'tour', 'trip', 'pec', 'sv', 'a', 'b', 'gender', 'sex', 'img', 'image', 'pic', 'pics', 'leith', 'trips', 'kings', 'photo', 'video', 'media', 'fix', 'feeding', 'nrd', 'nd', 'gen', 'wa', 'nmp', 'bo', 'kd', 'ow', 'ne', 'dsc', 'nwd', 'mg', 'w', 'mai', 'blue', 'stumpy', 'oea', 'cbe', 'edc', 'knrt', 'tiws2', 'ando', 'adv', 'str', 'adventure', 'camera', 'tag', 'id', 'of', 'and', 'tagged', 'from', 'day', '\\d*april', '\\d*may', '\\d*july', '\\d*june', 'ningaloo', 'ningblue\\d*', 'kooling', ] valid_tag_level_set = [ ['view-left', 'left', 'lhs', 'l', 'leftside'], ['view-right', 'right', 'rhs', 'r', 'rightside'], ['view-back', 'back'], ['view-top', 'top'], ['sex-male', 'male', 'm', 'sexm'], ['sex-female', 'female', 'f'], ['sex-unknown', 'unknown', 'u'], ['part-tail', 'tail'], ['part-flank', 'side', 'flank'], ['part-head', 'head'], ['part-pectoral', 'pectoral', 'pec'], ['part-dorsal', 'dorsal', 'dorsals'], ['part-claspers', 'claspers', 'clasper'], ['part-fin', 'fin'], ['cropped', 'crop'], ['scar', 'scar2'], ['notch'], ['small'], ['bite'], ['cam-slr2', 'slr2'], #['cam-5m', '5m'] ['5m'], ['7m'], ['4m'], ['copy'], ['qual-resize'], ['qual-stretched'], ] def apply_enum_regex(pat_list): enum_endings = [ '[a-g]', '\\d*', 'i*', ] expanded_pats = ut.flatten([[pat + end for end in enum_endings] for pat in pat_list]) return expanded_pats def apply_regex_endings(pat_list): return [p + '$' for p in pat_list] tag_alias_map = {} for level_set in valid_tag_level_set: main_key = level_set[0] for key in level_set: tag_alias_map[key] = main_key inverse_alias_map = {} for level_set in valid_tag_level_set: inverse_alias_map[level_set[0]] = level_set regex_alias_map = { 'view-left': apply_regex_endings(apply_enum_regex(inverse_alias_map['view-left'])), 'view-right': apply_regex_endings(apply_enum_regex(inverse_alias_map['view-right'])), } valid_tags = list(inverse_alias_map.keys()) invalid_tag_patterns = apply_regex_endings(invalid_tag_patterns) def parse_all_fname_tags(fname): _tags = [splitext(fname)[0]] _tags = ut.flatten([t.split('_') for t in _tags]) _tags = ut.flatten([t.split('.') for t in _tags]) _tags = [t.lower() for t in _tags] _tags = [tag_alias_map.get(t, t) for t in _tags] for key, vals in regex_alias_map.items(): pat = ut.regex_or(vals) _tags = [key if re.match(pat, t) else t for t in _tags] pat = ut.regex_or(invalid_tag_patterns) _tags = [t for t in _tags if not re.match(pat, t)] _tags = ut.unique_ordered(_tags) return _tags all_img_tag_list = list(map(parse_all_fname_tags, orig_fname_list)) known_img_tag_list = [ list(set(tags).intersection(set(valid_tags))) for tags in all_img_tag_list ] if False: # Help figure out which tags are important _parsed_tags = ut.flatten(all_img_tag_list) taghist = ut.dict_hist(_parsed_tags) taghist = {key: val for key, val in taghist.items() if val > 1} unknown_taghist = sorted([(val, key) for key, val in taghist.items() if key not in valid_tags])[::-1] known_taghist = sorted([(val, key) for key, val in taghist.items() if key in valid_tags])[::-1] print('Known') print(ut.list_str(known_taghist[0:100])) print('Unknown') print(ut.list_str(unknown_taghist[0:100])) print( ut.dict_str(ut.dict_hist(ut.flatten(known_img_tag_list)), key_order_metric='val')) return known_img_tag_list
def ggr_random_name_splits(): """ CommandLine: python -m wbia.viz.viz_graph2 ggr_random_name_splits --show Ignore: sshfs -o idmap=user lev:/ ~/lev Example: >>> # DISABLE_DOCTEST >>> from wbia.viz.viz_graph2 import * # NOQA >>> ggr_random_name_splits() """ import wbia.guitool as gt gt.ensure_qtapp() # nid_list = ibs.get_valid_nids(filter_empty=True) import wbia dbdir = '/media/danger/GGR/GGR-IBEIS' dbdir = (dbdir if ut.checkpath(dbdir) else ut.truepath('~/lev/media/danger/GGR/GGR-IBEIS')) ibs = wbia.opendb(dbdir=dbdir, allow_newdir=False) import datetime day1 = datetime.date(2016, 1, 30) day2 = datetime.date(2016, 1, 31) orig_filter_kw = { 'multiple': None, # 'view': ['right'], # 'minqual': 'good', 'is_known': True, 'min_pername': 2, } orig_aids = ibs.filter_annots_general(filter_kw=ut.dict_union( orig_filter_kw, { 'min_unixtime': ut.datetime_to_posixtime(ut.date_to_datetime(day1, 0.0)), 'max_unixtime': ut.datetime_to_posixtime(ut.date_to_datetime(day2, 1.0)), }, )) orig_all_annots = ibs.annots(orig_aids) orig_unique_nids, orig_grouped_annots_ = orig_all_annots.group( orig_all_annots.nids) # Ensure we get everything orig_grouped_annots = [ ibs.annots(aids_) for aids_ in ibs.get_name_aids(orig_unique_nids) ] # pip install quantumrandom if False: import quantumrandom data = quantumrandom.uint16() seed = data.sum() print('seed = %r' % (seed, )) # import Crypto.Random # from Crypto import Random # quantumrandom.get_data() # StrongRandom = Crypto.Random.random.StrongRandom # aes.reseed(3340258) # chars = [str(chr(x)) for x in data.view(np.uint8)] # aes_seed = str('').join(chars) # aes = Crypto.Random.Fortuna.FortunaGenerator.AESGenerator() # aes.reseed(aes_seed) # aes.pseudo_random_data(10) orig_rand_idxs = ut.random_indexes(len(orig_grouped_annots), seed=3340258) orig_sample_size = 75 random_annot_groups = ut.take(orig_grouped_annots, orig_rand_idxs) orig_annot_sample = random_annot_groups[:orig_sample_size] # OOOPS MADE ERROR REDO ---- filter_kw = { 'multiple': None, 'view': ['right'], 'minqual': 'good', 'is_known': True, 'min_pername': 2, } filter_kw_ = ut.dict_union( filter_kw, { 'min_unixtime': ut.datetime_to_posixtime(ut.date_to_datetime(day1, 0.0)), 'max_unixtime': ut.datetime_to_posixtime(ut.date_to_datetime(day2, 1.0)), }, ) refiltered_sample = [ ibs.filter_annots_general(annot.aids, filter_kw=filter_kw_) for annot in orig_annot_sample ] is_ok = np.array(ut.lmap(len, refiltered_sample)) >= 2 ok_part_orig_sample = ut.compress(orig_annot_sample, is_ok) ok_part_orig_nids = [x.nids[0] for x in ok_part_orig_sample] # Now compute real sample aids = ibs.filter_annots_general(filter_kw=filter_kw_) all_annots = ibs.annots(aids) unique_nids, grouped_annots_ = all_annots.group(all_annots.nids) grouped_annots = grouped_annots_ # Ensure we get everything # grouped_annots = [ibs.annots(aids_) for aids_ in ibs.get_name_aids(unique_nids)] pop = len(grouped_annots) pername_list = ut.lmap(len, grouped_annots) groups = wbia.annots.AnnotGroups(grouped_annots, ibs) match_tags = [ut.unique(ut.flatten(t)) for t in groups.match_tags] tag_case_hist = ut.dict_hist(ut.flatten(match_tags)) print('name_pop = %r' % (pop, )) print('Annots per Multiton Name' + ut.repr3(ut.get_stats(pername_list, use_median=True))) print('Name Tag Hist ' + ut.repr3(tag_case_hist)) print('Percent Photobomb: %.2f%%' % (tag_case_hist['photobomb'] / pop * 100)) print('Percent Split: %.2f%%' % (tag_case_hist['splitcase'] / pop * 100)) # Remove the ok part from this sample remain_unique_nids = ut.setdiff(unique_nids, ok_part_orig_nids) remain_grouped_annots = [ ibs.annots(aids_) for aids_ in ibs.get_name_aids(remain_unique_nids) ] sample_size = 75 import vtool as vt vt.calc_sample_from_error_bars(0.05, pop, conf_level=0.95, prior=0.05) remain_rand_idxs = ut.random_indexes(len(remain_grouped_annots), seed=3340258) remain_sample_size = sample_size - len(ok_part_orig_nids) remain_random_annot_groups = ut.take(remain_grouped_annots, remain_rand_idxs) remain_annot_sample = remain_random_annot_groups[:remain_sample_size] annot_sample_nofilter = ok_part_orig_sample + remain_annot_sample # Filter out all bad parts annot_sample_filter = [ ibs.annots(ibs.filter_annots_general(annot.aids, filter_kw=filter_kw_)) for annot in annot_sample_nofilter ] annot_sample = annot_sample_filter win = None from wbia.viz import viz_graph2 for annots in ut.InteractiveIter(annot_sample): if win is not None: win.close() win = viz_graph2.make_qt_graph_interface(ibs, aids=annots.aids, init_mode='rereview') print(win) sample_groups = wbia.annots.AnnotGroups(annot_sample, ibs) flat_tags = [ut.unique(ut.flatten(t)) for t in sample_groups.match_tags] print('Using Split and Photobomb') is_positive = ['photobomb' in t or 'splitcase' in t for t in flat_tags] num_positive = sum(is_positive) vt.calc_error_bars_from_sample(sample_size, num_positive, pop, conf_level=0.95) print('Only Photobomb') is_positive = ['photobomb' in t for t in flat_tags] num_positive = sum(is_positive) vt.calc_error_bars_from_sample(sample_size, num_positive, pop, conf_level=0.95) print('Only SplitCase') is_positive = ['splitcase' in t for t in flat_tags] num_positive = sum(is_positive) vt.calc_error_bars_from_sample(sample_size, num_positive, pop, conf_level=0.95)
if len(global_aid_list_) > MAX_AIDS: global_aid_list_ = global_aid_list_[:MAX_AIDS] if len(global_aid_list_) > 1: valid_aid = global_aid_list_[0] valid_aid_set.add(valid_aid) # test_aid = global_aid_list_[1] # test_aid_set.add(test_aid) aid_list += global_aid_list_ count = len(global_aid_list_) count_list.append(count) if len(count_list) >= MAX_NAMES: break print(ut.repr3(ut.dict_hist(count_list))) print(ut.repr3(len(count_list))) tips_list = depc.get('Notch_Tips', aid_list) size_list = depc.get('chips', aid_list, ('width', 'height')) config = { 'dim_size': 1000, 'resize_dim': 'width', 'ext': '.jpg', } chip_list = depc.get('chips', aid_list, 'img', config=config, ensure=True) color_list = [ (255, 0, 0), (0, 0, 255), (0, 255, 0),
def tag_hist(tags_list): import utool as ut return ut.dict_hist(ut.flatten(tags_list), ordered=True)
def get_toy_data_1vM(num_annots, num_names=None, **kwargs): r""" Args: num_annots (int): num_names (int): (default = None) Kwargs: initial_aids, initial_nids, nid_sequence, seed Returns: tuple: (pair_list, feat_list) CommandLine: python -m ibeis.algo.hots.demobayes --exec-get_toy_data_1vM --show Example: >>> # DISABLE_DOCTEST >>> from ibeis.algo.hots.demobayes import * # NOQA >>> num_annots = 1000 >>> num_names = 40 >>> get_toy_data_1vM(num_annots, num_names) >>> ut.quit_if_noshow() >>> import plottool as pt >>> ut.show_if_requested() """ import vtool as vt tup_ = get_toy_annots(num_annots, num_names, **kwargs) aids, nids, aids1, nids1, all_aids, all_nids = tup_ rng = vt.ensure_rng(None) # Test a simple SVM classifier nid2_nexemp = ut.dict_hist(nids1) aid2_nid = dict(zip(aids, nids)) ut.fix_embed_globals() #def add_to_globals(globals_, subdict): # globals_.update(subdict) unique_nids = list(nid2_nexemp.keys()) def annot_to_class_feats2(aid, aid2_nid, top=None): pair_list = [] score_list = [] nexemplar_list = [] for nid in unique_nids: label = (aid2_nid[aid] == nid) num_exemplars = nid2_nexemp.get(nid, 0) if num_exemplars == 0: continue params = toy_params[label] mu, sigma = ut.dict_take(params, ['mu', 'sigma']) score_ = rng.normal(mu, sigma, size=num_exemplars).max() score = np.clip(score_, 0, np.inf) pair_list.append((aid, nid)) score_list.append(score) nexemplar_list.append(num_exemplars) rank_list = ut.argsort(score_list, reverse=True) feat_list = np.array([score_list, rank_list, nexemplar_list]).T sortx = np.argsort(rank_list) feat_list = feat_list.take(sortx, axis=0) pair_list = np.array(pair_list).take(sortx, axis=0) if top is not None: feat_list = feat_list[:top] pair_list = pair_list[0:top] return pair_list, feat_list toclass_features = [annot_to_class_feats2(aid, aid2_nid, top=5) for aid in aids] aidnid_pairs = np.vstack(ut.get_list_column(toclass_features, 0)) feat_list = np.vstack(ut.get_list_column(toclass_features, 1)) score_list = feat_list.T[0:1].T lbl_list = [aid2_nid[aid] == nid for aid, nid in aidnid_pairs] from sklearn import svm #clf1 = svm.LinearSVC() print('Learning classifiers') clf3 = svm.SVC() clf3.fit(feat_list, lbl_list) clf1 = svm.LinearSVC() clf1.fit(score_list, lbl_list) # Score new annots against the training database tup_ = get_toy_annots(num_annots * 2, num_names, initial_aids=all_aids, initial_nids=all_nids) aids, nids, aids1, nids1, all_aids, all_nids = tup_ aid2_nid = dict(zip(aids, nids)) toclass_features = [annot_to_class_feats2(aid, aid2_nid) for aid in aids] aidnid_pairs = np.vstack(ut.get_list_column(toclass_features, 0)) feat_list = np.vstack(ut.get_list_column(toclass_features, 1)) lbl_list = np.array([aid2_nid[aid] == nid for aid, nid in aidnid_pairs]) print('Running tests') score_list = feat_list.T[0:1].T tp_feat_list = feat_list[lbl_list] tn_feat_list = feat_list[~lbl_list] tp_lbls = lbl_list[lbl_list] tn_lbls = lbl_list[~lbl_list] print('num tp: %d' % len(tp_lbls)) print('num fp: %d' % len(tn_lbls)) tp_score_list = score_list[lbl_list] tn_score_list = score_list[~lbl_list] print('tp_feat' + ut.repr3(ut.get_stats(tp_feat_list, axis=0), precision=2)) print('tp_feat' + ut.repr3(ut.get_stats(tn_feat_list, axis=0), precision=2)) print('tp_score' + ut.repr2(ut.get_stats(tp_score_list), precision=2)) print('tp_score' + ut.repr2(ut.get_stats(tn_score_list), precision=2)) tp_pred3 = clf3.predict(tp_feat_list) tn_pred3 = clf3.predict(tn_feat_list) print((tp_pred3.sum(), tp_pred3.shape)) print((tn_pred3.sum(), tn_pred3.shape)) tp_score3 = clf3.score(tp_feat_list, tp_lbls) tn_score3 = clf3.score(tn_feat_list, tn_lbls) tp_pred1 = clf1.predict(tp_score_list) tn_pred1 = clf1.predict(tn_score_list) print((tp_pred1.sum(), tp_pred1.shape)) print((tn_pred1.sum(), tn_pred1.shape)) tp_score1 = clf1.score(tp_score_list, tp_lbls) tn_score1 = clf1.score(tn_score_list, tn_lbls) print('tp score with rank = %r' % (tp_score3,)) print('tn score with rank = %r' % (tn_score3,)) print('tp score without rank = %r' % (tp_score1,)) print('tn score without rank = %r' % (tn_score1,)) toy_data = {} return toy_data
def fix_duplicates(drive): r""" for every duplicate file passing a (eg avi) filter, remove the file that is in the smallest directory. On a tie use the smallest dpath. This will filter all duplicate files in a folder into a single folder. but... need to look at non-duplicates in that folder and decide if they should be moved as well. So, should trigger on folders that have at least 50% duplicate. Might not want to move curated folders. Example: cd ~/local/scripts >>> from register_files import * # NOQA >>> dpaths = ut.get_argval('--drives', type_=list, default=['E:/'])#'D:/', 'E:/', 'F:/']) >>> drives = [Drive(root_dpath) for root_dpath in dpaths] >>> E = drive = drives[0] >>> #D, E, F = drives """ print('Fixing Duplicates in %r' % (drive, )) list_ = drive.fpath_hashX_list multiindex_dict_ = build_multindex(list_) duplicate_hashes = [ key for key, val in six.iteritems(multiindex_dict_) if len(val) > 1 ] duplicate_idxs = ut.dict_take(multiindex_dict_, duplicate_hashes) unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs) # Check if any dups have been removed still_exists = ut.unflat_map(exists, unflat_fpaths) unflat_idxs2 = ut.zipcompress(duplicate_idxs, still_exists) duplicate_idxs = [idxs for idxs in unflat_idxs2 if len(idxs) > 1] # Look at duplicate files unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs) unflat_sizes = ut.list_unflat_take(drive.fpath_bytes_list, duplicate_idxs) # Find highly coupled directories if True: coupled_dirs = [] for fpaths in unflat_fpaths: #basedir = ut.longest_existing_path(commonprefix(fpaths)) dirs = sorted(list(map(dirname, fpaths))) _list = list(range(len(dirs))) idxs = ut.upper_diag_self_prodx(_list) coupled_dirs.extend( list(map(tuple, ut.list_unflat_take(dirs, idxs)))) hist_ = ut.dict_hist(coupled_dirs) coupled_idxs = ut.list_argsort(hist_.values())[::-1] most_coupled = ut.take(list(hist_.keys()), coupled_idxs[0:100]) print('Coupled fpaths: ' + ut.repr2(most_coupled, nl=True)) print('%d unique files are duplicated' % (len(unflat_sizes), )) #print('Duplicate sizes: ' + ut.repr2(unflat_sizes[0:10], nl=True)) #print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths[0:10], nl=True)) #print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths[0::5], nl=True)) print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths, nl=True)) # Find duplicate directories dpath_list = list(drive.dpath_to_fidx.keys()) fidxs_list = ut.dict_take(drive.dpath_to_fidx, drive.dpath_list) #exists_list = list(map(exists, drive.fpath_list)) #unflat_exists = ut.list_unflat_take(exists_list, fidxs_list) fname_registry = [basename(fpath) for fpath in drive.fpath_list] unflat_fnames = ut.list_unflat_take(fname_registry, fidxs_list) def unsorted_list_hash(list_): return ut.hashstr27(str(sorted(list_))) unflat_fname_sets = list( map(unsorted_list_hash, ut.ProgIter(unflat_fnames, freq=10000))) fname_based_duplicate_dpaths = [] multiindex_dict2_ = build_multindex(unflat_fname_sets) fname_based_duplicate_hashes = [ key for key, val in multiindex_dict2_.items() if len(val) > 1 ] print('#fname_based_duplicate_dpaths = %r' % (len(fname_based_duplicate_hashes), )) fname_based_duplicate_didxs = ut.dict_take( multiindex_dict2_, fname_based_duplicate_hashes) fname_based_duplicate_dpaths = ut.list_unflat_take( dpath_list, fname_based_duplicate_didxs) print(ut.repr3(fname_based_duplicate_dpaths[0:10]))
def update_asset_symlinks(self, verbose=True): """ Traverse the files in the _submission/ folder and add/update symlinks for any relevant files we identify Ref: https://pypi.org/project/python-magic/ https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types http://www.iana.org/assignments/media-types/media-types.xhtml """ from app.modules.assets.models import Asset import utool as ut import magic submission_abspath = self.get_absolute_path() submission_path = os.path.join(submission_abspath, '_submission') assets_path = os.path.join(submission_abspath, '_assets') current_app.sub.ensure_initialed() # Walk the submission path, looking for white-listed MIME type files files = [] skipped = [] errors = [] walk_list = sorted(list(os.walk(submission_path))) print('Walking submission...') for root, directories, filenames in tqdm.tqdm(walk_list): filenames = sorted(filenames) for filename in filenames: filepath = os.path.join(root, filename) # Normalize path (sanity check) filepath = os.path.normpath(filepath) # Sanity check, ensure that the path is formatted well assert os.path.exists(filepath) assert os.path.isabs(filepath) try: basename = os.path.basename(filepath) _, extension = os.path.splitext(basename) extension = extension.lower() extension = extension.strip('.') if basename.startswith('.'): # Skip hidden files if basename not in ['.touch']: skipped.append((filepath, basename)) continue if os.path.isdir(filepath): # Skip any directories (sanity check) skipped.append((filepath, extension)) continue if os.path.islink(filepath): # Skip any symbolic links (sanity check) skipped.append((filepath, extension)) continue mime_type = magic.from_file(filepath, mime=True) if mime_type not in current_app.sub.mime_type_whitelist: # Skip any unsupported MIME types skipped.append((filepath, extension)) continue magic_signature = magic.from_file(filepath) size_bytes = os.path.getsize(filepath) file_data = { 'filepath': filepath, 'path': basename, 'extension': extension, 'mime_type': mime_type, 'magic_signature': magic_signature, 'size_bytes': size_bytes, 'submission_guid': self.guid, } files.append(file_data) except Exception: logging.exception('Got exception in update_asset_symlinks') errors.append(filepath) if verbose: print('Processed asset files from submission: %r' % (self, )) print('\tFiles : %d' % (len(files), )) print('\tSkipped : %d' % (len(skipped), )) if len(skipped) > 0: skipped_ext_list = [skip[1] for skip in skipped] skipped_ext_str = ut.repr3(ut.dict_hist(skipped_ext_list)) skipped_ext_str = skipped_ext_str.replace('\n', '\n\t\t') print('\t\t%s' % (skipped_ext_str, )) print('\tErrors : %d' % (len(errors), )) # Compute the xxHash64 for all found files filepath_list = [file_data['filepath'] for file_data in files] arguments_list = list(zip(filepath_list)) print('Computing filesystem xxHash64...') filesystem_xxhash64_list = parallel(compute_xxhash64_digest_filepath, arguments_list) filesystem_guid_list = list( map(ut.hashable_to_uuid, filesystem_xxhash64_list)) # Update file_data with the filesystem and semantic hash information zipped = zip(files, filesystem_xxhash64_list, filesystem_guid_list) for file_data, filesystem_xxhash64, filesystem_guid in zipped: file_data['filesystem_xxhash64'] = filesystem_xxhash64 file_data['filesystem_guid'] = filesystem_guid semantic_guid_data = ( file_data['submission_guid'], file_data['filesystem_guid'], ) file_data['semantic_guid'] = ut.hashable_to_uuid( semantic_guid_data) # Delete all existing symlinks existing_filepath_guid_mapping = {} existing_asset_symlinks = ut.glob(os.path.join(assets_path, '*')) for existing_asset_symlink in existing_asset_symlinks: basename = os.path.basename(existing_asset_symlink) if basename in ['.touch', 'derived']: continue existing_asset_target = os.readlink(existing_asset_symlink) existing_asset_target_ = os.path.abspath( os.path.join(assets_path, existing_asset_target)) if os.path.exists(existing_asset_target_): uuid_str, _ = os.path.splitext(basename) uuid_str = uuid_str.strip().strip('.') try: existing_filepath_guid_mapping[ existing_asset_target_] = uuid.UUID(uuid_str) except Exception: pass os.remove(existing_asset_symlink) # Add new or update any existing Assets found in the Submission asset_submission_filepath_list = [ file_data.pop('filepath', None) for file_data in files ] assets = [] with db.session.begin(): for file_data, asset_submission_filepath in zip( files, asset_submission_filepath_list): semantic_guid = file_data.get('semantic_guid', None) asset = Asset.query.filter( Asset.semantic_guid == semantic_guid).first() if asset is None: # Check if we can recycle existing GUID from symlink recycle_guid = existing_filepath_guid_mapping.get( asset_submission_filepath, None) if recycle_guid is not None: file_data['guid'] = recycle_guid # Create record if asset is new asset = Asset(**file_data) db.session.add(asset) else: # Update record if Asset exists for key in file_data: if key in [ 'submission_guid', 'filesystem_guid', 'semantic_guid' ]: continue value = file_data[key] setattr(asset, key, value) db.session.merge(asset) assets.append(asset) # Update all symlinks for each Asset for asset, asset_submission_filepath in zip( assets, asset_submission_filepath_list): db.session.refresh(asset) asset.update_symlink(asset_submission_filepath) if verbose: print(filepath) print('\tAsset : %s' % (asset, )) print('\tSemantic GUID : %s' % (asset.semantic_guid, )) print('\tExtension : %s' % (asset.extension, )) print('\tMIME type : %s' % (asset.mime_type, )) print('\tSignature : %s' % (asset.magic_signature, )) print('\tSize bytes : %s' % (asset.size_bytes, )) print('\tFS xxHash64 : %s' % (asset.filesystem_xxhash64, )) print('\tFS GUID : %s' % (asset.filesystem_guid, )) # Get all historical and current Assets for this Submission db.session.refresh(self) # Delete any historical Assets that have been deleted from this commit deleted_assets = list(set(self.assets) - set(assets)) if verbose: print('Deleting %d orphaned Assets' % (len(deleted_assets), )) with db.session.begin(): for deleted_asset in deleted_assets: deleted_asset.delete() db.session.refresh(self)
def parse_shark_tags(orig_fname_list): import re invalid_tag_patterns = [ re.escape('-'), re.escape('(') + '?\\d*' + re.escape(')') + '?', '\\d+-\\d+-\\d+', '\\d+,', '\\d+', 'vi*', 'i*v', 'i+', '\\d+th', '\\d+nd', '\\d+rd', 'remant', 'timnfe', 't', 'e', 'sjl', 'disc', 'dec', 'road', 'easter', 'western', 'west', 'tn', '\\d*ap', 'whaleshark\\d*', 'shark\\d*', 'whale\\d*', 'whalesharking', 'sharking', 'whalesharks', 'whales', 'picture', 'australien', 'australia', 'nick', 'tim\\d*', 'imageset', 'holiday', 'visit', 'tour', 'trip', 'pec', 'sv', 'a', 'b', 'gender', 'sex', 'img', 'image', 'pic', 'pics', 'leith', 'trips', 'kings', 'photo', 'video', 'media', 'fix', 'feeding', 'nrd', 'nd', 'gen', 'wa', 'nmp', 'bo', 'kd', 'ow', 'ne', 'dsc', 'nwd', 'mg', 'w', 'mai', 'blue', 'stumpy', 'oea', 'cbe', 'edc', 'knrt', 'tiws2', 'ando', 'adv', 'str', 'adventure', 'camera', 'tag', 'id', 'of', 'and', 'tagged', 'from', 'day', '\\d*april', '\\d*may', '\\d*july', '\\d*june', 'ningaloo', 'ningblue\\d*', 'kooling', ] valid_tag_level_set = [ ['view-left', 'left', 'lhs', 'l', 'leftside'], ['view-right', 'right', 'rhs', 'r', 'rightside'], ['view-back', 'back'], ['view-top', 'top'], ['sex-male', 'male', 'm', 'sexm'], ['sex-female', 'female', 'f'], ['sex-unknown', 'unknown', 'u'], ['part-tail', 'tail'], ['part-flank', 'side', 'flank'], ['part-head', 'head'], ['part-pectoral', 'pectoral', 'pec'], ['part-dorsal', 'dorsal', 'dorsals'], ['part-claspers', 'claspers', 'clasper'], ['part-fin', 'fin'], ['cropped', 'crop'], ['scar', 'scar2'], ['notch'], ['small'], ['bite'], ['cam-slr2', 'slr2'], #['cam-5m', '5m'] ['5m'], ['7m'], ['4m'], ['copy'], ['qual-resize'], ['qual-stretched'], ] def apply_enum_regex(pat_list): enum_endings = [ '[a-g]', '\\d*', 'i*', ] expanded_pats = ut.flatten([ [pat + end for end in enum_endings] for pat in pat_list ]) return expanded_pats def apply_regex_endings(pat_list): return [p + '$' for p in pat_list] tag_alias_map = {} for level_set in valid_tag_level_set: main_key = level_set[0] for key in level_set: tag_alias_map[key] = main_key inverse_alias_map = {} for level_set in valid_tag_level_set: inverse_alias_map[level_set[0]] = level_set regex_alias_map = { 'view-left': apply_regex_endings(apply_enum_regex(inverse_alias_map['view-left'])), 'view-right': apply_regex_endings(apply_enum_regex(inverse_alias_map['view-right'])), } valid_tags = list(inverse_alias_map.keys()) invalid_tag_patterns = apply_regex_endings(invalid_tag_patterns) def parse_all_fname_tags(fname): _tags = [splitext(fname)[0]] _tags = ut.flatten([t.split('_') for t in _tags]) _tags = ut.flatten([t.split('.') for t in _tags]) _tags = [t.lower() for t in _tags] _tags = [tag_alias_map.get(t, t) for t in _tags] for key, vals in regex_alias_map.items(): pat = ut.regex_or(vals) _tags = [key if re.match(pat, t) else t for t in _tags] pat = ut.regex_or(invalid_tag_patterns) _tags = [t for t in _tags if not re.match(pat, t)] _tags = ut.unique_ordered(_tags) return _tags all_img_tag_list = list(map(parse_all_fname_tags, orig_fname_list)) known_img_tag_list = [list(set(tags).intersection(set(valid_tags))) for tags in all_img_tag_list] if False: # Help figure out which tags are important _parsed_tags = ut.flatten(all_img_tag_list) taghist = ut.dict_hist(_parsed_tags) taghist = {key: val for key, val in taghist.items() if val > 1} unknown_taghist = sorted([ (val, key) for key, val in taghist.items() if key not in valid_tags ])[::-1] known_taghist = sorted([ (val, key) for key, val in taghist.items() if key in valid_tags ])[::-1] print('Known') print(ut.list_str(known_taghist[0:100])) print('Unknown') print(ut.list_str(unknown_taghist[0:100])) print(ut.dict_str( ut.dict_hist(ut.flatten(known_img_tag_list)), key_order_metric='val' )) return known_img_tag_list
def detect_sharks(ibs, gids): # import wbia # ibs = wbia.opendb('WS_ALL') config = { 'algo': 'yolo', 'sensitivity': 0.2, 'config_filepath': ut.truepath('~/work/WS_ALL/localizer_backup/detect.yolo.2.cfg'), 'weight_filepath': ut.truepath( '~/work/WS_ALL/localizer_backup/detect.yolo.2.39000.weights'), 'class_filepath': ut.truepath( '~/work/WS_ALL/localizer_backup/detect.yolo.2.cfg.classes'), } depc = ibs.depc_image # imgsets = ibs.imagesets(text='Injured Sharks') # images = ibs.images(imgsets.gids[0]) images = ibs.images(gids) images = images.compress([ext not in ['.gif'] for ext in images.exts]) gid_list = images.gids # result is a tuple: # (score, bbox_list, theta_list, conf_list, class_list) results_list = depc.get_property('localizations', gid_list, None, config=config) results_list2 = [] multi_gids = [] failed_gids = [] # ibs.set_image_imagesettext(failed_gids, ['Fixme'] * len(failed_gids)) ibs.set_image_imagesettext(multi_gids, ['Fixme2'] * len(multi_gids)) failed_gids for gid, res in zip(gid_list, results_list): score, bbox_list, theta_list, conf_list, class_list = res if len(bbox_list) == 0: failed_gids.append(gid) elif len(bbox_list) == 1: results_list2.append((gid, bbox_list, theta_list)) elif len(bbox_list) > 1: multi_gids.append(gid) idx = conf_list.argmax() res2 = (gid, bbox_list[idx:idx + 1], theta_list[idx:idx + 1]) results_list2.append(res2) ut.dict_hist(([t[1].shape[0] for t in results_list])) localized_imgs = ibs.images(ut.take_column(results_list2, 0)) assert all([len(a) == 1 for a in localized_imgs.aids]) old_annots = ibs.annots(ut.flatten(localized_imgs.aids)) # old_tags = old_annots.case_tags # Override old bboxes import numpy as np bboxes = np.array(ut.take_column(results_list2, 1))[:, 0, :] ibs.set_annot_bboxes(old_annots.aids, bboxes) if False: import wbia.plottool as pt pt.qt4ensure() inter = pt.MultiImageInteraction( ibs.get_image_paths(ut.take_column(results_list2, 0)), bboxes_list=ut.take_column(results_list2, 1), ) inter.dump_to_disk('shark_loc', num=50, prefix='shark_loc') inter.start() inter = pt.MultiImageInteraction(ibs.get_image_paths(failed_gids)) inter.start() inter = pt.MultiImageInteraction(ibs.get_image_paths(multi_gids)) inter.start()
def ensure_tf(X): termfreq = ut.dict_hist(X.wx_list) # do what video google does termfreq = ut.map_dict_vals(lambda x: x / len(X.wx_list), termfreq) X.termfreq = termfreq
def limited_power_toughness_histogram(): r""" CommandLine: python -m mtgmonte.stats --exec-limited_power_toughness_histogram --show Example: >>> # DISABLE_DOCTEST >>> from mtgmonte.stats import * # NOQA >>> result = limited_power_toughness_histogram() >>> print(result) >>> ut.show_if_requested() """ from mtgmonte import mtgobjs from mtglib.gatherer_request import SearchRequest from mtglib.card_extractor import CardExtractor # from mtglib.card_renderer import CardList request = SearchRequest({"set": "Oath of the Gatewatch"}) def add_page(url, page): parts = url.split("/") part1 = "/".join(parts[:-1]) part2 = "/Default.aspx?page=%d&" % (page,) part3 = parts[-1].replace("Default.aspx?", "") url2 = part1 + part2 + part3 return url2 card_list = [] for page in range(0, 10): url = request.url url2 = add_page(url, page) extract = CardExtractor(url2) card_list0 = extract.cards for card in card_list0: card2 = mtgobjs.Card2() card2.__dict__.update(card.__dict__) card_list.append(card2) if len(card_list0) != 100: break for c in card_list: c.nice_attrs += ["rarity"] creats = [_card2 for _card2 in card_list if "Creature" in card2.types] creats = [_card2 for _card2 in creats if _card2.rarity in ["Common", "Uncommon"]] powtough = [] for c in creats: try: powtough.append((int(c.power), int(c.toughness))) except ValueError: pass import plottool as pt pt.ensure_pylab_qt4() import numpy as np scores_list = np.array(list(zip(*powtough))) xdata = np.arange(0, np.max(scores_list) + 1) powhist = np.histogram(scores_list[0], bins=xdata)[0] toughist = np.histogram(scores_list[1], bins=xdata)[0] pt.multi_plot(xdata, [powhist, toughist], label_list=["power", "toughness"], kind="bar") bothhist = ut.dict_hist(powtough) xdata = np.arange(len(bothhist)) dat = sorted(bothhist.items()) xticklabels = ut.take_column(dat, 0) ydata = ut.take_column(dat, 1) pt.multi_plot(xdata, [ydata], xticklabels=xticklabels, kind="bar")