Пример #1
0
def report_partitioning_statistics(new_reduced_joint):
    # compute partitioning statistics
    import vtool as vt
    vals, idxs = vt.group_indices(new_reduced_joint.values.ravel())
    #groupsize = list(map(len, idxs))
    #groupassigns = ut.unflat_vecmap(new_reduced_joint.assignment, idxs)
    all_states = new_reduced_joint._row_labels(asindex=True)
    clusterstats = [tuple(sorted(list(ut.dict_hist(a).values())))
                    for a in all_states]
    grouped_vals = ut.group_items(new_reduced_joint.values.ravel(),
                                  clusterstats)

    #probs_assigned_to_clustertype = [(
    #    sorted(np.unique(np.array(b).round(decimals=5)).tolist())[::-1], a)
    #    for a, b in grouped_vals.items()]
    probs_assigned_to_clustertype = [(
        ut.dict_hist(np.array(b).round(decimals=5)), a)
        for a, b in grouped_vals.items()]
    sortx = ut.argsort([max(c[0].keys())
                        for c in probs_assigned_to_clustertype])
    probs_assigned_to_clustertype = ut.take(probs_assigned_to_clustertype, sortx)

    # This list of 2-tuples with the first item being the unique
    # probabilies that are assigned to a cluster type along with the number
    # of times they were assigned. A cluster type is the second item. Every
    # number represents how many annotations were assigned to a specific
    # label. The length of that list is the number of total labels.  For
    # all low scores you will see [[{somenum: 1}, {0: 800}], [1, 1, 1, ... 1]]
    # indicating that that the assignment of everyone to a different label happend once
    # where the probability was somenum and a 800 times where the probability was 0.

    #print(sorted([(b, a) for a, b in ut.map_dict_vals(sum, x)]).items())
    #z = sorted([(b, a) for a, b in ut.map_dict_vals(sum, grouped_vals).items()])
    print(ut.repr2(probs_assigned_to_clustertype, nl=2, precision=2, sorted_=True))
Пример #2
0
    def find_needsmove_to_other(self, other):
        hash1 = self.get_prop('md5_stride')
        hash2 = other.get_prop('md5_stride')
        idxs1 = list(range(len(hash1)))

        hash_to_idxs = ut.group_items(idxs1, hash1)
        # Find what we have that other doesnt have and move it there
        other_missing = set(hash1).difference(hash2)
        missing_idxs1 = ut.flatten(ut.take(hash_to_idxs, other_missing))

        data = ut.ColumnLists({
            'idx':
            missing_idxs1,
            'fname':
            self.get_prop('fname', missing_idxs1),
            'dname':
            self.get_prop('dname', missing_idxs1),
            'full_path':
            self.get_prop('full_path', missing_idxs1),
            'nbytes':
            self.get_prop('nbytes', missing_idxs1),
        })
        data = data.compress([f != 'Thumbs.db' for f in data['fname']])
        data['ext'] = self.get_prop('ext', data['idx'])
        ut.dict_hist(data['ext'])
        data.print(ignore=['full_path', 'dname'])
Пример #3
0
    def inference_stats(infr_list_):
        relabel_stats = []
        for infr in infr_list_:
            num_ccs, num_inconsistent = infr.relabel_using_reviews()
            state_hist = ut.dict_hist(nx.get_edge_attributes(infr.graph, 'decision').values())
            if POSTV not in state_hist:
                state_hist[POSTV] = 0
            hist = ut.dict_hist(nx.get_edge_attributes(infr.graph, '_speed_split').values())

            subgraphs = infr.positive_connected_compoments()
            subgraph_sizes = [len(g) for g in subgraphs]

            info = ut.odict([
                ('num_nonmatch_edges', state_hist[NEGTV]),
                ('num_match_edges', state_hist[POSTV]),
                ('frac_nonmatch_edges',  state_hist[NEGTV] / (state_hist[POSTV] + state_hist[NEGTV])),
                ('num_inconsistent', num_inconsistent),
                ('num_ccs', num_ccs),
                ('edges_flipped', hist.get('flip', 0)),
                ('edges_unchanged', hist.get('orig', 0)),
                ('bad_unreviewed_edges', hist.get('new', 0)),
                ('orig_size', len(infr.graph)),
                ('new_sizes', subgraph_sizes),
            ])
            relabel_stats.append(info)
        return relabel_stats
Пример #4
0
 def _print_previous_loop_statistics(infr, count):
     # Print stats about what happend in the this loop
     history = infr.metrics_list[-count:]
     recover_blocks = ut.group_items([
         (k, sum(1 for i in g))
         for k, g in it.groupby(ut.take_column(history, 'recovering'))
     ]).get(True, [])
     infr.print((
         'Recovery mode entered {} times, '
         'made {} recovery decisions.').format(
             len(recover_blocks), sum(recover_blocks)), color='green')
     testaction_hist = ut.dict_hist(ut.take_column(history, 'test_action'))
     infr.print(
         'Test Action Histogram: {}'.format(
             ut.repr4(testaction_hist, si=True)), color='yellow')
     if infr.params['inference.enabled']:
         action_hist = ut.dict_hist(
             ut.emap(frozenset, ut.take_column(history, 'action')))
         infr.print(
             'Inference Action Histogram: {}'.format(
                 ub.repr2(action_hist, si=True)), color='yellow')
     infr.print(
         'Decision Histogram: {}'.format(ut.repr2(ut.dict_hist(
             ut.take_column(history, 'pred_decision')
         ), si=True)), color='yellow')
     infr.print(
         'User Histogram: {}'.format(ut.repr2(ut.dict_hist(
             ut.take_column(history, 'user_id')
         ), si=True)), color='yellow')
Пример #5
0
def report_partitioning_statistics(new_reduced_joint):
    # compute partitioning statistics
    import vtool as vt
    vals, idxs = vt.group_indices(new_reduced_joint.values.ravel())
    #groupsize = list(map(len, idxs))
    #groupassigns = ut.unflat_vecmap(new_reduced_joint.assignment, idxs)
    all_states = new_reduced_joint._row_labels(asindex=True)
    clusterstats = [tuple(sorted(list(ut.dict_hist(a).values())))
                    for a in all_states]
    grouped_vals = ut.group_items(new_reduced_joint.values.ravel(),
                                  clusterstats)

    #probs_assigned_to_clustertype = [(
    #    sorted(np.unique(np.array(b).round(decimals=5)).tolist())[::-1], a)
    #    for a, b in grouped_vals.items()]
    probs_assigned_to_clustertype = [(
        ut.dict_hist(np.array(b).round(decimals=5)), a)
        for a, b in grouped_vals.items()]
    sortx = ut.argsort([max(c[0].keys())
                        for c in probs_assigned_to_clustertype])
    probs_assigned_to_clustertype = ut.take(probs_assigned_to_clustertype, sortx)

    # This list of 2-tuples with the first item being the unique
    # probabilies that are assigned to a cluster type along with the number
    # of times they were assigned. A cluster type is the second item. Every
    # number represents how many annotations were assigned to a specific
    # label. The length of that list is the number of total labels.  For
    # all low scores you will see [[{somenum: 1}, {0: 800}], [1, 1, 1, ... 1]]
    # indicating that that the assignment of everyone to a different label happend once
    # where the probability was somenum and a 800 times where the probability was 0.

    #print(sorted([(b, a) for a, b in ut.map_dict_vals(sum, x)]).items())
    #z = sorted([(b, a) for a, b in ut.map_dict_vals(sum, grouped_vals).items()])
    print(ut.repr2(probs_assigned_to_clustertype, nl=2, precision=2, sorted_=True))
Пример #6
0
    def sub(self, other):
        """
        CommandLine:
            python -m mtgmonte.mtgobjs --exec-ManaSet.sub:0
            python -m mtgmonte.mtgobjs --exec-ManaSet.sub:1

        Example:
            >>> # ENABLE_DOCTEST
            >>> from mtgmonte.mtgobjs import *
            >>> from mtgmonte import mtgobjs
            >>> self = mtgobjs.ManaSet('RRRUC')
            >>> other = mtgobjs.ManaSet('RRU')
            >>> mana = self - other
            >>> result = ('mana = %s' % (mana,))
            >>> print(result)
            mana = {RC}

        Example:
            >>> # ENABLE_DOCTEST
            >>> from mtgmonte.mtgobjs import *  # NOQA
            >>> self = ManaSet(['WWURC'])
            >>> other = ManaCost([('W', 'colored'), ('W', 'colored'), ('U', 'colored'), ('1', 'uncolored')])
            >>> mana = self - other
            >>> result = ('mana = %s' % (mana,))
            >>> print(result)
            mana = {R}
        """
        if isinstance(other, ManaCost):
            colored_cost = other.colored.to_manaset()
            remainder1 = self.sub(colored_cost)
            color2_remain = remainder1.get_colordict()
            uncolored_need = other.num_uncolored
            # TODO: value different colors differently for payment
            if uncolored_need > 0:
                for color in list(color2_remain.keys()):
                    using = min(uncolored_need, color2_remain[color])
                    color2_remain[color] -= using
                    uncolored_need -= using
            if uncolored_need > 0:
                raise NotEnoughManaError('Cannot subtract more mana from less')
            # Todo hybrid / phyrexian
        else:
            color2_need = ut.dict_hist(other._manas)
            color2_remain = ut.ddict(lambda: 0, ut.dict_hist(self._manas))
            for color, num_need in color2_need.items():
                num_have = color2_remain[color]
                if num_have < num_need:
                    raise NotEnoughManaError('Cannot subtract more mana from less')
                color2_remain[color] -= num_need
        color2_remain = delete_dict_zeros(color2_remain)
        remainder = ManaSet(color2_remain)
        return remainder
Пример #7
0
 def author_hist():
     #print(all_authors)
     hist_ = ut.dict_hist(all_authors, ordered=True)
     hist_[''] = None
     del hist_['']
     print('Author histogram')
     print(ut.dict_str(hist_)[-1000:])
Пример #8
0
def cheetah_stats(ibs):
    filters = [
        dict(view=['right', 'frontright', 'backright'], minqual='good'),
        dict(view=['right', 'frontright', 'backright']),
    ]
    for filtkw in filters:
        annots = ibs.annots(ibs.filter_annots_general(**filtkw))
        unique_nids, grouped_annots = annots.group(annots.nids)
        annots_per_name = ut.lmap(len, grouped_annots)
        annots_per_name_freq = ut.dict_hist(annots_per_name)
        def bin_mapper(num):
            if num < 5:
                return (num, num + 1)
            else:
                for bin, mod in [(20, 5), (50, 10)]:
                    if num < bin:
                        low = (num // mod) * mod
                        high = low + mod
                        return (low, high)
                if num >= bin:
                    return (bin, None)
                else:
                    assert False, str(num)
        hist = ut.ddict(lambda: 0)
        for num in annots_per_name:
            hist[bin_mapper(num)] += 1
        hist = ut.sort_dict(hist)

        print('------------')
        print('filters = %s' % ut.repr4(filtkw))
        print('num_annots = %r' % (len(annots)))
        print('num_names = %r' % (len(unique_nids)))
        print('annots_per_name_freq = %s' % (ut.repr4(annots_per_name_freq)))
        print('annots_per_name_freq (ranges) = %s' % (ut.repr4(hist)))
        assert sum(hist.values()) == len(unique_nids)
Пример #9
0
def inspect_deck(deck):
    def get_card_tags(card, deck):
        tags = []
        stats = card.mana_source_stats(deck)
        if stats is not None:
            tags.append("land")
            if len(stats[1]) > 0:
                tags.append("tapland")
            else:
                tags.append("untapland")
        return tags

    # ------------
    print("len(deck) = %r" % (len(deck),))
    tags_list = [get_card_tags(card, deck) for card in deck.card_list]
    print("Deck Counts:")
    print(ut.repr2(ut.dict_hist(ut.flatten(tags_list)), nl=True))

    hand = deck.sample_hand()
    manastats_list = [card.mana_source_stats(deck) for card in hand]
    print(ut.list_str([card.name + ": " + text_type(stats) for card, stats in zip(hand, manastats_list)]))
    tags_list = [get_card_tags(card, deck) for card in hand]
    print("Hand Counts")
    print(ut.repr2(ut.dict_hist(ut.flatten(tags_list)), nl=True))

    valid_tags = ["land", "tapland", "untapland"]
    x = {tag: [] for tag in valid_tags}

    for _ in range(500):
        hand = deck.sample_hand()
        tags_list = [get_card_tags(card, deck) for card in hand]
        taghist = ut.dict_hist(ut.flatten(tags_list))
        for key, val in x.items():
            val.append(taghist.get(key, 0))

    print("Monte Stats:")
    for key, val in list(x.items()):
        print("%15s: %s" % (key, ut.repr2(ut.get_stats(val), precision=2)))

    def hand_stats():
        # [card.types for card in hand]
        # [card.rrr() for card in hand]
        [card.mana_source_stats(deck) for card in hand]
        card.types
Пример #10
0
def tag_coocurrence(tags_list):
    import utool as ut
    co_occur_list = []
    for tags in tags_list:
        for combo in ut.combinations(tags, 2):
            key = tuple(sorted(combo))
            co_occur_list.append(key)
    co_occur = ut.dict_hist(co_occur_list, ordered=True)
    #        co_occur[key] += 1
    #co_occur = ut.odict(co_occur)
    return co_occur
Пример #11
0
 def glossterms():
     re_glossterm = ut.named_field('glossterm', '.' + ut.REGEX_NONGREEDY)
     pat = r'\\glossterm{' + re_glossterm + '}'
     tup = ut.grep(pat, fpath_list=testdata_fpaths(), verbose=True)
     found_fpath_list, found_lines_list, found_lxs_list = tup
     glossterm_list = []
     for line in ut.flatten(found_lines_list):
         match = re.search(pat, line)
         glossterm = match.groupdict()['glossterm']
         glossterm_list.append(glossterm)
     print('Glossary Terms: ')
     print(ut.repr2(ut.dict_hist(glossterm_list), nl=True, strvals=True))
Пример #12
0
    def find_needsmove_to_other(self, other):
        hash1 = self.get_prop('md5_stride')
        hash2 = other.get_prop('md5_stride')
        idxs1 = list(range(len(hash1)))

        hash_to_idxs = ut.group_items(idxs1, hash1)
        # Find what we have that other doesnt have and move it there
        other_missing = set(hash1).difference(hash2)
        missing_idxs1 = ut.flatten(ut.take(hash_to_idxs, other_missing))

        data = ut.ColumnLists({
            'idx': missing_idxs1,
            'fname': self.get_prop('fname', missing_idxs1),
            'dname': self.get_prop('dname', missing_idxs1),
            'full_path': self.get_prop('full_path', missing_idxs1),
            'nbytes': self.get_prop('nbytes', missing_idxs1),
        })
        data = data.compress([f != 'Thumbs.db' for f in data['fname']])
        data['ext'] = self.get_prop('ext', data['idx'])
        ut.dict_hist(data['ext'])
        data.print(ignore=['full_path', 'dname'])
Пример #13
0
def _sync_filter_only_multiple_sightings(ibs, aid_list):
    r"""
    Returns:
        filtered_aids (list): the subset of aid_list such that every annot
        has a name and each name appears at least 2x.
    """
    name_list = ibs._sync_get_names(aid_list)
    name_hist = ut.dict_hist(name_list)
    aid_names = zip(aid_list, name_list)
    filtered_aids = [aid for (aid, name) in aid_names if name_hist[name] > 1]
    filtered_aid_names = [
        name for (aid, name) in aid_names if name_hist[name] > 1
    ]
    return filtered_aids, filtered_aid_names
Пример #14
0
def graph_info(graph, verbose=False):
    import utool as ut
    node_attrs = list(graph.node.values())
    edge_attrs = list(ut.take_column(graph.edges(data=True), 2))
    node_attr_hist = ut.dict_hist(ut.flatten([attr.keys() for attr in node_attrs]))
    edge_attr_hist = ut.dict_hist(ut.flatten([attr.keys() for attr in edge_attrs]))
    node_type_hist = ut.dict_hist(list(map(type, graph.nodes())))
    info_dict = ut.odict([
        ('directed', graph.is_directed()),
        ('multi', graph.is_multigraph()),
        ('num_nodes', len(graph)),
        ('num_edges', len(list(graph.edges()))),
        ('edge_attr_hist', ut.sort_dict(edge_attr_hist)),
        ('node_attr_hist', ut.sort_dict(node_attr_hist)),
        ('node_type_hist', ut.sort_dict(node_type_hist)),
        ('graph_attrs', graph.graph),
        ('graph_name', graph.name),
    ])
    #unique_attrs = ut.map_dict_vals(ut.unique, ut.dict_accum(*node_attrs))
    #ut.dict_isect_combine(*node_attrs))
    #[list(attrs.keys())]
    if verbose:
        print(ut.repr3(info_dict))
    return info_dict
Пример #15
0
    def check_baseline_results(sim):
        import networkx as nx
        infr = sim.infr
        n_names_possible = 0
        real_groups = ut.group_pairs(infr.gen_node_attrs('orig_name_label'))
        possible_clusters = []
        for nid, nodes in real_groups.items():
            if len(nodes) == 1:
                possible_clusters.append(nodes)
                n_names_possible += 1
                continue
            cc_cand_edges = list(ut.nx_edges_between(infr.graph, nodes))
            cc = ut.nx_from_node_edge(nodes, cc_cand_edges)
            mst = nx.minimum_spanning_tree(cc)
            ccs = list(nx.connected_components(mst))
            possible_clusters.extend(ccs)
            n_names_possible += (len(ccs))

        sumafter = 3

        best_possible_compare_results = compare_groups(
            list(real_groups.values()), list(possible_clusters))
        possible_per_num = ut.map_vals(
            len,
            ut.group_items(best_possible_compare_results['common'],
                           map(len, best_possible_compare_results['common'])))
        greater = [i for i in possible_per_num.keys() if i > sumafter]
        possible_per_num['>%s' % sumafter] = sum(
            ut.take(possible_per_num, greater))
        ut.delete_keys(possible_per_num, greater)
        for k, v in possible_per_num.items():
            sim.results['possible@' + str(k)] = v
        sim.results['possible'] = len(best_possible_compare_results['common'])

        # Measure the number of real names in the test (per number of annots)
        real_per_num = ut.dict_hist(map(len, real_groups.values()))
        greater = [i for i in real_per_num.keys() if i > sumafter]
        real_per_num['>%s' % sumafter] = sum(ut.take(real_per_num, greater))
        ut.delete_keys(real_per_num, greater)
        for k, v in real_per_num.items():
            sim.results['real@' + str(k)] = v

        sim.results['n_names_possible'] = n_names_possible
        sim.results['n_names_real'] = len(real_groups)
        sim.results['real'] = len(real_groups)
Пример #16
0
def random_case_set():
    r"""
    Returns:
        tuple: (labels, pairwise_feats)

    CommandLine:
        python -m ibeis.algo.hots.testem random_case_set --show

    Example:
        >>> # DISABLE_DOCTEST
        >>> from ibeis.algo.hots.testem import *  # NOQA
        >>> (labels, pairwise_feats) = random_case_set()
        >>> result = ('(labels, pairwise_feats) = %s' % (ut.repr2((labels, pairwise_feats)),))
        >>> print(result)
    """
    rng = np.random.RandomState(0)
    case_params = dict(num_names=5, rng=rng)
    num_annots = 600
    test_cases = [
        random_test_annot(**case_params)
        for _ in ut.ProgIter(range(num_annots), bs=1)
    ]
    pairxs = list(ut.product_nonsame(range(num_annots), range(num_annots)))
    import utool
    utool.embed()

    test_pairs = list(ut.unflat_take(test_cases, pairxs))
    cases1 = ut.instancelist(ut.take_column(test_pairs, 0), check=False)
    cases2 = ut.instancelist(ut.take_column(test_pairs, 1), check=False)
    # FIXME
    labels = labels1 = make_test_pairwise_labels2(cases1, cases2)  # NOQA

    #labels = np.array([make_test_pairwise_labels(case1, case2)
    #                   for case1, case2 in ut.ProgIter(test_pairs, bs=1)])
    pairwise_feats_ = [
        make_test_pairwise_fetaures(case1, case2, label, rng)
        for label, (case1,
                    case2) in ut.ProgIter(list(zip(labels, test_pairs)), bs=1)
    ]
    pairwise_feats = np.vstack(pairwise_feats_)
    print(ut.dict_hist(labels))
    return labels, pairwise_feats
Пример #17
0
def random_case_set():
    r"""
    Returns:
        tuple: (labels, pairwise_feats)

    CommandLine:
        python -m ibeis.algo.hots.testem random_case_set --show

    Example:
        >>> # DISABLE_DOCTEST
        >>> from ibeis.algo.hots.testem import *  # NOQA
        >>> (labels, pairwise_feats) = random_case_set()
        >>> result = ('(labels, pairwise_feats) = %s' % (ut.repr2((labels, pairwise_feats)),))
        >>> print(result)
    """
    rng = np.random.RandomState(0)
    case_params = dict(num_names=5, rng=rng)
    num_annots = 600
    test_cases = [random_test_annot(**case_params) for _ in ut.ProgIter(range(num_annots), bs=1)]
    pairxs = list(ut.product_nonsame(range(num_annots), range(num_annots)))
    import utool
    utool.embed()

    test_pairs = list(ut.unflat_take(test_cases, pairxs))
    cases1 = ut.make_instancelist(ut.take_column(test_pairs, 0), check=False)
    cases2 = ut.make_instancelist(ut.take_column(test_pairs, 1), check=False)
    # FIXME
    labels = labels1 = make_test_pairwise_labels2(cases1, cases2)  # NOQA

    #labels = np.array([make_test_pairwise_labels(case1, case2)
    #                   for case1, case2 in ut.ProgIter(test_pairs, bs=1)])
    pairwise_feats_ = [make_test_pairwise_fetaures(case1, case2, label, rng)
                       for label, (case1, case2) in ut.ProgIter(list(zip(labels, test_pairs)), bs=1)]
    pairwise_feats = np.vstack(pairwise_feats_)
    print(ut.dict_hist(labels))
    return labels, pairwise_feats
Пример #18
0
 def ext_hist(self):
     return ut.dict_hist(self.attrs['ext'])
Пример #19
0
def find_consistent_labeling(grouped_oldnames):
    """
    Solves a a maximum bipirtite matching problem to find a consistent
    name assignment.

    Notes:
        # Install module containing the Hungarian algorithm for matching
        pip install munkres

    Example:
        >>> # DISABLE_DOCTEST
        >>> from ibeis.scripts.name_recitifer import *  # NOQA
        >>> grouped_oldnames = [['a', 'b'], ['b', 'c'], ['c', 'a', 'a']]
        >>> new_names = find_consistent_labeling(grouped_oldnames)
        >>> print(new_names)
        [u'b', u'c', u'a']

    Example:
        >>> # DISABLE_DOCTEST
        >>> from ibeis.scripts.name_recitifer import *  # NOQA
        >>> grouped_oldnames = [['a', 'b', 'c'], ['b', 'c'], ['c', 'e', 'e']]
        >>> new_names = find_consistent_labeling(grouped_oldnames)
        >>> print(new_names)
        [u'a', u'b', u'e']

    Example:
        >>> # DISABLE_DOCTEST
        >>> from ibeis.scripts.name_recitifer import *  # NOQA
        >>> grouped_oldnames = [['a', 'b'], ['a', 'a', 'b'], ['a']]
        >>> new_names = find_consistent_labeling(grouped_oldnames)
        >>> print(new_names)
        [u'a', u'b', u'e']
    """
    import numpy as np
    try:
        import munkres
    except ImportError:
        print('Need to install Hungrian algorithm bipartite matching solver.')
        print('Run:')
        print('pip install munkres')
        raise
    unique_old_names = ut.unique(ut.flatten(grouped_oldnames))
    num_new_names = len(grouped_oldnames)
    num_old_names = len(unique_old_names)
    extra_oldnames = []

    # Create padded dummy values.  This accounts for the case where it is
    # impossible to uniquely map to the old db
    num_extra = num_new_names - num_old_names
    if num_extra > 0:
        extra_oldnames = [
            '_extra_name%d' % (count, ) for count in range(num_extra)
        ]
    elif num_extra < 0:
        pass
    else:
        extra_oldnames = []
    assignable_names = unique_old_names + extra_oldnames

    total = len(assignable_names)

    # Allocate assignment matrix
    profit_matrix = np.zeros((total, total), dtype=np.int)
    # Populate assignment profit matrix
    oldname2_idx = ut.make_index_lookup(assignable_names)
    name_freq_list = [ut.dict_hist(names) for names in grouped_oldnames]
    for rowx, name_freq in enumerate(name_freq_list):
        for name, freq in name_freq.items():
            colx = oldname2_idx[name]
            profit_matrix[rowx, colx] += freq
    # Add extra profit for using a previously used name
    profit_matrix[profit_matrix > 0] += 2
    # Add small profit for using an extra name
    extra_colxs = ut.take(oldname2_idx, extra_oldnames)
    profit_matrix[:, extra_colxs] += 1

    # Convert to minimization problem
    big_value = (profit_matrix.max())
    cost_matrix = big_value - profit_matrix
    m = munkres.Munkres()
    indexes = m.compute(cost_matrix)

    # Map output to be aligned with input
    rx2_cx = dict(indexes)
    assignment = [assignable_names[rx2_cx[rx]] for rx in range(num_new_names)]
    return assignment
Пример #20
0
def ingest_serengeti_mamal_cameratrap(species):
    """
    Downloads data from Serengeti dryad server

    References:
        http://datadryad.org/resource/doi:10.5061/dryad.5pt92
        Swanson AB, Kosmala M, Lintott CJ, Simpson RJ, Smith A, Packer C (2015)
        Snapshot Serengeti, high-frequency annotated camera trap images of 40
        mammalian species in an African savanna. Scientific Data 2: 150026.
        http://dx.doi.org/10.1038/sdata.2015.26
        Swanson AB, Kosmala M, Lintott CJ, Simpson RJ, Smith A, Packer C (2015)
        Data from: Snapshot Serengeti, high-frequency annotated camera trap
        images of 40 mammalian species in an African savanna. Dryad Digital
        Repository. http://dx.doi.org/10.5061/dryad.5pt92

    Args:
        species (?):

    CommandLine:
        python -m ibeis.dbio.ingest_database --test-ingest_serengeti_mamal_cameratrap --species zebra_plains
        python -m ibeis.dbio.ingest_database --test-ingest_serengeti_mamal_cameratrap --species cheetah

    Example:
        >>> # SCRIPT
        >>> from ibeis.dbio.ingest_database import *  # NOQA
        >>> import ibeis
        >>> species = ut.get_argval('--species', type_=str, default=ibeis.const.TEST_SPECIES.ZEB_PLAIN)
        >>> # species = ut.get_argval('--species', type_=str, default='cheetah')
        >>> result = ingest_serengeti_mamal_cameratrap(species)
        >>> print(result)
    """
    'https://snapshotserengeti.s3.msi.umn.edu/'
    import ibeis

    if species is None:
        code = 'ALL'
    elif species == 'zebra_plains':
        code = 'PZ'
    elif species == 'cheetah':
        code = 'CHTH'
    else:
        raise NotImplementedError()

    if species == 'zebra_plains':
        serengeti_sepcies = 'zebra'
    else:
        serengeti_sepcies = species

    print('species = %r' % (species,))
    print('serengeti_sepcies = %r' % (serengeti_sepcies,))

    dbname = code + '_Serengeti'
    print('dbname = %r' % (dbname,))
    dbdir = ut.ensuredir(join(ibeis.sysres.get_workdir(), dbname))
    print('dbdir = %r' % (dbdir,))
    image_dir = ut.ensuredir(join(dbdir, 'images'))

    base_url = 'http://datadryad.org/bitstream/handle/10255'
    all_images_url         = base_url + '/dryad.86392/all_images.csv'
    consensus_metadata_url = base_url + '/dryad.86348/consensus_data.csv'
    search_effort_url      = base_url + '/dryad.86347/search_effort.csv'
    gold_standard_url      = base_url + '/dryad.76010/gold_standard_data.csv'

    all_images_fpath         = ut.grab_file_url(all_images_url, download_dir=dbdir)
    consensus_metadata_fpath = ut.grab_file_url(consensus_metadata_url, download_dir=dbdir)
    search_effort_fpath      = ut.grab_file_url(search_effort_url, download_dir=dbdir)
    gold_standard_fpath      = ut.grab_file_url(gold_standard_url, download_dir=dbdir)

    print('all_images_fpath         = %r' % (all_images_fpath,))
    print('consensus_metadata_fpath = %r' % (consensus_metadata_fpath,))
    print('search_effort_fpath      = %r' % (search_effort_fpath,))
    print('gold_standard_fpath      = %r' % (gold_standard_fpath,))

    def read_csv(csv_fpath):
        import utool as ut
        csv_text = ut.read_from(csv_fpath)
        csv_lines = csv_text.split('\n')
        print(ut.list_str(csv_lines[0:2]))
        csv_data = [[field.strip('"').strip('\r') for field in line.split(',')]
                    for line in csv_lines if len(line) > 0]
        csv_header = csv_data[0]
        csv_data = csv_data[1:]
        return csv_data, csv_header

    def download_image_urls(image_url_info_list):
        # Find ones that we already have
        print('Requested %d downloaded images' % (len(image_url_info_list)))
        full_gpath_list = [join(image_dir, basename(gpath)) for gpath in image_url_info_list]
        exists_list = [ut.checkpath(gpath) for gpath in full_gpath_list]
        image_url_info_list_ = ut.compress(image_url_info_list, ut.not_list(exists_list))
        print('Already have %d/%d downloaded images' % (
            len(image_url_info_list) - len(image_url_info_list_), len(image_url_info_list)))
        print('Need to download %d images' % (len(image_url_info_list_)))
        #import sys
        #sys.exit(0)
        # Download the rest
        imgurl_prefix = 'https://snapshotserengeti.s3.msi.umn.edu/'
        image_url_list = [imgurl_prefix + suffix for suffix in image_url_info_list_]
        for img_url in ut.ProgressIter(image_url_list, lbl='Downloading image'):
            ut.grab_file_url(img_url, download_dir=image_dir)
        return full_gpath_list

    # Data contains information about which events have which animals
    if False:
        species_class_csv_data, species_class_header = read_csv(gold_standard_fpath)
        species_class_eventid_list    = ut.get_list_column(species_class_csv_data, 0)
        #gold_num_species_annots_list = ut.get_list_column(gold_standard_csv_data, 2)
        species_class_species_list    = ut.get_list_column(species_class_csv_data, 2)
        #gold_count_list              = ut.get_list_column(gold_standard_csv_data, 3)
    else:
        species_class_csv_data, species_class_header = read_csv(consensus_metadata_fpath)
        species_class_eventid_list    = ut.get_list_column(species_class_csv_data, 0)
        species_class_species_list    = ut.get_list_column(species_class_csv_data, 7)

    # Find the zebra events
    serengeti_sepcies_set = sorted(list(set(species_class_species_list)))
    print('serengeti_sepcies_hist = %s' %
          ut.dict_str(ut.dict_hist(species_class_species_list), key_order_metric='val'))
    #print('serengeti_sepcies_set = %s' % (ut.list_str(serengeti_sepcies_set),))

    assert serengeti_sepcies in serengeti_sepcies_set, 'not a known  seregeti species'
    species_class_chosen_idx_list = ut.list_where(
        [serengeti_sepcies == species_ for species_ in species_class_species_list])
    chosen_eventid_list = ut.take(species_class_eventid_list, species_class_chosen_idx_list)

    print('Number of chosen species:')
    print(' * len(species_class_chosen_idx_list) = %r' % (len(species_class_chosen_idx_list),))
    print(' * len(chosen_eventid_list) = %r' % (len(chosen_eventid_list),))

    # Read info about which events have which images
    images_csv_data, image_csv_header = read_csv(all_images_fpath)
    capture_event_id_list = ut.get_list_column(images_csv_data, 0)
    image_url_info_list = ut.get_list_column(images_csv_data, 1)
    # Group photos by eventid
    eventid_to_photos = ut.group_items(image_url_info_list, capture_event_id_list)

    # Filter to only chosens
    unflat_chosen_url_infos = ut.dict_take(eventid_to_photos, chosen_eventid_list)
    chosen_url_infos = ut.flatten(unflat_chosen_url_infos)
    image_url_info_list = chosen_url_infos
    chosen_path_list = download_image_urls(chosen_url_infos)

    ibs = ibeis.opendb(dbdir=dbdir, allow_newdir=True)
    gid_list_ = ibs.add_images(chosen_path_list, auto_localize=False)  # NOQA

    # Attempt to automatically detect the annotations
    #aids_list = ibs.detect_random_forest(gid_list_, species)
    #aids_list

    #if False:
    #    # remove non-zebra photos
    #    from os.path import basename
    #    base_gname_list = list(map(basename, zebra_url_infos))
    #    all_gname_list = ut.list_images(image_dir)
    #    nonzebra_gname_list = ut.setdiff_ordered(all_gname_list, base_gname_list)
    #    nonzebra_gpath_list = ut.fnames_to_fpaths(nonzebra_gname_list, image_dir)
    #    ut.remove_fpaths(nonzebra_gpath_list)
    return ibs
Пример #21
0
    print('\n --- TYPE = %r' % (e.upper(), ))
    g = g[g.columns[~np.all(pd.isnull(g), axis=0)]]
    missing_cols = g.columns[np.any(pd.isnull(g), axis=0)]
    if e in ignore:
        missing_cols = missing_cols.difference(ignore[e])
    print('missing_cols = {!r}'.format(missing_cols.tolist()))
    for col in missing_cols:
        print('col = {!r}'.format(col))
        print(g[pd.isnull(g[col])].index.tolist())

for e, g in entrytypes.items():
    print('e = %r' % (e, ))
    g = g[g.columns[~np.all(pd.isnull(g), axis=0)]]
    if 'pub_full' in g.columns:
        place_title = g['pub_full'].tolist()
        print(ut.repr4(ut.dict_hist(place_title)))
    else:
        print(g)
        print('Unknown publications')

if 'report' in entrytypes:
    g = entrytypes['report']
    missing = g[pd.isnull(g['title'])]
    if len(missing):
        print('Missing Title')
        print(ut.repr4(missing[['title', 'author']].values.tolist()))

if 'journal' in entrytypes:
    g = entrytypes['journal']
    g = g[g.columns[~np.all(pd.isnull(g), axis=0)]]
Пример #22
0
def find_consistent_labeling_old(grouped_oldnames,
                                 extra_prefix='_extra_name',
                                 verbose=False):
    import numpy as np
    import scipy.optimize

    unique_old_names = ut.unique(ut.flatten(grouped_oldnames))

    # TODO: find names that are only used once, and just ignore those for
    # optimization.
    # unique_set = set(unique_old_names)
    oldname_sets = list(map(set, grouped_oldnames))
    usage_hist = ut.dict_hist(ut.flatten(oldname_sets))
    conflicts = {k for k, v in usage_hist.items() if v > 1}
    # nonconflicts = {k for k, v in usage_hist.items() if v == 1}

    conflict_groups = []
    orig_idxs = []
    assignment = [None] * len(grouped_oldnames)
    ntrivial = 0
    for idx, group in enumerate(grouped_oldnames):
        if set(group).intersection(conflicts):
            orig_idxs.append(idx)
            conflict_groups.append(group)
        else:
            ntrivial += 1
            if len(group) > 0:
                h = ut.dict_hist(group)
                hitems = list(h.items())
                hvals = [i[1] for i in hitems]
                maxval = max(hvals)
                g = min([k for k, v in hitems if v == maxval])
                assignment[idx] = g
            else:
                assignment[idx] = None

    if verbose:
        print('rectify %d non-trivial groups' % (len(conflict_groups), ))
        print('rectify %d trivial groups' % (ntrivial, ))

    num_extra = 0

    if len(conflict_groups) > 0:
        grouped_oldnames_ = conflict_groups
        unique_old_names = ut.unique(ut.flatten(grouped_oldnames_))
        num_new_names = len(grouped_oldnames_)
        num_old_names = len(unique_old_names)
        extra_oldnames = []

        # Create padded dummy values.  This accounts for the case where it is
        # impossible to uniquely map to the old db
        num_extra = num_new_names - num_old_names
        if num_extra > 0:
            extra_oldnames = [
                '%s%d' % (
                    extra_prefix,
                    count,
                ) for count in range(num_extra)
            ]
        elif num_extra < 0:
            pass
        else:
            extra_oldnames = []
        assignable_names = unique_old_names + extra_oldnames

        total = len(assignable_names)

        # Allocate assignment matrix
        # Start with a large negative value indicating
        # that you must select from your assignments only
        profit_matrix = -np.ones((total, total), dtype=np.int) * (2 * total)
        # Populate assignment profit matrix
        oldname2_idx = ut.make_index_lookup(assignable_names)
        name_freq_list = [ut.dict_hist(names) for names in grouped_oldnames_]
        # Initialize base profit for using a previously used name
        for rowx, name_freq in enumerate(name_freq_list):
            for name, freq in name_freq.items():
                colx = oldname2_idx[name]
                profit_matrix[rowx, colx] = 1
        # Now add in the real profit
        for rowx, name_freq in enumerate(name_freq_list):
            for name, freq in name_freq.items():
                colx = oldname2_idx[name]
                profit_matrix[rowx, colx] += freq
        # Set a small profit for using an extra name
        extra_colxs = ut.take(oldname2_idx, extra_oldnames)
        profit_matrix[:, extra_colxs] = 1

        # Convert to minimization problem
        big_value = (profit_matrix.max()) - (profit_matrix.min())
        cost_matrix = big_value - profit_matrix

        # Don't use munkres, it is pure python and very slow. Use scipy instead
        indexes = list(zip(*scipy.optimize.linear_sum_assignment(cost_matrix)))

        # Map output to be aligned with input
        rx2_cx = dict(indexes)
        assignment_ = [
            assignable_names[rx2_cx[rx]] for rx in range(num_new_names)
        ]

        # Reintegrate trivial values
        for idx, g in zip(orig_idxs, assignment_):
            assignment[idx] = g

    for idx, val in enumerate(assignment):
        if val is None:
            assignment[idx] = '%s%d' % (
                extra_prefix,
                num_extra,
            )
            num_extra += 1
    return assignment
Пример #23
0
def simple_munkres(part_oldnames):
    """
    Defines a munkres problem to solve name rectification.

    Notes:
        We create a matrix where each rows represents a group of annotations in
        the same PCC and each column represents an original name. If there are
        more PCCs than original names the columns are padded with extra values.
        The matrix is first initialized to be negative infinity representing
        impossible assignments. Then for each column representing a padded
        name, we set we its value to $1$ indicating that each new name could be
        assigned to a padded name for some small profit.  Finally, let $f_{rc}$
        be the the number of annotations in row $r$ with an original name of
        $c$. Each matrix value $(r, c)$ is set to $f_{rc} + 1$ if $f_{rc} > 0$,
        to represent how much each name ``wants'' to be labeled with a
        particular original name, and the extra one ensures that these original
        names are always preferred over padded names.

    CommandLine:
        python -m ibeis.scripts.name_recitifer simple_munkres

    Example:
        >>> # ENABLE_DOCTEST
        >>> from ibeis.scripts.name_recitifer import *  # NOQA
        >>> part_oldnames = [['a', 'b'], ['b', 'c'], ['c', 'a', 'a']]
        >>> new_names = simple_munkres(part_oldnames)
        >>> result = ut.repr2(new_names)
        >>> print(new_names)
        ['b', 'c', 'a']

    Example:
        >>> # ENABLE_DOCTEST
        >>> from ibeis.scripts.name_recitifer import *  # NOQA
        >>> part_oldnames = [[], ['a', 'a'], [],
        >>>                  ['a', 'a', 'a', 'a', 'a', 'a', 'a', 'b'], ['a']]
        >>> new_names = simple_munkres(part_oldnames)
        >>> result = ut.repr2(new_names)
        >>> print(new_names)
        [None, 'a', None, 'b', None]

    Example:
        >>> # ENABLE_DOCTEST
        >>> from ibeis.scripts.name_recitifer import *  # NOQA
        >>> part_oldnames = [[], ['b'], ['a', 'b', 'c'], ['b', 'c'], ['c', 'e', 'e']]
        >>> new_names = find_consistent_labeling(part_oldnames)
        >>> result = ut.repr2(new_names)
        >>> print(new_names)
        ['_extra_name0', 'b', 'a', 'c', 'e']

        Profit Matrix
            b   a   c   e  _0
        0 -10 -10 -10 -10   1
        1   2 -10 -10 -10   1
        2   2   2   2 -10   1
        3   2 -10   2 -10   1
        4 -10 -10   2   3   1
    """
    import numpy as np
    import scipy.optimize
    unique_old_names = ut.unique(ut.flatten(part_oldnames))
    num_new_names = len(part_oldnames)
    num_old_names = len(unique_old_names)

    # Create padded dummy values.  This accounts for the case where it is
    # impossible to uniquely map to the old db
    num_pad = max(num_new_names - num_old_names, 0)
    total = num_old_names + num_pad
    shape = (total, total)

    # Allocate assignment matrix.
    # rows are new-names and cols are old-names.
    # Initially the profit of any assignment is effectively -inf
    # This effectively marks all assignments as invalid
    profit_matrix = np.full(shape, -2 * total, dtype=np.int)
    # Overwrite valid assignments with positive profits
    oldname2_idx = ut.make_index_lookup(unique_old_names)
    name_freq_list = [ut.dict_hist(names) for names in part_oldnames]
    # Initialize profit of a valid assignment as 1 + freq
    # This incentivizes using a previously used name
    for rowx, name_freq in enumerate(name_freq_list):
        for name, freq in name_freq.items():
            colx = oldname2_idx[name]
            profit_matrix[rowx, colx] = freq + 1
    # Set a much smaller profit for using an extra name
    # This allows the solution to always exist
    profit_matrix[:, num_old_names:total] = 1

    # Convert to minimization problem
    big_value = (profit_matrix.max()) - (profit_matrix.min())
    cost_matrix = big_value - profit_matrix

    # Use scipy implementation of munkres algorithm.
    rx2_cx = dict(zip(*scipy.optimize.linear_sum_assignment(cost_matrix)))

    # Each row (new-name) has now been assigned a column (old-name)
    # Map this back to the input-space (using None to indicate extras)
    cx2_name = dict(enumerate(unique_old_names))

    if False:
        import pandas as pd
        columns = unique_old_names + ['_%r' % x for x in range(num_pad)]
        print('Profit Matrix')
        print(pd.DataFrame(profit_matrix, columns=columns))

        print('Cost Matrix')
        print(pd.DataFrame(cost_matrix, columns=columns))

    assignment_ = [
        cx2_name.get(rx2_cx[rx], None) for rx in range(num_new_names)
    ]
    return assignment_
Пример #24
0
def download_sharks(XMLdata, number):
    """
    cd ~/work/WS_ALL
    python -m ibeis.scripts.getshark

    >>> from ibeis.scripts.getshark import *  # NOQA
    >>> url = 'www.whaleshark.org/listImages.jsp'
    >>> XMLdata = ut.url_read(url)
    >>> number = None
    """
    # Prepare the output directory for writing, if it doesn't exist
    output_dir = 'sharkimages'
    ut.ensuredir(output_dir)

    dom = parseString(XMLdata)

    # Download files
    if number:
        maxCount = min(number, len(dom.getElementsByTagName('img')))
    else:
        maxCount = len(dom.getElementsByTagName('img'))

    parsed_info = dict(
        img_url_list=[],
        localid_list=[],
        nameid_list=[],
        orig_fname_list=[],
        new_fname_list=[],
    )

    print('Preparing to fetch %i files...' % maxCount)

    for shark in dom.getElementsByTagName('shark'):
        localCount = 0
        for imageset in shark.getElementsByTagName('imageset'):
            for img in imageset.getElementsByTagName('img'):
                localCount += 1

                img_url = img.getAttribute('href')
                orig_fname = split(img_url)[1]
                ext = splitext(orig_fname)[1].lower()
                nameid = shark.getAttribute('number')

                new_fname = '%s-%i%s' % (
                    nameid, localCount, ext)

                parsed_info['img_url_list'].append(img_url)
                parsed_info['nameid_list'].append(nameid)
                parsed_info['localid_list'].append(localCount)
                parsed_info['orig_fname_list'].append(orig_fname)
                parsed_info['new_fname_list'].append(new_fname)

                print('Parsed %i / %i files.' % (len(parsed_info['orig_fname_list']), maxCount))

                if number is not None and len(parsed_info['orig_fname_list']) == number:
                    break
    parsed_info['new_fpath_list'] = [join(output_dir, _fname)
                                     for _fname in parsed_info['new_fname_list']]

    print('Filtering parsed images')

    # Filter based on image type (keep only jpgs)
    ext_flags = [_fname.endswith('.jpg') or _fname.endswith('.jpg')
                  for _fname in parsed_info['new_fname_list']]
    parsed_info = {key: ut.compress(list_, ext_flags) for key, list_ in parsed_info.items()}

    # Filter to only images matching the appropriate tags
    from ibeis import tag_funcs
    parsed_info['tags_list'] = parse_shark_tags(parsed_info['orig_fname_list'])
    tag_flags = tag_funcs.filterflags_general_tags(
        parsed_info['tags_list'],
        has_any=['view-left'],
        none_match=['qual.*', 'view-top', 'part-.*', 'cropped'],
    )
    parsed_info = {key: ut.compress(list_, tag_flags) for key, list_ in parsed_info.items()}
    print('Tags in chosen images:')
    print(ut.dict_hist(ut.flatten(parsed_info['tags_list'] )))

    # Download selected subset
    print('Downloading selected subset')
    _iter = list(zip(parsed_info['img_url_list'],
                     parsed_info['new_fpath_list']))
    _iter = ut.ProgressIter(_iter, lbl='downloading sharks')
    for img_url, new_fpath in _iter:
        if not exists(new_fpath):
            ut.download_url(img_url, new_fpath)

    # Remove corrupted or ill-formatted images
    print('Checking for corrupted images')
    import vtool as vt
    noncorrupt_flags = vt.filterflags_valid_images(parsed_info['new_fpath_list'])
    parsed_info = {
        key: ut.compress(list_, noncorrupt_flags)
        for key, list_ in parsed_info.items()
    }

    print('Removing small images')
    import numpy as np
    imgsize_list = np.array([vt.open_image_size(gpath) for gpath in parsed_info['new_fpath_list']])
    sqrt_area_list = np.sqrt(np.prod(imgsize_list, axis=1))
    areq_flags_list = sqrt_area_list >= 750
    parsed_info = {key: ut.compress(list_, areq_flags_list)
                   for key, list_ in parsed_info.items()}

    grouped_idxs = ut.group_items(list(range(len(parsed_info['nameid_list']))),
                                  parsed_info['nameid_list'])
    keep_idxs = sorted(ut.flatten([idxs for key, idxs in grouped_idxs.items() if len(idxs) >= 2]))
    parsed_info = {key: ut.take(list_, keep_idxs) for key, list_ in parsed_info.items()}

    print('Moving imagse to secondary directory')
    named_outputdir = 'named-left-sharkimages'
    # Build names
    parsed_info['namedir_fpath_list'] = [
        join(named_outputdir, _nameid, _fname)
        for _fname, _nameid in zip(parsed_info['new_fname_list'],
                                   parsed_info['nameid_list'])]
    # Create directories
    ut.ensuredir(named_outputdir)
    named_dirs = ut.unique_ordered(list(map(dirname, parsed_info['namedir_fpath_list'])))
    for dir_ in named_dirs:
        ut.ensuredir(dir_)
    # Copy
    ut.copy_files_to(src_fpath_list=parsed_info['new_fpath_list'],
                     dst_fpath_list=parsed_info['namedir_fpath_list'])
Пример #25
0
def get_toy_data_1vM(num_annots, num_names=None, **kwargs):
    r"""
    Args:
        num_annots (int):
        num_names (int): (default = None)

    Kwargs:
        initial_aids, initial_nids, nid_sequence, seed

    Returns:
        tuple: (pair_list, feat_list)

    CommandLine:
        python -m ibeis.algo.hots.demobayes --exec-get_toy_data_1vM --show

    Example:
        >>> # DISABLE_DOCTEST
        >>> from ibeis.algo.hots.demobayes import *  # NOQA
        >>> num_annots = 1000
        >>> num_names = 40
        >>> get_toy_data_1vM(num_annots, num_names)
        >>> ut.quit_if_noshow()
        >>> import plottool as pt
        >>> ut.show_if_requested()
    """
    import vtool as vt
    tup_ = get_toy_annots(num_annots, num_names, **kwargs)
    aids, nids, aids1, nids1, all_aids, all_nids = tup_
    rng = vt.ensure_rng(None)

    # Test a simple SVM classifier
    nid2_nexemp = ut.dict_hist(nids1)
    aid2_nid = dict(zip(aids, nids))

    ut.fix_embed_globals()

    #def add_to_globals(globals_, subdict):
    #    globals_.update(subdict)

    unique_nids = list(nid2_nexemp.keys())

    def annot_to_class_feats2(aid, aid2_nid, top=None):
        pair_list = []
        score_list = []
        nexemplar_list = []
        for nid in unique_nids:
            label = (aid2_nid[aid] == nid)
            num_exemplars = nid2_nexemp.get(nid, 0)
            if num_exemplars == 0:
                continue
            params = toy_params[label]
            mu, sigma = ut.dict_take(params, ['mu', 'sigma'])
            score_ = rng.normal(mu, sigma, size=num_exemplars).max()
            score = np.clip(score_, 0, np.inf)
            pair_list.append((aid, nid))
            score_list.append(score)
            nexemplar_list.append(num_exemplars)
        rank_list = ut.argsort(score_list, reverse=True)
        feat_list = np.array([score_list, rank_list, nexemplar_list]).T
        sortx = np.argsort(rank_list)
        feat_list = feat_list.take(sortx, axis=0)
        pair_list = np.array(pair_list).take(sortx, axis=0)
        if top is not None:
            feat_list = feat_list[:top]
            pair_list = pair_list[0:top]
        return pair_list, feat_list

    toclass_features = [
        annot_to_class_feats2(aid, aid2_nid, top=5) for aid in aids
    ]
    aidnid_pairs = np.vstack(ut.get_list_column(toclass_features, 0))
    feat_list = np.vstack(ut.get_list_column(toclass_features, 1))
    score_list = feat_list.T[0:1].T
    lbl_list = [aid2_nid[aid] == nid for aid, nid in aidnid_pairs]

    from sklearn import svm
    #clf1 = svm.LinearSVC()
    print('Learning classifiers')

    clf3 = svm.SVC(probability=True)
    clf3.fit(feat_list, lbl_list)
    #prob_true, prob_false = clf3.predict_proba(feat_list).T

    clf1 = svm.LinearSVC()
    clf1.fit(score_list, lbl_list)

    # Score new annots against the training database
    tup_ = get_toy_annots(num_annots * 2,
                          num_names,
                          initial_aids=all_aids,
                          initial_nids=all_nids)
    aids, nids, aids1, nids1, all_aids, all_nids = tup_
    aid2_nid = dict(zip(aids, nids))
    toclass_features = [annot_to_class_feats2(aid, aid2_nid) for aid in aids]
    aidnid_pairs = np.vstack(ut.get_list_column(toclass_features, 0))
    feat_list = np.vstack(ut.get_list_column(toclass_features, 1))
    lbl_list = np.array([aid2_nid[aid] == nid for aid, nid in aidnid_pairs])

    print('Running tests')

    score_list = feat_list.T[0:1].T

    tp_feat_list = feat_list[lbl_list]
    tn_feat_list = feat_list[~lbl_list]
    tp_lbls = lbl_list[lbl_list]
    tn_lbls = lbl_list[~lbl_list]
    print('num tp: %d' % len(tp_lbls))
    print('num fp: %d' % len(tn_lbls))

    tp_score_list = score_list[lbl_list]
    tn_score_list = score_list[~lbl_list]

    print('tp_feat' +
          ut.repr3(ut.get_stats(tp_feat_list, axis=0), precision=2))
    print('tp_feat' +
          ut.repr3(ut.get_stats(tn_feat_list, axis=0), precision=2))

    print('tp_score' + ut.repr2(ut.get_stats(tp_score_list), precision=2))
    print('tp_score' + ut.repr2(ut.get_stats(tn_score_list), precision=2))

    tp_pred3 = clf3.predict(tp_feat_list)
    tn_pred3 = clf3.predict(tn_feat_list)
    print((tp_pred3.sum(), tp_pred3.shape))
    print((tn_pred3.sum(), tn_pred3.shape))

    tp_score3 = clf3.score(tp_feat_list, tp_lbls)
    tn_score3 = clf3.score(tn_feat_list, tn_lbls)

    tp_pred1 = clf1.predict(tp_score_list)
    tn_pred1 = clf1.predict(tn_score_list)
    print((tp_pred1.sum(), tp_pred1.shape))
    print((tn_pred1.sum(), tn_pred1.shape))

    tp_score1 = clf1.score(tp_score_list, tp_lbls)
    tn_score1 = clf1.score(tn_score_list, tn_lbls)
    print('tp score with rank    = %r' % (tp_score3, ))
    print('tn score with rank    = %r' % (tn_score3, ))

    print('tp score without rank = %r' % (tp_score1, ))
    print('tn score without rank = %r' % (tn_score1, ))
    toy_data = {}

    return toy_data
Пример #26
0
def check_doublewords():
    """
    ./texfix.py --fpaths chapter4-application.tex --check-doublewords
    ./texfix.py --check-doublewords
    ./texfix.py --fpaths main.tex --outline --asmarkdown --numlines=999 -w --ignoreinputstartswith=def,Crall,header,colordef,figdef
    text = ut.readfrom('outline_main.md')
    >>> from texfix import *  # NOQA
    """
    # TODO: Do this on a per section basis to remove math considerations automagically
    root = testdata_main(
        ignoreinputstartswith=['def', 'Crall', 'header', 'colordef', 'figdef'])
    #root = latex_parser.LatexDocPart.parse_fpath('chapter4-application.tex')
    root._config['asmarkdown'] = True
    root._config['numlines'] = 999
    #text = root.summary_str(outline=True)

    #document = root.find_descendant_type('document')

    import re
    #text = ut.readfrom('outline_main.md')
    #lines = text.split('\n')
    found_duplicates = []
    found_lines = []
    found_linenos = []

    def check_palendrome(sequence_norm):
        half1 = sequence_norm[0:len(sequence_norm) // 2]
        half2 = sequence_norm[len(sequence_norm) // 2:]
        return all([a == b for a, b in zip(half1, half2)])

    #for num, line in enumerate(lines):
    num = 0
    for x, node in enumerate(
            root.iter_nodes(invalid_types=['comment', 'equation'])):
        block = node.summary_str(outline=True, highlight=False, depth=1)
        for line in block.split('\n'):
            num += 1
            line_ = re.sub('\\$.*?\\$', 'mathpart' + str(num) + 'math', line)
            words = line_.split(' ')
            #if len(words) > 10:
            #    break
            for size in [2, 4, 6, 8, 10]:
                for sequence in ut.iter_window(words, size=size):
                    sequence_norm = [
                        re.sub('[^a-zA-Z0-9]', '', s.lower()) for s in sequence
                    ]
                    if sequence_norm[0] == '' or 'mathpart' in sequence_norm[0]:
                        continue
                    #if ut.allsame(sequence_norm):
                    if check_palendrome(sequence_norm):
                        print('sequence_norm = %r' % (sequence_norm, ))
                        print(('Potential repeat of %r ' % (sequence_norm, )) +
                              node.parsed_location_span())
                        found_duplicates.append(sequence_norm)
                        found_lines.append(line_)
                        found_linenos.append(num)

    print('found_linenos = ' + '\n'.join(ut.lmap(str, found_linenos)))
    print('found_lines = ' + '\n'.join(found_lines))
    print('found_duplicates = ' + ut.repr3(found_duplicates, nl=1))

    constants_tex_fixes.CAPITAL_LIST

    proper_words = [
        'Identification', 'Park.', 'Discussion', 'Hamming', 'Grevy', 'Affine',
        'Equation', 'Sweetwaters', 'National', 'Nairobi', 'The', 'Hessian',
        'Fisher', 'Gaussian', 'Section', "Grevy's", 'Masai', 'Figure', 'Jason',
        'March', 'Parham', 'Euclidean', 'Bayes', 'Chapter', 'Subsection',
        'Lowe', 'Luigi', 'Dryad', 'Jablons', 'Wildbook', 'Apache', 'Hadoop',
        'Zack', 'Lincoln', 'Peterson', 'Alessandro', 'Oddone', 'Earth',
        'Darwin', 'Markov', 'Bayesian', 'Table', 'Boxer', 'Beagle', 'Platt',
        'K'
    ]

    flagged_words = []

    #for num, line in enumerate(lines):
    for x, node in enumerate(
            root.iter_nodes(invalid_types=['comment', 'equation'])):
        block = node.summary_str(outline=True, highlight=False, depth=1)
        for line in block.split('\n'):
            #print('node.type_ = %r' % (node.type_,))
            #print('line = %r' % (line[0:20],))
            #if x > 30:
            #    break
            #if node.type_ in ['equation', 'comment']:
            #    continue
            line_ = re.sub('\\$.*?\\$', 'mathpart' + str(num) + 'math', line)
            line_ = re.sub('[0-9]+', '', line_)
            line_ = re.sub('\'s\\b', '', line_)
            line_ = re.sub('\\\\[A-Za-z]+\\b', '', line_)
            line_ = line_.replace('#', '')
            line_ = line_.replace('\\', '')
            line_ = line_.replace('(', '')
            line_ = line_.replace('Nairobi National Park', '')
            line_ = line_.replace('Plains zebras', '')
            line_ = line_.replace('Ol Pejeta', '')
            line_ = line_.replace('Darwin Core', '')
            line_ = line_.replace(')', '')
            line_ = line_.replace('*', '')
            line_ = line_.lstrip(' ')
            words = line_.split(' ')
            flag = False
            for w in words[1:]:
                matches = re.findall('[A-Z]', w)
                if w in proper_words:
                    continue
                if len(matches) == 1:
                    #print(w)
                    print(('Bad caps word %r ' % (w, )) + node.fpath_root() +
                          ' at line ' + str(node.line_num))
                    flagged_words.append(w)
                    flag = True
            if flag:
                pass
                print(line_)

    print('Found caps problems')
    hist = ut.dict_hist(flagged_words, ordered=True)
    print(ut.repr3(hist, nl=1))
Пример #27
0
def find_consistent_labeling(grouped_oldnames):
    """
    Solves a a maximum bipirtite matching problem to find a consistent
    name assignment.

    Notes:
        # Install module containing the Hungarian algorithm for matching
        pip install munkres

    Example:
        >>> # DISABLE_DOCTEST
        >>> from ibeis.scripts.name_recitifer import *  # NOQA
        >>> grouped_oldnames = [['a', 'b'], ['b', 'c'], ['c', 'a', 'a']]
        >>> new_names = find_consistent_labeling(grouped_oldnames)
        >>> print(new_names)
        [u'b', u'c', u'a']

    Example:
        >>> # DISABLE_DOCTEST
        >>> from ibeis.scripts.name_recitifer import *  # NOQA
        >>> grouped_oldnames = [['a', 'b', 'c'], ['b', 'c'], ['c', 'e', 'e']]
        >>> new_names = find_consistent_labeling(grouped_oldnames)
        >>> print(new_names)
        [u'a', u'b', u'e']

    Example:
        >>> # DISABLE_DOCTEST
        >>> from ibeis.scripts.name_recitifer import *  # NOQA
        >>> grouped_oldnames = [['a', 'b'], ['a', 'a', 'b'], ['a']]
        >>> new_names = find_consistent_labeling(grouped_oldnames)
        >>> print(new_names)
        [u'a', u'b', u'e']
    """
    import numpy as np
    try:
        import munkres
    except ImportError:
        print('Need to install Hungrian algorithm bipartite matching solver.')
        print('Run:')
        print('pip install munkres')
        raise
    unique_old_names = ut.unique(ut.flatten(grouped_oldnames))
    num_new_names = len(grouped_oldnames)
    num_old_names = len(unique_old_names)
    extra_oldnames = []

    # Create padded dummy values.  This accounts for the case where it is
    # impossible to uniquely map to the old db
    num_extra = num_new_names - num_old_names
    if num_extra > 0:
        extra_oldnames = ['_extra_name%d' % (count,) for count in
                          range(num_extra)]
    elif num_extra < 0:
        pass
    else:
        extra_oldnames = []
    assignable_names = unique_old_names + extra_oldnames

    total = len(assignable_names)

    # Allocate assignment matrix
    profit_matrix = np.zeros((total, total), dtype=np.int)
    # Populate assignment profit matrix
    oldname2_idx = ut.make_index_lookup(assignable_names)
    name_freq_list = [ut.dict_hist(names) for names in grouped_oldnames]
    for rowx, name_freq in enumerate(name_freq_list):
        for name, freq in name_freq.items():
            colx = oldname2_idx[name]
            profit_matrix[rowx, colx] += freq
    # Add extra profit for using a previously used name
    profit_matrix[profit_matrix > 0] += 2
    # Add small profit for using an extra name
    extra_colxs = ut.take(oldname2_idx, extra_oldnames)
    profit_matrix[:, extra_colxs] += 1

    # Convert to minimization problem
    big_value = (profit_matrix.max())
    cost_matrix = big_value - profit_matrix
    m = munkres.Munkres()
    indexes = m.compute(cost_matrix)

    # Map output to be aligned with input
    rx2_cx = dict(indexes)
    assignment = [assignable_names[rx2_cx[rx]]
                  for rx in range(num_new_names)]
    return assignment
Пример #28
0
 def get_colordict(self):
     color2_num = ut.dict_hist([m.color for m in self._manas for _ in range(m.num)])
     return color2_num
Пример #29
0
def download_sharks(XMLdata, number):
    """
    cd ~/work/WS_ALL
    python -m ibeis.scripts.getshark

    >>> from ibeis.scripts.getshark import *  # NOQA
    >>> url = 'www.whaleshark.org/listImages.jsp'
    >>> XMLdata = ut.url_read(url)
    >>> number = None
    """
    # Prepare the output directory for writing, if it doesn't exist
    output_dir = 'sharkimages'
    ut.ensuredir(output_dir)

    dom = parseString(XMLdata)

    # Download files
    if number:
        maxCount = min(number, len(dom.getElementsByTagName('img')))
    else:
        maxCount = len(dom.getElementsByTagName('img'))

    parsed_info = dict(
        img_url_list=[],
        localid_list=[],
        nameid_list=[],
        orig_fname_list=[],
        new_fname_list=[],
    )

    print('Preparing to fetch %i files...' % maxCount)

    for shark in dom.getElementsByTagName('shark'):
        localCount = 0
        for imageset in shark.getElementsByTagName('imageset'):
            for img in imageset.getElementsByTagName('img'):
                localCount += 1

                img_url = img.getAttribute('href')
                orig_fname = split(img_url)[1]
                ext = splitext(orig_fname)[1].lower()
                nameid = shark.getAttribute('number')

                new_fname = '%s-%i%s' % (nameid, localCount, ext)

                parsed_info['img_url_list'].append(img_url)
                parsed_info['nameid_list'].append(nameid)
                parsed_info['localid_list'].append(localCount)
                parsed_info['orig_fname_list'].append(orig_fname)
                parsed_info['new_fname_list'].append(new_fname)

                print('Parsed %i / %i files.' %
                      (len(parsed_info['orig_fname_list']), maxCount))

                if number is not None and len(
                        parsed_info['orig_fname_list']) == number:
                    break
    parsed_info['new_fpath_list'] = [
        join(output_dir, _fname) for _fname in parsed_info['new_fname_list']
    ]

    print('Filtering parsed images')

    # Filter based on image type (keep only jpgs)
    ext_flags = [
        _fname.endswith('.jpg') or _fname.endswith('.jpg')
        for _fname in parsed_info['new_fname_list']
    ]
    parsed_info = {
        key: ut.compress(list_, ext_flags)
        for key, list_ in parsed_info.items()
    }

    # Filter to only images matching the appropriate tags
    from ibeis import tag_funcs
    parsed_info['tags_list'] = parse_shark_tags(parsed_info['orig_fname_list'])
    tag_flags = tag_funcs.filterflags_general_tags(
        parsed_info['tags_list'],
        has_any=['view-left'],
        none_match=['qual.*', 'view-top', 'part-.*', 'cropped'],
    )
    parsed_info = {
        key: ut.compress(list_, tag_flags)
        for key, list_ in parsed_info.items()
    }
    print('Tags in chosen images:')
    print(ut.dict_hist(ut.flatten(parsed_info['tags_list'])))

    # Download selected subset
    print('Downloading selected subset')
    _iter = list(
        zip(parsed_info['img_url_list'], parsed_info['new_fpath_list']))
    _iter = ut.ProgressIter(_iter, lbl='downloading sharks')
    for img_url, new_fpath in _iter:
        if not exists(new_fpath):
            ut.download_url(img_url, new_fpath)

    # Remove corrupted or ill-formatted images
    print('Checking for corrupted images')
    import vtool as vt
    noncorrupt_flags = vt.filterflags_valid_images(
        parsed_info['new_fpath_list'])
    parsed_info = {
        key: ut.compress(list_, noncorrupt_flags)
        for key, list_ in parsed_info.items()
    }

    print('Removing small images')
    import numpy as np
    imgsize_list = np.array(
        [vt.open_image_size(gpath) for gpath in parsed_info['new_fpath_list']])
    sqrt_area_list = np.sqrt(np.prod(imgsize_list, axis=1))
    areq_flags_list = sqrt_area_list >= 750
    parsed_info = {
        key: ut.compress(list_, areq_flags_list)
        for key, list_ in parsed_info.items()
    }

    grouped_idxs = ut.group_items(list(range(len(parsed_info['nameid_list']))),
                                  parsed_info['nameid_list'])
    keep_idxs = sorted(
        ut.flatten(
            [idxs for key, idxs in grouped_idxs.items() if len(idxs) >= 2]))
    parsed_info = {
        key: ut.take(list_, keep_idxs)
        for key, list_ in parsed_info.items()
    }

    print('Moving imagse to secondary directory')
    named_outputdir = 'named-left-sharkimages'
    # Build names
    parsed_info['namedir_fpath_list'] = [
        join(named_outputdir, _nameid, _fname) for _fname, _nameid in zip(
            parsed_info['new_fname_list'], parsed_info['nameid_list'])
    ]
    # Create directories
    ut.ensuredir(named_outputdir)
    named_dirs = ut.unique_ordered(
        list(map(dirname, parsed_info['namedir_fpath_list'])))
    for dir_ in named_dirs:
        ut.ensuredir(dir_)
    # Copy
    ut.copy_files_to(src_fpath_list=parsed_info['new_fpath_list'],
                     dst_fpath_list=parsed_info['namedir_fpath_list'])
Пример #30
0
def get_cnn_labeler_training_images_pytorch(
    ibs,
    dest_path=None,
    image_size=224,
    category_list=None,
    min_examples=10,
    category_mapping=None,
    viewpoint_mapping=None,
    purge=True,
    strict=True,
    skip_rate=0.0,
    valid_rate=0.2,
    use_axis_aligned_chips=False,
    train_gid_set=None,
):
    from os.path import join, expanduser, exists
    import random
    import cv2

    if dest_path is None:
        dest_path = expanduser(join('~', 'Desktop', 'extracted'))

    name = 'labeler-pytorch'
    dbname = ibs.dbname
    name_path = join(dest_path, name)
    train_path = join(name_path, 'train')
    valid_path = join(name_path, 'val')

    if purge:
        ut.delete(name_path)

    ut.ensuredir(name_path)
    ut.ensuredir(train_path)
    ut.ensuredir(valid_path)

    logger.info('category mapping = %s' % (ut.repr3(category_mapping), ))
    logger.info('viewpoint mapping = %s' % (ut.repr3(viewpoint_mapping), ))

    # train_gid_set = ibs.get_valid_gids()
    if train_gid_set is None:
        train_gid_set = set(
            ibs.get_imageset_gids(
                ibs.get_imageset_imgsetids_from_text('TRAIN_SET')))

    aids_list = ibs.get_image_aids(train_gid_set)
    # bboxes_list = [ ibs.get_annot_bboxes(aid_list) for aid_list in aids_list ]
    # aid_list = ibs.get_valid_aids()
    aid_list = ut.flatten(aids_list)
    # import random
    # random.shuffle(aid_list)
    # aid_list = sorted(aid_list[:100])
    species_list = ibs.get_annot_species_texts(aid_list)
    if category_mapping is not None:
        species_list = [
            category_mapping.get(species, species) for species in species_list
        ]
    species_set = set(species_list)
    yaw_list = ibs.get_annot_viewpoints(aid_list)

    if category_list is None:
        category_list = sorted(list(species_set))
        undesired_list = [
            'unspecified_animal',
            ibs.get_species_nice(ibs.const.UNKNOWN_SPECIES_ROWID),
        ]
        for undesired_species in undesired_list:
            if undesired_species in category_list:
                category_list.remove(undesired_species)
    category_set = set(category_list)

    # Filter the tup_list based on the requested categories
    tup_list = list(zip(aid_list, species_list, yaw_list))
    old_len = len(tup_list)
    tup_list = [(aid, species, viewpoint_mapping.get(species,
                                                     {}).get(yaw, yaw))
                for aid, species, yaw in tup_list if species in category_set]
    new_len = len(tup_list)
    logger.info('Filtered annotations: keep %d / original %d' %
                (new_len, old_len))

    # Skip any annotations that are of the wanted category and don't have a specified viewpoint
    counter = 0
    seen_dict = {}
    yaw_dict = {}
    for tup in tup_list:
        aid, species, yaw = tup
        # Keep track of the number of overall instances
        if species not in seen_dict:
            seen_dict[species] = 0
        seen_dict[species] += 1
        # Keep track of yaws that aren't None
        if yaw is not None:
            if species not in yaw_dict:
                yaw_dict[species] = {}
            if yaw not in yaw_dict[species]:
                yaw_dict[species][yaw] = 0
            yaw_dict[species][yaw] += 1
        else:
            counter += 1

    # Get the list of species that do not have enough viewpoint examples for training
    invalid_seen_set = set([])
    invalid_yaw_set = set([])
    for species in seen_dict:
        # Check that the number of instances is above the min_examples
        if seen_dict[species] < min_examples:
            invalid_seen_set.add(species)
            continue
        # If the species has viewpoints, check them as well
        if strict:
            if species in yaw_dict:
                # Check that all viewpoints exist
                # if len(yaw_dict[species]) < 8:
                #     invalid_yaw_set.add(species)
                #     continue
                # Check that all viewpoints have a minimum number of instances
                for yaw in yaw_dict[species]:
                    # assert yaw in ibs.const.VIEWTEXT_TO_YAW_RADIANS
                    if yaw_dict[species][yaw] < min_examples:
                        invalid_yaw_set.add(species)
                        continue
            else:
                invalid_yaw_set.add(species)
                continue

    logger.info('Null yaws: %d' % (counter, ))
    valid_seen_set = category_set - invalid_seen_set
    valid_yaw_set = valid_seen_set - invalid_yaw_set
    logger.info('Requested categories:')
    category_set = sorted(category_set)
    ut.print_list(category_set)
    # logger.info('Invalid yaw categories:')
    # ut.print_list(sorted(invalid_yaw_set))
    # logger.info('Valid seen categories:')
    # ut.print_list(sorted(valid_seen_set))
    logger.info('Valid yaw categories:')
    valid_yaw_set = sorted(valid_yaw_set)
    ut.print_list(valid_yaw_set)
    logger.info('Invalid seen categories (could not fulfill request):')
    invalid_seen_set = sorted(invalid_seen_set)
    ut.print_list(invalid_seen_set)

    skipped_yaw = 0
    skipped_seen = 0
    aid_list_ = []
    category_list_ = []
    for tup in tup_list:
        aid, species, yaw = tup
        if species in valid_yaw_set:
            # If the species is valid, but this specific annotation has no yaw, skip it
            if yaw is None:
                skipped_yaw += 1
                continue
            category = '%s:%s' % (species, yaw)
        elif species in valid_seen_set:
            category = '%s' % (species, )
        else:
            skipped_seen += 1
            continue
        aid_list_.append(aid)
        category_list_.append(category)
    logger.info('Skipped Yaw:  skipped %d / total %d' %
                (skipped_yaw, len(tup_list)))
    logger.info('Skipped Seen: skipped %d / total %d' %
                (skipped_seen, len(tup_list)))

    for category in sorted(set(category_list_)):
        logger.info('Making folder for %r' % (category, ))
        ut.ensuredir(join(train_path, category))
        ut.ensuredir(join(valid_path, category))

    config = {
        'dim_size': (image_size, image_size),
        'resize_dim': 'wh',
        'axis_aligned': use_axis_aligned_chips,
    }
    chip_list_ = ibs.depc_annot.get_property('chips',
                                             aid_list_,
                                             'img',
                                             config=config)

    # Get training data
    label_list = []
    for aid, chip, category in zip(aid_list_, chip_list_, category_list_):

        args = (aid, )
        logger.info('Processing AID: %r' % args)

        if skip_rate > 0.0 and random.uniform(0.0, 1.0) <= skip_rate:
            logger.info('\t Skipping')
            continue

        is_valid = random.uniform(0.0, 1.0) < valid_rate
        dest_path = valid_path if is_valid else train_path
        raw_path = join(dest_path, category)
        assert exists(dest_path)

        # Compute data
        values = (
            dbname,
            aid,
        )
        patch_filename = '%s_annot_aid_%s.png' % values
        patch_filepath = join(raw_path, patch_filename)
        cv2.imwrite(patch_filepath, chip)

        # Compute label
        label = '%s,%s' % (patch_filename, category)
        label_list.append(label)

    logger.info('Using labels for labeler training:')
    logger.info(ut.repr3(ut.dict_hist(category_list_)))

    return name_path
Пример #31
0
def fix_annotmatch_pzmaster1():
    """
    PZ_Master1 had annotmatch rowids that did not agree with the current name
    labeling. Looking at the inconsistencies in the graph interface was too
    cumbersome, because over 3000 annots were incorrectly grouped together.

    This function deletes any annotmatch rowid that is not consistent with the
    current labeling so we can go forward with using the new AnnotInference
    object
    """
    import wbia

    ibs = wbia.opendb('PZ_Master1')
    infr = wbia.AnnotInference(ibs=ibs, aids=ibs.get_valid_aids(), verbose=5)
    infr.initialize_graph()
    annots = ibs.annots()
    aid_to_nid = ut.dzip(annots.aids, annots.nids)

    if False:
        infr.reset_feedback()
        infr.ensure_mst()
        infr.apply_feedback_edges()
        infr.relabel_using_reviews()
        infr.start_qt_interface()

    # Get annotmatch rowids that agree with current labeling
    if False:
        annotmatch = ibs.db.get_table_as_pandas('annotmatch')
        import pandas as pd

        flags1 = pd.isnull(annotmatch['annotmatch_evidence_decision'])
        flags2 = annotmatch['annotmatch_tag_text'] == ''
        bad_part = annotmatch[flags1 & flags2]
        rowids = bad_part.index.tolist()
        ibs.delete_annotmatch(rowids)

    if False:
        # Delete bidirectional annotmatches
        annotmatch = ibs.db.get_table_as_pandas('annotmatch')
        df = annotmatch.set_index(['annot_rowid1', 'annot_rowid2'])

        # Find entires that have both directions
        pairs1 = annotmatch[['annot_rowid1', 'annot_rowid2']].values
        f_edges = {tuple(p) for p in pairs1}
        b_edges = {tuple(p[::-1]) for p in pairs1}
        isect_edges = {tuple(sorted(p)) for p in b_edges.intersection(f_edges)}
        isect_edges1 = list(isect_edges)
        isect_edges2 = [p[::-1] for p in isect_edges]

        # cols = ['annotmatch_evidence_decision', 'annotmatch_tag_text']
        import pandas as pd

        custom_ = {
            (559, 4909): (False, ['photobomb']),
            (7918, 8041): (False, ['photobomb']),
            (6634, 6754): (False, ['photobomb']),
            (3707, 3727): (False, ['photobomb']),
            (86, 103): (False, ['photobomb']),
        }
        extra_ = {}

        fixme_edges = []

        d1 = df.loc[isect_edges1].reset_index(drop=False)
        d2 = df.loc[isect_edges2].reset_index(drop=False)
        flags = d1['annotmatch_evidence_decision'] != d2[
            'annotmatch_evidence_decision']
        from wbia.tag_funcs import _parse_tags

        for f, r1, r2 in zip(flags, d1.iterrows(), d2.iterrows()):
            v1, v2 = r1[1], r2[1]
            aid1 = v1['annot_rowid1']
            aid2 = v1['annot_rowid2']
            truth_real = (ibs.const.EVIDENCE_DECISION.POSITIVE
                          if aid_to_nid[aid1] == aid_to_nid[aid2] else
                          ibs.const.EVIDENCE_DECISION.NEGATIVE)
            truth1 = v1['annotmatch_evidence_decision']
            truth2 = v2['annotmatch_evidence_decision']
            t1 = _parse_tags(v1['annotmatch_tag_text'])
            t2 = _parse_tags(v2['annotmatch_tag_text'])
            newtag = ut.union_ordered(t1, t2)
            if (aid1, aid2) in custom_:
                continue
            fixme_flag = False
            if not pd.isnull(truth1):
                if truth_real != truth1:
                    fixme_flag = True
            if not pd.isnull(truth2):
                if truth_real != truth2:
                    fixme_flag = True
            if fixme_flag:
                logger.info('newtag = %r' % (newtag, ))
                logger.info('truth_real = %r' % (truth_real, ))
                logger.info('truth1 = %r' % (truth1, ))
                logger.info('truth2 = %r' % (truth2, ))
                logger.info('aid1 = %r' % (aid1, ))
                logger.info('aid2 = %r' % (aid2, ))
                fixme_edges.append((aid1, aid2))
            else:
                extra_[(aid1, aid2)] = (truth_real, newtag)

        extra_.update(custom_)
        new_pairs = extra_.keys()
        new_truths = ut.take_column(ut.dict_take(extra_, new_pairs), 0)
        new_tags = ut.take_column(ut.dict_take(extra_, new_pairs), 1)
        new_tag_texts = [';'.join(t) for t in new_tags]
        aids1, aids2 = ut.listT(new_pairs)

        # Delete the old
        ibs.delete_annotmatch((d1['annotmatch_rowid'].values.tolist() +
                               d2['annotmatch_rowid'].values.tolist()))

        # Add the new
        ams = ibs.add_annotmatch_undirected(aids1, aids2)
        ibs.set_annotmatch_evidence_decision(ams, new_truths)
        ibs.set_annotmatch_tag_text(ams, new_tag_texts)

        if False:
            import wbia.guitool as gt

            gt.ensure_qapp()
            ut.qtensure()
            from wbia.gui import inspect_gui

            inspect_gui.show_vsone_tuner(ibs, aid1, aid2)

        # pairs2 = pairs1.T[::-1].T
        # idx1, idx2 = ut.isect_indices(list(map(tuple, pairs1)),
        #                               list(map(tuple, pairs2)))
        # r_edges = list(set(map(tuple, map(sorted, pairs1[idx1]))))
        # unique_pairs = list(set(map(tuple, map(sorted, pairs1[idx1]))))
        # df = annotmatch.set_index(['annot_rowid1', 'annot_rowid2'])

    x = ut.ddict(list)
    annotmatch = ibs.db.get_table_as_pandas('annotmatch')
    import ubelt as ub

    _iter = annotmatch.iterrows()
    prog = ub.ProgIter(_iter, length=len(annotmatch))
    for k, m in prog:
        aid1 = m['annot_rowid1']
        aid2 = m['annot_rowid2']
        if m['annotmatch_evidence_decision'] == ibs.const.EVIDENCE_DECISION.POSITIVE:
            if aid_to_nid[aid1] == aid_to_nid[aid2]:
                x['agree1'].append(k)
            else:
                x['disagree1'].append(k)
        elif m['annotmatch_evidence_decision'] == ibs.const.EVIDENCE_DECISION.NEGATIVE:
            if aid_to_nid[aid1] == aid_to_nid[aid2]:
                x['disagree2'].append(k)
            else:
                x['agree2'].append(k)

    ub.map_vals(len, x)
    ut.dict_hist(annotmatch.loc[x['disagree1']]['annotmatch_tag_text'])

    disagree1 = annotmatch.loc[x['disagree1']]
    pb_disagree1 = disagree1[disagree1['annotmatch_tag_text'] == 'photobomb']
    aids1 = pb_disagree1['annot_rowid1'].values.tolist()
    aids2 = pb_disagree1['annot_rowid2'].values.tolist()
    aid_pairs = list(zip(aids1, aids2))
    infr = wbia.AnnotInference.from_pairs(aid_pairs, ibs=ibs, verbose=5)
    if False:
        feedback = infr.read_wbia_annotmatch_feedback(edges=infr.edges())
        infr.external_feedback = feedback
        infr.apply_feedback_edges()
        infr.start_qt_interface(loop=False)

    # Delete these values
    if False:
        nonpb_disagree1 = disagree1[
            disagree1['annotmatch_tag_text'] != 'photobomb']
        disagree2 = annotmatch.loc[x['disagree2']]
        ibs.delete_annotmatch(nonpb_disagree1['annotmatch_rowid'])
        ibs.delete_annotmatch(disagree2['annotmatch_rowid'])

    # ut.dict_hist(disagree1['annotmatch_tag_text'])
    import networkx as nx

    graph = nx.Graph()
    graph.add_edges_from(
        zip(pb_disagree1['annot_rowid1'], pb_disagree1['annot_rowid2']))
    list(nx.connected_components(graph))

    set(annotmatch.loc[x['disagree2']]['annotmatch_tag_text'])
Пример #32
0
def find_consistent_labeling(grouped_oldnames,
                             extra_prefix='_extra_name',
                             verbose=False):
    r"""
    Solves a a maximum bipirtite matching problem to find a consistent
    name assignment that minimizes the number of annotations with different
    names. For each new grouping of annotations we assign

    For each group of annotations we must assign them all the same name, either from




    To reduce the running time

    Args:
        gropued_oldnames (list): A group of old names where the grouping is
            based on new names. For instance:

                Given:
                    aids      = [1, 2, 3, 4, 5]
                    old_names = [0, 1, 1, 1, 0]
                    new_names = [0, 0, 1, 1, 0]

                The grouping is
                    [[0, 1, 0], [1, 1]]

                This lets us keep the old names in a split case and
                re-use exising names and make minimal changes to
                current annotation names while still being consistent
                with the new and improved grouping.

                The output will be:
                    [0, 1]

                Meaning that all annots in the first group are assigned the
                name 0 and all annots in the second group are assigned the name
                1.

    References:
        http://stackoverflow.com/questions/1398822/assignment-problem-numpy

    CommandLine:
        python -m ibeis.scripts.name_recitifer find_consistent_labeling


    Example:
        >>> # ENABLE_DOCTEST
        >>> from ibeis.scripts.name_recitifer import *  # NOQA
        >>> grouped_oldnames = testdata_oldnames(25, 15,  5, n_per_incon=5)
        >>> new_names = find_consistent_labeling(grouped_oldnames, verbose=1)
        >>> grouped_oldnames = testdata_oldnames(0, 15,  5, n_per_incon=1)
        >>> new_names = find_consistent_labeling(grouped_oldnames, verbose=1)
        >>> grouped_oldnames = testdata_oldnames(0, 0, 0, n_per_incon=1)
        >>> new_names = find_consistent_labeling(grouped_oldnames, verbose=1)

    Example:
        >>> # ENABLE_DOCTEST
        >>> from ibeis.scripts.name_recitifer import *  # NOQA
        >>> ydata = []
        >>> xdata = list(range(10, 150, 50))
        >>> for x in xdata:
        >>>     print('x = %r' % (x,))
        >>>     grouped_oldnames = testdata_oldnames(x, 15,  5, n_per_incon=5)
        >>>     t = ut.Timerit(3, verbose=1)
        >>>     for timer in t:
        >>>         with timer:
        >>>             new_names = find_consistent_labeling(grouped_oldnames)
        >>>     ydata.append(t.ave_secs)
        >>> ut.quit_if_noshow()
        >>> import plottool_ibeis as pt
        >>> pt.qtensure()
        >>> pt.multi_plot(xdata, [ydata])
        >>> ut.show_if_requested()

    Example:
        >>> # ENABLE_DOCTEST
        >>> from ibeis.scripts.name_recitifer import *  # NOQA
        >>> grouped_oldnames = [['a', 'b', 'c'], ['b', 'c'], ['c', 'e', 'e']]
        >>> new_names = find_consistent_labeling(grouped_oldnames, verbose=1)
        >>> result = ut.repr2(new_names)
        >>> print(new_names)
        ['a', 'b', 'e']

    Example:
        >>> # ENABLE_DOCTEST
        >>> from ibeis.scripts.name_recitifer import *  # NOQA
        >>> grouped_oldnames = [['a', 'b'], ['a', 'a', 'b'], ['a']]
        >>> new_names = find_consistent_labeling(grouped_oldnames)
        >>> result = ut.repr2(new_names)
        >>> print(new_names)
        ['b', 'a', '_extra_name0']

    Example:
        >>> # ENABLE_DOCTEST
        >>> from ibeis.scripts.name_recitifer import *  # NOQA
        >>> grouped_oldnames = [['a', 'b'], ['e'], ['a', 'a', 'b'], [], ['a'], ['d']]
        >>> new_names = find_consistent_labeling(grouped_oldnames)
        >>> result = ut.repr2(new_names)
        >>> print(new_names)
        ['b', 'e', 'a', '_extra_name0', '_extra_name1', 'd']

    Example:
        >>> # ENABLE_DOCTEST
        >>> from ibeis.scripts.name_recitifer import *  # NOQA
        >>> grouped_oldnames = [[], ['a', 'a'], [],
        >>>                     ['a', 'a', 'a', 'a', 'a', 'a', 'a', 'b'], ['a']]
        >>> new_names = find_consistent_labeling(grouped_oldnames)
        >>> result = ut.repr2(new_names)
        >>> print(new_names)
        ['_extra_name0', 'a', '_extra_name1', 'b', '_extra_name2']
    """
    unique_old_names = ut.unique(ut.flatten(grouped_oldnames))
    n_old_names = len(unique_old_names)
    n_new_names = len(grouped_oldnames)

    # Initialize assignment to all Nones
    assignment = [None for _ in range(n_new_names)]

    if verbose:
        print('finding maximally consistent labeling')
        print('n_old_names = %r' % (n_old_names, ))
        print('n_new_names = %r' % (n_new_names, ))

    # For each old_name, determine now many new_names use it.
    oldname_sets = list(map(set, grouped_oldnames))
    oldname_usage = ut.dict_hist(ut.flatten(oldname_sets))

    # Any name used more than once is a conflict and must be resolved
    conflict_oldnames = {k for k, v in oldname_usage.items() if v > 1}

    # Partition into trivial and non-trivial cases
    nontrivial_oldnames = []
    nontrivial_new_idxs = []

    trivial_oldnames = []
    trivial_new_idxs = []
    for new_idx, group in enumerate(grouped_oldnames):
        if set(group).intersection(conflict_oldnames):
            nontrivial_oldnames.append(group)
            nontrivial_new_idxs.append(new_idx)
        else:
            trivial_oldnames.append(group)
            trivial_new_idxs.append(new_idx)

    # Rectify trivial cases
    # Any new-name that does not share any of its old-names with other
    # new-names can be resolved trivially
    n_trivial_unchanged = 0
    n_trivial_ignored = 0
    n_trivial_merges = 0
    for group, new_idx in zip(trivial_oldnames, trivial_new_idxs):
        if len(group) > 0:
            # new-names that use more than one old-name are simple merges
            h = ut.dict_hist(group)
            if len(h) > 1:
                n_trivial_merges += 1
            else:
                n_trivial_unchanged += 1
            hitems = list(h.items())
            hvals = [i[1] for i in hitems]
            maxval = max(hvals)
            g = min([k for k, v in hitems if v == maxval])
            assignment[new_idx] = g
        else:
            # new-names that use no old-names can be ignored
            n_trivial_ignored += 1

    if verbose:
        n_trivial = len(trivial_oldnames)
        n_nontrivial = len(nontrivial_oldnames)
        print('rectify %d trivial groups' % (n_trivial, ))
        print('  * n_trivial_unchanged = %r' % (n_trivial_unchanged, ))
        print('  * n_trivial_merges = %r' % (n_trivial_merges, ))
        print('  * n_trivial_ignored = %r' % (n_trivial_ignored, ))
        print('rectify %d non-trivial groups' % (n_nontrivial, ))

    # Partition nontrivial_oldnames into smaller disjoint sets
    nontrivial_oldnames_sets = list(map(set, nontrivial_oldnames))
    import networkx as nx
    g = nx.Graph()
    g.add_nodes_from(range(len(nontrivial_oldnames_sets)))
    for u, group1 in enumerate(nontrivial_oldnames_sets):
        rest = nontrivial_oldnames_sets[u + 1:]
        for v, group2 in enumerate(rest, start=u + 1):
            if group1.intersection(group2):
                g.add_edge(u, v)
    nontrivial_partition = list(nx.connected_components(g))
    if verbose:
        print('  * partitioned non-trivial into %d subgroups' %
              (len(nontrivial_partition)))
        part_size_stats = ut.get_stats(map(len, nontrivial_partition))
        stats_str = ut.repr2(part_size_stats, precision=2, strkeys=True)
        print('  * partition size stats = %s' % (stats_str, ))

    # Rectify nontrivial cases
    for part_idxs in ut.ProgIter(nontrivial_partition,
                                 labels='rectify parts',
                                 enabled=verbose):
        part_oldnames = ut.take(nontrivial_oldnames, part_idxs)
        part_newidxs = ut.take(nontrivial_new_idxs, part_idxs)
        # Rectify this part
        assignment_ = simple_munkres(part_oldnames)
        for new_idx, new_name in zip(part_newidxs, assignment_):
            assignment[new_idx] = new_name

    # Any unassigned name is now given a new unique label with a prefix
    if extra_prefix is not None:
        num_extra = 0
        for idx, val in enumerate(assignment):
            if val is None:
                assignment[idx] = '%s%d' % (
                    extra_prefix,
                    num_extra,
                )
                num_extra += 1
    return assignment
Пример #33
0
    def fix_duplicates(drive):
        r"""
        for every duplicate file passing a (eg avi) filter, remove the file
        that is in the smallest directory. On a tie use the smallest dpath.
        This will filter all duplicate files in a folder into a single folder.

        but... need to look at non-duplicates in that folder and decide if they
        should be moved as well.  So, should trigger on folders that have at
        least 50% duplicate.  Might not want to move curated folders.

        Example:
            cd ~/local/scripts
            >>> from register_files import *  # NOQA
            >>> dpaths = ut.get_argval('--drives', type_=list, default=['E:/'])#'D:/', 'E:/', 'F:/'])
            >>> drives = [Drive(root_dpath) for root_dpath in dpaths]
            >>> E = drive = drives[0]
            >>> #D, E, F = drives
        """
        print('Fixing Duplicates in %r' % (drive,))
        list_ = drive.fpath_hashX_list
        multiindex_dict_ = build_multindex(list_)
        duplicate_hashes = [
            key for key, val in six.iteritems(multiindex_dict_)
            if len(val) > 1
        ]
        duplicate_idxs = ut.dict_take(multiindex_dict_, duplicate_hashes)
        unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs)
        # Check if any dups have been removed
        still_exists = ut.unflat_map(exists, unflat_fpaths)
        unflat_idxs2 = ut.zipcompress(duplicate_idxs, still_exists)
        duplicate_idxs = [idxs for idxs in unflat_idxs2 if len(idxs) > 1]
        # Look at duplicate files
        unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs)
        unflat_sizes = ut.list_unflat_take(drive.fpath_bytes_list, duplicate_idxs)
        # Find highly coupled directories
        if True:
            coupled_dirs = []
            for fpaths in unflat_fpaths:
                #basedir = ut.longest_existing_path(commonprefix(fpaths))
                dirs = sorted(list(map(dirname, fpaths)))
                _list = list(range(len(dirs)))
                idxs = ut.upper_diag_self_prodx(_list)
                coupled_dirs.extend(list(map(tuple, ut.list_unflat_take(dirs, idxs))))
            hist_ = ut.dict_hist(coupled_dirs)
            coupled_idxs = ut.list_argsort(hist_.values())[::-1]
            most_coupled = ut.take(list(hist_.keys()), coupled_idxs[0:100])
            print('Coupled fpaths: ' + ut.list_str(most_coupled, nl=True))
        print('%d unique files are duplicated' % (len(unflat_sizes),))
        #print('Duplicate sizes: ' + ut.list_str(unflat_sizes[0:10], nl=True))
        #print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths[0:10], nl=True))
        #print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths[0::5], nl=True))
        print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths, nl=True))
        # Find duplicate directories
        dpath_list = list(drive.dpath_to_fidx.keys())
        fidxs_list = ut.dict_take(drive.dpath_to_fidx, drive.dpath_list)
        #exists_list = list(map(exists, drive.fpath_list))
        #unflat_exists = ut.list_unflat_take(exists_list, fidxs_list)
        fname_registry = [basename(fpath) for fpath in drive.fpath_list]
        unflat_fnames = ut.list_unflat_take(fname_registry, fidxs_list)
        def unsorted_list_hash(list_):
            return ut.hashstr27(str(sorted(list_)))
        unflat_fname_sets = list(map(unsorted_list_hash, ut.ProgIter(unflat_fnames, freq=10000)))
        fname_based_duplicate_dpaths = []
        multiindex_dict2_ = build_multindex(unflat_fname_sets)
        fname_based_duplicate_hashes = [key for key, val in multiindex_dict2_.items() if len(val) > 1]
        print('#fname_based_duplicate_dpaths = %r' % (len(fname_based_duplicate_hashes),))
        fname_based_duplicate_didxs = ut.dict_take(multiindex_dict2_, fname_based_duplicate_hashes)
        fname_based_duplicate_dpaths = ut.list_unflat_take(dpath_list, fname_based_duplicate_didxs)
        print(ut.repr3(fname_based_duplicate_dpaths[0:10]))
Пример #34
0
def parse_shark_tags(orig_fname_list):
    import re

    invalid_tag_patterns = [
        re.escape('-'),
        re.escape('(') + '?\\d*' + re.escape(')') + '?',
        '\\d+-\\d+-\\d+',
        '\\d+,',
        '\\d+',
        'vi*',
        'i*v',
        'i+',
        '\\d+th',
        '\\d+nd',
        '\\d+rd',
        'remant',
        'timnfe',
        't',
        'e',
        'sjl',
        'disc',
        'dec',
        'road',
        'easter',
        'western',
        'west',
        'tn',
        '\\d*ap',
        'whaleshark\\d*',
        'shark\\d*',
        'whale\\d*',
        'whalesharking',
        'sharking',
        'whalesharks',
        'whales',
        'picture',
        'australien',
        'australia',
        'nick',
        'tim\\d*',
        'imageset',
        'holiday',
        'visit',
        'tour',
        'trip',
        'pec',
        'sv',
        'a',
        'b',
        'gender',
        'sex',
        'img',
        'image',
        'pic',
        'pics',
        'leith',
        'trips',
        'kings',
        'photo',
        'video',
        'media',
        'fix',
        'feeding',
        'nrd',
        'nd',
        'gen',
        'wa',
        'nmp',
        'bo',
        'kd',
        'ow',
        'ne',
        'dsc',
        'nwd',
        'mg',
        'w',
        'mai',
        'blue',
        'stumpy',
        'oea',
        'cbe',
        'edc',
        'knrt',
        'tiws2',
        'ando',
        'adv',
        'str',
        'adventure',
        'camera',
        'tag',
        'id',
        'of',
        'and',
        'tagged',
        'from',
        'day',
        '\\d*april',
        '\\d*may',
        '\\d*july',
        '\\d*june',
        'ningaloo',
        'ningblue\\d*',
        'kooling',
    ]

    valid_tag_level_set = [
        ['view-left', 'left', 'lhs', 'l', 'leftside'],
        ['view-right', 'right', 'rhs', 'r', 'rightside'],
        ['view-back', 'back'],
        ['view-top', 'top'],
        ['sex-male', 'male', 'm', 'sexm'],
        ['sex-female', 'female', 'f'],
        ['sex-unknown', 'unknown', 'u'],
        ['part-tail', 'tail'],
        ['part-flank', 'side', 'flank'],
        ['part-head', 'head'],
        ['part-pectoral', 'pectoral', 'pec'],
        ['part-dorsal', 'dorsal', 'dorsals'],
        ['part-claspers', 'claspers', 'clasper'],
        ['part-fin', 'fin'],
        ['cropped', 'crop'],
        ['scar', 'scar2'],
        ['notch'],
        ['small'],
        ['bite'],
        ['cam-slr2', 'slr2'],
        #['cam-5m', '5m']
        ['5m'],
        ['7m'],
        ['4m'],
        ['copy'],
        ['qual-resize'],
        ['qual-stretched'],
    ]

    def apply_enum_regex(pat_list):
        enum_endings = [
            '[a-g]',
            '\\d*',
            'i*',
        ]
        expanded_pats = ut.flatten([[pat + end for end in enum_endings]
                                    for pat in pat_list])
        return expanded_pats

    def apply_regex_endings(pat_list):
        return [p + '$' for p in pat_list]

    tag_alias_map = {}
    for level_set in valid_tag_level_set:
        main_key = level_set[0]
        for key in level_set:
            tag_alias_map[key] = main_key

    inverse_alias_map = {}
    for level_set in valid_tag_level_set:
        inverse_alias_map[level_set[0]] = level_set

    regex_alias_map = {
        'view-left':
        apply_regex_endings(apply_enum_regex(inverse_alias_map['view-left'])),
        'view-right':
        apply_regex_endings(apply_enum_regex(inverse_alias_map['view-right'])),
    }

    valid_tags = list(inverse_alias_map.keys())

    invalid_tag_patterns = apply_regex_endings(invalid_tag_patterns)

    def parse_all_fname_tags(fname):
        _tags = [splitext(fname)[0]]
        _tags = ut.flatten([t.split('_') for t in _tags])
        _tags = ut.flatten([t.split('.') for t in _tags])
        _tags = [t.lower() for t in _tags]
        _tags = [tag_alias_map.get(t, t) for t in _tags]
        for key, vals in regex_alias_map.items():
            pat = ut.regex_or(vals)
            _tags = [key if re.match(pat, t) else t for t in _tags]
        pat = ut.regex_or(invalid_tag_patterns)
        _tags = [t for t in _tags if not re.match(pat, t)]
        _tags = ut.unique_ordered(_tags)
        return _tags

    all_img_tag_list = list(map(parse_all_fname_tags, orig_fname_list))

    known_img_tag_list = [
        list(set(tags).intersection(set(valid_tags)))
        for tags in all_img_tag_list
    ]

    if False:
        # Help figure out which tags are important
        _parsed_tags = ut.flatten(all_img_tag_list)

        taghist = ut.dict_hist(_parsed_tags)
        taghist = {key: val for key, val in taghist.items() if val > 1}

        unknown_taghist = sorted([(val, key) for key, val in taghist.items()
                                  if key not in valid_tags])[::-1]
        known_taghist = sorted([(val, key) for key, val in taghist.items()
                                if key in valid_tags])[::-1]

        print('Known')
        print(ut.list_str(known_taghist[0:100]))

        print('Unknown')
        print(ut.list_str(unknown_taghist[0:100]))

        print(
            ut.dict_str(ut.dict_hist(ut.flatten(known_img_tag_list)),
                        key_order_metric='val'))

    return known_img_tag_list
Пример #35
0
def ggr_random_name_splits():
    """
    CommandLine:
        python -m wbia.viz.viz_graph2 ggr_random_name_splits --show

    Ignore:
        sshfs -o idmap=user lev:/ ~/lev

    Example:
        >>> # DISABLE_DOCTEST
        >>> from wbia.viz.viz_graph2 import *  # NOQA
        >>> ggr_random_name_splits()
    """
    import wbia.guitool as gt

    gt.ensure_qtapp()
    # nid_list = ibs.get_valid_nids(filter_empty=True)
    import wbia

    dbdir = '/media/danger/GGR/GGR-IBEIS'
    dbdir = (dbdir if ut.checkpath(dbdir) else
             ut.truepath('~/lev/media/danger/GGR/GGR-IBEIS'))
    ibs = wbia.opendb(dbdir=dbdir, allow_newdir=False)

    import datetime

    day1 = datetime.date(2016, 1, 30)
    day2 = datetime.date(2016, 1, 31)

    orig_filter_kw = {
        'multiple': None,
        # 'view': ['right'],
        # 'minqual': 'good',
        'is_known': True,
        'min_pername': 2,
    }
    orig_aids = ibs.filter_annots_general(filter_kw=ut.dict_union(
        orig_filter_kw,
        {
            'min_unixtime':
            ut.datetime_to_posixtime(ut.date_to_datetime(day1, 0.0)),
            'max_unixtime':
            ut.datetime_to_posixtime(ut.date_to_datetime(day2, 1.0)),
        },
    ))
    orig_all_annots = ibs.annots(orig_aids)
    orig_unique_nids, orig_grouped_annots_ = orig_all_annots.group(
        orig_all_annots.nids)
    # Ensure we get everything
    orig_grouped_annots = [
        ibs.annots(aids_) for aids_ in ibs.get_name_aids(orig_unique_nids)
    ]

    # pip install quantumrandom
    if False:
        import quantumrandom

        data = quantumrandom.uint16()
        seed = data.sum()
        print('seed = %r' % (seed, ))
        # import Crypto.Random
        # from Crypto import Random
        # quantumrandom.get_data()
        # StrongRandom = Crypto.Random.random.StrongRandom
        # aes.reseed(3340258)
        # chars = [str(chr(x)) for x in data.view(np.uint8)]
        # aes_seed = str('').join(chars)
        # aes = Crypto.Random.Fortuna.FortunaGenerator.AESGenerator()
        # aes.reseed(aes_seed)
        # aes.pseudo_random_data(10)

    orig_rand_idxs = ut.random_indexes(len(orig_grouped_annots), seed=3340258)
    orig_sample_size = 75
    random_annot_groups = ut.take(orig_grouped_annots, orig_rand_idxs)
    orig_annot_sample = random_annot_groups[:orig_sample_size]

    # OOOPS MADE ERROR REDO ----

    filter_kw = {
        'multiple': None,
        'view': ['right'],
        'minqual': 'good',
        'is_known': True,
        'min_pername': 2,
    }
    filter_kw_ = ut.dict_union(
        filter_kw,
        {
            'min_unixtime':
            ut.datetime_to_posixtime(ut.date_to_datetime(day1, 0.0)),
            'max_unixtime':
            ut.datetime_to_posixtime(ut.date_to_datetime(day2, 1.0)),
        },
    )
    refiltered_sample = [
        ibs.filter_annots_general(annot.aids, filter_kw=filter_kw_)
        for annot in orig_annot_sample
    ]
    is_ok = np.array(ut.lmap(len, refiltered_sample)) >= 2
    ok_part_orig_sample = ut.compress(orig_annot_sample, is_ok)
    ok_part_orig_nids = [x.nids[0] for x in ok_part_orig_sample]

    # Now compute real sample
    aids = ibs.filter_annots_general(filter_kw=filter_kw_)
    all_annots = ibs.annots(aids)
    unique_nids, grouped_annots_ = all_annots.group(all_annots.nids)
    grouped_annots = grouped_annots_
    # Ensure we get everything
    # grouped_annots = [ibs.annots(aids_) for aids_ in ibs.get_name_aids(unique_nids)]

    pop = len(grouped_annots)
    pername_list = ut.lmap(len, grouped_annots)
    groups = wbia.annots.AnnotGroups(grouped_annots, ibs)
    match_tags = [ut.unique(ut.flatten(t)) for t in groups.match_tags]
    tag_case_hist = ut.dict_hist(ut.flatten(match_tags))
    print('name_pop = %r' % (pop, ))
    print('Annots per Multiton Name' +
          ut.repr3(ut.get_stats(pername_list, use_median=True)))
    print('Name Tag Hist ' + ut.repr3(tag_case_hist))
    print('Percent Photobomb: %.2f%%' %
          (tag_case_hist['photobomb'] / pop * 100))
    print('Percent Split: %.2f%%' % (tag_case_hist['splitcase'] / pop * 100))

    # Remove the ok part from this sample
    remain_unique_nids = ut.setdiff(unique_nids, ok_part_orig_nids)
    remain_grouped_annots = [
        ibs.annots(aids_) for aids_ in ibs.get_name_aids(remain_unique_nids)
    ]

    sample_size = 75
    import vtool as vt

    vt.calc_sample_from_error_bars(0.05, pop, conf_level=0.95, prior=0.05)

    remain_rand_idxs = ut.random_indexes(len(remain_grouped_annots),
                                         seed=3340258)
    remain_sample_size = sample_size - len(ok_part_orig_nids)
    remain_random_annot_groups = ut.take(remain_grouped_annots,
                                         remain_rand_idxs)
    remain_annot_sample = remain_random_annot_groups[:remain_sample_size]

    annot_sample_nofilter = ok_part_orig_sample + remain_annot_sample
    # Filter out all bad parts
    annot_sample_filter = [
        ibs.annots(ibs.filter_annots_general(annot.aids, filter_kw=filter_kw_))
        for annot in annot_sample_nofilter
    ]
    annot_sample = annot_sample_filter

    win = None
    from wbia.viz import viz_graph2

    for annots in ut.InteractiveIter(annot_sample):
        if win is not None:
            win.close()
        win = viz_graph2.make_qt_graph_interface(ibs,
                                                 aids=annots.aids,
                                                 init_mode='rereview')
        print(win)

    sample_groups = wbia.annots.AnnotGroups(annot_sample, ibs)

    flat_tags = [ut.unique(ut.flatten(t)) for t in sample_groups.match_tags]

    print('Using Split and Photobomb')
    is_positive = ['photobomb' in t or 'splitcase' in t for t in flat_tags]
    num_positive = sum(is_positive)
    vt.calc_error_bars_from_sample(sample_size,
                                   num_positive,
                                   pop,
                                   conf_level=0.95)

    print('Only Photobomb')
    is_positive = ['photobomb' in t for t in flat_tags]
    num_positive = sum(is_positive)
    vt.calc_error_bars_from_sample(sample_size,
                                   num_positive,
                                   pop,
                                   conf_level=0.95)

    print('Only SplitCase')
    is_positive = ['splitcase' in t for t in flat_tags]
    num_positive = sum(is_positive)
    vt.calc_error_bars_from_sample(sample_size,
                                   num_positive,
                                   pop,
                                   conf_level=0.95)
Пример #36
0
 def ext_hist(self):
     return ut.dict_hist(self.attrs['ext'])
    if len(global_aid_list_) > MAX_AIDS:
        global_aid_list_ = global_aid_list_[:MAX_AIDS]

    if len(global_aid_list_) > 1:
        valid_aid = global_aid_list_[0]
        valid_aid_set.add(valid_aid)

    # test_aid = global_aid_list_[1]
    # test_aid_set.add(test_aid)

    aid_list += global_aid_list_
    count = len(global_aid_list_)
    count_list.append(count)
    if len(count_list) >= MAX_NAMES:
        break
print(ut.repr3(ut.dict_hist(count_list)))
print(ut.repr3(len(count_list)))

tips_list = depc.get('Notch_Tips', aid_list)
size_list = depc.get('chips', aid_list, ('width', 'height'))
config = {
    'dim_size': 1000,
    'resize_dim': 'width',
    'ext': '.jpg',
}
chip_list = depc.get('chips', aid_list, 'img', config=config, ensure=True)

color_list = [
    (255, 0, 0),
    (0, 0, 255),
    (0, 255, 0),
Пример #38
0
def tag_hist(tags_list):
    import utool as ut
    return ut.dict_hist(ut.flatten(tags_list), ordered=True)
Пример #39
0
def get_toy_data_1vM(num_annots, num_names=None, **kwargs):
    r"""
    Args:
        num_annots (int):
        num_names (int): (default = None)

    Kwargs:
        initial_aids, initial_nids, nid_sequence, seed

    Returns:
        tuple: (pair_list, feat_list)

    CommandLine:
        python -m ibeis.algo.hots.demobayes --exec-get_toy_data_1vM --show

    Example:
        >>> # DISABLE_DOCTEST
        >>> from ibeis.algo.hots.demobayes import *  # NOQA
        >>> num_annots = 1000
        >>> num_names = 40
        >>> get_toy_data_1vM(num_annots, num_names)
        >>> ut.quit_if_noshow()
        >>> import plottool as pt
        >>> ut.show_if_requested()
    """
    import vtool as vt
    tup_ = get_toy_annots(num_annots, num_names, **kwargs)
    aids, nids, aids1, nids1, all_aids, all_nids = tup_
    rng = vt.ensure_rng(None)

    # Test a simple SVM classifier
    nid2_nexemp = ut.dict_hist(nids1)
    aid2_nid = dict(zip(aids, nids))

    ut.fix_embed_globals()

    #def add_to_globals(globals_, subdict):
    #    globals_.update(subdict)

    unique_nids = list(nid2_nexemp.keys())

    def annot_to_class_feats2(aid, aid2_nid, top=None):
        pair_list = []
        score_list = []
        nexemplar_list = []
        for nid in unique_nids:
            label = (aid2_nid[aid] == nid)
            num_exemplars = nid2_nexemp.get(nid, 0)
            if num_exemplars == 0:
                continue
            params = toy_params[label]
            mu, sigma = ut.dict_take(params, ['mu', 'sigma'])
            score_ = rng.normal(mu, sigma, size=num_exemplars).max()
            score = np.clip(score_, 0, np.inf)
            pair_list.append((aid, nid))
            score_list.append(score)
            nexemplar_list.append(num_exemplars)
        rank_list = ut.argsort(score_list, reverse=True)
        feat_list = np.array([score_list, rank_list, nexemplar_list]).T
        sortx = np.argsort(rank_list)
        feat_list = feat_list.take(sortx, axis=0)
        pair_list = np.array(pair_list).take(sortx, axis=0)
        if top is not None:
            feat_list = feat_list[:top]
            pair_list = pair_list[0:top]
        return pair_list, feat_list

    toclass_features = [annot_to_class_feats2(aid, aid2_nid, top=5) for aid in aids]
    aidnid_pairs = np.vstack(ut.get_list_column(toclass_features, 0))
    feat_list = np.vstack(ut.get_list_column(toclass_features, 1))
    score_list = feat_list.T[0:1].T
    lbl_list = [aid2_nid[aid] == nid for aid, nid in aidnid_pairs]

    from sklearn import svm
    #clf1 = svm.LinearSVC()
    print('Learning classifiers')

    clf3 = svm.SVC()
    clf3.fit(feat_list, lbl_list)

    clf1 = svm.LinearSVC()
    clf1.fit(score_list, lbl_list)

    # Score new annots against the training database
    tup_ = get_toy_annots(num_annots * 2, num_names, initial_aids=all_aids, initial_nids=all_nids)
    aids, nids, aids1, nids1, all_aids, all_nids = tup_
    aid2_nid = dict(zip(aids, nids))
    toclass_features = [annot_to_class_feats2(aid, aid2_nid) for aid in aids]
    aidnid_pairs = np.vstack(ut.get_list_column(toclass_features, 0))
    feat_list = np.vstack(ut.get_list_column(toclass_features, 1))
    lbl_list = np.array([aid2_nid[aid] == nid for aid, nid in aidnid_pairs])

    print('Running tests')

    score_list = feat_list.T[0:1].T

    tp_feat_list = feat_list[lbl_list]
    tn_feat_list = feat_list[~lbl_list]
    tp_lbls = lbl_list[lbl_list]
    tn_lbls = lbl_list[~lbl_list]
    print('num tp: %d' % len(tp_lbls))
    print('num fp: %d' % len(tn_lbls))

    tp_score_list = score_list[lbl_list]
    tn_score_list = score_list[~lbl_list]

    print('tp_feat' + ut.repr3(ut.get_stats(tp_feat_list, axis=0), precision=2))
    print('tp_feat' + ut.repr3(ut.get_stats(tn_feat_list, axis=0), precision=2))

    print('tp_score' + ut.repr2(ut.get_stats(tp_score_list), precision=2))
    print('tp_score' + ut.repr2(ut.get_stats(tn_score_list), precision=2))

    tp_pred3 = clf3.predict(tp_feat_list)
    tn_pred3 = clf3.predict(tn_feat_list)
    print((tp_pred3.sum(), tp_pred3.shape))
    print((tn_pred3.sum(), tn_pred3.shape))

    tp_score3 = clf3.score(tp_feat_list, tp_lbls)
    tn_score3 = clf3.score(tn_feat_list, tn_lbls)

    tp_pred1 = clf1.predict(tp_score_list)
    tn_pred1 = clf1.predict(tn_score_list)
    print((tp_pred1.sum(), tp_pred1.shape))
    print((tn_pred1.sum(), tn_pred1.shape))

    tp_score1 = clf1.score(tp_score_list, tp_lbls)
    tn_score1 = clf1.score(tn_score_list, tn_lbls)
    print('tp score with rank    = %r' % (tp_score3,))
    print('tn score with rank    = %r' % (tn_score3,))

    print('tp score without rank = %r' % (tp_score1,))
    print('tn score without rank = %r' % (tn_score1,))
    toy_data = {}

    return toy_data
Пример #40
0
    def fix_duplicates(drive):
        r"""
        for every duplicate file passing a (eg avi) filter, remove the file
        that is in the smallest directory. On a tie use the smallest dpath.
        This will filter all duplicate files in a folder into a single folder.

        but... need to look at non-duplicates in that folder and decide if they
        should be moved as well.  So, should trigger on folders that have at
        least 50% duplicate.  Might not want to move curated folders.

        Example:
            cd ~/local/scripts
            >>> from register_files import *  # NOQA
            >>> dpaths = ut.get_argval('--drives', type_=list, default=['E:/'])#'D:/', 'E:/', 'F:/'])
            >>> drives = [Drive(root_dpath) for root_dpath in dpaths]
            >>> E = drive = drives[0]
            >>> #D, E, F = drives
        """
        print('Fixing Duplicates in %r' % (drive, ))
        list_ = drive.fpath_hashX_list
        multiindex_dict_ = build_multindex(list_)
        duplicate_hashes = [
            key for key, val in six.iteritems(multiindex_dict_) if len(val) > 1
        ]
        duplicate_idxs = ut.dict_take(multiindex_dict_, duplicate_hashes)
        unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs)
        # Check if any dups have been removed
        still_exists = ut.unflat_map(exists, unflat_fpaths)
        unflat_idxs2 = ut.zipcompress(duplicate_idxs, still_exists)
        duplicate_idxs = [idxs for idxs in unflat_idxs2 if len(idxs) > 1]
        # Look at duplicate files
        unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs)
        unflat_sizes = ut.list_unflat_take(drive.fpath_bytes_list,
                                           duplicate_idxs)
        # Find highly coupled directories
        if True:
            coupled_dirs = []
            for fpaths in unflat_fpaths:
                #basedir = ut.longest_existing_path(commonprefix(fpaths))
                dirs = sorted(list(map(dirname, fpaths)))
                _list = list(range(len(dirs)))
                idxs = ut.upper_diag_self_prodx(_list)
                coupled_dirs.extend(
                    list(map(tuple, ut.list_unflat_take(dirs, idxs))))
            hist_ = ut.dict_hist(coupled_dirs)
            coupled_idxs = ut.list_argsort(hist_.values())[::-1]
            most_coupled = ut.take(list(hist_.keys()), coupled_idxs[0:100])
            print('Coupled fpaths: ' + ut.repr2(most_coupled, nl=True))
        print('%d unique files are duplicated' % (len(unflat_sizes), ))
        #print('Duplicate sizes: ' + ut.repr2(unflat_sizes[0:10], nl=True))
        #print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths[0:10], nl=True))
        #print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths[0::5], nl=True))
        print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths, nl=True))
        # Find duplicate directories
        dpath_list = list(drive.dpath_to_fidx.keys())
        fidxs_list = ut.dict_take(drive.dpath_to_fidx, drive.dpath_list)
        #exists_list = list(map(exists, drive.fpath_list))
        #unflat_exists = ut.list_unflat_take(exists_list, fidxs_list)
        fname_registry = [basename(fpath) for fpath in drive.fpath_list]
        unflat_fnames = ut.list_unflat_take(fname_registry, fidxs_list)

        def unsorted_list_hash(list_):
            return ut.hashstr27(str(sorted(list_)))

        unflat_fname_sets = list(
            map(unsorted_list_hash, ut.ProgIter(unflat_fnames, freq=10000)))
        fname_based_duplicate_dpaths = []
        multiindex_dict2_ = build_multindex(unflat_fname_sets)
        fname_based_duplicate_hashes = [
            key for key, val in multiindex_dict2_.items() if len(val) > 1
        ]
        print('#fname_based_duplicate_dpaths = %r' %
              (len(fname_based_duplicate_hashes), ))
        fname_based_duplicate_didxs = ut.dict_take(
            multiindex_dict2_, fname_based_duplicate_hashes)
        fname_based_duplicate_dpaths = ut.list_unflat_take(
            dpath_list, fname_based_duplicate_didxs)
        print(ut.repr3(fname_based_duplicate_dpaths[0:10]))
Пример #41
0
    def update_asset_symlinks(self, verbose=True):
        """
        Traverse the files in the _submission/ folder and add/update symlinks
        for any relevant files we identify

        Ref:
            https://pypi.org/project/python-magic/
            https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
            http://www.iana.org/assignments/media-types/media-types.xhtml
        """
        from app.modules.assets.models import Asset
        import utool as ut
        import magic

        submission_abspath = self.get_absolute_path()
        submission_path = os.path.join(submission_abspath, '_submission')
        assets_path = os.path.join(submission_abspath, '_assets')

        current_app.sub.ensure_initialed()

        # Walk the submission path, looking for white-listed MIME type files
        files = []
        skipped = []
        errors = []
        walk_list = sorted(list(os.walk(submission_path)))
        print('Walking submission...')
        for root, directories, filenames in tqdm.tqdm(walk_list):
            filenames = sorted(filenames)
            for filename in filenames:
                filepath = os.path.join(root, filename)

                # Normalize path (sanity check)
                filepath = os.path.normpath(filepath)

                # Sanity check, ensure that the path is formatted well
                assert os.path.exists(filepath)
                assert os.path.isabs(filepath)
                try:
                    basename = os.path.basename(filepath)
                    _, extension = os.path.splitext(basename)
                    extension = extension.lower()
                    extension = extension.strip('.')

                    if basename.startswith('.'):
                        # Skip hidden files
                        if basename not in ['.touch']:
                            skipped.append((filepath, basename))
                        continue

                    if os.path.isdir(filepath):
                        # Skip any directories (sanity check)
                        skipped.append((filepath, extension))
                        continue

                    if os.path.islink(filepath):
                        # Skip any symbolic links (sanity check)
                        skipped.append((filepath, extension))
                        continue

                    mime_type = magic.from_file(filepath, mime=True)
                    if mime_type not in current_app.sub.mime_type_whitelist:
                        # Skip any unsupported MIME types
                        skipped.append((filepath, extension))
                        continue

                    magic_signature = magic.from_file(filepath)
                    size_bytes = os.path.getsize(filepath)

                    file_data = {
                        'filepath': filepath,
                        'path': basename,
                        'extension': extension,
                        'mime_type': mime_type,
                        'magic_signature': magic_signature,
                        'size_bytes': size_bytes,
                        'submission_guid': self.guid,
                    }
                    files.append(file_data)
                except Exception:
                    logging.exception('Got exception in update_asset_symlinks')
                    errors.append(filepath)

        if verbose:
            print('Processed asset files from submission: %r' % (self, ))
            print('\tFiles   : %d' % (len(files), ))
            print('\tSkipped : %d' % (len(skipped), ))
            if len(skipped) > 0:
                skipped_ext_list = [skip[1] for skip in skipped]
                skipped_ext_str = ut.repr3(ut.dict_hist(skipped_ext_list))
                skipped_ext_str = skipped_ext_str.replace('\n', '\n\t\t')
                print('\t\t%s' % (skipped_ext_str, ))
            print('\tErrors  : %d' % (len(errors), ))

        # Compute the xxHash64 for all found files
        filepath_list = [file_data['filepath'] for file_data in files]
        arguments_list = list(zip(filepath_list))
        print('Computing filesystem xxHash64...')
        filesystem_xxhash64_list = parallel(compute_xxhash64_digest_filepath,
                                            arguments_list)
        filesystem_guid_list = list(
            map(ut.hashable_to_uuid, filesystem_xxhash64_list))

        # Update file_data with the filesystem and semantic hash information
        zipped = zip(files, filesystem_xxhash64_list, filesystem_guid_list)
        for file_data, filesystem_xxhash64, filesystem_guid in zipped:
            file_data['filesystem_xxhash64'] = filesystem_xxhash64
            file_data['filesystem_guid'] = filesystem_guid
            semantic_guid_data = (
                file_data['submission_guid'],
                file_data['filesystem_guid'],
            )
            file_data['semantic_guid'] = ut.hashable_to_uuid(
                semantic_guid_data)

        # Delete all existing symlinks
        existing_filepath_guid_mapping = {}
        existing_asset_symlinks = ut.glob(os.path.join(assets_path, '*'))
        for existing_asset_symlink in existing_asset_symlinks:
            basename = os.path.basename(existing_asset_symlink)
            if basename in ['.touch', 'derived']:
                continue
            existing_asset_target = os.readlink(existing_asset_symlink)
            existing_asset_target_ = os.path.abspath(
                os.path.join(assets_path, existing_asset_target))
            if os.path.exists(existing_asset_target_):
                uuid_str, _ = os.path.splitext(basename)
                uuid_str = uuid_str.strip().strip('.')
                try:
                    existing_filepath_guid_mapping[
                        existing_asset_target_] = uuid.UUID(uuid_str)
                except Exception:
                    pass
            os.remove(existing_asset_symlink)

        # Add new or update any existing Assets found in the Submission
        asset_submission_filepath_list = [
            file_data.pop('filepath', None) for file_data in files
        ]
        assets = []
        with db.session.begin():
            for file_data, asset_submission_filepath in zip(
                    files, asset_submission_filepath_list):
                semantic_guid = file_data.get('semantic_guid', None)
                asset = Asset.query.filter(
                    Asset.semantic_guid == semantic_guid).first()
                if asset is None:
                    # Check if we can recycle existing GUID from symlink
                    recycle_guid = existing_filepath_guid_mapping.get(
                        asset_submission_filepath, None)
                    if recycle_guid is not None:
                        file_data['guid'] = recycle_guid
                    # Create record if asset is new
                    asset = Asset(**file_data)
                    db.session.add(asset)
                else:
                    # Update record if Asset exists
                    for key in file_data:
                        if key in [
                                'submission_guid', 'filesystem_guid',
                                'semantic_guid'
                        ]:
                            continue
                        value = file_data[key]
                        setattr(asset, key, value)
                    db.session.merge(asset)
                assets.append(asset)

        # Update all symlinks for each Asset
        for asset, asset_submission_filepath in zip(
                assets, asset_submission_filepath_list):
            db.session.refresh(asset)
            asset.update_symlink(asset_submission_filepath)
            if verbose:
                print(filepath)
                print('\tAsset         : %s' % (asset, ))
                print('\tSemantic GUID : %s' % (asset.semantic_guid, ))
                print('\tExtension     : %s' % (asset.extension, ))
                print('\tMIME type     : %s' % (asset.mime_type, ))
                print('\tSignature     : %s' % (asset.magic_signature, ))
                print('\tSize bytes    : %s' % (asset.size_bytes, ))
                print('\tFS xxHash64   : %s' % (asset.filesystem_xxhash64, ))
                print('\tFS GUID       : %s' % (asset.filesystem_guid, ))

        # Get all historical and current Assets for this Submission
        db.session.refresh(self)

        # Delete any historical Assets that have been deleted from this commit
        deleted_assets = list(set(self.assets) - set(assets))
        if verbose:
            print('Deleting %d orphaned Assets' % (len(deleted_assets), ))
        with db.session.begin():
            for deleted_asset in deleted_assets:
                deleted_asset.delete()
        db.session.refresh(self)
Пример #42
0
def parse_shark_tags(orig_fname_list):
    import re

    invalid_tag_patterns = [
        re.escape('-'),
        re.escape('(') + '?\\d*' + re.escape(')') + '?',
        '\\d+-\\d+-\\d+', '\\d+,',
        '\\d+', 'vi*', 'i*v', 'i+',
        '\\d+th', '\\d+nd', '\\d+rd',
        'remant', 'timnfe', 't', 'e', 'sjl', 'disc', 'dec', 'road', 'easter',
        'western', 'west', 'tn',
        '\\d*ap',
        'whaleshark\\d*', 'shark\\d*', 'whale\\d*',
        'whalesharking', 'sharking', 'whalesharks', 'whales',
        'picture',
        'australien',
        'australia',
        'nick', 'tim\\d*',
        'imageset',
        'holiday', 'visit', 'tour', 'trip', 'pec', 'sv',
        'a', 'b',
        'gender', 'sex',
        'img', 'image', 'pic', 'pics', 'leith', 'trips', 'kings', 'photo', 'video', 'media',
        'fix', 'feeding',
        'nrd', 'nd', 'gen', 'wa', 'nmp', 'bo', 'kd', 'ow', 'ne', 'dsc', 'nwd',
        'mg', 'w', 'mai', 'blue', 'stumpy',
        'oea', 'cbe', 'edc', 'knrt',
        'tiws2',
        'ando', 'adv', 'str', 'adventure',
        'camera', 'tag', 'id',
        'of', 'and',
        'tagged', 'from',
        'day', '\\d*april', '\\d*may', '\\d*july', '\\d*june',
        'ningaloo', 'ningblue\\d*', 'kooling',
    ]

    valid_tag_level_set = [
        ['view-left', 'left', 'lhs', 'l', 'leftside'],
        ['view-right', 'right', 'rhs', 'r', 'rightside'],
        ['view-back', 'back'],
        ['view-top', 'top'],
        ['sex-male', 'male', 'm', 'sexm'],
        ['sex-female', 'female', 'f'],
        ['sex-unknown', 'unknown', 'u'],
        ['part-tail', 'tail'],
        ['part-flank', 'side', 'flank'],
        ['part-head', 'head'],
        ['part-pectoral', 'pectoral', 'pec'],
        ['part-dorsal', 'dorsal', 'dorsals'],
        ['part-claspers', 'claspers', 'clasper'],
        ['part-fin', 'fin'],
        ['cropped', 'crop'],
        ['scar', 'scar2'],
        ['notch'],
        ['small'],
        ['bite'],
        ['cam-slr2', 'slr2'],
        #['cam-5m', '5m']
        ['5m'],
        ['7m'],
        ['4m'],
        ['copy'],
        ['qual-resize'],
        ['qual-stretched'],
    ]

    def apply_enum_regex(pat_list):
        enum_endings = [
            '[a-g]',
            '\\d*',
            'i*',
        ]
        expanded_pats = ut.flatten([
            [pat + end for end in enum_endings]
            for pat  in pat_list
        ])
        return expanded_pats

    def apply_regex_endings(pat_list):
        return [p + '$' for p in pat_list]

    tag_alias_map = {}
    for level_set in valid_tag_level_set:
        main_key = level_set[0]
        for key in level_set:
            tag_alias_map[key] = main_key

    inverse_alias_map = {}
    for level_set in valid_tag_level_set:
        inverse_alias_map[level_set[0]] = level_set

    regex_alias_map = {
        'view-left': apply_regex_endings(apply_enum_regex(inverse_alias_map['view-left'])),
        'view-right': apply_regex_endings(apply_enum_regex(inverse_alias_map['view-right'])),
    }

    valid_tags = list(inverse_alias_map.keys())

    invalid_tag_patterns = apply_regex_endings(invalid_tag_patterns)

    def parse_all_fname_tags(fname):
        _tags = [splitext(fname)[0]]
        _tags = ut.flatten([t.split('_') for t in _tags])
        _tags = ut.flatten([t.split('.') for t in _tags])
        _tags = [t.lower() for t in _tags]
        _tags = [tag_alias_map.get(t, t) for t in _tags]
        for key, vals in regex_alias_map.items():
            pat = ut.regex_or(vals)
            _tags = [key if re.match(pat, t) else t for t in _tags]
        pat = ut.regex_or(invalid_tag_patterns)
        _tags = [t for t in _tags if not re.match(pat, t)]
        _tags = ut.unique_ordered(_tags)
        return _tags

    all_img_tag_list = list(map(parse_all_fname_tags, orig_fname_list))

    known_img_tag_list = [list(set(tags).intersection(set(valid_tags)))
                          for tags in all_img_tag_list]

    if False:
        # Help figure out which tags are important
        _parsed_tags = ut.flatten(all_img_tag_list)

        taghist =  ut.dict_hist(_parsed_tags)
        taghist = {key: val for key, val in taghist.items() if val > 1}

        unknown_taghist = sorted([
            (val, key) for key, val in taghist.items()
            if key not in valid_tags
        ])[::-1]
        known_taghist = sorted([
            (val, key) for key, val in taghist.items()
            if key in valid_tags
        ])[::-1]

        print('Known')
        print(ut.list_str(known_taghist[0:100]))

        print('Unknown')
        print(ut.list_str(unknown_taghist[0:100]))

        print(ut.dict_str(
            ut.dict_hist(ut.flatten(known_img_tag_list)),
            key_order_metric='val'
        ))

    return known_img_tag_list
Пример #43
0
def detect_sharks(ibs, gids):
    # import wbia
    # ibs = wbia.opendb('WS_ALL')
    config = {
        'algo':
        'yolo',
        'sensitivity':
        0.2,
        'config_filepath':
        ut.truepath('~/work/WS_ALL/localizer_backup/detect.yolo.2.cfg'),
        'weight_filepath':
        ut.truepath(
            '~/work/WS_ALL/localizer_backup/detect.yolo.2.39000.weights'),
        'class_filepath':
        ut.truepath(
            '~/work/WS_ALL/localizer_backup/detect.yolo.2.cfg.classes'),
    }
    depc = ibs.depc_image

    # imgsets = ibs.imagesets(text='Injured Sharks')
    # images = ibs.images(imgsets.gids[0])
    images = ibs.images(gids)
    images = images.compress([ext not in ['.gif'] for ext in images.exts])
    gid_list = images.gids

    # result is a tuple:
    # (score, bbox_list, theta_list, conf_list, class_list)
    results_list = depc.get_property('localizations',
                                     gid_list,
                                     None,
                                     config=config)

    results_list2 = []
    multi_gids = []
    failed_gids = []

    # ibs.set_image_imagesettext(failed_gids, ['Fixme'] * len(failed_gids))
    ibs.set_image_imagesettext(multi_gids, ['Fixme2'] * len(multi_gids))

    failed_gids

    for gid, res in zip(gid_list, results_list):
        score, bbox_list, theta_list, conf_list, class_list = res
        if len(bbox_list) == 0:
            failed_gids.append(gid)
        elif len(bbox_list) == 1:
            results_list2.append((gid, bbox_list, theta_list))
        elif len(bbox_list) > 1:
            multi_gids.append(gid)
            idx = conf_list.argmax()
            res2 = (gid, bbox_list[idx:idx + 1], theta_list[idx:idx + 1])
            results_list2.append(res2)

    ut.dict_hist(([t[1].shape[0] for t in results_list]))

    localized_imgs = ibs.images(ut.take_column(results_list2, 0))
    assert all([len(a) == 1 for a in localized_imgs.aids])
    old_annots = ibs.annots(ut.flatten(localized_imgs.aids))
    # old_tags = old_annots.case_tags

    # Override old bboxes
    import numpy as np

    bboxes = np.array(ut.take_column(results_list2, 1))[:, 0, :]
    ibs.set_annot_bboxes(old_annots.aids, bboxes)

    if False:
        import wbia.plottool as pt

        pt.qt4ensure()

        inter = pt.MultiImageInteraction(
            ibs.get_image_paths(ut.take_column(results_list2, 0)),
            bboxes_list=ut.take_column(results_list2, 1),
        )
        inter.dump_to_disk('shark_loc', num=50, prefix='shark_loc')
        inter.start()

        inter = pt.MultiImageInteraction(ibs.get_image_paths(failed_gids))
        inter.start()

        inter = pt.MultiImageInteraction(ibs.get_image_paths(multi_gids))
        inter.start()
Пример #44
0
def ensure_tf(X):
    termfreq = ut.dict_hist(X.wx_list)
    # do what video google does
    termfreq = ut.map_dict_vals(lambda x: x / len(X.wx_list), termfreq)
    X.termfreq = termfreq
Пример #45
0
def limited_power_toughness_histogram():
    r"""
    CommandLine:
        python -m mtgmonte.stats --exec-limited_power_toughness_histogram --show

    Example:
        >>> # DISABLE_DOCTEST
        >>> from mtgmonte.stats import *  # NOQA
        >>> result = limited_power_toughness_histogram()
        >>> print(result)
        >>> ut.show_if_requested()
    """
    from mtgmonte import mtgobjs
    from mtglib.gatherer_request import SearchRequest
    from mtglib.card_extractor import CardExtractor

    # from mtglib.card_renderer import CardList
    request = SearchRequest({"set": "Oath of the Gatewatch"})

    def add_page(url, page):
        parts = url.split("/")
        part1 = "/".join(parts[:-1])
        part2 = "/Default.aspx?page=%d&" % (page,)
        part3 = parts[-1].replace("Default.aspx?", "")
        url2 = part1 + part2 + part3
        return url2

    card_list = []
    for page in range(0, 10):
        url = request.url
        url2 = add_page(url, page)
        extract = CardExtractor(url2)
        card_list0 = extract.cards

        for card in card_list0:
            card2 = mtgobjs.Card2()
            card2.__dict__.update(card.__dict__)
            card_list.append(card2)

        if len(card_list0) != 100:
            break

    for c in card_list:
        c.nice_attrs += ["rarity"]

    creats = [_card2 for _card2 in card_list if "Creature" in card2.types]
    creats = [_card2 for _card2 in creats if _card2.rarity in ["Common", "Uncommon"]]

    powtough = []

    for c in creats:
        try:
            powtough.append((int(c.power), int(c.toughness)))
        except ValueError:
            pass

    import plottool as pt

    pt.ensure_pylab_qt4()
    import numpy as np

    scores_list = np.array(list(zip(*powtough)))
    xdata = np.arange(0, np.max(scores_list) + 1)
    powhist = np.histogram(scores_list[0], bins=xdata)[0]
    toughist = np.histogram(scores_list[1], bins=xdata)[0]
    pt.multi_plot(xdata, [powhist, toughist], label_list=["power", "toughness"], kind="bar")

    bothhist = ut.dict_hist(powtough)
    xdata = np.arange(len(bothhist))
    dat = sorted(bothhist.items())
    xticklabels = ut.take_column(dat, 0)
    ydata = ut.take_column(dat, 1)

    pt.multi_plot(xdata, [ydata], xticklabels=xticklabels, kind="bar")