示例#1
0
    def oracle_review(sim):
        queue_params = {
            'pos_diameter': None,
            'neg_diameter': None,
        }
        infr = sim.infr
        prev = infr.verbose
        infr.verbose = 0
        # rng = np.random.RandomState(0)
        infr = sim.infr
        primary_truth = sim.primary_truth
        review_edges = infr.generate_reviews(**queue_params)
        max_reviews = 1000
        for count, (aid1, aid2) in enumerate(ut.ProgIter(review_edges)):
            state = primary_truth.loc[(aid1, aid2)].idxmax()
            tags = []
            infr.add_feedback(aid1,
                              aid2,
                              state,
                              tags,
                              apply=True,
                              rectify=False,
                              user_id='oracle',
                              confidence='absolutely_sure')
            if count > max_reviews:
                break
        infr.verbose = prev

        sim.results['max_reviews'] = max_reviews

        n_clusters, n_inconsistent = infr.relabel_using_reviews(rectify=False)
        assert n_inconsistent == 0, 'should not create any inconsistencies'

        sim.results['n_user_clusters'] = n_clusters
        # infr.apply_review_inference()

        curr_decisions = infr.edge_attr_df('decision')
        curr_truth = primary_truth.loc[curr_decisions.index].idxmax(axis=1)
        n_user_mistakes = curr_decisions != curr_truth
        sim.results['n_user_mistakes'] = sum(n_user_mistakes)

        gt_clusters = ut.group_pairs(infr.gen_node_attrs('orig_name_label'))
        curr_clusters = ut.group_pairs(infr.gen_node_attrs('name_label'))

        compare_results = compare_groups(list(gt_clusters.values()),
                                         list(curr_clusters.values()))
        sim.results.update(ut.map_vals(len, compare_results))

        common_per_num = ut.group_items(compare_results['common'],
                                        map(len, compare_results['common']))
        sumafter = 3
        greater = [i for i in common_per_num.keys() if i > sumafter]
        common_per_num['>%s' % sumafter] = ut.flatten(
            ut.take(common_per_num, greater))
        ut.delete_keys(common_per_num, greater)
        for k, v in common_per_num.items():
            sim.results['common@' + str(k)] = len(v)

        sim.results['n_names_common'] = len(compare_results['common'])
示例#2
0
def apply_dummy_viewpoints(infr):
    transition_rate = 0.5
    transition_rate = 0
    valid_views = ['L', 'F', 'R', 'B']
    rng = np.random.RandomState(42)

    class MarkovView(object):
        def __init__(self):
            self.dir_ = +1
            self.state = 0

        def __call__(self):
            return self.next_state()

        def next_state(self):
            if self.dir_ == -1 and self.state <= 0:
                self.dir_ = +1
            if self.dir_ == +1 and self.state >= len(valid_views) - 1:
                self.dir_ = -1
            if rng.rand() < transition_rate:
                self.state += self.dir_
            return valid_views[self.state]

    mkv = MarkovView()
    nid_to_aids = ut.group_pairs([(n, d['name_label'])
                                  for n, d in infr.graph.nodes(data=True)])
    grouped_nodes = list(nid_to_aids.values())
    node_to_view = {node: mkv() for nodes in grouped_nodes for node in nodes}
    infr.set_node_attrs('viewpoint', node_to_view)
示例#3
0
    def check_baseline_results(sim):
        import networkx as nx
        infr = sim.infr
        n_names_possible = 0
        real_groups = ut.group_pairs(infr.gen_node_attrs('orig_name_label'))
        possible_clusters = []
        for nid, nodes in real_groups.items():
            if len(nodes) == 1:
                possible_clusters.append(nodes)
                n_names_possible += 1
                continue
            cc_cand_edges = list(ut.nx_edges_between(infr.graph, nodes))
            cc = ut.nx_from_node_edge(nodes, cc_cand_edges)
            mst = nx.minimum_spanning_tree(cc)
            ccs = list(nx.connected_components(mst))
            possible_clusters.extend(ccs)
            n_names_possible += (len(ccs))

        sumafter = 3

        best_possible_compare_results = compare_groups(
            list(real_groups.values()), list(possible_clusters))
        possible_per_num = ut.map_vals(
            len,
            ut.group_items(best_possible_compare_results['common'],
                           map(len, best_possible_compare_results['common'])))
        greater = [i for i in possible_per_num.keys() if i > sumafter]
        possible_per_num['>%s' % sumafter] = sum(
            ut.take(possible_per_num, greater))
        ut.delete_keys(possible_per_num, greater)
        for k, v in possible_per_num.items():
            sim.results['possible@' + str(k)] = v
        sim.results['possible'] = len(best_possible_compare_results['common'])

        # Measure the number of real names in the test (per number of annots)
        real_per_num = ut.dict_hist(map(len, real_groups.values()))
        greater = [i for i in real_per_num.keys() if i > sumafter]
        real_per_num['>%s' % sumafter] = sum(ut.take(real_per_num, greater))
        ut.delete_keys(real_per_num, greater)
        for k, v in real_per_num.items():
            sim.results['real@' + str(k)] = v

        sim.results['n_names_possible'] = n_names_possible
        sim.results['n_names_real'] = len(real_groups)
        sim.results['real'] = len(real_groups)
示例#4
0
    def update_visual_attrs(infr,
                            graph=None,
                            show_reviewed_edges=True,
                            show_unreviewed_edges=False,
                            show_inferred_diff=True,
                            show_inferred_same=True,
                            show_recent_review=False,
                            highlight_reviews=True,
                            show_inconsistency=True,
                            wavy=False,
                            simple_labels=False,
                            show_labels=True,
                            reposition=True,
                            use_image=False,
                            edge_overrides=None,
                            node_overrides=None,
                            colorby='name_label',
                            **kwargs
                            # hide_unreviewed_inferred=True
                            ):
        import wbia.plottool as pt

        infr.print('update_visual_attrs', 3)
        if graph is None:
            graph = infr.graph
        # if hide_cuts is not None:
        #     # show_unreviewed_cuts = not hide_cuts
        #     show_reviewed_cuts = not hide_cuts

        if not getattr(infr, '_viz_init_nodes', False):
            infr._viz_init_nodes = True
            nx.set_node_attributes(graph, name='shape', values='circle')
            # infr.set_node_attrs('shape', 'circle')

        if getattr(infr, '_viz_image_config_dirty', True):
            infr.update_node_image_attribute(graph=graph, use_image=use_image)

        def get_any(dict_, keys, default=None):
            for key in keys:
                if key in dict_:
                    return dict_[key]
            return default

        show_cand = get_any(
            kwargs, ['show_candidate_edges', 'show_candidates', 'show_cand'])
        if show_cand is not None:
            show_cand = True
            show_reviewed_edges = True
            show_unreviewed_edges = True
            show_inferred_diff = True
            show_inferred_same = True

        if kwargs.get('show_all'):
            show_cand = True

        # alpha_low = .5
        alpha_med = 0.9
        alpha_high = 1.0

        dark_background = graph.graph.get('dark_background', None)

        # Ensure we are starting from a clean slate
        # if reposition:
        ut.nx_delete_edge_attr(graph, infr.visual_edge_attrs_appearance)

        # Set annotation node labels
        node_to_nid = None
        if not show_labels:
            nx.set_node_attributes(graph,
                                   name='label',
                                   values=ut.dzip(graph.nodes(), ['']))
        else:
            if simple_labels:
                nx.set_node_attributes(
                    graph,
                    name='label',
                    values={n: str(n)
                            for n in graph.nodes()})
            else:
                if node_to_nid is None:
                    node_to_nid = nx.get_node_attributes(graph, 'name_label')
                node_to_view = nx.get_node_attributes(graph, 'viewpoint')
                if node_to_view:
                    annotnode_to_label = {
                        aid: 'aid=%r%s\nnid=%r' %
                        (aid, node_to_view[aid], node_to_nid[aid])
                        for aid in graph.nodes()
                    }
                else:
                    annotnode_to_label = {
                        aid: 'aid=%r\nnid=%r' % (aid, node_to_nid[aid])
                        for aid in graph.nodes()
                    }
                nx.set_node_attributes(graph,
                                       name='label',
                                       values=annotnode_to_label)

        # NODE_COLOR: based on name_label
        ut.color_nodes(graph,
                       labelattr=colorby,
                       outof=kwargs.get('outof', None),
                       sat_adjust=-0.4)

        # EDGES:
        # Grab different types of edges
        edges, edge_colors = infr.get_colored_edge_weights(
            graph, highlight_reviews)

        # reviewed_states = nx.get_edge_attributes(graph, 'evidence_decision')
        reviewed_states = {
            e: infr.edge_decision(e)
            for e in infr.graph.edges()
        }
        edge_to_inferred_state = nx.get_edge_attributes(
            graph, 'inferred_state')
        # dummy_edges = [edge for edge, flag in
        #                nx.get_edge_attributes(graph, '_dummy_edge').items()
        #                if flag]
        edge_to_reviewid = nx.get_edge_attributes(graph, 'review_id')
        recheck_edges = [
            edge for edge, split in nx.get_edge_attributes(
                graph, 'maybe_error').items() if split
        ]
        decision_to_edge = ut.group_pairs(reviewed_states.items())
        neg_edges = decision_to_edge[NEGTV]
        pos_edges = decision_to_edge[POSTV]
        incomp_edges = decision_to_edge[INCMP]
        unreviewed_edges = decision_to_edge[UNREV]

        inferred_same = [
            edge for edge, state in edge_to_inferred_state.items()
            if state == 'same'
        ]
        inferred_diff = [
            edge for edge, state in edge_to_inferred_state.items()
            if state == 'diff'
        ]
        inconsistent_external = [
            edge for edge, state in edge_to_inferred_state.items()
            if state == 'inconsistent_external'
        ]
        inferred_notcomp = [
            edge for edge, state in edge_to_inferred_state.items()
            if state == 'notcomp'
        ]

        reviewed_edges = incomp_edges + pos_edges + neg_edges
        compared_edges = pos_edges + neg_edges
        uncompared_edges = ut.setdiff(edges, compared_edges)
        nontrivial_inferred_same = ut.setdiff(
            inferred_same, pos_edges + neg_edges + incomp_edges)
        nontrivial_inferred_diff = ut.setdiff(
            inferred_diff, pos_edges + neg_edges + incomp_edges)
        nontrivial_inferred_edges = nontrivial_inferred_same + nontrivial_inferred_diff

        # EDGE_COLOR: based on edge_weight
        nx.set_edge_attributes(graph,
                               name='color',
                               values=ut.dzip(edges, edge_colors))

        # LINE_WIDTH: based on review_state
        # unreviewed_width = 2.0
        # reviewed_width = 5.0
        unreviewed_width = 1.0
        reviewed_width = 2.0
        if highlight_reviews:
            nx.set_edge_attributes(
                graph,
                name='linewidth',
                values=ut.dzip(reviewed_edges, [reviewed_width]),
            )
            nx.set_edge_attributes(
                graph,
                name='linewidth',
                values=ut.dzip(unreviewed_edges, [unreviewed_width]),
            )
        else:
            nx.set_edge_attributes(graph,
                                   name='linewidth',
                                   values=ut.dzip(edges, [unreviewed_width]))

        # EDGE_STROKE: based on decision and maybe_error
        # fg = pt.WHITE if dark_background else pt.BLACK
        # nx.set_edge_attributes(graph, name='stroke', values=ut.dzip(reviewed_edges, [{'linewidth': 3, 'foreground': fg}]))
        if show_inconsistency:
            nx.set_edge_attributes(
                graph,
                name='stroke',
                values=ut.dzip(recheck_edges, [{
                    'linewidth': 5,
                    'foreground': infr._error_color
                }]),
            )

        # Set linestyles to emphasize PCCs
        # Dash lines between PCCs inferred to be different
        nx.set_edge_attributes(graph,
                               name='linestyle',
                               values=ut.dzip(inferred_diff, ['dashed']))

        # Treat incomparable/incon-external inference as different
        nx.set_edge_attributes(graph,
                               name='linestyle',
                               values=ut.dzip(inferred_notcomp, ['dashed']))
        nx.set_edge_attributes(graph,
                               name='linestyle',
                               values=ut.dzip(inconsistent_external,
                                              ['dashed']))

        # Dot lines that we are unsure of
        nx.set_edge_attributes(graph,
                               name='linestyle',
                               values=ut.dzip(unreviewed_edges, ['dotted']))

        # Cut edges are implicit and dashed
        # nx.set_edge_attributes(graph, name='implicit', values=ut.dzip(cut_edges, [True]))
        # nx.set_edge_attributes(graph, name='linestyle', values=ut.dzip(cut_edges, ['dashed']))
        # nx.set_edge_attributes(graph, name='alpha', values=ut.dzip(cut_edges, [alpha_med]))

        nx.set_edge_attributes(graph,
                               name='implicit',
                               values=ut.dzip(uncompared_edges, [True]))

        # Only matching edges should impose constraints on the graph layout
        nx.set_edge_attributes(graph,
                               name='implicit',
                               values=ut.dzip(neg_edges, [True]))
        nx.set_edge_attributes(graph,
                               name='alpha',
                               values=ut.dzip(neg_edges, [alpha_med]))
        nx.set_edge_attributes(graph,
                               name='implicit',
                               values=ut.dzip(incomp_edges, [True]))
        nx.set_edge_attributes(graph,
                               name='alpha',
                               values=ut.dzip(incomp_edges, [alpha_med]))

        # Ensure reviewed edges are visible
        nx.set_edge_attributes(graph,
                               name='implicit',
                               values=ut.dzip(reviewed_edges, [False]))
        nx.set_edge_attributes(graph,
                               name='alpha',
                               values=ut.dzip(reviewed_edges, [alpha_high]))

        if True:
            # Infered same edges can be allowed to constrain in order
            # to make things look nice sometimes
            nx.set_edge_attributes(graph,
                                   name='implicit',
                                   values=ut.dzip(inferred_same, [False]))
            nx.set_edge_attributes(graph,
                                   name='alpha',
                                   values=ut.dzip(inferred_same, [alpha_high]))

        if not kwargs.get('show_same', True):
            nx.set_edge_attributes(graph,
                                   name='alpha',
                                   values=ut.dzip(inferred_same, [0]))

        if not kwargs.get('show_diff', True):
            nx.set_edge_attributes(graph,
                                   name='alpha',
                                   values=ut.dzip(inferred_diff, [0]))

        if not kwargs.get('show_positive_edges', True):
            nx.set_edge_attributes(graph,
                                   name='alpha',
                                   values=ut.dzip(pos_edges, [0]))

        if not kwargs.get('show_negative_edges', True):
            nx.set_edge_attributes(graph,
                                   name='alpha',
                                   values=ut.dzip(neg_edges, [0]))

        if not kwargs.get('show_incomparable_edges', True):
            nx.set_edge_attributes(graph,
                                   name='alpha',
                                   values=ut.dzip(incomp_edges, [0]))

        if not kwargs.get('show_between', True):
            if node_to_nid is None:
                node_to_nid = nx.get_node_attributes(graph, 'name_label')
            between_edges = [(u, v) for u, v in edges
                             if node_to_nid[u] != node_to_nid[v]]
            nx.set_edge_attributes(graph,
                                   name='alpha',
                                   values=ut.dzip(between_edges, [0]))

        # SKETCH: based on inferred_edges
        # Make inferred edges wavy
        if wavy:
            # dict(scale=3.0, length=18.0, randomness=None)]
            nx.set_edge_attributes(
                graph,
                name='sketch',
                values=ut.dzip(
                    nontrivial_inferred_edges,
                    [dict(scale=10.0, length=64.0, randomness=None)],
                ),
            )

        # Make dummy edges more transparent
        # nx.set_edge_attributes(graph, name='alpha', values=ut.dzip(dummy_edges, [alpha_low]))
        selected_edges = kwargs.pop('selected_edges', None)

        # SHADOW: based on most recent
        # Increase visibility of nodes with the most recently changed timestamp
        if show_recent_review and edge_to_reviewid and selected_edges is None:
            review_ids = list(edge_to_reviewid.values())
            recent_idxs = ut.argmax(review_ids, multi=True)
            recent_edges = ut.take(list(edge_to_reviewid.keys()), recent_idxs)
            selected_edges = recent_edges

        if selected_edges is not None:
            # TODO: add photoshop-like parameters like
            # spread and size. offset is the same as angle and distance.
            nx.set_edge_attributes(
                graph,
                name='shadow',
                values=ut.dzip(
                    selected_edges,
                    [{
                        'rho': 0.3,
                        'alpha': 0.6,
                        'shadow_color': 'w' if dark_background else 'k',
                        'offset': (0, 0),
                        'scale': 3.0,
                    }],
                ),
            )

        # Z_ORDER: make sure nodes are on top
        nodes = list(graph.nodes())
        nx.set_node_attributes(graph,
                               name='zorder',
                               values=ut.dzip(nodes, [10]))
        nx.set_edge_attributes(graph,
                               name='zorder',
                               values=ut.dzip(edges, [0]))
        nx.set_edge_attributes(graph,
                               name='picker',
                               values=ut.dzip(edges, [10]))

        # VISIBILITY: Set visibility of edges based on arguments
        if not show_reviewed_edges:
            infr.print('Making reviewed edges invisible', 10)
            nx.set_edge_attributes(graph,
                                   name='style',
                                   values=ut.dzip(reviewed_edges, ['invis']))

        if not show_unreviewed_edges:
            infr.print('Making un-reviewed edges invisible', 10)
            nx.set_edge_attributes(graph,
                                   name='style',
                                   values=ut.dzip(unreviewed_edges, ['invis']))

        if not show_inferred_same:
            infr.print('Making nontrivial_same edges invisible', 10)
            nx.set_edge_attributes(graph,
                                   name='style',
                                   values=ut.dzip(nontrivial_inferred_same,
                                                  ['invis']))

        if not show_inferred_diff:
            infr.print('Making nontrivial_diff edges invisible', 10)
            nx.set_edge_attributes(graph,
                                   name='style',
                                   values=ut.dzip(nontrivial_inferred_diff,
                                                  ['invis']))

        if selected_edges is not None:
            # Always show the most recent review (remove setting of invis)
            # infr.print('recent_edges = %r' % (recent_edges,))
            nx.set_edge_attributes(graph,
                                   name='style',
                                   values=ut.dzip(selected_edges, ['']))

        if reposition:
            # LAYOUT: update the positioning layout
            def get_layoutkw(key, default):
                return kwargs.get(key, graph.graph.get(key, default))

            layoutkw = dict(
                prog='neato',
                splines=get_layoutkw('splines', 'line'),
                fontsize=get_layoutkw('fontsize', None),
                fontname=get_layoutkw('fontname', None),
                sep=10 / 72,
                esep=1 / 72,
                nodesep=0.1,
            )
            layoutkw.update(kwargs)
            # logger.info(ut.repr3(graph.edges))
            pt.nx_agraph_layout(graph, inplace=True, **layoutkw)

        if edge_overrides:
            for key, edge_to_attr in edge_overrides.items():
                nx.set_edge_attributes(graph, name=key, values=edge_to_attr)
        if node_overrides:
            for key, node_to_attr in node_overrides.items():
                nx.set_node_attributes(graph, name=key, values=node_to_attr)
def clean_tags():
    zotero = get_libzotero()
    # dict of all zotero items
    # items = zotero.index
    # get sql cursor
    cur = zotero.cur
    if False:
        sorted(ut.util_sqlite.get_tablenames(cur))
        ut.print_database_structure(cur)
        # Debug info about tags table in sql

        # The `tags` table stores all tags
        # The itemTags table stores the association between items and tags
        ut.get_table_columninfo_list(cur, 'fields')
        # ut.get_table_columninfo_list(cur, 'relations')
        ut.get_table_columninfo_list(cur, 'fieldsCombined')

        ut.get_table_columninfo_list(cur, 'itemData')
        ut.get_table_columninfo_list(cur, 'itemDataValues')

        ut.get_table_columninfo_list(cur, 'tags')
        ut.get_table_columninfo_list(cur, 'itemTags')

    import pandas as pd
    pd.options.display.max_colwidth = 40
    pd.options.display.max_rows = 20
    def pandas_sql(table, columns):
        return pd.DataFrame(ut.get_table_rows(cur, table, columns),
                            columns=columns)

    item_df = pandas_sql('items', ('itemID', 'itemTypeID', 'libraryID', 'key')).set_index('itemID', drop=False)
    tags_df = pandas_sql('tags', ('tagID', 'name', 'type', 'libraryID', 'key')).set_index('tagID', drop=False)
    itemData_df = pandas_sql('itemData', ('itemID', 'fieldID', 'valueID'))

    itemTag_df = pandas_sql('itemTags', ('itemID', 'tagID'))

    itemDataValues_df = pandas_sql('itemDataValues', ('valueID', 'value')).set_index('valueID')
    field_df = pandas_sql('fields', ('fieldID', 'fieldName', 'fieldFormatID')).set_index('fieldID')

    itemData_df['value'] = itemDataValues_df['value'].loc[itemData_df['valueID'].values].values
    itemData_df['fieldName'] = field_df['fieldName'].loc[itemData_df['fieldID'].values].values

    titles = itemData_df[itemData_df['fieldName'] == 'title']
    assert len(ut.unique(ut.map_vals(len, titles.groupby('itemID').indices).values())) == 1

    # itemTag_df.groupby('itemID').count()
    # Find how often each tag is used
    tagid_to_count = itemTag_df.groupby('tagID').count()
    tagid_to_count = tagid_to_count.rename(columns={'itemID': 'nItems'})
    tagid_to_count['name'] = tags_df.loc[tagid_to_count.index]['name']
    tagid_to_count = tagid_to_count.sort_values('nItems')

    bad_tags = tagid_to_count[tagid_to_count['nItems'] == 1]

    tagid_to_count['tag_ncharsize'] = tagid_to_count['name'].apply(len)
    tagid_to_count = tagid_to_count.sort_values('tag_ncharsize')
    bad_tags = tagid_to_count[tagid_to_count['tag_ncharsize'] > 25]['name'].values.tolist()

    def clean_tags2():
        api_key = 'fBDBqRPwW9O3mYyNLiksBKZy'
        base_url = 'https://api.zotero.org'
        library_id = '1279414'
        library_type = 'user'
        from pyzotero import zotero
        zot = zotero.Zotero(library_id, library_type, api_key)

        for chunk in ut.ProgChunks(bad_tags, 50):
            zot.delete_tags(*chunk)

    if False:
        api_key = 'fBDBqRPwW9O3mYyNLiksBKZy'
        base_url = 'https://api.zotero.org'
        user_id = '1279414'
        userOrGroupPrefix = '/users/' + user_id
        params = {'v': 3, 'key': api_key}

        items_resp = requests.get(base_url + userOrGroupPrefix + '/items', params=params)
        print(items_resp.content)
        print(items_resp)

        json_tags = []
        get_url = base_url + userOrGroupPrefix + '/tags'
        while True:
            print('get_url = %r' % (get_url,))
            tag_resp = requests.get(get_url, params=params)
            if tag_resp.status_code != 200:
                break
            json_tags.extend(tag_resp.json())
            if 'next' in tag_resp.links:
                get_url = tag_resp.links['next']['url']
            else:
                break

        version_to_tags = ut.ddict(list)
        bad_tags = []
        for tag in ut.ProgIter(json_tags, label='parsing tags'):
            # x = requests.get(tag['links']['self']['href'], params=params)
            if tag['meta']['numItems'] == 1:
                import urllib2
                try:
                    bad_tags.append(urllib2.quote(tag['tag']))
                except Exception as ex:
                    print('cant encode tag=%r' % (tag,))
                    pass

        for chunk in ut.ProgIter(ut.ichunks(bad_tags, 50), length=len(bad_tags) / 50):
            search_url = base_url + userOrGroupPrefix + '/items?tag=' + ' || '.join(chunk)
            r = requests.get(search_url, params=params)
            matching_items = r.json()
            # assert len(matching_items) == 1
            for item in matching_items:
                version = item['version']
            version_to_tags[item['version']].append(tag['tag'])

        # DELETE MULTIPLE TAGS
        import requests
        for chunk in ut.ichunks(bad_tags['name'], 50):
            import urllib2
            encoded_chunk = []
            for t in chunk:
                try:
                    encoded_chunk.append(urllib2.quote(t))
                except Exception:
                    print(t)
            suffix = ' || '.join(encoded_chunk)
            delete_url = base_url + userOrGroupPrefix + '/tags?' + suffix
            print('delete_url = %r' % (delete_url,))
            resp = requests.delete(delete_url, params=params)

        bad_tags = tagid_to_count[tagid_to_count['nItems'] == 1]
        bad_tags['tagID'] = bad_tags.index
        for tagid in bad_tags:
            delete from itemTags where tagID in (select tagID from tags where type=1);
        pass
        for name in k['name'].values.tolist()
    item_df['title'] = titles.set_index('itemID')['value']
    for idx, item in zotero.index.items():
        sql_title = item_df.loc[item.id]['title']
        if item.title != sql_title:
            if pd.isnull(sql_title) and item.title is not None:
                print(item.__dict__)
                print(item_df.loc[item.id])
                print('item.title = %r' % (item.title,))
                print('sql_title = %r' % (sql_title,))
                assert False

    duplicate_tags = [
        (name, idxs) for name, idxs in tags_df.groupby('name', sort=True).indices.items() if len(idxs) > 2
    ]
    tagname_to_tagid = tags_df.groupby('name', sort=True).first()
    new_to_oldtags = {}
    # Determine which tagi to use for each name
    for tagname, idxs in duplicate_tags:
        tags_subdf = tags_df.iloc[idxs]
        mapping = itemTag_df[itemTag_df['tagID'].isin(tags_subdf['tagID'])]
        tag_hist = mapping.groupby('tagID').count()
        best_tagid = tag_hist['itemID'].idxmax()

        new_to_oldtags[best_tagid] = set(tag_hist['itemID'].values) - {best_tagid}

        tagname_to_tagid.loc[tagname] = tags_df.loc[best_tagid]
        # for col in tagname_to_tagid.columns:
        #     tagname_to_tagid.loc[tagname][col] = tags_df.loc[best_tagid][col]
        # tags_df.loc[best_tagid]

    if False:
        # Update tagIds
        for newid, oldids in new_to_oldtags.items():
            for oldid in oldids:
                # cur.execute('SELECT itemID, tagID FROM itemTags WHERE tagID=?', (oldid,))
                import sqlite3
                try:
                    cmd = 'UPDATE itemTags SET tagID=? WHERE tagID=?'
                    args = (newid, oldid)
                    print('(%s) args = %r' % (cmd, args,))
                    cur.execute(cmd, args)
                    print(cur.fetchall())
                except sqlite3.IntegrityError:
                    print('error')
                    pass

    # tags_df.groupby('name', sort=True)

    # itemTag_df.groupby('itemID')
    # duptags = tags_df.iloc[tags_df.groupby('name', sort=True).indices['animals']]
    # duptags['tagID']
    # flags = itemTag_df['tagID'].isin(duptags['tagID'])
    # dup_rel = itemTag_df[flags]
    # item_df['title'].loc[dup_rel['itemID']].values
    # tags_df.iloc[tags_df.groupby('name', sort=True).indices['animals']]

    # tags_df[tags_df['type'] == 1]
    # tags_df[tags_df['type'] == 0]
    # tags_df['libraryID'].unique()
    # tags_df['type'].unique()

    '''
    SELECT
    SELECT FROM itemTags WHERE name in (animals)
    '''

    item_tag_pairs = ut.get_table_rows(cur, 'itemTags', ('itemID', 'tagID'))
    # Group tags by item
    itemid_to_tagids = ut.group_pairs(item_tag_pairs)
    # Group items by tags
    tagid_to_itemids = ut.group_pairs(map(tuple, map(reversed, item_tag_pairs)))

    # mapping from tagid to name
    tagid_to_name = dict(ut.get_table_rows(cur, 'tags', ('tagID', 'name')))

    tagid_freq = list(ut.sort_dict(ut.map_vals(len, tagid_to_itemids), 'vals').items())
    ut.sort_dict(ut.map_vals(sum, ut.group_pairs([(freq, tagid_to_name.get(tagid, tagid)) for tagid, freq in tagid_freq])), 'vals')
    tagname_freq = ut.map_keys(lambda k: tagid_to_name.get(k, k), tagid_freq)