def isect_info(self, other): set1 = set(self.rel_fpath_list) set2 = set(other.rel_fpath_list) set_comparisons = ut.odict([ ('s1', set1), ('s2', set2), ('union', set1.union(set2)), ('isect', set1.intersection(set2)), ('s1 - s2', set1.difference(set2)), ('s2 - s1', set1.difference(set1)), ]) stat_stats = ut.map_vals(len, set_comparisons) print(ut.repr4(stat_stats)) return set_comparisons if False: idx_lookup1 = ut.make_index_lookup(self.rel_fpath_list) idx_lookup2 = ut.make_index_lookup(other.rel_fpath_list) uuids1 = ut.take(self.uuids, ut.take(idx_lookup1, set_comparisons['union'])) uuids2 = ut.take(other.uuids, ut.take(idx_lookup2, set_comparisons['union'])) uuids1 == uuids2
def get_annot_viewpoint_stats(ibs, aid_list): annots = ibs.annots(aid_list) viewcode2_nAnnots = ut.order_dict_by( ut.map_vals(len, annots.group_items(annots.viewpoint_code)), list(ibs.const.VIEW.CODE_TO_INT.keys()) + [None], ) return viewcode2_nAnnots
def get_annot_qual_stats(ibs, aid_list): annots = ibs.annots(aid_list) qualtext2_nAnnots = ut.order_dict_by( ut.map_vals(len, annots.group_items(annots.quality_texts)), list(ibs.const.QUALITY_TEXT_TO_INT.keys()), ) return qualtext2_nAnnots
def oracle_review(sim): queue_params = { 'pos_diameter': None, 'neg_diameter': None, } infr = sim.infr prev = infr.verbose infr.verbose = 0 # rng = np.random.RandomState(0) infr = sim.infr primary_truth = sim.primary_truth review_edges = infr.generate_reviews(**queue_params) max_reviews = 1000 for count, (aid1, aid2) in enumerate(ut.ProgIter(review_edges)): state = primary_truth.loc[(aid1, aid2)].idxmax() tags = [] infr.add_feedback(aid1, aid2, state, tags, apply=True, rectify=False, user_id='oracle', confidence='absolutely_sure') if count > max_reviews: break infr.verbose = prev sim.results['max_reviews'] = max_reviews n_clusters, n_inconsistent = infr.relabel_using_reviews(rectify=False) assert n_inconsistent == 0, 'should not create any inconsistencies' sim.results['n_user_clusters'] = n_clusters # infr.apply_review_inference() curr_decisions = infr.edge_attr_df('decision') curr_truth = primary_truth.loc[curr_decisions.index].idxmax(axis=1) n_user_mistakes = curr_decisions != curr_truth sim.results['n_user_mistakes'] = sum(n_user_mistakes) gt_clusters = ut.group_pairs(infr.gen_node_attrs('orig_name_label')) curr_clusters = ut.group_pairs(infr.gen_node_attrs('name_label')) compare_results = compare_groups(list(gt_clusters.values()), list(curr_clusters.values())) sim.results.update(ut.map_vals(len, compare_results)) common_per_num = ut.group_items(compare_results['common'], map(len, compare_results['common'])) sumafter = 3 greater = [i for i in common_per_num.keys() if i > sumafter] common_per_num['>%s' % sumafter] = ut.flatten( ut.take(common_per_num, greater)) ut.delete_keys(common_per_num, greater) for k, v in common_per_num.items(): sim.results['common@' + str(k)] = len(v) sim.results['n_names_common'] = len(compare_results['common'])
def check_baseline_results(sim): import networkx as nx infr = sim.infr n_names_possible = 0 real_groups = ut.group_pairs(infr.gen_node_attrs('orig_name_label')) possible_clusters = [] for nid, nodes in real_groups.items(): if len(nodes) == 1: possible_clusters.append(nodes) n_names_possible += 1 continue cc_cand_edges = list(ut.nx_edges_between(infr.graph, nodes)) cc = ut.nx_from_node_edge(nodes, cc_cand_edges) mst = nx.minimum_spanning_tree(cc) ccs = list(nx.connected_components(mst)) possible_clusters.extend(ccs) n_names_possible += (len(ccs)) sumafter = 3 best_possible_compare_results = compare_groups( list(real_groups.values()), list(possible_clusters)) possible_per_num = ut.map_vals( len, ut.group_items(best_possible_compare_results['common'], map(len, best_possible_compare_results['common']))) greater = [i for i in possible_per_num.keys() if i > sumafter] possible_per_num['>%s' % sumafter] = sum( ut.take(possible_per_num, greater)) ut.delete_keys(possible_per_num, greater) for k, v in possible_per_num.items(): sim.results['possible@' + str(k)] = v sim.results['possible'] = len(best_possible_compare_results['common']) # Measure the number of real names in the test (per number of annots) real_per_num = ut.dict_hist(map(len, real_groups.values())) greater = [i for i in real_per_num.keys() if i > sumafter] real_per_num['>%s' % sumafter] = sum(ut.take(real_per_num, greater)) ut.delete_keys(real_per_num, greater) for k, v in real_per_num.items(): sim.results['real@' + str(k)] = v sim.results['n_names_possible'] = n_names_possible sim.results['n_names_real'] = len(real_groups) sim.results['real'] = len(real_groups)
def __init__(verif, infr): verif.rng = np.random.RandomState(4033913) verif.dummy_params = { NEGTV: { 'mean': 0.2, 'std': 0.25 }, POSTV: { 'mean': 0.85, 'std': 0.2 }, INCMP: { 'mean': 0.15, 'std': 0.1 }, } verif.score_dist = randn verif.infr = infr verif.orig_nodes = set(infr.aids) verif.orig_labels = infr.get_node_attrs('orig_name_label') verif.orig_groups = ut.invert_dict(verif.orig_labels, False) verif.orig_groups = ut.map_vals(set, verif.orig_groups)
def nbytes_info(X): size_info = ut.map_vals(ut.get_object_nbytes, X.__dict__) return size_info
def clean_tags(): zotero = get_libzotero() # dict of all zotero items # items = zotero.index # get sql cursor cur = zotero.cur if False: sorted(ut.util_sqlite.get_tablenames(cur)) ut.print_database_structure(cur) # Debug info about tags table in sql # The `tags` table stores all tags # The itemTags table stores the association between items and tags ut.get_table_columninfo_list(cur, 'fields') # ut.get_table_columninfo_list(cur, 'relations') ut.get_table_columninfo_list(cur, 'fieldsCombined') ut.get_table_columninfo_list(cur, 'itemData') ut.get_table_columninfo_list(cur, 'itemDataValues') ut.get_table_columninfo_list(cur, 'tags') ut.get_table_columninfo_list(cur, 'itemTags') import pandas as pd pd.options.display.max_colwidth = 40 pd.options.display.max_rows = 20 def pandas_sql(table, columns): return pd.DataFrame(ut.get_table_rows(cur, table, columns), columns=columns) item_df = pandas_sql('items', ('itemID', 'itemTypeID', 'libraryID', 'key')).set_index('itemID', drop=False) tags_df = pandas_sql('tags', ('tagID', 'name', 'type', 'libraryID', 'key')).set_index('tagID', drop=False) itemData_df = pandas_sql('itemData', ('itemID', 'fieldID', 'valueID')) itemTag_df = pandas_sql('itemTags', ('itemID', 'tagID')) itemDataValues_df = pandas_sql('itemDataValues', ('valueID', 'value')).set_index('valueID') field_df = pandas_sql('fields', ('fieldID', 'fieldName', 'fieldFormatID')).set_index('fieldID') itemData_df['value'] = itemDataValues_df['value'].loc[itemData_df['valueID'].values].values itemData_df['fieldName'] = field_df['fieldName'].loc[itemData_df['fieldID'].values].values titles = itemData_df[itemData_df['fieldName'] == 'title'] assert len(ut.unique(ut.map_vals(len, titles.groupby('itemID').indices).values())) == 1 # itemTag_df.groupby('itemID').count() # Find how often each tag is used tagid_to_count = itemTag_df.groupby('tagID').count() tagid_to_count = tagid_to_count.rename(columns={'itemID': 'nItems'}) tagid_to_count['name'] = tags_df.loc[tagid_to_count.index]['name'] tagid_to_count = tagid_to_count.sort_values('nItems') bad_tags = tagid_to_count[tagid_to_count['nItems'] == 1] tagid_to_count['tag_ncharsize'] = tagid_to_count['name'].apply(len) tagid_to_count = tagid_to_count.sort_values('tag_ncharsize') bad_tags = tagid_to_count[tagid_to_count['tag_ncharsize'] > 25]['name'].values.tolist() def clean_tags2(): api_key = 'fBDBqRPwW9O3mYyNLiksBKZy' base_url = 'https://api.zotero.org' library_id = '1279414' library_type = 'user' from pyzotero import zotero zot = zotero.Zotero(library_id, library_type, api_key) for chunk in ut.ProgChunks(bad_tags, 50): zot.delete_tags(*chunk) if False: api_key = 'fBDBqRPwW9O3mYyNLiksBKZy' base_url = 'https://api.zotero.org' user_id = '1279414' userOrGroupPrefix = '/users/' + user_id params = {'v': 3, 'key': api_key} items_resp = requests.get(base_url + userOrGroupPrefix + '/items', params=params) print(items_resp.content) print(items_resp) json_tags = [] get_url = base_url + userOrGroupPrefix + '/tags' while True: print('get_url = %r' % (get_url,)) tag_resp = requests.get(get_url, params=params) if tag_resp.status_code != 200: break json_tags.extend(tag_resp.json()) if 'next' in tag_resp.links: get_url = tag_resp.links['next']['url'] else: break version_to_tags = ut.ddict(list) bad_tags = [] for tag in ut.ProgIter(json_tags, label='parsing tags'): # x = requests.get(tag['links']['self']['href'], params=params) if tag['meta']['numItems'] == 1: import urllib2 try: bad_tags.append(urllib2.quote(tag['tag'])) except Exception as ex: print('cant encode tag=%r' % (tag,)) pass for chunk in ut.ProgIter(ut.ichunks(bad_tags, 50), length=len(bad_tags) / 50): search_url = base_url + userOrGroupPrefix + '/items?tag=' + ' || '.join(chunk) r = requests.get(search_url, params=params) matching_items = r.json() # assert len(matching_items) == 1 for item in matching_items: version = item['version'] version_to_tags[item['version']].append(tag['tag']) # DELETE MULTIPLE TAGS import requests for chunk in ut.ichunks(bad_tags['name'], 50): import urllib2 encoded_chunk = [] for t in chunk: try: encoded_chunk.append(urllib2.quote(t)) except Exception: print(t) suffix = ' || '.join(encoded_chunk) delete_url = base_url + userOrGroupPrefix + '/tags?' + suffix print('delete_url = %r' % (delete_url,)) resp = requests.delete(delete_url, params=params) bad_tags = tagid_to_count[tagid_to_count['nItems'] == 1] bad_tags['tagID'] = bad_tags.index for tagid in bad_tags: delete from itemTags where tagID in (select tagID from tags where type=1); pass for name in k['name'].values.tolist() item_df['title'] = titles.set_index('itemID')['value'] for idx, item in zotero.index.items(): sql_title = item_df.loc[item.id]['title'] if item.title != sql_title: if pd.isnull(sql_title) and item.title is not None: print(item.__dict__) print(item_df.loc[item.id]) print('item.title = %r' % (item.title,)) print('sql_title = %r' % (sql_title,)) assert False duplicate_tags = [ (name, idxs) for name, idxs in tags_df.groupby('name', sort=True).indices.items() if len(idxs) > 2 ] tagname_to_tagid = tags_df.groupby('name', sort=True).first() new_to_oldtags = {} # Determine which tagi to use for each name for tagname, idxs in duplicate_tags: tags_subdf = tags_df.iloc[idxs] mapping = itemTag_df[itemTag_df['tagID'].isin(tags_subdf['tagID'])] tag_hist = mapping.groupby('tagID').count() best_tagid = tag_hist['itemID'].idxmax() new_to_oldtags[best_tagid] = set(tag_hist['itemID'].values) - {best_tagid} tagname_to_tagid.loc[tagname] = tags_df.loc[best_tagid] # for col in tagname_to_tagid.columns: # tagname_to_tagid.loc[tagname][col] = tags_df.loc[best_tagid][col] # tags_df.loc[best_tagid] if False: # Update tagIds for newid, oldids in new_to_oldtags.items(): for oldid in oldids: # cur.execute('SELECT itemID, tagID FROM itemTags WHERE tagID=?', (oldid,)) import sqlite3 try: cmd = 'UPDATE itemTags SET tagID=? WHERE tagID=?' args = (newid, oldid) print('(%s) args = %r' % (cmd, args,)) cur.execute(cmd, args) print(cur.fetchall()) except sqlite3.IntegrityError: print('error') pass # tags_df.groupby('name', sort=True) # itemTag_df.groupby('itemID') # duptags = tags_df.iloc[tags_df.groupby('name', sort=True).indices['animals']] # duptags['tagID'] # flags = itemTag_df['tagID'].isin(duptags['tagID']) # dup_rel = itemTag_df[flags] # item_df['title'].loc[dup_rel['itemID']].values # tags_df.iloc[tags_df.groupby('name', sort=True).indices['animals']] # tags_df[tags_df['type'] == 1] # tags_df[tags_df['type'] == 0] # tags_df['libraryID'].unique() # tags_df['type'].unique() ''' SELECT SELECT FROM itemTags WHERE name in (animals) ''' item_tag_pairs = ut.get_table_rows(cur, 'itemTags', ('itemID', 'tagID')) # Group tags by item itemid_to_tagids = ut.group_pairs(item_tag_pairs) # Group items by tags tagid_to_itemids = ut.group_pairs(map(tuple, map(reversed, item_tag_pairs))) # mapping from tagid to name tagid_to_name = dict(ut.get_table_rows(cur, 'tags', ('tagID', 'name'))) tagid_freq = list(ut.sort_dict(ut.map_vals(len, tagid_to_itemids), 'vals').items()) ut.sort_dict(ut.map_vals(sum, ut.group_pairs([(freq, tagid_to_name.get(tagid, tagid)) for tagid, freq in tagid_freq])), 'vals') tagname_freq = ut.map_keys(lambda k: tagid_to_name.get(k, k), tagid_freq)
print('g = {!r}'.format(g)) ignore = { 'conference': ['eventtitle', 'doi', 'urldate', 'location', 'volume'], 'journal': ['doi', 'urldate', 'issue', 'number', 'volume'], 'book': ['urldate'], 'thesis': ['urldate'], 'online': ['type'], 'report': ['urldate'], } for v in ignore.values(): v.append('eprinttype') v.append('eprint') print('Entry type freq:') print(ut.map_vals(len, entrytypes)) for e, g in entrytypes.items(): print('\n --- TYPE = %r' % (e.upper(), )) g = g[g.columns[~np.all(pd.isnull(g), axis=0)]] missing_cols = g.columns[np.any(pd.isnull(g), axis=0)] if e in ignore: missing_cols = missing_cols.difference(ignore[e]) print('missing_cols = {!r}'.format(missing_cols.tolist())) for col in missing_cols: print('col = {!r}'.format(col)) print(g[pd.isnull(g[col])].index.tolist()) for e, g in entrytypes.items(): print('e = %r' % (e, )) g = g[g.columns[~np.all(pd.isnull(g), axis=0)]]