def owner_cluster(con, cur, nitem=None, reverse=True, nshingle=2, store=True, **kwargs): print('generating hashes and pairs') c = sh.Cluster(**kwargs) cmd = 'select ownerid,name from owner' if reverse: cmd += ' order by rowid desc' if nitem: cmd += ' limit %i' % nitem name_dict = {} for (i, (ownerid, name)) in enumerate(cur.execute(cmd)): words = name.split() shings = list(sh.shingle(name, nshingle)) features = shings + words weights = list(np.linspace(1.0, 0.0, len(shings))) + list( np.linspace(1.0, 0.0, len(words))) c.add(features, weights=weights, label=ownerid) name_dict[ownerid] = name if i % 10000 == 0: print(i) ipairs = c.unions npairs = [(name_dict[i1], name_dict[i2]) for (i1, i2) in ipairs] print('Found %i pairs' % len(ipairs)) if store: cur.execute('drop table if exists pair') cur.execute( 'create table pair (ownerid1 int, ownerid2 int, name1 text, name2 text)' ) cur.executemany('insert into pair values (?,?,?,?)', [(o1, o2, n1, n2) for ((o1, o2), (n1, n2)) in zip(ipairs, npairs)]) con.commit() else: return (ipairs, npairs)
def filter_pairs(con, nshingle=2, k=8, thresh=4): print('filtering pairs') c = sh.Cluster(k=k, thresh=thresh) name_dict = {} names = pd.read_sql('select id,name from name', con) for i, id, name in names.itertuples(): words = name.split() shings = list(sh.shingle(name, nshingle)) features = shings + words weights = list(np.linspace(1.0, 0.0, len(shings))) + list( np.linspace(1.0, 0.0, len(words))) c.add(features, weights=weights, label=id) name_dict[id] = name if i > 0 and i % 100_000 == 0: print(f'{i}: {len(c.unions)}')