Exemplo n.º 1
0
def owner_cluster(con,
                  cur,
                  nitem=None,
                  reverse=True,
                  nshingle=2,
                  store=True,
                  **kwargs):
    print('generating hashes and pairs')

    c = sh.Cluster(**kwargs)

    cmd = 'select ownerid,name from owner'
    if reverse:
        cmd += ' order by rowid desc'
    if nitem:
        cmd += ' limit %i' % nitem

    name_dict = {}
    for (i, (ownerid, name)) in enumerate(cur.execute(cmd)):
        words = name.split()
        shings = list(sh.shingle(name, nshingle))

        features = shings + words
        weights = list(np.linspace(1.0, 0.0, len(shings))) + list(
            np.linspace(1.0, 0.0, len(words)))

        c.add(features, weights=weights, label=ownerid)
        name_dict[ownerid] = name

        if i % 10000 == 0:
            print(i)

    ipairs = c.unions
    npairs = [(name_dict[i1], name_dict[i2]) for (i1, i2) in ipairs]
    print('Found %i pairs' % len(ipairs))

    if store:
        cur.execute('drop table if exists pair')
        cur.execute(
            'create table pair (ownerid1 int, ownerid2 int, name1 text, name2 text)'
        )
        cur.executemany('insert into pair values (?,?,?,?)',
                        [(o1, o2, n1, n2)
                         for ((o1, o2), (n1, n2)) in zip(ipairs, npairs)])
        con.commit()
    else:
        return (ipairs, npairs)
Exemplo n.º 2
0
def filter_pairs(con, nshingle=2, k=8, thresh=4):
    print('filtering pairs')

    c = sh.Cluster(k=k, thresh=thresh)
    name_dict = {}

    names = pd.read_sql('select id,name from name', con)
    for i, id, name in names.itertuples():
        words = name.split()
        shings = list(sh.shingle(name, nshingle))

        features = shings + words
        weights = list(np.linspace(1.0, 0.0, len(shings))) + list(
            np.linspace(1.0, 0.0, len(words)))

        c.add(features, weights=weights, label=id)
        name_dict[id] = name

        if i > 0 and i % 100_000 == 0:
            print(f'{i}: {len(c.unions)}')