Пример #1
0
def find_components(con, cur, thresh=0.85, store=True):
    print('finding firm components')

    cmd = 'select * from pair'

    def dmetr(name1, name2):
        maxlen = max(len(name1), len(name2))
        ldist = levenshtein(name1,
                            name2,
                            max_dist=int(ceil(maxlen * (1.0 - thresh))))
        return (1.0 - float(ldist) / maxlen) if (ldist != -1
                                                 and maxlen != 0) else 0.0

    dists = []
    close = []
    name_dict = {}
    name_std = {}

    for (o1, o2, n1, n2) in cur.execute(cmd):
        if o1 not in name_dict:
            n1s = name_standardize_strong(n1)
            name_dict[o1] = n1
            name_std[o1] = n1s
        else:
            n1s = name_std[o1]
        if o2 not in name_dict:
            n2s = name_standardize_strong(n2)
            name_dict[o2] = n2
            name_std[o2] = n2s
        else:
            n2s = name_std[o2]

        d = dmetr(n1s, n2s)

        dists.append((o1, o2, d))
        if d > thresh:
            close.append((o1, o2))

    G = nx.Graph()
    G.add_edges_from(close)
    comps = sorted(nx.connected_components(G), key=len, reverse=True)

    if store:
        cur.execute('drop table if exists component')
        cur.execute('create table component (compid int, ownerid int)')
        cur.executemany(
            'insert into component values (?,?)',
            chain(
                *[zip(repeat(cid), comp) for (cid, comp) in enumerate(comps)]))
        con.commit()
    else:
        comp_names = [[name_std[id] for id in ids] for ids in comps]
        return comp_names
Пример #2
0
def find_components(con,cur,thresh=0.85,store=False):
    cmd = 'select * from pair'

    def dmetr(name1,name2):
        maxlen = max(len(name1),len(name2))
        ldist = levenshtein(name1,name2,max_dist=int(ceil(maxlen*(1.0-thresh))))
        return (1.0 - float(ldist)/maxlen) if (ldist != -1 and maxlen != 0) else 0.0

    dists = []
    close = []
    name_dict = {}
    name_std = {}

    for (o1,o2,n1,n2) in cur.execute(cmd):
        if o1 not in name_dict:
            n1s = name_standardize_strong(n1)
            name_dict[o1] = n1
            name_std[o1] = n1s
        else:
            n1s = name_std[o1]
        if o2 not in name_dict:
            n2s = name_standardize_strong(n2)
            name_dict[o2] = n2
            name_std[o2] = n2s
        else:
            n2s = name_std[o2]

        d = dmetr(n1s,n2s)

        dists.append((o1,o2,d))
        if d > thresh:
            close.append((o1,o2))

    G = nx.Graph()
    G.add_edges_from(close)
    comps = sorted(nx.connected_components(G),key=len,reverse=True)

    if store:
        cur.execute('drop table if exists component')
        cur.execute('create table component (compid int, ownerid int)')
        cur.executemany('insert into component values (?,?)',chain(*[zip(repeat(cid),comp) for (cid,comp) in enumerate(comps)]))
        con.commit()
    else:
        comp_names = [[name_std[id] for id in ids] for ids in comps]
        return comp_names
Пример #3
0
con = sqlite3.connect(args.db)
cur = con.cursor()

# create table
cur.execute('drop table if exists assign_use')
cur.execute(
    'create table assign_use (assignid integer primary key, patnum int, execdate text, recdate text, conveyance text, assignor text, assignee text, assignee_state text, assignee_country text)'
)
chunker = ChunkInserter(con, table='assign_use')

match_num = 0
rnum = 0
for row in cur.execute('select * from assign'):
    (assignee, assignor) = (row[5], row[6])

    assignor_toks = name_standardize_strong(assignor)
    assignee_toks = name_standardize_strong(assignee)

    word_match = 0
    for tok in assignor_toks:
        if tok in assignee_toks:
            word_match += 1

    word_match /= max(1.0, 0.5 * (len(assignor_toks) + len(assignee_toks)))
    match = word_match > 0.5

    chunker.insert(*row)

    match_num += match
    rnum += 1
Пример #4
0
    # create table
    cur_ins.execute('create table assignment_use (assignid integer primary key, patnum int, execdate text, recdate text, conveyance text, assignor text, assignee text, assignee_state text, assignee_country text)')
    cmd_ins = 'insert into assignment_use values (?,?,?,?,?,?,?,?,?)'

# batch insertion
batch_size = 10000
assignments = []

rlim = sys.maxsize
match_num = 0
rnum = 0
for row in cur.execute('select rowid,* from assignment'):
    (assignee,assignor) = (row[5],row[6])

    assignor_toks = name_standardize_strong(assignor)
    assignee_toks = name_standardize_strong(assignee)

    word_match = 0
    for tok in assignor_toks:
        if tok in assignee_toks:
            word_match += 1

    word_match /= max(1.0,0.5*(len(assignor_toks)+len(assignee_toks)))
    match = word_match > 0.5

    # if match:
    #   print('{:7}-{:7}, {:4.2}-{}: {:40.40} -> {:40.40}'.format(rowid,patnum,word_match,int(match),assignor,assignee))

    if store:
        assignments.append(row)