예제 #1
0
파일: dedupe.py 프로젝트: c-trl/Notebooks
def canopy(n):
    df = fl_data()
    df2 = oge_data()

    x = ngram_index(df, 'nname', n=n)

    for s in shingle(df2.nname[0],k=n):
        print(len(x[s]))
예제 #2
0
def ngram_index(df, name_field, n=4):
    index = defaultdict(set)
    for idx, row in df.iterrows():
        for s in shingle(row[name_field], k=n):
            index[s].add(idx)
    return index