示例#1
0
def get_matches(school, source):
    with open('../../teams/' + source + '/results_' + source +
              '.csv') as csv_file:
        csv_reader = csv.reader(csv_file)
        is_head_row = True
        for row in csv_reader:
            if is_head_row:  # skip the one head row
                is_head_row = False
                continue
            my_match = match.match(source, row)

            # IMPORTANT LINE - CALCULATES SIMILARITY SCORE
            # rltk.jaro_winkler_similarity(s1, s2, <threshold for invoking prefix score>, <scaling factor for prefix score>, <length of prefix score>)
            if (source == "rivals" or reference_source
                    == "rivals"):  # then we have to use school name
                # distance_score = rltk.levenshtein_similarity(school.school_name, my_match.school_name)
                distance_score = rltk.jaro_winkler_similarity(
                    school.school_name, my_match.school_name, 0.6, 0.25, 1)
            else:
                # distance_score = rltk.levenshtein_similarity(school.representation, my_match.representation)
                distance_score = rltk.jaro_winkler_similarity(
                    school.representation, my_match.representation, 0.6, 0.25,
                    1)

            eval("school.matches_" + source +
                 ".append((my_match, distance_score))")

    school.sort_lists(source)
    csv_file.close()
def confirmed_match(name_1, name_2, school_1, school_2, hometown_1, hometown_2,
                    pos_1, pos_2, source_1, source_2):

    if (not (name_1 and name_2)):
        return False

    # COMPARE HOMETOWNS - if hometowns are present and more than jaro_threshold different, return false
    if (hometown_1 and hometown_2):
        string1 = hometown_1.strip(" '()").lower()
        string2 = hometown_2.strip(" '()").lower()
        distance_score = rltk.jaro_winkler_similarity(string1, string2)
        if (distance_score < jaro_threshold_hometown):
            return False

    # COMPARE NAMES
    string1 = name_1.strip(" '()").lower()
    string2 = name_2.strip(" '()").lower()
    distance_score = rltk.jaro_winkler_similarity(string1, string2)
    if (distance_score > jaro_threshold_players):

        # COMPARE POSITIONS IF NO SCHOOLS
        if (not (school_1 and school_2)):  # if not school, use position
            if (pos_1 and pos_2):
                # return True if pos_1 and pos_2 share a substring of 3+ chars
                match_len = SequenceMatcher(None, pos_1,
                                            pos_2).find_longest_match(
                                                0, len(pos_1), 0,
                                                len(pos_2))[2]
                if (match_len >= position_substring_min):
                    return True

            return False  # no school and no position -> no match

        # COMPARE SCHOOLS
        if school_match(school_1, school_2, source_1, source_2):
            return True

        if ('/' in school_1):

            school_new = school_1.split('/')[0]
            if school_match(school_new, school_2, source_1, source_2):
                return True

            school_new = school_1.split('/')[1]
            if school_match(school_new, school_2, source_1, source_2):
                return True

        elif ('/' in school_2):
            school_new = school_2.split('/')[0]
            if school_match(school_1, school_new, source_1, source_2):
                return True

            school_new = school_2.split('/')[1]
            if school_match(school_1, school_new, source_1, source_2):
                return True

    return False
示例#3
0
def featurize_record_pair(r1, r2, freq, doc_size):
    """
    Featurize a record pair and return a Series of the feature vectors

    Params:
        r1: (rltk.Record) record 1
        r2: (rltk.Record) record 2
        freq: (Dict) corpus frequency
        doc_size: (int) total size of dataset
    """
    fv = pd.Series()
    fv['id1'] = r1.id
    fv['id2'] = r2.id

    if gt.is_member(r1.id, r2.id):
        fv['label'] = 1
    else:
        fv['label'] = 0

    if (r1.manufacturer == '' or None) or (r2.manufacturer == '' or None):
        fv['manufacturer_jaro_winkler'] = None 
        fv['manufacturer_levenshtien'] = None
        fv['manufacturer_jaccard'] = None
    else:
        fv['manufacturer_jaro_winkler'] = rltk.jaro_winkler_similarity(r1.manufacturer, r2.manufacturer)
        fv['manufacturer_levenshtien'] = rltk.levenshtein_similarity(r1.manufacturer, r2.manufacturer)
        fv['manufacturer_jaccard'] = rltk.jaccard_index_similarity(set(tokenize(r1.manufacturer)), 
                                set(tokenize(r2.manufacturer)))

    if r1.price is None or r2.price is None:
        fv['price_difference'] = None
    else:
        fv['price_difference'] = abs(r1.price - r2.price)/max(r1.price, r2.price)

    fv['name_jaccard'] = rltk.jaccard_index_similarity(set(r1.name_tokenized), set(r2.name_tokenized))
    fv['name_jaro_winkler'] = rltk.jaro_winkler_similarity(" ".join(r1.name_tokenized), " ".join(r2.name_tokenized))
    fv['name_trigram'] = rltk.ngram_distance(r1.name, r2.name,3)
    
    if r1.description_tokenized is None or r2.description_tokenized is None:
        fv['desc_tf_idf'] = None
        fv['desc_trigram'] = None
        fv['desc_jaccard'] = None
    else:
        fv['desc_tf_idf'] = rltk.tf_idf_similarity(r1.description_tokenized,
                                                r2.description_tokenized,freq,doc_size)
        fv['desc_trigram'] = rltk.ngram_distance(" ".join(r1.description_tokenized), " ".join(r2.description_tokenized),3)
        fv['desc_jaccard'] = rltk.jaccard_index_similarity(set(r1.description_tokenized), set(r2.description_tokenized))

    return fv
def school_match(school_1, school_2, source_1, source_2):
    # First just try it without getting the canonical source
    distance_score = rltk.jaro_winkler_similarity(school_1.lower(),
                                                  school_2.lower())
    if (distance_score > jaro_threshold_schools):
        return True

    school_1 = get_canonical_source(school_1, source_1)
    school_2 = get_canonical_source(school_2, source_2)

    distance_score = rltk.jaro_winkler_similarity(school_1.lower(),
                                                  school_2.lower())
    if (distance_score > jaro_threshold_schools):
        return True
    return False
def name_string_similarity_1(r_imdb, r_afi):
    s1 = r_imdb.name_string.lower()[:8]
    s2 = cached_names_1.get(r_afi)
    if s2 is None:
        s2 = r_afi.name_string.lower()[:8]
        cached_names_1[r_afi] = s2

    return rltk.jaro_winkler_similarity(s1, s2)
def name_string_similarity_3(r_imdb, r_afi):
    s1 = ''.join(sorted(re.split(r'[-,\s]+', r_imdb.name_string.lower())))
    s2 = cached_names_3.get(r_afi)
    if s2 is None:
        s2 = ''.join(sorted(re.split(r'[-,\s]+', r_afi.name_string.lower())))
        cached_names_3[r_afi] = s2

    return rltk.jaro_winkler_similarity(s1, s2)
def school_match(school_1, school_2, source_1, source_2):
    school_1 = get_canonical_source(school_1, source_1)
    school_2 = get_canonical_source(school_2, source_2)

    distance_score = rltk.jaro_winkler_similarity(school_1.lower(),
                                                  school_2.lower())
    if (distance_score > jaro_threshold_schools):
        return True
    return False
def confirmed_match(name_1, name_2, school_1, school_2, hometown_1, hometown_2,
                    source_1, source_2):
    if (not (name_1 and name_2 and school_1 and school_2)):
        return False

    # COMPARE HOMETOWNS
    if (hometown_1 and hometown_2):
        string1 = hometown_1.strip(" '()").lower()
        string2 = hometown_2.strip(" '()").lower()
        distance_score = rltk.jaro_winkler_similarity(string1, string2)
        if (distance_score < jaro_threshold_hometown):
            return False

    # COMPARE NAMES
    string1 = name_1.strip(" '()").lower()
    string2 = name_2.strip(" '()").lower()
    distance_score = rltk.jaro_winkler_similarity(string1, string2)

    # COMPARE SCHOOLS
    if (distance_score > jaro_threshold_players):
        if school_match(school_1, school_2, source_1, source_2):
            return True

        if ('/' in school_1):

            school_new = school_1.split('/')[0]
            if school_match(school_new, school_2, source_1, source_2):
                return True

            school_new = school_1.split('/')[1]
            if school_match(school_new, school_2, source_1, source_2):
                return True

        elif ('/' in school_2):
            school_new = school_2.split('/')[0]
            if school_match(school_1, school_new, source_1, source_2):
                return True

            school_new = school_2.split('/')[1]
            if school_match(school_1, school_new, source_1, source_2):
                return True

    return False
def name_similarity(r_imdb, r_afi):
    # imdb_name = r_imdb['name_string']
    # afi_name = r_afi['name_string']
    imdb_name = r_imdb.lower()
    afi_name = r_afi.lower()

    # imdb_name = r_imdb.lower()
    # afi_name = r_afi.lower()

    return rltk.jaro_winkler_similarity(imdb_name, afi_name)
def entire_school_similarity(r1, r2):
    s1 = r1.school_string
    s2 = r2.school_string

    similarity_score = rltk.jaro_winkler_similarity(s1, s2)
    if s1 == s2:
        return 1
    elif similarity_score > 0:
        return similarity_score
    else:
        return 0
def school_similarity(r1, r2):
    s1 = r1.school_string[:int(len(r1.school_string) / 2)]
    s2 = r2.school_string[:int(len(r2.school_string) / 2)]

    similarity_score = rltk.jaro_winkler_similarity(s1, s2)
    if s1 == s2:
        return 1
    elif similarity_score > 0:
        return similarity_score
    else:
        return 0
def similarity_match_by_name(record1, record2):    
    full_name_m = record1.full_name_string.lower()
    full_name_w = record2.full_name_string.lower()

    # full name score
    if full_name_m == full_name_w:
        return True, 1
    # Jaccard name score for whole set of name tokens (dirty)
    jaccard_name_score = rltk.jaccard_index_similarity(record1.name_tokens, record2.name_tokens)
    # Jaro Winkerler name score for re-assembeled full name (clean)
    jw_name_score = rltk.jaro_winkler_similarity(full_name_m, full_name_w)
    total = jaccard_name_score * 0.65 + jw_name_score * 0.35

    return total > 0.7, total
示例#13
0
def er_task(block):
    st = time.time()
    similar = defaultdict(lambda: [])

    for i, (key, val) in enumerate(block.items()):
        if (i + 1) % 1 == 0:
            print("time taken for {} is {}".format(i, time.time() - st))
        for igdb_obj in val[0]:
            for igdb_game_key, igdb_game_name in igdb_obj.items():
                max_score = -1
                matching_key = ''
                matching_name = ''
                max_lev_score = -1
                max_jw_score = -1

                if len(val[1]) != 0:
                    for g2a_obj in val[1]:
                        for g2a_game_key, g2a_game_name in g2a_obj.items():
                            lev_score = rltk.levenshtein_similarity(
                                igdb_game_name, g2a_game_name)
                            jw_score = rltk.jaro_winkler_similarity(
                                igdb_game_name, g2a_game_name)
                            score = lev_score + jw_score
                            if score > max_score:
                                max_score = score
                                max_lev_score = lev_score
                                max_jw_score = jw_score
                                matching_key = g2a_game_key
                                matching_name = g2a_game_name
                    if max_score > 1.2:
                        similar[key].append({
                            (igdb_game_key, igdb_game_name):
                            (matching_key, matching_name, max_score)
                        })
                    else:
                        similar[key].append({
                            (igdb_game_key, igdb_game_name): ('', '', -1)
                        })
                else:
                    similar[key].append({
                        (igdb_game_key, igdb_game_name): ('', '', -1)
                    })
    print("total time taken: ", time.time() - st)

    return similar
def address_similarity(r_usn, r_cc):
    if r_usn.address is not None and r_cc.address is not None:
        address1 = r_usn.address
        address2 = r_cc.address
        if address1 and address2:
            address1 = list(map(lambda s: s.strip(), r_usn.address.split(',')))
            address1 = address1[:-1] + [address1[-1].split()[0]]
            address1 = [address1[0][:int(len(address1[0]) / 2)]] + address1[1:]
            address2 = [address2[0][:int(len(address2[0]) / 2)]] + address2[1:]
            address1 = ' '.join(address1)
            address2 = ' '.join(address2)
            similarity = rltk.jaro_winkler_similarity(address1, address2)

            if address1 == address2:
                return 1
            elif similarity > 0:
                return similarity
            else:
                return 0
           
    return 0
def title_similarity(title1: str, title2: str) -> float:
    return rltk.jaro_winkler_similarity(title1, title2)
def compare_cpu(g2a_min_cpu_1,
                techpowerup_cpu,
                g2a_min_cpu_2=None,
                max_score_1=-1,
                max_score_2=-1):
    # techpowerup_cpu_reader = jsonlines.open(techpowerup_cpu, 'r')
    similar_id_1 = ''
    similar_id_2 = ''
    most_similar_cpu_1 = ''
    most_similar_cpu_2 = ''
    score_1 = -9999
    score_2 = -9999
    with jsonlines.open(techpowerup_cpu, 'r') as techpowerup_cpu_reader:
        if g2a_min_cpu_1 != None and g2a_min_cpu_2 != None:

            for cpu in techpowerup_cpu_reader:
                cpu_key, cpu_value = list(cpu.items())[0][0], list(
                    cpu.items())[0][1]
                if len(g2a_min_cpu_1) != 0:
                    score_1 = rltk.levenshtein_similarity(
                        g2a_min_cpu_1, cpu_value['name'].lower(
                        )) + rltk.jaro_winkler_similarity(
                            g2a_min_cpu_1, cpu_value['name'].lower())
                '''if "Intel" in g2a_min_cpu_1:
                    if cpu_value["Company"] == "Intel":
                        score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name'])
                elif "AMD" in g2a_min_cpu_1:
                    if cpu_value["Company"] == 'AMD':
                        score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name'])
                else:
                    score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name'])'''

                if len(g2a_min_cpu_2) != 0:
                    score_2 = rltk.levenshtein_similarity(
                        g2a_min_cpu_2, cpu_value['name'].lower(
                        )) + rltk.jaro_winkler_similarity(
                            g2a_min_cpu_2, cpu_value['name'].lower())
                '''if "Intel" in g2a_min_cpu_2:
                    if cpu_value["Company"] == "Intel":
                        score_2 = rltk.levenshtein_similarity(g2a_min_cpu_2,cpu_value['name'])
                elif "AMD" in g2a_min_cpu_2:
                    if cpu_value["Company"] == 'AMD':
                        score_2 = rltk.levenshtein_similarity(g2a_min_cpu_2,cpu_value['name'])
                else:
                    score_2 = rltk.levenshtein_similarity(g2a_min_cpu_2,cpu_value['name'])'''
                # score_2 = rltk.levenshtein_similarity(g2a_min_cpu_2,cpu_value['name'])

                if score_1 > max_score_1:
                    max_score_1 = score_1
                    similar_id_1 = cpu_key
                    most_similar_cpu_1 = cpu_value['name']

                if score_2 > max_score_2:
                    max_score_2 = score_2
                    similar_id_2 = cpu_key
                    most_similar_cpu_2 = cpu_value['name']

            if max_score_1 >= 1.2 and max_score_2 >= 1.2:
                return {
                    'most_sim_cpu_1': {
                        'name': most_similar_cpu_1,
                        'sim_id': similar_id_1,
                        'sim_score': max_score_1
                    },
                    'most_sim_cpu_2': {
                        'name': most_similar_cpu_2,
                        'sim_id': similar_id_2,
                        'sim_score': max_score_2
                    }
                }

            elif max_score_1 >= 1.2 and max_score_2 <= 1.2:
                return {
                    'most_sim_cpu_1': {
                        'name': most_similar_cpu_1,
                        'sim_id': similar_id_1,
                        'sim_score': max_score_1
                    },
                    'most_sim_cpu_2': {
                        'name': '',
                        'sim_id': '',
                        'sim_score': max_score_2
                    }
                }

            elif max_score_1 <= 1.2 and max_score_2 >= 1.2:
                return {
                    'most_sim_cpu_1': {
                        'name': '',
                        'sim_id': '',
                        'sim_score': max_score_1
                    },
                    'most_sim_cpu_2': {
                        'name': most_similar_cpu_2,
                        'sim_id': similar_id_2,
                        'sim_score': max_score_2
                    }
                }

            else:
                return {
                    'most_sim_cpu_1': {
                        'name': '',
                        'sim_id': '',
                        'sim_score': max_score_1
                    },
                    'most_sim_cpu_2': {
                        'name': '',
                        'sim_id': '',
                        'sim_score': max_score_2
                    }
                }

        if g2a_min_cpu_1 != None and g2a_min_cpu_2 == None:
            for cpu in techpowerup_cpu_reader:
                cpu_key, cpu_value = list(cpu.items())[0][0], list(
                    cpu.items())[0][1]

                if len(g2a_min_cpu_1) != 0:
                    score_1 = rltk.levenshtein_similarity(
                        g2a_min_cpu_1,
                        cpu_value['name']) + rltk.jaro_winkler_similarity(
                            g2a_min_cpu_1, cpu_value['name'].lower())
                '''if "Intel" in g2a_min_cpu_1:
                    if cpu_value["Company"] == "Intel":
                        score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name'])
                elif "AMD" in g2a_min_cpu_1:
                    if cpu_value["Company"] == 'AMD':
                        score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name'])
                else:
                    score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name'])'''
                # score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1, cpu_value['name'])
                if score_1 > max_score_1:
                    max_score_1 = score_1
                    similar_id_1 = cpu_key
                    most_similar_cpu_1 = cpu_value['name']
            if max_score_1 >= 1.2:
                return {
                    'most_sim_cpu': {
                        'name': most_similar_cpu_1,
                        'sim_id': similar_id_1,
                        'sim_score': max_score_1
                    }
                }
            else:
                return {
                    'most_sim_cpu': {
                        'name': '',
                        'sim_id': '',
                        'sim_score': max_score_1
                    }
                }
示例#17
0
def name_string_similarity(prod1, prod2):
    s1 = prod1.name_string
    s2 = prod2.name_string
    return rltk.jaro_winkler_similarity(s1, s2)
示例#18
0
def name_similarity(r_imdb, r_afi):
    s1 = r_imdb.name_string
    s2 = r_afi.name_string
    return rltk.jaro_winkler_similarity(s1, s2)
示例#19
0
        return self.raw_object['Phone']

    @property
    def cuisine(self):
        return self.raw_object['Cuisine']


ds1 = rltk.Dataset(reader=rltk.DataFrameReader(ds1),
                   record_class=Record1,
                   adapter=rltk.MemoryKeyValueAdapter())
ds2 = rltk.Dataset(reader=rltk.DataFrameReader(ds2),
                   record_class=Record2,
                   adapter=rltk.MemoryKeyValueAdapter())
'''bg = rltk.HashBlockGenerator()
blocks = bg.generate(bg.block(ds1, property_='cuisine'), bg.block(ds2, property_='cuisine'))
pairs = rltk.get_record_pairs(ds1, ds2, block=blocks)'''

pairs = rltk.get_record_pairs(ds1, ds2)

f = open('similarities.txt', 'w+')

for r1, r2 in pairs:

    a_d = rltk.levenshtein_similarity(r1.address, r2.address)
    p_d = rltk.jaro_winkler_similarity(r1.phone, r2.phone)
    c_d = rltk.jaro_winkler_similarity(r1.cuisine, r2.cuisine)
    f.write(r1.id + "," + r2.id + "," + str(a_d) + "," + str(p_d) + "," +
            str(c_d) + "\n")

f.close()
def main():
    with open("dblp_final_JSON.json", "r") as f:
        dblp_dict = json.load(f)

    professors = set()
    for key in dblp_dict:
        professors.add(key['person'])

    #print(professors)
    #print(len(professors))

    coauthor_dict = defaultdict(list)
    for key in dblp_dict:
        author = key['person']
        for items in key['papers']:
            co_authors = items['co_authors']
            if author in co_authors:
                co_authors.remove(author)
            if co_authors:
                coauthor_dict[author].extend(co_authors)

    list_of_coauthors = []
    for key in coauthor_dict:
        list_of_coauthors.extend(coauthor_dict[key])
    #print(len(list_of_coauthors))

    ### String / Data Matching for Entity linking using RLTK

    ### Remove duplicates in the coauthor_dict using String Matching
    ### Compare with professors and do entity linking / remove duplicates

    df1 = pd.DataFrame(list(professors), columns=['name'])
    #print(df1)
    df2 = pd.DataFrame(list_of_coauthors, columns=['name'])
    #print(len(df2))
    df1['first_name'] = df1.apply(lambda x: x['name'].split()[0], axis=1)
    df1['last_name'] = df1.apply(lambda x: ' '.join(x['name'].split()[1:]),
                                 axis=1)
    df1['id'] = (df1.index + 1).astype(str)

    #print(df1)
    df2['first_name'] = df2.apply(lambda x: x['name'].split()[0], axis=1)
    df2['last_name'] = df2.apply(lambda x: ' '.join(x['name'].split()[1:]),
                                 axis=1)
    df2['id'] = (df2.index + 1).astype(str)

    ds1 = rltk.Dataset(reader=rltk.DataFrameReader(df1),
                       record_class=Record1,
                       adapter=rltk.MemoryKeyValueAdapter())
    ds2 = rltk.Dataset(reader=rltk.DataFrameReader(df2),
                       record_class=Record2,
                       adapter=rltk.MemoryKeyValueAdapter())
    bg = rltk.HashBlockGenerator()
    block = bg.generate(bg.block(ds1, property_='fname'),
                        bg.block(ds2, property_='fname'))
    pairs = rltk.get_record_pairs(ds1, ds2, block=block)
    num_pairs = 0
    sim_pairs = []
    sim_dict = {}
    for r1, r2 in pairs:
        num_pairs += 1
        sim = rltk.jaro_winkler_similarity(r1.lname, r2.lname)
        if 0.9 < sim < 1:
            sim_pairs.append(
                (r1.fname + ' ' + r1.lname, r2.fname + ' ' + r2.lname))
            sim_dict[r1.fname + ' ' + r1.lname] = r2.fname + ' ' + r2.lname
            #print(r1.lname,r2.lname,sim)
    #print(sim_pairs)
    #print("Blocking using Cuisine - Number of pairs:",num_pairs)
    for key in coauthor_dict:
        lis = coauthor_dict[key]
        for ind in range(len(lis)):
            if lis[ind] in sim_dict:
                lis[ind] = sim_dict[lis[ind]]

    with open("co_authors.json", "w") as jf:
        json.dump(coauthor_dict, jf, indent=2)