Пример #1
0
def featurize_record_pair(r1, r2, freq, doc_size):
    """
    Featurize a record pair and return a Series of the feature vectors

    Params:
        r1: (rltk.Record) record 1
        r2: (rltk.Record) record 2
        freq: (Dict) corpus frequency
        doc_size: (int) total size of dataset
    """
    fv = pd.Series()
    fv['id1'] = r1.id
    fv['id2'] = r2.id

    if gt.is_member(r1.id, r2.id):
        fv['label'] = 1
    else:
        fv['label'] = 0

    if (r1.manufacturer == '' or None) or (r2.manufacturer == '' or None):
        fv['manufacturer_jaro_winkler'] = None 
        fv['manufacturer_levenshtien'] = None
        fv['manufacturer_jaccard'] = None
    else:
        fv['manufacturer_jaro_winkler'] = rltk.jaro_winkler_similarity(r1.manufacturer, r2.manufacturer)
        fv['manufacturer_levenshtien'] = rltk.levenshtein_similarity(r1.manufacturer, r2.manufacturer)
        fv['manufacturer_jaccard'] = rltk.jaccard_index_similarity(set(tokenize(r1.manufacturer)), 
                                set(tokenize(r2.manufacturer)))

    if r1.price is None or r2.price is None:
        fv['price_difference'] = None
    else:
        fv['price_difference'] = abs(r1.price - r2.price)/max(r1.price, r2.price)

    fv['name_jaccard'] = rltk.jaccard_index_similarity(set(r1.name_tokenized), set(r2.name_tokenized))
    fv['name_jaro_winkler'] = rltk.jaro_winkler_similarity(" ".join(r1.name_tokenized), " ".join(r2.name_tokenized))
    fv['name_trigram'] = rltk.ngram_distance(r1.name, r2.name,3)
    
    if r1.description_tokenized is None or r2.description_tokenized is None:
        fv['desc_tf_idf'] = None
        fv['desc_trigram'] = None
        fv['desc_jaccard'] = None
    else:
        fv['desc_tf_idf'] = rltk.tf_idf_similarity(r1.description_tokenized,
                                                r2.description_tokenized,freq,doc_size)
        fv['desc_trigram'] = rltk.ngram_distance(" ".join(r1.description_tokenized), " ".join(r2.description_tokenized),3)
        fv['desc_jaccard'] = rltk.jaccard_index_similarity(set(r1.description_tokenized), set(r2.description_tokenized))

    return fv
Пример #2
0
def match_records_using_string_similarity(record_1, field_1, record_2,
                                          field_2):
    value_1 = getattr(record_1, field_1).lower()
    value_2 = getattr(record_2, field_2).lower()

    ngram_tokenizer = rltk.NGramTokenizer()
    if rltk.jaccard_index_similarity(ngram_tokenizer.basic(value_1, 3),
                                     ngram_tokenizer.basic(value_2, 3)) > 0.8:
        return True
    return False
def similarity_match_by_name(record1, record2):    
    full_name_m = record1.full_name_string.lower()
    full_name_w = record2.full_name_string.lower()

    # full name score
    if full_name_m == full_name_w:
        return True, 1
    # Jaccard name score for whole set of name tokens (dirty)
    jaccard_name_score = rltk.jaccard_index_similarity(record1.name_tokens, record2.name_tokens)
    # Jaro Winkerler name score for re-assembeled full name (clean)
    jw_name_score = rltk.jaro_winkler_similarity(full_name_m, full_name_w)
    total = jaccard_name_score * 0.65 + jw_name_score * 0.35

    return total > 0.7, total
Пример #4
0
def SimilarityScore(record1, record2):
    names = rltk.jaccard_index_similarity(record1.name, record2.name)
    address = rltk.levenshtein_similarity(record1.address, record2.address)
    cuisine = rltk.levenshtein_similarity(record1.cuisine, record2.cuisine)
    #     phone = rltk.levenshtein_similarity(record1.phone, record2.phone)

    if record1.phone != record2.phone:
        phone = 0.
    else:
        phone = 1.
    #0.7  0.2 0.1 > 0.8 104
    #0.4 0.4 0.2 >0.59 106
    #0.4 0.4 0.2 >0.53 113
    return 0.4 * phone + 0.4 * names + 0.2 * address
Пример #5
0
    def similarity(self, id1, id2):
        if not self._gm.in_graph(id1) or not self._gm.in_graph(id2):
            raise ValueError('Invalid id1 or id2')

        if id1 == id2:
            return 1.0

        # node type (item or property)
        cat1 = self._gm.get_node_type(id1)
        cat2 = self._gm.get_node_type(id2)
        if cat1 != cat2:
            return 0.0

        # attributes
        attr1 = self._gm.get_node_attributes(id1)
        attr2 = self._gm.get_node_attributes(id2)
        attr1_keys = set(attr1)
        attr2_keys = set(attr2)

        # comparing on same attributes
        attr_score = 0
        all_keys = attr1_keys & attr2_keys
        for key in all_keys:
            similarity = Comparator.get(key)
            attr_score += similarity(attr1[key], attr2[key]) / len(
                all_keys
            )  # weights of different attributes should be configurable

        # weight of overlapped keys (penalty of non-shared keys)
        attr_score *= rltk.jaccard_index_similarity(attr1_keys, attr2_keys)

        # class
        # aida's entity and event types are special
        # need a more general way to compare class similarity
        cls1 = self._gm.get_node_class(id1)
        cls2 = self._gm.get_node_class(id2)
        type1 = set(normalize_ontology_type(cls1).split())
        type2 = set(normalize_ontology_type(cls2).split())
        cls_score = len(type1 & type2) / max(len(type1 | type2), 1)

        return (attr_score + cls_score) / 2
Пример #6
0
    def name_sim(a, b):
        def _decode(s):
            # en:"label1",ru:"label2"
            r = defaultdict(list)
            for l in s.split(','):
                lang, label = l[:2], l[4:-1]
                r[lang].append(normalize_text(label))
            return r

        if a == b:
            return 1
        if not a or not b:
            return

        # multi-lingual labels0
        ml_l_a, ml_l_b = _decode(a), _decode(b)
        shared_langs = set(ml_l_a.keys()) & set(ml_l_b.keys())
        score = 0
        for lang in shared_langs:
            l_a, l_b = ml_l_a[lang], ml_l_b[lang]
            score += rltk.jaccard_index_similarity(
                set(l_a), set(l_b)) / len(shared_langs)
        return score
Пример #7
0
def genre_similarity(r_imdb, r_afi):
    s1 = r_imdb.genre_set
    s2 = r_afi.genre_set
    return rltk.jaccard_index_similarity(s1, s2)
            try:
                gt_path = set(gt_sp[cur_key])
            except:
                gt_path = set()

            try:
                base_gen_path = set(base_gen_sp[cur_key])
            except:
                base_gen_path = set()

            try:
                comp_gen_path = set(comp_gen_sp[cur_key])
            except:
                comp_gen_path = set()

            base_jaccard_sim = rltk.jaccard_index_similarity(
                gt_path, base_gen_path)
            baseline_jaccard_sims.append(base_jaccard_sim)

            comp_jaccard_sim = rltk.jaccard_index_similarity(
                gt_path, comp_gen_path)
            comparison_jaccard_sims.append(comp_jaccard_sim)

    baseline_JACCARD = np.mean(baseline_jaccard_sims) * 100
    comparison_JACCARD = np.mean(comparison_jaccard_sims) * 100

    print("\nBaseline Method Jaccard Similarity metric = ", baseline_JACCARD)
    print("Comparison Method Jaccard Similarity metric = ", comparison_JACCARD)
    print("MAP significance test = ",
          stats.ttest_rel(baseline_jaccard_sims, comparison_jaccard_sims),
          "\n")
Пример #9
0
def ingredient_set(prod1, prod2):
    set1 = prod1.ingredients
    set2 = prod2.ingredients
    return rltk.jaccard_index_similarity(set1, set2)
Пример #10
0
 def record_score(r1, r2):
     score = rltk.jaccard_index_similarity(set(r1.concatenated_labels),
                                           set(r2.concatenated_labels))
     return score