Пример #1
0
    def matcher(self, source, target, matrix, i, j):
        from similarity.ngram import NGram
        twogram = NGram(2)

        sim_score = 1 - twogram.distance(source, target)
        matrix[i, j] = sim_score

        return matrix
def test_similarity():
    from similarity.ngram import NGram
    twogram = NGram(2)
    print(twogram.distance('ABCD', 'ABTUIO'))

    s1 = 'Adobe CreativeSuite 5 Master Collection from cheap 4zp'
    s2 = 'Adobe CreativeSuite 5 Master Collection from cheap d1x'
    fourgram = NGram(4)
    print(fourgram.distance(s1, s2))
    # print(twogram.distance(s1, s2))

    # s2 = 'Adobe CreativeSuite 5 Master Collection from cheap 4zp'
    # print(fourgram.distance(s1, s2))
    #
    # print(fourgram.distance('ABCD', 'ABTUIO'))

    print(1 - fourgram.distance(s1, s2))
Пример #3
0
def compute_similarity_ngram(word1, word2, n):
    ngram = NGram(n)
    sim = ngram.distance(word1, word2)
    # print(sim)
    return sim
Пример #4
0
DEBUG_MODE = False

from similarity.ngram import NGram
twogram = NGram(2)


def matcher_name(src, tar, function):
    sim_score = 1 - function.distance(src, tar)
    return sim_score


import pandas as pd
import numpy as np


def matcher_name_matrix(srcs, tars, function=twogram):
    sim_matrix = np.zeros((len(srcs), len(tars)))

    for i, s in enumerate(srcs):
        for j, t in enumerate(tars):
            sim_score = 1 - function.distance(s, t)
            sim_matrix[i, j] += sim_score

    sim_scores = pd.DataFrame(data=sim_matrix, columns=tars, index=srcs)
    return sim_scores


import math


def sigmoid(x):
Пример #5
0
    def similarity(self, question, answer):

        stopword = self.read_from(folder_path + '上证专用停用词.txt')
        stopwords = []
        for sw in stopword:
            sw = sw.strip('\n')
            sw = sw.strip(' ')
            stopwords.append(sw)
        # print(stopwords)

        meaningful_words1 = []
        meaningful_words2 = []

        words2 = jieba.cut(str(question))
        words3 = jieba.cut(str(answer))
        for word in words2:
            if word not in stopwords:
                meaningful_words1.append(word)
        for word in words3:
            if word not in stopwords:
                meaningful_words2.append(word)
        s2 = ''.join(meaningful_words1)
        # print(s2)
        s3 = ''.join(meaningful_words2)
        a1 = Cosine(1)
        b1 = Damerau()
        c1 = Jaccard(1)
        d1 = JaroWinkler()
        e1 = Levenshtein()
        f1 = LongestCommonSubsequence()
        g1 = MetricLCS()
        h1 = NGram(2)
        i1 = NormalizedLevenshtein()
        j1 = OptimalStringAlignment()
        k1 = QGram(1)
        l1 = SorensenDice(2)
        m1 = WeightedLevenshtein(character_substitution=CharSub())

        line_sim = []

        cos_s = a1.similarity(s2, s3)
        line_sim.append(cos_s)
        cos_d = a1.distance(s2, s3)
        line_sim.append(cos_d)
        dam = b1.distance(s2, s3)
        line_sim.append(dam)
        jac_d = c1.distance(s2, s3)
        line_sim.append(jac_d)
        jac_s = c1.similarity(s2, s3)
        line_sim.append(jac_s)
        jar_d = d1.distance(s2, s3)
        line_sim.append(jar_d)
        jar_s = d1.similarity(s2, s3)
        line_sim.append(jar_s)
        lev = e1.distance(s2, s3)
        line_sim.append(lev)
        lon = f1.distance(s2, s3)
        line_sim.append(lon)
        met = g1.distance(s2, s3)
        line_sim.append(met)
        ngr = h1.distance(s2, s3)
        line_sim.append(ngr)
        nor_d = i1.distance(s2, s3)
        line_sim.append(nor_d)
        nor_s = i1.similarity(s2, s3)
        line_sim.append(nor_s)
        opt = j1.distance(s2, s3)
        line_sim.append(opt)
        qgr = k1.distance(s2, s3)
        line_sim.append(qgr)
        sor_d = l1.distance(s2, s3)
        line_sim.append(sor_d)
        sor_s = l1.similarity(s2, s3)
        line_sim.append(sor_s)
        wei = m1.distance(s2, s3)
        line_sim.append(wei)

        return line_sim
Пример #6
0
def main():

    v1 = 'text'
    v2 = 'text'

    # -----------------------------------------------Edit based ------------------------------------------------------
    print(
        "-------------------------------- Edit based ----------------------------------"
    )
    print("------- HAMMING ---------")
    ed = Hamming()
    #The return value is a float between 0 and 1, where 0 means totally different, and 1 equal.
    print("Hamming Similarity: ", ed.normalized_similarity(v1, v2))

    print("\n-------- MLIPNS --------")
    ed = MLIPNS()
    print("MLIPNS similarity: ", ed.similarity(v1, v2))

    print("\n-------- JaroWinkler --------")
    ed = JaroWinkler()
    print("JaroWinkler similarity: ", ed.similarity(v1, v2))

    print("\n-------- Jaro --------")
    ed = Jaro()
    print("Jaro similarity: ", ed.similarity(v1, v2))

    # ----------------------------------------------- Token based ------------------------------------------------------

    print(
        "-------------------------------- Token based ----------------------------------"
    )
    print("\n-------- JACCARD --------")
    ed = Jaccard()
    print("JACCARD similarity: ", ed.similarity(v1, v2))
    #considera a quantidade de letras

    print("\n-------- Sorensen --------")
    ed = Sorensen()
    print("Sorensen similarity: ", ed.similarity(v1, v2))

    print("\n-------- Tversky --------")
    ed = Tversky()
    print("Tversky similarity: ", ed.similarity(v1, v2))

    print("\n-------- Overlap --------")
    ed = Overlap()
    print("Overlap similarity: ", ed.similarity(v1, v2))

    print("\n-------- Cosine --------")
    ed = Cosine()
    print("Cosine similarity: ", ed.similarity(v1, v2))

    # ----------------------------------------------- Sequence based ------------------------------------------------------
    print(
        "-------------------------------- Sequence based ----------------------------------"
    )

    print("\n-------- RatcliffObershelp --------")
    ed = RatcliffObershelp()
    print("RatcliffObershelp similarity: ", ed.similarity(v1, v2))

    # ----------------------------------------------- Compression based ------------------------------------------------------
    print(
        "-------------------------------- Compression based ----------------------------------"
    )

    print("\n-------- EntropyNCD --------")
    ed = EntropyNCD()
    print("EntropyNCD similarity: ", ed.similarity(v1, v2))

    print("\n-------- BZ2NCD --------")
    ed = BZ2NCD()
    print("BZ2NCD similarity: ", ed.similarity(v1, v2))

    print("\n-------- LZMANCD --------")
    ed = LZMANCD()
    print("LZMANCD similarity: ", ed.similarity(v1, v2))

    print("\n-------- ZLIBNCD --------")
    ed = ZLIBNCD()
    print("ZLIBNCD similarity: ", ed.similarity(v1, v2))

    # ----------------------------------------------- Simple based ------------------------------------------------------
    print(
        "-------------------------------- Simple based ----------------------------------"
    )

    print("\n-------- Prefix --------")
    ed = Prefix()
    print("Prefix similarity: ", ed.similarity(v1, v2))

    print("\n-------- Postfix --------")
    ed = Postfix()
    print("Postfix similarity: ", ed.similarity(v1, v2))

    # ----------------------------------------------- strsim function ------------------------------------------------------
    print(
        "-------------------------------- strsim function ----------------------------------"
    )

    print("\n-------- Normalized Levenshtein --------")
    ed = NormalizedLevenshtein()
    print("Normalized Levenshtein similarity: ", ed.similarity(v1, v2))

    print("\n-------- MetricLCS --------")
    ed = MetricLCS()
    print("MetricLCS similarity: ", ed.distance(v1, v2))

    print("\n-------- NGram --------")
    ed = NGram()
    print("NGram similarity: ", ed.distance(v1, v2))

    print("\n-------- Sorensen --------")
    ed = Sorensen()
    print("Sorensen similarity: ", ed.similarity(v1, v2))
Пример #7
0
def similaridade(function_name, string_1, string_2):

    if function_name == 'Hamming':
        ed = Hamming()
        return ed.normalized_similarity(string_1, string_2)

    elif function_name == 'MLIPNS':
        ed = MLIPNS()
        return ed.similarity(string_1, string_2)

    elif function_name == 'JaroWinkler':
        ed = JaroWinkler()
        return ed.similarity(string_1, string_2)

    elif function_name == 'Jaro':
        ed = Jaro()
        return ed.similarity(string_1, string_2)

    elif function_name == 'Jaccard':
        ed = Jaccard()
        return ed.similarity(string_1, string_2)

    elif function_name == 'Sorensen':
        ed = Sorensen()
        return ed.similarity(string_1, string_2)

    elif function_name == 'Tversky':
        ed = Tversky()
        return ed.similarity(string_1, string_2)

    elif function_name == 'Overlap':
        ed = Overlap()
        return ed.similarity(string_1, string_2)

    elif function_name == 'Cosine':
        ed = Cosine()
        return ed.similarity(string_1, string_2)

    elif function_name == 'RatcliffObershelp':
        ed = RatcliffObershelp()
        return ed.similarity(string_1, string_2)

    elif function_name == 'EntropyNCD':
        ed = EntropyNCD()
        return ed.similarity(string_1, string_2)

    elif function_name == 'BZ2NCD':
        ed = BZ2NCD()
        return ed.similarity(string_1, string_2)

    elif function_name == 'LZMANCD':
        ed = LZMANCD()
        return ed.similarity(string_1, string_2)

    elif function_name == 'ZLIBNCD':
        ed = ZLIBNCD()
        return ed.similarity(string_1, string_2)

    elif function_name == 'Prefix':
        ed = Prefix()
        return ed.similarity(string_1, string_2)

    elif function_name == 'Postfix':
        ed = Postfix()
        return ed.similarity(string_1, string_2)

    elif function_name == 'NormalizedLevenshtein':
        ed = NormalizedLevenshtein()
        return ed.similarity(string_1, string_2)

    elif function_name == 'MetricLCS':
        ed = MetricLCS()
        return ed.distance(string_1, string_2)

    elif function_name == 'NGram':
        ed = NGram()
        return ed.distance(string_1, string_2)

    elif function_name == 'StrCmp95':
        ed = StrCmp95()
        return ed.distance(string_1, string_2)
Пример #8
0
    #     a = array(pair)


    # for key in data_model.datasets:
    #     data_instance = data_model.datasets[key]
    #     print('Data Instance: ' + key)
    #
    #     for resource in data_instance.resources:
    #         print(resource['format'])
    #
    #         data = resource['data']
    #         first_row = data[0]

from difflib import SequenceMatcher
from similarity.ngram import NGram
twogram = NGram(2)
fourgram = NGram(4)

from similarity.metric_lcs import MetricLCS
metric_lcs = MetricLCS()
def build_local_similarity_matrix(source_schema, target_schema):
    matrix= np.zeros((len(source_schema), len(target_schema)))

    for i in range(len(source_schema)):
            for j in range(len(target_schema)):
                # TODO call matcher
                sim_score = 1 - twogram.distance(source_schema[i],target_schema[j])
                # matrix[i,j] = np.int(100*SequenceMatcher(None,source_schema[i],target_schema[j]).ratio())
                matrix[i, j] = sim_score

                DEBUG_MODE = False # TODO <=
def pre_clustering(stats_path, source, target, instance_matching_output):
    twogram = NGram(2)
    threshold = 0.7
    weights = [0.5, 0.5]

    for src_table in source:
        src_path = stats_path + src_table + '.json'
        f = open(src_path)
        src_data = json.load(f)
        src_attrs = list(src_data.keys())

        for tar_table in target:
            print('-----')
            print(src_table, tar_table)

            tar_path = stats_path + tar_table + '.json'
            f = open(tar_path)
            tar_data = json.load(f)
            tar_attrs = list(tar_data.keys())

            sim_matrix = np.zeros((len(src_data), len(tar_data)))

            for i in range(len(src_attrs)):
                src_vals = src_data[src_attrs[i]]

                src_datatype = find_datatype(src_vals)

                for j in range(len(tar_attrs)):
                    tar_vals = tar_data[tar_attrs[j]]

                    tar_datatype = find_datatype(tar_vals)
                    print(src_attrs[i], tar_attrs[j])

                    if src_datatype == 'str' and tar_datatype == 'str':

                        n_a, n_b, D, n_D, t, n_t = dh.compute_sets(
                            src_vals, tar_vals, threshold, matcher_name,
                            twogram)
                        U_set = dh.cdf(n_t, n_a, n_b, n_D)
                    else:
                        U_set = 0.0

                    name_sim = matcher_name(src_attrs[i], tar_attrs[j],
                                            twogram)

                    print(U_set, name_sim)

                    if U_set > 1.0: U_set = 1.0
                    sim_matrix[i,
                               j] = U_set * weights[0] + name_sim * weights[1]

                    df_sim_matrix = pd.DataFrame(data=sim_matrix,
                                                 columns=tar_attrs,
                                                 index=src_attrs)

            filename = instance_matching_output + src_table + '/'

            if not os.path.exists(filename):
                os.makedirs(filename)

            filename += '%s||%s.csv' % (src_table, tar_table)
            df_sim_matrix.to_csv(filename, sep=',', encoding='utf-8')
            msg = 'Matrix saved for src=%s tar=%s to %s' % (
                src_table, tar_table, filename)
            logging.info(msg)

    return
def select_datasources():
    import os
    ls = os.listdir('./metadata')
    ls_dict = {}
    for st in ls:
        st = st.replace('-', ' ')
        st = st.replace('.', ' ')
        st = st.split(' ')
        st = st[1:-1]
        st = ' '.join(st)
        st = st.lower()
        ls_dict[st] = {'csv': [], 'json': []}

    from similarity.ngram import NGram
    twogram = NGram(2)
    metadata_sources = ls_dict.keys()

    for root, dirs, files in os.walk("../thesis_project_dataset"):
        curr_dir_path = root.split("/")
        curr_dir_name = curr_dir_path[-1]
        for file in files:
            filename, file_extension = os.path.splitext(file)
            dataset = root.split('/')
            dataset = dataset[2:3]
            if len(dataset) != 0 and dataset[0] != '.git':
                dataset = dataset[0]
                dataset = dataset.replace('-', ' ')

                found = False
                found_val = None
                curr_score = 0
                found_datasource = None

                if dataset in ls_dict:
                    found = True

                if found:
                    dataset_collection = ls_dict[dataset]
                    found = True
                    found_val = dataset_collection
                    found_datasource = dataset
                    curr_score = 1

                if not found:
                    curr_score = 0
                    for metadata_source in metadata_sources:
                        dist = 1 - twogram.distance(dataset, metadata_source)
                        if dist < 0.85:
                            print('skip', root + '/' + file)
                            continue
                        if dist > curr_score:
                            found = True
                            found_val = ls_dict[metadata_source]
                            curr_score = dist
                            found_datasource = metadata_source

                            print('found', found, found_datasource, curr_score,
                                  file_extension, root + '/' + file)

                if not found:
                    continue

                if file_extension == '.json':
                    found_val['json'].append((root + '/' + file, curr_score))

                if file_extension == '.csv':
                    found_val['csv'].append((root + '/' + file, curr_score))

    print(ls_dict)
    for key in ls_dict:
        val = ls_dict[key]
        val['csv'] = sorted(val['csv'], key=lambda x: x[1])
        val['json'] = sorted(val['json'], key=lambda x: x[1])

    import json
    with open('datasource_and_metadata.json', 'w') as fp:
        json.dump(ls_dict, fp, sort_keys=True, indent=2)
Пример #11
0
from pathlib import Path
import csv
import pandas as pd
from sklearn import metrics

cpath = open(
    "/Users/shwetha/Desktop/Desktop/2019S2-COMP90049_proj1-data/candidates.txt",
    "r")
dpath = open(
    "/Users/shwetha/Desktop/Desktop/2019S2-COMP90049_proj1-data/dict.txt", "r")
bpath = open(
    "/Users/shwetha/Desktop/Desktop/2019S2-COMP90049_proj1-data/blends_org.txt",
    "r")

wordscheck = []
twogram = NGram(2)

candidates = csv.reader(cpath, dialect="excel")
dictionary = csv.reader(dpath, dialect="excel")

blends = pd.read_table(bpath, names=("blends", "w1", "w2"))

blends.head()

blends.tail()

blends.head(20)

dictwords = list(dictionary)

blendwords = list(blends)
def test_instance_matching():
    import numpy as np
    import pandas as pd
    tar = [['attr1', 'attr2', 'attr3'], ['aaaa', 'bbb', 'ccc'],
           ['xxx', 'yyyy', 'zzz']]
    # y = [['attr4', 'attr5', 'attr6'], ['xxx', 'yyy', 'zzz'], ['aaa', 'bbb', 'ccc']]
    src = [['attr4'], ['xxx'], ['aaa'], ['mmm']]

    data_tar = np.array([np.array(xi) for xi in tar])
    df_tar = pd.DataFrame(data=data_tar[1:, 0:], columns=data_tar[0, 0:])

    data_src = np.array([np.array(xi) for xi in src])
    df_src = pd.DataFrame(data=data_src[1:, 0:], columns=data_src[0, 0:])

    print(df_tar.to_string())
    print(df_src.to_string())

    schema_tar = list(df_tar.columns.values)
    schema_src = list(df_src.columns.values)

    print(schema_tar)
    print(schema_src)

    src_values = []
    tar_values = []
    src_val_len = 0
    tar_val_len = 0
    for attr in schema_src:
        src_values.extend(list(df_src[attr]))
        src_val_len = len(list(df_src[attr]))

    for attr in schema_tar:
        tar_values.extend(list(df_tar[attr]))
        tar_val_len = len(list(df_tar[attr]))

    from similarity.ngram import NGram
    twogram = NGram(2)

    match_threshold = 0.6
    sim_matrix = np.zeros((len(schema_src), len(schema_tar)))

    for i in range(len(src_values)):
        src_value = src_values[i]
        src_ind = i // src_val_len
        src_attr = schema_src[src_ind]

        for j in range(len(tar_values)):
            tar_value = tar_values[j]
            tar_ind = j // tar_val_len
            tar_attr = schema_tar[tar_ind]

            sim_score = 1 - twogram.distance(str(src_value), str(tar_value))

            if str(src_value) == 'None' or str(tar_value) == 'None':
                sim_score = 0

            if sim_score > match_threshold:
                sim_matrix[src_ind, tar_ind] += sim_score
                print('sim_score >= ', match_threshold, ': ', src_attr,
                      tar_attr, src_value, tar_value, sim_score)

    df_sim_matrix = pd.DataFrame(data=sim_matrix,
                                 columns=schema_tar,
                                 index=schema_src)
    print(df_sim_matrix.to_string())
Пример #13
0
 def fourgram(self, s0, s1):
     fourgram = NGram(3)
     #print('Fourgram similarity \"%\" vs \"%\"'% (s0,s1))
     return 1 - fourgram.distance(s0, s1)
Пример #14
0
from similarity.jaccard import Jaccard
from similarity.sorensen_dice import SorensenDice
from scipy.spatial.distance import euclidean, cosine, cityblock
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import numpy as np

# Inizializza all'import
levenshtein = Levenshtein()
norm_levenshtein = NormalizedLevenshtein()
damerau = Damerau()
optimal_string_alignment = OptimalStringAlignment()
jarowinkler = JaroWinkler()
lcs = LongestCommonSubsequence()
metric_lcs = MetricLCS()
ngram = NGram()
qgram = QGram()
dice = SorensenDice()
cos = Cosine(5)
jaccard = Jaccard(5)

similarity_functions = [
    norm_levenshtein.similarity, lambda a, b: 1 - metric_lcs.distance(a, b),
    lambda a, b: 1 - ngram.distance(a, b), cos.similarity, dice.similarity
]


def mono_vector0(tup1, tup2):

    str1 = ' '.join(tup1).lower()
    str2 = ' '.join(tup2).lower()