예제 #1
0
    def _get_distance(self, a, b, metric='ed', max_len=None):
        """
        Compute the edit distance between two token lists a and b.
        """
        rt = np.nan
        if metric == 'ed':
            lev = Levenshtein()
            rt = lev.distance(a, b)
        elif metric == 'dtw':

            def dist_func(x, y):
                return 0 if x == y else 1

            d, mat_cost, mat_acc_cost, path = dtw_func(a, b, dist=dist_func)
            rt = d
        elif metric == 'lcs':
            lcs = LongestCommonSubsequence()
            rt = lcs.distance(a, b)
        elif metric == 'mylcs':
            lcs = LongestCommonSubsequence()
            rt = lcs.distance(a, b) / max_len
        elif metric == 'mlcs':  # metric LCS
            metric_lcs = MetricLCS()
            rt = metric_lcs.distance(a, b)
        else:
            raise NotImplementedError(
                "Metric not implemented: {}".format(metric))

        return rt
예제 #2
0
def find_best_candidate(ciphertext):
    candidates = []
    for i in range(len(message_candidates)):
        levenshtein = Levenshtein()
        L = 500
        plaintext_str = message_candidates[i]
        distance = (levenshtein.distance(plaintext_str, ciphertext))
        # accuracy = ((1 - (levenshtein.distance(plaintext_str, ciphertext)/L) ) * 100)
        candidate = [i, distance]
        candidates.append(candidate)
    # print("Levenshtein Distance Accuracy", (1 - (levenshtein.distance(plaintext_str, ciphertext) / L)) * 100)
    return candidates
예제 #3
0
def levenshtein(keyword, domain):
    """Compute Levenshtein distance

    Args:
        keyword:
        domain:

    Returns:
        leven.distance: Levenshtein Distance (int)

    """
    leven = Levenshtein()
    return leven.distance(keyword, domain)
예제 #4
0
def count_matches(t, L, c, freq_replacements):
    substrings = [0] * t
    # Divide the ciphertext into t substrings
    for i in range(t):
        substrings[i] = c[slice(i, L, t)]
    # print(substrings[0])
    # print(substrings)

    # Do frequency analysis on each substring
    updated_substrings = [0] * t
    for i in range(t):
        updated_substrings[i] = frequency_analysis(substrings[i],
                                                   freq_replacements)
    # print(updated_substrings)

    # Now reassemble the substrings into a single message
    new_text = [None] * L
    for i in range(t):
        new_text[i::t] = updated_substrings[i]
    # print(new_text)

    # Convert messages to numbers to compare
    m = []
    for message in message_candidates:
        m.append(convert_to_numbers(message))
    # print(m)
    # print(m[0])

    # Count how many matches we get with each message
    message_distances = []
    levenshtein = Levenshtein()
    for i in range(5):
        distance = (levenshtein.distance(new_text, m[i]))
        # accuracy = ((1 - (levenshtein.distance(plaintext_str, ciphertext)/L) ) * 100)
        message_distances.append(distance)
        # matches_ctr = 0
        # for j in range(500):  # messages are all length 500
        #     if m[i][j] == new_text[j]:
        #         matches_ctr += 1
        # message_matches[i] = matches_ctr
    # print(message_matches)

    # Return the array of message_matches
    return message_distances
    def test4_custommetric(self):

        metric = Levenshtein().distance

        uriA = "https://test.me/A"
        uriB = "https://test.me/B"

        str_dict = {
            "https://test.me/A": "Hello this is a test string.",
            "https://test.me/B": "Hello this is another test string."
        }

        result_exp = 6

        result = calc_string_similarity(uri_1=uriA,
                                        uri_2=uriB,
                                        label_dict=str_dict,
                                        metric=metric)

        assert result == result_exp
예제 #6
0
import re
import nltk
import gensim
from nltk.stem.porter import *
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from strsimpy.levenshtein import Levenshtein
from collections import Counter
'''Global Parameters'''
levenshtein = Levenshtein()


class TopicModel():
    '''---------Func: To preprocess and stem the language data-----------'''
    def preprocess(text):
        stemmer = SnowballStemmer("english")
        result = []
        for token in gensim.utils.simple_preprocess(text):
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(
                    token) > 3:
                result.append(
                    stemmer.stem(WordNetLemmatizer().lemmatize(token,
                                                               pos='v')))
        return result

    '''---------Func: To convert string into vectors-----------'''
    '''- Count the characters in word
       - Precomputes a set of the different characters
       - Precomputes the "length" of the word vector
예제 #7
0
class kb2():
    sheetNameList = ['开出', '开入', '匹配']
    levenshtein = Levenshtein()
    normalized_levenshtein = NormalizedLevenshtein()
    funcList = [levenshtein, normalized_levenshtein]

    def __init__(self,
                 matchList: [Match] = [Match],
                 outPortList: [] = [],
                 inPortList: [] = []):
        self.matchList = matchList
        self.portListDict = {str: []}
        self.portDict = {str: Port}
        self.dfDict = {}
        for sheetName in self.sheetNameList:
            self.dfDict[sheetName] = DataFrame()  # in,out,match

    def learn_folder(self,
                     path2folder='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元'):
        for filename in glob.iglob(path2folder + '**/*.xls', recursive=True):
            if filename.endswith(".xls") or filename.endswith(".csv"):
                self.learn_excel(filename)
            else:
                continue

    def learn_excel(self, path2excel):
        self.load_excel(path2excel, sheetName='已配置', inOut='开出')
        self.load_excel(path2excel, sheetName='已配置', inOut='开入')
        sheet = pd.ExcelFile(path2excel).parse('已配置')
        try:
            for row in sheet.iterrows():
                outPort = Port(row[1]['开出端子描述'], row[1]['开出端子引用'])
                inPort = Port(row[1]['开入端子描述'], row[1]['开入端子引用'])
                match = Match(outPort, inPort)
                self.matchList.append(match)
                global df
                df = self.dfDict.get('匹配', DataFrame())
                key2 = row[1]['开出端子描述'] + row[1]['开出端子引用'] + '匹配' + row[1][
                    '开入端子描述'] + row[1]['开入端子引用']
                df[key2] = df.get(key2)
                self.dfDict['匹配'] = df
        except RuntimeError:
            print(row[1])

    def load_excel(
            self,
            path2excel='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/赤厝.xls',
            sheetName='所有发送',
            inOut='开出'):
        sheet = pd.ExcelFile(path2excel).parse(sheetName)
        key: str = path2excel + sheetName + inOut
        portList = self.portListDict.get(key, [])
        try:
            for row in sheet.iterrows():
                port = Port(row[1][inOut + '端子描述'], row[1][inOut + '端子引用'])
                # print(vars(port))
                portList.append(port)
                key2 = row[1][inOut + '端子描述'] + inOut + row[1][inOut + '端子引用']
                self.portDict[key2] = port
                global df
                df = self.dfDict.get(inOut, DataFrame())
                if sheetName == '已配置':
                    df[key2] = df.get(key2)
                else:  # new
                    if key2 not in df.index:
                        df = df.reindex(df.index.tolist() + [key2])
                        for done in df:
                            # for function in strsimpy.functions:
                            df[done][key2] = self.levenshtein.distance(
                                done, key2)
                self.dfDict[inOut] = df
            self.portListDict[key] = portList
        except RuntimeError:
            print(row[1])

    def load_test(
            self,
            path2excel='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/赤厝.xls'):
        self.load_excel(path2excel, sheetName='所有发送', inOut='开出')
        self.load_excel(path2excel, sheetName='所有接收', inOut='开入')
        global df
        df = self.dfDict.get('匹配', DataFrame())
        for outPort in self.portListDict[path2excel + '所有发送' + '开出']:
            for inPort in self.portListDict[path2excel + '所有接收' + '开入']:
                # print(vars(inPort))
                key2 = outPort.description + outPort.reference + '匹配' + inPort.description + inPort.reference

                self.dfDict['匹配'] = self.distance(key2, df, '匹配')

    def distance(self, key2, df, inOut):
        if key2 not in df.index:
            df = df.reindex(df.index.tolist() + [key2])
            for done in df:
                similarity = self.levenshtein.distance(done, key2)
                df[done][key2] = similarity
                # if similarity<0.03:
                #     print(done+"like"+key2)
        self.dfDict[inOut] = df
        return df

    def main(self,
             path2excel='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/kb2.xlsx'):
        # for sheetName in self.sheetNameList:
        #     self.dfDict[sheetName] = pd.ExcelFile(path2excel).parse(sheetName)  # load history
        start_time = time.time()

        self.learn_excel('..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/赤厝.xls')
        self.load_test()
        with pd.ExcelWriter(path2excel) as writer:
            for key, df in self.dfDict.items():
                print(key, df)
                df.to_excel(writer, sheet_name=key)
        print("--- %s m ---" % ((time.time() - start_time) / 60))

    def transform(self, multilevelDict):
        return {
            str(key).replace("\n", ""):
            (self.transform(value) if isinstance(value, dict) else value)
            for key, value in multilevelDict.items()
        }
예제 #8
0
def query_boosting(search_str):
    """
    Query boosting algorithm
    """

    #Initializing weights
    weights = {
        "title_si": 0,
        "title_en": 0,
        "artist_si": 0,
        "artist_en": 0,
        "music_si": 0,
        "melody_si": 0,
        "lyricist_si": 0,
        "lyrics": 0
    }

    sinhala = isSinhala(search_str)
    num_words = len(search_str.split(" "))

    ####### The algorithm ########
    if (not sinhala):

        weights["artist_en"] = 1
        weights["title_en"] = 1

    elif (num_words < 3):

        weights["artist_si"] = 1

        comp_str = search_str

        artist, music, lyricist, melody = get_all_lists()

        lev = Levenshtein()

        for a in artist:
            dist = lev.distance(a, comp_str)
            if (dist <= 4):
                weights["artist_si"] = 5
                break

        for m in music:
            dist = lev.distance(m, comp_str)
            if (dist <= 4):
                if (weights["artist_si"] == 5):
                    weights["music_si"] = 0
                else:
                    weights["music_si"] = 5

                break

        for l in lyricist:
            dist = lev.distance(l, comp_str)
            if (dist <= 4):
                if (weights["artist_si"] == 5):
                    weights["lyricist_si"] = 0
                else:
                    weights["lyricist_si"] = 5

                break

        for m in melody:
            dist = lev.distance(m, comp_str)
            if (dist <= 4):
                if (weights["artist_si"] == 5):
                    weights["melody_si"] = 0
                else:
                    weights["melody_si"] = 5

                break

    elif (5 > num_words >= 3):

        weights["lyrics"] = 2
        weights["title_si"] = 5

    elif (num_words >= 5):
        weights["lyrics"] = 3

    # Query attributes building based on weights of each field
    title_si = "title_si^{}".format(weights["title_si"])
    title_en = "title_en^{}".format(weights["title_en"])
    artist_si = "artist_si^{}".format(weights["artist_si"])
    artist_en = "artist_en^{}".format(weights["artist_en"])
    music_si = "music_si^{}".format(weights["music_si"])
    melody_si = "melody_si^{}".format(weights["melody_si"])
    lyricist_si = "lyricist_si^{}".format(weights["lyricist_si"])
    lyrics = "lyrics^{}".format(weights["lyrics"])

    return [
        title_si, title_en, artist_si, artist_en, music_si, melody_si,
        lyricist_si, lyrics
    ]
예제 #9
0
class KnowledgeBase():
    sheetNameList = ['开出', '开入', '匹配']
    typeList = ['描述', '引用']
    get2txt_similarity = QGram(2)
    get2txt_similarityList = {'Levenshtein': Levenshtein(), 'QGram': QGram(2)}

    def __init__(self,
                 matchList: [Match] = [Match],
                 outPortList: [] = [],
                 inPortList: [] = []):
        self.matchList = matchList
        self.portListDict = {str: []}
        self.portDict = {str: Port}
        self.dfDict = {}
        for sheetName in self.sheetNameList:
            for type in self.typeList:
                self.dfDict[sheetName + type] = DataFrame()  # in,out,match

    def learn_folder(self,
                     path2folder='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元'):
        for filename in glob.iglob(path2folder + '**/*.xls', recursive=True):
            if filename.endswith(".xls") or filename.endswith(".csv"):
                self.learn_excel(filename)
            else:
                continue

    def learn_excel(self, path2excel):
        for type in self.typeList:
            for inOut in ['开出', '开入']:
                self.load_excel_sheet(path2excel,
                                      sheetName='已配置',
                                      inOut=inOut,
                                      type=type)

        sheet = pd.ExcelFile(path2excel).parse('已配置')
        try:
            for row in sheet.iterrows():
                outPort = Port(row[1]['开出端子描述'], row[1]['开出端子引用'])
                inPort = Port(row[1]['开入端子描述'], row[1]['开入端子引用'])
                match = Match(outPort, inPort)
                self.matchList.append(match)
                for type in self.typeList:
                    df2 = self.dfDict.get('匹配' + type, DataFrame())
                    dfKey = row[1]['开出端子' + type] + '匹配' + row[1]['开入端子' +
                                                                  type]
                    if dfKey not in df2:
                        df2[dfKey] = df2.get(dfKey)
                    self.dfDict['匹配' + type] = df2
        except RuntimeError:
            print(row[1])

    def load_excel_sheet(
            self,
            path2excel='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/赤厝.xls',
            sheetName='所有发送',
            inOut='开出',
            type='描述'):
        sheet = pd.ExcelFile(path2excel).parse(sheetName)
        key: str = path2excel + sheetName + inOut
        portList = self.portListDict.get(key, [])
        try:
            for row in sheet.iterrows():
                port = Port(row[1][inOut + '端子描述'], row[1][inOut + '端子引用'])
                portList.append(port)
                key2 = row[1][inOut + '端子' + type]
                self.portDict[key2] = port
                dfName = inOut + type
                global df2
                df2 = self.dfDict.get(dfName, DataFrame())
                if sheetName == '已配置':
                    df2[key2] = df2.get(key2)
                else:  # new
                    if key2 not in df2.index:
                        df2 = df2.reindex(df2.index.tolist() + [key2])
                        for done in df2:
                            similarity = self.get2txt_similarity.distance(
                                done, key2)
                            df2[done][key2] = similarity
                            # if similarity < 3:
                            #     print(key2 + " ~ " + done)
                self.dfDict[dfName] = df2
            self.portListDict[key] = portList
        except RuntimeError:
            print(row[1])

    def load_test(
            self,
            path2excel='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/new/赤厝.xls'):
        recDF = self.dfDict.get(
            '匹配recomend', DataFrame(0, index=range(99), columns=range(99)))
        i = 0
        for type in self.typeList:
            self.load_excel_sheet(path2excel,
                                  sheetName='所有发送',
                                  inOut='开出',
                                  type=type)
            self.load_excel_sheet(path2excel,
                                  sheetName='所有接收',
                                  inOut='开入',
                                  type=type)
            key = '匹配' + type
            df = self.dfDict.get(key, DataFrame())
            for outPort in self.portListDict[path2excel + '所有发送' + '开出']:
                for inPort in self.portListDict[path2excel + '所有接收' + '开入']:
                    if type == '描述':
                        threshold = 2
                        key2 = outPort.description + '匹配' + inPort.description
                    else:
                        threshold = 4
                        key2 = outPort.reference + '匹配' + inPort.reference
                    if key2 not in df.index:
                        df = df.reindex(df.index.tolist() + [key2])
                        for done in df:
                            similarity = self.get2txt_similarity.distance(
                                done, key2)
                            df[done][key2] = similarity
                            if similarity < threshold:
                                print(key2 + " ~ " + done)
                                recDF[0][i] = str(key2 + " ~ " + done)
                                i = i + 1
                    self.dfDict[key] = df
                    self.dfDict['匹配recomend'] = recDF

    def main(
        self,
        path2excel='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/KnowledgeBase.xlsx'
    ):
        # for sheetName in self.sheetNameList:
        #     self.dfDict[sheetName] = pd.ExcelFile(path2excel).parse(sheetName)  # load history

        self.learn_folder()
        # self.learn_excel('..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/赤厝.xls')
        start_time = time.time()
        self.load_test()
        print("--- %s s ---" % ((time.time() - start_time)))
        with pd.ExcelWriter(path2excel) as writer:
            for key, df in self.dfDict.items():
                print(key, df)
                df.to_excel(writer, sheet_name=key)

    def transform(self, multilevelDict):
        return {
            str(key).replace("\n", ""):
            (self.transform(value) if isinstance(value, dict) else value)
            for key, value in multilevelDict.items()
        }