def _get_distance(self, a, b, metric='ed', max_len=None): """ Compute the edit distance between two token lists a and b. """ rt = np.nan if metric == 'ed': lev = Levenshtein() rt = lev.distance(a, b) elif metric == 'dtw': def dist_func(x, y): return 0 if x == y else 1 d, mat_cost, mat_acc_cost, path = dtw_func(a, b, dist=dist_func) rt = d elif metric == 'lcs': lcs = LongestCommonSubsequence() rt = lcs.distance(a, b) elif metric == 'mylcs': lcs = LongestCommonSubsequence() rt = lcs.distance(a, b) / max_len elif metric == 'mlcs': # metric LCS metric_lcs = MetricLCS() rt = metric_lcs.distance(a, b) else: raise NotImplementedError( "Metric not implemented: {}".format(metric)) return rt
def find_best_candidate(ciphertext): candidates = [] for i in range(len(message_candidates)): levenshtein = Levenshtein() L = 500 plaintext_str = message_candidates[i] distance = (levenshtein.distance(plaintext_str, ciphertext)) # accuracy = ((1 - (levenshtein.distance(plaintext_str, ciphertext)/L) ) * 100) candidate = [i, distance] candidates.append(candidate) # print("Levenshtein Distance Accuracy", (1 - (levenshtein.distance(plaintext_str, ciphertext) / L)) * 100) return candidates
def levenshtein(keyword, domain): """Compute Levenshtein distance Args: keyword: domain: Returns: leven.distance: Levenshtein Distance (int) """ leven = Levenshtein() return leven.distance(keyword, domain)
def count_matches(t, L, c, freq_replacements): substrings = [0] * t # Divide the ciphertext into t substrings for i in range(t): substrings[i] = c[slice(i, L, t)] # print(substrings[0]) # print(substrings) # Do frequency analysis on each substring updated_substrings = [0] * t for i in range(t): updated_substrings[i] = frequency_analysis(substrings[i], freq_replacements) # print(updated_substrings) # Now reassemble the substrings into a single message new_text = [None] * L for i in range(t): new_text[i::t] = updated_substrings[i] # print(new_text) # Convert messages to numbers to compare m = [] for message in message_candidates: m.append(convert_to_numbers(message)) # print(m) # print(m[0]) # Count how many matches we get with each message message_distances = [] levenshtein = Levenshtein() for i in range(5): distance = (levenshtein.distance(new_text, m[i])) # accuracy = ((1 - (levenshtein.distance(plaintext_str, ciphertext)/L) ) * 100) message_distances.append(distance) # matches_ctr = 0 # for j in range(500): # messages are all length 500 # if m[i][j] == new_text[j]: # matches_ctr += 1 # message_matches[i] = matches_ctr # print(message_matches) # Return the array of message_matches return message_distances
def test4_custommetric(self): metric = Levenshtein().distance uriA = "https://test.me/A" uriB = "https://test.me/B" str_dict = { "https://test.me/A": "Hello this is a test string.", "https://test.me/B": "Hello this is another test string." } result_exp = 6 result = calc_string_similarity(uri_1=uriA, uri_2=uriB, label_dict=str_dict, metric=metric) assert result == result_exp
import re import nltk import gensim from nltk.stem.porter import * from gensim.corpora import Dictionary from gensim.utils import simple_preprocess from gensim.parsing.preprocessing import STOPWORDS from nltk.stem import WordNetLemmatizer, SnowballStemmer from strsimpy.levenshtein import Levenshtein from collections import Counter '''Global Parameters''' levenshtein = Levenshtein() class TopicModel(): '''---------Func: To preprocess and stem the language data-----------''' def preprocess(text): stemmer = SnowballStemmer("english") result = [] for token in gensim.utils.simple_preprocess(text): if token not in gensim.parsing.preprocessing.STOPWORDS and len( token) > 3: result.append( stemmer.stem(WordNetLemmatizer().lemmatize(token, pos='v'))) return result '''---------Func: To convert string into vectors-----------''' '''- Count the characters in word - Precomputes a set of the different characters - Precomputes the "length" of the word vector
class kb2(): sheetNameList = ['开出', '开入', '匹配'] levenshtein = Levenshtein() normalized_levenshtein = NormalizedLevenshtein() funcList = [levenshtein, normalized_levenshtein] def __init__(self, matchList: [Match] = [Match], outPortList: [] = [], inPortList: [] = []): self.matchList = matchList self.portListDict = {str: []} self.portDict = {str: Port} self.dfDict = {} for sheetName in self.sheetNameList: self.dfDict[sheetName] = DataFrame() # in,out,match def learn_folder(self, path2folder='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元'): for filename in glob.iglob(path2folder + '**/*.xls', recursive=True): if filename.endswith(".xls") or filename.endswith(".csv"): self.learn_excel(filename) else: continue def learn_excel(self, path2excel): self.load_excel(path2excel, sheetName='已配置', inOut='开出') self.load_excel(path2excel, sheetName='已配置', inOut='开入') sheet = pd.ExcelFile(path2excel).parse('已配置') try: for row in sheet.iterrows(): outPort = Port(row[1]['开出端子描述'], row[1]['开出端子引用']) inPort = Port(row[1]['开入端子描述'], row[1]['开入端子引用']) match = Match(outPort, inPort) self.matchList.append(match) global df df = self.dfDict.get('匹配', DataFrame()) key2 = row[1]['开出端子描述'] + row[1]['开出端子引用'] + '匹配' + row[1][ '开入端子描述'] + row[1]['开入端子引用'] df[key2] = df.get(key2) self.dfDict['匹配'] = df except RuntimeError: print(row[1]) def load_excel( self, path2excel='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/赤厝.xls', sheetName='所有发送', inOut='开出'): sheet = pd.ExcelFile(path2excel).parse(sheetName) key: str = path2excel + sheetName + inOut portList = self.portListDict.get(key, []) try: for row in sheet.iterrows(): port = Port(row[1][inOut + '端子描述'], row[1][inOut + '端子引用']) # print(vars(port)) portList.append(port) key2 = row[1][inOut + '端子描述'] + inOut + row[1][inOut + '端子引用'] self.portDict[key2] = port global df df = self.dfDict.get(inOut, DataFrame()) if sheetName == '已配置': df[key2] = df.get(key2) else: # new if key2 not in df.index: df = df.reindex(df.index.tolist() + [key2]) for done in df: # for function in strsimpy.functions: df[done][key2] = self.levenshtein.distance( done, key2) self.dfDict[inOut] = df self.portListDict[key] = portList except RuntimeError: print(row[1]) def load_test( self, path2excel='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/赤厝.xls'): self.load_excel(path2excel, sheetName='所有发送', inOut='开出') self.load_excel(path2excel, sheetName='所有接收', inOut='开入') global df df = self.dfDict.get('匹配', DataFrame()) for outPort in self.portListDict[path2excel + '所有发送' + '开出']: for inPort in self.portListDict[path2excel + '所有接收' + '开入']: # print(vars(inPort)) key2 = outPort.description + outPort.reference + '匹配' + inPort.description + inPort.reference self.dfDict['匹配'] = self.distance(key2, df, '匹配') def distance(self, key2, df, inOut): if key2 not in df.index: df = df.reindex(df.index.tolist() + [key2]) for done in df: similarity = self.levenshtein.distance(done, key2) df[done][key2] = similarity # if similarity<0.03: # print(done+"like"+key2) self.dfDict[inOut] = df return df def main(self, path2excel='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/kb2.xlsx'): # for sheetName in self.sheetNameList: # self.dfDict[sheetName] = pd.ExcelFile(path2excel).parse(sheetName) # load history start_time = time.time() self.learn_excel('..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/赤厝.xls') self.load_test() with pd.ExcelWriter(path2excel) as writer: for key, df in self.dfDict.items(): print(key, df) df.to_excel(writer, sheet_name=key) print("--- %s m ---" % ((time.time() - start_time) / 60)) def transform(self, multilevelDict): return { str(key).replace("\n", ""): (self.transform(value) if isinstance(value, dict) else value) for key, value in multilevelDict.items() }
def query_boosting(search_str): """ Query boosting algorithm """ #Initializing weights weights = { "title_si": 0, "title_en": 0, "artist_si": 0, "artist_en": 0, "music_si": 0, "melody_si": 0, "lyricist_si": 0, "lyrics": 0 } sinhala = isSinhala(search_str) num_words = len(search_str.split(" ")) ####### The algorithm ######## if (not sinhala): weights["artist_en"] = 1 weights["title_en"] = 1 elif (num_words < 3): weights["artist_si"] = 1 comp_str = search_str artist, music, lyricist, melody = get_all_lists() lev = Levenshtein() for a in artist: dist = lev.distance(a, comp_str) if (dist <= 4): weights["artist_si"] = 5 break for m in music: dist = lev.distance(m, comp_str) if (dist <= 4): if (weights["artist_si"] == 5): weights["music_si"] = 0 else: weights["music_si"] = 5 break for l in lyricist: dist = lev.distance(l, comp_str) if (dist <= 4): if (weights["artist_si"] == 5): weights["lyricist_si"] = 0 else: weights["lyricist_si"] = 5 break for m in melody: dist = lev.distance(m, comp_str) if (dist <= 4): if (weights["artist_si"] == 5): weights["melody_si"] = 0 else: weights["melody_si"] = 5 break elif (5 > num_words >= 3): weights["lyrics"] = 2 weights["title_si"] = 5 elif (num_words >= 5): weights["lyrics"] = 3 # Query attributes building based on weights of each field title_si = "title_si^{}".format(weights["title_si"]) title_en = "title_en^{}".format(weights["title_en"]) artist_si = "artist_si^{}".format(weights["artist_si"]) artist_en = "artist_en^{}".format(weights["artist_en"]) music_si = "music_si^{}".format(weights["music_si"]) melody_si = "melody_si^{}".format(weights["melody_si"]) lyricist_si = "lyricist_si^{}".format(weights["lyricist_si"]) lyrics = "lyrics^{}".format(weights["lyrics"]) return [ title_si, title_en, artist_si, artist_en, music_si, melody_si, lyricist_si, lyrics ]
class KnowledgeBase(): sheetNameList = ['开出', '开入', '匹配'] typeList = ['描述', '引用'] get2txt_similarity = QGram(2) get2txt_similarityList = {'Levenshtein': Levenshtein(), 'QGram': QGram(2)} def __init__(self, matchList: [Match] = [Match], outPortList: [] = [], inPortList: [] = []): self.matchList = matchList self.portListDict = {str: []} self.portDict = {str: Port} self.dfDict = {} for sheetName in self.sheetNameList: for type in self.typeList: self.dfDict[sheetName + type] = DataFrame() # in,out,match def learn_folder(self, path2folder='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元'): for filename in glob.iglob(path2folder + '**/*.xls', recursive=True): if filename.endswith(".xls") or filename.endswith(".csv"): self.learn_excel(filename) else: continue def learn_excel(self, path2excel): for type in self.typeList: for inOut in ['开出', '开入']: self.load_excel_sheet(path2excel, sheetName='已配置', inOut=inOut, type=type) sheet = pd.ExcelFile(path2excel).parse('已配置') try: for row in sheet.iterrows(): outPort = Port(row[1]['开出端子描述'], row[1]['开出端子引用']) inPort = Port(row[1]['开入端子描述'], row[1]['开入端子引用']) match = Match(outPort, inPort) self.matchList.append(match) for type in self.typeList: df2 = self.dfDict.get('匹配' + type, DataFrame()) dfKey = row[1]['开出端子' + type] + '匹配' + row[1]['开入端子' + type] if dfKey not in df2: df2[dfKey] = df2.get(dfKey) self.dfDict['匹配' + type] = df2 except RuntimeError: print(row[1]) def load_excel_sheet( self, path2excel='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/赤厝.xls', sheetName='所有发送', inOut='开出', type='描述'): sheet = pd.ExcelFile(path2excel).parse(sheetName) key: str = path2excel + sheetName + inOut portList = self.portListDict.get(key, []) try: for row in sheet.iterrows(): port = Port(row[1][inOut + '端子描述'], row[1][inOut + '端子引用']) portList.append(port) key2 = row[1][inOut + '端子' + type] self.portDict[key2] = port dfName = inOut + type global df2 df2 = self.dfDict.get(dfName, DataFrame()) if sheetName == '已配置': df2[key2] = df2.get(key2) else: # new if key2 not in df2.index: df2 = df2.reindex(df2.index.tolist() + [key2]) for done in df2: similarity = self.get2txt_similarity.distance( done, key2) df2[done][key2] = similarity # if similarity < 3: # print(key2 + " ~ " + done) self.dfDict[dfName] = df2 self.portListDict[key] = portList except RuntimeError: print(row[1]) def load_test( self, path2excel='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/new/赤厝.xls'): recDF = self.dfDict.get( '匹配recomend', DataFrame(0, index=range(99), columns=range(99))) i = 0 for type in self.typeList: self.load_excel_sheet(path2excel, sheetName='所有发送', inOut='开出', type=type) self.load_excel_sheet(path2excel, sheetName='所有接收', inOut='开入', type=type) key = '匹配' + type df = self.dfDict.get(key, DataFrame()) for outPort in self.portListDict[path2excel + '所有发送' + '开出']: for inPort in self.portListDict[path2excel + '所有接收' + '开入']: if type == '描述': threshold = 2 key2 = outPort.description + '匹配' + inPort.description else: threshold = 4 key2 = outPort.reference + '匹配' + inPort.reference if key2 not in df.index: df = df.reindex(df.index.tolist() + [key2]) for done in df: similarity = self.get2txt_similarity.distance( done, key2) df[done][key2] = similarity if similarity < threshold: print(key2 + " ~ " + done) recDF[0][i] = str(key2 + " ~ " + done) i = i + 1 self.dfDict[key] = df self.dfDict['匹配recomend'] = recDF def main( self, path2excel='..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/KnowledgeBase.xlsx' ): # for sheetName in self.sheetNameList: # self.dfDict[sheetName] = pd.ExcelFile(path2excel).parse(sheetName) # load history self.learn_folder() # self.learn_excel('..\excel\learn/220-母线&线路-第一套合并单元&第一套合并单元/赤厝.xls') start_time = time.time() self.load_test() print("--- %s s ---" % ((time.time() - start_time))) with pd.ExcelWriter(path2excel) as writer: for key, df in self.dfDict.items(): print(key, df) df.to_excel(writer, sheet_name=key) def transform(self, multilevelDict): return { str(key).replace("\n", ""): (self.transform(value) if isinstance(value, dict) else value) for key, value in multilevelDict.items() }