def similaritypolymain(pTrainData, pTestData, pLevel1, pLevel2, pDesc, pFromDir, pToDir, Nbest): try: pTrainData = pTrainData[pTrainData[pDesc].notna()] pTestData = pTestData[pTestData[pDesc].notna()] pTestData['Intent'], pTestData['Confidence_Level'] = 'Nan','Nan' pTrainData, __ = traindata(pTrainData, pDesc, pLevel1, pLevel2, pFromDir, pToDir) pTrainDataDesc = pd.DataFrame(pTrainData[pDesc]) pTrainDataDescUnq = pTrainDataDesc[pDesc].unique().tolist() pTestDataDescList = list(pTestData[pDesc].values) #need to convert back to a list model = PolyFuzz("TF-IDF") model.match(pTestDataDescList, pTrainDataDescUnq, nbest = int(Nbest)) pMatchesDf = model.get_matches() IntCol = ["To"] for i in range(1, int(Nbest)-1): IntCol.append("BestMatch" + "__" + str(i)) pTestData['Intent' + '__' + str(i)] = 'NaN' SimCol = ['Similarity'] for k in range(1, int(Nbest) - 1): SimCol.append("Similarity" + "__" + str(k)) pTestData['Confidence_Level'+ '__' + str(k)] = 'NaN' for i in range(len(IntCol)): col = str(IntCol[i]) if col != "To": for j in range(len(pTestData)): if pMatchesDf[col][j] != None: pTestData['Intent' + '__' + str(i-1)][j] = pTrainData[np.where(pTrainData[pDesc] == pMatchesDf[col][j], True , False)]['Intent'].values[0] else: for j in range(len(pTestData)): if pMatchesDf[col][j] != None: pTestData['Intent'][j] = pTrainData[np.where(pTrainData[pDesc] == pMatchesDf[IntCol[i]][j], True , False)]['Intent'].values[0] for l in range(len(SimCol)): col = str(SimCol[l]) if col != "Similarity": for m in range(len(pTestData)): if pMatchesDf[col][m] != None: pTestData['Confidence_Level'+ '__' + int(l-1)][m] = pMatchesDf[SimCol[l]][m] else: for m in range(len(pTestData)): if pMatchesDf[SimCol[l]][m] != None: pTestData['Confidence_Level'][m] = pMatchesDf[SimCol[l]][m] except Exception as e: print('*** ERROR[004]: Error in similarity poly main function: ', sys.exc_info()[0],str(e)) print(traceback.format_exc()) utils.movefile(pFromDir, pToDir) return(-1) sys.exit(-1) return(0, pTestData)
def similaritypolymain(pTrainData, pTestData, pAsg, pDesc, pDate, Nbest): try: pTrainData = pTrainData[pTrainData[pDesc].notna()] pTestData = pTestData[pTestData[pDesc].notna()] pTestData['Assignee_Group_Pred'], pTestData['Confidence_Level'] = 'Nan', float(0.0) pTrainDataDesc = pd.DataFrame(pTrainData[pDesc]) pFeaList = [] pFeaList = pTrainData['Features'].tolist() + pTestData['Features'].tolist() pFeaUnqList = list(set(pFeaList)) pMatchData, pData, pTestAppendDf, = [], [], [] pMatchesDf, pTestMatchData, pTestDf = pd.DataFrame(),pd.DataFrame(), pd.DataFrame() for i in range(len(pFeaUnqList)): ToData, FromData = pd.DataFrame(), pd.DataFrame() FromData = pTrainData.loc[pTrainData['Features'] == pFeaUnqList[i]] ToData = pTestData.loc[pTestData['Features'] == pFeaUnqList[i]] model = PolyFuzz("TF-IDF") pTestAppendDf.append(ToData) if len(ToData[pDesc].tolist()) and len(FromData[pDesc].tolist()) >= 1: model.match(list(ToData[pDesc].values), FromData[pDesc].unique().tolist(), nbest = int(Nbest)) Matches = model.get_matches() pMatchData.append(Matches) pData.append(ToData) pMatchesDf = pd.concat(pMatchData) pTestMatchData = pd.concat(pData) pTestDf = pd.concat(pTestAppendDf) pMatchesDf.reset_index(inplace=True) del pMatchesDf['index'] pTestMatchData.reset_index(inplace=True) del pTestMatchData['index'] pTestDf.reset_index(inplace=True) del pTestDf['index'] pTestConcatData = pd.concat([pTestMatchData,pMatchesDf], axis = 1) IntCol = ["To"] for i in range(1, int(Nbest)-1): IntCol.append("BestMatch" + "__" + str(i)) pTestMatchData['Assignee_Group_Pred' + '__' + str(i)] = 'NaN' SimCol = ['Similarity'] for k in range(1, int(Nbest) - 1): SimCol.append("Similarity" + "__" + str(k)) pTestMatchData['Confidence_Level'+ '__' + str(k)] = 'NaN' for i in range(len(IntCol)): col = str(IntCol[i]) if col != "To": pTestAppendFea = [] for p in range(len(pFeaUnqList)): pTrainFeaData, pTestFeaData = pd.DataFrame(), pd.DataFrame() pTrainFeaData = pTrainData.loc[pTrainData['Features'] == pFeaUnqList[p]] pTestFeaData = pTestConcatData.loc[pTestConcatData['Features'] == pFeaUnqList[p]] pTestFeaData.reset_index(inplace=True) del pTestFeaData['index'] if len(pTestFeaData) and len(pTrainFeaData)> 0: for j in range(len(pTestFeaData)): if pMatchesDf[col][j] != None: if len(pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[IntCol[i]][j], True , False)][pAsg].values) != 0: pTestFeaData['Assignee_Group_Pred' + '__' + str(i-1)][j] = pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[col][j], True , False)][pAsg].values[0] else: pTestAppendFea = [] for p in range(len(pFeaUnqList)): pTrainFeaData, pTestFeaData = pd.DataFrame(), pd.DataFrame() pTrainFeaData = pTrainData.loc[pTrainData['Features'] == pFeaUnqList[p]] pTestFeaData = pTestConcatData.loc[pTestConcatData['Features'] == pFeaUnqList[p]] pTestFeaData.reset_index(inplace=True) del pTestFeaData['index'] if len(pTestFeaData) and len(pTrainFeaData)> 0: for j in range(len(pTestFeaData)): if pTestFeaData[col][j] != None: if len(pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[IntCol[i]][j], True , False)][pAsg].values) != 0: pTestFeaData['Assignee_Group_Pred'][j] = pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[IntCol[i]][j], True , False)][pAsg].values[0] else: pTestFeaData['Assignee_Group_Pred'][j] = None pTestAppendFea.append(pTestFeaData) pTestFeaDf = pd.concat(pTestAppendFea) pTestFeaDf.reset_index(inplace=True) del pTestFeaDf['index'] pTestDf.loc[pTestDf['Number'].isin(pTestFeaDf['Number']), ['Confidence_Level', 'Assignee_Group_Pred']] = pTestFeaDf[['Similarity', 'Assignee_Group_Pred']].values except Exception as e: print('*** ERROR[004]: Error in similarity poly main function: ', sys.exc_info()[0],str(e)) print(traceback.format_exc()) return(-1) sys.exit(-1) return(0, pTestDf)
class Matcher: def __init__(self): # Load PolyFuzz model for matching. Default: TF-IDF self.model = PolyFuzz(config.MODEL_MATCHING) # Load the filters self.filters: Dict[str, List[Filter]] = self.__load_filters() @staticmethod def __load_filters() -> dict: """ Load the filters from filters.toml (by default), create Filter objects, and return a dictionary of these object classified by intent. """ filters = {} # Load the raw filter toml_file = toml.load(config.FILTERS_TOML, _dict=dict) # Loop over each intent for intent, raw_filters in toml_file.items(): filter_list = [] # Loop over each filter in this intent for name, content in raw_filters.items(): # Create and append a Filter object filter_list.append( Filter( name=name, words=content['words'], regex=content['regex'], threshold=content['threshold'] ) ) # Save the filters to the main dictionary filters[intent] = filter_list return filters def get_keywords(self, text: str, intent: str) -> dict: keywords = {} if intent in self.filters: # Split the text into a list of words entries = text.split(" ") for filter_ in self.filters[intent]: # Math similarities between the filter and the given text self.model.match(entries, filter_.words) matches: pd.DataFrame = self.model.get_matches() try: # Get the word with the maximum similarity thresholds = matches[matches['Similarity'] >= filter_.threshold] keyword = thresholds[thresholds['Similarity'] == thresholds['Similarity'].max()].iloc[0, 0] except Exception: # If there's no match, set the filter as None keywords[filter_.name] = None else: # Use the keyword to retrieve and save its chained-data if result := re.search(filter_.regex % keyword, text): keywords[filter_.name] = result.group(filter_.name) else: keywords[filter_.name] = None
# gif_runner = st.image(gif_path) c1, c2, c3 = st.beta_columns([5, 5, 5]) with c2: # gif_runner = st.image("mouse.gif") # gif_runner = st.image(gif_path) gif_runner = st.image("mouse.gif") import time time.sleep(2) model.match(linesList, col_one_list) # model.match(linesDeduped2, col_one_list) # Auto map by Similarity scores Polyfuzz = model.get_matches() # Polyfuzz gif_runner.empty() # st.stop() # model.match(linesList, col_one_list) ## Auto map by Similarity scores # Polyfuzz = model.get_matches() # Polyfuzz # st.stop() if (RadioMapTo == "To crawled titles") and (RadioMapWhat == "Map Broken URLs"):
print("\nRemove repeated retweets... >80% fast text similarity") fasttext_embeddings = WordEmbeddings('en-crawl') fasttext = Embeddings(fasttext_embeddings, min_similarity=0, model_id="FastText") model = PolyFuzz(fasttext) start = time.time() indexes_to_remove = [] for topic in relevant_tweetir["topic"].unique(): topic_tweets = relevant_tweetir.loc[relevant_tweetir["topic"] == topic, "tweets.full_text"] for index, tweet in topic_tweets.items(): indexes = topic_tweets.index[topic_tweets.index != index] for ind in indexes: model.match(tweet.split(), topic_tweets.loc[ind].split()) mean_sim = round(model.get_matches()["Similarity"].mean(), 2) if mean_sim > 0.8: indexes_to_remove.append(ind) break relevant_tweetir = relevant_tweetir[~relevant_tweetir.index. isin(indexes_to_remove)] end = time.time() print(indexes_to_remove) print(f"Computation time - {round(end - start, 2)} seconds") relevant_tweetir["tweets.full_text"] = relevant_tweetir[ "tweets.full_text"].drop_duplicates() relevant_tweetir.dropna(subset=["tweets.full_text"], inplace=True) relevant_tweetir = remove_irrelevant_topics(relevant_tweetir,
if RadioMapAgainst == "all crawled URLs": GSCDf = dfIndexable else: pass ########################################################## model = PolyFuzz("EditDistance") start_execution = c30.button(" Run model! ✨ ") model.match(linesDeduped2, col_one_list) # Auto map by Similarity scores Polyfuzz = model.get_matches() Polyfuzz st.stop() #start_execution = c30.button(" 🚀✨Run model! ") if start_execution: cm = sns.light_palette("red", as_cmap=True, reverse=True) FuzzStyled = Polyfuzz.style.background_gradient(cmap=cm)
from polyfuzz import PolyFuzz #import polyfuzz from_list = ["https://www.tatielou.co.uk/apples/sadasda", "https://www.tatielou.co.uk/oranges/sadasda"] to_list = ["https://www.tatielou.co.uk/apples/", "https://www.tatielou.co.uk/oranges/", "https://www.tatielou.co.uk/pears/"] from_list = ["apple", "apples", "appl", "recal", "house", "similarity"] to_list = ["apple", "apples", "mouse"] model = PolyFuzz("EditDistance") model.match(from_list, to_list) # Auto map by Similarity scores model.get_matches()