def __init__(self): # Load PolyFuzz model for matching. Default: TF-IDF self.model = PolyFuzz(config.MODEL_MATCHING) # Load the filters self.filters: Dict[str, List[Filter]] = self.__load_filters()
def test_custom_model(): custom_matcher = MyModel() model = PolyFuzz(custom_matcher).match(from_list, to_list) matches = model.get_matches() assert isinstance(matches, pd.DataFrame) assert matches.Similarity.mean() > 0.0 assert len(matches) == 6 assert list(matches.columns) == ['From', 'To', 'Similarity']
def test_base_model(method): model = PolyFuzz(method).match(from_list, to_list) matches = model.get_matches() assert isinstance(matches, pd.DataFrame) assert matches.Similarity.mean() > 0.3 assert len(matches) == 6 assert list(matches.columns) == ['From', 'To', 'Similarity']
def similaritypolymain(pTrainData, pTestData, pLevel1, pLevel2, pDesc, pFromDir, pToDir, Nbest): try: pTrainData = pTrainData[pTrainData[pDesc].notna()] pTestData = pTestData[pTestData[pDesc].notna()] pTestData['Intent'], pTestData['Confidence_Level'] = 'Nan','Nan' pTrainData, __ = traindata(pTrainData, pDesc, pLevel1, pLevel2, pFromDir, pToDir) pTrainDataDesc = pd.DataFrame(pTrainData[pDesc]) pTrainDataDescUnq = pTrainDataDesc[pDesc].unique().tolist() pTestDataDescList = list(pTestData[pDesc].values) #need to convert back to a list model = PolyFuzz("TF-IDF") model.match(pTestDataDescList, pTrainDataDescUnq, nbest = int(Nbest)) pMatchesDf = model.get_matches() IntCol = ["To"] for i in range(1, int(Nbest)-1): IntCol.append("BestMatch" + "__" + str(i)) pTestData['Intent' + '__' + str(i)] = 'NaN' SimCol = ['Similarity'] for k in range(1, int(Nbest) - 1): SimCol.append("Similarity" + "__" + str(k)) pTestData['Confidence_Level'+ '__' + str(k)] = 'NaN' for i in range(len(IntCol)): col = str(IntCol[i]) if col != "To": for j in range(len(pTestData)): if pMatchesDf[col][j] != None: pTestData['Intent' + '__' + str(i-1)][j] = pTrainData[np.where(pTrainData[pDesc] == pMatchesDf[col][j], True , False)]['Intent'].values[0] else: for j in range(len(pTestData)): if pMatchesDf[col][j] != None: pTestData['Intent'][j] = pTrainData[np.where(pTrainData[pDesc] == pMatchesDf[IntCol[i]][j], True , False)]['Intent'].values[0] for l in range(len(SimCol)): col = str(SimCol[l]) if col != "Similarity": for m in range(len(pTestData)): if pMatchesDf[col][m] != None: pTestData['Confidence_Level'+ '__' + int(l-1)][m] = pMatchesDf[SimCol[l]][m] else: for m in range(len(pTestData)): if pMatchesDf[SimCol[l]][m] != None: pTestData['Confidence_Level'][m] = pMatchesDf[SimCol[l]][m] except Exception as e: print('*** ERROR[004]: Error in similarity poly main function: ', sys.exc_info()[0],str(e)) print(traceback.format_exc()) utils.movefile(pFromDir, pToDir) return(-1) sys.exit(-1) return(0, pTestData)
def test_fit_model(method): model = PolyFuzz(method).fit(from_list, to_list) matches = model.get_matches() assert isinstance(matches, pd.DataFrame) assert matches.Similarity.mean() > 0.3 assert len(matches) == 6 assert list(matches.columns) == ['From', 'To', 'Similarity'] results = model.transform(to_list) if method == "TF-IDF": key = "TF-IDF" elif method == "EditDistance": key = "EditDistance" else: key = list(results.keys())[0] assert isinstance(results[key], pd.DataFrame) assert results[key].Similarity.sum() > 0
def test_grouper_same_list(): model = PolyFuzz("TF-IDF").match(from_list, from_list) model.group(link_min_similarity=0.75, group_all_strings=True) matches = model.get_matches() assert isinstance(matches, pd.DataFrame) assert matches.Similarity.mean() > 0.3 assert len(matches) == 6 assert list(matches.columns) == ['From', 'To', 'Similarity', 'Group'] assert model.get_clusters() == {1: ['apples', 'apple', 'appl']} assert model.get_cluster_mappings() == {'apples': 1, 'apple': 1, 'appl': 1}
def test_grouper(method): model = PolyFuzz(method).match(from_list, to_list) model.group(link_min_similarity=0.75) matches = model.get_matches() assert isinstance(matches, pd.DataFrame) assert matches.Similarity.mean() > 0.3 assert len(matches) == 6 assert list(matches.columns) == ['From', 'To', 'Similarity', 'Group'] assert model.get_clusters() == {1: ['apples', 'apple']} assert model.get_cluster_mappings() == {'apples': 1, 'apple': 1}
from polyfuzz import PolyFuzz #import polyfuzz from_list = ["https://www.tatielou.co.uk/apples/sadasda", "https://www.tatielou.co.uk/oranges/sadasda"] to_list = ["https://www.tatielou.co.uk/apples/", "https://www.tatielou.co.uk/oranges/", "https://www.tatielou.co.uk/pears/"] from_list = ["apple", "apples", "appl", "recal", "house", "similarity"] to_list = ["apple", "apples", "mouse"] model = PolyFuzz("EditDistance") model.match(from_list, to_list) # Auto map by Similarity scores model.get_matches()
# Keep tweets with more than 3 words relevant_tweetir = relevant_tweetir[relevant_tweetir[ "tweets.full_text"].str.split().apply(lambda x: len(x) > 3)] # Add period to the end of sentences relevant_tweetir["tweets.full_text"] = relevant_tweetir[ "tweets.full_text"].apply(punctuate_sent) relevant_tweetir["tweets.full_text"] = relevant_tweetir[ "tweets.full_text"].str.lower() print("\nRemove repeated retweets... >80% fast text similarity") fasttext_embeddings = WordEmbeddings('en-crawl') fasttext = Embeddings(fasttext_embeddings, min_similarity=0, model_id="FastText") model = PolyFuzz(fasttext) start = time.time() indexes_to_remove = [] for topic in relevant_tweetir["topic"].unique(): topic_tweets = relevant_tweetir.loc[relevant_tweetir["topic"] == topic, "tweets.full_text"] for index, tweet in topic_tweets.items(): indexes = topic_tweets.index[topic_tweets.index != index] for ind in indexes: model.match(tweet.split(), topic_tweets.loc[ind].split()) mean_sim = round(model.get_matches()["Similarity"].mean(), 2) if mean_sim > 0.8: indexes_to_remove.append(ind) break relevant_tweetir = relevant_tweetir[~relevant_tweetir.index.
else: c50.warning( f""" 👹 **Oh! What the Fuzz!** It seems that the crawl you uploaded was not the one I was looking for! Currently, I only accept Screaming Frog's internal_all.csv file, yet planning to add more crawlers in the future - namely OnCrawl, DeepCrawl and SiteBulb! Check-out here [where to find it](https://i.imgur.com/HavO4d6.png) """ ) st.stop() dfIndexable = GSCDf.loc[GSCDf["Indexability"] == "Indexable"] col_one_list = GSCDf["Address"].tolist() model = PolyFuzz("EditDistance") SCHEMES = ("http://", "https://") if start_execution and (uploaded_file is None): st.wc50.warning("file 1st!") st.stop() else: model.match(linesList, col_one_list) Polyfuzz = model.get_matches() # Auto map by Similarity scores Polyfuzz.columns = ["URL to map", "URL match", "Similarity"] Polyfuzz.index = Polyfuzz.index + 1 #cmapRed = sns.diverging_palette(10, 133, as_cmap=True) #cmapRedBlue = sns.color_palette("vlag", as_cmap=True)
def test_multiple_models(): tfidf_matcher = TFIDF(n_gram_range=(3, 3), min_similarity=0, model_id="TF-IDF") tfidf_large_matcher = TFIDF(n_gram_range=(3, 6), min_similarity=0) base_edit_matcher = EditDistance(n_jobs=1) ratio_matcher = EditDistance(n_jobs=1, scorer=fuzz.ratio) rapidfuzz_matcher = RapidFuzz(n_jobs=1) matchers = [ tfidf_matcher, tfidf_large_matcher, base_edit_matcher, ratio_matcher, rapidfuzz_matcher ] model = PolyFuzz(matchers).match(from_list, to_list) # Test if correct matches are found for model_id in model.get_ids(): assert model_id in model.get_matches().keys() assert isinstance(model.get_matches(model_id), pd.DataFrame) assert len(model.get_matches()) == len(matchers) # Test if error is raised when accessing clusters before creating them with pytest.raises(ValueError): model.get_clusters() with pytest.raises(ValueError): model.get_cluster_mappings() # Test if groupings are found model.group() for model_id in model.get_ids(): assert model_id in model.get_cluster_mappings().keys() assert len(model.get_cluster_mappings()) == len(matchers)
sleep(crawl_delay) df_final['Status Code'] = my_list # drop urls if already redirected df_final = df_final[~df_final["Status Code"].isin(["301"])] df_final = df_final[~df_final["Status Code"].isin(["302"])] print("Automatically Mapping URLs ..") # create lists from dfs df_final_list = list(df_final["Address"]) df_sf_list = list(df_sf["Address"]) # instantiate PolyFuzz model, choose TF-IDF as the similarity measure and match the two lists. model = PolyFuzz("TF-IDF").match(df_final_list, df_sf_list) # make the polyfuzz dataframe df_matches = model.get_matches() count_row = df_final.shape[0] print("Total Opportunity:", count_row, "URLs") df_stats = pd.merge(df_matches, df_final, left_on="From", right_on="Address", how="inner") # sort on similarity df_stats = df_stats.sort_values(by="Similarity", ascending=False)
def test_wrongbase_model(method): with pytest.raises(ValueError): model = PolyFuzz(method).match(from_list, to_list)
def similaritypolymain(pTrainData, pTestData, pAsg, pDesc, pDate, Nbest): try: pTrainData = pTrainData[pTrainData[pDesc].notna()] pTestData = pTestData[pTestData[pDesc].notna()] pTestData['Assignee_Group_Pred'], pTestData['Confidence_Level'] = 'Nan', float(0.0) pTrainDataDesc = pd.DataFrame(pTrainData[pDesc]) pFeaList = [] pFeaList = pTrainData['Features'].tolist() + pTestData['Features'].tolist() pFeaUnqList = list(set(pFeaList)) pMatchData, pData, pTestAppendDf, = [], [], [] pMatchesDf, pTestMatchData, pTestDf = pd.DataFrame(),pd.DataFrame(), pd.DataFrame() for i in range(len(pFeaUnqList)): ToData, FromData = pd.DataFrame(), pd.DataFrame() FromData = pTrainData.loc[pTrainData['Features'] == pFeaUnqList[i]] ToData = pTestData.loc[pTestData['Features'] == pFeaUnqList[i]] model = PolyFuzz("TF-IDF") pTestAppendDf.append(ToData) if len(ToData[pDesc].tolist()) and len(FromData[pDesc].tolist()) >= 1: model.match(list(ToData[pDesc].values), FromData[pDesc].unique().tolist(), nbest = int(Nbest)) Matches = model.get_matches() pMatchData.append(Matches) pData.append(ToData) pMatchesDf = pd.concat(pMatchData) pTestMatchData = pd.concat(pData) pTestDf = pd.concat(pTestAppendDf) pMatchesDf.reset_index(inplace=True) del pMatchesDf['index'] pTestMatchData.reset_index(inplace=True) del pTestMatchData['index'] pTestDf.reset_index(inplace=True) del pTestDf['index'] pTestConcatData = pd.concat([pTestMatchData,pMatchesDf], axis = 1) IntCol = ["To"] for i in range(1, int(Nbest)-1): IntCol.append("BestMatch" + "__" + str(i)) pTestMatchData['Assignee_Group_Pred' + '__' + str(i)] = 'NaN' SimCol = ['Similarity'] for k in range(1, int(Nbest) - 1): SimCol.append("Similarity" + "__" + str(k)) pTestMatchData['Confidence_Level'+ '__' + str(k)] = 'NaN' for i in range(len(IntCol)): col = str(IntCol[i]) if col != "To": pTestAppendFea = [] for p in range(len(pFeaUnqList)): pTrainFeaData, pTestFeaData = pd.DataFrame(), pd.DataFrame() pTrainFeaData = pTrainData.loc[pTrainData['Features'] == pFeaUnqList[p]] pTestFeaData = pTestConcatData.loc[pTestConcatData['Features'] == pFeaUnqList[p]] pTestFeaData.reset_index(inplace=True) del pTestFeaData['index'] if len(pTestFeaData) and len(pTrainFeaData)> 0: for j in range(len(pTestFeaData)): if pMatchesDf[col][j] != None: if len(pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[IntCol[i]][j], True , False)][pAsg].values) != 0: pTestFeaData['Assignee_Group_Pred' + '__' + str(i-1)][j] = pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[col][j], True , False)][pAsg].values[0] else: pTestAppendFea = [] for p in range(len(pFeaUnqList)): pTrainFeaData, pTestFeaData = pd.DataFrame(), pd.DataFrame() pTrainFeaData = pTrainData.loc[pTrainData['Features'] == pFeaUnqList[p]] pTestFeaData = pTestConcatData.loc[pTestConcatData['Features'] == pFeaUnqList[p]] pTestFeaData.reset_index(inplace=True) del pTestFeaData['index'] if len(pTestFeaData) and len(pTrainFeaData)> 0: for j in range(len(pTestFeaData)): if pTestFeaData[col][j] != None: if len(pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[IntCol[i]][j], True , False)][pAsg].values) != 0: pTestFeaData['Assignee_Group_Pred'][j] = pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[IntCol[i]][j], True , False)][pAsg].values[0] else: pTestFeaData['Assignee_Group_Pred'][j] = None pTestAppendFea.append(pTestFeaData) pTestFeaDf = pd.concat(pTestAppendFea) pTestFeaDf.reset_index(inplace=True) del pTestFeaDf['index'] pTestDf.loc[pTestDf['Number'].isin(pTestFeaDf['Number']), ['Confidence_Level', 'Assignee_Group_Pred']] = pTestFeaDf[['Similarity', 'Assignee_Group_Pred']].values except Exception as e: print('*** ERROR[004]: Error in similarity poly main function: ', sys.exc_info()[0],str(e)) print(traceback.format_exc()) return(-1) sys.exit(-1) return(0, pTestDf)
if RadioMapTo == "To crawled URL": col_one_list = GSCDf["Address"].tolist() else: col_one_list = GSCDf["Title 1"].tolist() # col_one_listURL = GSCDf["Address"].tolist() # col_one_list # st.stop() if RadioMapAgainst == "all crawled URLs": GSCDf = dfIndexable else: pass model = PolyFuzz("EditDistance") gif_path = "mouse.gif" if start_execution: ########## GIT NOT WORKING (UNBOUND) # gif_runner = st.image(gif_path) c1, c2, c3 = st.beta_columns([5, 5, 5]) with c2: # gif_runner = st.image("mouse.gif")
class Matcher: def __init__(self): # Load PolyFuzz model for matching. Default: TF-IDF self.model = PolyFuzz(config.MODEL_MATCHING) # Load the filters self.filters: Dict[str, List[Filter]] = self.__load_filters() @staticmethod def __load_filters() -> dict: """ Load the filters from filters.toml (by default), create Filter objects, and return a dictionary of these object classified by intent. """ filters = {} # Load the raw filter toml_file = toml.load(config.FILTERS_TOML, _dict=dict) # Loop over each intent for intent, raw_filters in toml_file.items(): filter_list = [] # Loop over each filter in this intent for name, content in raw_filters.items(): # Create and append a Filter object filter_list.append( Filter( name=name, words=content['words'], regex=content['regex'], threshold=content['threshold'] ) ) # Save the filters to the main dictionary filters[intent] = filter_list return filters def get_keywords(self, text: str, intent: str) -> dict: keywords = {} if intent in self.filters: # Split the text into a list of words entries = text.split(" ") for filter_ in self.filters[intent]: # Math similarities between the filter and the given text self.model.match(entries, filter_.words) matches: pd.DataFrame = self.model.get_matches() try: # Get the word with the maximum similarity thresholds = matches[matches['Similarity'] >= filter_.threshold] keyword = thresholds[thresholds['Similarity'] == thresholds['Similarity'].max()].iloc[0, 0] except Exception: # If there's no match, set the filter as None keywords[filter_.name] = None else: # Use the keyword to retrieve and save its chained-data if result := re.search(filter_.regex % keyword, text): keywords[filter_.name] = result.group(filter_.name) else: keywords[filter_.name] = None
def main(n_actors, news_kg, t_actors, tweets_kg, verbose=True): print("\n1. REMOVING STOPWORDS...") n_actors, news_kg = remove_stopwords(n_actors, news_kg) t_actors, tweets_kg = remove_stopwords(t_actors, tweets_kg) print("\n2. CLUSTER SIMILARITY...") start = time.time() embeddings = TransformerWordEmbeddings("bert-base-multilingual-cased") bert = Embeddings(embeddings, min_similarity=0, model_id="BERT") model_bert = PolyFuzz(bert) paired_clusters, non_paired_clusters = cluster_similarity( model_bert, n_actors, t_actors) end = time.time() print(f"Computation time - {round(end - start, 2)} seconds\n") if verbose: print("\nSimilar clusters (news vs. tweets):") print(paired_clusters) print("\nClusters with no similar pair (only news):") print(non_paired_clusters) print("\n3. MAPPING ACTORS TO CLUSTER VALUES...") news_kg = map_news_in_kg(news_kg, paired_clusters, non_paired_clusters) tweets_kg = map_tweets_in_kg(tweets_kg, paired_clusters, t_actors) news_kg["Redge1"] = news_kg["Redge1"].str.lower() news_kg["Redge2"] = news_kg["Redge2"].str.lower() news_kg["node1"] = news_kg["node1"].str.lower() tweets_kg["Redge1"] = tweets_kg["Redge1"].str.lower() tweets_kg["Redge2"] = tweets_kg["Redge2"].str.lower() tweets_kg["node1"] = tweets_kg["node1"].str.lower() if verbose: print("\nExisting triples...") print("NEWS:") # print(news_kg[["Redge1", "node1", "Redge2"]]) print(news_kg) print("\nTWEETS:") # print(tweets_kg[["Redge1", "node1", "Redge2"]]) print(tweets_kg) print("\n4. COMPARE TRIPLES...") fasttext_embeddings = WordEmbeddings('en-crawl') fasttext = Embeddings(fasttext_embeddings, min_similarity=0, model_id="FastText") leven_dist = RapidFuzz(n_jobs=1, model_id="leven") model_names = ["BERT", "FastText", "leven"] models = [bert, fasttext, leven_dist] model = PolyFuzz(models) start = time.time() paired_KG_evaluation = pd.DataFrame() non_paired_KG = pd.DataFrame() for cluster in paired_clusters.itertuples(): news_triples = news_kg[news_kg["edge1"] == cluster.news_key] tweets_triples = tweets_kg[tweets_kg["edge1"] == cluster.tweets_key] if tweets_triples.shape[0] == 0: tweets_triples = tweets_kg[tweets_kg["edge2"] == cluster.tweets_key] if tweets_triples.shape[0] == 0: non_paired_KG = non_paired_KG.append(tweets_triples, ignore_index=True) continue paired_KG_evaluation = paired_KG_evaluation.append(triples_evaluation( model, news_triples, tweets_triples, model_names), ignore_index=True) if verbose: print("\nPAIRED CLUSTERS TRIPLES EVALUATION...") print(paired_KG_evaluation) end = time.time() print(f"\nComputation time - {round(end - start, 2)} seconds") # non paired KG evaluation (triples da notícia para os quais não encontro semelhantes nos tweets) print("\nNON PAIRED CLUSTERS TRIPLES EVALUATION...") non_paired_KG = non_paired_KG.append( news_kg[~news_kg["edge1"].isin(paired_clusters["news_key"])], ignore_index=True) non_paired_KG_evaluation = triples_evaluation(model, non_paired_KG, tweets_kg, model_names) print(non_paired_KG_evaluation) print("\n5. FINAL KG EVALUATION...") final_evaluation = paired_KG_evaluation.append(non_paired_KG_evaluation, ignore_index=True) print(final_evaluation) final_leven = round(final_evaluation['leven'].mean(), 3) final_fasttext = round(final_evaluation['FastText'].mean(), 3) final_bert = round(final_evaluation['BERT'].mean(), 3) final_rouge1 = round(final_evaluation['rouge1'].mean(), 3) print( f"\nMEAN LEVEN SIMILARITY BETWEEN NEWS AND TWEETS TRIPLES - {final_leven}" ) print( f"\nMEAN FAST TEXT SIMILARITY BETWEEN NEWS AND TWEETS TRIPLES - {final_fasttext}" ) print( f"\nMEAN BERT SIMILARITY BETWEEN NEWS AND TWEETS TRIPLES - {final_bert}" ) print( f"\nMEAN ROUGE1 F1-SCORE BETWEEN NEWS AND TWEETS TRIPLES - {final_rouge1}" ) return final_evaluation, { "leven": final_leven, "FastText": final_fasttext, "BERT": final_bert, "ROUGE1": final_rouge1 }