Пример #1
0
    def __init__(self):

        # Load PolyFuzz model for matching. Default: TF-IDF
        self.model = PolyFuzz(config.MODEL_MATCHING)

        # Load the filters
        self.filters: Dict[str, List[Filter]] = self.__load_filters()
Пример #2
0
def test_custom_model():
    custom_matcher = MyModel()
    model = PolyFuzz(custom_matcher).match(from_list, to_list)
    matches = model.get_matches()
    assert isinstance(matches, pd.DataFrame)
    assert matches.Similarity.mean() > 0.0
    assert len(matches) == 6
    assert list(matches.columns) == ['From', 'To', 'Similarity']
Пример #3
0
def test_base_model(method):
    model = PolyFuzz(method).match(from_list, to_list)
    matches = model.get_matches()

    assert isinstance(matches, pd.DataFrame)
    assert matches.Similarity.mean() > 0.3
    assert len(matches) == 6
    assert list(matches.columns) == ['From', 'To', 'Similarity']
def similaritypolymain(pTrainData, pTestData, pLevel1, pLevel2, pDesc, pFromDir, pToDir, Nbest):
    try:
        pTrainData = pTrainData[pTrainData[pDesc].notna()]
        pTestData = pTestData[pTestData[pDesc].notna()]
        pTestData['Intent'], pTestData['Confidence_Level'] = 'Nan','Nan'
        pTrainData, __ = traindata(pTrainData, pDesc, pLevel1, pLevel2, pFromDir, pToDir)
        pTrainDataDesc = pd.DataFrame(pTrainData[pDesc])
        pTrainDataDescUnq = pTrainDataDesc[pDesc].unique().tolist()
        pTestDataDescList = list(pTestData[pDesc].values) #need to convert back to a list
        model = PolyFuzz("TF-IDF")
        model.match(pTestDataDescList, pTrainDataDescUnq, nbest = int(Nbest))
        pMatchesDf = model.get_matches()

        IntCol = ["To"]
        for i in range(1, int(Nbest)-1):
            IntCol.append("BestMatch" + "__" + str(i))
            pTestData['Intent' + '__' + str(i)] = 'NaN'

        SimCol = ['Similarity']
        for k in range(1, int(Nbest) - 1):
            SimCol.append("Similarity" + "__" + str(k))
            pTestData['Confidence_Level'+ '__' + str(k)] = 'NaN'
            
        for i in range(len(IntCol)):
            col = str(IntCol[i])
            if col != "To":
                for j in range(len(pTestData)):
                    if pMatchesDf[col][j] != None:
                        pTestData['Intent' + '__' + str(i-1)][j] = pTrainData[np.where(pTrainData[pDesc] == pMatchesDf[col][j], True , False)]['Intent'].values[0]
            else:
                for j in range(len(pTestData)):
                    if pMatchesDf[col][j] != None:
                        pTestData['Intent'][j] = pTrainData[np.where(pTrainData[pDesc] == pMatchesDf[IntCol[i]][j], True , False)]['Intent'].values[0]                 
                    
            
        for l in range(len(SimCol)):
            col = str(SimCol[l])
            if col != "Similarity":
                for m in range(len(pTestData)):
                    if pMatchesDf[col][m] != None:
                        pTestData['Confidence_Level'+ '__' + int(l-1)][m] = pMatchesDf[SimCol[l]][m]
            else:
                for m in range(len(pTestData)):
                    if pMatchesDf[SimCol[l]][m] != None:
                        pTestData['Confidence_Level'][m] = pMatchesDf[SimCol[l]][m]
            
    except Exception as e:
        print('*** ERROR[004]: Error in similarity poly main function: ', sys.exc_info()[0],str(e))
        print(traceback.format_exc())
        utils.movefile(pFromDir, pToDir)
        return(-1)
        sys.exit(-1)
    return(0, pTestData)    
Пример #5
0
def test_fit_model(method):
    model = PolyFuzz(method).fit(from_list, to_list)
    matches = model.get_matches()

    assert isinstance(matches, pd.DataFrame)
    assert matches.Similarity.mean() > 0.3
    assert len(matches) == 6
    assert list(matches.columns) == ['From', 'To', 'Similarity']

    results = model.transform(to_list)

    if method == "TF-IDF":
        key = "TF-IDF"
    elif method == "EditDistance":
        key = "EditDistance"
    else:
        key = list(results.keys())[0]

    assert isinstance(results[key], pd.DataFrame)
    assert results[key].Similarity.sum() > 0
Пример #6
0
def test_grouper_same_list():
    model = PolyFuzz("TF-IDF").match(from_list, from_list)
    model.group(link_min_similarity=0.75, group_all_strings=True)
    matches = model.get_matches()

    assert isinstance(matches, pd.DataFrame)
    assert matches.Similarity.mean() > 0.3
    assert len(matches) == 6
    assert list(matches.columns) == ['From', 'To', 'Similarity', 'Group']

    assert model.get_clusters() == {1: ['apples', 'apple', 'appl']}
    assert model.get_cluster_mappings() == {'apples': 1, 'apple': 1, 'appl': 1}
Пример #7
0
def test_grouper(method):
    model = PolyFuzz(method).match(from_list, to_list)
    model.group(link_min_similarity=0.75)
    matches = model.get_matches()

    assert isinstance(matches, pd.DataFrame)
    assert matches.Similarity.mean() > 0.3
    assert len(matches) == 6
    assert list(matches.columns) == ['From', 'To', 'Similarity', 'Group']

    assert model.get_clusters() == {1: ['apples', 'apple']}
    assert model.get_cluster_mappings() == {'apples': 1, 'apple': 1}
Пример #8
0
from polyfuzz import PolyFuzz

#import polyfuzz


from_list = ["https://www.tatielou.co.uk/apples/sadasda", "https://www.tatielou.co.uk/oranges/sadasda"]
to_list = ["https://www.tatielou.co.uk/apples/", "https://www.tatielou.co.uk/oranges/", "https://www.tatielou.co.uk/pears/"]
from_list = ["apple", "apples", "appl", "recal", "house", "similarity"]
to_list = ["apple", "apples", "mouse"]
model = PolyFuzz("EditDistance")
model.match(from_list, to_list)
# Auto map by Similarity scores
model.get_matches()
Пример #9
0
    # Keep tweets with more than 3 words
    relevant_tweetir = relevant_tweetir[relevant_tweetir[
        "tweets.full_text"].str.split().apply(lambda x: len(x) > 3)]
    # Add period to the end of sentences
    relevant_tweetir["tweets.full_text"] = relevant_tweetir[
        "tweets.full_text"].apply(punctuate_sent)
    relevant_tweetir["tweets.full_text"] = relevant_tweetir[
        "tweets.full_text"].str.lower()

    print("\nRemove repeated retweets... >80% fast text similarity")
    fasttext_embeddings = WordEmbeddings('en-crawl')
    fasttext = Embeddings(fasttext_embeddings,
                          min_similarity=0,
                          model_id="FastText")
    model = PolyFuzz(fasttext)

    start = time.time()
    indexes_to_remove = []
    for topic in relevant_tweetir["topic"].unique():
        topic_tweets = relevant_tweetir.loc[relevant_tweetir["topic"] == topic,
                                            "tweets.full_text"]
        for index, tweet in topic_tweets.items():
            indexes = topic_tweets.index[topic_tweets.index != index]
            for ind in indexes:
                model.match(tweet.split(), topic_tweets.loc[ind].split())
                mean_sim = round(model.get_matches()["Similarity"].mean(), 2)
                if mean_sim > 0.8:
                    indexes_to_remove.append(ind)
                    break
    relevant_tweetir = relevant_tweetir[~relevant_tweetir.index.
Пример #10
0
else:
    c50.warning(
        f"""
                👹 **Oh! What the Fuzz!** It seems that the crawl you uploaded was not the one I was looking for!
                Currently, I only accept Screaming Frog's internal_all.csv file, yet  planning to add more crawlers in the future - namely OnCrawl, DeepCrawl and SiteBulb!
                Check-out here [where to find it](https://i.imgur.com/HavO4d6.png)
                """
    )

    st.stop()

dfIndexable = GSCDf.loc[GSCDf["Indexability"] == "Indexable"]
col_one_list = GSCDf["Address"].tolist()


model = PolyFuzz("EditDistance")
SCHEMES = ("http://", "https://")

if start_execution and (uploaded_file is None):
    st.wc50.warning("file 1st!")
    st.stop()

else:

    model.match(linesList, col_one_list)
    Polyfuzz = model.get_matches()  # Auto map by Similarity scores
    Polyfuzz.columns = ["URL to map", "URL match", "Similarity"]
    Polyfuzz.index = Polyfuzz.index + 1

    #cmapRed = sns.diverging_palette(10, 133, as_cmap=True)
    #cmapRedBlue = sns.color_palette("vlag", as_cmap=True)
Пример #11
0
def test_multiple_models():
    tfidf_matcher = TFIDF(n_gram_range=(3, 3),
                          min_similarity=0,
                          model_id="TF-IDF")
    tfidf_large_matcher = TFIDF(n_gram_range=(3, 6), min_similarity=0)
    base_edit_matcher = EditDistance(n_jobs=1)
    ratio_matcher = EditDistance(n_jobs=1, scorer=fuzz.ratio)
    rapidfuzz_matcher = RapidFuzz(n_jobs=1)
    matchers = [
        tfidf_matcher, tfidf_large_matcher, base_edit_matcher, ratio_matcher,
        rapidfuzz_matcher
    ]

    model = PolyFuzz(matchers).match(from_list, to_list)

    # Test if correct matches are found
    for model_id in model.get_ids():
        assert model_id in model.get_matches().keys()
        assert isinstance(model.get_matches(model_id), pd.DataFrame)
    assert len(model.get_matches()) == len(matchers)

    # Test if error is raised when accessing clusters before creating them
    with pytest.raises(ValueError):
        model.get_clusters()

    with pytest.raises(ValueError):
        model.get_cluster_mappings()

    # Test if groupings are found
    model.group()
    for model_id in model.get_ids():
        assert model_id in model.get_cluster_mappings().keys()
    assert len(model.get_cluster_mappings()) == len(matchers)
Пример #12
0
    sleep(crawl_delay)

df_final['Status Code'] = my_list

# drop urls if already redirected
df_final = df_final[~df_final["Status Code"].isin(["301"])]
df_final = df_final[~df_final["Status Code"].isin(["302"])]

print("Automatically Mapping URLs ..")

# create lists from dfs
df_final_list = list(df_final["Address"])
df_sf_list = list(df_sf["Address"])

# instantiate PolyFuzz model, choose TF-IDF as the similarity measure and match the two lists.
model = PolyFuzz("TF-IDF").match(df_final_list, df_sf_list)

# make the polyfuzz dataframe
df_matches = model.get_matches()

count_row = df_final.shape[0]
print("Total Opportunity:", count_row, "URLs")

df_stats = pd.merge(df_matches,
                    df_final,
                    left_on="From",
                    right_on="Address",
                    how="inner")

# sort on similarity
df_stats = df_stats.sort_values(by="Similarity", ascending=False)
Пример #13
0
def test_wrongbase_model(method):
    with pytest.raises(ValueError):
        model = PolyFuzz(method).match(from_list, to_list)
Пример #14
0
def similaritypolymain(pTrainData, pTestData, pAsg, pDesc, pDate, Nbest):
    try:
        pTrainData = pTrainData[pTrainData[pDesc].notna()]
        pTestData = pTestData[pTestData[pDesc].notna()]
        pTestData['Assignee_Group_Pred'], pTestData['Confidence_Level'] = 'Nan', float(0.0)
        pTrainDataDesc = pd.DataFrame(pTrainData[pDesc])
        pFeaList = []
        pFeaList = pTrainData['Features'].tolist() + pTestData['Features'].tolist()
        pFeaUnqList = list(set(pFeaList))   
        pMatchData, pData, pTestAppendDf,  = [], [], []
        pMatchesDf, pTestMatchData, pTestDf = pd.DataFrame(),pd.DataFrame(), pd.DataFrame()
        for i in range(len(pFeaUnqList)):
            ToData, FromData = pd.DataFrame(), pd.DataFrame()
            FromData = pTrainData.loc[pTrainData['Features'] == pFeaUnqList[i]]
            ToData = pTestData.loc[pTestData['Features'] == pFeaUnqList[i]]
            model = PolyFuzz("TF-IDF")
            pTestAppendDf.append(ToData)
            if len(ToData[pDesc].tolist()) and len(FromData[pDesc].tolist()) >= 1:
                model.match(list(ToData[pDesc].values), FromData[pDesc].unique().tolist(), nbest = int(Nbest))
                Matches = model.get_matches()
                pMatchData.append(Matches)
                pData.append(ToData)              
            
        pMatchesDf = pd.concat(pMatchData)
        pTestMatchData = pd.concat(pData) 
        pTestDf = pd.concat(pTestAppendDf)
        pMatchesDf.reset_index(inplace=True)
        del pMatchesDf['index']
        pTestMatchData.reset_index(inplace=True)
        del pTestMatchData['index']    
        pTestDf.reset_index(inplace=True)
        del pTestDf['index']        
        
        pTestConcatData = pd.concat([pTestMatchData,pMatchesDf], axis = 1)
        
        IntCol = ["To"]
        for i in range(1, int(Nbest)-1):
            IntCol.append("BestMatch" + "__" + str(i))
            pTestMatchData['Assignee_Group_Pred' + '__' + str(i)] = 'NaN'

        SimCol = ['Similarity']
        for k in range(1, int(Nbest) - 1):
            SimCol.append("Similarity" + "__" + str(k))
            pTestMatchData['Confidence_Level'+ '__' + str(k)] = 'NaN'
            
        for i in range(len(IntCol)):
            col = str(IntCol[i])
            if col != "To":
                pTestAppendFea = []
                for p in range(len(pFeaUnqList)):
                    pTrainFeaData, pTestFeaData = pd.DataFrame(), pd.DataFrame()
                    pTrainFeaData = pTrainData.loc[pTrainData['Features'] == pFeaUnqList[p]]
                    pTestFeaData = pTestConcatData.loc[pTestConcatData['Features'] == pFeaUnqList[p]]
                    pTestFeaData.reset_index(inplace=True)
                    del pTestFeaData['index'] 
                    if len(pTestFeaData) and len(pTrainFeaData)> 0: 
                        for j in range(len(pTestFeaData)):
                            if pMatchesDf[col][j] != None:
                                if len(pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[IntCol[i]][j], True , False)][pAsg].values) != 0:
                                    pTestFeaData['Assignee_Group_Pred' + '__' + str(i-1)][j] = pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[col][j], True , False)][pAsg].values[0]
            else:
                pTestAppendFea = []
                for p in range(len(pFeaUnqList)):
                    pTrainFeaData, pTestFeaData = pd.DataFrame(), pd.DataFrame()
                    pTrainFeaData = pTrainData.loc[pTrainData['Features'] == pFeaUnqList[p]]
                    pTestFeaData = pTestConcatData.loc[pTestConcatData['Features'] == pFeaUnqList[p]]
                    pTestFeaData.reset_index(inplace=True)
                    del pTestFeaData['index'] 
                    if len(pTestFeaData) and len(pTrainFeaData)> 0: 
                        for j in range(len(pTestFeaData)):
                            if pTestFeaData[col][j] != None:
                                if len(pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[IntCol[i]][j], True , False)][pAsg].values) != 0:
                                    pTestFeaData['Assignee_Group_Pred'][j] = pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[IntCol[i]][j], True , False)][pAsg].values[0]  
                                else:
                                    pTestFeaData['Assignee_Group_Pred'][j] =  None
                        pTestAppendFea.append(pTestFeaData)
        pTestFeaDf = pd.concat(pTestAppendFea)  
        pTestFeaDf.reset_index(inplace=True)
        del pTestFeaDf['index'] 
        
        pTestDf.loc[pTestDf['Number'].isin(pTestFeaDf['Number']), ['Confidence_Level', 'Assignee_Group_Pred']] = pTestFeaDf[['Similarity', 'Assignee_Group_Pred']].values
        
    except Exception as e:
        print('*** ERROR[004]: Error in similarity poly main function: ', sys.exc_info()[0],str(e))
        print(traceback.format_exc())
        return(-1)
        sys.exit(-1)
    return(0, pTestDf)    
Пример #15
0
if RadioMapTo == "To crawled URL":
    col_one_list = GSCDf["Address"].tolist()
else:
    col_one_list = GSCDf["Title 1"].tolist()
    # col_one_listURL = GSCDf["Address"].tolist()

# col_one_list
# st.stop()

if RadioMapAgainst == "all crawled URLs":
    GSCDf = dfIndexable
else:
    pass

model = PolyFuzz("EditDistance")


gif_path = "mouse.gif"

if start_execution:

    ########## GIT NOT WORKING (UNBOUND)

    # gif_runner = st.image(gif_path)

    c1, c2, c3 = st.beta_columns([5, 5, 5])

    with c2:

        # gif_runner = st.image("mouse.gif")
Пример #16
0
class Matcher:

    def __init__(self):

        # Load PolyFuzz model for matching. Default: TF-IDF
        self.model = PolyFuzz(config.MODEL_MATCHING)

        # Load the filters
        self.filters: Dict[str, List[Filter]] = self.__load_filters()

    @staticmethod
    def __load_filters() -> dict:
        """
        Load the filters from filters.toml (by default), create Filter
        objects, and return a dictionary of these object classified by
        intent.
        """
        filters = {}

        # Load the raw filter
        toml_file = toml.load(config.FILTERS_TOML, _dict=dict)

        # Loop over each intent
        for intent, raw_filters in toml_file.items():
            filter_list = []

            # Loop over each filter in this intent
            for name, content in raw_filters.items():

                # Create and append a Filter object
                filter_list.append(
                    Filter(
                        name=name,
                        words=content['words'],
                        regex=content['regex'],
                        threshold=content['threshold']
                    )
                )

            # Save the filters to the main dictionary
            filters[intent] = filter_list

        return filters

    def get_keywords(self, text: str, intent: str) -> dict:

        keywords = {}
        if intent in self.filters:

            # Split the text into a list of words
            entries = text.split(" ")

            for filter_ in self.filters[intent]:

                # Math similarities between the filter and the given text
                self.model.match(entries, filter_.words)
                matches: pd.DataFrame = self.model.get_matches()

                try:
                    # Get the word with the maximum similarity
                    thresholds = matches[matches['Similarity'] >= filter_.threshold]
                    keyword = thresholds[thresholds['Similarity'] == thresholds['Similarity'].max()].iloc[0, 0]

                except Exception:
                    # If there's no match, set the filter as None
                    keywords[filter_.name] = None

                else:
                    # Use the keyword to retrieve and save its chained-data
                    if result := re.search(filter_.regex % keyword, text):
                        keywords[filter_.name] = result.group(filter_.name)

                    else:
                        keywords[filter_.name] = None
Пример #17
0
def main(n_actors, news_kg, t_actors, tweets_kg, verbose=True):
    print("\n1. REMOVING STOPWORDS...")
    n_actors, news_kg = remove_stopwords(n_actors, news_kg)
    t_actors, tweets_kg = remove_stopwords(t_actors, tweets_kg)

    print("\n2. CLUSTER SIMILARITY...")
    start = time.time()

    embeddings = TransformerWordEmbeddings("bert-base-multilingual-cased")
    bert = Embeddings(embeddings, min_similarity=0, model_id="BERT")
    model_bert = PolyFuzz(bert)

    paired_clusters, non_paired_clusters = cluster_similarity(
        model_bert, n_actors, t_actors)

    end = time.time()
    print(f"Computation time - {round(end - start, 2)} seconds\n")

    if verbose:
        print("\nSimilar clusters (news vs. tweets):")
        print(paired_clusters)
        print("\nClusters with no similar pair (only news):")
        print(non_paired_clusters)

    print("\n3. MAPPING ACTORS TO CLUSTER VALUES...")
    news_kg = map_news_in_kg(news_kg, paired_clusters, non_paired_clusters)
    tweets_kg = map_tweets_in_kg(tweets_kg, paired_clusters, t_actors)

    news_kg["Redge1"] = news_kg["Redge1"].str.lower()
    news_kg["Redge2"] = news_kg["Redge2"].str.lower()
    news_kg["node1"] = news_kg["node1"].str.lower()

    tweets_kg["Redge1"] = tweets_kg["Redge1"].str.lower()
    tweets_kg["Redge2"] = tweets_kg["Redge2"].str.lower()
    tweets_kg["node1"] = tweets_kg["node1"].str.lower()

    if verbose:
        print("\nExisting triples...")
        print("NEWS:")
        # print(news_kg[["Redge1", "node1", "Redge2"]])
        print(news_kg)
        print("\nTWEETS:")
        # print(tweets_kg[["Redge1", "node1", "Redge2"]])
        print(tweets_kg)

    print("\n4. COMPARE TRIPLES...")
    fasttext_embeddings = WordEmbeddings('en-crawl')
    fasttext = Embeddings(fasttext_embeddings,
                          min_similarity=0,
                          model_id="FastText")
    leven_dist = RapidFuzz(n_jobs=1, model_id="leven")

    model_names = ["BERT", "FastText", "leven"]
    models = [bert, fasttext, leven_dist]
    model = PolyFuzz(models)

    start = time.time()
    paired_KG_evaluation = pd.DataFrame()
    non_paired_KG = pd.DataFrame()
    for cluster in paired_clusters.itertuples():
        news_triples = news_kg[news_kg["edge1"] == cluster.news_key]
        tweets_triples = tweets_kg[tweets_kg["edge1"] == cluster.tweets_key]
        if tweets_triples.shape[0] == 0:
            tweets_triples = tweets_kg[tweets_kg["edge2"] ==
                                       cluster.tweets_key]
        if tweets_triples.shape[0] == 0:
            non_paired_KG = non_paired_KG.append(tweets_triples,
                                                 ignore_index=True)
            continue
        paired_KG_evaluation = paired_KG_evaluation.append(triples_evaluation(
            model, news_triples, tweets_triples, model_names),
                                                           ignore_index=True)

    if verbose:
        print("\nPAIRED CLUSTERS TRIPLES EVALUATION...")
        print(paired_KG_evaluation)

    end = time.time()
    print(f"\nComputation time - {round(end - start, 2)} seconds")

    # non paired KG evaluation (triples da notícia para os quais não encontro semelhantes nos tweets)
    print("\nNON PAIRED CLUSTERS TRIPLES EVALUATION...")
    non_paired_KG = non_paired_KG.append(
        news_kg[~news_kg["edge1"].isin(paired_clusters["news_key"])],
        ignore_index=True)
    non_paired_KG_evaluation = triples_evaluation(model, non_paired_KG,
                                                  tweets_kg, model_names)
    print(non_paired_KG_evaluation)

    print("\n5. FINAL KG EVALUATION...")
    final_evaluation = paired_KG_evaluation.append(non_paired_KG_evaluation,
                                                   ignore_index=True)
    print(final_evaluation)
    final_leven = round(final_evaluation['leven'].mean(), 3)
    final_fasttext = round(final_evaluation['FastText'].mean(), 3)
    final_bert = round(final_evaluation['BERT'].mean(), 3)
    final_rouge1 = round(final_evaluation['rouge1'].mean(), 3)
    print(
        f"\nMEAN LEVEN SIMILARITY BETWEEN NEWS AND TWEETS TRIPLES - {final_leven}"
    )
    print(
        f"\nMEAN FAST TEXT SIMILARITY BETWEEN NEWS AND TWEETS TRIPLES - {final_fasttext}"
    )
    print(
        f"\nMEAN BERT SIMILARITY BETWEEN NEWS AND TWEETS TRIPLES - {final_bert}"
    )
    print(
        f"\nMEAN ROUGE1 F1-SCORE BETWEEN NEWS AND TWEETS TRIPLES - {final_rouge1}"
    )

    return final_evaluation, {
        "leven": final_leven,
        "FastText": final_fasttext,
        "BERT": final_bert,
        "ROUGE1": final_rouge1
    }