Пример #1
0
def test_multiple_models():
    tfidf_matcher = TFIDF(n_gram_range=(3, 3),
                          min_similarity=0,
                          model_id="TF-IDF")
    tfidf_large_matcher = TFIDF(n_gram_range=(3, 6), min_similarity=0)
    base_edit_matcher = EditDistance(n_jobs=1)
    ratio_matcher = EditDistance(n_jobs=1, scorer=fuzz.ratio)
    rapidfuzz_matcher = RapidFuzz(n_jobs=1)
    matchers = [
        tfidf_matcher, tfidf_large_matcher, base_edit_matcher, ratio_matcher,
        rapidfuzz_matcher
    ]

    model = PolyFuzz(matchers).match(from_list, to_list)

    # Test if correct matches are found
    for model_id in model.get_ids():
        assert model_id in model.get_matches().keys()
        assert isinstance(model.get_matches(model_id), pd.DataFrame)
    assert len(model.get_matches()) == len(matchers)

    # Test if error is raised when accessing clusters before creating them
    with pytest.raises(ValueError):
        model.get_clusters()

    with pytest.raises(ValueError):
        model.get_cluster_mappings()

    # Test if groupings are found
    model.group()
    for model_id in model.get_ids():
        assert model_id in model.get_cluster_mappings().keys()
    assert len(model.get_cluster_mappings()) == len(matchers)
Пример #2
0
def test_base_model(method):
    model = PolyFuzz(method).match(from_list, to_list)
    matches = model.get_matches()

    assert isinstance(matches, pd.DataFrame)
    assert matches.Similarity.mean() > 0.3
    assert len(matches) == 6
    assert list(matches.columns) == ['From', 'To', 'Similarity']
Пример #3
0
def test_custom_model():
    custom_matcher = MyModel()
    model = PolyFuzz(custom_matcher).match(from_list, to_list)
    matches = model.get_matches()
    assert isinstance(matches, pd.DataFrame)
    assert matches.Similarity.mean() > 0.0
    assert len(matches) == 6
    assert list(matches.columns) == ['From', 'To', 'Similarity']
Пример #4
0
def test_grouper_same_list():
    model = PolyFuzz("TF-IDF").match(from_list, from_list)
    model.group(link_min_similarity=0.75, group_all_strings=True)
    matches = model.get_matches()

    assert isinstance(matches, pd.DataFrame)
    assert matches.Similarity.mean() > 0.3
    assert len(matches) == 6
    assert list(matches.columns) == ['From', 'To', 'Similarity', 'Group']

    assert model.get_clusters() == {1: ['apples', 'apple', 'appl']}
    assert model.get_cluster_mappings() == {'apples': 1, 'apple': 1, 'appl': 1}
Пример #5
0
def test_grouper(method):
    model = PolyFuzz(method).match(from_list, to_list)
    model.group(link_min_similarity=0.75)
    matches = model.get_matches()

    assert isinstance(matches, pd.DataFrame)
    assert matches.Similarity.mean() > 0.3
    assert len(matches) == 6
    assert list(matches.columns) == ['From', 'To', 'Similarity', 'Group']

    assert model.get_clusters() == {1: ['apples', 'apple']}
    assert model.get_cluster_mappings() == {'apples': 1, 'apple': 1}
def similaritypolymain(pTrainData, pTestData, pLevel1, pLevel2, pDesc, pFromDir, pToDir, Nbest):
    try:
        pTrainData = pTrainData[pTrainData[pDesc].notna()]
        pTestData = pTestData[pTestData[pDesc].notna()]
        pTestData['Intent'], pTestData['Confidence_Level'] = 'Nan','Nan'
        pTrainData, __ = traindata(pTrainData, pDesc, pLevel1, pLevel2, pFromDir, pToDir)
        pTrainDataDesc = pd.DataFrame(pTrainData[pDesc])
        pTrainDataDescUnq = pTrainDataDesc[pDesc].unique().tolist()
        pTestDataDescList = list(pTestData[pDesc].values) #need to convert back to a list
        model = PolyFuzz("TF-IDF")
        model.match(pTestDataDescList, pTrainDataDescUnq, nbest = int(Nbest))
        pMatchesDf = model.get_matches()

        IntCol = ["To"]
        for i in range(1, int(Nbest)-1):
            IntCol.append("BestMatch" + "__" + str(i))
            pTestData['Intent' + '__' + str(i)] = 'NaN'

        SimCol = ['Similarity']
        for k in range(1, int(Nbest) - 1):
            SimCol.append("Similarity" + "__" + str(k))
            pTestData['Confidence_Level'+ '__' + str(k)] = 'NaN'
            
        for i in range(len(IntCol)):
            col = str(IntCol[i])
            if col != "To":
                for j in range(len(pTestData)):
                    if pMatchesDf[col][j] != None:
                        pTestData['Intent' + '__' + str(i-1)][j] = pTrainData[np.where(pTrainData[pDesc] == pMatchesDf[col][j], True , False)]['Intent'].values[0]
            else:
                for j in range(len(pTestData)):
                    if pMatchesDf[col][j] != None:
                        pTestData['Intent'][j] = pTrainData[np.where(pTrainData[pDesc] == pMatchesDf[IntCol[i]][j], True , False)]['Intent'].values[0]                 
                    
            
        for l in range(len(SimCol)):
            col = str(SimCol[l])
            if col != "Similarity":
                for m in range(len(pTestData)):
                    if pMatchesDf[col][m] != None:
                        pTestData['Confidence_Level'+ '__' + int(l-1)][m] = pMatchesDf[SimCol[l]][m]
            else:
                for m in range(len(pTestData)):
                    if pMatchesDf[SimCol[l]][m] != None:
                        pTestData['Confidence_Level'][m] = pMatchesDf[SimCol[l]][m]
            
    except Exception as e:
        print('*** ERROR[004]: Error in similarity poly main function: ', sys.exc_info()[0],str(e))
        print(traceback.format_exc())
        utils.movefile(pFromDir, pToDir)
        return(-1)
        sys.exit(-1)
    return(0, pTestData)    
Пример #7
0
def test_fit_model(method):
    model = PolyFuzz(method).fit(from_list, to_list)
    matches = model.get_matches()

    assert isinstance(matches, pd.DataFrame)
    assert matches.Similarity.mean() > 0.3
    assert len(matches) == 6
    assert list(matches.columns) == ['From', 'To', 'Similarity']

    results = model.transform(to_list)

    if method == "TF-IDF":
        key = "TF-IDF"
    elif method == "EditDistance":
        key = "EditDistance"
    else:
        key = list(results.keys())[0]

    assert isinstance(results[key], pd.DataFrame)
    assert results[key].Similarity.sum() > 0
Пример #8
0
def similaritypolymain(pTrainData, pTestData, pAsg, pDesc, pDate, Nbest):
    try:
        pTrainData = pTrainData[pTrainData[pDesc].notna()]
        pTestData = pTestData[pTestData[pDesc].notna()]
        pTestData['Assignee_Group_Pred'], pTestData['Confidence_Level'] = 'Nan', float(0.0)
        pTrainDataDesc = pd.DataFrame(pTrainData[pDesc])
        pFeaList = []
        pFeaList = pTrainData['Features'].tolist() + pTestData['Features'].tolist()
        pFeaUnqList = list(set(pFeaList))   
        pMatchData, pData, pTestAppendDf,  = [], [], []
        pMatchesDf, pTestMatchData, pTestDf = pd.DataFrame(),pd.DataFrame(), pd.DataFrame()
        for i in range(len(pFeaUnqList)):
            ToData, FromData = pd.DataFrame(), pd.DataFrame()
            FromData = pTrainData.loc[pTrainData['Features'] == pFeaUnqList[i]]
            ToData = pTestData.loc[pTestData['Features'] == pFeaUnqList[i]]
            model = PolyFuzz("TF-IDF")
            pTestAppendDf.append(ToData)
            if len(ToData[pDesc].tolist()) and len(FromData[pDesc].tolist()) >= 1:
                model.match(list(ToData[pDesc].values), FromData[pDesc].unique().tolist(), nbest = int(Nbest))
                Matches = model.get_matches()
                pMatchData.append(Matches)
                pData.append(ToData)              
            
        pMatchesDf = pd.concat(pMatchData)
        pTestMatchData = pd.concat(pData) 
        pTestDf = pd.concat(pTestAppendDf)
        pMatchesDf.reset_index(inplace=True)
        del pMatchesDf['index']
        pTestMatchData.reset_index(inplace=True)
        del pTestMatchData['index']    
        pTestDf.reset_index(inplace=True)
        del pTestDf['index']        
        
        pTestConcatData = pd.concat([pTestMatchData,pMatchesDf], axis = 1)
        
        IntCol = ["To"]
        for i in range(1, int(Nbest)-1):
            IntCol.append("BestMatch" + "__" + str(i))
            pTestMatchData['Assignee_Group_Pred' + '__' + str(i)] = 'NaN'

        SimCol = ['Similarity']
        for k in range(1, int(Nbest) - 1):
            SimCol.append("Similarity" + "__" + str(k))
            pTestMatchData['Confidence_Level'+ '__' + str(k)] = 'NaN'
            
        for i in range(len(IntCol)):
            col = str(IntCol[i])
            if col != "To":
                pTestAppendFea = []
                for p in range(len(pFeaUnqList)):
                    pTrainFeaData, pTestFeaData = pd.DataFrame(), pd.DataFrame()
                    pTrainFeaData = pTrainData.loc[pTrainData['Features'] == pFeaUnqList[p]]
                    pTestFeaData = pTestConcatData.loc[pTestConcatData['Features'] == pFeaUnqList[p]]
                    pTestFeaData.reset_index(inplace=True)
                    del pTestFeaData['index'] 
                    if len(pTestFeaData) and len(pTrainFeaData)> 0: 
                        for j in range(len(pTestFeaData)):
                            if pMatchesDf[col][j] != None:
                                if len(pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[IntCol[i]][j], True , False)][pAsg].values) != 0:
                                    pTestFeaData['Assignee_Group_Pred' + '__' + str(i-1)][j] = pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[col][j], True , False)][pAsg].values[0]
            else:
                pTestAppendFea = []
                for p in range(len(pFeaUnqList)):
                    pTrainFeaData, pTestFeaData = pd.DataFrame(), pd.DataFrame()
                    pTrainFeaData = pTrainData.loc[pTrainData['Features'] == pFeaUnqList[p]]
                    pTestFeaData = pTestConcatData.loc[pTestConcatData['Features'] == pFeaUnqList[p]]
                    pTestFeaData.reset_index(inplace=True)
                    del pTestFeaData['index'] 
                    if len(pTestFeaData) and len(pTrainFeaData)> 0: 
                        for j in range(len(pTestFeaData)):
                            if pTestFeaData[col][j] != None:
                                if len(pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[IntCol[i]][j], True , False)][pAsg].values) != 0:
                                    pTestFeaData['Assignee_Group_Pred'][j] = pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[IntCol[i]][j], True , False)][pAsg].values[0]  
                                else:
                                    pTestFeaData['Assignee_Group_Pred'][j] =  None
                        pTestAppendFea.append(pTestFeaData)
        pTestFeaDf = pd.concat(pTestAppendFea)  
        pTestFeaDf.reset_index(inplace=True)
        del pTestFeaDf['index'] 
        
        pTestDf.loc[pTestDf['Number'].isin(pTestFeaDf['Number']), ['Confidence_Level', 'Assignee_Group_Pred']] = pTestFeaDf[['Similarity', 'Assignee_Group_Pred']].values
        
    except Exception as e:
        print('*** ERROR[004]: Error in similarity poly main function: ', sys.exc_info()[0],str(e))
        print(traceback.format_exc())
        return(-1)
        sys.exit(-1)
    return(0, pTestDf)    
Пример #9
0
class Matcher:

    def __init__(self):

        # Load PolyFuzz model for matching. Default: TF-IDF
        self.model = PolyFuzz(config.MODEL_MATCHING)

        # Load the filters
        self.filters: Dict[str, List[Filter]] = self.__load_filters()

    @staticmethod
    def __load_filters() -> dict:
        """
        Load the filters from filters.toml (by default), create Filter
        objects, and return a dictionary of these object classified by
        intent.
        """
        filters = {}

        # Load the raw filter
        toml_file = toml.load(config.FILTERS_TOML, _dict=dict)

        # Loop over each intent
        for intent, raw_filters in toml_file.items():
            filter_list = []

            # Loop over each filter in this intent
            for name, content in raw_filters.items():

                # Create and append a Filter object
                filter_list.append(
                    Filter(
                        name=name,
                        words=content['words'],
                        regex=content['regex'],
                        threshold=content['threshold']
                    )
                )

            # Save the filters to the main dictionary
            filters[intent] = filter_list

        return filters

    def get_keywords(self, text: str, intent: str) -> dict:

        keywords = {}
        if intent in self.filters:

            # Split the text into a list of words
            entries = text.split(" ")

            for filter_ in self.filters[intent]:

                # Math similarities between the filter and the given text
                self.model.match(entries, filter_.words)
                matches: pd.DataFrame = self.model.get_matches()

                try:
                    # Get the word with the maximum similarity
                    thresholds = matches[matches['Similarity'] >= filter_.threshold]
                    keyword = thresholds[thresholds['Similarity'] == thresholds['Similarity'].max()].iloc[0, 0]

                except Exception:
                    # If there's no match, set the filter as None
                    keywords[filter_.name] = None

                else:
                    # Use the keyword to retrieve and save its chained-data
                    if result := re.search(filter_.regex % keyword, text):
                        keywords[filter_.name] = result.group(filter_.name)

                    else:
                        keywords[filter_.name] = None
Пример #10
0
    c1, c2, c3 = st.beta_columns([5, 5, 5])

    with c2:

        # gif_runner = st.image("mouse.gif")
        # gif_runner = st.image(gif_path)
        gif_runner = st.image("mouse.gif")

        import time

        time.sleep(2)

        model.match(linesList, col_one_list)
        # model.match(linesDeduped2, col_one_list)
        # Auto map by Similarity scores
        Polyfuzz = model.get_matches()
        # Polyfuzz
        gif_runner.empty()
    # st.stop()

    # model.match(linesList, col_one_list)
    ## Auto map by Similarity scores
    # Polyfuzz = model.get_matches()

    # Polyfuzz
    # st.stop()

    if (RadioMapTo == "To crawled titles") and (RadioMapWhat == "Map Broken URLs"):
        # Polyfuzz = pd.concat([Polyfuzz.assign(name=i) for i in col_one_listURL], ignore_index=True)
        # Polyfuzz = Polyfuzz.assign(key=1).merge(pd.DataFrame({'Name':col_one_listURL,'key':1})).drop('key',1)
        Polyfuzz.columns = ["URL to map", "Tite tag match", "Similarity"]
Пример #11
0
# drop urls if already redirected
df_final = df_final[~df_final["Status Code"].isin(["301"])]
df_final = df_final[~df_final["Status Code"].isin(["302"])]

print("Automatically Mapping URLs ..")

# create lists from dfs
df_final_list = list(df_final["Address"])
df_sf_list = list(df_sf["Address"])

# instantiate PolyFuzz model, choose TF-IDF as the similarity measure and match the two lists.
model = PolyFuzz("TF-IDF").match(df_final_list, df_sf_list)

# make the polyfuzz dataframe
df_matches = model.get_matches()

count_row = df_final.shape[0]
print("Total Opportunity:", count_row, "URLs")

df_stats = pd.merge(df_matches,
                    df_final,
                    left_on="From",
                    right_on="Address",
                    how="inner")

# sort on similarity
df_stats = df_stats.sort_values(by="Similarity", ascending=False)

df_stats["Status Code"] = df_stats["Status Code"].astype(str)
Пример #12
0
    fasttext_embeddings = WordEmbeddings('en-crawl')
    fasttext = Embeddings(fasttext_embeddings,
                          min_similarity=0,
                          model_id="FastText")
    model = PolyFuzz(fasttext)

    start = time.time()
    indexes_to_remove = []
    for topic in relevant_tweetir["topic"].unique():
        topic_tweets = relevant_tweetir.loc[relevant_tweetir["topic"] == topic,
                                            "tweets.full_text"]
        for index, tweet in topic_tweets.items():
            indexes = topic_tweets.index[topic_tweets.index != index]
            for ind in indexes:
                model.match(tweet.split(), topic_tweets.loc[ind].split())
                mean_sim = round(model.get_matches()["Similarity"].mean(), 2)
                if mean_sim > 0.8:
                    indexes_to_remove.append(ind)
                    break
    relevant_tweetir = relevant_tweetir[~relevant_tweetir.index.
                                        isin(indexes_to_remove)]
    end = time.time()

    print(indexes_to_remove)
    print(f"Computation time - {round(end - start, 2)} seconds")

    relevant_tweetir["tweets.full_text"] = relevant_tweetir[
        "tweets.full_text"].drop_duplicates()
    relevant_tweetir.dropna(subset=["tweets.full_text"], inplace=True)
    relevant_tweetir = remove_irrelevant_topics(relevant_tweetir,
                                                tweets_threshold=3)
Пример #13
0
dfIndexable = GSCDf.loc[GSCDf["Indexability"] == "Indexable"]
col_one_list = GSCDf["Address"].tolist()


model = PolyFuzz("EditDistance")
SCHEMES = ("http://", "https://")

if start_execution and (uploaded_file is None):
    st.wc50.warning("file 1st!")
    st.stop()

else:

    model.match(linesList, col_one_list)
    Polyfuzz = model.get_matches()  # Auto map by Similarity scores
    Polyfuzz.columns = ["URL to map", "URL match", "Similarity"]
    Polyfuzz.index = Polyfuzz.index + 1

    #cmapRed = sns.diverging_palette(10, 133, as_cmap=True)
    #cmapRedBlue = sns.color_palette("vlag", as_cmap=True)
    cmGreen = sns.light_palette("green", as_cmap=True)
    FuzzStyled = Polyfuzz.style.background_gradient(cmap=cmGreen)

    format_dictionary = {
        "Similarity": "{:.1%}",
    }

    FuzzStyled = FuzzStyled.format(format_dictionary)

    c2 = st.beta_container()
Пример #14
0
from polyfuzz import PolyFuzz

#import polyfuzz


from_list = ["https://www.tatielou.co.uk/apples/sadasda", "https://www.tatielou.co.uk/oranges/sadasda"]
to_list = ["https://www.tatielou.co.uk/apples/", "https://www.tatielou.co.uk/oranges/", "https://www.tatielou.co.uk/pears/"]
from_list = ["apple", "apples", "appl", "recal", "house", "similarity"]
to_list = ["apple", "apples", "mouse"]
model = PolyFuzz("EditDistance")
model.match(from_list, to_list)
# Auto map by Similarity scores
model.get_matches()