def test_multiple_models(): tfidf_matcher = TFIDF(n_gram_range=(3, 3), min_similarity=0, model_id="TF-IDF") tfidf_large_matcher = TFIDF(n_gram_range=(3, 6), min_similarity=0) base_edit_matcher = EditDistance(n_jobs=1) ratio_matcher = EditDistance(n_jobs=1, scorer=fuzz.ratio) rapidfuzz_matcher = RapidFuzz(n_jobs=1) matchers = [ tfidf_matcher, tfidf_large_matcher, base_edit_matcher, ratio_matcher, rapidfuzz_matcher ] model = PolyFuzz(matchers).match(from_list, to_list) # Test if correct matches are found for model_id in model.get_ids(): assert model_id in model.get_matches().keys() assert isinstance(model.get_matches(model_id), pd.DataFrame) assert len(model.get_matches()) == len(matchers) # Test if error is raised when accessing clusters before creating them with pytest.raises(ValueError): model.get_clusters() with pytest.raises(ValueError): model.get_cluster_mappings() # Test if groupings are found model.group() for model_id in model.get_ids(): assert model_id in model.get_cluster_mappings().keys() assert len(model.get_cluster_mappings()) == len(matchers)
def test_ngrams(n_gram_low, n_gram_high): model = TFIDF(n_gram_range=(n_gram_low, n_gram_high)) matches = model.match(from_list, to_list) assert isinstance(matches, pd.DataFrame) assert len(matches) == 6 assert list(matches.columns) == ['From', 'To', 'Similarity']
def test_distance(method): model = TFIDF(cosine_method=method) matches = model.match(from_list, to_list) assert isinstance(matches, pd.DataFrame) assert matches.Similarity.mean() > 0.0 assert len(matches) == 6 assert list(matches.columns) == ['From', 'To', 'Similarity']
def group(self, model: Union[str, BaseMatcher] = None, link_min_similarity: float = 0.75): """ From the matches, group the `To` matches together using single linkage Arguments: model: you can choose one of the models in `polyfuzz.models` to be used as a grouper link_min_similarity: the minimum similarity between strings before they are grouped in a single linkage fashion Updates: self.matches: Adds a column `Group` that is the grouped version of the `To` column """ check_matches(self) self.clusters = {} self.cluster_mappings = {} # Standard models - quick access if isinstance(model, str): if model in ["TF-IDF", "TFIDF"]: model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity) elif self.method in ["EditDistance", "Edit Distance"]: model = RapidFuzz() elif self.method in ["Embeddings", "Embedding"]: model = Embeddings(min_similarity=link_min_similarity) else: raise ValueError( "Please instantiate the model with one of the following methods: \n" "* 'TF-IDF'\n" "* 'EditDistance'\n" "* 'Embeddings'\n" "* Or None if you want to automatically use TF-IDF") # Use TF-IDF if no model is specified elif not model: model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity) for name, match in self.matches.items(): strings = list(self.matches[name].To.dropna().unique()) matches = model.match(strings, strings) clusters, cluster_id_map, cluster_name_map = single_linkage( matches, link_min_similarity) self._map_groups(name, cluster_name_map) self.clusters[name] = clusters self.cluster_mappings[name] = cluster_id_map
def group(self, model: Union[str, BaseMatcher] = None, link_min_similarity: float = 0.75, group_all_strings: bool = False): """ From the matches, group the `To` matches together using single linkage Arguments: model: you can choose one of the models in `polyfuzz.models` to be used as a grouper link_min_similarity: the minimum similarity between strings before they are grouped in a single linkage fashion group_all_strings: if you want to compare a list of strings with itself and then cluster those strings, set this to True. Otherwise, only the strings that were mapped To are clustered. Updates: self.matches: Adds a column `Group` that is the grouped version of the `To` column """ check_matches(self) self.clusters = {} self.cluster_mappings = {} # Standard models - quick access if isinstance(model, str): if model in ["TF-IDF", "TFIDF"]: model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity) elif self.method in ["EditDistance", "Edit Distance"]: model = RapidFuzz() elif self.method in ["Embeddings", "Embedding"]: model = Embeddings(min_similarity=link_min_similarity) else: raise ValueError( "Please instantiate the model with one of the following methods: \n" "* 'TF-IDF'\n" "* 'EditDistance'\n" "* 'Embeddings'\n" "* Or None if you want to automatically use TF-IDF") # Use TF-IDF if no model is specified elif not model: model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity) # Group per model for name, match in self.matches.items(): self._create_groups(name, model, link_min_similarity, group_all_strings)
def match(self, from_list: List[str], to_list: List[str]): """ Match the from_list of strings to the to_list of strings with whatever models you have initialized Arguments: from_list: The list from which you want mappings to_list: The list where you want to map to Updates: self.matches: A dictionary with the matches from all models, can be accessed with `model.get_all_matches` or `model.get_match("TF-IDF")` Usage: After having initialized your models, you can pass through lists of strings: ```python import polyfuzz as pf model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF") model.match(from_list = ["string_one", "string_two"], to_list = ["string_three", "string_four"]) ``` You can access the results matches with `model.get_all_matches` or a specific model with `model.get_match("TF-IDF")` based on their model_id. """ # Standard models - quick access if isinstance(self.method, str): if self.method in ["TF-IDF", "TFIDF"]: self.matches = {"TF-IDF": TFIDF(min_similarity=0).match(from_list, to_list)} elif self.method in ["EditDistance", "Edit Distance"]: self.matches = {"EditDistance": RapidFuzz().match(from_list, to_list)} elif self.method in ["Embeddings", "Embedding"]: self.matches = {"Embeddings": Embeddings(min_similarity=0).match(from_list, to_list)} else: raise ValueError("Please instantiate the model with one of the following methods: \n" "* 'TF-IDF'\n" "* 'EditDistance'\n" "* 'Embeddings'\n") logger.info(f"Ran model with model id = {self.method}") # Custom models elif isinstance(self.method, BaseMatcher): self.matches = {self.method.model_id: self.method.match(from_list, to_list)} logging.info(f"Ran model with model id = {self.method.model_id}") # Multiple custom models elif isinstance(self.method, Iterable): self._update_model_ids() self.matches = {} for model in self.method: self.matches[model.model_id] = model.match(from_list, to_list) logging.info(f"Ran model with model id = {model.model_id}") return self
import pytest from polyfuzz.linkage import single_linkage from polyfuzz.models import TFIDF from tests.utils import get_test_strings from_list, to_list = get_test_strings() model = TFIDF(cosine_method="sparse") matches = model.match(from_list, to_list) @pytest.mark.parametrize("min_similarity", [0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1.]) def test_linkage(min_similarity): clusters, cluster_mapping, cluster_name_map = single_linkage( matches, min_similarity) assert isinstance(clusters, dict) assert isinstance(cluster_mapping, dict) assert isinstance(cluster_name_map, dict) if min_similarity == 1.: assert clusters == {} assert cluster_mapping == {} assert cluster_name_map == {} elif min_similarity >= 0.8: assert max(cluster_mapping.values()) == 1 assert len(cluster_name_map) == 2 else: assert max(cluster_mapping.values()) > 1
def match(self, from_list: List[str], to_list: List[str] = None, top_n: int = 1): """ Match the from_list of strings to the to_list of strings with whatever models you have initialized Arguments: from_list: The list from which you want mappings. If you want to map items within a list, and not map the items to themselves, you can supply only the `from_list` and ignore the `to_list`. to_list: The list where you want to map to top_n: The number of matches you want returned. This is currently only implemented for `polyfuzz.models.TFIDF` and `polyfuzz.models.Embeddings` as they can computationally handle more comparisons. Updates: self.matches: A dictionary with the matches from all models, can be accessed with `model.get_all_matches` or `model.get_match("TF-IDF")` Usage: After having initialized your models, you can pass through lists of strings: ```python import polyfuzz as pf model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF") model.match(from_list = ["string_one", "string_two"], to_list = ["string_three", "string_four"]) ``` You can access the results matches with `model.get_all_matches` or a specific model with `model.get_match("TF-IDF")` based on their model_id. """ # Standard models - quick access if isinstance(self.method, str): if self.method in ["TF-IDF", "TFIDF"]: self.method = TFIDF(min_similarity=0, top_n=top_n) self.matches = { "TF-IDF": self.method.match(from_list, to_list) } elif self.method in ["EditDistance", "Edit Distance"]: self.method = RapidFuzz() self.matches = { "EditDistance": self.method.match(from_list, to_list) } elif self.method in ["Embeddings", "Embedding"]: self.method = Embeddings(min_similarity=0, top_n=top_n) self.matches = { "Embeddings": self.method.match(from_list, to_list) } else: raise ValueError( "Please instantiate the model with one of the following methods: \n" "* 'TF-IDF'\n" "* 'EditDistance'\n" "* 'Embeddings'\n") logger.info(f"Ran model with model id = {self.method}") # Custom models elif isinstance(self.method, BaseMatcher): self.matches = { self.method.model_id: self.method.match(from_list, to_list) } logging.info(f"Ran model with model id = {self.method.model_id}") # Multiple custom models elif isinstance(self.method, Iterable): self._update_model_ids() self.matches = {} for model in self.method: self.matches[model.model_id] = model.match(from_list, to_list) logging.info(f"Ran model with model id = {model.model_id}") return self