def get_ids(self) -> Union[str, List[str], None]: """ Get all model ids for easier access """ check_matches(self) if isinstance(self.method, str): return self.method elif isinstance(self.method, Iterable): return [model.model_id for model in self.method] return None
def get_cluster_mappings(self, name: str = None) -> Mapping[str, int]: """ Get the mappings from the `To` column to its respective column """ check_matches(self) check_grouped(self) if len(self.matches) == 1: return list(self.cluster_mappings.values())[0] elif len(self.matches) > 1 and name: return self.cluster_mappings[name] return self.cluster_mappings
def get_matches(self, model_id: str = None) -> Union[pd.DataFrame, Mapping[str, pd.DataFrame]]: """ Get the matches from one or more models""" check_matches(self) if len(self.matches) == 1: return list(self.matches.values())[0] elif len(self.matches) > 1 and model_id: return self.matches[model_id] return self.matches
def get_clusters(self, model_id: str = None) -> Mapping[str, List[str]]: """ Get the groupings/clusters from a single model Arguments: model_id: the model id of the model if you have specified multiple models """ check_matches(self) check_grouped(self) if len(self.matches) == 1: return list(self.clusters.values())[0] elif len(self.matches) > 1 and model_id: return self.clusters[model_id] return self.clusters
def group(self, model: Union[str, BaseMatcher] = None, link_min_similarity: float = 0.75): """ From the matches, group the `To` matches together using single linkage Arguments: model: you can choose one of the models in `polyfuzz.models` to be used as a grouper link_min_similarity: the minimum similarity between strings before they are grouped in a single linkage fashion Updates: self.matches: Adds a column `Group` that is the grouped version of the `To` column """ check_matches(self) self.clusters = {} self.cluster_mappings = {} # Standard models - quick access if isinstance(model, str): if model in ["TF-IDF", "TFIDF"]: model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity) elif self.method in ["EditDistance", "Edit Distance"]: model = RapidFuzz() elif self.method in ["Embeddings", "Embedding"]: model = Embeddings(min_similarity=link_min_similarity) else: raise ValueError( "Please instantiate the model with one of the following methods: \n" "* 'TF-IDF'\n" "* 'EditDistance'\n" "* 'Embeddings'\n" "* Or None if you want to automatically use TF-IDF") # Use TF-IDF if no model is specified elif not model: model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity) for name, match in self.matches.items(): strings = list(self.matches[name].To.dropna().unique()) matches = model.match(strings, strings) clusters, cluster_id_map, cluster_name_map = single_linkage( matches, link_min_similarity) self._map_groups(name, cluster_name_map) self.clusters[name] = clusters self.cluster_mappings[name] = cluster_id_map
def group(self, model: Union[str, BaseMatcher] = None, link_min_similarity: float = 0.75, group_all_strings: bool = False): """ From the matches, group the `To` matches together using single linkage Arguments: model: you can choose one of the models in `polyfuzz.models` to be used as a grouper link_min_similarity: the minimum similarity between strings before they are grouped in a single linkage fashion group_all_strings: if you want to compare a list of strings with itself and then cluster those strings, set this to True. Otherwise, only the strings that were mapped To are clustered. Updates: self.matches: Adds a column `Group` that is the grouped version of the `To` column """ check_matches(self) self.clusters = {} self.cluster_mappings = {} # Standard models - quick access if isinstance(model, str): if model in ["TF-IDF", "TFIDF"]: model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity) elif self.method in ["EditDistance", "Edit Distance"]: model = RapidFuzz() elif self.method in ["Embeddings", "Embedding"]: model = Embeddings(min_similarity=link_min_similarity) else: raise ValueError( "Please instantiate the model with one of the following methods: \n" "* 'TF-IDF'\n" "* 'EditDistance'\n" "* 'Embeddings'\n" "* Or None if you want to automatically use TF-IDF") # Use TF-IDF if no model is specified elif not model: model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity) # Group per model for name, match in self.matches.items(): self._create_groups(name, model, link_min_similarity, group_all_strings)
def visualize_precision_recall(self, kde: bool = False, save_path: str = None): """ Calculate and visualize precision-recall curves A minimum similarity score might be used to identify when a match could be considered to be correct. For example, we can assume that if a similarity score pass 0.95 we are quite confident that the matches are correct. This minimum similarity score can be defined as **precision** since it shows you how precise we believe the matches are at a minimum. **Recall** can then be defined as as the percentage of matches found at a certain minimum similarity score. A high recall means that for a certain minimum precision score, we find many matches. Arguments: kde: whether to also visualize the kde plot save_path: the path to save the resulting image to Usage: ```python import polyfuzz as pf model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF") model.match(from_list = ["string_one", "string_two"], to_list = ["string_three", "string_four"]) model.visualize_precision_recall(save_path="results.png") ``` """ check_matches(self) self.min_precisions = {} self.recalls = {} self.average_precisions = {} for name, match in self.matches.items(): min_precision, recall, average_precision = precision_recall_curve( match) self.min_precisions[name] = min_precision self.recalls[name] = recall self.average_precisions[name] = average_precision visualize_precision_recall(self.matches, self.min_precisions, self.recalls, kde, save_path)