예제 #1
0
def test_multiple_models():
    tfidf_matcher = TFIDF(n_gram_range=(3, 3),
                          min_similarity=0,
                          model_id="TF-IDF")
    tfidf_large_matcher = TFIDF(n_gram_range=(3, 6), min_similarity=0)
    base_edit_matcher = EditDistance(n_jobs=1)
    ratio_matcher = EditDistance(n_jobs=1, scorer=fuzz.ratio)
    rapidfuzz_matcher = RapidFuzz(n_jobs=1)
    matchers = [
        tfidf_matcher, tfidf_large_matcher, base_edit_matcher, ratio_matcher,
        rapidfuzz_matcher
    ]

    model = PolyFuzz(matchers).match(from_list, to_list)

    # Test if correct matches are found
    for model_id in model.get_ids():
        assert model_id in model.get_matches().keys()
        assert isinstance(model.get_matches(model_id), pd.DataFrame)
    assert len(model.get_matches()) == len(matchers)

    # Test if error is raised when accessing clusters before creating them
    with pytest.raises(ValueError):
        model.get_clusters()

    with pytest.raises(ValueError):
        model.get_cluster_mappings()

    # Test if groupings are found
    model.group()
    for model_id in model.get_ids():
        assert model_id in model.get_cluster_mappings().keys()
    assert len(model.get_cluster_mappings()) == len(matchers)
예제 #2
0
def test_ngrams(n_gram_low, n_gram_high):
    model = TFIDF(n_gram_range=(n_gram_low, n_gram_high))
    matches = model.match(from_list, to_list)

    assert isinstance(matches, pd.DataFrame)
    assert len(matches) == 6
    assert list(matches.columns) == ['From', 'To', 'Similarity']
예제 #3
0
def test_distance(method):
    model = TFIDF(cosine_method=method)
    matches = model.match(from_list, to_list)

    assert isinstance(matches, pd.DataFrame)
    assert matches.Similarity.mean() > 0.0
    assert len(matches) == 6
    assert list(matches.columns) == ['From', 'To', 'Similarity']
예제 #4
0
    def group(self,
              model: Union[str, BaseMatcher] = None,
              link_min_similarity: float = 0.75):
        """ From the matches, group the `To` matches together using single linkage

         Arguments:
             model: you can choose one of the models in `polyfuzz.models` to be used as a grouper
             link_min_similarity: the minimum similarity between strings before they are grouped
                                  in a single linkage fashion

         Updates:
            self.matches: Adds a column `Group` that is the grouped version of the `To` column
         """
        check_matches(self)
        self.clusters = {}
        self.cluster_mappings = {}

        # Standard models - quick access
        if isinstance(model, str):
            if model in ["TF-IDF", "TFIDF"]:
                model = TFIDF(n_gram_range=(3, 3),
                              min_similarity=link_min_similarity)
            elif self.method in ["EditDistance", "Edit Distance"]:
                model = RapidFuzz()
            elif self.method in ["Embeddings", "Embedding"]:
                model = Embeddings(min_similarity=link_min_similarity)
            else:
                raise ValueError(
                    "Please instantiate the model with one of the following methods: \n"
                    "* 'TF-IDF'\n"
                    "* 'EditDistance'\n"
                    "* 'Embeddings'\n"
                    "* Or None if you want to automatically use TF-IDF")

        # Use TF-IDF if no model is specified
        elif not model:
            model = TFIDF(n_gram_range=(3, 3),
                          min_similarity=link_min_similarity)

        for name, match in self.matches.items():
            strings = list(self.matches[name].To.dropna().unique())
            matches = model.match(strings, strings)
            clusters, cluster_id_map, cluster_name_map = single_linkage(
                matches, link_min_similarity)
            self._map_groups(name, cluster_name_map)
            self.clusters[name] = clusters
            self.cluster_mappings[name] = cluster_id_map
예제 #5
0
    def group(self,
              model: Union[str, BaseMatcher] = None,
              link_min_similarity: float = 0.75,
              group_all_strings: bool = False):
        """ From the matches, group the `To` matches together using single linkage

         Arguments:
             model: you can choose one of the models in `polyfuzz.models` to be used as a grouper
             link_min_similarity: the minimum similarity between strings before they are grouped
                                  in a single linkage fashion
             group_all_strings: if you want to compare a list of strings with itself and then cluster
                                those strings, set this to True. Otherwise, only the strings that
                                were mapped To are clustered.

         Updates:
            self.matches: Adds a column `Group` that is the grouped version of the `To` column
         """
        check_matches(self)
        self.clusters = {}
        self.cluster_mappings = {}

        # Standard models - quick access
        if isinstance(model, str):
            if model in ["TF-IDF", "TFIDF"]:
                model = TFIDF(n_gram_range=(3, 3),
                              min_similarity=link_min_similarity)
            elif self.method in ["EditDistance", "Edit Distance"]:
                model = RapidFuzz()
            elif self.method in ["Embeddings", "Embedding"]:
                model = Embeddings(min_similarity=link_min_similarity)
            else:
                raise ValueError(
                    "Please instantiate the model with one of the following methods: \n"
                    "* 'TF-IDF'\n"
                    "* 'EditDistance'\n"
                    "* 'Embeddings'\n"
                    "* Or None if you want to automatically use TF-IDF")

        # Use TF-IDF if no model is specified
        elif not model:
            model = TFIDF(n_gram_range=(3, 3),
                          min_similarity=link_min_similarity)

        # Group per model
        for name, match in self.matches.items():
            self._create_groups(name, model, link_min_similarity,
                                group_all_strings)
예제 #6
0
    def match(self,
              from_list: List[str],
              to_list: List[str]):
        """ Match the from_list of strings to the to_list of strings with whatever models
        you have initialized

        Arguments:
            from_list: The list from which you want mappings
            to_list: The list where you want to map to

        Updates:
            self.matches: A dictionary with the matches from all models, can
                          be accessed with `model.get_all_matches` or
                          `model.get_match("TF-IDF")`

        Usage:

        After having initialized your models, you can pass through lists of strings:

        ```python
        import polyfuzz as pf
        model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
        model.match(from_list = ["string_one", "string_two"],
                    to_list = ["string_three", "string_four"])
        ```

        You can access the results matches with `model.get_all_matches` or a specific
        model with `model.get_match("TF-IDF")` based on their model_id.
        """
        # Standard models - quick access
        if isinstance(self.method, str):
            if self.method in ["TF-IDF", "TFIDF"]:
                self.matches = {"TF-IDF": TFIDF(min_similarity=0).match(from_list, to_list)}
            elif self.method in ["EditDistance", "Edit Distance"]:
                self.matches = {"EditDistance": RapidFuzz().match(from_list, to_list)}
            elif self.method in ["Embeddings", "Embedding"]:
                self.matches = {"Embeddings": Embeddings(min_similarity=0).match(from_list, to_list)}
            else:
                raise ValueError("Please instantiate the model with one of the following methods: \n"
                                 "* 'TF-IDF'\n"
                                 "* 'EditDistance'\n"
                                 "* 'Embeddings'\n")
            logger.info(f"Ran model with model id = {self.method}")

        # Custom models
        elif isinstance(self.method, BaseMatcher):
            self.matches = {self.method.model_id: self.method.match(from_list, to_list)}
            logging.info(f"Ran model with model id = {self.method.model_id}")

        # Multiple custom models
        elif isinstance(self.method, Iterable):
            self._update_model_ids()
            self.matches = {}
            for model in self.method:
                self.matches[model.model_id] = model.match(from_list, to_list)
                logging.info(f"Ran model with model id = {model.model_id}")

        return self
예제 #7
0
import pytest
from polyfuzz.linkage import single_linkage
from polyfuzz.models import TFIDF
from tests.utils import get_test_strings

from_list, to_list = get_test_strings()
model = TFIDF(cosine_method="sparse")
matches = model.match(from_list, to_list)


@pytest.mark.parametrize("min_similarity",
                         [0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1.])
def test_linkage(min_similarity):
    clusters, cluster_mapping, cluster_name_map = single_linkage(
        matches, min_similarity)

    assert isinstance(clusters, dict)
    assert isinstance(cluster_mapping, dict)
    assert isinstance(cluster_name_map, dict)

    if min_similarity == 1.:
        assert clusters == {}
        assert cluster_mapping == {}
        assert cluster_name_map == {}

    elif min_similarity >= 0.8:
        assert max(cluster_mapping.values()) == 1
        assert len(cluster_name_map) == 2

    else:
        assert max(cluster_mapping.values()) > 1
예제 #8
0
    def match(self,
              from_list: List[str],
              to_list: List[str] = None,
              top_n: int = 1):
        """ Match the from_list of strings to the to_list of strings with whatever models
        you have initialized

        Arguments:
            from_list: The list from which you want mappings.
                       If you want to map items within a list, and not map the 
                       items to themselves, you can supply only the `from_list` and 
                       ignore the `to_list`. 
            to_list: The list where you want to map to
            top_n: The number of matches you want returned. This is currently only implemented
                   for `polyfuzz.models.TFIDF` and `polyfuzz.models.Embeddings` as they
                   can computationally handle more comparisons.

        Updates:
            self.matches: A dictionary with the matches from all models, can
                          be accessed with `model.get_all_matches` or
                          `model.get_match("TF-IDF")`

        Usage:

        After having initialized your models, you can pass through lists of strings:

        ```python
        import polyfuzz as pf
        model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
        model.match(from_list = ["string_one", "string_two"],
                    to_list = ["string_three", "string_four"])
        ```

        You can access the results matches with `model.get_all_matches` or a specific
        model with `model.get_match("TF-IDF")` based on their model_id.
        """
        # Standard models - quick access
        if isinstance(self.method, str):
            if self.method in ["TF-IDF", "TFIDF"]:
                self.method = TFIDF(min_similarity=0, top_n=top_n)
                self.matches = {
                    "TF-IDF": self.method.match(from_list, to_list)
                }
            elif self.method in ["EditDistance", "Edit Distance"]:
                self.method = RapidFuzz()
                self.matches = {
                    "EditDistance": self.method.match(from_list, to_list)
                }
            elif self.method in ["Embeddings", "Embedding"]:
                self.method = Embeddings(min_similarity=0, top_n=top_n)
                self.matches = {
                    "Embeddings": self.method.match(from_list, to_list)
                }
            else:
                raise ValueError(
                    "Please instantiate the model with one of the following methods: \n"
                    "* 'TF-IDF'\n"
                    "* 'EditDistance'\n"
                    "* 'Embeddings'\n")
            logger.info(f"Ran model with model id = {self.method}")

        # Custom models
        elif isinstance(self.method, BaseMatcher):
            self.matches = {
                self.method.model_id: self.method.match(from_list, to_list)
            }
            logging.info(f"Ran model with model id = {self.method.model_id}")

        # Multiple custom models
        elif isinstance(self.method, Iterable):
            self._update_model_ids()
            self.matches = {}
            for model in self.method:
                self.matches[model.model_id] = model.match(from_list, to_list)
                logging.info(f"Ran model with model id = {model.model_id}")

        return self