示例#1
0
    def test_roman_numbers_in_text(self):
        """
        Checks conversion of roman numbers to arabic ones in string
        normalization.
        """
        self.assertEqual(
            "dans le XVe arrondissement",
            tools.normalize_string("Dans le 15e arrondissement"),
        )

        self.assertEqual("paris XVe, 75005", tools.normalize_string("Paris 15e, 75005"))

        self.assertEqual("paris xve, 75005", tools.normalize_string("Paris XVe, 75005"))
示例#2
0
def _preprocess_laposte():
    """
    Build SQLAlchemy objects from the postal codes data.

    :return: A list of ``PostalCode`` objects to be inserted in database.
    """
    data_file = "laposte.json"
    LOGGER.info("Building from %s data.", data_file)

    raw_laposte_data = []
    # Load opendata file
    try:
        with io.open(os.path.join(MODULE_DIR, data_file), "r", encoding="utf-8") as fh:
            raw_laposte_data = json.load(fh)
    except (IOError, ValueError):
        LOGGER.error("Invalid raw LaPoste opendata file.")
        return []

    # Build postal codes to other infos file
    postal_codes_data = []
    # Keep track of seen (postal_codes, names) to avoid inserting useless
    # duplicates (already in the OpenData file)
    seen_postal_codes = []
    for item in raw_laposte_data:
        fields = item["fields"]
        try:
            area = french_postal_codes_to_quarter(fields["code_postal"])
            if area is None:
                LOGGER.debug(
                    "No matching area found for postal code %s, skipping it.",
                    fields["code_postal"],
                )
                continue

            name = normalize_string(titlecase.titlecase(fields["nom_de_la_commune"]), lowercase=False)

            if (fields["code_postal"], name) in seen_postal_codes:
                continue

            seen_postal_codes.append((fields["code_postal"], name))
            postal_codes_data.append(
                PostalCode(
                    area=area,
                    postal_code=fields["code_postal"],
                    insee_code=fields["code_commune_insee"],
                    name=name,
                    lat=fields["coordonnees_gps"][0],
                    lng=fields["coordonnees_gps"][1],
                )
            )
        except KeyError:
            LOGGER.debug("Missing data for postal code %s, skipping it.", fields["code_postal"])

    return postal_codes_data
示例#3
0
 def test_whitespace_trim(self):
     """
     Checks that trailing and beginning whitespaces are trimmed.
     """
     self.assertEqual("rennes 35000", tools.normalize_string("  Rennes 35000 "))
示例#4
0
 def test_multiple_whitespaces(self):
     """
     Checks whitespaces are collapsed.
     """
     self.assertEqual("avec ascenseur", tools.normalize_string("avec   ascenseur"))
示例#5
0
 def test_accents(self):
     """
     Checks accents are replaced.
     """
     self.assertEqual("eeeaui", tools.normalize_string(u"éèêàüï"))
示例#6
0
def fuzzy_match(query, choices, limit=3, threshold=75):
    """
    Custom search for the best element in choices matching the query.

    :param query: The string to match.
    :param choices: The list of strings to match with.
    :param limit: The maximum number of items to return. Set to ``None`` to
        return all values above threshold.
    :param threshold: The score threshold to use.

    :return: Tuples of matching items and associated confidence.

    .. note ::

        This function works by removing any fancy character from the ``query``
        and ``choices`` strings (replacing any non alphabetic and non numeric
        characters by space), converting to lower case and normalizing them
        (collapsing multiple spaces etc). It also converts any roman numerals
        to decimal system. It then compares the string and look for the longest
        string in ``choices`` which is a substring of ``query``. The longest
        one gets a confidence of 100. The shorter ones get a confidence
        proportional to their length.

    .. seealso :: flatisfy.tools.normalize_string

    Example::

        >>> match("Paris 14ème", ["Ris", "ris", "Paris 14"], limit=1)
        [("Paris 14", 100)

        >>> match( \
                "Saint-Jacques, Denfert-Rochereau (Colonel Rol-Tanguy), " \
                "Mouton-Duvernet", \
                ["saint-jacques", "denfert rochereau", "duvernet", "toto"], \
                limit=4 \
            )
        [('denfert rochereau', 100), ('saint-jacques', 76)]
    """
    # TODO: Is there a better confidence measure?
    normalized_query = tools.normalize_string(query)
    normalized_choices = [tools.normalize_string(choice) for choice in choices]

    # Remove duplicates in the choices list
    unique_normalized_choices = tools.uniqify(normalized_choices)

    # Get the matches (normalized strings)
    # Keep only ``limit`` matches.
    matches = sorted([(choice, len(choice))
                      for choice in tools.uniqify(unique_normalized_choices)
                      if choice in normalized_query],
                     key=lambda x: x[1],
                     reverse=True)
    if limit:
        matches = matches[:limit]

    # Update confidence
    if matches:
        max_confidence = max(match[1] for match in matches)
        matches = [(x[0], int(x[1] / max_confidence * 100)) for x in matches]

    # Convert back matches to original strings
    # Also filter out matches below threshold
    matches = [(choices[normalized_choices.index(x[0])], x[1]) for x in matches
               if x[1] >= threshold]

    return matches
示例#7
0
def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold):
    """
    Compute the duplicate score between two flats. The higher the score, the
    more likely the two flats to be duplicates.

    :param flat1: First flat dict.
    :param flat2: Second flat dict.
    :param photo_cache: An instance of ``ImageCache`` to use to cache images.
    :param hash_threshold: The hash threshold between two images.
    :return: The duplicate score as ``int``.
    """
    n_common_items = 0
    try:
        # They should have the same area, up to one unit
        assert abs(flat1["area"] - flat2["area"]) < 1
        n_common_items += 1

        # They should be at the same price, up to one unit
        assert abs(flat1["cost"] - flat2["cost"]) < 1
        n_common_items += 1

        # They should have the same number of bedrooms if this was
        # fetched for both
        if flat1["bedrooms"] and flat2["bedrooms"]:
            assert flat1["bedrooms"] == flat2["bedrooms"]
            n_common_items += 1

        # They should have the same utilities (included or excluded for
        # both of them), if this was fetched for both
        if flat1["utilities"] and flat2["utilities"]:
            assert flat1["utilities"] == flat2["utilities"]
            n_common_items += 1

        # They should have the same number of rooms if it was fetched
        # for both of them
        if flat1["rooms"] and flat2["rooms"]:
            assert flat1["rooms"] == flat2["rooms"]
            n_common_items += 1

        # They should have the same postal code, if available
        if ("flatisfy" in flat1 and "flatisfy" in flat2
                and flat1["flatisfy"].get("postal_code", None)
                and flat2["flatisfy"].get("postal_code", None)):
            assert flat1["flatisfy"]["postal_code"] == flat2["flatisfy"][
                "postal_code"]
            n_common_items += 1

        # TODO: Better text comparison (one included in the other, fuzzymatch)
        flat1_text = tools.normalize_string(flat1.get("text", ""))
        flat2_text = tools.normalize_string(flat2.get("text", ""))
        if flat1_text and flat2_text and flat1_text == flat2_text:
            n_common_items += 1

        # They should have the same phone number if it was fetched for
        # both
        flat1_phone = homogeneize_phone_number(flat1["phone"])
        flat2_phone = homogeneize_phone_number(flat2["phone"])
        if flat1_phone and flat2_phone:
            # Use an "in" test as there could be multiple phone numbers
            # returned by a Woob module
            if flat1_phone in flat2_phone or flat2_phone in flat1_phone:
                n_common_items += 4  # Counts much more than the rest

        # If the two flats are from the same website and have a
        # different float part, consider they cannot be duplicates. See
        # https://framagit.org/phyks/Flatisfy/issues/100.
        both_are_from_same_backend = flat1["id"].split(
            "@")[-1] == flat2["id"].split("@")[-1]
        both_have_float_part = (flat1["area"] % 1) > 0 and (flat2["area"] %
                                                            1) > 0
        both_have_equal_float_part = (flat1["area"] % 1) == (flat2["area"] % 1)
        if both_have_float_part and both_are_from_same_backend:
            assert both_have_equal_float_part

        if flat1.get("photos", []) and flat2.get("photos", []):
            n_common_photos = find_number_common_photos(
                flat1["photos"], flat2["photos"], photo_cache, hash_threshold)

            min_number_photos = min(len(flat1["photos"]), len(flat2["photos"]))

            # Either all the photos are the same, or there are at least
            # three common photos.
            if n_common_photos == min_number_photos:
                n_common_items += 15
            else:
                n_common_items += 5 * min(n_common_photos, 3)
    except (AssertionError, TypeError):
        # Skip and consider as not duplicates whenever the conditions
        # are not met
        # TypeError occurs when an area or a cost is None, which should
        # not be considered as duplicates
        n_common_items = 0

    return n_common_items