예제 #1
0
    def street_dupe_status(cls, street1, street2, languages=None, fuzzy=False):
        if languages is None:
            languages1 = place_languages(['road'], [street1])
            languages2 = place_languages(['road'], [street2])
            if languages1 is not None and languages2 is not None:
                languages = combined_languages(languages1, languages2)
            else:
                languages = languages1 or languages2 or DEFAULT_LANGUAGES

        street_status = is_street_duplicate(street1, street2, languages=languages)
        same_street = street_status in (duplicate_status.EXACT_DUPLICATE, duplicate_status.LIKELY_DUPLICATE)
        if street_status == duplicate_status.EXACT_DUPLICATE:
            street_sim = 1.0
        elif street_status == duplicate_status.NEEDS_REVIEW:
            street_sim = 0.5
        elif street_status == duplicate_status.LIKELY_DUPLICATE:
            street_sim = 0.9
        else:
            street_sim = 0.0

        if same_street:
            return Dupe(status=street_status, sim=street_sim)
        elif fuzzy:
            a1_street_tokens = Name.content_tokens(street1, languages=languages)
            a1_scores_norm = WordIndex.normalized_vector([1] * len(a1_street_tokens))
            a2_street_tokens = Name.content_tokens(street2, languages=languages)
            a2_scores_norm = WordIndex.normalized_vector([1] * len(a2_street_tokens))
            if a1_street_tokens and a2_street_tokens and a1_scores_norm and a2_scores_norm:
                street_status, street_sim = is_street_duplicate_fuzzy(a1_street_tokens, a1_scores_norm, a2_street_tokens, a2_scores_norm, languages=languages)
                return Dupe(status=street_status, sim=street_sim)
            else:
                return Dupe(status=duplicate_status.NON_DUPLICATE, sim=street_sim)
        else:
            return Dupe(status=duplicate_status.NON_DUPLICATE, sim=street_sim)
예제 #2
0
    def dupe_class_and_sim(cls, a1, a2, word_index=None, likely_dupe_threshold=DedupeResponse.default_name_dupe_threshold,
                           needs_review_threshold=DedupeResponse.default_name_review_threshold, with_address=True, with_unit=False, with_phone_number=True, fuzzy_street_name=False):
        a1_name = a1.get(AddressComponents.NAME)
        a2_name = a2.get(AddressComponents.NAME)
        if not a1_name or not a2_name:
            return NULL_DUPE

        a1_languages = cls.address_languages(a1)
        a2_languages = cls.address_languages(a2)

        languages = cls.combined_languages(a1_languages, a2_languages)

        if with_address:
            same_address = cls.is_address_dupe(a1, a2, languages=languages, fuzzy_street_name=fuzzy_street_name)
            if not same_address:
                return NULL_DUPE

        if with_unit:
            same_unit = cls.is_sub_building_dupe(a1, a2, languages=languages)
            if not same_unit:
                return NULL_DUPE

        name_dupe_class = cls.name_dupe_status(a1_name, a2_name, languages=languages)
        name_sim = 0.0

        if name_dupe_class == duplicate_status.EXACT_DUPLICATE:
            name_sim = 1.0
        elif name_dupe_class == duplicate_status.LIKELY_DUPLICATE:
            name_sim = likely_dupe_threshold
        elif name_dupe_class == duplicate_status.NEEDS_REVIEW:
            name_sim = needs_review_threshold
        else:
            return NULL_DUPE

        if word_index and name_dupe_class != duplicate_status.EXACT_DUPLICATE:
            name_fuzzy_dupe_class, name_fuzzy_sim = cls.name_dupe_similarity(a1_name, a2_name, word_index=word_index, languages=languages)

            if name_fuzzy_dupe_class >= name_dupe_class:
                name_dupe_class = name_fuzzy_dupe_class
                name_sim = name_fuzzy_sim

        phone_number_dupe = None

        if with_phone_number:
            name_dupe_class, phone_number_dupe = PhoneNumberDeduper.revised_dupe_class(name_dupe_class, a1, a2)

        return Dupe(name_dupe_class, name_sim)
예제 #3
0
    def address_dupe_status(cls, a1, a2, languages=None, fuzzy_street_name=False):
        a1_street = a1.get(AddressComponents.STREET)
        a2_street = a2.get(AddressComponents.STREET)

        if a1_street:
            a1_street = a1_street.strip()
        if a2_street:
            a2_street = a2_street.strip()

        a1_house_number = a1.get(AddressComponents.HOUSE_NUMBER)
        a2_house_number = a2.get(AddressComponents.HOUSE_NUMBER)

        if a1_house_number:
            a1_house_number = safe_decode(a1_house_number).strip()
        if a2_house_number:
            a2_house_number = safe_decode(a2_house_number).strip()

        a1_base_house_number = a1.get(AddressComponents.HOUSE_NUMBER_BASE)
        a2_base_house_number = a2.get(AddressComponents.HOUSE_NUMBER_BASE)

        if a1_base_house_number:
            a1_base_house_number = safe_decode(a1_base_house_number).strip()
        if a2_base_house_number:
            a2_base_house_number = safe_decode(a2_base_house_number).strip()

        if (a1_street and not a2_street) or (a2_street and not a1_street):
            return NULL_DUPE

        if (a1_house_number and not a2_house_number) or (a2_house_number and not a1_house_number):
            return NULL_DUPE

        have_street = a1_street and a2_street
        same_street = False
        street_status = duplicate_status.NON_DUPLICATE

        street_sim = 0.0
        if have_street:
            street_dupe_status = StreetDeduper.street_dupe_status(a1_street, a2_street, languages=languages, fuzzy=fuzzy_street_name)
            same_street = street_dupe_status.status in (duplicate_status.EXACT_DUPLICATE, duplicate_status.LIKELY_DUPLICATE)

            if not same_street:
                return Dupe(status=duplicate_status.NON_DUPLICATE, sim=street_sim)

        have_house_number = a1_house_number and a2_house_number
        have_base_house_number = a1_base_house_number or a2_base_house_number
        same_house_number = False
        house_number_status = duplicate_status.NON_DUPLICATE
        house_number_sim = 0.0

        if have_house_number:
            house_number_status = is_house_number_duplicate(a1_house_number, a2_house_number, languages=languages)
            same_house_number = house_number_status == duplicate_status.EXACT_DUPLICATE
            if same_house_number:
                house_number_sim = 1.0

            if have_base_house_number and not same_house_number:
                a1h = a1_base_house_number or a1_house_number
                a2h = a2_base_house_number or a2_house_number

                base_house_number_status = is_house_number_duplicate(a1h, a2h, languages=languages)
                same_house_number = base_house_number_status == duplicate_status.EXACT_DUPLICATE
                if same_house_number:
                    house_number_status = duplicate_status.LIKELY_DUPLICATE
                    house_number_sim = 0.9

            if not same_house_number:
                return Dupe(status=duplicate_status.NON_DUPLICATE, sim=house_number_sim)

        if not have_house_number and not have_street:
            return NULL_DUPE

        if have_street and same_street and have_house_number and same_house_number:
            min_status, min_sim = min((street_dupe_status.status, street_dupe_status.sim), (house_number_status, house_number_sim))
            return Dupe(status=min_status, sim=min_sim)
        elif have_house_number and same_house_number and not have_street:
            return Dupe(status=house_number_status, sim=house_number_sim)
        elif have_street and same_street and not have_house_number:
            return Dupe(status=street_status, sim=street_sim)

        return NULL_DUPE