def street_dupe_status(cls, street1, street2, languages=None, fuzzy=False): if languages is None: languages1 = place_languages(['road'], [street1]) languages2 = place_languages(['road'], [street2]) if languages1 is not None and languages2 is not None: languages = combined_languages(languages1, languages2) else: languages = languages1 or languages2 or DEFAULT_LANGUAGES street_status = is_street_duplicate(street1, street2, languages=languages) same_street = street_status in (duplicate_status.EXACT_DUPLICATE, duplicate_status.LIKELY_DUPLICATE) if street_status == duplicate_status.EXACT_DUPLICATE: street_sim = 1.0 elif street_status == duplicate_status.NEEDS_REVIEW: street_sim = 0.5 elif street_status == duplicate_status.LIKELY_DUPLICATE: street_sim = 0.9 else: street_sim = 0.0 if same_street: return Dupe(status=street_status, sim=street_sim) elif fuzzy: a1_street_tokens = Name.content_tokens(street1, languages=languages) a1_scores_norm = WordIndex.normalized_vector([1] * len(a1_street_tokens)) a2_street_tokens = Name.content_tokens(street2, languages=languages) a2_scores_norm = WordIndex.normalized_vector([1] * len(a2_street_tokens)) if a1_street_tokens and a2_street_tokens and a1_scores_norm and a2_scores_norm: street_status, street_sim = is_street_duplicate_fuzzy(a1_street_tokens, a1_scores_norm, a2_street_tokens, a2_scores_norm, languages=languages) return Dupe(status=street_status, sim=street_sim) else: return Dupe(status=duplicate_status.NON_DUPLICATE, sim=street_sim) else: return Dupe(status=duplicate_status.NON_DUPLICATE, sim=street_sim)
def address_dupe_status(cls, a1, a2, languages=None, fuzzy_street_name=False): a1_street = a1.get(AddressComponents.STREET) a2_street = a2.get(AddressComponents.STREET) if a1_street: a1_street = a1_street.strip() if a2_street: a2_street = a2_street.strip() a1_house_number = a1.get(AddressComponents.HOUSE_NUMBER) a2_house_number = a2.get(AddressComponents.HOUSE_NUMBER) if a1_house_number: a1_house_number = a1_house_number.strip() if a2_house_number: a2_house_number = a2_house_number.strip() a1_base_house_number = a1.get(AddressComponents.HOUSE_NUMBER_BASE) a2_base_house_number = a2.get(AddressComponents.HOUSE_NUMBER_BASE) if a1_base_house_number: a1_base_house_number = a1_base_house_number.strip() if a2_base_house_number: a2_base_house_number = a2_base_house_number.strip() if (a1_street and not a2_street) or (a2_street and not a1_street): return (duplicate_status.NON_DUPLICATE, 0.0) if (a1_house_number and not a2_house_number) or (a2_house_number and not a1_house_number): return (duplicate_status.NON_DUPLICATE, 0.0) have_street = a1_street and a2_street same_street = False street_status = duplicate_status.NON_DUPLICATE street_sim = 0.0 if have_street: street_status = is_street_duplicate(a1_street, a2_street, languages=languages) same_street = street_status in (duplicate_status.EXACT_DUPLICATE, duplicate_status.LIKELY_DUPLICATE) if street_status == duplicate_status.EXACT_DUPLICATE: street_sim = 1.0 elif street_status == duplicate_status.NEEDS_REVIEW: street_sim = 0.5 elif street_status == duplicate_status.LIKELY_DUPLICATE: street_sim = 0.9 else: street_sim = 0.0 if not same_street and fuzzy_street_name: a1_street_tokens = Name.content_tokens(a1_street, languages=languages) a1_scores_norm = WordIndex.normalized_vector([1] * len(a1_street_tokens)) a2_street_tokens = Name.content_tokens(a2_street, languages=languages) a2_scores_norm = WordIndex.normalized_vector([1] * len(a2_street_tokens)) if a1_street_tokens and a2_street_tokens and a1_scores_norm and a2_scores_norm: street_status, street_sim = is_street_duplicate_fuzzy(a1_street_tokens, a1_scores_norm, a2_street_tokens, a2_scores_norm, languages=languages) same_street = street_status in (duplicate_status.EXACT_DUPLICATE, duplicate_status.LIKELY_DUPLICATE) if not same_street: return (duplicate_status.NON_DUPLICATE, 0.0) have_house_number = a1_house_number and a2_house_number have_base_house_number = a1_base_house_number or a2_base_house_number same_house_number = False house_number_status = duplicate_status.NON_DUPLICATE house_number_sim = 0.0 if have_house_number: house_number_status = is_house_number_duplicate(a1_house_number, a2_house_number, languages=languages) same_house_number = house_number_status == duplicate_status.EXACT_DUPLICATE if same_house_number: house_number_sim = 1.0 if have_base_house_number and not same_house_number: a1h = a1_base_house_number or a1_house_number a2h = a2_base_house_number or a2_house_number base_house_number_status = is_house_number_duplicate(a1h, a2h, languages=languages) same_house_number = base_house_number_status == duplicate_status.EXACT_DUPLICATE if same_house_number: house_number_status = duplicate_status.LIKELY_DUPLICATE house_number_sim = 0.9 if not same_house_number: return (duplicate_status.NON_DUPLICATE, house_number_sim) if not have_house_number and not have_street: return (duplicate_status.NON_DUPLICATE, 0.0) if have_street and same_street and (same_house_number or not have_house_number): return min((street_status, street_sim), (house_number_status, house_number_sim)) elif have_house_number and same_house_number: return (house_number_status, house_number_sim) elif have_street and same_street: return (street_status, street_sim) return (duplicate_status.NON_DUPLICATE, 0.0)