def test_location_is_not_matched_if_it_is_not_flat_address(self): mocked_address_provider = MockedAddressProvider(streets=[{ "official": "Szeroka", "colloquial": [], }, { "official": "Karmelicka", "colloquial": [], }], places=[{ "official": "Ikea", "colloquial": [], }]) ctx_analyser = NearbyLocationContext( introducers={'w sąsiedztwie'}, conjunctions={'i'}, address_provider=mocked_address_provider) extractor = AddressExtractor(mocked_address_provider, excluded_contexts=[ctx_analyser]) found_address = extractor( "Mieszkanie znajduje się na ulicy Karmelickiej. W sąsiedztwie ul. Szeroka i Ikea" ) self.assertIn("Karmelicka", [str(match.location) for match in found_address.street]) self.assertEqual(1, len(found_address.all_addresses))
def test_address_extractor_correctly_recognize_location_type(self): mocked_address_provider = MockedAddressProvider(streets=[{ "official": "Stanisława", "colloquial": [], }], estates=[{ "official": "Grzegorza", "colloquial": [], }], districts=[{ "official": "Piotra", "colloquial": [], }]) extractor = AddressExtractor(mocked_address_provider) found_address = extractor( "blah blah Piotra blah Grzegorza blah Stanisława") self.assertEqual(len(found_address.street), 1) self.assertIn("Stanisława", [str(match.location) for match in found_address.street]) self.assertEqual(len(found_address.estate), 1) self.assertIn("Grzegorza", [str(match.location) for match in found_address.estate]) self.assertEqual(len(found_address.district), 1) self.assertIn( "Piotra", [str(match.location) for match in found_address.district])
def test_only_longest_location_from_overlapping_matches_is_returned(self): with self.subTest(): mocked_address_provider = MockedAddressProvider(streets=[{ "official": "Zygmunta Starego", "colloquial": [], }, { "official": "Stare Podgórze", "colloquial": [], }], ) extractor = AddressExtractor(mocked_address_provider) found_address = extractor( "\nDo wynajęcia 1-pokojowe funkcjonalne mieszkanie w spokojnej, dobrze skomunikowanej" " okolicy - Stare Podgórze przy ulicy Zamoyskiego, bardzo dobry dojazd do każdej części miasta." ) names_of_matched_locations = [ str(match.location) for match in found_address.all ] self.assertIn("Stare Podgórze", names_of_matched_locations) self.assertNotIn("Zygmunta Starego", names_of_matched_locations) with self.subTest(): mocked_address_provider = MockedAddressProvider( streets=[{ "official": "Bronowicka", "colloquial": [], }], places=[{ "official": "Galeria Bronowicka", "colloquial": [], }]) extractor = AddressExtractor(mocked_address_provider) found_address = extractor("Galeria Bronowicka") names_of_matched_locations = [ str(match.location) for match in found_address.all ] self.assertIn("Galeria Bronowicka", names_of_matched_locations) self.assertNotIn("Bronowicka", names_of_matched_locations)
def test_address_extractor_performs_morphological_comparison(self): mocked_address_provider = MockedAddressProvider( streets=[{ "official": "Stanisława", "colloquial": [], }]) extractor = AddressExtractor(mocked_address_provider) found_address = extractor("Stanisławowi") self.assertIn("Stanisława", [str(match.location) for match in found_address.street])
def test_address_extractor_correctly_compares_names(self): streets = [{ "official": "Tadeusza Kościuszki", "colloquial": [], }] extractor = AddressExtractor(MockedAddressProvider(streets=streets)) found_address = extractor("Kościuszki") self.assertIn("Tadeusza Kościuszki", [str(match.location) for match in found_address.street]) extractor = AddressExtractor(MockedAddressProvider(streets=streets)) found_address = extractor("Tadeusza Kościuszki") self.assertIn("Tadeusza Kościuszki", [str(match.location) for match in found_address.street]) extractor = AddressExtractor(MockedAddressProvider(streets=streets)) found_address = extractor("Tadeusza") self.assertNotIn( "Tadeusza Kościuszki", [str(match.location) for match in found_address.street])
def test_extract_address_with_unit_number(self): mocked_address_provider = MockedAddressProvider( streets=[{ "official": "Jana Zamoyskiego", "colloquial": [], }]) extractor = AddressExtractor(mocked_address_provider) found_address = extractor("Zamoyskiego 15") self.assertIn("Jana Zamoyskiego 15", [str(match.location) for match in found_address.street])
def test_case_does_not_matter_phrase_in_text_is_all_upper_case(self): mocked_address_provider = MockedAddressProvider( streets=[{ "official": "Śliczna", "colloquial": [], }]) extractor = AddressExtractor(mocked_address_provider) found_address = extractor("mieszkanie przy ulicy ŚLICZNEJ") self.assertIn("Śliczna", [str(match.location) for match in found_address.street])
def test_case_matters(self): mocked_address_provider = MockedAddressProvider( streets=[{ "official": "Śliczna", "colloquial": [], }]) extractor = AddressExtractor(mocked_address_provider) found_address = extractor( "Oferuję do wynajęcia śliczne mieszkanie 4-pokojowe") self.assertEqual(0, len(found_address.all))
def test_address_extractor_returns_official_name_if_colloquial_name_matched( self): mocked_address_provider = MockedAddressProvider( estates=[{ "official": "Osiedle Na Kozłówce", "colloquial": ["Kozłówek"], }]) extractor = AddressExtractor(mocked_address_provider) found_address = extractor("Kozłówek") self.assertIn("Osiedle Na Kozłówce", [match.location for match in found_address.estate])
def test_osiedle_street_is_not_matched_to_osiedle_location_prefix(self): mocked_address_provider = MockedAddressProvider(streets=[ { "official": "Osiedle", "colloquial": [], }, ], ) extractor = AddressExtractor(mocked_address_provider) found_address = extractor("Duże osiedle.") self.assertNotIn("Osiedle", [match.location for match in found_address.all])
def test_duplications_are_merged(self): mocked_address_provider = MockedAddressProvider(districts=[ { "official": "Nowa Huta", "colloquial": [], }, ], ) extractor = AddressExtractor(mocked_address_provider) found_address = extractor( 'Dzielnica Nowa Huta. Mieszkanie się na Nowej Hucie') self.assertEqual(1, len(found_address.all))
def test_actual_all_uppercase_bug(self): mocked_address_provider = MockedAddressProvider(streets=[ { "official": "Czyżyny", "colloquial": [], }, ], ) extractor = AddressExtractor(mocked_address_provider) found_address = extractor('CZYŻYNY') self.assertIn("Czyżyny", [str(match.location) for match in found_address.all]) self.assertEqual(1, len(found_address.all))
def test_zl_is_not_matched_to_zlota_street(self): mocked_address_provider = MockedAddressProvider(streets=[ { "official": "Złota", "colloquial": [], }, ], ) extractor = AddressExtractor(mocked_address_provider, excluded_contexts=[PriceContext()]) found_address = extractor( 'czynsz najmu : 1600 zł + 553 ZŁ czynsz administracyjny + media .') self.assertNotIn("Złota", [match.location for match in found_address.all])
def test_Krakow_city_is_not_recognized_as_Kraka_street(self): mocked_address_provider = MockedAddressProvider(streets=[{ "official": "Kraka", "colloquial": [], }], ) extractor = AddressExtractor(mocked_address_provider) found_address = extractor("miasto Kraków") self.assertEqual(0, len(found_address.all)) found_address = extractor("w Krakowie") self.assertEqual(0, len(found_address.all))
def test_temp(self): # TODO remove import logging logging.root.setLevel(logging.NOTSET) all_test_cases = self._load_regression_cases() flat = all_test_cases[46] flat['ignore_extra_matches'] = False extractor = AddressExtractor( address_provider, excluded_contexts=[ FirstWordOfSentenceContext(), NearbyLocationContext(address_provider=address_provider) ]) found_address = extractor(flat['title'] + '.\n' + flat['description']) self._compare_address_results(flat, found_address)
def runner(flat): try: extractor = AddressExtractor( address_provider, excluded_contexts=[ FirstWordOfSentenceContext(), NearbyLocationContext( address_provider=address_provider), PriceContext() ]) found_address = extractor(flat['title'] + '.\n' + flat['description']) return flat, found_address except Exception as e: trace = traceback.format_exc() return None, Exception(str(e) + '\n' + trace)
def test_word_is_not_interpreted_as_location_if_it_is_first_word_of_a_sentence( self): mocked_address_provider = MockedAddressProvider(streets=[{ "official": "Piękna", "colloquial": [], }], ) extractor = AddressExtractor( mocked_address_provider, excluded_contexts=[FirstWordOfSentenceContext()]) found_address = extractor("Jakieś zdanie. Piękna okolica.") self.assertEqual(0, len(found_address.all)) found_address = extractor("Jakieś zdanie. Lokalizacja - Piękna 13") self.assertNotEqual(0, len(found_address.all))
def does_introducer_refer_to_tested_location(self, introducer_subject): address_extractor = AddressExtractor(self.address_provider) matches = address_extractor(' '.join(introducer_subject)) found_addresses = chain(matches.street, matches.estate, matches.district, matches.place) match_slices = (address.match_slice_position for address in found_addresses) is_the_word_an_address_part_or_conjunction = [ False ] * len(introducer_subject) # locations for beg, end in match_slices: for i in range(beg, end): is_the_word_an_address_part_or_conjunction[i] = True # conjunctions for i in range(len(introducer_subject)): if not is_the_word_an_address_part_or_conjunction[i]: if introducer_subject[i] in self.conjunctions: is_the_word_an_address_part_or_conjunction[i] = True # location types for i in range(len(introducer_subject)): if not is_the_word_an_address_part_or_conjunction[i]: if introducer_subject[i].lower( ) in self.location_type_prefixes: is_the_word_an_address_part_or_conjunction[i] = True try: if introducer_subject[i + 1] == '.': is_the_word_an_address_part_or_conjunction[ i + 1] = True except IndexError: pass # newline character for i in range(len(introducer_subject)): if not is_the_word_an_address_part_or_conjunction[i]: if introducer_subject[i] == '\n': is_the_word_an_address_part_or_conjunction[i] = True return all(is_the_word_an_address_part_or_conjunction)
def test_street_duplications_are_merged(self): mocked_address_provider = MockedAddressProvider(streets=[ { "official": "Mogilska", "colloquial": [], }, ], ) extractor = AddressExtractor(mocked_address_provider) found_address = extractor( 'Mieszkanie przy ulicy Mogilskiej. Adres Mogilska 66') self.assertIn("Mogilska 66", [str(match.location) for match in found_address.all]) self.assertEqual(1, len(found_address.all)) found_address = extractor('Mieszkanie przy ulicy Mogilskiej') self.assertIn("Mogilska", [str(match.location) for match in found_address.all]) self.assertEqual(1, len(found_address.all))
"Mistrzejowice", "Swoszowice", "Ruczaj", 'Eliasza Radzikowskiego', 'Aleja 29 Listopada', 'Dobrego Pasterza', 'Żabiniec', 'Jana Sobieskiego', 'Białoprądnicka', ]) extractors = [ AddressExtractor( address_provider, excluded_contexts=[ FirstWordOfSentenceContext(), NearbyLocationContext(address_provider=address_provider), PriceContext() ]), InterconnectingRoomExtractor(), KitchenetteExtractor(), BachelorPadExtractor() ] two_room_cfg = {'price_low': 800, 'price_high': 1500, 'room': 2} three_room_cfg = {'price_low': 1000, 'price_high': 2200, 'room': 3} mgr = ScrappingManager(check_interval_in_seconds=1 * 60 * 60, filters=[ without_kitchenette,