def test_location_is_not_matched_if_it_is_not_flat_address(self):
        mocked_address_provider = MockedAddressProvider(streets=[{
            "official":
            "Szeroka",
            "colloquial": [],
        }, {
            "official":
            "Karmelicka",
            "colloquial": [],
        }],
                                                        places=[{
                                                            "official":
                                                            "Ikea",
                                                            "colloquial": [],
                                                        }])

        ctx_analyser = NearbyLocationContext(
            introducers={'w sąsiedztwie'},
            conjunctions={'i'},
            address_provider=mocked_address_provider)
        extractor = AddressExtractor(mocked_address_provider,
                                     excluded_contexts=[ctx_analyser])

        found_address = extractor(
            "Mieszkanie znajduje się na ulicy Karmelickiej. W sąsiedztwie ul. Szeroka i Ikea"
        )
        self.assertIn("Karmelicka",
                      [str(match.location) for match in found_address.street])
        self.assertEqual(1, len(found_address.all_addresses))
    def test_address_extractor_correctly_recognize_location_type(self):
        mocked_address_provider = MockedAddressProvider(streets=[{
            "official":
            "Stanisława",
            "colloquial": [],
        }],
                                                        estates=[{
                                                            "official":
                                                            "Grzegorza",
                                                            "colloquial": [],
                                                        }],
                                                        districts=[{
                                                            "official":
                                                            "Piotra",
                                                            "colloquial": [],
                                                        }])

        extractor = AddressExtractor(mocked_address_provider)

        found_address = extractor(
            "blah blah Piotra blah Grzegorza blah Stanisława")

        self.assertEqual(len(found_address.street), 1)
        self.assertIn("Stanisława",
                      [str(match.location) for match in found_address.street])

        self.assertEqual(len(found_address.estate), 1)
        self.assertIn("Grzegorza",
                      [str(match.location) for match in found_address.estate])

        self.assertEqual(len(found_address.district), 1)
        self.assertIn(
            "Piotra",
            [str(match.location) for match in found_address.district])
    def test_only_longest_location_from_overlapping_matches_is_returned(self):
        with self.subTest():
            mocked_address_provider = MockedAddressProvider(streets=[{
                "official":
                "Zygmunta Starego",
                "colloquial": [],
            }, {
                "official":
                "Stare Podgórze",
                "colloquial": [],
            }], )

            extractor = AddressExtractor(mocked_address_provider)
            found_address = extractor(
                "\nDo wynajęcia 1-pokojowe funkcjonalne mieszkanie w spokojnej, dobrze skomunikowanej"
                " okolicy - Stare Podgórze przy ulicy Zamoyskiego, bardzo dobry dojazd do każdej części miasta."
            )
            names_of_matched_locations = [
                str(match.location) for match in found_address.all
            ]

            self.assertIn("Stare Podgórze", names_of_matched_locations)
            self.assertNotIn("Zygmunta Starego", names_of_matched_locations)

        with self.subTest():
            mocked_address_provider = MockedAddressProvider(
                streets=[{
                    "official": "Bronowicka",
                    "colloquial": [],
                }],
                places=[{
                    "official": "Galeria Bronowicka",
                    "colloquial": [],
                }])

            extractor = AddressExtractor(mocked_address_provider)
            found_address = extractor("Galeria Bronowicka")
            names_of_matched_locations = [
                str(match.location) for match in found_address.all
            ]

            self.assertIn("Galeria Bronowicka", names_of_matched_locations)
            self.assertNotIn("Bronowicka", names_of_matched_locations)
    def test_address_extractor_performs_morphological_comparison(self):
        mocked_address_provider = MockedAddressProvider(
            streets=[{
                "official": "Stanisława",
                "colloquial": [],
            }])

        extractor = AddressExtractor(mocked_address_provider)
        found_address = extractor("Stanisławowi")
        self.assertIn("Stanisława",
                      [str(match.location) for match in found_address.street])
    def test_address_extractor_correctly_compares_names(self):
        streets = [{
            "official": "Tadeusza Kościuszki",
            "colloquial": [],
        }]

        extractor = AddressExtractor(MockedAddressProvider(streets=streets))
        found_address = extractor("Kościuszki")
        self.assertIn("Tadeusza Kościuszki",
                      [str(match.location) for match in found_address.street])

        extractor = AddressExtractor(MockedAddressProvider(streets=streets))
        found_address = extractor("Tadeusza Kościuszki")
        self.assertIn("Tadeusza Kościuszki",
                      [str(match.location) for match in found_address.street])

        extractor = AddressExtractor(MockedAddressProvider(streets=streets))
        found_address = extractor("Tadeusza")
        self.assertNotIn(
            "Tadeusza Kościuszki",
            [str(match.location) for match in found_address.street])
    def test_extract_address_with_unit_number(self):
        mocked_address_provider = MockedAddressProvider(
            streets=[{
                "official": "Jana Zamoyskiego",
                "colloquial": [],
            }])

        extractor = AddressExtractor(mocked_address_provider)

        found_address = extractor("Zamoyskiego 15")
        self.assertIn("Jana Zamoyskiego 15",
                      [str(match.location) for match in found_address.street])
    def test_case_does_not_matter_phrase_in_text_is_all_upper_case(self):
        mocked_address_provider = MockedAddressProvider(
            streets=[{
                "official": "Śliczna",
                "colloquial": [],
            }])

        extractor = AddressExtractor(mocked_address_provider)

        found_address = extractor("mieszkanie przy ulicy ŚLICZNEJ")
        self.assertIn("Śliczna",
                      [str(match.location) for match in found_address.street])
    def test_case_matters(self):
        mocked_address_provider = MockedAddressProvider(
            streets=[{
                "official": "Śliczna",
                "colloquial": [],
            }])

        extractor = AddressExtractor(mocked_address_provider)

        found_address = extractor(
            "Oferuję do wynajęcia śliczne mieszkanie 4-pokojowe")
        self.assertEqual(0, len(found_address.all))
    def test_address_extractor_returns_official_name_if_colloquial_name_matched(
            self):
        mocked_address_provider = MockedAddressProvider(
            estates=[{
                "official": "Osiedle Na Kozłówce",
                "colloquial": ["Kozłówek"],
            }])

        extractor = AddressExtractor(mocked_address_provider)

        found_address = extractor("Kozłówek")
        self.assertIn("Osiedle Na Kozłówce",
                      [match.location for match in found_address.estate])
    def test_osiedle_street_is_not_matched_to_osiedle_location_prefix(self):
        mocked_address_provider = MockedAddressProvider(streets=[
            {
                "official": "Osiedle",
                "colloquial": [],
            },
        ], )

        extractor = AddressExtractor(mocked_address_provider)

        found_address = extractor("Duże osiedle.")
        self.assertNotIn("Osiedle",
                         [match.location for match in found_address.all])
    def test_duplications_are_merged(self):
        mocked_address_provider = MockedAddressProvider(districts=[
            {
                "official": "Nowa Huta",
                "colloquial": [],
            },
        ], )

        extractor = AddressExtractor(mocked_address_provider)

        found_address = extractor(
            'Dzielnica Nowa Huta. Mieszkanie się na Nowej Hucie')
        self.assertEqual(1, len(found_address.all))
    def test_actual_all_uppercase_bug(self):
        mocked_address_provider = MockedAddressProvider(streets=[
            {
                "official": "Czyżyny",
                "colloquial": [],
            },
        ], )

        extractor = AddressExtractor(mocked_address_provider)

        found_address = extractor('CZYŻYNY')
        self.assertIn("Czyżyny",
                      [str(match.location) for match in found_address.all])
        self.assertEqual(1, len(found_address.all))
    def test_zl_is_not_matched_to_zlota_street(self):
        mocked_address_provider = MockedAddressProvider(streets=[
            {
                "official": "Złota",
                "colloquial": [],
            },
        ], )

        extractor = AddressExtractor(mocked_address_provider,
                                     excluded_contexts=[PriceContext()])

        found_address = extractor(
            'czynsz najmu : 1600 zł + 553 ZŁ czynsz administracyjny + media .')
        self.assertNotIn("Złota",
                         [match.location for match in found_address.all])
    def test_Krakow_city_is_not_recognized_as_Kraka_street(self):

        mocked_address_provider = MockedAddressProvider(streets=[{
            "official":
            "Kraka",
            "colloquial": [],
        }], )

        extractor = AddressExtractor(mocked_address_provider)

        found_address = extractor("miasto Kraków")
        self.assertEqual(0, len(found_address.all))

        found_address = extractor("w Krakowie")
        self.assertEqual(0, len(found_address.all))
    def test_temp(self):  # TODO remove
        import logging
        logging.root.setLevel(logging.NOTSET)

        all_test_cases = self._load_regression_cases()
        flat = all_test_cases[46]
        flat['ignore_extra_matches'] = False

        extractor = AddressExtractor(
            address_provider,
            excluded_contexts=[
                FirstWordOfSentenceContext(),
                NearbyLocationContext(address_provider=address_provider)
            ])

        found_address = extractor(flat['title'] + '.\n' + flat['description'])
        self._compare_address_results(flat, found_address)
        def runner(flat):
            try:
                extractor = AddressExtractor(
                    address_provider,
                    excluded_contexts=[
                        FirstWordOfSentenceContext(),
                        NearbyLocationContext(
                            address_provider=address_provider),
                        PriceContext()
                    ])

                found_address = extractor(flat['title'] + '.\n' +
                                          flat['description'])
                return flat, found_address
            except Exception as e:
                trace = traceback.format_exc()
                return None, Exception(str(e) + '\n' + trace)
    def test_word_is_not_interpreted_as_location_if_it_is_first_word_of_a_sentence(
            self):
        mocked_address_provider = MockedAddressProvider(streets=[{
            "official":
            "Piękna",
            "colloquial": [],
        }], )

        extractor = AddressExtractor(
            mocked_address_provider,
            excluded_contexts=[FirstWordOfSentenceContext()])

        found_address = extractor("Jakieś zdanie. Piękna okolica.")
        self.assertEqual(0, len(found_address.all))

        found_address = extractor("Jakieś zdanie. Lokalizacja - Piękna 13")
        self.assertNotEqual(0, len(found_address.all))
示例#18
0
    def does_introducer_refer_to_tested_location(self, introducer_subject):
        address_extractor = AddressExtractor(self.address_provider)

        matches = address_extractor(' '.join(introducer_subject))
        found_addresses = chain(matches.street, matches.estate,
                                matches.district, matches.place)
        match_slices = (address.match_slice_position
                        for address in found_addresses)

        is_the_word_an_address_part_or_conjunction = [
            False
        ] * len(introducer_subject)

        # locations
        for beg, end in match_slices:
            for i in range(beg, end):
                is_the_word_an_address_part_or_conjunction[i] = True

        # conjunctions
        for i in range(len(introducer_subject)):
            if not is_the_word_an_address_part_or_conjunction[i]:
                if introducer_subject[i] in self.conjunctions:
                    is_the_word_an_address_part_or_conjunction[i] = True

        # location types
        for i in range(len(introducer_subject)):
            if not is_the_word_an_address_part_or_conjunction[i]:
                if introducer_subject[i].lower(
                ) in self.location_type_prefixes:
                    is_the_word_an_address_part_or_conjunction[i] = True
                    try:
                        if introducer_subject[i + 1] == '.':
                            is_the_word_an_address_part_or_conjunction[
                                i + 1] = True
                    except IndexError:
                        pass

        # newline character
        for i in range(len(introducer_subject)):
            if not is_the_word_an_address_part_or_conjunction[i]:
                if introducer_subject[i] == '\n':
                    is_the_word_an_address_part_or_conjunction[i] = True

        return all(is_the_word_an_address_part_or_conjunction)
    def test_street_duplications_are_merged(self):
        mocked_address_provider = MockedAddressProvider(streets=[
            {
                "official": "Mogilska",
                "colloquial": [],
            },
        ], )

        extractor = AddressExtractor(mocked_address_provider)

        found_address = extractor(
            'Mieszkanie przy ulicy Mogilskiej. Adres Mogilska 66')
        self.assertIn("Mogilska 66",
                      [str(match.location) for match in found_address.all])
        self.assertEqual(1, len(found_address.all))

        found_address = extractor('Mieszkanie przy ulicy Mogilskiej')
        self.assertIn("Mogilska",
                      [str(match.location) for match in found_address.all])
        self.assertEqual(1, len(found_address.all))
示例#20
0
        "Mistrzejowice",
        "Swoszowice",
        "Ruczaj",
        'Eliasza Radzikowskiego',
        'Aleja 29 Listopada',
        'Dobrego Pasterza',
        'Żabiniec',
        'Jana Sobieskiego',
        'Białoprądnicka',
    ])

    extractors = [
        AddressExtractor(
            address_provider,
            excluded_contexts=[
                FirstWordOfSentenceContext(),
                NearbyLocationContext(address_provider=address_provider),
                PriceContext()
            ]),
        InterconnectingRoomExtractor(),
        KitchenetteExtractor(),
        BachelorPadExtractor()
    ]

    two_room_cfg = {'price_low': 800, 'price_high': 1500, 'room': 2}

    three_room_cfg = {'price_low': 1000, 'price_high': 2200, 'room': 3}

    mgr = ScrappingManager(check_interval_in_seconds=1 * 60 * 60,
                           filters=[
                               without_kitchenette,