def test_exposes_are_returned_with_limit(id_watch): config = Config(string=CONFIG_WITH_FILTERS) config.set_searchers([DummyCrawler()]) hunter = Hunter(config, id_watch) hunter.hunt_flats() saved = id_watch.get_recent_exposes(10) assert len(saved) == 10 expose = saved[0] assert expose['title'] is not None
def test_exposes_are_returned_with_limit(): config = Config(string=IdMaintainerTest.CONFIG_WITH_FILTERS) id_watch = IdMaintainer(":memory:") hunter = Hunter(config, [DummyCrawler()], id_watch) hunter.hunt_flats() saved = id_watch.get_recent_exposes(10) assert len(saved) == 10 expose = saved[0] assert expose['title'] is not None
def launch_flat_hunt(config): id_watch = IdMaintainer('%s/processed_ids.db' % config.database_location()) hunter = Hunter(config, id_watch) hunter.hunt_flats() while config.get('loop', dict()).get('active', False): time.sleep(config.get('loop', dict()).get('sleeping_time', 60 * 10)) hunter.hunt_flats()
def launch_flat_hunt(config): searchers = [CrawlImmobilienscout(), CrawlWgGesucht(),CrawlEbayKleinanzeigen()] id_watch = IdMaintainer('%s/processed_ids.db' % os.path.dirname(os.path.abspath(__file__))) hunter = Hunter() hunter.hunt_flats(config, searchers, id_watch) while config.get('loop', dict()).get('active', False): time.sleep(config.get('loop', dict()).get('sleeping_time',60*10)) hunter.hunt_flats(config, searchers, id_watch)
def test_exposes_are_returned_as_dictionaries(id_watch): config = Config(string=CONFIG_WITH_FILTERS) config.set_searchers([DummyCrawler()]) hunter = Hunter(config, id_watch) hunter.hunt_flats() saved = id_watch.get_exposes_since(datetime.datetime.now() - datetime.timedelta(seconds=10)) assert len(saved) > 0 expose = saved[0] assert expose['title'] is not None assert expose['created_at'] is not None
def test_exposes_are_returned_as_dictionaries(): config = Config(string=IdMaintainerTest.CONFIG_WITH_FILTERS) id_watch = IdMaintainer(":memory:") hunter = Hunter(config, [DummyCrawler()], id_watch) hunter.hunt_flats() saved = id_watch.get_exposes_since(datetime.datetime.now() - datetime.timedelta(seconds=10)) assert len(saved) > 0 expose = saved[0] assert expose['title'] is not None assert expose['created_at'] is not None
def launch_flat_hunt(config): id_watch = IdMaintainer('%s/processed_ids.db' % os.path.dirname(os.path.abspath(__file__))) hunter = Hunter(config, id_watch) hunter.hunt_flats() while config.get('loop', dict()).get('active', False): time.sleep(config.get('loop', dict()).get('sleeping_time', 60 * 10)) hunter.hunt_flats()
def test_exposes_are_returned_filtered(): config = Config(string=IdMaintainerTest.CONFIG_WITH_FILTERS) id_watch = IdMaintainer(":memory:") hunter = Hunter(config, [DummyCrawler()], id_watch) hunter.hunt_flats() hunter.hunt_flats() filter = Filter.builder().max_size_filter(70).build() saved = id_watch.get_recent_exposes(10, filter_set=filter) assert len(saved) == 10 for expose in saved: assert int(re.match(r'\d+', expose['size'])[0]) <= 70
def test_exposes_are_returned_filtered(id_watch): config = Config(string=CONFIG_WITH_FILTERS) config.set_searchers([DummyCrawler()]) hunter = Hunter(config, id_watch) hunter.hunt_flats() hunter.hunt_flats() filter = Filter.builder().max_size_filter(70).build() saved = id_watch.get_recent_exposes(10, filter=filter) assert len(saved) == 10 for expose in saved: assert int(re.match(r'\d+', expose['size'])[0]) <= 70
def launch_flat_hunt(config): """Start the crawler loop""" id_watch = IdMaintainer('%s/processed_ids.db' % config.database_location()) hunter = Hunter(config, all_searchers(config), id_watch, RedisPubsub(config)) hunter.hunt_flats() while config.get('loop', dict()).get('active', False): time.sleep(config.get('loop', dict()).get('sleeping_time', 60 * 10)) hunter.hunt_flats()
def launch_flat_hunt(config, heartbeat=None): """Starts the crawler / notification loop""" id_watch = IdMaintainer('%s/processed_ids.db' % config.database_location()) hunter = Hunter(config, id_watch) hunter.hunt_flats() counter = 0 while config.get('loop', dict()).get('active', False): counter += 1 counter = heartbeat.send_heartbeat(counter) time.sleep(config.get('loop', dict()).get('sleeping_time', 60 * 10)) hunter.hunt_flats()
def test_hunt_flats(self): config = Config(string=self.DUMMY_CONFIG) hunter = Hunter(config, [CrawlImmowelt(Config(string=self.DUMMY_CONFIG))], IdMaintainer(":memory:")) exposes = hunter.hunt_flats() self.assertTrue(count(exposes) > 0, "Expected to find exposes")
def test_addresses_are_processed_by_hunter(self): config = Config(string=self.DUMMY_CONFIG) hunter = Hunter(config, [DummyCrawler(addresses_as_links=True)], IdMaintainer(":memory:")) exposes = hunter.hunt_flats() self.assertTrue(count(exposes) > 4, "Expected to find exposes") for expose in exposes: self.assertFalse(expose['address'].startswith('http'), "Expected addresses to be processed by default")
def test_ids_are_added_to_maintainer(mocker): config = Config(string=IdMaintainerTest.DUMMY_CONFIG) id_watch = IdMaintainer(":memory:") spy = mocker.spy(id_watch, "mark_processed") hunter = Hunter(config, [DummyCrawler()], id_watch) exposes = hunter.hunt_flats() assert count(exposes) > 4 assert spy.call_count == 24
def test_is_processed_works(id_watch): config = Config(string=CONFIG_WITH_FILTERS) config.set_searchers([DummyCrawler()]) hunter = Hunter(config, id_watch) exposes = hunter.hunt_flats() assert count(exposes) > 4 for expose in exposes: assert id_watch.is_processed(expose['id'])
def test_is_processed_works(mocker): config = Config(string=IdMaintainerTest.DUMMY_CONFIG) id_watch = IdMaintainer(":memory:") hunter = Hunter(config, [DummyCrawler()], id_watch) exposes = hunter.hunt_flats() assert count(exposes) > 4 for expose in exposes: assert id_watch.is_processed(expose['id'])
def test_exposes_are_saved_to_maintainer(id_watch): config = Config(string=CONFIG_WITH_FILTERS) config.set_searchers([DummyCrawler()]) hunter = Hunter(config, id_watch) exposes = hunter.hunt_flats() assert count(exposes) > 4 saved = id_watch.get_exposes_since(datetime.datetime.now() - datetime.timedelta(seconds=10)) assert len(saved) > 0 assert count(exposes) < len(saved)
def test_exposes_are_saved_to_maintainer(): config = Config(string=IdMaintainerTest.CONFIG_WITH_FILTERS) id_watch = IdMaintainer(":memory:") hunter = Hunter(config, [DummyCrawler()], id_watch) exposes = hunter.hunt_flats() assert count(exposes) > 4 saved = id_watch.get_exposes_since(datetime.datetime.now() - datetime.timedelta(seconds=10)) assert len(saved) > 0 assert count(exposes) < len(saved)
def test_filter_min_rooms(self): min_rooms = 2 config = Config(string=self.FILTER_MIN_ROOMS_CONFIG) config.set_searchers([DummyCrawler()]) hunter = Hunter(config, IdMaintainer(":memory:")) exposes = hunter.hunt_flats() self.assertTrue(count(exposes) > 4, "Expected to find exposes") unfiltered = list(filter(lambda expose: float(re.search(r'\d+([\.,]\d+)?', expose['rooms'])[0]) < min_rooms, exposes)) if len(unfiltered) > 0: for expose in unfiltered: print("Got unfiltered expose: ", expose) self.assertTrue(len(unfiltered) == 0, "Expected flats with too few rooms to be filtered")
def test_filter_max_size(self): max_size = 80 config = Config(string=self.FILTER_MAX_SIZE_CONFIG) config.set_searchers([DummyCrawler()]) hunter = Hunter(config, IdMaintainer(":memory:")) exposes = hunter.hunt_flats() self.assertTrue(count(exposes) > 4, "Expected to find exposes") unfiltered = list(filter(lambda expose: float(re.search(r'\d+([\.,]\d+)?', expose['size'])[0]) > max_size, exposes)) if len(unfiltered) > 0: for expose in unfiltered: print("Got unfiltered expose: ", expose) self.assertTrue(len(unfiltered) == 0, "Expected big flats to be filtered")
def test_filter_titles(self): titlewords = [ "wg", "tausch", "flat", "ruhig", "gruen" ] filteredwords = [ "wg", "tausch", "wochenendheimfahrer", "pendler", "zwischenmiete" ] config = Config(string=self.FILTER_TITLES_CONFIG) config.set_searchers([DummyCrawler(titlewords)]) hunter = Hunter(config, IdMaintainer(":memory:")) exposes = hunter.hunt_flats() self.assertTrue(count(exposes) > 4, "Expected to find exposes") unfiltered = list(filter(lambda expose: any(word in expose['title'] for word in filteredwords), exposes)) if len(unfiltered) > 0: for expose in unfiltered: print("Got unfiltered expose: ", expose) self.assertTrue(len(unfiltered) == 0, "Expected words to be filtered")
def test_filter_min_price(self): min_price = 700 config = Config(string=self.FILTER_MIN_PRICE_CONFIG) hunter = Hunter(config, [DummyCrawler()], IdMaintainer(":memory:")) exposes = hunter.hunt_flats() self.assertTrue(count(exposes) > 4, "Expected to find exposes") unfiltered = list( filter( lambda expose: float( re.search(r'\d+([\.,]\d+)?', expose['price'])[0]) < min_price, exposes)) if len(unfiltered) > 0: for expose in unfiltered: print("Got unfiltered expose: ", expose) self.assertTrue( len(unfiltered) == 0, "Expected cheap flats to be filtered")
def test_resolve_durations(self, m): config = Config(string=self.DUMMY_CONFIG) hunter = Hunter(config, [DummyCrawler()], IdMaintainer(":memory:")) matcher = re.compile( 'maps.googleapis.com/maps/api/distancematrix/json') m.get( matcher, text= '{"status": "OK", "rows": [ { "elements": [ { "distance": { "text": "far", "value": 123 }, "duration": { "text": "days", "value": 123 } } ] } ]}' ) exposes = hunter.hunt_flats() self.assertTrue(count(exposes) > 4, "Expected to find exposes") without_durations = list( filter(lambda expose: 'durations' not in expose, exposes)) if len(without_durations) > 0: for expose in without_durations: print("Got expose: ", expose) self.assertTrue( len(without_durations) == 0, "Expected durations to be calculated")
class HunterTest(unittest.TestCase): DUMMY_CONFIG = """ urls: - https://www.immowelt.de/liste/berlin/wohnungen/mieten?roomi=2&prima=1500&wflmi=70&sort=createdate%2Bdesc google_maps_api: key: SOME_KEY url: https://maps.googleapis.com/maps/api/distancematrix/json?origins={origin}&destinations={dest}&mode={mode}&sensor=true&key={key}&arrival_time={arrival} enable: true """ def setUp(self): self.hunter = Hunter(Config(string=self.DUMMY_CONFIG), [CrawlImmowelt()], IdMaintainer(":memory:")) def test_hunt_flats(self): exposes = self.hunter.hunt_flats() self.assertTrue(len(exposes) > 0, "Expected to find exposes")