def test_is_processed_works(mocker): config = Config(string=IdMaintainerTest.DUMMY_CONFIG) id_watch = IdMaintainer(":memory:") hunter = Hunter(config, [DummyCrawler()], id_watch) exposes = hunter.hunt_flats() assert count(exposes) > 4 for expose in exposes: assert id_watch.is_processed(expose['id'])
def test_exposes_are_saved_to_maintainer(): config = Config(string=IdMaintainerTest.CONFIG_WITH_FILTERS) id_watch = IdMaintainer(":memory:") hunter = Hunter(config, [DummyCrawler()], id_watch) exposes = hunter.hunt_flats() assert count(exposes) > 4 saved = id_watch.get_exposes_since(datetime.datetime.now() - datetime.timedelta(seconds=10)) assert len(saved) > 0 assert count(exposes) < len(saved)
def test_exposes_are_returned_with_limit(): config = Config(string=IdMaintainerTest.CONFIG_WITH_FILTERS) id_watch = IdMaintainer(":memory:") hunter = Hunter(config, [DummyCrawler()], id_watch) hunter.hunt_flats() saved = id_watch.get_recent_exposes(10) assert len(saved) == 10 expose = saved[0] assert expose['title'] is not None
def test_exposes_are_returned_as_dictionaries(): config = Config(string=IdMaintainerTest.CONFIG_WITH_FILTERS) id_watch = IdMaintainer(":memory:") hunter = Hunter(config, [DummyCrawler()], id_watch) hunter.hunt_flats() saved = id_watch.get_exposes_since(datetime.datetime.now() - datetime.timedelta(seconds=10)) assert len(saved) > 0 expose = saved[0] assert expose['title'] is not None assert expose['created_at'] is not None
def test_exposes_are_returned_filtered(): config = Config(string=IdMaintainerTest.CONFIG_WITH_FILTERS) id_watch = IdMaintainer(":memory:") hunter = Hunter(config, [DummyCrawler()], id_watch) hunter.hunt_flats() hunter.hunt_flats() filter = Filter.builder().max_size_filter(70).build() saved = id_watch.get_recent_exposes(10, filter_set=filter) assert len(saved) == 10 for expose in saved: assert int(re.match(r'\d+', expose['size'])[0]) <= 70
def test_all_filters_can_be_loaded(): config = Config(string=IdMaintainerTest.CONFIG_WITH_FILTERS) id_watch = IdMaintainer(":memory:") filter = {'fish': 'cat'} hunter = WebHunter(config, id_watch) hunter.set_filters_for_user(123, filter) hunter.set_filters_for_user(124, filter) assert id_watch.get_user_settings() == [(123, { 'filters': filter }), (124, { 'filters': filter })]
def test_filters_for_user_are_saved(): config = Config(string=IdMaintainerTest.CONFIG_WITH_FILTERS) id_watch = IdMaintainer(":memory:") filter = {'fish': 'cat'} hunter = WebHunter(config, id_watch) hunter.set_filters_for_user(123, filter) assert hunter.get_filters_for_user(123) == filter
def test_addresses_are_processed_by_hunter(self): config = Config(string=self.DUMMY_CONFIG) hunter = Hunter(config, [DummyCrawler(addresses_as_links=True)], IdMaintainer(":memory:")) exposes = hunter.hunt_flats() self.assertTrue(count(exposes) > 4, "Expected to find exposes") for expose in exposes: self.assertFalse(expose['address'].startswith('http'), "Expected addresses to be processed by default")
def test_hunt_flats(self): config = Config(string=self.DUMMY_CONFIG) hunter = Hunter(config, [CrawlImmowelt(Config(string=self.DUMMY_CONFIG))], IdMaintainer(":memory:")) exposes = hunter.hunt_flats() self.assertTrue(count(exposes) > 0, "Expected to find exposes")
def test_ids_are_added_to_maintainer(mocker): config = Config(string=IdMaintainerTest.DUMMY_CONFIG) id_watch = IdMaintainer(":memory:") spy = mocker.spy(id_watch, "mark_processed") hunter = Hunter(config, [DummyCrawler()], id_watch) exposes = hunter.hunt_flats() assert count(exposes) > 4 assert spy.call_count == 24
class IdMaintainerTest(unittest.TestCase): TEST_URL = 'https://www.immowelt.de/liste/berlin/wohnungen/mieten?roomi=2&prima=1500&wflmi=70&sort=createdate%2Bdesc' def setUp(self): self.maintainer = IdMaintainer(":memory:") def test_read_from_empty_db(self): self.assertEqual(0, len(self.maintainer.get()), "Expected empty db to return empty array") def test_read_after_write(self): self.maintainer.add(12345) self.assertEqual(12345, self.maintainer.get()[0], "Expected ID to be saved") def test_get_last_run_time_none_by_default(self): self.assertIsNone(self.maintainer.get_last_run_time(), "Expected last run time to be none") def test_get_list_run_time_is_updated(self): time = self.maintainer.update_last_run_time() self.assertIsNotNone(time, "Expected time not to be none") self.assertEqual(time, self.maintainer.get_last_run_time(), "Expected last run time to be updated")
def launch_flat_hunt(config): id_watch = IdMaintainer('%s/processed_ids.db' % config.database_location()) hunter = Hunter(config, id_watch) hunter.hunt_flats() while config.get('loop', dict()).get('active', False): time.sleep(config.get('loop', dict()).get('sleeping_time', 60 * 10)) hunter.hunt_flats()
def client(): app.config['TESTING'] = True with tempfile.NamedTemporaryFile(mode='w+') as temp_db: app.config['HUNTER'] = Hunter(Config(string=DUMMY_CONFIG), [CrawlImmowelt()], IdMaintainer(temp_db.name)) with app.test_client() as client: yield client
def launch_flat_hunt(config): id_watch = IdMaintainer('%s/processed_ids.db' % os.path.dirname(os.path.abspath(__file__))) hunter = Hunter(config, id_watch) hunter.hunt_flats() while config.get('loop', dict()).get('active', False): time.sleep(config.get('loop', dict()).get('sleeping_time', 60 * 10)) hunter.hunt_flats()
def launch_flat_hunt(config): searchers = [CrawlImmobilienscout(), CrawlWgGesucht(),CrawlEbayKleinanzeigen()] id_watch = IdMaintainer('%s/processed_ids.db' % os.path.dirname(os.path.abspath(__file__))) hunter = Hunter() hunter.hunt_flats(config, searchers, id_watch) while config.get('loop', dict()).get('active', False): time.sleep(config.get('loop', dict()).get('sleeping_time',60*10)) hunter.hunt_flats(config, searchers, id_watch)
def launch_flat_hunt(config): """Start the crawler loop""" id_watch = IdMaintainer('%s/processed_ids.db' % config.database_location()) hunter = Hunter(config, all_searchers(config), id_watch, RedisPubsub(config)) hunter.hunt_flats() while config.get('loop', dict()).get('active', False): time.sleep(config.get('loop', dict()).get('sleeping_time', 60 * 10)) hunter.hunt_flats()
def hunt_client(): app.config['TESTING'] = True with tempfile.NamedTemporaryFile(mode='w+') as temp_db: config = Config(string=DUMMY_CONFIG) config.set_searchers([DummyCrawler()]) app.config['HUNTER'] = WebHunter(config, IdMaintainer(temp_db.name)) app.config['BOT_TOKEN'] = "1234xxx.12345" app.secret_key = b'test_session_key' with app.test_client() as hunt_client: yield hunt_client
def test_filter_min_rooms(self): min_rooms = 2 config = Config(string=self.FILTER_MIN_ROOMS_CONFIG) config.set_searchers([DummyCrawler()]) hunter = Hunter(config, IdMaintainer(":memory:")) exposes = hunter.hunt_flats() self.assertTrue(count(exposes) > 4, "Expected to find exposes") unfiltered = list(filter(lambda expose: float(re.search(r'\d+([\.,]\d+)?', expose['rooms'])[0]) < min_rooms, exposes)) if len(unfiltered) > 0: for expose in unfiltered: print("Got unfiltered expose: ", expose) self.assertTrue(len(unfiltered) == 0, "Expected flats with too few rooms to be filtered")
def test_filter_max_size(self): max_size = 80 config = Config(string=self.FILTER_MAX_SIZE_CONFIG) config.set_searchers([DummyCrawler()]) hunter = Hunter(config, IdMaintainer(":memory:")) exposes = hunter.hunt_flats() self.assertTrue(count(exposes) > 4, "Expected to find exposes") unfiltered = list(filter(lambda expose: float(re.search(r'\d+([\.,]\d+)?', expose['size'])[0]) > max_size, exposes)) if len(unfiltered) > 0: for expose in unfiltered: print("Got unfiltered expose: ", expose) self.assertTrue(len(unfiltered) == 0, "Expected big flats to be filtered")
class IdMaintainerTest(unittest.TestCase): TEST_URL = 'https://www.immowelt.de/liste/berlin/wohnungen/mieten?roomi=2&prima=1500&wflmi=70&sort=createdate%2Bdesc' DUMMY_CONFIG = """ urls: - https://www.example.com/liste/berlin/wohnungen/mieten?roomi=2&prima=1500&wflmi=70&sort=createdate%2Bdesc """ CONFIG_WITH_FILTERS = """ urls: - https://www.example.com/liste/berlin/wohnungen/mieten?roomi=2&prima=1500&wflmi=70&sort=createdate%2Bdesc filters: max_price: 1000 """ def setUp(self): self.maintainer = IdMaintainer(":memory:") def test_read_after_write(self): self.maintainer.mark_processed(12345) self.assertTrue(self.maintainer.is_processed(12345), "Expected ID to be saved") def test_get_last_run_time_none_by_default(self): self.assertIsNone(self.maintainer.get_last_run_time(), "Expected last run time to be none") def test_get_list_run_time_is_updated(self): time = self.maintainer.update_last_run_time() self.assertIsNotNone(time, "Expected time not to be none") self.assertEqual(time, self.maintainer.get_last_run_time(), "Expected last run time to be updated")
def test_filter_titles(self): titlewords = [ "wg", "tausch", "flat", "ruhig", "gruen" ] filteredwords = [ "wg", "tausch", "wochenendheimfahrer", "pendler", "zwischenmiete" ] config = Config(string=self.FILTER_TITLES_CONFIG) config.set_searchers([DummyCrawler(titlewords)]) hunter = Hunter(config, IdMaintainer(":memory:")) exposes = hunter.hunt_flats() self.assertTrue(count(exposes) > 4, "Expected to find exposes") unfiltered = list(filter(lambda expose: any(word in expose['title'] for word in filteredwords), exposes)) if len(unfiltered) > 0: for expose in unfiltered: print("Got unfiltered expose: ", expose) self.assertTrue(len(unfiltered) == 0, "Expected words to be filtered")
def launch_flat_hunt(config, heartbeat=None): """Starts the crawler / notification loop""" id_watch = IdMaintainer('%s/processed_ids.db' % config.database_location()) hunter = Hunter(config, id_watch) hunter.hunt_flats() counter = 0 while config.get('loop', dict()).get('active', False): counter += 1 counter = heartbeat.send_heartbeat(counter) time.sleep(config.get('loop', dict()).get('sleeping_time', 60 * 10)) hunter.hunt_flats()
def test_filter_min_price(self): min_price = 700 config = Config(string=self.FILTER_MIN_PRICE_CONFIG) hunter = Hunter(config, [DummyCrawler()], IdMaintainer(":memory:")) exposes = hunter.hunt_flats() self.assertTrue(count(exposes) > 4, "Expected to find exposes") unfiltered = list( filter( lambda expose: float( re.search(r'\d+([\.,]\d+)?', expose['price'])[0]) < min_price, exposes)) if len(unfiltered) > 0: for expose in unfiltered: print("Got unfiltered expose: ", expose) self.assertTrue( len(unfiltered) == 0, "Expected cheap flats to be filtered")
def test_resolve_durations(self, m): config = Config(string=self.DUMMY_CONFIG) hunter = Hunter(config, [DummyCrawler()], IdMaintainer(":memory:")) matcher = re.compile( 'maps.googleapis.com/maps/api/distancematrix/json') m.get( matcher, text= '{"status": "OK", "rows": [ { "elements": [ { "distance": { "text": "far", "value": 123 }, "duration": { "text": "days", "value": 123 } } ] } ]}' ) exposes = hunter.hunt_flats() self.assertTrue(count(exposes) > 4, "Expected to find exposes") without_durations = list( filter(lambda expose: 'durations' not in expose, exposes)) if len(without_durations) > 0: for expose in without_durations: print("Got expose: ", expose) self.assertTrue( len(without_durations) == 0, "Expected durations to be calculated")
def setUp(self): self.maintainer = IdMaintainer(":memory:")
def test_invalid_config(self): with self.assertRaises(Exception) as context: Hunter(dict(), IdMaintainer(":memory:")) self.assertTrue('Invalid config' in str(context.exception))
def setUp(self): self.hunter = Hunter(Config(string=self.DUMMY_CONFIG), [CrawlImmowelt()], IdMaintainer(":memory:"))
from flathunter.crawl_immobilienscout import CrawlImmobilienscout from flathunter.crawl_wggesucht import CrawlWgGesucht from flathunter.crawl_immowelt import CrawlImmowelt from flathunter.idmaintainer import IdMaintainer from flathunter.googlecloud_idmaintainer import GoogleCloudIdMaintainer from flathunter.hunter import Hunter from flathunter.config import Config from flathunter.web import app searchers = [ CrawlImmobilienscout(), CrawlWgGesucht(), CrawlEbayKleinanzeigen(), CrawlImmowelt() ] if __name__ == '__main__': # Use the SQLite DB file if we are running locally id_watch = IdMaintainer('%s/processed_ids.db' % os.path.dirname(os.path.abspath(__file__))) else: # Use Google Cloud DB if we run on the cloud id_watch = GoogleCloudIdMaintainer() hunter = Hunter(Config(), searchers, id_watch) app.config["HUNTER"] = hunter if __name__ == '__main__': app.run(host='127.0.0.1', port=8080, debug=True)
# Startup file for Google Cloud deployment ## import os from flathunter.idmaintainer import IdMaintainer from flathunter.googlecloud_idmaintainer import GoogleCloudIdMaintainer from flathunter.web_hunter import WebHunter from flathunter.config import Config from flathunter.web import app config = Config() if __name__ == '__main__': # Use the SQLite DB file if we are running locally id_watch = IdMaintainer('%s/processed_ids.db' % config.database_location()) else: # Use Google Cloud DB if we run on the cloud id_watch = GoogleCloudIdMaintainer() hunter = WebHunter(config, id_watch) app.config["HUNTER"] = hunter if 'website' in config: app.secret_key = config['website']['session_key'] app.config["DOMAIN"] = config['website']['domain'] app.config["BOT_NAME"] = config['website']['bot_name'] else: app.secret_key = b'Not a secret' app.config["BOT_TOKEN"] = config['telegram']['bot_token']