def test_01_insert_init_data(self): """sample domain insertion test""" _domains = dict(domains) for name, url in _domains.items(): d = Domain(domain=name, url=url) d.save() for d in Domain.get_by_filters(): _domains.pop(d.domain, None) self.assertEqual(_domains, {})
def test_03_regexp(self): """regexp process test""" d = Domain(domain='dummy', url='dummy') d.save() s = Snapshot( domain_id=d.id, pulled_at=datetime.utcnow(), html="find me with a regexp!!", ) s.save() dc = DomainCheck(domain_id=d.id, name="dummy_check", regexp="find (me|you)") dc.save() run_regexp(snapshot_id=s.id) scd = SnapshotCheckData.get_by_filters(check_id=dc.id)[0] self.assertEqual(json.loads(scd.check_value), ["me"])
if __name__ == "__main__": domains = { "helsinkitimes": "https://www.helsinkitimes.fi/", "berlin": "https://www.berlin.de/en/news/", "9news": "https://www.9news.com.au/sydney", "fail!": "fail://fail.com", } domain_checks = { "helsinkitimes": ["covid(\\d+) ", "govern(\\w+)"], } delete_tables() create_tables() domain_ids = [] for name, url in domains.items(): d = Domain(domain=name, url=url) d.save() domain_ids.append(d.id) for idx, regexp in enumerate(domain_checks.get(name, [])): DomainCheck( domain_id=d.id, name=f"{name}-{idx}", regexp=regexp, ).save() for x in range(n_pulls): for d_id in domain_ids: print(f"sending collector task for {d_id}") send_task(topic=collector_topic, domain_id=d_id) time.sleep(sleep_between_pulls)