def test_01_insert_init_data(self): """sample domain insertion test""" _domains = dict(domains) for name, url in _domains.items(): d = Domain(domain=name, url=url) d.save() for d in Domain.get_by_filters(): _domains.pop(d.domain, None) self.assertEqual(_domains, {})
def run(self): """ Main function for thread that will """ # It might be good to create a process rather than a thread per file, and to create a multithreading environment # to process the list rather than a single thread for all of it (test under big file environment). # One json per line my_list = self.buf_file.splitlines() for line in my_list: # Ignoring if there is an empty entrance in the list if line: # load json json_line = json.loads(line) # we are going to look for creative_size, if it does not exist we will get the information from ad_width # and ad_height creative_size = find_key("creative_size", json_line) if not creative_size: value_width = find_key("ad_width", json_line) value_height = find_key("ad_height", json_line) if value_width and value_height: creative_size = [value_width[0] + "x" + value_height[0]] # We are going to look for the keys page_url and Referer referer = find_key("Referer", json_line) url = find_key("page_url", json_line) # If the three elements were found, introduce them in the DB. if creative_size and referer and url: with transaction() as session: added = False # Check the existence of the entrance before introducing a repetitive one if not session.query(Domain).filter(Domain.url==url[0]).first(): session.add(Domain(url[0])) added = True if not session.query(Referer).filter(Referer.url==referer[0]).first(): session.add(Referer(referer[0])) added = True session.flush() # If one of the previous tables has a new entry. No need to check of existence in here. if added: session.add(Information(domain_url=url[0], referer_url=referer[0], creative_size=creative_size[0])) elif not session.query(Information).filter(Information.domain_url==url[0])\ .filter(Information.referer_url==referer[0])\ .filter(Information.creative_size==creative_size[0]).first(): session.add(Information(domain_url=url[0], referer_url=referer[0], creative_size=creative_size[0])) print "Database updated with information from file: %s" % self.f_name
def test_03_regexp(self): """regexp process test""" d = Domain(domain='dummy', url='dummy') d.save() s = Snapshot( domain_id=d.id, pulled_at=datetime.utcnow(), html="find me with a regexp!!", ) s.save() dc = DomainCheck(domain_id=d.id, name="dummy_check", regexp="find (me|you)") dc.save() run_regexp(snapshot_id=s.id) scd = SnapshotCheckData.get_by_filters(check_id=dc.id)[0] self.assertEqual(json.loads(scd.check_value), ["me"])
if __name__ == "__main__": domains = { "helsinkitimes": "https://www.helsinkitimes.fi/", "berlin": "https://www.berlin.de/en/news/", "9news": "https://www.9news.com.au/sydney", "fail!": "fail://fail.com", } domain_checks = { "helsinkitimes": ["covid(\\d+) ", "govern(\\w+)"], } delete_tables() create_tables() domain_ids = [] for name, url in domains.items(): d = Domain(domain=name, url=url) d.save() domain_ids.append(d.id) for idx, regexp in enumerate(domain_checks.get(name, [])): DomainCheck( domain_id=d.id, name=f"{name}-{idx}", regexp=regexp, ).save() for x in range(n_pulls): for d_id in domain_ids: print(f"sending collector task for {d_id}") send_task(topic=collector_topic, domain_id=d_id) time.sleep(sleep_between_pulls)