示例#1
0
文件: tests.py 项目: xecgr/kafka
    def test_01_insert_init_data(self):
        """sample domain insertion test"""
        _domains = dict(domains)
        for name, url in _domains.items():
            d = Domain(domain=name, url=url)
            d.save()

        for d in Domain.get_by_filters():
            _domains.pop(d.domain, None)
        self.assertEqual(_domains, {})
示例#2
0
    def run(self):
        """
        Main function for thread that will
        """

        # It might be good to create a process rather than a thread per file, and to create a multithreading environment
        # to process the list rather than a single thread for all of it (test under big file environment).

        # One json per line
        my_list = self.buf_file.splitlines()

        for line in my_list:
            # Ignoring if there is an empty entrance in the list
            if line:

                # load json
                json_line = json.loads(line)

                # we are going to look for creative_size, if it does not exist we will get the information from ad_width
                # and ad_height
                creative_size = find_key("creative_size", json_line)
                if not creative_size:
                    value_width = find_key("ad_width", json_line)
                    value_height = find_key("ad_height", json_line)
                    if value_width and value_height:
                        creative_size = [value_width[0] + "x" + value_height[0]]

                # We are going to look for the keys page_url and Referer
                referer = find_key("Referer", json_line)
                url = find_key("page_url", json_line)

                # If the three elements were found, introduce them in the DB.
                if creative_size and referer and url:
                    with transaction() as session:
                        added = False
                        # Check the existence of the entrance before introducing a repetitive one
                        if not session.query(Domain).filter(Domain.url==url[0]).first():
                            session.add(Domain(url[0]))
                            added = True
                        if not session.query(Referer).filter(Referer.url==referer[0]).first():
                            session.add(Referer(referer[0]))
                            added = True

                        session.flush()

                        # If one of the previous tables has a new entry. No need to check of existence in here.
                        if added:
                            session.add(Information(domain_url=url[0], referer_url=referer[0], creative_size=creative_size[0]))

                        elif not session.query(Information).filter(Information.domain_url==url[0])\
                                .filter(Information.referer_url==referer[0])\
                                .filter(Information.creative_size==creative_size[0]).first():
                            session.add(Information(domain_url=url[0], referer_url=referer[0], creative_size=creative_size[0]))

        print "Database updated with information from file: %s" % self.f_name
示例#3
0
文件: tests.py 项目: xecgr/kafka
 def test_03_regexp(self):
     """regexp process test"""
     d = Domain(domain='dummy', url='dummy')
     d.save()
     s = Snapshot(
         domain_id=d.id,
         pulled_at=datetime.utcnow(),
         html="find me with a regexp!!",
     )
     s.save()
     dc = DomainCheck(domain_id=d.id,
                      name="dummy_check",
                      regexp="find (me|you)")
     dc.save()
     run_regexp(snapshot_id=s.id)
     scd = SnapshotCheckData.get_by_filters(check_id=dc.id)[0]
     self.assertEqual(json.loads(scd.check_value), ["me"])
示例#4
0
if __name__ == "__main__":
    domains = {
        "helsinkitimes": "https://www.helsinkitimes.fi/",
        "berlin": "https://www.berlin.de/en/news/",
        "9news": "https://www.9news.com.au/sydney",
        "fail!": "fail://fail.com",
    }
    domain_checks = {
        "helsinkitimes": ["covid(\\d+) ", "govern(\\w+)"],
    }
    delete_tables()
    create_tables()
    domain_ids = []
    for name, url in domains.items():
        d = Domain(domain=name, url=url)
        d.save()
        domain_ids.append(d.id)
        for idx, regexp in enumerate(domain_checks.get(name, [])):
            DomainCheck(
                domain_id=d.id,
                name=f"{name}-{idx}",
                regexp=regexp,
            ).save()

    for x in range(n_pulls):
        for d_id in domain_ids:
            print(f"sending collector task for {d_id}")
            send_task(topic=collector_topic, domain_id=d_id)
        time.sleep(sleep_between_pulls)