Пример #1
0
 def test_02_collector(self):
     """collector process test"""
     done_snapshots = Snapshot.count_by_filters()
     for d in Domain.get_by_filters():
         ok = run_collector(domain_id=d.id)
         if ok:
             done_snapshots += 1
     total_snapshots = Snapshot.count_by_filters()
     self.assertEqual(done_snapshots, total_snapshots)
Пример #2
0
    def test_01_insert_init_data(self):
        """sample domain insertion test"""
        _domains = dict(domains)
        for name, url in _domains.items():
            d = Domain(domain=name, url=url)
            d.save()

        for d in Domain.get_by_filters():
            _domains.pop(d.domain, None)
        self.assertEqual(_domains, {})
Пример #3
0
def run(*args, domain_id='', **kwargs):
    """Collector execution, get domains HTML running an HTTP GET request
    
    Keyword arguments:
    domain_id  -- domain_id that should be retrieved
    """
    logging.info("start collector run")
    domain = Domain.get_by_filters(id=domain_id)
    if domain:
        domain = domain[0]
        headers = {
            #custom headers
        }
        proxies = {
            #custom proxies
        }
        params = {
            #custom query string
        }
        res = None

        try:
            res = requests.get(
                domain.url,
                headers=headers,
                proxies=proxies,
                data=params,
                timeout=
                10  #avoid hangs! -> https://requests.readthedocs.io/en/master/user/advanced/#timeouts
            )
        except:
            #bad things can happen with users' input and i/o operations
            #here a good error callback function will be super!
            logging.exception(f"Error fething {domain.url}")
        snapshot_id = None
        if res:
            s = Snapshot(domain_id=domain_id,
                         pulled_at=datetime.utcnow(),
                         html=res.text)
            s.save()

            domain.last_snapshot_at = datetime.utcnow()
            domain.save()

            snapshot_id = s.id
            SnapshotMetadata(
                snapshot_id=s.id,
                domain_id=domain_id,
                request_time=res.elapsed.total_seconds(),
                request_status=res.status_code,
            ).save()
        return snapshot_id