Пример #1
0
 async def parse_site(self, session, site: SiteData, resp: SiteResponse):
     parser = self._parsers.get(site.key)
     if not parser:
         return
     try:
         result = parser(resp)
         if not self._test_model:
             await self.save_parse_result(session, site, result)
         else:
             await self.show_result(session, site, result, resp=resp)
     except Exception as e:
         Logger.error('[get] Parse error, message: %s' % str(e))
Пример #2
0
async def main():
    argv = None
    if len(sys.argv) > 1:
        argv = sys.argv[1]
    if argv and argv.find('://') > 0:
        return await load_from_url(argv)
    res = os.listdir('.')
    ip_file_lists = [name for name in res if name.find('.ip.txt') > 0]
    if argv:
        if argv not in ip_file_lists:
            Logger.error('file %s doesn\'t exists' % argv)
            return
        else:
            ip_file_lists = [argv]
    for fn in ip_file_lists:
        await load_file(fn)
Пример #3
0
 async def crawl_single_page(self, session, site, request: SiteRequestData):
     proxy = None
     if request.use_proxy is True:
         random_proxy = await IPFactory.get_random_ip(
             request.url.find('https') == 0)
         if random_proxy:
             proxy = random_proxy.to_http()
     try:
         async with session.get(request.url, proxy=proxy) as resp:
             text = await resp.text()
             if not text:
                 raise EmptyResponseException('empty text')
             site_resp = SiteResponse(text, url=request.url, site=site)
         await self.parse_site(session, site, site_resp)
     except Exception as e:
         Logger.error('[get] Get page %s error, message: %s' %
                      (request.url, str(e)))
         raise RetryException() from e
Пример #4
0
 async def handle_task_exception(self, e):
     Logger.error('[error] ' + str(e))
     await asyncio.sleep(5)  #