예제 #1
0
def config():
    site = SiteData()
    site.name = '快代理'
    site.enabled = True
    site.pages = ['https://www.kuaidaili.com/free/{}/{}'.format(i, ii) for i in ['inha', 'intr'] for ii in
                  range(1, 10)]
    return site
예제 #2
0
def config():
    site = SiteData()
    site.name = '免费代理IP库'
    site.pages = [
        'http://ip.jiangxianli.com/?page=%d' % i for i in range(1, 5)
    ]
    return site
예제 #3
0
def config():
    site = SiteData()
    site.name = '齐乐分享'
    site.pages = [
        'https://bbs.76fx.com/ip/pt.php?sxb=&tqsl=1000&port=&export=&ktip=&sxa=&Api=2'
    ]
    return site
예제 #4
0
def config():
    site = SiteData()
    site.name = 'Github proxy list'
    site.pages = [
        'https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt'
    ]
    return site
예제 #5
0
def config():
    site = SiteData()
    site.name = '云代理 ip3366'
    site.enabled = True
    site.pages = [
        'http://www.ip3366.net/free/?stype=%s&page=%s' % (i, ii)
        for i in range(1, 3) for ii in range(1, 5)
    ]
    return site
예제 #6
0
def config():
    site = SiteData()
    site.name = '小幻HTTP代理'
    site.use_proxy = True
    site.pages = ['https://ip.ihuan.me/']
    site.base_url = 'https://ip.ihuan.me/'
    site.page_limit = 20
    site.current_page = 1
    return site
예제 #7
0
def config():
    site = SiteData()
    site.name = '西刺代理'
    site.enabled = True
    site.use_proxy = True
    site.pages = [
        'http://www.xicidaili.com/{}/{}'.format(i, ii)
        for i in ['nn', 'nt', 'wn', 'wt'] for ii in range(1, 5)
    ]
    return site
예제 #8
0
 async def crawl_site(self, site: SiteData, page_limit: int = 0):
     headers = {'User-Agent': self.get_user_agent()}
     headers.update(site.headers)
     async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(
             Config.DEFAULT_REQUEST_TIME_OUT),
                                      headers=headers) as session:
         pages = site.pages if page_limit == 0 else site.pages[0:page_limit]
         for page in pages:
             try:
                 await self.crawl_single_page(session, site,
                                              site.to_request(page))
             except MaxRetryException as e:
                 Logger.warn('[get] Max retry skip, message: %s' % str(e))
                 continue
             finally:
                 if site.page_interval:
                     await asyncio.sleep(site.page_interval)
예제 #9
0
def config():
    site = SiteData()
    site.name = 'Spys.me'
    site.pages = ['http://spys.me/proxy.txt']
    return site
예제 #10
0
def config():
    site = SiteData()
    site.name = 'Proxy daily'
    site.pages = ['https://proxy-daily.com/']
    return site
예제 #11
0
def config():
    site = SiteData()
    site.name = '全网代理IP'
    site.pages = ['http://www.goubanjia.com/']
    return site