Python WebSpider示例，datautil.webutils.WebSpider Python示例

示例#1

0

显示文件

文件： site_nyloner.py 项目： lniwn/my_proxypool

    async def yield_proxy(self, *args, **kwargs):
        # https://www.nyloner.cn/proxy
        ev_loop = kwargs.get('ev_loop')

        async with webutils.WebSpider(ev_loop) as spider:
            spider.header.update({'Host': 'www.nyloner.cn', 'Referer': 'https://www.nyloner.cn/proxy'})
            proxies = []
            num = 15
            status, _ = await spider.get('https://www.nyloner.cn/proxy')
            if status != 200:
                mylog.error('%s 访问出错', __name__)
                return proxies
            for page in range(1, 50):
                t = int(datetime.datetime.now().timestamp())
                status, resp_html = await spider.get('https://www.nyloner.cn/proxy', params={
                    'page': page, 'num': num, 't': t, 'token': self.gen_token(page, num, t)})
                if status != 200:
                    continue
                try:
                    js_result = json.loads(resp_html, encoding='utf-8')
                    if js_result['status'].lower() == 'true':
                        for pd in json.loads(self.decode_str(js_result['list'])):
                            proxies.append(models.ProxyTbl(host=pd['ip'], port=int(pd['port']),
                                                           scheme='http', country='未知'))
                except json.JSONDecodeError as er:
                    mylog.warning('%s 解析返回值<%s>出错: %s', __name__, resp_html, er)
                    return proxies

        return proxies

示例#2

0

显示文件

文件： site_kewangst.py 项目： lniwn/my_proxypool

    async def yield_proxy(self, *args, **kwargs):
        ev_loop = kwargs.get('ev_loop')

        async with webutils.WebSpider(ev_loop) as spider:
            spider.header.update({'Host': 'www.kewangst.com', 'Referer': 'https://www.kewangst.com/ProxyList'})
            proxies = []
            status, resp_text = await spider.get('https://www.kewangst.com/ProxyList')
            if status != 200:
                mylog.error('%s 访问出错', __name__)
                return proxies
            with io.StringIO(resp_text) as fp:
                while True:
                    line = fp.readline()
                    if line:
                        line = line.strip()
                    else:
                        break
                    if line.startswith('http'):
                        try:
                            parse_result = urllib.parse.urlparse(line)
                            proxies.append(models.ProxyTbl(
                                host=parse_result.hostname, port=parse_result.port, scheme=parse_result.scheme,
                                country='未知'))
                        except ValueError as e:
                            mylog.warning(e)

        return proxies

示例#3

0

显示文件

文件： site_66ip.py 项目： lniwn/my_proxypool

    async def yield_proxy(self, *args, **kwargs):
        ev_loop = kwargs.get('ev_loop')
        async with webutils.WebSpider(ev_loop) as spider:
            spider.header.update({'Host': 'www.66ip.cn'})

            area = 33
            page = 1
            proxies = []
            for area_index in range(1, area + 1):
                asyncio.sleep(1, loop=ev_loop)
                for i in range(1, page + 1):
                    url = "http://www.66ip.cn/areaindex_{}/{}.html".format(
                        area_index, i)
                    status, resp_html = await spider.get(url)
                    if status != 200:
                        continue
                    html_tree = etree.HTML(resp_html)
                    tr_list = html_tree.xpath(
                        "//*[@id='footer']/div/table/tr[position()>1]")
                    if len(tr_list) == 0:
                        continue
                    for tr in tr_list:
                        proxies.append(
                            models.ProxyTbl(host=tr.xpath("./td[1]/text()")[0],
                                            port=int(
                                                tr.xpath("./td[2]/text()")[0]),
                                            country='中国',
                                            area=tr.xpath("./td[3]/text()")[0],
                                            scheme='http'))
            return proxies

示例#4

0

显示文件

    async def yield_proxy(self, *args, **kwargs):
        ev_loop = kwargs.get('ev_loop')
        async with webutils.WebSpider(ev_loop) as spider:
            spider.header.update({'Host': 'www.xicidaili.com'})

            proxies = []

            url_list = [
                'http://www.xicidaili.com/nn/',  # 高匿
                'http://www.xicidaili.com/nt/',  # 透明
                'http://www.xicidaili.com/wn/',  # 国内https
                'http://www.xicidaili.com/wt/',  # 国内普通
            ]
            page = 2
            for url in url_list:
                for i in range(1, page + 1):
                    asyncio.sleep(1, loop=ev_loop)
                    url = url + str(i)
                    status, resp_html = await spider.get(url)
                    if status != 200:
                        continue
                    html_tree = etree.HTML(resp_html)
                    ip_list = html_tree.xpath(
                        '//table[@id="ip_list"]//tr[position()>1]')
                    for tr in ip_list:
                        tds = tr.xpath("td")
                        if len(tds) < 5:
                            continue

                        location = tds[3].xpath('a')
                        if len(location) >= 1:
                            location = location[0].text
                        else:
                            location = tds[3].text
                        proxies.append(
                            models.ProxyTbl(host=str(tds[1].text),
                                            port=int(tds[2].text),
                                            country='中国',
                                            area=str(location),
                                            scheme=str(tds[5].text).lower()))

            return proxies

示例#5

0

显示文件

文件： zfxfo.py 项目： lniwn/my_proxypool

 async def consume(self, req: web.Request, **kwargs) -> web.Response:
     self._user_arg = req.query
     while True:
         try:
             all_proxy = await ProxyTblManager.get_proxy(req.app['db'])
         except queue.Empty:
             return web.Response(text='目前没有可用代理，请稍候再试', charset='utf-8')
         proxy = random.choice(all_proxy)
         async with webutils.WebSpider(ev_loop=None, proxy=proxy) as client:
             try:
                 stock = random.choice(await self.get_stock(client))
                 stock = re.search('c:"(\d+)"', stock).group(1)
                 spldid = await self.open_page(client)
                 await self.page_loading(client, spldid)
                 msg = await self.set_pageoperinfo(client, spldid, stock)
                 await self.page_close(client, spldid)
             except ConsumerError as err:
                 return web.Response(text=err.expression)
             except AttributeError as err:
                 raise web.HTTPInternalServerError() from err
             except (ClientError, asyncio.TimeoutError) as err:
                 pass  # timeout error, continue
             else:
                 return web.Response(text=msg, charset='utf-8')