def crawl_data5u(self): start_url = 'http://www.data5u.com/free/gngn/index.shtml' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86', 'Host': 'www.data5u.com', 'Referer': 'http://www.data5u.com/free/index.shtml', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36', } html = get_page(start_url, options=headers) if html: ip_address = re.compile( '<span><li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class=\"port.*?>(\d+)</li>', re.S) re_ip_address = ip_address.findall(html) for address, port in re_ip_address: result = address + ':' + port yield result.replace(' ', '')
async def decorator(handle, *args, **kwargs): request = handle.request query = dict(request.query) page, pagesize = get_page(query) request.pagination = { 'page': page, 'pagesize': pagesize, } return await func(handle, *args, **kwargs)
def crawl_ip3366(self): for page in range(1, 4): start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format( page) html = get_page(start_url) ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>') # \s * 匹配空格,起到换行作用 re_ip_address = ip_address.findall(html) for address, port in re_ip_address: result = address + ':' + port yield result.replace(' ', '')
def crawl_kuaidaili(self): for i in range(1, 4): start_url = 'http://www.kuaidaili.com/free/inha/{}/'.format(i) html = get_page(start_url) if html: ip_address = re.compile('<td data-title="IP">(.*?)</td>') re_ip_address = ip_address.findall(html) port = re.compile('<td data-title="PORT">(.*?)</td>') re_port = port.findall(html) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port yield address_port.replace(' ', '')
def home(request): '''首页''' try: page = int(request.GET.get('page', '1')) except ValueError: page = 1 return render(request,'index.html', { 'index': True, 'settings':settings, 'keywords': settings.SITE_KEYWORD, 'posts': utils.get_page(Post.objects.all(), page), })
async def get(self, *args, **kwargs): ''' count: 总查询记录 page:总页数 rest:最后一页的记录数 current_page: 当前页数 current_base_url: 当前api地址 ''' res = {} MAX_PER_PAGE = settings['MAX_PER_PAGE'] PARAM = settings['PAGINATE_PARAM'] current_page = self.get_argument(PARAM, default="1") try: current_page = int(current_page) except ValueError as e: self.set_status(404) res['detail'] = '传入参数错误' self.finish(res) return query = Category.select().order_by(Category.add_time.desc()) count = await self.application.objects.count(query) page = utils.get_page(count) previous, next, err = utils.get_next_pre_page('/categories/', current_page, page) #page传入错误 无效 if err: self.set_status(404) res['detail'] = err self.finish(res) return #查询数据库并进行分页 query = Category.select().order_by(Category.add_time.desc()).paginate(current_page, MAX_PER_PAGE) categories = await self.application.objects.execute(query) results = [] for category in categories: results.append(model_to_dict(category)) res = { "count": str(count), "next": next, "previous": previous, "results": results } self.finish(json.dumps(res, default=utils.json_serial))
def crawl_ip3366(self): for i in range(1, 4): start_url = 'http://www.ip3366.net/?stype=1&page={}'.format(i) html = get_page(start_url) if html: find_tr = re.compile('<tr>(.*?)</tr>', re.S) trs = find_tr.findall(html) for s in range(1, len(trs)): find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>') re_ip_address = find_ip.findall(trs[s]) find_port = re.compile('<td>(\d+)</td>') re_port = find_port.findall(trs[s]) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port yield address_port.replace(' ', '')
def crawl_iphai(self): start_url = 'http://www.iphai.com/' html = get_page(start_url) if html: find_tr = re.compile('<tr>(.*?)</tr>', re.S) trs = find_tr.findall(html) for s in range(1, len(trs)): find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S) re_ip_address = find_ip.findall(trs[s]) find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S) re_port = find_port.findall(trs[s]) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port yield address_port.replace(' ', '')
def crawl_iphai(self): url = self.cfg.get('iphai', 'url') response_text = get_page(url) if response_text: html = etree.HTML(response_text) trs = html.xpath('//table//tr')[1:] for tr in trs: ip = tr.xpath('./td[1]/text()')[0].strip().replace(' ', '') port = tr.xpath('./td[2]/text()')[0].strip().replace(' ', '') scheme = tr.xpath('./td[4]/text()')[0].lower().strip().replace( ' ', '') # 拼接代理 proxy = '%s://%s:%s' % (scheme, ip, port) yield proxy, ip, port, scheme
def crawl_daili66(self, page_count=4): """ 获取代理66 :param page_count: 页码 :return: 代理 """ start_url = 'http://www.66ip.cn/{}.html' urls = [start_url.format(page) for page in range(1, page_count + 1)] for url in urls: print('Crawling', url) html = get_page(url) if html: doc = pq(html) trs = doc('.containerbox table tr:gt(0)').items() for tr in trs: ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() yield ':'.join([ip, port])
def crawl_ip3366(self): types = self.cfg.get('ip-3366', 'types').lstrip('[').rstrip(']').split(',') page_num = int(self.cfg.get('ip-3366', 'page_num')) url = self.cfg.get('ip-3366', 'url') for type in types: for page in range(1, page_num): proxy_list_url = url.format(type, page) response_text = get_page(url=proxy_list_url) if response_text: html = etree.HTML(response_text) trs = html.xpath('//div[@id="list"]/table/tbody/tr') for tr in trs: ip = tr.xpath('./td[1]/text()')[0] port = tr.xpath('./td[2]/text()')[0] scheme = tr.xpath('./td[4]/text()')[0].lower() # 拼接代理 proxy = '%s://%s:%s' % (scheme, ip, port) yield proxy, ip, port, scheme
def crawl_daili66(self, page_count=4): url = self.cfg.get('daili-66', 'url') page_num = int(self.cfg.get('daili-66', 'page_num')) for page in range(1, page_num): # 拼接要爬取的url proxy_list_url = url.format(page) # 该网页需要动态渲染,设置selenium=True,使用selenium 爬取 browser = get_page(url=proxy_list_url, selenium=True) if browser: # 因为tr[0]是标题 trs = browser.find_elements_by_xpath( '//div[@align="center"]/table/tbody/tr')[1:] for tr in trs: res = tr.text.split(' ') ip = res[0] port = res[1] scheme = 'https' # 拼接代理 proxy = '%s://%s:%s' % (scheme, ip, port) yield proxy, ip, port, scheme
def crawl_xicidaili(self): types = self.cfg.get('xici', 'types').lstrip('[').rstrip(']').split(',') page_num = int(self.cfg.get('xici', 'page_num')) url = self.cfg.get('xici', 'url') for type in types: for page in range(1, page_num): proxy_list_url = url.format(type, page) response_text = get_page(proxy_list_url) time.sleep(5) if response_text: html = etree.HTML(response_text) trs = html.xpath('//table[@id="ip_list"]//tr')[1:] for tr in trs: ip = tr.xpath('./td[2]/text()')[0] port = tr.xpath('./td[3]/text()')[0] scheme = tr.xpath('./td[6]/text()')[0].lower() # 拼接代理 proxy = '%s://%s:%s' % (scheme, ip, port) yield proxy, ip, port, scheme
def crawl_xiladaili(self): url = self.cfg.get('xila', 'url') types = self.cfg.get('xila', 'types').lstrip('[').rstrip(']').split(',') page_num = int(self.cfg.get('xila', 'page_num')) for type in types: for page in range(1, page_num): proxy_list_url = url.format(type, page) response_text = get_page(proxy_list_url) if response_text: html = etree.HTML(response_text) trs = html.xpath('//table/tbody/tr') for tr in trs: ip_port = tr.xpath('./td[1]/text()')[0] ip, port = ip_port.split(':') scheme_str = tr.xpath('./td[2]/text()')[0].lower() scheme = 'https' if 'https' in scheme_str else 'http' # 拼接代理 proxy = '%s://%s' % (scheme, ip_port) yield proxy, ip, port, scheme
def crawl_xicidaili(self): for i in range(1, 3): start_url = 'http://www.xicidaili.com/nn/{}'.format(i) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3', 'Host': 'www.xicidaili.com', 'Referer': 'http://www.xicidaili.com/nn/3', 'Upgrade-Insecure-Requests': '1', } html = get_page(start_url, options=headers) if html: find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S) trs = find_trs.findall(html) for tr in trs: find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>') re_ip_address = find_ip.findall(tr) find_port = re.compile('<td>(\d+)</td>') re_port = find_port.findall(tr) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port yield address_port.replace(' ', '')
async def get(self, *args, **kwargs): ''' count: 总查询记录 page:总页数 rest:最后一页的记录数 current_page: 当前页数 current_base_url: 当前api地址 ''' res = {} MAX_PER_PAGE = settings['MAX_PER_PAGE'] PARAM = settings['PAGINATE_PARAM'] current_page = self.get_argument(PARAM, default="1") try: current_page = int(current_page) except ValueError as e: self.set_status(404) res['detail'] = '传入参数错误' self.finish(res) return query = Post.select().order_by(Post.add_time.desc()) count = await self.application.objects.count(query) page = utils.get_page(count) previous, next, err = utils.get_next_pre_page('/posts/', current_page, page) #page传入错误 无效 if err: self.set_status(404) res['detail'] = err self.finish(res) return #查询数据库并进行分页 query = Post.extend().order_by(Post.add_time.desc()).paginate(current_page, MAX_PER_PAGE) posts = await self.application.objects.execute(query) results = [] for post in posts: item = { "id": post.id, "title": post.title, "content": post.content, "category": { "id": post.category.id, "name": post.category.name, "desc": post.category.desc, }, "author": { "id": post.author.id, "username": post.author.username, }, "like_nums": post.like_nums, "read_nums": post.read_nums, "comment_nums": post.comment_nums, "is_excellent": post.is_excellent, "is_hot": post.is_hot, "is_top": post.is_top, "add_time": post.add_time, "update_time": post.update_time, } results.append(item) res = { "count": str(count), "next": next, "previous": previous, "results": results } self.finish(json.dumps(res, default=utils.json_serial))