def get_ip(): page = 1 base_url = 'http://www.kuaidaili.com/proxylist/' while page <= 10: url = base_url + str(page) data = getHtml(url) soup = BeautifulSoup(data,'lxml') for tr in soup.find_all('tr'): ip = tr.contents[1].string port = tr.contents[3].string if 'IP' == ip: continue yield ip,port page+=1 sleep(1)
def get_ip(): page = 1 base_url = 'http://www.kuaidaili.com/proxylist/' while page <= 10: url = base_url + str(page) data = getHtml(url) soup = BeautifulSoup(data, 'lxml') for tr in soup.find_all('tr'): ip = tr.contents[1].string port = tr.contents[3].string if 'IP' == ip: continue yield ip, port page += 1 sleep(1)
__author__ = 'wangqi' import re from commen import getHtml # -*- coding:utf-8 -*- page = 1 initail_url = 'http://www.qiushibaike.com/hot/page/' + str(page) html = getHtml(initail_url) pattern = re.compile('<div.*?author">.*?<a.*?<img.*?>(.*?)</a>.*?<div.*?'+ 'content">(.*?)<!--(.*?)-->.*?</div>(.*?)<div class="stats.*?class="number">(.*?)</i>',re.S) items = re.findall(pattern,html) for item in items: haveImg = re.search("img",item[3]) if not haveImg: print(item[0],item[1],item[2],item[4])
__author__ = 'wangqi' import re from commen import getHtml # -*- coding:utf-8 -*- page = 1 initail_url = 'http://www.qiushibaike.com/hot/page/' + str(page) html = getHtml(initail_url) pattern = re.compile( '<div.*?author">.*?<a.*?<img.*?>(.*?)</a>.*?<div.*?' + 'content">(.*?)<!--(.*?)-->.*?</div>(.*?)<div class="stats.*?class="number">(.*?)</i>', re.S) items = re.findall(pattern, html) for item in items: haveImg = re.search("img", item[3]) if not haveImg: print(item[0], item[1], item[2], item[4])