def validate(host, port): proxy_url = 'http://{}:{}'.format(host, port) proxies = {'http': proxy_url, 'https': proxy_url} url = 'http://www.sina.com.cn/' # 访问新浪网,如果成功说明ip有效 try: start_time = time.time() page_raw = _crawl('get', url, proxies=proxies, timeout=5) if page_raw and 'sina' in page_raw: pass else: raise Exception('response is invalid') end_time = time.time() time_cost = end_time - start_time except Exception as e: print(e) time_cost = 25 if time_cost < 15: ProxyPool.objects(id='{}:{}:{}'.format('sina', host, port)).update( set__host=host, set__port=port, set__updated_at=datetime.datetime.now(), upsert=True, ) valid_ips.add((host, port)) return time_cost
def validate(host, port): proxy_url = 'http://{}:{}'.format(host, port) proxies = { 'http': proxy_url, 'https': proxy_url } url = 'http://www.sina.com.cn/' # 访问新浪网,如果成功说明ip有效 try: start_time = time.time() page_raw = _crawl('get', url, proxies=proxies, timeout=5) if page_raw and 'sina' in page_raw: pass else: raise Exception('response is invalid') end_time = time.time() time_cost = end_time - start_time except Exception as e: print(e) time_cost = 25 if time_cost < 15: valid_ips.add((host, port)) return time_cost
def get_free_ips(): ips = set() for url in [ 'http://www.xicidaili.com/nn/', 'http://www.xicidaili.com/nt/' ]: headers = { 'Host': 'www.xicidaili.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', } page_raw = _crawl('get', url, headers) doc = parse_doc(page_raw) for each in doc.xpath("//table[@id='ip_list']//tr")[1:]: ip = each.xpath("./td[2]/text()")[0] port = int(each.xpath("./td[3]/text()")[0]) ips.add((ip, port)) return ips
href_list = node.xpath("./@href") # 此节点下面写xpath,注意前面需要有./ 表面是相对路径 print(href_list) # ['http://www.4399.com/flash/32979.htm'] # xpath解析36kr新闻详情页 from article1.crawl import _crawl # 导入篇文章介绍的抓取函数 url = 'http://36kr.com/p/5130007.html' headers = { 'Host': '36kr.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', } html = _crawl('get', url, headers) # 获取到文章源码 doc = parse_doc(html) # 用lxml.html解析成树形结构对象 title_xpath = "//div[@class='mobile_article']/h1/text()" print(doc.xpath(title_xpath)) # 输出:[] data = re.findall(r'props=(.+),locationnal', html)[0] data = json.loads(data)['detailArticle|post'] print(data['id'], data['title']) # 输出:5130007 最前线 | 李大霄:中兴或可通过谈判获得一个解决方案 # 正则表达式介绍 article_id_list = re.findall(r'p/(\d+)', 'http://36kr.com/p/5130007.html') print(article_id_list) # 输出:['5130007']
article_id=article_id, article_title=article_title, publish_time=article_publish_time, author_name=article_author_name, )) return articles if __name__ == '__main__': # 抓取首页 url = 'https://www.huxiu.com/' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', } page_raw = _crawl('get', url, headers) channels = parse_channel_urls_page(page_raw) print(channels) # 抓取列表页面 for channel in channels: url = 'https://www.huxiu.com/channel/{}.html'.format( channel['channel_id']) page_raw = _crawl('get', url, headers) articles = parse_channel_list_page(page_raw) print(articles) # 注:笔者在抓取列表页面时,只是抓取了第一页,没有抓取第一页,第二页,...,读者有兴趣可以研究下,抓取其他页面。
# # 输出: [] # # # 使用chrome # with ChromeDownloader() as browser: # browser.get(url) # doc = parse_doc(browser.page_source) # print(doc.xpath("//span[contains(text(), '月销量')]/following-sibling::span/text()")) # # 输出: ['95'] cookie = login_weibo('user_name', 'passeord') # 你的用户名和密码,在weibo手机端将登陆保护去掉 print(cookie) # 下面直接用requests在headers中加入cookie,即可完成登陆,获取到页面数据 headers = {'Cookie': cookie} page_raw = _crawl('get', 'https://weibo.com/2092770375/fans', headers=headers) print('脆弱的负离子饭' in page_raw, '=============') # 输出: true # from selenium import webdriver # # # 为了方便展示,使用有界面chrome # browser = webdriver.Chrome() # browser.get('https://www.baidu.com') # # # # 执行js # browser.execute_script(''' # document.getElementById("su").setAttribute("value","搜狗二下") # ''')