class webCon: # chrome启动的慢,所以尽量少启动,作为静态最合适 chrome = chromes() # 只会实例化一次 mylog = Mylog("WebCon") def __init__(self): # 集搜客提供的API self.extra = GsExtractor() # ip代理商提供的服务 self.Ipagency = IpAgency() def con_ThenGetContent(self, url, rule): try: # chrome 浏览器方式采集数据 data = webCon.chrome.get_html(url=url) doc = etree.HTML(data) self.extra.setXsltFromAPI(theme=rule) # 获取规则文件,并读取 content = self.extra.extract(doc) # 更具xslt 解析的内容,xml 字节格式 return content except Exception as e: # 最好加一个日志 webCon.mylog.debug("chrome hava some problem:" + str(e)) rd = RandomUserAgent() headers = {"User-Agent": rd.get_RanDomAgent()} try: # requests 代理方式采集数据 data = requests.get(url=url, headers=headers, timeout=500, proxies=self.Ipagency.getIpProxy()).text # data = requests.get(url=url, headers=headers,timeout=500).text #不用代理方式 doc = etree.HTML(data) # doc = etree.HTML(conn.read()) self.extra.setXsltFromAPI(theme=rule) # 获取规则文件,并读取 content = self.extra.extract(doc) # 更具xslt 解析的内容,xml 字节格式 return content except Exception as e1: # 这里最好加一个日志 print("chrome hava some problem:" + str(e1)) try: # 阿布云代理IP出错 requests 本机IP采集数据 data = requests.get(url=url, headers=headers, timeout=500).text # 不用代理方式 doc = etree.HTML(data) self.extra.setXsltFromAPI(theme=rule) # 获取规则文件,并读取 content = self.extra.extract(doc) # 更具xslt 解析的内容,xml 字节格式 return content except Exception as e2: # 加一个日志操作,warning webCon.mylog.error(str.encode((str(url) + ":" + str(e2)))) print(str.encode((str(url) + ":" + str(e2)))) return None def getextra(self): return self.extra
def getContent(self, url): browser = webdriver.PhantomJS(executable_path=self.phantomjsPath) browser.get(url) time.sleep(self.waittime) html = browser.execute_script( "return document.documentElement.outerHTML") doc = etree.HTML(html) jdlistExtra = GsExtractor() jdlistExtra.setXsltFromFile("jd_list.xml") output = jdlistExtra.extract(doc) return output
def parse(self, response): print("start...") # start browser self.browser.get(response.url) # loading time interval time.sleep(3) # get xslt extra = GsExtractor() extra.setXsltFromAPI("API KEY", "淘宝天猫_商品详情30474") # get doc html = self.browser.execute_script("return document.documentElement.outerHTML"); doc = etree.HTML(html) result = extra.extract(doc) # out file file_name = 'E:/淘宝天猫_商品详情30474_' + self.getTime() + '.xml' open(file_name, "wb").write(result) self.browser.close() print("end")
def parse(self, response): print("start...") #start browser self.browser.get(response.url) #loading time interval time.sleep(3) #get xslt extra=GsExtractor() extra.setXsltFromAPI("0a3898683f265e7b28991e0615228baa", "淘宝天猫_商品详情30474") # get doc html = self.browser.execute_script("return document.documentElement.outerHTML") doc = etree.HTML(html) result = extra.extract(doc) # out file file_name = 'F:/temp/淘宝天猫_商品详情30474_' + self.getTime() + '.xml' open(file_name,"wb").write(result) self.browser.close() print("end")
# _*_coding:utf8_*_ # crawler_gooseeker_bbs.py # 版本: V1.0 from urllib import request from lxml import etree from gooseeker import GsExtractor # 访问并读取网页内容 url = "http://www.gooseeker.com/cn/forum/7" conn = request.urlopen(url) doc = etree.HTML(conn.read()) bbsExtra = GsExtractor() bbsExtra.setXsltFromAPI( "98adf83ksdf0slrwaerwersdkfjsa", "gooseeker_bbs_xslt") # 设置xslt抓取规则,第一个参数是app key,请到会员中心申请 result = bbsExtra.extract(doc) # 调用extract方法提取所需内容 print(str(result))
def getContent(self, url): conn = request.urlopen(url) output = etree.HTML(conn.read()) return output def saveContent(self, filepath, content): file_obj = open(filepath, 'w', encoding='UTF-8') file_obj.write(content) file_obj.close() bbsExtra = GsExtractor() # 下面这句调用gooseeker的api来设置xslt抓取规则 # 第一个参数是app key,请到GooSeeker会员中心申请 # 第二个参数是规则名,是通过GooSeeker的图形化工具: 谋数台MS 来生成的 bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e", "安居客房产经纪人") url = "http://shenzhen.anjuke.com/tycoon/nanshan/p" totalpages = 50 anjukeSpider = Spider() print("爬取开始") for pagenumber in range(1, totalpages): currenturl = url + str(pagenumber) print("正在爬取", currenturl) content = anjukeSpider.getContent(currenturl) outputxml = bbsExtra.extract(content) outputfile = "result" + str(pagenumber) + ".xml" anjukeSpider.saveContent(outputfile, str(outputxml)) print("爬取结束")
# _*_coding:utf8_*_ # crawler_gooseeker_bbs.py # 版本: V1.0 from urllib import request from lxml import etree from gooseeker import GsExtractor # 访问并读取网页内容 url = "http://www.gooseeker.com/cn/forum/7" conn = request.urlopen(url) doc = etree.HTML(conn.read()) bbsExtra = GsExtractor() bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e" , "gooseeker_bbs_xslt") # 设置xslt抓取规则,第一个参数是app key,请到会员中心申请 result = bbsExtra.extract(doc) # 调用extract方法提取所需内容 print(str(result))
bbsExtra.setXsltFromAPI("a0056d16ff3003ae9d5b48bcfa54f4af", "newsCrawler") # 设置xslt抓取规则 # 创建存储结果的目录 current_path = os.getcwd() res_path = current_path + "/third-result" if os.path.exists(res_path): pass else: os.mkdir(res_path) # 驱动火狐 driver = webdriver.Firefox() url = "https://www.amazon.cn/s/ref=sr_pg_1?rh=n%3A658390051%2Cn%3A!658391051%2Cn%3A658414051%2Cn%3A658810051&page=1&ie=UTF8&qid=1476258544" driver.get(url) time.sleep(2) # 获取网页内容 content = driver.execute_script( "return document.documentElement.outerHTML") #page_source.encode('utf-8') # 获取docment doc = etree.HTML(content) # 调用extract方法提取所需内容 result = bbsExtra.extract(doc) # 保存结果 file_path = res_path + "/page-" + str(page) + ".xml" open(file_path, "wb").write(result) print('第' + str(page) + '页采集完毕,文件:' + file_path) driver.quit()
#-*_coding:utf8-*- # 使用gsExtractor类的示例程序 # 访问集搜客论坛,以xslt为模板提取论坛内容 # xslt保存在xslt_bbs.xml中 from urllib import request from lxml import etree from gooseeker import GsExtractor # 访问并读取网页内容 url = "http://www.gooseeker.com/cn/forum/7" conn = request.urlopen(url) doc = etree.HTML(conn.read()) # 生成xsltExtractor对象 bbsExtra = GsExtractor() # 调用set方法设置xslt内容 bbsExtra.setXsltFromFile("xslt_bbs.xml") # 调用extract方法提取所需内容 result = bbsExtra.extract(doc) # 显示提取结果 print(str(result))
class Spider: def getContent(self, url): conn = request.urlopen(url) output = etree.HTML(conn.read()) return output def saveContent(self, filepath, content): file_obj = open(filepath, 'w', encoding='UTF-8') file_obj.write(content) file_obj.close() bbsExtra = GsExtractor() bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e" , "安居客房产经纪人") # 设置xslt抓取规则,第一个参数是app key,请到会员中心申请 url = "http://shenzhen.anjuke.com/tycoon/nanshan/p" anjukeSpider = Spider() print("爬取开始") for pagenumber in range(1 , totalpages): currenturl = url + str(pagenumber) print("正在爬取", currenturl) content = anjukeSpider.getContent(currenturl) outputxml = bbsExtra.extract(content) outputfile = "result" + str(pagenumber) + ".xml" anjukeSpider.saveContent(outputfile , str(outputxml)) print("爬取结束")
time.sleep(3) html = browser.execute_script("return document.documentElement.outerHTML") output = etree.HTML(html) return output def saveContent(self, filepath, content): file_obj = open(filepath, 'w', encoding='UTF-8') file_obj.write(content) file_obj.close() doubanExtra = GsExtractor() # 下面这句调用gooseeker的api来设置xslt抓取规则 # 第一个参数是app key,请到GooSeeker会员中心申请 # 第二个参数是规则名,是通过GooSeeker的图形化工具: 谋数台MS 来生成的 doubanExtra.setXsltFromAPI("ffd5273e213036d812ea298922e2627b" , "豆瓣小组讨论话题") url = "https://www.douban.com/group/haixiuzu/discussion?start=" totalpages = 5 doubanSpider = PhantomSpider() print("爬取开始") for pagenumber in range(1 , totalpages): currenturl = url + str((pagenumber-1)*25) print("正在爬取", currenturl) content = doubanSpider.getContent(currenturl) outputxml = doubanExtra.extract(content) outputfile = "result" + str(pagenumber) +".xml" doubanSpider.saveContent(outputfile , str(outputxml)) print("爬取结束")
"return document.documentElement.outerHTML") output = etree.HTML(html) return output def saveContent(self, filepath, content): file_obj = open(filepath, 'w', encoding='UTF-8') file_obj.write(content) file_obj.close() doubanExtra = GsExtractor() # 下面这句调用gooseeker的api来设置xslt抓取规则 # 第一个参数是app key,请到GooSeeker会员中心申请 # 第二个参数是规则名,是通过GooSeeker的图形化工具: 谋数台MS 来生成的 doubanExtra.setXsltFromAPI("ffd5273e213036d812ea298922e2627b", "豆瓣小组讨论话题") url = "https://www.douban.com/group/haixiuzu/discussion?start=" totalpages = 5 doubanSpider = PhantomSpider() print("爬取开始") for pagenumber in range(1, totalpages): currenturl = url + str((pagenumber - 1) * 25) print("正在爬取", currenturl) content = doubanSpider.getContent(currenturl) outputxml = doubanExtra.extract(content) outputfile = "result" + str(pagenumber) + ".xml" doubanSpider.saveContent(outputfile, str(outputxml)) print("爬取结束")