def getContent(self, url): browser = webdriver.PhantomJS(executable_path=self.phantomjsPath) browser.get(url) time.sleep(self.waittime) html = browser.execute_script( "return document.documentElement.outerHTML") doc = etree.HTML(html) jdlistExtra = GsExtractor() jdlistExtra.setXsltFromFile("jd_list.xml") output = jdlistExtra.extract(doc) return output
def parse(self, response): print("-" * 10) # 引入Gooseeker最新规则提取器 bbsExtra = GsExtractor() # 设置xslt抓取规则 bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e", "安居客_房源") # 调用extract方法提取所需内容 result = bbsExtra.extractHTML(response.body) # 打印采集结果 res = str(result).encode('gbk', 'ignore').decode('gbk') print(res) # 保存采集结果 file_path = os.getcwd() + "/anjuke-result.xml" open(file_path, "wb").write(result) # 打印结果存放路径 print("采集结果文件:" + file_path)
def parse(self, response): print("start...") # start browser self.browser.get(response.url) # loading time interval time.sleep(3) # get xslt extra = GsExtractor() extra.setXsltFromAPI("API KEY", "淘宝天猫_商品详情30474") # get doc html = self.browser.execute_script("return document.documentElement.outerHTML"); doc = etree.HTML(html) result = extra.extract(doc) # out file file_name = 'E:/淘宝天猫_商品详情30474_' + self.getTime() + '.xml' open(file_name, "wb").write(result) self.browser.close() print("end")
# _*_coding:utf8_*_ # crawler_gooseeker_bbs.py # 版本: V1.0 from urllib import request from lxml import etree from gooseeker import GsExtractor # 访问并读取网页内容 url = "http://www.gooseeker.com/cn/forum/7" conn = request.urlopen(url) doc = etree.HTML(conn.read()) bbsExtra = GsExtractor() bbsExtra.setXsltFromAPI( "98adf83ksdf0slrwaerwersdkfjsa", "gooseeker_bbs_xslt") # 设置xslt抓取规则,第一个参数是app key,请到会员中心申请 result = bbsExtra.extract(doc) # 调用extract方法提取所需内容 print(str(result))
def __init__(self): # 集搜客提供的API self.extra = GsExtractor() # ip代理商提供的服务 self.Ipagency = IpAgency()
# -*_coding:utf-8-*- # 使用GsExtractor类的示例程序 # 访问集搜客论坛,以xslt为模板提取论坛内容 # xslt保存在xslt_bbs.xml中 from urllib import request from lxml import etree from gooseeker import GsExtractor import html # 访问并读取网页内容 url = "http://im.nju.edu.cn/teachers.do?type=1&mid=4" conn = request.urlopen(url) doc = etree.HTML(conn.read()) bbsExtra = GsExtractor() # 生成xsltExtractor对象 bbsExtra.setXsltFromAPI("e346796c93c6ba7441636666e401e5cc", "im.nju.edu.cn") xs = bbsExtra.getXslt() result = bbsExtra.extract(doc) # 调用extract方法提取所需内容 # out file file_name = 'E:/parse_detail_' + '.xml' open(file_name, "w").write(result) print(result)
browser = webdriver.PhantomJS( executable_path='C:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe') browser.get(url) time.sleep(3) html = browser.execute_script( "return document.documentElement.outerHTML") output = etree.HTML(html) return output def saveContent(self, filepath, content): file_obj = open(filepath, 'w', encoding='UTF-8') file_obj.write(content) file_obj.close() doubanExtra = GsExtractor() # 下面这句调用gooseeker的api来设置xslt抓取规则 # 第一个参数是app key,请到GooSeeker会员中心申请 # 第二个参数是规则名,是通过GooSeeker的图形化工具: 谋数台MS 来生成的 doubanExtra.setXsltFromAPI("ffd5273e213036d812ea298922e2627b", "豆瓣小组讨论话题") url = "https://www.douban.com/group/haixiuzu/discussion?start=" totalpages = 5 doubanSpider = PhantomSpider() print("爬取开始") for pagenumber in range(1, totalpages): currenturl = url + str((pagenumber - 1) * 25) print("正在爬取", currenturl) content = doubanSpider.getContent(currenturl) outputxml = doubanExtra.extract(content)