예제 #1
0
 def getContent(self, url):
     browser = webdriver.PhantomJS(executable_path=self.phantomjsPath)
     browser.get(url)
     time.sleep(self.waittime)
     html = browser.execute_script(
         "return document.documentElement.outerHTML")
     doc = etree.HTML(html)
     jdlistExtra = GsExtractor()
     jdlistExtra.setXsltFromFile("jd_list.xml")
     output = jdlistExtra.extract(doc)
     return output
예제 #2
0
 def parse(self, response):
     print("-" * 10)
     # 引入Gooseeker最新规则提取器
     bbsExtra = GsExtractor()
     # 设置xslt抓取规则
     bbsExtra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e", "安居客_房源")
     # 调用extract方法提取所需内容
     result = bbsExtra.extractHTML(response.body)
     # 打印采集结果
     res = str(result).encode('gbk', 'ignore').decode('gbk')
     print(res)
     # 保存采集结果
     file_path = os.getcwd() + "/anjuke-result.xml"
     open(file_path, "wb").write(result)
     # 打印结果存放路径
     print("采集结果文件:" + file_path)
예제 #3
0
 def parse(self, response):
     print("start...")
     # start browser
     self.browser.get(response.url)
     # loading time interval
     time.sleep(3)
     # get xslt
     extra = GsExtractor()
     extra.setXsltFromAPI("API KEY", "淘宝天猫_商品详情30474")
     # get doc
     html = self.browser.execute_script("return document.documentElement.outerHTML");
     doc = etree.HTML(html)
     result = extra.extract(doc)
     # out file
     file_name = 'E:/淘宝天猫_商品详情30474_' + self.getTime() + '.xml'
     open(file_name, "wb").write(result)
     self.browser.close()
     print("end")
예제 #4
0
# _*_coding:utf8_*_
# crawler_gooseeker_bbs.py
# 版本: V1.0

from urllib import request
from lxml import etree
from gooseeker import GsExtractor

# 访问并读取网页内容
url = "http://www.gooseeker.com/cn/forum/7"
conn = request.urlopen(url)
doc = etree.HTML(conn.read())

bbsExtra = GsExtractor()
bbsExtra.setXsltFromAPI(
    "98adf83ksdf0slrwaerwersdkfjsa",
    "gooseeker_bbs_xslt")  # 设置xslt抓取规则,第一个参数是app key,请到会员中心申请
result = bbsExtra.extract(doc)  # 调用extract方法提取所需内容

print(str(result))
예제 #5
0
파일: webCon.py 프로젝트: zhaox1nkai/spider
    def __init__(self):
        # 集搜客提供的API
        self.extra = GsExtractor()

        # ip代理商提供的服务
        self.Ipagency = IpAgency()
예제 #6
0
# -*_coding:utf-8-*-
# 使用GsExtractor类的示例程序
# 访问集搜客论坛,以xslt为模板提取论坛内容
# xslt保存在xslt_bbs.xml中
from urllib import request
from lxml import etree
from gooseeker import GsExtractor
import html

# 访问并读取网页内容
url = "http://im.nju.edu.cn/teachers.do?type=1&mid=4"
conn = request.urlopen(url)
doc = etree.HTML(conn.read())

bbsExtra = GsExtractor()  # 生成xsltExtractor对象
bbsExtra.setXsltFromAPI("e346796c93c6ba7441636666e401e5cc", "im.nju.edu.cn")
xs = bbsExtra.getXslt()
result = bbsExtra.extract(doc)  # 调用extract方法提取所需内容
# out file
file_name = 'E:/parse_detail_' + '.xml'
open(file_name, "w").write(result)
print(result)
        browser = webdriver.PhantomJS(
            executable_path='C:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe')
        browser.get(url)
        time.sleep(3)
        html = browser.execute_script(
            "return document.documentElement.outerHTML")
        output = etree.HTML(html)
        return output

    def saveContent(self, filepath, content):
        file_obj = open(filepath, 'w', encoding='UTF-8')
        file_obj.write(content)
        file_obj.close()


doubanExtra = GsExtractor()
# 下面这句调用gooseeker的api来设置xslt抓取规则
# 第一个参数是app key,请到GooSeeker会员中心申请
# 第二个参数是规则名,是通过GooSeeker的图形化工具: 谋数台MS 来生成的
doubanExtra.setXsltFromAPI("ffd5273e213036d812ea298922e2627b", "豆瓣小组讨论话题")

url = "https://www.douban.com/group/haixiuzu/discussion?start="
totalpages = 5
doubanSpider = PhantomSpider()
print("爬取开始")

for pagenumber in range(1, totalpages):
    currenturl = url + str((pagenumber - 1) * 25)
    print("正在爬取", currenturl)
    content = doubanSpider.getContent(currenturl)
    outputxml = doubanExtra.extract(content)