Python FirmcrawlerItem示例，FirmCrawler.items.FirmcrawlerItem Python示例

示例#1

0

显示文件

文件： tgnetSpider.py 项目： yifiHeaven/VendorFirmwareCrawl

 def parse_page(self, response):
     prototype = response.meta['prototype']
     item = MI.FirmcrawlerItem(prototype)
     tables = response.xpath(
         '//div[@class="list"]/table/tr[4]//a/@href').extract()
     absurl = urlparse.urljoin(response.url, tables[0])
     filename = tables[0].split('/')[-1].replace(".online", "")
     softname = response.xpath('//div[@class="list"]/table/tr[1]/td/text()'
                               ).extract().pop().strip()
     version = re.search('[V,v]?\d\.\d\.\d\.*\d*', softname)
     if version:
         version = version.group()
     else:
         version = ""
     if version:
         model = softname.split(" ")[-1].split(version)[0]
     else:
         model = softname
     model = unicode.encode(model,
                            encoding='utf8').replace("-",
                                                     "").replace("_", "")
     item["productVersion"] = version
     item["publishTime"] = ""
     item["productClass"] = ""
     item["productModel"] = model.replace("-", "").replace("_", "")
     item["description"] = ""
     item["url"] = absurl
     item["firmwareName"] = filename
     item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S")
     yield item
     print "firmwarename:", item["firmwareName"]

示例#2

0

显示文件

    def parse_list(self, response):
        prototype = response.meta['prototype']
        lines = response.xpath(
            '//div[@id="c_companyFile_list-15086754191809347"]/div/div[1]/div')
        for line in lines:
            filename = line.xpath(
                './div/div[2]/div[1]/a/h3/div/text()').extract().pop()
            # print filename
            productModel = filename.split("升级")[0]
            # print productModel
            publishTime = line.xpath(
                './div/div[2]/div[4]/div/div/text()').extract().pop()
            # print publishTime

            ###http://www.netcoretec.com/comp/companyFile/download.do?fid=104&appId=24&id=98
            ###http://www.netcoretec.com/comp/companyFile/download.do?fid=103&appId=24&id=97#
            ###在网页上很难找到这两个参数(使用javascript内容,仔细找找还是能找得到的)
            cid = line.xpath('./div/a/@cid').extract().pop()
            data = line.xpath('./div/a/@data').extract().pop()
            # print cid,data
            absurl = "http://www.netcoretec.com/comp/companyFile/download.do?fid=" + str(
                cid) + "&appId=24&id=" + str(data)

            item = MI.FirmcrawlerItem(prototype)
            item["firmwareName"] = filename
            item["url"] = absurl
            item["productVersion"] = ""
            item["publishTime"] = publishTime
            item["productClass"] = ""
            item["productModel"] = productModel
            item["description"] = ""
            item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S")
            yield item
            print "firmwarename:", item["firmwareName"]

示例#3

0

显示文件

    def parse(self, response):
        table_list = response.xpath("//html/body/div[3]/div/div[2]/table")
        for table in table_list:
            #html/body/div[3]/div/div[2]/table[1]/tbody/tr[1]/td[1]/strong
            filename = table.xpath(
                "./tbody/tr[1]/td[1]/strong/text()").extract().pop()

            description = table.xpath("./tbody/tr[2]/td/p/text()").extract()
            desc = ""
            for d in description:
                desc = desc + d.strip()
            # print desc

            absurl = table.xpath(
                "./tbody/tr[1]/td[2]/strong/a/@href").extract().pop()
            # print absurl
            #http://www.tiandy.com/wp-content/files/Easy7SmartClientProfessionalV7.14T.zip

            item = MI.FirmcrawlerItem()
            item["firmwareName"] = filename
            item["publishTime"] = ""
            item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S")
            item["url"] = absurl
            item["description"] = desc
            item["productClass"] = ""
            item["productVersion"] = ""
            item["productModel"] = ""
            item["manufacturer"] = "tiandy"

            yield item
            print "firmwarename:", item["firmwareName"]

示例#4

0

显示文件

文件： xiongmaiSpider.py 项目： stl314159/VendorFirmwareCrawl

    def parse_page(self, response):
        prototype = response.meta['prototype']
        version = response.xpath(
            '//div[@class="down1-ccont"]/div[2]/p[1]/text()').extract()
        if version:
            self.productVersion = version[0]
        else:
            self.productVersion = ""

        description = response.xpath(
            '//div[@class="down1-ccont"]/div[2]/p[position()>1]/text()'
        ).extract()
        if description:
            self.desc = " ".join(description)
        else:
            self.desc = ""

        urls = response.xpath(
            '//div[@class="down1-ccont"]/div[2]/p[1]/a/@href').extract()
        if urls:
            url = urls[0]
            request = scrapy.Request(url, callback=self.parse_next)
            request.meta["prototype"] = MI.FirmcrawlerItem()
            request.meta["prototype"]["manufacturer"] = "xiongmai"
            yield request

示例#5

0

显示文件

文件： totolinkSpider.py 项目： yifiHeaven/VendorFirmwareCrawl

 def parse_page(self, response):
     prototype = response.meta['prototype']
     item = MI.FirmcrawlerItem(prototype)
     tables = response.xpath('//div[@class="sofewear"]/table[1]/tbody/tr')
     for t in tables:
         softname = t.xpath('./td[2]/a/text()').extract().pop().strip()
         if "驱动" not in unicode.encode(softname, encoding='utf-8'):
             url = t.xpath('./td[2]/a/@href').extract()
             absurl = urlparse.urljoin(response.url, url[0]).replace(
                 " ", "%20").replace("(", "%28").replace(")", "%29")
             model = t.xpath('./td[1]/text()').extract().pop().strip()
             publishtime = t.xpath('./td[4]/text()').extract().pop()
             version = re.search("[V,v]?\d\.\d", softname)
             if version:
                 version = version.group()
             else:
                 version = ""
             item["productVersion"] = version
             item["publishTime"] = publishtime
             item["productClass"] = ""
             item["productModel"] = model
             item["description"] = softname
             item["url"] = absurl
             item["firmwareName"] = item["url"].split('/')[-1]
             item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S")
             yield item
             print "firmwarename:", item["firmwareName"]
         else:
             print "qudong :", softname

示例#6

0

显示文件

文件： lblinkSpider.py 项目： yifiHeaven/VendorFirmwareCrawl

    def parse_page(self, response):
        urls = response.xpath(
            '//div[@class="download_rig fr"]/a/@href').extract()[0]
        absurl = urlparse.urljoin(response.url, urls)
        info = response.xpath(
            '//div[@class="download_rig fr"]//text()').extract()
        info = str().join(info).strip()
        info = unicode.encode(info, encoding='utf8')
        modelt = info.split(r"路由器型号：")[-1]
        if "固件版本" in modelt:
            model = modelt.split(r"固件版本：")[0].strip()
            version = modelt.split(r"固件版本：")[-1].split(r"固件大小：")[0].strip()
        else:
            model = modelt.split(r"固件大小：")[0].strip()
            version = ""
        publishtime = modelt.split(r"上传日期：")[-1].split(r"软件简介：")[0].strip()
        desc = modelt.split(r"软件简介：")[-1].split("。")[0].strip()

        item = MI.FirmcrawlerItem()
        item["url"] = absurl
        try:
            res = urllib2.urlopen(urllib2.Request(item["url"], None),
                                  timeout=lblinkSpider.timeout)
            contentType = res.headers["content-type"]
            filename = contentType.split('\"')[1]
            item["firmwareName"] = filename
        except Exception, e:
            print "no firmware name"
            print e

示例#7

0

显示文件

文件： jcgSpider.py 项目： yifiHeaven/VendorFirmwareCrawl

 def parse(self, response):
     for i in xrange(1, 3 + 1):  #10+1
         url = "http://www.jcgcn.com/list-42-22-%s/" % i
         request = scrapy.Request(url, callback=self.parse_list)
         request.meta["prototype"] = MI.FirmcrawlerItem()
         request.meta["prototype"]["manufacturer"] = "jcg"
         yield request

示例#8

0

显示文件

    def parse(self, response):
        ul_list = response.xpath("/html/body/div[2]/div/div/ul")
        # print ul_list
        for ul in ul_list:
            li_list = ul.xpath("./li")
            for li in li_list:
                version = li.xpath("./a/text()").extract().pop()
                # print version
                absurl = li.xpath("./a/@href").extract().pop()
                # print absurl
                filename = absurl.split("/")[-1]
                # print filename

                item = MI.FirmcrawlerItem()
                item["firmwareName"] = filename
                item["productVersion"] = version
                item["productModel"] = ""
                item["productClass"] = ""
                item["publishTime"] = ""
                item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S")
                item["url"] = absurl
                item["description"] = ""
                item["manufacturer"] = "micropython"

                yield item
                print "firmwarename:", item["firmwareName"]

示例#9

0

显示文件

 def parse_page(self, response):
     prototype = response.meta['prototype']
     item = MI.FirmcrawlerItem(prototype)
     tables = response.xpath(
         '//div[@class="table-wrap"]/table/tbody/tr[4]/td[2]/a/@href'
     ).extract().pop()
     absurl = urlparse.urljoin(response.url, tables.replace(' ', '%20'))
     filename = tables.split('/')[-1]
     softname = response.xpath(
         '//div[@class="table-wrap"]/table/tbody/tr[1]/td[2]//text()'
     ).extract().pop().strip()
     desc = response.xpath(
         '//div[@class="table-wrap"]/table/tbody/tr[5]/td[2]//text()'
     ).extract()
     publishtime = response.xpath(
         '//div[@class="table-wrap"]/table/tbody/tr[3]/td[2]//text()'
     ).extract().pop()
     model = softname.split(' ')[0]
     version = softname.split(' ')[-1].split('_')[0]
     item["productVersion"] = version
     item["publishTime"] = publishtime.strip()
     item["productClass"] = ""
     item["productModel"] = model
     item["description"] = str().join(desc).strip()
     item["url"] = absurl
     item["firmwareName"] = filename
     item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S")
     yield item
     print "firmwarename:", item["firmwareName"]
     return

示例#10

0

显示文件

文件： onkyoSpider.py 项目： yifiHeaven/VendorFirmwareCrawl

    def parse_page(self, response):
        li_list = response.xpath('//html/body/div[3]/div/div[2]/ul/li')
        for li in li_list:
            did_value = li.xpath('./@did').extract().pop()
            absurl = "http://www.cn.onkyo.com/2018newsite/Download/" + str(
                did_value) + ".html"
            # print absurl
            file_name = li.xpath('./a/text()').extract().pop()
            filename = file_name.split("固件更新")[0]
            publishTime = file_name.split("固件更新")[-1]
            # print filename
            # print publishTime

            item = MI.FirmcrawlerItem()
            item["firmwareName"] = filename
            item["productVersion"] = ""
            item["productModel"] = ""
            item["productClass"] = ""
            item["publishTime"] = publishTime
            item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S")
            item["url"] = absurl
            item["description"] = ""
            item["manufacturer"] = "onkyo"

            yield item
            print "firmwarename:", item["firmwareName"]

示例#11

0

显示文件

 def parse(self, response):
     for i in xrange(1, 5 + 1):  #5+1
         url = "http://www.tomaxcom.com/shengjiruanjian/list_30_%s.html" % i
         request = scrapy.Request(url, callback=self.parse_list)
         request.meta["prototype"] = MI.FirmcrawlerItem()
         request.meta["prototype"]["manufacturer"] = "tomax"
         yield request

示例#12

0

显示文件

文件： tgnetSpider.py 项目： yifiHeaven/VendorFirmwareCrawl

 def parse_list(self, response):
     tables = response.xpath('//div[@class="list"]/dl//@href').extract()
     for t in tables:
         url = urlparse.urljoin(response.url, t)
         request = scrapy.Request(url, callback=self.parse_page)
         request.meta["prototype"] = MI.FirmcrawlerItem()
         request.meta["prototype"]["manufacturer"] = "tg-net"
         yield request

示例#13

0

显示文件

文件： tgnetSpider.py 项目： yifiHeaven/VendorFirmwareCrawl

 def parse(self, response):
     for i in xrange(1, 12 + 1):  #12+1
         url = "http://www.tg-net.cn/download_106_%s.html" % i
         # print "url:",url
         request = scrapy.Request(url, callback=self.parse_list)
         request.meta["prototype"] = MI.FirmcrawlerItem()
         request.meta["prototype"]["manufacturer"] = "TG-NET"
         yield request

示例#14

0

显示文件

文件： mercurySpider.py 项目： yifiHeaven/VendorFirmwareCrawl

 def parse(self, response):
     for i in xrange(1, 25):  #20+1
         url = "http://service.mercurycom.com.cn/download-list.html?p=%s" % i
         # print "url:",url
         request = scrapy.Request(url, callback=self.parse_list)
         request.meta["prototype"] = MI.FirmcrawlerItem()
         request.meta["prototype"]["manufacturer"] = "mercury"
         yield request

示例#15

0

显示文件

 def parse(self, response):
     for i in xrange(1, 18 + 1):
         url = "http://service.fastcom.com.cn/download-list.html?classTip=software&p=%s&o=1&ajax=True&paging=False" % i
         # print "url:",url
         request = scrapy.Request(url, callback=self.parse_list)
         request.meta["prototype"] = MI.FirmcrawlerItem()
         request.meta["prototype"]["manufacturer"] = "fast"
         yield request

示例#16

0

显示文件

文件： egreatSpider.py 项目： yifiHeaven/VendorFirmwareCrawl

    def parse_page(self, response):
        # print response.url
        productModel = response.xpath(
            ".//section[@id='page-title']/div/h1/text()").extract().pop(
            ).split("固件")[0]
        # print productModel
        div1 = response.xpath(
            ".//div[@id='posts']/div[@class='entry clearfix']")
        # print len(div1)
        div2 = response.xpath(
            ".//div[@id='posts']/div[@class='entry clearfix alt']")
        # print len(div2)
        div = div1 + div2
        # print len(div)

        for d in div:
            absurl = d.xpath("./div[2]/div/div/a[1]/@href").extract().pop()
            # print absurl
            filename = d.xpath("./div[2]/div/div/a[1]/text()").extract().pop()
            desc_info = d.xpath("./div[2]/div/div")
            desc_ = desc_info.xpath('string(.)').extract()
            desc = ""
            for description in desc_:
                description_ = description.strip()
                desc = desc + description_

            # print desc
            # print filename
            version = re.search("v\d.+", filename)
            if version:
                productVersion = version.group()
            else:
                productVersion = ""
            # print productVersion
            publish_Time = d.xpath(
                "./div[2]/div/div/div/ul/li[1]/text()").extract().pop()
            publishTime = ""
            try:
                array = time.strptime(publish_Time, u"%Y年%m月%d日")
                publishTime = time.strftime("%Y-%m-%d", array)
            except Exception, e:
                print e

        # print publishTime

            item = MI.FirmcrawlerItem()
            item["firmwareName"] = filename
            item["publishTime"] = publishTime
            item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S")
            item["url"] = absurl
            item["description"] = desc
            item["productClass"] = ""
            item["productVersion"] = ""
            item["productModel"] = productModel
            item["manufacturer"] = "egreat"

            yield item
            print "firmwarename:", item["firmwareName"]

示例#17

0

显示文件

文件： koolshareSpider.py 项目： yifiHeaven/VendorFirmwareCrawl

    def parse_page(self, response):
        file_stype = response.meta['file_stype']
        productModel = response.meta['productModel']
        tr_list_1 = response.xpath(
            ".//*[@id='page-content']/div[2]/div[3]/table/tbody/tr[position()>1]"
        )
        tr_list_2 = response.xpath(
            ".//*[@id='page-content']/div[2]/div[2]/table/tbody/tr[position()>1]"
        )
        tr_list = tr_list_1 + tr_list_2

        if tr_list_1:
            for tr in tr_list:
                try:

                    href = tr.xpath("./td[1]/a/@href").extract().pop()

                    if href.endswith('/'):
                        request = scrapy.Request(href,
                                                 meta={
                                                     'file_stype': file_stype,
                                                     'productModel':
                                                     productModel
                                                 },
                                                 callback=self.parse_page)
                        yield request

                    else:
                        filename = tr.xpath("./td[1]/a/text()").extract().pop()

                        publish_Time = response.xpath(
                            ".//*[@id='page-content']/div[2]/div[3]/table/tbody/tr[1]/td[3]/text()"
                        ).extract()
                        if publish_Time:
                            publishTime = publish_Time.pop().strip().split()[0]
                        else:
                            publishTime = ""

                        absurl = href

                        item = MI.FirmcrawlerItem()
                        item["firmwareName"] = filename
                        item["publishTime"] = publishTime
                        item["crawlerTime"] = time.strftime(
                            "%Y-%m-%d %H:%M:%S")
                        item["url"] = absurl
                        item["description"] = ""
                        item["productClass"] = ""
                        item["productVersion"] = ""
                        item["productModel"] = productModel
                        item["manufacturer"] = "koolshare"

                        yield item
                        print "firmwarename:", item["firmwareName"]

                except Exception, e:
                    print e.message

示例#18

0

显示文件

 def parse(self, response):
     for i in xrange(1,7+1): #3+1
         url = "http://www.sundray.com.cn/data/32_page_%s.html" %i
         if i == 1:
             url = "http://www.sundray.com.cn/data/32.html"
         request = scrapy.Request(url, callback=self.parse_list)
         request.meta["prototype"] = MI.FirmcrawlerItem()
         request.meta["prototype"]["manufacturer"] = "sundray"
         yield request

示例#19

0

显示文件

 def parse_list(self, response):
     tables = response.xpath('//table[@id="con_two_1"]/tr[position()>1]')
     for t in tables:
         urls = t.xpath('./td[1]/a/@href').extract()
         absurl = urlparse.urljoin(response.url, urls[0])
         request = scrapy.Request(absurl, callback=self.parse_page)
         request.meta["prototype"] = MI.FirmcrawlerItem()
         request.meta["prototype"]["manufacturer"] = "tomax"
         yield request

示例#20

0

显示文件

文件： wayosSpider.py 项目： yifiHeaven/VendorFirmwareCrawl

    def parse(self, response):
        for page in xrange(1, 5):
            url_router = "http://www.wayos.com/download/luyougujian/" + str(
                page) + ".html"
            request = scrapy.Request(url_router, callback=self.parse_list)

            request.meta["prototype"] = MI.FirmcrawlerItem()
            request.meta["prototype"]["manufacturer"] = "Wayos"
            yield request

        for page in xrange(1, 3):
            url_app = "http://www.wayos.com/download/APgujian/" + str(
                page) + ".html"
            request = scrapy.Request(url_app, callback=self.parse_list)

            request.meta["prototype"] = MI.FirmcrawlerItem()
            request.meta["prototype"]["manufacturer"] = "Wayos"
            yield request

示例#21

0

显示文件

文件： draytekSpider.py 项目： yifiHeaven/VendorFirmwareCrawl

 def parse_list(self,response):
     lists = response.selector.xpath('//body/table[1]//table//tr[position()>3]')
     for l in lists:
         url = l.xpath('./td[3]//a/@href').extract()
         absurl = urlparse.urljoin(response.url,url[0])
         request = scrapy.Request(absurl, callback=self.parse_page)
         request.meta["prototype"] = MI.FirmcrawlerItem()
         request.meta["prototype"]["manufacturer"] = "draytek"
         yield request

示例#22

0

显示文件

文件： dahuatechSpider.py 项目： yifiHeaven/VendorFirmwareCrawl

 def parse(self, response):
     div_list = response.xpath(
         '//html/body/div[1]/div[2]/div/div[2]/ul/li[2]/div[position()<3]')
     for div_in in div_list:
         href_list = div_in.xpath('./div/a/@href').extract()
         for href in href_list:
             request = scrapy.Request(href, callback=self.parse_list)
             request.meta["prototype"] = MI.FirmcrawlerItem()
             request.meta["prototype"]["manufacturer"] = "dahuatech"
             yield request

示例#23

0

显示文件

文件： adslrSpider.py 项目： yifiHeaven/VendorFirmwareCrawl

 def parse_page(self, response):
     href_list = response.xpath(
         '//div[@id="right"]/div/div[2]/ul/li/table/tr[position()>1]/td[1]/a/@href'
     ).extract()
     for href in href_list:
         url = urlparse.urljoin(adslrSpider.headurl, href)
         request = scrapy.Request(url, callback=self.parse_list)
         request.meta["prototype"] = MI.FirmcrawlerItem()
         request.meta["prototype"]["manufacturer"] = "adslr"
         yield request

示例#24

0

显示文件

    def parse(self, response):
        # print "use ip"
        # iprand = random_proxy_ip()
        # print "random proxy:", iprand
        # request = scrapy.Request(response.url, callback=self.parse_page, meta={'proxy':'http://'+iprand})

        request = scrapy.Request(response.url, callback=self.parse_page)
        request.meta["prototype"] = MI.FirmcrawlerItem()
        request.meta["prototype"]["manufacturer"] = "dd-wrt"
        yield request

示例#25

0

显示文件

    def parse(self, response):
        for i in range(1, 14):  #13+1
            url_list = "http://www.netcoretec.com/companyfile/2/%23c_companyFile_list-15086754191809347-" + str(
                i)
            # print url_list
            request = scrapy.Request(url_list, callback=self.parse_list)

            request.meta["prototype"] = MI.FirmcrawlerItem()
            request.meta["prototype"]["manufacturer"] = "netcore"
            yield request

示例#26

0

显示文件

文件： openwrtSpider.py 项目： yifiHeaven/VendorFirmwareCrawl

    def parse_page(self, response):
        r = response.selector.xpath("//pre").re(
            "<a[ ]*href=\"(.*)\".*>.*</a>[ ]*(.*:.*)\r\n")  # [0-9]{2}
        i = 0
        prototype = response.meta['prototype']
        while i < len(r):
            if r[i][-1] == "/":
                request = scrapy.Request(response.url + r[i],
                                         callback=self.parse_page)
                request.meta["prototype"] = response.meta["prototype"]
                yield request
            elif r[i].rsplit(".").pop().lower() in OpenwrtSpider.suffix:
                item = MI.FirmcrawlerItem(prototype)
                item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S")
                item["firmwareName"] = r[i]
                item["url"] = response.url + r[i]
                item["productModel"] = ""
                #divided model
                firmWareName = item["firmwareName"]

                divName1 = firmWareName.split("-")
                try:
                    #aim at PandoraBox-realtek-rtl8198c-alpha-fw.bin
                    if divName1[0] == "PandoraBox":
                        likeModel = divName1[1] + "-" + divName1[2]
                    elif divName1[0] == "openwrt":
                        likeModel = divName1[1]
                    elif divName1[1] == "openwrt":
                        likeModel = divName1[2]
                    else:
                        likeModel = ""
                    item["productModel"] = likeModel
                except:
                    pass
                #The full firmware of openwrt are Router!
                item["productClass"] = "Router"

                try:
                    p_s = r[i + 1].split(" ")
                    item["publishTime"] = p_s[0]
                    a = item["publishTime"]
                    a = a.strip()
                    try:
                        array = time.strptime(a, u"%d-%b-%Y")
                        item["publishTime"] = time.strftime("%Y-%m-%d", array)
                    except Exception, e:
                        print e
                except Exception, e:
                    print e

                yield item
                print "firmwareName:", item["firmwareName"]
            else:
                OpenwrtSpider.allsuffix.add(r[i].rsplit(".").pop().lower())
            i += 2

示例#27

0

显示文件

文件： ubloxSpider.py 项目： yifiHeaven/VendorFirmwareCrawl

    def parse(self, response):
        div_list = response.xpath('//div[@class ="view-content"]/div')
        for div_info in div_list:
            href = div_info.xpath('./div/span/div/div/div[1]/div[2]/h2/a/@href').extract()
            if href:
                absurl = href[0]
                filename = absurl.split("/")[-1]
                desc = div_info.xpath('./div/span/div/div/div[1]/div[2]/h2/a/text()').extract()[0]
                productModel = desc.split(" ")[0]

                publish_Time = div_info.xpath('./div/span/div/div/div[1]/div[3]/p/text()').extract()
                if publish_Time:
                    publishTime = publish_Time[0].strip()
                else:
                    publishTime = ""

            elif div_info.xpath('./div/span/div/div/div[1]/div[1]/h2/a/@href').extract():
                href = div_info.xpath('./div/span/div/div/div[1]/div[1]/h2/a/@href').extract()
                absurl = href[0]
                filename = absurl.split("/")[-1]
                desc = div_info.xpath('./div/span/div/div/div[1]/div[1]/h2/a/text()').extract()[0]
                productModel = desc.split(" ")[0]
                publish_Time = div_info.xpath('./div/span/div/div/div[1]/div[2]/p/text()').extract()
                if publish_Time:
                    publishTime = publish_Time[0].strip()
                else:
                    publishTime = ""

            else:
                productModel= ""
                absurl = ""
                desc = ""
                publishTime = ""
                filename = ""

            # print absurl
            # print filename
            # print publishTime
            # print desc
            # print productModel

            item = MI.FirmcrawlerItem()
            item["productVersion"] = ""
            item["productClass"] = ""
            item["productModel"] = productModel
            item["description"] = desc
            item["url"] = absurl
            item["firmwareName"] = filename
            item["publishTime"] = publishTime
            item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S")
            item["manufacturer"] = "u-blox"
            yield item
            print "firmwarename:", item["firmwareName"]

示例#28

0

显示文件

    def parse_list(self, response):
        prototype = response.meta['prototype']
        href_list = response.xpath(
            '//div[@id="main-content"]/div[2]/div[2]/div[2]/ul/li/div/section/a/@href'
        ).extract()
        for href in href_list:
            url = urlparse.urljoin(schnerderSpider.headurl, href)

            request = scrapy.Request(url, callback=self.parse_page)
            request.meta["prototype"] = MI.FirmcrawlerItem()
            request.meta["prototype"]["manufacturer"] = "schneider"
            yield request

示例#29

0

显示文件

文件： jcgSpider.py 项目： yifiHeaven/VendorFirmwareCrawl

    def parse_page(self, response):
        prototype = response.meta['prototype']
        item = MI.FirmcrawlerItem(prototype)
        filename = response.xpath(
            '//div[@class="technical_support_box_z"]/div/div/text()').extract(
            )
        if filename:
            filename = filename[0]
        else:
            filename = ""

        publishTime = response.xpath(
            '//div[@class="technical_support_box_z"]/div/ul/li[2]/text()'
        ).extract()[0]
        publishTime = publishTime.strip().split(" ")[0]

        absurl = response.xpath(
            '//div[@class="technical_support_box_z_info_box"]/div[5]/ul/li/a/@href'
        ).extract()
        if absurl:
            absurl = absurl[0]
            # print absurl
        else:
            absurl = ""

        desc_li = response.xpath(
            '//div[@class="technical_support_box_z_info_box"]/div[3]/ul/li')
        desc = []
        for desc_info in desc_li:
            desc_ = desc_info.xpath('./font/text()').extract()
            if desc_:
                description = desc_[0]
                # print description
                desc.append(description)
            else:
                description = ""
        if desc:
            desc = " ".join(desc)
        else:
            desc = ""

        item["productVersion"] = ""
        item["publishTime"] = publishTime
        item["productClass"] = ""
        item["productModel"] = ""
        item["description"] = desc
        item["url"] = absurl
        item["firmwareName"] = filename
        item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S")
        item["manufacturer"] = "jcg"
        yield item
        print "firmwarename:", item["firmwareName"]

示例#30

0

显示文件

    def parse_page(self, response):
        li_list = response.xpath(
            "//html/body/div[1]/div[3]/div[1]/div[2]/div/ul/li")
        for li in li_list:
            filename = li.xpath("./h3/a/text()").extract().pop().strip()
            href = li.xpath("./h3/a/@href").extract().pop()
            absurl = urlparse.urljoin(self.headurl, href)

            description = li.xpath("./p[2]/text()").extract()
            if description:
                desc = description.pop()
            else:
                desc = ""

            product_Model = desc.split(" ")[0]
            if product_Model:
                productModel = product_Model
            else:
                productModel = ""

            publish_Time = desc.split(" ")[-1]
            if publish_Time:
                publish_Time_ = re.search("\d.+.\d", publish_Time)
                if publish_Time_:
                    publishTime = publish_Time_.group()
            else:
                publishTime = ""
            # print publishTime

            version_info = re.search("V.*", desc)
            if version_info:
                version = version_info.group()
                productVersion = version.split(" ")[0]
            else:
                productVersion = ""

            item = MI.FirmcrawlerItem()
            item["firmwareName"] = filename
            item["publishTime"] = publishTime
            item["crawlerTime"] = time.strftime("%Y-%m-%d %H:%M:%S")
            item["url"] = absurl
            item["description"] = ""
            item["productClass"] = ""
            item["productVersion"] = productVersion
            item["productModel"] = productModel
            item["manufacturer"] = "newgreennet"

            yield item
            print "firmwarename:", item["firmwareName"]