def parse(self, response):
		item = BaseItem()
		speci_list = []
		pack_list = []
		intro_list = []
		details_list = []
		item['productUrl'] = ''
		item['productName'] = ''
		item['productBrand'] = ''
		item['productModel'] = ''
		item['productClassification'] = ''
		item['productPrice'] = ''
		item['productImagePath'] = ''
		item['productAddres'] = ""
		item['productCompany'] = ''
		item['fileName'] = ''
		item['productDetails'] = ""
		item['productPack'] = ""
		item['productIntro'] = ""
		item['productSpeci'] = ""
		classification_one = ''
		classification_two = ''
		classification_three = ''
		try:
			item['productUrl'] = response.url
		except:
			pass
		try:
			item['productName'] = response.xpath("//div[@class='productDetail product-detail-repair']/h1/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
Пример #2
0
	def parse(self, response):
		item = BaseItem()
		details_list = []
		pack_list = []
		intro_list = []
		speci_list = []
		item['productUrl'] = ''
		item['productName'] = ''
		item['productBrand'] = ''
		item['productModel'] = ''
		item['productClassification'] = ''
		item['productPrice'] = ''
		item['productImagePath'] = ''
		item['productAddres'] = ""
		item['productCompany'] = ''
		item['fileName'] = ''
		item['productDetails'] = ""
		item['productPack'] = ""
		item['productIntro'] = ""
		item['productSpeci'] = ""
		classification_one = ''
		classification_two = ''
		classification_three = ''
		try:
			classification_one = response.xpath("//div[@class='breadcrumbs']/a[3]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
			classification_two = response.xpath("//div[@class='breadcrumbs']/a[4]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
			classification_three = response.xpath("//div[@class='breadcrumbs']/a[5]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		classification = classification_one + '|||' + classification_two + '|||' +classification_three
		try:
			item['productUrl'] = response.url
		except:
			pass
		try:
			item['productName'] = response.xpath('//div[@class="tl-wrap-g"]/div[@class = "title"]/h3/b/text()').extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
	def parse(self, response):
		#filename = response.url.split("/")[-2]
		#open(filename, 'wb').write(response.body)
		#start browser
		#self.driver.get(response.url)
		#loading time interval
		#time.sleep(5)
		item = BaseItem()
		speci_list = []
		pack_list = []
		intro_list = []
		details_list = []
		item['productUrl'] = ''
		item['productName'] = ''
		item['productBrand'] = ''
		item['productModel'] = ''
		item['productClassification'] = ''
		item['productPrice'] = ''
		item['productImagePath'] = ''
		item['productAddres'] = ""
		item['productCompany'] = ''
		item['fileName'] = ''
		item['productDetails'] = ""
		item['productPack'] = ""
		item['productIntro'] = ""
		item['productSpeci'] = ""
		classification_one = ''
		classification_two = ''
		classification_three = ''
		try:
			item['productUrl'] = response.url
		except:
			pass
		try:
			item['productName'] = response.xpath("//div[@class='textInfo']/form/div/h1/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
 def parse(self, response):
     item = BaseItem()
     item = BaseItem()
     item['productUrl'] = ''
     item['productName'] = ''
     item['productBrand'] = ''
     item['productModel'] = ''
     item['productClassification'] = ''
     item['productPrice'] = ''
     item['productImagePath'] = ''
     item['productAddres'] = ""
     item['productCompany'] = ''
     item['fileName'] = ''
     classification_one = ''
     classification_two = ''
     classification_three = ''
     try:
         classification_two = response.xpath(
             "//dd[@class='crumb_item'][1]/a/text()").extract()[0].encode(
                 'utf-8').replace("\"", "\'").strip()
         classification_three = response.xpath(
             "//div[@class='crumb']/dl/dd[@class='crumb_item'][2]/a/text()"
         ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
         classification_one = response.xpath(
             "//dd[@class='crumb_item'][13]/a/text()").extract()[0].encode(
                 'utf-8').replace("\"", "\'").strip()
     except:
         pass
     classification = classification_one + '|||' + classification_two + '|||' + classification_three
     try:
         item['productUrl'] = response.url
     except:
         pass
     try:
         item['productName'] = response.xpath(
             "//form[@id='form1']/ul/li[@class='tit']/text()").extract(
             )[0].encode('utf-8').replace("\"", "\'").strip()
     except:
         pass
     list_brand = ''
     for j in range(1, 20):
         try:
             list_brand = response.xpath(
                 "//form[@id='form1']/ul/li[%i]/label/text()" %
                 j).extract()[0].encode('utf-8').replace("\"",
                                                         "\'").strip()
             if '品    牌:' == list_brand:
                 item['productBrand'] = response.xpath(
                     "//form[@id='form1']/ul/li[%i]/text()" %
                     j).extract()[0].encode('utf-8').replace("\"",
                                                             "\'").strip()
                 break
         except:
             pass
     try:
         item['productModel'] = response.xpath(
             '//div[@class="m m1"]/div/ul/dt/li/text()').extract(
             )[0].encode('utf-8').replace("\"", "\'").strip()
     except:
         pass
     try:
         item['productClassification'] = classification
     except:
         pass
     try:
         #去空格 转分   去人民币符号
         item['productPrice'] = response.xpath(
             "//strong[@class='orange price_tit']/text()").extract(
             )[0].encode('utf-8').replace("\"", "\'").strip()
     except:
         pass
     try:
         item['productPrice'] = str(
             float(
                 filter(lambda ch: ch in '0123456789.~',
                        item['productPrice'])) * 100)
     except:
         pass
     #图片连接
     try:
         item['productImagePath'] = response.xpath(
             "//div[@class='bd']/div/div/p/span/img/@src").extract(
             )[0].encode('utf-8').replace("\"", "\'").strip()
     except:
         pass
     #print item['image_urls'],"777777"
     try:
         item['productAddres'] = response.xpath(
             "//form[@id='form1']/ul/li[4]/text()").extract()[0]
     except:
         pass
     try:
         item['productCompany'] = ""
     except:
         pass
     names = self.name + '.json'
     try:
         item['fileName'] = names
     except:
         pass
     yield item
Пример #5
0
    def parse(self, response):
        item = BaseItem()
        speci_list = []
        pack_list = []
        intro_list = []
        details_list = []
        item['productUrl'] = ''
        item['productName'] = ''
        item['productBrand'] = ''
        item['productModel'] = ''
        item['productClassification'] = ''
        item['productPrice'] = ''
        item['productImagePath'] = ''
        item['productAddres'] = ""
        item['productCompany'] = ''
        item['fileName'] = ''
        item['productDetails'] = ""
        item['productPack'] = ""
        item['productIntro'] = ""
        item['productSpeci'] = ""
        classification_one = ''
        classification_two = ''
        classification_three = ''
        try:
            classification_one = response.xpath(
                "//div[@class='Navigation']/span[3]/a/text()").extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
            classification_two = response.xpath(
                "//div[@class='Navigation']/span[5]/a/text()").extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
            classification_three = response.xpath(
                "//div[@class='Navigation']/span[7]/a/text()").extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        classification = classification_one + '|||' + classification_two + '|||' + classification_three
        try:
            item['productUrl'] = response.url
        except:
            pass
        try:
            item['productName'] = response.xpath(
                "//form[@class='goods-action']/h1[@class='goodsname']/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        list_brand = ''
        for j in range(1, 20):
            try:
                list_brand = response.xpath(
                    "//ul[@class='goodsprops clearfix']/li[%i]/span/text()" %
                    j).extract()[0].encode('utf-8').replace("\"",
                                                            "\'").strip()
                if '品  牌:' in list_brand:
                    item['productBrand'] = response.xpath(
                        "//ul[@class='goodsprops clearfix']/li[%i]/a/text()" %
                        j).extract()[0].encode('utf-8').replace("\"",
                                                                "\'").strip()
                    break
            except:
                pass
        try:
            item['productModel'] = response.xpath(
                "//ul[@class='goodsprops clearfix']/li/text()").extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productClassification'] = classification
        except:
            pass
        try:
            #去空格 转分   去人民币符号
            item['productPrice'] = response.xpath(
                "//ul[@class='goods-price list']/li/span[@class='price1']/text()"
            ).extract()[0].encode('utf-8').replace("¥", "").strip()
        except:
            pass
        try:
            item['productPrice'] = str(
                float(
                    filter(lambda ch: ch in '0123456789.~',
                           item['productPrice'])) * 100)
        except:
            pass
        #图片连接
        try:
            item['productImagePath'] = response.xpath(
                "//div[@class='goodspic']/div[@class='goods-detail-pic']/a/img/@src"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        #print item['image_urls'],"777777"
        try:
            item['productAddres'] = response.xpath(
                "//form[@id='form1']/ul/li[4]/text()").extract()[0]
        except:
            pass
        try:
            item['productCompany'] = ""
        except:
            pass
        names = self.name + '.json'
        try:
            item['fileName'] = names
        except:
            pass
        list_details = response.xpath(
            "//div[@id='para']/table[1]/tbody[2]/tr/td/text()").extract()
        logging.info("-------list_details_len=%i" % len(list_details))
        list_pack = response.xpath(
            "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()"
        ).extract()
        intro = response.xpath("//span[@id='PDescriptiion']/text()").extract()
        logging.info("-------intr_len=%i" % len(intro))
        speci = response.xpath("//span[@id='techParam']/text()").extract()
        logging.info("-------intr_len=%i" % len(speci))
        num_one = 1
        for value_details in list_details:
            value_details = value_details.encode('utf-8').replace(
                ":", "\/").replace("\n", "").replace("\"", "").strip()
            if num_one % 2 == 2:
                num_one = 1
                continue
            if num_one % 2 == 1:
                data2 = {}
                data2['attrkey'] = ''
                data2['keyname'] = ''
                if '品牌' in value_details:
                    num_one = 0
                    continue
                if '型号' in value_details:
                    num_one = 0
                    continue
                data2['attrkey'] = value_details
            else:
                if num_one == 0:
                    num_one = 1
                    continue
                data2['keyname'] = value_details
                details_list.append(data2)
            num_one += 1

        num_two = 1
        for list_intro in intro:
            list_intro = list_intro.encode('utf-8').replace("\n", "").replace(
                "\"", "").strip()
            list_intro = list_intro.split(':')
            for value_intro in list_intro:
                if num_two % 2 == 1:
                    data2 = {}
                    data2['attrkey'] = ''
                    data2['keyname'] = ''
                    if '商品名称' in value_intro:
                        break
                    if '品牌' in value_intro:
                        break
                    data2['attrkey'] = value_intro
                else:
                    data2['keyname'] = value_intro
                    intro_list.append(data2)
                num_two += 1

        num_three = 1
        for list_speci in speci:
            list_speci = list_speci.encode('utf-8').replace("\n", "").replace(
                "\"", "").strip()
            list_speci = list_speci.split(':')
            for value_speci in list_speci:
                if num_three % 2 == 1:
                    data2 = {}
                    data2['attrkey'] = ''
                    data2['keyname'] = ''
                    if '商品名称' in value_speci:
                        break
                    if '品牌' in value_speci:
                        break
                    data2['attrkey'] = value_speci
                else:
                    data2['keyname'] = value_speci
                    speci_list.append(data2)
                num_three += 1

        product_intro = response.xpath(
            "//span[@id='PDescription']/text()").extract()
        product_pack = response.xpath(
            "//td[@id='imgDiv']/div[@id='div3']/font/b/text()").extract()
        product_details = response.xpath(
            "//div[@id='goods-intro']/p/text()").extract()
        filename = self.name + ".txt"
        file = open("data/" + filename, 'a+')
        file.write("\n" + "productUrl:" + response.url + "\n")
        file.write("productDetails:" + "\n")
        for details in product_details:
            details = details.encode('utf-8').replace("\b", "").replace(
                "<br/>", "").replace("<br>", "").strip()
            file.write(details + "\n")
        file.close()

        item['productSpeci'] = speci_list
        item['productPack'] = pack_list
        item['productIntro'] = intro_list
        item['productDetails'] = details_list
        yield item
    def parse(self, response):
        item = BaseItem()
        speci_list = []
        pack_list = []
        intro_list = []
        details_list = []
        item['productUrl'] = ''
        item['productName'] = ''
        item['productBrand'] = ''
        item['productModel'] = ''
        item['productClassification'] = ''
        item['productPrice'] = ''
        item['productImagePath'] = ''
        item['productAddres'] = ""
        item['productCompany'] = ''
        item['fileName'] = ''
        item['productDetails'] = ""
        item['productPack'] = ""
        item['productIntro'] = ""
        item['productSpeci'] = ""
        classification_one = ''
        classification_two = ''
        classification_three = ''
        print "PhantomJS is starting1..."
        driver = webdriver.PhantomJS()
        driver.get(response.url)
        #time.sleep(3)
        body = driver.page_source
        #driver.close()
        HtmlResponses = HtmlResponse(driver.current_url,
                                     body=body,
                                     encoding='utf-8',
                                     request=response)
        try:
            item['productUrl'] = response.url
        except:
            pass
        try:
            item['productName'] = response.xpath(
                "//div[@id='product_information']/div[@class='product-titles']/h2/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productBrand'] = response.xpath(
                "//div[@class='product-concerns']/ul/li[@class='item'][3]/span[@class='detail']/i[@class='minor']/em[@class='action-mktprice']/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productModel'] = response.xpath(
                "//div[@class='product-concerns']/ul/li[@class='item'][4]/span[@class='detail']/i[@class='minor']/em[@class='action-mktprice']/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            classification_one = response.xpath(
                "//div[@id='p_navbar']/a[2]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_two = response.xpath(
                "//div[@id='p_navbar']/a[3]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_three = response.xpath(
                "//div[@id='p_navbar']/a[4]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            item[
                'productClassification'] = classification_one + '|||' + classification_two + '|||' + classification_three
        except:
            pass
        try:
            item['productPrice'] = HtmlResponses.xpath(
                "//ins[@class='action-price']/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productPrice'] = str(
                float(
                    filter(lambda ch: ch in '0123456789.~',
                           item['productPrice'])) * 100)
        except:
            pass
        try:
            ImagePath = response.xpath(
                '//div[@class="product-album-pic"]/a/@href').extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
            ImagePath = ImagePath.split('?')[-2]
            item['productImagePath'] = ImagePath
        except:
            pass
        try:
            item['productAddres'] = response.xpath(
                "//form[@id='form1']/ul/li[4]/text()").extract()[0]
        except:
            pass
        try:
            item['productCompany'] = ""
        except:
            pass
        names = self.name + '.json'
        try:
            item['fileName'] = names
        except:
            pass
        list_details = response.xpath(
            "//div[@id='product_detail']/div[@class='product-attributes']/ul[@class='clearfix']/li/text()"
        ).extract()
        details = response.xpath(
            "//ul[@class='inLeft_attributes']/li/span/text()").extract()
        logging.info("-------list_details_len=%i" % len(list_details))
        logging.info("-------details_len=%i" % len(details))
        list_pack = response.xpath(
            "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()"
        ).extract()
        intro = response.xpath("//span[@id='PDescription']/text()").extract()
        logging.info("-------intr_len=%i" % len(intro))
        speci = response.xpath("//span[@id='techParam']/text()").extract()
        logging.info("-------intr_len=%i" % len(speci))
        num_one = 1
        for value_details in list_details:
            value_details = value_details.encode('utf-8').replace(
                "\"", "\'").strip()
            if '品牌' in value_details:
                continue
            else:
                details = value_details.split(':')
                data2 = {}
                data2['attrkey'] = ''
                data2['keyname'] = ''
                data2['attrkey'] = details[0]
                data2['keyname'] = details[1]
                details_list.append(data2)
            num_one += 1

        num_two = 1
        for list_intro in intro:
            list_intro = list_intro.encode('utf-8').replace("\n", "").replace(
                "\"", "").strip()
            list_intro = list_intro.split(':')
            for value_intro in list_intro:
                if num_two % 2 == 1:
                    data2 = {}
                    data2['attrkey'] = ''
                    data2['keyname'] = ''
                    if '商品名称' in value_intro:
                        break
                    if '品牌' in value_intro:
                        break
                    data2['attrkey'] = value_intro
                else:
                    data2['keyname'] = value_intro
                    intro_list.append(data2)
                num_two += 1

        num_three = 1
        for list_speci in speci:
            list_speci = list_speci.encode('utf-8').replace("\n", "").replace(
                "\"", "").strip()
            list_speci = list_speci.split(':')
            for value_speci in list_speci:
                if num_three % 2 == 1:
                    data2 = {}
                    data2['attrkey'] = ''
                    data2['keyname'] = ''
                    if '商品名称' in value_speci:
                        break
                    if '品牌' in value_speci:
                        break
                    data2['attrkey'] = value_speci
                else:
                    data2['keyname'] = value_speci
                    speci_list.append(data2)
                num_three += 1

        item['productSpeci'] = speci_list
        item['productPack'] = pack_list
        item['productIntro'] = intro_list
        item['productDetails'] = details_list
        yield item
Пример #7
0
    def parse(self, response):
        item = BaseItem()
        speci_list = []
        pack_list = []
        intro_list = []
        details_list = []
        item['productUrl'] = ''
        item['productName'] = ''
        item['productBrand'] = ''
        item['productModel'] = ''
        item['productClassification'] = ''
        item['productPrice'] = ''
        item['productImagePath'] = ''
        item['productAddres'] = ""
        item['productCompany'] = ''
        item['fileName'] = ''
        item['productDetails'] = ""
        item['productPack'] = ""
        item['productIntro'] = ""
        item['productSpeci'] = ""
        classification_one = ''
        classification_two = ''
        classification_three = ''
        try:
            classification_one = response.xpath(
                "//div[@class='layout']/div[@class='path']/a[2]/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
            classification_two = response.xpath(
                "//div[@class='layout']/div[@class='path']/a[3]/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
            classification_three = response.xpath(
                "//div[@class='layout']/div[@class='path']/a[4]/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        classification = classification_one + '|||' + classification_two + '|||' + classification_three
        item['productUrl'] = response.url
        try:
            item['productName'] = response.xpath(
                "//div[@class='prodetails']/h1[@class='protitle']/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        list_brand = []
        try:
            list_brand = response.xpath(
                "//ul[@class='list2 clf']/li[@class='itm']/span[@class='dt']/text()"
            ).extract()
            for j in range(1, len(list_brand)):
                brand = response.xpath(
                    "//ul[@class='list2 clf']/li[@class='itm'][%i]/span[@class='dt']/text()"
                    % j).extract()[0].encode('utf-8')
                model = response.xpath(
                    "//ul[@class='list2 clf']/li[@class='itm'][%i]/span[@class='dt']/text()"
                    % j).extract()[0].encode('utf-8')
                if '品牌' in brand:
                    item['productBrand'] = response.xpath(
                        "//ul[@class='list1 clf']/li[@class='itm'][%i]/span[@class='dd']/text()"
                        % j).extract()[0].encode('utf-8').replace(
                            "\"", "\'").strip()
                if '型号' in model:
                    item['productModel'] = response.xpath(
                        "//ul[@class='list1 clf']/li[@class='itm'][%i]/span[@class='dd']/text()"
                        % j).extract()[0].encode('utf-8').replace(
                            "\"", "\'").strip()
        except:
            pass
        try:
            item['productClassification'] = classification
        except:
            pass
        try:
            price = response.xpath("//em[@class='prc']/b/text()").extract(
            )[0].encode('utf-8').replace("\"", "\'").strip()
            if price == '':
                price = '0.0'
        except:
            pass
        try:
            item['productPrice'] = str(
                float(filter(lambda ch: ch in '0123456789.~', price)) * 100)
        except:
            pass
        try:
            item['productImagePath'] = "http:" + response.xpath(
                "//li[@class='img-itm active']/div[@class='img-box']/img/@src"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        item['productAddres'] = ""
        item['productCompany'] = ""
        names = self.name + '.json'
        item['fileName'] = names
        list_details = response.xpath(
            "//ul[@class='list1 clf']/li[@class='itm']/span/text()").extract()
        logging.info("-------list_details_len=%i" % len(list_details))
        list_pack = response.xpath(
            "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()"
        ).extract()
        intro = response.xpath("//span[@id='PDescriptiion']/text()").extract()
        logging.info("-------intr_len=%i" % len(intro))
        speci = response.xpath("//span[@id='techParam']/text()").extract()
        logging.info("-------intr_len=%i" % len(speci))
        num_one = 1
        for value_details in list_details:
            value_details = value_details.encode('utf-8').replace(
                ":", "").replace("\n", "").replace("\"", "").strip()
            if num_one % 2 == 2:
                num_one = 1
                continue
            if num_one % 2 == 1:
                data2 = {}
                data2['attrkey'] = ''
                data2['keyname'] = ''
                if '品牌' in value_details:
                    num_one = 0
                    continue
                if '型号' in value_details:
                    num_one = 0
                    continue
                data2['attrkey'] = value_details
            else:
                if num_one == 0:
                    num_one = 1
                    continue
                data2['keyname'] = value_details
                details_list.append(data2)
            num_one += 1

        num_two = 1
        for list_intro in intro:
            list_intro = list_intro.encode('utf-8').replace("\n", "").replace(
                "\"", "").strip()
            list_intro = list_intro.split(':')
            for value_intro in list_intro:
                if num_two % 2 == 1:
                    data2 = {}
                    data2['attrkey'] = ''
                    data2['keyname'] = ''
                    if '商品名称' in value_intro:
                        break
                    if '品牌' in value_intro:
                        break
                    data2['attrkey'] = value_intro
                else:
                    data2['keyname'] = value_intro
                    intro_list.append(data2)
                num_two += 1

        num_three = 1
        for list_speci in speci:
            list_speci = list_speci.encode('utf-8').replace("\n", "").replace(
                "\"", "").strip()
            list_speci = list_speci.split(':')
            for value_speci in list_speci:
                if num_three % 2 == 1:
                    data2 = {}
                    data2['attrkey'] = ''
                    data2['keyname'] = ''
                    if '商品名称' in value_speci:
                        break
                    if '品牌' in value_speci:
                        break
                    data2['attrkey'] = value_speci
                else:
                    data2['keyname'] = value_speci
                    speci_list.append(data2)
                num_three += 1

        product_details = response.xpath(
            "//div[@class='pro-main']/div[@class='con'][1]/text()").extract()
        filename = self.name + ".txt"
        file = open("data/" + filename, 'a+')
        file.write("\n" + "productUrl:" + response.url + "\n")
        file.write("productDetails:" + "\n")
        for details in product_details:
            details = details.encode('utf-8').replace("\b", "").replace(
                "<br/>", "").replace("<br>", "").strip()
            file.write(details + "\n")
        file.close()

        item['productSpeci'] = speci_list
        item['productPack'] = pack_list
        item['productIntro'] = intro_list
        item['productDetails'] = details_list
        yield item
	def parse(self, response):
		item = BaseItem()
		details_list = []
		pack_list = []
		intro_list = []
		speci_list = []
		item['productUrl'] = ''
		item['productName'] = ''
		item['productBrand'] = ''
		item['productModel'] = ''
		item['productClassification'] = ''
		item['productPrice'] = ''
		item['productImagePath'] = ''
		item['productAddres'] = ""
		item['productCompany'] = ''
		item['fileName'] = ''
		item['productDetails'] = ""
		item['productPack'] = ""
		item['productIntro'] = ""
		item['productSpeci'] = ""
		classification_one = ''
		classification_two = ''
		classification_three = ''
		try:
			item['productUrl'] = response.url
		except:
			pass
		try:
			item['productName'] = response.xpath("//h1[@class='prodbaseinfo_title']/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()+"eoriutqirwe"
		except:
			pass
		try:
			item['productBrand'] = response.xpath("//ul[@class='ul_list']/li[@class='fg14'][1]/div[1]/a/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		try:
			item['productModel'] = response.xpath("//ul[@class='ul_list']/li[@class='fg14'][2]/p/font/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		try:
			classification_one = response.xpath("//div[@class='location']/a[2]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
			classification_two = response.xpath("//div[@class='location']/a[3]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
			classification_three = response.xpath("//div[@class='location']/a[4]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
			item['productClassification'] = classification_one + '|||' + classification_two + '|||' +classification_three
		except:
			pass
		try:
			item['productPrice'] = response.xpath("//ul[@class='ul_list']/li[@class='fg14'][3]/p/span[@id='attr_price']/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		try:
			item['productPrice'] = str(float(filter(lambda ch: ch in '0123456789.~', item['productPrice']))*100)
		except:
			pass
		try:
			item['productImagePath'] = response.xpath('//div[@id="wrap"]/a/@href').extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		try:
			item['productAddres'] = ""
		except:
			pass
		try:
			item['productCompany'] = ""
		except:
			pass
		names = self.name+'.json'
		try:
			item['fileName'] = names
		except:
			pass
		list_details = response.xpath("//div[@id='para']/table[1]/tbody[2]/tr/td/text()").extract()
		logging.info("-------list_details_len=%i" %len(list_details))
		list_pack = response.xpath("//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()").extract()
		intro = []
		try:
			intro_p = response.xpath("//li[@class='pb10p']/blockquote/table[2]/tbody/tr/td/p/text()").extract()
			logging.info("-------intro_p_len=%i" %len(intro_p))
			intro = response.xpath("//li[@class='pb10pppp']/blockquote/table[2]/tbody/tr/td/text()").extract()
			logging.info("-------intro_len=%i" %len(intro))
		except:
			pass
		speci = []
		try:
			speci_p = response.xpath("//li[@class='pb10p']/blockquote/table[2]/td/p/text()").extract()
			logging.info("-------speci_p_len=%i" %len(speci_p))
			speci = response.xpath("//li[@class='pb10pppp']/blockquote/table[2]/tbody/text()").extract()
			logging.info("-------speci_len=%i" %len(speci))
		except:
			pass
		num_one=1
		for value_details in list_details :
			value_details = value_details.encode('utf-8').replace(":","\/").replace("\n","").replace("\"","").strip()
			if num_one%2==2 :
				num_one = 1
				continue
			if num_one%2==1 :
				data2 = {}
				data2['attrkey'] = ''
				data2['keyname'] = ''
				if '品牌' in value_details:
					num_one=0
					continue
				if '型号' in value_details:
					num_one=0
					continue
				data2['attrkey']=value_details
			else:
				if num_one ==0:
					num_one = 1
					continue
				data2['keyname']=value_details
				details_list.append(data2)
			num_one+=1

		num_two=1
		for list_intro in intro:
			list_intro = list_intro.encode('utf-8').replace("\n","").replace(":","").strip()
			if num_two%2==1:
				data2 = {}
				data2['attrkey'] = ''
				data2['keyname'] = ''
				if '型号' in list_intro:
					num_two +=1
					continue
				data2['attrkey']=list_intro
			else:
				if num_two == 3:
					num_two +=1
					continue
				data2['keyname']=list_intro
				if num_two == 4:
					num_two =6
					continue
				intro_list.append(data2)
			num_two +=1


		num_three=1
		for list_speci in speci:
			list_speci = list_speci.encode('utf-8').replace("\n","").replace("\"","").strip()
			if num_three%2==1 :
				data2 = {}
				data2['attrkey'] = ''
				data2['keyname'] = ''
				if '商品名称' in list_speci:
					break
				if '品牌' in list_speci:
					break
				data2['attrkey']=list_speci
			else:
				data2['keyname']=list_speci
				speci_list.append(data2)
			num_three+=1

		intro_file = response.xpath("//li[@class='pb10p']/blockquote/table[2]/tbody/tr/td/text()").extract()
		filename = self.name+".txt"
		file = open("data/"+filename, 'a+')
		file.write("\n"+"productUrl:"+response.url+"\n")
		file.write("\n"+"productIntro:"+"\n")
		for list_intro in intro_file:
			list_intro = list_intro.encode('utf-8').replace("\n","").replace(":","").strip()
			file.write(list_intro+"\n")
		file.close()

		item['productSpeci'] = speci_list
		item['productPack'] = pack_list
		item['productIntro'] = intro_list
		item['productDetails'] = details_list
		yield item
Пример #9
0
    def parse(self, response):
        item = BaseItem()
        details_list = []
        pack_list = []
        intro_list = []
        speci_list = []
        item['productUrl'] = ''
        item['productName'] = ''
        item['productBrand'] = ''
        item['productModel'] = ''
        item['productClassification'] = ''
        item['productPrice'] = ''
        item['productImagePath'] = ''
        item['productAddres'] = ""
        item['productCompany'] = ''
        item['fileName'] = ''
        item['productDetails'] = ""
        item['productPack'] = ""
        item['productIntro'] = ""
        item['productSpeci'] = ""
        classification_one = ''
        classification_two = ''
        classification_three = ''
        try:
            item['productUrl'] = response.url
        except:
            pass
        try:
            item['productName'] = response.xpath(
                '//div[@class="goods_info"]/div/h1/text()').extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        brand = ''
        try:
            brand = response.xpath(
                "//div[@id='con_goods_1']/ul[@class='detail-list']/li[3]/a/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        item['productBrand'] = brand
        try:
            item['productModel'] = response.xpath(
                '//div[@class="m m1"]/div/ul/dt/li/text()').extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            classification_one = response.xpath(
                "//div[@class='breadcrumb']/a[2]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_two = response.xpath(
                "//div[@class='breadcrumb']/a[3]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_three = response.xpath(
                "//div[@class='breadcrumb']/a[4]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        classification = classification_one + '|||' + classification_two + '|||' + classification_three
        item['productClassification'] = classification
        try:
            #去空格 转分   去人民币符号
            item['productPrice'] = response.xpath(
                '//strong[@class="p-price"]/font/text()').extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productPrice'] = str(
                float(
                    filter(lambda ch: ch in '0123456789.~',
                           item['productPrice'])) * 100)
        except:
            pass
        #图片连接
        try:
            item[
                'productImagePath'] = "http://www.91yilong.com" + response.xpath(
                    '//div[@class="goods_img"]/a/@href').extract()[0].encode(
                        'utf-8').replace("\"", "\'").strip()
        except:
            pass
        #print item['image_urls'],"777777"
        try:
            item['productAddres'] = ""
        except:
            pass
        try:
            item['productCompany'] = ""
        except:
            pass
        names = self.name + '.json'
        try:
            item['fileName'] = names
        except:
            pass
        list_details = response.xpath(
            "//div[@id='para']/table[1]/tbody[2]/tr/td/text()").extract()
        logging.info("-------list_details_len=%i" % len(list_details))
        list_pack = response.xpath(
            "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()"
        ).extract()
        intro = response.xpath(
            "//div[@id='con_goods_1']/ul[@class='detail-list']/li/text()"
        ).extract()
        logging.info("-------intr_len=%i" % len(intro))
        speci = response.xpath(
            "//div[@id='con_goods_2']/ul/li/text()").extract()
        logging.info("-------intr_len=%i" % len(speci))
        num_one = 1
        for value_details in list_details:
            value_details = value_details.encode('utf-8').replace(
                ":", "\/").replace("\n", "").replace("\"", "").strip()
            if num_one % 2 == 2:
                num_one = 1
                continue
            if num_one % 2 == 1:
                data2 = {}
                data2['attrkey'] = ''
                data2['keyname'] = ''
                if '品牌' in value_details:
                    num_one = 0
                    continue
                if '型号' in value_details:
                    num_one = 0
                    continue
                data2['attrkey'] = value_details
            else:
                if num_one == 0:
                    num_one = 1
                    continue
                data2['keyname'] = value_details
                details_list.append(data2)
            num_one += 1

        num_two = 1
        for list_intro in intro:
            list_intro = list_intro.encode('utf-8').replace("\n", "").replace(
                "\"", "").strip()
            list_intro = list_intro.split(':')
            for value_intro in list_intro:
                if num_two % 2 == 1:
                    data2 = {}
                    data2['attrkey'] = ''
                    data2['keyname'] = ''
                    if '商品名称' in value_intro:
                        break
                    if '品牌' in value_intro:
                        break
                    data2['attrkey'] = value_intro
                else:
                    data2['keyname'] = value_intro
                    intro_list.append(data2)
                num_two += 1

        num_three = 1
        for list_speci in speci:
            list_speci = list_speci.encode('utf-8').replace("\n", "").replace(
                "\"", "").strip()
            list_speci = list_speci.split(':')
            for value_speci in list_speci:
                if num_three % 2 == 1:
                    data2 = {}
                    data2['attrkey'] = ''
                    data2['keyname'] = ''
                    if '商品名称' in value_speci:
                        break
                    if '品牌' in value_speci:
                        break
                    data2['attrkey'] = value_speci
                else:
                    data2['keyname'] = value_speci
                    speci_list.append(data2)
                num_three += 1

        item['productSpeci'] = speci_list
        item['productPack'] = pack_list
        item['productIntro'] = intro_list
        item['productDetails'] = details_list
        yield item
	def parse(self, response):
		item = BaseItem()
		speci_list = []
		pack_list = []
		intro_list = []
		details_list = []
		item['productUrl'] = ''
		item['productName'] = ''
		item['productBrand'] = ''
		item['productModel'] = ''
		item['productClassification'] = ''
		item['productPrice'] = ''
		item['productImagePath'] = ''
		item['productAddres'] = ""
		item['productCompany'] = ''
		item['fileName'] = ''
		item['productDetails'] = ""
		item['productPack'] = ""
		item['productIntro'] = ""
		item['productSpeci'] = ""
		classification_one = ''
		classification_two = ''
		classification_three = ''
		print "PhantomJS is starting..."
		driver = webdriver.PhantomJS()
		# driver = webdriver.Chrome()
		driver.get(response.url)
		time.sleep(3)
		body = driver.page_source
		#driver.close()
		HtmlResponses = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=response)
		try:
			item['productUrl'] = response.url
		except:
			pass
		try:
			item['productName'] = HtmlResponses.xpath("//div[@class='detail-goods-right']/h1[@class='detail-goods-right-head ft18 J_title']/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		try:
			item['productBrand'] = HtmlResponses.xpath("//div[@class='main-width bread-top-main J_bread']/a[2]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
			if '京东' in item['productBrand']:
				item['productBrand'] = ''
		except:
			pass
		try:
			item['productModel'] = HtmlResponses.xpath("//div[@class='detail-goods-right']/div[@class='detail-goods-right-list m-top15 J_goods']/span[2]/label/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		try:
			classification_one = response.xpath("//div[@id='ur_here']/a[2]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
			classification_two = response.xpath("//div[@id='ur_here']/a[3]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
			classification_three = response.xpath("//div[@id='ur_here']/a[4]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
			item['productClassification'] = classification_one + '|||' + classification_two + '|||' +classification_three
		except:
			pass
		try:
		#去空格 转分   去人民币符号
			item['productPrice'] = HtmlResponses.xpath("//div[@class='detail-goods-right']/div[@class='detail-goods-price m-top15']/ul/li[1]/label[@class='ft24 a weight J_salePrice']/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		try:
			item['productPrice'] = str(float(filter(lambda ch: ch in '0123456789.~', item['productPrice']))*100)
		except:
			pass
		#图片连接
		try:
			item['productImagePath'] = HtmlResponses.xpath('//div[@class="detail-goods-left"]/div[1]/img/@src').extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		#print item['image_urls'],"777777"
		try:
			item['productAddres'] = response.xpath("//form[@id='form1']/ul/li[4]/text()").extract()[0]
		except:
			pass
		try:
			item['productCompany'] = ""
		except:
			pass
		names = self.name+'.json'
		try:
			item['fileName'] = names
		except:
			pass
		list_details = HtmlResponses.xpath("//div[@class='J_shows']/table/tbody/tr/td/text()").extract()
		logging.info("-------list_details_len=%i" %len(list_details))
		list_pack = response.xpath("//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()").extract()
		intro = response.xpath("//span[@id='PDescriptiion']/text()").extract()
		logging.info("-------intr_len=%i" %len(intro))
		speci = response.xpath("//span[@id='techParam']/text()").extract()
		logging.info("-------intr_len=%i" %len(speci))
		num_one=1
		for value_details in list_details :
			value_details = value_details.encode('utf-8').replace(":","\/").replace("\n","").replace("\"","").strip()
			if num_one%2==2 :
				num_one = 1
				continue
			if num_one%2==1 :
				data2 = {}
				data2['attrkey'] = ''
				data2['keyname'] = ''
				if '品牌' in value_details:
					num_one=0
					continue
				if '型号' in value_details:
					num_one=0
					continue
				data2['attrkey']=value_details
			else:
				if num_one ==0:
					num_one = 1
					continue
				data2['keyname']=value_details
				details_list.append(data2)
			num_one+=1

		num_two=1
		for list_intro in intro:
			list_intro = list_intro.encode('utf-8').replace("\n","").replace("\"","").strip()
			list_intro = list_intro.split(':')
			for value_intro in list_intro :
				if num_two%2==1 :
					data2 = {}
					data2['attrkey'] = ''
					data2['keyname'] = ''
					if '商品名称' in value_intro:
						break
					if '品牌' in value_intro:
						break
					data2['attrkey']=value_intro
				else:
					data2['keyname']=value_intro
					intro_list.append(data2)
				num_two+=1

		num_three=1
		for list_speci in speci:
			list_speci = list_speci.encode('utf-8').replace("\n","").replace("\"","").strip()
			list_speci = list_speci.split(':')
			for value_speci in list_speci :
				if num_three%2==1 :
					data2 = {}
					data2['attrkey'] = ''
					data2['keyname'] = ''
					if '商品名称' in value_speci:
						break
					if '品牌' in value_speci:
						break
					data2['attrkey']=value_speci
				else:
					data2['keyname']=value_speci
					speci_list.append(data2)
				num_three+=1


		item['productSpeci'] = speci_list
		item['productPack'] = pack_list
		item['productIntro'] = intro_list
		item['productDetails'] = details_list
		yield item
	def parse(self, response):
		item = BaseItem()
		speci_list = []
		pack_list = []
		intro_list = []
		details_list = []
		item['productUrl'] = ''
		item['productName'] = ''
		item['productBrand'] = ''
		item['productModel'] = ''
		item['productClassification'] = ''
		item['productPrice'] = ''
		item['productImagePath'] = ''
		item['productAddres'] = ""
		item['productCompany'] = ''
		item['fileName'] = ''
		item['productDetails'] = ""
		item['productPack'] = ""
		item['productIntro'] = ""
		item['productSpeci'] = ""
		classification_one = ''
		classification_two = ''
		classification_three = ''
		try:
			item['productUrl'] = response.url
		except:
			pass
		try:
			item['productName'] = response.xpath("//div[@class='goods_content_a_l_r f_l']/div[@class='title']/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		try:
			item['productBrand'] = response.xpath("//div[@class='goods_content_a_l_r f_l']/div[4]/span[@class='brand']/a/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		try:
			item['productModel'] = response.xpath('//div[@class="m m1"]/div/ul/dt/li/text()').extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		try:
			classification_one = response.xpath("//div[@class='position w1000']/a[2]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
			classification_two = response.xpath("//div[@class='position w1000']/a[3]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
			classification_three = response.xpath("//div[@class='position w1000']/a[4]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		classification = classification_one + '|||' + classification_two + '|||' +classification_three
		item['productClassification'] = classification
		try:
		#去空格 转分   去人民币符号
			item['productPrice'] = response.xpath("//span[@class='goods_price weiruanyahei']/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		try:
			item['productPrice'] = str(float(filter(lambda ch: ch in '0123456789.~', item['productPrice']))*100)
		except:
			pass
		#图片连接
		try:
			item['productImagePath'] = "http://www.deppre.cn/" + response.xpath('//div[@class="img_center_div"]/div/a/img/@src').extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		#print item['image_urls'],"777777"
		try:
			item['productAddres'] = response.xpath("//div[@class='goods_content_a_l_r f_l']/div[@class='score'][1]/span[@class='brand']/text()").extract()[0]
		except:
			pass
		try:
			item['productCompany'] = ""
		except:
			pass
		names = self.name+'.json'
		try:
			item['fileName'] = names
		except:
			pass
		list_details = response.xpath("//div[@id='para']/table[1]/tbody[2]/tr/td/text()").extract()
		logging.info("-------list_details_len=%i" %len(list_details))
		list_pack = response.xpath("//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()").extract()
		intro = response.xpath("//span[@id='PDescriptiion']/text()").extract()
		logging.info("-------intr_len=%i" %len(intro))
		speci = response.xpath("//span[@id='techParam']/text()").extract()
		logging.info("-------intr_len=%i" %len(speci))
		num_one=1
		for value_details in list_details :
			value_details = value_details.encode('utf-8').replace(":","\/").replace("\n","").replace("\"","").strip()
			if num_one%2==2 :
				num_one = 1
				continue
			if num_one%2==1 :
				data2 = {}
				data2['attrkey'] = ''
				data2['keyname'] = ''
				if '品牌' in value_details:
					num_one=0
					continue
				if '型号' in value_details:
					num_one=0
					continue
				data2['attrkey']=value_details
			else:
				if num_one ==0:
					num_one = 1
					continue
				data2['keyname']=value_details
				details_list.append(data2)
			num_one+=1

		num_two=1
		for list_intro in intro:
			list_intro = list_intro.encode('utf-8').replace("\n","").replace("\"","").strip()
			list_intro = list_intro.split(':')
			for value_intro in list_intro :
				if num_two%2==1 :
					data2 = {}
					data2['attrkey'] = ''
					data2['keyname'] = ''
					if '商品名称' in value_intro:
						break
					if '品牌' in value_intro:
						break
					data2['attrkey']=value_intro
				else:
					data2['keyname']=value_intro
					intro_list.append(data2)
				num_two+=1

		num_three=1
		for list_speci in speci:
			list_speci = list_speci.encode('utf-8').replace("\n","").replace("\"","").strip()
			list_speci = list_speci.split(':')
			for value_speci in list_speci :
				if num_three%2==1 :
					data2 = {}
					data2['attrkey'] = ''
					data2['keyname'] = ''
					if '商品名称' in value_speci:
						break
					if '品牌' in value_speci:
						break
					data2['attrkey']=value_speci
				else:
					data2['keyname']=value_speci
					speci_list.append(data2)
				num_three+=1

		product_intro = response.xpath("//div[@class='goods_content_c_r_b_b clearfix']/span/text()").extract()
		product_pack = response.xpath("//td[@id='imgDiv']/div[@id='div3']/font/b/text()").extract()
		filename = self.name+".txt"
		file = open("data/"+filename, 'a+')
		file.write("\n"+"productUrl:"+response.url+"\n")
		file.write("productIntro:"+"\n")
		for intro in product_intro:
			intro = intro.encode('utf-8').replace("\b","").replace("<br/>","").replace("<br>","").strip()
			file.write(intro+"\n")
		file.close()

		item['productSpeci'] = speci_list
		item['productPack'] = pack_list
		item['productIntro'] = intro_list
		item['productDetails'] = details_list
		yield item
Пример #12
0
    def parse(self, response):
        p = open('aa.html', 'a+')
        p.write(response.body)
        item = BaseItem()
        details_list = []
        pack_list = []
        intro_list = []
        speci_list = []
        item['productUrl'] = ''
        item['productName'] = ''
        item['productBrand'] = ''
        item['productModel'] = ''
        item['productClassification'] = ''
        item['productPrice'] = ''
        item['productImagePath'] = ''
        item['productAddres'] = ""
        item['productCompany'] = ''
        item['fileName'] = ''
        item['productDetails'] = ""
        item['productPack'] = ""
        item['productIntro'] = ""
        item['productSpeci'] = ""
        classification_one = ''
        classification_two = ''
        classification_three = ''
        print "PhantomJS is starting1..."
        driver = webdriver.PhantomJS()
        driver.get(response.url)
        #time.sleep(3)
        body = driver.page_source
        #driver.close()
        HtmlResponses = HtmlResponse(driver.current_url,
                                     body=body,
                                     encoding='utf-8',
                                     request=response)
        try:
            item['productUrl'] = response.url
        except:
            pass
        try:
            item['productName'] = HtmlResponses.xpath(
                "//h1[@class='ware_title']/a/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        brand = ''
        try:
            brand = HtmlResponses.xpath("//div[@class='ware_text']/div/text()"
                                        )[3].extract().encode('utf-8').replace(
                                            "\r\n", "\'").strip()
        except:
            pass
        item['productBrand'] = brand
        model = ''
        try:
            model = HtmlResponses.xpath("//div[@class='ware_text']/div/text()"
                                        ).extract()[6].encode('utf-8').strip()
            if '型号' in model:
                item['productModel'] = HtmlResponses.xpath(
                    "//div[@class='ware_text']/div/text()").extract(
                    )[6].encode('utf-8').replace("型号:", "").strip()
            else:
                item['productModel'] = HtmlResponses.xpath(
                    "//div[@class='ware_text']/div/text()").extract(
                    )[5].encode('utf-8').replace("型号:", "").strip()
        except:
            pass
        try:
            classification_one = HtmlResponses.xpath(
                "//div[@id='head']/div[@id='path']/a[2]/text()").extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
            classification_two = HtmlResponses.xpath(
                "//div[@id='head']/div[@id='path']/a[3]/text()").extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
            classification_three = HtmlResponses.xpath(
                "//div[@id='head']/div[@id='path']/a[4]/text()").extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        classification = classification_one + '|||' + classification_two + '|||' + classification_three
        if classification not in self.data1:
            self.data1.append(classification)
            item['productClassification'] = classification

        try:
            item['productPrice'] = response.xpath(
                "//div[@class='rate']/span[@class='fontColor3'][2]/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productPrice'] = str(
                float(
                    filter(lambda ch: ch in '0123456789.~',
                           item['productPrice'])) * 100)
        except:
            pass
        try:
            item['productImagePath'] = response.xpath(
                '//span[@class="jqzoom"]/img/@src').extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        #print item['image_urls'],"777777"
        try:
            item['productAddres'] = ""
        except:
            pass
        try:
            item['productCompany'] = ""
        except:
            pass
        names = self.name + '.json'
        try:
            item['fileName'] = names
        except:
            pass
        list_details = HtmlResponses.xpath(
            "//div[@id='para']/table[1]/tbody[2]/tr/td/text()").extract()
        logging.info("-------list_details_len=%i" % len(list_details))
        list_pack = HtmlResponses.xpath(
            "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()"
        ).extract()
        intro = HtmlResponses.xpath(
            "//div[@id='para']/p[2]/font/text()").extract()
        logging.info("-------intr_len=%i" % len(intro))
        driver.close
        num_one = 1
        for value_details in list_details:
            value_details = value_details.encode('utf-8').replace(
                ":", "\/").replace("\n", "").replace("\"", "").strip()
            if num_one % 2 == 2:
                num_one = 1
                continue
            if num_one % 2 == 1:
                data2 = {}
                data2['attrkey'] = ''
                data2['keyname'] = ''
                if '品牌' in value_details:
                    num_one = 0
                    continue
                if '型号' in value_details:
                    num_one = 0
                    continue
                data2['attrkey'] = value_details
            else:
                if num_one == 0:
                    num_one = 1
                    continue
                data2['keyname'] = value_details
                details_list.append(data2)
            num_one += 1
        num_two = 1
        for list_intro in intro:
            list_intro = list_intro.encode('utf-8').replace(":", "\/").replace(
                "\n", "").replace("\"", "").strip()
            list_intro = list_intro.split(':')
            for value_intro in list_intro:
                if num_two % 2 == 1:
                    data2 = {}
                    data2['attrkey'] = ''
                    data2['keyname'] = ''
                    data2['attrkey'] = value_intro
                else:
                    data2['keyname'] = value_intro
                    intro_list.append(data2)
                num_two += 1

        item['productSpeci'] = speci_list
        item['productPack'] = pack_list
        item['productIntro'] = intro_list
        item['productDetails'] = intro_list
        yield item
    def parse(self, response):
        item = BaseItem()
        speci_list = []
        pack_list = []
        intro_list = []
        details_list = []
        item['productUrl'] = ''
        item['productName'] = ''
        item['productBrand'] = ''
        item['productModel'] = ''
        item['productClassification'] = ''
        item['productPrice'] = ''
        item['productImagePath'] = ''
        item['productAddres'] = ""
        item['productCompany'] = ''
        item['fileName'] = ''
        item['productDetails'] = ""
        item['productPack'] = ""
        item['productIntro'] = ""
        item['productSpeci'] = ""
        classification_one = ''
        classification_two = ''
        classification_three = ''
        try:
            item['productUrl'] = response.url
        except:
            pass
        try:
            item['productName'] = response.xpath(
                "//form[@id='ECS_FORMBUY']/ul/li[1]/dd/div/text()").extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productBrand'] = response.xpath(
                "//ul[@class='ul1']/li[@class='clearfix'][2]/dd/div[@class='f_r goos_news']/a/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productModel'] = response.xpath(
                "//ul[@class='ul1']/li[@class='clearfix'][3]/dd/div[@class='f_l']/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            classification_one = response.xpath(
                "//div[@id='ur_here']/a[2]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_two = response.xpath(
                "//div[@id='ur_here']/a[3]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_three = response.xpath(
                "//div[@id='ur_here']/a[4]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            item[
                'productClassification'] = classification_one + '|||' + classification_two + '|||' + classification_three
        except:
            pass
        try:
            item['productPrice'] = response.xpath(
                "//font[@id='ECS_SHOPPRICE']/text()").extract()[1].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productPrice'] = str(
                float(
                    filter(lambda ch: ch in '0123456789.~',
                           item['productPrice'])) * 100)
        except:
            pass
        #ͼƬÁ¬½Ó
        try:
            item[
                'productImagePath'] = "http://www.sssmro.com/" + response.xpath(
                    '//div[@id="preview"]/div[@class="jqzoom"]/img/@src'
                ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        #print item['image_urls'],"777777"
        try:
            item['productAddres'] = ""
        except:
            pass
        try:
            item['productCompany'] = ""
        except:
            pass
        names = self.name + '.json'
        try:
            item['fileName'] = names
        except:
            pass
        test_specis = response.xpath(
            "//div[@id='main1']/blockquote[2]/div[@class='qyjstxt']/text()"
        ).extract()
        logging.info("-------specis_len=%i" % len(test_specis))
        test_details = response.xpath(
            "//blockquote[@class='block']/div[@class='qyjstxt']/text()"
        ).extract()
        logging.info("-------details_len=%i" % len(test_details))
        specis = ''
        try:
            specis = response.xpath(
                "//div[@id='main1']/blockquote[2]/div[@class='qyjst']/text()"
            ).extract()[0].encode('utf-8').replace("\n",
                                                   "").replace("\"",
                                                               "").strip()
        except:
            pass
        list_speci = specis.split('£º')
        list_pack = response.xpath(
            "//div[@class='packageParameter tabContent']/div[@class='specisParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()"
        ).extract()
        num_one = 1
        for speci in list_speci:
            if num_one % 2 == 0:
                data2 = {}
                data2['attrkey'] = ''
                data2['keyname'] = ''
                data2['attrkey'] = speci
            else:
                if num_one == 1:
                    num_one += 1
                    continue
                data2['keyname'] = speci
                speci_list.append(data2)
            num_one += 1
        num_two = 1
        for value_pack in list_pack:
            value_pack = value_pack.encode('utf-8').replace(":", "\/").replace(
                "\n", "").replace("\"", "").strip()
            if num_two % 2 == 1:
                data2 = {}
                data2['attrkey'] = ''
                data2['keyname'] = ''
                data2['attrkey'] = value_pack
            else:
                data2['keyname'] = value_pack
                pack_list.append(data2)
            num_two += 1

        product_intro = response.xpath(
            "//div[@class='formwork_bt'][1]/p/text()").extract()
        filename = self.name + ".txt"
        file = open("data/" + filename, 'a+')
        file.write("\n" + "productUrl:" + response.url + "\n")
        file.write("productIntro:" + "\n")
        for intro in product_intro:
            intro = intro.encode('utf-8').replace("\"", "").strip()
            file.write(intro + "\n")
        file.close()

        item['productSpeci'] = speci_list
        item['productPack'] = pack_list
        item['productIntro'] = intro_list
        item['productDetails'] = intro_list
        yield item
Пример #14
0
    def parse(self, response):
        item = BaseItem()
        speci_list = []
        pack_list = []
        intro_list = []
        details_list = []
        item['productUrl'] = ''
        item['productName'] = ''
        item['productBrand'] = ''
        item['productModel'] = ''
        item['productClassification'] = ''
        item['productPrice'] = ''
        item['productImagePath'] = ''
        item['productAddres'] = ""
        item['productCompany'] = ''
        item['fileName'] = ''
        item['productDetails'] = ""
        item['productPack'] = ""
        item['productIntro'] = ""
        item['productSpeci'] = ""
        classification_one = ''
        classification_two = ''
        classification_three = ''
        try:
            item['productUrl'] = response.url
        except:
            pass
        try:
            item['productName'] = response.xpath(
                "//div[@id='name']/h1/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productBrand'] = response.xpath(
                "//li[@id='summary-brand']/div[@class='dd']/a/em[@class='hl_red bold']/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productModel'] = response.xpath(
                '//div[@class="m m1"]/div/ul/dt/li/text()').extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            classification_one = response.xpath(
                "//div[@id='part_content']/div[@class='node_path']/a[3]/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
            classification_two = response.xpath(
                "//div[@id='part_content']/div[@class='node_path']/a[4]/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
            classification_three = response.xpath(
                "//div[@id='part_content']/div[@class='node_path']/a[5]/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        classification = classification_one + '|||' + classification_two + '|||' + classification_three
        item['productClassification'] = classification
        try:
            #去空格 转分   去人民币符号
            item['productPrice'] = response.xpath(
                "//li[@id='summary-price']/div[@class='dd']/strong/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productPrice'] = str(
                float(
                    filter(lambda ch: ch in '0123456789.~',
                           item['productPrice'])) * 100)
        except:
            pass
        #图片连接
        try:
            item['productImagePath'] = response.xpath(
                '//div[@id="spec-n1"]/a/@href').extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        #print item['image_urls'],"777777"
        try:
            item['productAddres'] = response.xpath(
                "//div[@class='goods_content_a_l_r f_l']/div[@class='score'][1]/span[@class='brand']/text()"
            ).extract()[0]
        except:
            pass
        try:
            item['productCompany'] = ""
        except:
            pass
        names = self.name + '.json'
        try:
            item['fileName'] = names
        except:
            pass
        test_specis = response.xpath(
            "//div[@id='main1']/blockquote[2]/div[@class='qyjstxt']/text()"
        ).extract()
        logging.info("-------specis_len=%i" % len(test_specis))
        test_details = response.xpath(
            "//blockquote[@class='block']/div[@class='qyjstxt']/text()"
        ).extract()
        logging.info("-------details_len=%i" % len(test_details))
        specis = ''
        try:
            specis = response.xpath(
                "//div[@id='main1']/blockquote[2]/div[@class='qyjst']/text()"
            ).extract()[0].encode('utf-8').replace("\n",
                                                   "").replace("\"",
                                                               "").strip()
        except:
            pass
        list_speci = specis.split(':')
        list_pack = response.xpath(
            "//div[@class='packageParameter tabContent']/div[@class='specisParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()"
        ).extract()
        num_one = 1
        for speci in list_speci:
            if num_one % 2 == 0:
                data2 = {}
                data2['attrkey'] = ''
                data2['keyname'] = ''
                data2['attrkey'] = speci
            else:
                if num_one == 1:
                    num_one += 1
                    continue
                data2['keyname'] = speci
                speci_list.append(data2)
            num_one += 1
        num_two = 1
        for value_pack in list_pack:
            value_pack = value_pack.encode('utf-8').replace(":", "\/").replace(
                "\n", "").replace("\"", "").strip()
            if num_two % 2 == 1:
                data2 = {}
                data2['attrkey'] = ''
                data2['keyname'] = ''
                data2['attrkey'] = value_pack
            else:
                data2['keyname'] = value_pack
                pack_list.append(data2)
            num_two += 1

        product_intro = response.xpath(
            "//div[@id='content_product']/div[@class='property']/text()"
        ).extract()
        filename = self.name + ".txt"
        file = open("data/" + filename, 'a+')
        file.write("\n" + "productUrl:" + response.url + "\n")
        file.write("productIntro:" + "\n")
        for intro in product_intro:
            intro = intro.encode('utf-8').replace("\"", "").strip()
            file.write(intro + "\n")
        file.close()

        item['productSpeci'] = speci_list
        item['productPack'] = pack_list
        item['productIntro'] = intro_list
        item['productDetails'] = intro_list
        yield item
    def parse(self, response):
        item = BaseItem()
        speci_list = []
        pack_list = []
        intro_list = []
        details_list = []
        item['productUrl'] = ''
        item['productName'] = ''
        item['productBrand'] = ''
        item['productModel'] = ''
        item['productClassification'] = ''
        item['productPrice'] = ''
        item['productImagePath'] = ''
        item['productAddres'] = ""
        item['productCompany'] = ''
        item['fileName'] = ''
        item['productDetails'] = ""
        item['productPack'] = ""
        item['productIntro'] = ""
        item['productSpeci'] = ""
        classification_one = ''
        classification_two = ''
        classification_three = ''
        print "PhantomJS is starting1..."
        #driver = webdriver.PhantomJS()
        driver = webdriver.PhantomJS()
        driver.get(response.url)
        time.sleep(3)
        body = driver.page_source
        #driver.close()
        HtmlResponses = HtmlResponse(driver.current_url,
                                     body=body,
                                     encoding='utf-8',
                                     request=response)
        try:
            item['productUrl'] = response.url
        except:
            pass
        try:
            item['productName'] = HtmlResponses.xpath(
                "//h1[@id='comTitle']/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        list_brand = ''
        for j in range(1, 20):
            try:
                list_brand = response.xpath(
                    "//div[@id='pdetail']/div/table/tr[%i]/th/h4/text()" %
                    j).extract()[0].encode('utf-8').replace("\"",
                                                            "\'").strip()
                if '品牌:' in list_brand:
                    item['productBrand'] = response.xpath(
                        "//div[@id='pdetail']/div/table/tr[%i]/td/text()" %
                        j).extract()[0].encode('utf-8').replace("\"",
                                                                "\'").strip()
                    break
            except:
                pass
        list_model = ''
        for j in range(1, 20):
            try:
                list_model = response.xpath(
                    "//div[@id='pdetail']/div/table/tr[%i]/th/h4/text()" %
                    j).extract()[0].encode('utf-8').replace("\"",
                                                            "\'").strip()
                if '型号:' in list_model:
                    item['productModel'] = response.xpath(
                        "//div[@id='pdetail']/div/table/tr[%i]/td/text()" %
                        j).extract()[0].encode('utf-8').replace("\"",
                                                                "\'").strip()
                    break
            except:
                pass
        try:
            classification_one = response.xpath(
                "//div[@id='head']/div[@id='path']/a[2]/text()").extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
            classification_two = response.xpath(
                "//div[@id='head']/div[@id='path']/a[3]/text()").extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
            classification_three = response.xpath(
                "//div[@id='head']/div[@id='path']/a[4]/text()").extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        classification = classification_one + '|||' + classification_two + '|||' + classification_three
        item['productClassification'] = classification
        try:
            #去空格 转分   去人民币符号
            item['productPrice'] = response.xpath(
                "//div[@id='oriPriceTop']/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productPrice'] = str(
                float(
                    filter(lambda ch: ch in '0123456789.~',
                           item['productPrice'])) * 100)
        except:
            pass
        #图片连接
        try:
            item['productImagePath'] = HtmlResponses.xpath(
                "//a[@id='imgContainer']/@hrefs").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        #print item['image_urls'],"777777"
        try:
            item['productAddres'] = response.xpath(
                "//div[@id='pdetail']/div[@class='d-vopy']/table/tbody/tr[4]/td/text()"
            ).extract()[0]
        except:
            pass
        try:
            item['productCompany'] = ""
        except:
            pass
        names = self.name + '.json'
        try:
            item['fileName'] = names
        except:
            pass
        item['productAddres'] = ""
        item['productCompany'] = ""
        names = self.name + '.json'
        item['fileName'] = names

        list_details = response.xpath(
            "//div[@class='d-vopy']/table/tr/th/h4/text()").extract()
        logging.info("-------list_details_len=%i" % len(list_details))
        details = response.xpath(
            "//div[@class='d-vopy']/table/tr/td/text()").extract()
        logging.info("-------details_len=%i" % len(details))
        list_pack = response.xpath(
            "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()"
        ).extract()
        list_intro = response.xpath(
            "//table[@class='goods-items']/tr[1]/th/text()").extract()
        logging.info("-------list_intro_len=%i" % len(list_intro))
        intro = response.xpath(
            "//div[@class='goods']/table[@class='goods-items']/tr[2]/td/text()"
        ).extract()
        logging.info("-------intr_len=%i" % len(intro))
        speci = response.xpath("//span[@id='techParam']/text()").extract()
        logging.info("-------intr_len=%i" % len(speci))
        num_one = 0
        for list_details_value in list_details:
            list_details_value = list_details_value.encode('utf-8').replace(
                "\n", "").replace("\"", "").strip()
            data2 = {}
            data2['attrkey'] = ''
            data2['keyname'] = ''
            if '品牌' in list_details_value:
                num_one += 1
                continue
            if '价格' in list_details_value:
                num_one += 1
                continue
            if '供应商' in list_details_value:
                num_one = 0
                continue
            if '保修期' in list_details_value:
                break
            data2['attrkey'] = list_details_value
            data2['keyname'] = details[num_one]
            details_list.append(data2)
            num_one += 1

        num_two = 0
        for list_intro_value in list_intro:
            list_intro_value = list_intro_value.encode('utf-8').replace(
                "\n", "").replace("\"", "").strip()
            data2 = {}
            data2['attrkey'] = ''
            data2['keyname'] = ''
            if '品牌' in list_intro_value:
                num_two += 1
                continue
            if '价格' in list_intro_value:
                num_two += 1
                continue
            if '供应商' in list_intro_value:
                num_two = 0
                continue
            if '保修期' in list_intro_value:
                break
            data2['attrkey'] = list_intro_value
            data2['keyname'] = intro[num_two]
            intro_list.append(data2)
            num_two += 1

        num_three = 1
        for list_speci in speci:
            list_speci = list_speci.encode('utf-8').replace("\n", "").replace(
                "\"", "").strip()
            list_speci = list_speci.split(':')
            for value_speci in list_speci:
                if num_three % 2 == 1:
                    data2 = {}
                    data2['attrkey'] = ''
                    data2['keyname'] = ''
                    if '商品名称' in value_speci:
                        break
                    if '品牌' in value_speci:
                        break
                    data2['attrkey'] = value_speci
                else:
                    data2['keyname'] = value_speci
                    speci_list.append(data2)
                num_three += 1

        item['productSpeci'] = speci_list
        item['productPack'] = pack_list
        item['productIntro'] = intro_list
        item['productDetails'] = details_list
        yield item
	def parse(self, response):
		item = BaseItem()
		speci_list = []
		pack_list = []
		intro_list = []
		details_list = []
		item['productUrl'] = ''
		item['productName'] = ''
		item['productBrand'] = ''
		item['productModel'] = ''
		item['productClassification'] = ''
		item['productPrice'] = ''
		item['productImagePath'] = ''
		item['productAddres'] = ""
		item['productCompany'] = ''
		item['fileName'] = ''
		item['productDetails'] = ""
		item['productPack'] = ""
		item['productIntro'] = ""
		item['productSpeci'] = ""
		classification_one = ''
		classification_two = ''
		classification_three = ''
		try:
			classification_one = response.xpath("//div[@class='g_position']/a[2]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
			classification_two = response.xpath("//div[@class='g_position']/a[3]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
			classification_three = response.xpath("//div[@class='g_position']/a[4]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		classification = classification_one + '|||' + classification_two + '|||' +classification_three
		try:
			item['productUrl'] = response.url
		except:
			pass
		try:
			item['productName'] = response.xpath('//div[@class="pd_l_cont clearfix"]/div[1]/a/h1/text()').extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		brand = ''
		list_brand = []
		for j in range(1,20):
			try:
				list_brand = response.xpath("//div[@class='pd_param clearfix']/ul/li[%i]/text()" %j).extract()[0].encode('utf-8').replace("\"","\'").strip()
			except:
				pass
			if "品牌" in list_brand:
				brand = response.xpath("//div[@class='pd_param clearfix']/ul/li[%i]/span/text()" %j).extract()[0].encode('utf-8').replace("\"","\'").strip()
				break
		item['productBrand'] = brand
		try:
			item['productModel'] = response.xpath("//div[@class='pd_param clearfix']/ul/li[2]/span/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		try:
			item['productClassification'] = classification
		except:
			pass
		try:
		#去空格 转分   去人民币符号
			item['productPrice'] = response.xpath("//div[@class='pd_l clearfix']/div[@class='pd_l_cont clearfix'][1]/div[@class='pd_info']/ul[@class='pd_info_supplier']/li[4]//span[@class='f_red']/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
			if item['productPrice'] == '面议':
				item['productPrice'] = 0.0
		except:
			pass
		try:
			item['productPrice'] = str(float(filter(lambda ch: ch in '0123456789.~', item['productPrice']))*100)
		except:
			pass
		#图片连接
		try:
			item['productImagePath'] = response.xpath("//img[@id='show_big']/@src").extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		#print item['image_urls'],"777777"
		try:
			item['productAddres'] = response.xpath("//span[@class='address_over']/a/text()").extract()[0]
		except:
			pass
		try:
			item['productCompany'] = ""
		except:
			pass
		names = self.name+'.json'
		try:
			item['fileName'] = names
		except:
			pass
		list_details = response.xpath("//div[@class='pd_param clearfix']/ul/li/text()").extract()
		details = response.xpath("//div[@class='pd_param clearfix']/ul/li/span/text()").extract()
		logging.info("-------list_details_len=%i" %len(list_details))
		list_pack = response.xpath("//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()").extract()
		intro = response.xpath("//span[@id='PDescription']/text()").extract()
		logging.info("-------intr_len=%i" %len(intro))
		speci = response.xpath("//span[@id='techParam']/text()").extract()
		logging.info("-------intr_len=%i" %len(speci))
		num_one=0
		for value_details in list_details :
			value_details = value_details.encode('utf-8').replace(":","\/").replace("\n","").replace("\"","").strip()
			if '品牌' in value_details:
				num_one+=1
				continue
			if '型号' in value_details:
				num_one+=1
				continue
			else:
				data2 = {}
				data2['attrkey'] = ''
				data2['keyname'] = ''
				data2['attrkey']=value_details
				data2['keyname']=details[num_one]
				details_list.append(data2)
			num_one+=1

		num_two=1
		for list_intro in intro:
			list_intro = list_intro.encode('utf-8').replace("\n","").replace("\"","").strip()
			list_intro = list_intro.split(':')
			for value_intro in list_intro :
				if num_two%2==1 :
					data2 = {}
					data2['attrkey'] = ''
					data2['keyname'] = ''
					if '商品名称' in value_intro:
						break
					if '品牌' in value_intro:
						break
					data2['attrkey']=value_intro
				else:
					data2['keyname']=value_intro
					intro_list.append(data2)
				num_two+=1

		num_three=1
		for list_speci in speci:
			list_speci = list_speci.encode('utf-8').replace("\n","").replace("\"","").strip()
			list_speci = list_speci.split(':')
			for value_speci in list_speci :
				if num_three%2==1 :
					data2 = {}
					data2['attrkey'] = ''
					data2['keyname'] = ''
					if '商品名称' in value_speci:
						break
					if '品牌' in value_speci:
						break
					data2['attrkey']=value_speci
				else:
					data2['keyname']=value_speci
					speci_list.append(data2)
				num_three+=1

		item['productSpeci'] = speci_list
		item['productPack'] = pack_list
		item['productIntro'] = intro_list
		item['productDetails'] = details_list
		yield item
Пример #17
0
    def parse(self, response):
        item = BaseItem()
        speci_list = []
        pack_list = []
        intro_list = []
        details_list = []
        item['productUrl'] = ''
        item['productName'] = ''
        item['productBrand'] = ''
        item['productModel'] = ''
        item['productClassification'] = ''
        item['productPrice'] = ''
        item['productImagePath'] = ''
        item['productAddres'] = ""
        item['productCompany'] = ''
        item['fileName'] = ''
        item['productDetails'] = ""
        item['productPack'] = ""
        item['productIntro'] = ""
        item['productSpeci'] = ""
        classification_one = ''
        classification_two = ''
        classification_three = ''
        try:
            item['productUrl'] = response.url
        except:
            pass
        try:
            item['productName'] = response.xpath(
                "//h1[@class='lh40 col59 f18']/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        list_brand = ''
        try:
            list_brand = response.xpath(
                "//tr[@class='keyValue'][1]/td[1]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            if '涂层手套' not in list_brand:
                brand = response.xpath(
                    "//div[@class='detailAndBuy']/div[@class='detail'][1]/span[@class='typeValue']/text()"
                ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
                item['productBrand'] = brand.split(' ')[0]
            else:
                item['productBrand'] = response.xpath(
                    "//tr[@class='keyValue'][1]/td[2]/text()").extract(
                    )[0].encode('utf-8').replace("\"", "\'").strip()
                filter(str.isalnum, item['productBrand'])
        except:
            pass
        try:
            item['productModel'] = response.xpath(
                "//div[@class='cpzstm']/b/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            classification_one = response.xpath(
                "//div[@class='crumbs']/span[2]/a/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_two = response.xpath(
                "//div[@class='crumbs']/span[3]/a/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_three = response.xpath(
                "//div[@class='crumbs']/span[4]/a/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        classification = classification_one + '|||' + classification_two + '|||' + classification_three
        item['productClassification'] = classification
        try:
            item['productPrice'] = response.xpath(
                "//span[@id='show-price']/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productPrice'] = str(
                float(
                    filter(lambda ch: ch in '0123456789.~',
                           item['productPrice'])) * 100)
        except:
            pass
        try:
            item[
                'productImagePath'] = 'http://www.zhaogongye.cn' + response.xpath(
                    "//span[@class='jqzoom']/img/@src").extract()[0].encode(
                        'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productAddres'] = ""
        except:
            pass
        try:
            item['productCompany'] = ""
        except:
            pass
        names = self.name + '.json'
        try:
            item['fileName'] = names
        except:
            pass
        test_specis = response.xpath(
            "//div[@id='main1']/blockquote[2]/div[@class='qyjstxt']/text()"
        ).extract()
        logging.info("-------specis_len=%i" % len(test_specis))
        test_details = response.xpath(
            "//blockquote[@class='block']/div[@class='qyjstxt']/text()"
        ).extract()
        logging.info("-------details_len=%i" % len(test_details))
        specis = ''
        try:
            specis = response.xpath(
                "//div[@id='main1']/blockquote[2]/div[@class='qyjst']/text()"
            ).extract()[0].encode('utf-8').replace("\n",
                                                   "").replace("\"",
                                                               "").strip()
        except:
            pass
        list_speci = specis.split(':')
        list_pack = response.xpath(
            "//div[@class='packageParameter tabContent']/div[@class='specisParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()"
        ).extract()
        num_one = 1
        for speci in list_speci:
            if num_one % 2 == 0:
                data2 = {}
                data2['attrkey'] = ''
                data2['keyname'] = ''
                data2['attrkey'] = speci
            else:
                if num_one == 1:
                    num_one += 1
                    continue
                data2['keyname'] = speci
                speci_list.append(data2)
            num_one += 1
        num_two = 1
        for value_pack in list_pack:
            value_pack = value_pack.encode('utf-8').replace(":", "\/").replace(
                "\n", "").replace("\"", "").strip()
            if num_two % 2 == 1:
                data2 = {}
                data2['attrkey'] = ''
                data2['keyname'] = ''
                data2['attrkey'] = value_pack
            else:
                data2['keyname'] = value_pack
                pack_list.append(data2)
            num_two += 1

        product_details = response.xpath(
            "//blockquote[@class='block']/div[@class='qyjstxt']/text()"
        ).extract()
        product_speci = response.xpath(
            "//div[@id='main1']/blockquote[2]/div[@class='qyjstxt']/text()"
        ).extract()
        filename = self.name + ".txt"
        file = open("data/" + filename, 'a+')
        file.write("\n" + "productUrl:" + response.url + "\n")
        file.write("productDetails:" + "\n")
        for details in product_details:
            details = details.encode('utf-8').replace("\b", "").replace(
                "<br/>", "").replace("<br>", "").strip()
            file.write(details + "\n")
        file.write("productSpeci:" + "\n")
        for speci in product_speci:
            speci = speci.encode('utf-8').replace("\"", "").strip()
            file.write(speci + "\n")
        file.close()

        item['productSpeci'] = speci_list
        item['productPack'] = pack_list
        item['productIntro'] = intro_list
        item['productDetails'] = intro_list
        yield item
Пример #18
0
    def parse(self, response):
        item = BaseItem()
        speci_list = []
        pack_list = []
        intro_list = []
        details_list = []
        item['productUrl'] = ''
        item['productName'] = ''
        item['productBrand'] = ''
        item['productModel'] = ''
        item['productClassification'] = ''
        item['productPrice'] = ''
        item['productImagePath'] = ''
        item['productAddres'] = ""
        item['productCompany'] = ''
        item['fileName'] = ''
        item['productDetails'] = ""
        item['productPack'] = ""
        item['productIntro'] = ""
        item['productSpeci'] = ""
        classification_one = ''
        classification_two = ''
        classification_three = ''
        try:
            classification_one = response.xpath(
                "//div[@class='breadth']/div/a[3]/text()").extract()[0].encode(
                    'utf-8').replace(".", "").strip()
            classification_two = response.xpath(
                "//div[@class='position w1000']/a[3]/text()").extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
            classification_three = response.xpath(
                "//div[@class='position w1000']/a[4]/text()").extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        classification = classification_one + '|||' + classification_two + '|||' + classification_three
        #print "PhantomJS is starting1..."
        #driver = webdriver.PhantomJS()
        #driver.get(response.url)
        #time.sleep(3)
        #body = driver.page_source
        #driver.close()
        #HtmlResponses = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=response)
        try:
            item['productUrl'] = response.url
        except:
            pass
        try:
            item['productName'] = response.xpath(
                "//div[@class='corpus_left']/div/h3/text()").extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productBrand'] = response.xpath(
                "//div[@class='intro']/table/tr[4]/td[2]/text()").extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productModel'] = response.xpath(
                "//div[@class='intro']/table/tr[1]/td[2]/text()").extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productClassification'] = classification
        except:
            pass
        try:
            #去空格 转分   去人民币符号
            item['productPrice'] = response.xpath(
                "//div[@class='intro']/table/tr[3]/td[2]/b/text()").extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productPrice'] = str(
                float(
                    filter(lambda ch: ch in '0123456789.~',
                           item['productPrice'])) * 100)
        except:
            pass
        #图片连接
        try:
            item[
                'productImagePath'] = "http://www.rolymro.com/" + response.xpath(
                    "//div[@class='sphoto']/a[@class='sphoto']/img/@src"
                ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        #print item['image_urls'],"777777"
        try:
            item['productAddres'] = response.xpath(
                "//div[@id='pdetail']/div[@class='d-vopy']/table/tbody/tr[4]/td/text()"
            ).extract()[0]
        except:
            pass
        try:
            item['productCompany'] = ""
        except:
            pass
        names = self.name + '.json'
        try:
            item['fileName'] = names
        except:
            pass
        list_details = response.xpath(
            "//table[@class='table_pro_info']/tr/td[1]/text()").extract()
        details = response.xpath(
            "//table[@class='table_pro_info']/tr/td[2]/text()").extract()
        logging.info("-------list_details_len=%i" % len(list_details))
        logging.info("-------details_len=%i" % len(details))
        list_pack = response.xpath(
            "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()"
        ).extract()
        intro = response.xpath("//span[@id='PDescription']/text()").extract()
        logging.info("-------intr_len=%i" % len(intro))
        speci = response.xpath("//span[@id='techParam']/text()").extract()
        logging.info("-------intr_len=%i" % len(speci))
        num_one = 0
        for value_details in list_details:
            value_details = value_details.encode('utf-8').replace(
                ":", "\/").replace("\n", "").replace("\t",
                                                     "").replace("\b",
                                                                 "").strip()
            data2 = {}
            data2['attrkey'] = ''
            data2['keyname'] = ''
            if '产品单价' in value_details:
                num_one += 2
                continue
            if '品牌' in value_details:
                num_one += 1
                continue
            if '型号' in value_details:
                num_one += 1
                continue
            data2['attrkey'] = value_details
            data2['keyname'] = details[num_one].encode('utf-8').replace(
                ":", "\/").replace("\n", "").replace("\t",
                                                     "").replace("\b",
                                                                 "").strip()
            details_list.append(data2)
            num_one += 1

        num_two = 1
        for list_intro in intro:
            list_intro = list_intro.encode('utf-8').replace(
                "\r\n\t", "").replace("\"", "").strip()
            list_intro = list_intro.split(':')
            for value_intro in list_intro:
                if num_two % 2 == 1:
                    data2 = {}
                    data2['attrkey'] = ''
                    data2['keyname'] = ''
                    if '商品名称' in value_intro:
                        break
                    if '品牌' in value_intro:
                        break
                    data2['attrkey'] = value_intro
                else:
                    data2['keyname'] = value_intro
                    intro_list.append(data2)
                num_two += 1

        num_three = 1
        for list_speci in speci:
            list_speci = list_speci.encode('utf-8').replace("\n", "").replace(
                "\"", "").strip()
            list_speci = list_speci.split(':')
            for value_speci in list_speci:
                if num_three % 2 == 1:
                    data2 = {}
                    data2['attrkey'] = ''
                    data2['keyname'] = ''
                    if '商品名称' in value_speci:
                        break
                    if '品牌' in value_speci:
                        break
                    data2['attrkey'] = value_speci
                else:
                    data2['keyname'] = value_speci
                    speci_list.append(data2)
                num_three += 1

        product_intro = response.xpath(
            "//span[@id='PDescription']/text()").extract()
        product_pack = response.xpath(
            "//td[@id='imgDiv']/div[@id='div3']/font/b/text()").extract()
        filename = self.name + ".txt"
        file = open("data/" + filename, 'a+')
        file.write("\n" + "productUrl:" + response.url + "\n")
        file.write("productIntro:" + "\n")
        for intro in product_intro:
            intro = intro.encode('utf-8').replace("\b", "").replace(
                "<br/>", "").replace("<br>", "").strip()
            file.write(intro + "\n")
        file.write("productPack:" + "\n")
        for pack in product_pack:
            pack = pack.encode('utf-8').replace("\"", "").strip()
            file.write(pack + "\n")
        file.close()

        item['productSpeci'] = speci_list
        item['productPack'] = pack_list
        item['productIntro'] = intro_list
        item['productDetails'] = details_list
        yield item
	def parse(self, response):
		#Analytical framework
		sel = Selector(response)
		#Loop body
		speci_list = []
		pack_list = []
		intro_list = []
		details_list = []
		productClassification=""
		productName=""
		productBrand=""
		productModel=""
		productPrice=""
		productImagePath=""
		productAddres=""
		productId=""
		productCompany=""
		productPack=""
		price=""
		fileName="www_ispek_cn_data_info.json"
		#Instantiation CrawlertoolsItem object
		item=BaseItem()
		#Parse text
		productImagePath=sel.xpath(".//*[@id='picshower']/img/@src").extract()[0]
		if  len(sel.xpath(".//div[@id='sample-table-2_wrapper']")) == 0:
			try:
				tempSel=sel.xpath(".//*[@id='main-container']/div/div[1]/div[1]/a")
				i=1
				classificationStr=""
				while i < len(tempSel):
					classificationStr=classificationStr+tempSel[i].xpath("text()").extract()[0].strip()+"|||"
					i+=1
				productClassification=classificationStr.rstrip("|||")
				tempSel=sel.xpath(".//*[@id='main-container']/div/div[1]/div[2]/div[2]")
				if len(tempSel.xpath("form")):
					tempSel=tempSel.xpath("form")
					temps=tempSel.xpath("div[1]/div[1]/span/text()")[0].extract()
					productPrice=temps.split("/")[0].strip()
					productPrice=filter(lambda ch: ch in '0123456789.', productPrice)
					productCompany=temps.split("/")[1]
				else:
					temps=tempSel.xpath("div[1]/div[1]/text()")[0].extract().strip()
					productPrice=filter(lambda ch: ch in '0123456789.~', temps).split("~")
					productCompany=""

				productName=tempSel.xpath("h2/text()").extract()[0]
				productBrand=tempSel.xpath("div[2]/div/div[1]/a/text()").extract()[0]
				tempSel=tempSel.xpath("div[2]/div/div")
				i=2
				while i<len(tempSel):
					tempStr=""
					tempStr=tempSel[i].xpath("text()").extract()[0].strip().split(u":")
					if tempStr[0]==u"型号":
						productModel=tempStr[1]
					if tempStr[0]==u"产地":
						productAddres=tempStr[1]
					if tempStr[0]==u"包装":
						productPack=tempStr[1]
					i+=1
				prices=productPrice.encode('utf-8').replace("\"","\'").strip().split("~")
				if len(prices) >1:
					price=str(float(prices[0])*100)+"~"+str(float(prices[1])*100)
				else:
					price=str(float(prices[0])*100)

			except Exception,e:
				print "-----------------yichang--------------->",e
			#Formatted data
			item['productUrl']=response.url
			item['productImagePath'] = "http://www.ispek.cn"+productImagePath.encode('utf-8').replace("\"","\'").strip() 
			item['productClassification'] =productClassification.encode('utf-8').replace("\"","\'").strip()
			item['productName'] =productName.encode('utf-8').replace("\"","\'").strip()
			item['productBrand'] =productBrand.encode('utf-8').replace("\"","\'").strip()
			item['productModel'] =productModel.encode('utf-8').replace("\"","\'").strip()
			item['productAddres'] =productAddres.encode('utf-8').replace("\"","\'").strip()
			item['productCompany'] =productCompany.encode('utf-8').replace("\"","\'").strip()
			item['productPrice'] =price
			item['fileName']=fileName
			item['productDetails']=""
			item['productPack']=productPack.encode('utf-8').replace("\"","\'").strip()
			item['productIntro']=""
			item['productSpeci']=""
			yield item
>>>>>>> 7235bd3b1c4452496ce81c35a74b003805fe6394
=======
>>>>>>> 7235bd3b1c4452496ce81c35a74b003805fe6394
		productName=""
		productBrand=""
		productModel=""
		productPrice=""
		productImagePath=""
		productId=""
		productCompany=""
		temps=""
		productDetails=""
		productPack=""
		fileName="zc_mrobay_com_data_info.json"
		#Instantiation CrawlertoolsItem object
		item=BaseItem()
		#Parse text
		productImagePath=sel.xpath(".//*[@id='showPic']/@src").extract()[0]
		try:
			tempSel=sel.xpath(".//div[@class='Xh_xq']")
			productName=tempSel.xpath("h1/text()").extract()[0]
			productBrand=tempSel.xpath("div[2]/div[2]/ul/li[3]/text()").extract()[0]
			temps=tempSel.xpath("div[2]/div[2]/ul/li[1]/span/b/text()")[0].extract()
			productPrice=str(float(filter(lambda ch: ch in '0123456789.~', temps))*100)
			productCompany=u"套"				
			productModel=tempSel.xpath("div[2]/div[2]/ul/li[4]/p[1]/text()")[0].extract()
		except Exception,e:
			print "-----------------yichang--------------->",e
		
		try:
			productPack=sel.xpath(".//div[5]/div/div[2]/div[2]/ul/li[9]/text()").extract()[0]
Пример #21
0
    def parse(self, response):
        item = BaseItem()
        speci_list = []
        pack_list = []
        intro_list = []
        details_list = []
        item['productUrl'] = ''
        item['productName'] = ''
        item['productBrand'] = ''
        item['productModel'] = ''
        item['productClassification'] = ''
        item['productPrice'] = ''
        item['productImagePath'] = ''
        item['productAddres'] = ""
        item['productCompany'] = ''
        item['fileName'] = ''
        item['productDetails'] = ""
        item['productPack'] = ""
        item['productIntro'] = ""
        item['productSpeci'] = ""
        classification_one = ''
        classification_two = ''
        classification_three = ''
        #print "PhantomJS is starting1..."
        #driver = webdriver.PhantomJS()
        #driver.get(response.url)
        #time.sleep(3)
        #body = driver.page_source
        #HtmlResponses = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=response)
        try:
            item['productUrl'] = response.url
        except:
            pass
        try:
            item['productName'] = response.xpath(
                "//h1[@id='title']/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            classification_one = response.xpath(
                "//div[@class='pos']/a[3]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_two = response.xpath(
                "//div[@class='pos']/a[4]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_three = response.xpath(
                "//div[@class='pos']/a[5]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        classification = classification_one + '|||' + classification_two + '|||' + classification_three
        item['productClassification'] = classification
        try:
            item['productPrice'] = response.xpath(
                "//span[@class='f_price px16']/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productPrice'] = str(
                float(
                    filter(lambda ch: ch in '0123456789.~',
                           item['productPrice'])) * 100)
        except:
            pass
        try:
            imagePath = response.xpath("//div[@id='mid_div']/img/@src"
                                       ).extract()[0].encode('utf-8').replace(
                                           "\"", "\'").strip()
            item['productImagePath'] = imagePath
        except:
            pass
        try:
            item['productAddres'] = ""
        except:
            pass
        try:
            item['productCompany'] = ""
        except:
            pass
        names = self.name + '.json'
        try:
            item['fileName'] = names
        except:
            pass
        print "PhantomJS is starting1..."
        driver = webdriver.PhantomJS()
        driver.get(response.url)
        time.sleep(3)
        body = driver.page_source
        HtmlResponses = HtmlResponse(driver.current_url,
                                     body=body,
                                     encoding='utf-8',
                                     request=response)
        list_brand = HtmlResponses.xpath(
            "//table/tbody/tr/td[2]/table/tbody/tr/td/table/tbody/tr/td[1]"
        ).extract()
        list_speci_th = response.xpath(
            "//div[@class='col-xs-12 nopadding border-ccc attrDiv']/div/p/text()"
        ).extract()
        list_detail = response.xpath("//div[@id='content']").extract()
        logging.info("----------list_detail_len=%i" % len(list_detail))
        list_intro = response.xpath(
            "//div[@id='tbc_13']/div[@class='intro_box']").extract()
        logging.info("----------list_intro_len=%i" % len(list_intro))
        for j in range(1, len(list_brand)):
            brand = HtmlResponses.xpath(
                "//table/tbody/tr/td[2]/table/tbody/tr/td/table/tbody/tr[%i]/td[1]/text()"
                % j).extract()[0].encode('utf-8').replace(":", "\/").replace(
                    "\n", "").replace("\"", "").strip()
            if "型号" in brand:
                item['productModel'] = HtmlResponses.xpath(
                    "//table/tbody/tr[2]/td[3]/table/tbody/tr[%i]/td[2]/a[@class='b']/text()"
                    % j).extract()[0].encode('utf-8').replace(
                        ":", "\/").replace("\n", "").replace("\"", "").strip()
                continue
            if "品牌" in brand:
                item['productBrand'] = HtmlResponses.xpath(
                    "//table/tbody/tr[2]/td[3]/table/tbody/tr[%i]/td[2]/a[@class='b']/text()"
                    % j).extract()[0].encode('utf-8').replace(
                        ":", "\/").replace("\n", "").replace("\"", "").strip()
                break
        driver.close()
        #11.28
        brand = ''
        try:
            brand = response.xpath(
                "//dl[@class='pro-info-prop pro-info-brand']/dd[@class='pro-info-cons']/text()"
            ).extract()[0].encode('utf-8').replace("\t", "").replace(
                "\n", "").replace("\b", "").replace("\r", "").strip()
            item['productBrand'] = brand
        except:
            pass
        model = ''
        try:
            model = response.xpath(
                "//dl[@class='pro-info-prop pro-info-model']/dd[@class='pro-info-cons']/text()"
            ).extract()[0].encode('utf-8').replace("\t", "").replace(
                "\n", "").replace("\b", "").strip()
            item['productModel'] = model
        except:
            pass
        #11.28
        for value_detail in list_detail:
            value_detail = value_detail.encode('utf-8').replace(
                "\t", "").replace("\n", "").replace("\b", "").replace(
                    "<br>", "").replace("</br>", "").replace("\r", "").strip()
            dr = re.compile(r'<[^>]+>', re.S)
            dd_value_detail = dr.sub('', value_detail)
            details_list.append(dd_value_detail)
        cancel = ''
        try:
            cancel_l = response.xpath(
                "//p[@class='link-detail']/text()").extract()
            for cancel_s in cancel_l:
                cancel_s = cancel_s.encode('utf-8').replace("\t", "").replace(
                    "\n", "").replace("\b", "").replace("<br>", "").replace(
                        "</br>", "").replace("\r", "").strip()
                cancel += cancel_s
        except:
            pass
        for value_intro in list_intro:
            value_intro = value_intro.encode('utf-8').replace(
                "\t", "").replace("\n", "").replace("\b", "").replace(
                    "<br>", "").replace("</br>", "").replace("\r", "").strip()
            value_intro.replace(cancel, '')
            dr = re.compile(r'<[^>]+>', re.S)
            dd_value_intro = dr.sub('', value_intro)
            intro_list.append(dd_value_intro)

        item['productSpeci'] = speci_list
        item['productPack'] = pack_list
        item['productIntro'] = intro_list
        item['productDetails'] = details_list
        yield item
	def parse(self, response):
		item = BaseItem()
		speci_list = []
		pack_list = []
		intro_list = []
		details_list = []
		item['productUrl'] = ''
		item['productName'] = ''
		item['productBrand'] = ''
		item['productModel'] = ''
		item['productClassification'] = ''
		item['productPrice'] = ''
		item['productImagePath'] = ''
		item['productAddres'] = ""
		item['productCompany'] = ''
		item['fileName'] = ''
		item['productDetails'] = ""
		item['productPack'] = ""
		item['productIntro'] = ""
		item['productSpeci'] = ""
		classification_one = ''
		classification_two = ''
		classification_three = ''
		try:
			classification_one = response.xpath("//div[@class='w-fly-cnt']/div[@class='position']/a[2]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
			classification_two = response.xpath("//div[@class='w-fly-cnt']/div[@class='position']/a[3]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
			classification_three = response.xpath("//div[@class='w-fly-cnt']/div[@class='position']/a[4]/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		classification = classification_one + '|||' + classification_two + '|||' +classification_three
		item['productUrl'] = response.url
		try:
			item['productName'] = response.xpath("//div[@class='prodetails']/h1[@class='protitle']/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() 
		except:
			pass
		try:
			item['productBrand'] = response.xpath('//ul[@class="list_pic"]/li/dl/dt/a/text()').extract()[0].encode('utf-8').replace("\"","\'").strip() 
		except:
			pass
		try:
			item['productModel'] = response.xpath("//div[@class='fn-fr']/div[@class='add-to-basket']/dl[@class='fn-clearfix atb-dl-01']/dd/text()").extract()[0].encode('utf-8').replace("\"","\'").strip() 
		except:
			pass
		try:
			item['productClassification'] = classification
		except:
			pass
		try:
			price=response.xpath("//font[@id='ECS_GOODS_AMOUNT']/text()").extract()[0].encode('utf-8').replace("\"","\'").strip()
		except:
			pass
		try:
			item['productPrice'] = str(float(filter(lambda ch: ch in '0123456789.~', price))*100)
		except:
			pass
		try:
			item['productImagePath'] = "http://www.isweek.cn/"+response.xpath('//a[@class="jqzoom"]/img/@src').extract()[0].encode('utf-8').replace("\"","\'").strip() 
		except:
			pass
		item['productAddres'] = ""
		item['productCompany'] = ""
		names = self.name+'.json'
		item['fileName'] = names
		list_details = response.xpath("//div[@id='para']/table[1]/tbody[2]/tr/td/text()").extract()
		logging.info("-------list_details_len=%i" %len(list_details))
		list_pack = response.xpath("//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()").extract()
		intro = response.xpath("//span[@id='PDescriptiion']/text()").extract()
		logging.info("-------intr_len=%i" %len(intro))
		speci = response.xpath("//span[@id='techParam']/text()").extract()
		logging.info("-------intr_len=%i" %len(speci))
		num_one=1
		for value_details in list_details :
			value_details = value_details.encode('utf-8').replace(":","\/").replace("\n","").replace("\"","").strip()
			if num_one%2==2 :
				num_one = 1
				continue
			if num_one%2==1 :
				data2 = {}
				data2['attrkey'] = ''
				data2['keyname'] = ''
				if '品牌' in value_details:
					num_one=0
					continue
				if '型号' in value_details:
					num_one=0
					continue
				data2['attrkey']=value_details
			else:
				if num_one ==0:
					num_one = 1
					continue
				data2['keyname']=value_details
				details_list.append(data2)
			num_one+=1

		num_two=1
		for list_intro in intro:
			list_intro = list_intro.encode('utf-8').replace("\n","").replace("\"","").strip()
			list_intro = list_intro.split(':')
			for value_intro in list_intro :
				if num_two%2==1 :
					data2 = {}
					data2['attrkey'] = ''
					data2['keyname'] = ''
					if '商品名称' in value_intro:
						break
					if '品牌' in value_intro:
						break
					data2['attrkey']=value_intro
				else:
					data2['keyname']=value_intro
					intro_list.append(data2)
				num_two+=1

		num_three=1
		for list_speci in speci:
			list_speci = list_speci.encode('utf-8').replace("\n","").replace("\"","").strip()
			list_speci = list_speci.split(':')
			for value_speci in list_speci :
				if num_three%2==1 :
					data2 = {}
					data2['attrkey'] = ''
					data2['keyname'] = ''
					if '商品名称' in value_speci:
						break
					if '品牌' in value_speci:
						break
					data2['attrkey']=value_speci
				else:
					data2['keyname']=value_speci
					speci_list.append(data2)
				num_three+=1

		product_intro = response.xpath("//div[@class='prodesc']/p/span/text()").extract()
		product_pack = response.xpath("//td[@id='imgDiv']/div[@id='div3']/font/b/text()").extract()
		product_speci = response.xpath("//div[@class='fn-fr']/div[@class='info']/p/text()").extract()
		filename = self.name+".txt"
		file = open("data/"+filename, 'a+')
		file.write("\n"+"productUrl:"+response.url+"\n")
		file.write("productIntro:"+"\n")
		for intro in product_intro:
			intro = intro.encode('utf-8').replace("\b","").replace("<br/>","").replace("<br>","").strip()
			file.write(intro+"\n")
		file.write("productSpeci:"+"\n")
		for speci in product_speci:
			speci = speci.encode('utf-8').replace("\"","").strip()
			file.write(speci+"\n")
		file.close()

		item['productSpeci'] = speci_list
		item['productPack'] = pack_list
		item['productIntro'] = intro_list
		item['productDetails'] = details_list
		yield item
    def parse(self, response):
        item = BaseItem()
        speci_list = []
        pack_list = []
        intro_list = []
        details_list = []
        item['productUrl'] = ''
        item['productName'] = ''
        item['productBrand'] = ''
        item['productModel'] = ''
        item['productClassification'] = ''
        item['productPrice'] = ''
        item['productImagePath'] = ''
        item['productAddres'] = ""
        item['productCompany'] = ''
        item['fileName'] = ''
        item['productDetails'] = ""
        item['productPack'] = ""
        item['productIntro'] = ""
        item['productSpeci'] = ""
        classification_one = ''
        classification_two = ''
        classification_three = ''
        try:
            classification_one = response.xpath(
                "//div[@id='Public_breadCrumb01-110']/a[@class='arrowbg1']/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
            classification_two = response.xpath(
                "//div[@id='Public_breadCrumb01-110']/a[@class='arrowbg2']/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
            classification_three = response.xpath(
                "//div[@id='Public_breadCrumb01-110']/a[@class='arrowbg3']/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        classification = classification_one + '|||' + classification_two + '|||' + classification_three
        try:
            item['productUrl'] = response.url
        except:
            pass
        try:
            item['productName'] = response.xpath(
                "//div[@id='ProductDetail_basic01-101']/h1[@class='htmlinline']/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productBrand'] = response.xpath(
                '//ul[@class="list_pic"]/li/dl/dt/a/text()').extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productModel'] = response.xpath(
                "//li[@class='number']/em/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productClassification'] = classification
        except:
            pass
        try:
            #去空格 转分   去人民币符号
            item['productPrice'] = response.xpath(
                "//li[@class='retailprice']/em/strong").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productPrice'] = str(
                float(
                    filter(lambda ch: ch in '0123456789.~',
                           item['productPrice'])) * 100)
        except:
            pass
        #图片连接
        try:
            item[
                'productImagePath'] = "http://www.ieou.com.cn" + response.xpath(
                    '//div[@class="jqzoom"]/img/@src').extract()[0].encode(
                        'utf-8').replace("\"", "\'").strip()
        except:
            pass
        #print item['image_urls'],"777777"
        try:
            item['productAddres'] = response.xpath(
                "//div[@id='pdetail']/div[@class='d-vopy']/table/tbody/tr[4]/td/text()"
            ).extract()[0]
        except:
            pass
        try:
            item['productCompany'] = ""
        except:
            pass
        names = self.name + '.json'
        try:
            item['fileName'] = names
        except:
            pass
        list_details = response.xpath(
            "//div[@class='extend']/ul/li/label/text()").extract()
        logging.info("-------list_details_len=%i" % len(list_details))
        details = response.xpath(
            "//div[@class='extend']/ul/li/span/text()").extract()
        logging.info("-------details_len=%i" % len(details))
        list_pack = response.xpath(
            "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()"
        ).extract()
        intro = response.xpath(
            "//div[@class='descreibe htmledit']/ul/li/span/span/text()"
        ).extract()
        logging.info("-------intr_len=%i" % len(intro))
        speci = response.xpath(
            "//div[@class='desceribe htmleit']/ul/li/span/span/span/text()"
        ).extract()
        logging.info("-------intr_len=%i" % len(speci))
        num_one = 0
        for value_details in list_details:
            value_details = value_details.encode('utf-8').replace(
                ":", "").replace("\n", "").replace("\"", "").strip()
            data2 = {}
            data2['attrkey'] = ''
            data2['keyname'] = ''
            if '品牌' in value_details:
                num_one += 1
                continue
            if '型号' in value_details:
                num_one += 1
                continue
            data2['attrkey'] = value_details
            data2['keyname'] = details[num_one].encode('utf-8').replace(
                ":", "").replace("\n", "").replace("\"", "").strip()
            details_list.append(data2)
            num_one += 1

        num_two = 1
        for list_intro in intro:
            list_intro = list_intro.encode('utf-8').replace("\n", "").replace(
                "\"", "").strip()
            list_intro = list_intro.split(':')
            for value_intro in list_intro:
                if num_two % 2 == 1:
                    data2 = {}
                    data2['attrkey'] = ''
                    data2['keyname'] = ''
                    if '商品名称' in value_intro:
                        break
                    if '品牌' in value_intro:
                        break
                    data2['attrkey'] = value_intro
                else:
                    data2['keyname'] = value_intro
                    intro_list.append(data2)
                num_two += 1

        num_three = 1
        for list_speci in speci:
            list_speci = list_speci.encode('utf-8').replace("\n", "").replace(
                "\"", "").strip()
            list_speci = list_speci.split(':')
            for value_speci in list_speci:
                if num_three % 2 == 1:
                    data2 = {}
                    data2['attrkey'] = ''
                    data2['keyname'] = ''
                    if '商品名称' in value_speci:
                        break
                    if '品牌' in value_speci:
                        break
                    data2['attrkey'] = value_speci
                else:
                    data2['keyname'] = value_speci
                    speci_list.append(data2)
                num_three += 1

        product_intro = response.xpath(
            "//div[@class='describe htmledit']/ul/li/span/span/text()"
        ).extract()
        product_details = response.xpath(
            "//div[@class='describe htmledit']/p/span/text()").extract()
        product_speci = response.xpath(
            "//div[@class='describe htmledit']/p/span/strong/text()").extract(
            )
        if len(product_speci) == 0:
            product_speci = response.xpath(
                "//div[@class='describe htmledit']/p/strong/span/text()"
            ).extract()
        product_pack = response.xpath(
            "//td[@id='imgDiv']/div[@id='div3']/font/b/text()").extract()
        filename = self.name + ".txt"
        file = open("data/" + filename, 'a+')
        file.write("\n" + "productUrl:" + response.url + "\n")
        file.write("productIntro:" + "\n")
        for intro in product_intro:
            intro = intro.encode('utf-8').replace("\b", "").replace(
                "<br/>", "").replace("<br>", "").strip()
            file.write(intro + "\n")
        file.write("productDetails:" + "\n")
        for details in product_details:
            details = details.encode('utf-8').replace("\"", "").strip()
            file.write(details + "\n")
        file.write("productSpeci:" + "\n")
        for speci in product_speci:
            speci = speci.encode('utf-8').replace("\"", "").strip()
            file.write(speci + "\n")
        file.close()

        item['productSpeci'] = speci_list
        item['productPack'] = pack_list
        item['productIntro'] = intro_list
        item['productDetails'] = details_list
        yield item
Пример #24
0
    def parse(self, response):
        item = BaseItem()
        speci_list = []
        pack_list = []
        intro_list = []
        details_list = []
        item['productUrl'] = ''
        item['productName'] = ''
        item['productBrand'] = ''
        item['productModel'] = ''
        item['productClassification'] = ''
        item['productPrice'] = ''
        item['productImagePath'] = ''
        item['productAddres'] = ""
        item['productCompany'] = ''
        item['fileName'] = ''
        item['productDetails'] = ""
        item['productPack'] = ""
        item['productIntro'] = ""
        item['productSpeci'] = ""
        classification_one = ''
        classification_two = ''
        classification_three = ''
        print "PhantomJS is starting1..."
        driver = webdriver.PhantomJS()
        driver.get(response.url)
        time.sleep(3)
        body = driver.page_source
        HtmlResponses = HtmlResponse(driver.current_url,
                                     body=body,
                                     encoding='utf-8',
                                     request=response)
        try:
            item['productUrl'] = response.url
        except:
            pass
        try:
            item['productName'] = HtmlResponses.xpath(
                "//div[@class='panel-heading panel-heading-div']/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            classification_one = response.xpath(
                "//div[@class='crumbs']/span[2]/a/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_two = response.xpath(
                "//div[@class='crumbs']/span[3]/a/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_three = response.xpath(
                "//div[@class='crumbs']/span[4]/a/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        classification = classification_one + '|||' + classification_two + '|||' + classification_three
        item['productClassification'] = classification
        try:
            item['productPrice'] = HtmlResponses.xpath(
                "//font[@class='price-font'][2]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productPrice'] = str(
                float(
                    filter(lambda ch: ch in '0123456789.~',
                           item['productPrice'])) * 100)
        except:
            pass
        try:
            item['productImagePath'] = HtmlResponses.xpath(
                "//img[@id='zoomimg']/@src").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productAddres'] = ""
        except:
            pass
        try:
            item['productCompany'] = ""
        except:
            pass
        names = self.name + '.json'
        try:
            item['fileName'] = names
        except:
            pass
        list_speci_tr = HtmlResponses.xpath(
            "//div[@class='col-xs-12 nopadding border-ccc attrDiv']/div/b/text()"
        ).extract()
        list_speci_th = HtmlResponses.xpath(
            "//div[@class='col-xs-12 nopadding border-ccc attrDiv']/div/p/text()"
        ).extract()
        list_detail = HtmlResponses.xpath(
            "//div[@id='prd-desc-mdeditor']/p/text()").extract()
        driver.close()
        num_one = 0
        for value_speci in list_speci_tr:
            data2 = {}
            value_speci = value_speci.encode('utf-8').replace(
                ":", "\/").replace("\n", "").replace("\"", "").strip()
            if "型号" in value_speci:
                item['productModel'] = list_speci_th[num_one]
                num_one += 1
                continue
            if "品牌" in value_speci:
                item['productBrand'] = list_speci_th[num_one]
                num_one += 1
                continue
            data2['attrkey'] = value_speci
            data2['keyname'] = list_speci_th[num_one]
            speci_list.append(data2)
            num_one += 1
        #11.28
        brand = ''
        try:
            brand = HtmlResponses.xpath(
                "//div[@class='form-group margin-left_53 margin-bottom-0'][1]/div/p/font/text()"
            ).extract()[0].encode('utf-8').replace("\t", "").replace(
                "\n", "").replace("\b", "").strip()
            if '/' in brand and item['productBrand'] == '':
                item['productBrand'] = brand.split('/')[0]
            else:
                item['productBrand'] = brand
        except:
            pass
        model = ''
        try:
            model = HtmlResponses.xpath(
                "//div[@class='form-group margin-left_53 margin-bottom-0'][1]/div/p/font/text()"
            ).extract()[0].encode('utf-8').replace("\t", "").replace(
                "\n", "").replace("\b", "").strip()
            if '/' in brand and item['productModel'] == '':
                item['productModel'] = model.split('/')[1]
        except:
            pass
        #11.28
        detail = ' '
        for value_detail in list_detail:
            value_detail = value_detail.encode('utf-8').replace(
                "\t",
                "").replace("\n",
                            "").replace("\b",
                                        "").replace("<br>",
                                                    "").replace("</br>",
                                                                "").strip()
            detail += value_detail
        details_list.append(detail)

        item['productSpeci'] = speci_list
        item['productPack'] = pack_list
        item['productIntro'] = intro_list
        item['productDetails'] = details_list
        yield item
Пример #25
0
    def parse(self, response):
        item = BaseItem()
        speci_list = []
        pack_list = []
        intro_list = []
        details_list = []
        item['productUrl'] = ''
        item['productName'] = ''
        item['productBrand'] = ''
        item['productModel'] = ''
        item['productClassification'] = ''
        item['productPrice'] = ''
        item['productImagePath'] = ''
        item['productAddres'] = ""
        item['productCompany'] = ''
        item['fileName'] = ''
        item['productDetails'] = ""
        item['productPack'] = ""
        item['productIntro'] = ""
        item['productSpeci'] = ""
        classification_one = ''
        classification_two = ''
        classification_three = ''
        #print "PhantomJS is starting1..."
        #driver = webdriver.PhantomJS()
        #driver.get(response.url)
        #time.sleep(3)
        #body = driver.page_source
        #HtmlResponses = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=response)
        try:
            item['productUrl'] = response.url
        except:
            pass
        try:
            item['productName'] = response.xpath(
                "//span[@id='productMainName']/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            classification_one = response.xpath(
                "//div[@class='bread-crumb']/a[2]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_two = response.xpath(
                "//div[@class='bread-crumb']/a[3]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_three = response.xpath(
                "//div[@class='bread-crumb']/a[4]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        classification = classification_one + '|||' + classification_two + '|||' + classification_three
        item['productClassification'] = classification
        try:
            item['productPrice'] = response.xpath(
                "//strong[@class='prodet']/b/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productPrice'] = str(
                float(
                    filter(lambda ch: ch in '0123456789.~',
                           item['productPrice'])) * 100)
        except:
            pass
        try:
            imagePath = response.xpath("//img[@id='productImg']/@src").extract(
            )[0].encode('utf-8').replace("\"", "\'").strip()
            item['productImagePath'] = 'http://mro.abiz.com' + imagePath
        except:
            pass
        try:
            item['productAddres'] = ""
        except:
            pass
        try:
            item['productCompany'] = ""
        except:
            pass
        names = self.name + '.json'
        try:
            item['fileName'] = names
        except:
            pass
        list_speci_tr = response.xpath(
            "//div[@class='col-xs-12 nopadding border-ccc attrDiv']/div/b/text()"
        ).extract()
        list_speci_th = response.xpath(
            "//div[@class='col-xs-12 nopadding border-ccc attrDiv']/div/p/text()"
        ).extract()
        list_detail = response.xpath(
            "//div[@id='tbc_11']/div[@class='intro_box']").extract()
        logging.info("----------list_detail_len=%i" % len(list_detail))
        list_intro = response.xpath(
            "//div[@id='tbc_13']/div[@class='intro_box']").extract()
        logging.info("----------list_intro_len=%i" % len(list_intro))
        num_one = 0
        for value_speci in list_speci_tr:
            data2 = {}
            value_speci = value_speci.encode('utf-8').replace(
                ":", "\/").replace("\n", "").replace("\"", "").strip()
            if "型号" in value_speci:
                item['productModel'] = list_speci_th[num_one]
                num_one += 1
                continue
            if "品牌" in value_speci:
                item['productBrand'] = list_speci_th[num_one]
                num_one += 1
                continue
            data2['attrkey'] = value_speci
            data2['keyname'] = list_speci_th[num_one]
            speci_list.append(data2)
            num_one += 1
        #11.28
        brand = ''
        try:
            brand = response.xpath(
                "//dl[@class='pro-info-prop pro-info-brand']/dd[@class='pro-info-cons']/text()"
            ).extract()[0].encode('utf-8').replace("\t", "").replace(
                "\n", "").replace("\b", "").replace("\r", "").strip()
            item['productBrand'] = brand
        except:
            pass
        model = ''
        try:
            model = response.xpath(
                "//dl[@class='pro-info-prop pro-info-model']/dd[@class='pro-info-cons']/text()"
            ).extract()[0].encode('utf-8').replace("\t", "").replace(
                "\n", "").replace("\b", "").strip()
            item['productModel'] = model
        except:
            pass
        #11.28
        for value_detail in list_detail:
            value_detail = value_detail.encode('utf-8').replace(
                "\t", "").replace("\n", "").replace("\b", "").replace(
                    "<br>", "").replace("</br>", "").replace("\r", "").strip()
            dr = re.compile(r'<[^>]+>', re.S)
            dd_value_detail = dr.sub('', value_detail)
            details_list.append(dd_value_detail)
        cancel = ''
        try:
            cancel_l = response.xpath(
                "//p[@class='link-detail']/text()").extract()
            for cancel_s in cancel_l:
                cancel_s = cancel_s.encode('utf-8').replace("\t", "").replace(
                    "\n", "").replace("\b", "").replace("<br>", "").replace(
                        "</br>", "").replace("\r", "").strip()
                cancel += cancel_s
        except:
            pass
        for value_intro in list_intro:
            value_intro = value_intro.encode('utf-8').replace(
                "\t", "").replace("\n", "").replace("\b", "").replace(
                    "<br>", "").replace("</br>", "").replace("\r", "").strip()
            value_intro.replace(cancel, '')
            dr = re.compile(r'<[^>]+>', re.S)
            dd_value_intro = dr.sub('', value_intro)
            intro_list.append(dd_value_intro)

        item['productSpeci'] = speci_list
        item['productPack'] = pack_list
        item['productIntro'] = intro_list
        item['productDetails'] = details_list
        yield item
Пример #26
0
    def parse(self, response):
        item = BaseItem()
        speci_list = []
        pack_list = []
        intro_list = []
        details_list = []
        item['productUrl'] = ''
        item['productName'] = ''
        item['productBrand'] = ''
        item['productModel'] = ''
        item['productClassification'] = ''
        item['productPrice'] = ''
        item['productImagePath'] = ''
        item['productAddres'] = ""
        item['productCompany'] = ''
        item['fileName'] = ''
        item['productDetails'] = ""
        item['productPack'] = ""
        item['productIntro'] = ""
        item['productSpeci'] = ""
        classification_one = ''
        classification_two = ''
        classification_three = ''
        try:
            classification_one = response.xpath(
                "//div[@class='subNav']/a[3]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_two = response.xpath(
                "//div[@class='subNav']/a[4]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_three = response.xpath(
                "//div[@class='subNav']/a[5]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        classification = classification_one + '|||' + classification_two + '|||' + classification_three
        try:
            item['productUrl'] = response.url
        except:
            pass
        try:
            item['productName'] = response.xpath(
                "//div[@class='proDiv']/dl[@class='proDl']/dt/b/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productBrand'] = response.xpath(
                "//div[@class='proDiv']/dl[@class='proDl']/dd[3]/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
            item['productBrand'] = item['productBrand'].replace('产品品牌:', '')
        except:
            pass
        try:
            item['productModel'] = response.xpath(
                "//div[@class='proDiv']/dl[@class='proDl']/dd[2]/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
            item['productModel'] = item['productModel'].replace('原始型号:', '')
        except:
            pass
        try:
            item['productClassification'] = classification
        except:
            pass
        try:
            #去空格 转分   去人民币符号
            item['productPrice'] = response.xpath(
                "//div[@class='proDiv']/dl[@class='proDl']/dd[4]/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
            item['productPrice'] = item['productPrice'].replace('价格:', '')
            if item['productPrice'] == '询价':
                item['productPrice'] = 0.0
        except:
            pass
        try:
            item['productPrice'] = str(
                float(
                    filter(lambda ch: ch in '0123456789.~',
                           item['productPrice'])) * 100)
        except:
            pass
        #图片连接
        try:
            item[
                'productImagePath'] = "http://www.btone-mro.com" + response.xpath(
                    "//img[@id='ctl00_ContentPlaceHolder1_imgMain']/@src"
                ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        #print item['image_urls'],"777777"
        try:
            item['productAddres'] = response.xpath(
                "//form[@id='form1']/ul/li[4]/text()").extract()[0]
        except:
            pass
        try:
            item['productCompany'] = ""
        except:
            pass
        names = self.name + '.json'
        try:
            item['fileName'] = names
        except:
            pass
        list_details = response.xpath(
            "//div[@id='para']/table[1]/tbody[2]/tr/td/text()").extract()
        logging.info("-------list_details_len=%i" % len(list_details))
        list_pack = response.xpath(
            "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()"
        ).extract()
        intro = response.xpath("//span[@id='PDescription']/text()").extract()
        logging.info("-------intr_len=%i" % len(intro))
        speci = response.xpath("//span[@id='techParam']/text()").extract()
        logging.info("-------intr_len=%i" % len(speci))
        num_one = 1
        for value_details in list_details:
            value_details = value_details.encode('utf-8').replace(
                ":", "\/").replace("\n", "").replace("\"", "").strip()
            if num_one % 2 == 2:
                num_one = 1
                continue
            if num_one % 2 == 1:
                data2 = {}
                data2['attrkey'] = ''
                data2['keyname'] = ''
                if '品牌' in value_details:
                    num_one = 0
                    continue
                if '型号' in value_details:
                    num_one = 0
                    continue
                data2['attrkey'] = value_details
            else:
                if num_one == 0:
                    num_one = 1
                    continue
                data2['keyname'] = value_details
                details_list.append(data2)
            num_one += 1

        num_two = 1
        for list_intro in intro:
            list_intro = list_intro.encode('utf-8').replace("\n", "").replace(
                "\"", "").strip()
            list_intro = list_intro.split(':')
            for value_intro in list_intro:
                if num_two % 2 == 1:
                    data2 = {}
                    data2['attrkey'] = ''
                    data2['keyname'] = ''
                    if '商品名称' in value_intro:
                        break
                    if '品牌' in value_intro:
                        break
                    data2['attrkey'] = value_intro
                else:
                    data2['keyname'] = value_intro
                    intro_list.append(data2)
                num_two += 1

        num_three = 1
        for list_speci in speci:
            list_speci = list_speci.encode('utf-8').replace("\n", "").replace(
                "\"", "").strip()
            list_speci = list_speci.split(':')
            for value_speci in list_speci:
                if num_three % 2 == 1:
                    data2 = {}
                    data2['attrkey'] = ''
                    data2['keyname'] = ''
                    if '商品名称' in value_speci:
                        break
                    if '品牌' in value_speci:
                        break
                    data2['attrkey'] = value_speci
                else:
                    data2['keyname'] = value_speci
                    speci_list.append(data2)
                num_three += 1

        product_intro = response.xpath(
            "//div[@class='proNavInfo proNav1']/p[2]/text()").extract()
        product_pack = response.xpath(
            "//div[@class='proNavInfo proNav3']/p/text()").extract()
        product_speci = response.xpath(
            "//div[@class='proNavInfo proNav2']/p/text()").extract()
        filename = self.name + ".txt"
        file = open("data/" + filename, 'a+')
        file.write("\n" + "productUrl:" + response.url + "\n")
        file.write("productIntro:" + "\n")
        for intro in product_intro:
            intro = intro.encode('utf-8').replace("\b", "").replace(
                "<br/>", "").replace("<br>", "").strip()
            file.write(intro + "\n")
        file.write("productPack:" + "\n")
        for pack in product_pack:
            pack = pack.encode('utf-8').replace("\"", "").strip()
            file.write(pack + "\n")
        file.write("productSpeci:" + "\n")
        for speci in product_speci:
            speci = speci.encode('utf-8').replace("\"", "").strip()
            file.write(speci + "\n")
        file.close()

        item['productSpeci'] = speci_list
        item['productPack'] = pack_list
        item['productIntro'] = intro_list
        item['productDetails'] = intro_list
        yield item
Пример #27
0
    def parse(self, response):
        item = BaseItem()
        speci_list = []
        pack_list = []
        intro_list = []
        details_list = []
        item['productUrl'] = ''
        item['productName'] = ''
        item['productBrand'] = ''
        item['productModel'] = ''
        item['productClassification'] = ''
        item['productPrice'] = ''
        item['productImagePath'] = ''
        item['productAddres'] = ""
        item['productCompany'] = ''
        item['fileName'] = ''
        item['productDetails'] = ""
        item['productPack'] = ""
        item['productIntro'] = ""
        item['productSpeci'] = ""
        classification_one = ''
        classification_two = ''
        classification_three = ''
        try:
            item['productUrl'] = response.url
        except:
            pass
        try:
            item['productName'] = response.xpath(
                "//div[@class='center_title']/a/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productBrand'] = response.xpath(
                "//form[@id='ECS_FORMBUY_P']/div[@class='detail_center']/div[@class='center_txt']/div[@class='center_text']/p[1]/a/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productModel'] = response.xpath(
                '//div[@class="m m1"]/div/ul/dt/li/text()').extract(
                )[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            classification_one = response.xpath(
                "//div[@class='location']/a[2]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_two = response.xpath(
                "//div[@class='location']/a[3]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_three = response.xpath(
                "//div[@class='location']/a[4]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        classification = classification_one + '|||' + classification_two + '|||' + classification_three
        item['productClassification'] = classification
        try:
            item['productPrice'] = response.xpath(
                "//div[@class='center_text']/ul[@class='tm-fcs-panel']/li[1]/span[@id='ECS_GOODS_AMOUNT']/span/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productPrice'] = str(
                float(
                    filter(lambda ch: ch in '0123456789.~',
                           item['productPrice'])) * 100)
        except:
            pass
        try:
            item[
                'productImagePath'] = "http://www.1ez.com.cn/" + response.xpath(
                    '//img[@id="J_prodImg"]/@src').extract()[0].encode(
                        'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productAddres'] = ""
        except:
            pass
        try:
            item['productCompany'] = ""
        except:
            pass
        names = self.name + '.json'
        try:
            item['fileName'] = names
        except:
            pass
        list_details = response.xpath(
            "//ul[@class='inLeft_attributes']/li/text()").extract()
        details = response.xpath(
            "//ul[@class='inLeft_attributes']/li/span/text()").extract()
        logging.info("-------list_details_len=%i" % len(list_details))
        logging.info("-------details_len=%i" % len(details))
        list_pack = response.xpath(
            "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()"
        ).extract()
        intro = response.xpath("//span[@id='PDescription']/text()").extract()
        logging.info("-------intr_len=%i" % len(intro))
        speci = response.xpath("//span[@id='techParam']/text()").extract()
        logging.info("-------intr_len=%i" % len(speci))
        num_one = 0
        value_details = ''
        for j in range(1, len(list_details)):
            try:
                value_details = response.xpath(
                    "//ul[@class='inLeft_attributes']/li[%i]/text()" %
                    j).extract()[0].encode('utf-8').replace(":", "").replace(
                        "\"", "").replace(":", "").strip()
            except:
                break
            if '品牌' in value_details:
                num_one += 1
                continue
            if '商品名称' in value_details:
                num_one += 1
                continue
            if '型号' in value_details:
                num_one += 1
                continue
            else:
                data2 = {}
                data2['attrkey'] = ''
                data2['keyname'] = ''
                data2['attrkey'] = value_details
                data2['keyname'] = details[num_one]
                details_list.append(data2)
            num_one += 1

        num_two = 1
        for list_intro in intro:
            list_intro = list_intro.encode('utf-8').replace("\n", "").replace(
                "\"", "").strip()
            list_intro = list_intro.split(':')
            for value_intro in list_intro:
                if num_two % 2 == 1:
                    data2 = {}
                    data2['attrkey'] = ''
                    data2['keyname'] = ''
                    if '商品名称' in value_intro:
                        break
                    if '品牌' in value_intro:
                        break
                    data2['attrkey'] = value_intro
                else:
                    data2['keyname'] = value_intro
                    intro_list.append(data2)
                num_two += 1

        num_three = 1
        for list_speci in speci:
            list_speci = list_speci.encode('utf-8').replace("\n", "").replace(
                "\"", "").strip()
            list_speci = list_speci.split(':')
            for value_speci in list_speci:
                if num_three % 2 == 1:
                    data2 = {}
                    data2['attrkey'] = ''
                    data2['keyname'] = ''
                    if '商品名称' in value_speci:
                        break
                    if '品牌' in value_speci:
                        break
                    data2['attrkey'] = value_speci
                else:
                    data2['keyname'] = value_speci
                    speci_list.append(data2)
                num_three += 1

        item['productSpeci'] = speci_list
        item['productPack'] = pack_list
        item['productIntro'] = intro_list
        item['productDetails'] = details_list
        yield item
    def parse(self, response):
        item = BaseItem()
        speci_list = []
        pack_list = []
        intro_list = []
        details_list = []
        item['productUrl'] = ''
        item['productName'] = ''
        item['productBrand'] = ''
        item['productModel'] = ''
        item['productClassification'] = ''
        item['productPrice'] = ''
        item['productImagePath'] = ''
        item['productAddres'] = ""
        item['productCompany'] = ''
        item['fileName'] = ''
        item['productDetails'] = ""
        item['productPack'] = ""
        item['productIntro'] = ""
        item['productSpeci'] = ""
        classification_one = ''
        classification_two = ''
        classification_three = ''
        try:
            classification_one = response.xpath(
                "//div[@class='siteUrl']/a[2]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_two = response.xpath(
                "//div[@class='siteUrl']/a[3]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
            classification_three = response.xpath(
                "//div[@class='siteUrl']/a[4]/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        classification = classification_one + '|||' + classification_two + '|||' + classification_three
        print "PhantomJS is starting1..."
        driver = webdriver.PhantomJS()
        driver.get(response.url)
        #time.sleep(3)
        body = driver.page_source
        #driver.close()
        HtmlResponses = HtmlResponse(driver.current_url,
                                     body=body,
                                     encoding='utf-8',
                                     request=response)
        try:
            item['productUrl'] = response.url
        except:
            pass
        try:
            item['productName'] = response.xpath(
                "//div[@class='hd']/div/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productBrand'] = response.xpath(
                "//div[@class='dd']/em/a/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productModel'] = response.xpath(
                "//form[@id='ECS_FORMBUY']/div[@class='proInfo f_R']/div[@class='bd']/ul/li[2]/div[@class='dd']/em/text()"
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productClassification'] = classification
        except:
            pass
        try:
            #去空格 转分   去人民币符号
            item['productPrice'] = HtmlResponses.xpath(
                "//b[@id='ECS_GOODS_AMOUNT']/text()").extract()[0].encode(
                    'utf-8').replace("\"", "\'").strip()
        except:
            pass
        try:
            item['productPrice'] = str(
                float(
                    filter(lambda ch: ch in '0123456789.~',
                           item['productPrice'])) * 100)
        except:
            pass
        #图片连接
        try:
            item['productImagePath'] = "http://www.huaaomro.com/" + HtmlResponses.xpath(
                '//div[@class="proSide f_L"]/div[@class="bd"]/img[@id="idImage2"]/@src'
            ).extract()[0].encode('utf-8').replace("\"", "\'").strip()
        except:
            pass
        #print item['image_urls'],"777777"
        try:
            item['productAddres'] = response.xpath(
                "//div[@id='pdetail']/div[@class='d-vopy']/table/tbody/tr[4]/td/text()"
            ).extract()[0]
        except:
            pass
        try:
            item['productCompany'] = ""
        except:
            pass
        names = self.name + '.json'
        try:
            item['fileName'] = names
        except:
            pass
        item['productAddres'] = ""
        item['productCompany'] = ""
        names = self.name + '.json'
        item['fileName'] = names

        list_details = response.xpath(
            "//div[@class='d-vopy']/table/tr/th/h4/text()").extract()
        logging.info("-------list_details_len=%i" % len(list_details))
        details = response.xpath(
            "//div[@class='d-vopy']/table/tr/td/text()").extract()
        logging.info("-------details_len=%i" % len(details))
        list_pack = response.xpath(
            "//div[@class='packageParameter tabContent']/div[@class='specsParameter-wrap']/table[@class='standardTable']/tbody/tr[@class='keyValue']/td/text()"
        ).extract()
        list_intro = response.xpath(
            "//ul[@class='detail-list clearfix']/li/text()").extract()
        logging.info("-------list_intro_len=%i" % len(list_intro))
        intro = response.xpath(
            "//div[@class='goods']/table[@class='goods-items']/tr[2]/td/text()"
        ).extract()
        logging.info("-------intr_len=%i" % len(intro))
        speci = response.xpath("//span[@id='techParam']/text()").extract()
        logging.info("-------intr_len=%i" % len(speci))
        num_one = 0
        for list_details_value in list_details:
            list_details_value = list_details_value.encode('utf-8').replace(
                "\n", "").replace("\"", "").strip()
            data2 = {}
            data2['attrkey'] = ''
            data2['keyname'] = ''
            if '品牌' in list_details_value:
                num_one += 1
                continue
            if '价格' in list_details_value:
                num_one += 1
                continue
            if '供应商' in list_details_value:
                num_one = 0
                continue
            if '保修期' in list_details_value:
                break
            data2['attrkey'] = list_details_value
            data2['keyname'] = details[num_one]
            details_list.append(data2)
            num_one += 1

        for list_intro_value in list_intro:
            list_intro_value = list_intro_value.encode('utf-8').replace(
                "\n", "").strip()
            intro = list_intro_value.split(':')
            data2 = {}
            data2['attrkey'] = ''
            data2['keyname'] = ''
            if '商品品牌' in intro[0]:
                continue
            if '商品型号' in intro[0]:
                continue
            if '商品名称' in intro[0]:
                continue
            data2['attrkey'] = intro[0]
            data2['keyname'] = intro[1]
            intro_list.append(data2)

        num_three = 1
        for list_speci in speci:
            list_speci = list_speci.encode('utf-8').replace("\n", "").replace(
                "\"", "").strip()
            list_speci = list_speci.split(':')
            for value_speci in list_speci:
                if num_three % 2 == 1:
                    data2 = {}
                    data2['attrkey'] = ''
                    data2['keyname'] = ''
                    if '商品名称' in value_speci:
                        break
                    if '品牌' in value_speci:
                        break
                    data2['attrkey'] = value_speci
                else:
                    data2['keyname'] = value_speci
                    speci_list.append(data2)
                num_three += 1

        item['productSpeci'] = speci_list
        item['productPack'] = pack_list
        item['productIntro'] = intro_list
        item['productDetails'] = details_list
        yield item