예제 #1
0
    def parse_js_item(self, response):
        """解析js Item"""
        js_data = response.body
        js_data = js_data.decode('UTF-8')
        soup = BeautifulSoup(js_data, "html5lib")
        item = response.meta['item']
        parameter_dic = item['other_parameter']

        pro_price_tag = soup.find('p', class_='proPrice')
        price_span_tag = pro_price_tag.find('span', recursive=False)
        parameter_dic['price'] = price_span_tag.text

        pro_num_tag = soup.find('p', class_='proItem')
        # print 'nooooooooooo ------------- %s' % pro_num_tag
        item_no_dic = Parse_Util.structure_parameter_dic([pro_num_tag], u':')

        skuinfo_tag = soup.find('div', id='skuInfo')
        # print 'skuinfo_tag --------- %s' % skuinfo_tag
        other_parameter_dic = Parse_Util.make_up_dic(skuinfo_tag)

        pro_all_parameter_dic = dict(
            dict(parameter_dic, **other_parameter_dic), **item_no_dic)
        item['other_parameter'] = pro_all_parameter_dic

        yield item
예제 #2
0
    def parse_jd_item(self, response):
        """解析普通jd Item"""
        data = response.body
        soup = BeautifulSoup(data, "html5lib")
        item = response.meta['item']
        item_id = response.meta['id']

        title_tag = soup.find('div', id="name")
        title = 'error'
        for child in title_tag.children:
            if child is None:
                continue
            if child.name is None:
                continue
            if child.name == u"h1":
                title = child.string
                break
        jd_item = JDMMItem()
        jd_item['title'] = title.encode('utf-8')

        good_tag = soup.find('ul', attrs={'id': 'parameter2'})
        jd_item['product_name'] = Parse_Util.get_parse_value(
            good_tag, u'商品名称:')
        jd_item['good_detail'] = Parse_Util.make_up_dic(good_tag)

        ul_tag = soup.find('ul', id="parameter-brand")
        # print 'ul_tag -------------- %s' % ul_tag
        jd_item['brand'] = 'None'
        if ul_tag != None:
            jd_item['brand'] = ul_tag.find('li').get("title")
            li_tags = ul_tag.find_all('li')
            li_tag = li_tags[0]

            p = re.compile('\s+')
            brand_str = re.sub(p, '', li_tag.text)
            if string.find(brand_str, u'♥') != -1:
                list_str = brand_str.split(u'♥')
                brand_str = list_str[0]
            brand_str_list = brand_str.split(u':')
            # print 'brand_str_list --------- %s' % brand_str_list
            jd_item['good_detail'][brand_str_list[0]] = brand_str_list[1]

        item['other_parameter'] = jd_item

        item_comment_link = comment_origin_url % (int(item_id))
        yield Request(item_comment_link,
                      callback=self.parse_comment_detail,
                      meta={'item': item})
    def parse_kaola_item(self, reponse):
        """解析Kaola Item"""
        data = reponse.body
        soup = BeautifulSoup(data, "html5lib")
        item = reponse.meta['item']

        kaola_item = KaoLaMMItem()

        title_tag = soup.find('dt', class_="product-title")
        kaola_item['title'] = title_tag.text

        goods_tag = soup.find('ul', class_='goods_parameter')
        kaola_item['brand'] = Parse_Util.get_parse_value(goods_tag, u'商品品牌:')
        kaola_item['product_name'] = Parse_Util.get_parse_value(
            goods_tag, u'品名:')
        kaola_item['good_detail'] = Parse_Util.make_up_dic(goods_tag)
        item['other_parameter'] = kaola_item

        yield item
예제 #4
0
    def parse_word_wide_item(self, response):
        """解析全球购ITEM"""
        data = response.body
        soup = BeautifulSoup(data, "html5lib")
        item = response.meta['item']
        item_id = response.meta['id']

        title_tag = soup.find('div', id="name")

        jd_item = JDMMItem()
        jd_item['title'] = self.delete_node_content(title_tag, 'span')

        good_tag = soup.find('ul', id="parameter2")
        jd_item['product_name'] = Parse_Util.get_parse_value(
            good_tag, u'商品名称:')
        jd_item['brand'] = Parse_Util.get_parse_value(good_tag, u'品牌:')
        jd_item['good_detail'] = Parse_Util.make_up_dic(good_tag)
        item['other_parameter'] = jd_item

        item_comment_link = comment_origin_url % (int(item_id))
        yield Request(item_comment_link,
                      callback=self.parse_comment_detail,
                      meta={'item': item})