def parse_js_item(self, response): """解析js Item""" js_data = response.body js_data = js_data.decode('UTF-8') soup = BeautifulSoup(js_data, "html5lib") item = response.meta['item'] parameter_dic = item['other_parameter'] pro_price_tag = soup.find('p', class_='proPrice') price_span_tag = pro_price_tag.find('span', recursive=False) parameter_dic['price'] = price_span_tag.text pro_num_tag = soup.find('p', class_='proItem') # print 'nooooooooooo ------------- %s' % pro_num_tag item_no_dic = Parse_Util.structure_parameter_dic([pro_num_tag], u':') skuinfo_tag = soup.find('div', id='skuInfo') # print 'skuinfo_tag --------- %s' % skuinfo_tag other_parameter_dic = Parse_Util.make_up_dic(skuinfo_tag) pro_all_parameter_dic = dict( dict(parameter_dic, **other_parameter_dic), **item_no_dic) item['other_parameter'] = pro_all_parameter_dic yield item
def parse_jd_item(self, response): """解析普通jd Item""" data = response.body soup = BeautifulSoup(data, "html5lib") item = response.meta['item'] item_id = response.meta['id'] title_tag = soup.find('div', id="name") title = 'error' for child in title_tag.children: if child is None: continue if child.name is None: continue if child.name == u"h1": title = child.string break jd_item = JDMMItem() jd_item['title'] = title.encode('utf-8') good_tag = soup.find('ul', attrs={'id': 'parameter2'}) jd_item['product_name'] = Parse_Util.get_parse_value( good_tag, u'商品名称:') jd_item['good_detail'] = Parse_Util.make_up_dic(good_tag) ul_tag = soup.find('ul', id="parameter-brand") # print 'ul_tag -------------- %s' % ul_tag jd_item['brand'] = 'None' if ul_tag != None: jd_item['brand'] = ul_tag.find('li').get("title") li_tags = ul_tag.find_all('li') li_tag = li_tags[0] p = re.compile('\s+') brand_str = re.sub(p, '', li_tag.text) if string.find(brand_str, u'♥') != -1: list_str = brand_str.split(u'♥') brand_str = list_str[0] brand_str_list = brand_str.split(u':') # print 'brand_str_list --------- %s' % brand_str_list jd_item['good_detail'][brand_str_list[0]] = brand_str_list[1] item['other_parameter'] = jd_item item_comment_link = comment_origin_url % (int(item_id)) yield Request(item_comment_link, callback=self.parse_comment_detail, meta={'item': item})
def parse_kaola_item(self, reponse): """解析Kaola Item""" data = reponse.body soup = BeautifulSoup(data, "html5lib") item = reponse.meta['item'] kaola_item = KaoLaMMItem() title_tag = soup.find('dt', class_="product-title") kaola_item['title'] = title_tag.text goods_tag = soup.find('ul', class_='goods_parameter') kaola_item['brand'] = Parse_Util.get_parse_value(goods_tag, u'商品品牌:') kaola_item['product_name'] = Parse_Util.get_parse_value( goods_tag, u'品名:') kaola_item['good_detail'] = Parse_Util.make_up_dic(goods_tag) item['other_parameter'] = kaola_item yield item
def parse_word_wide_item(self, response): """解析全球购ITEM""" data = response.body soup = BeautifulSoup(data, "html5lib") item = response.meta['item'] item_id = response.meta['id'] title_tag = soup.find('div', id="name") jd_item = JDMMItem() jd_item['title'] = self.delete_node_content(title_tag, 'span') good_tag = soup.find('ul', id="parameter2") jd_item['product_name'] = Parse_Util.get_parse_value( good_tag, u'商品名称:') jd_item['brand'] = Parse_Util.get_parse_value(good_tag, u'品牌:') jd_item['good_detail'] = Parse_Util.make_up_dic(good_tag) item['other_parameter'] = jd_item item_comment_link = comment_origin_url % (int(item_id)) yield Request(item_comment_link, callback=self.parse_comment_detail, meta={'item': item})