def parse_lefeng_item(self, reponse): """解析Lefeng Item""" data = reponse.body soup = BeautifulSoup(data, "html5lib") item = reponse.meta['item'] pro_parameter_dic = {} title_div_tag = soup.find('div', class_="bigProduct-c") title_tag = title_div_tag.find('h1') title_i_tag = title_tag.find('i', recursive=False) if title_i_tag != None: title_i_tag.extract() pro_parameter_dic['title'] = Parse_Util.get_no_space_string( title_tag.text) print 'zzzzzzzzzzz------------- %s' % pro_parameter_dic detail_info_tag = soup.find('table', class_='detail-info-table') detail_tbody_tag = detail_info_tag.find('tbody', recursive=False) detail_tags = detail_tbody_tag.find_all('tr') pro_detail_parameter_dic = Parse_Util.structure_parameter_dic( detail_tags, u':') pro_parameter_dic = dict(pro_parameter_dic, **pro_detail_parameter_dic) price_c_tag = soup.find('div', class_='dity-price-c ') price_tag = price_c_tag.find('strong') origin_pirce_tag = price_c_tag.find('b', class_='marketPrice-s') pro_parameter_dic['price'] = origin_pirce_tag.text.strip(u'¥ ') pro_parameter_dic['promotion_price'] = price_tag.text item['other_parameter'] = pro_parameter_dic yield item
def parse_js_item(self, response): """解析js Item""" js_data = response.body js_data = js_data.decode('UTF-8') soup = BeautifulSoup(js_data, "html5lib") item = response.meta['item'] parameter_dic = item['other_parameter'] pro_price_tag = soup.find('p', class_='proPrice') price_span_tag = pro_price_tag.find('span', recursive=False) parameter_dic['price'] = price_span_tag.text pro_num_tag = soup.find('p', class_='proItem') # print 'nooooooooooo ------------- %s' % pro_num_tag item_no_dic = Parse_Util.structure_parameter_dic([pro_num_tag], u':') skuinfo_tag = soup.find('div', id='skuInfo') # print 'skuinfo_tag --------- %s' % skuinfo_tag other_parameter_dic = Parse_Util.make_up_dic(skuinfo_tag) pro_all_parameter_dic = dict( dict(parameter_dic, **other_parameter_dic), **item_no_dic) item['other_parameter'] = pro_all_parameter_dic yield item
def parse_sephora_item(self, response): """解析sephora Item""" data = response.body soup = BeautifulSoup(data, "html5lib") item = response.meta['item'] parameter_dic = {} pro_name_tag = soup.find('input', id='productNa') parameter_dic['product_name'] = pro_name_tag['value'] pro_enName_tag = soup.find('p', id='enName') parameter_dic['ename'] = Parse_Util.get_no_space_string( pro_enName_tag.text) pro_info_tag = soup.find('div', class_='popProDet proDetInfo floatR') pro_title_tag = pro_info_tag.find('h1', recursive=False) parameter_dic['title'] = Parse_Util.get_no_space_string( pro_title_tag.text) brand_img_tag = soup.find('a', class_='proBrandImg') brand_tag = brand_img_tag.find('img') parameter_dic['brand'] = brand_tag['alt'] sku_tag = soup.find('input', id='mySelCurrentSKUID') parameter_dic['sku_id'] = sku_tag['value'] item['other_parameter'] = parameter_dic js_url = js_origin_url % (int(parameter_dic['sku_id']), int( item['id'])) # yield item yield Request(js_url, callback=self.parse_js_item, meta={'item': item})
def parse_detail_item(self, response): # print 'detail ---------- %s' % response.url data = response.body soup = BeautifulSoup(data, "html5lib") item = response.meta['item'] sku_id = item['id'] pro_parameter_dic = item['other_parameter'] pro_title_tag = soup.find('div', class_='tb-detail-hd').find('h1', recursive=False) pro_parameter_dic['title'] = Parse_Util.get_no_space_string( pro_title_tag.string) pro_detail_tags = soup.find('ul', id='J_AttrUL').find_all('li') for pro_detail_tag in pro_detail_tags: detail_no_space_text = Parse_Util.get_no_space_string( pro_detail_tag.text) # print 'no space text ------------ %s' % detail_no_space_text if string.find(detail_no_space_text, u":") != -1: pro_dic_array = detail_no_space_text.split(u":") if string.find(detail_no_space_text, u':') != -1: pro_dic_array = detail_no_space_text.split(u':') # print 'araaaaa ------------- %s' % pro_dic_array pro_parameter_dic[pro_dic_array[0]] = pro_detail_tag['title'] J_DetailMeta_tag = soup.find('div', id='J_DetailMeta') tm_clear_tag = J_DetailMeta_tag.find('div', class_='tm-clear', recursive=False) sku_script_tag = tm_clear_tag.find_all('script')[-1] # print 'sku ------------- %s' % sku_script_tag m = re.search(r'[\s\S]*TShop.Setup\(([\s\S]*)( \); }\)\(\);)', Parse_Util.get_no_space_string(sku_script_tag.text)) # print 'sku dic1 ---------------- %s' % m.group(1) sku_dic = json.loads(m.group(1)) if sku_dic.has_key('valItemInfo'): item_info = sku_dic['valItemInfo'] sku_map = item_info['skuMap'] sku_list = item_info['skuList'] print 'sku-list ---------- %s' % sku_list for sku_dic in sku_list: # print 'skuuuuuuuuuuuuuuuuuid ---------------- %s' % sku_dic if sku_dic['skuId'] == sku_id: pro_parameter_dic['specification'] = sku_dic['names'] pro_parameter_dic['pvs'] = sku_dic['pvs'] pvs_key = ';%s;' % sku_dic['pvs'] pro_parameter_dic['price'] = sku_map[pvs_key]['price'] pro_parameter_dic['stock'] = sku_map[pvs_key]['stock'] else: print 'item stock specification null' comment_url = comment_origin_url % pro_parameter_dic['item_id'] yield FormRequest(comment_url, meta={'item': item}, headers=headers, cookies=cookies, callback=self.parse_comment_item)
def parse_sasa_item(self, response): """解析SaSa Item""" data = response.body soup = BeautifulSoup(data, "html5lib") item = response.meta['item'] pro_parameter_dic = {} start_end_tag = soup.find('span', class_='now_start_end_time_msg') if start_end_tag is not None: pro_title_tag = soup.find('div', class_='product-line') b_tag = pro_title_tag.find('b', class_='yew bonded_words_show') pro_info_tag = pro_title_tag.find('div', id='product_information') if b_tag is not None: b_tag.extract() if pro_info_tag is not None: pro_info_tag.extract() pro_title_text = Parse_Util.get_no_space_string(pro_title_tag.text) else: pro_title_tag = soup.find('div', class_='product-titles') b_title_tag = pro_title_tag.find('b', recursive=False) if b_title_tag is not None: b_title_tag.extract() pro_title_text = Parse_Util.get_no_space_string(pro_title_tag.text) pro_parameter_dic['title'] = pro_title_text pro_attributes_tag = soup.find('div', class_='product-attributes mod') pro_clearfix_tag = pro_attributes_tag.find('ul', class_='clearfix', recursive=False) pro_li_tags = pro_clearfix_tag.find_all('li', recursive=False) for pro_li_tag in pro_li_tags: pro_detail_key = Parse_Util.get_no_space_string( pro_li_tag.find('span').string.replace(u':', '')) pro_detail_value = Parse_Util.get_no_space_string( pro_li_tag.find('div', class_='attributes-cont').string) pro_parameter_dic[pro_detail_key] = pro_detail_value good_id = soup.find('input', attrs={'name': 'goods[goods_id]'})['value'] pro_parameter_dic['good_id'] = good_id item['other_parameter'] = pro_parameter_dic seckill_price_tag = soup.find('input', attrs={'name': 'goods[seckill_id]'}) if seckill_price_tag is not None: pro_price_link = seckill_price_origin_url % seckill_price_tag[ 'value'] else: pro_id = soup.find('input', attrs={'name': 'goods[product_id]'})['value'] pro_price_link = price_origin_url % pro_id # print 'pricelink ---------------- %s' % pro_price_link yield Request(pro_price_link, callback=self.parse_price_item, meta={'item': item})
def parse_lizi_item(self, response): """解析Lizi Item""" data = response.body soup = BeautifulSoup(data, "html5lib") item = response.meta['item'] pro_parameter_dic = {} title_dt_tag = soup.find('dt', class_="product_name") title_tag = title_dt_tag.find('h1', recursive=False) # title_i_tag = title_tag.find('i', recursive=False) # if title_i_tag != None: # title_i_tag.extract() pro_parameter_dic['title'] = Parse_Util.get_no_space_string( title_tag.text) detail_info_tag = soup.find('td', class_='op') detail_tbody_tag = detail_info_tag.find('tbody') # print 'zzzzzzzzzzzzzzz ------------ %s' % detail_tbody_tag detail_tags = detail_tbody_tag.find_all('tr') for tr_tag in detail_tags: # print 'liiiiistr --------- %s' % tr_tag.text no_space_string = tr_tag.text.replace("\t", " ").replace( "\n", " ").replace("\r", " ").strip() no_space_string = " ".join(no_space_string.split()) # print 'ssssssssssss %s' % no_space_string if string.find(no_space_string, u':') != -1 and no_space_string[-1] != u':': parameterList = no_space_string.split(u':') pro_parameter_dic[parameterList[0]] = parameterList[1] # PM - Oriented # if no_space_string.strip().startswith(u'所属品牌:'): # lizi_item['brand'] = no_space_string.strip(u'所属品牌:') # if no_space_string.strip().startswith(u'商品名称:'): # lizi_item['product_name'] = no_space_string.strip(u'商品名称:') # # print 'zzzzzzzzzzzzzzzzzzzzzzzzz ------------ %s' % detail_tag.text # # price_info_tag = soup.find('dd', id='item_info') # print 'priceeeeeeeeeeeeee ------------------- %s' % price_info_tag item['other_parameter'] = pro_parameter_dic script_tags = soup.find_all('script') # print 'zzzdsadfadfasdfasf ------------- %s' % script_tags[3] script_id_tag = script_tags[3] origin_data = Parse_Util.get_no_space_string(script_id_tag.text) p = re.match(r'([\s\S]*id:)([\s\S]*)(, cover[\s\S]*)', origin_data) price_id_str = p.group(2).replace('\'', " ") price_id_str = " ".join(price_id_str.split()) price_link = price_origin_url + str(price_id_str) # print 'rrrrrrrrrrr -------------- %s' % price_link yield Request(price_link, callback=self.parse_price_item, meta={'item': item})
def parse_yhd_item(self, reponse): """解析tmall Item""" data = reponse.body soup = BeautifulSoup(data, "html5lib") item = reponse.meta['item'] title_tag = soup.find('h1', id="productMainName") item['title'] = title_tag.text is_proprietary_trading = False source_tag = soup.find('p', attrs={'class': 'add_02'}) # print 'sssssssssssource_tag.string %s' % source_tag.string # print 'ttttttttttsource_tag.text %s' % source_tag.text if source_tag.text.strip().startswith(u'本商品由1号店自营提供'): # print 'sssssssssssssssssssssss本商品由1号店自营提供' is_proprietary_trading = True else: pass if is_proprietary_trading: ul_tag = soup.find('ul', attrs={'class': 'ull'}) brand_name = Parse_Util.get_parse_value(ul_tag, u'【产品品牌】:') if brand_name == 'None': brand_name = Parse_Util.get_parse_value(ul_tag, u'【品牌名称】:') if brand_name == 'None': dl_tag = soup.find('dl', attrs={'class': 'des_info clearfix'}) brand_name = self.get_brand(dl_tag) product_name = Parse_Util.get_parse_value(ul_tag, u'【产品名称】:') if product_name == 'None': product_name = Parse_Util.get_parse_value(ul_tag, u'【商品名称】:') if product_name == 'None': product_name = Parse_Util.get_parse_value(ul_tag, u'【名称】:') item['brand'] = brand_name item['product_name'] = product_name item['source'] = 1 else: good_tag = soup.find('dl', class_="des_info clearfix") item['brand'] = self.get_brand(good_tag) item['source'] = 0 yield item
def parse_letian_item(self, response): """解析LeTian Item""" data = response.body soup = BeautifulSoup(data, "html5lib") item = response.meta['item'] pro_parameter_dic = {} wrap_tag = soup.find('div', id='wrap') script_tags = wrap_tag.find_all('script', type='text/javascript') pro_script_text = Parse_Util.get_no_space_string(script_tags[21].text) # for k, script_tag in enumerate(script_tags): # print 'k : %d ------------------- %s' % (k, script_tag) re_brand_object = re.search( r'brandNmTemp = \'([\s\S]*)\'; brandNmTemp', pro_script_text) # print 'branddddddddddd1 --------------- %s' % re_brand_object.group(1) pro_parameter_dic['brand'] = re_brand_object.group(1) pro_title_tag = soup.find('meta', property='rb:itemName') # print 'tititititiitit ------------- %s' % pro_title_tag pro_parameter_dic['title'] = pro_title_tag['content'] pro_table_tag = soup.find('table', summary=u'产品详细信息') pro_tbody_tag = pro_table_tag.find('tbody') pro_info_tags = pro_tbody_tag.find_all('tr', recursive=False) for pro_info_tag in pro_info_tags: dic_key = pro_info_tag.find('th').string dic_value = Parse_Util.get_no_space_string( pro_info_tag.find('td').text) pro_parameter_dic[dic_key] = dic_value pro_parameter_dic['comment_count'] = soup.find( 'div', id='tabmenuT').string.replace(u'条', '') help_tag = soup.find('div', class_='help') dl_tag = help_tag.find('dl', recursive=False) t01_num_key = dl_tag.find('dt', class_='t01').find('img')['alt'] t01_num_value = dl_tag.find('dd', class_='r01').string pro_parameter_dic[t01_num_key] = t01_num_value t02_num_key = dl_tag.find('dt', class_='t02').find('img')['alt'] t02_num_value = dl_tag.find('dd', class_='r02').string pro_parameter_dic[t02_num_key] = t02_num_value item['other_parameter'] = pro_parameter_dic pro_price_link = price_origin_url % item['id'] # print 'priceeeeeeeeeeeeee ---------------- %s' % pro_price_link yield Request(pro_price_link, callback=self.parse_price_item, meta={'item': item})
def parse_jd_item(self, response): """解析普通jd Item""" data = response.body soup = BeautifulSoup(data, "html5lib") item = response.meta['item'] item_id = response.meta['id'] title_tag = soup.find('div', id="name") title = 'error' for child in title_tag.children: if child is None: continue if child.name is None: continue if child.name == u"h1": title = child.string break jd_item = JDMMItem() jd_item['title'] = title.encode('utf-8') good_tag = soup.find('ul', attrs={'id': 'parameter2'}) jd_item['product_name'] = Parse_Util.get_parse_value( good_tag, u'商品名称:') jd_item['good_detail'] = Parse_Util.make_up_dic(good_tag) ul_tag = soup.find('ul', id="parameter-brand") # print 'ul_tag -------------- %s' % ul_tag jd_item['brand'] = 'None' if ul_tag != None: jd_item['brand'] = ul_tag.find('li').get("title") li_tags = ul_tag.find_all('li') li_tag = li_tags[0] p = re.compile('\s+') brand_str = re.sub(p, '', li_tag.text) if string.find(brand_str, u'♥') != -1: list_str = brand_str.split(u'♥') brand_str = list_str[0] brand_str_list = brand_str.split(u':') # print 'brand_str_list --------- %s' % brand_str_list jd_item['good_detail'][brand_str_list[0]] = brand_str_list[1] item['other_parameter'] = jd_item item_comment_link = comment_origin_url % (int(item_id)) yield Request(item_comment_link, callback=self.parse_comment_detail, meta={'item': item})
def parse_memebox_item(self, response): """解析MeMeBox Item""" data = response.body soup = BeautifulSoup(data, "html5lib") item = response.meta['item'] pro_parameter_dic = {} pro_name_tag = soup.find('div', class_='product-name') pro_title_tag = pro_name_tag.find('span') # print h1_tag pro_parameter_dic['title'] = pro_title_tag.text review_tag = soup.find('a', id='goto-reviews') if review_tag != None: pro_parameter_dic['comment_count'] = filter( str.isdigit, review_tag.text.encode("utf-8")) # print 'ssssssssssssssss --------------- %s' % review_tag detail_table_tag = soup.find('table', id='product-attribute-specs-table') # print 'zzzzzzzzzzzzzz --------------- %s' % detail_table_tag detail_tbody_tag = detail_table_tag.find('tbody', recursive=False) detail_tr_tags = detail_tbody_tag.find_all('tr', recursive=False) for detail_tr_tag in detail_tr_tags: dic_key = Parse_Util.get_no_space_string( detail_tr_tag.find('th', class_='label').string) dic_value = Parse_Util.get_no_space_string( detail_tr_tag.find('td', class_='data').string) # PM - Oriented # if dic_key.strip().startswith(u'商品名称'): # memebox_item['product_name'] = dic_value # if dic_key.strip().startswith(u'品牌'): # memebox_item['brand'] = dic_value pro_parameter_dic[dic_key] = dic_value item['other_parameter'] = pro_parameter_dic pro_nodisplay_tag = soup.find('div', class_='no-display') pro_id_tag = pro_nodisplay_tag.find('input', attrs={'name': 'productId'}) pro_id = pro_id_tag['value'] pro_price_link = price_origin_url % str(pro_id) yield Request(pro_price_link, callback=self.parse_price_item, meta={'item': item})
def parse_kaola_item(self, reponse): """解析Kaola Item""" data = reponse.body soup = BeautifulSoup(data, "html5lib") item = reponse.meta['item'] kaola_item = KaoLaMMItem() title_tag = soup.find('dt', class_="product-title") kaola_item['title'] = title_tag.text goods_tag = soup.find('ul', class_='goods_parameter') kaola_item['brand'] = Parse_Util.get_parse_value(goods_tag, u'商品品牌:') kaola_item['product_name'] = Parse_Util.get_parse_value( goods_tag, u'品名:') kaola_item['good_detail'] = Parse_Util.make_up_dic(goods_tag) item['other_parameter'] = kaola_item yield item
def parser_price_detail(self, response): """解析价格""" jquery_data = response.body jquery_data = jquery_data.decode('UTF-8') item = response.meta['item'] jd_item = item['other_parameter'] json_data = Parse_Util.get_json_str(jquery_data) data = json.loads(json_data, 'UTF-8') jd_item['price'] = data[0]['p'] yield item
def parse_word_wide_item(self, response): """解析全球购ITEM""" data = response.body soup = BeautifulSoup(data, "html5lib") item = response.meta['item'] item_id = response.meta['id'] title_tag = soup.find('div', id="name") jd_item = JDMMItem() jd_item['title'] = self.delete_node_content(title_tag, 'span') good_tag = soup.find('ul', id="parameter2") jd_item['product_name'] = Parse_Util.get_parse_value( good_tag, u'商品名称:') jd_item['brand'] = Parse_Util.get_parse_value(good_tag, u'品牌:') jd_item['good_detail'] = Parse_Util.make_up_dic(good_tag) item['other_parameter'] = jd_item item_comment_link = comment_origin_url % (int(item_id)) yield Request(item_comment_link, callback=self.parse_comment_detail, meta={'item': item})
def parse_suning_item(self, response): """解析Suning Item""" data = response.body soup = BeautifulSoup(data, "html5lib") item = response.meta['item'] pro_parameter_dic = {} zy_tag = soup.html.find('span',id='itemNameZy') if zy_tag != None: zy_tag.extract() title_tag = soup.find('h1', id="itemDisplayName") pro_parameter_dic['title'] = Parse_Util.get_no_space_string(title_tag.text) #class = pro-para-tbl bgqd_tag = soup.find('table', id='bzqd_tag') if bgqd_tag != None: bgqd_tag.extract() table_tag = soup.find('table', id='itemParameter') # print 'tabletag ---------- %s' % table_tag if table_tag == None: print 'sssssssss -------- tabletag is None' table_tag = soup.html.find('table', id='pro-para-tbl') print 'pro-para-tbl ----- %s' % table_tag body_tag = table_tag.find('tbody') tr_tags = body_tag.find_all('tr') # print 'sssssssssss -------- %s' % soup.html.find_all('th') #th标签从html树移除 for th_tag in soup.html.find_all('th'): th_tag.extract() # print 'zzzzzzzzzzz -------- %s' % soup.html.find_all('th') for tr_tag in tr_tags: item_dic_key = 'key' item_dic_value = 'value' # is_brand_tag = False for i, tr_tag_str in enumerate(tr_tag.stripped_strings): # print 'index --- %s :tr_tag_str ------- %s' % (i, tr_tag_str) if i == 0: item_dic_key = tr_tag_str if i == 1: item_dic_value = tr_tag_str pro_parameter_dic[item_dic_key] = item_dic_value del pro_parameter_dic['key'] item['other_parameter'] = pro_parameter_dic p = re.match(r'(\d+)(/)(\d+)', str(item['id'])) vendorCode = p.group(1) partNumber = p.group(3) price_url = price_origin_url % (partNumber, partNumber, vendorCode) # print 'ooooooooooooooooo ------------------- %s' % price_url yield Request(price_url, callback=self.parse_price_item, meta={'item': item})
def parse_price_item(self, response): item_id = response.meta['item_id'] origin_data = response.body decode_data = Parse_Util.get_no_space_string(origin_data.decode('GBK')) # print 'zzdzzzzzzz ----------------- %s' % decode_data m = re.match(r'((setMdskip \()([\s\S]*)\))', decode_data) # print '33333333333 ------------- %s' % m.group(3) py_obj = json.loads(m.group(3)) price_info_dic = py_obj['defaultModel']['itemPriceResultDO'][ 'priceInfo'] price_sell_count_dic = py_obj['defaultModel']['sellCountDO'] # print 'prceinfodic ----------- %s' % price_info_dic.keys() sku_ids = price_info_dic.keys() for sku_id in sku_ids: item = CommonItem() # print 'sku dic ------------- %s' % sku_id pro_parameter_dic = {} pro_parameter_dic['item_id'] = item_id pro_parameter_dic[u'月销量'] = price_sell_count_dic['sellCount'] item['source'] = 'tmall.com' item['domain'] = 'cosmetics' item['classify'] = 'mask' item['subclass'] = 'mask' item['id'] = sku_id sku_dic = price_info_dic[sku_id] pro_parameter_dic['price'] = sku_dic['price'] if sku_dic.has_key('promotionList'): promotion_price_dic = sku_dic['promotionList'][0] pro_parameter_dic['promotion_price'] = promotion_price_dic[ 'price'] if sku_id == 'def': item['id'] = item_id sku_parameter = "&skuId=%s" % sku_id sku_detail_url = detail_origin_url % item_id sku_link = sku_detail_url + sku_parameter item['url'] = sku_link item['other_parameter'] = pro_parameter_dic yield FormRequest(sku_link, meta={'item': item}, headers=headers, cookies=cookies, callback=self.parse_detail_item)
def parse_comment_detail(self, response): """解析评价""" jquery_data = response.body jquery_data = jquery_data.decode('GBK') item = response.meta['item'] jd_item = item['other_parameter'] json_data = Parse_Util.get_json_str(jquery_data) data = json.loads(json_data, 'UTF-8') jd_item['comment_count'] = data['productCommentSummary'][ 'commentCount'] jd_item['good_count'] = data['productCommentSummary']['goodCount'] jd_item['general_count'] = data['productCommentSummary'][ 'generalCount'] jd_item['bad_count'] = data['productCommentSummary']['score1Count'] jd_item['good_rate'] = data['productCommentSummary']['goodRate'] item_price_link = price_origin_url % (int(item['id'])) # print 'item_price_link ------------ %s' % item_price_link yield Request(item_price_link, callback=self.parser_price_detail, meta={'item': item})