def getBaseJson(self): try: heros = self.req.get_html(self.hero_js, is_json=True) items = self.req.get_html(self.item_js, is_json=True) summoners = self.req.get_html(self.summoner_js, is_json=True) mings = self.req.get_html(self.ming_js, is_json=True) except Exception as e: logger.error('王者荣耀 Base Json Fetche error:' + e) self.heros = [] # 对hero添加topic_id映射 for hero in heros: tmp_hero = hero hero_id = 0 ms_hero = self.session.query(Hero).filter( Hero.ename == tmp_hero['ename']).first() if not ms_hero is None: hero_id = ms_hero.topicid tmp_hero['id'] = hero_id self.heros.append(tmp_hero) # 持久json文件 self._saveJson(json.dumps(self.heros), 'hero.json') self._saveJson(json.dumps(items), 'item.json') self._saveJson(json.dumps(summoners), 'summoner.json') self._saveJson(json.dumps(mings), 'ming.json')
def qiniu_fetch_file(self, url, key): try: ret, info = self.__bucket.fetch(url, self.bucket_name, key) if ret and str(ret['key']) == key: return True else: return False except Exception as e: logger.error("qiniu远程fetch文件出错,info:{0}".format(e)) return False
def fetchSingleGoodInfo(self, good_url): self.__sleep() good_id = '' good_desc = '' good_pics = {} good_price = {} html = self.req.get_html(good_url, is_json=False) if html.strip() != '': try: soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8') # 获取keywords good_keyword = soup.find('meta', attrs={'name': 'keywords'}) seo_keyword = good_keyword['content'] # 获取description good_desc = soup.find('meta', attrs={'name': 'description'}) seo_desc = good_keyword['content'] # 获取商品列表图 lis = soup.find(id='spec-list').find_all('li') good_pics = [] for li in lis: img_src = li.img['src'] # 替换为高分辨图片 img_src = img_src.replace('s54x54_jfs', 's450x450_jfs') img_src = img_src.replace('s75x75_jfs', 's450x450_jfs') img_src = img_src.replace('n5/jfs', 'n1/jfs') good_pics.append('http:{0}'.format(img_src)) # 获取商品描述 good_desc = str( soup.find(class_='parameter2 p-parameter-list')) # 获取商品ID match_obj = re.search(r'http:(.*)\/(\d+)\.html', str(good_url), re.M | re.I) if match_obj: good_id = match_obj.group(2) # 获取商品价格(Modify, 上篇价格单独处理 # good_price = self.fetchGoodPrice(good_url) except Exception as e: logger.error('parse Html error: {0}'.format(e)) return good_id, good_desc, good_pics, seo_keyword, seo_desc
def fetchJdGoods(self): # 初始page start_page = 1 max_page = 2 good_count = 0 error_time = 0 for page in range(start_page, max_page): goods_list = [] # 休息1秒钟 time.sleep(1) list_url = self.start_url + '&page={0}'.format(page) logger.info( "###########list Url: {0} ###############".format(list_url)) html = self.req.get_html(list_url, is_json=False) if html.strip() != '': try: soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8') items = soup.find_all(class_='gl-item') for good in items: good_info = {} good_thumb = good.find(class_='p-img').a.img good_info['name'] = good.find( class_='p-name').a.em.string good_info['link'] = 'http:{0}'.format( good.find(class_='p-img').a['href']) goods_list.append(good_info) except Exception as e: logger.error('parse Html error: {0}'.format(e)) else: error_time += 1 logger.error( 'Failed Get Return Message, error_time:{0}, res:{1}'. format(error_time, html)) if error_time > 5: break # 对当前列表页的商品进行处理 for good in goods_list: good_count += 1 good_link = good['link'] good_name = good['name'] logger.info('good_count: {0}, good_link: {1}'.format( good_count, good_link)) good_id, good_desc, good_pics, seo_keywords, seo_desc = self.fetchSingleGoodInfo( good_link) if good_pics: good_thumb = good_pics[0] else: good_thumb = '' # 其他信息更新到mysql now = int(time.time()) shopping_good = Shopping_Goods(catid=self.goods_cat, pid=self.brand, title=good_name, content=good_desc, desc=good_link, price=0, seo_title=good_name, seo_keywords=seo_keywords, seo_desc=seo_desc, thumb=good_thumb, inputtime=now) self.session.add(shopping_good) self.session.commit() # 将轮播图片和price信息存入redis,待下载和更新 self.redis.lpush( self.redis_list, json.dumps({ good_id: { 'thumb': good_thumb, 'pics': good_pics, 'id': shopping_good.id } }))
for pic_url in data: new_key = self.generate_pic_key() self.qiniu_fetch_file(pic_url, new_key) logger.info('new_pic_url: {0}{1}'.format( 'http://static.shenyou.tv/', new_key)) new_pics.append('{0}{1}'.format('http://static.shenyou.tv/', new_key)) return new_pics def fetchJD(brand, category, url, process_step='fetch'): good_fetch = GoodsFetcher(brand, category, url) if process_step == 'fetch': good_fetch.fetchJdGoods() else: good_fetch.updateGoodsExtraInfo() # 四个参数 # 1 方法类型,fetch | process # 2 商品列表地址, url # 3 商品分类,category # 4 品牌名称, if __name__ == '__main__': if len(sys.argv) != 5: logger.error('参数输入有误') process_step = sys.argv[1] brand = sys.argv[2] category = sys.argv[3] url = sys.argv[4] fetchJD(brand, category, url, process_step)
def _processSingleHero(self, url, hero_id): hero_res = {} html = self.req.get_html(url, is_json=False) if html.strip() != '': try: soup = BeautifulSoup(html, 'html.parser', from_encoding='gbk') beijing = soup.find(class_='story-info info').find( class_='nr').p hero_res['id'] = hero_id #hero_res['desc'] = beijing.text skills = soup.find(class_='skill-show').find_all( class_='show-list') skill_imgs = soup.find(class_='skill-u1').find_all('li') # 组装英雄技能 skill_arr = [] for idx, skill in enumerate(skills): tmp = {} tmp['name'] = skill.a.text tmp['p1'] = skill.find(class_='skill-p1').text tmp['p2'] = skill.find(class_='skill-p2').text tmp['p3'] = skill.find(class_='skill-p3').text tmp['img'] = skill_imgs[idx].img['src'] skill_arr.append(tmp) hero_res['skills'] = skill_arr # 组装相关英雄 rel_hero_arr = [] rel_hreos = soup.find_all( class_='hero-list hero-relate-list fl') for hero in rel_hreos: str_ids = hero['data-relatename'] str_new_ids = '' rel_hero_arr.append(str_ids) # 处理英雄映射ID for id in str_ids.split('|'): ms_hero = self.session.query(Hero).filter( Hero.ename == id).first() if not ms_hero is None: str_new_ids += str(ms_hero.topicid) str_new_ids += '|' str_new_ids = str_new_ids[:-1] rel_hero_arr.append(str_new_ids) pprint(rel_hero_arr) # 组装出装 hero_res['rel_heros'] = rel_hero_arr equipment_arr = [] equipments = soup.find_all(class_='equip-list fl') for equipment in equipments: equipment_arr.append(equipment['data-item']) hero_res['equipments'] = equipment_arr # 组装明文 mings = soup.find(class_='sugg-u1')['data-ming'] hero_res['mings'] = mings # 组装能力 abilities = soup.find_all(class_='cover-list-bar') hero_ability = '' for ability in abilities: tmp = str(ability.i['style'])[6:] + '|' hero_ability += tmp hero_res['ability'] = hero_ability[:-1] # 组装出装 sug_skills = soup.find_all('img', class_='jn-pic1') sug_skills_arr = [] for sug_skill in sug_skills: sug_skills_arr.append(sug_skill['src']) hero_res['sug_skills'] = sug_skills_arr # 组装召唤师技能 hero_zhs_skills = soup.find(id='skill3')['data-skill'] hero_res['zhs_skills'] = hero_zhs_skills # 组装皮肤 skins = soup.find( class_='pic-pf-list pic-pf-list3')['data-imgname'] hero_res['skins'] = skins hero_json = json.dumps(hero_res) self._saveJson(hero_json, '{0}.json'.format(hero_id)) except Exception as e: logger.error('parse Html error:' + e)