def get_total_sales(self, session, agentipjj, page_num, shop_id): try: count = 0 while (count < 20): print("agentipjj:" + agentipjj) proxies = {"http": agentipjj, "https": agentipjj} parms_pager = "{{\"shopId\":\"{shop_id}\",\"currentPage\":{page_num},\"pageSize\":\"30\",\"sort\":\"hotsell\",\"q\":\"\"}}" parms_url = "https://unzbmix25g.api.m.taobao.com/h5/com.taobao.search.api.getshopitemlist/2.0/?appKey=12574478&t={stmp}&sign={sign}&api=com.taobao.search.api.getShopItemList&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp12&data={pager}" params_referer = "https://shop{shop_id}.m.taobao.com/?shop_id={shop_id}&sort=d".format( shop_id=shop_id) stmp = "%s739" % (long(time.time())) referer = params_referer.format(shop_id=shop_id) pager = parms_pager.format(shop_id=shop_id, page_num=page_num) if session.cookies.get_dict( '.taobao.com') and session.cookies.get_dict( '.taobao.com').has_key('_m_h5_tk'): h5_tk = session.cookies.get_dict('.taobao.com')['_m_h5_tk'] token = re.compile('(.*)(?=_)').findall(h5_tk)[0] value = '%s&%s&12574478&%s' % (token, stmp, pager) sign = self.execute_javascript(value) else: sign = "a013c868718eddb116eac3da0aa7974a" url = parms_url.format(pager=pager, stmp=stmp, sign=sign) requests_parms = {} headers = { 'Referer': referer, 'Host': 'api.m.taobao.com', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'timeout': '5000', 'User-Agent': Html_Downloader.GetUserAgent() } if agentipjj: requests_parms['proxies'] = proxies requests_parms['verify'] = False try: result = session.get(url, headers=headers, **requests_parms) except Exception, e: agentipjj = Utils.GetMyAgent() continue count = count + 1 if result.status_code != 200: logging.info("代理ip返回结果{log_code}".format( log_code=result.status_code)) agentipjj = Utils.GetMyAgent() sleep(2) else: print(result.status_code) if result.ok: sleep(2) return result.content break except Exception, e: #shop_id={shop_id}&sort=d".format(shop_id=shop_id) logging.info("抓取totalSoldQuantity有错{m}".format(m=e.message)) print("抓取totalSoldQuantity有错{e}".format(e=e.message))
def run(self): agentip = Utils.GetMyAgent() agentipjj = Utils.GetMyAgent() day = datetime.now().strftime("%Y%m%d") search_url = "https://s.taobao.com/search?q={q}&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_{day}&ie=utf8&bcoffset=0&ntoffset=1&p4ppushleft=%2C44&sort=sale-desc&s={s}" page_Url = search_url.format(q=self.key_word, day=day, s=0) header = {'ip': agentip} total = 3 totalpage = self.crawlTotalpage(page_Url, header) total = totalpage if totalpage < total else total total = total + 1 for i in range(1, total): t_url = search_url.format(q=self.key_word, day=day, s=(i - 1) * 44) try: ok, response = Html_Downloader.Download_Html(t_url, {}, header) if not ok: count = 0 while (count < 4): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html( t_url, {}, header) if ok: break count += 1 if count == 3: header = {} if ok: html = etree.HTML(response.text) matchs = html.xpath( "//script[contains(.,'g_page_config')]") if len(matchs) > 0: data = re.compile( "g_page_config=(.*)?;g_srp_loadCss").match( matchs[0].text.replace("\n\n", "\n").replace( "\n", "").replace(" ", "")) if data.lastindex > 0: data = json.loads(data.group(1).encode('utf-8')) if data.has_key('mods'): self.crawlNid(data, i, agentip, agentipjj) else: print("无法匹配有效的json") else: print("无法匹配到宝贝列表") else: logging.info("关键词{p}第{i}页抓取失败{m}".format(p=self.key_word, i=i, m=e.message)) except Exception, e: logging.info("关键词{p}第{i}页抓取错误{m}".format(p=self.key_word, i=i, m=e.message))
def crawlMonthSales(self, nid, agentip): try: month_Sales = "" nid_url = "https://mdskip.taobao.com/core/initItemDetail.htm?itemId={nid}" refer_url = "https://detail.taobao.com/item.htm?id={nid}" nid_Url = nid_url.format(nid=nid) nid_refer = refer_url.format(nid=nid) cookies = "ab=12; UM_distinctid=15e7a46caf311b-0188d989dac01e-5c153d17-144000-15e7a46caf4552; thw=cn; ali_apache_id=11.131.226.119.1505353641652.239211.1; miid=2033888855982094870; l=AllZcEkSLTy0io2vJcWc-ksY6U4zk02Y; _cc_=WqG3DMC9EA%3D%3D; tg=0; _uab_collina=150780747345339957932139; cna=dNU/EvGIRjsCAQ4XY4PdDkHN; _tb_token_=3d73497b6b4b1; ali_ab=14.23.99.131.1510570522194.8; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=0_0; _m_h5_tk=c690a92415e1684e37a0d852f95c4237_1511139636041; _m_h5_tk_enc=03e0735d1910593631f521e6615c4e4b; x=124660357; uc3=sg2=AVMH%2FcTVYAeWJwo98UZ6Ld9wxpMCVcQb0e1XXZrd%2BhE%3D&nk2=&id2=&lg2=; uss=VAmowkFljKPmUhfhc%2B1GBuXNJWn9cLMEX%2FtIkJ5j0tQgoNppvUlaKrn3; tracknick=; sn=%E4%BE%9D%E4%BF%8A%E6%9C%8D%E9%A5%B0%3A%E8%BF%90%E8%90%A5; skt=53a079a2a620057d; v=0; cookie2=17f5415096176ca88c03d1fed693a1d4; unb=2077259956; t=1630b104e4d32df897451d6c96642469; uc1=cookie14=UoTdev2%2BYyNASg%3D%3D&lng=zh_CN; _umdata=85957DF9A4B3B3E8F872A3094256432F0F1549AE1C92C6CCF1E68B982581686F23BFC13A60CCABD1CD43AD3E795C914C5B383FEA6B5C410F78EAF10A11987746; isg=Au_vsoMX6XTuPe7jEO7aMMjafgM5PEijMRuJ0QF8i95lUA9SCWTTBu2ApHYV" # cookies="_tb_token_=f3fe5d65a6591;cookie2=171e5eb92d66332b1d52d9e2730fed33;t=bf64b0d40d912c08dd434661471b2c98;v=0" cookie_dict = {item.split('=')[0]: item.split('=')[1] for item in cookies.split(';')} header = {'ip': agentip, 'Referer': nid_refer, "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent()} ok, response = Html_Downloader.Download_Html(nid_Url, {}, header) if not ok: count = 0 while count < 5: sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip, 'Referer': nid_refer, 'timeout': '5000', "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent()} ok, response = Html_Downloader.Download_Html(nid_Url, {}, header) if ok: break count += 1 print "获取月销量第{count}试错".format(count=count) if ok and "sellCount\":" not in response.text: count = 0 while count <5: sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip, 'Referer': nid_refer, 'timeout': '5000', "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent()} if count ==4: header = {} ok, response = Html_Downloader.Download_Html(nid_Url, {}, header) if ok and "sellCount\":" in response.text: break count += 1 print "sellCount不在反馈中,获取月销量第{count}试错".format(count=count) if ok and "sellCount\":" in response.text: month_Sales = str(re.compile("sellCount\":(.*?)(?=\"success\")").findall(response.text)[0]).replace(",", "").replace( ",", "").strip() print "获得月销量为:{month_Sales}".format(month_Sales=month_Sales) return month_Sales except Exception, e: logging.info("月销量爬取错误{m}".format(m=e.message))
def crawl_shop_all_item(self): agentIp = Utils.GetMyAgent() shop_id =self.shop_id shop_name=self.shop_name session =Session() self.get_shop_item_list(session, agentIp,1,shop_id,shop_name) # 首次获取会失败只为获取cookie total_page = 1 for i in range(100): print i # 到最后一页就提前终止 if total_page and i >= total_page: break result =self.get_shop_item_list(session, agentIp, (i+1),shop_id,shop_name) if not result: result =self.get_shop_item_list(session, agentIp, (i+1),shop_id,shop_name) print(result) jobj = json.loads(result.replace("mtopjsonp12(", "").replace("})", "}")) # 获取解析的json #if(i>=7): jsonArray=jobj['data']['itemsArray'] self.parse_items(jsonArray,shop_id,agentIp) if jobj and "SUCCESS" in jobj['ret'][0]: total = int(jobj['data']['totalResults']) total_page = total / 30 # 每页最多30个不能再多 if total % 30: total_page += 1 else: print("获取数据失败") break sleep(2)
def crawlMonthSales(self, nid, agentip): try: month_Sales = "" nid_url = "https://mdskip.taobao.com/core/initItemDetail.htm?itemId={nid}" refer_url = "https://detail.taobao.com/item.htm?id={nid}" nid_Url = nid_url.format(nid=nid) nid_refer = refer_url.format(nid=nid) cookies = "ab=56; UM_distinctid=15e7a46caf311b-0188d989dac01e-5c153d17-144000-15e7a46caf4552; thw=cn; ali_apache_id=11.131.226.119.1505353641652.239211.1; miid=2033888855982094870; l=AllZcEkSLTy0io2vJcWc-ksY6U4zk02Y; uc2=wuf=https%3A%2F%2Ftrade.tmall.com%2Fdetail%2ForderDetail.htm%3Fbiz_order_id%3D70514222507416230%26forward_action%3D; _cc_=WqG3DMC9EA%3D%3D; tg=0; _uab_collina=150780747345339957932139; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=0_0; _tb_token_=3e0501668eb3b; x=124660357; uc3=sg2=AVMH%2FcTVYAeWJwo98UZ6Ld9wxpMCVcQb0e1XXZrd%2BhE%3D&nk2=&id2=&lg2=; uss=VW9L9wvPPdgBBh%2BJHeH%2BVW8D%2FgmRg%2B6YCnShUPaOH0CFHrL4%2FVpP4v7d; tracknick=; sn=%E4%BE%9D%E4%BF%8A%E6%9C%8D%E9%A5%B0%3A%E8%BF%90%E8%90%A5; skt=efe1ec1051eec814; v=0; cookie2=1ce9fff7464537de3d45fe012006d49d; unb=2077259956; t=1630b104e4d32df897451d6c96642469; _m_h5_tk=37be146862abddcfc955f9ec15ebb25d_1508307778971; _m_h5_tk_enc=7ab9ef3ea063dd2c4cd6d33cf84ea2a4; cna=dNU/EvGIRjsCAQ4XY4PdDkHN; uc1=cookie14=UoTcBzysjIcUbw%3D%3D&lng=zh_CN; isg=Amxsuy9SGdk0Xg26l9-JufebPUpejRva_jrq6MateJe60Qzb7jXgX2Ljh68S; _umdata=85957DF9A4B3B3E8F872A3094256432F0F1549AE1C92C6CCF1E68B982581686F23BFC13A60CCABD1CD43AD3E795C914C9A2685321202E656A2C4B44241C24328" # cookies="_tb_token_=f3fe5d65a6591;cookie2=171e5eb92d66332b1d52d9e2730fed33;t=bf64b0d40d912c08dd434661471b2c98;v=0" cookie_dict = {item.split('=')[0]: item.split('=')[1] for item in cookies.split(';')} header = {'ip': agentip, 'Referer': nid_refer, "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent()} ok, response = Html_Downloader.Download_Html(nid_Url, {}, header) if not ok: count = 0 while (count < 11): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html(nid_Url, {}, header) if ok: break count += 1 print "获取月销量第{conut}试错".format(count=count) if ok: month_Sales = str(re.compile("sellCount\":(.*?)(?=\"success\")").findall(response.text)[0]).replace(",", "").replace( ",", "").strip() if month_Sales == None: self.crawlMonthSales(nid, agentip) print "获得月销量为:{month_Sales}".format(month_Sales=month_Sales) return month_Sales except Exception, e: logging.info("月销量爬取错误{m}".format(m=e.message))
def get_csrfId(): url = "http://dmp.taobao.com/api/login/loginuserinfo" proxy = dict() proxy['HTTP'] = Utils.GetMyAgent() response = requests.get(url=url, proxies=proxy, verify=False, headers=HEADER, cookies=cookie_dict).text csrfid = json.loads(response)['data']['csrfId'] return str(csrfid)
def get_response(cate, i): agentIp = dict() agentIp['HTTPS'] = Utils.GetMyAgent() url = "https://sycm.taobao.com/mq/rank/listItems.json?cateId={cate}&categoryId={cate}&dateRange=2017-11-06%7C2017-11-12&dateRangePre=2017-11-06|2017-11-12&dateType=recent7&dateTypePre=recent7&device=0&devicePre=0&itemDetailType=1&keyword=&orderDirection=desc&orderField=payOrdCnt&page={page}&pageSize=100&rankTabIndex=0&rankType=1&seller=-1&token=67868107c&view=rank&_=1510572223929".format( page=i, cate=cate) response = requests.get(url=url, headers=HEADERS, proxies=agentIp, verify=False, cookies=cookie_dict).text return response
def get_response(url): try: proxy = dict() proxy['HTTP'] = Utils.GetMyAgent() #proxy['HTTP'] ={'HTTP':'182.34.50.90:808'} respones = requests.get(url=url, cookies=cookie_dict, headers=HEADERS, verify=False,proxies=proxy).text html = etree.HTML(respones) time.sleep(8) except Exception, e: logging.error("获取响应时出现错误{e}".format(e=e.message)) get_response(url)
def get_data(shop_id, item_ids, total_items, shopname): url = "https://tui.taobao.com/recommend?seller_id=78550821&shop_id={shop_id}&item_ids={item_id}&floorId=42296&" \ "pSize=500&callback=detail_pine&appid=2144&count=200&pNum=0".format(shop_id=shop_id, item_id=item_ids) agentIp = dict() agentIp['HTTPS'] = Utils.GetMyAgent() try: response = requests.get(url, headers=HEADERS, verify=False, proxies=agentIp) if response.status_code == 200 and "result" in response.text: total_json = response.text.replace("\r", "").replace("\n", "").replace("detail_pine(", "").replace("});", "}") total_json = json.loads(total_json) need_json = total_json["result"] for need in need_json: getNeedJson = dict() getNeedJson['month_Sales'] = str(need['monthSellCount']) getNeedJson['title'] = need['itemName'] getNeedJson['item_id'] = str(need['itemId']).strip() getNeedJson['totalSoldQuantity'] = str(need['sellCount']) getNeedJson['quantity'] = str(need['quantity']) # 不要价格getNeedJson['promotionPrice'] = need['promotionPrice'] getNeedJson['category_id'] = str(need['categoryId']) getNeedJson['picUrl'] = "https:" + need['pic'] getNeedJson['crawl_url'] = "https:" + need['url'] getNeedJson['crawl_time'] = TIME getNeedJson['shop_id'] = str(shop_id) getNeedJson['shop_name'] = shopname # 完成数据写入redis r.set(need['itemId'], getNeedJson) # SHARE_Q.put(getNeedJson) ''' 数组针对item_id去重,但效率太低弃用 if len(DATA) == 0: DATA.append(getNeedJson) else: m = list() for data in DATA: m.append(data['itemId']) if getNeedJson['itemId'] not in m: DATA.append(getNeedJson) else: continue #DATA = list(set(DATA)) ''' print "添加商品成功{item_id}".format(item_id=need['itemId']) except Exception, e: logging.error("爬取{shopname}是出错:{e}".format(shopname=shopname, e=e.message)) return get_data(shop_id, item_ids, total_items, shopname)
def crawl_yxl(self,auctionId,agentIp): yxl=-1 count =0 while(count<20): agentIp=Utils.GetMyAgent() userAgent=Html_Downloader.GetUserAgent() header = {'ip': agentIp,'user-agent':userAgent} text_detail_url="https://detail.m.tmall.com/item.htm?spm=a320p.7692363.0.0&id={auctionId}".format(auctionId=auctionId) ok, response = Html_Downloader.Download_Html(text_detail_url,{}, header) if ok: matchs=re.compile("sellCount\":(.*?)(?=showShopActivitySize)").findall(response.text) if len(matchs) > 0: if "sellCount" in response.text: yxl=re.compile("sellCount\":(.*?)(?=showShopActivitySize)").findall(response.text)[0].encode('utf-8') yxl=yxl.replace(",\"","") break sleep(3) count+=1 return yxl
def get_tag_details(tags, csrfid): global TAG_DATA if type(tags) != list: tags = [tags] for tag in tags: url = "http://dmp.taobao.com/api/tag/{tag_id}?csrfId={csrfid}&t={time}977" \ .format(tag_id=tag, csrfid=csrfid, time=int(time.time())) proxy = dict() proxy['HTTP'] = Utils.GetMyAgent() # allow_redirects 是用来解决URI重定向问题 response = requests.get(url=url, proxies=proxy, verify=False, headers=HEADER, cookies=cookie_dict, allow_redirects=False).text data = json.loads(response) tag_data = dict() tag_data['dmp_msg'] = str( data["data"]["tag"]["tagDesc"].split(",")[0])[9:-1] # 标签信息 tag_data['dmp_msg'] = str( data["data"]["tag"]["tagDesc"].split(",")[0])[9:-1] # 获取标签数 tag_data['qualityScore'] = str(data["data"]["tag"]["qualityScore"]) # 标签标题 tag_data['tag_name'] = str(data["data"]["tag"]["tagName"]) # 获取选项信息 tag_data['options'] = data["data"]["tag"]["options"] # 获取该标签的GroupId # tag_data['GroupIds'] = data["data"]["tag"]["optionGroups"] # 获取标签的type option_groups = data["data"]["tag"]["optionGroups"] group_id_type = dict() tag_data['group_id_type'] = group_id_type for group in option_groups: tag_data['group_id'] = group['id'] group_type = group['type'] group_id_type[group_type] = group['id'] TAG_DATA[tag] = tag_data
def get_total_items(time_now, shop_url): ip = Utils.GetMyAgent() proxies = { "http": "http://{ip}".format(ip=ip), "https": "http://{ip}".format(ip=ip) } # shop_url = "https://shop67361531.taobao.com/" parms_url = "{shop_url}i/asynSearch.htm?_ksTS={now}569_240&callback=jsonp&mid=w-14766145001-0&wid=14766145001&path=/search.htm&search=y&orderType=hotsell_desc&scene=taobao_shop&pageNo={page_num}" url_test = parms_url.format(shop_url=shop_url, now=time_now, page_num=1) cookie2 = "UM_distinctid=15e7a46caf311b-0188d989dac01e-5c153d17-144000-15e7a46caf4552; thw=cn; ali_apache_id=11.131.226.119.1505353641652.239211.1; _uab_collina=150538207117146260512386; miid=2033888855982094870; l=AllZcEkSLTy0io2vJcWc-ksY6U4zk02Y; _cc_=WqG3DMC9EA%3D%3D; tg=0; _umdata=85957DF9A4B3B3E8F872A3094256432F0F1549AE1C92C6CCF1E68B982581686F23BFC13A60CCABD1CD43AD3E795C914CAF73DEDAA30E5DF4E27D6F4EB50F8E1F; cna=dNU/EvGIRjsCAQ4XY4PdDkHN; _tb_token_=36f53e7b339f5; _m_h5_tk=4570646d13ae7111fef3e2b7a043022c_1509837913393; _m_h5_tk_enc=20b6e25356a739887ddf33c092c83ece; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=0_0; x=124660357; uc3=sg2=AVMH%2FcTVYAeWJwo98UZ6Ld9wxpMCVcQb0e1XXZrd%2BhE%3D&nk2=&id2=&lg2=; uss=VAXSokZb41x3LrOdSf%2FkOXi5mZhwOKGqxWNIJ%2BcsBdECv1yvzxoYTiml; tracknick=; sn=%E4%BE%9D%E4%BF%8A%E6%9C%8D%E9%A5%B0%3A%E8%BF%90%E8%90%A5; skt=dfb70b3150f31cce; v=0; cookie2=113bd831048eba21f8cb9e18be45b345; unb=2077259956; t=1630b104e4d32df897451d6c96642469; uc1=cookie14=UoTcBrCpbGSbPg%3D%3D&lng=zh_CN; isg=AoWF8Nl9cPq6fVThFtgQUqaUlMF_6jLBbwXTU4fqQrzLHqWQT5JJpBP-XnQT" cookie_dict = {item.split('=')[0]: item.split('=')[1] for item in cookie2.split(';')} try: response = requests.get(url=url_test, proxies=proxies, headers=HEADERS, verify=False, allow_redirects=False, cookies= cookie_dict) html = etree.HTML(response.text) if response.status_code == 200 and "search-result" in response.text: total_items = html.xpath("//div[contains(@class,'search-result')]/span/text()")[0] logging.info("获取店铺{shop_url},宝贝总数:{total_items}".format(total_items=total_items, shop_url=shop_url)) return total_items else: return get_total_items(time_now, shop_url) except Exception, e: logging.error("爬取全店宝贝时{shop_url}时出错:{e}".format(shop_url=shop_url, e=e.message)) return get_total_items(time_now, shop_url)
def crawlTotalpage(self, search_url, header): try: ok, response = Html_Downloader.Download_Html( search_url, {}, header) if not ok: count = 0 while (count < 4): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html( search_url, {}, header) if ok: break count += 1 if count == 3: header = {} if ok: html = etree.HTML(response.text) matchs = html.xpath("//script[contains(.,'g_page_config')]") if len(matchs) > 0: data = re.compile( "g_page_config=(.*)?;g_srp_loadCss").match( matchs[0].text.replace("\n\n", "\n").replace( "\n", "").replace(" ", "")) if data.lastindex > 0: data = json.loads(data.group(1).encode('utf-8')) if data.has_key('mods'): totalpage = data['mods']['pager']['data'][ 'totalPage'] else: print("无法匹配有效的json") else: print("无法匹配到宝贝列表") except Exception, e: logging.info("关键词{p}第{i}页抓取错误{m}".format(m=e.message))
def parse_items(self,jsonArray, shop_id,agentIp): shop_items=[] # agentIp=None header = {'ip': agentIp} for item in jsonArray: shop_item = {} shop_item['shop_id']=shop_id auctionId=item.get('auctionId') shop_item['item_id']=auctionId shop_item['title']= item.get('title') shop_item['picUrl']="http:"+item.get('picUrl') # shop_item['picUrl']=re.compile("/([^/]*)(?=_)").findall(item.get('picUrl'))[0] shop_item['salePrice']= item.get('salePrice') shop_item['reservePrice']= item.get('reservePrice') shop_item['quantity']= item.get('quantity') shop_item['totalSoldQuantity']= item.get('totalSoldQuantity') #获取链接里面的详情 t_detail_url="https://item.taobao.com/item.htm?spm=a1z10.3-c-s.w4002-14766145001.18.6584ae82X93XhC&id={auctionId}".format(auctionId=auctionId) shop_item['crawl_url']=t_detail_url print(t_detail_url) shop_item['crawl_time'] = long(time.time()) sold=item.get('sold') if "tmall" in self.shop_url: sold="" sold=self.crawl_yxl(auctionId,agentIp) #天猫的月销量另外获取 shop_item['sold']=sold try: ok, response = Html_Downloader.Download_Html(t_detail_url,{}, header) if not ok: count =0 while(count<4): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html(t_detail_url,{},header) if ok and "category=item" in response.text: break count+=1 if count==3: header={} if ok and "category=item" not in response.text: count =0 while(count<4): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html(t_detail_url,{},header) if ok and "category=item" in response.text: break count+=1 if count==3: header={} if ok and "category=item" in response.text: html = etree.HTML(response.text) # shop_id = "" category_id= re.compile("item%5f(.*?)(?=&)").findall(response.text)[0] shop_item['category_id']=category_id if html.xpath("//dl[contains(@class,'tb-prop')]"): for prop in html.xpath("//dl[contains(@class,'tb-prop')]"): if not prop in html.xpath("//dl[contains(@class,'tb-hidden')]"): prop_value_id=[] prop_name = prop.xpath(".//dt/text()")[0].encode('utf-8') for value in prop.xpath(".//dd/ul/li"): sub_value_id= [] sku_id = value.get('data-value') sub_value_id.append(sku_id) if value.xpath('./a/span/text()'): sku_name = value.xpath('./a/span/text()')[0].encode('utf-8') sub_value_id.append(sku_name) # prop_value_id.append(";".join(sub_value_id)) if value.xpath('./a')[0].get('style') and re.compile("/([^/]*)(?=_!!|_M2)").findall( value.xpath('./a')[0].get('style')): sku_img_url=re.compile("/([^/]*)(?=_!!|_M2)").findall(value.xpath('./a')[0].get('style'))[0] sub_value_id.append(sku_img_url) prop_value_id.append(";".join(sub_value_id)) # shop_item[prop_name] ="&&||".join(prop_value_id) shop_item[prop_name] =prop_value_id if html.xpath("//ul[@id='J_UlThumb']"): stype_img_id=[] if html.xpath("//ul[@id='J_UlThumb']")[0].xpath(".//li/div"): for value1 in html.xpath("//ul[@id='J_UlThumb']")[0].xpath(".//li/div"): if value1.xpath('./a')[0].xpath('./img')[0].get('data-src') and re.compile("/([^/]*)(?=_!!|_M2)").findall( value1.xpath('./a')[0].xpath('./img')[0].get('data-src')): sku_img_id=re.compile("/([^/]*)(?=_!!|_M2)").findall(value1.xpath('./a')[0].xpath('./img')[0].get('data-src'))[0] stype_img_id.append(sku_img_id) elif html.xpath("//ul[@id='J_UlThumb']")[0].xpath(".//li"): for value1 in html.xpath("//ul[@id='J_UlThumb']")[0].xpath(".//li"): if value1.xpath('./a')[0].xpath('./img')[0].get('src') and re.compile("/([^/]*)(?=_!!|_M2)").findall( value1.xpath('./a')[0].xpath('./img')[0].get('src')): sku_img_id=re.compile("/([^/]*)(?=_!!|_M2)").findall(value1.xpath('./a')[0].xpath('./img')[0].get('src'))[0] stype_img_id.append(sku_img_id) shop_item["img_attr"]="&&||".join(stype_img_id) if html.xpath("//ul[@id='J_AttrUL']"): styleliList=[] for styleli in html.xpath("//ul[@id='J_AttrUL']")[0].xpath(".//li"): if styleli.xpath('./text()'): styleliText= styleli.xpath('./text()')[0].encode('utf-8').strip() styleliList.append(styleliText) elif html.xpath("//div[@id='attributes']"): styleliList=[] for styleli in html.xpath("//div[@id='attributes']")[0].xpath(".//ul/li"): if styleli.xpath('./text()'): styleliText= styleli.xpath('./text()')[0].encode('utf-8').strip() styleliList.append(styleliText) shop_item["attribute"]="&&||".join(styleliList) except Exception, e: logging.info("----详情抓取错误----".format(e=e.message)) shop_items.append(shop_item)
logUUZS = logZS + getData ok, result = Html_Downloader.Download_Html(logUU, {}, {}) if ok: result_json = json.loads(result.content) #result_ok = bool(result_json['status']) ok, result = Html_Downloader.Download_Html(logUUZS, {}, {}) if ok: result_json = json.loads(result.content) #result_ok = bool(result_json['status']) if __name__ == "__main__": starttime = datetime.now() start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) agentIp = Utils.GetMyAgent() ''' urlLog="http://192.168.10.198:8080/pdd/CrawlerLogController/getAllNick" log="http://192.168.10.198:8080/pdd/CrawlerLogController/SaveCrawlerLog?" ok, result = Html_Downloader.Download_Html(urlLog,{},{}) if ok: result_json1 = json.loads(result.content) sonArrayShop=result_json1['data'] for itemShop in jsonArrayShop: shop_url=itemShop.get('shop_url') shop_id=itemShop.get('shop_id') shop_name=itemShop.get('shop_name') ''' shop_url = "https://newbalancekids.tmall.com/" shop_id = "101815493" shop_name = "newbalance童鞋旗舰店"
def crawl_shop_all_item(self): agentIp = Utils.GetMyAgent() shop_id = self.shop_id shop_name = self.shop_name userAgent = Html_Downloader.GetUserAgent() header = {'ip': agentIp, 'user-agent': userAgent} test_detail_url = "{shop_url}shop/shop_auction_search.do?ajson=1&_tm_source=tmallsearch&spm=a320p.7692171.0.0&sort" \ "=d&p={page}&page_size={page_size}&from=h5".format(shop_url=self.shop_url, page_size=1, page=1) test_detail_url = test_detail_url.replace(".tmall.com", ".m.tmall.com") try: ok, response = Html_Downloader.Download_Html( test_detail_url, {}, header) if not ok: count = 0 while (count < 4): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html( test_detail_url, {}, header) if ok: break count += 1 if count == 3: header = {} if ok: jsonArray = json.loads(response.content) # 获取解析的json total_page = jsonArray.get("total_page") total_results = jsonArray.get("total_results") page_size = jsonArray.get("page_size") logging.info("shopname:" + shop_name + " total_page:" + total_page + " total_results:" + total_results + " page_size:" + page_size) print "total_page:" + total_page + "total_results:" + total_results + "page_size:" + page_size for i in range(int(total_page)): print i + 1 test_detail_url = "{shop_url}shop/shop_auction_search.do?ajson=1&_tm_source=tmallsearch&spm=a320p.7692171.0.0&sort=d&p={page}&page_size={page_size}&from=h5".format( shop_url=self.shop_url, page_size=page_size, page=i + 1) test_detail_url = test_detail_url.replace( ".tmall.com", ".m.tmall.com") ''' if int(total_page)==(i+1): lastCount=int(total_results)-i*int(page_size) ok, response = Html_Downloader.Download_Html(test_detail_url,{}, header) if not ok: count =0 while(count<11): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html(test_detail_url,{},header) if ok and "price" in response.text and lastCount-response.text.count("price")<2: break count+=1 if count==10: header={} print response.text.count('price') if ok and "price" not in response.text: print "111" count =0 while(count<11): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html(test_detail_url,{},header) if ok and "price" in response.text and lastCount-response.text.count("price")<2: break count+=1 if count==10: header={} if ok and lastCount-response.text.count("price")>2: while(count<11): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html(test_detail_url,{},header) if ok and "price" in response.text and lastCount-response.text.count("price")<2: break count+=1 if count==10: header={} if ok and lastCount-response.text.count("price")<2: logging.info("成功获取price字符串并开始解析") self.parse_items(response.content,shop_id,agentIp,shop_name,userAgent) else: ''' ok, response = Html_Downloader.Download_Html( test_detail_url, {}, header) if not ok: count = 0 while (count < 11): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html( test_detail_url, {}, header) if ok: break count += 1 if count == 10: header = {} if ok: # logging.info("成功获取price字符串并开始解析") self.parse_items(response.content, shop_id, agentIp, shop_name, userAgent) except Exception, e: logging.error("抓取店铺:{shop_name}失败,店铺id:{shop_id},错误内容{m}".format( shop_name=shop_name, shop_id=shop_id, m=e.message, )) crawl_content = "抓取列表页有错" message = e.message start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) insertLog(crawl_content, message, shop_id, agentIp, test_detail_url, start_time, shop_name)
def get_coverage(total_combins): csrfid = get_csrfId() for combine in total_combins: DATA = dict() # tag的tagName 通过tagid获得并存储 DATA['tag_name'] = str() # option内容的标签,通过optionName直接获得 DATA['option_name'] = str() # tagid集合标签 DATA['tagids'] = str() proxy = dict() proxy['HTTP'] = Utils.GetMyAgent() url_data = "http://dmp.taobao.com/api/analysis/coverage" # 要对post数据进行循环重组好发送数据 post_data = dict() post_data['csrfId'] = csrfid post_data['user'] = 1 # 这里的i 最好也为字典 # @@这里有个很大的问题就是当这个标签有需要输入的信息时就需要单独添加标签@@ same_group_list = list() same_group_dict = dict() for same_group in combine: if 'SHOP' not in str(same_group): if same_group['GroupId'] not in same_group_dict.keys(): some_group_list2 = list() some_group_list2.append(same_group) same_group_dict[same_group['GroupId']] = some_group_list2 elif same_group['GroupId'] in same_group_dict.keys(): same_group_dict[same_group['GroupId']].append(same_group) if 'SHOP' in str(same_group): same_group_list.append(same_group) same_group_list.append(same_group_dict) # num_post = len(same_group_list) # if "SHOP" in str(combine):num_post += 1 # same_group_list = list(set(same_group_list)) num_post = 0 for single_combine in same_group_list: # 开始对单个请求信息进行解析,目标为字典 if 'SHOP' in single_combine: shop_list = single_combine['SHOP'] tagId = single_combine['tagId'] DATA['tagids'] += str(tagId) + "_" for shop in shop_list: if 'shop_id' in str(shop): post_data["options{a}.operatorType".format( a=[num_post])] = 1 post_data["options{a}.optionGroupId".format( a=[num_post])] = shop['GroupId'] post_data["options{a}.source".format( a=[num_post])] = "all" post_data["options{a}.tagId".format( a=[num_post])] = tagId post_data["options{a}.value".format( a=[num_post])] = shop['shop_id'] post_data["options{a}.optionNameMapStr".format( a=[num_post])] = str( {str(shop['shop_id']): shop['shop_name']}) num_post += 1 # DATA['tag_name'].append(shop['shop_name']+"_") DATA['option_name'] += str(shop['shop_name']) + "_" else: post_data["options{a}.operatorType".format( a=[num_post])] = 1 post_data["options{a}.optionGroupId".format( a=[num_post])] = shop['GroupId'] post_data["options{a}.source".format( a=[num_post])] = "all" post_data["options{a}.tagId".format( a=[num_post])] = tagId # post_data["options{a}.value".format(a=[num_post])] = shop['shop_id'] # 这里input的value写死了,因为产品要求输入为1 post_data["options{a}.value".format( a=[num_post])] = "1~999999999" num_post += 1 DATA['tag_name'] += "最近180天店内购买频次" + "_" DATA['option_name'] += "1~999999999_" else: # @@@@@@这里要注意啊!要按照groupid进行分组操作,现在没有。。。。。。所以数据有误 for key in single_combine.keys(): # key = single_combine.keys()[keys] optionNameMapStr = dict() value = list() for same_groop_tag in single_combine[key]: optionValue = int(same_groop_tag['optionValue']) optionNameMapStr[str( optionValue)] = same_groop_tag['optionName'] tagId = same_groop_tag['tagId'] option_name_forDATA = list() option_name_forDATA.append( str(same_groop_tag['optionName'])) value.append(optionValue) for data in option_name_forDATA: DATA['option_name'] += data + "_" DATA['tagids'] += str(tagId) + "_" post_data["options{a}.operatorType".format( a=[num_post])] = 1 post_data["options{a}.optionGroupId".format( a=[num_post])] = key post_data["options{a}.source".format(a=[num_post])] = "all" post_data["options{a}.tagId".format(a=[num_post])] = tagId post_data["options{a}.value".format(a=[num_post])] = value post_data["options{a}.optionNameMapStr".format( a=[num_post])] = str(optionNameMapStr) DATA['tag_name'] += str(TAG_DATA[tagId]['tag_name']) + "_" num_post += 1 logging.info(post_data) try: response = requests.post(url_data, proxies=proxy, verify=False, headers=HEADER, cookies=cookie_dict, allow_redirects=False, data=post_data).text DATA['count'] = str(json.loads(response)['data']['coverage']) except Exception, e: logging.error("解析得到响应数据时发生错误{e}".format(e=e)) response = requests.post(url_data, proxies=proxy, verify=False, headers=HEADER, cookies=cookie_dict, allow_redirects=False, data=post_data).text DATA['tag_name'] = str(DATA['tag_name'])[:-1] DATA['option_name'] = str(DATA['option_name'])[:-1] DATA['tagids'] = str(DATA['tagids'])[:-1] DATA['crawl_time'] = int(time.time()) local_set.save(DATA) print response
def crawlMonthSales(self, nid, agentip): try: month_Sales = "" nid_url = "https://mdskip.taobao.com/core/initItemDetail.htm?itemId={nid}" refer_url = "https://detail.taobao.com/item.htm?id={nid}" nid_Url = nid_url.format(nid=nid) nid_refer = refer_url.format(nid=nid) cookies = "x=__ll%3D-1%26_ato%3D0; l=AhERSU92PmRba9QUgSCkQMF6oRaqOoXt; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; cna=dNU/EvGIRjsCAQ4XY4PdDkHN; _m_h5_tk=7d8d6e65e5c676a6d0a69c26f7436ea1_1510363282671; _m_h5_tk_enc=e32129060738b7ce01e9114c9bec037f; sm4=440100; hng=CN%7Czh-CN%7CCNY%7C156; uc1=cookie14=UoTde95xncLyFQ%3D%3D&lng=zh_CN; uc3=sg2=Vq0THzNyGHIH22DuvMx9ZEwXL5qc2kn7REWHdois6v0%3D&nk2=&id2=&lg2=; uss=AQDPJiEXAu47o41b5k%2BKpKRT3Ckpz9nqnJX2F%2F7kZG6ttuI82ZnQa7ZL; t=1630b104e4d32df897451d6c96642469; unb=2607292494; sn=sitiselected%E6%97%97%E8%88%B0%E5%BA%97%3A%E5%A4%A7%E9%BA%A6; _tb_token_=eef7bd7b7abd6; cookie2=23bb087c638814ce8a8e329ead5332d4; isg=ApqaMZmelJirXxuDoGSRqtW160B8YxWwfLxcMqQTRi34FzpRjFtutWDlkdVw" # cookies="_tb_token_=f3fe5d65a6591;cookie2=171e5eb92d66332b1d52d9e2730fed33;t=bf64b0d40d912c08dd434661471b2c98;v=0" cookie_dict = { item.split('=')[0]: item.split('=')[1] for item in cookies.split(';') } header = { 'ip': agentip, 'Referer': nid_refer, "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent() } ok, response = Html_Downloader.Download_Html(nid_Url, {}, header) if not ok: count = 0 while count < 5: sleep(2) agentip = Utils.GetMyAgent() header = { 'ip': agentip, 'Referer': nid_refer, 'timeout': '5000', "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent() } ok, response = Html_Downloader.Download_Html( nid_Url, {}, header) if ok: break count += 1 print "获取月销量第{count}试错".format(count=count) if ok and "sellCount\":" not in response.text: count = 0 while count < 10: sleep(2) agentip = Utils.GetMyAgent() header = { 'ip': agentip, 'Referer': nid_refer, 'timeout': '5000', "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent() } if count == 9: header = {} ok, response = Html_Downloader.Download_Html( nid_Url, {}, header) if ok and "sellCount\":" in response.text: break count += 1 print "sellCount不在反馈中,获取月销量第{count}试错".format(count=count) if ok and "sellCount\":" in response.text: month_Sales = str( re.compile("sellCount\":(.*?)(?=\"success\")").findall( response.text)[0]).replace(",", "").replace(",", "").strip() print "获得月销量为:{month_Sales}".format(month_Sales=month_Sales) return month_Sales except Exception, e: logging.info("月销量爬取错误{m}".format(m=e.message))
def parse_items(self, content, shop_id, agentIp, shop_name, userAgent): try: # start_time2=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) jsonArray = json.loads(content) jsonResult = jsonArray.get("items") shop_items = [] header = {'ip': agentIp, 'user-agent': userAgent} print "开始解析列表json数据" for item in jsonResult: shop_item = {} shop_item['shop_id'] = str(shop_id) shop_item['shop_name'] = shop_name item_id = str(item.get("item_id")).strip() shop_item['item_id'] = item_id shop_item['title'] = item.get('title').encode('utf-8') shop_item['picUrl'] = "https:" + item.get('img') # print item.get('price') # 现在的销售价 # shop_item['salePrice'] = item.get('price') shop_item['totalSoldQuantity'] = str( item.get('totalSoldQuantity')) crawl_url = "https:" + item.get('url') shop_item['crawl_url'] = crawl_url.replace( ".m.tmall.com", ".tmall.com") shop_item['crawl_time'] = long(time.time()) # 获取quantity接口url # 获取Items接口url category_id = "" category_id_Url = "http://d.da-mai.com/index.php?r=itemApi/getItemInfoByItemId&item_id=" + item_id ok, response = Html_Downloader.Download_Html( category_id_Url, {}, header) if not ok: count = 0 while count < 4: sleep(1) agentip = Utils.GetMyAgent() header = {'ip': agentip} if count == 3: header = {} ok, response = Html_Downloader.Download_Html( category_id_Url, {}, header) if ok: break count += 1 if ok: jsonItems = json.loads(response.content) category_id = jsonItems['data']['data']['cid'] total_quantity = 0 quantity_Url = "http://yj.da-mai.com/index.php?r=itemskus/getSkus&fields=*&num_iids={item_id}".format( item_id=item_id) ok, response = Html_Downloader.Download_Html( quantity_Url, {}, header) if not ok: count = 0 while (count < 4): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html( quantity_Url, {}, header) if ok and "quantity" in response.text: break count += 1 if count == 3: header = {} if ok and "quantity" not in response.text: count = 0 while (count < 4): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html( quantity_Url, {}, header) if ok and "quantity" in response.text: break count += 1 if count == 3: header = {} if ok and "quantity" in response.text: print "成功获取sku的json字符串并开始解析" jsonItems = json.loads(response.content) # 获取解析的json total_data = jsonItems.get("data") for date in total_data: quantity = date.get("quantity") total_quantity = total_quantity + quantity shop_item['category_id'] = str(category_id) shop_item['quantity'] = str(total_quantity) agentip = Utils.GetMyAgent() shop_item['month_Sales'] = self.crawlMonthSales( item_id, agentip) shop_items.append(shop_item) post_data = {'data': json.dumps(shop_items)} if not self.process_request(SAVE__INSERT_API, post_data): sleep(3) self.process_request(SAVE__INSERT_API, post_data) # if not self.process_request(SAVE__INSERT_API_ZS, post_data): # sleep(3) # self.process_request(SAVE__INSERT_API_ZS, post_data) except Exception, e: logging.info( "抓取店铺:{shop_name}失败,抓取店铺链接:{shop_url},店铺id:{shop_id},错误内容{m}". format(shop_name=shop_name, shop_id=shop_id, m=e.message)) crawl_content = "解析接口数据有误" message = e.message end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) insertLog(crawl_content, message, shop_id, agentIp, "", start_time, shop_name)
def get_coverage(options_list, csrfid): proxy = dict() proxy['HTTP'] = Utils.GetMyAgent() url_data = "http://dmp.taobao.com/api/analysis/coverage" for option in options_list: num = 2 post_data = dict() post_data['csrfId'] = csrfid post_data['user'] = 1 # 开始组合post数据,先组成主标签即店铺标签 post_data["options[{a}].operatorType".format(a=0)] = 1 post_data["options[{a}].optionGroupId".format(a=0)] = 304 post_data["options[{a}].source".format(a=0)] = "all" post_data["options[{a}].tagId".format(a=0)] = 110063 post_data["options[{a}].value".format(a=0)] = 145841584 # 填shopId # post_data["options[{a}].value".format(a=0)] = 36236493 # 填shopId # post_data["options[{a}].optionNameMapStr".format(a=0)] = '{"36236493":"依俊服饰"}' # 填店铺信息 post_data["options[{a}].optionNameMapStr".format( a=0)] = '{"145841584":"麦斯威尔旗舰店"}' # 填店铺信息 post_data["options[{a}].operatorType".format(a=1)] = 1 post_data["options[{a}].optionGroupId".format(a=1)] = 111 post_data["options[{a}].source".format(a=1)] = "all" post_data["options[{a}].tagId".format(a=1)] = 110063 post_data["options[{a}].value".format(a=1)] = "1~999999999" key_list = option.keys() for key in key_list: # 每个key代表一个tagId option_id = option[key] options = TAG_DATA[key]['options'] # DATA['tag_name'] += TAG_DATA[key]['tag_name'] + '_' for get_right_option in options: DATA = dict() DATA['id'] = str() # DATA['tag_name'] = str() DATA['option_value'] = str() DATA['option_group_id'] = str() DATA['option_name'] = "本店圈定人数" DATA['id'] += '110063_110063_' DATA['option_value'] += '145841584_1~9999999_' DATA['option_group_id'] += '304_111_' DATA['id'] += str(key) + '_' DATA['option_group_id'] += str(TAG_DATA[key]['group_id']) + '_' optionGroupId = get_right_option['optionGroupId'] optionValue = get_right_option['optionValue'] optionName = get_right_option['optionName'] if option_id == get_right_option['id']: post_data['options[{num}].operatorType'.format( num=num)] = 1 post_data['options[{num}].optionGroupId'.format( num=num)] = optionGroupId post_data['options[{num}].source'.format(num=num)] = 'all' post_data['options[{num}].tagId'.format(num=num)] = key post_data['options[{num}].value'.format( num=num)] = optionValue optionNameMapStr = dict() optionNameMapStr[optionValue] = str(optionName) DATA['count_name'] = get_right_option['optionName'] post_data['options[{num}].optionNameMapStr'.format( num=num)] = str(optionNameMapStr) DATA['option_value'] += get_right_option[ 'optionValue'] + '_' # DATA['option_group_id'] += get_right_option num += 1 time.sleep(2) response = requests.post(url_data, proxies=proxy, verify=False, headers=HEADER, cookies=cookie_dict, allow_redirects=False, data=post_data).text DATA['count'] = str( json.loads(response)['data']['coverage']) DATA['crawl_time'] = int(time.time()) # DATA['tag_name'] = str(DATA['tag_name'])[:-1] DATA['id'] = str(DATA['id'])[:-1] DATA['option_group_id'] = str(DATA['option_group_id'])[:-1] DATA['option_value'] = str(DATA['option_value'])[:-1] print DATA local_set.save(DATA) if key == 8: for value in [ '0~4', '5~9', '10~20', '21~31', '32~42', '43~53', '53~99999' ]: DATA = dict() DATA['id'] = str() # DATA['tag_name'] = str() DATA['option_value'] = str() DATA['option_group_id'] = str() DATA['option_name'] = "本店圈定人数" DATA['id'] += '110063_110063_' DATA['option_value'] += '145841584_1~9999999_' DATA['option_group_id'] += '304_111_' DATA['id'] += str(key) + '_' DATA['option_group_id'] += str( TAG_DATA[key]['group_id']) + '_' post_data['options[{num}].operatorType'.format( num=num)] = 1 post_data['options[{num}].optionGroupId'.format( num=num)] = 8 post_data['options[{num}].source'.format(num=num)] = 'all' post_data['options[{num}].tagId'.format(num=num)] = key post_data['options[{num}].value'.format(num=num)] = value DATA['count_name'] = value DATA['option_value'] += value response = requests.post(url_data, proxies=proxy, verify=False, headers=HEADER, cookies=cookie_dict, allow_redirects=False, data=post_data).text DATA['count'] = str( json.loads(response)['data']['coverage']) DATA['crawl_time'] = int(time.time()) # DATA['option_group_id'] += "8" # DATA['id'] += "8" # DATA['tag_name'] = str(DATA['tag_name'])[:-1] DATA['id'] = str(DATA['id'])[:-1] DATA['option_group_id'] = str(DATA['option_group_id'])[:-1] DATA['option_value'] = str(DATA['option_value']) print DATA local_set.save(DATA) DATA['option_value'] = '145841584_1~9999999_' DATA['option_group_id'] = '304_111_' DATA['id'] = '110063_110063_' break if key == 7: for value in [ '0~500', '500~1000', '1000~2000', '2000~3000', '3000~99999' ]: DATA = dict() DATA['id'] = str() # DATA['tag_name'] = str() DATA['option_value'] = str() DATA['option_group_id'] = str() DATA['option_name'] = "本店圈定人数" DATA['id'] += '110063_110063_' DATA['option_value'] += '145841584_1~9999999_' DATA['option_group_id'] += '304_111_' DATA['id'] += str(key) + '_' DATA['option_group_id'] += str( TAG_DATA[key]['group_id']) + '_' post_data['options[{num}].operatorType'.format( num=num)] = 1 post_data['options[{num}].optionGroupId'.format( num=num)] = key post_data['options[{num}].source'.format(num=num)] = 'all' post_data['options[{num}].tagId'.format(num=num)] = key post_data['options[{num}].value'.format(num=num)] = value DATA['count_name'] = value DATA['option_value'] += value response = requests.post(url_data, proxies=proxy, verify=False, headers=HEADER, cookies=cookie_dict, allow_redirects=False, data=post_data).text DATA['count'] = str( json.loads(response)['data']['coverage']) DATA['crawl_time'] = int(time.time()) # DATA['tag_name'] = str(DATA['tag_name'])[:-1] DATA['option_group_id'] += "7" DATA['id'] += "7" DATA['id'] = str(DATA['id'])[:-2] DATA['option_group_id'] = str(DATA['option_group_id'])[:-1] DATA['option_value'] = str(DATA['option_value']) print DATA local_set.save(DATA) DATA['option_value'] = '145841584_1~9999999_' DATA['option_group_id'] = '304_111_' DATA['id'] = '110063_110063_' break
def get_detail(): rabbits = local_set.find({"所属店铺": "六只兔子 高端内裤 内衣店"}) rabbits2 = local_set.find({"所属店铺": "莎琪儿私藏内衣店"}) for rabbit in rabbits2: # 引流关键词 coming_url_coming = "https://sycm.taobao.com/mq/rank/listItemSeKeyword.json?cateId={cate}&categoryId={cate}&dateRange=2017-11-07%7C2017-11-13&dateRangePre=2017-11-06|2017-11-12&dateType=recent7&dateTypePre=recent7&device=0&devicePre=0&itemDetailType=1&itemId={itemid}&latitude=undefined&rankTabIndex=0&rankType=1&seller=-1&token=67868107c&view=detail&_=1510630823012".format( cate=rabbit['cate'], itemid=rabbit['item_id']) # Top10成交关键词 coming_url_deal = "https://sycm.taobao.com/mq/rank/listKeywordOrder.json?cateId={cate}&categoryId={cate}&dateRange=2017-11-07%7C2017-11-13&dateRangePre=2017-11-06|2017-11-12&dateType=recent7&dateTypePre=recent7&device=0&devicePre=0&itemDetailType=1&itemId={itemid}&latitude=undefined&rankTabIndex=0&rankType=1&seller=-1&token=67868107c&view=detail&_=1510630823016".format( cate=rabbit['cate'], itemid=rabbit['item_id']) # 无线端来源(流量来源) coming_url_wlSeList = "https://sycm.taobao.com/mq/rank/listItemSrcFlow.json?cateId={cate}&categoryId={cate}&dateRange=2017-11-13%7C2017-11-13&dateRangePre=2017-11-07|2017-11-13&dateType=recent1&dateTypePre=recent7&device=2&devicePre=0&itemDetailType=1&itemId={itemid}&rankTabIndex=0&rankType=1&seller=-1&token=67868107c&view=detail&_=1510632184267".format( cate=rabbit['cate'], itemid=rabbit['item_id']) # PC端来源(流量来源) coming_url_PC = "https://sycm.taobao.com/mq/rank/listItemSrcFlow.json?cateId={cate}&categoryId={cate}&dateRange=2017-11-13%7C2017-11-13&dateRangePre=2017-11-07|2017-11-13&dateType=recent1&dateTypePre=recent7&device=1&devicePre=0&itemDetailType=1&itemId={itemid}&rankTabIndex=0&rankType=1&seller=-1&token=67868107c&view=detail&_=1510632184266".format( cate=rabbit['cate'], itemid=rabbit['item_id']) agentIp = dict() agentIp['HTTPS'] = Utils.GetMyAgent() try: ''' if coming_url_coming: time.sleep(4) response = requests.get(url=coming_url_coming, headers=HEADERS, proxies=agentIp, verify=False, cookies=cookie_dict).text if "操作成功" in response: need_datas = json.loads(response)['content']['data'] for need_date in need_datas: for need in need_datas[str(need_date)]: data = dict() data['name'] = "Top10引流关键词" data['coming_from'] = str(need_date) # data_keyword = dict() data['shop_name'] = "六只兔子 高端内裤 内衣店" data['keyword'] = need['keyword'] data['uv'] = need['uv'] data['itemid'] = rabbit['item_id'] # data_keyword_list.append(data_keyword) # data['流量明细'] = data_keyword_list local_set2.save(data) # data = dict() else: time.sleep(30) response = requests.get(url=coming_url_coming, headers=HEADERS, proxies=agentIp, verify=False, cookies=cookie_dict).text need_datas = json.loads(response)['content']['data'] for need_date in need_datas: for need in need_datas[str(need_date)]: data = dict() data['name'] = "Top10引流关键词" data['coming_from'] = str(need_date) # data_keyword = dict() data['shop_name'] = "六只兔子 高端内裤 内衣店" data['keyword'] = need['keyword'] data['uv'] = need['uv'] data['itemid'] = rabbit['item_id'] # data_keyword_list.append(data_keyword) # data['流量明细'] = data_keyword_list local_set2.save(data) # data = dict() ''' if coming_url_deal: time.sleep(4) response = requests.get(url=coming_url_PC, headers=HEADERS, proxies=agentIp, verify=False, cookies=cookie_dict).text if "操作成功" in response: need_datas = json.loads(response)['content']['data'] for need_date in need_datas: for need in need_datas[str(need_date)]: data = dict() data['name'] = "Top10成交关键词" data['coming_from'] = str(need_date) # data_keyword = dict() data['shop_name'] = "六只兔子 高端内裤 内衣店" data['keyword'] = need['keyword'] data['value'] = need['value'] data['itemid'] = rabbit['item_id'] # data_keyword_list.append(data_keyword) # data['流量明细'] = data_keyword_list local_set2.save(data) # data = dict() else: time.sleep(30) response = requests.get(url=coming_url_deal, headers=HEADERS, proxies=agentIp, verify=False, cookies=cookie_dict).text need_datas = json.loads(response)['content']['data'] for need_date in need_datas: for need in need_datas[str(need_date)]: data = dict() data['name'] = "Top10成交关键词" data['coming_from'] = str(need_date) # data_keyword = dict() data['shop_name'] = "六只兔子 高端内裤 内衣店" data['keyword'] = need['keyword'] data['value'] = need['value'] data['itemid'] = rabbit['item_id'] local_set2.save(data) ''' if coming_url_wlSeList: time.sleep(4) response = requests.get(url=coming_url_PC, headers=HEADERS, proxies=agentIp, verify=False, cookies=cookie_dict).text if "操作成功" in response: need_datas = json.loads(response)['content']['data'] for need_date in need_datas: for need in need_datas[str(need_date)]: data = dict() data['name'] = "无线端来源" data['coming_from'] = str(need_date) # data_keyword = dict() data['shop_name'] = "六只兔子 高端内裤 内衣店" data['pageName'] = need['pageName'] data['uv'] = need['uv'] data['pv'] = need['pv'] data['uvRate'] = need['uvRate'] data['pvRate'] = need['pvRate'] data['itemid'] = rabbit['item_id'] local_set2.save(data) else: time.sleep(30) response = requests.get(url=coming_url_PC, headers=HEADERS, proxies=agentIp, verify=False, cookies=cookie_dict).text need_datas = json.loads(response)['content']['data'] for need_date in need_datas: for need in need_datas[str(need_date)]: data = dict() data['name'] = "Top10成交关键词" data['coming_from'] = str(need_date) data['shop_name'] = "六只兔子 高端内裤 内衣店" data['pageName'] = need['pageName'] data['uv'] = need['uv'] data['pv'] = need['pv'] data['uvRate'] = need['uvRate'] data['pvRate'] = need['pvRate'] data['itemid'] = rabbit['item_id'] local_set2.save(data) ''' ''' if coming_url_wlSeList: time.sleep(4) response = requests.get(url=coming_url_wlSeList, headers=HEADERS, proxies=agentIp, verify=False, cookies=cookie_dict).text if "操作成功" in response: need_datas = json.loads(response)['content']['data'] for need_date in need_datas: #for need in need_datas[str(need_date)]: data = dict() data['name'] = "无线端来源" # data['coming_from'] = str(need_date) # data_keyword = dict() data['shop_name'] = "六只兔子 高端内裤 内衣店" data['pageName'] = need_date['pageName'] data['uv'] = need_date['uv'] data['pv'] = need_date['pv'] data['uvRate'] = need_date['uvRate'] data['pvRate'] = need_date['pvRate'] data['itemid'] = rabbit['item_id'] local_set2.save(data) else: time.sleep(30) response = requests.get(url=coming_url_wlSeList, headers=HEADERS, proxies=agentIp, verify=False, cookies=cookie_dict).text need_datas = json.loads(response)['content']['data'] for need_date in need_datas: # for need in need_datas[str(need_date)]: data = dict() data['name'] = "无线端来源" # data['coming_from'] = str(need_date) data['shop_name'] = "六只兔子 高端内裤 内衣店" data['pageName'] = need_date['pageName'] data['uv'] = need_date['uv'] data['pv'] = need_date['pv'] data['uvRate'] = need_date['uvRate'] data['pvRate'] = need_date['pvRate'] data['itemid'] = rabbit['item_id'] local_set2.save(data) ''' except Exception, e: print e
def get_coverage(options_list, csrfid): proxy = dict() proxy['HTTP'] = Utils.GetMyAgent() url_data = "http://dmp.taobao.com/api/analysis/coverage" for option in options_list: num = 2 post_data = dict() post_data['csrfId'] = csrfid post_data['user'] = 1 # 开始组合post数据,先组成主标签即店铺标签 post_data["options[{a}].operatorType".format(a=0)] = 1 post_data["options[{a}].optionGroupId".format(a=0)] = 304 post_data["options[{a}].source".format(a=0)] = "all" post_data["options[{a}].tagId".format(a=0)] = 110063 post_data["options[{a}].value".format(a=0)] = 145841584 # 填shopId post_data["options[{a}].optionNameMapStr".format( a=0)] = '{"145841584":"麦斯威尔旗舰店"}' # 填店铺信息 post_data["options[{a}].operatorType".format(a=1)] = 1 post_data["options[{a}].optionGroupId".format(a=1)] = 111 post_data["options[{a}].source".format(a=1)] = "all" post_data["options[{a}].tagId".format(a=1)] = 110063 post_data["options[{a}].value".format(a=1)] = "1~999999999" DATA = dict() DATA['tag_id'] = str() DATA['tag_name'] = str() DATA['option_name'] = str() key_list = option.keys() for key in key_list: # 每个key代表一个tagId option_id = option[key] options = TAG_DATA[key]['options'] DATA['tag_name'] += TAG_DATA[key]['tag_name'] + '_' DATA['tag_id'] += str(key) + '_' for get_right_option in options: if option_id == get_right_option['id']: optionGroupId = get_right_option['optionGroupId'] optionValue = get_right_option['optionValue'] optionName = get_right_option['optionName'] post_data['options[{num}].operatorType'.format( num=num)] = 1 post_data['options[{num}].optionGroupId'.format( num=num)] = optionGroupId post_data['options[{num}].source'.format(num=num)] = 'all' post_data['options[{num}].tagId'.format(num=num)] = key post_data['options[{num}].value'.format( num=num)] = optionValue optionNameMapStr = dict() optionNameMapStr[optionValue] = str(optionName) post_data['options[{num}].optionNameMapStr'.format( num=num)] = str(optionNameMapStr) DATA['option_name'] += get_right_option['optionName'] + '_' num += 1 response = requests.post(url_data, proxies=proxy, verify=False, headers=HEADER, cookies=cookie_dict, allow_redirects=False, data=post_data).text DATA['tag_name'] += "制定店铺用户_" DATA['tag_id'] += '110063_' DATA['option_name'] += '1~9999999_麦斯威尔旗舰店_' DATA['count'] = str(json.loads(response)['data']['coverage']) DATA['crawl_time'] = int(time.time()) DATA['tag_name'] = str(DATA['tag_name'])[:-1] DATA['tag_id'] = str(DATA['tag_id'])[:-1] DATA['option_name'] = str(DATA['option_name'])[:-1] print DATA local_set.save(DATA)
def crawlNid(self, data, i, agentip, agentipjj): items = data['mods']['itemlist']['data']['auctions'] x = (i - 1) * 44 + 1 agentip = Utils.GetMyAgent() for item in items: shop_items = [] shop_item = {} shop_item['keyword'] = self.key_word title = item['title'] isTmall = item['shopcard']['isTmall'] shop_item['isTmall'] = isTmall title = title.replace("<spanclass=H>", "").replace("</span>", "").strip() shop_item['title'] = title nid = item['nid'].strip() shop_item['item_id'] = nid view_sales = item['view_sales'].strip() view_sales = view_sales.replace("人收货,", "").replace("人收货", "").strip() shop_item['view_sales'] = view_sales shop_item['view_price'] = item['view_price'].strip() shop_item['picUrl'] = "http:" + item['pic_url'].strip() shop_item['idnick'] = item['nick'].strip() shop_item['crawl_time'] = long(time.time()) shop_item['rank'] = x print(x) if x == 101: break x += 1 #if(x<=5): # continue detail_url = "https://detail.tmall.com/item.htm?spm=a230r.1.14.1.ebb2eb2PXquhm&id={nid}&ns=1&abbucket=20" t_detail_url = detail_url.format(nid=nid) header = {'ip': agentip} try: sleep(2) ok, response = Html_Downloader.Download_Html( t_detail_url, {}, header) if not ok: count = 0 while (count < 4): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html( t_detail_url, {}, header) if ok: break count += 1 if count == 3: header = {} if ok: html = etree.HTML(response.text) if not "shopid" in response.text: count = 0 while (count < 4): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html( t_detail_url, {}, header) if ok: html = etree.HTML(response.text) if "shopid" in response.text: break count += 1 if count == 3: header = {} shop_id = "" # user_id="" category_id = re.compile("item%5f(.*?)(?=&)").findall( response.text)[0] shop_item['category_id'] = category_id if html.xpath("//meta[@name='microscope-data']"): for meta in html.xpath( "//meta[@name='microscope-data']")[0].get( 'content').split(';'): if 'shopid' in meta.lower(): shop_id = meta.split("=")[1] # if 'userid=' in meta.lower(): # user_id= meta.split("=")[1] if html.xpath("//dl[contains(@class,'tb-prop')]"): for prop in html.xpath( "//dl[contains(@class,'tb-prop')]"): if not prop in html.xpath( "//dl[contains(@class,'tb-hidden')]"): prop_value_id = [] prop_name = prop.xpath( ".//dt/text()")[0].encode('utf-8') for value in prop.xpath(".//dd/ul/li"): sub_value_id = [] sku_id = value.get('data-value') sub_value_id.append(sku_id) if value.xpath('./a/span/text()'): sku_name = value.xpath( './a/span/text()')[0].encode( 'utf-8') sub_value_id.append(sku_name) if value.xpath('./a')[0].get( 'style') and re.compile( "/([^/]*)(?=_!!|_M2)").findall( value.xpath('./a')[0].get( 'style')): sku_img_url = re.compile( "/([^/]*)(?=_!!|_M2)").findall( value.xpath('./a')[0].get( 'style'))[0] sub_value_id.append(sku_img_url) prop_value_id.append( ";".join(sub_value_id)) shop_item[prop_name] = prop_value_id if html.xpath("//ul[@id='J_UlThumb']"): stype_img_id = [] if html.xpath("//ul[@id='J_UlThumb']")[0].xpath( ".//li/div"): for value1 in html.xpath("//ul[@id='J_UlThumb']" )[0].xpath(".//li/div"): if value1.xpath('./a')[0].xpath( './img' )[0].get('data-src') and re.compile( "/([^/]*)(?=_!!|_M2)").findall( value1.xpath('./a')[0].xpath( './img')[0].get('data-src')): sku_img_id = re.compile( "/([^/]*)(?=_!!|_M2)").findall( value1.xpath('./a')[0].xpath( './img')[0].get('data-src'))[0] stype_img_id.append(sku_img_id) elif html.xpath("//ul[@id='J_UlThumb']")[0].xpath( ".//li"): for value1 in html.xpath( "//ul[@id='J_UlThumb']")[0].xpath(".//li"): if value1.xpath('./a')[0].xpath( './img')[0].get('src') and re.compile( "/([^/]*)(?=_!!|_M2)").findall( value1.xpath('./a')[0].xpath( './img')[0].get('src')): sku_img_id = re.compile( "/([^/]*)(?=_!!|_M2)").findall( value1.xpath('./a')[0].xpath( './img')[0].get('src'))[0] stype_img_id.append(sku_img_id) shop_item["attr_img"] = "&&||".join(stype_img_id) if html.xpath("//ul[@id='J_AttrUL']"): styleliList = [] # dict={} for styleli in html.xpath( "//ul[@id='J_AttrUL']")[0].xpath(".//li"): if styleli.xpath('./text()'): styleliText = styleli.xpath( './text()')[0].encode('utf-8').strip() # styleliText=styleliText.replace(":",":") # str1=styleliText.split(":")[0].encode('utf-8').strip() # str2=styleliText.split(":")[1].encode('utf-8').strip().replace("\xc2\xa0"," ").lstrip() # dict[str1] =str2 styleliList.append(styleliText) elif html.xpath("//div[@id='attributes']"): styleliList = [] # dict={} for styleli in html.xpath("//div[@id='attributes']" )[0].xpath(".//ul/li"): if styleli.xpath('./text()'): styleliText = styleli.xpath( './text()')[0].encode('utf-8').strip() # styleliText=styleliText.replace(":",":") # str1=styleliText.split(":")[0].encode('utf-8').strip() # str2=styleliText.split(":")[1].encode('utf-8').strip().replace("\xc2\xa0"," ").lstrip() # dict[str1] =str2 styleliList.append(styleliText) shop_item["attribute"] = "&&||".join(styleliList) # shop_item["attribute"]=dict except Exception, e: logging.info("关键词{p}抓取失败,nid={nid},{m}".format(p=self.key_word, nid=nid, m=e.message)) shop_item['crawl_url'] = t_detail_url shop_item['shop_id'] = shop_id session = Session() self.get_total_sales(session, agentip, 1, shop_id) #首次获取会失败只为获取cookie total_page = 1 for i in range(50): # 到最后一页就提前终止 if total_page and i >= total_page: break result = self.get_total_sales(session, agentip, (i + 1), shop_id) if not result: result = self.get_total_sales(session, agentip, (i + 1), shop_id) if (result != None): jobj = json.loads( result.replace("mtopjsonp12(", "").replace("})", "}")) # 获取解析的json jsonArray = jobj['data']['itemsArray'] total_sales = self.parse_total_sales(jsonArray, nid) if total_sales != -1: break if jobj and "SUCCESS" in jobj['ret'][0]: total = int(jobj['data']['totalResults']) total_page = total / 30 # 每页最多30个不能再多 if total % 30: total_page += 1 else: print("获取数据失败") break sleep(2) else: total_sales = "" shop_item['totalSoldQuantity'] = total_sales shop_items.append(shop_item) post_data = {'data': json.dumps(shop_items)} if not self.process_request(SAVE_INSERT_KEYWORD_API, post_data): self.process_request(SAVE_INSERT_KEYWORD_API, post_data)
def crawl_shop_all_item(self): agentIp = Utils.GetMyAgent() shop_id = self.shop_id shop_name = self.shop_name
def crawl_shop_all_item(self, url): agentIp = Utils.GetMyAgent() shop_id = self.shop_id shop_name = self.shop_name userAgent = Html_Downloader.GetUserAgent() header = {'ip': agentIp, 'user-agent': userAgent} text_detail_url = url ok, response = Html_Downloader.Download_Html(text_detail_url, {}, header) if ok: jsonArray = json.loads(response.content) # 获取解析的json total_page = jsonArray.get("total_page") total_results = jsonArray.get("total_results") page_size = jsonArray.get("page_size") jsonResult = jsonArray.get("items") for item in jsonResult: shop_item = {} item_id = str(item.get("item_id")).strip() shop_item['item_id'] = item_id shop_item['title'] = item.get('title').encode('utf-8') shop_item['picUrl'] = "http:" + item.get('img') #现在的销售价 shop_item['salePrice'] = item.get('price') shop_item['totalSoldQuantity'] = item.get('totalSoldQuantity') shop_item['crawl_url'] = item.get('url') shop_item['crawl_time'] = long(time.time()) #接口url 获取宝贝种类(颜色分类)不需要这个接口了,下面那个接口就可以得到颜色分类等信息 ''' test_Url="http://d.da-mai.com/index.php?r=itemApi/getItemInfoByItemId&item_id="+item_id ok, response = Html_Downloader.Download_Html(test_Url,{}, header) if ok: jsonItems=json.loads(response.content) # 获取解析的json ''' #接口url 获取SKU详细信息() shop_item['quantity'] = 0 getSKU_Url = "http://yj.da-mai.com/index.php?r=itemskus/getSkus&fields=*&num_iids={item_id}".format( item_id=item_id) ok, response = Html_Downloader.Download_Html( getSKU_Url, {}, header) if ok: jsonItems = json.loads(response.content) total_data = jsonItems.get("data") for date in total_data: quantity = date.get("quantity") shop_item[ 'quantity'] = shop_item['quantity'] + quantity #获取宝贝详情页信息 (第二屏信息) getDetail_Url = "http://d.da-mai.com/index.php?r=itemApi/getItemInfoByItemId&item_id={item_id}".format( item_id=item_id) ok, response_detail = Html_Downloader.Download_Html( getDetail_Url, {}, header) if ok: shop_item['attribute'] = [] #jsonDetails = response_detail['data']['data'] jsonDetails = json.loads(response_detail.content) properties = jsonDetails['data']['data']['properties'] stringName = "" for attri in properties: #string = "{name}:{value}&&||".format(name=attri.get('name'),value=attri.get('value')) name = attri.get('name') value = attri.get('value') if name in stringName: #shop_item['attribute'].append(name) string = "{value} ".format(value=value) shop_item['attribute'].append(string) if name not in stringName: string = "{name}:{value}&&||".format(name=name, value=value) shop_item['attribute'].append(string) stringName = name + stringName for page in total_page: #重写json的URL并完成回调函数 ###!!!!!注意这里店铺的url写死了,应该传参进来!!!! getlist_url="https://yiqianny.m.tmall.com/shop/shop_auction_search.do?ajson=1&_tm_source=tmallsearch&" \ "spm=a320p.7692171.0.0&sort=d&p={page}&page_size=24&from=h5".format(page=page) p = multiprocessing.Process( target=self.crawl_shop_all_item(getlist_url), args=(page, )) p.start() logging.info("开始多进程爬虫,爬取的json列表为:{url}".format(url=getlist_url)) self.crawl_shop_all_item(getlist_url)
def get_data(tag_id): csrfid = str(get_csrfId()) tag_data = dict() url = "http://dmp.taobao.com/api/tag/{tag_id}?csrfId={csrfid}&t={time}977" \ .format(tag_id=tag_id, csrfid=csrfid, time=int(time.time())) proxy = dict() proxy['HTTP'] = Utils.GetMyAgent() # allow_redirects 是用来解决URI重定向问题 response = requests.get(url=url, proxies=proxy, verify=False, headers=HEADER, cookies=cookie_dict, allow_redirects=False).text data = json.loads(response) # 标签信息 tag_data['dmp_msg'] = str( data["data"]["tag"]["tagDesc"].split(",")[0])[9:-1] # 获取标签数 tag_data['qualityScore'] = str(data["data"]["tag"]["qualityScore"]) # 标签标题 tag_data['tag_name'] = str(data["data"]["tag"]["tagName"]) # 标签 options = dict() optionValue = data["data"]["tag"]["options"] GroupId = data["data"]["tag"]["options"][0]['optionGroupId'] optionValue_id = dict() global DATA # 先把选项编号和选型名称以字典的形式保存在数组中 options = list() for value in optionValue: options_list = list() options_list.append(value["optionValue"]) options_list.append(value["optionName"]) options.append(options_list) # 将每个选项的id和选项标签数组成字典后续用 optionValue_id[value['optionValue']] = value['id'] # 对选项进行组合全部发出请求获取数据 for combin in range(1, len(options) + 1): for option in combinations(options, combin): optionNameMapStr = dict() keys_list = list() for key in option: tag_key = key[0] keys_list.append(tag_key) optionNameMapStr[str(tag_key)] = key[1] url_data = "http://dmp.taobao.com/api/analysis/coverage" post_data = { "csrfId": csrfid, "user": 1, "options[0].operatorType": 1, "options[0].optionGroupId": GroupId, "options[0].source": "detail", "options[0].tagId": tag_id, "options[0].value": keys_list, "options[0].optionNameMapStr": optionNameMapStr, } post_data2 = { 'options[0].tagId': 110063, 'options[0].optionGroupId': 111, 'options[2].optionNameMapStr': "{2: '本科生'}", 'options[2].value': [2], 'options[1].source': 'all', 'options[2].source': 'all', 'options[0].value': '1~999999999', 'options[1].optionNameMapStr': "{'36236493': '依俊服饰", 'options[1].value': 36236493, 'options[0].source': 'all', 'options[1].optionGroupId': 304, 'options[1].tagId': 110063, 'csrfId': '3aed9369a6af50b4d2b202', 'user': 1, 'options[2].optionGroupId': 12164, 'options[0].operatorType': 1, 'options[2].operatorType': 1, 'options[1].operatorType': 1, 'options[2].tagId': 113736 } post_data3 = { 'options[3].optionGroupId': 12164, 'options[0].value': '1~999999999', 'options[2].optionNameMapStr': "{7: '小学'}", 'options[3].optionNameMapStr': "{6: u'初中'}", 'options[0].tagId': 110063, 'options[4].source': 'all', 'options[2].value': [7], 'csrfId': '3aed9369a6af50b4d2b202', 'options[4].optionNameMapStr': "{2:'本科生'}", 'options[4].value': [2], 'options[3].operatorType': 1, 'options[1].tagId': 110063, 'options[3].tagId': 113736, 'options[0].operatorType': 1, 'options[1].operatorType': 1, 'options[4].operatorType': 1, 'options[2].operatorType': 1, 'options[1].optionGroupId': 304, 'options[2].source': 'all', 'options[1].value': 36236493, 'options[4].tagId': 113736, 'user': 1, 'options[2].optionGroupId': 12164, 'options[3].source': 'all', 'options[1].source': 'all', 'options[0].optionGroupId': 111, 'options[0].source': 'all', 'options[4].optionGroupId': 12164, 'options[1].optionNameMapStr': "{'36236493': '\\xe4\\xbe\\x9d\\xe4\\xbf\\x8a\\xe6\\x9c\\x8d\\xe9\\xa5\\xb0'}", 'options[2].tagId': 113736, 'options[3].value': [6] } response = requests.post(url_data, proxies=proxy, verify=False, headers=HEADER, cookies=cookie_dict, allow_redirects=False, data=post_data).text DATA['count'] = str(json.loads(response)['data']['coverage']) # 标签说明。。。。 DATA['option_name'] = str(tag_data['dmp_msg']) # 标签名称 DATA['option_name'] = str(tag_data['tag_name']) # 选项名称: count_name = str() for name in optionNameMapStr.values(): count_name = count_name + name + "_" DATA['count_name'] = count_name[:-1] print "爬取{count_name}成功".format(count_name=count_name[:-1]) # 标签id : a = str() for key in keys_list: a = a + "_" + str(optionValue_id[key]) DATA['id'] = str(tag_id) + a DATA['crawl_time'] = int(time.time()) # 将数据存在本地mongdb查看结果 my_set.save(DATA) # json_data = json.dumps(DATA, ensure_ascii=False) # logging.info(json_data) TATAL_DATA.append(DATA) DATA = {} # tag_data['options'] = options #############开始重新发请求获取数据################### '''