def crawlMonthSales(self, nid, agentip): try: month_Sales = "" nid_url = "https://mdskip.taobao.com/core/initItemDetail.htm?itemId={nid}" refer_url = "https://detail.taobao.com/item.htm?id={nid}" nid_Url = nid_url.format(nid=nid) nid_refer = refer_url.format(nid=nid) cookies = "ab=12; UM_distinctid=15e7a46caf311b-0188d989dac01e-5c153d17-144000-15e7a46caf4552; thw=cn; ali_apache_id=11.131.226.119.1505353641652.239211.1; miid=2033888855982094870; l=AllZcEkSLTy0io2vJcWc-ksY6U4zk02Y; _cc_=WqG3DMC9EA%3D%3D; tg=0; _uab_collina=150780747345339957932139; cna=dNU/EvGIRjsCAQ4XY4PdDkHN; _tb_token_=3d73497b6b4b1; ali_ab=14.23.99.131.1510570522194.8; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=0_0; _m_h5_tk=c690a92415e1684e37a0d852f95c4237_1511139636041; _m_h5_tk_enc=03e0735d1910593631f521e6615c4e4b; x=124660357; uc3=sg2=AVMH%2FcTVYAeWJwo98UZ6Ld9wxpMCVcQb0e1XXZrd%2BhE%3D&nk2=&id2=&lg2=; uss=VAmowkFljKPmUhfhc%2B1GBuXNJWn9cLMEX%2FtIkJ5j0tQgoNppvUlaKrn3; tracknick=; sn=%E4%BE%9D%E4%BF%8A%E6%9C%8D%E9%A5%B0%3A%E8%BF%90%E8%90%A5; skt=53a079a2a620057d; v=0; cookie2=17f5415096176ca88c03d1fed693a1d4; unb=2077259956; t=1630b104e4d32df897451d6c96642469; uc1=cookie14=UoTdev2%2BYyNASg%3D%3D&lng=zh_CN; _umdata=85957DF9A4B3B3E8F872A3094256432F0F1549AE1C92C6CCF1E68B982581686F23BFC13A60CCABD1CD43AD3E795C914C5B383FEA6B5C410F78EAF10A11987746; isg=Au_vsoMX6XTuPe7jEO7aMMjafgM5PEijMRuJ0QF8i95lUA9SCWTTBu2ApHYV" # cookies="_tb_token_=f3fe5d65a6591;cookie2=171e5eb92d66332b1d52d9e2730fed33;t=bf64b0d40d912c08dd434661471b2c98;v=0" cookie_dict = {item.split('=')[0]: item.split('=')[1] for item in cookies.split(';')} header = {'ip': agentip, 'Referer': nid_refer, "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent()} ok, response = Html_Downloader.Download_Html(nid_Url, {}, header) if not ok: count = 0 while count < 5: sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip, 'Referer': nid_refer, 'timeout': '5000', "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent()} ok, response = Html_Downloader.Download_Html(nid_Url, {}, header) if ok: break count += 1 print "获取月销量第{count}试错".format(count=count) if ok and "sellCount\":" not in response.text: count = 0 while count <5: sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip, 'Referer': nid_refer, 'timeout': '5000', "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent()} if count ==4: header = {} ok, response = Html_Downloader.Download_Html(nid_Url, {}, header) if ok and "sellCount\":" in response.text: break count += 1 print "sellCount不在反馈中,获取月销量第{count}试错".format(count=count) if ok and "sellCount\":" in response.text: month_Sales = str(re.compile("sellCount\":(.*?)(?=\"success\")").findall(response.text)[0]).replace(",", "").replace( ",", "").strip() print "获得月销量为:{month_Sales}".format(month_Sales=month_Sales) return month_Sales except Exception, e: logging.info("月销量爬取错误{m}".format(m=e.message))
def get_shop_item_list(session, proxy_ip, page_num): proxies = {"http": proxy_ip, "https": proxy_ip} parms_pager = "{{\"shopId\":\"{shop_id}\",\"currentPage\":{page_num},\"pageSize\":\"30\",\"sort\":\"hotsell\",\"q\":\"\"}}" parms_url = "https://api.m.taobao.com/h5/com.taobao.search.api.getshopitemlist/2.0/?appKey=12574478&t={stmp}&sign={sign}&api=com.taobao.search.api.getShopItemList&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp12&data={pager}" params_referer = "https://shop{shop_id}.m.taobao.com/?shop_id={shop_id}&sort=d".format(shop_id=shop_id) # print(params_referer) stmp = "%s739" % (long(time.time())) referer = params_referer.format(shop_id=shop_id) pager = parms_pager.format(shop_id=shop_id, page_num=page_num) if session.cookies.get_dict('.taobao.com') and session.cookies.get_dict('.taobao.com').has_key('_m_h5_tk'): h5_tk = session.cookies.get_dict('.taobao.com')['_m_h5_tk'] token = re.compile('(.*)(?=_)').findall(h5_tk)[0] value = '%s&%s&12574478&%s' % (token, stmp, pager) sign = execute_javascript(value) else: sign = "a013c868718eddb116eac3da0aa7974a" url = parms_url.format(pager=pager, stmp=stmp, sign=sign) # print(url) requests_parms = {} headers = {'Referer': referer, 'Host': 'api.m.taobao.com', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'User-Agent': Html_Downloader.GetUserAgent()} if proxy_ip: requests_parms['proxies'] = proxies requests_parms['verify'] = False result = session.get(url, headers=headers, **requests_parms) if result.ok: return result.content else: return None
def crawlMonthSales(self, nid, agentip): try: month_Sales = "" nid_url = "https://mdskip.taobao.com/core/initItemDetail.htm?itemId={nid}" refer_url = "https://detail.taobao.com/item.htm?id={nid}" nid_Url = nid_url.format(nid=nid) nid_refer = refer_url.format(nid=nid) cookies = "ab=56; UM_distinctid=15e7a46caf311b-0188d989dac01e-5c153d17-144000-15e7a46caf4552; thw=cn; ali_apache_id=11.131.226.119.1505353641652.239211.1; miid=2033888855982094870; l=AllZcEkSLTy0io2vJcWc-ksY6U4zk02Y; uc2=wuf=https%3A%2F%2Ftrade.tmall.com%2Fdetail%2ForderDetail.htm%3Fbiz_order_id%3D70514222507416230%26forward_action%3D; _cc_=WqG3DMC9EA%3D%3D; tg=0; _uab_collina=150780747345339957932139; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=0_0; _tb_token_=3e0501668eb3b; x=124660357; uc3=sg2=AVMH%2FcTVYAeWJwo98UZ6Ld9wxpMCVcQb0e1XXZrd%2BhE%3D&nk2=&id2=&lg2=; uss=VW9L9wvPPdgBBh%2BJHeH%2BVW8D%2FgmRg%2B6YCnShUPaOH0CFHrL4%2FVpP4v7d; tracknick=; sn=%E4%BE%9D%E4%BF%8A%E6%9C%8D%E9%A5%B0%3A%E8%BF%90%E8%90%A5; skt=efe1ec1051eec814; v=0; cookie2=1ce9fff7464537de3d45fe012006d49d; unb=2077259956; t=1630b104e4d32df897451d6c96642469; _m_h5_tk=37be146862abddcfc955f9ec15ebb25d_1508307778971; _m_h5_tk_enc=7ab9ef3ea063dd2c4cd6d33cf84ea2a4; cna=dNU/EvGIRjsCAQ4XY4PdDkHN; uc1=cookie14=UoTcBzysjIcUbw%3D%3D&lng=zh_CN; isg=Amxsuy9SGdk0Xg26l9-JufebPUpejRva_jrq6MateJe60Qzb7jXgX2Ljh68S; _umdata=85957DF9A4B3B3E8F872A3094256432F0F1549AE1C92C6CCF1E68B982581686F23BFC13A60CCABD1CD43AD3E795C914C9A2685321202E656A2C4B44241C24328" # cookies="_tb_token_=f3fe5d65a6591;cookie2=171e5eb92d66332b1d52d9e2730fed33;t=bf64b0d40d912c08dd434661471b2c98;v=0" cookie_dict = {item.split('=')[0]: item.split('=')[1] for item in cookies.split(';')} header = {'ip': agentip, 'Referer': nid_refer, "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent()} ok, response = Html_Downloader.Download_Html(nid_Url, {}, header) if not ok: count = 0 while (count < 11): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html(nid_Url, {}, header) if ok: break count += 1 print "获取月销量第{conut}试错".format(count=count) if ok: month_Sales = str(re.compile("sellCount\":(.*?)(?=\"success\")").findall(response.text)[0]).replace(",", "").replace( ",", "").strip() if month_Sales == None: self.crawlMonthSales(nid, agentip) print "获得月销量为:{month_Sales}".format(month_Sales=month_Sales) return month_Sales except Exception, e: logging.info("月销量爬取错误{m}".format(m=e.message))
def get_total_sales(self, session, agentipjj, page_num, shop_id): try: count = 0 while (count < 20): print("agentipjj:" + agentipjj) proxies = {"http": agentipjj, "https": agentipjj} parms_pager = "{{\"shopId\":\"{shop_id}\",\"currentPage\":{page_num},\"pageSize\":\"30\",\"sort\":\"hotsell\",\"q\":\"\"}}" parms_url = "https://unzbmix25g.api.m.taobao.com/h5/com.taobao.search.api.getshopitemlist/2.0/?appKey=12574478&t={stmp}&sign={sign}&api=com.taobao.search.api.getShopItemList&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp12&data={pager}" params_referer = "https://shop{shop_id}.m.taobao.com/?shop_id={shop_id}&sort=d".format( shop_id=shop_id) stmp = "%s739" % (long(time.time())) referer = params_referer.format(shop_id=shop_id) pager = parms_pager.format(shop_id=shop_id, page_num=page_num) if session.cookies.get_dict( '.taobao.com') and session.cookies.get_dict( '.taobao.com').has_key('_m_h5_tk'): h5_tk = session.cookies.get_dict('.taobao.com')['_m_h5_tk'] token = re.compile('(.*)(?=_)').findall(h5_tk)[0] value = '%s&%s&12574478&%s' % (token, stmp, pager) sign = self.execute_javascript(value) else: sign = "a013c868718eddb116eac3da0aa7974a" url = parms_url.format(pager=pager, stmp=stmp, sign=sign) requests_parms = {} headers = { 'Referer': referer, 'Host': 'api.m.taobao.com', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'timeout': '5000', 'User-Agent': Html_Downloader.GetUserAgent() } if agentipjj: requests_parms['proxies'] = proxies requests_parms['verify'] = False try: result = session.get(url, headers=headers, **requests_parms) except Exception, e: agentipjj = Utils.GetMyAgent() continue count = count + 1 if result.status_code != 200: logging.info("代理ip返回结果{log_code}".format( log_code=result.status_code)) agentipjj = Utils.GetMyAgent() sleep(2) else: print(result.status_code) if result.ok: sleep(2) return result.content break except Exception, e: #shop_id={shop_id}&sort=d".format(shop_id=shop_id) logging.info("抓取totalSoldQuantity有错{m}".format(m=e.message)) print("抓取totalSoldQuantity有错{e}".format(e=e.message))
def crawl_yxl(self,auctionId,agentIp): yxl=-1 count =0 while(count<20): agentIp=Utils.GetMyAgent() userAgent=Html_Downloader.GetUserAgent() header = {'ip': agentIp,'user-agent':userAgent} text_detail_url="https://detail.m.tmall.com/item.htm?spm=a320p.7692363.0.0&id={auctionId}".format(auctionId=auctionId) ok, response = Html_Downloader.Download_Html(text_detail_url,{}, header) if ok: matchs=re.compile("sellCount\":(.*?)(?=showShopActivitySize)").findall(response.text) if len(matchs) > 0: if "sellCount" in response.text: yxl=re.compile("sellCount\":(.*?)(?=showShopActivitySize)").findall(response.text)[0].encode('utf-8') yxl=yxl.replace(",\"","") break sleep(3) count+=1 return yxl
def crawl_shop_all_item(self, url): agentIp = Utils.GetMyAgent() shop_id = self.shop_id shop_name = self.shop_name userAgent = Html_Downloader.GetUserAgent() header = {'ip': agentIp, 'user-agent': userAgent} text_detail_url = url ok, response = Html_Downloader.Download_Html(text_detail_url, {}, header) if ok: jsonArray = json.loads(response.content) # 获取解析的json total_page = jsonArray.get("total_page") total_results = jsonArray.get("total_results") page_size = jsonArray.get("page_size") jsonResult = jsonArray.get("items") for item in jsonResult: shop_item = {} item_id = str(item.get("item_id")).strip() shop_item['item_id'] = item_id shop_item['title'] = item.get('title').encode('utf-8') shop_item['picUrl'] = "http:" + item.get('img') #现在的销售价 shop_item['salePrice'] = item.get('price') shop_item['totalSoldQuantity'] = item.get('totalSoldQuantity') shop_item['crawl_url'] = item.get('url') shop_item['crawl_time'] = long(time.time()) #接口url 获取宝贝种类(颜色分类)不需要这个接口了,下面那个接口就可以得到颜色分类等信息 ''' test_Url="http://d.da-mai.com/index.php?r=itemApi/getItemInfoByItemId&item_id="+item_id ok, response = Html_Downloader.Download_Html(test_Url,{}, header) if ok: jsonItems=json.loads(response.content) # 获取解析的json ''' #接口url 获取SKU详细信息() shop_item['quantity'] = 0 getSKU_Url = "http://yj.da-mai.com/index.php?r=itemskus/getSkus&fields=*&num_iids={item_id}".format( item_id=item_id) ok, response = Html_Downloader.Download_Html( getSKU_Url, {}, header) if ok: jsonItems = json.loads(response.content) total_data = jsonItems.get("data") for date in total_data: quantity = date.get("quantity") shop_item[ 'quantity'] = shop_item['quantity'] + quantity #获取宝贝详情页信息 (第二屏信息) getDetail_Url = "http://d.da-mai.com/index.php?r=itemApi/getItemInfoByItemId&item_id={item_id}".format( item_id=item_id) ok, response_detail = Html_Downloader.Download_Html( getDetail_Url, {}, header) if ok: shop_item['attribute'] = [] #jsonDetails = response_detail['data']['data'] jsonDetails = json.loads(response_detail.content) properties = jsonDetails['data']['data']['properties'] stringName = "" for attri in properties: #string = "{name}:{value}&&||".format(name=attri.get('name'),value=attri.get('value')) name = attri.get('name') value = attri.get('value') if name in stringName: #shop_item['attribute'].append(name) string = "{value} ".format(value=value) shop_item['attribute'].append(string) if name not in stringName: string = "{name}:{value}&&||".format(name=name, value=value) shop_item['attribute'].append(string) stringName = name + stringName for page in total_page: #重写json的URL并完成回调函数 ###!!!!!注意这里店铺的url写死了,应该传参进来!!!! getlist_url="https://yiqianny.m.tmall.com/shop/shop_auction_search.do?ajson=1&_tm_source=tmallsearch&" \ "spm=a320p.7692171.0.0&sort=d&p={page}&page_size=24&from=h5".format(page=page) p = multiprocessing.Process( target=self.crawl_shop_all_item(getlist_url), args=(page, )) p.start() logging.info("开始多进程爬虫,爬取的json列表为:{url}".format(url=getlist_url)) self.crawl_shop_all_item(getlist_url)
def crawl_shop_all_item(self): agentIp = Utils.GetMyAgent() shop_id = self.shop_id shop_name = self.shop_name userAgent = Html_Downloader.GetUserAgent() header = {'ip': agentIp, 'user-agent': userAgent} test_detail_url = "{shop_url}shop/shop_auction_search.do?ajson=1&_tm_source=tmallsearch&spm=a320p.7692171.0.0&sort" \ "=d&p={page}&page_size={page_size}&from=h5".format(shop_url=self.shop_url, page_size=1, page=1) test_detail_url = test_detail_url.replace(".tmall.com", ".m.tmall.com") try: ok, response = Html_Downloader.Download_Html( test_detail_url, {}, header) if not ok: count = 0 while (count < 4): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html( test_detail_url, {}, header) if ok: break count += 1 if count == 3: header = {} if ok: jsonArray = json.loads(response.content) # 获取解析的json total_page = jsonArray.get("total_page") total_results = jsonArray.get("total_results") page_size = jsonArray.get("page_size") logging.info("shopname:" + shop_name + " total_page:" + total_page + " total_results:" + total_results + " page_size:" + page_size) print "total_page:" + total_page + "total_results:" + total_results + "page_size:" + page_size for i in range(int(total_page)): print i + 1 test_detail_url = "{shop_url}shop/shop_auction_search.do?ajson=1&_tm_source=tmallsearch&spm=a320p.7692171.0.0&sort=d&p={page}&page_size={page_size}&from=h5".format( shop_url=self.shop_url, page_size=page_size, page=i + 1) test_detail_url = test_detail_url.replace( ".tmall.com", ".m.tmall.com") ''' if int(total_page)==(i+1): lastCount=int(total_results)-i*int(page_size) ok, response = Html_Downloader.Download_Html(test_detail_url,{}, header) if not ok: count =0 while(count<11): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html(test_detail_url,{},header) if ok and "price" in response.text and lastCount-response.text.count("price")<2: break count+=1 if count==10: header={} print response.text.count('price') if ok and "price" not in response.text: print "111" count =0 while(count<11): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html(test_detail_url,{},header) if ok and "price" in response.text and lastCount-response.text.count("price")<2: break count+=1 if count==10: header={} if ok and lastCount-response.text.count("price")>2: while(count<11): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html(test_detail_url,{},header) if ok and "price" in response.text and lastCount-response.text.count("price")<2: break count+=1 if count==10: header={} if ok and lastCount-response.text.count("price")<2: logging.info("成功获取price字符串并开始解析") self.parse_items(response.content,shop_id,agentIp,shop_name,userAgent) else: ''' ok, response = Html_Downloader.Download_Html( test_detail_url, {}, header) if not ok: count = 0 while (count < 11): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html( test_detail_url, {}, header) if ok: break count += 1 if count == 10: header = {} if ok: # logging.info("成功获取price字符串并开始解析") self.parse_items(response.content, shop_id, agentIp, shop_name, userAgent) except Exception, e: logging.error("抓取店铺:{shop_name}失败,店铺id:{shop_id},错误内容{m}".format( shop_name=shop_name, shop_id=shop_id, m=e.message, )) crawl_content = "抓取列表页有错" message = e.message start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) insertLog(crawl_content, message, shop_id, agentIp, test_detail_url, start_time, shop_name)
def crawlMonthSales(self, nid, agentip): try: month_Sales = "" nid_url = "https://mdskip.taobao.com/core/initItemDetail.htm?itemId={nid}" refer_url = "https://detail.taobao.com/item.htm?id={nid}" nid_Url = nid_url.format(nid=nid) nid_refer = refer_url.format(nid=nid) cookies = "x=__ll%3D-1%26_ato%3D0; l=AhERSU92PmRba9QUgSCkQMF6oRaqOoXt; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; cna=dNU/EvGIRjsCAQ4XY4PdDkHN; _m_h5_tk=7d8d6e65e5c676a6d0a69c26f7436ea1_1510363282671; _m_h5_tk_enc=e32129060738b7ce01e9114c9bec037f; sm4=440100; hng=CN%7Czh-CN%7CCNY%7C156; uc1=cookie14=UoTde95xncLyFQ%3D%3D&lng=zh_CN; uc3=sg2=Vq0THzNyGHIH22DuvMx9ZEwXL5qc2kn7REWHdois6v0%3D&nk2=&id2=&lg2=; uss=AQDPJiEXAu47o41b5k%2BKpKRT3Ckpz9nqnJX2F%2F7kZG6ttuI82ZnQa7ZL; t=1630b104e4d32df897451d6c96642469; unb=2607292494; sn=sitiselected%E6%97%97%E8%88%B0%E5%BA%97%3A%E5%A4%A7%E9%BA%A6; _tb_token_=eef7bd7b7abd6; cookie2=23bb087c638814ce8a8e329ead5332d4; isg=ApqaMZmelJirXxuDoGSRqtW160B8YxWwfLxcMqQTRi34FzpRjFtutWDlkdVw" # cookies="_tb_token_=f3fe5d65a6591;cookie2=171e5eb92d66332b1d52d9e2730fed33;t=bf64b0d40d912c08dd434661471b2c98;v=0" cookie_dict = { item.split('=')[0]: item.split('=')[1] for item in cookies.split(';') } header = { 'ip': agentip, 'Referer': nid_refer, "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent() } ok, response = Html_Downloader.Download_Html(nid_Url, {}, header) if not ok: count = 0 while count < 5: sleep(2) agentip = Utils.GetMyAgent() header = { 'ip': agentip, 'Referer': nid_refer, 'timeout': '5000', "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent() } ok, response = Html_Downloader.Download_Html( nid_Url, {}, header) if ok: break count += 1 print "获取月销量第{count}试错".format(count=count) if ok and "sellCount\":" not in response.text: count = 0 while count < 10: sleep(2) agentip = Utils.GetMyAgent() header = { 'ip': agentip, 'Referer': nid_refer, 'timeout': '5000', "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent() } if count == 9: header = {} ok, response = Html_Downloader.Download_Html( nid_Url, {}, header) if ok and "sellCount\":" in response.text: break count += 1 print "sellCount不在反馈中,获取月销量第{count}试错".format(count=count) if ok and "sellCount\":" in response.text: month_Sales = str( re.compile("sellCount\":(.*?)(?=\"success\")").findall( response.text)[0]).replace(",", "").replace(",", "").strip() print "获得月销量为:{month_Sales}".format(month_Sales=month_Sales) return month_Sales except Exception, e: logging.info("月销量爬取错误{m}".format(m=e.message))