def crawlMonthSales(self, nid, agentip): try: month_Sales = "" nid_url = "https://mdskip.taobao.com/core/initItemDetail.htm?itemId={nid}" refer_url = "https://detail.taobao.com/item.htm?id={nid}" nid_Url = nid_url.format(nid=nid) nid_refer = refer_url.format(nid=nid) cookies = "ab=56; UM_distinctid=15e7a46caf311b-0188d989dac01e-5c153d17-144000-15e7a46caf4552; thw=cn; ali_apache_id=11.131.226.119.1505353641652.239211.1; miid=2033888855982094870; l=AllZcEkSLTy0io2vJcWc-ksY6U4zk02Y; uc2=wuf=https%3A%2F%2Ftrade.tmall.com%2Fdetail%2ForderDetail.htm%3Fbiz_order_id%3D70514222507416230%26forward_action%3D; _cc_=WqG3DMC9EA%3D%3D; tg=0; _uab_collina=150780747345339957932139; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=0_0; _tb_token_=3e0501668eb3b; x=124660357; uc3=sg2=AVMH%2FcTVYAeWJwo98UZ6Ld9wxpMCVcQb0e1XXZrd%2BhE%3D&nk2=&id2=&lg2=; uss=VW9L9wvPPdgBBh%2BJHeH%2BVW8D%2FgmRg%2B6YCnShUPaOH0CFHrL4%2FVpP4v7d; tracknick=; sn=%E4%BE%9D%E4%BF%8A%E6%9C%8D%E9%A5%B0%3A%E8%BF%90%E8%90%A5; skt=efe1ec1051eec814; v=0; cookie2=1ce9fff7464537de3d45fe012006d49d; unb=2077259956; t=1630b104e4d32df897451d6c96642469; _m_h5_tk=37be146862abddcfc955f9ec15ebb25d_1508307778971; _m_h5_tk_enc=7ab9ef3ea063dd2c4cd6d33cf84ea2a4; cna=dNU/EvGIRjsCAQ4XY4PdDkHN; uc1=cookie14=UoTcBzysjIcUbw%3D%3D&lng=zh_CN; isg=Amxsuy9SGdk0Xg26l9-JufebPUpejRva_jrq6MateJe60Qzb7jXgX2Ljh68S; _umdata=85957DF9A4B3B3E8F872A3094256432F0F1549AE1C92C6CCF1E68B982581686F23BFC13A60CCABD1CD43AD3E795C914C9A2685321202E656A2C4B44241C24328" # cookies="_tb_token_=f3fe5d65a6591;cookie2=171e5eb92d66332b1d52d9e2730fed33;t=bf64b0d40d912c08dd434661471b2c98;v=0" cookie_dict = {item.split('=')[0]: item.split('=')[1] for item in cookies.split(';')} header = {'ip': agentip, 'Referer': nid_refer, "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent()} ok, response = Html_Downloader.Download_Html(nid_Url, {}, header) if not ok: count = 0 while (count < 11): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html(nid_Url, {}, header) if ok: break count += 1 print "获取月销量第{conut}试错".format(count=count) if ok: month_Sales = str(re.compile("sellCount\":(.*?)(?=\"success\")").findall(response.text)[0]).replace(",", "").replace( ",", "").strip() if month_Sales == None: self.crawlMonthSales(nid, agentip) print "获得月销量为:{month_Sales}".format(month_Sales=month_Sales) return month_Sales except Exception, e: logging.info("月销量爬取错误{m}".format(m=e.message))
def run(self): agentip = Utils.GetMyAgent() agentipjj = Utils.GetMyAgent() day = datetime.now().strftime("%Y%m%d") search_url = "https://s.taobao.com/search?q={q}&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_{day}&ie=utf8&bcoffset=0&ntoffset=1&p4ppushleft=%2C44&sort=sale-desc&s={s}" page_Url = search_url.format(q=self.key_word, day=day, s=0) header = {'ip': agentip} total = 3 totalpage = self.crawlTotalpage(page_Url, header) total = totalpage if totalpage < total else total total = total + 1 for i in range(1, total): t_url = search_url.format(q=self.key_word, day=day, s=(i - 1) * 44) try: ok, response = Html_Downloader.Download_Html(t_url, {}, header) if not ok: count = 0 while (count < 4): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html( t_url, {}, header) if ok: break count += 1 if count == 3: header = {} if ok: html = etree.HTML(response.text) matchs = html.xpath( "//script[contains(.,'g_page_config')]") if len(matchs) > 0: data = re.compile( "g_page_config=(.*)?;g_srp_loadCss").match( matchs[0].text.replace("\n\n", "\n").replace( "\n", "").replace(" ", "")) if data.lastindex > 0: data = json.loads(data.group(1).encode('utf-8')) if data.has_key('mods'): self.crawlNid(data, i, agentip, agentipjj) else: print("无法匹配有效的json") else: print("无法匹配到宝贝列表") else: logging.info("关键词{p}第{i}页抓取失败{m}".format(p=self.key_word, i=i, m=e.message)) except Exception, e: logging.info("关键词{p}第{i}页抓取错误{m}".format(p=self.key_word, i=i, m=e.message))
def testurl(self, url, agentIp): print("111") header = {'ip': agentIp} for i in range(1, 200): sleep(1) ok, response = Html_Downloader.Download_Html(url, {}, header) print(ok) if not ok: ok, response = Html_Downloader.Download_Html(url, {}, {}) print(url) if ok: html = etree.HTML(response.text) print("1111")
def insertLog(crawl_content,message,shop_id,agentIp,shop_url,start_time,shop_name): end_time=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) getData="&shop_id="+shop_id+"&ip_addr="+agentIp+"&shop_name="+shop_name+"&crawl_url="+shop_url+"&start_time="+start_time+"&end_time="+end_time+"&crawl_content="+crawl_content+"&error_info="+message log="http://192.168.10.198:8080/pdd/CrawlerLogController/SaveCrawlerLog?" logZS="http://syjcapi.da-mai.com/CrawlerLogController/SaveCrawlerLog?" logUU=log+getData logUUZS=logZS+getData ok, result = Html_Downloader.Download_Html(logUU,{},{}) if ok: result_json = json.loads(result.content) #result_ok = bool(result_json['status']) ok, result = Html_Downloader.Download_Html(logUUZS,{},{}) if ok: result_json = json.loads(result.content)
def crawl_shop_all_item(self): agentIp = Utils.GetAgentIp() header = {'ip': agentIp} shop_id = -1 # agentIp=None # agentIp = '120.24.171.107:16816' url = "{shop_url}/search.htm?&search=y&orderType=hotsell_desc&scene=taobao_shop".format( shop_url=self.shop_url) url = self.shop_url print url # data=urllib2.urlopen(url).readlines() # soup=BeautifulSoup(''.join(data), fromEncoding='utf8') # primary_consumer = soup.find(id="bd") ok, response = Html_Downloader.Download_Html(url, {}, header) soup = BeautifulSoup(''.join(response), fromEncoding='utf8') header = soup.find(id="J_GlobalNav") div = header.text # print(ok) if ok: html = etree.HTML(response.text.encode('utf-8')) if html is not None and html.xpath("//header[@id='mp-header']"): if "shopId" in html.xpath("//header[@id='mp-header']")[0].get( "mdv-cfg").split(':')[0]: shop_id = html.xpath("//header[@id='mp-header']")[0].get( "mdv-cfg").split(':')[1] shop_id = shop_id.replace("\'}", "").replace("\'", "") url = "{shop_url}/shop/shop_auction_search.do?sort=d&p=1&page_size=90&from=h5&shop_id={shop_id}&ajson=1&_tm_source=tmallsearch&orderType=hotsell_desc".format( shop_url=self.shop_url, shop_id=shop_id) print(url) # driver = PhantomDriver(2, agentIp, 60) # driver.download_no_quit(self.shop_url) # sleep(1) # for i in range(20): # result = driver.download_no_quit(url) # sleep(3) # source = result['page_source'] # driver.return_driver().quit() # if result['ok']: # html = etree.HTML(source) ok, response = Html_Downloader.Download_Html(url, {}, header) print(ok) if not ok: ok, response = Html_Downloader.Download_Html(url, {}, {}) print(url) if ok: html = etree.HTML(response.text) data = json.loads(html.group(1).encode('utf-8')) print
def get_shop_item_list(session, proxy_ip, page_num): proxies = {"http": proxy_ip, "https": proxy_ip} parms_pager = "{{\"shopId\":\"{shop_id}\",\"currentPage\":{page_num},\"pageSize\":\"30\",\"sort\":\"hotsell\",\"q\":\"\"}}" parms_url = "https://api.m.taobao.com/h5/com.taobao.search.api.getshopitemlist/2.0/?appKey=12574478&t={stmp}&sign={sign}&api=com.taobao.search.api.getShopItemList&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp12&data={pager}" params_referer = "https://shop{shop_id}.m.taobao.com/?shop_id={shop_id}&sort=d".format(shop_id=shop_id) # print(params_referer) stmp = "%s739" % (long(time.time())) referer = params_referer.format(shop_id=shop_id) pager = parms_pager.format(shop_id=shop_id, page_num=page_num) if session.cookies.get_dict('.taobao.com') and session.cookies.get_dict('.taobao.com').has_key('_m_h5_tk'): h5_tk = session.cookies.get_dict('.taobao.com')['_m_h5_tk'] token = re.compile('(.*)(?=_)').findall(h5_tk)[0] value = '%s&%s&12574478&%s' % (token, stmp, pager) sign = execute_javascript(value) else: sign = "a013c868718eddb116eac3da0aa7974a" url = parms_url.format(pager=pager, stmp=stmp, sign=sign) # print(url) requests_parms = {} headers = {'Referer': referer, 'Host': 'api.m.taobao.com', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'User-Agent': Html_Downloader.GetUserAgent()} if proxy_ip: requests_parms['proxies'] = proxies requests_parms['verify'] = False result = session.get(url, headers=headers, **requests_parms) if result.ok: return result.content else: return None
def get_total_page(self, url_params, agent_ip, cookie_dict, start, end): total_page = 0 try: ok, result = Html_Downloader.Download_Html(self.url, { item.split('=')[0]: item.split('=')[1] for item in url_params.split('&') }, { "cookies": cookie_dict, "ip": agent_ip, "Referer": "https://trade.taobao.com/trade/itemlist/list_sold_items.htm?action=itemlist/SoldQueryAction&event_submit_do_query=1&auctionStatus=SUCCESS&tabCode=success" }, post=True) if ok: order_json = json.loads( result.text.replace("\n", "").replace("\r", "")) if order_json.has_key('page'): total_page = int(order_json['page']['totalNumber']) else: print(result) except: pass return total_page
def process_request(self, url, data): result_ok = False ok, result = Html_Downloader.Download_Html(url, data, {'timeout':60}, post=True) if ok: result_json = json.loads(result.content) result_ok = bool(result_json['flag']) return result_ok
def process_request(url, data): result_ok = False ok, result = Html_Downloader.Download_Html(url, data, {'timeout': 10}, post=True) if ok: result_json = json.loads(result.content) result_ok = bool(result_json['flag']) logging.info("数据存储成功{result_ok}".format(result_ok=result_ok)) return result_ok
def get_total_sales(self, session, agentipjj, page_num, shop_id): try: count = 0 while (count < 20): print("agentipjj:" + agentipjj) proxies = {"http": agentipjj, "https": agentipjj} parms_pager = "{{\"shopId\":\"{shop_id}\",\"currentPage\":{page_num},\"pageSize\":\"30\",\"sort\":\"hotsell\",\"q\":\"\"}}" parms_url = "https://unzbmix25g.api.m.taobao.com/h5/com.taobao.search.api.getshopitemlist/2.0/?appKey=12574478&t={stmp}&sign={sign}&api=com.taobao.search.api.getShopItemList&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp12&data={pager}" params_referer = "https://shop{shop_id}.m.taobao.com/?shop_id={shop_id}&sort=d".format( shop_id=shop_id) stmp = "%s739" % (long(time.time())) referer = params_referer.format(shop_id=shop_id) pager = parms_pager.format(shop_id=shop_id, page_num=page_num) if session.cookies.get_dict( '.taobao.com') and session.cookies.get_dict( '.taobao.com').has_key('_m_h5_tk'): h5_tk = session.cookies.get_dict('.taobao.com')['_m_h5_tk'] token = re.compile('(.*)(?=_)').findall(h5_tk)[0] value = '%s&%s&12574478&%s' % (token, stmp, pager) sign = self.execute_javascript(value) else: sign = "a013c868718eddb116eac3da0aa7974a" url = parms_url.format(pager=pager, stmp=stmp, sign=sign) requests_parms = {} headers = { 'Referer': referer, 'Host': 'api.m.taobao.com', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'timeout': '5000', 'User-Agent': Html_Downloader.GetUserAgent() } if agentipjj: requests_parms['proxies'] = proxies requests_parms['verify'] = False try: result = session.get(url, headers=headers, **requests_parms) except Exception, e: agentipjj = Utils.GetMyAgent() continue count = count + 1 if result.status_code != 200: logging.info("代理ip返回结果{log_code}".format( log_code=result.status_code)) agentipjj = Utils.GetMyAgent() sleep(2) else: print(result.status_code) if result.ok: sleep(2) return result.content break except Exception, e: #shop_id={shop_id}&sort=d".format(shop_id=shop_id) logging.info("抓取totalSoldQuantity有错{m}".format(m=e.message)) print("抓取totalSoldQuantity有错{e}".format(e=e.message))
def crawl_yxl(self,auctionId,agentIp): yxl=-1 count =0 while(count<20): agentIp=Utils.GetMyAgent() userAgent=Html_Downloader.GetUserAgent() header = {'ip': agentIp,'user-agent':userAgent} text_detail_url="https://detail.m.tmall.com/item.htm?spm=a320p.7692363.0.0&id={auctionId}".format(auctionId=auctionId) ok, response = Html_Downloader.Download_Html(text_detail_url,{}, header) if ok: matchs=re.compile("sellCount\":(.*?)(?=showShopActivitySize)").findall(response.text) if len(matchs) > 0: if "sellCount" in response.text: yxl=re.compile("sellCount\":(.*?)(?=showShopActivitySize)").findall(response.text)[0].encode('utf-8') yxl=yxl.replace(",\"","") break sleep(3) count+=1 return yxl
def crawl_keyWord(shop_id_url): keyword = "" try: ok, result = Html_Downloader.Download_Html(shop_id_url, {}, {}) if ok: result_json = json.loads(result.content) result_ok = bool(result_json['status']) keyword = result_json['data'][0]['keyword'].encode('utf-8').strip() else: print('调用接口获取关键词失败') except Exception, e: logging.info("获取关键词失败{e}".format(e=e.message))
def process_request(self, url, data): result_ok = False ok, result = Html_Downloader.Download_Html(url, { item.split('=')[0]: item.split('=')[1] for item in data.split('&') }, {}, post=True) if ok: result_json = json.loads(result.content) result_ok = bool(result_json['flag']) self.log_and_print(result_json['message']) else: self.log_and_print(result) return result_ok
def crawlTotalpage(self, search_url, header): try: ok, response = Html_Downloader.Download_Html( search_url, {}, header) if not ok: count = 0 while (count < 4): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html( search_url, {}, header) if ok: break count += 1 if count == 3: header = {} if ok: html = etree.HTML(response.text) matchs = html.xpath("//script[contains(.,'g_page_config')]") if len(matchs) > 0: data = re.compile( "g_page_config=(.*)?;g_srp_loadCss").match( matchs[0].text.replace("\n\n", "\n").replace( "\n", "").replace(" ", "")) if data.lastindex > 0: data = json.loads(data.group(1).encode('utf-8')) if data.has_key('mods'): totalpage = data['mods']['pager']['data'][ 'totalPage'] else: print("无法匹配有效的json") else: print("无法匹配到宝贝列表") except Exception, e: logging.info("关键词{p}第{i}页抓取错误{m}".format(m=e.message))
def crawlMonthSales(self, nid, agentip): try: month_Sales = "" nid_url = "https://mdskip.taobao.com/core/initItemDetail.htm?itemId={nid}" refer_url = "https://detail.taobao.com/item.htm?id={nid}" nid_Url = nid_url.format(nid=nid) nid_refer = refer_url.format(nid=nid) cookies = "ab=12; UM_distinctid=15e7a46caf311b-0188d989dac01e-5c153d17-144000-15e7a46caf4552; thw=cn; ali_apache_id=11.131.226.119.1505353641652.239211.1; miid=2033888855982094870; l=AllZcEkSLTy0io2vJcWc-ksY6U4zk02Y; _cc_=WqG3DMC9EA%3D%3D; tg=0; _uab_collina=150780747345339957932139; cna=dNU/EvGIRjsCAQ4XY4PdDkHN; _tb_token_=3d73497b6b4b1; ali_ab=14.23.99.131.1510570522194.8; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=0_0; _m_h5_tk=c690a92415e1684e37a0d852f95c4237_1511139636041; _m_h5_tk_enc=03e0735d1910593631f521e6615c4e4b; x=124660357; uc3=sg2=AVMH%2FcTVYAeWJwo98UZ6Ld9wxpMCVcQb0e1XXZrd%2BhE%3D&nk2=&id2=&lg2=; uss=VAmowkFljKPmUhfhc%2B1GBuXNJWn9cLMEX%2FtIkJ5j0tQgoNppvUlaKrn3; tracknick=; sn=%E4%BE%9D%E4%BF%8A%E6%9C%8D%E9%A5%B0%3A%E8%BF%90%E8%90%A5; skt=53a079a2a620057d; v=0; cookie2=17f5415096176ca88c03d1fed693a1d4; unb=2077259956; t=1630b104e4d32df897451d6c96642469; uc1=cookie14=UoTdev2%2BYyNASg%3D%3D&lng=zh_CN; _umdata=85957DF9A4B3B3E8F872A3094256432F0F1549AE1C92C6CCF1E68B982581686F23BFC13A60CCABD1CD43AD3E795C914C5B383FEA6B5C410F78EAF10A11987746; isg=Au_vsoMX6XTuPe7jEO7aMMjafgM5PEijMRuJ0QF8i95lUA9SCWTTBu2ApHYV" # cookies="_tb_token_=f3fe5d65a6591;cookie2=171e5eb92d66332b1d52d9e2730fed33;t=bf64b0d40d912c08dd434661471b2c98;v=0" cookie_dict = {item.split('=')[0]: item.split('=')[1] for item in cookies.split(';')} header = {'ip': agentip, 'Referer': nid_refer, "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent()} ok, response = Html_Downloader.Download_Html(nid_Url, {}, header) if not ok: count = 0 while count < 5: sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip, 'Referer': nid_refer, 'timeout': '5000', "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent()} ok, response = Html_Downloader.Download_Html(nid_Url, {}, header) if ok: break count += 1 print "获取月销量第{count}试错".format(count=count) if ok and "sellCount\":" not in response.text: count = 0 while count <5: sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip, 'Referer': nid_refer, 'timeout': '5000', "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent()} if count ==4: header = {} ok, response = Html_Downloader.Download_Html(nid_Url, {}, header) if ok and "sellCount\":" in response.text: break count += 1 print "sellCount不在反馈中,获取月销量第{count}试错".format(count=count) if ok and "sellCount\":" in response.text: month_Sales = str(re.compile("sellCount\":(.*?)(?=\"success\")").findall(response.text)[0]).replace(",", "").replace( ",", "").strip() print "获得月销量为:{month_Sales}".format(month_Sales=month_Sales) return month_Sales except Exception, e: logging.info("月销量爬取错误{m}".format(m=e.message))
def get_sales_cnt(self, start, end): url = GET_SALES_ORDER_CNT_API trade_id = [] post_data = "account_name=%sstart=%s&end=%s" % (self.account_name, start, end) ok, result = Html_Downloader.Download_Html(url, { item.split('=')[0]: item.split('=')[1] for item in post_data.split('&') }, {}, post=True) if ok: result_json = json.loads(result.content) if bool(result_json['flag']): trade_id = set(result_json['data']) self.log_and_print(result_json['message']) else: self.log_and_print(result) self.log_and_print("获取id列表失败") return trade_id
def crawl_shop_all_item(self, url): agentIp = Utils.GetMyAgent() shop_id = self.shop_id shop_name = self.shop_name userAgent = Html_Downloader.GetUserAgent() header = {'ip': agentIp, 'user-agent': userAgent} text_detail_url = url ok, response = Html_Downloader.Download_Html(text_detail_url, {}, header) if ok: jsonArray = json.loads(response.content) # 获取解析的json total_page = jsonArray.get("total_page") total_results = jsonArray.get("total_results") page_size = jsonArray.get("page_size") jsonResult = jsonArray.get("items") for item in jsonResult: shop_item = {} item_id = str(item.get("item_id")).strip() shop_item['item_id'] = item_id shop_item['title'] = item.get('title').encode('utf-8') shop_item['picUrl'] = "http:" + item.get('img') #现在的销售价 shop_item['salePrice'] = item.get('price') shop_item['totalSoldQuantity'] = item.get('totalSoldQuantity') shop_item['crawl_url'] = item.get('url') shop_item['crawl_time'] = long(time.time()) #接口url 获取宝贝种类(颜色分类)不需要这个接口了,下面那个接口就可以得到颜色分类等信息 ''' test_Url="http://d.da-mai.com/index.php?r=itemApi/getItemInfoByItemId&item_id="+item_id ok, response = Html_Downloader.Download_Html(test_Url,{}, header) if ok: jsonItems=json.loads(response.content) # 获取解析的json ''' #接口url 获取SKU详细信息() shop_item['quantity'] = 0 getSKU_Url = "http://yj.da-mai.com/index.php?r=itemskus/getSkus&fields=*&num_iids={item_id}".format( item_id=item_id) ok, response = Html_Downloader.Download_Html( getSKU_Url, {}, header) if ok: jsonItems = json.loads(response.content) total_data = jsonItems.get("data") for date in total_data: quantity = date.get("quantity") shop_item[ 'quantity'] = shop_item['quantity'] + quantity #获取宝贝详情页信息 (第二屏信息) getDetail_Url = "http://d.da-mai.com/index.php?r=itemApi/getItemInfoByItemId&item_id={item_id}".format( item_id=item_id) ok, response_detail = Html_Downloader.Download_Html( getDetail_Url, {}, header) if ok: shop_item['attribute'] = [] #jsonDetails = response_detail['data']['data'] jsonDetails = json.loads(response_detail.content) properties = jsonDetails['data']['data']['properties'] stringName = "" for attri in properties: #string = "{name}:{value}&&||".format(name=attri.get('name'),value=attri.get('value')) name = attri.get('name') value = attri.get('value') if name in stringName: #shop_item['attribute'].append(name) string = "{value} ".format(value=value) shop_item['attribute'].append(string) if name not in stringName: string = "{name}:{value}&&||".format(name=name, value=value) shop_item['attribute'].append(string) stringName = name + stringName for page in total_page: #重写json的URL并完成回调函数 ###!!!!!注意这里店铺的url写死了,应该传参进来!!!! getlist_url="https://yiqianny.m.tmall.com/shop/shop_auction_search.do?ajson=1&_tm_source=tmallsearch&" \ "spm=a320p.7692171.0.0&sort=d&p={page}&page_size=24&from=h5".format(page=page) p = multiprocessing.Process( target=self.crawl_shop_all_item(getlist_url), args=(page, )) p.start() logging.info("开始多进程爬虫,爬取的json列表为:{url}".format(url=getlist_url)) self.crawl_shop_all_item(getlist_url)
def parse_items(self,jsonArray, shop_id,agentIp): shop_items=[] # agentIp=None header = {'ip': agentIp} for item in jsonArray: shop_item = {} shop_item['shop_id']=shop_id auctionId=item.get('auctionId') shop_item['item_id']=auctionId shop_item['title']= item.get('title') shop_item['picUrl']="http:"+item.get('picUrl') # shop_item['picUrl']=re.compile("/([^/]*)(?=_)").findall(item.get('picUrl'))[0] shop_item['salePrice']= item.get('salePrice') shop_item['reservePrice']= item.get('reservePrice') shop_item['quantity']= item.get('quantity') shop_item['totalSoldQuantity']= item.get('totalSoldQuantity') #获取链接里面的详情 t_detail_url="https://item.taobao.com/item.htm?spm=a1z10.3-c-s.w4002-14766145001.18.6584ae82X93XhC&id={auctionId}".format(auctionId=auctionId) shop_item['crawl_url']=t_detail_url print(t_detail_url) shop_item['crawl_time'] = long(time.time()) sold=item.get('sold') if "tmall" in self.shop_url: sold="" sold=self.crawl_yxl(auctionId,agentIp) #天猫的月销量另外获取 shop_item['sold']=sold try: ok, response = Html_Downloader.Download_Html(t_detail_url,{}, header) if not ok: count =0 while(count<4): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html(t_detail_url,{},header) if ok and "category=item" in response.text: break count+=1 if count==3: header={} if ok and "category=item" not in response.text: count =0 while(count<4): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html(t_detail_url,{},header) if ok and "category=item" in response.text: break count+=1 if count==3: header={} if ok and "category=item" in response.text: html = etree.HTML(response.text) # shop_id = "" category_id= re.compile("item%5f(.*?)(?=&)").findall(response.text)[0] shop_item['category_id']=category_id if html.xpath("//dl[contains(@class,'tb-prop')]"): for prop in html.xpath("//dl[contains(@class,'tb-prop')]"): if not prop in html.xpath("//dl[contains(@class,'tb-hidden')]"): prop_value_id=[] prop_name = prop.xpath(".//dt/text()")[0].encode('utf-8') for value in prop.xpath(".//dd/ul/li"): sub_value_id= [] sku_id = value.get('data-value') sub_value_id.append(sku_id) if value.xpath('./a/span/text()'): sku_name = value.xpath('./a/span/text()')[0].encode('utf-8') sub_value_id.append(sku_name) # prop_value_id.append(";".join(sub_value_id)) if value.xpath('./a')[0].get('style') and re.compile("/([^/]*)(?=_!!|_M2)").findall( value.xpath('./a')[0].get('style')): sku_img_url=re.compile("/([^/]*)(?=_!!|_M2)").findall(value.xpath('./a')[0].get('style'))[0] sub_value_id.append(sku_img_url) prop_value_id.append(";".join(sub_value_id)) # shop_item[prop_name] ="&&||".join(prop_value_id) shop_item[prop_name] =prop_value_id if html.xpath("//ul[@id='J_UlThumb']"): stype_img_id=[] if html.xpath("//ul[@id='J_UlThumb']")[0].xpath(".//li/div"): for value1 in html.xpath("//ul[@id='J_UlThumb']")[0].xpath(".//li/div"): if value1.xpath('./a')[0].xpath('./img')[0].get('data-src') and re.compile("/([^/]*)(?=_!!|_M2)").findall( value1.xpath('./a')[0].xpath('./img')[0].get('data-src')): sku_img_id=re.compile("/([^/]*)(?=_!!|_M2)").findall(value1.xpath('./a')[0].xpath('./img')[0].get('data-src'))[0] stype_img_id.append(sku_img_id) elif html.xpath("//ul[@id='J_UlThumb']")[0].xpath(".//li"): for value1 in html.xpath("//ul[@id='J_UlThumb']")[0].xpath(".//li"): if value1.xpath('./a')[0].xpath('./img')[0].get('src') and re.compile("/([^/]*)(?=_!!|_M2)").findall( value1.xpath('./a')[0].xpath('./img')[0].get('src')): sku_img_id=re.compile("/([^/]*)(?=_!!|_M2)").findall(value1.xpath('./a')[0].xpath('./img')[0].get('src'))[0] stype_img_id.append(sku_img_id) shop_item["img_attr"]="&&||".join(stype_img_id) if html.xpath("//ul[@id='J_AttrUL']"): styleliList=[] for styleli in html.xpath("//ul[@id='J_AttrUL']")[0].xpath(".//li"): if styleli.xpath('./text()'): styleliText= styleli.xpath('./text()')[0].encode('utf-8').strip() styleliList.append(styleliText) elif html.xpath("//div[@id='attributes']"): styleliList=[] for styleli in html.xpath("//div[@id='attributes']")[0].xpath(".//ul/li"): if styleli.xpath('./text()'): styleliText= styleli.xpath('./text()')[0].encode('utf-8').strip() styleliList.append(styleliText) shop_item["attribute"]="&&||".join(styleliList) except Exception, e: logging.info("----详情抓取错误----".format(e=e.message)) shop_items.append(shop_item)
def crawl_shop_all_item(self): agentIp = Utils.GetMyAgent() shop_id = self.shop_id shop_name = self.shop_name userAgent = Html_Downloader.GetUserAgent() header = {'ip': agentIp, 'user-agent': userAgent} test_detail_url = "{shop_url}shop/shop_auction_search.do?ajson=1&_tm_source=tmallsearch&spm=a320p.7692171.0.0&sort" \ "=d&p={page}&page_size={page_size}&from=h5".format(shop_url=self.shop_url, page_size=1, page=1) test_detail_url = test_detail_url.replace(".tmall.com", ".m.tmall.com") try: ok, response = Html_Downloader.Download_Html( test_detail_url, {}, header) if not ok: count = 0 while (count < 4): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html( test_detail_url, {}, header) if ok: break count += 1 if count == 3: header = {} if ok: jsonArray = json.loads(response.content) # 获取解析的json total_page = jsonArray.get("total_page") total_results = jsonArray.get("total_results") page_size = jsonArray.get("page_size") logging.info("shopname:" + shop_name + " total_page:" + total_page + " total_results:" + total_results + " page_size:" + page_size) print "total_page:" + total_page + "total_results:" + total_results + "page_size:" + page_size for i in range(int(total_page)): print i + 1 test_detail_url = "{shop_url}shop/shop_auction_search.do?ajson=1&_tm_source=tmallsearch&spm=a320p.7692171.0.0&sort=d&p={page}&page_size={page_size}&from=h5".format( shop_url=self.shop_url, page_size=page_size, page=i + 1) test_detail_url = test_detail_url.replace( ".tmall.com", ".m.tmall.com") ''' if int(total_page)==(i+1): lastCount=int(total_results)-i*int(page_size) ok, response = Html_Downloader.Download_Html(test_detail_url,{}, header) if not ok: count =0 while(count<11): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html(test_detail_url,{},header) if ok and "price" in response.text and lastCount-response.text.count("price")<2: break count+=1 if count==10: header={} print response.text.count('price') if ok and "price" not in response.text: print "111" count =0 while(count<11): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html(test_detail_url,{},header) if ok and "price" in response.text and lastCount-response.text.count("price")<2: break count+=1 if count==10: header={} if ok and lastCount-response.text.count("price")>2: while(count<11): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html(test_detail_url,{},header) if ok and "price" in response.text and lastCount-response.text.count("price")<2: break count+=1 if count==10: header={} if ok and lastCount-response.text.count("price")<2: logging.info("成功获取price字符串并开始解析") self.parse_items(response.content,shop_id,agentIp,shop_name,userAgent) else: ''' ok, response = Html_Downloader.Download_Html( test_detail_url, {}, header) if not ok: count = 0 while (count < 11): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html( test_detail_url, {}, header) if ok: break count += 1 if count == 10: header = {} if ok: # logging.info("成功获取price字符串并开始解析") self.parse_items(response.content, shop_id, agentIp, shop_name, userAgent) except Exception, e: logging.error("抓取店铺:{shop_name}失败,店铺id:{shop_id},错误内容{m}".format( shop_name=shop_name, shop_id=shop_id, m=e.message, )) crawl_content = "抓取列表页有错" message = e.message start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) insertLog(crawl_content, message, shop_id, agentIp, test_detail_url, start_time, shop_name)
def parse_items(self, content, shop_id, agentIp, shop_name, userAgent): try: # start_time2=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) jsonArray = json.loads(content) jsonResult = jsonArray.get("items") shop_items = [] header = {'ip': agentIp, 'user-agent': userAgent} print "开始解析列表json数据" for item in jsonResult: shop_item = {} shop_item['shop_id'] = str(shop_id) shop_item['shop_name'] = shop_name item_id = str(item.get("item_id")).strip() shop_item['item_id'] = item_id shop_item['title'] = item.get('title').encode('utf-8') shop_item['picUrl'] = "https:" + item.get('img') # print item.get('price') # 现在的销售价 # shop_item['salePrice'] = item.get('price') shop_item['totalSoldQuantity'] = str( item.get('totalSoldQuantity')) crawl_url = "https:" + item.get('url') shop_item['crawl_url'] = crawl_url.replace( ".m.tmall.com", ".tmall.com") shop_item['crawl_time'] = long(time.time()) # 获取quantity接口url # 获取Items接口url category_id = "" category_id_Url = "http://d.da-mai.com/index.php?r=itemApi/getItemInfoByItemId&item_id=" + item_id ok, response = Html_Downloader.Download_Html( category_id_Url, {}, header) if not ok: count = 0 while count < 4: sleep(1) agentip = Utils.GetMyAgent() header = {'ip': agentip} if count == 3: header = {} ok, response = Html_Downloader.Download_Html( category_id_Url, {}, header) if ok: break count += 1 if ok: jsonItems = json.loads(response.content) category_id = jsonItems['data']['data']['cid'] total_quantity = 0 quantity_Url = "http://yj.da-mai.com/index.php?r=itemskus/getSkus&fields=*&num_iids={item_id}".format( item_id=item_id) ok, response = Html_Downloader.Download_Html( quantity_Url, {}, header) if not ok: count = 0 while (count < 4): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html( quantity_Url, {}, header) if ok and "quantity" in response.text: break count += 1 if count == 3: header = {} if ok and "quantity" not in response.text: count = 0 while (count < 4): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html( quantity_Url, {}, header) if ok and "quantity" in response.text: break count += 1 if count == 3: header = {} if ok and "quantity" in response.text: print "成功获取sku的json字符串并开始解析" jsonItems = json.loads(response.content) # 获取解析的json total_data = jsonItems.get("data") for date in total_data: quantity = date.get("quantity") total_quantity = total_quantity + quantity shop_item['category_id'] = str(category_id) shop_item['quantity'] = str(total_quantity) agentip = Utils.GetMyAgent() shop_item['month_Sales'] = self.crawlMonthSales( item_id, agentip) shop_items.append(shop_item) post_data = {'data': json.dumps(shop_items)} if not self.process_request(SAVE__INSERT_API, post_data): sleep(3) self.process_request(SAVE__INSERT_API, post_data) # if not self.process_request(SAVE__INSERT_API_ZS, post_data): # sleep(3) # self.process_request(SAVE__INSERT_API_ZS, post_data) except Exception, e: logging.info( "抓取店铺:{shop_name}失败,抓取店铺链接:{shop_url},店铺id:{shop_id},错误内容{m}". format(shop_name=shop_name, shop_id=shop_id, m=e.message)) crawl_content = "解析接口数据有误" message = e.message end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) insertLog(crawl_content, message, shop_id, agentIp, "", start_time, shop_name)
def crawlMonthSales(self, nid, agentip): try: month_Sales = "" nid_url = "https://mdskip.taobao.com/core/initItemDetail.htm?itemId={nid}" refer_url = "https://detail.taobao.com/item.htm?id={nid}" nid_Url = nid_url.format(nid=nid) nid_refer = refer_url.format(nid=nid) cookies = "x=__ll%3D-1%26_ato%3D0; l=AhERSU92PmRba9QUgSCkQMF6oRaqOoXt; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; cna=dNU/EvGIRjsCAQ4XY4PdDkHN; _m_h5_tk=7d8d6e65e5c676a6d0a69c26f7436ea1_1510363282671; _m_h5_tk_enc=e32129060738b7ce01e9114c9bec037f; sm4=440100; hng=CN%7Czh-CN%7CCNY%7C156; uc1=cookie14=UoTde95xncLyFQ%3D%3D&lng=zh_CN; uc3=sg2=Vq0THzNyGHIH22DuvMx9ZEwXL5qc2kn7REWHdois6v0%3D&nk2=&id2=&lg2=; uss=AQDPJiEXAu47o41b5k%2BKpKRT3Ckpz9nqnJX2F%2F7kZG6ttuI82ZnQa7ZL; t=1630b104e4d32df897451d6c96642469; unb=2607292494; sn=sitiselected%E6%97%97%E8%88%B0%E5%BA%97%3A%E5%A4%A7%E9%BA%A6; _tb_token_=eef7bd7b7abd6; cookie2=23bb087c638814ce8a8e329ead5332d4; isg=ApqaMZmelJirXxuDoGSRqtW160B8YxWwfLxcMqQTRi34FzpRjFtutWDlkdVw" # cookies="_tb_token_=f3fe5d65a6591;cookie2=171e5eb92d66332b1d52d9e2730fed33;t=bf64b0d40d912c08dd434661471b2c98;v=0" cookie_dict = { item.split('=')[0]: item.split('=')[1] for item in cookies.split(';') } header = { 'ip': agentip, 'Referer': nid_refer, "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent() } ok, response = Html_Downloader.Download_Html(nid_Url, {}, header) if not ok: count = 0 while count < 5: sleep(2) agentip = Utils.GetMyAgent() header = { 'ip': agentip, 'Referer': nid_refer, 'timeout': '5000', "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent() } ok, response = Html_Downloader.Download_Html( nid_Url, {}, header) if ok: break count += 1 print "获取月销量第{count}试错".format(count=count) if ok and "sellCount\":" not in response.text: count = 0 while count < 10: sleep(2) agentip = Utils.GetMyAgent() header = { 'ip': agentip, 'Referer': nid_refer, 'timeout': '5000', "cookies": cookie_dict, 'User-Agent': Html_Downloader.GetUserAgent() } if count == 9: header = {} ok, response = Html_Downloader.Download_Html( nid_Url, {}, header) if ok and "sellCount\":" in response.text: break count += 1 print "sellCount不在反馈中,获取月销量第{count}试错".format(count=count) if ok and "sellCount\":" in response.text: month_Sales = str( re.compile("sellCount\":(.*?)(?=\"success\")").findall( response.text)[0]).replace(",", "").replace(",", "").strip() print "获得月销量为:{month_Sales}".format(month_Sales=month_Sales) return month_Sales except Exception, e: logging.info("月销量爬取错误{m}".format(m=e.message))
# -*- coding: utf-8 -*- # 用于抓取指定店铺所有宝贝及宝贝规格属性 #间隔时间小于一天拿item_id判重,间隔时间大于一天直接入库 from config import PROJECT_PATH, SEPARATOR from lxml import etree from utils.utils import Utils from utils.html_downloader import Html_Downloader import json fq = open("d:\\1.txt", 'r') json_str = fq.read() fq.close() post_data = {'data': json_str} url = 'http://192.168.12.91:8080/pdd/competeShopRearController/saveCompeteShopInfo' ok, result = Html_Downloader.Download_Html(url, post_data, {'timeout': 60}, post=True) print result
def parse_items(self, html, shop_id, agentIp): shop_items = [] if html.xpath("//div[contains(@class,'shop-hesper-bd')]"): for item in html.xpath("//div[contains(@class,'shop-hesper-bd')]" )[0].xpath(".//dl"): shop_item = {} shop_item['shop_id'] = shop_id item_id = item.get('data-id').replace("\"", "").replace("\\", "") shop_item['item_id'] = item_id shop_item['item_title'] = item.xpath( './dd[1]/a/text()')[0].encode('utf-8').strip() shop_item['item_pic'] = item.xpath( './dt/a/img/@src')[0].strip().replace("\\", "").replace( "//", 'https://') shop_item['item_sales'] = int( item.xpath(".//*[contains(@class,'sale-num')]/text()") [0].encode('utf-8')) shop_item['item_old_price'] = float( item.xpath(".//*[contains(@class,'s-price')]/text()") [0].encode('utf-8')) if item.xpath( ".//*[contains(@class,'s-price')]/text()") else None shop_item['item_new_price'] = float( item.xpath(".//*[contains(@class,'c-price')]/text()") [0].encode('utf-8')) shop_item['item_comment'] = int( item.xpath(".//*[contains(@class,'rates')]")[0].xpath( ".//span/text()")[0].encode('utf-8')) if item.xpath( ".//*[contains(@class,'rates')]") else None shop_item['crawl_time'] = long(time.time()) #获取链接里面的详情 detail_url = "https://item.taobao.com/item.htm?spm=a1z10.3-c-s.w4002-14766145001.18.6584ae82X93XhC&id={item_id}" t_detail_url = detail_url.format(item_id=item_id) shop_item['crawl_url'] = t_detail_url header = {'ip': agentIp} try: ok, response = Html_Downloader.Download_Html( t_detail_url, {}, header) print(ok) if not ok: ok, response = Html_Downloader.Download_Html( t_detail_url, {}, {}) print(t_detail_url) if ok: html = etree.HTML(response.text) # shop_id = "" category_id = re.compile("item%5f(.*?)(?=&)").findall( response.text)[0] shop_item['category_id'] = category_id # if html.xpath("//meta[@name='microscope-data']"): # for meta in html.xpath("//meta[@name='microscope-data']")[0].get('content').split(';'): # if 'shopid' in meta.lower(): # shop_id = meta.split("=")[1] if html.xpath("//dl[contains(@class,'tb-prop')]"): for prop in html.xpath( "//dl[contains(@class,'tb-prop')]"): if not prop in html.xpath( "//dl[contains(@class,'tb-hidden')]"): prop_value_id = [] prop_name = prop.xpath( ".//dt/text()")[0].encode('utf-8') for value in prop.xpath(".//dd/ul/li"): sub_value_id = [] sku_id = value.get('data-value') sub_value_id.append(sku_id) if value.xpath('./a/span/text()'): sku_name = value.xpath( './a/span/text()')[0].encode( 'utf-8') sub_value_id.append(sku_name) prop_value_id.append( ";".join(sub_value_id)) shop_item[prop_name] = "&&||".join( prop_value_id) if html.xpath("//ul[@id='J_UlThumb']"): stype_img_id = [] if html.xpath("//ul[@id='J_UlThumb']")[0].xpath( ".//li/div"): for value1 in html.xpath( "//ul[@id='J_UlThumb']")[0].xpath( ".//li/div"): if value1.xpath('./a')[0].xpath('./img')[ 0].get('data-src') and re.compile( "/([^/]*)(?=!!)").findall( value1.xpath('./a') [0].xpath('./img')[0].get( 'data-src')): sku_img_id = re.compile( "/([^/]*)(?=!!)").findall( value1.xpath('./a')[0].xpath( './img')[0].get( 'data-src'))[0] stype_img_id.append(sku_img_id) elif html.xpath("//ul[@id='J_UlThumb']")[0].xpath( ".//li"): for value1 in html.xpath( "//ul[@id='J_UlThumb']")[0].xpath( ".//li"): if value1.xpath('./a')[0].xpath( './img' )[0].get('src') and re.compile( "/([^/]*)(?=!!)").findall( value1.xpath('./a')[0].xpath( './img')[0].get('src')): sku_img_id = re.compile( "/([^/]*)(?=!!)").findall( value1.xpath('./a')[0].xpath( './img')[0].get('src'))[0] stype_img_id.append(sku_img_id) shop_item["图片属性"] = "&&||".join(stype_img_id) if html.xpath("//ul[@id='J_AttrUL']"): styleliList = [] for styleli in html.xpath( "//ul[@id='J_AttrUL']")[0].xpath(".//li"): if styleli.xpath('./text()'): styleliText = styleli.xpath( './text()')[0].encode('utf-8').strip() styleliList.append(styleliText) elif html.xpath("//div[@id='attributes']"): styleliList = [] for styleli in html.xpath("//div[@id='attributes']" )[0].xpath(".//ul/li"): if styleli.xpath('./text()'): styleliText = styleli.xpath( './text()')[0].encode('utf-8').strip() styleliList.append(styleliText) shop_item["属性"] = "&&||".join(styleliList) except Exception, e: print("----抓取错误----") shop_items.append(shop_item) self.shopall.insert_or_update(shop_items)
def crawl_sales(self, finish_handler): driver = ChromeDriver() cookie_dic, cookies = driver.login_an_get(self.account_name, self.account_pwd) sleep(2) driver.quite() cookie_dict = { item.split('=')[0]: item.split('=')[1] for item in cookies.split(';') } # 时间切分成一个个时间段,切割粒度为小时, # 每段时间采之前对比数据库跟服务器的数量差距, # 差距在5条以内的不再采这天的数据 hour_interval = 24 # 24小时检查一次 # start_date = datetime.datetime.strptime("%s 00:00:00" % "2017-06-01", "%Y-%m-%d 00:00:00") # end_date = datetime.datetime.strptime("%s 00:00:00" % "2017-06-03", "%Y-%m-%d 00:00:00") # end_date + datetime.timedelta(days=1) start = long(time.mktime(self.start_date.timetuple())) final_end = long(time.mktime(self.end_date.timetuple())) end = start + (hour_interval * 3600) agent_ip = None while end <= final_end: url_params = self.params.format(page_num=1, page_size=1, start=start, end=end, pre_page=1, query_more='true') page_size = 100 total = self.get_total_page(url_params, agent_ip, cookie_dict, start, end) sleep(random.randint(1, 2)) totalpage = 0 exist_count = self.db.get_order_count_start_end(start, end) if total and total > exist_count and (total - exist_count) > 5: totalpage = total / page_size if total % page_size: totalpage += 1 self.log_and_print( "正在抓取从%s到%s的订单数据共%s条,分%s页获取" % (datetime.datetime.fromtimestamp(start), datetime.datetime.fromtimestamp(end), total, totalpage)) elif total: self.log_and_print( "从%s到%s订单数据已抓取完毕(服务端%s,数据库%s)" % (datetime.datetime.fromtimestamp(start), datetime.datetime.fromtimestamp(end), total, exist_count)) for i in range(totalpage): page_num = i + 1 self.log_and_print("正在抓取第%s页..." % page_num) url_params = self.params.format(page_num=page_num, page_size=page_size, start=start, end=end, pre_page=1, query_more='true') ok, result = Html_Downloader.Download_Html(self.url, { item.split('=')[0]: item.split('=')[1] for item in url_params.split('&') }, { "cookies": cookie_dict, "ip": agent_ip, "Referer": "https://trade.taobao.com/trade/itemlist/list_sold_items.htm?action=itemlist/SoldQueryAction&event_submit_do_query=1&auctionStatus=SUCCESS&tabCode=success" }, post=True) if ok and "mainOrders" in result.text: try: order_json = json.loads( result.text.replace("\r", "").replace("\n", "")) except Exception, e: self.log_and_print("Error:%s,result:%s" % (e.message, result.text)) self.parse_sales_json(order_json, start, end) else: self.log_and_print( "抓取从%s到%s的订单数据第%s页数据失败:%s" % (datetime.datetime.fromtimestamp(start), datetime.datetime.fromtimestamp(end), page_num, result)) sec = random.randint(15, 40) self.log_and_print("等待%s秒...." % sec) sleep(sec) self.log_and_print("抓取从%s到%s的订单数据共%s条完成" % (datetime.datetime.fromtimestamp(start), datetime.datetime.fromtimestamp(end), total)) start = end end = start + (hour_interval * 3600)
for i in range(totalpage): page_num = i + 1 logandprint("正在抓取第%s页..." % page_num) url_params = params.format(page_num=page_num, page_size=page_size, start=start, end=end, pre_page=1, query_more='true') ok, result = Html_Downloader.Download_Html(url, { item.split('=')[0]: item.split('=')[1] for item in url_params.split('&') }, { "cookies": cookie_dict, "ip": agent_ip, "Referer": "https://trade.taobao.com/trade/itemlist/list_sold_items.htm?action=itemlist/SoldQueryAction&event_submit_do_query=1&auctionStatus=SUCCESS&tabCode=success" }, post=True) if ok and "mainOrders" in result.text: try: order_json = json.loads( result.text.replace("\r", "").replace("\n", "")) except Exception, e: logandprint("Error:%s,result:%s" % (e.message, result.text)) parseSalesJson(order_json, start, end) else: logandprint( "抓取从%s到%s的订单数据第%s页数据失败:%s" %
def crawlNid(self, data, i, agentip, agentipjj): items = data['mods']['itemlist']['data']['auctions'] x = (i - 1) * 44 + 1 agentip = Utils.GetMyAgent() for item in items: shop_items = [] shop_item = {} shop_item['keyword'] = self.key_word title = item['title'] isTmall = item['shopcard']['isTmall'] shop_item['isTmall'] = isTmall title = title.replace("<spanclass=H>", "").replace("</span>", "").strip() shop_item['title'] = title nid = item['nid'].strip() shop_item['item_id'] = nid view_sales = item['view_sales'].strip() view_sales = view_sales.replace("人收货,", "").replace("人收货", "").strip() shop_item['view_sales'] = view_sales shop_item['view_price'] = item['view_price'].strip() shop_item['picUrl'] = "http:" + item['pic_url'].strip() shop_item['idnick'] = item['nick'].strip() shop_item['crawl_time'] = long(time.time()) shop_item['rank'] = x print(x) if x == 101: break x += 1 #if(x<=5): # continue detail_url = "https://detail.tmall.com/item.htm?spm=a230r.1.14.1.ebb2eb2PXquhm&id={nid}&ns=1&abbucket=20" t_detail_url = detail_url.format(nid=nid) header = {'ip': agentip} try: sleep(2) ok, response = Html_Downloader.Download_Html( t_detail_url, {}, header) if not ok: count = 0 while (count < 4): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html( t_detail_url, {}, header) if ok: break count += 1 if count == 3: header = {} if ok: html = etree.HTML(response.text) if not "shopid" in response.text: count = 0 while (count < 4): sleep(2) agentip = Utils.GetMyAgent() header = {'ip': agentip} ok, response = Html_Downloader.Download_Html( t_detail_url, {}, header) if ok: html = etree.HTML(response.text) if "shopid" in response.text: break count += 1 if count == 3: header = {} shop_id = "" # user_id="" category_id = re.compile("item%5f(.*?)(?=&)").findall( response.text)[0] shop_item['category_id'] = category_id if html.xpath("//meta[@name='microscope-data']"): for meta in html.xpath( "//meta[@name='microscope-data']")[0].get( 'content').split(';'): if 'shopid' in meta.lower(): shop_id = meta.split("=")[1] # if 'userid=' in meta.lower(): # user_id= meta.split("=")[1] if html.xpath("//dl[contains(@class,'tb-prop')]"): for prop in html.xpath( "//dl[contains(@class,'tb-prop')]"): if not prop in html.xpath( "//dl[contains(@class,'tb-hidden')]"): prop_value_id = [] prop_name = prop.xpath( ".//dt/text()")[0].encode('utf-8') for value in prop.xpath(".//dd/ul/li"): sub_value_id = [] sku_id = value.get('data-value') sub_value_id.append(sku_id) if value.xpath('./a/span/text()'): sku_name = value.xpath( './a/span/text()')[0].encode( 'utf-8') sub_value_id.append(sku_name) if value.xpath('./a')[0].get( 'style') and re.compile( "/([^/]*)(?=_!!|_M2)").findall( value.xpath('./a')[0].get( 'style')): sku_img_url = re.compile( "/([^/]*)(?=_!!|_M2)").findall( value.xpath('./a')[0].get( 'style'))[0] sub_value_id.append(sku_img_url) prop_value_id.append( ";".join(sub_value_id)) shop_item[prop_name] = prop_value_id if html.xpath("//ul[@id='J_UlThumb']"): stype_img_id = [] if html.xpath("//ul[@id='J_UlThumb']")[0].xpath( ".//li/div"): for value1 in html.xpath("//ul[@id='J_UlThumb']" )[0].xpath(".//li/div"): if value1.xpath('./a')[0].xpath( './img' )[0].get('data-src') and re.compile( "/([^/]*)(?=_!!|_M2)").findall( value1.xpath('./a')[0].xpath( './img')[0].get('data-src')): sku_img_id = re.compile( "/([^/]*)(?=_!!|_M2)").findall( value1.xpath('./a')[0].xpath( './img')[0].get('data-src'))[0] stype_img_id.append(sku_img_id) elif html.xpath("//ul[@id='J_UlThumb']")[0].xpath( ".//li"): for value1 in html.xpath( "//ul[@id='J_UlThumb']")[0].xpath(".//li"): if value1.xpath('./a')[0].xpath( './img')[0].get('src') and re.compile( "/([^/]*)(?=_!!|_M2)").findall( value1.xpath('./a')[0].xpath( './img')[0].get('src')): sku_img_id = re.compile( "/([^/]*)(?=_!!|_M2)").findall( value1.xpath('./a')[0].xpath( './img')[0].get('src'))[0] stype_img_id.append(sku_img_id) shop_item["attr_img"] = "&&||".join(stype_img_id) if html.xpath("//ul[@id='J_AttrUL']"): styleliList = [] # dict={} for styleli in html.xpath( "//ul[@id='J_AttrUL']")[0].xpath(".//li"): if styleli.xpath('./text()'): styleliText = styleli.xpath( './text()')[0].encode('utf-8').strip() # styleliText=styleliText.replace(":",":") # str1=styleliText.split(":")[0].encode('utf-8').strip() # str2=styleliText.split(":")[1].encode('utf-8').strip().replace("\xc2\xa0"," ").lstrip() # dict[str1] =str2 styleliList.append(styleliText) elif html.xpath("//div[@id='attributes']"): styleliList = [] # dict={} for styleli in html.xpath("//div[@id='attributes']" )[0].xpath(".//ul/li"): if styleli.xpath('./text()'): styleliText = styleli.xpath( './text()')[0].encode('utf-8').strip() # styleliText=styleliText.replace(":",":") # str1=styleliText.split(":")[0].encode('utf-8').strip() # str2=styleliText.split(":")[1].encode('utf-8').strip().replace("\xc2\xa0"," ").lstrip() # dict[str1] =str2 styleliList.append(styleliText) shop_item["attribute"] = "&&||".join(styleliList) # shop_item["attribute"]=dict except Exception, e: logging.info("关键词{p}抓取失败,nid={nid},{m}".format(p=self.key_word, nid=nid, m=e.message)) shop_item['crawl_url'] = t_detail_url shop_item['shop_id'] = shop_id session = Session() self.get_total_sales(session, agentip, 1, shop_id) #首次获取会失败只为获取cookie total_page = 1 for i in range(50): # 到最后一页就提前终止 if total_page and i >= total_page: break result = self.get_total_sales(session, agentip, (i + 1), shop_id) if not result: result = self.get_total_sales(session, agentip, (i + 1), shop_id) if (result != None): jobj = json.loads( result.replace("mtopjsonp12(", "").replace("})", "}")) # 获取解析的json jsonArray = jobj['data']['itemsArray'] total_sales = self.parse_total_sales(jsonArray, nid) if total_sales != -1: break if jobj and "SUCCESS" in jobj['ret'][0]: total = int(jobj['data']['totalResults']) total_page = total / 30 # 每页最多30个不能再多 if total % 30: total_page += 1 else: print("获取数据失败") break sleep(2) else: total_sales = "" shop_item['totalSoldQuantity'] = total_sales shop_items.append(shop_item) post_data = {'data': json.dumps(shop_items)} if not self.process_request(SAVE_INSERT_KEYWORD_API, post_data): self.process_request(SAVE_INSERT_KEYWORD_API, post_data)