Exemplo n.º 1
0
 def get_total_sales(self, session, agentipjj, page_num, shop_id):
     try:
         count = 0
         while (count < 20):
             print("agentipjj:" + agentipjj)
             proxies = {"http": agentipjj, "https": agentipjj}
             parms_pager = "{{\"shopId\":\"{shop_id}\",\"currentPage\":{page_num},\"pageSize\":\"30\",\"sort\":\"hotsell\",\"q\":\"\"}}"
             parms_url = "https://unzbmix25g.api.m.taobao.com/h5/com.taobao.search.api.getshopitemlist/2.0/?appKey=12574478&t={stmp}&sign={sign}&api=com.taobao.search.api.getShopItemList&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp12&data={pager}"
             params_referer = "https://shop{shop_id}.m.taobao.com/?shop_id={shop_id}&sort=d".format(
                 shop_id=shop_id)
             stmp = "%s739" % (long(time.time()))
             referer = params_referer.format(shop_id=shop_id)
             pager = parms_pager.format(shop_id=shop_id, page_num=page_num)
             if session.cookies.get_dict(
                     '.taobao.com') and session.cookies.get_dict(
                         '.taobao.com').has_key('_m_h5_tk'):
                 h5_tk = session.cookies.get_dict('.taobao.com')['_m_h5_tk']
                 token = re.compile('(.*)(?=_)').findall(h5_tk)[0]
                 value = '%s&%s&12574478&%s' % (token, stmp, pager)
                 sign = self.execute_javascript(value)
             else:
                 sign = "a013c868718eddb116eac3da0aa7974a"
             url = parms_url.format(pager=pager, stmp=stmp, sign=sign)
             requests_parms = {}
             headers = {
                 'Referer': referer,
                 'Host': 'api.m.taobao.com',
                 'Cache-Control': 'no-cache',
                 'Pragma': 'no-cache',
                 'timeout': '5000',
                 'User-Agent': Html_Downloader.GetUserAgent()
             }
             if agentipjj:
                 requests_parms['proxies'] = proxies
                 requests_parms['verify'] = False
             try:
                 result = session.get(url,
                                      headers=headers,
                                      **requests_parms)
             except Exception, e:
                 agentipjj = Utils.GetMyAgent()
                 continue
             count = count + 1
             if result.status_code != 200:
                 logging.info("代理ip返回结果{log_code}".format(
                     log_code=result.status_code))
                 agentipjj = Utils.GetMyAgent()
                 sleep(2)
             else:
                 print(result.status_code)
             if result.ok:
                 sleep(2)
                 return result.content
                 break
     except Exception, e:
         #shop_id={shop_id}&sort=d".format(shop_id=shop_id)
         logging.info("抓取totalSoldQuantity有错{m}".format(m=e.message))
         print("抓取totalSoldQuantity有错{e}".format(e=e.message))
Exemplo n.º 2
0
 def run(self):
     agentip = Utils.GetMyAgent()
     agentipjj = Utils.GetMyAgent()
     day = datetime.now().strftime("%Y%m%d")
     search_url = "https://s.taobao.com/search?q={q}&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_{day}&ie=utf8&bcoffset=0&ntoffset=1&p4ppushleft=%2C44&sort=sale-desc&s={s}"
     page_Url = search_url.format(q=self.key_word, day=day, s=0)
     header = {'ip': agentip}
     total = 3
     totalpage = self.crawlTotalpage(page_Url, header)
     total = totalpage if totalpage < total else total
     total = total + 1
     for i in range(1, total):
         t_url = search_url.format(q=self.key_word, day=day, s=(i - 1) * 44)
         try:
             ok, response = Html_Downloader.Download_Html(t_url, {}, header)
             if not ok:
                 count = 0
                 while (count < 4):
                     sleep(2)
                     agentip = Utils.GetMyAgent()
                     header = {'ip': agentip}
                     ok, response = Html_Downloader.Download_Html(
                         t_url, {}, header)
                     if ok:
                         break
                     count += 1
                     if count == 3:
                         header = {}
             if ok:
                 html = etree.HTML(response.text)
                 matchs = html.xpath(
                     "//script[contains(.,'g_page_config')]")
                 if len(matchs) > 0:
                     data = re.compile(
                         "g_page_config=(.*)?;g_srp_loadCss").match(
                             matchs[0].text.replace("\n\n", "\n").replace(
                                 "\n", "").replace(" ", ""))
                     if data.lastindex > 0:
                         data = json.loads(data.group(1).encode('utf-8'))
                         if data.has_key('mods'):
                             self.crawlNid(data, i, agentip, agentipjj)
                     else:
                         print("无法匹配有效的json")
                 else:
                     print("无法匹配到宝贝列表")
             else:
                 logging.info("关键词{p}第{i}页抓取失败{m}".format(p=self.key_word,
                                                          i=i,
                                                          m=e.message))
         except Exception, e:
             logging.info("关键词{p}第{i}页抓取错误{m}".format(p=self.key_word,
                                                      i=i,
                                                      m=e.message))
Exemplo n.º 3
0
 def crawlMonthSales(self, nid, agentip):
     try:
         month_Sales = ""
         nid_url = "https://mdskip.taobao.com/core/initItemDetail.htm?itemId={nid}"
         refer_url = "https://detail.taobao.com/item.htm?id={nid}"
         nid_Url = nid_url.format(nid=nid)
         nid_refer = refer_url.format(nid=nid)
         cookies = "ab=12; UM_distinctid=15e7a46caf311b-0188d989dac01e-5c153d17-144000-15e7a46caf4552; thw=cn; ali_apache_id=11.131.226.119.1505353641652.239211.1; miid=2033888855982094870; l=AllZcEkSLTy0io2vJcWc-ksY6U4zk02Y; _cc_=WqG3DMC9EA%3D%3D; tg=0; _uab_collina=150780747345339957932139; cna=dNU/EvGIRjsCAQ4XY4PdDkHN; _tb_token_=3d73497b6b4b1; ali_ab=14.23.99.131.1510570522194.8; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=0_0; _m_h5_tk=c690a92415e1684e37a0d852f95c4237_1511139636041; _m_h5_tk_enc=03e0735d1910593631f521e6615c4e4b; x=124660357; uc3=sg2=AVMH%2FcTVYAeWJwo98UZ6Ld9wxpMCVcQb0e1XXZrd%2BhE%3D&nk2=&id2=&lg2=; uss=VAmowkFljKPmUhfhc%2B1GBuXNJWn9cLMEX%2FtIkJ5j0tQgoNppvUlaKrn3; tracknick=; sn=%E4%BE%9D%E4%BF%8A%E6%9C%8D%E9%A5%B0%3A%E8%BF%90%E8%90%A5; skt=53a079a2a620057d; v=0; cookie2=17f5415096176ca88c03d1fed693a1d4; unb=2077259956; t=1630b104e4d32df897451d6c96642469; uc1=cookie14=UoTdev2%2BYyNASg%3D%3D&lng=zh_CN; _umdata=85957DF9A4B3B3E8F872A3094256432F0F1549AE1C92C6CCF1E68B982581686F23BFC13A60CCABD1CD43AD3E795C914C5B383FEA6B5C410F78EAF10A11987746; isg=Au_vsoMX6XTuPe7jEO7aMMjafgM5PEijMRuJ0QF8i95lUA9SCWTTBu2ApHYV"
         # cookies="_tb_token_=f3fe5d65a6591;cookie2=171e5eb92d66332b1d52d9e2730fed33;t=bf64b0d40d912c08dd434661471b2c98;v=0"
         cookie_dict = {item.split('=')[0]: item.split('=')[1] for item in cookies.split(';')}
         header = {'ip': agentip, 'Referer': nid_refer,
                   "cookies": cookie_dict,
                   'User-Agent': Html_Downloader.GetUserAgent()}
         ok, response = Html_Downloader.Download_Html(nid_Url, {}, header)
         if not ok:
             count = 0
             while count < 5:
                 sleep(2)
                 agentip = Utils.GetMyAgent()
                 header = {'ip': agentip, 'Referer': nid_refer,
                           'timeout': '5000',
                           "cookies": cookie_dict,
                           'User-Agent': Html_Downloader.GetUserAgent()}
                 ok, response = Html_Downloader.Download_Html(nid_Url, {}, header)
                 if ok:
                     break
                 count += 1
                 print "获取月销量第{count}试错".format(count=count)
         if ok and "sellCount\":" not in response.text:
             count = 0
             while count <5:
                 sleep(2)
                 agentip = Utils.GetMyAgent()
                 header = {'ip': agentip, 'Referer': nid_refer,
                           'timeout': '5000',
                           "cookies": cookie_dict,
                           'User-Agent': Html_Downloader.GetUserAgent()}
                 if count ==4:
                     header = {}
                 ok, response = Html_Downloader.Download_Html(nid_Url, {}, header)
                 if ok and "sellCount\":" in response.text:
                     break
                 count += 1
                 print "sellCount不在反馈中,获取月销量第{count}试错".format(count=count)
         if ok and "sellCount\":" in response.text:
             month_Sales = str(re.compile("sellCount\":(.*?)(?=\"success\")").findall(response.text)[0]).replace(",",
                                                                                                                 "").replace(
                     ",", "").strip()
             print "获得月销量为:{month_Sales}".format(month_Sales=month_Sales)
             return month_Sales
     except Exception, e:
         logging.info("月销量爬取错误{m}".format(m=e.message))
Exemplo n.º 4
0
 def crawl_shop_all_item(self):
     agentIp = Utils.GetMyAgent()
     shop_id =self.shop_id
     shop_name=self.shop_name
     session =Session()
     self.get_shop_item_list(session, agentIp,1,shop_id,shop_name)  # 首次获取会失败只为获取cookie
     total_page = 1
     for i in range(100):
         print i
         # 到最后一页就提前终止
         if total_page and i >= total_page:
             break
         result =self.get_shop_item_list(session, agentIp, (i+1),shop_id,shop_name)
         if not result:
             result =self.get_shop_item_list(session, agentIp, (i+1),shop_id,shop_name)
         print(result)
         jobj = json.loads(result.replace("mtopjsonp12(", "").replace("})", "}"))  # 获取解析的json
         #if(i>=7):
         jsonArray=jobj['data']['itemsArray']
         self.parse_items(jsonArray,shop_id,agentIp)
         if jobj and "SUCCESS" in jobj['ret'][0]:
             total = int(jobj['data']['totalResults'])
             total_page = total / 30  # 每页最多30个不能再多
             if total % 30:
                 total_page += 1
         else:
             print("获取数据失败")
             break
         sleep(2)
Exemplo n.º 5
0
 def crawlMonthSales(self, nid, agentip):
     try:
         month_Sales = ""
         nid_url = "https://mdskip.taobao.com/core/initItemDetail.htm?itemId={nid}"
         refer_url = "https://detail.taobao.com/item.htm?id={nid}"
         nid_Url = nid_url.format(nid=nid)
         nid_refer = refer_url.format(nid=nid)
         cookies = "ab=56; UM_distinctid=15e7a46caf311b-0188d989dac01e-5c153d17-144000-15e7a46caf4552; thw=cn; ali_apache_id=11.131.226.119.1505353641652.239211.1; miid=2033888855982094870; l=AllZcEkSLTy0io2vJcWc-ksY6U4zk02Y; uc2=wuf=https%3A%2F%2Ftrade.tmall.com%2Fdetail%2ForderDetail.htm%3Fbiz_order_id%3D70514222507416230%26forward_action%3D; _cc_=WqG3DMC9EA%3D%3D; tg=0; _uab_collina=150780747345339957932139; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=0_0; _tb_token_=3e0501668eb3b; x=124660357; uc3=sg2=AVMH%2FcTVYAeWJwo98UZ6Ld9wxpMCVcQb0e1XXZrd%2BhE%3D&nk2=&id2=&lg2=; uss=VW9L9wvPPdgBBh%2BJHeH%2BVW8D%2FgmRg%2B6YCnShUPaOH0CFHrL4%2FVpP4v7d; tracknick=; sn=%E4%BE%9D%E4%BF%8A%E6%9C%8D%E9%A5%B0%3A%E8%BF%90%E8%90%A5; skt=efe1ec1051eec814; v=0; cookie2=1ce9fff7464537de3d45fe012006d49d; unb=2077259956; t=1630b104e4d32df897451d6c96642469; _m_h5_tk=37be146862abddcfc955f9ec15ebb25d_1508307778971; _m_h5_tk_enc=7ab9ef3ea063dd2c4cd6d33cf84ea2a4; cna=dNU/EvGIRjsCAQ4XY4PdDkHN; uc1=cookie14=UoTcBzysjIcUbw%3D%3D&lng=zh_CN; isg=Amxsuy9SGdk0Xg26l9-JufebPUpejRva_jrq6MateJe60Qzb7jXgX2Ljh68S; _umdata=85957DF9A4B3B3E8F872A3094256432F0F1549AE1C92C6CCF1E68B982581686F23BFC13A60CCABD1CD43AD3E795C914C9A2685321202E656A2C4B44241C24328"
         # cookies="_tb_token_=f3fe5d65a6591;cookie2=171e5eb92d66332b1d52d9e2730fed33;t=bf64b0d40d912c08dd434661471b2c98;v=0"
         cookie_dict = {item.split('=')[0]: item.split('=')[1] for item in cookies.split(';')}
         header = {'ip': agentip, 'Referer': nid_refer,
                   "cookies": cookie_dict,
                   'User-Agent': Html_Downloader.GetUserAgent()}
         ok, response = Html_Downloader.Download_Html(nid_Url, {}, header)
         if not ok:
             count = 0
             while (count < 11):
                 sleep(2)
                 agentip = Utils.GetMyAgent()
                 header = {'ip': agentip}
                 ok, response = Html_Downloader.Download_Html(nid_Url, {}, header)
                 if ok:
                     break
                 count += 1
                 print "获取月销量第{conut}试错".format(count=count)
         if ok:
             month_Sales = str(re.compile("sellCount\":(.*?)(?=\"success\")").findall(response.text)[0]).replace(",",
                                                                                                                 "").replace(
                 ",", "").strip()
             if month_Sales == None:
                 self.crawlMonthSales(nid, agentip)
             print  "获得月销量为:{month_Sales}".format(month_Sales=month_Sales)
             return month_Sales
     except Exception, e:
         logging.info("月销量爬取错误{m}".format(m=e.message))
Exemplo n.º 6
0
def get_csrfId():
    url = "http://dmp.taobao.com/api/login/loginuserinfo"
    proxy = dict()
    proxy['HTTP'] = Utils.GetMyAgent()
    response = requests.get(url=url,
                            proxies=proxy,
                            verify=False,
                            headers=HEADER,
                            cookies=cookie_dict).text
    csrfid = json.loads(response)['data']['csrfId']
    return str(csrfid)
Exemplo n.º 7
0
def get_response(cate, i):
    agentIp = dict()
    agentIp['HTTPS'] = Utils.GetMyAgent()
    url = "https://sycm.taobao.com/mq/rank/listItems.json?cateId={cate}&categoryId={cate}&dateRange=2017-11-06%7C2017-11-12&dateRangePre=2017-11-06|2017-11-12&dateType=recent7&dateTypePre=recent7&device=0&devicePre=0&itemDetailType=1&keyword=&orderDirection=desc&orderField=payOrdCnt&page={page}&pageSize=100&rankTabIndex=0&rankType=1&seller=-1&token=67868107c&view=rank&_=1510572223929".format(
        page=i, cate=cate)
    response = requests.get(url=url,
                            headers=HEADERS,
                            proxies=agentIp,
                            verify=False,
                            cookies=cookie_dict).text
    return response
Exemplo n.º 8
0
def get_response(url):
    try:
        proxy = dict()
        proxy['HTTP'] = Utils.GetMyAgent()
        #proxy['HTTP'] ={'HTTP':'182.34.50.90:808'}
        respones = requests.get(url=url, cookies=cookie_dict, headers=HEADERS, verify=False,proxies=proxy).text
        html = etree.HTML(respones)
        time.sleep(8)
    except Exception, e:
        logging.error("获取响应时出现错误{e}".format(e=e.message))
        get_response(url)
Exemplo n.º 9
0
def get_data(shop_id, item_ids, total_items, shopname):
    url = "https://tui.taobao.com/recommend?seller_id=78550821&shop_id={shop_id}&item_ids={item_id}&floorId=42296&" \
          "pSize=500&callback=detail_pine&appid=2144&count=200&pNum=0".format(shop_id=shop_id, item_id=item_ids)
    agentIp = dict()
    agentIp['HTTPS'] = Utils.GetMyAgent()
    try:
        response = requests.get(url, headers=HEADERS, verify=False, proxies=agentIp)
        if response.status_code == 200 and "result" in response.text:
            total_json = response.text.replace("\r", "").replace("\n", "").replace("detail_pine(", "").replace("});",
                                                                                                               "}")
            total_json = json.loads(total_json)
            need_json = total_json["result"]
            for need in need_json:
                getNeedJson = dict()
                getNeedJson['month_Sales'] = str(need['monthSellCount'])
                getNeedJson['title'] = need['itemName']
                getNeedJson['item_id'] = str(need['itemId']).strip()
                getNeedJson['totalSoldQuantity'] = str(need['sellCount'])
                getNeedJson['quantity'] = str(need['quantity'])
                # 不要价格getNeedJson['promotionPrice'] = need['promotionPrice']
                getNeedJson['category_id'] = str(need['categoryId'])
                getNeedJson['picUrl'] = "https:" + need['pic']
                getNeedJson['crawl_url'] = "https:" + need['url']
                getNeedJson['crawl_time'] = TIME
                getNeedJson['shop_id'] = str(shop_id)
                getNeedJson['shop_name'] = shopname
                # 完成数据写入redis
                r.set(need['itemId'], getNeedJson)
                # SHARE_Q.put(getNeedJson)
                '''
                数组针对item_id去重,但效率太低弃用
                if len(DATA) == 0:
                    DATA.append(getNeedJson)
                else:
                    m = list()
                    for data in DATA:
                        m.append(data['itemId'])
                    if getNeedJson['itemId'] not in m:
                        DATA.append(getNeedJson)
                    else:
                        continue
                #DATA = list(set(DATA))
                '''
                print "添加商品成功{item_id}".format(item_id=need['itemId'])
    except Exception, e:
        logging.error("爬取{shopname}是出错:{e}".format(shopname=shopname, e=e.message))
        return get_data(shop_id, item_ids, total_items, shopname)
Exemplo n.º 10
0
 def crawl_yxl(self,auctionId,agentIp):
     yxl=-1
     count =0
     while(count<20):
         agentIp=Utils.GetMyAgent()
         userAgent=Html_Downloader.GetUserAgent()
         header = {'ip': agentIp,'user-agent':userAgent}
         text_detail_url="https://detail.m.tmall.com/item.htm?spm=a320p.7692363.0.0&id={auctionId}".format(auctionId=auctionId)
         ok, response = Html_Downloader.Download_Html(text_detail_url,{}, header)
         if ok:
             matchs=re.compile("sellCount\":(.*?)(?=showShopActivitySize)").findall(response.text)
             if len(matchs) > 0:
              if "sellCount" in response.text:
                 yxl=re.compile("sellCount\":(.*?)(?=showShopActivitySize)").findall(response.text)[0].encode('utf-8')
                 yxl=yxl.replace(",\"","")
                 break
         sleep(3)
         count+=1
     return  yxl
Exemplo n.º 11
0
def get_tag_details(tags, csrfid):
    global TAG_DATA
    if type(tags) != list:
        tags = [tags]
    for tag in tags:
        url = "http://dmp.taobao.com/api/tag/{tag_id}?csrfId={csrfid}&t={time}977" \
            .format(tag_id=tag, csrfid=csrfid, time=int(time.time()))
        proxy = dict()
        proxy['HTTP'] = Utils.GetMyAgent()
        # allow_redirects 是用来解决URI重定向问题
        response = requests.get(url=url,
                                proxies=proxy,
                                verify=False,
                                headers=HEADER,
                                cookies=cookie_dict,
                                allow_redirects=False).text
        data = json.loads(response)
        tag_data = dict()
        tag_data['dmp_msg'] = str(
            data["data"]["tag"]["tagDesc"].split(",")[0])[9:-1]
        # 标签信息
        tag_data['dmp_msg'] = str(
            data["data"]["tag"]["tagDesc"].split(",")[0])[9:-1]
        # 获取标签数
        tag_data['qualityScore'] = str(data["data"]["tag"]["qualityScore"])
        # 标签标题
        tag_data['tag_name'] = str(data["data"]["tag"]["tagName"])
        # 获取选项信息
        tag_data['options'] = data["data"]["tag"]["options"]
        # 获取该标签的GroupId
        # tag_data['GroupIds'] = data["data"]["tag"]["optionGroups"]
        # 获取标签的type
        option_groups = data["data"]["tag"]["optionGroups"]
        group_id_type = dict()
        tag_data['group_id_type'] = group_id_type
        for group in option_groups:
            tag_data['group_id'] = group['id']
            group_type = group['type']
            group_id_type[group_type] = group['id']
        TAG_DATA[tag] = tag_data
Exemplo n.º 12
0
def get_total_items(time_now, shop_url):
    ip = Utils.GetMyAgent()
    proxies = {
        "http": "http://{ip}".format(ip=ip),
        "https": "http://{ip}".format(ip=ip)
    }
    # shop_url = "https://shop67361531.taobao.com/"
    parms_url = "{shop_url}i/asynSearch.htm?_ksTS={now}569_240&callback=jsonp&mid=w-14766145001-0&wid=14766145001&path=/search.htm&search=y&orderType=hotsell_desc&scene=taobao_shop&pageNo={page_num}"
    url_test = parms_url.format(shop_url=shop_url, now=time_now, page_num=1)
    cookie2 = "UM_distinctid=15e7a46caf311b-0188d989dac01e-5c153d17-144000-15e7a46caf4552; thw=cn; ali_apache_id=11.131.226.119.1505353641652.239211.1; _uab_collina=150538207117146260512386; miid=2033888855982094870; l=AllZcEkSLTy0io2vJcWc-ksY6U4zk02Y; _cc_=WqG3DMC9EA%3D%3D; tg=0; _umdata=85957DF9A4B3B3E8F872A3094256432F0F1549AE1C92C6CCF1E68B982581686F23BFC13A60CCABD1CD43AD3E795C914CAF73DEDAA30E5DF4E27D6F4EB50F8E1F; cna=dNU/EvGIRjsCAQ4XY4PdDkHN; _tb_token_=36f53e7b339f5; _m_h5_tk=4570646d13ae7111fef3e2b7a043022c_1509837913393; _m_h5_tk_enc=20b6e25356a739887ddf33c092c83ece; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=0_0; x=124660357; uc3=sg2=AVMH%2FcTVYAeWJwo98UZ6Ld9wxpMCVcQb0e1XXZrd%2BhE%3D&nk2=&id2=&lg2=; uss=VAXSokZb41x3LrOdSf%2FkOXi5mZhwOKGqxWNIJ%2BcsBdECv1yvzxoYTiml; tracknick=; sn=%E4%BE%9D%E4%BF%8A%E6%9C%8D%E9%A5%B0%3A%E8%BF%90%E8%90%A5; skt=dfb70b3150f31cce; v=0; cookie2=113bd831048eba21f8cb9e18be45b345; unb=2077259956; t=1630b104e4d32df897451d6c96642469; uc1=cookie14=UoTcBrCpbGSbPg%3D%3D&lng=zh_CN; isg=AoWF8Nl9cPq6fVThFtgQUqaUlMF_6jLBbwXTU4fqQrzLHqWQT5JJpBP-XnQT"
    cookie_dict = {item.split('=')[0]: item.split('=')[1] for item in cookie2.split(';')}
    try:
        response = requests.get(url=url_test, proxies=proxies, headers=HEADERS, verify=False, allow_redirects=False, cookies= cookie_dict)
        html = etree.HTML(response.text)
        if response.status_code == 200 and "search-result" in response.text:
            total_items = html.xpath("//div[contains(@class,'search-result')]/span/text()")[0]
            logging.info("获取店铺{shop_url},宝贝总数:{total_items}".format(total_items=total_items, shop_url=shop_url))
            return total_items
        else:
            return get_total_items(time_now, shop_url)
    except Exception, e:
        logging.error("爬取全店宝贝时{shop_url}时出错:{e}".format(shop_url=shop_url, e=e.message))
        return get_total_items(time_now, shop_url)
Exemplo n.º 13
0
 def crawlTotalpage(self, search_url, header):
     try:
         ok, response = Html_Downloader.Download_Html(
             search_url, {}, header)
         if not ok:
             count = 0
             while (count < 4):
                 sleep(2)
                 agentip = Utils.GetMyAgent()
                 header = {'ip': agentip}
                 ok, response = Html_Downloader.Download_Html(
                     search_url, {}, header)
                 if ok:
                     break
                 count += 1
                 if count == 3:
                     header = {}
         if ok:
             html = etree.HTML(response.text)
             matchs = html.xpath("//script[contains(.,'g_page_config')]")
             if len(matchs) > 0:
                 data = re.compile(
                     "g_page_config=(.*)?;g_srp_loadCss").match(
                         matchs[0].text.replace("\n\n", "\n").replace(
                             "\n", "").replace(" ", ""))
                 if data.lastindex > 0:
                     data = json.loads(data.group(1).encode('utf-8'))
                     if data.has_key('mods'):
                         totalpage = data['mods']['pager']['data'][
                             'totalPage']
             else:
                 print("无法匹配有效的json")
         else:
             print("无法匹配到宝贝列表")
     except Exception, e:
         logging.info("关键词{p}第{i}页抓取错误{m}".format(m=e.message))
Exemplo n.º 14
0
 def parse_items(self,jsonArray, shop_id,agentIp):
     shop_items=[]
     # agentIp=None
     header = {'ip': agentIp}
     for item in jsonArray:
          shop_item = {}
          shop_item['shop_id']=shop_id
          auctionId=item.get('auctionId')
          shop_item['item_id']=auctionId
          shop_item['title']= item.get('title')
          shop_item['picUrl']="http:"+item.get('picUrl')
          # shop_item['picUrl']=re.compile("/([^/]*)(?=_)").findall(item.get('picUrl'))[0]
          shop_item['salePrice']= item.get('salePrice')
          shop_item['reservePrice']= item.get('reservePrice')
          shop_item['quantity']= item.get('quantity')
          shop_item['totalSoldQuantity']= item.get('totalSoldQuantity')
          #获取链接里面的详情
          t_detail_url="https://item.taobao.com/item.htm?spm=a1z10.3-c-s.w4002-14766145001.18.6584ae82X93XhC&id={auctionId}".format(auctionId=auctionId)
          shop_item['crawl_url']=t_detail_url
          print(t_detail_url)
          shop_item['crawl_time'] = long(time.time())
          sold=item.get('sold')
          if "tmall" in self.shop_url:
              sold=""
              sold=self.crawl_yxl(auctionId,agentIp)
          #天猫的月销量另外获取
          shop_item['sold']=sold
          try:
                ok, response = Html_Downloader.Download_Html(t_detail_url,{}, header)
                if not ok:
                    count =0
                    while(count<4):
                         sleep(2)
                         agentip = Utils.GetMyAgent()
                         header = {'ip': agentip}
                         ok, response = Html_Downloader.Download_Html(t_detail_url,{},header)
                         if ok and "category=item" in response.text:
                             break
                         count+=1
                         if count==3:
                             header={}
                if ok and  "category=item" not in response.text:
                    count =0
                    while(count<4):
                         sleep(2)
                         agentip = Utils.GetMyAgent()
                         header = {'ip': agentip}
                         ok, response = Html_Downloader.Download_Html(t_detail_url,{},header)
                         if ok and "category=item" in response.text:
                             break
                         count+=1
                         if count==3:
                             header={}
                if ok and "category=item" in response.text:
                   html = etree.HTML(response.text)
                   # shop_id = ""
                   category_id= re.compile("item%5f(.*?)(?=&)").findall(response.text)[0]
                   shop_item['category_id']=category_id
                   if html.xpath("//dl[contains(@class,'tb-prop')]"):
                       for prop in html.xpath("//dl[contains(@class,'tb-prop')]"):
                             if not prop in html.xpath("//dl[contains(@class,'tb-hidden')]"):
                                 prop_value_id=[]
                                 prop_name = prop.xpath(".//dt/text()")[0].encode('utf-8')
                                 for value in prop.xpath(".//dd/ul/li"):
                                        sub_value_id= []
                                        sku_id = value.get('data-value')
                                        sub_value_id.append(sku_id)
                                        if value.xpath('./a/span/text()'):
                                            sku_name = value.xpath('./a/span/text()')[0].encode('utf-8')
                                            sub_value_id.append(sku_name)
                                            # prop_value_id.append(";".join(sub_value_id))
                                        if value.xpath('./a')[0].get('style') and re.compile("/([^/]*)(?=_!!|_M2)").findall(
                                            value.xpath('./a')[0].get('style')):
                                            sku_img_url=re.compile("/([^/]*)(?=_!!|_M2)").findall(value.xpath('./a')[0].get('style'))[0]
                                            sub_value_id.append(sku_img_url)
                                        prop_value_id.append(";".join(sub_value_id))
                             # shop_item[prop_name] ="&&||".join(prop_value_id)
                                 shop_item[prop_name] =prop_value_id
                   if html.xpath("//ul[@id='J_UlThumb']"):
                         stype_img_id=[]
                         if html.xpath("//ul[@id='J_UlThumb']")[0].xpath(".//li/div"):
                             for value1 in html.xpath("//ul[@id='J_UlThumb']")[0].xpath(".//li/div"):
                                      if value1.xpath('./a')[0].xpath('./img')[0].get('data-src') and re.compile("/([^/]*)(?=_!!|_M2)").findall(
                                            value1.xpath('./a')[0].xpath('./img')[0].get('data-src')):
                                            sku_img_id=re.compile("/([^/]*)(?=_!!|_M2)").findall(value1.xpath('./a')[0].xpath('./img')[0].get('data-src'))[0]
                                            stype_img_id.append(sku_img_id)
                         elif html.xpath("//ul[@id='J_UlThumb']")[0].xpath(".//li"):
                             for value1 in html.xpath("//ul[@id='J_UlThumb']")[0].xpath(".//li"):
                                      if value1.xpath('./a')[0].xpath('./img')[0].get('src') and re.compile("/([^/]*)(?=_!!|_M2)").findall(
                                            value1.xpath('./a')[0].xpath('./img')[0].get('src')):
                                            sku_img_id=re.compile("/([^/]*)(?=_!!|_M2)").findall(value1.xpath('./a')[0].xpath('./img')[0].get('src'))[0]
                                            stype_img_id.append(sku_img_id)
                         shop_item["img_attr"]="&&||".join(stype_img_id)
                   if html.xpath("//ul[@id='J_AttrUL']"):
                       styleliList=[]
                       for styleli in html.xpath("//ul[@id='J_AttrUL']")[0].xpath(".//li"):
                            if styleli.xpath('./text()'):
                                 styleliText= styleli.xpath('./text()')[0].encode('utf-8').strip()
                                 styleliList.append(styleliText)
                   elif html.xpath("//div[@id='attributes']"):
                       styleliList=[]
                       for styleli in html.xpath("//div[@id='attributes']")[0].xpath(".//ul/li"):
                            if styleli.xpath('./text()'):
                                 styleliText= styleli.xpath('./text()')[0].encode('utf-8').strip()
                                 styleliList.append(styleliText)
                   shop_item["attribute"]="&&||".join(styleliList)
          except Exception, e:
              logging.info("----详情抓取错误----".format(e=e.message))
          shop_items.append(shop_item)
Exemplo n.º 15
0
    logUUZS = logZS + getData
    ok, result = Html_Downloader.Download_Html(logUU, {}, {})
    if ok:
        result_json = json.loads(result.content)
        #result_ok = bool(result_json['status'])
    ok, result = Html_Downloader.Download_Html(logUUZS, {}, {})
    if ok:
        result_json = json.loads(result.content)
        #result_ok = bool(result_json['status'])


if __name__ == "__main__":
    starttime = datetime.now()
    start_time = time.strftime('%Y-%m-%d %H:%M:%S',
                               time.localtime(time.time()))
    agentIp = Utils.GetMyAgent()
    '''
         urlLog="http://192.168.10.198:8080/pdd/CrawlerLogController/getAllNick"
         log="http://192.168.10.198:8080/pdd/CrawlerLogController/SaveCrawlerLog?"
         ok, result = Html_Downloader.Download_Html(urlLog,{},{})
         if ok:
             result_json1 = json.loads(result.content)
             sonArrayShop=result_json1['data']
             for itemShop in jsonArrayShop:
                shop_url=itemShop.get('shop_url')
                shop_id=itemShop.get('shop_id')
                shop_name=itemShop.get('shop_name')
         '''
    shop_url = "https://newbalancekids.tmall.com/"
    shop_id = "101815493"
    shop_name = "newbalance童鞋旗舰店"
Exemplo n.º 16
0
 def crawl_shop_all_item(self):
     agentIp = Utils.GetMyAgent()
     shop_id = self.shop_id
     shop_name = self.shop_name
     userAgent = Html_Downloader.GetUserAgent()
     header = {'ip': agentIp, 'user-agent': userAgent}
     test_detail_url = "{shop_url}shop/shop_auction_search.do?ajson=1&_tm_source=tmallsearch&spm=a320p.7692171.0.0&sort" \
                       "=d&p={page}&page_size={page_size}&from=h5".format(shop_url=self.shop_url, page_size=1,
                                                                          page=1)
     test_detail_url = test_detail_url.replace(".tmall.com", ".m.tmall.com")
     try:
         ok, response = Html_Downloader.Download_Html(
             test_detail_url, {}, header)
         if not ok:
             count = 0
             while (count < 4):
                 sleep(2)
                 agentip = Utils.GetMyAgent()
                 header = {'ip': agentip}
                 ok, response = Html_Downloader.Download_Html(
                     test_detail_url, {}, header)
                 if ok:
                     break
                 count += 1
                 if count == 3:
                     header = {}
         if ok:
             jsonArray = json.loads(response.content)  # 获取解析的json
             total_page = jsonArray.get("total_page")
             total_results = jsonArray.get("total_results")
             page_size = jsonArray.get("page_size")
             logging.info("shopname:" + shop_name + " total_page:" +
                          total_page + " total_results:" + total_results +
                          " page_size:" + page_size)
             print "total_page:" + total_page + "total_results:" + total_results + "page_size:" + page_size
             for i in range(int(total_page)):
                 print i + 1
                 test_detail_url = "{shop_url}shop/shop_auction_search.do?ajson=1&_tm_source=tmallsearch&spm=a320p.7692171.0.0&sort=d&p={page}&page_size={page_size}&from=h5".format(
                     shop_url=self.shop_url,
                     page_size=page_size,
                     page=i + 1)
                 test_detail_url = test_detail_url.replace(
                     ".tmall.com", ".m.tmall.com")
                 '''
                 if int(total_page)==(i+1):
                     lastCount=int(total_results)-i*int(page_size)
                     ok, response = Html_Downloader.Download_Html(test_detail_url,{}, header)
                     if not ok:
                         count =0
                         while(count<11):
                             sleep(2)
                             agentip = Utils.GetMyAgent()
                             header = {'ip': agentip}
                             ok, response = Html_Downloader.Download_Html(test_detail_url,{},header)
                             if ok  and "price" in response.text and lastCount-response.text.count("price")<2:
                                 break
                             count+=1
                             if count==10:
                                 header={}
                     print  response.text.count('price')
                     if ok and  "price" not in response.text:
                        print "111"
                        count =0
                        while(count<11):
                             sleep(2)
                             agentip = Utils.GetMyAgent()
                             header = {'ip': agentip}
                             ok, response = Html_Downloader.Download_Html(test_detail_url,{},header)
                             if ok and "price" in response.text and lastCount-response.text.count("price")<2:
                                 break
                             count+=1
                             if count==10:
                                 header={}
                     if ok and  lastCount-response.text.count("price")>2:
                        while(count<11):
                             sleep(2)
                             agentip = Utils.GetMyAgent()
                             header = {'ip': agentip}
                             ok, response = Html_Downloader.Download_Html(test_detail_url,{},header)
                             if ok and "price" in response.text and lastCount-response.text.count("price")<2:
                                 break
                             count+=1
                             if count==10:
                                 header={}
                     if ok  and lastCount-response.text.count("price")<2:
                         logging.info("成功获取price字符串并开始解析")
                         self.parse_items(response.content,shop_id,agentIp,shop_name,userAgent)
                 else:
                     '''
                 ok, response = Html_Downloader.Download_Html(
                     test_detail_url, {}, header)
                 if not ok:
                     count = 0
                     while (count < 11):
                         sleep(2)
                         agentip = Utils.GetMyAgent()
                         header = {'ip': agentip}
                         ok, response = Html_Downloader.Download_Html(
                             test_detail_url, {}, header)
                         if ok:
                             break
                         count += 1
                         if count == 10:
                             header = {}
                 if ok:
                     # logging.info("成功获取price字符串并开始解析")
                     self.parse_items(response.content, shop_id, agentIp,
                                      shop_name, userAgent)
     except Exception, e:
         logging.error("抓取店铺:{shop_name}失败,店铺id:{shop_id},错误内容{m}".format(
             shop_name=shop_name,
             shop_id=shop_id,
             m=e.message,
         ))
         crawl_content = "抓取列表页有错"
         message = e.message
         start_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                    time.localtime(time.time()))
         insertLog(crawl_content, message, shop_id, agentIp,
                   test_detail_url, start_time, shop_name)
Exemplo n.º 17
0
def get_coverage(total_combins):
    csrfid = get_csrfId()
    for combine in total_combins:
        DATA = dict()
        # tag的tagName 通过tagid获得并存储
        DATA['tag_name'] = str()
        # option内容的标签,通过optionName直接获得
        DATA['option_name'] = str()
        # tagid集合标签
        DATA['tagids'] = str()
        proxy = dict()
        proxy['HTTP'] = Utils.GetMyAgent()
        url_data = "http://dmp.taobao.com/api/analysis/coverage"
        # 要对post数据进行循环重组好发送数据
        post_data = dict()
        post_data['csrfId'] = csrfid
        post_data['user'] = 1
        # 这里的i 最好也为字典
        # @@这里有个很大的问题就是当这个标签有需要输入的信息时就需要单独添加标签@@
        same_group_list = list()
        same_group_dict = dict()
        for same_group in combine:
            if 'SHOP' not in str(same_group):
                if same_group['GroupId'] not in same_group_dict.keys():
                    some_group_list2 = list()
                    some_group_list2.append(same_group)
                    same_group_dict[same_group['GroupId']] = some_group_list2
                elif same_group['GroupId'] in same_group_dict.keys():
                    same_group_dict[same_group['GroupId']].append(same_group)
            if 'SHOP' in str(same_group):
                same_group_list.append(same_group)
        same_group_list.append(same_group_dict)
        # num_post = len(same_group_list)
        # if "SHOP" in str(combine):num_post += 1
        # same_group_list = list(set(same_group_list))
        num_post = 0
        for single_combine in same_group_list:
            # 开始对单个请求信息进行解析,目标为字典
            if 'SHOP' in single_combine:
                shop_list = single_combine['SHOP']
                tagId = single_combine['tagId']
                DATA['tagids'] += str(tagId) + "_"
                for shop in shop_list:
                    if 'shop_id' in str(shop):
                        post_data["options{a}.operatorType".format(
                            a=[num_post])] = 1
                        post_data["options{a}.optionGroupId".format(
                            a=[num_post])] = shop['GroupId']
                        post_data["options{a}.source".format(
                            a=[num_post])] = "all"
                        post_data["options{a}.tagId".format(
                            a=[num_post])] = tagId
                        post_data["options{a}.value".format(
                            a=[num_post])] = shop['shop_id']
                        post_data["options{a}.optionNameMapStr".format(
                            a=[num_post])] = str(
                                {str(shop['shop_id']): shop['shop_name']})
                        num_post += 1
                        # DATA['tag_name'].append(shop['shop_name']+"_")
                        DATA['option_name'] += str(shop['shop_name']) + "_"
                    else:
                        post_data["options{a}.operatorType".format(
                            a=[num_post])] = 1
                        post_data["options{a}.optionGroupId".format(
                            a=[num_post])] = shop['GroupId']
                        post_data["options{a}.source".format(
                            a=[num_post])] = "all"
                        post_data["options{a}.tagId".format(
                            a=[num_post])] = tagId
                        # post_data["options{a}.value".format(a=[num_post])] = shop['shop_id']
                        # 这里input的value写死了,因为产品要求输入为1
                        post_data["options{a}.value".format(
                            a=[num_post])] = "1~999999999"
                        num_post += 1
                        DATA['tag_name'] += "最近180天店内购买频次" + "_"
                        DATA['option_name'] += "1~999999999_"
            else:
                # @@@@@@这里要注意啊!要按照groupid进行分组操作,现在没有。。。。。。所以数据有误
                for key in single_combine.keys():
                    # key = single_combine.keys()[keys]
                    optionNameMapStr = dict()
                    value = list()
                    for same_groop_tag in single_combine[key]:
                        optionValue = int(same_groop_tag['optionValue'])
                        optionNameMapStr[str(
                            optionValue)] = same_groop_tag['optionName']
                        tagId = same_groop_tag['tagId']
                        option_name_forDATA = list()
                        option_name_forDATA.append(
                            str(same_groop_tag['optionName']))
                        value.append(optionValue)
                        for data in option_name_forDATA:
                            DATA['option_name'] += data + "_"
                    DATA['tagids'] += str(tagId) + "_"
                    post_data["options{a}.operatorType".format(
                        a=[num_post])] = 1
                    post_data["options{a}.optionGroupId".format(
                        a=[num_post])] = key
                    post_data["options{a}.source".format(a=[num_post])] = "all"
                    post_data["options{a}.tagId".format(a=[num_post])] = tagId
                    post_data["options{a}.value".format(a=[num_post])] = value
                    post_data["options{a}.optionNameMapStr".format(
                        a=[num_post])] = str(optionNameMapStr)
                    DATA['tag_name'] += str(TAG_DATA[tagId]['tag_name']) + "_"
                    num_post += 1
        logging.info(post_data)
        try:
            response = requests.post(url_data,
                                     proxies=proxy,
                                     verify=False,
                                     headers=HEADER,
                                     cookies=cookie_dict,
                                     allow_redirects=False,
                                     data=post_data).text
            DATA['count'] = str(json.loads(response)['data']['coverage'])
        except Exception, e:
            logging.error("解析得到响应数据时发生错误{e}".format(e=e))
            response = requests.post(url_data,
                                     proxies=proxy,
                                     verify=False,
                                     headers=HEADER,
                                     cookies=cookie_dict,
                                     allow_redirects=False,
                                     data=post_data).text
        DATA['tag_name'] = str(DATA['tag_name'])[:-1]
        DATA['option_name'] = str(DATA['option_name'])[:-1]
        DATA['tagids'] = str(DATA['tagids'])[:-1]
        DATA['crawl_time'] = int(time.time())
        local_set.save(DATA)
        print response
Exemplo n.º 18
0
 def crawlMonthSales(self, nid, agentip):
     try:
         month_Sales = ""
         nid_url = "https://mdskip.taobao.com/core/initItemDetail.htm?itemId={nid}"
         refer_url = "https://detail.taobao.com/item.htm?id={nid}"
         nid_Url = nid_url.format(nid=nid)
         nid_refer = refer_url.format(nid=nid)
         cookies = "x=__ll%3D-1%26_ato%3D0; l=AhERSU92PmRba9QUgSCkQMF6oRaqOoXt; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; cna=dNU/EvGIRjsCAQ4XY4PdDkHN; _m_h5_tk=7d8d6e65e5c676a6d0a69c26f7436ea1_1510363282671; _m_h5_tk_enc=e32129060738b7ce01e9114c9bec037f; sm4=440100; hng=CN%7Czh-CN%7CCNY%7C156; uc1=cookie14=UoTde95xncLyFQ%3D%3D&lng=zh_CN; uc3=sg2=Vq0THzNyGHIH22DuvMx9ZEwXL5qc2kn7REWHdois6v0%3D&nk2=&id2=&lg2=; uss=AQDPJiEXAu47o41b5k%2BKpKRT3Ckpz9nqnJX2F%2F7kZG6ttuI82ZnQa7ZL; t=1630b104e4d32df897451d6c96642469; unb=2607292494; sn=sitiselected%E6%97%97%E8%88%B0%E5%BA%97%3A%E5%A4%A7%E9%BA%A6; _tb_token_=eef7bd7b7abd6; cookie2=23bb087c638814ce8a8e329ead5332d4; isg=ApqaMZmelJirXxuDoGSRqtW160B8YxWwfLxcMqQTRi34FzpRjFtutWDlkdVw"
         # cookies="_tb_token_=f3fe5d65a6591;cookie2=171e5eb92d66332b1d52d9e2730fed33;t=bf64b0d40d912c08dd434661471b2c98;v=0"
         cookie_dict = {
             item.split('=')[0]: item.split('=')[1]
             for item in cookies.split(';')
         }
         header = {
             'ip': agentip,
             'Referer': nid_refer,
             "cookies": cookie_dict,
             'User-Agent': Html_Downloader.GetUserAgent()
         }
         ok, response = Html_Downloader.Download_Html(nid_Url, {}, header)
         if not ok:
             count = 0
             while count < 5:
                 sleep(2)
                 agentip = Utils.GetMyAgent()
                 header = {
                     'ip': agentip,
                     'Referer': nid_refer,
                     'timeout': '5000',
                     "cookies": cookie_dict,
                     'User-Agent': Html_Downloader.GetUserAgent()
                 }
                 ok, response = Html_Downloader.Download_Html(
                     nid_Url, {}, header)
                 if ok:
                     break
                 count += 1
                 print "获取月销量第{count}试错".format(count=count)
         if ok and "sellCount\":" not in response.text:
             count = 0
             while count < 10:
                 sleep(2)
                 agentip = Utils.GetMyAgent()
                 header = {
                     'ip': agentip,
                     'Referer': nid_refer,
                     'timeout': '5000',
                     "cookies": cookie_dict,
                     'User-Agent': Html_Downloader.GetUserAgent()
                 }
                 if count == 9:
                     header = {}
                 ok, response = Html_Downloader.Download_Html(
                     nid_Url, {}, header)
                 if ok and "sellCount\":" in response.text:
                     break
                 count += 1
                 print "sellCount不在反馈中,获取月销量第{count}试错".format(count=count)
         if ok and "sellCount\":" in response.text:
             month_Sales = str(
                 re.compile("sellCount\":(.*?)(?=\"success\")").findall(
                     response.text)[0]).replace(",",
                                                "").replace(",",
                                                            "").strip()
             print "获得月销量为:{month_Sales}".format(month_Sales=month_Sales)
             return month_Sales
     except Exception, e:
         logging.info("月销量爬取错误{m}".format(m=e.message))
Exemplo n.º 19
0
    def parse_items(self, content, shop_id, agentIp, shop_name, userAgent):
        try:
            # start_time2=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            jsonArray = json.loads(content)
            jsonResult = jsonArray.get("items")
            shop_items = []
            header = {'ip': agentIp, 'user-agent': userAgent}
            print "开始解析列表json数据"
            for item in jsonResult:
                shop_item = {}
                shop_item['shop_id'] = str(shop_id)
                shop_item['shop_name'] = shop_name
                item_id = str(item.get("item_id")).strip()
                shop_item['item_id'] = item_id
                shop_item['title'] = item.get('title').encode('utf-8')
                shop_item['picUrl'] = "https:" + item.get('img')
                # print  item.get('price')
                # 现在的销售价
                # shop_item['salePrice'] = item.get('price')
                shop_item['totalSoldQuantity'] = str(
                    item.get('totalSoldQuantity'))
                crawl_url = "https:" + item.get('url')
                shop_item['crawl_url'] = crawl_url.replace(
                    ".m.tmall.com", ".tmall.com")
                shop_item['crawl_time'] = long(time.time())
                # 获取quantity接口url
                # 获取Items接口url
                category_id = ""
                category_id_Url = "http://d.da-mai.com/index.php?r=itemApi/getItemInfoByItemId&item_id=" + item_id
                ok, response = Html_Downloader.Download_Html(
                    category_id_Url, {}, header)
                if not ok:
                    count = 0
                    while count < 4:
                        sleep(1)
                        agentip = Utils.GetMyAgent()
                        header = {'ip': agentip}
                        if count == 3:
                            header = {}
                        ok, response = Html_Downloader.Download_Html(
                            category_id_Url, {}, header)
                        if ok:
                            break
                        count += 1

                if ok:
                    jsonItems = json.loads(response.content)
                    category_id = jsonItems['data']['data']['cid']
                total_quantity = 0
                quantity_Url = "http://yj.da-mai.com/index.php?r=itemskus/getSkus&fields=*&num_iids={item_id}".format(
                    item_id=item_id)
                ok, response = Html_Downloader.Download_Html(
                    quantity_Url, {}, header)
                if not ok:
                    count = 0
                    while (count < 4):
                        sleep(2)
                        agentip = Utils.GetMyAgent()
                        header = {'ip': agentip}
                        ok, response = Html_Downloader.Download_Html(
                            quantity_Url, {}, header)
                        if ok and "quantity" in response.text:
                            break
                        count += 1
                        if count == 3:
                            header = {}
                if ok and "quantity" not in response.text:
                    count = 0
                    while (count < 4):
                        sleep(2)
                        agentip = Utils.GetMyAgent()
                        header = {'ip': agentip}
                        ok, response = Html_Downloader.Download_Html(
                            quantity_Url, {}, header)
                        if ok and "quantity" in response.text:
                            break
                        count += 1
                        if count == 3:
                            header = {}
                if ok and "quantity" in response.text:
                    print "成功获取sku的json字符串并开始解析"
                    jsonItems = json.loads(response.content)  # 获取解析的json
                    total_data = jsonItems.get("data")
                    for date in total_data:
                        quantity = date.get("quantity")
                        total_quantity = total_quantity + quantity
                shop_item['category_id'] = str(category_id)
                shop_item['quantity'] = str(total_quantity)
                agentip = Utils.GetMyAgent()
                shop_item['month_Sales'] = self.crawlMonthSales(
                    item_id, agentip)
                shop_items.append(shop_item)
            post_data = {'data': json.dumps(shop_items)}
            if not self.process_request(SAVE__INSERT_API, post_data):
                sleep(3)
                self.process_request(SAVE__INSERT_API, post_data)
            # if not self.process_request(SAVE__INSERT_API_ZS, post_data):
            #     sleep(3)
            #     self.process_request(SAVE__INSERT_API_ZS, post_data)
        except Exception, e:
            logging.info(
                "抓取店铺:{shop_name}失败,抓取店铺链接:{shop_url},店铺id:{shop_id},错误内容{m}".
                format(shop_name=shop_name, shop_id=shop_id, m=e.message))
            crawl_content = "解析接口数据有误"
            message = e.message
            end_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                     time.localtime(time.time()))
            insertLog(crawl_content, message, shop_id, agentIp, "", start_time,
                      shop_name)
Exemplo n.º 20
0
def get_coverage(options_list, csrfid):
    proxy = dict()
    proxy['HTTP'] = Utils.GetMyAgent()
    url_data = "http://dmp.taobao.com/api/analysis/coverage"
    for option in options_list:
        num = 2
        post_data = dict()
        post_data['csrfId'] = csrfid
        post_data['user'] = 1
        # 开始组合post数据,先组成主标签即店铺标签
        post_data["options[{a}].operatorType".format(a=0)] = 1
        post_data["options[{a}].optionGroupId".format(a=0)] = 304
        post_data["options[{a}].source".format(a=0)] = "all"
        post_data["options[{a}].tagId".format(a=0)] = 110063
        post_data["options[{a}].value".format(a=0)] = 145841584  # 填shopId
        # post_data["options[{a}].value".format(a=0)] = 36236493  # 填shopId
        # post_data["options[{a}].optionNameMapStr".format(a=0)] = '{"36236493":"依俊服饰"}'  # 填店铺信息
        post_data["options[{a}].optionNameMapStr".format(
            a=0)] = '{"145841584":"麦斯威尔旗舰店"}'  # 填店铺信息
        post_data["options[{a}].operatorType".format(a=1)] = 1
        post_data["options[{a}].optionGroupId".format(a=1)] = 111
        post_data["options[{a}].source".format(a=1)] = "all"
        post_data["options[{a}].tagId".format(a=1)] = 110063
        post_data["options[{a}].value".format(a=1)] = "1~999999999"
        key_list = option.keys()
        for key in key_list:
            # 每个key代表一个tagId
            option_id = option[key]
            options = TAG_DATA[key]['options']
            # DATA['tag_name'] += TAG_DATA[key]['tag_name'] + '_'
            for get_right_option in options:
                DATA = dict()
                DATA['id'] = str()
                # DATA['tag_name'] = str()
                DATA['option_value'] = str()
                DATA['option_group_id'] = str()
                DATA['option_name'] = "本店圈定人数"
                DATA['id'] += '110063_110063_'
                DATA['option_value'] += '145841584_1~9999999_'
                DATA['option_group_id'] += '304_111_'
                DATA['id'] += str(key) + '_'
                DATA['option_group_id'] += str(TAG_DATA[key]['group_id']) + '_'
                optionGroupId = get_right_option['optionGroupId']
                optionValue = get_right_option['optionValue']
                optionName = get_right_option['optionName']
                if option_id == get_right_option['id']:
                    post_data['options[{num}].operatorType'.format(
                        num=num)] = 1
                    post_data['options[{num}].optionGroupId'.format(
                        num=num)] = optionGroupId
                    post_data['options[{num}].source'.format(num=num)] = 'all'
                    post_data['options[{num}].tagId'.format(num=num)] = key
                    post_data['options[{num}].value'.format(
                        num=num)] = optionValue
                    optionNameMapStr = dict()
                    optionNameMapStr[optionValue] = str(optionName)
                    DATA['count_name'] = get_right_option['optionName']
                    post_data['options[{num}].optionNameMapStr'.format(
                        num=num)] = str(optionNameMapStr)
                    DATA['option_value'] += get_right_option[
                        'optionValue'] + '_'
                    # DATA['option_group_id'] += get_right_option
                    num += 1
                    time.sleep(2)
                    response = requests.post(url_data,
                                             proxies=proxy,
                                             verify=False,
                                             headers=HEADER,
                                             cookies=cookie_dict,
                                             allow_redirects=False,
                                             data=post_data).text
                    DATA['count'] = str(
                        json.loads(response)['data']['coverage'])
                    DATA['crawl_time'] = int(time.time())
                    # DATA['tag_name'] = str(DATA['tag_name'])[:-1]
                    DATA['id'] = str(DATA['id'])[:-1]
                    DATA['option_group_id'] = str(DATA['option_group_id'])[:-1]
                    DATA['option_value'] = str(DATA['option_value'])[:-1]
                    print DATA
                    local_set.save(DATA)
            if key == 8:
                for value in [
                        '0~4', '5~9', '10~20', '21~31', '32~42', '43~53',
                        '53~99999'
                ]:
                    DATA = dict()
                    DATA['id'] = str()
                    # DATA['tag_name'] = str()
                    DATA['option_value'] = str()
                    DATA['option_group_id'] = str()
                    DATA['option_name'] = "本店圈定人数"
                    DATA['id'] += '110063_110063_'
                    DATA['option_value'] += '145841584_1~9999999_'
                    DATA['option_group_id'] += '304_111_'
                    DATA['id'] += str(key) + '_'
                    DATA['option_group_id'] += str(
                        TAG_DATA[key]['group_id']) + '_'
                    post_data['options[{num}].operatorType'.format(
                        num=num)] = 1
                    post_data['options[{num}].optionGroupId'.format(
                        num=num)] = 8
                    post_data['options[{num}].source'.format(num=num)] = 'all'
                    post_data['options[{num}].tagId'.format(num=num)] = key
                    post_data['options[{num}].value'.format(num=num)] = value
                    DATA['count_name'] = value
                    DATA['option_value'] += value
                    response = requests.post(url_data,
                                             proxies=proxy,
                                             verify=False,
                                             headers=HEADER,
                                             cookies=cookie_dict,
                                             allow_redirects=False,
                                             data=post_data).text
                    DATA['count'] = str(
                        json.loads(response)['data']['coverage'])
                    DATA['crawl_time'] = int(time.time())
                    # DATA['option_group_id'] += "8"
                    # DATA['id'] += "8"
                    # DATA['tag_name'] = str(DATA['tag_name'])[:-1]
                    DATA['id'] = str(DATA['id'])[:-1]
                    DATA['option_group_id'] = str(DATA['option_group_id'])[:-1]
                    DATA['option_value'] = str(DATA['option_value'])
                    print DATA
                    local_set.save(DATA)
                    DATA['option_value'] = '145841584_1~9999999_'
                    DATA['option_group_id'] = '304_111_'
                    DATA['id'] = '110063_110063_'
                break
            if key == 7:
                for value in [
                        '0~500', '500~1000', '1000~2000', '2000~3000',
                        '3000~99999'
                ]:
                    DATA = dict()
                    DATA['id'] = str()
                    # DATA['tag_name'] = str()
                    DATA['option_value'] = str()
                    DATA['option_group_id'] = str()
                    DATA['option_name'] = "本店圈定人数"
                    DATA['id'] += '110063_110063_'
                    DATA['option_value'] += '145841584_1~9999999_'
                    DATA['option_group_id'] += '304_111_'
                    DATA['id'] += str(key) + '_'
                    DATA['option_group_id'] += str(
                        TAG_DATA[key]['group_id']) + '_'
                    post_data['options[{num}].operatorType'.format(
                        num=num)] = 1
                    post_data['options[{num}].optionGroupId'.format(
                        num=num)] = key
                    post_data['options[{num}].source'.format(num=num)] = 'all'
                    post_data['options[{num}].tagId'.format(num=num)] = key
                    post_data['options[{num}].value'.format(num=num)] = value
                    DATA['count_name'] = value
                    DATA['option_value'] += value
                    response = requests.post(url_data,
                                             proxies=proxy,
                                             verify=False,
                                             headers=HEADER,
                                             cookies=cookie_dict,
                                             allow_redirects=False,
                                             data=post_data).text
                    DATA['count'] = str(
                        json.loads(response)['data']['coverage'])
                    DATA['crawl_time'] = int(time.time())
                    # DATA['tag_name'] = str(DATA['tag_name'])[:-1]
                    DATA['option_group_id'] += "7"
                    DATA['id'] += "7"
                    DATA['id'] = str(DATA['id'])[:-2]
                    DATA['option_group_id'] = str(DATA['option_group_id'])[:-1]
                    DATA['option_value'] = str(DATA['option_value'])
                    print DATA
                    local_set.save(DATA)
                    DATA['option_value'] = '145841584_1~9999999_'
                    DATA['option_group_id'] = '304_111_'
                    DATA['id'] = '110063_110063_'
                break
Exemplo n.º 21
0
def get_detail():
    rabbits = local_set.find({"所属店铺": "六只兔子 高端内裤 内衣店"})
    rabbits2 = local_set.find({"所属店铺": "莎琪儿私藏内衣店"})
    for rabbit in rabbits2:
        # 引流关键词
        coming_url_coming = "https://sycm.taobao.com/mq/rank/listItemSeKeyword.json?cateId={cate}&categoryId={cate}&dateRange=2017-11-07%7C2017-11-13&dateRangePre=2017-11-06|2017-11-12&dateType=recent7&dateTypePre=recent7&device=0&devicePre=0&itemDetailType=1&itemId={itemid}&latitude=undefined&rankTabIndex=0&rankType=1&seller=-1&token=67868107c&view=detail&_=1510630823012".format(
            cate=rabbit['cate'], itemid=rabbit['item_id'])
        # Top10成交关键词
        coming_url_deal = "https://sycm.taobao.com/mq/rank/listKeywordOrder.json?cateId={cate}&categoryId={cate}&dateRange=2017-11-07%7C2017-11-13&dateRangePre=2017-11-06|2017-11-12&dateType=recent7&dateTypePre=recent7&device=0&devicePre=0&itemDetailType=1&itemId={itemid}&latitude=undefined&rankTabIndex=0&rankType=1&seller=-1&token=67868107c&view=detail&_=1510630823016".format(
            cate=rabbit['cate'], itemid=rabbit['item_id'])
        # 无线端来源(流量来源)
        coming_url_wlSeList = "https://sycm.taobao.com/mq/rank/listItemSrcFlow.json?cateId={cate}&categoryId={cate}&dateRange=2017-11-13%7C2017-11-13&dateRangePre=2017-11-07|2017-11-13&dateType=recent1&dateTypePre=recent7&device=2&devicePre=0&itemDetailType=1&itemId={itemid}&rankTabIndex=0&rankType=1&seller=-1&token=67868107c&view=detail&_=1510632184267".format(
            cate=rabbit['cate'], itemid=rabbit['item_id'])
        # PC端来源(流量来源)
        coming_url_PC = "https://sycm.taobao.com/mq/rank/listItemSrcFlow.json?cateId={cate}&categoryId={cate}&dateRange=2017-11-13%7C2017-11-13&dateRangePre=2017-11-07|2017-11-13&dateType=recent1&dateTypePre=recent7&device=1&devicePre=0&itemDetailType=1&itemId={itemid}&rankTabIndex=0&rankType=1&seller=-1&token=67868107c&view=detail&_=1510632184266".format(
            cate=rabbit['cate'], itemid=rabbit['item_id'])
        agentIp = dict()
        agentIp['HTTPS'] = Utils.GetMyAgent()
        try:
            '''
            if coming_url_coming:
                time.sleep(4)
                response = requests.get(url=coming_url_coming, headers=HEADERS, proxies=agentIp, verify=False,
                                        cookies=cookie_dict).text
                if "操作成功" in response:
                    need_datas = json.loads(response)['content']['data']
                    for need_date in need_datas:
                        for need in need_datas[str(need_date)]:
                            data = dict()
                            data['name'] = "Top10引流关键词"
                            data['coming_from'] = str(need_date)
                            # data_keyword = dict()
                            data['shop_name'] = "六只兔子 高端内裤 内衣店"
                            data['keyword'] = need['keyword']
                            data['uv'] = need['uv']
                            data['itemid'] = rabbit['item_id']
                            # data_keyword_list.append(data_keyword)
                            # data['流量明细'] = data_keyword_list
                            local_set2.save(data)
                            # data = dict()
                else:
                    time.sleep(30)
                    response = requests.get(url=coming_url_coming, headers=HEADERS, proxies=agentIp, verify=False,
                                            cookies=cookie_dict).text
                    need_datas = json.loads(response)['content']['data']
                    for need_date in need_datas:
                        for need in need_datas[str(need_date)]:
                            data = dict()
                            data['name'] = "Top10引流关键词"
                            data['coming_from'] = str(need_date)
                            # data_keyword = dict()
                            data['shop_name'] = "六只兔子 高端内裤 内衣店"
                            data['keyword'] = need['keyword']
                            data['uv'] = need['uv']
                            data['itemid'] = rabbit['item_id']
                            # data_keyword_list.append(data_keyword)
                            # data['流量明细'] = data_keyword_list
                            local_set2.save(data)
                            # data = dict()
            '''
            if coming_url_deal:
                time.sleep(4)
                response = requests.get(url=coming_url_PC,
                                        headers=HEADERS,
                                        proxies=agentIp,
                                        verify=False,
                                        cookies=cookie_dict).text
                if "操作成功" in response:
                    need_datas = json.loads(response)['content']['data']
                    for need_date in need_datas:
                        for need in need_datas[str(need_date)]:
                            data = dict()
                            data['name'] = "Top10成交关键词"
                            data['coming_from'] = str(need_date)
                            # data_keyword = dict()
                            data['shop_name'] = "六只兔子 高端内裤 内衣店"
                            data['keyword'] = need['keyword']
                            data['value'] = need['value']
                            data['itemid'] = rabbit['item_id']
                            # data_keyword_list.append(data_keyword)
                            # data['流量明细'] = data_keyword_list
                            local_set2.save(data)
                            # data = dict()
                else:
                    time.sleep(30)
                    response = requests.get(url=coming_url_deal,
                                            headers=HEADERS,
                                            proxies=agentIp,
                                            verify=False,
                                            cookies=cookie_dict).text
                    need_datas = json.loads(response)['content']['data']
                    for need_date in need_datas:
                        for need in need_datas[str(need_date)]:
                            data = dict()
                            data['name'] = "Top10成交关键词"
                            data['coming_from'] = str(need_date)
                            # data_keyword = dict()
                            data['shop_name'] = "六只兔子 高端内裤 内衣店"
                            data['keyword'] = need['keyword']
                            data['value'] = need['value']
                            data['itemid'] = rabbit['item_id']
                        local_set2.save(data)
            '''
            if coming_url_wlSeList:
                time.sleep(4)
                response = requests.get(url=coming_url_PC, headers=HEADERS, proxies=agentIp, verify=False,
                                        cookies=cookie_dict).text
                if "操作成功" in response:
                    need_datas = json.loads(response)['content']['data']
                    for need_date in need_datas:
                        for need in need_datas[str(need_date)]:
                            data = dict()
                            data['name'] = "无线端来源"
                            data['coming_from'] = str(need_date)
                            # data_keyword = dict()
                            data['shop_name'] = "六只兔子 高端内裤 内衣店"
                            data['pageName'] = need['pageName']
                            data['uv'] = need['uv']
                            data['pv'] = need['pv']
                            data['uvRate'] = need['uvRate']
                            data['pvRate'] = need['pvRate']
                            data['itemid'] = rabbit['item_id']
                            local_set2.save(data)
                else:
                    time.sleep(30)
                    response = requests.get(url=coming_url_PC, headers=HEADERS, proxies=agentIp, verify=False,
                                            cookies=cookie_dict).text
                    need_datas = json.loads(response)['content']['data']
                    for need_date in need_datas:
                        for need in need_datas[str(need_date)]:
                            data = dict()
                            data['name'] = "Top10成交关键词"
                            data['coming_from'] = str(need_date)
                            data['shop_name'] = "六只兔子 高端内裤 内衣店"
                            data['pageName'] = need['pageName']
                            data['uv'] = need['uv']
                            data['pv'] = need['pv']
                            data['uvRate'] = need['uvRate']
                            data['pvRate'] = need['pvRate']
                            data['itemid'] = rabbit['item_id']
                            local_set2.save(data)
            '''
            '''
            if coming_url_wlSeList:
                time.sleep(4)
                response = requests.get(url=coming_url_wlSeList, headers=HEADERS, proxies=agentIp, verify=False,
                                        cookies=cookie_dict).text
                if "操作成功" in response:
                    need_datas = json.loads(response)['content']['data']
                    for need_date in need_datas:
                        #for need in need_datas[str(need_date)]:
                        data = dict()
                        data['name'] = "无线端来源"
                        # data['coming_from'] = str(need_date)
                        # data_keyword = dict()
                        data['shop_name'] = "六只兔子 高端内裤 内衣店"
                        data['pageName'] = need_date['pageName']
                        data['uv'] = need_date['uv']
                        data['pv'] = need_date['pv']
                        data['uvRate'] = need_date['uvRate']
                        data['pvRate'] = need_date['pvRate']
                        data['itemid'] = rabbit['item_id']
                        local_set2.save(data)
                else:
                    time.sleep(30)
                    response = requests.get(url=coming_url_wlSeList, headers=HEADERS, proxies=agentIp, verify=False,
                                            cookies=cookie_dict).text
                    need_datas = json.loads(response)['content']['data']
                    for need_date in need_datas:
                        # for need in need_datas[str(need_date)]:
                        data = dict()
                        data['name'] = "无线端来源"
                        # data['coming_from'] = str(need_date)
                        data['shop_name'] = "六只兔子 高端内裤 内衣店"
                        data['pageName'] = need_date['pageName']
                        data['uv'] = need_date['uv']
                        data['pv'] = need_date['pv']
                        data['uvRate'] = need_date['uvRate']
                        data['pvRate'] = need_date['pvRate']
                        data['itemid'] = rabbit['item_id']
                        local_set2.save(data)
            '''
        except Exception, e:
            print e
Exemplo n.º 22
0
def get_coverage(options_list, csrfid):
    proxy = dict()
    proxy['HTTP'] = Utils.GetMyAgent()
    url_data = "http://dmp.taobao.com/api/analysis/coverage"
    for option in options_list:
        num = 2
        post_data = dict()
        post_data['csrfId'] = csrfid
        post_data['user'] = 1
        # 开始组合post数据,先组成主标签即店铺标签
        post_data["options[{a}].operatorType".format(a=0)] = 1
        post_data["options[{a}].optionGroupId".format(a=0)] = 304
        post_data["options[{a}].source".format(a=0)] = "all"
        post_data["options[{a}].tagId".format(a=0)] = 110063
        post_data["options[{a}].value".format(a=0)] = 145841584  # 填shopId
        post_data["options[{a}].optionNameMapStr".format(
            a=0)] = '{"145841584":"麦斯威尔旗舰店"}'  # 填店铺信息
        post_data["options[{a}].operatorType".format(a=1)] = 1
        post_data["options[{a}].optionGroupId".format(a=1)] = 111
        post_data["options[{a}].source".format(a=1)] = "all"
        post_data["options[{a}].tagId".format(a=1)] = 110063
        post_data["options[{a}].value".format(a=1)] = "1~999999999"
        DATA = dict()
        DATA['tag_id'] = str()
        DATA['tag_name'] = str()
        DATA['option_name'] = str()
        key_list = option.keys()
        for key in key_list:
            # 每个key代表一个tagId
            option_id = option[key]
            options = TAG_DATA[key]['options']
            DATA['tag_name'] += TAG_DATA[key]['tag_name'] + '_'
            DATA['tag_id'] += str(key) + '_'
            for get_right_option in options:
                if option_id == get_right_option['id']:
                    optionGroupId = get_right_option['optionGroupId']
                    optionValue = get_right_option['optionValue']
                    optionName = get_right_option['optionName']
                    post_data['options[{num}].operatorType'.format(
                        num=num)] = 1
                    post_data['options[{num}].optionGroupId'.format(
                        num=num)] = optionGroupId
                    post_data['options[{num}].source'.format(num=num)] = 'all'
                    post_data['options[{num}].tagId'.format(num=num)] = key
                    post_data['options[{num}].value'.format(
                        num=num)] = optionValue
                    optionNameMapStr = dict()
                    optionNameMapStr[optionValue] = str(optionName)
                    post_data['options[{num}].optionNameMapStr'.format(
                        num=num)] = str(optionNameMapStr)
                    DATA['option_name'] += get_right_option['optionName'] + '_'
                    num += 1
        response = requests.post(url_data,
                                 proxies=proxy,
                                 verify=False,
                                 headers=HEADER,
                                 cookies=cookie_dict,
                                 allow_redirects=False,
                                 data=post_data).text
        DATA['tag_name'] += "制定店铺用户_"
        DATA['tag_id'] += '110063_'
        DATA['option_name'] += '1~9999999_麦斯威尔旗舰店_'
        DATA['count'] = str(json.loads(response)['data']['coverage'])
        DATA['crawl_time'] = int(time.time())
        DATA['tag_name'] = str(DATA['tag_name'])[:-1]
        DATA['tag_id'] = str(DATA['tag_id'])[:-1]
        DATA['option_name'] = str(DATA['option_name'])[:-1]
        print DATA
        local_set.save(DATA)
Exemplo n.º 23
0
 def crawlNid(self, data, i, agentip, agentipjj):
     items = data['mods']['itemlist']['data']['auctions']
     x = (i - 1) * 44 + 1
     agentip = Utils.GetMyAgent()
     for item in items:
         shop_items = []
         shop_item = {}
         shop_item['keyword'] = self.key_word
         title = item['title']
         isTmall = item['shopcard']['isTmall']
         shop_item['isTmall'] = isTmall
         title = title.replace("<spanclass=H>", "").replace("</span>",
                                                            "").strip()
         shop_item['title'] = title
         nid = item['nid'].strip()
         shop_item['item_id'] = nid
         view_sales = item['view_sales'].strip()
         view_sales = view_sales.replace("人收货,", "").replace("人收货",
                                                             "").strip()
         shop_item['view_sales'] = view_sales
         shop_item['view_price'] = item['view_price'].strip()
         shop_item['picUrl'] = "http:" + item['pic_url'].strip()
         shop_item['idnick'] = item['nick'].strip()
         shop_item['crawl_time'] = long(time.time())
         shop_item['rank'] = x
         print(x)
         if x == 101:
             break
         x += 1
         #if(x<=5):
         # continue
         detail_url = "https://detail.tmall.com/item.htm?spm=a230r.1.14.1.ebb2eb2PXquhm&id={nid}&ns=1&abbucket=20"
         t_detail_url = detail_url.format(nid=nid)
         header = {'ip': agentip}
         try:
             sleep(2)
             ok, response = Html_Downloader.Download_Html(
                 t_detail_url, {}, header)
             if not ok:
                 count = 0
                 while (count < 4):
                     sleep(2)
                     agentip = Utils.GetMyAgent()
                     header = {'ip': agentip}
                     ok, response = Html_Downloader.Download_Html(
                         t_detail_url, {}, header)
                     if ok:
                         break
                     count += 1
                     if count == 3:
                         header = {}
             if ok:
                 html = etree.HTML(response.text)
                 if not "shopid" in response.text:
                     count = 0
                     while (count < 4):
                         sleep(2)
                         agentip = Utils.GetMyAgent()
                         header = {'ip': agentip}
                         ok, response = Html_Downloader.Download_Html(
                             t_detail_url, {}, header)
                         if ok:
                             html = etree.HTML(response.text)
                             if "shopid" in response.text:
                                 break
                         count += 1
                         if count == 3:
                             header = {}
                 shop_id = ""
                 # user_id=""
                 category_id = re.compile("item%5f(.*?)(?=&)").findall(
                     response.text)[0]
                 shop_item['category_id'] = category_id
                 if html.xpath("//meta[@name='microscope-data']"):
                     for meta in html.xpath(
                             "//meta[@name='microscope-data']")[0].get(
                                 'content').split(';'):
                         if 'shopid' in meta.lower():
                             shop_id = meta.split("=")[1]
                         # if 'userid=' in meta.lower():
                         #   user_id= meta.split("=")[1]
                 if html.xpath("//dl[contains(@class,'tb-prop')]"):
                     for prop in html.xpath(
                             "//dl[contains(@class,'tb-prop')]"):
                         if not prop in html.xpath(
                                 "//dl[contains(@class,'tb-hidden')]"):
                             prop_value_id = []
                             prop_name = prop.xpath(
                                 ".//dt/text()")[0].encode('utf-8')
                             for value in prop.xpath(".//dd/ul/li"):
                                 sub_value_id = []
                                 sku_id = value.get('data-value')
                                 sub_value_id.append(sku_id)
                                 if value.xpath('./a/span/text()'):
                                     sku_name = value.xpath(
                                         './a/span/text()')[0].encode(
                                             'utf-8')
                                     sub_value_id.append(sku_name)
                                 if value.xpath('./a')[0].get(
                                         'style') and re.compile(
                                             "/([^/]*)(?=_!!|_M2)").findall(
                                                 value.xpath('./a')[0].get(
                                                     'style')):
                                     sku_img_url = re.compile(
                                         "/([^/]*)(?=_!!|_M2)").findall(
                                             value.xpath('./a')[0].get(
                                                 'style'))[0]
                                     sub_value_id.append(sku_img_url)
                                 prop_value_id.append(
                                     ";".join(sub_value_id))
                             shop_item[prop_name] = prop_value_id
                 if html.xpath("//ul[@id='J_UlThumb']"):
                     stype_img_id = []
                     if html.xpath("//ul[@id='J_UlThumb']")[0].xpath(
                             ".//li/div"):
                         for value1 in html.xpath("//ul[@id='J_UlThumb']"
                                                  )[0].xpath(".//li/div"):
                             if value1.xpath('./a')[0].xpath(
                                     './img'
                             )[0].get('data-src') and re.compile(
                                     "/([^/]*)(?=_!!|_M2)").findall(
                                         value1.xpath('./a')[0].xpath(
                                             './img')[0].get('data-src')):
                                 sku_img_id = re.compile(
                                     "/([^/]*)(?=_!!|_M2)").findall(
                                         value1.xpath('./a')[0].xpath(
                                             './img')[0].get('data-src'))[0]
                                 stype_img_id.append(sku_img_id)
                     elif html.xpath("//ul[@id='J_UlThumb']")[0].xpath(
                             ".//li"):
                         for value1 in html.xpath(
                                 "//ul[@id='J_UlThumb']")[0].xpath(".//li"):
                             if value1.xpath('./a')[0].xpath(
                                     './img')[0].get('src') and re.compile(
                                         "/([^/]*)(?=_!!|_M2)").findall(
                                             value1.xpath('./a')[0].xpath(
                                                 './img')[0].get('src')):
                                 sku_img_id = re.compile(
                                     "/([^/]*)(?=_!!|_M2)").findall(
                                         value1.xpath('./a')[0].xpath(
                                             './img')[0].get('src'))[0]
                                 stype_img_id.append(sku_img_id)
                     shop_item["attr_img"] = "&&||".join(stype_img_id)
                 if html.xpath("//ul[@id='J_AttrUL']"):
                     styleliList = []
                     # dict={}
                     for styleli in html.xpath(
                             "//ul[@id='J_AttrUL']")[0].xpath(".//li"):
                         if styleli.xpath('./text()'):
                             styleliText = styleli.xpath(
                                 './text()')[0].encode('utf-8').strip()
                             # styleliText=styleliText.replace(":",":")
                             # str1=styleliText.split(":")[0].encode('utf-8').strip()
                             # str2=styleliText.split(":")[1].encode('utf-8').strip().replace("\xc2\xa0"," ").lstrip()
                             # dict[str1] =str2
                             styleliList.append(styleliText)
                 elif html.xpath("//div[@id='attributes']"):
                     styleliList = []
                     # dict={}
                     for styleli in html.xpath("//div[@id='attributes']"
                                               )[0].xpath(".//ul/li"):
                         if styleli.xpath('./text()'):
                             styleliText = styleli.xpath(
                                 './text()')[0].encode('utf-8').strip()
                             # styleliText=styleliText.replace(":",":")
                             # str1=styleliText.split(":")[0].encode('utf-8').strip()
                             # str2=styleliText.split(":")[1].encode('utf-8').strip().replace("\xc2\xa0"," ").lstrip()
                             # dict[str1] =str2
                             styleliList.append(styleliText)
                 shop_item["attribute"] = "&&||".join(styleliList)
                 # shop_item["attribute"]=dict
         except Exception, e:
             logging.info("关键词{p}抓取失败,nid={nid},{m}".format(p=self.key_word,
                                                            nid=nid,
                                                            m=e.message))
         shop_item['crawl_url'] = t_detail_url
         shop_item['shop_id'] = shop_id
         session = Session()
         self.get_total_sales(session, agentip, 1,
                              shop_id)  #首次获取会失败只为获取cookie
         total_page = 1
         for i in range(50):
             # 到最后一页就提前终止
             if total_page and i >= total_page:
                 break
             result = self.get_total_sales(session, agentip, (i + 1),
                                           shop_id)
             if not result:
                 result = self.get_total_sales(session, agentip, (i + 1),
                                               shop_id)
             if (result != None):
                 jobj = json.loads(
                     result.replace("mtopjsonp12(",
                                    "").replace("})", "}"))  # 获取解析的json
                 jsonArray = jobj['data']['itemsArray']
                 total_sales = self.parse_total_sales(jsonArray, nid)
                 if total_sales != -1:
                     break
                 if jobj and "SUCCESS" in jobj['ret'][0]:
                     total = int(jobj['data']['totalResults'])
                     total_page = total / 30  # 每页最多30个不能再多
                     if total % 30:
                         total_page += 1
                 else:
                     print("获取数据失败")
                     break
                 sleep(2)
             else:
                 total_sales = ""
         shop_item['totalSoldQuantity'] = total_sales
         shop_items.append(shop_item)
         post_data = {'data': json.dumps(shop_items)}
         if not self.process_request(SAVE_INSERT_KEYWORD_API, post_data):
             self.process_request(SAVE_INSERT_KEYWORD_API, post_data)
Exemplo n.º 24
0
 def crawl_shop_all_item(self):
     agentIp = Utils.GetMyAgent()
     shop_id = self.shop_id
     shop_name = self.shop_name
Exemplo n.º 25
0
    def crawl_shop_all_item(self, url):
        agentIp = Utils.GetMyAgent()
        shop_id = self.shop_id
        shop_name = self.shop_name
        userAgent = Html_Downloader.GetUserAgent()
        header = {'ip': agentIp, 'user-agent': userAgent}
        text_detail_url = url
        ok, response = Html_Downloader.Download_Html(text_detail_url, {},
                                                     header)
        if ok:
            jsonArray = json.loads(response.content)  # 获取解析的json
            total_page = jsonArray.get("total_page")
            total_results = jsonArray.get("total_results")
            page_size = jsonArray.get("page_size")
            jsonResult = jsonArray.get("items")
            for item in jsonResult:
                shop_item = {}
                item_id = str(item.get("item_id")).strip()
                shop_item['item_id'] = item_id
                shop_item['title'] = item.get('title').encode('utf-8')
                shop_item['picUrl'] = "http:" + item.get('img')
                #现在的销售价
                shop_item['salePrice'] = item.get('price')
                shop_item['totalSoldQuantity'] = item.get('totalSoldQuantity')
                shop_item['crawl_url'] = item.get('url')
                shop_item['crawl_time'] = long(time.time())
                #接口url 获取宝贝种类(颜色分类)不需要这个接口了,下面那个接口就可以得到颜色分类等信息
                '''
                test_Url="http://d.da-mai.com/index.php?r=itemApi/getItemInfoByItemId&item_id="+item_id
                ok, response = Html_Downloader.Download_Html(test_Url,{}, header)
                if ok:
                   jsonItems=json.loads(response.content)  # 获取解析的json
                '''
                #接口url 获取SKU详细信息()
                shop_item['quantity'] = 0
                getSKU_Url = "http://yj.da-mai.com/index.php?r=itemskus/getSkus&fields=*&num_iids={item_id}".format(
                    item_id=item_id)
                ok, response = Html_Downloader.Download_Html(
                    getSKU_Url, {}, header)
                if ok:
                    jsonItems = json.loads(response.content)
                    total_data = jsonItems.get("data")
                    for date in total_data:
                        quantity = date.get("quantity")
                        shop_item[
                            'quantity'] = shop_item['quantity'] + quantity
                #获取宝贝详情页信息 (第二屏信息)
                getDetail_Url = "http://d.da-mai.com/index.php?r=itemApi/getItemInfoByItemId&item_id={item_id}".format(
                    item_id=item_id)
                ok, response_detail = Html_Downloader.Download_Html(
                    getDetail_Url, {}, header)
                if ok:
                    shop_item['attribute'] = []
                    #jsonDetails = response_detail['data']['data']
                    jsonDetails = json.loads(response_detail.content)
                    properties = jsonDetails['data']['data']['properties']
                    stringName = ""
                    for attri in properties:
                        #string = "{name}:{value}&&||".format(name=attri.get('name'),value=attri.get('value'))
                        name = attri.get('name')
                        value = attri.get('value')
                        if name in stringName:
                            #shop_item['attribute'].append(name)
                            string = "{value} ".format(value=value)
                            shop_item['attribute'].append(string)
                        if name not in stringName:
                            string = "{name}:{value}&&||".format(name=name,
                                                                 value=value)
                            shop_item['attribute'].append(string)
                            stringName = name + stringName

        for page in total_page:
            #重写json的URL并完成回调函数
            ###!!!!!注意这里店铺的url写死了,应该传参进来!!!!
            getlist_url="https://yiqianny.m.tmall.com/shop/shop_auction_search.do?ajson=1&_tm_source=tmallsearch&" \
                    "spm=a320p.7692171.0.0&sort=d&p={page}&page_size=24&from=h5".format(page=page)
            p = multiprocessing.Process(
                target=self.crawl_shop_all_item(getlist_url), args=(page, ))
            p.start()
            logging.info("开始多进程爬虫,爬取的json列表为:{url}".format(url=getlist_url))
            self.crawl_shop_all_item(getlist_url)
Exemplo n.º 26
0
def get_data(tag_id):
    csrfid = str(get_csrfId())
    tag_data = dict()
    url = "http://dmp.taobao.com/api/tag/{tag_id}?csrfId={csrfid}&t={time}977" \
        .format(tag_id=tag_id, csrfid=csrfid, time=int(time.time()))
    proxy = dict()
    proxy['HTTP'] = Utils.GetMyAgent()
    # allow_redirects 是用来解决URI重定向问题
    response = requests.get(url=url,
                            proxies=proxy,
                            verify=False,
                            headers=HEADER,
                            cookies=cookie_dict,
                            allow_redirects=False).text
    data = json.loads(response)
    # 标签信息
    tag_data['dmp_msg'] = str(
        data["data"]["tag"]["tagDesc"].split(",")[0])[9:-1]
    # 获取标签数
    tag_data['qualityScore'] = str(data["data"]["tag"]["qualityScore"])
    # 标签标题
    tag_data['tag_name'] = str(data["data"]["tag"]["tagName"])
    # 标签
    options = dict()
    optionValue = data["data"]["tag"]["options"]
    GroupId = data["data"]["tag"]["options"][0]['optionGroupId']
    optionValue_id = dict()
    global DATA
    # 先把选项编号和选型名称以字典的形式保存在数组中
    options = list()
    for value in optionValue:
        options_list = list()
        options_list.append(value["optionValue"])
        options_list.append(value["optionName"])
        options.append(options_list)
        # 将每个选项的id和选项标签数组成字典后续用
        optionValue_id[value['optionValue']] = value['id']
    # 对选项进行组合全部发出请求获取数据
    for combin in range(1, len(options) + 1):
        for option in combinations(options, combin):
            optionNameMapStr = dict()
            keys_list = list()
            for key in option:
                tag_key = key[0]
                keys_list.append(tag_key)
                optionNameMapStr[str(tag_key)] = key[1]
            url_data = "http://dmp.taobao.com/api/analysis/coverage"
            post_data = {
                "csrfId": csrfid,
                "user": 1,
                "options[0].operatorType": 1,
                "options[0].optionGroupId": GroupId,
                "options[0].source": "detail",
                "options[0].tagId": tag_id,
                "options[0].value": keys_list,
                "options[0].optionNameMapStr": optionNameMapStr,
            }
            post_data2 = {
                'options[0].tagId': 110063,
                'options[0].optionGroupId': 111,
                'options[2].optionNameMapStr': "{2: '本科生'}",
                'options[2].value': [2],
                'options[1].source': 'all',
                'options[2].source': 'all',
                'options[0].value': '1~999999999',
                'options[1].optionNameMapStr': "{'36236493': '依俊服饰",
                'options[1].value': 36236493,
                'options[0].source': 'all',
                'options[1].optionGroupId': 304,
                'options[1].tagId': 110063,
                'csrfId': '3aed9369a6af50b4d2b202',
                'user': 1,
                'options[2].optionGroupId': 12164,
                'options[0].operatorType': 1,
                'options[2].operatorType': 1,
                'options[1].operatorType': 1,
                'options[2].tagId': 113736
            }
            post_data3 = {
                'options[3].optionGroupId': 12164,
                'options[0].value': '1~999999999',
                'options[2].optionNameMapStr': "{7: '小学'}",
                'options[3].optionNameMapStr': "{6: u'初中'}",
                'options[0].tagId': 110063,
                'options[4].source': 'all',
                'options[2].value': [7],
                'csrfId': '3aed9369a6af50b4d2b202',
                'options[4].optionNameMapStr': "{2:'本科生'}",
                'options[4].value': [2],
                'options[3].operatorType': 1,
                'options[1].tagId': 110063,
                'options[3].tagId': 113736,
                'options[0].operatorType': 1,
                'options[1].operatorType': 1,
                'options[4].operatorType': 1,
                'options[2].operatorType': 1,
                'options[1].optionGroupId': 304,
                'options[2].source': 'all',
                'options[1].value': 36236493,
                'options[4].tagId': 113736,
                'user': 1,
                'options[2].optionGroupId': 12164,
                'options[3].source': 'all',
                'options[1].source': 'all',
                'options[0].optionGroupId': 111,
                'options[0].source': 'all',
                'options[4].optionGroupId': 12164,
                'options[1].optionNameMapStr':
                "{'36236493': '\\xe4\\xbe\\x9d\\xe4\\xbf\\x8a\\xe6\\x9c\\x8d\\xe9\\xa5\\xb0'}",
                'options[2].tagId': 113736,
                'options[3].value': [6]
            }
            response = requests.post(url_data,
                                     proxies=proxy,
                                     verify=False,
                                     headers=HEADER,
                                     cookies=cookie_dict,
                                     allow_redirects=False,
                                     data=post_data).text
            DATA['count'] = str(json.loads(response)['data']['coverage'])
            # 标签说明。。。。
            DATA['option_name'] = str(tag_data['dmp_msg'])
            # 标签名称
            DATA['option_name'] = str(tag_data['tag_name'])
            # 选项名称:
            count_name = str()
            for name in optionNameMapStr.values():
                count_name = count_name + name + "_"
            DATA['count_name'] = count_name[:-1]
            print "爬取{count_name}成功".format(count_name=count_name[:-1])
            # 标签id :
            a = str()
            for key in keys_list:
                a = a + "_" + str(optionValue_id[key])
            DATA['id'] = str(tag_id) + a
            DATA['crawl_time'] = int(time.time())
            # 将数据存在本地mongdb查看结果
            my_set.save(DATA)
            # json_data = json.dumps(DATA, ensure_ascii=False)
            # logging.info(json_data)
            TATAL_DATA.append(DATA)
            DATA = {}
            # tag_data['options'] = options
    #############开始重新发请求获取数据###################
    '''