def crawl_shop_all_item(self): agentIp = Utils.GetAgentIp() header = {'ip': agentIp} shop_id = -1 # agentIp=None # agentIp = '120.24.171.107:16816' url = "{shop_url}/search.htm?&search=y&orderType=hotsell_desc&scene=taobao_shop".format( shop_url=self.shop_url) url = self.shop_url print url # data=urllib2.urlopen(url).readlines() # soup=BeautifulSoup(''.join(data), fromEncoding='utf8') # primary_consumer = soup.find(id="bd") ok, response = Html_Downloader.Download_Html(url, {}, header) soup = BeautifulSoup(''.join(response), fromEncoding='utf8') header = soup.find(id="J_GlobalNav") div = header.text # print(ok) if ok: html = etree.HTML(response.text.encode('utf-8')) if html is not None and html.xpath("//header[@id='mp-header']"): if "shopId" in html.xpath("//header[@id='mp-header']")[0].get( "mdv-cfg").split(':')[0]: shop_id = html.xpath("//header[@id='mp-header']")[0].get( "mdv-cfg").split(':')[1] shop_id = shop_id.replace("\'}", "").replace("\'", "") url = "{shop_url}/shop/shop_auction_search.do?sort=d&p=1&page_size=90&from=h5&shop_id={shop_id}&ajson=1&_tm_source=tmallsearch&orderType=hotsell_desc".format( shop_url=self.shop_url, shop_id=shop_id) print(url) # driver = PhantomDriver(2, agentIp, 60) # driver.download_no_quit(self.shop_url) # sleep(1) # for i in range(20): # result = driver.download_no_quit(url) # sleep(3) # source = result['page_source'] # driver.return_driver().quit() # if result['ok']: # html = etree.HTML(source) ok, response = Html_Downloader.Download_Html(url, {}, header) print(ok) if not ok: ok, response = Html_Downloader.Download_Html(url, {}, {}) print(url) if ok: html = etree.HTML(response.text) data = json.loads(html.group(1).encode('utf-8')) print
def crawl_shop_all_item(self): agentIp = Utils.GetAgentIp() shop_id = -1 # agentIp=None # agentIp = '120.24.171.107:16816' driver = PhantomDriver(2, agentIp, 60) parms_url = "{shop_url}/i/asynSearch.htm?_ksTS={now}569_240&callback=jsonp241&mid=w-14766145001-0&wid=14766145001&path=/search.htm&search=y&orderType=hotsell_desc&scene=taobao_shop&pageNo={page_num}" url = "{shop_url}/search.htm?&search=y&orderType=hotsell_desc&scene=taobao_shop".format( shop_url=self.shop_url) # url="https://nanshanweng.m.tmall.com/shop/shop_auction_search.do?sort=d&p=1&page_size=12&from=h5&shop_id=247506881&ajson=1&_tm_source=tmallsearch" # self.testurl(url,agentIp) print(url) result = driver.download_no_quit(url) source = result['page_source'] driver.return_driver().quit() if result['ok']: html = etree.HTML(source) shop_items = [] if html is not None and 'page-info' in source and html.xpath( "//span[contains(@class,'page-info')]/text()"): total = int( html.xpath("//span[contains(@class,'page-info')]/text()") [0].split('/')[1]) total = 3 if html.xpath("//meta[@name='microscope-data']"): for meta in html.xpath("//meta[@name='microscope-data']" )[0].get('content').split(';'): if 'shopid' in meta.lower(): shop_id = meta.split("=")[1] # self.shopall.format_data(shop_id, False) shop_items.extend( self.parse_items(html, shop_id, agentIp)) for i in range(1, total): page_num = i + 1 print("page%s" % page_num) url = parms_url.format(shop_url=self.shop_url, now=long(time.time()), page_num=page_num) result = driver.download_no_quit(url) if result['ok']: html = etree.HTML(result['page_source']) if result['ok'] and 'page-info' in source and html.xpath( "//span[contains(@class,'page-info')]/text()"): results = self.parse_items(html, shop_id, agentIp) shop_items.extend(results) sleep(15) self.shopall.insert_or_update(shop_items) elif html is not None and 'ui-page-s-len' in source and html.xpath( "//b[contains(@class,'ui-page-s-len')]/text()"): total = int( html.xpath("//b[contains(@class,'ui-page-s-len')]/text()") [0].split('/')[1]) total = 3 if html.xpath("//meta[@name='microscope-data']"): for meta in html.xpath("//meta[@name='microscope-data']" )[0].get('content').split(';'): if 'shopid' in meta.lower(): shop_id = meta.split("=")[1] # self.shopall.format_data(shop_id, False) shop_items.extend( self.parse_items1(html, shop_id, agentIp)) for i in range(1, total): page_num = i + 1 print("page%s" % page_num) url = parms_url.format(shop_url=self.shop_url, now=long(time.time()), page_num=page_num) result = driver.download_no_quit(url) if result['ok']: html = etree.HTML(result['page_source']) if result['ok'] and 'ui-page-s-len' in source and html.xpath( "//b[contains(@class,'ui-page-s-len')]/text()"): results = self.parse_items1(html, shop_id, agentIp) shop_items.extend(results) sleep(15) self.shopall.insert_or_update(shop_items) else: # 失败就退出关闭webdriver driver.return_driver().quit() print("无法获取%s" % agentIp) return -1 return shop_id
# -*- coding: utf-8 -*- #用于抓取订单中tradeID对应的宝贝ID import json from utils.driver_utils import ChromeDriver from db.DataStore import * from utils.utils import Utils from lxml import etree import time import datetime from time import sleep import re import random agent_ip = Utils.GetAgentIp() result = get_item_trade_ids() cookies = "mt=ci%3D-1_0; thw=cn; _m_user_unitinfo_=unit|unsz; _m_unitapi_v_=1498717160426; _m_h5_tk=5497d68b5bcf376f3f03c2bfe29d5c3e_1499745724771; _m_h5_tk_enc=bd37ed1f8dad5844fa8737aa499399d3; mt=ci%3D-1_0; _tb_token_=e17e846a1e737; x=78550821; uc3=sg2=AVAJ%2F%2FuFgrZrwbvpPwMpeUNJWGnNVTEcpZhNLKPoZwE%3D&nk2=&id2=&lg2=; uss=WvmGFLDaRLuLKHzx3Jt6R6Zh8SbBg8epTAb4OU0jo4jMr30BF8ACG4yF; tracknick=; sn=%E8%8B%B1%E8%AF%AD%E4%BA%8C%E6%B2%B9%E6%9D%A1%3A%E6%8E%A8%E5%B9%BF; skt=753a73a2763c5d75; v=0; cookie2=3c92dea4c50d0cf31281f889a3a999ec; unb=857889334; t=efd1f635969594e9ad33c0ec391d9883; uc1=cookie14=UoW%2BsWPGhqNu%2Fw%3D%3D&lng=zh_CN; cna=0SPrEVg+OkQCAQ4XY4MHK7uX; isg=Avv7jk0BJmcLoRtlqwnHCbYyit-l-AMTeQ3uMe245_oRTBsudSCfohnMENr5; apush5dceacf8bcd04ef16398a2906680ab9b=%7B%22ts%22%3A1499853369995%2C%22parentId%22%3A1499850283869%7D" cookie_dict = { item.split('=')[0]: item.split('=')[1] for item in cookies.split(';') } driver = ChromeDriver() cookies = driver.login_an_get('英语二油条:推广', 'tuiguang654321') sleep(5) for item in result: url = "https:%s" % item['item_url'] mydriver = driver.get_driver() mydriver.get(url)