Exemplo n.º 1
0
 def crawl_shop_all_item(self):
     agentIp = Utils.GetAgentIp()
     header = {'ip': agentIp}
     shop_id = -1
     # agentIp=None
     # agentIp = '120.24.171.107:16816'
     url = "{shop_url}/search.htm?&search=y&orderType=hotsell_desc&scene=taobao_shop".format(
         shop_url=self.shop_url)
     url = self.shop_url
     print url
     # data=urllib2.urlopen(url).readlines()
     # soup=BeautifulSoup(''.join(data), fromEncoding='utf8')
     # primary_consumer = soup.find(id="bd")
     ok, response = Html_Downloader.Download_Html(url, {}, header)
     soup = BeautifulSoup(''.join(response), fromEncoding='utf8')
     header = soup.find(id="J_GlobalNav")
     div = header.text
     # print(ok)
     if ok:
         html = etree.HTML(response.text.encode('utf-8'))
         if html is not None and html.xpath("//header[@id='mp-header']"):
             if "shopId" in html.xpath("//header[@id='mp-header']")[0].get(
                     "mdv-cfg").split(':')[0]:
                 shop_id = html.xpath("//header[@id='mp-header']")[0].get(
                     "mdv-cfg").split(':')[1]
                 shop_id = shop_id.replace("\'}", "").replace("\'", "")
     url = "{shop_url}/shop/shop_auction_search.do?sort=d&p=1&page_size=90&from=h5&shop_id={shop_id}&ajson=1&_tm_source=tmallsearch&orderType=hotsell_desc".format(
         shop_url=self.shop_url, shop_id=shop_id)
     print(url)
     # driver = PhantomDriver(2, agentIp, 60)
     # driver.download_no_quit(self.shop_url)
     # sleep(1)
     # for i in range(20):
     #     result = driver.download_no_quit(url)
     #     sleep(3)
     # source = result['page_source']
     # driver.return_driver().quit()
     # if result['ok']:
     #     html = etree.HTML(source)
     ok, response = Html_Downloader.Download_Html(url, {}, header)
     print(ok)
     if not ok:
         ok, response = Html_Downloader.Download_Html(url, {}, {})
     print(url)
     if ok:
         html = etree.HTML(response.text)
         data = json.loads(html.group(1).encode('utf-8'))
         print
Exemplo n.º 2
0
 def crawl_shop_all_item(self):
     agentIp = Utils.GetAgentIp()
     shop_id = -1
     # agentIp=None
     # agentIp = '120.24.171.107:16816'
     driver = PhantomDriver(2, agentIp, 60)
     parms_url = "{shop_url}/i/asynSearch.htm?_ksTS={now}569_240&callback=jsonp241&mid=w-14766145001-0&wid=14766145001&path=/search.htm&search=y&orderType=hotsell_desc&scene=taobao_shop&pageNo={page_num}"
     url = "{shop_url}/search.htm?&search=y&orderType=hotsell_desc&scene=taobao_shop".format(
         shop_url=self.shop_url)
     # url="https://nanshanweng.m.tmall.com/shop/shop_auction_search.do?sort=d&p=1&page_size=12&from=h5&shop_id=247506881&ajson=1&_tm_source=tmallsearch"
     # self.testurl(url,agentIp)
     print(url)
     result = driver.download_no_quit(url)
     source = result['page_source']
     driver.return_driver().quit()
     if result['ok']:
         html = etree.HTML(source)
     shop_items = []
     if html is not None and 'page-info' in source and html.xpath(
             "//span[contains(@class,'page-info')]/text()"):
         total = int(
             html.xpath("//span[contains(@class,'page-info')]/text()")
             [0].split('/')[1])
         total = 3
         if html.xpath("//meta[@name='microscope-data']"):
             for meta in html.xpath("//meta[@name='microscope-data']"
                                    )[0].get('content').split(';'):
                 if 'shopid' in meta.lower():
                     shop_id = meta.split("=")[1]
                     # self.shopall.format_data(shop_id, False)
                     shop_items.extend(
                         self.parse_items(html, shop_id, agentIp))
         for i in range(1, total):
             page_num = i + 1
             print("page%s" % page_num)
             url = parms_url.format(shop_url=self.shop_url,
                                    now=long(time.time()),
                                    page_num=page_num)
             result = driver.download_no_quit(url)
             if result['ok']:
                 html = etree.HTML(result['page_source'])
             if result['ok'] and 'page-info' in source and html.xpath(
                     "//span[contains(@class,'page-info')]/text()"):
                 results = self.parse_items(html, shop_id, agentIp)
                 shop_items.extend(results)
             sleep(15)
         self.shopall.insert_or_update(shop_items)
     elif html is not None and 'ui-page-s-len' in source and html.xpath(
             "//b[contains(@class,'ui-page-s-len')]/text()"):
         total = int(
             html.xpath("//b[contains(@class,'ui-page-s-len')]/text()")
             [0].split('/')[1])
         total = 3
         if html.xpath("//meta[@name='microscope-data']"):
             for meta in html.xpath("//meta[@name='microscope-data']"
                                    )[0].get('content').split(';'):
                 if 'shopid' in meta.lower():
                     shop_id = meta.split("=")[1]
                     # self.shopall.format_data(shop_id, False)
                     shop_items.extend(
                         self.parse_items1(html, shop_id, agentIp))
         for i in range(1, total):
             page_num = i + 1
             print("page%s" % page_num)
             url = parms_url.format(shop_url=self.shop_url,
                                    now=long(time.time()),
                                    page_num=page_num)
             result = driver.download_no_quit(url)
             if result['ok']:
                 html = etree.HTML(result['page_source'])
             if result['ok'] and 'ui-page-s-len' in source and html.xpath(
                     "//b[contains(@class,'ui-page-s-len')]/text()"):
                 results = self.parse_items1(html, shop_id, agentIp)
                 shop_items.extend(results)
             sleep(15)
         self.shopall.insert_or_update(shop_items)
     else:
         # 失败就退出关闭webdriver
         driver.return_driver().quit()
     print("无法获取%s" % agentIp)
     return -1
     return shop_id
Exemplo n.º 3
0
# -*- coding: utf-8 -*-

#用于抓取订单中tradeID对应的宝贝ID

import json
from utils.driver_utils import ChromeDriver
from db.DataStore import *
from utils.utils import Utils
from lxml import etree
import time
import datetime
from time import sleep
import re
import random

agent_ip = Utils.GetAgentIp()
result = get_item_trade_ids()
cookies = "mt=ci%3D-1_0; thw=cn; _m_user_unitinfo_=unit|unsz; _m_unitapi_v_=1498717160426; _m_h5_tk=5497d68b5bcf376f3f03c2bfe29d5c3e_1499745724771; _m_h5_tk_enc=bd37ed1f8dad5844fa8737aa499399d3; mt=ci%3D-1_0; _tb_token_=e17e846a1e737; x=78550821; uc3=sg2=AVAJ%2F%2FuFgrZrwbvpPwMpeUNJWGnNVTEcpZhNLKPoZwE%3D&nk2=&id2=&lg2=; uss=WvmGFLDaRLuLKHzx3Jt6R6Zh8SbBg8epTAb4OU0jo4jMr30BF8ACG4yF; tracknick=; sn=%E8%8B%B1%E8%AF%AD%E4%BA%8C%E6%B2%B9%E6%9D%A1%3A%E6%8E%A8%E5%B9%BF; skt=753a73a2763c5d75; v=0; cookie2=3c92dea4c50d0cf31281f889a3a999ec; unb=857889334; t=efd1f635969594e9ad33c0ec391d9883; uc1=cookie14=UoW%2BsWPGhqNu%2Fw%3D%3D&lng=zh_CN; cna=0SPrEVg+OkQCAQ4XY4MHK7uX; isg=Avv7jk0BJmcLoRtlqwnHCbYyit-l-AMTeQ3uMe245_oRTBsudSCfohnMENr5; apush5dceacf8bcd04ef16398a2906680ab9b=%7B%22ts%22%3A1499853369995%2C%22parentId%22%3A1499850283869%7D"
cookie_dict = {
    item.split('=')[0]: item.split('=')[1]
    for item in cookies.split(';')
}

driver = ChromeDriver()
cookies = driver.login_an_get('英语二油条:推广', 'tuiguang654321')
sleep(5)

for item in result:
    url = "https:%s" % item['item_url']
    mydriver = driver.get_driver()
    mydriver.get(url)