示例#1
0
def tmallBrandNews():

    # TODO:XDF Chrome欲歌浏览器
    options = webdriver.ChromeOptions()
    # options.add_extension('AdBlock_v3.15.0.crx') # TODO:XDF Chrome欲歌广告过滤插件
    # 设置中文
    options.add_argument('lang=zh_CN.UTF-8')
    # prefs = {"profile.managed_default_content_settings.images": 2}
    # options.add_experimental_option("prefs", prefs)  # TODO:XDF 禁止加载图片
    # 更换头部
    options.add_argument(
        'user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"'
    )
    # driver = webdriver.Chrome(executable_path=r'/Users/zhuoqin/Desktop/Python/SeleniumDemo/chromedriver')#chrome_options=options,
    driver = webdriver.Chrome(
        executable_path=r'/Users/zhuoqin/Downloads/123456/chromedriver'
    )  # chrome_options=options,
    wait = WebDriverWait(driver, 200, 0.5)  # 表示给browser浏览器一个10秒的加载时间
    print '窗口最大化----1'
    tmallLogin(driver, UnexpectedAlertPresentException, ActionChains)
    print '窗口最大化----2'

    # TODO:XDF PhantomJS无头浏览器
    # dcap = dict(DesiredCapabilities.PHANTOMJS)
    # dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36")  # 设置user-agent请求头
    # dcap["phantomjs.page.settings.loadImages"] = False  # 禁止加载图片
    #
    # print ('即将开始。。。')
    # service_args = []
    # service_args.append('--load-images=no')  ##关闭图片加载
    # service_args.append('--disk-cache=yes')  ##开启缓存
    # service_args.append('--ignore-ssl-errors=true')  ##忽略https错误
    # # #TODO:XDF 针对本地调试
    # # driver = webdriver.PhantomJS(executable_path=r'/Users/zhuoqin/Desktop/Python/SeleniumDemo/phantomjs',service_args=service_args,desired_capabilities=dcap)
    #
    # driver = webdriver.PhantomJS(executable_path=r'/usr/bin/phantomjs', service_args=service_args,desired_capabilities=dcap)  # TODO:XDF 针对Linux
    # # # TODO:XDF 针对Linux服务器
    # # wait = WebDriverWait(driver, 60, 0.5)  # 表示给browser浏览器一个10秒的加载时间
    # #
    # driver.implicitly_wait(30)
    # driver.set_page_load_timeout(30)
    # wait = WebDriverWait(driver, 200, 0.5)  # 表示给browser浏览器一个10秒的加载时间

    try:
        print '进来窗口了'
        driver.maximize_window()
        driver.get("https://list.tmall.com/search_product.htm?q=123&type=p")

        print '窗口最大化----'
    except Exception as e:
        print '即将出错啦----%s' % e
        driver.quit()
        driver.close()
        return

    time.sleep(2)
    if 'login.tmall' in str(driver.current_url):
        print '即将登录'
        tmallLogin(driver, UnexpectedAlertPresentException, ActionChains)
    SelectShopTempletes()
    # BaseInfoList = SelectT_Treasures_BaseInfo()
    for i in range(0, len(allShopName)):
        InsertSqlList = []
        print('进来了---%s' % i)
        driver.find_element_by_xpath('//*[@id="mq"]').clear()
        time.sleep(random.uniform(2, 4))
        driver.find_element_by_xpath('//*[@id="mq"]').send_keys(
            allShopName['shopName'][i])
        print('地址********%s' % (allShopName['shopName'][i]))
        time.sleep(random.uniform(4, 6))

        if 'login.tmall' in str(driver.current_url):
            print '当前为登录页面'
            tmallLogin(driver, UnexpectedAlertPresentException, ActionChains)

        # driver.find_element_by_xpath('//*[@id="mallSearch"]/form/fieldset/div/button').click()
        driver.find_element_by_xpath(
            '//*[@id="mallSearch"]/form/fieldset/div/button').click()
        tmallLogin(driver, UnexpectedAlertPresentException, ActionChains)
        wait.until(EC.presence_of_element_located((By.ID, 'J_ItemList')))
        time.sleep(random.uniform(2, 4))
        print '赶紧睡吧'
        JudgeLoginSuccess(driver, UnexpectedAlertPresentException,
                          ActionChains)

        time.sleep(random.randint(1, 3))

        key = allShopName['shopName'][i]
        html = driver.page_source
        docs = pq(html)

        for j in range(0, int(getProjectPage(docs))):
            time.sleep(random.randint(4, 8))
            urls = 'https://list.tmall.com/search_shopitem.htm?s=' + str(
                j *
                60) + '&oq=' + str(key) + '&style=sg&sort=s&user_id=' + str(
                    allShopName['user_id'][i]) + '&stype=search'
            print urls
            driver.get(urls)
            time.sleep(random.randint(1, 3))
            if tmallCode(driver, wait, EC) == '访问过快':
                continue
            tmallLogin(driver, UnexpectedAlertPresentException, ActionChains)
            wait.until(EC.presence_of_element_located((By.ID, 'J_ItemList')))
            time.sleep(random.uniform(3, 5))
            html = driver.page_source

            doc = pq(html)
            list = doc('#J_ItemList .product   .product-iWrap').items()

            resultData = search(list)
            lists = doc('#J_ItemList .product   .product-iWrap').items()
            YuShouPriceData = YuShouPrice(lists)

            time.sleep(random.uniform(3, 5))

            print '所有店铺ID----%s-----%s' % (len(resultData), resultData)
            print '所有预售价---%s' % YuShouPriceData
            getDetailFilterData(driver, wait, resultData, YuShouPriceData,
                                allShopName['shopName'][i])
示例#2
0
def getDetailFilterData(driver, wait, resultData, YuShouPriceData, ShopName):
    for i in range(0, len(resultData)):
        TreasureID = str(resultData[i])
        presellPrice = clearToReplaceData(str(YuShouPriceData[i]), 1)
        print '------你大爷------'
        time.sleep(random.uniform(5, 8))
        try:
            driver.get('https://detail.tmall.com/item.htm?id=' +
                       str(TreasureID))
            time.sleep(random.randint(1, 3))
            print '地址----%s' % str(TreasureID)
        except Exception as e:
            print e
            continue

        if judgeProduct(driver) == True:
            print '商品不存在'
            continue

        if tmallCode(driver, wait, EC) == '访问过快':
            continue
        tmallLogin(driver, UnexpectedAlertPresentException, ActionChains)
        try:
            wait.until(
                EC.presence_of_element_located(
                    (By.CLASS_NAME, 'tb-detail-hd')))  # 显性等待
        except Exception as e:
            print '显性未加载成功---%s' % e
            continue
        time.sleep(random.randint(2, 5))  # 这里得让他睡眠一下,否则第二页开始会报错(加载数据)
        htmlDetail = driver.page_source  # 这是一面的页面内容
        try:
            doc = pq(htmlDetail)

            if WhetherYuShou(doc) == False:
                print '-------不是预售,不需要保存--------'
                time.sleep(random.uniform(2, 4))
                continue
            detailURL = 'https://detail.tmall.com/item.htm?id=' + str(
                TreasureID)

            # TODO:XDF 这里需要注意一下,src图片链接可以不丰在https,需要自己手动拼接
            mainPics = doc.find('#J_ImgBooth').attr('src')
            if 'https:' in mainPics:
                mainPic = mainPics
            else:
                mainPic = 'https:' + mainPics
            # 收藏人数
            popularity = clearToReplaceData(
                doc.find('#J_CollectCount').text(), 0)

            # TODO:XDF 这里要注意,源码中可能存在xmlns,用pq是爬取不到的,要用lxml的tree抓取(非常坑爹)
            # presellPrice = clearToReplaceData(doc.find('#J_PromoBox').text(), 1)
            address = doc.find('#J_deliveryAdd').text()
            paymentDate = doc.find('.J_step2Time').text().split('~')
            paymentBeginDate = paymentDate[0]
            paymentFinishDate = paymentDate[1]
            reserveCount = clearToReplaceData(
                doc.find('.tb-wrt-guc').text(), 3)
            detailPrice = clearToReplaceData(
                doc.find('#J_StrPriceModBox .tm-price').text(), 5)
            categoryIdContent = clearToReplaceData(
                str(doc.find('#J_ZebraPriceDesc').attr('mdv-cfg')), 2)
            print(categoryIdContent)

            spuIds = 'TShop.Setup\((.*?)\);'
            apiData = re.findall(spuIds, htmlDetail, re.S)[0]
            datas = json.loads(apiData)
            brandId = datas['itemDO']['brandId']
            categoryId = datas['itemDO']['categoryId']
            rootCatId = datas['itemDO']['rootCatId']
            spuId = datas['itemDO']['spuId']
            title = datas['itemDO']['title']
            sellerId = datas['rateConfig']['sellerId']
            # URL_NO = datas['rstShopId']
            shopID = sellerId

            shopName = doc.find('.slogo-shopname').text()
            categoryName = categoryNamesQly(str(TreasureID))
            styleData = doc.find('#J_AttrUL').children().items()
            # 风格
            StyleName = styleNames(styleData)
            # 因为styleData是一个迭代器,被循环完的就会被释放掉(品牌有可能在查找风格的时候循环过去了,已经被释放掉了),所以这里得重新赋值数据源
            brandData = doc.find('#J_AttrUL').children().items()
            # 品牌
            brand = brandName(brandData)
            # 评价描述评分
            EvaluationScores = evaluationScoreURL(str(TreasureID), str(spuId),
                                                  str(sellerId))
            URL_NO = doc.find('#LineZing').attr('shopid')

            if ExistenceShopName(ShopName) == False:  #这里证明这家店铺是没有被爬过的
                InsertShopTempletes(ShopName, URL_NO)

            try:
                ShopURL = str(doc.find('.shopLink').attr('href'))
                if len(ShopURL):
                    ShopURL = clearToReplaceData(ShopURL, 4)
                else:
                    ShopURL = '-'
            except Exception as e:
                print e

            print str(
                TreasureID
            ), shopName, categoryName, datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S'
            ), detailPrice, address, title, mainPic, presellPrice, popularity  # , paymentBeginDate, paymentFinishDate, reserveCount

            StartTime = datetime.datetime.strptime(
                str(allShopName['YuShouStartTime'][0]), '%Y-%m-%d %H:%M:%S')

            EndTime = datetime.datetime.strptime(
                str(allShopName['YuShouEndTime'][0]), '%Y-%m-%d %H:%M:%S')

            # print nowTime, type(nowTime), type(endTime), now(), endTime

            # if now() > str(allShopName['YuShouEndTime'][0]):
            #     print '过期'
            # else:
            #     print '没过期'

            product = {
                'title':
                title,
                'TreasureID':
                str(TreasureID),
                'addRess':
                address,
                'shopName':
                shopName,
                'mainPic':
                mainPic,
                'detailPrice':
                detailPrice,
                'popularity':
                popularity,
                'reserveCount':
                int(reserveCount),
                'paymentBeginDate':
                paymentBeginDate,
                'paymentFinishDate':
                paymentFinishDate,
                'presellPrice':
                float(presellPrice),
                'categoryId':
                int(categoryId),
                'categoryName':
                categoryName,
                'spiderTime':
                datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'ShopID':
                shopID,
                'brandId':
                brandId,
                'brand':
                brand,
                'spuId':
                spuId,
                'rootCatId':
                int(rootCatId),
                'StyleName':
                StyleName,
                'EffectiveTime':
                '',
                'ReservationStatus':
                0,
                'CollectionNum':
                int(popularity),
                'ItemName':
                '',
                'NCategory_Name':
                '',
                'Is_Search':
                1,
                'NStyleName':
                ' ',
                'NewstPrice':
                0,
                'EvaluationScores':
                float(EvaluationScores),
                'URL_NO':
                URL_NO,
                'ShopURL':
                ShopURL,
                'sellerId':
                sellerId,
                'productState':
                '1',
                'JHSmodifyTime':
                datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'modifyTime':
                datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'StartTime':
                StartTime,
                'EndTime':
                EndTime,
                'detailURL':
                detailURL
            }
            currentTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            rateData = getCommentBestNewTime(product)
            if rateData:
                # commentNewTime = getCommentBestNewTime(product)
                EvaluationNewTime = strToDateTime(str(rateData),
                                                  'sixLineTypes')
            else:
                EvaluationNewTime = ''
            if str(dbChoice['TmallYuShouEnemyShopSql']
                   [0]) == 'Mongodb':  #保存到mongodb
                saveYuShouOrRemove(product, str(TreasureID))
            else:
                print '保存到sqlserver数据库...'

                if judgeHaveTreasureID(product, '') == True:
                    InsertOrUpdateBaseInfo(product, 'Update')
                else:
                    InsertOrUpdateBaseInfo(product, 'Insert')

                InsertPreSaleNew(product)

        except Exception as e:
            print('error---%s' % e)
def tmallGivenIDAndShopName():
    # TODO:XDF Chrome欲歌浏览器
    options = webdriver.ChromeOptions()
    # options.add_extension('AdBlock_v3.15.0.crx') # TODO:XDF Chrome欲歌广告过滤插件
    # 设置中文
    options.add_argument('lang=zh_CN.UTF-8')
    prefs = {"profile.managed_default_content_settings.images": 2}
    options.add_experimental_option("prefs", prefs)  # TODO:XDF 禁止加载图片
    # 更换头部
    options.add_argument(
        'user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"'
    )
    driver = webdriver.Chrome(
        executable_path=
        r'/Users/zhuoqin/Desktop/Python/SeleniumDemo/chromedriver')

    wait = WebDriverWait(driver, 200, 0.5)  # 表示给browser浏览器一个10秒的加载时间

    # TODO:XDF PhantomJS无头浏览器
    # dcap = dict(DesiredCapabilities.PHANTOMJS)
    # dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36")  # 设置user-agent请求头
    # dcap["phantomjs.page.settings.loadImages"] = False  # 禁止加载图片
    #
    # print ('即将开始。。。')
    # service_args = []
    # service_args.append('--load-images=no')  ##关闭图片加载
    # service_args.append('--disk-cache=yes')  ##开启缓存
    # service_args.append('--ignore-ssl-errors=true')  ##忽略https错误
    # #TODO:XDF 针对本地调试
    # driver = webdriver.PhantomJS(executable_path=r'/Users/zhuoqin/Desktop/Python/SeleniumDemo/phantomjs',service_args=service_args,desired_capabilities=dcap)
    # #
    # # # driver = webdriver.PhantomJS(executable_path=r'/usr/bin/phantomjs', service_args=service_args,desired_capabilities=dcap)  # TODO:XDF 针对Linux
    # # # TODO:XDF 针对Linux服务器
    # wait = WebDriverWait(driver, 60, 0.5)  # 表示给browser浏览器一个10秒的加载时间
    # #
    # driver.implicitly_wait(30)
    # driver.set_page_load_timeout(30)
    print('等待中。。。')

    while True:

        # for k in range(0, len(ownShopID)):
        for datas in selectAllProductID():

            driver.get('https://detail.tmall.com/item.htm?id=%s' %
                       str(datas[0]))
            print('https://detail.tmall.com/item.htm?id=%s' % str(datas[0]))
            TreasureID = str(datas[0])
            JudgeLoginSuccess(driver)

            tmallLogin(driver)

            time.sleep(random.uniform(3, 5))
            if judgeProduct(driver) == True:
                print '商品不存在'
                continue
            tmallCode(driver, wait)

            try:
                wait.until(
                    EC.presence_of_element_located(
                        (By.CLASS_NAME, 'tb-detail-hd')))  # 显性等待
            except Exception as e:
                print '显性未加载成功---%s' % e

            time.sleep(random.randint(4, 5))  # 这里得让他睡眠一下,否则第二页开始会报错(加载数据)

            # driver.implicitly_wait(30) #隐性等待30秒,如果30之内页面加载完毕,往下执行,否则超时会报错,需要处理
            html = driver.page_source  # 这是一面的页面内容
            # print '源码内容----%s'%html
            # print ('等待中。。。%s' % k)
            if 'tmall' in driver.current_url:
                detailURL = 'https://detail.tmall.com/item.htm?id=' + str(
                    datas[0])
            else:
                detailURL = 'https://item.taobao.com/item.htm?id=' + str(
                    datas[0])
            try:
                doc = pq(html)

                # TODO:XDF 这里需要注意一下,src图片链接可以不丰在https,需要自己手动拼接
                mainPics = doc.find('#J_ImgBooth').attr('src')
                if 'https:' in mainPics:
                    mainPic = mainPics
                else:
                    mainPic = 'https:' + mainPics

                # TODO:XDF 这里要注意,源码中可能存在xmlns,用pq是爬取不到的,要用lxml的tree抓取(非常坑爹)

                presellPrice = clearToReplaceData(
                    doc.find('#J_PromoBox').text(), 1)
                address = doc.find('#J_deliveryAdd').text()
                # 收藏人数
                popularity = clearToReplaceData(
                    doc.find('#J_CollectCount').text(), 0)

                reserveCount = clearToReplaceData(
                    doc.find('.tb-wrt-guc').text(), 3)
                detailPrice = clearToReplaceData(
                    doc.find('#J_StrPriceModBox .tm-price').text(), 5)
                categoryIdContent = clearToReplaceData(
                    str(doc.find('#J_ZebraPriceDesc').attr('mdv-cfg')), 2)
                print(categoryIdContent)

                spuIds = 'TShop.Setup\((.*?)\);'
                apiData = re.findall(spuIds, html, re.S)[0]
                datas = json.loads(apiData)
                brandId = datas['itemDO']['brandId']
                categoryId = datas['itemDO']['categoryId']
                rootCatId = datas['itemDO']['rootCatId']
                spuId = datas['itemDO']['spuId']
                title = datas['itemDO']['title']
                sellerId = datas['rateConfig']['sellerId']
                shopID = sellerId

                shopName = doc.find('.slogo-shopname').text()

                categoryName = categoryNamesQly(TreasureID)
                print '类目名称------%s' % categoryName

                styleData = doc.find('#J_AttrUL').children().items()
                # 风格
                StyleName = styleNames(styleData)
                # 因为styleData是一个迭代器,被循环完的就会被释放掉(品牌有可能在查找风格的时候循环过去了,已经被释放掉了),所以这里得重新赋值数据源
                brandData = doc.find('#J_AttrUL').children().items()

                # 品牌
                brand = brandName(brandData)
                # 评价描述评分
                EvaluationScores = evaluationScoreURL(str(TreasureID),
                                                      str(spuId),
                                                      str(sellerId))
                print '你大爷的------000'
                URL_NO = doc.find('#LineZing').attr('shopid')

                try:
                    ShopURL = str(doc.find('.shopLink').attr('href'))
                    if len(ShopURL):
                        ShopURL = clearToReplaceData(ShopURL, 4)
                    else:
                        ShopURL = '-'
                except Exception as e:
                    print e
                print '你大爷的------123'
                print TreasureID, shopName, categoryName, address, detailURL, title, mainPic, presellPrice, popularity  # , paymentBeginDate, paymentFinishDate, reserveCount
                print '你大爷的------456'
                product = {
                    'title':
                    title,
                    'TreasureID':
                    TreasureID,
                    'addRess':
                    address,
                    'shopName':
                    shopName,
                    'mainPic':
                    mainPic,
                    'detailURL':
                    detailURL,
                    'detailPrice':
                    detailPrice,
                    'popularity':
                    popularity,
                    'reserveCount':
                    reserveCount,
                    'presellPrice':
                    presellPrice,
                    'categoryId':
                    int(categoryId),
                    'categoryName':
                    categoryName,
                    'spiderTime':
                    datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'ShopID':
                    shopID,
                    'brandId':
                    brandId,
                    'brand':
                    brand,
                    'spuId':
                    spuId,
                    'rootCatId':
                    int(rootCatId),
                    'StyleName':
                    StyleName,
                    'EffectiveTime':
                    '',
                    'ReservationStatus':
                    0,
                    'ReNewPreSaleTime':
                    datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'JHSReNewTime':
                    datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'CollectionNum':
                    0,
                    'JHSmodifyTime':
                    datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'ItemName':
                    '',
                    'EvaluationTime':
                    datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'NCategory_Name':
                    '',
                    'Is_Search':
                    1,
                    'NStyleName':
                    '',
                    'NewstPrice':
                    0,
                    'SkuModifyDate':
                    datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'TempleteTime':
                    datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'EvaluationScores':
                    float(EvaluationScores),
                    'URL_NO':
                    URL_NO,
                    'ShopURL':
                    ShopURL,
                    'sellerId':
                    sellerId,
                    'NumTimes':
                    '第' + str(productNumberTimes(TreasureID) + 1) + '次爬取'
                }

                saveTmallGivenIDTB(product)

                # if judgeHaveTreasureID(product) == True:
                #     print '存在------'
                #     InsertOrUpdateBaseInfo(product,'Update')
                #     print '更新成功------'
                # else:
                #     print '不存在吧------'
                #     InsertOrUpdateBaseInfo(product, 'Insert')
                #     print '存入成功------'
                # # InsertPreSaleNew(product)

            except Exception as e:
                print('error---%s' % e)

        break

    print('---------------名字----1')

    time.sleep(random.randint(4, 7))
    driver.close()
    driver.quit()
示例#4
0
def getTaoBaoCommentData():
    # TODO:XDF Chrome欲歌浏览器
    options = webdriver.ChromeOptions()
    # options.add_extension('AdBlock_v3.15.0.crx') # TODO:XDF Chrome欲歌广告过滤插件
    # 设置中文
    options.add_argument('lang=zh_CN.UTF-8')
    prefs = {"profile.managed_default_content_settings.images": 2}
    options.add_experimental_option("prefs", prefs)  # TODO:XDF 禁止加载图片
    # 更换头部
    options.add_argument(
        'user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"'
    )

    driver = webdriver.Chrome(
        executable_path=r'/Users/zhuoqin/Downloads/123456/chromedriver'
    )  # chrome_options=options,
    wait = WebDriverWait(driver, 200, 0.5)  # 表示给browser浏览器一个10秒的加载时间

    # TODO:XDF PhantomJS无头浏览器
    # dcap = dict(DesiredCapabilities.PHANTOMJS)
    # dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36")  # 设置user-agent请求头
    # dcap["phantomjs.page.settings.loadImages"] = False  # 禁止加载图片
    #
    # print ('即将开始。。。')
    # service_args = []
    # service_args.append('--load-images=no')  ##关闭图片加载
    # service_args.append('--disk-cache=yes')  ##开启缓存
    # service_args.append('--ignore-ssl-errors=true')  ##忽略https错误
    # #TODO:XDF 针对本地调试
    # driver = webdriver.PhantomJS(executable_path=r'/Users/zhuoqin/Desktop/Python/SeleniumDemo/phantomjs',service_args=service_args,desired_capabilities=dcap)
    # #
    # # # driver = webdriver.PhantomJS(executable_path=r'/usr/bin/phantomjs', service_args=service_args,desired_capabilities=dcap)  # TODO:XDF 针对Linux
    # # # TODO:XDF 针对Linux服务器
    # wait = WebDriverWait(driver, 60, 0.5)  # 表示给browser浏览器一个10秒的加载时间
    # #
    # driver.implicitly_wait(30)
    # driver.set_page_load_timeout(30)
    print('等待中。。。')
    driver.maximize_window()
    driver.get('https://item.taobao.com/item.htm?id=541901945837')
    BounceCommentLogin(driver)
    tmallLogin(driver, UnexpectedAlertPresentException, ActionChains)

    time.sleep(random.uniform(3, 5))

    tmallCode(driver, wait, EC)

    try:
        wait.until(
            EC.presence_of_element_located(
                (By.CLASS_NAME, 'attributes-list')))  # 显性等待
    except Exception as e:
        print '显性未加载成功---%s' % e

    time.sleep(random.randint(5, 6))  # 这里得让他睡眠一下,否则第二页开始会报错(加载数据)
    doc = pq(driver.page_source)
    shopName = doc.find('.shop-name-link').text()
    Title = doc.find('#J_Title .tb-main-title').attr('data-title')

    # print '类型---%s'%categoryId
    styleData = doc.find('.attributes-list').children().items()
    # 风格
    StyleName = styleNames(styleData)
    # 因为styleData是一个迭代器,被循环完的就会被释放掉(品牌有可能在查找风格的时候循环过去了,已经被释放掉了),所以这里得重新赋值数据源
    brandData = doc.find('.attributes-list').children().items()
    # 品牌
    brand = brandName(brandData)

    categoryName = categoryNamesQly('541901945837')

    spuIds = "shopId           : '(.*?)',"
    shopId = re.findall(spuIds, driver.page_source, re.S)[0]

    sellerIds = "sellerId         : '(.*?)',"
    sellerId = re.findall(sellerIds, driver.page_source, re.S)[0]
    categoryIds = " cid           : '(.*?)',"
    categoryId = re.findall(categoryIds, driver.page_source, re.S)[0]
    print '店铺名-----%s' % shopName, Title, StyleName, brand, categoryName, shopId, sellerId, categoryId

    time.sleep(random.randint(15, 20))  # 这里得让他睡眠一下,否则第二页开始会报错(加载数据)

    time.sleep(random.randint(4, 5))  # 这里得让他睡眠一下,否则第二页开始会报错(加载数据)

    js = "var q = document.documentElement.scrollTop = 1000"
    driver.execute_script(js)

    time.sleep(random.uniform(3, 5))

    driver.find_element_by_xpath('//*[@id="J_TabBar"]/li[2]/a').click()
    time.sleep(random.uniform(3, 5))

    BounceCommentLogin(driver)

    dragger = driver.find_element_by_class_name("sorting")
    action = ActionChains(driver)
    action.move_to_element(dragger)

    time.sleep(random.uniform(3, 5))

    action.click_and_hold(on_element=dragger).perform()

    driver.find_element_by_xpath(
        '//*[@id="reviews"]/div/div/div/div/div/div[1]/div[2]/div/ul/li[2]'
    ).click()

    # print '源码-----%s'%driver.page_source

    action.release()
    time.sleep(random.uniform(5, 8))

    driver.delete_all_cookies()

    print '测试一下吧-------1'
    time.sleep(random.uniform(5, 8))

    while True:
        BounceCommentLogin(driver)
        LoginCodeVerificatin(driver)
        time.sleep(random.uniform(5, 8))
        doc = pq(driver.page_source)

        print '源码吧----%s' % doc.find(
            '.J_KgRate_ReviewItem.kg-rate-ct-review-item').text()

        for data in doc.find(
                '.J_KgRate_ReviewItem.kg-rate-ct-review-item').items():

            RateDate = strToDateTime(
                str(data.find('.tb-r-act-bar .tb-r-info .tb-r-date').text()),
                'fiveAllWordTypes')
            TaoBaoComment = data.find(
                '.tb-rev-item .J_KgRate_ReviewContent.tb-tbcr-content ').text(
                )
            PhotoItems = data.find(
                '.tb-rev-item-media .kg-photo-viewer-thumb-bar.tb-tbcr-mt .photo-item'
            ).items()
            auctionSku = data.find('.tb-r-act-bar .tb-r-info').text()

            #追加内容
            appendContent = data.find(
                '.tb-rev-item tb-rev-item-append .tb-rev-item.tb-rev-item-append .J_KgRate_ReviewContent.tb-tbcr-content '
            ).text()
            if '颜色分类:' in auctionSku:
                auctionSku = '颜色分类:' + auctionSku.split('颜色分类:')[-1]
            else:
                auctionSku = '-'

            Phostos = []
            for photo in PhotoItems:
                # TODO:XDF 这里要注意,源码中可能存在xmlns,用pq是爬取不到的,要用lxml的tree抓取(非常坑爹)
                if 'xmlns' in photo.html():
                    selector = etree.HTML(photo.html())
                    Img = str(selector.xpath('//img/@src')[0])
                    if 'https:' not in Img:
                        Image = 'https:' + Img.replace('40x40', '400x400')
                    else:
                        Image = Img
                    Phostos.append(Image)

                else:
                    print '**************不存在xmlns啦*****************'

                # print '相片----%s'%Image#photo.html()
            print '测试一下吧------', data.find('.from-whom').text(
            ), RateDate, TaoBaoComment, Phostos, auctionSku, appendContent
        # print '源码-----%s' %doc.find('.tb-revbd').text()
        time.sleep(random.uniform(5, 8))
        driver.find_element_by_xpath('//*[@class="pg-next"]').click()
        time.sleep(random.uniform(5, 8))
示例#5
0
def tmallGivenIDAndShopName():
    # TODO:XDF Chrome欲歌浏览器
    # options = webdriver.ChromeOptions()
    # # options.add_extension('AdBlock_v3.15.0.crx') # TODO:XDF Chrome欲歌广告过滤插件
    # # 设置中文
    # options.add_argument('lang=zh_CN.UTF-8')
    # prefs = {"profile.managed_default_content_settings.images": 2}
    # options.add_experimental_option("prefs", prefs)  # TODO:XDF 禁止加载图片
    # # 更换头部
    # options.add_argument(
    #     'user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"')
    # # driver = webdriver.Chrome(chrome_options=options,
    # #                           executable_path=r'/Users/zhuoqin/Desktop/Python/SeleniumDemo/chromedriver')
    # driver = webdriver.Chrome(executable_path=r'/Users/zhuoqin/Downloads/123456/chromedriver')  # chrome_options=options,
    # wait = WebDriverWait(driver, 200, 0.5)  # 表示给browser浏览器一个10秒的加载时间

    # TODO:XDF PhantomJS无头浏览器
    dcap = dict(DesiredCapabilities.PHANTOMJS)
    dcap["phantomjs.page.settings.userAgent"] = (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36"
    )  # 设置user-agent请求头
    dcap["phantomjs.page.settings.loadImages"] = False  # 禁止加载图片

    print('即将开始。。。')
    service_args = []
    service_args.append('--load-images=no')  ##关闭图片加载
    service_args.append('--disk-cache=yes')  ##开启缓存
    service_args.append('--ignore-ssl-errors=true')  ##忽略https错误
    #TODO:XDF 针对本地调试
    # driver = webdriver.PhantomJS(executable_path=r'/Users/zhuoqin/Desktop/Python/SeleniumDemo/phantomjs',service_args=service_args,desired_capabilities=dcap)
    #
    driver = webdriver.PhantomJS(executable_path=r'/usr/bin/phantomjs',
                                 service_args=service_args,
                                 desired_capabilities=dcap)  # TODO:XDF 针对Linux
    # # TODO:XDF 针对Linux服务器
    wait = WebDriverWait(driver, 60, 0.5)  # 表示给browser浏览器一个10秒的加载时间
    #
    driver.implicitly_wait(30)
    driver.set_page_load_timeout(30)
    print('等待中。。。')

    while True:
        BaseInfo = TmallYuShouBaseInfoData()
        for k in range(0, len(BaseInfo)):
            time.sleep(random.uniform(5, 8))
            TreasureID = str(BaseInfo[k]).replace(' ', '')
            driver.get('https://detail.tmall.com/item.htm?id=%s' % TreasureID)
            print('https://detail.tmall.com/item.htm?id=%s' % TreasureID)
            # ID = str(ownShopID['shopID'][k])
            # JudgeLoginSuccess(driver,UnexpectedAlertPresentException,ActionChains)

            tmallLogin(driver, UnexpectedAlertPresentException, ActionChains)

            time.sleep(random.uniform(3, 5))
            if judgeProduct(driver) == True:
                print '商品不存在'
                continue
            tmallCode(driver, wait, EC)

            try:
                wait.until(
                    EC.presence_of_element_located(
                        (By.CLASS_NAME, 'tb-detail-hd')))  # 显性等待
            except Exception as e:
                print '显性未加载成功---%s' % e

            time.sleep(random.randint(3, 5))  #这里得让他睡眠一下,否则第二页开始会报错(加载数据)

            # driver.implicitly_wait(30) #隐性等待30秒,如果30之内页面加载完毕,往下执行,否则超时会报错,需要处理
            html = driver.page_source  #这是一面的页面内容
            # print '源码内容----%s'%html
            print('等待中。。。%s' % k)
            if 'tmall' in driver.current_url:
                detailURL = 'https://detail.tmall.com/item.htm?id=' + TreasureID
            else:
                detailURL = 'https://item.taobao.com/item.htm?id=' + TreasureID
            try:
                doc = pq(html)

                if WhetherYuShou(doc) == False:
                    print '-----不是预售-----跳过'
                    continue

                # TODO:XDF 这里需要注意一下,src图片链接可以不丰在https,需要自己手动拼接
                mainPics = doc.find('#J_ImgBooth').attr('src')
                if 'https:' in mainPics:
                    mainPic = mainPics
                else:
                    mainPic = 'https:' + mainPics

                # TODO:XDF 这里要注意,源码中可能存在xmlns,用pq是爬取不到的,要用lxml的tree抓取(非常坑爹)
                # if 'xmlns' in doc.find('.tb-detail-hd').html():
                #     print ('存在xmlns--%s'%doc.find('.tb-detail-hd').html())
                #     titles = doc.find('.tb-detail-hd').html()
                #     selector = etree.HTML(titles)
                #     title = str(selector.xpath('//h1/text()')[0]).replace('\r\n','').replace(' ','').replace('\n','').replace('\t','')
                #     print (title)
                # else:
                #     title = doc.find('#detail .tb-detail-hd h1').text().replace('\r\n', '').replace(' ', '').replace('\n', '').replace('\t','')

                presellPrice = clearToReplaceData(
                    doc.find('#J_PromoBox').text(), 1)
                address = doc.find('#J_deliveryAdd').text()
                #收藏人数

                popularity = clearToReplaceData(
                    doc.find('#J_CollectCount').text(), 0)

                reserveCount = clearToReplaceData(
                    doc.find('.tb-wrt-guc').text(), 3)

                paymentDate = doc.find('.J_step2Time').text().split('~')
                # driver.save_screenshot('RecordProcess/ceShiprocess%s.png' % k)
                try:
                    paymentBeginDate = paymentDate[0]
                    paymentFinishDate = paymentDate[1]
                except Exception as e:
                    print('miss2---%s' % e)

                detailPrice = clearToReplaceData(
                    doc.find('#J_StrPriceModBox .tm-price').text(), 5)
                categoryIdContent = clearToReplaceData(
                    str(doc.find('#J_ZebraPriceDesc').attr('mdv-cfg')), 2)
                print(categoryIdContent)

                spuIds = 'TShop.Setup\((.*?)\);'
                apiData = re.findall(spuIds, html, re.S)[0]
                datas = json.loads(apiData)
                brandId = datas['itemDO']['brandId']
                categoryId = datas['itemDO']['categoryId']
                rootCatId = datas['itemDO']['rootCatId']
                spuId = datas['itemDO']['spuId']
                title = datas['itemDO']['title']
                sellerId = datas['rateConfig']['sellerId']
                shopID = sellerId
                shopName = doc.find('.slogo-shopname').text()
                categoryName = categoryNamesQly(TreasureID)

                # print '类型---%s'%categoryId
                styleData = doc.find('#J_AttrUL').children().items()
                # 风格
                StyleName = styleNames(styleData)
                # 因为styleData是一个迭代器,被循环完的就会被释放掉(品牌有可能在查找风格的时候循环过去了,已经被释放掉了),所以这里得重新赋值数据源
                brandData = doc.find('#J_AttrUL').children().items()
                # 品牌
                brand = brandName(brandData)
                # 评价描述评分
                EvaluationScores = evaluationScoreURL(str(TreasureID),
                                                      str(spuId),
                                                      str(sellerId))
                URL_NO = doc.find('#LineZing').attr('shopid')

                try:
                    ShopURL = str(doc.find('.shopLink').attr('href'))
                    if len(ShopURL):
                        ShopURL = clearToReplaceData(ShopURL, 4)
                    else:
                        ShopURL = '-'
                except Exception as e:
                    print e

                print TreasureID, shopName, categoryName, datetime.datetime.now(
                ).strftime(
                    '%Y-%m-%d %H:%M:%S'
                ), detailPrice, address, detailURL, title, mainPic, presellPrice, popularity  #, paymentBeginDate, paymentFinishDate, reserveCount

                # product = {
                #     'title': title,
                #     'ID': ID,
                #     'addRess': address,
                #     'shopName': shopName,
                #     'mainPic': mainPic,
                #     'detailURL': detailURL,
                #     'detailPrice': detailPrice,
                #     'popularity': popularity,
                #     'reserveCount': reserveCount,
                #     'paymentBeginDate': paymentBeginDate,
                #     'paymentFinishDate': paymentFinishDate,
                #     'presellPrice': presellPrice,
                #     'categoryId': int(categoryId),
                #     'categoryName': categoryName,
                #     'spiderTime': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                #     'ShopID':shopID,
                #     'brandId':brandId,
                #     'brand':brand,
                #     'spuId':spuId,
                #     'rootCatId':int(rootCatId),
                #     'StyleName':StyleName,
                #     'EffectiveTime':'',
                #     'ReservationStatus':0,
                #     'ReNewPreSaleTime':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                #     'JHSReNewTime':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                #     'CollectionNum':0,
                #     'JHSmodifyTime':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                #     'ItemName':'',
                #     'EvaluationTime':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                #     'NCategory_Name':'',
                #     'Is_Search':1,
                #     'NStyleName':'',
                #     'NewstPrice':0,
                #     'SkuModifyDate':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                #     'TempleteTime': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                #     'EvaluationScores':float(EvaluationScores),
                #     'URL_NO':URL_NO,
                #     'ShopURL':ShopURL,
                #     'sellerId':sellerId
                # }
                StartTime = datetime.datetime.strptime(
                    str(allShopName['YuShouStartTime'][0]),
                    '%Y-%m-%d %H:%M:%S')

                EndTime = datetime.datetime.strptime(
                    str(allShopName['YuShouEndTime'][0]), '%Y-%m-%d %H:%M:%S')
                product = {
                    'title':
                    title,
                    'TreasureID':
                    TreasureID,
                    'addRess':
                    address,
                    'shopName':
                    shopName,
                    'mainPic':
                    mainPic,
                    'detailPrice':
                    detailPrice,
                    'popularity':
                    popularity,
                    'reserveCount':
                    int(reserveCount),
                    'paymentBeginDate':
                    strToDateTime(str(paymentBeginDate), 'fiveColonTypes'),
                    'paymentFinishDate':
                    strToDateTime(str(paymentFinishDate), 'fiveColonTypes'),
                    'presellPrice':
                    presellPrice,
                    'categoryId':
                    int(categoryId),
                    'categoryName':
                    categoryName,
                    'spiderTime':
                    datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'ShopID':
                    shopID,
                    'brandId':
                    brandId,
                    'brand':
                    brand,
                    'spuId':
                    spuId,
                    'rootCatId':
                    int(rootCatId),
                    'StyleName':
                    StyleName,
                    'EffectiveTime':
                    '',
                    'ReservationStatus':
                    0,
                    'CollectionNum':
                    int(popularity),
                    'ItemName':
                    '',
                    'NCategory_Name':
                    '',
                    'Is_Search':
                    1,
                    'NStyleName':
                    ' ',
                    'NewstPrice':
                    0,
                    'EvaluationScores':
                    float(EvaluationScores),
                    'URL_NO':
                    URL_NO,
                    'ShopURL':
                    ShopURL,
                    'sellerId':
                    sellerId,
                    'productState':
                    '1',
                    'JHSmodifyTime':
                    datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'modifyTime':
                    datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'StartTime':
                    StartTime,
                    'EndTime':
                    EndTime,
                    'detailURL':
                    detailURL
                }

                if str(dbChoice['tmallYuShouSql'][0]) == 'Mongodb':

                    if TreasureID in BaseInfo:
                        UpdateTmallBaseInfoTB(product)
                        UpdateTmallYuShouTB(product)

                    else:
                        saveTmallGivenIDToYuShouTB(product)
                        saveTmallBaseInfoTBToMongodb(product)
                else:
                    if judgeHaveTreasureID(product) == True:
                        print '存在------'
                        InsertOrUpdateBaseInfo(product, 'Update')
                        print '更新成功------'
                    else:
                        print '不存在吧------'
                        InsertOrUpdateBaseInfo(product, 'Insert')
                        print '存入成功------'
                    # InsertPreSaleNew(product)

            except Exception as e:
                print('error---%s' % e)

        time.sleep(7200)

    print('---------------名字----1')

    time.sleep(random.randint(4, 7))
    driver.close()
    driver.quit()