def __init__(self): # 创建日志文件夹 nowTime = common.getNowTime() self.nowDate = common.getNowTime() self.logPath = common.getDirPath(nowTime, "log") common.mkdir(self.logPath) # 设置日志文件的文件名 self.logName = 'jsTag_' + time.strftime( '%Y%m%d_%H_%M_%S_%M', time.localtime(time.time())) + '.log' self.logFile = self.logPath + self.logName # 初始化日志 # 1、设置formatter,日志的输出格式 self.logFormat = '%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s' self.formatter = logging.Formatter(self.logFormat) # 2、设置Handler,用于写入日志的控制。先创建Handler,然后设置Handler级别 # 级别:CRITICAL > ERROR > WARNING > INFO > DEBUG,默认级别为 WARNING self.handler = logging.FileHandler(self.logFile, mode='a') self.handler.setLevel(logging.DEBUG) self.handler.setFormatter(self.formatter) # 3、创建及配置logger self.logger = logging.getLogger() self.logger.addHandler(self.handler)
def test_AOL(): print("............AOL................") # 找到前天的日期 def getDate(): d = datetime.now() + timedelta(days=-2) d1 = d + timedelta(days=-1) if (int(d.strftime('%Y-%m-%d %H:%M:%S')[11:13]) <= 3): str_d = d1.strftime('%Y-%m-%d %H:%M:%S') else: str_d = d.strftime('%Y-%m-%d %H:%M:%S') yyyy1 = str_d[0:4] mmmm1 = str_d[5:7] dddd1 = str_d[8:10] logInURL1 = yyyy1 + "-" + mmmm1 + "-" + dddd1 return logInURL1 flag = False for try_num in range(5): print("AOL第" + str(try_num + 1) + "次尝试-----------") try: chrome_options = webdriver.ChromeOptions() # 使用headless无界面浏览器模式 chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument( 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36') browser = webdriver.Chrome(chrome_options=chrome_options) loginURL = "https://onemobile.aol.com/" dataURL = "https://onemobile.aol.com/#/seller/39625/reports" browser.get(loginURL) print("点击登录---------------------------------") browser.find_element_by_id("native-login").click() time.sleep(15) username, password = common_mysql.selectFromTb("AOL") print("输入用户名---------------------------------") browser.find_element_by_xpath("//input[@placeholder='Username']").send_keys(username) time.sleep(2) print("确认用户名,点击下一步---------------------------------") browser.find_element_by_xpath("//input[@name='callback_2']").click() time.sleep(10) print("输入密码---------------------------------") browser.find_element_by_xpath("//input[@placeholder='Password']").send_keys(password) time.sleep(2) print("点击登录按钮---------------------------------") browser.find_element_by_xpath("//input[@name='callback_2']").click() time.sleep(10) browser.get(dataURL) # 等待网页刷新 t = 15 while t > 0: print("网页加载中……倒计时" + str(t) + "秒后开始操作……") time.sleep(1) # 等待10秒钟加载时间,网络好的话,5秒就够,但是10秒比较保守 t = t - 1 # 有一个概率性弹窗,有就点击关闭。 try: print("点击X") browser.find_element_by_xpath("//a[@data-dismiss='modal']").click() print("关闭弹窗成功") except Exception as e: errorInfo = traceback.format_exc() print(errorInfo) comm_logging.myLogger.write_logger(errorInfo) print("选择开始日期%s---------------------------------" % getDate()) browser.find_element_by_xpath("//input[@placeholder='YYYY-MM-DD'][1]").clear() time.sleep(3) browser.find_element_by_xpath("//input[@placeholder='YYYY-MM-DD'][1]").send_keys(getDate()) browser.find_element_by_xpath("//input[@placeholder='YYYY-MM-DD'][2]").clear() time.sleep(3) print("选择结束日期---------------------------------") browser.find_element_by_xpath("//input[@placeholder='YYYY-MM-DD'][2]").send_keys(getDate()) time.sleep(3) print("点击确认日期---------------------------------") # class='datepicker-apply-button pendo-id-datepicker-apply-button' 的 a 有3个 为何不用 index browser.find_element_by_xpath("//div[@class='datepicker-button-panel']/a").click() print("点击查询---------------------------------") browser.find_element_by_xpath("//button[@class='e-btn button-primary pendo-id-generate-report']").click() print("点击第一行数据---------------------------------") time.sleep(5) browser.find_element_by_xpath("//tbody[@aria-live='polite']/tr[1]").click() print("点击维度选择---------------------------------") time.sleep(2) browser.find_elements_by_xpath("//span[@class='title']")[2].click() time.sleep(1) print("点击tag维度---------------------------------") browser.find_element_by_xpath("//li[@data-sid='report-dimension-adTagId']").click() print("等待加载数据---------------------------------") time.sleep(10) dimensionsOfdata = ["Ad Tag", "Requests", "Served", "Delivered", "Fill Rate", "Clicks", "CTR", "Revenue", "eCPM", "RPM"] workbook = xlwt.Workbook(encoding='utf-8') worksheet = workbook.add_sheet('AOL') # 加表头 col = 0 for di in dimensionsOfdata: worksheet.write(0, col, dimensionsOfdata[col]) col = col + 1 print("开始爬数据") soup = BeautifulSoup(browser.page_source, "lxml") table = soup.find("table", {"class": "table table-body table-nexage tablesorter tablesorter-default hasResizable"}) # find() 直接返回结果 findAll()返回一个符合条件的所有tag的list tbody = table.find("tbody") trSum = tbody.findAll("tr") # trSum 为表格中的所有行的list row = 1 for tr in trSum: # col 放在for 外面,会引起错误ValueError: column index (256) not an int in range(256) col = 0 # tdSum 为一个tr的所有列的list tdSum = tr.findAll("td") # td 中存在空的<td></td>,在每个tr的最后一个td # 不取每一行的最后一个 td for td in tdSum: if td.text.strip() != '': if col == 0: worksheet.write(row, col, td.text) else: worksheet.write(row, col, float(td.text.replace(",", "").replace("%", "").replace("$", ""))) col = col + 1 row = row + 1 nowTime = common.getNowTime() # 路径只取nowTime的日期部分 path = common.getDirPath(nowTime) excelName = common.getExcelName(nowTime, "AOL") common.mkdir(path) workbook.save(path + excelName) print(" excel保存成功,路径:" + path + "-----------") print("||||||||||||AOL抓取完毕||||||||||||||") flag = True except Exception as e: flag = False errorInfo = traceback.format_exc() print(errorInfo) comm_logging.myLogger.write_logger(errorInfo) continue finally: browser.quit() if (flag or try_num >= 4): break
def test_Mobfox(): print("............Mobfox................") flag = False for try_num in range(3): print("Mobfox第" + str(try_num + 1) + "次尝试-----------") yesterday = common.getNowTime(delta=-1, type="-")[0:10] try: chrome_options = webdriver.ChromeOptions() # 使用headless无界面浏览器模式 chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument( 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' ) # 启动浏览器,获取网页源代码 browser = webdriver.Chrome(chrome_options=chrome_options) loginURL = "https://account.mobfox.com/www/cp/login.php" dataURL = "https://account.mobfox.com/www/cp/exchange_reporting.php" dataInnerURL = "https://account.mobfox.com/react/exchange-reporting?apikey=065f325c0728e09132ebf4cedfe10ed3&accountid=72511&hash=a305f8299d4f99e855a6a8aa26d3a221&env=prod&siteRoot=https://account.mobfox.com:443/&apiRoot=https://api-v3.mobfox.com" dataLoadURL = "https://api-v3.mobfox.com/publisher/report?apikey=065f325c0728e09132ebf4cedfe10ed3&from=" + yesterday + "&to=" + yesterday + "&tz=Asia%2FHong_Kong&group=inventory_id&timegroup=day&totals=total_impressions%2Ctotal_served%2Ctotal_ad_source_opportunities%2Ctotal_clicks%2Ctotal_earnings&f%3Aad_source=exchange&o%3Ainclude_entities=true" browser.get(loginURL) time.sleep(2) print("输入用户名密码---------------------------------") username, password = common_mysql.selectFromTb("Mobfox") browser.find_element_by_id("email").send_keys(username) browser.find_element_by_id("password").send_keys(password) # 有一个概率性弹窗,有就点击关闭。 try: print("点击accept") browser.find_element_by_xpath( '//a[@class="optanon-allow-all"]').click() print("关闭弹窗成功") except Exception as e: errorInfo = traceback.format_exc() comm_logging.myLogger.write_logger(errorInfo) print(traceback.format_exc()) print("无弹窗 或 弹窗关闭失败") time.sleep(5) browser.find_element_by_xpath(".//*[@type='submit']").click() # browser.find_element_by_class_name('btn btn-primary btn-myDsp ').click() print("跳转到数据页面---------------------------------") browser.get(dataURL) time.sleep(2) browser.get(dataInnerURL) print("自动选择数据维度内容---------------------------------") browser.get(dataLoadURL) time.sleep(2) workbook = xlwt.Workbook(encoding='utf-8') worksheet = workbook.add_sheet('Mobfox') print("开始爬数据") soup = BeautifulSoup(browser.page_source, "lxml") dataSoup = soup.find("pre").text #网页里提供的数据为json数据,需要json进行解析 dataJson = json.loads(dataSoup) dataCol = dataJson["columns"] # dataResults 的大小为 rowNum * 7 dataResults = dataJson["results"] rowNum = dataResults.__len__() dimensions = [ "day", "inventory", "source_opportunities", "total_served", "total_impressions", "total_clicks", "ctr", "fillrate", "total_earnings", "ecpm" ] for col in range(10): worksheet.write(0, col, dimensions[col]) try: for row in range(0, rowNum): for col in range(10): if col == 0 or col == 3 or col == 5: worksheet.write(row + 1, col, dataResults[row][col]) elif col == 1: InventoryID = str(dataResults[row][col]) InventoryName = dataJson["entities"][ "inventory_id"][InventoryID]["name"] Inventory = InventoryName + " (" + InventoryID + ")" worksheet.write(row + 1, col, Inventory) elif col == 2: worksheet.write(row + 1, col, dataResults[row][4]) elif col == 4: worksheet.write(row + 1, col, dataResults[row][2]) elif col == 6: if dataResults[row][4] != 0: ctr = dataResults[row][5] / dataResults[row][ 4] * 100 else: ctr = "#DIV/!" worksheet.write(row + 1, col, ctr) elif col == 7: if dataResults[row][2] != 0: fillRate = dataResults[row][4] / dataResults[ row][2] * 100 else: fillRate = "#DIV/!" worksheet.write(row + 1, col, fillRate) elif col == 8: worksheet.write(row + 1, col, dataResults[row][6]) elif col == 9: if dataResults[row][4] != 0: ecpm = dataResults[row][6] / dataResults[row][ 4] * 100 else: ecpm = "#DIV/!" worksheet.write(row + 1, col, ecpm) except Exception: print(traceback.format_exc()) errorInfo = traceback.format_exc() comm_logging.myLogger.write_logger(errorInfo) continue nowTime = common.getNowTime() # 路径只取nowTime的日期部分 path = common.getDirPath(nowTime) excelName = common.getExcelName(nowTime, "Mobfox") common.mkdir(path) workbook.save(path + excelName) print(" excel保存成功,路径:" + path + "-----------") print("||||||||||||Mobfox抓取完毕||||||||||||||") flag = True except Exception as e: flag = False print(traceback.format_exc()) errorInfo = traceback.format_exc() comm_logging.myLogger.write_logger(errorInfo) continue finally: browser.quit() if (flag or try_num == 4): break
def test_Mobfox(): print("............Mobfox................") flag = False for try_num in range(3): print("Mobfox第" + str(try_num + 1) + "次尝试-----------") try: chrome_options = webdriver.ChromeOptions() # 使用headless无界面浏览器模式 # chrome_options.add_argument('--headless') # chrome_options.add_argument('--disable-gpu') chrome_options.add_argument( 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' ) # 启动浏览器,获取网页源代码 browser = webdriver.Chrome(chrome_options=chrome_options) loginURL = "https://account.mobfox.com/www/cp/login.php" dataURL = "https://account.mobfox.com/www/cp/exchange_reporting.php" dataInnerURL = "https://account.mobfox.com/react/exchange-reporting?apikey=065f325c0728e09132ebf4cedfe10ed3&accountid=72511&hash=a305f8299d4f99e855a6a8aa26d3a221&env=prod&siteRoot=https://account.mobfox.com:443/&apiRoot=https://api-v3.mobfox.com" browser.get(loginURL) time.sleep(2) print("输入用户名密码---------------------------------") browser.find_element_by_id("email").send_keys( "*****@*****.**") browser.find_element_by_id("password").send_keys("360Security2017") # 有一个概率性弹窗,有就点击关闭。 try: print("点击accept") browser.find_element_by_xpath( '//a[@class="optanon-allow-all"]').click() print("关闭弹窗成功") except Exception as e: errorInfo = traceback.format_exc() comm_logging.myLogger.write_logger(errorInfo) print(traceback.format_exc()) print("无弹窗 或 弹窗关闭失败") time.sleep(5) browser.find_element_by_xpath(".//*[@type='submit']").click() # browser.find_element_by_class_name('btn btn-primary btn-myDsp ').click() print("跳转到数据页面---------------------------------") browser.get(dataURL) time.sleep(2) browser.get(dataInnerURL) print("自动选择数据维度内容---------------------------------") # s = browser.find_element_by_id("react-select-5--value") # Select(s).select_by_value("inventory_id") # s = browser.find_element_by_id("period") # Select(s).select_by_value("yesterday") # s = browser.find_element_by_id("timezone") # Select(s).select_by_value("Asia/Hong_Kong") # s = browser.find_element_by_id("timegroup") # Select(s).select_by_value("day") # browser.find_element_by_xpath('//span[@id="react-select-5--value"]/div[2]/input').send_keys("Daily") # browser.find_element_by_xpath('//span[@id="react-select-6--value-item"]').text = "react-select-6--value-item" # browser.find_element_by_class_name("text-center form-control").click() time.sleep(10) s = browser.find_element_by_xpath('//div[@id="content"]') html = browser.find_element_by_xpath( '//html[@class=" supports cssfilters"]') browser.find_element_by_xpath('//li[text()="Yesterday"]').click() browser.find_element_by_xpath( '//button[@title="Download Excel"]').click() time.sleep(2) browser.find_element_by_xpath('//button[@type="submit"]').click() dimensionsOfdata = [ "col-day sorting_1", "col-inventory_id", "col-total_ad_source_opportunities", "col-total_served", "col-total_impressions", "col-total_clicks", "col-ctr", "col-fillrate", "col-total_earnings", "col-ecpm" ] workbook = xlwt.Workbook(encoding='utf-8') worksheet = workbook.add_sheet('Mobfox') print("开始爬数据") soup = BeautifulSoup(browser.page_source, "lxml") tbody = soup.find("div", { "class": "dataTables_scrollBody" }).find("tbody") trSum = tbody.findAll("tr") # 加表头 col = 0 for di in dimensionsOfdata: worksheet.write(0, col, dimensionsOfdata[col]) col = col + 1 row = 1 for tr in trSum: # col 放在for 外面,会引起错误ValueError: column index (256) not an int in range(256) col = 0 # tdSum 为一个tr的所有列的list tdSum = tr.findAll("td") for td in tdSum: try: worksheet.write( row, col, float(td.text.strip().replace(",", "").replace( "%", "").replace("$", ""))) except ValueError: worksheet.write(row, col, td.text.strip()) col = col + 1 row = row + 1 nowTime = common.getNowTime() # 路径只取nowTime的日期部分 path = common.getDirPath(nowTime) excelName = common.getExcelName(nowTime, "Mobfox") common.mkdir(path) workbook.save(path + excelName) print(" excel保存成功,路径:" + path + "-----------") print("||||||||||||Mobfox抓取完毕||||||||||||||") flag = True except Exception as e: flag = False print(traceback.format_exc()) errorInfo = traceback.format_exc() comm_logging.myLogger.write_logger(errorInfo) continue finally: browser.quit() if (flag or try_num == 4): break
def test_Mopub(): print("............Mopub................") flag = False for try_num in range(3): print("Mopub第" + str(try_num + 1) + "次尝试-----------") try: chrome_options = webdriver.ChromeOptions() # 使用headless无界面浏览器模式 chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument( 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' ) # 启动浏览器,获取网页源代码 browser = webdriver.Chrome(chrome_options=chrome_options) def getURL(str): d = datetime.now() + timedelta(days=-1) d1 = d + timedelta(days=-1) d2 = d + timedelta(days=-2) if (int(d.strftime('%Y-%m-%d %H:%M:%S')[11:13]) <= 3): str_d1 = d1.strftime('%Y-%m-%d %H:%M:%S') str_d2 = d2.strftime('%Y-%m-%d %H:%M:%S') else: str_d1 = d.strftime('%Y-%m-%d %H:%M:%S') str_d2 = d1.strftime('%Y-%m-%d %H:%M:%S') yyyy1 = str_d1[0:4] mmmm1 = str_d1[5:7] dddd1 = str_d1[8:10] yyyy2 = str_d2[0:4] mmmm2 = str_d2[5:7] dddd2 = str_d2[8:10] # 这个MopubBanner的link logInURL1 = "https://dash.metamarkets.com/mopub-360_mobile_security/explore#ed=app_name&fs.0.k=ad_size&fs.0.v.0=300x250&fs.0.v.1=320x50&fs.1.k=timestamp&fs.1.t.0.tr.end=" + yyyy1 + "-" + mmmm1 + "-" + dddd1 + "T16&fs.1.t.0.tr.start=" + yyyy2 + "-" + mmmm2 + "-" + dddd2 + "T16&gm.0=rev_adj&gm.1=auctions&gm.2=win_rate_v2&gm.3=cleared_done&gm.4=ctr&gm.5=ecpm&gm.6=uniques&od.0=ad_size&od.1=adgroup_priority&od.2=adunit_name&od.3=app_version&od.4=app_name&od.5=country&od.6=creative_id&od.7=pub_id&sbd=0&sortBy=rev_adj&sortDim=0&sortDir=descending&td=time_day&tm.0=rev_adj&tm.1=auctions&tm.2=win_rate_v2&tm.3=cleared_done&tm.4=ctr&tm.5=ecpm&tm.6=uniques&tz=Asia~2fShanghai&zz=4" # 这个Mopub native的link logInURL2 = "https://dash.metamarkets.com/mopub-360_mobile_security/explore#ed=app_name&fs.0.k=ad_size&fs.0.v.0=0x0&fs.0.v.1=320x480&fs.1.k=timestamp&fs.1.t.0.tr.end=" + yyyy1 + "-" + mmmm1 + "-" + dddd1 + "T16&fs.1.t.0.tr.start=" + yyyy2 + "-" + mmmm2 + "-" + dddd2 + "T16&gm.0=rev_adj&gm.1=auctions&gm.2=win_rate_v2&gm.3=cleared_done&gm.4=ctr&gm.5=ecpm&gm.6=uniques&od.0=ad_size&od.1=adgroup_priority&od.2=adunit_name&od.3=app_version&od.4=app_name&od.5=country&od.6=creative_id&od.7=pub_id&sbd=0&sortBy=rev_adj&sortDim=0&sortDir=descending&td=time_day&tm.0=rev_adj&tm.1=auctions&tm.2=win_rate_v2&tm.3=cleared_done&tm.4=ctr&tm.5=ecpm&tm.6=uniques&tz=Asia~2fShanghai&zz=4" if str == "Mopub_Banner": logInURL = logInURL1 elif str == "Mopub_Native": logInURL = logInURL2 # 传什么参数,那么就返回哪个链接 return logInURL # dimensionsOfdata中元素 = 数据div中的colid 值 dimensionsOfdata = [ "auctions", "cleared_done", "uniques", "rev_adj", "win_rate_v2", "ctr", "ecpm" ] dataRes = ["Mopub_Banner", "Mopub_Native"] workbook = xlwt.Workbook(encoding='utf-8') for resouce in dataRes: worksheet1 = workbook.add_sheet(resouce) browser.get(getURL(resouce)) print("现在开始抓取" + resouce + "的数据。" + "\n链接:" + getURL(resouce)) # 因为先运行Mopub_Banner,因此只要判断第一次,第一次登陆即可 if resouce == "Mopub_Banner": username, password = common_mysql.selectFromTb("Mopub") browser.find_element_by_id("form-id1").send_keys(username) browser.find_element_by_id("form-id2").send_keys(password) browser.find_element_by_xpath( "//button[@class='primary login']").click() time.sleep(10) soup = BeautifulSoup(browser.page_source, "lxml") appNameList = soup.findAll("div", colid="app_name") # 插入表头 m = 0 for appName in appNameList: # 虽然网页的表头 colid="app_name" ,但是,div的下一层没有span,而数据部分的div下一层有span,因此appname的第一行空着 worksheet1.write(m, 0, appName.span.text) m = m + 1 # 将数据按列插入excel中 j = 1 for col in dimensionsOfdata: worksheet1.write(0, j, col) colData = soup.findAll("div", colid=col) i = 1 for row in colData[1:]: try: worksheet1.write(i, j, float(row.span["title"])) i = i + 1 except BaseException as err: print("Exception:", err) j = j + 1 print(resouce + " 抓取完成----------------") nowTime = common.getNowTime() # 路径只取nowTime的日期部分 path = common.getDirPath(nowTime) excelName = common.getExcelName(nowTime, "Mopub") common.mkdir(path) workbook.save(path + excelName) print(" excel保存成功,路径:" + path + "-----------") browser.quit() print("||||||||||||Mopub抓取完毕||||||||||||||") flag = True except: flag = False print(traceback.format_exc()) errorInfo = traceback.format_exc() comm_logging.myLogger.write_logger(errorInfo) continue finally: browser.quit() if (flag or try_num == 3): break
def test_Adview(): print("............Adview................") flag = False for try_num in range(5): print("Adview第" + str(try_num + 1) + "次尝试-----------") try: # adview取前天 yesterday = common.getNowTime(-2) yyyy = yesterday[0:4] mm = yesterday[5:7] dd = yesterday[8:10] # beforeYerter = common.getNowTime(-2) # yyyyBefore = beforeYerter[0:4] # mmBefore = beforeYerter[5:7] # ddBefore = beforeYerter[8:10] nowTime = common.getNowTime() yearNow = nowTime[0:4] monthNow = nowTime[5:7] dayNow = nowTime[8:10] # match_Str 为要保留的日期,但是pandas的日期格式为 yyyy-mm-dd match_Str = yyyy + "-" + mm + "-" + dd # match_Str_before = yyyyBefore + "-" + mmBefore + "-" + ddBefore # Adview下载的文件名的前缀 prefix = "AdView_" + yearNow + monthNow + dayNow # 路径只取nowTime的日期部分 path = common.getDirPath(nowTime) excelName = common.getExcelName(nowTime, "Adview") chrome_options = webdriver.ChromeOptions() # 使用headless无界面浏览器模式 # chrome_options.add_argument('--headless') # chrome_options.add_argument('--disable-gpu') chrome_options.add_argument( 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36') # 修改默认下载地址 prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': path} chrome_options.add_experimental_option('prefs', prefs) # 启动浏览器,获取网页源代码 browser = webdriver.Chrome(chrome_options=chrome_options) loginURL = "http://www.adview.cn/web/overseas/login" dataURL = "http://www.adview.cn/user/bid/income" browser.get(loginURL) time.sleep(5) print("输入用户名密码---------------------------------") browser.find_element_by_xpath('//span[text()="Publishers"]').click() username,password = common_mysql.selectFromTb("Adview") browser.find_element_by_id("email").send_keys(username) browser.find_element_by_id("pwd").send_keys(password) ctypes.windll.user32.MessageBoxA(0, u"点击确定后,请在15秒内输入验证码,不要点击登录!!!".encode('gb2312'), u' 信息'.encode('gb2312'), 0) print("请输入验证码,等待 15秒") for i in range(1,15): print("倒计时:",15-i) time.sleep(1) print("点击登录按钮------") browser.find_element_by_xpath('//button[@class="form-control btn btn-blue blue submitBtn"]').click() time.sleep(5) print("跳转到数据页面---------------------------------") browser.get(dataURL) time.sleep(5) try: browser.find_element_by_xpath('//a[text()="English"]').click() except: flag = False print(traceback.format_exc()) errorInfo = traceback.format_exc() comm_logging.myLogger.write_logger(errorInfo) time.sleep(2) print("点击下载") browser.find_element_by_xpath('//input[@value="Export detail CSV"]').click() time.sleep(2) # 扫描文件夹,获取 Adview 的文件列表 print("正在扫描下载的 csv 文件---------") fileList = common.scan_File(path, prefix) common.mkdir(path) # 读取下载的文件,删除无用数据并 重新保存 print("正在剔除其他日期的数据 并另存为 excel ---------") common.turnToXls_ByPandas(path + fileList[-1], path + excelName, 'Adview', match_Str) common.remove_File(path, prefix) print(" excel保存成功,路径:" + path + "-----------") print("||||||||||||Adview抓取完毕||||||||||||||") flag = True except: flag = False print(traceback.format_exc()) errorInfo = traceback.format_exc() comm_logging.myLogger.write_logger(errorInfo) continue finally: browser.quit() if (flag or try_num == 5): break
def test_Pubnative(): print("............Pubnative................") flag = False for try_num in range(3): print("Pubnative第" + str(try_num + 1) + "次尝试-----------") try: yesterday = common.getNowTime(-1) yyyy = yesterday[0:4] mm = yesterday[5:7] dd = yesterday[8:10] # downloadFile = "C:/Users/Administrator/Downloads" + "/" + "Publisher App_" + dd + "." + mm + "." + yyyy + ".csv" # 下载的文件名 downloadFile = "Publisher App_" + dd + "." + mm + "." + yyyy + "-" + dd + "." + mm + "." + yyyy + ".csv" # 用于删除 匹配的字符串 prefix = "Publisher App_" + dd + "." + mm + "." + yyyy + "-" + dd + "." + mm + "." + yyyy # print(downloadFile) nowTime = common.getNowTime() # 路径只取nowTime的日期部分 path = common.getDirPath(nowTime) # print(path) excelName = common.getExcelName(nowTime, "Pubnative") chrome_options = webdriver.ChromeOptions() # 使用headless无界面浏览器模式 # chrome_options.add_argument('--headless') # chrome_options.add_argument('--disable-gpu') chrome_options.add_argument( 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' ) # 修改默认下载地址 prefs = { 'profile.default_content_settings.popups': 0, 'download.default_directory': path } chrome_options.add_experimental_option('prefs', prefs) # 启动浏览器,获取网页源代码 browser = webdriver.Chrome(chrome_options=chrome_options) loginURL = "https://dashboard.pubnative.net" dataURL = "https://dashboard.pubnative.net/partner/#!/api" browser.get(loginURL) time.sleep(5) print("输入用户名密码---------------------------------") from public import common_mysql username, password = common_mysql.selectFromTb("Pubnative") browser.find_element_by_id("email").send_keys(username) browser.find_element_by_id("password").send_keys(password) # browser.find_element_by_xpath('//form[@action="/sessions"]/div[0]/div/input').send_keys("*****@*****.**") # browser.find_element_by_xpath('//form[@action="/sessions"]/div[1]/div/input').send_keys("360Security2017666") time.sleep(2) browser.find_element_by_xpath('//input[@value="LOGIN"]').click() print("跳转到数据页面---------------------------------") browser.get(dataURL) time.sleep(15) print("自动选择数据维度内容---------------------------------") browser.find_element_by_xpath('//div[@class="col-md-7"]').click() browser.find_element_by_xpath( '//div[@class="daterange"]/ul/li[2]').click() s = browser.find_element_by_tag_name("select") # Select(s).select_by_value("0") Select(s).select_by_index(12) browser.find_element_by_xpath( '//button[@class="btn btn-secondary ng-star-inserted"]').click( ) time.sleep(2) browser.find_element_by_xpath( '//div[@class="reports-filters row"]/div[3]').click() print("准备下载...") time.sleep(2) dimensionsOfdata = [ "Publisher App", "Impressions", "Requests", "Fill Rate", "eCPM", "Clicks", "CTR", "Conversions", "Payout" ] # workbook = xlwt.Workbook(encoding='utf-8') # worksheet = workbook.add_sheet('Pubnative') # print("开始爬数据") # soup = BeautifulSoup(browser.page_source, "lxml") # tbody = soup.find("tbody") # time.sleep(2) # trSum = tbody.findAll("tr") # time.sleep(2) # # 加表头 # col = 0 # for di in dimensionsOfdata: # worksheet.write(0, col, dimensionsOfdata[col]) # col = col + 1 # # row = 1 # for tr in trSum[3:]: # # col 放在for 外面,会引起错误ValueError: column index (256) not an int in range(256) # col = 0 # # tdSum 为一个tr的所有列的list # tdSum = tr.findAll("td") # for td in tdSum: # worksheet.write(row, col, td.text.strip()) # col = col + 1 # row = row + 1 print("点击下载") browser.find_element_by_xpath( '//div[@class="col-md order-disabled optional-buttons"]/a[2]' ).click() time.sleep(2) common.mkdir(path) # workbook.save(path + excelName) print("读取 csv 另存为 excel") common.turnToXls_ByPandas(path + downloadFile, path + excelName, 'Pubnative') print("删除多余的 csv") common.remove_File(path, prefix) print(" excel保存成功,路径:" + path + "-----------") print("||||||||||||Pubnative抓取完毕||||||||||||||") flag = True except: flag = False print(traceback.format_exc()) errorInfo = traceback.format_exc() comm_logging.myLogger.write_logger(errorInfo) continue finally: browser.quit() if (flag or try_num == 3): break
def test_OpenX(): print("............OpenX................") nowTime = common.getNowTime() yearNow = nowTime[0:4] monthNow = nowTime[5:7] dayNow = nowTime[8:10] # 路径只取nowTime的日期部分 path = common.getDirPath(nowTime) # 用于删除 匹配的字符串 prefix = "Workspace 1-" + monthNow + "-" + dayNow + "-" + yearNow excelName = common.getExcelName(nowTime, "openx") # 尝试5次,错误继续 flag = False for try_num in range(2): print("OpenX第" + str(try_num + 1) + "次尝试-----------") try: print("打开浏览器") chrome_options = webdriver.ChromeOptions() # 使用headless无界面浏览器模式 # chrome_options.add_argument('--headless') # chrome_options.add_argument('--disable-gpu') chrome_options.add_argument( 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' ) prefs = { 'profile.default_content_settings.popups': 0, 'download.default_directory': path } chrome_options.add_experimental_option('prefs', prefs) # 启动浏览器,获取网页源代码 browser = webdriver.Chrome(chrome_options=chrome_options) # 不最大化 某些按钮按不到会报错 browser.maximize_window() time.sleep(2) logInURL = "https://sso.openx.com/login/login" dataURL = "http://mobimagic-ui.openx.net/app.html#/reports/pie" # 前天 yesterday = common.getNowTime(-2) yyyy = yesterday[0:4] mm = yesterday[5:7] dd = yesterday[8:10] # print(yyyy,mm,dd) print("打开登录网址") browser.get(logInURL) time.sleep(5) # try 是正常不需要验证码登录,如果try里面失败了,相当于登录失败,需要验证码,然后就跑到了except中,按照需要验证码的方式进行登录。 username, password = common_mysql.selectFromTb("OpenX") try: browser.find_element_by_id("email").send_keys(username) time.sleep(3) browser.find_element_by_id('password').send_keys(password) time.sleep(2) browser.find_element_by_id("submit").click() time.sleep(5) browser.find_element_by_xpath( '//a[text()="http:// mobimagic-ui.openx.net/"]').click() except: print(traceback.format_exc()) errorInfo = traceback.format_exc() comm_logging.myLogger.write_logger(errorInfo) # 不能再重新载入界面,重新载入后不会出现验证码,然后继续失败 # browser.get(logInURL) time.sleep(2) browser.find_element_by_id("email").clear() browser.find_element_by_id("email").send_keys(username) time.sleep(3) browser.find_element_by_id('password').clear() browser.find_element_by_id('password').send_keys(password) ctypes.windll.user32.MessageBoxA( 0, u"点击确定后,请在15秒内输入验证码,不要点击登录!!!".encode('gb2312'), u' 信息'.encode('gb2312'), 0) print("请输入验证码,等待 15秒") for i in range(1, 15): print("倒计时:", 15 - i) time.sleep(1) browser.find_element_by_id("submit").click() print("模拟登录成功") print("打开数据网址") browser.get(dataURL) time.sleep(10) browser.find_element_by_xpath( '//div[@class="date-range-filter__input"]').click() browser.find_element_by_xpath( '//li[@class="date-range-filter__filter date-range-filter__filter--custom date-range-filter__filter__label"]' ).click() time.sleep(5) # startElements = browser.find_element_by_xpath('//input[@placeholder="Enter Date"]').send_keys(mm + "/" + dd + "/" + yyyy) # 各种方法试了一通,根本不行,下面可以list[0] 是开始日期,list[1]是结束日期 print("开始选择日期和维度") inputElements = browser.find_elements_by_css_selector( '[placeholder="Enter Date"]') inputElements[0].clear() inputElements[0].send_keys(mm + "/" + dd + "/" + yyyy) inputElements[1].clear() inputElements[1].send_keys(mm + "/" + dd + "/" + yyyy) browser.find_element_by_xpath( '//div[@class="date-range-filter__custom-date-range-menu__buttons"]/div/button[2]' ).click() time.sleep(5) # 隐藏日期栏、折线图 不隐藏点不到下载按钮 browser.find_element_by_xpath( '//div[@class="reports-pie-data-collection-container__collapse-trigger"]' ).click() time.sleep(1) browser.find_element_by_xpath( '//div[@class="reports-pie-collapse-toggle"]').click() time.sleep(1) # 点击下载 选择xlsx browser.find_element_by_xpath( '//div[@class="reports-pie-chart-toolkit__export"]/ox-dropdown' ).click() time.sleep(1) # browser.find_element_by_xpath('//button[@class="ox-btn ox-btn--tertiary"]/div').click() browser.find_element_by_xpath('//span[text()="Excel "]').click() time.sleep(10) common.mkdir(path) # 扫描文件夹,获取 openx 的文件列表 print("正在扫描下载的 xlsx 文件---------") fileList = common.scan_File(path, prefix) common.turnToXls_ByPandas(path + fileList[-1], path + excelName, 'openx') common.remove_File(path, prefix) print(" excel保存成功,路径:" + path + "-----------") print("||||||||||||OpenX抓取完毕||||||||||||||") flag = True except Exception as e: flag = False print(traceback.format_exc()) errorInfo = traceback.format_exc() comm_logging.myLogger.write_logger(errorInfo) continue finally: browser.quit() if (flag or try_num == 4): break
def test_cm(): print("............cm................") # 尝试5次,错误继续 flag = False for try_num in range(5): print("cmcm第" + str(try_num + 1) + "次尝试-----------") try: print("打开浏览器") chrome_options = webdriver.ChromeOptions() # 使用headless无界面浏览器模式 chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument( 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' ) # 启动浏览器,获取网页源代码 browser = webdriver.Chrome(chrome_options=chrome_options) logInURL = "http://console.cmcm.com" d = datetime.now() + timedelta(days=-1) d1 = d + timedelta(days=-1) if (int(d.strftime('%Y-%m-%d %H:%M:%S')[11:13]) <= 3): str_d1 = d1.strftime('%Y-%m-%d %H:%M:%S') else: str_d1 = d.strftime('%Y-%m-%d %H:%M:%S') yyyy1 = str_d1[0:4] mmmm1 = str_d1[5:7] dddd1 = str_d1[8:10] dataURL = "http://console.cmcm.com/report/placement?by=day&from=" + yyyy1 + "-" + mmmm1 + "-" + dddd1 + "&to=" + yyyy1 + "-" + mmmm1 + "-" + dddd1 print("打开登录网址") browser.get(logInURL) time.sleep(2) print("模拟输入用户名密码中") username, password = common_mysql.selectFromTb("cm") browser.find_element_by_name('email').send_keys(username) browser.find_element_by_name('password').send_keys(password) browser.find_element_by_xpath( '//*[@id="login-form"]/div[3]/div/button').click() print("模拟登录成功") print("打开数据网址") browser.get(dataURL) time.sleep(20) dimensionsOfdata = [ "datekey", "item_placement", "backfill", "wins", "impressions", "requests", "fillrate", "winrate", "clicks", "ctr", "ecpm", "money" ] workbook = xlwt.Workbook(encoding='utf-8') worksheet = workbook.add_sheet('cmcm') pageNum = browser.find_elements_by_xpath( '//li[@data-page]').__len__() # 如果不进行此判断,会出现空表 if pageNum >= 1: row = 1 for page in range(pageNum): print("-------读取第" + str(page + 1) + "页----------") browser.find_element_by_xpath('//li[@data-page=' + str(page + 1) + ']').click() soup = BeautifulSoup(browser.page_source, "lxml") tbody = soup.find( "table", { "class": "bordered highlighted scrolling-table" }).find("tbody") trSum = tbody.findAll("tr") for tr in trSum: col = 0 tdSum = tr.findAll("td") for td in tdSum: if col == 0 or col == 1: worksheet.write(row, col, td.text) else: worksheet.write( row, col, float(td.text.replace(",", ""))) col = col + 1 row = row + 1 time.sleep(2) else: continue # 处理表头 col = 0 for d in dimensionsOfdata: worksheet.write(0, col, dimensionsOfdata[col]) col = col + 1 nowTime = common.getNowTime() # 路径只取nowTime的日期部分 path = common.getDirPath(nowTime) excelName = common.getExcelName(nowTime, "cmcm") common.mkdir(path) workbook.save(path + excelName) print(" excel保存成功,路径:" + path + "-----------") print("||||||||||||cmcm抓取完毕||||||||||||||") flag = True except Exception as e: flag = False errorInfo = traceback.format_exc() comm_logging.myLogger.write_logger(errorInfo) continue finally: browser.quit() if (flag or try_num == 4): break
def test_Smaato(): print("............Smaato................") # 尝试5次,错误继续 flag = False for try_num in range(5): print("Smaato第" + str(try_num + 1) + "次尝试-----------") try: chrome_options = webdriver.ChromeOptions() # 使用headless无界面浏览器模式 chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument( 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' ) # 启动浏览器,获取网页源代码 browser = webdriver.Chrome(chrome_options=chrome_options) loginURL = "https://spx.smaato.com/publisherportal/pages/login.xhtml" username, password = common_mysql.selectFromTb("Smaato") dataURL = "https://spx.smaato.com/publisherportal/pages/reporting/reporting.xhtml" print("进入登陆页---------------------------------") browser.get(loginURL) print(" 输入用户名密码-------------------------") browser.find_element_by_id("j_username").send_keys(username) browser.find_element_by_id("j_password").send_keys(password) browser.find_element_by_xpath("//button[@type='submit']").click() time.sleep(5) print("跳转到数据页面---------------------------------") browser.get(dataURL) time.sleep(10) print(" 点击日历表------------------") browser.find_element_by_xpath( "//span[@id='reporting:popup']").click() print(" 选择yesterday----------------") browser.find_element_by_xpath( "//div[@class='drp_shortcuts-block1']/span[2]").click() print(" 点击update----------------") browser.find_element_by_xpath( "//input[@class='apply-btn']").click() time.sleep(2) print(" 点击display by:----------------") browser.find_element_by_xpath( "//label[@id='reporting:displayByMenu_label']").click() print(" 点击Adspace----------------") # data-label 是 li 的数据部分 browser.find_element_by_xpath( "//li[@data-label='Adspace']").click() # 需要一个刷新延时 time.sleep(5) # 表头信息列表 dimensionsOfdata = [ "Adspace", "Adspace ID", "Net Revenue", "Gross Revenue", "Ad Requests", "Served Ads", "Fillrate", "Impressions", "Viewrate", "Net eCPM", "Gross eCPM", "Clicks" ] # 新建表格对象 workbook = xlwt.Workbook(encoding='utf-8') # 新建sheet对象 worksheet = workbook.add_sheet('Smaato') soup = BeautifulSoup(browser.page_source, "lxml") print("开始爬数据------------------") table = soup.find("tbody", {"id": "reporting:reportingSummaryTable_data"}) trSum = table.findAll("tr") time.sleep(2) # 处理数据部分 # 第一行为表头,从第二行开始填充数据 row = 1 # 对所有行进行循环 for tr in trSum: col = 0 tdSum = tr.findAll("td") # 对每一行的所有列进行循环 for td in tdSum: try: if col == 2 or col == 3 or col == 9 or col == 10: worksheet.write( row, col, float( td.text.replace("$", "").replace(",", ""))) elif col == 4 or col == 5 or col == 7 or col == 11: worksheet.write(row, col, float(td.text.replace(",", ""))) elif col == 6 or col == 8: worksheet.write(row, col, float(td.text.replace("%", ""))) elif col == 1: worksheet.write(row, col, float(td.text)) else: worksheet.write(row, col, td.text) except Exception as e: print(traceback.format_exc()) # continue col = col + 1 row = row + 1 # 处理表头 col = 0 for d in dimensionsOfdata: worksheet.write(0, col, dimensionsOfdata[col]) col = col + 1 nowTime = common.getNowTime() # 路径只取nowTime的日期部分 path = common.getDirPath(nowTime) excelName = common.getExcelName(nowTime, "Smaato") common.mkdir(path) workbook.save(path + excelName) print(" excel保存成功,路径:" + path + "-----------") print("||||||||||||Smaato抓取完毕||||||||||||||") flag = True # read_workbook = xlrd.open_workbook(path + excelName) # read_sheetName = read_workbook.sheet_names()[0] # read_sheet = read_workbook.sheet_by_name(read_sheetName) # read_numRow, read_numCol = read_sheet.nrows, read_sheet.ncols # if read_sheet.cell(read_numRow - 1,1) is '': # flag = False except Exception as e: flag = False print(traceback.format_exc()) errorInfo = traceback.format_exc() comm_logging.myLogger.write_logger(errorInfo) continue finally: browser.quit() if (flag or try_num == 4): break
def test_NewCM(): print("............NewCM................") flag = False for try_num in range(5): print("NewCM第" + str(try_num + 1) + "次尝试-----------") try: yesterday = common.getNowTime(-1) yyyy = yesterday[0:4] mm = yesterday[5:7] dd = yesterday[8:10] nowTime = common.getNowTime() yearNow = nowTime[0:4] monthNow = nowTime[5:7] dayNow = nowTime[8:10] # NewCM下载的文件名的前缀 prefix = "Brand+Details_" + yearNow + monthNow + dayNow # 路径只取nowTime的日期部分 path = common.getDirPath(nowTime) excelName = common.getExcelName(nowTime, "NewCM") chrome_options = webdriver.ChromeOptions() # 使用headless无界面浏览器模式 # chrome_options.add_argument('--headless') # chrome_options.add_argument('--disable-gpu') chrome_options.add_argument( 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' ) # 修改默认下载地址 prefs = { 'profile.default_content_settings.popups': 0, 'download.default_directory': path } chrome_options.add_experimental_option('prefs', prefs) # 启动浏览器,获取网页源代码 browser = webdriver.Chrome(chrome_options=chrome_options) loginURL = "https://peg.cmcm.com/login" dataURL = "https://peg.cmcm.com/frontReport/default" browser.get(loginURL) time.sleep(5) print("输入用户名密码---------------------------------") username, password = common_mysql.selectFromTb("NewCM") browser.find_element_by_name("username").send_keys(username) browser.find_element_by_name("password").send_keys(password) ctypes.windll.user32.MessageBoxA( 0, u"点击确定后,请在15秒内输入验证码,不要点击登录!!!".encode('gb2312'), u' 信息'.encode('gb2312'), 0) print("请输入验证码,等待 15秒") for i in range(1, 15): print("倒计时:", 15 - i) time.sleep(1) print("点击登录按钮------") browser.find_element_by_xpath('//button[@type="submit"]').click() time.sleep(5) print("跳转到数据页面---------------------------------") browser.get(dataURL) time.sleep(5) # 选择 AD unit print("选择 AD unit") browser.find_element_by_xpath( '//button[text()="Dimension & Metric"]').click() browser.find_element_by_xpath('//span[text()="Ad Unit"]').click() time.sleep(2) browser.find_element_by_xpath('//button[text()="Submit"]').click() time.sleep(2) # 选择日期 print("选择日期") browser.find_element_by_xpath( '//button[@class="btn btn-default ng-binding"]').click() browser.find_element_by_xpath('//span[text()="Yesterday"]').click() time.sleep(2) browser.find_element_by_xpath('//button[text()="Submit"]').click() time.sleep(2) # 点击search print("# 点击search") browser.find_element_by_xpath('//input[@value="Search"]').click() time.sleep(5) print("点击下载") browser.find_element_by_xpath( '//a[@class="btn btn-outline"]').click() time.sleep(5) # 扫描文件夹,获取 NewCM 的文件列表 print("正在扫描下载的 csv 文件---------") fileList = common.scan_File(path, prefix) common.mkdir(path) # 读取下载的文件,删除无用数据并 重新保存 print("正在剔除其他日期的数据 并另存为 excel ---------") common.turnToXls_ByPandas(path + fileList[-1], path + excelName, 'NewCM') common.remove_File(path, prefix) print(" excel保存成功,路径:" + path + "-----------") print("||||||||||||NewCM抓取完毕||||||||||||||") flag = True except: flag = False print(traceback.format_exc()) errorInfo = traceback.format_exc() comm_logging.myLogger.write_logger(errorInfo) continue finally: browser.quit() if (flag or try_num == 5): break
def test_Tappx(): print("............Tappx................") flag = False for try_num in range(3): print("Tappx第" + str(try_num + 1) + "次尝试-----------") try: chrome_options = webdriver.ChromeOptions() # 使用headless无界面浏览器模式 chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument( 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' ) # 启动浏览器,获取网页源代码 browser = webdriver.Chrome(chrome_options=chrome_options) loginURL = "https://www.tappx.com/en/admin/login/" dataURL = "https://www.tappx.com/en/admin/monetize/" browser.get(loginURL) print("输入用户名密码---------------------------------") username, password = common_mysql.selectFromTb("Tappx") browser.find_element_by_id("username").send_keys(username) browser.find_element_by_id("password").send_keys(password) browser.find_element_by_xpath("//button[@type='submit']").click() print("跳转到数据页面---------------------------------") browser.get(dataURL) print("自动选择数据维度内容---------------------------------") time.sleep(5) print("选择货币形式为美元---------------------------------") browser.find_element_by_xpath( "//div[@id='currency-selector']/a[2]").click() time.sleep(1) print("取消APP维度---------------------------------") browser.find_element_by_xpath( "//i[@class='tappxicon tappxicon-close']").click() time.sleep(1) print("展开维度---------------------------------") browser.find_element_by_xpath( "//button[@class='btn-block btn btn-default dropdown-toggle']" ).click() time.sleep(1) print("点击显示下拉维度---------------------------------") browser.find_element_by_xpath( "//a[@class='dropdown-opener']").click() time.sleep(1) print("点击day维度---------------------------------") browser.find_element_by_xpath( "//li[@class='model-g_time_day no-image']").click() time.sleep(1) print("展开维度---------------------------------") browser.find_element_by_xpath( "//button[@class='btn-block btn btn-default dropdown-toggle']" ).click() time.sleep(1) print("点击app维度---------------------------------") browser.find_element_by_xpath( "//ul[@class='list-unstyled']/li[5]").click() time.sleep(1) print("展开维度---------------------------------") browser.find_element_by_xpath( "//button[@class='btn-block btn btn-default dropdown-toggle']" ).click() time.sleep(1) print("点击format维度---------------------------------") browser.find_element_by_xpath( "//ul[@class='list-unstyled']/li[2]").click() time.sleep(1) print("展开维度---------------------------------") browser.find_element_by_xpath( "//button[@class='btn btn-default btn-plus dropdown-toggle']" ).click() time.sleep(1) print("点击时间下拉维度---------------------------------") browser.find_element_by_xpath( "//ul[@id='default-options']/li[4]").click() time.sleep(1) print("点击昨天维度---------------------------------") browser.find_element_by_xpath( "//ul[@id='default-options']/li[4]/ul/li[2]").click() time.sleep(3) dimensionsOfdata = [ "Date", "App", "Format", "Requests", "Deliveries", "Impressions", "Clicks", "CPM", "CTR", "Fill Rate", "Render Rate", "Benefits" ] workbook = xlwt.Workbook(encoding='utf-8') worksheet = workbook.add_sheet('Tappx') soup = BeautifulSoup(browser.page_source, "lxml") print("开始爬数据") tbody = soup.find("tbody", {"id": "data-raw-table"}) trSum = tbody.findAll("tr") row = 1 for tr in trSum: col = 0 tdSum = tr.findAll("td") for td in tdSum: # print(td.text) try: if col >= 3: worksheet.write( row, col, float( td.text.replace(",", "").replace( "%", "").replace("$", ""))) elif col == 1: x = td.text.replace("\n", "").strip() worksheet.write(row, col, td.text.replace("\n", "").strip()) else: worksheet.write(row, col, float(td.text)) except ValueError: worksheet.write(row, col, td.text) col = col + 1 row = row + 1 # 加表头 col = 0 for di in dimensionsOfdata: worksheet.write(0, col, dimensionsOfdata[col]) col = col + 1 nowTime = common.getNowTime() # 路径只取nowTime的日期部分 path = common.getDirPath(nowTime) excelName = common.getExcelName(nowTime, "Tappx") common.mkdir(path) workbook.save(path + excelName) print(" excel保存成功,路径:" + path + "-----------") browser.quit() print("||||||||||||Tappx抓取完毕||||||||||||||") flag = True except: flag = False print(traceback.format_exc()) errorInfo = traceback.format_exc() comm_logging.myLogger.write_logger(errorInfo) continue finally: browser.quit() if (flag or try_num == 3): break
def test_Solo(): print("............Solo................") flag = False for try_num in range(3): print("Solo第" + str(try_num + 1) + "次尝试-----------") try: dayBeforeYester = common.getNowTime(-2) yyyy = dayBeforeYester[0:4] mm = dayBeforeYester[5:7] dd = dayBeforeYester[8:10] nowTime = common.getNowTime() yearNow = nowTime[0:4] monthNow = nowTime[5:7] dayNow = nowTime[8:10] # Solo下载的文件名的前缀 prefix = "ReportTable" # 路径只取nowTime的日期部分 path = common.getDirPath(nowTime) excelName = common.getExcelName(nowTime, "Solo") chrome_options = webdriver.ChromeOptions() # 使用headless无界面浏览器模式 # chrome_options.add_argument('--headless') # chrome_options.add_argument('--disable-gpu') chrome_options.add_argument( 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' ) # 修改默认下载地址 prefs = { 'profile.default_content_settings.popups': 0, 'download.default_directory': path } chrome_options.add_experimental_option('prefs', prefs) # 启动浏览器,获取网页源代码 browser = webdriver.Chrome(chrome_options=chrome_options) loginURL = "https://portal.newborntown.com/logout" dataURL = "https://portal.newborntown.com/report" browser.get(loginURL) time.sleep(5) print("输入用户名密码---------------------------------") username, password = common_mysql.selectFromTb("Solo") browser.find_element_by_name("email").send_keys(username) browser.find_element_by_name("password").send_keys(password) ctypes.windll.user32.MessageBoxA( 0, u"点击确定后,请在15秒内输入验证码,不要点击登录!!!".encode('gb2312'), u' 信息'.encode('gb2312'), 0) print("请输入验证码,等待 15秒") for i in range(1, 15): print("倒计时:", 15 - i) time.sleep(1) print("点击登录按钮------") browser.find_element_by_xpath('//button[@id="btn_login"]').click() time.sleep(5) print("跳转到数据页面---------------------------------") browser.get(dataURL) time.sleep(5) # 选择stats页 print("选择stats页") browser.find_element_by_xpath('//h4[text()="Stats"]').click() time.sleep(5) # 开始选择日期 print("开始选择日期") browser.find_element_by_id("reportrange").click() time.sleep(2) browser.find_element_by_xpath( '//li[text()="Custom Range"]').click() time.sleep(2) browser.find_element_by_name("daterangepicker_start").clear() browser.find_element_by_name("daterangepicker_start").send_keys( mm + "/" + dd + "/" + yyyy) time.sleep(2) browser.find_element_by_name("daterangepicker_end").clear() browser.find_element_by_name("daterangepicker_end").send_keys(mm + "/" + dd + "/" + yyyy) time.sleep(2) browser.find_element_by_xpath('//button[text()="Apply"]').click() time.sleep(5) # 点击选择 slot 页 print("点击选择 slot 页") browser.find_element_by_id("slot").click() # browser.maximize_window() time.sleep(5) print("点击下载") browser.find_element_by_id("export").click() time.sleep(2) # 扫描文件夹,获取 Solo 的文件列表 print("正在扫描下载的 csv 文件---------") fileList = common.scan_File(path, prefix) common.mkdir(path) # 读取下载的文件,删除无用数据并 重新保存 print("正在剔除其他日期的数据 并另存为 excel ---------") common.turnToXls_ByPandas(path + fileList[0], path + excelName, 'Solo') common.remove_File(path, prefix) print(" excel保存成功,路径:" + path + "-----------") print("||||||||||||Solo抓取完毕||||||||||||||") flag = True except: flag = False print(traceback.format_exc()) errorInfo = traceback.format_exc() comm_logging.myLogger.write_logger(errorInfo) continue finally: browser.quit() if (flag or try_num == 3): break