def read_cookies(): # c_service = Service('E:\work\\twbot\chromedriver.exe') c_service = Service('/opt/google/chrome/chromedriver') c_service.command_line_args() c_service.start() #chrome option = webdriver.ChromeOptions() # option.set_headless() option.add_argument("--headless") option.add_argument("--no-sandbox") driver = webdriver.Chrome(chrome_options=option) # driver = webdriver.Chrome() #firefox # option = webdriver.FirefoxOptions() # # option.add_argument("headless") # # option.add_argument('--no-sandbox') # option.set_headless() # driver = webdriver.Firefox(firefox_options=option) driver.get("https://mobile.twitter.com") with open("qrsncookies.txt", "r") as fp: cookies = json.load(fp) for cookie in cookies: if 'expiry' in cookie: del cookie['expiry'] driver.add_cookie(cookie) driver.get("https://mobile.twitter.com/home") print driver.title time.sleep(20) return driver, c_service
def get_movie_url(url): data_dict = {} i = 0 html = ask_url(url) doc = etree.HTML(html) # 所有级数的a标签和文本 all_url = doc.xpath('//div[@class="fed-play-item fed-drop-item fed-visible"]//ul[@class="fed-part-rows"]/li/a/@href') all_title = doc.xpath('//div[@class="fed-play-item fed-drop-item fed-visible"]' '//ul[@class="fed-part-rows"]/li/a/text()') # 用selenium获取iframe里的src c_service = Service('/usr/bin/chromedriver') c_service.command_line_args() c_service.start() option = webdriver.ChromeOptions() option.add_argument('--headless') option.add_argument('--no-sandbox') option.add_argument('--disable-dev-shm-usage') browser = webdriver.Chrome('/usr/bin/chromedriver', options=option) # print('正在爬取视频链接中') for url in all_url: browser.get('https://kuyun.tv'+url) movie_url = browser.find_element_by_id('fed-play-iframe').get_attribute('src') data_dict[all_title[i]] = movie_url i = i+1 browser.quit() c_service.stop() return data_dict
def punch(StudentId, Name): c_service = Service('/Users/wq//Downloads/chromedriver') c_service.command_line_args() c_service.start() driver = webdriver.Chrome( '/Users/wq/Downloads/chromedriver') # 选择Chrome浏览器 driver.get('http://xsc.sicau.edu.cn/SPCP') # 打开网站 #采用xpath定位 result = driver.find_element_by_xpath('//*[@id="code-box"]') text = result.text driver.find_element_by_xpath('//*[@id="StudentId"]').click() driver.find_element_by_xpath('//*[@id="StudentId"]').send_keys(StudentId) driver.find_element_by_xpath('//*[@id="Name"]').click() driver.find_element_by_xpath('//*[@id="Name"]').send_keys(Name) driver.find_element_by_xpath('//*[@id="codeInput"]').click() driver.find_element_by_xpath('//*[@id="codeInput"]').send_keys(text) driver.find_element_by_xpath('//*[@id="Submit"]').click() driver.find_element_by_xpath('//*[@id="platfrom2"]').click() try: driver.find_element_by_xpath('//*[@id="ckCLS"]').click() driver.find_element_by_xpath('//*[@id="SaveBtnDiv"]/button').click() except: driver.find_element_by_xpath( '//*[@id="layui-layer1"]/div[3]/a').click() driver.quit() c_service.stop()
def get_driver(): chromedriver_path = webium.settings.chromedriverpath c_service = Service(chromedriver_path) c_service.command_line_args() c_service.start() chrome_options = Options() if hasattr(webium.settings, "chrome_nosandbox") and webium.settings.chrome_nosandbox: chrome_options.add_argument('--no-sandbox') if hasattr(webium.settings, "chrome_disable_shmusage" ) and webium.settings.chrome_disable_shmusage: chrome_options.add_argument('--disable-dev-shm-usage') if hasattr(webium.settings, "chrome_ignore_certificate_errors" ) and webium.settings.chrome_ignore_certificate_errors: chrome_options.add_argument('--ignore-certificate-errors') if hasattr(webium.settings, "chrome_disable_gpu") and webium.settings.chrome_disable_gpu: chrome_options.add_argument('--disable-gpu') if hasattr(webium.settings, "chrome_disable_plugins" ) and webium.settings.chrome_disable_plugins: chrome_options.add_argument('--disable-plugins') if hasattr(webium.settings, "chrome_handless") and webium.settings.chrome_handless: chrome_options.add_argument('--headless') service_args = [] if hasattr(webium.settings, "service_load_images") and webium.settings.service_load_images: service_args.append('--load-images=yes') else: service_args.append('--load-images=no') if hasattr(webium.settings, "service_disk_cache") and webium.settings.service_disk_cache: service_args.append('--disk-cache=yes') else: service_args.append('--disk-cache=no') if hasattr(webium.settings, "service_ignore_ssl_errors" ) and webium.settings.service_ignore_ssl_errors: service_args.append('--ignore-ssl-errors=true') else: service_args.append('--ignore-ssl-errors=false') global _driver_instance if not _driver_instance: _driver_instance = webdriver.Chrome(chrome_options=chrome_options, service_args=service_args, executable_path=chromedriver_path) _driver_instance.implicitly_wait(webium.settings.implicit_timeout) return _driver_instance
def initializeDriver(browser_name='google', driver='/opt/google/chrome/chromedriver'): print('initializeDriver: create a ' + browser_name + ' driver') google_driver = driver if browser_name == 'google': c_service = Service(google_driver) c_service.command_line_args() c_service.start() options = webdriver.ChromeOptions() options.add_argument('--headless') browser = webdriver.Chrome(executable_path=google_driver, chrome_options=options) return browser, c_service
def getcook(): loginurl = 'http://113.57.169.227:8088/ccps/login.jsp' # 登录页面 path = r'd:\chromedriver.exe' # 加载webdriver驱动,用于获取登录页面标签属性 # driver = webdriver.Chrome(r'd:\chromedriver.exe') # option = webdriver.ChromeOptions() # option.binary_location = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe' # option.add_argument('--headless') #增加无界面选项 # option.add_argument('--disable-gpu') #如果不加这个选项,有时定位会出现问题 # option.add_experimental_option('excludeSwitches', ['enable-logging']) c_service = Service(path) c_service.command_line_args() c_service.start() chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') driver = webdriver.Chrome(executable_path=path, options=chrome_options) driver.get(loginurl) # 请求登录页面 driver.find_element_by_id('wcode').clear() # 获取用户名输入框,并先清空 driver.find_element_by_id('wcode').send_keys(u'WHBK100') # 输入用户名 driver.find_element_by_id('password').clear() # 获取密码框,并清空 driver.find_element_by_id('password').send_keys(u'') # 输入密码 #captcha = driver.find_element_by_id('captcha_image') # 获取验证码标签 #submit = driver.find_element_by_css_selector('a[name="登录"]') # 获取提交按钮 submit = driver.find_element_by_link_text("登录") # 判断是否需要验证码 captcha = [] if captcha: captcha_field = driver.find_element_by_id('captcha_field') # 获取验证码输入框 text = input("请输入验证码:") # 控制栏输入验证码 captcha_field.send_keys(text) # 将输入的验证码传递给selenium打开的浏览器 submit.click() # 按钮提交并登录 else: submit.click() # 无验证码则直接登录提交 cookies = driver.get_cookies() # 获取COOK #driver.get('http://113.57.169.227:8088/ccps//workorder/findWorkOrderList.action?workOrder.range=yff&workOrder.standby3=order_deal') # 请求其他页面 time.sleep(1) driver.quit() c_service.stop() #print(cookies) return cookies # 返回cookies 之后其他方法可以调用,这样不用每次请求都返回登录
def get_urls(xingqi, A): #由于在后台打开浏览器,因此不能很好的关闭,所以用service.start(),service.close()控制进程开关 c_service = Service( 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') c_service.command_line_args() c_service.start() #使用谷歌自带的无头浏览器模式,悄无声息地运行 opt = Options() opt.add_argument('--headless') browser = webdriver.Chrome(chrome_options=opt) wait = WebDriverWait(browser, 10) #设置延迟10秒,等待网页加载 # browser.get('http://www.qqshidao.com/index.php?c=home&a=bifen') time.sleep(3) submit = wait.until( EC.element_to_be_clickable( (By.XPATH, '//*[@id="app"]/div[6]/div/span[6]'))) submit.click() time.sleep(2) js = 'var q=document.documentElement.scrollTop=100000' #设置往下拉网页的长度,设置大点,直接拉到底部 browser.execute_script(js) #发现当比赛较多时,往下拉一下后加载新的数据,因此,自动往下拉一下就停了 time.sleep(1) browser.execute_script(js) #再往下拉一下,加载全部比赛 time.sleep(2) yuanma = browser.page_source browser.quit() c_service.stop() s = etree.HTML(yuanma) urls = [] urls_ = s.xpath('//*[@id="app"]/div[7]/div/table/tbody/tr[@data-fid]') for each in urls_: fid = each.attrib['data-fid'] xingqiji = each.xpath('./td[3]/text()')[0] if xingqi in xingqiji: url = 'http://www.qqshidao.com/index.php?c=odds&a=betfair&fid={}'.format( fid) A['{}'.format(url)] = xingqiji urls.append(url) return urls
def download_by_webdriver(url, charset='utf-8', proxy=None, user_agent=None): # 传入URL,使用浏览器下载后,返回页面。 print("[download_by_webdriver]: begin download the link %s" % url) try: # 进入浏览器设置 options = webdriver.ChromeOptions() # 谷歌无头模式 options.add_argument('--headless') options.add_argument('--disable-gpu') # options.add_argument('window-size=1200x600') # 设置中文 options.add_argument('lang=zh_CN.UTF-8') # 设置代理 if proxy: print("[download_by_webdriver]: use proxy %s" % proxy) options.add_argument('proxy-server=' + proxy) # 添加头 if user_agent: options.add_argument('user-agent=' + user_agent) else: options.add_argument( 'user-agent=' + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/71.0.3578.98 Safari/537.36') # 设置驱动服务 c_service = Service('/usr/local/bin/chromedriver') c_service.command_line_args() c_service.start() driver = webdriver.Chrome(chrome_options=options) driver.get(url) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") driver.implicitly_wait(10) driver.set_page_load_timeout(15) p_content = driver.page_source.encode(charset, "ignore").decode(charset, 'ignore') current_url = driver.current_url driver.quit() c_service.stop() except Exception as e: print("[download_by_webdriver]:", e) p_content, current_url = None, None return p_content, current_url
def download_by_webdriver(url, charset='utf-8'): # 传入URL,使用浏览器下载后,返回页面。 print("[download_by_webdriver]: begin download the link %s" % url) try: chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') c_service = Service('/usr/local/bin/chromedriver') c_service.command_line_args() c_service.start() driver = webdriver.Chrome(chrome_options=chrome_options) driver.get(url) driver.implicitly_wait(10) content = driver.page_source.encode(charset, "ignore").decode( charset, 'ignore') current_url = driver.current_url driver.quit() c_service.stop() except Exception as e: print("[download_by_webdriver]:", e) content, current_url = None, None return content, current_url
class Login(object): def __init__(self): if len(sys.argv) < 3: self.account = '1778973****' self.password = '******' else: self.account = sys.argv[1] self.password = sys.argv[2] self.url = "https://www.zhihu.com/signin?next=%2Fsettings%2Faccount" # self.url = "https://detail.1688.com/offer/622477660714.html" self.browser = 'chrome' if self.browser == 'chrome': driver_path = "D:\Develop\GeckoDriver\chromedriver_win32\chromedriver.exe" self.c_service = Service(driver_path) self.c_service.command_line_args() self.c_service.start() chrome_options = Options() chrome_options.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"') # chrome_options.add_argument('--headless') # 浏览器不提供可视化页面 chrome_options.add_argument('--no-sandbox') # 取消沙盒模式 chrome_options.add_argument('--disable-gpu') # 禁用GPU加速 chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--start-maximized') # 最大化运行(全屏窗口) chrome_options.add_argument("--incognito") # 隐身模式启动 # chrome_options.add_argument("disable-infobars") # 已弃用 去掉提示:Chrome正收到自动测试软件的控制 # chrome_options.add_experimental_option('useAutomationExtension', False) # 去掉提示:Chrome正收到自动测试软件的控制 # 屏蔽提示:chrome正收到自动测试软件的控制 # 在79(含79)以后的版本无效。谷歌修复了非无头模式下排除“启用自动化”时window.navigator.webdriver是未定义的问题 chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) # 去掉提示:Chrome正收到自动测试软件的控制 self.driver = webdriver.Chrome(executable_path=driver_path, options=chrome_options) # CDP执行JavaScript 代码 重定义window.navigator.webdriver的值 绕过反爬机制 # 检测机制: # selenium调用驱动打开浏览器,在控制台window.navigator.webdriver会标记FALSE, # 手工正常打开的浏览器控制台window.navigator.webdriver的结果是True self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { "source": """ Object.defineProperty(navigator, 'webdriver', { get: () => undefined }) """ }) elif self.browser == 'firefox': driver_path = "/usr/local/bin/geckodriver" firefox_options = FirefoxOptions() firefox_options.add_argument('--headless') firefox_options.add_argument('--no-sandbox') firefox_options.add_argument('--disable-gpu') # firefox_options.add_argument('--disable-dev-shm-usage') self.driver = webdriver.Firefox(executable_path=driver_path, firefox_options=firefox_options) self.driver.delete_all_cookies() # self.index = 'https://open.yuewen.com/pub/index/index.html' # self.agent = '' def after_quit(self): """ 关闭浏览器 """ self.driver.quit() # self.driver.close() # self.c_service.stop() # 去掉提示:Chrome正收到自动测试软件的控制 def save_cookie(self, cookies, token): db = pymysql.Connect(host='127.0.0.1', port=int(3306), user='******', passwd='password', db='db_name', charset='utf8mb4') cursor = db.cursor() sql = 'SELECT cookie.id FROM pigeon_spider_account_cookie cookie ' +\ 'LEFT JOIN pigeon_spider_account_center spider ' +\ 'ON spider.id = cookie.pigeon_spider_account_center_id ' + \ 'LEFT JOIN pigeon_platform_account_center account ' + \ 'ON account.id = spider.platform_account_center_id ' + \ 'WHERE account.platform= %s AND account.account = %s' count = cursor.execute(sql, ('1688', self.account)) if (count > 0): id = cursor.fetchone()[0] update_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) sql = 'UPDATE pigeon_spider_account_cookie SET cookies = %s, token = %s,' + \ 'update_time = %s WHERE id = %s' result = cursor.execute(sql, (cookies, token, update_at, id)) else: sql = 'SELECT spider.id FROM pigeon_spider_account_center spider ' + \ 'LEFT JOIN pigeon_platform_account_center account ' + \ 'ON account.id = spider.platform_account_center_id ' + \ 'WHERE account.platform= %s AND account.account = %s' count = cursor.execute(sql, ('1688', self.account)) if(count > 0): id = cursor.fetchone()[0] create_at = update_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) sql = 'INSERT INTO pigeon_spider_account_cookie (pigeon_spider_account_center_id, ' + \ 'cookies, token, expires_time, create_time, update_time) ' + \ 'VALUES (%s, %s, %s, %s, %s, %s)' expires_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()+86400)) result = cursor.execute(sql, (id, cookies, token, expires_time, create_at, update_at)) print(result) db.commit() db.close() return result def urllib_download(self, imgurl, imgsavepath): """ 下载图片 :param imgurl: 需要下载图片的url :param imgsavepath: 图片存放位置 """ from urllib.request import urlretrieve urlretrieve(imgurl, imgsavepath) """ 复杂图像处理: https://blog.csdn.net/sinat_36458870/article/details/78825571?utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromMachineLearnPai2%7Edefault-1.control&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromMachineLearnPai2%7Edefault-1.control """ def get_position_senior(self, chunk, canves): base_dir = os.path.dirname(os.path.abspath(__file__)) # cv2.imread()用于读取图片文件;图片路径,读取图片的形式(1表示彩色图片[默认],0表示灰度图片,-1表示原来的格式) chunk = cv2.imread(chunk) # 读取大图(灰化) canves = cv2.imread(canves) # 读取拼图(灰化) chunk_gray = cv2.cvtColor(chunk, cv2.COLOR_BGR2GRAY) # 灰化 canves_gray = cv2.cvtColor(canves, cv2.COLOR_BGR2GRAY) # 灰化 chunk_blurred = cv2.GaussianBlur(chunk_gray, (9, 9), 0) # 去噪 canves_blurred = cv2.GaussianBlur(canves_gray, (9, 9), 0) # 去噪 chunk_gradX = cv2.Sobel(chunk_blurred, ddepth=cv2.CV_32F, dx=1, dy=0) # 提取梯度 chunk_gradY = cv2.Sobel(chunk_blurred, ddepth=cv2.CV_32F, dx=0, dy=1) canves_gradX = cv2.Sobel(canves_blurred, ddepth=cv2.CV_32F, dx=1, dy=0) canves_gradY = cv2.Sobel(canves_blurred, ddepth=cv2.CV_32F, dx=0, dy=1) chunk_gradient = cv2.subtract(chunk_gradX, chunk_gradY) chunk_gradient = cv2.convertScaleAbs(chunk_gradient) canves_gradient = cv2.subtract(canves_gradX, canves_gradY) canves_gradient = cv2.convertScaleAbs(canves_gradient) methods = [cv2.TM_SQDIFF_NORMED, cv2.TM_CCORR_NORMED, cv2.TM_CCOEFF_NORMED] methods = [cv2.TM_CCOEFF_NORMED] th, tw = canves_gradient.shape[:2] for md in methods: result = cv2.matchTemplate(chunk_gradient, canves_gradient, md) min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result) if md == cv2.TM_SQDIFF_NORMED: tl = min_loc else: tl = max_loc br = (tl[0] + tw, tl[1] + th) cv2.rectangle(chunk, tl, br, [0, 0, 0]) # cv2.imshow("pipei" + np.str(md), chunk) cv2.imwrite(base_dir + "/image/"+"pipei" + np.str(md)+".jpg", chunk) # 保存大图 y, x = np.unravel_index(result.argmax(), result.shape) print(x) return x def get_position(self, chunk, canves): """ 判断缺口位置 :param chunk: 缺口图片(验证码中的大图) :param canves: 验证码中的拼图 :return: 位置 x, y """ base_dir = os.path.dirname(os.path.abspath(__file__)) # cv2.imread()用于读取图片文件;图片路径,读取图片的形式(1表示彩色图片[默认],0表示灰度图片,-1表示原来的格式) chunk = cv2.imread(chunk, 0) # 读取大图(灰化) canves = cv2.imread(canves, 0) # 读取拼图(灰化) h, w = canves.shape[::1] # 二值化后的图片名称Ω slide_puzzle = base_dir + "/image/slide_puzzle.jpg" slide_bg = base_dir + "/image/slide_bg.jpg" # 将二值化后的图片进行保存 # cv2.imwrite()用于保存图片文件;参数1:保存的图像名称,参数2:需要保存的图像 cv2.imwrite(slide_bg, chunk) # 保存大图 cv2.imwrite(slide_puzzle, canves) # 保存拼图 # os.system('chmod 777 ' + base_dir + '/image/slide_puzzle.jpg') # os.system('chmod 777 ' + base_dir + '/image/slide_bg.jpg') chunk = cv2.imread(slide_bg) # 使用cv2.imread()读出来的是BGR数据格式 # cv2.cvtColor(p1, p2) 是颜色空间转换函数 参数1:需要转换的图片,参数2:转换成何种格式 # cv2.COLOR_BGR2RGB:将BGR格式转换成RGB格式 cv2.COLOR_BGR2GRAY:将BGR格式转换成灰度图片 chunk = cv2.cvtColor(chunk, cv2.COLOR_BGR2GRAY) chunk = abs(255 - chunk) cv2.imwrite(slide_bg, chunk) chunk = cv2.imread(slide_bg) # 读取大图 canves = cv2.imread(slide_puzzle) # 读取拼图 # 获取偏移量 result = cv2.matchTemplate(chunk, canves, cv2.TM_CCOEFF_NORMED) # 得到最大和最小值得位置 min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result) top_left = min_loc # 左上角的位置 bottom_right = (top_left[0] + w, top_left[1] + h) # 右下角的位置 # 在原图上画矩形 cv2.rectangle(chunk, top_left, bottom_right, (0, 0, 255), 2) # 显示原图和处理后的图像 cv2.imshow("img_template", chunk) cv2.imshow("processed", canves) cv2.waitKey(0) y, x = np.unravel_index(result.argmax(), result.shape) return x def get_track(self, distance): """ 模拟轨迹 假装是人在操作 :param distance: :return: """ # 初速度 v = 0 # 单位时间为0.2s来统计轨迹,轨迹即0.2内的位移 t = 0.2 # 位移/轨迹列表,列表内的一个元素代表0.2s的位移 tracks = [] # 当前的位移 current = 0 # 到达mid值开始减速 mid = distance * 7 / 8 distance += 10 # 先滑过一点,最后再反着滑动回来 # a = random.randint(1,3) while current < distance: if current < mid: # 加速度越小,单位时间的位移越小,模拟的轨迹就越多越详细 a = random.randint(2, 4) # 加速运动 else: a = -random.randint(3, 5) # 减速运动 # 初速度 v0 = v # 0.2秒时间内的位移 s = v0 * t + 0.5 * a * (t ** 2) # 当前的位置 current += s # 添加到轨迹列表 tracks.append(round(s)) # 速度已经达到v,该速度作为下次的初速度 v = v0 + a * t # 反着滑动到大概准确位置 # for i in range(4): # tracks.append(-random.randint(2, 3)) # for i in range(4): # tracks.append(-random.randint(1, 3)) return tracks def login_main(self): try: # ssl._create_default_https_context = ssl._create_unverified_context driver = self.driver driver.get(self.url) # 设置最大等待时间 driver.implicitly_wait(10) # 找到密码登录按钮 并点击 driver.find_element_by_xpath("//*[@class='SignFlow-tabs']/*[2]").click() # 随机等待时间,以免被反爬虫 time.sleep(random.uniform(1, 3)) driver.find_element_by_xpath("//input[@name='username']").send_keys(self.account) driver.find_element_by_xpath("//input[@name='password']").send_keys(self.password) # 随机等待时间,以免被反爬虫 time.sleep(random.uniform(1, 3)) driver.find_element_by_xpath("//button[@type='submit']").click() # 随机等待时间,以免被反爬虫 time.sleep(random.uniform(1, 3)) bk_block = driver.find_element_by_xpath('//img[@class="yidun_bg-img"]') web_image_width = bk_block.size['width'] bk_block_x = bk_block.location['x'] slide_block = driver.find_element_by_xpath('//img[@class="yidun_jigsaw"]') # 获取验证码中的拼图 # print(bk_block.location) # 该图片对象在弹出的验证码框中的位置,返回字典的格式,例如:{'x': 36, 'y': 102} slide_block_x = slide_block.location['x'] # 获取该图片对象在验证码框中的位置(x轴) # 获取验证码中的大图url bk_block = driver.find_element_by_xpath('//img[@class="yidun_bg-img"]').get_attribute('src') # 获取验证码中的拼图url slide_block = driver.find_element_by_xpath('//img[@class="yidun_jigsaw"]').get_attribute('src') # 获取滑块 slid_ing = driver.find_element_by_xpath('//div[@class="yidun_slider"]') base_dir = os.path.dirname(os.path.abspath(__file__)) os.makedirs(base_dir + '/image/', exist_ok=True) self.urllib_download(bk_block, base_dir + '/image/bkBlock.png') # os.system('chmod 777 ' + base_dir + '/image/bkBlock.png') self.urllib_download(slide_block, base_dir + '/image/slideBlock.png') # os.system('chmod 777 ' + base_dir + '/image/slideBlock.png') # 随机等待时间,以免被反爬虫 time.sleep(random.uniform(1, 3)) img_bkblock = Image.open(base_dir + '/image/bkBlock.png') real_width = img_bkblock.size[0] width_scale = float(real_width) / float(web_image_width) position_x = self.get_position_senior(base_dir + '/image/bkBlock.png', base_dir + '/image/slideBlock.png') # 获取到 大图 与 拼图 位移的距离 (实际滑动的距离就是x轴的距离) real_position = position_x / width_scale # 将大图/比例,得到验证码框中大图与拼图实际的滑动距离 real_position = real_position - ( slide_block_x - bk_block_x) # (slide_block_x - bk_block_x):即拼图到大图的左边距,所以减去左边距后才得到真正的滑动距离 track_list = self.get_track(real_position) # 调用get_track()方法,传入真实距离参数,得出移动轨迹 ActionChains(driver).click_and_hold(on_element=slid_ing).perform() # 找到滑块元素,点击鼠标左键,按住不放 time.sleep(0.02) # 拖动元素 for track in track_list: ActionChains(driver).move_by_offset(xoffset=track, yoffset=0).perform() # 根据运动轨迹(x轴),进行拖动 time.sleep(0.01) time.sleep(0.5) print("验证滑块结束") ActionChains(driver).release(on_element=slid_ing).perform() # 释放鼠标 error_message = driver.find_element_by_xpath('//div[@class="Notification-textSection Notification-textSection--withoutButton"]').text """ 滑块验证通过后,知乎会报请求参数异常,展示无法定位原因 """ if(error_message): print(error_message) return False else: cookies_all = json.dumps(self.driver.get_cookies()) result = self.save_cookie(cookies_all) return True except Exception as e: print('str(Exception):\t', str(Exception)) print('str(e):\t\t', str(e)) print('repr(e):\t', repr(e)) # Get information about the exception that is currently being handled self.after_quit() return False
class ICloud(object): TIMEOUT = 100 id = None account = None browser = None chrome = None wait = None tab = None deleted = False mapping = set() c_service = None def __init__(self): gc.collect() self.id = random.randint(0, 100) self.c_service = Service('chromedriver') self.c_service.command_line_args() self.c_service.start() self.start_browser() logger.info('Start a browser.') def __wait_for_visible(self, xpath): return self.wait.until( expected_conditions.visibility_of_element_located( (By.XPATH, xpath))) def run_login(self, account, password): logger.info('"{account}" is logging in.'.format(account=account)) self.account = account # Get the login page. self.browser.get('https://www.icloud.com/#fmf') auth_frame = self.__wait_for_visible('//*[@id="auth-frame"]') self.browser.switch_to.frame(auth_frame) logger.info('Login page is loaded.') # Process input: account name and password. remember_me_input = self.browser.find_element_by_xpath( '//*[@id="remember-me"]') remember_me_input.click() account_name_text_field = self.browser.find_element_by_xpath( '//*[@id="account_name_text_field"]') account_name_text_field.send_keys(account) account_name_text_field.send_keys(Keys.RETURN) password_text_field = self.__wait_for_visible( '//*[@id="password_text_field"]') password_text_field.send_keys(password) password_text_field.send_keys(Keys.RETURN) try: # Wait until the code controls are visible. self.__wait_for_visible('//*[@id="char0"]') return True except TimeoutException: # Login failed. return False def run_codes(self, codes): # Write codes to each controls. for i in range(6): char = self.browser.find_element_by_xpath( '//*[@id="char{i}"]'.format(i=i)) char.send_keys(codes[i]) try: # Click trust button trust_browser = self.__wait_for_visible( '//*[starts-with(@id, "trust-browser-")]') trust_browser.click() except TimeoutException: # Codes were incorrect. return False self.save_cookies() logger.info('Start network listening...') self.tab = self.chrome.list_tab()[-1] self.tab.Network.responseReceived = self.response_received self.tab.start() self.tab.Network.enable() # Start auto refresh. Timer(60, self.auto_refresh).start() return True def save_cookies(self): cookies = self.browser.get_cookies() jsonCookies = json.dumps(cookies) with open('./logs/{id}.cookies'.format(id=self.id), 'w') as f: f.write(jsonCookies) def load_cookies(self): self.browser.delete_all_cookies() with open('./logs/{id}.cookies'.format(id=self.id), 'r') as f: listCookies = json.loads(f.read()) for cookie in listCookies: self.browser.add_cookie({ 'domain': cookie['domain'], 'name': cookie['name'], 'value': cookie['value'], 'path': '/', 'expires': None }) logger.info('Browser cookies loaded.') def start_browser(self): # Start chromedriver options = webdriver.ChromeOptions() options.add_argument('--disable-background-networking=false') options.add_argument('--no-sandbox') retry_count = 0 while True: try: self.browser = webdriver.Chrome( chrome_options=options, service_args=[ '--verbose', '--log-path=./logs/{id}.log'.format(id=self.id) ]) break except ConnectionResetError as e: retry_count += 1 if retry_count >= 10: raise e self.browser.set_page_load_timeout(self.TIMEOUT) self.wait = WebDriverWait(self.browser, self.TIMEOUT) # Get debug url url = None with open('./logs/{id}.log'.format(id=self.id), 'r') as log: for line in log: if 'DevTools request: http://localhost' in line: url = line[line.index('http'):].replace( '/json/version', '').strip() break if not url: raise Exception('Invalid protocol url.') # Start pychrome self.chrome = pychrome.Browser(url=url) def restart_browser(self): try: if self.browser: self.browser.close() self.browser.quit() except Exception: pass self.start_browser() logger.info('Browser restarted.') retry = 0 while True: try: self.browser.get('https://www.icloud.com/#fmf') self.load_cookies() self.browser.refresh() break except Exception as e: retry += 1 if retry >= 5: raise e logger.info('Start network listening...') self.tab = self.chrome.list_tab()[-1] self.tab.Network.responseReceived = self.response_received self.tab.start() self.tab.Network.enable() # Start auto refresh. Timer(60, self.auto_refresh).start() def auto_refresh(self): if self.deleted: return try: self.browser.switch_to.default_content() frame = self.__wait_for_visible('//*[@id="fmf"]') self.browser.switch_to.frame(frame) nearby = self.__wait_for_visible( '/html/body/div[2]/div/div/div[2]/div[1]/div/div[3]/div[1]/div[1]' ) friends = self.browser.find_elements_by_xpath( '/html/body/div[2]/div/div/div[2]/div[1]/div/div[3]/div[1]/div[not(contains(@class, "nearby"))]' ) for friend in friends: friend.click() nearby.click() self.save_cookies() Timer(60, self.auto_refresh).start() except WebDriverException as e: logger.error(e.args) self.refresh_page() def refresh_page(self, retry=1): if self.deleted: return if retry >= 5: logger.error('SERVICE DOWN! restarting...') try: self.restart_browser() except Exception as e: ICLOUD_DICT.pop(self.account) app.mail.send( 'FMF: SERVICE DOWN', '<p>{account} unavailable, login again.</p><p>{e}</p>'. format(account=self.account, e=e.args), img='logs/{id}.png'.format(id=self.id)) return logger.info('REFRESHING...') try: self.browser.save_screenshot('logs/{id}.png'.format(id=self.id)) self.browser.refresh() Timer(60, self.auto_refresh).start() except Exception as e: logger.error(e.args) retry += 1 Timer(10, self.refresh_page, [retry]).start() def response_received(self, **kwargs): response = kwargs.get('response') request_id = kwargs.get('requestId') if 'refreshClient' in response.get('url'): try: content = self.tab.Network.getResponseBody( requestId=request_id)['body'] except pychrome.CallMethodException: return logger.info('{request_id}: {content}'.format(request_id=request_id, content=content)) obj = json.loads(content) if 'locations' in obj: contacts = {} for contact in obj['contactDetails']: id = contact['id'] name = '{first} {middle} {last}'.format( first=contact['firstName'], middle=contact['middleName'], last=contact['lastName']).strip() contacts[id] = name for loc in obj['locations']: if loc['location'] is None: continue id = loc['id'] locid = loc['location']['locationId'] if loc['location']['address'] is None: address = 'UNKNOWN' elif 'formattedAddressLines' in loc['location']['address']: address = ' '.join(loc['location']['address'] ['formattedAddressLines']) else: address = '{streetAddress} {locality} {administrativeArea}'.format( streetAddress=loc['location']['address'] ['streetAddress'], locality=loc['location']['address']['locality'], administrativeArea=loc['location']['address'] ['administrativeArea']) time = loc['location']['timestamp'] / 1000.0 accuracy = loc['location']['horizontalAccuracy'] latitude = loc['location']['latitude'] longitude = loc['location']['longitude'] self.save_model({ 'locid': locid, 'account': self.account, 'uid': id, 'name': contacts[id], 'time': time, 'accuracy': accuracy, 'latitude': latitude, 'longitude': longitude, 'address': address }) def save_model(self, obj): if obj['uid'] not in self.mapping: res = requests.get('http://yingyan.baidu.com/api/v3/entity/list', params={ 'ak': BMAP_AK, 'service_id': YINGYAN_ID, 'filter': 'entity_names:{uid}'.format(uid=obj['uid']) }) jo = json.loads(res.text) if jo['status'] != 0: res = requests.post( 'http://yingyan.baidu.com/api/v3/entity/add', data={ 'ak': BMAP_AK, 'service_id': YINGYAN_ID, 'entity_name': obj['uid'], 'entity_desc': obj['name'] }) logger.info('YingYan ADD entity: {res}'.format(res=res.text)) self.mapping.add(obj['uid']) if Location.objects.filter(locid=obj['locid']): return try: Location.objects.create(locid=obj['locid'], account=obj['account'], uid=obj['uid'], name=obj['name'], time=datetime.datetime.fromtimestamp( obj['time']), accuracy=obj['accuracy'], latitude=obj['latitude'], longitude=obj['longitude'], address=obj['address']) except IntegrityError: pass res = requests.post('http://yingyan.baidu.com/api/v3/track/addpoint', data={ 'ak': BMAP_AK, 'service_id': YINGYAN_ID, 'entity_name': obj['uid'], 'latitude': obj['latitude'], 'longitude': obj['longitude'], 'loc_time': int(obj['time']), 'radius': obj['accuracy'], 'coord_type_input': 'wgs84', 'address': obj['address'] }) logger.info('YingYan ADD point: {res}'.format(res=res.text)) def __del__(self): self.deleted = True if self.browser: self.browser.quit() self.c_service.stop() logger.info('A browser is Closed.')
#!/usr/bin/python3 from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options import time import random import sys import string import random import base64 import csv #开启服务为了后续关闭服务,不然进程一直杀不掉 driver_service = Service('./webdriver/chromedriver.exe') #括号内填写 驱动路径 driver_service.command_line_args() driver_service.start() chrome_options = Options() chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') #chrome_options.add_argument('--headless') # par_dir = os.path.dirname(os.path.abspath(__file__)) # os.chdir(par_dir) def decode(s): return base64.b64decode(s).decode('ascii') localtime = time.asctime(time.localtime(time.time()))
def AutoInfoSpider(url): # 启动服务 c_service = Service( 'C:/Users/gusisong/AppData/Local/Continuum/anaconda3/chromedriver.exe') c_service.command_line_args() c_service.start() # 功能配置 option = webdriver.ChromeOptions() option.add_argument('--headless') option.add_argument('--disable-gpu') option.add_argument('--disable-images') option.add_argument('--disable-javascript') option.add_argument('--disable-plugins') option.add_argument('--no-sandbox') driver = webdriver.Chrome(options=option) driver.get(url) output = [] for letter_index in range(1, 25): # 按品牌抓取 brand_list = driver.find_elements_by_xpath( '/html/body/div[8]/div[1]/div[2]/div/div[{}]//a/div'.format( letter_index)) for brand_index in range(2, len(brand_list) + 2): # 记录品牌名称 brand_name = driver.find_element_by_xpath( '/html/body/div[8]/div[1]/div[2]/div/div[{0}]/div[{1}]/a/div'. format(letter_index, brand_index)).text # 进入车型目录 driver.find_element_by_xpath( '/html/body/div[8]/div[1]/div[2]/div/div[{0}]/div[{1}]/a/div'. format(letter_index, brand_index)).click() # 按页码抓取 page_list = driver.find_elements_by_xpath( '/html/body/div[8]/div[4]/div[5]/div/div/div/a') for page_index in range(1, len(page_list) + 1): if page_index > 1: driver.find_element_by_xpath( '/html/body/div[8]/div[4]/div[5]/div/div/div/a[{}]'. format(page_index)).click() model_list = driver.find_elements_by_xpath( '/html/body/div[8]/div[4]/div[3]/div') for model_index in range(1, len(model_list) + 1): # 记录车型名称 model_name = driver.find_element_by_xpath( '/html/body/div[8]/div[4]/div[3]/div[{}]/a/p[1]'. format(model_index)).text # 记录车型价格 price_range = driver.find_element_by_xpath( '/html/body/div[8]/div[4]/div[3]/div[{}]/a/p[2]'. format(model_index)).text if '-' in price_range: min_price = price_range[:-1].split('-')[0] + '万' max_price = price_range[:-1].split('-')[1] + '万' elif '暂无' in price_range: min_price = '暂无' max_price = '暂无' else: min_price = price_range max_price = price_range # 记录图片链接 pic_link = driver.find_element_by_xpath( '/html/body/div[8]/div[4]/div[3]/div[{}]/a/img'.format( model_index)).get_attribute("src") print(brand_name, model_name, min_price, max_price, pic_link) output.append([ brand_name, model_name, min_price, max_price, pic_link ]) # 写入文件 df = DataFrame(output) df.columns = ['品牌名', '车型', '最低价', '最高价', '图片链接'] df.to_csv('auto_info.csv', encoding='utf_8_sig') # 后台完全关闭 driver.close() driver.quit() c_service.stop()
class CaesarReaderWindow(QMainWindow, Ui_myMainWindow): def __init__(self, parent=None): super(CaesarReaderWindow, self).__init__(parent) self.chromeDriverPath = os.path.abspath('chromedriver.exe') self.driver = None self.c_service = None self.startReadFlag = False self.stopReadFlag = False self.stopPPPOEFlag = False # 已完成文章篇数 self.readNum = 0 # 剩余文章篇数 self.unReadNum = 0 self.driverThread = DriverThread(self) self.pppoeThread = PPPOETask(self) self.setupUi(self) self.connectPppoeBtn.clicked.connect(self.connect_pppoe_click) self.disconnectPppoeBtn.clicked.connect(self.disconnect_pppoe_click) self.saveConfigBtn.clicked.connect(self.save_config_click) self.startReadBtn.clicked.connect(self.start_read_click) self.stopReadBtn.clicked.connect(self.end_read_click) self.init_config() def init_config(self): """ 初始化全局配置 :return: """ try: initConfigTask = InitConfigTask(self) pool = QThreadPool.globalInstance() pool.start(initConfigTask) except Exception as e: self.print_log(str(e)) pass def popen(self, cmd): try: popen = subprocess.Popen(cmd, stdout=subprocess.PIPE) popen.wait() lines = popen.stdout.readlines() return [line.decode('gbk') for line in lines] except Exception as e: self.print_log("获取ip异常") return -1 def save_config_click(self): """ 保存全局配置信息 :return: """ startTime = self.startTimeEdit.text() endTime = self.endTimeEdit.text() pauseTimeFrom = self.pauseTimeFromEdit.text() pauseTimeTo = self.pauseTimeToEdit.text() slipTimesFrom = self.slipTimesFromEdit.text() slipTimesTo = self.slipTimesToEdit.text() pxFrom = self.pxFromEdit.text() pxTo = self.pxToEdit.text() chromeLocation = self.chromeLocationEdit.text() if not (startTime and endTime and pauseTimeFrom and pauseTimeTo and slipTimesFrom and slipTimesTo and pxFrom and pxTo and chromeLocation): return configObj = { 'startTime': startTime, 'endTime': endTime, 'pauseTimeFrom': pauseTimeFrom, 'pauseTimeTo': pauseTimeTo, 'slipTimesFrom': slipTimesFrom, 'slipTimesTo': slipTimesTo, 'pxFrom': pxFrom, 'pxTo': pxTo, 'chromeLocation': chromeLocation } with open(os.path.abspath('config.ini'), 'w', encoding='utf-8') as f: f.write(json.dumps(configObj)) f.flush() f.close() pass def start_read_click(self): """ 开始阅读按钮点击 :return: """ self.startReadFlag = True self.stopReadFlag = False try: if not self.driverThread.isRunning(): self.driverThread.trigger.connect(self.reconnect_pppoe) self.driverThread.start() except Exception as e: self.print_log(str(e)) pass def end_read_click(self): """ 停止阅读按钮点击 :return: """ self.startReadFlag = False self.stopReadFlag = True try: if self.driver is not None: self.driver.quit() self.driver = None except Exception as msg: pass try: if self.c_service is not None: self.c_service.stop() self.c_service = None except Exception as msg: pass pass def connect_pppoe_click(self): """ 开始拨号按钮点击 :return: """ self.stopPPPOEFlag = False try: if not self.pppoeThread.isRunning(): self.pppoeThread.trigger.connect(self.read_next) self.pppoeThread.start() except Exception as e: self.print_log(str(e)) pass def disconnect_pppoe_click(self): """ 停止拨号按钮点击 :return: """ self.stopPPPOEFlag = True data = self.check_for_broadband() if data is not None: for p in data: self.show_ip_address() if self.disconnect_pppoe(p[0]) == "success": self.print_log("宽带%s已经断开" % p[1]) sleep(5) pass def read_next(self): """ 阅读下一篇 :return: """ if not self.startReadFlag: return if self.stopReadFlag or self.stopPPPOEFlag: return self.start_read_click() pass def reconnect_pppoe(self): """ 重新连接pppoe :return: """ if self.stopReadFlag or self.stopPPPOEFlag: return self.connect_pppoe_click() pass def build_driver(self, url): """ 构建阅读driver :return: """ if not url: self.stopReadFlag = True self.print_log("已全部阅读完成\n") return self.readNum = self.readNum + 1 self.print_log("开始阅读第 %d 篇,剩余 %d 篇" % (self.readNum, self.unReadNum)) self.print_log("当前:%s" % url) self.c_service = Service(self.chromeDriverPath) self.c_service.command_line_args() self.c_service.start() mobileEmulation = { "deviceMetrics": { "width": 320, "height": 640, "pixelRatio": 3.0 }, "userAgent": 'Mozilla/5.0 (Linux; Android 4.1.1; GT-N7100 Build/JRO03C) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/35.0.1916.138 Mobile Safari/537.36 T7/6.3' } # mobileEmulation = {'deviceName': 'Apple iPhone 5'} chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--window-size=250,640') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--hide-scrollbars') chrome_options.add_argument('--disable-javascript') chrome_options.add_argument('--log-level=3') chrome_options.binary_location = self.chromeLocationEdit.text() chrome_options.add_experimental_option('mobileEmulation', mobileEmulation) chrome_options.add_experimental_option("excludeSwitches", ['enable-automation']) chrome_options.add_experimental_option('w3c', False) self.driver = webdriver.Chrome(options=chrome_options) # 操作这个对象. self.driver.get(url) num = random.randint(int(self.slipTimesFromEdit.text()), int(self.slipTimesToEdit.text())) # 下滑次数 hasNum = 0 for n in range(num): if not self.stopReadFlag: holdTime = random.randint(int(self.pauseTimeFromEdit.text()), int(self.pauseTimeToEdit.text())) px = random.randint(int(self.pxFromEdit.text()), int(self.pxToEdit.text())) self.print_log("第 %d 次下滑,等待 %d 秒, 下滑 %d 像素" % (n + 1, holdTime, px)) # 每次下滑停顿时间 sleep(holdTime) action = TouchActions(self.driver) action.scroll(0, 200).perform() hasNum = hasNum + 1 else: break try: if self.driver is not None: self.driver.quit() self.driver = None except Exception as msg: pass try: if self.c_service is not None: self.c_service.stop() self.c_service = None except Exception as msg: pass if self.stopReadFlag: self.print_log("第 %d 篇阅未完成,共下滑 %d 次\n" % (self.readNum, hasNum)) return else: self.print_log("第 %d 篇阅读完成,共下滑 %d 次\n" % (self.readNum, hasNum)) try: # 删除第一行 with open(os.path.abspath('unread.txt'), 'r', encoding='utf-8') as f: content = f.readlines() with open(os.path.abspath('unread.txt'), 'w+', encoding='utf-8') as f1: f1.writelines(content[1:]) f1.flush() f1.close() f.close() # 追加到最后一行 with open(os.path.abspath('read.txt'), 'a', encoding='utf-8') as f: f.write(url) f.close() except Exception as e: print(e) pass def build_pppoe(self): """ 构建pppoe :return: """ data = self.check_for_broadband() if data is not None: for p in data: self.show_ip_address() if self.disconnect_pppoe(p[0]) == "success": self.print_log("宽带%s已经断开" % p[1]) sleep(5) else: try: pid, res = self.dial_broadband() if res == 0: self.show_ip_address() sleep(5) except Exception as ee: pass pass def print_log(self, message): """ 异度打印日志 :param message: 日志信息 :return: """ try: logTask = LogTask(self, message) pool = QThreadPool.globalInstance() pool.start(logTask) except Exception as msg: print(msg) pass def setUnReadNum(self, num): self.unReadNum = num def connect_pppoe(self, dialname, account, passwd): dial_params = (dialname, '', '', account, passwd, '') return win32ras.Dial(None, None, dial_params, None) def dial_broadband(self): """ 宽带拨号 :return: """ self.pppoeStatusLbl.setText("正在拨号...") dialname = '宽带连接' # just a name account = self.accountEdit.text() passwd = self.passwordEdit.text() self.print_log("正在拨号") try: # handle is a pid, for disconnect or showipadrress, if connect success return 0. # account is the username that your ISP supposed, passwd is the password. handle, result = self.connect_pppoe(dialname, account, passwd) if result == 0: self.print_log("拨号成功") self.pppoeStatusLbl.setText("拨号成功") return handle, result else: if self.stopPPPOEFlag: self.print_log("拨号失败") self.pppoeStatusLbl.setText("拨号失败") return -1, -1 else: self.print_log("拨号失败,3秒后重试") self.pppoeStatusLbl.setText("正在重试") sleep(3) return self.dial_broadband() except Exception as e: self.print_log("拨号异常" + str(e)) return -1, -1 def disconnect_pppoe(self, handle): self.print_log("正在断开宽带!") self.pppoeStatusLbl.setText("正在断开") if handle is not None: try: win32ras.HangUp(handle) self.print_log("宽带断开成功!") self.pppoeStatusLbl.setText("断开成功") return "success" except Exception as e: self.print_log("宽带断开失败,3秒后重试") self.pppoeStatusLbl.setText("断开失败") sleep(3) return self.disconnect_pppoe(handle) else: self.print_log("宽带断开异常") self.pppoeStatusLbl.setText("断开失败") return "fail" def check_for_broadband(self): connections = win32ras.EnumConnections() if len(connections) == 0: self.print_log("系统未运行任何宽带连接") return else: self.print_log("系统正在运行%d个宽带连接" % len(connections)) return connections def show_ip_address(self): self.print_log("正在查询IP") self.pppoeIpLbl.setText("") ipconfig_result_list = self.popen('ipconfig') if ipconfig_result_list == -1: return ip_str = None have_ppp = 0 for line in ipconfig_result_list: if line.find("宽带连接") >= 0: have_ppp = 1 if have_ppp == 1: if line.strip().startswith("IPv4 地址"): ip_str = line.split(":")[1].strip() have_ppp = 0 if ip_str is not None: self.print_log("IP地址为: " + ip_str) self.pppoeIpLbl.setText(ip_str) pass
class Spider: def __init__(self): self.c_service = Service( 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe' ) self.c_service.command_line_args() self.c_service.start() chrome_options = Options() chrome_options.add_argument('--headless') #不显示界面 #chrome_options.add_argument('--disable-gpu') self.driver = webdriver.Chrome(chrome_options=chrome_options) self.url = "https://music.163.com/#/discover/toplist?id=3778678" '''找到所有热门歌曲的URL和歌曲名称''' def find_allSong(self): # self.driver.implicitly_wait(10) #print("start") WebDriverWait(self.driver, 3, 0.5).until( lambda driver: self.driver.find_element_by_id("g_iframe")) #print("All download") self.driver.switch_to.frame(self.driver.find_element_by_id("g_iframe")) # with open("test3.html",'w',encoding='UTF-8') as file_obj: # file_obj.write(self.driver.page_source) # header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'} # response = driver.get(self.url,headers = header) #请求网址 # print(response.request.headers) # print(response.headers) # print(response.status_code) # print(response.content.decode("utf-8")) soup = BeautifulSoup(self.driver.page_source, "lxml") # song_list = soup.find('tbody') # 找到所有歌曲 all_song = song_list.find_all('tr') for each_song in all_song: each_info = each_song.find('span', class_="txt") # each_info_2 = each_song.find('span', class_="title") #each_info = each_song.find_all('span') long_time = each_song.find('span', class_="u-dur").text author = each_song.find('span', class_="icn icn-share")['data-res-author'] link = "https://music.163.com/" + each_info.find('a')['href'] name = each_info.find('b')['title'] print('歌曲连接 : {}, 歌曲名 : {},作者 : {},时长 : {}'.format( link, name, author, long_time)) self.comment(link, name) #爬取评论 def comment(self, link, name): self.driver.execute_script("window.open('%s')" % link) # print(self.driver.window_handles) # print(self.driver.current_window_handle) self.driver.switch_to.window(window_name=self.driver.window_handles[1]) # print(self.driver.current_window_handle) WebDriverWait(self.driver, 3, 0.5).until( lambda driver: self.driver.find_element_by_id("g_iframe")) self.driver.switch_to.frame(self.driver.find_element_by_id("g_iframe")) page = 0 with open(name + "-评论信息.csv", 'w', encoding="utf-8") as fp: fp.write('评论者,评论内容,评论日期' + '\n') while True: page += 1 soup = BeautifulSoup(self.driver.page_source, "lxml") all_comment = soup.find_all('div', class_='itm') #print(soup) for each_comment in all_comment: comment_name = each_comment.find('a', class_='s-fc7').text tmp = each_comment.find('div', class_='cnt f-brk').text date = each_comment.find('div', class_='time s-fc4').text comment = tmp.replace(comment_name + ':', '') #print(comment_name, comment) with open(name + "-评论信息.csv", 'a+', encoding="utf-8") as fp: fp.write(comment_name + ',' + comment + ',' + date + '\n') ''' 下一页 ''' try: print("{}--第{}页.".format(name, page)) next = self.driver.find_element_by_xpath( '//*[starts-with(@class,"zbtn znxt") and not(contains(@class,"js-disabled"))]' ) #xpath路径匹配节点 except: print(self.driver.current_window_handle) self.driver.switch_to.window( window_name=self.driver.window_handles[0]) print(self.driver.current_window_handle) self.driver.close() # 关闭当前窗口 return #last = self.driver.find_element_by_xpath('//*[starts-with(@class,"zbtn znxt js-n") and contains(@class,"js-disabled")]') else: next.send_keys(Keys.ENTER) time.sleep(0.5) def run(self): self.driver.get(self.url) #启动浏览器 self.find_allSong() #self.comment() #self.driver.close() # 关闭当前窗口 self.driver.quit() # 关闭进程 def __del__(self): self.c_service.stop()
def donghangcrawler(city1, city2, date_in): print("东方航空爬虫开始运行") infolist = [] pricelist = [] resultlist = [] cell = {} date_in = date_in[0:4] + '-' + date_in[4:6] + '-' + date_in[6:8] c_service = Service('./webdriver/chromedriver.exe') c_service.command_line_args() browser = webdriver.Chrome(executable_path='./webdriver/chromedriver.exe') c_service.start() browser.get('http://www.ceair.com/') ad = browser.find_element_by_id("appd_wrap_close") citya = browser.find_element_by_id("label_ID_0") # 出发城市 cityb = browser.find_element_by_id("label_ID_1") # 到达城市 date = browser.find_element_by_id("depDt") datex = browser.find_element_by_id("deptDtRt") search = browser.find_element_by_id("btn_flight_search") # 查询按钮 time.sleep(0.5) # 必须等待页面加载结束后开始操作,否则会被当机器人 ad.click() # 关闭广告,会遮挡查询页面 time.sleep(0.5) citya.clear() # 删除默认输入 time.sleep(0.5) citya.send_keys(city1) time.sleep(0.5) citya.send_keys(Keys.TAB) cityb.send_keys(city2) time.sleep(0.5) cityb.send_keys(Keys.TAB) time.sleep(0.5) date.send_keys(Keys.BACKSPACE, Keys.BACKSPACE, Keys.BACKSPACE, Keys.BACKSPACE, Keys.BACKSPACE, Keys.BACKSPACE, Keys.BACKSPACE, Keys.BACKSPACE, Keys.BACKSPACE, Keys.BACKSPACE, date_in) # 清除默认值重新赋值,不能用clear time.sleep(0.5) date.send_keys(Keys.TAB) # 消除弹出菜单 time.sleep(0.5) datex.send_keys(Keys.TAB) # 消除日历菜单 time.sleep(0.5) search.click() time.sleep(3) try: browser.switch_to.window(browser.window_handles[1]) # 定位到跳转后的查询结果页面 except BaseException: time.sleep(3) search.click() time.sleep(3) browser.switch_to.window(browser.window_handles[1]) # 定位到跳转后的查询结果页面 time.sleep(5) info = browser.find_elements_by_xpath( "//section[@class='summary']") # 信息模块组 price = browser.find_elements_by_xpath( "//dd[@data-type='economy']") # 价格方块组 if info is None or price is None: print("----东航--nodata----") return resultlist for i in info: clean = str(i.text).strip().split() # 解析机票基本信息 # ['东方航空', '|', 'MU5104|直达|', '09:00', '首都国际机场', 'T2', '直达', '11:15', '虹桥国际机场', 'T2', '02小时15分钟'] clean.remove('|') clean[1] = clean[1].split('|')[0] # 提取航班号 infolist.append(clean) for j in price: clean = str(j.text) if clean: clean = clean.split()[1] else: clean = '售完' pricelist.append(clean) for info_, price_ in zip(infolist, pricelist): cell['Airline'] = info_[0] cell['FlightNumber'] = info_[1] cell['dTime'] = info_[2] cell['dAirport'] = info_[3] cell['aTime'] = info_[6] cell['aAirport'] = info_[7] cell['LowestPrice'] = price_ resultlist.append(cell.copy()) browser.quit() c_service.stop() # with open('./data/donghang.csv', 'w', encoding='utf-8') as csvfile: # 运行整个项目时解除此注释 # writer = csv.writer(csvfile, delimiter=',') # for i in resultlist: # writer.writerow([city1, city2, i.get("Airline"), i.get('FlightNumber'), i.get('dAirport'), # i.get('aAirport'), i.get('dTime'), i.get('aTime'), i.get('LowestPrice'), '东方航空']) # csvfile.close() print("东方航空爬虫运行结束") return resultlist
options.add_argument('--window-size=1920,1080') #这个命令禁止沙箱模式,否则肯能会报错遇到chrome异常。 options.add_argument('--no-sandbox') options.add_argument('--disable-gpu') options.add_argument('--disable-dev-shm-usage') #建议加上user-agent,因为liunx下有时候会被当成手机版的,所以你会发现代码会报错 num = str(float(random.randint(500, 600))) #此参数最好建议最好带上,不然有些网站会识别liunx系统进行拦截,这里把它伪装成windows下的 options.add_argument( "user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/{} (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/{}" .format(num, num)) sys_str = platform.system() service = Service('./chromedriver') ### linux service.command_line_args() service.start() if sys_str == "Linux": driver = webdriver.Chrome(executable_path='./chromedriver', options=options) else: driver = webdriver.Chrome( 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe', options=options) today_md = datetime.datetime.now().strftime('%m%d') today_ymd = datetime.datetime.now().strftime('%Y%m%d') first_day = datetime.datetime.strptime('2021-02-01', '%Y-%m-%d')
def register(country_code, email, password): global cache # Login try: c_service = Service("/usr/bin/chromedriver") c_service.command_line_args() c_service.start() selenium_list = [x.strip() for x in open("node.txt", "r").readlines()] entry = random.choice(selenium_list) driver = webdriver.Remote( command_executor='http://%s:4444/wd/hub' % entry, desired_capabilities=chrome_options.to_capabilities()) print("Choose Node:", entry) if email in cache: session, schedule_id, group_id = cache[email] new_session = change_region(country_code, session, group_id) driver.get("https://ais.usvisa-info.com") driver.add_cookie({ 'name': '_yatri_session', 'value': new_session, 'path': '/', 'domain': 'ais.usvisa-info.com', 'secure': True }) driver.get("https://ais.usvisa-info.com/%s/niv/groups/%s" % (country_code, group_id)) else: driver.get("https://ais.usvisa-info.com/%s/niv/users/sign_in" % country_code) email_box = driver.find_element_by_id("user_email") email_box.clear() email_box.send_keys(email) password_box = driver.find_element_by_id("user_password") password_box.clear() password_box.send_keys(password) driver.execute_script( "document.getElementById('policy_confirmed').click()") signin_button = driver.find_element_by_name("commit") signin_button.click() def wait_loading(xpath, option="locate"): try: if option == "locate": element_present = EC.presence_of_element_located( (By.XPATH, xpath)) elif option == "clickable": element_present = EC.element_to_be_clickable( (By.XPATH, xpath)) WebDriverWait(driver, wait_timeout).until(element_present) except TimeoutException: print("Timed out waiting for page to load") driver.execute_script("window.scrollTo(0, 1080)") driver.save_screenshot("test.png") # Continue continue_button_xpath = "//a[contains(text(), 'Continue')]" wait_loading(continue_button_xpath) current_url = driver.current_url group_id = current_url.split("/")[-1] continue_button = driver.find_element_by_xpath(continue_button_xpath) continue_button.click() # Choose action pay_button_xpath = "//a[contains(text(), 'Pay Visa Fee')]" wait_loading(pay_button_xpath) banner = driver.find_element_by_tag_name('h5') banner.click() wait_loading(pay_button_xpath, option="clickable") pay_button = driver.find_element_by_xpath(pay_button_xpath) pay_button.click() # Collect result title_xpath = "//h2[contains(text(), 'MRV Fee Details')]" wait_loading(title_xpath) time_table = driver.find_element_by_class_name('for-layout') result = [] if time_table: trs = time_table.find_elements_by_tag_name('tr') for tr in trs: tds = tr.find_elements_by_tag_name('td') if not len(tds) == 2: continue place = tds[0].text date_str = tds[1].text s = date_str.split() year, month, day = 0, 0, 0 if len(s) >= 3 and s[0] != "No": day_str, month_str, year_str = s[-3], s[-2].replace( ",", ""), s[-1] year, month, day = int(year_str), g.MONTH[month_str], int( day_str) result.append([place, (year, month, day)]) current_url = driver.current_url schedule_id = current_url.split("/")[-2] session = driver.get_cookie("_yatri_session")["value"] driver.quit() c_service.stop() if result: cache[email] = [session, schedule_id, group_id] else: del cache[email] return result, session, schedule_id except Exception as e: if email in cache: del cache[email] print(str(e)) if driver: driver.quit() if c_service: c_service.stop() return None, None, None
# -*- coding:UTF-8 -*- import os import sys from selenium import webdriver from selenium.webdriver.chrome.service import Service base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(base_dir + '/bin') driver_path = base_dir + '/bin/chromedriver' # 初始化 chrome service chrome_service = Service(executable_path=driver_path) chrome_service.command_line_args() chrome_service.start() # 初始化 driver driver = webdriver.Chrome(driver_path) driver.implicitly_wait(30) driver.maximize_window() driver.get("http://www.linkedsee.com/") products = driver.find_elements_by_xpath("//ul[@class='drop box-1']/li/a") print(products) print("found " + str(len(products)) + "products") for product in products: print(product.get_attribute('textContent'))
class Login(object): def __init__(self): if len(sys.argv) < 3: self.account = '1778973****' self.password = '******' else: self.account = sys.argv[1] self.password = sys.argv[2] self.url = "https://login.1688.com/member/signin.htm" # self.url = "https://detail.1688.com/offer/622477660714.html" self.browser = 'chrome' if self.browser == 'chrome': driver_path = "D:\phpStudy\PHPTutorial\WWW\zwwl2016\python\chromedriver.exe" self.c_service = Service(driver_path) self.c_service.command_line_args() self.c_service.start() chrome_options = Options() chrome_options.add_argument( 'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"' ) # chrome_options.add_argument('--headless') # 浏览器不提供可视化页面 chrome_options.add_argument('--no-sandbox') # 取消沙盒模式 chrome_options.add_argument('--disable-gpu') # 禁用GPU加速 chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--start-maximized') # 最大化运行(全屏窗口) chrome_options.add_argument("--incognito") # 隐身模式启动 # chrome_options.add_argument("disable-infobars") # 已弃用 去掉提示:Chrome正收到自动测试软件的控制 # chrome_options.add_experimental_option('useAutomationExtension', False) # 去掉提示:Chrome正收到自动测试软件的控制 # 屏蔽提示:chrome正收到自动测试软件的控制 # 在79(含79)以后的版本无效。谷歌修复了非无头模式下排除“启用自动化”时window.navigator.webdriver是未定义的问题 chrome_options.add_experimental_option( 'excludeSwitches', ['enable-automation']) # 去掉提示:Chrome正收到自动测试软件的控制 self.driver = webdriver.Chrome(executable_path=driver_path, options=chrome_options) # CDP执行JavaScript 代码 重定义window.navigator.webdriver的值 绕过反爬机制 # 检测机制: # selenium调用驱动打开浏览器,在控制台window.navigator.webdriver会标记FALSE, # 手工正常打开的浏览器控制台window.navigator.webdriver的结果是True self.driver.execute_cdp_cmd( "Page.addScriptToEvaluateOnNewDocument", { "source": """ Object.defineProperty(navigator, 'webdriver', { get: () => undefined }) """ }) elif self.browser == 'firefox': driver_path = "/usr/local/bin/geckodriver" firefox_options = FirefoxOptions() firefox_options.add_argument('--headless') firefox_options.add_argument('--no-sandbox') firefox_options.add_argument('--disable-gpu') # firefox_options.add_argument('--disable-dev-shm-usage') self.driver = webdriver.Firefox(executable_path=driver_path, firefox_options=firefox_options) self.driver.delete_all_cookies() # self.index = 'https://open.yuewen.com/pub/index/index.html' # self.agent = '' def after_quit(self): """ 关闭浏览器 """ self.driver.quit() # self.driver.close() # self.c_service.stop() # 去掉提示:Chrome正收到自动测试软件的控制 def save_cookie(self, cookies, token): db = pymysql.Connect(host='127.0.0.1', port=int(3306), user='******', passwd='password', db='db_name', charset='utf8mb4') cursor = db.cursor() sql = 'SELECT cookie.id FROM pigeon_spider_account_cookie cookie ' +\ 'LEFT JOIN pigeon_spider_account_center spider ' +\ 'ON spider.id = cookie.pigeon_spider_account_center_id ' + \ 'LEFT JOIN pigeon_platform_account_center account ' + \ 'ON account.id = spider.platform_account_center_id ' + \ 'WHERE account.platform= %s AND account.account = %s' count = cursor.execute(sql, ('1688', self.account)) if (count > 0): id = cursor.fetchone()[0] update_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) sql = 'UPDATE pigeon_spider_account_cookie SET cookies = %s, token = %s,' + \ 'update_time = %s WHERE id = %s' result = cursor.execute(sql, (cookies, token, update_at, id)) else: sql = 'SELECT spider.id FROM pigeon_spider_account_center spider ' + \ 'LEFT JOIN pigeon_platform_account_center account ' + \ 'ON account.id = spider.platform_account_center_id ' + \ 'WHERE account.platform= %s AND account.account = %s' count = cursor.execute(sql, ('1688', self.account)) if (count > 0): id = cursor.fetchone()[0] create_at = update_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) sql = 'INSERT INTO pigeon_spider_account_cookie (pigeon_spider_account_center_id, ' + \ 'cookies, token, expires_time, create_time, update_time) ' + \ 'VALUES (%s, %s, %s, %s, %s, %s)' expires_time = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(time.time() + 86400)) result = cursor.execute( sql, (id, cookies, token, expires_time, create_at, update_at)) print(result) db.commit() db.close() return result def get_track(self, distance): """ 模拟轨迹 假装是人在操作 :param distance: :return: """ # 初速度 v = 0 # 单位时间为0.2s来统计轨迹,轨迹即0.2内的位移 t = 0.2 # 位移/轨迹列表,列表内的一个元素代表0.2s的位移 tracks = [] # 当前的位移 current = 0 # 到达mid值开始减速 mid = distance * 7 / 8 # a = random.randint(1,3) while current < distance: if current < mid: # 加速度越小,单位时间的位移越小,模拟的轨迹就越多越详细 a = random.randint(2, 4) # 加速运动 else: a = -random.randint(3, 5) # 减速运动 # 初速度 v0 = v # 0.2秒时间内的位移 s = v0 * t + 5 * a * (t**2) # 当前的位置 current += s # 添加到轨迹列表 tracks.append(round(s)) # 速度已经达到v,该速度作为下次的初速度 v = v0 + a * t # 反着滑动到大概准确位置 # for i in range(4): # tracks.append(-random.randint(2, 3)) # for i in range(4): # tracks.append(-random.randint(1, 3)) return tracks def login_main(self): try: # ssl._create_default_https_context = ssl._create_unverified_context driver = self.driver driver.get(self.url) driver.implicitly_wait(10) driver.switch_to.frame(driver.find_element_by_tag_name("iframe")) driver.find_element_by_xpath("//*[@id='fm-login-id']").send_keys( self.account) driver.find_element_by_xpath( "//*[@id='fm-login-password']").send_keys(self.password) # 随机等待时间,以免被反爬虫 time.sleep(random.uniform(10, 15)) driver.switch_to.frame( driver.find_element_by_id("baxia-dialog-content")) # 找到登录窗口滑动背景验证的方块 slider_square_bg = driver.find_element_by_xpath( "//*[@id='nc_2__scale_text']") print(slider_square_bg.size) slider_square_bg_width = slider_square_bg.size['width'] # 找到登录窗口滑动验证的方块 slider_square = driver.find_element_by_xpath("//*[@id='nc_2_n1z']") # 得到登录窗口滑动验证方块的宽度 slider_square_width = slider_square.size['width'] print(slider_square.size) print(slider_square.location) location_x = slider_square.location['x'] # 获取移动轨迹 track_list = self.get_track( slider_square_bg_width - slider_square_width) # 调用get_track()方法,传入真实距离参数,得出移动轨迹 # 判断方块是否显示,是则模拟鼠标滑动,否则跳过 if slider_square.is_displayed(): # 找到滑块元素,点击鼠标左键,按住不放 ActionChains(driver).click_and_hold(slider_square).perform( ) # 点击鼠标左键,不松开 perform() ——执行链中的所有动作 ActionChains(driver).move_by_offset( xoffset=location_x + (slider_square_bg_width - slider_square_width), yoffset=0).perform() # 根据运动轨迹(x轴),进行拖动 # # 拖动元素 # for track in track_list: # print(track) # ActionChains(driver).move_by_offset(xoffset=track, yoffset=0).perform() # 根据运动轨迹(x轴),进行拖动 # time.sleep(0.001) time.sleep(0.5) # print("验证滑块结束") ActionChains(driver).release( on_element=slider_square).perform() # 释放鼠标 # 切换到主html(最外层html) driver.switch_to.default_content() print( driver.find_element_by_xpath( "//button[@class='fm-button fm-submit password-login']"). text()) cookies_all = json.dumps(self.driver.get_cookies()) pass except Exception as e: print('str(Exception):\t', str(Exception)) print('str(e):\t\t', str(e)) print('repr(e):\t', repr(e)) # Get information about the exception that is currently being handled self.after_quit() return False