def crack_sougou(self, url): log('------开始处理未成功的URL:{}'.format(url)) if re.search('weixin\.sogou\.com', url): log('------开始处理搜狗验证码------') self.driver.get(url) time.sleep(2) if '搜公众号' in self.driver.page_source: for i in range(30): self.driver.get(url) log('浏览器页面正常') if '搜公众号' not in self.driver.page_source: break try: img = self.wait.until(EC.presence_of_element_located((By.ID, 'seccodeImage'))) log('------出现验证码页面------') location = img.location size = img.size left = location['x'] top = location['y'] right = location['x'] + size['width'] bottom = location['y'] + size['height'] screenshot = self.driver.get_screenshot_as_png() screenshot = Image.open(BytesIO(screenshot)) captcha = screenshot.crop((left, top, right, bottom)) captcha_path = os.path.join(IMAGE_DIR, CAPTCHA_NAME) captcha.save(captcha_path) with open(captcha_path, "rb") as f: filebytes = f.read() captch_input = captch_upload_image(filebytes) log('------验证码:{}------'.format(captch_input)) if captch_input: input_text = self.wait.until(EC.presence_of_element_located((By.ID, 'seccodeInput'))) input_text.clear() input_text.send_keys(captch_input) submit = self.wait.until(EC.element_to_be_clickable((By.ID, 'submit'))) submit.click() time.sleep(2) try: if '搜公众号' not in self.driver.page_source: log('验证失败') return log('------验证码正确------') except: log('--22222222----验证码输入错误------') except Exception as e: log('------未跳转到验证码页面,跳转到首页,忽略------') elif re.search('mp\.weixin\.qq\.com', url): log('------开始处理微信验证码------') cert = random.random() image_url = 'https://mp.weixin.qq.com/mp/verifycode?cert={}'.format(cert) respones = self.s.get(image_url, cookies=self.cookies) captch_input = captch_upload_image(respones.content) log('------验证码:{}------'.format(captch_input)) data = { 'cert': cert, 'input': captch_input } self.s.post(image_url, cookies=self.cookies, data=data) log('------cookies已更新------')
def crack_sougou(self, url): print('------开始处理未成功的URL:{}'.format(url)) # if 'weixin\.sogou\.com' in url: print('------开始处理搜狗验证码------') chrome_options = webdriver.ChromeOptions() # chrome_options.add_argument('--headless') browser = webdriver.Chrome(chrome_options=chrome_options) browser.get(url) time.sleep(2) try: wait = WebDriverWait(browser, 10) img = wait.until(EC.presence_of_element_located((By.ID, 'seccodeImage'))) print('------出现验证码页面------') location = img.location size = img.size left = location['x'] top = location['y'] right = location['x'] + size['width'] bottom = location['y'] + size['height'] screenshot = browser.get_screenshot_as_png() screenshot = Image.open(BytesIO(screenshot)) captcha = screenshot.crop((left, top, right, bottom)) captcha_path = os.path.join(IMAGE_DIR, CAPTCHA_NAME) captcha.save(captcha_path) with open(captcha_path, "rb") as f: filebytes = f.read() captch_input = captch_upload_image(filebytes) print('------验证码:{}------'.format(captch_input)) if captch_input: input_text = wait.until(EC.presence_of_element_located((By.ID, 'seccodeInput'))) input_text.clear() input_text.send_keys(captch_input) submit = wait.until(EC.element_to_be_clickable((By.ID, 'submit'))) submit.click() try: print('------输入验证码------') error_tips = wait.until(EC.presence_of_element_located((By.ID, 'error-tips'))).text if len(error_tips): print('------验证码输入错误------') return wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'login-info'))) print('------验证码正确------') cookies = browser.get_cookies() new_cookie = {} for items in cookies: new_cookie[items.get('name')] = items.get('value') self.cookies = new_cookie print('------cookies已更新------') return new_cookie except: print('------验证码输入错误------') except: print('------未跳转到验证码页面,跳转到首页,忽略------')
def account_homepage(self): # 搜索并进入公众号主页 search_url = self.url.format(self.name) resp_search = self.s.get(search_url, headers=self.headers, cookies=self.cookies) if 'class="b404-box" id="noresult_part1_container"' in resp_search.text: log("找不到该公众号: {}".format(self.name)) return e = pq(resp_search.text) if e(".tit").eq(0).text() == self.name: account_link = e(".tit").find('a').attr('href') elif len(e(".tit").eq(0).text()) > 1: log("不能匹配正确的公众号: {}".format(self.name)) return else: # 处理验证码 self.crack_sougou(search_url) print("验证完毕") # 被跳过的公众号要不要抓取 大概 4次 return account_match = re.search(r'微信号:\w*', e.text()) account_search = account_match.group().replace('微信号:', '') if account_match else '' homepage = self.s.get(account_link, cookies=self.cookies) if '<title>请输入验证码 </title>' in homepage.text: print("出现验码") from verification_code import captch_upload_image print('------开始处理微信验证码------') cert = random.random() image_url = 'https://mp.weixin.qq.com/mp/verifycode?cert={}'.format(cert) respones = self.s.get(image_url, ) captch_input = captch_upload_image(respones.content) print('------验证码:{}------'.format(captch_input)) data = { 'cert': cert, 'input': captch_input } respones = self.s.post(image_url, data=data, cookies=self.cookies) cookies = requests.utils.dict_from_cookiejar(respones.cookies) print('adffa', cookies) homepage = self.s.get(account_link, cookies=self.cookies) print('破解验证码之后') account = pq(homepage.text)('.profile_account').text().replace('微信号: ', '') # 搜索页面有account,公众号主页有account,确保找到account return homepage.text, account or account_search
def crack_sougou(self, url): log.info('------开始处理未成功的URL:{}'.format(url)) if re.search('weixin\.sogou\.com', url): log.info('------开始处理搜狗验证码------') self.driver.get(url) time.sleep(2) if '搜公众号' in self.driver.page_source: log.info('浏览器页面正常' + '直接返回') return try: img = self.wait.until( EC.presence_of_element_located((By.ID, 'seccodeImage'))) log.info('------出现验证码页面------') location = img.location size = img.size left = location['x'] top = location['y'] right = location['x'] + size['width'] bottom = location['y'] + size['height'] screenshot = self.driver.get_screenshot_as_png() screenshot = Image.open(BytesIO(screenshot)) captcha = screenshot.crop((left, top, right, bottom)) captcha_path = get_captcha_path() captcha.save(captcha_path) captcha_name = os.path.basename(captcha_path) try: captch_input = '' files = { 'img': (captcha_name, open(captcha_path, 'rb'), 'image/png', {}) } res = requests.post(url=GETCAPTCHA_URL, files=files) res = res.json() if res.get('Success'): captch_input = res.get('Captcha') except Exception as e: log.info('搜狗验证码获取失败'.format(e)) with open(captcha_path, "rb") as f: filebytes = f.read() captch_input = captch_upload_image(filebytes) # log.info('------验证码:{}------'.format(captch_input)) log.info('------验证码:{}------'.format(captch_input)) if captch_input: input_text = self.wait.until( EC.presence_of_element_located( (By.ID, 'seccodeInput'))) input_text.clear() input_text.send_keys(captch_input) submit = self.wait.until( EC.element_to_be_clickable((By.ID, 'submit'))) submit.click() time.sleep(2) try: if '搜公众号' not in self.driver.page_source: log.info('验证失败') return log.info('------验证码正确------') except Exception as e: log.info('--22222222----验证码输入错误------ {}'.format(e)) except Exception as e: log.info('------未跳转到验证码页面,跳转到首页,忽略------ {}'.format(e)) elif re.search('mp\.weixin\.qq\.com', url): log.info('------开始处理微信验证码------') cert = random.random() image_url = 'https://mp.weixin.qq.com/mp/verifycode?cert={}'.format( cert) respones = self.s.get(image_url, cookies=self.cookies) captch_input = captch_upload_image(respones.content) log.info('------验证码:{}------'.format(captch_input)) data = {'cert': cert, 'input': captch_input} r = self.s.post(image_url, cookies=self.cookies, data=data) log.info('------cookies已更新------{}'.format(r.status_code))
def account_homepage(self): # 搜索并进入公众号主页 count = 0 while True: search_url = self.url.format(self.name) resp_search = self.s.get(search_url, headers=self.headers, cookies=self.cookies) if '相关的官方认证订阅号' in resp_search.text: log("找不到该公众号: {}".format(self.name)) return e = pq(resp_search.text) if self.name in e(".tit").eq(0).text(): account_link = e(".tit").find('a').attr('href') elif len(e(".tit").eq(0).text()) > 1: log("不能匹配正确的公众号: {}".format(self.name)) return else: log(search_url) # log(resp_search.text) log('验证之前的cookie', self.cookies) try_count = 0 while True: try_count += 1 self.crack_sougou(search_url) if '搜公众号' in self.driver.page_source: log('------cookies更新------') cookies = self.driver.get_cookies() new_cookie = {} for items in cookies: new_cookie[items.get('name')] = items.get('value') self.cookies = new_cookie log('------cookies已更新------', self.cookies) break elif try_count > 6: log("浏览器验证失败") break log("验证完毕") time.sleep(2) # 被跳过的公众号要不要抓取 大概 4次 continue account_match = re.search(r'微信号:\w*', e.text()) account_search = account_match.group().replace('微信号:', '') if account_match else '' homepage = self.s.get(account_link, cookies=self.cookies) if '<title>请输入验证码 </title>' in homepage.text: print("出现验码") print('------开始处理微信验证码------') cert = random.random() image_url = 'https://mp.weixin.qq.com/mp/verifycode?cert={}'.format(cert) respones = self.s.get(image_url, ) captch_input = captch_upload_image(respones.content) print('------验证码:{}------'.format(captch_input)) data = { 'cert': cert, 'input': captch_input } respones = self.s.post(image_url, data=data, cookies=self.cookies) cookies = requests.utils.dict_from_cookiejar(respones.cookies) print('adffa', cookies) homepage = self.s.get(account_link, cookies=self.cookies) print('破解验证码之后') account = pq(homepage.text)('.profile_account').text().replace('微信号: ', '') # 搜索页面有account,公众号主页有account,确保找到account return homepage.text, account or account_search
bottom = location['y'] + size['height'] screenshot = browser.get_screenshot_as_png() screenshot = Image.open(BytesIO(screenshot)) captcha = screenshot.crop((left, top, right, bottom)) BASE_DIR = os.path.abspath(os.path.dirname(__file__)) IMAGE_DIR = os.path.join(BASE_DIR, 'images') CAPTCHA_NAME = 'captcha.png' captcha_path = os.path.join(IMAGE_DIR, CAPTCHA_NAME) captcha.save(captcha_path) with open(captcha_path, "rb") as f: filebytes = f.read() from verification_code import captch_upload_image captch_input = captch_upload_image(filebytes) print('------验证码:{}------'.format(captch_input)) if captch_input: input_text = wait.until( EC.presence_of_element_located((By.ID, 'seccodeInput'))) input_text.clear() input_text.send_keys(captch_input) submit = wait.until(EC.element_to_be_clickable((By.ID, 'submit'))) submit.click() try: print('------输入验证码------') error_tips = wait.until( EC.presence_of_element_located((By.ID, 'error-tips'))).text if len(error_tips): print('------验证码输入错误------')