예제 #1
0
파일: collect.py 프로젝트: whoiskx/com_code
    def crack_sougou(self, url):
        log('------开始处理未成功的URL:{}'.format(url))
        if re.search('weixin\.sogou\.com', url):
            log('------开始处理搜狗验证码------')
            self.driver.get(url)
            time.sleep(2)
            if '搜公众号' in self.driver.page_source:
                for i in range(30):
                    self.driver.get(url)
                    log('浏览器页面正常')
                    if '搜公众号' not in self.driver.page_source:
                        break
            try:
                img = self.wait.until(EC.presence_of_element_located((By.ID, 'seccodeImage')))
                log('------出现验证码页面------')
                location = img.location
                size = img.size
                left = location['x']
                top = location['y']
                right = location['x'] + size['width']
                bottom = location['y'] + size['height']
                screenshot = self.driver.get_screenshot_as_png()
                screenshot = Image.open(BytesIO(screenshot))
                captcha = screenshot.crop((left, top, right, bottom))
                captcha_path = os.path.join(IMAGE_DIR, CAPTCHA_NAME)
                captcha.save(captcha_path)
                with open(captcha_path, "rb") as f:
                    filebytes = f.read()
                captch_input = captch_upload_image(filebytes)
                log('------验证码:{}------'.format(captch_input))
                if captch_input:
                    input_text = self.wait.until(EC.presence_of_element_located((By.ID, 'seccodeInput')))
                    input_text.clear()
                    input_text.send_keys(captch_input)
                    submit = self.wait.until(EC.element_to_be_clickable((By.ID, 'submit')))
                    submit.click()
                    time.sleep(2)
                    try:
                        if '搜公众号' not in self.driver.page_source:
                            log('验证失败')
                            return
                        log('------验证码正确------')
                    except:
                        log('--22222222----验证码输入错误------')
            except Exception as e:
                log('------未跳转到验证码页面,跳转到首页,忽略------')

        elif re.search('mp\.weixin\.qq\.com', url):
            log('------开始处理微信验证码------')
            cert = random.random()
            image_url = 'https://mp.weixin.qq.com/mp/verifycode?cert={}'.format(cert)
            respones = self.s.get(image_url, cookies=self.cookies)
            captch_input = captch_upload_image(respones.content)
            log('------验证码:{}------'.format(captch_input))
            data = {
                'cert': cert,
                'input': captch_input
            }
            self.s.post(image_url, cookies=self.cookies, data=data)
            log('------cookies已更新------')
예제 #2
0
 def crack_sougou(self, url):
     print('------开始处理未成功的URL:{}'.format(url))
     # if 'weixin\.sogou\.com' in url:
     print('------开始处理搜狗验证码------')
     chrome_options = webdriver.ChromeOptions()
     # chrome_options.add_argument('--headless')
     browser = webdriver.Chrome(chrome_options=chrome_options)
     browser.get(url)
     time.sleep(2)
     try:
         wait = WebDriverWait(browser, 10)
         img = wait.until(EC.presence_of_element_located((By.ID, 'seccodeImage')))
         print('------出现验证码页面------')
         location = img.location
         size = img.size
         left = location['x']
         top = location['y']
         right = location['x'] + size['width']
         bottom = location['y'] + size['height']
         screenshot = browser.get_screenshot_as_png()
         screenshot = Image.open(BytesIO(screenshot))
         captcha = screenshot.crop((left, top, right, bottom))
         captcha_path = os.path.join(IMAGE_DIR, CAPTCHA_NAME)
         captcha.save(captcha_path)
         with open(captcha_path, "rb") as f:
             filebytes = f.read()
         captch_input = captch_upload_image(filebytes)
         print('------验证码:{}------'.format(captch_input))
         if captch_input:
             input_text = wait.until(EC.presence_of_element_located((By.ID, 'seccodeInput')))
             input_text.clear()
             input_text.send_keys(captch_input)
             submit = wait.until(EC.element_to_be_clickable((By.ID, 'submit')))
             submit.click()
             try:
                 print('------输入验证码------')
                 error_tips = wait.until(EC.presence_of_element_located((By.ID, 'error-tips'))).text
                 if len(error_tips):
                     print('------验证码输入错误------')
                     return
                 wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'login-info')))
                 print('------验证码正确------')
                 cookies = browser.get_cookies()
                 new_cookie = {}
                 for items in cookies:
                     new_cookie[items.get('name')] = items.get('value')
                 self.cookies = new_cookie
                 print('------cookies已更新------')
                 return new_cookie
             except:
                 print('------验证码输入错误------')
     except:
         print('------未跳转到验证码页面,跳转到首页,忽略------')
예제 #3
0
    def account_homepage(self):
        # 搜索并进入公众号主页
        search_url = self.url.format(self.name)
        resp_search = self.s.get(search_url, headers=self.headers, cookies=self.cookies)

        if 'class="b404-box" id="noresult_part1_container"' in resp_search.text:
            log("找不到该公众号: {}".format(self.name))
            return
        e = pq(resp_search.text)
        if e(".tit").eq(0).text() == self.name:
            account_link = e(".tit").find('a').attr('href')
        elif len(e(".tit").eq(0).text()) > 1:
            log("不能匹配正确的公众号: {}".format(self.name))
            return
        else:
            # 处理验证码
            self.crack_sougou(search_url)
            print("验证完毕")
            # 被跳过的公众号要不要抓取  大概 4次
            return
        account_match = re.search(r'微信号:\w*', e.text())
        account_search = account_match.group().replace('微信号:', '') if account_match else ''

        homepage = self.s.get(account_link, cookies=self.cookies)
        if '<title>请输入验证码 </title>' in homepage.text:
            print("出现验码")
            from verification_code import captch_upload_image
            print('------开始处理微信验证码------')
            cert = random.random()
            image_url = 'https://mp.weixin.qq.com/mp/verifycode?cert={}'.format(cert)
            respones = self.s.get(image_url, )
            captch_input = captch_upload_image(respones.content)
            print('------验证码:{}------'.format(captch_input))
            data = {
                'cert': cert,
                'input': captch_input
            }
            respones = self.s.post(image_url, data=data, cookies=self.cookies)
            cookies = requests.utils.dict_from_cookiejar(respones.cookies)
            print('adffa', cookies)
            homepage = self.s.get(account_link, cookies=self.cookies)
            print('破解验证码之后')
        account = pq(homepage.text)('.profile_account').text().replace('微信号: ', '')
        # 搜索页面有account,公众号主页有account,确保找到account
        return homepage.text, account or account_search
예제 #4
0
    def crack_sougou(self, url):
        log.info('------开始处理未成功的URL:{}'.format(url))
        if re.search('weixin\.sogou\.com', url):
            log.info('------开始处理搜狗验证码------')
            self.driver.get(url)
            time.sleep(2)
            if '搜公众号' in self.driver.page_source:
                log.info('浏览器页面正常' + '直接返回')
                return
            try:
                img = self.wait.until(
                    EC.presence_of_element_located((By.ID, 'seccodeImage')))
                log.info('------出现验证码页面------')
                location = img.location
                size = img.size
                left = location['x']
                top = location['y']
                right = location['x'] + size['width']
                bottom = location['y'] + size['height']
                screenshot = self.driver.get_screenshot_as_png()
                screenshot = Image.open(BytesIO(screenshot))
                captcha = screenshot.crop((left, top, right, bottom))
                captcha_path = get_captcha_path()
                captcha.save(captcha_path)
                captcha_name = os.path.basename(captcha_path)
                try:
                    captch_input = ''
                    files = {
                        'img': (captcha_name, open(captcha_path,
                                                   'rb'), 'image/png', {})
                    }
                    res = requests.post(url=GETCAPTCHA_URL, files=files)
                    res = res.json()
                    if res.get('Success'):
                        captch_input = res.get('Captcha')
                except Exception as e:
                    log.info('搜狗验证码获取失败'.format(e))
                    with open(captcha_path, "rb") as f:
                        filebytes = f.read()
                    captch_input = captch_upload_image(filebytes)
                    # log.info('------验证码:{}------'.format(captch_input))
                log.info('------验证码:{}------'.format(captch_input))
                if captch_input:
                    input_text = self.wait.until(
                        EC.presence_of_element_located(
                            (By.ID, 'seccodeInput')))
                    input_text.clear()
                    input_text.send_keys(captch_input)
                    submit = self.wait.until(
                        EC.element_to_be_clickable((By.ID, 'submit')))
                    submit.click()
                    time.sleep(2)
                    try:
                        if '搜公众号' not in self.driver.page_source:
                            log.info('验证失败')
                            return
                        log.info('------验证码正确------')
                    except Exception as e:
                        log.info('--22222222----验证码输入错误------ {}'.format(e))
            except Exception as e:
                log.info('------未跳转到验证码页面,跳转到首页,忽略------ {}'.format(e))

        elif re.search('mp\.weixin\.qq\.com', url):
            log.info('------开始处理微信验证码------')
            cert = random.random()
            image_url = 'https://mp.weixin.qq.com/mp/verifycode?cert={}'.format(
                cert)
            respones = self.s.get(image_url, cookies=self.cookies)
            captch_input = captch_upload_image(respones.content)
            log.info('------验证码:{}------'.format(captch_input))
            data = {'cert': cert, 'input': captch_input}
            r = self.s.post(image_url, cookies=self.cookies, data=data)
            log.info('------cookies已更新------{}'.format(r.status_code))
예제 #5
0
파일: collect.py 프로젝트: whoiskx/com_code
    def account_homepage(self):
        # 搜索并进入公众号主页
        count = 0
        while True:
            search_url = self.url.format(self.name)
            resp_search = self.s.get(search_url, headers=self.headers, cookies=self.cookies)

            if '相关的官方认证订阅号' in resp_search.text:
                log("找不到该公众号: {}".format(self.name))
                return
            e = pq(resp_search.text)
            if self.name in e(".tit").eq(0).text():
                account_link = e(".tit").find('a').attr('href')
            elif len(e(".tit").eq(0).text()) > 1:
                log("不能匹配正确的公众号: {}".format(self.name))
                return
            else:
                log(search_url)
                # log(resp_search.text)
                log('验证之前的cookie', self.cookies)
                try_count = 0
                while True:
                    try_count += 1
                    self.crack_sougou(search_url)
                    if '搜公众号' in self.driver.page_source:
                        log('------cookies更新------')
                        cookies = self.driver.get_cookies()
                        new_cookie = {}
                        for items in cookies:
                            new_cookie[items.get('name')] = items.get('value')
                        self.cookies = new_cookie
                        log('------cookies已更新------', self.cookies)
                        break
                    elif try_count > 6:
                        log("浏览器验证失败")
                        break

                log("验证完毕")
                time.sleep(2)
                # 被跳过的公众号要不要抓取  大概 4次
                continue
            account_match = re.search(r'微信号:\w*', e.text())
            account_search = account_match.group().replace('微信号:', '') if account_match else ''

            homepage = self.s.get(account_link, cookies=self.cookies)
            if '<title>请输入验证码 </title>' in homepage.text:
                print("出现验码")
                print('------开始处理微信验证码------')
                cert = random.random()
                image_url = 'https://mp.weixin.qq.com/mp/verifycode?cert={}'.format(cert)
                respones = self.s.get(image_url, )
                captch_input = captch_upload_image(respones.content)
                print('------验证码:{}------'.format(captch_input))
                data = {
                    'cert': cert,
                    'input': captch_input
                }
                respones = self.s.post(image_url, data=data, cookies=self.cookies)
                cookies = requests.utils.dict_from_cookiejar(respones.cookies)
                print('adffa', cookies)
                homepage = self.s.get(account_link, cookies=self.cookies)
                print('破解验证码之后')
            account = pq(homepage.text)('.profile_account').text().replace('微信号: ', '')
            # 搜索页面有account,公众号主页有account,确保找到account
            return homepage.text, account or account_search
예제 #6
0
파일: tests.py 프로젝트: whoiskx/com_code
    bottom = location['y'] + size['height']
    screenshot = browser.get_screenshot_as_png()
    screenshot = Image.open(BytesIO(screenshot))
    captcha = screenshot.crop((left, top, right, bottom))

    BASE_DIR = os.path.abspath(os.path.dirname(__file__))
    IMAGE_DIR = os.path.join(BASE_DIR, 'images')
    CAPTCHA_NAME = 'captcha.png'

    captcha_path = os.path.join(IMAGE_DIR, CAPTCHA_NAME)
    captcha.save(captcha_path)
    with open(captcha_path, "rb") as f:
        filebytes = f.read()

    from verification_code import captch_upload_image
    captch_input = captch_upload_image(filebytes)
    print('------验证码:{}------'.format(captch_input))
    if captch_input:
        input_text = wait.until(
            EC.presence_of_element_located((By.ID, 'seccodeInput')))
        input_text.clear()
        input_text.send_keys(captch_input)
        submit = wait.until(EC.element_to_be_clickable((By.ID, 'submit')))
        submit.click()
        try:
            print('------输入验证码------')
            error_tips = wait.until(
                EC.presence_of_element_located((By.ID, 'error-tips'))).text
            if len(error_tips):
                print('------验证码输入错误------')