def __init__(self): self.url = 'https://weixin.sogou.com/weixin?type=1&s_from=input&query={}&ie=utf8&_sug_=n&_sug_type_=' self.account = '' self.name = '' self.search_name = '' self.tags = '' self.s = requests.Session() self.s.keep_alive = False # 关闭多余连接 self.s.adapters.DEFAULT_RETRIES = 5 # 增加重连次数 self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/68.0.3440.106 Safari/537.36', } self.cookies = {} # 使用单一driver chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') self.driver = webdriver.Chrome(chrome_options=chrome_options) # self.driver = driver self.driver.set_page_load_timeout(20) self.driver.set_script_timeout(20) self.wait = WebDriverWait(self.driver, 5) self.proxies = abuyun_proxy()
def __init__(self): self.url = 'https://weixin.sogou.com/weixin?type=1&s_from=input&query={}&ie=utf8&_sug_=n&_sug_type_=' self.account = '' self.name = '' self.search_name = '' self.tags = '' self.s = requests.Session() self.s.keep_alive = False # 关闭多余连接 self.s.adapters.DEFAULT_RETRIES = 5 # 增加重连次数 self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/68.0.3440.106 Safari/537.36', } self.cookies = { 'SUID': '4A72170E2613910A000000005BAC759D', 'ABTEST': '3|1538028956|v1', 'SUIR': '1538028956', 'IPLOC': 'CN4401', 'SNUID': '5960051C121665C656C04D9E13C88607', 'PHPSESSID': '80l6acdo9sq3uj357t00heqpg1', 'seccodeRight': 'success', 'SUV': '00F347B50E17724A5BAC759DBEFB6849', 'successCount': '1|Thu, 27 Sep 2018 06:20:59 GMT', 'refresh': '1', 'JSESSIONID': 'aaa73Xexaf2BmgEL80Bvw' } self.driver = driver self.driver.set_page_load_timeout(15) self.driver.set_script_timeout(15) self.wait = WebDriverWait(self.driver, 5) self.proxies = abuyun_proxy()
def __init__(self): self.url = 'https://weixin.sogou.com/weixin?type=1&s_from=input&query={}&ie=utf8&_sug_=n&_sug_type_=' self.account = '' self.name = '' self.search_name = '' self.tags = '' self.s = requests.Session() self.s.keep_alive = False # 关闭多余连接 self.s.adapters.DEFAULT_RETRIES = 5 # 增加重连次数 self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'mp.weixin.qq.com', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36' } self.cookies = {} # 使用单一driver chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') self.driver = webdriver.Chrome(chrome_options=chrome_options) # self.driver = driver self.driver.set_page_load_timeout(20) self.driver.set_script_timeout(20) self.wait = WebDriverWait(self.driver, 5) self.proxies = abuyun_proxy() self.timeout = 23
def account_homepage(self): # 搜索账号并返回公众号主页 count = 0 while True: count += 1 if count > 3: log.info('多次账号异常,跳过账号:'.format(self.name)) return log.info('start account {}'.format(self.search_name)) search_url = self.url.format(self.search_name) resp_search = self.s.get(search_url, headers=self.headers, cookies=self.cookies) e = pq(resp_search.text) log.info('当前搜狗标题:{}'.format(e('title').text())) if '搜狗' not in e('title').text(): log.info('初始化session') self.s = requests.session() if self.search_name == e(".info").eq(0).text().replace('微信号:', ''): account_link = e(".tit").find('a').attr('href') self.name = e(".tit").eq(0).text() count_proxy = 0 while True: count_proxy += 1 if count_proxy > 10: log.error('未能获取有效代理:{}'.format(self.search_name)) return try: log.info(self.proxies) homepage = self.s.get(account_link, cookies=self.cookies, proxies=self.proxies) if '<title>请输入验证码 </title>' in homepage.text: log.info('需要输入验证码,重新获取代理') self.proxies = abuyun_proxy() # if self.proxies is False: # self.crack_sougou(account_link) # homepage = self.s.get(account_link, cookies=self.cookies) # return homepage.text continue else: return homepage.text except Exception as _e: log.info('重新获取代理:{}'.format(_e)) self.proxies = abuyun_proxy() # if '<title>请输入验证码 </title>' in homepage.text: # # self.crack_sougou(account_link) # count_proxy = 0 # while True: # count_proxy += 1 # if count_proxy > 5: # break # try: # log.info(self.proxies) # homepage = self.s.get(account_link, cookies=self.cookies, proxies=self.proxies) # break # except Exception as e: # log.info('重新获取代理:{}'.format(e)) # self.proxies = abuyun_proxy() elif len(e(".tit").eq(0).text()) > 1: log.info("不能匹配正确的公众号: {}".format(self.search_name)) return if '相关的官方认证订阅号' in resp_search.text: log.info("找不到该公众号: {}".format(self.search_name)) return if '搜狗' in e('title').text(): log.info('{} :搜索结果无文字'.format(self.search_name)) return else: # 处理验证码 log.info(search_url) log.info('验证之前的cookie'.format(self.cookies)) try_count = 0 while True: try_count += 1 self.crack_sougou(search_url) # if lock.acquire(): # try: # self.crack_sougou(search_url) # except Exception as e: # log.info(e) # finally: # lock.release() if '搜公众号' in self.driver.page_source: log.info('------开始更新cookies------') cookies = self.driver.get_cookies() new_cookie = {} for items in cookies: new_cookie[items.get('name')] = items.get('value') self.cookies = new_cookie log.info('------cookies已更新------'.format(self.cookies)) break elif try_count > 4: log.info("浏览器验证失败") break log.info("验证完毕") time.sleep(2) continue
def create(self, url, account_model, proxies): self.url = url if proxies is False: resp = requests.get(self.url) for i in range(30): if '访问过于频繁,请用微信扫描二维码进行访问' in resp.text: time.sleep(600) else: break else: count_loop = 0 while True: count_loop += 1 if count_loop >= 10: break try: resp = requests.get(self.url, proxies=proxies, timeout=21) proxy_count = 0 while True: proxy_count += 1 if proxy_count > 10: log('文章页未获取有效代理') # raise RuntimeError('访问过于频繁,请用微信扫描二维码进行访问') if '访问过于频繁,请用微信扫描二维码进行访问' in resp.text: proxies = abuyun_proxy() resp = requests.get(self.url, proxies=proxies, timeout=21) log('代理无效:访问过于频繁,请用微信扫描二维码进行访问') else: break break except requests.exceptions.ProxyError as e: log('代理请求ProxyError:{}'.format(e)) except requests.exceptions.ConnectionError as e: log('代理请求ConnectionError:{}'.format(e)) # time.sleep(600) # raise RuntimeError('访问过于频繁,请用微信扫描二维码进行访问') e = pq(resp.text) # 匹配分享的文章 好像失效 if 'var ct=' not in resp.text: # 第一次看到嫂子的就是她的 晚聊伴夜 if '此内容因违规无法查看' in resp.text: self.title = '此内容因违规无法查看' return if '此内容被投诉且经审核涉嫌侵权' in resp.text: self.title = '此内容被投诉且经审核涉嫌侵权,无法查看。' return self.is_share = True # self.title = e("title").eq(0).text() self.title = e("title").text() self.content = e(".share_notice").text() time_find = re.search('createDate=new Date\("\d*', resp.text) self.time = time_find.group() if time_find else '' if '视频' == self.title: self.set_time(resp, content_type='video') # if '用腾讯视频观看' in resp.text: # self.set_time(resp, content_type='video') return if '分享' in e('.share_notice').text(): self.is_share = True # self.content = self.set_time(resp, content_type='article') self.account = account_model.account # if not self.account: # inner_account = re.search('user_name = ".*?"', resp.text) # self.account = inner_account.group().split('"')[1] self.title = e('.rich_media_title').text().replace(' ', '') # todo 分享的和视频 self.content = e("#js_content").text().replace('\n', '') self.author = account_model.name img_list = e('img') img_str = '' for img_div in img_list: img = pq(img_div).attr('data-src') if img is not None: img_str += img + '|' self.image_url = img_str[:-1]