def _text(self, url): # 获取一个代理 proxy = self.__proxy() # 已设置使用代理且已有代理 if config.REQUEST_PROXY_RETRY and proxy: text = self.__request(proxy, url) # 代理已重试次数 retried = 0 # 总尝试次数 try_total = 0 while text is None: retried += 1 try_total += 1 # 重试次数已达限制,删除当前代理重获代理请求 if retried > config.REQUEST_PROXY_RETRY: self.__proxy_delete(proxy['ip'], proxy['port']) proxy = self.__proxy() # 清空代理重试次数 retried = 0 text = self.__request(proxy, url) config.console_log('使用代理请求页面 %s ,总尝试第 %s 次' % (url, try_total), 'white') else: text = self.__request(None, url) return text
def __request(self, proxy, url): """ 使用代理请求 url :param proxy: :param url: :return: """ # 请求设置 kwargs = {'timeout': 10, 'headers': config.get_http_header()} # 有代理使用 if isinstance(proxy, dict) and proxy: kwargs['proxies'] = { 'http': 'http://%s:%s' % (proxy['ip'], proxy['port']), 'https': 'http://%s:%s' % (proxy['ip'], proxy['port']) } response = requests.get(url, **kwargs) if response.ok and response.status_code == 200: return response.text else: config.console_log( '请求返回的状态码: %s URL: %s 内容: %s' % (url, str(response.status_code), response.text), 'red') return None
def _text(self, url): try: response = requests.get(url, **{'timeout': 10, 'headers': config.get_http_header()}) if response.ok: return response.text else: # todo 使用代理重新尝试下载 config.console_log('请求返回的状态码: ' + str(response.status_code), 'red') return None except Timeout as e: config.console_log('请求超时: ' + str(e), 'red') return None
def crawl_handle(protocal, proxy, queue_persistence): if protocal is 'http': http, h_anonymity, h_interval = connect('http://httpbin.org/get', proxy) if http: proxy['protocol'] = 'http' proxy['anonymity'] = h_anonymity proxy['speed'] = h_interval queue_persistence.put(proxy) config.console_log( '验证通过的 http 代理: ' + json.dumps(proxy, ensure_ascii=False), 'green') else: config.console_log( '无效的 http 代理: ' + json.dumps(proxy, ensure_ascii=False), 'red') elif protocal is 'https': https, hs_anonymity, hs_interval = connect('https://httpbin.org/get', proxy) if https: proxy['protocol'] = 'https' proxy['anonymity'] = hs_anonymity proxy['speed'] = hs_interval queue_persistence.put(proxy) config.console_log( '验证通过的 https 代理: ' + json.dumps(proxy, ensure_ascii=False), 'green') else: config.console_log( '无效的https代理: ' + json.dumps(proxy, ensure_ascii=False), 'red')
def __proxy_delete(self, ip, port): """ 删除一个已爬代理 :param ip: :param port: :return: """ response = requests.get( 'http://{}:{}/proxy/delete?ip={}&port={}'.format( config.WEB_API_IP, config.WEB_API_PORT, ip, port), **{'timeout': 5}) if response.status_code == 204: config.console_log('删除代理成功 %s:%s' % (ip, port), 'green') else: config.console_log('删除代理失败 %s:%s' % (ip, port), 'red')
def process_report(text_content, html_content): global server_connected global server if config.config['method'] == 'mail': if config.config['mail_format'] == 'html': message = MIMEText(html_content, 'html', 'utf-8') else: message = MIMEText(text_content, 'plain', 'utf-8') message['Subject'] = 'Movies torrents digest' message['From'] = config.config['from'] message['To'] = config.config['to'] try: server = smtplib.SMTP(config.config['smtp_server']) server_connected = True except: config.log_message("Unexpected error while connecting to mail server:" + str(sys.exc_info()[0]), 'error') config.log_message("Printing report to console\n") config.console_log("\n") config.console_log(text_content) if server_connected: try: server.ehlo() server.starttls() server.login(config.config['username'], config.config['password']) server.sendmail(config.config['from'], config.config['to'], message.as_string()) server.quit() config.log_message("Report sent by mail") except: config.log_message("Unexpected error while sending mail :" + str(sys.exc_info()[0]), 'error') config.log_message("Printing report to console") config.console_log("\n") config.console_log(text_content) finally: server.close() else: config.console_log("\n") config.console_log(text_content)