def _crawl_site_6(self): ''' 站大爷代理,爬取最近三次更新的代理 :return: ''' main_url = 'http://ip.zdaye.com/dayProxy.html' resp = requests.get(main_url, self.header) soup = BeautifulSoup(resp.content, 'lxml') urls = [u.find('a').attrs['href'] for u in soup.find_all('h3', class_='thread_title')] header = copy(self.header) header['referer'] = main_url for url in urls[0:3]: resp = requests.get('http://ip.zdaye.com' + url, headers=header) soup = BeautifulSoup(resp.content, 'lxml') text = soup.find('div', class_='cont').text pattern = '((\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])):(\d*)@(HTTP|HTTPS)' ip_list = re.findall(pattern, text) for ip_items in ip_list: item = { 'ip': ip_items[0], 'port': ip_items[5], 'type': ip_items[-1] } # print(item) ProxyManager.feed_pool(json.dumps(item))
def _crawl_site_5(self): ''' 小舒代理,爬取最近两天更新的代理 :return: ''' main_url = 'http://www.xsdaili.com' resp = requests.get(main_url, headers=self.header) soup = BeautifulSoup(resp.text, 'lxml') urls = [u.find('a').attrs['href'] for u in soup.find_all('div', class_='title')] for url in urls[0:2]: resp = requests.get(main_url + url, headers=self.header) soup = BeautifulSoup(resp.content, 'lxml') text = soup.find('div', class_='cont').text pattern = '((\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])):(\d*)@(HTTP|HTTPS)' ip_list = re.findall(pattern, text) for ip_items in ip_list: item = { 'ip': ip_items[0], 'port': ip_items[5], 'type': ip_items[-1] } ProxyManager.feed_pool(json.dumps(item))
def run(self): try: pm = ProxyManager() proxies = set() tasks = [] self.__gen_fetch_tasks(tasks, proxies) self.__wait_fetch(tasks) logger.info('Fetched %d proxies' % len(proxies)) proxies = self.__remove_exist_proxies(pm, proxies) if proxies: verify_tasks = self.__gen_gevent_tasks(proxies) self.__wait_for_gevent_tasks(verify_tasks) self.__write_verify_result(pm, proxies) pm.close() gc.collect() logger.info('ProxyModel Fetch Finished, wait for 10 min') except Exception as e: logger.exception(e)
def _crawl_site_3(self): ''' 爬取无忧免费代理,只有10个 :return: ''' url = 'http://www.data5u.com/' resp = requests.get(url, headers=self.header) soup = BeautifulSoup(resp.text, 'lxml') rows = soup.find_all('ul', class_='l2') for row in rows: tds = row.find_all('li') item = { 'ip': tds[0].text, 'port': tds[1].text, 'type': tds[3].text } ProxyManager.feed_pool(json.dumps(item))
def _crawl_site_7(self): ''' 米扑代理,游客只有第一页可见 :return: ''' url = 'https://proxy.mimvp.com/freeopen.php' resp = requests.get(url, headers=self.header) soup = BeautifulSoup(resp.text, 'lxml') rows = soup.find('div', class_='free-list').find_all('tr') for row in rows[1:]: tds = row.find_all('td') item = { 'ip': tds[0].text, 'port': tds[1].text, # 需要ocr将图片内容转成文本 'type': tds[3].text } ProxyManager.feed_pool(json.dumps(item))
def _crawl_site_4(self): ''' 云代理,爬取10页 :return: ''' for i in range(1, 11): url = 'http://www.ip3366.net/?stype=1&page={0}'.format(i) resp = requests.get(url, headers=self.header) soup = BeautifulSoup(resp.text, 'lxml') rows = soup.find('div', id='list').find_all('tr') for row in rows[1:]: tds = row.find_all('td') item = { 'ip': tds[0].text, 'port': tds[1].text, 'type': tds[3].text } ProxyManager.feed_pool(json.dumps(item))
def _crawl_site_0(self): ''' 爬取西刺代理前四页的IP :return: ''' for i in range(1, 5): url = 'https://www.xicidaili.com/nn/{0}'.format(i) resp = requests.get(url, headers=self.header) soup = BeautifulSoup(resp.text, 'lxml') rows = soup.find('table', id='ip_list').find_all('tr') for row in rows[1:]: tds = row.find_all('td') item = { 'ip': tds[1].text, 'port': tds[2].text, 'type': tds[5].text } ProxyManager.feed_pool(json.dumps(item))
def _crawl_site_2(self): ''' 爬取极速代理前10页 :return: ''' for i in range(1, 11): url = 'http://www.superfastip.com/welcome/freeip/{0}'.format(i) resp = requests.get(url, headers=self.header) soup = BeautifulSoup(resp.text, 'lxml') rows = soup.find_all('div', class_='row clearfix')[2].find_all('tr') for row in rows[1:]: tds = row.find_all('td') item = { 'ip': tds[0].text, 'port': tds[1].text, 'type': tds[3].text } ProxyManager.feed_pool(json.dumps(item))
def _crawl_site_10(self): ''' 万能代理前十页 :return: ''' for i in range(1, 11): url = 'http://wndaili.cn/?page={0}'.format(i) resp = requests.get(url, headers=self.header) soup = BeautifulSoup(resp.text, 'lxml') rows = soup.find('div', id='list').find_all('tr') for row in rows[1:]: tds = row.find_all('td') item = { 'ip': tds[0].text, 'port': tds[1].text, 'type': tds[3].text } # print(item) ProxyManager.feed_pool(json.dumps(item))
def _crawl_site_1(self): ''' 爬取快代理前三页的ip :return: ''' for i in range(1, 4): url = 'https://www.kuaidaili.com/free/inha/{0}'.format(i) resp = requests.get(url, headers=self.header) soup = BeautifulSoup(resp.text, 'lxml') rows = soup.find('div', id='list').find_all('tr') for row in rows[1:]: tds = row.find_all('td') item = { 'ip': tds[0].text, 'port': tds[1].text, 'type': tds[3].text } ProxyManager.feed_pool(json.dumps(item)) time.sleep(2)
def _crawl_site_8(self): ''' 西拉代理 :return: ''' url = 'http://www.xiladaili.com' resp = requests.get(url, headers=self.header, ) soup = BeautifulSoup(resp.text, 'lxml') tables = soup.find_all('table', class_='fl-table') # 爬取HTTP和HTTPS两块 for t in tables[1:3]: for row in t.find_all('tr')[2:]: tds = row.find_all('td') ip, port = tds[0].text.split(':') item = { 'ip': ip, 'port': port, 'type': tds[2].text } print(item) ProxyManager.feed_pool(json.dumps(item))
def _crawl_site_9(self): ''' 泥马代理 :return: ''' url = 'http://www.nimadaili.com' resp = requests.get(url, headers=self.header) soup = BeautifulSoup(resp.text, 'lxml') tables = soup.find_all('div', id='overflow') # 爬取HTTP和HTTPS两块 for t in tables[2:]: for row in t.find_all('tr')[1:-1]: tds = row.find_all('td') ip, port = tds[0].text.split(':') item = { 'ip': ip, 'port': port, 'type': tds[2].text } # print(item) ProxyManager.feed_pool(json.dumps(item))
def run(self): try: proxy_manager = ProxyManager() logger.info("Start proxy verify") while True: proxies = proxy_manager.proxy_verified_before(minutes=30, limit=1000) if not len(proxies): logger.info( 'Not proxy need to be verified! Sleep [ 5 ] minutes.') proxy_manager.close() break verifier = ProxyGeventVerifier() start = time.time() tasks = verifier.generate_tasks(proxies) logger.info('Created %d verify tasks' % len(proxies)) gevent.joinall(tasks) logger.info('Proxy Verify Using %d sec.' % (time.time() - start)) passed, failed = 0, 0 for proxy in proxies: if proxy.usable: passed += 1 proxy_manager.verify_passed(proxy) else: failed += 1 proxy_manager.verify_failed(proxy) proxy_manager.commit() logger.info('Valid Complete! %d / %d' % (passed, failed)) proxy_manager.remove_bad_proxy() except Exception as e: logger.exception(e)
from manager import ProxyManager from model import ProxyModel if __name__ == '__main__': # Config.Base.metadata.drop_all(Config.engine) # Config.Base.metadata.create_all(Config.engine) pm = ProxyManager() pm.add_proxy(ProxyModel.instance('http://27.208.25.190:8060'))
from manager import ProxyManager from utils import Util # refresh available ip proxy pool log_file = 'refresh.log' Util.log_to_file( 'Refresh job is starting up at {0}.'.format( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))), 0, log_file) while True: try: Util.log_to_file('Start refreshing proxy pool.', 0, log_file) result = ProxyManager.refresh_proxy_pool() Util.log_to_file('Refresh finished.', 0, log_file) Util.log_to_file(result, 0, log_file) cp = ConfigParser() cp.read('config', encoding='utf-8') interval = cp.get('scheduler', 'refresh_interval') Util.log_to_file('Refresh job begin to sleep.', 0, log_file) time.sleep(int(interval) * 60) Util.log_to_file('Refresh job wake up and start next refresh.', 0, log_file) except: Util.log_to_file(traceback.format_exc(), 1, log_file) Util.log_to_file(
# coding:utf-8 import sys from datetime import datetime from flask import Flask, render_template, json, request sys.path.append('../') from utils import Config from manager import ProxyManager app = Flask(__name__) app.config['JSONIFY_PRETTYPRINT_REGULAR'] = True proxy_manager = ProxyManager() @app.route('/') def index(): return render_template('index.html') @app.route('/api/proxy', methods=['POST']) def api_proxy(): args = request.json or {} start = int(args['start'] if 'start' in args else 0) length = int(args['length'] if 'length' in args else 10) draw = int(args['draw'] if 'draw' in args else 1) order = args['order'] if 'order' in args else [] if order: column_name = args['columns'][order[0]['column']]['name'] print('Order by', column_name)
import time from manager import ProxyManager s = time.time() ProxyManager.validate() e = time.time() print(str(e - s))
import traceback from manager import ProxyManager from utils import Util # refresh available ip proxy pool log_file = 'validation.log' Util.log_to_file( 'Validation job is starting up at {0}.'.format( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))), 0, log_file) while True: try: Util.log_to_file('Begin validating ip proxy queue.', 0, log_file) result = ProxyManager.validate() except: Util.log_to_file(traceback.format_exc(), 1, log_file) Util.log_to_file( 'Validation job failed running, this job will be shutdown.', 0, log_file) break Util.log_to_file('Validation job complete.', 0, log_file) time.sleep(600) Util.log_to_file( 'Validation job is ending at {0}.'.format( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))), 0, log_file)
from model import ProxyModel from manager import ProxyManager if __name__ == '__main__': # Config.Base.metadata.drop_all(Config.engine) # Config.Base.metadata.create_all(Config.engine) pm = ProxyManager() pm.add_proxy(ProxyModel.from_url('http://27.208.25.190:8060'))