def test_create_ip(self): ip_str = gen_random_ip() ip = ProxyIP(ip=ip_str, port=3306) ip.save() count = ProxyIP.select().count() assert count > 0 self.delete_ip(ip_str)
def test_validate(self): p = ProxyIP(ip='159.203.186.40', port='8080') if ValidateManager.should_validate(p): v = ValidateManager(p) v.validate() # s = '\n'.join(['%s:%s' % item for item in p.__dict__.items()]) s = '\n'.join(['%s:%s' % item for item in p.__data__.items()]) print("\n") print("\033[;35m\t{}\033[0m".format(s)) p.merge()
def crawl_callback(self, future): pw = provider = exc = None try: provider, exc = future.result() proxies = list(set(provider.proxies)) pw = ProxyWebSite(site_name=provider.name) if exc is None: pw.stats = 'OK' else: pw.stats = exc.__class__.__name__ logger.debug("{} crawl error:{}".format(provider.name, exc)) pw.proxy_count = len(proxies) logger.info("{} crawl proxies:{}".format(provider.name, pw.proxy_count)) for p in proxies: self.validator_queue.put(ProxyIP(ip=p[0], port=p[1])) logger.info("{} proxies enqueue:{}".format(provider.name, pw.proxy_count)) logger.info("{} proxies save to db:{}".format( provider.name, pw.proxy_count)) pw.merge() logger.info("{} crawl END".format(provider.name, pw.proxy_count)) except Exception as e: pw.stats = e.__class__.__name__ logger.debug("{} crawl callback error:{}".format( provider.name, exc))
def __init__(self, proxy: ProxyIP): proxy.latency = -1 proxy.google = 0 proxy.http_weight = 0 proxy.https_weight = 0 proxy.http_anonymous = 0 proxy.https_anonymous = 0 proxy.http_pass_proxy_ip = None proxy.https_pass_proxy_ip = None self._proxy = proxy
def testFeedFromDb(self): # proxies = ProxyIP.select().where((ProxyIP.updated_at < datetime.now() - timedelta(minutes=5)) # & (ProxyIP.https_weight > 0) & (ProxyIP.http_weight > 0)) while True: proxies = ProxyIP.select(ProxyIP.ip).where((ProxyIP.https_weight > 0) & (ProxyIP.http_weight > 0)) \ .where(ProxyIP.updated_at > "datetime('now','-1 hour','localtime')") for x in proxies.iterator(): print(x.ip) sleep(1)
def sche_validate_from_db(): try: i = 0 # with _db.connection_context(): proxies = ProxyIP.select().where( (ProxyIP.updated_at < datetime.now() - timedelta(minutes=5)) & (ProxyIP.https_weight + ProxyIP.http_weight > 0)) for p in proxies.iterator(): Scheduler.validator_queue.put(p) i += 1 logger.info('proxy from db :{}'.format(i)) except Exception as e: logger.error("error:%s", str(e), exc_info=True)
def should_validate(cls, proxy_ip: ProxyIP) -> bool: if proxy_ip.id is None: # with _db.connection_context(): p = ProxyIP.get_or_none(ProxyIP.ip == proxy_ip.ip) if p is not None: if p.updated_at > datetime.now() - timedelta(minutes=20): return False if p.latency > 40 and p.updated_at > datetime.now( ) - timedelta(hours=12): return False if p.http_weight + p.https_weight <= 0 and p.updated_at > datetime.now( ) - timedelta(hours=12): return False return True
def reconfigure(self): # with _db.connection_context(): ps = ProxyIP.select() \ .where(ProxyIP.updated_at > datetime.now() - timedelta(minutes=60)) \ .where(ProxyIP.https_anonymous > 0).where(ProxyIP.http_anonymous > 0) \ .where(ProxyIP.https_weight > 0) \ .where(ProxyIP.latency < 25) \ .order_by(ProxyIP.https_weight.desc(), ProxyIP.https_anonymous.desc(), ProxyIP.http_weight.desc(), ProxyIP.http_anonymous.desc(), ProxyIP.latency) \ .limit(300).execute() logger.info('squid reconfigure...') with open(self.SQUID_CONF_TPL, "r") as f: squid_conf = f.readlines() for c in self.HIDE: squid_conf.append(c + '\n') i = 0 for i, p in enumerate(ps): if p.http_weight == 0: squid_conf.append( self.PEER.format(p.ip, p.port, p.http_weight + p.https_weight, i)) squid_conf.append( 'cache_peer_access p{} deny acl_p443\n'.format(i)) else: squid_conf.append( self.PEER.format(p.ip, p.port, p.http_weight + p.https_weight, i)) squid_conf.append('#%s' % datetime.now().strftime('%Y-%m-%d %H:%M:%S')) if i > 20: with open(self.SQUID_CONF, "w") as f: f.writelines(squid_conf) subprocess.call(['squid', '-k', 'reconfigure'], shell=False) logger.info('squid reconfigured !!') else: logger.info('squid reconfigure cancel !!')
def do_GET(self): print('client :%s' % str(self.client_address)) self.do_HEAD() # /api?format=json params = parse.parse_qs(parse.urlparse(self.path).query) # with _db.connection_context(): ps = ProxyIP.select() \ .where(ProxyIP.updated_at > datetime.now() - timedelta(minutes=60)) \ .where(ProxyIP.https_anonymous > 0).where(ProxyIP.http_anonymous > 0) \ .where(ProxyIP.https_weight > 0) \ .where(ProxyIP.latency < 25) \ .order_by(ProxyIP.https_weight.desc(), ProxyIP.https_anonymous.desc(), ProxyIP.http_weight.desc(), ProxyIP.http_anonymous.desc(), ProxyIP.latency) \ .limit(500).execute() if params.get('format') and 'csv' == params['format'][0]: for p in ps: self.wfile.write( ('{}:{}:{}:{},'.format(p.ip, p.port, p.http_weight, p.https_weight)).encode("utf-8")) self.wfile.flush() elif params.get('format') and 'json' == params['format'][0]: jtxt = {p.ip: p.port for p in ps} self.wfile.write(json.dumps(jtxt).encode("utf-8")) self.wfile.flush() else: # with _db.connection_context(): total_count = ProxyIP.select().count() valid_count = _valid_proxies_query.count() ul = '''<div style='margin-left:20px'><h3> {0}:{1}</h3></div> <ul><li><b>{2}</b> proxy ips in total</li> <li><b>{3}</b> of them are valid</li></ul>'''.format( datetime.now().strftime("%Y-%m-%d %H:%M:%S"), self.client_address, total_count, valid_count) arr = [] # with _db.connection_context(): pw = ProxyWebSite.select().order_by(ProxyWebSite.this_fetch.desc()) for x in pw: arr.append( '<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td></tr>' .format( x.site_name, x.proxy_count, x.last_fetch.strftime("%Y-%m-%d %H:%M:%S") if x.last_fetch else '', x.this_fetch.strftime("%Y-%m-%d %H:%M:%S"), x.stats)) sites = ''.join(arr) arr.clear() # with _db.connection_context(): detail = _valid_proxies_query.order_by(ProxyIP.google.desc(), ProxyIP.https_weight.desc(), ProxyIP.http_weight.desc(), ProxyIP.latency).limit(100) for i, p in enumerate(detail): # <td nowrap='nowrap'></td> arr.append( "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td>" "<td>{6}</td><td>{7}</td><td>{8}</td><td>{9}</td><td nowrap='nowrap'>{10}</td>" "<td>{11}</td><td>{12}</td></tr>".format( i + 1, '%s:%s' % (p.ip, p.port), p.latency, p.google, p.http_pass_proxy_ip, p.https_pass_proxy_ip, p.http_anonymous, p.https_anonymous, p.http_weight, p.https_weight, p.updated_at.strftime("%Y-%m-%d %H:%M:%S"), p.country, p.city)) ips = ''.join(arr) html = '''<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>http/pikapi</title></head> <body>{0} <table width="800" style="margin-left:20px; border-collapse:collapse; padding-left:10px" border="1" cellPadding=3 bordercolor="#BBBBBB"> <thead bgcolor="#DDDDDD"><tr><th>site</th><th>proxy</th><th>last crawl</th><th>this crawl</th><th>state</th></tr></thead> <tbody>{1} </tbody></table></br> <table width="900" style="margin-left:20px; border-collapse:collapse; padding-left:10px" border="1" cellPadding=3 bordercolor="#BBBBBB"> <thead bgcolor="#DDDDDD"><tr><th>row</th><th>proxy</th><th>elapsed</th><th>google</th> <th>http_pass_ip</th><th>https_pass_ip</th> <th>http_ano</th><th>https_ano</th> <th>http_weight</th><th>https_weight</th> <th>updated_at</th><th>country_name</th><th>city</th></tr></thead> <tbody>{2}</tbody></table> </body></html>'''.format(ul, sites, ips) self.wfile.write(html.encode('utf-8'))
from http.server import HTTPStatus, HTTPServer, BaseHTTPRequestHandler from urllib import parse from pikapi.database import ProxyIP, ProxyWebSite import logging from threading import Thread from datetime import datetime, timedelta import json logger = logging.getLogger(__name__) _valid_proxies_query = ProxyIP.select() \ .where(ProxyIP.updated_at > datetime.now() - timedelta(minutes=30)) \ .where(ProxyIP.http_weight + ProxyIP.https_weight > 0) \ class ResquestHandler(BaseHTTPRequestHandler): def do_HEAD(self): self.send_response(HTTPStatus.OK) self.send_header("Content-type", "text/html") self.end_headers() def do_GET(self): print('client :%s' % str(self.client_address)) self.do_HEAD() # /api?format=json params = parse.parse_qs(parse.urlparse(self.path).query) # with _db.connection_context(): ps = ProxyIP.select() \ .where(ProxyIP.updated_at > datetime.now() - timedelta(minutes=60)) \ .where(ProxyIP.https_anonymous > 0).where(ProxyIP.http_anonymous > 0) \ .where(ProxyIP.https_weight > 0) \
def delete_ip(ip: str): ProxyIP.delete().where(ProxyIP.ip == ip).execute()