示例#1
0
    def test_create_ip(self):
        ip_str = gen_random_ip()
        ip = ProxyIP(ip=ip_str, port=3306)
        ip.save()

        count = ProxyIP.select().count()
        assert count > 0

        self.delete_ip(ip_str)
示例#2
0
 def test_validate(self):
     p = ProxyIP(ip='159.203.186.40', port='8080')
     if ValidateManager.should_validate(p):
         v = ValidateManager(p)
         v.validate()
         # s = '\n'.join(['%s:%s' % item for item in p.__dict__.items()])
         s = '\n'.join(['%s:%s' % item for item in p.__data__.items()])
         print("\n")
         print("\033[;35m\t{}\033[0m".format(s))
         p.merge()
示例#3
0
文件: scheduler.py 项目: yeshl/pikapi
    def crawl_callback(self, future):
        pw = provider = exc = None
        try:
            provider, exc = future.result()
            proxies = list(set(provider.proxies))
            pw = ProxyWebSite(site_name=provider.name)
            if exc is None:
                pw.stats = 'OK'
            else:
                pw.stats = exc.__class__.__name__
                logger.debug("{} crawl error:{}".format(provider.name, exc))

            pw.proxy_count = len(proxies)
            logger.info("{} crawl proxies:{}".format(provider.name,
                                                     pw.proxy_count))
            for p in proxies:
                self.validator_queue.put(ProxyIP(ip=p[0], port=p[1]))
            logger.info("{} proxies enqueue:{}".format(provider.name,
                                                       pw.proxy_count))
            logger.info("{} proxies save to db:{}".format(
                provider.name, pw.proxy_count))
            pw.merge()
            logger.info("{} crawl END".format(provider.name, pw.proxy_count))
        except Exception as e:
            pw.stats = e.__class__.__name__
            logger.debug("{} crawl callback error:{}".format(
                provider.name, exc))
示例#4
0
 def __init__(self, proxy: ProxyIP):
     proxy.latency = -1
     proxy.google = 0
     proxy.http_weight = 0
     proxy.https_weight = 0
     proxy.http_anonymous = 0
     proxy.https_anonymous = 0
     proxy.http_pass_proxy_ip = None
     proxy.https_pass_proxy_ip = None
     self._proxy = proxy
示例#5
0
 def testFeedFromDb(self):
     # proxies = ProxyIP.select().where((ProxyIP.updated_at < datetime.now() - timedelta(minutes=5))
     #                                  & (ProxyIP.https_weight > 0) & (ProxyIP.http_weight > 0))
     while True:
         proxies = ProxyIP.select(ProxyIP.ip).where((ProxyIP.https_weight > 0) & (ProxyIP.http_weight > 0)) \
             .where(ProxyIP.updated_at > "datetime('now','-1 hour','localtime')")
         for x in proxies.iterator():
             print(x.ip)
         sleep(1)
示例#6
0
文件: scheduler.py 项目: yeshl/pikapi
 def sche_validate_from_db():
     try:
         i = 0
         # with _db.connection_context():
         proxies = ProxyIP.select().where(
             (ProxyIP.updated_at < datetime.now() - timedelta(minutes=5))
             & (ProxyIP.https_weight + ProxyIP.http_weight > 0))
         for p in proxies.iterator():
             Scheduler.validator_queue.put(p)
             i += 1
         logger.info('proxy from db :{}'.format(i))
     except Exception as e:
         logger.error("error:%s", str(e), exc_info=True)
示例#7
0
 def should_validate(cls, proxy_ip: ProxyIP) -> bool:
     if proxy_ip.id is None:
         # with _db.connection_context():
         p = ProxyIP.get_or_none(ProxyIP.ip == proxy_ip.ip)
         if p is not None:
             if p.updated_at > datetime.now() - timedelta(minutes=20):
                 return False
             if p.latency > 40 and p.updated_at > datetime.now(
             ) - timedelta(hours=12):
                 return False
             if p.http_weight + p.https_weight <= 0 and p.updated_at > datetime.now(
             ) - timedelta(hours=12):
                 return False
     return True
示例#8
0
    def reconfigure(self):
        # with _db.connection_context():
        ps = ProxyIP.select() \
            .where(ProxyIP.updated_at > datetime.now() - timedelta(minutes=60)) \
            .where(ProxyIP.https_anonymous > 0).where(ProxyIP.http_anonymous > 0) \
            .where(ProxyIP.https_weight > 0) \
            .where(ProxyIP.latency < 25) \
            .order_by(ProxyIP.https_weight.desc(), ProxyIP.https_anonymous.desc(),
                      ProxyIP.http_weight.desc(), ProxyIP.http_anonymous.desc(), ProxyIP.latency) \
            .limit(300).execute()
        logger.info('squid reconfigure...')
        with open(self.SQUID_CONF_TPL, "r") as f:
            squid_conf = f.readlines()

        for c in self.HIDE:
            squid_conf.append(c + '\n')
        i = 0
        for i, p in enumerate(ps):
            if p.http_weight == 0:
                squid_conf.append(
                    self.PEER.format(p.ip, p.port,
                                     p.http_weight + p.https_weight, i))
                squid_conf.append(
                    'cache_peer_access p{} deny acl_p443\n'.format(i))
            else:
                squid_conf.append(
                    self.PEER.format(p.ip, p.port,
                                     p.http_weight + p.https_weight, i))
        squid_conf.append('#%s' % datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        if i > 20:
            with open(self.SQUID_CONF, "w") as f:
                f.writelines(squid_conf)
            subprocess.call(['squid', '-k', 'reconfigure'], shell=False)
            logger.info('squid reconfigured !!')
        else:
            logger.info('squid reconfigure cancel !!')
示例#9
0
    def do_GET(self):
        print('client :%s' % str(self.client_address))
        self.do_HEAD()
        # /api?format=json
        params = parse.parse_qs(parse.urlparse(self.path).query)
        # with _db.connection_context():
        ps = ProxyIP.select() \
            .where(ProxyIP.updated_at > datetime.now() - timedelta(minutes=60)) \
            .where(ProxyIP.https_anonymous > 0).where(ProxyIP.http_anonymous > 0) \
            .where(ProxyIP.https_weight > 0) \
            .where(ProxyIP.latency < 25) \
            .order_by(ProxyIP.https_weight.desc(), ProxyIP.https_anonymous.desc(),
                      ProxyIP.http_weight.desc(), ProxyIP.http_anonymous.desc(), ProxyIP.latency) \
            .limit(500).execute()
        if params.get('format') and 'csv' == params['format'][0]:
            for p in ps:
                self.wfile.write(
                    ('{}:{}:{}:{},'.format(p.ip, p.port, p.http_weight,
                                           p.https_weight)).encode("utf-8"))
            self.wfile.flush()
        elif params.get('format') and 'json' == params['format'][0]:
            jtxt = {p.ip: p.port for p in ps}
            self.wfile.write(json.dumps(jtxt).encode("utf-8"))
            self.wfile.flush()
        else:
            # with _db.connection_context():
            total_count = ProxyIP.select().count()
            valid_count = _valid_proxies_query.count()
            ul = '''<div style='margin-left:20px'><h3> {0}:{1}</h3></div>
                    <ul><li><b>{2}</b> proxy ips in total</li>
                    <li><b>{3}</b> of them are valid</li></ul>'''.format(
                datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                self.client_address, total_count, valid_count)

            arr = []
            # with _db.connection_context():
            pw = ProxyWebSite.select().order_by(ProxyWebSite.this_fetch.desc())
            for x in pw:
                arr.append(
                    '<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td></tr>'
                    .format(
                        x.site_name, x.proxy_count,
                        x.last_fetch.strftime("%Y-%m-%d %H:%M:%S")
                        if x.last_fetch else '',
                        x.this_fetch.strftime("%Y-%m-%d %H:%M:%S"), x.stats))
            sites = ''.join(arr)

            arr.clear()
            # with _db.connection_context():
            detail = _valid_proxies_query.order_by(ProxyIP.google.desc(),
                                                   ProxyIP.https_weight.desc(),
                                                   ProxyIP.http_weight.desc(),
                                                   ProxyIP.latency).limit(100)
            for i, p in enumerate(detail):
                # <td nowrap='nowrap'></td>
                arr.append(
                    "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td>"
                    "<td>{6}</td><td>{7}</td><td>{8}</td><td>{9}</td><td nowrap='nowrap'>{10}</td>"
                    "<td>{11}</td><td>{12}</td></tr>".format(
                        i + 1, '%s:%s' % (p.ip, p.port), p.latency, p.google,
                        p.http_pass_proxy_ip, p.https_pass_proxy_ip,
                        p.http_anonymous, p.https_anonymous, p.http_weight,
                        p.https_weight,
                        p.updated_at.strftime("%Y-%m-%d %H:%M:%S"), p.country,
                        p.city))
            ips = ''.join(arr)

            html = '''<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
                                 <title>http/pikapi</title></head>
                <body>{0}
                <table width="800" style="margin-left:20px; border-collapse:collapse; padding-left:10px"  border="1"  cellPadding=3 bordercolor="#BBBBBB">
                <thead bgcolor="#DDDDDD"><tr><th>site</th><th>proxy</th><th>last crawl</th><th>this crawl</th><th>state</th></tr></thead>
                <tbody>{1}
                </tbody></table></br>
                <table width="900" style="margin-left:20px; border-collapse:collapse; padding-left:10px"  border="1"  cellPadding=3 bordercolor="#BBBBBB">
                <thead bgcolor="#DDDDDD"><tr><th>row</th><th>proxy</th><th>elapsed</th><th>google</th>
                  <th>http_pass_ip</th><th>https_pass_ip</th>
                  <th>http_ano</th><th>https_ano</th>
                  <th>http_weight</th><th>https_weight</th>
                  <th>updated_at</th><th>country_name</th><th>city</th></tr></thead>
                <tbody>{2}</tbody></table>
                </body></html>'''.format(ul, sites, ips)
            self.wfile.write(html.encode('utf-8'))
示例#10
0
from http.server import HTTPStatus, HTTPServer, BaseHTTPRequestHandler
from urllib import parse
from pikapi.database import ProxyIP, ProxyWebSite
import logging
from threading import Thread
from datetime import datetime, timedelta
import json

logger = logging.getLogger(__name__)

_valid_proxies_query = ProxyIP.select() \
    .where(ProxyIP.updated_at > datetime.now() - timedelta(minutes=30)) \
    .where(ProxyIP.http_weight + ProxyIP.https_weight > 0) \


class ResquestHandler(BaseHTTPRequestHandler):
    def do_HEAD(self):
        self.send_response(HTTPStatus.OK)
        self.send_header("Content-type", "text/html")
        self.end_headers()

    def do_GET(self):
        print('client :%s' % str(self.client_address))
        self.do_HEAD()
        # /api?format=json
        params = parse.parse_qs(parse.urlparse(self.path).query)
        # with _db.connection_context():
        ps = ProxyIP.select() \
            .where(ProxyIP.updated_at > datetime.now() - timedelta(minutes=60)) \
            .where(ProxyIP.https_anonymous > 0).where(ProxyIP.http_anonymous > 0) \
            .where(ProxyIP.https_weight > 0) \
示例#11
0
 def delete_ip(ip: str):
     ProxyIP.delete().where(ProxyIP.ip == ip).execute()