Пример #1
0
 def checkdbexist(self):
     """
     没有数据库则创建
     创建数据库表proxy
     去重创建表:
     create  table proxy(id  INTEGER  PRIMARY KEY AUTOINCREMENT NOT NULL,ip  TEXT NOT NULL , port TEXT NOT NULL ,protocol  TEXT NOT NULL, score INTEGER NOT NULL  ,UNIQUE(ip,protocol));
     :rtype: object
     """
     if not self.dbExists:
         try:
             print("create proxydb: " + proxyDbName + " and table: proxy ")
             conn = sqlite3.connect(self.dbPath)
             c = conn.cursor()
             c.execute(
                 '''CREATE TABLE proxy(id  INTEGER  PRIMARY KEY AUTOINCREMENT NOT NULL,ipport  TEXT NOT NULL ,protocol  TEXT NOT NULL, score INTEGER NOT NULL DEFAULT 100 ,UNIQUE(ipport, protocol))'''
             )
             conn.commit()
             conn.close()
             cprint("pls get proxy ip to insert to the db ", color="red")
             return True
         except IOError:
             cprint("no proxydb !!", color="red")
     else:
         #后续添加检测是否存在proxy表
         return True
Пример #2
0
 def publicip(self):
     req = requests.get("http://www.net.cn/static/customercare/yourip.asp")
     if req.status_code == 200:
         find_ip = re.compile('<h2>((\d{1,3})(\.\d{1,3}){3})</h2>')
         re_ip = find_ip.findall(req.text)
         return re_ip[0][0]
     else:
         cprint("获取公网IP失败", color="red")
Пример #3
0
 def run(self):
     cprint('获取器开始执行')
     for callback_label in range(self.crawler.__CrawlFuncCount__):
         callback = self.crawler.__CrawlFunc__[callback_label]
         # 获取代理
         proxies = self.crawler.get_proxies(callback)
         sys.stdout.flush()
         cprint("插入数据到sqlite3 proxy 表")
         for proxy in proxies:
             self.sqlite3.add(list(proxy))
Пример #4
0
 def random(self):
     """
     随机获取一个分数最高的代理,]
     如果没有则降序排列获取最高的一个
     :return :随机的一个代理
     """
     print(self.dbPath)
     if self.dbCreated:
         try:
             conn = sqlite3.connect(self.dbPath)
             c = conn.cursor()
             getRandom = c.execute(
                 "select ipport,protocol from proxy where score=100")
             list = getRandom.fetchall()
             if list:
                 print("打印获取的随机代理,score=100")
                 # print(choice(list))
                 return choice(list)
             else:
                 getRandom = c.execute(
                     '''select  ipport,protocol from proxy where score>80'''
                 )
                 list = getRandom.fetchall()
                 if list:
                     return choice(list)
                 else:
                     cprint("no ip where score > 80 ", color="red")
             conn.close()
         except Exception as ex:
             print(type(ex))
             cprint("get random proxy  fail!!", color="red")
         finally:
             conn.close()
     else:
         cprint("no proxydb plse check...", color="red")
Пример #5
0
    def add(self, proxy):
        """
        判断数据库是否存在,代理写入数据库
        proxy = ('114.114.114.114:8080', 'http')
        :rtype: object
        """
        if not self.dbCreated:
            cprint("数据库不存在,请在setting检查数据库路径,并执行checkdbexist....", color="red")
            return False
        else:
            try:
                if self.dbCreated:
                    conn = sqlite3.connect(self.dbPath)
                    c = conn.execute(
                        'INSERT INTO proxy(ipport, protocol)  VALUES (?,?)',
                        proxy)
                    conn.commit()
                    conn.close()
                    cprint("success: add proxy ip  into db !!")
                    return True

            except Exception:
                print("add ip  into db WRONG!!")
Пример #6
0
 def run(self):
     cprint("代理有效性验证开始")
     print('\n')
     count = self.sqlite.count()
     cprint("共有代理数量:" + str(count))
     for i in range(0, count, BATCH_TEST_SIZE):
         start = i
         stop = min(i + BATCH_TEST_SIZE, count)
         print('正在测试第', start + 1, '-', stop, '个代理')
         time.sleep(1)
         testproxy = self.sqlite.batch(start, stop)
         print(testproxy)
         for proxy in testproxy:
             testresult = self.veriy(proxy)
             if testresult:
                 cprint(str(proxy[0]) + "可用")
                 self.sqlite.max(proxy[0])
             else:
                 self.sqlite.decrease(proxy[0])
                 cprint(str(proxy[0]) + "分数降级", color="red")
Пример #7
0
 def veriy(self, proxy):
     cprint("正在验证" + str(proxy[0]))
     http = {"http": "http://" + proxy[0]}
     https = {"https": "http://" + proxy[0]}
     headers = {
         'user-agent':
         'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36',
         'Referer': 'https://www.google.com/'
     }
     time.sleep(0.3)
     try:
         if str(proxy[1]).lower() == "http":
             cprint("测试的代理ip" + str(http))
             response = requests.get(TEST_URL,
                                     proxies=http,
                                     timeout=10,
                                     headers=headers)
             if response.status_code == 200:
                 find_ip = re.compile('<h2>((\d{1,3})(\.\d{1,3}){3})</h2>')
                 re_ip1 = find_ip.findall(response.text)
             else:
                 cprint("代理测试公网IP失败获取公网IP失败", color="red")
             # if proxy[0] in str(response.text):
             # 222.129.38.93
             if self.public_ip != re_ip1[0][0]:
                 return True
             else:
                 return False
         else:
             cprint(https)
             response = requests.get(TEST_URL,
                                     proxies=https,
                                     timeout=10,
                                     headers=headers)
             if response.status_code == 200:
                 find_ip = re.compile('<h2>((\d{1,3})(\.\d{1,3}){3})</h2>')
                 re_ip2 = find_ip.findall(response.text)
             else:
                 cprint("代理测试公网IP失败", color="red")
             # if proxy[0] in str(response.text):
             if self.public_ip != re_ip2[0][0]:
                 return True
             else:
                 return False
     except Exception:
         cprint("代理有效性测试失败", color="red")
         return False
Пример #8
0
 def get_proxies(self, callback):
     proxies = []
     for proxy in eval("self.{}()".format(callback)):
         cprint("成功获取到代理:" + str(proxy))
         proxies.append(proxy)
     return proxies