Exemplo n.º 1
0
    def checkout(self, host, port):
        proxy_url = '{}:{}'.format(host, port)
        proxies = {'http': proxy_url, 'https': proxy_url}
        start = time.time()
        try:
            r = requests.get(url='https://www.baidu.com',
                             headers=get_header(),
                             timeout=TIMEOUT,
                             proxies=proxies)
            r.encoding = chardet.detect(r.content)['encoding']

            if r.ok:
                speed = round(time.time() - start, 2)
                return speed
            else:
                sqlhelper = get_sqlhelper(self.proxy_type)
                try:
                    sqlhelper.delete({'ip': host, 'port': port})
                except:
                    pass
        except:
            sqlhelper = get_sqlhelper(self.proxy_type)
            try:
                sqlhelper.delete({'ip': host, 'port': port})
            except:
                pass
Exemplo n.º 2
0
    def run(self):
        while True:
            sqlhelper = get_sqlhelper(self.proxy_type)
            gather_count = sqlhelper.get_total()
            if gather_count < PROXY_CONF['proxy_type'][
                    self.proxy_type]['min_num']:
                str = "----->>>>>>ProxyPool starting"
                sys.stdout.write(str + '\r\n')
                sys.stdout.flush()
                #ip数量不足,开始采集数据
                target_list = PROXY_CONF['proxy_type'][
                    self.proxy_type]['parse_list']

                spawns = []
                for proxy in target_list:
                    spawns.append(gevent.spawn(self.crawl, proxy))
                    # 如果协程数超过设置数值
                    if len(spawns) >= PROXY_CONF['proxy_type'][
                            self.proxy_type]['max_download_num']:
                        gevent.joinall(spawns)
                        spawns = []
                gevent.joinall(spawns)

            else:
                str = '\r\nIPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...'
                sys.stdout.write(str + "\r\n")
                sys.stdout.flush()

                time.sleep(
                    PROXY_CONF['proxy_type'][self.proxy_type]['update_time'])
Exemplo n.º 3
0
    def re_verificate(self, proxy_type):
        sqlhelper = get_sqlhelper(proxy_type)
        proxy_list = sqlhelper.select()
        # 所有进程列表
        # 控制信息队列
        str = ">>>>>>>>Verificating Proxies Now,Please Ignor This Message>>>>>>>>>>\n"
        sys.stdout.write(str)
        sys.stdout.flush()
        while 1:

            if not proxy_list:
                time.sleep(5)
            if not self.cntl_q.empty():
                # 处理已结束的进程
                try:
                    pid = self.cntl_q.get()
                    proc = self.proc_pool.pop(pid)
                    proc_ps = psutil.Process(pid)
                    proc_ps.kill()
                    proc_ps.wait()
                except Exception as e:
                    pass
            while 1:
                if len(self.proc_pool) >= PROXY_CONF['proxy_type'][
                        self.proxy_type]['max_check_process_num']:
                    time.sleep(PROXY_CONF['proxy_type'][self.proxy_type]
                               ['check_wait_time'])

                    continue
                else:
                    p = Process(target=self.process_start, args=(proxy_list, ))
                    p.start()
                    self.proc_pool[p.pid] = p
                    break
Exemplo n.º 4
0
 def del_proxy(self, name, proxies):
     if isinstance(proxies, dict):
         proxy_type = SPIDER_CONF[name]['proxy_type']
         sqlhelper = get_sqlhelper(proxy_type)
         pro = proxies['http'].split(':')
         ip = pro[1]
         port = int(pro[2])
         sqlhelper.delete({'ip': ip, 'port': port})
Exemplo n.º 5
0
    def crawl(self, parser):
        sqlhelper = get_sqlhelper(self.proxy_type)
        if self.proxy_type != 'dedicated':
            html_parser = Html_Parser()

            for url in parser['urls']:

                response = Html_Downloader.download(url, self.proxy_type)
                if response is not None:
                    proxylist = html_parser.parse(response, parser)
                    if proxylist is not None:

                        for proxy in proxylist:

                            result = Verificate(self.proxy_type).checkout(
                                proxy['ip'], proxy['port'])
                            if result is not None:
                                str = '\r\n----->>>>>>>> ip is useful'
                                sys.stdout.write(str + "\r\n")
                                sys.stdout.flush()
                                proxy['speed'] = result
                                sqlhelper.insert(proxy)

                            while 1:
                                sqlhelper = get_sqlhelper(self.proxy_type)
                                count = sqlhelper.get_total()

                                if count > PROXY_CONF['proxy_type'][
                                        self.proxy_type]['min_num']:
                                    str = '\r\nIPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...'
                                    sys.stdout.write(str + "\r\n")
                                    sys.stdout.flush()
                                    time.sleep(PROXY_CONF['proxy_type'][
                                        self.proxy_type]['update_time'])
                                else:
                                    break
        else:
            for proxy in PROXY_CONF['proxy_type'][
                    self.proxy_type]['parse_list']:
                result = Verificate(self.proxy_type).checkout(
                    proxy['ip'], proxy['port'])
                if result:
                    proxy['speed'] = result
                    sqlhelper.insert(proxy)
Exemplo n.º 6
0
 def get_proxy(self, proxy_type):
     sqlhelper = get_sqlhelper(proxy_type)
     proxy = sqlhelper.select()
     if proxy:
         proxy = random.sample(proxy, 1)
         proxy_dict = {
             'http': 'http:{}:{}'.format(proxy[0][0], proxy[0][1]),
             'https': 'http:{}:{}'.format(proxy[0][0], proxy[0][1])
         }
         return proxy_dict
     else:
         str = "Acquiring Proxies Now,Please Wait For A Little Time--->>>>>>"
         sys.stdout.write(str + '\r\n')
         sys.stdout.flush()
         stop_time = time.time()
         # 如果超出设置的报警时间 则发送邮件通知管理员 进行处理
         if convert_seconds(self.now, stop_time) > ALARM_TIME and self.flag:
             SendEmail().send_email('代理警报', '超出设定时间,无可用代理,请处理')
             self.flag = 0
         time.sleep(2)
         self.get_proxy(proxy_type)