Exemplo n.º 1
0
 def SaveProxy(self,proxies):
     """在写入数据时注意一次写入多条,一并存储,避免一次1条严重影响效率"""
     # print("111")
     for proxy in proxies:
         try:
             MysqlClient().add(proxy)
         except Exception:
             continue
         print("------------------")
     self.proxies = []
     return MysqlClient().all()
Exemplo n.º 2
0
def start():
    '''非协程启动爬虫'''
    rds = RedisClient('url', '127.0.0.1', None)
    my = MysqlClient()
    ip_pv = GetIpPv(rds, my)
    while ip_pv.get_num():

        domain = ip_pv.get_domain()
        print(ip_pv.get_result(domain))
Exemplo n.º 3
0
def start_coro():
    '''非协程启动爬虫'''
    rds = RedisClient('url', '127.0.0.1', None)
    my = MysqlClient()
    ip_pv = GetIpPv(rds, my)
    event_loop = asyncio.get_event_loop()
    try:
        event_loop.run_until_complete(ip_pv.download())
    finally:
        event_loop.close()
Exemplo n.º 4
0
class Tester():
    def __init__(self):
        self.VALID_STATUS_CODES = [200]
        self.TEST_URL = "https://www.baidu.com"
        self.BATCH_TEST_SIZE = 100
        self.mysql = MysqlClient()

    async def single_proxy_handler(self,proxy):
        """单个代理获取测试"""
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy,bytes):
                    proxy = proxy.decode("utf-8")
                real_proxy = 'http://' + proxy
                print("正在测试",proxy)
                async with session.get(self.TEST_URL,proxy=real_proxy,timeout=15) as response:
                    if response.status in self.VALID_STATUS_CODES:
                        self.mysql.max(proxy)
                        print("代理可用",proxy)
                    else:
                        self.mysql.decrease(proxy)
                        print("请求响应码不合法",proxy)
            except Exception:
                self.mysql.decrease(proxy)
                print("代理请求失败", proxy)
    def run(self):
        try:
            proxy_list = self.mysql.all()
            loop = asyncio.get_event_loop()  # 构建事件循环
            for i in range(0,len(proxy_list),100):
                test_proxies = proxy_list[i:i+self.BATCH_TEST_SIZE]
                tasks = [self.single_proxy_handler(proxy[0]) for proxy in test_proxies] # 构建任务列表
                loop.run_until_complete(asyncio.wait(tasks)) # 将任务列表注入事件循环
                time.sleep(5)
        except Exception:
            print("测试错误")
Exemplo n.º 5
0
    def get_domain(self):
        '''获取域名'''
        return self.redis_db.pop()

    def get_rest_domain_num(self):
        '''剩余域名数量'''
        return self.redis_db.get_num()

    def save(self, text):
        '''保存结果'''
        with open('title.txt', 'a+') as f:
            f.write(text)

    def download(self):
        while self.get_rest_domain_num():
            url = self.get_domain()
            logging.info('req ' + url)
            try:
                response = self.get_page(url)
                response.encoding = response.apparent_encoding
                logging.info(response.status_code)
                doc = self.parse(response)
                self.save(url + ';' + doc + '\n')
            except Exception as e:
                self.save(url + ';\n')


if __name__ == '__main__':
    tc = ThemeCrawler(RedisClient('url', '127.0.0.1', None), MysqlClient())
    tc.download()
Exemplo n.º 6
0
 def get_proxy(self):
     result = MysqlClient.random()
     return result
Exemplo n.º 7
0
 def Count(self):
     return MysqlClient().count()
Exemplo n.º 8
0
def get_conn():
    if not hasattr(g, 'proxiespool'):
        g.mysql = MysqlClient()
    return g.mysql
Exemplo n.º 9
0
 def __init__(self):
     self.VALID_STATUS_CODES = [200]
     self.TEST_URL = "https://www.baidu.com"
     self.BATCH_TEST_SIZE = 100
     self.mysql = MysqlClient()