def run(self): spider_start_time = str(datetime.now()).split('.')[0] print spider_start_time, 'time to spider start!' proxy_manager = ProxyManager() page = get_html(BASE_URL) page = unicode(page,'GBK').encode('UTF-8') page_count = self.get_page_count(page) page_count_time = str(datetime.now()).split('.')[0] print page_count_time, 'get page count:', page_count default_ip = get_default_ip() if page_count != 0: last_proxy = None for i in xrange(1, page_count): page = get_html(URL_HEADER + str(i) + URL_END, last_proxy) proxy_list = filte(page) for proxy in proxy_list: if proxy.anonymous_type == '高匿': check_result = check_anonymous(proxy, default_ip) spider_time = str(datetime.now()).split('.')[0] if check_result[0]: proxy.delay_time = check_result[1] proxy.created_time = str(datetime.now()).split('.')[0] proxy.is_in_china = 2 proxy_manager.add_proxy(proxy, spider_time) last_proxy = proxy else: pass
def run(self): spider_start_time = str(datetime.now()).split('.')[0] print spider_start_time, 'time to spider start!' proxy_manager = ProxyManager() page = get_html(BASE_URL) page = unicode(page, 'GBK').encode('UTF-8') page_count = self.get_page_count(page) page_count_time = str(datetime.now()).split('.')[0] print page_count_time, 'get page count:', page_count default_ip = get_default_ip() if page_count != 0: last_proxy = None for i in xrange(1, page_count): page = get_html(URL_HEADER + str(i) + URL_END, last_proxy) proxy_list = filte(page) for proxy in proxy_list: if proxy.anonymous_type == '高匿': check_result = check_anonymous(proxy, default_ip) spider_time = str(datetime.now()).split('.')[0] if check_result[0]: proxy.delay_time = check_result[1] proxy.created_time = str( datetime.now()).split('.')[0] proxy.is_in_china = 2 proxy_manager.add_proxy(proxy, spider_time) last_proxy = proxy else: pass
def run(self): spider_start_time = str(datetime.now()).split('.')[0] print spider_start_time, 'time to spider start!' proxy_manager = ProxyManager() last_proxy = None for url in self.urls: page = get_html(url) page_count = self.get_page_count(page) page_count_time = str(datetime.now()).split('.')[0] print page_count_time, 'get page count:', page_count default_ip = get_default_ip() for i in xrange(1, page_count): page = get_html(url + str(i)) proxy_list = filte(page) for proxy in proxy_list: if proxy.anonymous_type == '高匿': check_result = check_anonymous(proxy, default_ip) spider_time = str(datetime.now()).split('.')[0] if check_result[0]: proxy.delay_time = check_result[1] proxy.created_time = str(datetime.now()).split('.')[0] proxy.is_in_china = 0 if url.endswith(CHINA_ANONYMOUS) or url.endswith(CHINA_NORMAL): proxy.is_in_china = 1 proxy_manager.add_proxy(proxy, spider_time) last_proxy = proxy else: pass
def run(self): spider_start_time = str(datetime.now()).split('.')[0] print spider_start_time, 'time to spider start!' proxy_manager = ProxyManager() last_proxy = None for url in self.urls: page = get_html(url) page_count = self.get_page_count(page) page_count_time = str(datetime.now()).split('.')[0] print page_count_time, 'get page count:', page_count default_ip = get_default_ip() for i in xrange(1, page_count): page = get_html(url + str(i)) proxy_list = filte(page) for proxy in proxy_list: if proxy.anonymous_type == '高匿': check_result = check_anonymous(proxy, default_ip) spider_time = str(datetime.now()).split('.')[0] if check_result[0]: proxy.delay_time = check_result[1] proxy.created_time = str( datetime.now()).split('.')[0] proxy.is_in_china = 0 if url.endswith(CHINA_ANONYMOUS) or url.endswith( CHINA_NORMAL): proxy.is_in_china = 1 proxy_manager.add_proxy(proxy, spider_time) last_proxy = proxy else: pass
def check_all_proxy_anonymous(self): default_ip = usefull_check.get_default_ip() check_start_time = str(datetime.now()).split('.')[0] print check_start_time, 'start check all proxy anonymous!' proxys = self.db.query(Proxy) for proxy in proxys: check_result = usefull_check.check_anonymous(proxy, default_ip) if not check_result[0]: self.del_proxy(proxy.ip, proxy.port) print proxy.ip, ':', proxy.port, ' is not anonymous and delete it!' else: print proxy.ip, ':', proxy.port, ' is anonymous!'