def check(self, amount=None): """ Check if the proxy address is valid. :return: None """ # TODO: 改成多线程检测 if amount: proxy_list = self.session.query(Proxy).filter( Proxy.id <= amount).all() else: proxy_list = self.session.query(Proxy).all() for proxy in proxy_list: proxy_ip = proxy.ip proxy_port = proxy.port logger.info("Testing %s:%s" % (proxy_ip, proxy_port)) s, t = self.__check_proxy(proxy_ip, proxy_port) logger.debug("Time: " + str(t) + " Success: " + str(s)) # 更新数据库 proxy_item = self.session.query(Proxy).filter( Proxy.id == proxy.id).first() proxy_item.times = t proxy_item.updated_time = datetime.datetime.now() if s: proxy_item.is_alive = 1 self.session.add(proxy_item) self.session.commit()
def start(self): if not len(self.al.spiders): logger.error("No Spiders loaded. exit.") sys.exit(1) else: message = "Loaded spiders: " for s in self.al.spiders: message += str(s.__class__).split(".")[-1].split("'")[0] + ", " logger.info(message.strip(", ")) # 创建线程池 if self.spider_threads: self.tp = ThreadPool(self.spider_threads) else: self.tp = ThreadPool() for sp in self.al.spiders: # 将spider中的run方法添加到线程池中 self.tp.add_function(sp.run) # 开始线程池 self.tp.run(join=False) # 输出结果 self.sd = SaveData(self.al.results, self.tp, use_file=self.output_file, use_database=self.output_db, filename=self.output_filename) if self.save_data_threads: self.write_file_tp = ThreadPool(self.save_data_threads) else: self.write_file_tp = ThreadPool() self.write_file_tp = ThreadPool() self.write_file_tp.add_function(self.sd.write) self.write_file_tp.run()
def check(self, amount=None): """ Check if the proxy address is valid. :return: None """ # TODO: 改成多线程检测 if amount: proxy_list = self.session.query(Proxy).filter(Proxy.id <= amount).all() else: proxy_list = self.session.query(Proxy).all() for proxy in proxy_list: proxy_ip = proxy.ip proxy_port = proxy.port logger.info("Testing %s:%s" % (proxy_ip, proxy_port)) s, t = self.__check_proxy(proxy_ip, proxy_port) logger.debug("Time: " + str(t) + " Success: " + str(s)) # 更新数据库 proxy_item = self.session.query(Proxy).filter(Proxy.id == proxy.id).first() proxy_item.times = t proxy_item.updated_time = datetime.datetime.now() if s: proxy_item.is_alive = 1 self.session.add(proxy_item) self.session.commit()
def _start(self, target): logger.debug("start spider " + target[0]) deep = target[1] target = target[0] # 随机取一个phantomjs进程 phantomjs_tag = random.randint(0, self.phantomjs_count - 1) self.driver_pool_lock[phantomjs_tag].acquire() retry_times = 2 while retry_times: try: self.driver_pool[phantomjs_tag].get(target) break except: # driver.close() logger.error("retry %d" % retry_times) retry_times -= 1 if not retry_times: logger.warn("Time out when get %s HTML" % target) self.driver_pool_lock[phantomjs_tag].release() return else: continue # 获取网页HTML raw_html = self.driver_pool[phantomjs_tag].execute_script( "return document.getElementsByTagName('html')[0].innerHTML") # 获取网页加载过程中发生的HTTP请求 http_log = json.loads(self.driver_pool[phantomjs_tag].get_log("har")[0] ["message"])["log"]["entries"] # 获取当前的页面URL base_url = self.driver_pool[phantomjs_tag].current_url # 释放锁 self.driver_pool_lock[phantomjs_tag].release() soup = BeautifulSoup(raw_html, "html5lib") logger.debug("Get %s HTML done. Deep: %s" % (target, deep)) # 处理文件中获取的href标签 for a in soup.find_all("a", href=True): url = a['href'].strip() # 去掉非URL的部分 if url.startswith('javascript:') or url.startswith('#') or not url: continue elif not url.startswith('https://') or not url.startswith( 'http://'): # 将相对路径转换为绝对路径 url = urlparse.urljoin(base_url, url) self.check_same_url(url, deep, self.filter_similar) # 处理打开页面时产生的请求 for log in http_log: url = log['request']['url'] logger.info(url) self.check_same_url(url, deep, self.filter_similar) logger.debug("".join(["Raw links: ", str(self.raw_links_num)])) logger.debug("".join(["Filter links: ", str(self.filter_links_num)]))
def _start(self, target): logger.debug("start spider " + target[0]) deep = target[1] target = target[0] # 随机取一个phantomjs进程 phantomjs_tag = random.randint(0, self.phantomjs_count-1) self.driver_pool_lock[phantomjs_tag].acquire() retry_times = 2 while retry_times: try: self.driver_pool[phantomjs_tag].get(target) break except: # driver.close() logger.error("retry %d" % retry_times) retry_times -= 1 if not retry_times: logger.warn("Time out when get %s HTML" % target) self.driver_pool_lock[phantomjs_tag].release() return else: continue # 获取网页HTML raw_html = self.driver_pool[phantomjs_tag].execute_script( "return document.getElementsByTagName('html')[0].innerHTML" ) # 获取网页加载过程中发生的HTTP请求 http_log = json.loads(self.driver_pool[phantomjs_tag].get_log("har")[0]["message"])["log"]["entries"] # 获取当前的页面URL base_url = self.driver_pool[phantomjs_tag].current_url # 释放锁 self.driver_pool_lock[phantomjs_tag].release() soup = BeautifulSoup(raw_html, "html5lib") logger.debug("Get %s HTML done. Deep: %s" % (target, deep)) # 处理文件中获取的href标签 for a in soup.find_all("a", href=True): url = a['href'].strip() # 去掉非URL的部分 if url.startswith('javascript:') or url.startswith('#') or not url: continue elif not url.startswith('https://') or not url.startswith('http://'): # 将相对路径转换为绝对路径 url = urlparse.urljoin(base_url, url) self.check_same_url(url, deep, self.filter_similar) # 处理打开页面时产生的请求 for log in http_log: url = log['request']['url'] logger.info(url) self.check_same_url(url, deep, self.filter_similar) logger.debug("".join(["Raw links: ", str(self.raw_links_num)])) logger.debug("".join(["Filter links: ", str(self.filter_links_num)]))
def __write_file(self, res): self.ff.writelines(res[0].get('url') + "\n") self.ff.writelines("ip,port,type,protocol,location,time(s)\n") logger.info("[*] url: " + res[0].get('url')) res = res[1] for r in res: line = r.get('ip', 'None') + "," + r.get('port', 'None') + "," + \ r.get('type', 'None') + "," + r.get('protocol', 'None') + "," + \ r.get('location', 'None') + "," + r.get('time', 'None') logger.info("[*] " + line) self.ff.writelines((line + "\n").encode("utf8"))
def check_same_url(self, url, deep, filter_similar): # 判断URL的后缀是否为图片等 url_st = urlparse.urlparse(url) suffix = url_st.path.split(".")[-1] if suffix.lower() in [ "jpg", "png", "gif", "jpeg", "bmp", "css", "ttf" ]: return self.raw_links_num += 1 # 先判断域名在不在目标域中 if self.check_domain_limit(url): # 在目标域中,判断参数格式 # 如果已经在set中,说明之前爬到过类似参数的页面,直接return # 如果不在set中,说明之前未出现过,继续向下执行处理,并将其添加到set中 formatted_url = self.format_url_param(url) # logger.warning(formatted_url) if formatted_url is not None: if formatted_url not in self.url_param_set: self.url_param_set.add(formatted_url) else: return # 格式化url r, suffix = self.format_url(url) if suffix: # 有后缀,正常页面,根据是否判断相似性的设置继续判断 if filter_similar and (r not in self.url_set): self.filter_links_num += 1 self.url_set.add(r) self.links.append(url) logger.info(url) if deep + 1 <= self.deep: self.task_queue.put((url, deep + 1)) elif not filter_similar and (url not in self.links): self.filter_links_num += 1 self.links.append(url) logger.info(url) if deep + 1 <= self.deep: self.task_queue.put((url, deep + 1)) else: # 没有后缀,是个目录,去重,不去相似 if url not in self.links: self.filter_links_num += 1 self.links.append(url) logger.info(url) if deep + 1 <= self.deep: self.task_queue.put((url, deep + 1))
def check_same_url(self, url, deep, filter_similar): # 判断URL的后缀是否为图片等 url_st = urlparse.urlparse(url) suffix = url_st.path.split(".")[-1] if suffix.lower() in ["jpg", "png", "gif", "jpeg", "bmp", "css", "ttf"]: return self.raw_links_num += 1 # 先判断域名在不在目标域中 if self.check_domain_limit(url): # 在目标域中,判断参数格式 # 如果已经在set中,说明之前爬到过类似参数的页面,直接return # 如果不在set中,说明之前未出现过,继续向下执行处理,并将其添加到set中 formatted_url = self.format_url_param(url) # logger.warning(formatted_url) if formatted_url is not None: if formatted_url not in self.url_param_set: self.url_param_set.add(formatted_url) else: return # 格式化url r, suffix = self.format_url(url) if suffix: # 有后缀,正常页面,根据是否判断相似性的设置继续判断 if filter_similar and (r not in self.url_set): self.filter_links_num += 1 self.url_set.add(r) self.links.append(url) logger.info(url) if deep + 1 <= self.deep: self.task_queue.put((url, deep + 1)) elif not filter_similar and (url not in self.links): self.filter_links_num += 1 self.links.append(url) logger.info(url) if deep + 1 <= self.deep: self.task_queue.put((url, deep + 1)) else: # 没有后缀,是个目录,去重,不去相似 if url not in self.links: self.filter_links_num += 1 self.links.append(url) logger.info(url) if deep + 1 <= self.deep: self.task_queue.put((url, deep + 1))
def terminate(self): self._exit = True logger.info("Send exit to WebClicker..Wait for quit.")
def __init__(self, target, deep=1, limit_domain=list(), thread_count=cpu_count()*2, phantomjs_count=cpu_count(), filter_similar=False): # 设置phantomjs路径 SpiderBase.__init__(self) SpiderBase.set_phantomjs_path(self) # 设置参数 self.target = target self.deep = deep if limit_domain: self.limit_domain = limit_domain else: self.limit_domain = ".".join(tldextract.extract(self.target)) self.thread_count = thread_count self.phantomjs_count = phantomjs_count self.filter_similar = filter_similar # 去重用的set self.url_set = set() self.url_param_set = set() # 存储爬虫结果的list self.links = list() # 待爬取的队列 self.task_queue = Queue.Queue() self.spider_pool = None # 将初始目标置于待爬取的队列中 self.task_queue.put((self.target, 0)) # 统计信息 self.raw_links_num = 0 self.filter_links_num = 0 self.links_num = 0 # 初始化 webdriver # dcap 好像无效 self.dcap = dict(DesiredCapabilities.PHANTOMJS) self.dcap["phantomjs.page.settings.resourceTimeout"] = 10 self.dcap["phantomjs.page.settings.loadImages"] = False self.service_args = [ "--webdriver-loglevel=DEBUG", "--webdriver-logfile=phantomjs.log" "--load-images=no", "--disk-cache=true" ] # webdriver进程池 logger.info("initial web spider phantomjs process pool...") self.driver_pool = list() self.driver_pool_lock = list() for i in range(self.phantomjs_count): self.driver_pool.append( webdriver.PhantomJS(executable_path=self.phantomjs_path, desired_capabilities=self.dcap, service_args=self.service_args ) ) self.driver_pool_lock.append( threading.Lock() ) logger.info("%.2f%% finished." % ((float(i + 1) * 100) / float(self.phantomjs_count))) logger.info("initial finished.")
def __init__(self, target, deep=1, limit_domain=list(), thread_count=cpu_count() * 2, phantomjs_count=cpu_count(), filter_similar=False): # 设置phantomjs路径 SpiderBase.__init__(self) SpiderBase.set_phantomjs_path(self) # 设置参数 self.target = target self.deep = deep if limit_domain: self.limit_domain = limit_domain else: self.limit_domain = ".".join(tldextract.extract(self.target)) self.thread_count = thread_count self.phantomjs_count = phantomjs_count self.filter_similar = filter_similar # 去重用的set self.url_set = set() self.url_param_set = set() # 存储爬虫结果的list self.links = list() # 待爬取的队列 self.task_queue = Queue.Queue() self.spider_pool = None # 将初始目标置于待爬取的队列中 self.task_queue.put((self.target, 0)) # 统计信息 self.raw_links_num = 0 self.filter_links_num = 0 self.links_num = 0 # 初始化 webdriver # dcap 好像无效 self.dcap = dict(DesiredCapabilities.PHANTOMJS) self.dcap["phantomjs.page.settings.resourceTimeout"] = 10 self.dcap["phantomjs.page.settings.loadImages"] = False self.service_args = [ "--webdriver-loglevel=DEBUG", "--webdriver-logfile=phantomjs.log" "--load-images=no", "--disk-cache=true" ] # webdriver进程池 logger.info("initial web spider phantomjs process pool...") self.driver_pool = list() self.driver_pool_lock = list() for i in range(self.phantomjs_count): self.driver_pool.append( webdriver.PhantomJS(executable_path=self.phantomjs_path, desired_capabilities=self.dcap, service_args=self.service_args)) self.driver_pool_lock.append(threading.Lock()) logger.info("%.2f%% finished." % ((float(i + 1) * 100) / float(self.phantomjs_count))) logger.info("initial finished.")