def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() # self.parser = html_parser.HtmlParser() self.parser = html_parser_by_xpath.HtmlParserByXpath() self.outputer = spider_outputer.HtmlOutputer()
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = downloader.Downloader() self.parser = parser.Parser() self.outputer = outputer.Outputer()
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser()
def __init__(self): #初始化各个对象 self.url = url_manager.UrlManager() self.parser = html_parser.HtmlParser() self.downloader = html_downloader.HtmlDownloader() self.outputer = img_outputer.ImgOutputer()
def __init__(self): #初始化, self.urls = url_manager.UrlManager() #创建url管理器实例 self.downloader = html_downloader.HtmlDownloader() #创建下载器实例 self.parser = html_parser.HtmlParser() #创建解析器实例 self.outputer = html_outputer.HtmlOutputer() #创建数据输出实例
def spider(config_path): """ 启动爬虫程序,根据配置文件进行爬取和停止 :param config_path: 爬虫的配置文件路径 :return: """ # seedfile_path, result_path, max_depth, crawl_interval, \ # crawl_timeout, thread_count, target_re = load_config(config_path) spider_config_obj = load_config(config_path) seedfile_path = spider_config_obj.seedfile result_path = spider_config_obj.result_path max_depth = spider_config_obj.max_depth crawl_interval = spider_config_obj.crawl_interval crawl_timeout = spider_config_obj.crawl_timeout thread_count = spider_config_obj.thread_count target_re = spider_config_obj.target_re target_re = re.compile(target_re) # 编译正则对象 seeds = load_seeds(seedfile_path) # 获取种子列表 # 构造Url管理器并且添加种子 url_manager_obi = url_manager.UrlManager() if seeds and isinstance(seeds, list): url_manager_obi.add_new_urls(seeds) result_queue_obj = result_queue.ResultQueue() # 保存结果的队列 wb_save_thread = webpage_save.WebpageSave(result_queue_obj, result_path) # 保存结果的线程 # 添加爬虫线程并且启动 crawl_threads = list() for index in range(thread_count): crawl_threads.append( crawl_thread.CrawlThread(url_manager_obi, result_queue_obj, crawl_interval, crawl_timeout, target_re, "crawl_thread_%s" % index)) for crawl_thread_obj in crawl_threads: crawl_thread_obj.start() # 启动爬取线程 wb_save_thread.start() # 启动收集结果线程 cur_depth = 0 while True: logger.info("当前爬取深度:%d" % cur_depth) url_manager_obi.join() if cur_depth < max_depth: # 添加所有线程新增的Url和符合预期的数据 for crawl_thread_obj in crawl_threads: url_manager_obi.add_new_urls(crawl_thread_obj.get_new_urls()) result_queue_obj.add_results(crawl_thread_obj.get_new_data()) crawl_thread_obj.clear() elif cur_depth == max_depth: # 只添加所有线程新增符合预期的数据,并且退出爬取线程 for crawl_thread_obj in crawl_threads: result_queue_obj.add_results(crawl_thread_obj.get_new_data()) crawl_thread_obj.clear() crawl_thread_obj.close() break cur_depth += 1 # 爬取结束后将深度加1 result_queue_obj.join() wb_save_thread.close() # 防止Queue的get方法阻塞,需要在get方法中添加超时时间和捕获异常s for crawl_thread_obj in crawl_threads: crawl_thread_obj.join()
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() self.connect = connect_mysql.Conenct()
def __init__(self): self.__url_manager = url_manager.UrlManager() self.__html_downloader = html_downloader.Downloader() self.__html_parser = worker_parser.LadyParser() self.__data_manager = data_manager.DataManager()
def __init__(self): #管理--下载--解析--输出 self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
from spider import html_downloader, html_parser, url_manager from bs4 import BeautifulSoup url = 'http://www.27270.com/ent/meinvtupian/2017/226226.html' urlManager = url_manager.UrlManager() urlManager.add_new_url(url) downloader = html_downloader.Downloader() parser = html_parser.Parser() def parse_detail(soup): image = soup.find('p', align='center').find('img') title = soup.find('h1', 'articleV4Tit').text position_a = soup.find('div', 'position').find_all('a')[-1] # count = len(position_a_tag) position_url = position_a['href'] + '2017/' nextPageUrl = position_url + soup.find('a', text='下一页')['href'] if nextPageUrl.endswith('.html'): urlManager.add_new_url(nextPageUrl) print(image['src'] + '---' + nextPageUrl + '----' + title) while urlManager.has_new_url(): new_url = urlManager.get_new_url() html, encode = downloader.download(new_url) parser.parseAll(parse_detail, html, encode)
def __init__(self): self.urls = url_manager.UrlManager() # url管理器 self.downloader = html_downloader.HtmlDownloader() # url下载器 self.parser = html_parser.HtmlParser() # url解析器 self.outputer = html_outputer.HtmlOutputer() # html输出器
def __init__(self): self.urls = url_manager.UrlManager( ) #类的方法也是属性,函数名是一个指向函数的变量,函数赋值给变量,那么变量就指向函数 self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()