示例#1
0
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     # self.parser = html_parser.HtmlParser()
     self.parser = html_parser_by_xpath.HtmlParserByXpath()
     self.outputer = spider_outputer.HtmlOutputer()
示例#2
0
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = downloader.Downloader()
     self.parser = parser.Parser()
     self.outputer = outputer.Outputer()
示例#3
0
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
示例#4
0
 def __init__(self):
     #初始化各个对象
     self.url = url_manager.UrlManager()
     self.parser = html_parser.HtmlParser()
     self.downloader = html_downloader.HtmlDownloader()
     self.outputer = img_outputer.ImgOutputer()
示例#5
0
 def __init__(self):  #初始化,
     self.urls = url_manager.UrlManager()  #创建url管理器实例
     self.downloader = html_downloader.HtmlDownloader()  #创建下载器实例
     self.parser = html_parser.HtmlParser()  #创建解析器实例
     self.outputer = html_outputer.HtmlOutputer()  #创建数据输出实例
示例#6
0
def spider(config_path):
    """
    启动爬虫程序,根据配置文件进行爬取和停止
    :param config_path: 爬虫的配置文件路径
    :return:
    """
    # seedfile_path, result_path, max_depth, crawl_interval, \
    # crawl_timeout, thread_count, target_re = load_config(config_path)

    spider_config_obj = load_config(config_path)
    seedfile_path = spider_config_obj.seedfile
    result_path = spider_config_obj.result_path
    max_depth = spider_config_obj.max_depth
    crawl_interval = spider_config_obj.crawl_interval
    crawl_timeout = spider_config_obj.crawl_timeout
    thread_count = spider_config_obj.thread_count
    target_re = spider_config_obj.target_re

    target_re = re.compile(target_re)  # 编译正则对象
    seeds = load_seeds(seedfile_path)  # 获取种子列表

    # 构造Url管理器并且添加种子
    url_manager_obi = url_manager.UrlManager()
    if seeds and isinstance(seeds, list):
        url_manager_obi.add_new_urls(seeds)

    result_queue_obj = result_queue.ResultQueue()  # 保存结果的队列
    wb_save_thread = webpage_save.WebpageSave(result_queue_obj,
                                              result_path)  # 保存结果的线程

    # 添加爬虫线程并且启动
    crawl_threads = list()
    for index in range(thread_count):
        crawl_threads.append(
            crawl_thread.CrawlThread(url_manager_obi, result_queue_obj,
                                     crawl_interval, crawl_timeout, target_re,
                                     "crawl_thread_%s" % index))

    for crawl_thread_obj in crawl_threads:
        crawl_thread_obj.start()  # 启动爬取线程

    wb_save_thread.start()  # 启动收集结果线程

    cur_depth = 0
    while True:
        logger.info("当前爬取深度:%d" % cur_depth)
        url_manager_obi.join()

        if cur_depth < max_depth:  # 添加所有线程新增的Url和符合预期的数据
            for crawl_thread_obj in crawl_threads:
                url_manager_obi.add_new_urls(crawl_thread_obj.get_new_urls())
                result_queue_obj.add_results(crawl_thread_obj.get_new_data())
                crawl_thread_obj.clear()
        elif cur_depth == max_depth:  # 只添加所有线程新增符合预期的数据,并且退出爬取线程
            for crawl_thread_obj in crawl_threads:
                result_queue_obj.add_results(crawl_thread_obj.get_new_data())
                crawl_thread_obj.clear()
                crawl_thread_obj.close()
            break
        cur_depth += 1  # 爬取结束后将深度加1

    result_queue_obj.join()
    wb_save_thread.close()  # 防止Queue的get方法阻塞,需要在get方法中添加超时时间和捕获异常s

    for crawl_thread_obj in crawl_threads:
        crawl_thread_obj.join()
示例#7
0
 def __init__(self):
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
     self.connect = connect_mysql.Conenct()
示例#8
0
 def __init__(self):
     self.__url_manager = url_manager.UrlManager()
     self.__html_downloader = html_downloader.Downloader()
     self.__html_parser = worker_parser.LadyParser()
     self.__data_manager = data_manager.DataManager()
示例#9
0
 def __init__(self):  #管理--下载--解析--输出
     self.urls = url_manager.UrlManager()
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()
示例#10
0
from spider import html_downloader, html_parser, url_manager
from bs4 import BeautifulSoup
url = 'http://www.27270.com/ent/meinvtupian/2017/226226.html'
urlManager = url_manager.UrlManager()
urlManager.add_new_url(url)
downloader = html_downloader.Downloader()
parser = html_parser.Parser()


def parse_detail(soup):
    image = soup.find('p', align='center').find('img')
    title = soup.find('h1', 'articleV4Tit').text
    position_a = soup.find('div', 'position').find_all('a')[-1]
    # count = len(position_a_tag)
    position_url = position_a['href'] + '2017/'
    nextPageUrl = position_url + soup.find('a', text='下一页')['href']
    if nextPageUrl.endswith('.html'):
        urlManager.add_new_url(nextPageUrl)
    print(image['src'] + '---' + nextPageUrl + '----' + title)


while urlManager.has_new_url():
    new_url = urlManager.get_new_url()
    html, encode = downloader.download(new_url)
    parser.parseAll(parse_detail, html, encode)
示例#11
0
 def __init__(self):
     self.urls = url_manager.UrlManager()  # url管理器
     self.downloader = html_downloader.HtmlDownloader()  # url下载器
     self.parser = html_parser.HtmlParser()  # url解析器
     self.outputer = html_outputer.HtmlOutputer()  # html输出器
示例#12
0
 def __init__(self):
     self.urls = url_manager.UrlManager(
     )  #类的方法也是属性,函数名是一个指向函数的变量,函数赋值给变量,那么变量就指向函数
     self.downloader = html_downloader.HtmlDownloader()
     self.parser = html_parser.HtmlParser()
     self.outputer = html_outputer.HtmlOutputer()