示例#1
0
def test_spider(spider_type):
    """
    test spider
    """
    # 定义fetcher,parser和saver, 你也可以重写这三个类中的任何一个
    fetcher = spider.Fetcher(normal_max_repeat=3, normal_sleep_time=0, critical_max_repeat=5, critical_sleep_time=5)
    parser = MyParser(max_deep=1, max_repeat=2)
    saver = spider.Saver(file_name="out_%s.txt" % spider_type)

    # 定义Url过滤, UrlFilter使用Set, 适合Url数量不多的情况
    black_patterns = (spider.CONFIG_URLPATTERN_FILES, r"binding", r"download", )
    white_patterns = ("^http[s]{0,1}://(www\.){0,1}(wandoujia|(zhushou\.360))\.(com|cn)", )
    url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=None)

    # 确定使用ThreadPool还是ProcessPool
    if spider_type == "thread":
        web_spider = spider.WebSpiderT(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5)
    else:
        web_spider = spider.WebSpiderP(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5)

    parser_num = 1 if spider_type == "thread" else 3

    # 首先抓取一次豌豆荚页面,抓取完成之后不停止monitor
    web_spider.set_start_url("http://www.wandoujia.com/apps", ("wandoujia",), priority=0, deep=0, critical=False)
    web_spider.start_work_and_wait_done(fetcher_num=10, parser_num=parser_num, is_over=False)

    # 然后抓取360应用商店页面,抓取完成之后停止monitor
    web_spider.set_start_url("http://zhushou.360.cn/", ("360app",), priority=0, deep=0, critical=False)
    web_spider.start_work_and_wait_done(fetcher_num=10, parser_num=parser_num, is_over=True)
    return
示例#2
0
文件: test.py 项目: xiaojiu01/PSpider
def test_spider(mysql, spider_type):
    """
    test spider
    """
    # 定义fetcher, parser和saver, 你也可以重写这三个类中的任何一个
    fetcher = spider.Fetcher(normal_max_repeat=3, normal_sleep_time=0, critical_max_repeat=5, critical_sleep_time=5)
    # parser = spider.Parser(max_deep=1, max_repeat=2)
    parser = MyParser(max_deep=1, max_repeat=2)

    # 定义Url过滤
    black_patterns = (spider.CONFIG_URLPATTERN_FILES, r"binding", r"download", )
    white_patterns = ("^http[s]{0,1}://(www\.){0,1}(wandoujia|(zhushou\.360)|duba_\d)\.(com|cn)", )

    if not mysql:
        saver = spider.Saver(save_pipe=open("out.txt", "w", encoding="utf-8"))

        # UrlFilter, 使用Set, 适合Url数量不多的情况
        url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=None)
    else:
        saver = spider.SaverMysql(host="localhost", user="******", passwd="123456", database="default")
        saver.change_sqlstr("insert into t_test(url, title, getdate) values (%s, %s, %s);")

        # UrlFilter, 使用BloomFilter, 适合Url数量巨大的情况
        url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=10000)

    # 确定使用ThreadPool还是ProcessPool
    if spider_type == "thread":
        web_spider = spider.WebSpiderT(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5)
    else:
        web_spider = spider.WebSpiderP(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5)

    parser_num = 1 if spider_type == "thread" else 3

    # 首先抓取一次豌豆荚页面,抓取完成之后不停止monitor
    web_spider.set_start_url("http://www.wandoujia.com/apps", ("wandoujia",), priority=0, deep=0, critical=False)
    web_spider.start_work_and_wait_done(fetcher_num=10, parser_num=parser_num, is_over=False)

    # 然后抓取360应用商店页面,并试验critical参数的作用,抓取完成之后停止monitor
    web_spider.set_start_url("http://zhushou.360.cn/", ("360app",), priority=0, deep=0, critical=False)
    for i in range(5):
        web_spider.set_start_url("https://www.duba_%d.com/" % i, ("critical",), priority=0, deep=0, critical=True)
    web_spider.start_work_and_wait_done(fetcher_num=10, parser_num=parser_num, is_over=True)
    return
示例#3
0
    def item_save(self, url, keys, item):
        """
        这里只需要重写item_save函数,参数含义及返回结果见框架
        """
        if keys[0] == "Totals":
            self.save_pipe_total.write("\t".join([keys[1]] + [str(i) for i in item]) + "\n")
        elif keys[0] == "PerGame":
            self.save_pipe_pergame.write("\t".join([keys[1]] + [str(i) for i in item]) + "\n")
        else:
            return False
        return True


if __name__ == "__main__":
    """
    main流程
    """
    # 初始化fetcher, parser和saver
    fetcher = NBAFetcher(critical_max_repeat=3, critical_sleep_time=0)
    parser = NBAParser(max_deep=-1, max_repeat=3)
    saver = NBASaver(file_name_total="nba_total.txt", file_name_pergame="nba_pergame.txt")

    # 初始化爬虫, 并传入初始Url
    nba_spider = spider.WebSpiderP(fetcher, parser, saver, url_filter=None)
    nba_spider.set_start_url(url_player_index, ("index",), critical=True)

    # 开启10个线程抓取数据
    nba_spider.start_work_and_wait_done(fetcher_num=10, is_over=True)

    exit()