def test_spider(): """ test spider """ # 定义fetcher,parser和saver, 你也可以重写这三个类中的任何一个 fetcher = spider.Fetcher(normal_max_repeat=3, normal_sleep_time=0, critical_max_repeat=5, critical_sleep_time=5) parser = spider.Parser(max_deep=1, max_repeat=2) saver = spider.Saver(save_pipe=open("out_spider.txt", "w")) # 定义Url过滤, UrlFilter使用Set, 适合Url数量不多的情况 black_patterns = (spider.CONFIG_URLPATTERN_FILES, r"binding", r"download", ) white_patterns = ("^http[s]{0,1}://(www\.){0,1}(wandoujia|(zhushou\.360))\.(com|cn)", ) url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=1000) # 初始化WebSpider web_spider = spider.WebSpider(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5) # 首先抓取一次豌豆荚页面, 抓取完成之后不停止monitor web_spider.set_start_url("http://www.wandoujia.com/apps", ("wandoujia",), priority=0, deep=0, critical=False) web_spider.start_work_and_wait_done(fetcher_num=10, is_over=False) # 然后抓取360应用商店页面, 抓取完成之后停止monitor web_spider.set_start_url("http://zhushou.360.cn/", ("360app",), priority=0, deep=0, critical=False) web_spider.start_work_and_wait_done(fetcher_num=10, is_over=True) return
def test_spider(): """ test spider """ # initial fetcher / parser / saver / proxieser fetcher = MyFetcher(sleep_time=1, max_repeat=0) parser = MyParser(max_deep=2) saver = MySaver(save_pipe=open("out_thread.txt", "w")) # proxieser = MyProxies(sleep_time=5) # define url_filter url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=None) # initial web_spider web_spider = spider.WebSpider(fetcher, parser, saver, proxieser=None, url_filter=url_filter, queue_parse_size=-1) # web_spider = spider.WebSpider(fetcher, parser, saver, proxieser=None, url_filter=url_filter, queue_parse_size=100, queue_proxies_size=100) # add start url web_spider.set_start_url("http://zhushou.360.cn/", priority=0, keys={"type": "360"}, deep=0) # start web_spider web_spider.start_working(fetcher_num=20) # wait for finished web_spider.wait_for_finished() return
def test_spider(): """ 测试函数 """ # 初始化 fetcher / parser / saver / proxieser fetcher = MyFetcher(sleep_time=0, max_repeat=1) parser = MyParser(max_deep=1) saver = MySaver(save_pipe=open("out.txt", "w")) # proxieser = MyProxies(sleep_time=5) # 定义url_filter url_filter = spider.UrlFilter(white_patterns=(re.compile(r"^http[s]?://(www\.)?petstation\.jp"),), capacity=None) # 定义爬虫web_spider web_spider = spider.WebSpider(fetcher, parser, saver, proxieser=None, url_filter=url_filter, queue_parse_size=-1, queue_save_size=-1) # web_spider = spider.WebSpider(fetcher, parser, saver, proxieser=proxieser, url_filter=url_filter, queue_parse_size=100, queue_proxies_size=100) # 添加起始的url # web_spider.set_start_url("https://www.appinn.com/", priority=0, keys={"type": "index"}, deep=0) web_spider.set_start_url("https://www.petstation.jp/animal_detail.php?animal__id=371144", priority=0, keys={"type": "index"}, deep=0) # 开启爬虫web_spider web_spider.start_working(fetcher_num=20) # 等待爬虫结束 web_spider.wait_for_finished() return
def test_spider(): """ test spider """ # initial fetcher / parser / saver, you also can rewrite this three class fetcher = spider.Fetcher(max_repeat=3, sleep_time=1) parser = spider.Parser(max_deep=2) saver = spider.Saver(save_pipe=open("out_spider_thread.txt", "w")) # define url_filter url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=None) # initial web_spider web_spider = spider.WebSpider(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5) # add start url web_spider.set_start_url("http://zhushou.360.cn/", keys=("360web", )) # start web_spider result = web_spider.start_work_and_wait_done(fetcher_num=10, is_over=True) # print result of web_spider print(result[spider.TPEnum.URL_FETCH_SUCC], result[spider.TPEnum.HTM_PARSE_SUCC], result[spider.TPEnum.ITEM_SAVE_SUCC]) return
def test_spider(): """ test spider """ # 定义fetcher, parser和saver, 你也可以重写这三个类中的任何一个 fetcher = spider.Fetcher(max_repeat=3, sleep_time=0) parser = spider.Parser(max_deep=1) saver = spider.Saver(save_pipe=open("out_spider_thread.txt", "w")) # 定义Url过滤, UrlFilter使用Set, 适合Url数量不多的情况 black_patterns = ( spider.CONFIG_URLPATTERN_FILES, r"binding", r"download", ) white_patterns = ("^http[s]{0,1}://(www\.){0,1}(zhushou\.360)\.(com|cn)", ) url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=1000) # 初始化WebSpider web_spider = spider.WebSpider(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5) # 添加种子Url web_spider.set_start_url("http://zhushou.360.cn/", keys=("360web", )) # 开始抓取任务并等待其结束 web_spider.start_work_and_wait_done(fetcher_num=10, is_over=True) return
def test_spider(): """ test spider """ # initial fetcher / parser / saver, you also can rewrite this three class fetcher = spider.Fetcher(max_repeat=3, sleep_time=0) parser = spider.Parser(max_deep=2) saver = spider.Saver(save_pipe=open("out_spider_thread.txt", "w")) # define url_filter black_patterns = ( spider.CONFIG_URLPATTERN_FILES, r"binding", r"download", ) white_patterns = ("^http[s]{0,1}://(www\.){0,1}(zhushou\.360)\.(com|cn)", ) url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns) # initial web_spider web_spider = spider.WebSpider(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5) # add start url web_spider.set_start_url("http://zhushou.360.cn/", keys=("360web", )) # start web_spider web_spider.start_work_and_wait_done(fetcher_num=10, is_over=True) return
def test_spider(): """ test spider """ # initial fetcher / parser / saver fetcher = MyFetcher(max_repeat=3, sleep_time=1) parser = MyParser(max_deep=3) saver = spider.Saver(save_pipe=open("./spider/out_thread.txt", "w")) # define url_filter url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=None) # initial web_spider web_spider = spider.WebSpider(fetcher, parser, saver, proxieser=None, url_filter=url_filter, max_count=10, max_count_in_proxies=100) # add start url web_spider.set_start_url("http://blog.jobbole.com/all-posts/", priority=0, keys={}, deep=0) # start web_spider web_spider.start_working(fetcher_num=10) # wait for finished web_spider.wait_for_finished() return
def test_spider(): """ test spider """ # initial fetcher / parser / saver / proxieser fetcher = MyFetcher(sleep_time=0, max_repeat=1) parser = MyParser(max_deep=1) saver = MySaver(save_pipe=open("out.txt", "w")) # proxieser = MyProxies(sleep_time=5) # define url_filter url_filter = spider.UrlFilter(white_patterns=(re.compile(r"^http[s]?://(www\.)?appinn\.com"), ), capacity=None) # initial web_spider web_spider = spider.WebSpider(fetcher, parser, saver, proxieser=None, url_filter=url_filter, queue_parse_size=-1, queue_save_size=-1) # web_spider = spider.WebSpider(fetcher, parser, saver, proxieser=proxieser, url_filter=url_filter, queue_parse_size=100, queue_proxies_size=100) # add start url web_spider.set_start_url("https://www.appinn.com/", priority=0, keys={"type": "index"}, deep=0) # start web_spider web_spider.start_working(fetcher_num=20) # wait for finished web_spider.wait_for_finished() return
def test_spider(): """ test spider """ # initial fetcher / parser / saver, you also can rewrite this three classes fetcher = spider.Fetcher(max_repeat=1, sleep_time=0) parser = spider.Parser(max_deep=2) saver = spider.Saver(save_pipe=open("out_thread.txt", "w")) # define url_filter url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=None) # initial web_spider web_spider = spider.WebSpider(fetcher, parser, saver, proxieser=None, url_filter=url_filter, monitor_sleep_time=5) # add start url web_spider.set_start_url("http://zhushou.360.cn/", priority=0, keys={"type": "360"}, deep=0) # start web_spider web_spider.start_work_and_wait_done(fetcher_num=10, is_over=True) return
def get_douban_movies(): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36", "Host": "movie.douban.com", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch, br", "Accept-Language": "zh-CN, zh; q=0.8, en; q=0.6", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Cookie": "bid=Pd48iLTpsf8" } # 获取初始url all_urls = set() resp = requests.get("https://movie.douban.com/tag/", headers=headers, verify=False) assert resp.status_code == 200, resp.status_code soup = BeautifulSoup(resp.text, "html5lib") a_list = soup.find_all("a", href=re.compile(r"^/tag/", flags=re.IGNORECASE)) all_urls.update([(a_soup.get_text(), "https://movie.douban.com" + a_soup.get("href")) for a_soup in a_list]) resp = requests.get("https://movie.douban.com/tag/?view=cloud", headers=headers, verify=False) assert resp.status_code == 200, resp.status_code soup = BeautifulSoup(resp.text, "html5lib") a_list = soup.find_all("a", href=re.compile(r"^/tag/", flags=re.IGNORECASE)) all_urls.update([(a_soup.get_text(), "https://movie.douban.com" + a_soup.get("href")) for a_soup in a_list]) # 构造爬虫 dou_spider = spider.WebSpider(MovieFetcher(), MovieParser(max_deep=-1, max_repeat=1), MovieSaver(open("doubanmovie.txt", "w")), spider.UrlFilter()) # dou_spider.set_start_url("https://movie.douban.com/tag/新海诚", ("index", "test"), priority=0, critical=False) for tag, url in all_urls: dou_spider.set_start_url(url, ("index", tag), priority=1, critical=True) pass dou_spider.start_work_and_wait_done(fetcher_num=20) return
def get_dangdang_books(): """ 测试当当网爬虫 """ fetcher_number = 10 fetcher_list = [] for i in range(fetcher_number): fetcher_list.append(BookFetcher()) parser = BookParser() saver = BookSaver() dang_spider = spider.WebSpider(fetcher_list, parser, saver, None) # 获取所有链接并存入数据库,由于时间太长,因此抓取链接和信息分开进行 url_prefix_list = [ "http://category.dangdang.com/pg{}-cp01.41.43.05.00.00.html", "http://category.dangdang.com/pg{}-cp01.41.59.00.00.00.html" ] for url_prefix in url_prefix_list: for i in range(100): url = url_prefix.format(i) dang_spider.set_start_url(url, ("lists", ), priority=1) dang_spider.start_work_and_wait_done(fetcher_num=fetcher_number) # 开始抓取所有的详细信息 dang_spider = spider.WebSpider(fetcher_list, parser, saver, None) conn = pymysql.connect(host="localhost", user="******", password="******", db="dangdang_book", charset="utf8") cursor = conn.cursor() conn.autocommit(1) cursor.execute("select url from book_urls;") url_list = [item[0] for item in cursor.fetchall()] for url in url_list: dang_spider.set_start_url(url, ("detail", ), priority=1) dang_spider.start_work_and_wait_done(fetcher_num=fetcher_number) for f_er in fetcher_list: f_er.driver_quit() return
def get_douban_movies(): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36", "Host": "movie.douban.com", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch, br", "Accept-Language": "zh-CN, zh; q=0.8, en; q=0.6", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Cookie": 'bid=TWn93lyonNk; ll="118254"; gr_user_id=118696be-aa6a-42e9-a20f-932c29fcddac; viewed="5333562_5948760_4736118_4241826_1495763_1433583_2124114_6430747_24335672"; ps=y; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1490076711%2C%22https%3A%2F%2Fmovie.douban.com%2Fsubject%2F1292052%2Freviews%22%5D; _ga=GA1.2.1671303578.1469101452; ue="*****@*****.**"; dbcl2="33045345:gXYCq8g9sy4"; ck=5VGo; __utmt=1; _vwo_uuid_v2=98306AEEC1B83E40741FF0A8A58DC180|c5bbf2b10ddb9854ac614269b546a464; ap=1; push_noty_num=0; push_doumail_num=0; _pk_id.100001.8cb4=88a4be0bc4943075.1469262289.53.1490077859.1490064764.; _pk_ses.100001.8cb4=*; __utma=30149280.1671303578.1469101452.1490062608.1490076712.73; __utmb=30149280.16.10.1490076712; __utmc=30149280; __utmz=30149280.1489996683.69.35.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __utmv=30149280.3304' } # 获取初始url all_urls = set() resp = requests.get("https://movie.douban.com/tag/", headers=headers, verify=False) assert resp.status_code == 200, resp.status_code soup = BeautifulSoup(resp.text, "html5lib") a_list = soup.find_all("a", href=re.compile(r"^/tag/", flags=re.IGNORECASE)) all_urls.update([(a_soup.get_text(), "https://movie.douban.com" + a_soup.get("href")) for a_soup in a_list]) # resp = requests.get("https://movie.douban.com/tag/?view=cloud", headers=headers, verify=False) # assert resp.status_code == 200, resp.status_code # soup = BeautifulSoup(resp.text, "html5lib") # a_list = soup.find_all("a", href=re.compile(r"^/tag/", flags=re.IGNORECASE)) # all_urls.update([(a_soup.get_text(), "https://movie.douban.com" + a_soup.get("href")) for a_soup in a_list]) logging.warning("all urls: %s", len(all_urls)) # 构造爬虫 dou_spider = spider.WebSpider(MovieFetcher(), MovieParser(max_deep=-1), spider.Saver(), spider.UrlFilter()) for tag, url in all_urls: print(tag + ":" + url) dou_spider.set_start_url(url, ("index", tag), priority=1) dou_spider.start_work_and_wait_done(fetcher_num=20) return
def get_tutorial(): fetcher_number=3 fetcher_list=[] for i in range(fetcher_number): fetcher_list.append(Html_Fetcher()) parser=Html_Parser() saver=Html_Saver() html_spider=spider.WebSpider(fetcher_list,parser,saver,None) base_url="http://www.liaoxuefeng.com" url_list=[url for url in saver.dir] #url="/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000/001431608990315a01b575e2ab041168ff0df194698afac000" for url in url_list: html_spider.set_start_url(url,("lists",),priority=1) html_spider.start_work_and_wait_done(fetcher_num=fetcher_number)
def get_douban_movies(): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36", "Host": "movie.douban.com", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch, br", "Accept-Language": "zh-CN, zh; q=0.8, en; q=0.6", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Cookie": "bid=Pd48iLTpsf8" } # 获取初始url all_urls = set() resp = requests.get("https://movie.douban.com/tag/", headers=headers, verify=False) assert resp.status_code == 200, resp.status_code soup = BeautifulSoup(resp.text, "html5lib") a_list = soup.find_all("a", href=re.compile(r"^/tag/", flags=re.IGNORECASE)) all_urls.update([(a_soup.get_text(), "https://movie.douban.com" + a_soup.get("href")) for a_soup in a_list]) resp = requests.get("https://movie.douban.com/tag/?view=cloud", headers=headers, verify=False) assert resp.status_code == 200, resp.status_code soup = BeautifulSoup(resp.text, "html5lib") a_list = soup.find_all("a", href=re.compile(r"^/tag/", flags=re.IGNORECASE)) all_urls.update([(a_soup.get_text(), "https://movie.douban.com" + a_soup.get("href")) for a_soup in a_list]) logging.warning("all urls: %s", len(all_urls)) # 查询已有数据 conn = pymysql.connect(host="xx.xx.xx.xx", user="******", password="", db="db_my", charset="utf8") cursor = conn.cursor() cursor.execute("select m_url from t_doubanmovies;") bloomfilter = spider.UrlFilter() bloomfilter.update([item[0] for item in cursor.fetchall()]) logging.warning("update bloomfilter success: %s", cursor.rowcount) cursor.close() conn.close() # 构造爬虫 dou_spider = spider.WebSpider(MovieFetcher(), MovieParser(max_deep=-1, max_repeat=1), MovieSaver(), bloomfilter) for tag, url in all_urls: dou_spider.set_start_url(url, ("index", tag), priority=1, critical=True) dou_spider.start_work_and_wait_done(fetcher_num=20) return
def get_all(movies): _proxy = proxy(sleep_time=0) _fetcher = fetcher(max_repeat=50, sleep_time=0) _parser = parser(max_deep=3) _saver = saver() id_spider = spider.WebSpider(proxieser=_proxy, fetcher=_fetcher, parser=_parser, saver=_saver, url_filter=None, max_count_in_parse=500, max_count_in_proxies=100) for key, url in movies: id_spider.set_start_url(url, keys={"type": key}, priority=0, deep=0) id_spider.start_working(fetcher_num=100) id_spider.wait_for_finished() return
def test_spider(): """ 测试函数 """ # 初始化 fetcher / parser / saver / proxieser fetcher = MyFetcher(sleep_time=0, max_repeat=1) parser = MyParser(max_deep=1) saver = MySaver(save_pipe=open("out.txt", "w")) # proxieser = MyProxies(sleep_time=5) # 定义url_filter url_filter = spider.UrlFilter( white_patterns=(re.compile(r"^http[s]?://docs\.rsshub\.app"), ), capacity=None) # 定义爬虫web_spider web_spider = spider.WebSpider(fetcher, parser, saver, proxieser=None, url_filter=url_filter, queue_parse_size=-1, queue_save_size=-1) # web_spider = spider.WebSpider(fetcher, parser, saver, proxieser=proxieser, url_filter=url_filter, queue_parse_size=100, queue_proxies_size=100) # 添加起始的url web_spider.set_start_url("https://docs.rsshub.app/", priority=0, keys={"type": "index"}, deep=0) # 开启爬虫web_spider web_spider.start_working(fetchers_num=20) # 等待爬虫结束 web_spider.wait_for_finished() return
def test_spider(): """ 测试函数 """ # 初始化 fetcher / parser / saver / proxieser fetcher = MyFetcher(sleep_time=1, max_repeat=3) parser = MyParser(max_deep=1) saver = MySaver(save_pipe=open("out.txt", "w")) # proxieser = MyProxies(sleep_time=5) # 定义url_filter url_filter = spider.UrlFilter( white_patterns=(re.compile(r"^https?://www\.appinn\.com"), )) # 定义爬虫web_spider web_spider = spider.WebSpider(fetcher, parser, saver, url_filter=url_filter, queue_parse_size=-1, queue_save_size=-1) # web_spider = spider.WebSpider(fetcher, parser, saver, proxieser=proxieser, queue_parse_size=100, queue_proxies_size=100) # 添加起始的Task web_spider.set_start_task( spider.TaskFetch(priority=0, keys={"type": "index"}, deep=0, url="https://www.appinn.com/")) # 开启爬虫web_spider web_spider.start_working(fetchers_num=5) # 等待爬虫结束 web_spider.wait_for_finished() return
def go_spider(self): """ test spider """ # initial fetcher / parser / saver fetcher = BookExportingThread.MyFetcher(max_repeat=3, sleep_time=5) parser = BookExportingThread.MyParser(max_deep=2) saver = BookExportingThread.MySaver( save_pipe=open("book_info_list.txt", "w", encoding='utf8')) parser.instance = self # initial web_spider web_spider = spider.WebSpider(fetcher, parser, saver, proxieser=None, url_filter=None, monitor_sleep_time=5) # add start url web_spider.set_start_url(start_url, priority=0, keys={"type": None}, deep=0) # start web_spider web_spider.start_working(fetcher_num=10) # stop web_spider # time.sleep(10) # web_spider.stop_working() # wait for finished web_spider.wait_for_finished(is_over=True) return
self.save_pipe_total.write("\t".join([keys[1]] + [str(i) for i in item]) + "\n") elif keys[0] == "PerGame": self.save_pipe_pergame.write("\t".join([keys[1]] + [str(i) for i in item]) + "\n") else: return False return True if __name__ == "__main__": """ main流程 """ # 初始化fetcher, parser和saver fetcher = NBAFetcher(critical_max_repeat=3, critical_sleep_time=0) parser = NBAParser(max_deep=-1, max_repeat=3) saver = NBASaver(file_name_total="nba_total.txt", file_name_pergame="nba_pergame.txt") # 初始化爬虫, 并传入初始Url nba_spider = spider.WebSpider(fetcher, parser, saver, url_filter=None) nba_spider.set_start_url(url_player_index, ("index", ), critical=True) # 开启10个线程抓取数据 nba_spider.start_work_and_wait_done(fetcher_num=10, is_over=True) exit()