def __init__(self): """构造函数,初始化属性""" self.urls = UrlManager() self.log = MyLog("spider_main", "logs") self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer()
class SpiderMan(object): def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): content = self.downloader.download(root_url) urls = self.parser.parser_url(root_url, content) for url in urls: try: t = time.strftime("%Y%m%d%H%M%S3282", time.localtime()) rank_url = 'http://service.library.mtime.com/Movie.api' \ '?Ajax_CallBack=true' \ '&Ajax_CallBackType=Mtime.Library.Services' \ '&Ajax_CallBackMethod=GetMovieOverviewRating' \ '&Ajax_CrossDomain=1' \ '&Ajax_RequestUrl=%s' \ '&t=%s' \ '&Ajax_CallBackArgument0=%s' % (url[0],t,url[1]) rank_content = self.downloader.download(rank_url) data = self.parser.parser_json(rank_url, rank_content) self.output.store_data(data) except Exception as e: print('Crawl failed') self.output.output_end() print('Crawl finish')
class SpiderMain: def __init__(self): """ 初始化方法,主要是将其他组件实例化 """ self.url_manager = UrlManager() self.html_downloader = HtmlDownloader() self.html_parser = HtmlParser() self.data_storage = DataStorage() def start(self): """ 爬虫的主启动方法 :return: """ """ 页码 """ title = set() for a in range(2, 10): html = self.html_downloader.download( 'http://ggzy.foshan.gov.cn/jyxx/fss/zfcg_1108551/zbxx/index_'+str(a)+'.html?1') _title = self.html_parser.titleParer(html) for i in _title: title.add(i) for i in title: print(i) html = self.html_downloader.download(i) _product = self.html_parser.contextParer(html) self.data_storage.storage(_product)
def __init__(self): """ 初始化方法,主要是将其他组件实例化 """ self.url_manager = UrlManager() self.html_downloader = HtmlDownloader() self.html_parser = HtmlParser() self.data_storage = DataStorage()
def __init__(self): # URL 管理器 # self.urls = UrlManager.UrlManager() self.urls = UrlManager() # URL 下载器 # self.downloader = HtmlDownloader.HtmlDownloader() self.downloader = HtmlDownloader() # URL 解析器 # self.parser = html_parser.HtmlParser() self.parser = HtmlParser() # self.outputer = html_outputer.HtmlOutputer() self.outputer = HtmlOutputer()
def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_addr = '127.0.0.1' print('Connect to server %s...' % server_addr) self.m = BaseManager(address=(server_addr,8001),authkey=b'baike') self.m.connect() self.task = self.m.get_task_queue() print(self.task.qsize()) self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish')
class SpiderWork(object): def __init__(self): BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') server_addr = '127.0.0.1' print('Connect to server %s...' % server_addr) self.m = BaseManager(address=(server_addr,8001),authkey=b'baike') self.m.connect() self.task = self.m.get_task_queue() print(self.task.qsize()) self.result = self.m.get_result_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish') def crawl(self): import time while(True): try: if not self.task.empty(): url = self.task.get() if url == 'end': print('控制节点通知爬虫节点停止工作...') self.result.put({'new_urls':'end', 'data':'end'}) return print('爬虫节点正在解析:%s' % url.encode('utf-8')) content = self.downloader.download(url) new_urls,data = self.parser.parser(url,content) self.result.put({'new_urls':new_urls, 'data':data}) except EOFError as e: print('连接工作节点失败') return except Exception as e: print(e) print('Crawl fail')
class SpiderMain(object): def __init__(self): self.urls = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = HtmlOutputer() def craw(self, root_url, page_amount=5, time_sleep=None): count = 1 # 添加第一个待爬取url self.urls.add_new_url(root_url) # 如果集合中有url, 就取出一个url 请求, 没有链接则跳出。 while self.urls.has_new_url(): try: # 开始爬取 new_url = self.urls.get_new_url() print(f'craw{count}:{new_url}') # 请求url, 返回html html_content = self.downloader.download(new_url) # xpath 解析html,得到需要的数据 new_urls, new_data = self.parser.parse(html_content) # 一个词条页面上关联的a 链表列表加入到url 管理器中待爬取 self.urls.add_new_urls(new_urls) self.output.collect_data(new_url, new_data) count += 1 if count > page_amount: break time.sleep(2) except Exception as e: print(e) print(f'抓取失败:{new_url}') self.output.output_html()
class SpiderMain(object): def __init__(self): self.urls = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer() def craw(self, url): count = 1 self.urls.add_new_url(url) while self.urls.has_new_url(): try: new_url = self.urls.get_new_url() html_cont = self.downloader.download(new_url) new_urls, html_data = self.parser.parse(new_url, html_cont) self.urls.add_new_urls(new_urls) self.outputer.collect_data(html_data) print "%d craw success : %s" % (count, new_url) if count >= 10: break count = count + 1 except Exception as e: print str(e) print "%d craw failed : %s" % (count, new_url) self.outputer.output()
def __init__(self): # 实例化其他模块类 self.mysql_handler = MysqlHandler() self.html_downloader = HtmlDownloader() self.html_parser = HtmlParser() # 爬取起点url self.root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html' # 用于后续url的拼接 self.split_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/' # 省页面列表 self.province_url_list = [] # 市页面列表 self.city_url_list = [] # 区页面列表 self.county_url_list = [] # 乡镇、街道页面列表 self.town_url_list = []
def __init__(self, address='127.0.0.1', port=8001, authkey=b'baike'): """初始化分布式进程中工作节点的连接工作""" # 注册用于获取Queue的方法名称 BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') # 连接到服务器 print('Connect to server %s:%s...' % (address, port)) self.manager = BaseManager(address=(address, port), authkey=authkey) # 开始连接 self.manager.connect() # 获取Queue对象 self.task_q = self.manager.get_task_queue() self.result_q = self.manager.get_result_queue() # 初始化下载器和解析器 self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish')
def __init__(self): args = ArgumentParser() index_start = 1 try: with open(args.index_end_path, 'r', encoding='utf-8') as f: index_end = int(f.readline().strip('\n')) except Exception as e: print(e) sys.exit(-1) self.new_urls = set() print("Adding all urls ...") for index in range(index_start, index_end): url = "https://baike.baidu.com/view/" + str(index) self.new_urls.add(url) print("Done.") self.old_urls = set() self.fail_urls = set() self.fail_url_mark = True self.downloader = HtmlDownloader()
def craw(self): # 下载 downloader = HtmlDownloader() root_cont = downloader.download(self.url) parser = HtmlParser() urls, data = parser.parse(self.url, root_cont, True) result = "" for url in urls: cont = downloader.download(url) newurls, month = parser.parse(url, cont, False) if month != None: result += month.getMonthly() month = None #print(month.getMonthly()) f = open("阿里巴巴数据库内核组月报.md", "w+", encoding='utf-8') result = "## 阿里巴巴数据库内核月报\n\n" + result f.write(result) f.close() pass
class Spider: def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): content = self.downloader.download(root_url) urls = self.parser.parse_url(root_url, content) for url in urls: try: # http://service.library.mtime.com/Movie.api # ?Ajax_CallBack=true # &Ajax_CallBackType=Mtime.Library.Services # &Ajax_CallBackMethod=GetMovieOverviewRating # &Ajax_CrossDomain=1 # &Ajax_RequestUrl=http%3A%2F%2Fmovie.mtime.com%2F246526%2F&t=201710117174393728&Ajax_CallBackArgument0=246526 t = time.strftime('%Y%m%d%H%M%S3282', time.localtime()) rank_url = 'http://service.library.mtime.com/Movie.api' \ '?Ajax_CallBack=true' \ '&Ajax_CallBackType=Mtime.Library.Services' \ '&Ajax_CallBackMethod=GetMovieOverviewRating' \ '&Ajax_CrossDomain=1' \ '&Ajax_RequestUrl=%s' \ '&t=%s' \ '&Ajax_CallbackArgument0=%s' % (url[0].replace('://', '%3A%2F%2F')[:-1], t, url[1]) rank_content = self.downloader.download(rank_url) if rank_content is None: print('None') data = self.parser.parse_json(rank_url, rank_content) self.output.store_data(data) except Exception as e: raise e # print(e) # print('Crawl failed') self.output.output_end() print('Crawl finish')
class SpiderWorker: def __init__(self, address='127.0.0.1', port=8001, authkey=b'baike'): """初始化分布式进程中工作节点的连接工作""" # 注册用于获取Queue的方法名称 BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') # 连接到服务器 print('Connect to server %s:%s...' % (address, port)) self.manager = BaseManager(address=(address, port), authkey=authkey) # 开始连接 self.manager.connect() # 获取Queue对象 self.task_q = self.manager.get_task_queue() self.result_q = self.manager.get_result_queue() # 初始化下载器和解析器 self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish') def crawl(self): while True: try: if not self.task_q.empty(): url = self.task_q.get() if url == 'end': print('控制节点通知爬虫节点停止工作...') # 接着通知其他节点停止工作 self.result_q.put({'new_urls': 'end', 'data': 'end'}) return print('爬虫节点正在解析: %s' % url) content = self.downloader.download(url) new_urls, data = self.parser.parse(url, content) self.result_q.put({'new_urls': new_urls, 'data': data}) else: print('task queue is empty', self.task_q.empty()) except EOFError: print('连接工作节点失败') return except Exception as e: print(e) print('crawl fail')
class SpiderMain(): def __init__(self): # URL 管理器 # self.urls = UrlManager.UrlManager() self.urls = UrlManager() # URL 下载器 # self.downloader = HtmlDownloader.HtmlDownloader() self.downloader = HtmlDownloader() # URL 解析器 # self.parser = html_parser.HtmlParser() self.parser = HtmlParser() # self.outputer = html_outputer.HtmlOutputer() self.outputer = HtmlOutputer() def craw(self, root_url): count = 1 originSet = set() originSet.add(root_url) self.urls.add_new_urls(originSet) while self.urls.has_new_rul(): try: new_url = self.urls.get_new_url() print "craw %d : %s" % (count, new_url) html_cont = self.downloader.downloader(new_url) # 输出信息 downStat = "ERROR" if html_cont != None: downStat = "SUCCESS" print "[Page ID : %d downloader %s!]" % (count, downStat) new_urls, new_data = self.parser.parser(new_url, html_cont) # print "\nnew_urls[%s], new_data[%s]" % (new_urls, new_data) self.urls.add_new_urls(new_urls) self.outputer.collect_data(new_data) if count == 15: break count = count + 1 except Exception as err: print "craw failed! ERROR infomation : %s" % err self.outputer.output_html()
class CodeSpider(object): def __init__(self): # 实例化其他模块类 #self.mysql_handler = MysqlHandler() self.html_downloader = HtmlDownloader() self.html_parser = HtmlParser() self.path = "/Users/spike/python_项目/get_cd_school/" # # 爬取起点url # self.root_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages=1' # # 用于后续url的拼接 # self.split_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages=' # school info # self.school_infos = [] def craw(self, downloading_url): try: # 记录正在下载、解析的url,便于分析错误 # downloading_url = self.root_url html_content = self.html_downloader.download(downloading_url) # 第一个参数:需要解析的html代码 # 第二个参数:用于url拼接的url self.school_infos = self.html_parser.province_parser(html_content) # print(self.school_infos) #exit() if (len(self.school_infos) != 20): print(downloading_url + "解析成功") print("当前页面数据:" + str(len(self.school_infos))) #print(self.province_url_list) with open(self.path + "school.txt", "a") as f: # print("writting") for mc, xd, qy, xz, dh, dz in self.school_infos: f.write(mc + "\t" + xd + "\t" + qy + "\t" + xz + "\t" + dh + "\t" + dz) f.close() return len(self.school_infos) except Exception as e: print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e) # 利用traceback定位异常 traceback.print_exc()
class CodeSpider(object): def __init__(self): # 实例化其他模块类 self.mysql_handler = MysqlHandler() self.html_downloader = HtmlDownloader() self.html_parser = HtmlParser() # 爬取起点url # self.root_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages=1' # # 用于后续url的拼接 # self.split_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages=' # # school info # self.school_infos = [] #日志文件路径需要自行修改 # self.last_log_path = "d:\\log.txt" # self.last_log_path = "/Users/spike/spider_log.txt" def craw(self,downloading_url): try: # 记录正在下载、解析的url,便于分析错误 # downloading_url = self.root_url html_content = self.html_downloader.download(downloading_url) # 第一个参数:需要解析的html代码 self.school_infos = self.html_parser.province_parser(html_content) # print(self.school_infos) if (len(self.school_infos)!=20): print(downloading_url+"解析成功") print("当前页面数据:"+str(len(self.school_infos))) for mc,xd,qy,xz,dh,dz in self.school_infos: # print(mc+xd+qy+xz+dh+dz) province_id = self.mysql_handler.insert(mc,xd,qy,xz,dh,dz) # print(province_id) # exit() # 记录正在下载、解析的url,便于分析错误 # self.mysql_handler.close() return len(self.school_infos) except Exception as e: print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e) # 利用traceback定位异常 traceback.print_exc() time.sleep(60)
class SpiderMain: def __init__(self): self.url_manager = UrlManager() self.html_downloader = HtmlDownloader() self.html_parser = HtmlParser() self.data_storage = DataStorage() def start(self): """ 爬虫的主启动方法 :return: """ self.url_manager.add_new_url( "http://127.0.0.1:8848/xiaomi-master/index.html") # 从url管理器获取url url = self.url_manager.get_new_url() # 将获取到的url使用下载器进行下载 html = self.html_downloader.download(url) # 将html进行解析 res = self.html_parser.parser(html) # 数据存储 self.data_storage.storage(res)
class SpiderMain(object): """docstring for SpiderMain""" def __init__(self): self.urlManage = UrlManage() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer() def craw(self,url): self.urlManage.add_new_url(url) count = 1 while self.urlManage.has_new_url(): url = self.urlManage.get_new_url() print '%dth page,address:%s' % (count,url) html_content = self.downloader.downloadPage(url) new_urls,new_data = self.parser.parse(html_content,url) self.urlManage.add_new_urls(new_urls) self.outputer.collect_data(new_data) if count == 10: break count = count + 1 self.outputer.output_html()
class SpiderMain(object): def __init__(self): self.urls = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer() def craw(self, root_url): count = 1 self.urls.add_new_url(root_url) while self.urls.has_new_url(): try: new_url = self.urls.get_new_url() # 获取新url html_cont = self.downloader.download(new_url) # 下载url内容 new_urls, new_data = self.parser.parse(new_url, html_cont) # 解析url内容 self.urls.add_new_urls(new_urls) # 将解析到的新url存入url管理器 self.outputer.collect_data(new_data) # 收集解析到的数据 if count == 200: break count = count + 1 except: print("craw failed") self.outputer.output_html()
class UrlManager(object): def __init__(self): args = ArgumentParser() index_start = 1 try: with open(args.index_end_path, 'r', encoding='utf-8') as f: index_end = int(f.readline().strip('\n')) except Exception as e: print(e) sys.exit(-1) self.new_urls = set() print("Adding all urls ...") for index in range(index_start, index_end): url = "https://baike.baidu.com/view/" + str(index) self.new_urls.add(url) print("Done.") self.old_urls = set() self.fail_urls = set() self.fail_url_mark = True self.downloader = HtmlDownloader() # self.update_new_url(index_end, args) def update_new_url(self, index_end, args): err_cnt = 0 start = index_end end = start while True: if err_cnt > 10: break url = "https://baike.baidu.com/view/" + str(start) response, response_url = self.downloader.download_update(url) if not response_url or response_url == 'https://baike.baidu.com/error.html': err_cnt += 1 else: err_cnt = 0 self.new_urls.add(url) end = start start += 1 with open(args.index_end_path, 'w', encoding='utf-8') as f: f.write(str(end)) def add_new_url(self, url): if url is None: return if url not in self.new_urls and url not in self.old_urls: self.new_urls.add(url) def add_old_url(self, url): if url is None: return if url not in self.new_urls: self.old_urls.add(url) def add_fail_url(self, url): if url is None: return if url not in self.new_urls and url not in self.old_urls: self.fail_urls.add(url) def add_new_urls(self, urls): if urls is None or len(urls) == 0: return for url in urls: if url not in self.old_urls: self.new_urls.add(url) def has_new_url(self): if len(self.new_urls) != 0: return True elif self.fail_url_mark: self.new_urls = self.fail_urls.copy() self.fail_urls.clear() self.fail_url_mark = False return True else: return False def get_new_url(self): new_url = self.new_urls.pop() return new_url
class SpiderMain(): """爬虫程序主模块""" def __init__(self): """构造函数,初始化属性""" self.urls = UrlManager() self.log = MyLog("spider_main", "logs") self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer() #self.util=utill.DBConn() def craw(self, root_url): """爬虫入口函数""" areas = { "gulou": 100, "jianye": 72, "qinhuai": 100, "xuanwu": 67, "yuhuatai": 32, "qixia": 62, "baijiahu": 33, "chalukou1": 26, "jiangningqita11": 3, "dongshanzhen": 29, "jiangningdaxuecheng": 15, "jiulonghu": 12, "jiangjundadao11": 22, "kexueyuan": 9, "qilinzhen": 42, "tiexinqiao": 9, "pukou": 100, "liuhe": 1, } #areas = {"gulou":1} #1、抓取所有二手房详情界面链接,并将所有连接放入URL管理模块 for area, pg_sum in areas.items(): for num in range(1, pg_sum + 1): #1.1 拼接页面地址: https://nj.lianjia.com/ershoufang/gulou/pg2/ pg_url = root_url + area + "/pg" + str(num) + "/" self.log.logger.info("1.1 拼接页面地址:" + pg_url) print("1.1 拼接页面地址:" + pg_url) #1.2 启动下载器,下载页面. try: html_cont = self.downloader.download(pg_url) except Exception as e: self.log.logger.error("1.2 下载页面出现异常:" + repr(e)) time.sleep(60 * 30) else: #1.3 解析PG页面,获得二手房详情页面的链接,并将所有链接放入URL管理模块 try: ershoufang_urls = self.parser.get_erhoufang_urls( html_cont) except Exception as e: self.log.logger.error("1.3 页面解析出现异常:" + repr(e)) else: self.urls.add_new_urls(ershoufang_urls) #暂停0~3秒的整数秒,时间区间:[0,3] time.sleep(random.randint(0, 3)) time.sleep(60 * 20) #2、解析二手房具体细心页面 id = 1 stop = 1 while self.urls.has_new_url(): #2.1 获取url try: detail_url = self.urls.get_new_url() self.log.logger.info("2.1 二手房页面地址:" + detail_url) print("2.1 二手房页面地址:" + detail_url) except Exception as e: print("2.1 拼接地址出现异常") self.log.logger.error("2.1 拼接地址出现异常:" + detail_url) #2.2 下载页面 try: detail_html = self.downloader.download(detail_url) except Exception as e: self.log.logger.error("2.2 下载页面出现异常:" + repr(e)) self.urls.add_new_url(detail_url) time.sleep(60 * 30) else: #2.3 解析页面 try: ershoufang_data = self.parser.get_ershoufang_data( detail_html, id) except Exception as e: self.log.logger.error("2.3 解析页面出现异常:" + repr(e)) else: #2.4 输出数据 try: self.outputer.collect_data(ershoufang_data) except Exception as e: self.log.logger.error("2.4 输出数据出现异常:" + repr(e)) else: print(id) id = id + 1 stop = stop + 1 #暂停0~3秒的整数秒,时间区间:[0,3] time.sleep(random.randint(0, 3)) if stop == 2500: stop = 1 time.sleep(60 * 20)
def __init__(self): self.urls = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer()
def __init__(self): # 实例化其他模块类 self.mysql_handler = MysqlHandler() self.html_downloader = HtmlDownloader() self.html_parser = HtmlParser()
class LinkExtractor(object): def __init__(self): self.counter = 0 self.k_count = 0 self.downloader = HtmlDownloader() def get_menu_page_info(self, menu_page_url): if menu_page_url is None: return None html_text = self.downloader.download(menu_page_url) if html_text == None: return None self.counter = (self.counter + 1)%100 if self.counter == 0: self.k_count += 1 print('Get Manu Pages: %d00'%(self.k_count)) return self.parse_menu_page_info(html_text) def parse_menu_page_info(self, html_text): if html_text is None: return None soup = BeautifulSoup(html_text, 'lxml') menu_page_data = [] for entry in soup.select('.r-ent'): data = { 'title': entry.select('.title')[0].text.strip(), 'post_url': PTT_HOST_URL + entry.select('.title > a')[0].get('href') if entry.select('.title > a') else None, 'date': entry.select('.date')[0].text.strip(), 'author': entry.select('.author')[0].text.strip(), 'visited': 0 } menu_page_data.append(data) return menu_page_data # 抓 post_links 到 post_url_infos table def fetch_menu_page_links(self, menu_page_url): menu_page_data = self.get_menu_page_info(menu_page_url) if menu_page_data != None: url_manager.add_new_url_infos(menu_page_data) def next_page(self, html_text): soup = BeautifulSoup(html_text, 'lxml') if soup.find_all('a', class_='btn wide', text='下頁 ›'): return PTT_HOST_URL + soup.find_all('a', class_='btn wide', text='下頁 ›')[0].get('href') return None def run(self, root_menu_page, min_menu_page_index=1, max_menu_page_index=6000, threadNum=5): print('===================== start run extractor() ========================') try: pool = threadpool.ThreadPool(threadNum) menu_page_urls = [root_menu_page.format(i) for i in range(min_menu_page_index, max_menu_page_index)] requests = threadpool.makeRequests(self.fetch_menu_page_links, menu_page_urls) [pool.putRequest(req) for req in requests] pool.wait() print('link extractor done.') except: print('link_extractor excepttion') raise
def __init__(self): self.counter = 0 self.k_count = 0 self.downloader = HtmlDownloader()
def __init__(self): self.urlManage = UrlManage() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer()
def __init__(self): self.url_manager = UrlManager() self.html_downloader = HtmlDownloader() self.html_parser = HtmlParser() self.data_storage = DataStorage()
def __init__(self): # 实例化其他模块类 #self.mysql_handler = MysqlHandler() self.html_downloader = HtmlDownloader() self.html_parser = HtmlParser() self.path = "/Users/spike/python_项目/get_cd_school/"
class CodeSpider(object): def __init__(self): # 实例化其他模块类 self.mysql_handler = MysqlHandler() self.html_downloader = HtmlDownloader() self.html_parser = HtmlParser() # 爬取起点url self.root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html' # 用于后续url的拼接 self.split_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/' # 省页面列表 self.province_url_list = [] # 市页面列表 self.city_url_list = [] # 区页面列表 self.county_url_list = [] # 乡镇、街道页面列表 self.town_url_list = [] self.last_log_path = "d:\\log.txt" def craw(self): try: # 记录正在下载、解析的url,便于分析错误 downloading_url = self.root_url html_content = self.html_downloader.download(downloading_url) # 第一个参数:需要解析的html代码 # 第二个参数:用于url拼接的url self.province_url_list = self.html_parser.province_parser( html_content, self.split_url) #print(self.province_url_list) pro = self.province_url_list #print(self.province_url_list[0][0]) with open(self.last_log_path, "r") as r: last_log = r.read() #print(last_log) if last_log != "": last_log_index = pro.index(tuple(last_log.split(';'))) #print("inde:"+str(last_log_index)) for i in range(last_log_index): del self.province_url_list[0] print("删除已下载元素后还剩余:" + str(len(self.province_url_list)) + "共计:31") #print(self.province_url_list) #exit() #else: # print("下载开始,共计:"+str(len(pro)) #print(last_log_index) #exit() for province_name, province_url, province_code in self.province_url_list: #print(province_code) #记录最后一个下载 last_record = (province_name, province_url, province_code) #print(last_record) with open(self.last_log_path, "w") as l: #last_name = province_name.encode('utf8') l.write(last_record[0] + ";" + last_record[1] + ";" + last_record[2]) #exit() province_id = self.mysql_handler.insert( province_code + '0000000000', province_name) #print(province_id) # 记录正在下载、解析的url,便于分析错误 downloading_url = province_url html_content = self.html_downloader.download(downloading_url) self.city_url_list = self.html_parser.city_parser( html_content, self.split_url) for city_name, city_url, city_code in self.city_url_list: city_id = self.mysql_handler.insert(city_code, city_name) # 例如直辖市没有下级页面 if city_url is None: continue # 记录正在下载、解析的url,便于分析错误 downloading_url = city_url html_content = self.html_downloader.download( downloading_url) self.county_url_list = self.html_parser.county_parser( html_content, self.split_url + province_code + "/") for county_name, county_url, county_code in self.county_url_list: county_id = self.mysql_handler.insert( county_code, county_name) if county_url is None: continue # 记录正在下载、解析的url,便于分析错误 downloading_url = county_url html_content = self.html_downloader.download( downloading_url) self.town_url_list = self.html_parser.town_parser( html_content, self.split_url) for town_name, town_url, town_code in self.town_url_list: # 输出抓取到的乡镇街道的名称、链接(实际不需要)、编号代码 print(town_name, town_url, town_code) self.mysql_handler.insert(town_code, town_name) self.mysql_handler.close() except Exception as e: print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e) # 利用traceback定位异常 traceback.print_exc() time.sleep(60) return self.craw()
from html_downloader import HtmlDownloader from html_paraser import HtmlParser import pymysql from date_provider import getAllDayPerYear import time conn = pymysql.connect(host='192.168.64.135', port=3306, user='******', passwd='123456', db='comp') cursor = conn.cursor() if __name__ == '__main__': hd = HtmlDownloader() hp = HtmlParser() province = 'zhejiang' for year in range(2019, 1949, -1): print(year) year_date_list = getAllDayPerYear(year) # print(year_date_list) for comregdate in year_date_list: print(comregdate) errcnt = 0 pagecnt_tmp = 0 for pagecnt in range(0, 1000): url = r'https://gongshang.mingluji.com/' + province + r'/riqi/' + comregdate + r'?page=' + str( pagecnt)
class CodeSpider(object): def __init__(self): # 实例化其他模块类 #self.mysql_handler = MysqlHandler() self.html_downloader = HtmlDownloader() self.html_parser = HtmlParser() self.path = "D:\\python_work\\get_diqu_dm\\" # 爬取起点url self.root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html' # 用于后续url的拼接 self.split_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/' # 省页面列表 self.province_url_list = [] # 市页面列表 self.city_url_list = [] # 区页面列表 self.county_url_list = [] # 乡镇、街道页面列表 self.town_url_list = [] def craw(self): try: # 记录正在下载、解析的url,便于分析错误 downloading_url = self.root_url html_content = self.html_downloader.download(downloading_url) # 第一个参数:需要解析的html代码 # 第二个参数:用于url拼接的url self.province_url_list = self.html_parser.province_parser(html_content, self.split_url) #print(self.province_url_list) with open(self.path+"shen_daima.txt", "a") as f: for province_name, province_url, province_code in self.province_url_list: province_code = province_code+'0000000000' f.write(province_code+"\t"+province_name+"\n") # 第一个参数:1-插入一个省数据;2-市数据;3-区数据;4-乡镇街道数据 # 第二个参数:省市区街道名称 # 第三个参数:上级的id,注意省没有上级id # 第四个参数:市区街道的行政区划编码 #province_id = self.mysql_handler.insert(1, province_name, None, None) # 记录正在下载、解析的url,便于分析错误 downloading_url = province_url html_content = self.html_downloader.download(downloading_url) self.city_url_list = self.html_parser.city_parser(html_content, self.split_url) with open(self.path+"other_daima.txt","a") as o: for city_name, city_url, city_code in self.city_url_list: o.write(city_code+"\t"+city_name+"\n") #city_id = self.mysql_handler.insert(2, city_name, province_id, city_code) # 例如直辖市没有下级页面 if city_url is None: continue # 记录正在下载、解析的url,便于分析错误 downloading_url = city_url html_content = self.html_downloader.download(downloading_url) self.county_url_list = self.html_parser.county_parser(html_content, self.split_url + province_code + "/") for county_name, county_url, county_code in self.county_url_list: o.write(county_code+"\t"+county_name+"\n") #county_id = self.mysql_handler.insert(3, county_name, city_id, county_code) if county_url is None: continue # 记录正在下载、解析的url,便于分析错误 print('To deal with county') downloading_url = county_url html_content = self.html_downloader.download(downloading_url) self.town_url_list = self.html_parser.town_parser(html_content, self.split_url) for town_name, town_url, town_code in self.town_url_list: # 输出抓取到的乡镇街道的名称、链接(实际不需要)、编号代码 o.write(town_code+"\t"+town_name+"\n") print(town_name, town_url, town_code) #self.mysql_handler.insert(4, town_name, county_id, town_code) #self.mysql_handler.close() f.close() o.close() except Exception as e: print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e) # 利用traceback定位异常 traceback.print_exc()
def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput()