class SpiderMain(object): def __init__(self): self.urls = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer() def craw(self, url): count = 1 self.urls.add_new_url(url) while self.urls.has_new_url(): try: new_url = self.urls.get_new_url() html_cont = self.downloader.download(new_url) new_urls, html_data = self.parser.parse(new_url, html_cont) self.urls.add_new_urls(new_urls) self.outputer.collect_data(html_data) print "%d craw success : %s" % (count, new_url) if count >= 10: break count = count + 1 except Exception as e: print str(e) print "%d craw failed : %s" % (count, new_url) self.outputer.output()
def url_manager_proc(self, url_q: Queue, conn_q: Queue, root_url): print('url manager process start...') url_manager = UrlManager() url_manager.add_new_url(root_url) print('url manager process started...') while True: while url_manager.has_new_url(): new_url = url_manager.get_new_url() print('new_url', new_url) # 将新的URL发给工作节点 url_q.put(new_url) # 加一个判断条件, 当爬取2000个链接后就关闭, 并保存进度 if url_manager.old_url_size() > 2000: # 通知爬行节点工作结束 url_q.put('end') print('控制节点发起结束通知') # 关闭管理节点, 同事存储set状态 url_manager.save_process(path.join('dist', 'new_urls.txt'), url_manager.new_urls) url_manager.save_process(path.join('dist', 'old_urls.txt'), url_manager.old_urls) return # 将从result_solve_proc 获取到的URL添加到URL管理器 try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except BaseException as e: time.sleep(0.1)
class SpiderMain(object): def __init__(self): self.urls = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = HtmlOutputer() def craw(self, root_url, page_amount=5, time_sleep=None): count = 1 # 添加第一个待爬取url self.urls.add_new_url(root_url) # 如果集合中有url, 就取出一个url 请求, 没有链接则跳出。 while self.urls.has_new_url(): try: # 开始爬取 new_url = self.urls.get_new_url() print(f'craw{count}:{new_url}') # 请求url, 返回html html_content = self.downloader.download(new_url) # xpath 解析html,得到需要的数据 new_urls, new_data = self.parser.parse(html_content) # 一个词条页面上关联的a 链表列表加入到url 管理器中待爬取 self.urls.add_new_urls(new_urls) self.output.collect_data(new_url, new_data) count += 1 if count > page_amount: break time.sleep(2) except Exception as e: print(e) print(f'抓取失败:{new_url}') self.output.output_html()
class Scheduler(object): def __init__(self): self.url_manager = UrlManager() self.downloader = Downloader() self.parser = Parser() self.data_output = DadaOutput() def crawl(self, start_url, max_page): self.url_manager.add_new_url(start_url) while self.url_manager.has_url( ) and self.url_manager.old_url_size() < max_page: page_url = self.url_manager.get_new_url() page_html = self.downloader.down(page_url) new_urls, new_data = self.parser.parse(start_url, page_html) self.url_manager.add_new_urls(new_urls) self.data_output.store_data(new_data) self.data_output.output_html() print('第%s条数据写入' % (self.url_manager.old_url_size()))
class SpiderMain(): def __init__(self): # URL 管理器 # self.urls = UrlManager.UrlManager() self.urls = UrlManager() # URL 下载器 # self.downloader = HtmlDownloader.HtmlDownloader() self.downloader = HtmlDownloader() # URL 解析器 # self.parser = html_parser.HtmlParser() self.parser = HtmlParser() # self.outputer = html_outputer.HtmlOutputer() self.outputer = HtmlOutputer() def craw(self, root_url): count = 1 originSet = set() originSet.add(root_url) self.urls.add_new_urls(originSet) while self.urls.has_new_rul(): try: new_url = self.urls.get_new_url() print "craw %d : %s" % (count, new_url) html_cont = self.downloader.downloader(new_url) # 输出信息 downStat = "ERROR" if html_cont != None: downStat = "SUCCESS" print "[Page ID : %d downloader %s!]" % (count, downStat) new_urls, new_data = self.parser.parser(new_url, html_cont) # print "\nnew_urls[%s], new_data[%s]" % (new_urls, new_data) self.urls.add_new_urls(new_urls) self.outputer.collect_data(new_data) if count == 15: break count = count + 1 except Exception as err: print "craw failed! ERROR infomation : %s" % err self.outputer.output_html()
class Spider(): def __init__(self): self.manager = UrlManager() self.downloader = HTMLDownloader() self.parser = HTMLParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.old_urls_size() < 50): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print("已经抓取%s个链接" % self.manager.old_urls_size()) except Exception as e: print(e) self.output.output_html()
class SpiderMain(object): def __init__(self): self.urls = UrlManager() self.downlaoder = HtmlDownlaoder() self.parser = HtmlParser() self.outputer = HtmlOutputer() def craw(self, root_url): count = 1 # 把根url 传入url管理列表 self.urls.add_url(root_url) # 页面爬取循环程序 while self.urls.has_new_url(): try: # 获取一个待爬取的url new_url = self.urls.get_new_url() print('craw %d: %s' % (count, new_url)) # 下载该url爬取context html_cont = self.downlaoder.download(new_url) # 通过解析器,解析该url下载到的内容,获取新的 new_urls 和 新的 data new_urls, new_data = self.parser.parse(new_url, html_cont) # 把获取到 新的url添加到新的url管理器, self.urls.add_new_urls(new_urls) # 把获取的新的data添加到新的数据处理器中 self.outputer.collect_data(new_data) if count == 100: break count += 1 except Exception as e: print('craw failed') self.outputer.output_html()
def url_manager_proc(self, url_q, conn_q, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) print(url_q) while True: while (url_manager.has_new_url()): new_url = url_manager.get_new_url() url_q.put(new_url) print('old_url=%s' % url_manager.old_url_size()) if (url_manager.old_url_size() > 2000): url_q.put('end') print('控制节点发起结束通知!') url_manager.save_progress('new_urls.txt', url_manager.new_urls) url_manager.save_progress('old_urls.txt', url_manager.old_urls) return try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except BaseException as e: time.sleep(0.1)
class SpiderMain(object): def __init__(self): self.urls = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer() def craw(self, root_url): count = 1 self.urls.add_new_url(root_url) while self.urls.has_new_url(): try: new_url = self.urls.get_new_url() # 获取新url html_cont = self.downloader.download(new_url) # 下载url内容 new_urls, new_data = self.parser.parse(new_url, html_cont) # 解析url内容 self.urls.add_new_urls(new_urls) # 将解析到的新url存入url管理器 self.outputer.collect_data(new_data) # 收集解析到的数据 if count == 200: break count = count + 1 except: print("craw failed") self.outputer.output_html()
class SpiderMain(): """爬虫程序主模块""" def __init__(self): """构造函数,初始化属性""" self.urls = UrlManager() self.log = MyLog("spider_main", "logs") self.downloader = HtmlDownloader() self.parser = HtmlParser() self.outputer = HtmlOutputer() #self.util=utill.DBConn() def craw(self, root_url): """爬虫入口函数""" areas = { "gulou": 100, "jianye": 72, "qinhuai": 100, "xuanwu": 67, "yuhuatai": 32, "qixia": 62, "baijiahu": 33, "chalukou1": 26, "jiangningqita11": 3, "dongshanzhen": 29, "jiangningdaxuecheng": 15, "jiulonghu": 12, "jiangjundadao11": 22, "kexueyuan": 9, "qilinzhen": 42, "tiexinqiao": 9, "pukou": 100, "liuhe": 1, } #areas = {"gulou":1} #1、抓取所有二手房详情界面链接,并将所有连接放入URL管理模块 for area, pg_sum in areas.items(): for num in range(1, pg_sum + 1): #1.1 拼接页面地址: https://nj.lianjia.com/ershoufang/gulou/pg2/ pg_url = root_url + area + "/pg" + str(num) + "/" self.log.logger.info("1.1 拼接页面地址:" + pg_url) print("1.1 拼接页面地址:" + pg_url) #1.2 启动下载器,下载页面. try: html_cont = self.downloader.download(pg_url) except Exception as e: self.log.logger.error("1.2 下载页面出现异常:" + repr(e)) time.sleep(60 * 30) else: #1.3 解析PG页面,获得二手房详情页面的链接,并将所有链接放入URL管理模块 try: ershoufang_urls = self.parser.get_erhoufang_urls( html_cont) except Exception as e: self.log.logger.error("1.3 页面解析出现异常:" + repr(e)) else: self.urls.add_new_urls(ershoufang_urls) #暂停0~3秒的整数秒,时间区间:[0,3] time.sleep(random.randint(0, 3)) time.sleep(60 * 20) #2、解析二手房具体细心页面 id = 1 stop = 1 while self.urls.has_new_url(): #2.1 获取url try: detail_url = self.urls.get_new_url() self.log.logger.info("2.1 二手房页面地址:" + detail_url) print("2.1 二手房页面地址:" + detail_url) except Exception as e: print("2.1 拼接地址出现异常") self.log.logger.error("2.1 拼接地址出现异常:" + detail_url) #2.2 下载页面 try: detail_html = self.downloader.download(detail_url) except Exception as e: self.log.logger.error("2.2 下载页面出现异常:" + repr(e)) self.urls.add_new_url(detail_url) time.sleep(60 * 30) else: #2.3 解析页面 try: ershoufang_data = self.parser.get_ershoufang_data( detail_html, id) except Exception as e: self.log.logger.error("2.3 解析页面出现异常:" + repr(e)) else: #2.4 输出数据 try: self.outputer.collect_data(ershoufang_data) except Exception as e: self.log.logger.error("2.4 输出数据出现异常:" + repr(e)) else: print(id) id = id + 1 stop = stop + 1 #暂停0~3秒的整数秒,时间区间:[0,3] time.sleep(random.randint(0, 3)) if stop == 2500: stop = 1 time.sleep(60 * 20)
class spider_main(): def __init__(self): self.urls = UrlManager() self.parser = HtmlParser() self.downloader = UrlDownloader() self.log = MyLog("spider", "logs") self.output = HtmlOutPut() # 主模块中开始爬虫 def Crawling(self, root_url): # 用字典存放地区名和网页数 areas = { "gulou": 100, "jianye": 100, "qinhuai": 100, "xuanwu": 100, "yuhuatai": 100, "qixia": 100, "baijiahu": 64, "jiangningqita11": 5, "chalukou1": 63, "dongshanzhen": 42, "jiangningdaxuecheng": 28, "jiulonghu": 28, "jiangjundadao11": 50, "kexueyuan": 16, "pukou": 100, "liuhe": 13, "lishui": 9, "jiangning": 100, "qilinzhen": 83, "tangshanzhen": 2, "fenghuangxijie1": 82, "xianlin2": 33, "yaohuamen": 4, "maigaoqiao1": 33, "maqun1": 31, "qixiaqita1": 5, "xiaozhuang": 9, "yanziji": 2, "yueyuan": 15, "wanshou1": 5, "hongshan1": 16, "caochangmendajie": 27, "dinghuaimendajie": 37, "fujianlu": 9, "hanzhongmendajie": 19, "huxijie": 15, "jiangdong2": 8, "nanhu4": 38, "nanyuan2": 38, "shuiximen1": 13, "wandaguangchang1": 25, "xiaoxing": 13, "yuhuaxincun": 15, "lukou": 14, "dingshanjiedao": 8, "gaoxinqu2": 12, "jiangpujiedao": 29, "pukouqita11": 8, "qiaobei": 100, "taishanjiedao": 12 } # 通过拼接形成所有的url地址,将所有的url连接保存 for area, num in areas.items(): for n in range(1, num + 1): # 拼接url: https://nj.lianjia.com/ershoufang/ splice_url = root_url + area + "/pg" + str(n) + "/" # 将拼接url写入日志 self.log.logger.info("url地址拼接" + splice_url) # 控制台打印 print("url地址拼接" + splice_url) # 拼接完成后开始进行网页下载 try: html_down = self.downloader.download(splice_url) except Exception as e: # 将错误信息写入日志 self.log.logger.error("html下载出现错误" + repr(e)) # 挂起进程 time.sleep(60) else: # 如果下载页面不出现错误,进行网页解析 try: secondhome_urls = self.parser.get_secondhandhome_urls( html_down) except Exception as e: # 将错误信息写入日志 self.log.logger.error("html页面解析错误" + repr(e)) else: # 页面解析正常 self.urls.add_new_urls(secondhome_urls) # time.sleep(random.randint(0,3)) time.sleep(60) # 具体解析html 获取需要的数据集 id = 1 # 起始 stop = 1 while self.urls.isEmpty_new_urls(): # 取出url try: temp_url = self.urls.get_new_url() # 控制台打印 print("html页面地址" + temp_url) # 日志写入 self.log.logger.info("html页面地址" + temp_url) except Exception as e: # 错误信息写入日志 # 控制台打印 print("html页面地址获取失败" + temp_url) self.log.logger.error("获取url错误" + repr(e)) # url获取正常进行下载 try: temp_data = self.downloader.download(temp_url) except Exception as e: # 控制台打印 print("页面下载失败" + temp_url) # 错误写入日志 self.log.logger.error("页面下载失败" + repr(e)) self.urls.add_new_url(temp_url) time.sleep(10) else: # 正常下载后 进行页面解析 try: temp_parser = self.parser.get_secondhandhome_data( temp_data, id) except Exception as e: self.log.logger.error("html页面解析错误" + repr(e)) print("html页面解析错误" + repr(e)) else: # 页面解析正常 进行写出 try: self.output.write_data(temp_parser) except Exception as e: self.log.logger.error("数据集写出错误" + repr(e)) print("数据集写出错误" + repr(e)) else: print(id) id = id + 1 stop = stop + 1 time.sleep(0.2) if stop == 2500: stop = 1 time.sleep(60)
class GrabMain(object): def __init__(self, url): self.root_url = url self.urlManager = UrlManager() self.dLoader = HtmlDLoader() self.contParser = HtmlParser() self.contOutputer = HtmlOutputer() pass def grabText(self): if self.root_url is None: return self.urlManager.add_new_next_url(self.root_url) self.contParser.parser_set(None, None, None, None, None) while self.urlManager.get_new_next_count(): try: new_url = self.urlManager.get_new_next_url() html_cont = self.dLoader.download(new_url) urls, nexts = self.contParser.parser_text_urls(html_cont) self.urlManager.add_new_next_urls(nexts) self.urlManager.add_new_urls(urls) except: print "url is error." pool = threadpool.ThreadPool(10) requests = threadpool.makeRequests(self.thread_grabText, self.urlManager.new_urls) [pool.putRequest(req) for req in requests] pool.wait() def thread_grabText(self, url): try: print "curr url is %s." % url html_cont = self.dLoader.download(url) title, cont = self.contParser.parser_text_cont(html_cont) self.contOutputer.output_cont(title, cont) except: print "url is %s, error." % url def grabImgs(self): if self.root_url is None: return None self.urlManager.add_new_next_url(self.root_url) self.contParser.parser_set(None, None, None, None, None) while self.urlManager.get_new_next_count(): try: new_url = self.urlManager.get_new_next_url() html_cont = self.dLoader.download(new_url) urls, nexts = self.contParser.parser_text_urls(html_cont) self.urlManager.add_new_next_urls(nexts) self.urlManager.add_new_urls(urls) except: print "url is error." pool = threadpool.ThreadPool(10) requests = threadpool.makeRequests(self.thread_grabImg, self.urlManager.new_urls) [pool.putRequest(req) for req in requests] pool.wait() def thread_grabImg(self, url): try: print "curr url is %s." % url html_cont = self.dLoader.download(url) title, links = self.contParser.parser_img_cont(html_cont) if links is None or len(links) == 0: print "url is %s, not src." % url return None if title is None: title = time.time() try: if not os.path.isdir(title): os.mkdir(title) except: title = time.time() if not os.path.isdir(title): os.mkdir(title) params = [] index = 0 for link in links: params.append(([title, link, index], None)) index += 1 pool = threadpool.ThreadPool(12) requests = threadpool.makeRequests(self.contOutputer.output_img, params) [pool.putRequest(req) for req in requests] pool.wait() except: print "url is %s, error." % url
class Wenku(): def __init__(self): self.authority = r'https://www.wenku8.net' self.loginurl = r'https://www.wenku8.net/login.php?do=submit&jumpurl=http%3A%2F%2Fwww.wenku8.net%2Findex.php' self.pageurl = r"/modules/article/articlelist.php?page=" self.username = r'' self.password = r'' self.formdata = {} self.formdata['username'] = self.username self.formdata['password'] = self.password self.formdata['usecookie'] = '0' self.formdata['action'] = r'login' self.formdata[ 'submit'] = r'%26%23160%3B%B5%C7%26%23160%3B%26%23160%3B%C2%BC%26%23160%3B' self.headers = {} self.headers['origin'] = r'https://www.wenku8.net' self.headers[ 'referer'] = r'https://www.wenku8.net/login.php?jumpurl=http%3A%2F%2Fwww.wenku8.net%2Findex.php' self.headers['upgrade-insecure-requests'] = '1' self.headers[ 'user-agent'] = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' self.session = requests.Session() self.manager = UrlManager() self.downloader = UrlDownloader() self.parser = UrlParser(self.session) def login(self): response = self.session.post(self.loginurl, data=self.formdata, headers=self.headers) if response.status_code == 200: return True return False def parser_some_pages(self, begin, end): for turn in range(begin, end): index = self.session.get(self.authority + self.pageurl + str(turn)) if index.status_code != 200: print('get page error page num: ' + str(turn) + ' ,error code: ' + str(index.status_code)) return index.encoding = 'gbk' self.parser_one_page(index) print('parser page ' + str(turn) + ' done!') time.sleep(random.random() * 3) def parser_one_page(self, index): data = self.parser.parser(index, index.text) self.manager.add_new_urls(data) def save_2_files(self, filename): self.manager.save_2_file(filename) def Run(self): thread_pool = [] for i in range(0, 10): t = threading.Thread(target=self.parser_some_pages, args=(1 + i * 10, 1 + (i + 1) * 10)) thread_pool.append(t) for t in thread_pool: t.start() for t in thread_pool: t.join() self.save_2_files('dict.txt') def load_and_download(self): with open('dict.txt', 'r') as f: urls = json.load(fp=f) for k, v in urls.items(): name = (k + '.txt').replace('?', '!') self.downloader.download(v[0], name) print('download done ' + name) time.sleep(random.random() * 3)
class NewsCrawler: def __init__(self): self.seed = ['', 'http://news.163.com/' ] # 网易新闻首页 self.downloader = multiThreadDownloader.downloader() self.analyze = HtmlAnalyze() self.craw_url_man = UrlManager() self.page_url_man = UrlManager() self.conn = MySQLdb.connect( host='localhost', user='******', passwd='toor', db='newsGather', charset='utf8') self.cur = self.conn.cursor() # 将数据库中已下载的url加入url管理器的old_urls中 self.cur.execute("select url from news_info;") results = self.cur.fetchall() exist_urls = list() if results == (): pass else: for i in results: exist_urls.append(i[0]) self.page_url_man.add_old_urls(exist_urls) def get_news(self, website): # 处理url管理器中的新的新闻url news = list() dic = dict() count = 0 new_urls = self.page_url_man.get_new_urls(len(self.page_url_man.new_urls)) print "获取新闻网页:" pages = self.downloader.download(new_urls, 6) print "分析新闻网页并存储新闻...." for page in pages: dic = self.analyze.Content(website, page['content']) if dic: dic['url'] = page['url'] news.append(dic) try: print 'save ',dic['url'] sql_raw = "INSERT IGNORE INTO news_info (url, post_time, title, keywords, content, source, origin) VALUES ('%s', '%s', '%s', '%s', '%s', '%s', \"%s\")" % (dic['url'], dic['post_time'], dic['title'], dic['keywords'], raw(dic['content']), dic['source'], dic['origin']) spider.cur.execute(sql_raw) spider.conn.commit() count += 1 except: print "save error!" print '抓取新闻数:%d' % count return news def craw(self, news_num, website, expand_patt, news_patt): # 新闻抓取 # print "hello" self.craw_url_man.add_new_url(self.seed[website]) news = list() dic = dict() count = 0 i = 0 while self.craw_url_man.has_new_url: print "第%d次扩展:" % i #print "获取待扩展页面:" craw_num = len(self.craw_url_man.new_urls) if craw_num < 60: new_urls = self.craw_url_man.get_new_urls(craw_num) else: new_urls = self.craw_url_man.get_new_urls(60) pages = self.downloader.download(new_urls, 6) print "分析待扩展页面....." for page in pages: craw_new_urls = self.analyze.getUrl(page['content'], expand_patt) self.craw_url_man.add_new_urls(craw_new_urls) page_new_urls = self.analyze.getUrl(page['content'], news_patt) #count = count + len(page_new_urls) self.page_url_man.add_new_urls(page_new_urls) count = len(self.page_url_man.new_urls) if count > news_num: news += self.get_news(website) break else: i = i + 1 news += self.get_news(website) continue break return news
class SpiderMain(): def __init__(self): self.urlDownLoader = HtmlDownloader() self.htmlParser = HtmlParser() self.urlManager = UrlManager() self.jsondataParser = JsonData_Parser() self.htmlOutPuter = HtmlOutPuter() def _get_from_discover_toplist(self, url): urls = self.htmlParser.parse( htmlContent=self.urlDownLoader.download(url), type='discover_toplist') self.urlManager.add_new_urls(urls) def _get_from_discover_artist(self, url): urls = self.htmlParser.parse( htmlContent=self.urlDownLoader.download(url), type='discover_artist') self.urlManager.add_new_urls(urls) def _get_from_artist(self, url): results = self.htmlParser.parse( htmlContent=self.urlDownLoader.download(url), type='artist') for name, urls in results.items(): print(name) self.urlManager.add_new_urls(urls) def _get_from_song(self, url): tmp = {} name = self.htmlParser.parse( htmlContent=self.urlDownLoader.download(url), type='song') print("正收集:" + name) comments = self.jsondataParser.parse( self.urlDownLoader.downloadJsonData(url)) tmp[name] = comments self.htmlOutPuter.collect_datas(tmp) def _parse_url(self, url): res = '' SONG = 'song' DISCOVER = 'discover' ARTIST = 'artist' TOPLIST = 'toplist' if (url.find(DISCOVER) != -1): res += DISCOVER if (url.find(ARTIST) != -1): if (res != ''): res += '_' + ARTIST else: res += ARTIST if (url.find(TOPLIST) != -1): if (res != ''): res += '_' + TOPLIST else: res += TOPLIST if (url.find(SONG) != -1): res += SONG return res def craw(self, rootUrl, direction=""): if (rootUrl.find('#') != -1): pos = rootUrl.find('#') rootUrl = rootUrl[:pos] + rootUrl[pos + 2:] self.urlManager.add_new_url(rootUrl) while self.urlManager.has_new_url(): url = self.urlManager.get_url() methodName = '_get_from_' + self._parse_url(url) method = getattr(self, methodName) if (method != None): method(url) self.htmlOutPuter.output_html(direction=direction)