class SpiderMan(object): def __init__(self, prefix,root_url): self.url_manager = UrlManager(root_url, prefix) self.output_manager = OutputManager() def start_push(self): context = zmq.Context() sender = context.socket(zmq.PUSH) sender.bind("tcp://*:15677") while True: if self.url_manager.has_url_node(): url_node = self.url_manager.get_url_node() sender.send_pyobj(url_node) time.sleep(1) def start_pull(self): context = zmq.Context() receiver = context.socket(zmq.PULL) receiver.connect("tcp://localhost:15678") while True: s = receiver.recv_pyobj() new_urls, new_data, url_node = s[0], s[1], s[2] if len(new_data) >= 1: current = DataNode(url_node, new_data) self.output_manager.add(current) self.url_manager.add_urls(url_node.id, new_urls) print "[" + str(url_node.parent_id) + "-->" + str(url_node.id) + "]" + new_data[0].string else: print "[" + str(url_node.parent_id) + "-->" + str(url_node.id) + "]" + "--" def get_points_and_edges(self): return self.output_manager.points_and_edges()
def __init__(self): self.seed = ['', 'http://news.163.com/' ] # 网易新闻首页 self.downloader = multiThreadDownloader.downloader() self.analyze = HtmlAnalyze() self.craw_url_man = UrlManager() self.page_url_man = UrlManager() self.conn = MySQLdb.connect( host='localhost', user='******', passwd='toor', db='newsGather', charset='utf8') self.cur = self.conn.cursor() # 将数据库中已下载的url加入url管理器的old_urls中 self.cur.execute("select url from news_info;") results = self.cur.fetchall() exist_urls = list() if results == (): pass else: for i in results: exist_urls.append(i[0]) self.page_url_man.add_old_urls(exist_urls)
class Crawler(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() def get_urls(self, keywords): data = {} for word in keywords: url = self.crawl(word) data[word] = url; return data def crawl(self, word): results = {} url = self.manager.get_url(word); page = self.downloader.download(url) return self.parser.search(page)
def url_manager_proc(self, url_q: Queue, conn_q: Queue, root_url): print('url manager process start...') url_manager = UrlManager() url_manager.add_new_url(root_url) print('url manager process started...') while True: while url_manager.has_new_url(): new_url = url_manager.get_new_url() print('new_url', new_url) # 将新的URL发给工作节点 url_q.put(new_url) # 加一个判断条件, 当爬取2000个链接后就关闭, 并保存进度 if url_manager.old_url_size() > 2000: # 通知爬行节点工作结束 url_q.put('end') print('控制节点发起结束通知') # 关闭管理节点, 同事存储set状态 url_manager.save_process(path.join('dist', 'new_urls.txt'), url_manager.new_urls) url_manager.save_process(path.join('dist', 'old_urls.txt'), url_manager.old_urls) return # 将从result_solve_proc 获取到的URL添加到URL管理器 try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except BaseException as e: time.sleep(0.1)
def __init__(self): self.url_store = None self.url_manager = UrlManager()
def __init__(self, pageLimit): self.urls = UrlManager() # url管理器,负责url分发 self.downloader = Downloader() # 根据url抓取网页数据 self.parser = Parser() # 解析下载的网页 self.pipeline = Pipeline() # 数据清洗、验证、存储
def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser()
class NewsCrawler: def __init__(self): self.seed = ['', 'http://news.163.com/' ] # 网易新闻首页 self.downloader = multiThreadDownloader.downloader() self.analyze = HtmlAnalyze() self.craw_url_man = UrlManager() self.page_url_man = UrlManager() self.conn = MySQLdb.connect( host='localhost', user='******', passwd='toor', db='newsGather', charset='utf8') self.cur = self.conn.cursor() # 将数据库中已下载的url加入url管理器的old_urls中 self.cur.execute("select url from news_info;") results = self.cur.fetchall() exist_urls = list() if results == (): pass else: for i in results: exist_urls.append(i[0]) self.page_url_man.add_old_urls(exist_urls) def get_news(self, website): # 处理url管理器中的新的新闻url news = list() dic = dict() count = 0 new_urls = self.page_url_man.get_new_urls(len(self.page_url_man.new_urls)) print "获取新闻网页:" pages = self.downloader.download(new_urls, 6) print "分析新闻网页并存储新闻...." for page in pages: dic = self.analyze.Content(website, page['content']) if dic: dic['url'] = page['url'] news.append(dic) try: print 'save ',dic['url'] sql_raw = "INSERT IGNORE INTO news_info (url, post_time, title, keywords, content, source, origin) VALUES ('%s', '%s', '%s', '%s', '%s', '%s', \"%s\")" % (dic['url'], dic['post_time'], dic['title'], dic['keywords'], raw(dic['content']), dic['source'], dic['origin']) spider.cur.execute(sql_raw) spider.conn.commit() count += 1 except: print "save error!" print '抓取新闻数:%d' % count return news def craw(self, news_num, website, expand_patt, news_patt): # 新闻抓取 # print "hello" self.craw_url_man.add_new_url(self.seed[website]) news = list() dic = dict() count = 0 i = 0 while self.craw_url_man.has_new_url: print "第%d次扩展:" % i #print "获取待扩展页面:" craw_num = len(self.craw_url_man.new_urls) if craw_num < 60: new_urls = self.craw_url_man.get_new_urls(craw_num) else: new_urls = self.craw_url_man.get_new_urls(60) pages = self.downloader.download(new_urls, 6) print "分析待扩展页面....." for page in pages: craw_new_urls = self.analyze.getUrl(page['content'], expand_patt) self.craw_url_man.add_new_urls(craw_new_urls) page_new_urls = self.analyze.getUrl(page['content'], news_patt) #count = count + len(page_new_urls) self.page_url_man.add_new_urls(page_new_urls) count = len(self.page_url_man.new_urls) if count > news_num: news += self.get_news(website) break else: i = i + 1 news += self.get_news(website) continue break return news
def __init__(self): self.urls = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = HtmlOutputer()
def __init__(self, prefix, root_url): self.url_manager = UrlManager(root_url, prefix) self.output_manager = OutputManager()
def __init__(self, prefix,root_url): self.url_manager = UrlManager(root_url, prefix) self.output_manager = OutputManager()
# 根据url列表爬取网页 while urlManager.has_new_url(): try: new_url = urlManager.get_new_url() html_cont = downloader.download(new_url) new_data = parser.parse(new_url, html_cont) printer.collect_data(new_data) except: print "crawl failed!" printer.output_sql() # 程序执行 if __name__ == "__main__": print "Welcome to EMM-Mall-ArknightDataSpider." # 各模块初始化(实例化) urlManager = UrlManager() downloader = Downloader() parser = ContentParser() printer = ResultPrinter() # 实例化主入口,开始爬取数据 SpiderMain = SpiderMain() change_working_dir() # craw(raw_input("Enter Root Url : ")) craw("http://prts.wiki/w/Lancet-2") print("Everything is done. Result is in result.sql")
def __init__(self): self.urlDownLoader = HtmlDownloader() self.htmlParser = HtmlParser() self.urlManager = UrlManager() self.jsondataParser = JsonData_Parser() self.htmlOutPuter = HtmlOutPuter()
class SpiderMain(): def __init__(self): self.urlDownLoader = HtmlDownloader() self.htmlParser = HtmlParser() self.urlManager = UrlManager() self.jsondataParser = JsonData_Parser() self.htmlOutPuter = HtmlOutPuter() def _get_from_discover_toplist(self, url): urls = self.htmlParser.parse( htmlContent=self.urlDownLoader.download(url), type='discover_toplist') self.urlManager.add_new_urls(urls) def _get_from_discover_artist(self, url): urls = self.htmlParser.parse( htmlContent=self.urlDownLoader.download(url), type='discover_artist') self.urlManager.add_new_urls(urls) def _get_from_artist(self, url): results = self.htmlParser.parse( htmlContent=self.urlDownLoader.download(url), type='artist') for name, urls in results.items(): print(name) self.urlManager.add_new_urls(urls) def _get_from_song(self, url): tmp = {} name = self.htmlParser.parse( htmlContent=self.urlDownLoader.download(url), type='song') print("正收集:" + name) comments = self.jsondataParser.parse( self.urlDownLoader.downloadJsonData(url)) tmp[name] = comments self.htmlOutPuter.collect_datas(tmp) def _parse_url(self, url): res = '' SONG = 'song' DISCOVER = 'discover' ARTIST = 'artist' TOPLIST = 'toplist' if (url.find(DISCOVER) != -1): res += DISCOVER if (url.find(ARTIST) != -1): if (res != ''): res += '_' + ARTIST else: res += ARTIST if (url.find(TOPLIST) != -1): if (res != ''): res += '_' + TOPLIST else: res += TOPLIST if (url.find(SONG) != -1): res += SONG return res def craw(self, rootUrl, direction=""): if (rootUrl.find('#') != -1): pos = rootUrl.find('#') rootUrl = rootUrl[:pos] + rootUrl[pos + 2:] self.urlManager.add_new_url(rootUrl) while self.urlManager.has_new_url(): url = self.urlManager.get_url() methodName = '_get_from_' + self._parse_url(url) method = getattr(self, methodName) if (method != None): method(url) self.htmlOutPuter.output_html(direction=direction)
def __init__(self): self.urls = UrlManager() self.parser = HtmlParser() self.downloader = UrlDownloader() self.log = MyLog("spider", "logs") self.output = HtmlOutPut()
class spider_main(): def __init__(self): self.urls = UrlManager() self.parser = HtmlParser() self.downloader = UrlDownloader() self.log = MyLog("spider", "logs") self.output = HtmlOutPut() # 主模块中开始爬虫 def Crawling(self, root_url): # 用字典存放地区名和网页数 areas = { "gulou": 100, "jianye": 100, "qinhuai": 100, "xuanwu": 100, "yuhuatai": 100, "qixia": 100, "baijiahu": 91, "jiangningqita11": 19, "chalukou1": 71, "dongshanzhen": 98, "jiangningdaxuecheng": 74, "jiulonghu": 49, "jiangjundadao11": 51, "kexueyuan": 28, "pukou": 100, "liuhe": 100, "lishui": 100, "gaochun11": 2, "jiangning": 100, "qilinzhen": 100, "tangshanzhen": 19, "fenghuangxijie1": 40, "xianlin": 72, "yaohuamen": 32, "maigaoqiao1": 50, "maqun1": 44, "qixiaqita1": 19, "qixiaqita1": 19, "xiaozhuang": 18, "yanziji": 8, "yueyuan": 18, "wanshou1": 15, "hongshan1": 25, "caochangmendajie": 10, "dinghuaimendajie": 17, "fujianlu": 12, "hanzhongmendajie": 10, "huxijie": 11, "jiangdong2": 3, "jiangxinzhou": 6, "nanhu4": 21, "nanyuan2": 17, "shuiximen1": 7, "wandaguangchang1": 28, "xiaoxing": 10, "yuhuaxincun": 16, "jiangningbinjiang": 16, "lukou": 70, "dingshanjiedao": 22, "gaoxinqu2": 44, "jiangpujiedao": 100, "pukouqita11": 36, "qiaobei": 100, "taishanjiedao": 32, "dachang12": 69, "liuheqita1": 14, "longchi": 26, "luhezhucheng": 21, "nanmenxincheng": 26, "wutangguangchang": 17 } # 通过拼接形成所有的url地址,将所有的url连接保存 for area, num in areas.items(): for n in range(1, num + 1): # 拼接url: https://nj.lianjia.com/ershoufang/ splice_url = root_url + area + "/pg" + str(n) + "/" # 将拼接url写入日志 self.log.logger.info("url地址拼接" + splice_url) # 控制台打印 print("url地址拼接" + splice_url) # 拼接完成后开始进行网页下载 try: html_down = self.downloader.download(splice_url) except Exception as e: # 将错误信息写入日志 self.log.logger.error("html下载出现错误" + repr(e)) # 挂起进程 time.sleep(60 * 30) else: # 如果下载页面不出现错误,进行网页解析 try: secondhome_urls = self.parser.get_secondhandhome_urls( html_down) except Exception as e: # 将错误信息写入日志 self.log.logger.error("html页面解析错误" + repr(e)) else: # 页面解析正常 self.urls.add_new_urls(secondhome_urls) time.sleep(random.randint(0, 3)) time.sleep(60 * 10) # 具体解析html 获取需要的数据集 id = 1 # 起始 stop = 1 while self.urls.isEmpty_new_urls(): # 取出url try: temp_url = self.urls.get_new_url() # 控制台打印 print("html页面地址" + temp_url) # 日志写入 self.log.logger.info("html页面地址" + temp_url) except Exception as e: # 错误信息写入日志 # 控制台打印 print("html页面地址获取失败" + temp_url) self.log.logger.error("获取url错误" + repr(e)) # url获取正常进行下载 try: temp_data = self.downloader.download(temp_url) except Exception as e: # 控制台打印 print("页面下载失败" + temp_url) # 错误写入日志 self.log.logger.error("页面下载失败" + repr(e)) self.urls.add_new_url(temp_url) time.sleep(60) else: # 正常下载后 进行页面解析 try: temp_parser = self.parser.get_secondhandhome_data( temp_data, id) except Exception as e: self.log.logger.error("html页面解析错误" + repr(e)) print("html页面解析错误" + repr(e)) else: # 页面解析正常 进行写出 try: self.output.write_data(temp_parser) except Exception as e: self.log.logger.error("数据集写出错误" + repr(e)) print("数据集写出错误" + repr(e)) else: print(id) id = id + 1 stop = stop + 1 time.sleep(1) if stop == 2500: stop = 1 time.sleep(60)