class Spider(object): def __init__(self): self.manage = UrlManager() def spider(self, url, param): page_num = HtmlParser.get_page_num(url) print('page_num:', page_num) with open('./name.csv', 'a') as csvfile: fielddnames = ['title', 'url', 'down'] write = csv.DictWriter(csvfile, fieldnames=fielddnames) write.writeheader() for i in range(1, page_num + 1): page_url = url + param + str(i) print(page_url) new_urls = HtmlParser.get_page_urls(page_url) self.manage.add_new_urls(new_urls) while self.manage.has_new_url(): try: new_url = self.manage.get_new_url() data = HtmlParser.get_data(new_url) DataOutput.write_data(data) print(data) except Exception as e: print('抓取失败!error:', e) print('已经抓取{}条数据'.format(self.manage.old_urls_size()))
class SpiderMain(object): def __init__(self): self.manager = UrlManager() self.downloader = FileDownLoader() self.parser = FileParser() self.output = DataOutput() def crawl(self, root_files): for root_file in root_files: new_urls = self.parser.parser(root_file) self.manager.add_new_urls(new_urls) while (self.manager.has_new_url()): try: new_url = self.manager.get_new_url() data = self.downloader.download(new_url) self.output.store_data(data, root_file, new_url) print("已经抓取%s个链接" % self.manager.old_url_size()) interval = random.randint(1, 3) time.sleep(interval) print("sleep: %d" % interval) except Exception as err: self.output.mark_result(root_file, new_url, False) print("crawl faild:" + str(err))
def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() pass
class Main(object): def __init__(self): self.manager = UrlManager() self.downloader = HttpDownloader() self.parser = ContentParser() def process_solo(self, name_id, fold): # next_page: //tieba.baidu.com/f?kw=%E5%A5%B3%E4%BA%BA&ie=utf-8&pn=50 solo_ba = 'https:' url = solo_ba + name_id response = self.downloader.download(url) p_lists, next_page, current_page = self.parser.parser_solo_ba(response) print('第', current_page, '页帖子,', 'next_page:', next_page, p_lists) for p in p_lists: self.process_p(p[0], p[1], fold) if next_page is not None: self.process_solo(next_page, fold) else: print('HAPPY! program finish!!!!!') def process_p(self, page): # http://scxx.whfcj.gov.cn/xmqk.asp?page=1 base_url = 'http://scxx.whfcj.gov.cn/xmqk.asp?page=' url = base_url + str(page) self.manager.save_url(url) while self.manager.has_url(): next_url = self.manager.next_url() print('detail_p :', next_url) response = self.downloader.download(next_url) self.parser.parser_detail_p(response)
class MainSearch(object): def __init__(self): self.manager = UrlManager() self.downloader = HttpDownloader() self.parser = ContentParser() def process_page(self, name_id): solo_ba = 'http://weixin.sogou.com/weixin' url = solo_ba + name_id response = self.downloader.download(url) a = self.parser.parser_solo_ba(response) if a is None: return p_lists, next_page, current_page = a print('Page:', current_page, ', ListSize:', len(p_lists)) if len(p_lists) > 0: for p in p_lists: self.process_article(p) if next_page is not None: self.process_page(next_page) def process_article(self, url): self.manager.save_url(url) while self.manager.has_url(): next_url = self.manager.next_url() print('detail_article :', next_url) response = self.downloader.download(next_url) self.parser.parser_detail_p(response)
def main(): idi = 1405150114 urlmanager = UrlManager() pageur = urlmanager.url_login(idi) infourl = urlmanager.url_userinfo(idi) htmldownloader = HtmlDownloader() htmldownloader.download( 'http://ygcp.njtech.edu.cn/User/LoginInSampleAction.aspx', idi, pageur, infourl)
def __init__(self): #创建一个url管理器 self.urlManager = UrlManager() #创建一个html下载器 self.downloader = htmlDownloader() #创建一个html的解析器 self.htmlparser = htmlParser() #创建一个html的存储器 self.htmlSave = htmlSave()
def __init__(self, sort, sort_url, sortFilename): threading.Thread.__init__(self) self.sort = sort self.sort_url = sort_url self.sortFilename = sortFilename self.manager = UrlManager(self.sort) self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput()
def __init__(self, bind_domain): # 建立管理爬取URL的物件 , 用于记录已经爬过的URL self.urlManager = UrlManager(enable_external_link=False, bind_domain=bind_domain) # 建立请求链接的物件 self.downloader = HtmlDownloader() # 建立转换Html源码成lxml.html物件 , 获取新的链接 self.parser = HtmlParser()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口URL self.manager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取多少url while (self.manager.has_new_url() and self.manager.old_url_size() < 100): # try: # 从URL管理器获取新的url new_url = self.manager.get_new_url() # HTML下载器下载网页 html = self.downloader.download(new_url) # HEML解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取的url添加到URL管理器中 self.manager.add_new_urls(new_urls) # 数据存储器存储文件 self.output.store_data(data) print("已经抓取%s个链接" % self.manager.old_url_size()) # except Exception as e: # print(e) # print("Crawl failed") # 数据存储器将文件输出成指定格式 self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口URL self.manager.add_new_url(root_url) # 判断是否有新的URL及已抓取数量 while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: # 从URL管理器获取新的链接 new_url = self.manager.get_new_url() print '1-------->new_url', new_url # 下载网页 html = self.downloader.download(new_url) print '2-------->html' # 解析抽取网页 new_urls, data = self.parser.parser(new_url, html) print '3-------->new_urls, data', new_urls, data # 将抽取的URL添加到管理器中 self.manager.add_new_urls(new_urls) print '4-------->new_urls', new_urls # 数据存储器存储文件 self.output.store_data(data) print '已经抓取%d个链接' % self.manager.old_url_size() except Exception, e: print 'crawl failed %s' % e # 将数据存储为指定格式 self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def start(self, url, numMax=50): self.manager.addUrl(url) num = 0 errorsNum = 0 while self.manager.sizeofNew() != 0 and num < numMax: try: num = num + 1 url = self.manager.getUrl() print('%d\n %s' % (num, url)) html = self.downloader.download(url) newUrls, data = self.parser.parser(url, html) self.output.addData(data) if self.manager.sizeofNew() + self.manager.sizeofOld( ) < numMax: self.manager.addUrls(newUrls) print(data['title']) except: num = num - 1 errorsNum = errorsNum + 1 print('crawl failed %d' % errorsNum) self.output.outputData()
class SpiderMan(object): def __init__(self): self.manger = UrlManager() self.download = HtmlDownload() self.parse = HtmlParse() self.outpu = DataOuput() def crawl(self, root_url): ''' 添加入口url :param root_url: :return: ''' self.manger.add_new_url(root_url) # 判断url管理器中是否有新的url地址,同时判断抓取了多少个url while (self.manger.has_new_url() and self.manger.old_url_size() < 100): try: # 从URL管理器中获取新的URL地址 new_url = self.manger.get_new_url() # html下载器进行页面下载 html = self.download.download(new_url) # html解析获取数据 new_urls, data = self.parse.parse(new_url, html) # 将获取到的url地址添加到url管理器中 self.manger.add_new_urls(new_urls) # 数据存储 self.outpu.store_data(data) self.outpu.ouput_html() print('已抓取%s个链接' % self.manger.old_url_size()) except Exception as e: print('crawl fail', e)
def craw(self, root_url): # todo: 层数设置? page_count = self.page_count UrlManager.add_new_url(root_url) try: while UrlManager.has_new_url(): todo_url = UrlManager.get_new_url() try: print u"\nNO: %d 正在检测链接:%s" % (page_count, todo_url) except: print u"出错", todo_url # todo: 换成 contiune 是不是更好? # page_status, bad_links = Downloader.test(todo_url) # print u"\n结果:%s" % page_status # todo: 加入状态码的说明 state, content = Downloader.download(todo_url) if state: prase_state, new_urls = Parser.parse(todo_url, content) if prase_state: UrlManager.add_new_urls(new_urls) else: Outputer.collect_data(new_urls) else: Outputer.collect_data(content) page_count += 1 # except: # print u"页面爬取失败" # UrlManager.add_wrong_url(todo_url) # todo:测试代码 if page_count == 5000: self.page_count = page_count break print UrlManager.num_new_url() finally: Outputer.output_txt(self.page_count)
class Main(object): def __init__(self): self.manager = UrlManager() self.downloader = HttpDownloader() self.parser = ContentParser() self.num = 1 def process_solo(self, name, fold): # next_page: //tieba.baidu.com/f?kw=%E5%A5%B3%E4%BA%BA&ie=utf-8&pn=50 ba = 'https:' url = ba + name response = self.downloader.download(url) p_lists, next_page, current_page = self.parser.parser_solo_ba(response) print('第', current_page, '页帖子,', 'next_page:', next_page, p_lists) for p in p_lists: self.process_p(p[0], p[1], fold) if next_page is not None: self.process_solo(next_page, fold) else: print('HAPPY! program finish!!!!!') def process_p(self, title, short_url, ba_name): # http://tieba.baidu.com/p/5287680253 base_url = 'http://tieba.baidu.com' if self.num is not 1: url = base_url + short_url + "?pn=" + str(self.num) else: url = base_url + short_url self.manager.save_url(url) while self.manager.has_url(): next_url = self.manager.next_url() print('detail_p :', next_url) response = self.downloader.download(next_url) new_img_urls, big_img_urls, total_num = self.parser.parser_detail_p( response) # 'http://imgsrc.baidu.com/forum/pic/item/' self.downloader.load_imgs(big_img_urls, title, ba_name, True) self.num = self.num + 1 if self.num > int(total_num): self.num = 1 return else: self.process_p(title, short_url, ba_name)
class Spider(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.paser = HtmlParser() def crawl(self, idi): rootloginurl = 'http://ygcp.njtech.edu.cn/User/LoginInSampleAction.aspx' pageurl = self.manager.url_login(idi) infourl = self.manager.url_userinfo(idi) htmlf, htmli = self.downloader.download(rootloginurl, idi, pageurl, infourl) xuehao, xingming, changpao, chenpao = self.paser.parser( infourl, pageurl, htmli, htmlf) print("学号:" + xuehao[0], "姓名:" + xingming[0], changpao, chenpao)
class SpiderMan(object): """爬虫调度器""" def __init__(self): self.urlManager = UrlManager() self.htmlDownloader = HtmlDownloader() self.htmlParser = HtmlParser() self.htmlOutput = DataOutput() def crawl(self, root_url): # 添加入口URL self.urlManager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.urlManager.has_new_url() and self.urlManager.old_url_size() < 100): try: # 从URL管理器获取新的url new_url = self.urlManager.get_new_url() # HTML下载器下载网页 html = self.htmlDownloader.download(new_url) # HTML解析器抽取网页数据 new_urls, data = self.htmlParser.parser(new_url, html) # 将抽取的url添加到URL管理器中 self.urlManager.add_new_urls(new_urls) # 数据存储器存储数据 self.htmlOutput.store_data(data) except Exception as e: print(traceback.format_exc()) # 数据存储器将文件输出成指定格式 self.htmlOutput.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url, dir, logFile): # 添加入口URL self.manager.add_new_url(root_url) # 判断url管理器中是否有新的url,同时判断抓取了多少个url while ( # self.manager.has_new_url() and self.manager.old_url_size() < 2): self.manager.has_new_url()): try: # 从URL管理器中获取新的url new_url = self.manager.get_new_url() # HTML下载器下载网页 html = self.downloader.download(new_url) # HTML解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取的url添加到URL管理器中 self.manager.add_new_urls(new_urls) # 数据存储器存储文件 self.output.store_data(data) print u"已经抓取%s个链接" % (self.manager.old_url_size()) except Exception, e: print e print "crawl failed" # 数据存储器将文件输出成指定格式 self.output.output_html(dir, logFile)
def __init__(self): self.G_STATE_OK = 200 self.crawMaxNum = -1 self.crawCountNum = 0 self.urlManager = UrlManager() self.dispatch = Dispatch() self.htmlParser = HtmlParser("http://baike.baidu.com") self.applicationShow = ApplicationShow()
def __init__(self, url_argv): sys.setrecursionlimit(10000000) """ 调度数据库接口, 引入初始化, 调度器, 爬取器, 分析器 """ self.db = DbManager.DbManager(db_config).mysql_connect() self.config = spider_config.spider_config() self.initialization = Initialization.Initialization( self.db, self.config, url_argv) self.manager = UrlManager.UrlManager(self.db, self.config) self.craw = UrlCraw.UrlCraw(self.db, self.config) self.analyse = UrlAnalyse.UrlAnalyse(self.db, self.config) self.sprint = SpiderPrint.SpiderPrint() self.initialize_spider()
class manage(object): def __init__(self): #创建一个url管理器 self.urlManager = UrlManager() #创建一个html下载器 self.downloader = htmlDownloader() #创建一个html的解析器 self.htmlparser = htmlParser() #创建一个html的存储器 self.htmlSave = htmlSave() def action(self): #给url管理器设置一个根url地址 root_url = "https://baike.baidu.com/item/网络爬虫" self.urlManager.add_new_url(root_url) n = 0 #询问url管理器是否有待取的url while self.urlManager.has_new_url() and n <= 100: n += 1 #获取一个未被抓取的url地址 new_url = self.urlManager.get_new_url() #把url交给下载器去下载html代码 htmlStr = self.downloader.download(new_url) #把htmlStr字符串交给解析器去解析,解析器返回一个元祖 #元祖的第一值是和当前页面关联的所有的url,是一个set集合 #元祖的第二个值是当前页面的数据,是一个字典 urls, data = self.htmlparser.parser(new_url, htmlStr) #把urls交给url管理器 self.urlManager.add_new_urls(urls) #把数据交给数据存储器 self.htmlSave.saveData(data) print "第%s个页面的数据" % n self.htmlSave.output()
class Spider(object): def __init__(self): print 'init' self.urlManager = UrlManager() self.downloader = Downloader() self.praser = HtmlPraser() self.outputer = Output() def craw(self, rootUrl): self.urlManager.addUrl(rootUrl) count = 1 while self.urlManager.hasNewUrl(): newUrl = self.urlManager.getNewUrl() print '爬取第', count, '个url,url是:', newUrl htmlContent = self.downloader.download(newUrl) newUrls, newData = self.praser.praser(newUrl, htmlContent) self.urlManager.addUrls(newUrls) self.outputer.collect(newData) if count == 10: break count = count + 1 self.outputer.output()
class Spider(object): def __init__(self): self.manage = UrlManager() self.output = DataOutput() self.parse = HtmlParser() def crawl(self): print(self.parse.page_num) for i in range(1, self.parse.page_num + 1): new_urls = self.parse.get_page_urls(i) print(new_urls) self.manage.add_new_urls(new_urls) while self.manage.has_new_url(): new_url = '' try: new_url = self.manage.get_new_url() print(new_url) data = self.parse.get_data(new_url) print(data) self.output.save_mongo(data) time.sleep(1) except Exception as e: print('抓取失败:', new_url, e) print('已经抓取{}条数据'.format(self.output.data_size()))
def start_server(path): server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) try: server.bind((HOST, PORT)) server.listen(100) print('bind %s,ready to use' % PORT) except: print('Server is already running, quit') sys.exit() i = 0 while True: connection, address = server.accept() username = address[0] connectionlist['connection' + str(i)] = connection if handshake(connection): print('handshake success') try: manager = UrlManager(connection, r'http://www.meitulu.com', '', path, True); manager.start() except: print('start new thread error') connection.close() i += 1
class SpiderMain(object): def __init__(self): self.manager=UrlManager() self.downloader=HtmlDownloader() self.parser=HtmlParser() self.output=DataOutput() def crawl(self,root_url): self.manager.add_new_url(root_url) while(self.manager.has_new_url() and self.manager.old_url_size()<100): try: new_url=self.manager.get_new_url() html=self.downloader.download(new_url) new_urls,data=self.parser.parser(new_url,html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print("已经抓取%s个链接"%self.manager.old_url_size()) except Exception as e: print(e) self.output.output_html()
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() def crawl(self, root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_url()): # try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url, html) self.manager.add_new_urls(new_urls)
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def get_url_page(self, url): pattern1 = re.compile('page=\d+') pattern2 = re.compile('\d+') rx = re.search(pattern1, url) rxx = re.search(pattern2, rx.group()) url_page = int(rxx.group()) return url_page def crawl(self, root_url): #添加入口url self.manager.add_new_url(root_url) page_number = self.get_url_page(root_url) #判断url管理器是否有新url,同时判断抓取了多少个url while (self.manager.has_new_urls() and self.manager.old_url_size() < 163): try: new_url = self.manager.get_new_urls() print(new_url) html = self.downloader.download(new_url) page_number += 1 print('page=%s' % page_number) new_urls, data = self.parser.parser(new_url, html, page_number) #print(new_urls) self.manager.add_new_urls(new_urls) self.output.store_data(data) print("已抓取%s个链接" % self.manager.old_url_size()) except Exception as e: print("crawl failed") break print(self.output.datas) self.output.output_html()
class MenetSpider: def __init__(self): self.downloader = HtmlDownloader() self.urlmanager = UrlManager() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, i): try: print(f"Process {i} is running") url = self.urlmanager.get_new_url(i) html = self.downloader.download(url) data = self.parser.parser(html) return data except: print(f"crawl failed at {i}") return pd.DataFrame([0, 0, 0, 0, 0, 0, 0, 0, 0], columns=[ '编码', "药品名称", "生产企业", "批文文号", "商品名", "剂型", "规格", "进口国产", "批准日期" ])
def url_manager_process(self, task_queue): ''' url管理器进程 :param task_queue: url队列 :param conn_queue: :param root_url: 起始url :return: ''' sql = 'SELECT id,bname FROM ' + TABLE_NAME + ' WHERE bdoubanlink IS NULL OR bdoubanlink=""' url_manager = UrlManager() db = MysqlHelper(DATABASE_NAME) while True: if not url_manager.has_new_url(): datas = db.select(sql) if datas: for data in datas: task_data = str(data[0])+'$$'+data[1].strip() url_manager.add_new_url(task_data) print('[√] Datas has been read from database!') else: print('[!] Fetch database null.') exit(-1) # 添加爬虫结束条件 if not url_manager.has_new_url(): # 通知节点停止工作 task_queue.put('end') print('[·] Controler send "end" command.') return while task_queue.qsize() < _config.QUEUE_NUM and url_manager.has_new_url(): # 从url管理器获取新的url new_url = url_manager.get_new_url() # 将url分发下去 task_queue.put(new_url) print('[+] >>> %s' % new_url)
class SpiderMan(object): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): # 添加入口URL self.manager.add_new_url(root_url) #判断url管理器中是否有新的url,同时判断抓取饿多少个url while (self.manager.has_new_url() and self.manager.old_url_size() < 10000): try: # 从URL管理器中获取新的url new_url = self.manager.get_new_url() # HTML下载器下载网页 html = self.downloader.dowload(new_url) """ with open(r"%s.html"%self.manager.old_url_size(), 'wb') as f: f.write(html) f.flush() """ # HTML解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) # 将抽取的url添加到url管理器中 self.manager.add_new_urls(new_urls) # 数据存储器存储文件 self.output.store_data(data) print "已经抓取%s个连接" % self.manager.old_url_size() except Exception, e: print e print "crawl failed" #数据存储器将文件输出成指定格式 self.output.output_html()
def url_manager_proc(self, url_q): ''' url管理进程将url_q中的待爬取城市传递给爬虫节点 :param url_q:管理进程通将url传递给爬虫节点的通道 :return: ''' url_manager = UrlManager() while True: while (url_manager.has_new_url()): # 从URL管理器获取新的url new_url = url_manager.get_new_url() # 将新的URL发给工作节点 url_q.put(new_url) # 通知爬虫节点停止工作 url_q.put('end') # 关闭管理节点,同时存储set状态 url_manager.save_progress('new_city.txt', url_manager.new_urls) url_manager.save_progress('old_city.txt', url_manager.old_urls) return
class Main(object): def __init__(self): self.manager = UrlManager() self.downloader = HttpDownloader() self.parser = ContentParser() def process_solo(self, name_id, fold): # next_page: //tieba.baidu.com/f?kw=%E5%A5%B3%E4%BA%BA&ie=utf-8&pn=50 solo_ba = 'https:' url = solo_ba + name_id response = self.downloader.download(url) p_lists, next_page, current_page = self.parser.parser_solo_ba(response) print('第', current_page, '页帖子,', 'next_page:', next_page, p_lists) for p in p_lists: self.process_p(p[0], p[1], fold) if next_page is not None: self.process_solo(next_page, fold) else: print('HAPPY! program finish!!!!!') def process_p(self, short_url): # https://zhuanlan.zhihu.com/p/26647066 base_url = 'https://zhuanlan.zhihu.com' url = base_url + short_url self.manager.save_url(url) while self.manager.has_url(): next_url = self.manager.next_url() print('detail_p :', next_url) response = self.downloader.download(next_url) print(response) title, img_urls, links = self.parser.parser_detail_p(response) if title is not None and img_urls is not None: self.downloader.load_imgs(title, img_urls) if links is not None: self.manager.save_urls(links)