def __init__(self, path, base_url, count): self.url_manager = URLManager() self.data_save = DataSave(path) self.html_parser = HtmlParse() self.downloader = HtmlDownloader() self.root_url = base_url self.count = count
class Scheduler: def __init__(self,path,root_url,count): # 初始化各个组件 self.url_manager = URLManager() self.data_save = DataSave(path) self.html_parser = HtmlParse() self.downloader = HtmlDownloader() self.root_url = root_url self.count = count def run_spider(self): # 先添加一条url到未爬取url集合中 self.url_manager.save_new_url(self.root_url) # 判断:如果未爬取url集合中还有网址,并且还没有爬取到50篇文章,那么继续爬去 while self.url_manager.get_new_url_num() and self.url_manager.get_old_url_num() < self.count: try: # 获取一条未爬取url url = self.url_manager.get_new_url() # 下载数据 response = self.downloader.download(url) # 分析数据,返回url与文章相关数据 new_urls,data = self.html_parser.parse_data(url,response) # 将获取到的url保存到未爬取url集合中 self.url_manager.save_new_urls(new_urls) # 保存数据到本地文件 self.data_save.save(data) print("已经抓取了{0}篇文章".format(len(self.url_manager.old_urls))) except Exception as e: print("本篇文章抓取停止,{0}".format(e))
class Scheduler: def __init__(self, path, base_url, count): self.url_manager = URLManager() self.data_save = DataSave(path) self.html_parser = HtmlParse() self.downloader = HtmlDownloader() self.root_url = base_url self.count = count def run_spider(self): self.url_manager.save_new_url(self.root_url) while self.url_manager.get_new_url_num( ) and self.url_manager.get_old_url_num() < self.count: try: url = self.url_manager.get_new_url() response = self.downloader.download(url) new_urls, data = self.html_parser.parse_data(response, url) self.url_manager.save_new_urls(new_urls) self.data_save.save(data) print("Already spider {} pages".format( len(self.url_manager.old_urls))) except Exception as e: print("Error", e)
if __name__ == "__main__": import sys print sys.argv rootURL = '' if len(sys.argv) > 1: rootURL = sys.argv[1] else: rootURL = 'vatgia.com' print 'Crawl ', rootURL from urlmanager import URLManager from urlrequest import URLRequest from storage import MySQLDBStorage from htmlprocessor import VGHMTLProcessor storage = MySQLDBStorage('localhost', 'root', 'adminpass', 'nc_vgdb') urlManager = URLManager(storage) processor = VGHMTLProcessor(storage) urlManager.addURL('http://{0}'.format(rootURL)) urlManager.addURL('http://{0}/home'.format(rootURL)) urlManager.addURL('http://{0}/home/'.format(rootURL)) urlRequest = URLRequest('testlink.txt') while 1: nextURL = urlManager.nextURL() if not nextURL: break urlRequest.execute(nextURL, urlManager, processor)
from peewee import * from urlmanager import URLManager from time import sleep urlman = URLManager() db = MySQLDatabase("dev_bookmarks",user="******",passwd="hkGVUX26w8ivEP") class Bookmarks(Model): id = IntegerField() url = CharField() domain = CharField() title = CharField() class Meta: database = db for bk in Bookmarks.select(): updated = False if bk.domain == None: bk.domain = urlman.get_domain(bk.url) updated = True if bk.title == None or "CloudFlare" in bk.title or bk.title == "Too Many Requests": bk.title = urlman.get_title(bk.url) updated = True if updated: print "Updating " + bk.url + "..." bk.save() sleep(3)
from BeautifulSoup import BeautifulSoup import urllib from urlmanager import URLManager url = 'http://www.aflcio.org/Legislation-and-Politics/Legislative-Alerts' umgr = URLManager() print umgr.get_title(url) print umgr.get_domain(url)