def __init__(self): ''' 初始化各模块 ''' self.DA = DataArranger.DataArranger() self.HD = HtmlDownloader.HtmlDownloader() self.HP = HtmlParser.HtmlParser() self.UM = UrlManager.UrlManager()
def __init__(self): ''' 初始化各类 ''' self.UM = UrlManager.UrlManager() self.HP = HtmlParser.HtmlParser() self.DA = DataArranger.DataArrange() self.driver = webdriver.Chrome()
def __init__(self, Response): ''' ''' try: parser = HtmlParser.HtmlParser(Response) except: msg = 'There is no parser for "%s".' % Response.get_url() raise msg self._parser = parser
def __init__(self): # 爬取深度(页数) self.maxPageDeep = 1 # 地址管理器 self.UrlsManager = UrlManager.UrlManager() # 下载器 self.Downloader = HtmlDownloader.HtmlDownloader() # 解析器 self.Parser = HtmlParser.HtmlParser() # 输出器 self.Outputer = HtmlOutputer.HtmlOutputer()
def main(): #浙江大学 # school = "浙江大学" # url = 'https://baike.baidu.com/item/%E6%B5%99%E6%B1%9F%E5%A4%A7%E5%AD%A6' #东北大学 # school = "东北大学" # url = 'https://baike.baidu.com/item/%E4%B8%9C%E5%8C%97%E5%A4%A7%E5%AD%A6/18014' # 高玉堂 # name = "高玉堂" # url = 'http://xueshu.baidu.com/scholarID/CN-BT73WSNJ' name = "scholar" i = 0 # url_list url_list = [] url_list.append('http://xueshu.baidu.com/scholarID/CN-BT73WSNJ') url_list.append('http://xueshu.baidu.com/scholarID/CN-B3742FWJ') url_list.append('http://xueshu.baidu.com/scholarID/CN-B0746Q8J') url_list.append('http://xueshu.baidu.com/scholarID/CN-B97472MJ') url_list.append('http://xueshu.baidu.com/scholarID/CN-BN733MNJ') download = HtmlDownloader() parser = HtmlParser() for item in url_list: url = item # download = HtmlDownloader() # parser = HtmlParser() html_cont = download.download(url) # data, table = parser.parser(url, html_cont) data = parser.parser(url, html_cont, url_list) # output_dict(school + "out_dict.txt", data) # output_table(school + "out_table.txt", table) # return output_dict(name + str(i) + "out_dict.txt", data) # output_table(name + "out_table.txt", table) i += 1 # time.sleep(1) return
def __init__(self): # 初始化分布式进程中工作节点的链接工作 # 实现第一步: 使用BaseManager注册用于获取Queue的方法名称 BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') # 实现第二步: 连接到服务器 server_addr = '127.0.0.1' print('Connect to server %s...' % server_addr) # 注意保持端口和验证口令与服务器进程设置的完全一致 self.m = BaseManager(address=(server_addr, 8001), authkey='baike') # 从网络连接 self.m.connect() # 实现第三步: 获取Queue的对象 self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() # 初始化网页下载器和解析器 self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish.')
def __init__(self): #初始化分布式进程中的工作节点的连接工作 # 实现第一步:使用BaseManager注册获取Queue的方法名称 BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') # 实现第二步:连接到服务器: server_addr = '127.0.0.1' print('Connect to server %s...' % server_addr) # 端口和验证口令注意保持与服务进程设置的完全一致: self.m = BaseManager(address=(server_addr, 8001), authkey='baike'.encode('utf-8')) # 从网络连接: self.m.connect() # 实现第三步:获取Queue的对象: self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() #初始化网页下载器和解析器 self.downloader = HtmlDownloader.HtmlDownloader() self.parser = HtmlParser.HtmlParser() print('init finish')
def __init__(self): self.urls = UrlManager.UrlManager() self.downloader = HtmlDownloader.HtmlDownloader() self.parser = HtmlParser.HtmlParser() self.outputer = HtmlOutputer.HtmlOutputer()
def __init__(self): self.downloader=downloader.Downloader() self.outputer=FileOutputer.FileOutputer() self.parser=HtmlParser.HtmlParser()
def __init__(self): self.UM = UM.UrlManager() self.HD = HD.HtmlDownloader() self.HP = HP.HtmlParser() self.DA = DA.DataArrange()
def __init__(self): self.manager = UrlManager.UrlManager() self.downloader = HtmlDownloader.HtmlDownloader() self.parser = HtmlParser.HtmlParser() self.output = DataOutput.DataOutput()
def __init__(self): self.urlManager = UrlManager.UrlManager() self.htmlDownloader = HtmlDownloader.HtmlDownloader() self.htmlParser = HtmlParser.HtmlParser() self.fileOutputer = FileOutputer.FileOutputer()
# coding=utf-8 import IOUtils import HtmlDownloader import HtmlParser import HtmlOutputer import bd L=[] hd = HtmlDownloader.HtmlDownloader() hp = HtmlParser.HtmlParser() io =IOUtils.IOUtils() ho = HtmlOutputer.HtmlOutputer() datas = io.getListOrDictFromJsonFile("F:\\ajk\\info.json") print len(datas) L_res=[] for data in datas: name = data[u'小区名称'] print name #下面这一步是为了url编码 name = name.encode('utf-8') url = data[u'网站'] d = bd.getPos(name) if d is None: data[u'精度']=u"未找到" data[u'纬度']=u"未找到" data[u'附件500米幼儿园']=u"未找到" data[u'附近3000米幼儿园']=u"未找到" data[u'附近500米医院']=u"未找到" data[u'附近3000米医院']=u"未找到" data[u'附近500米商场']=u"未找到"
def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput()
def __init__(self): self.manager = UrlManager.UrlManager() self.downloader = HtmlDownloader.HtmlDownloader() self.parser = HtmlParser.HtmlParser()
def __init__(self): super(SpiderMain, self).__init__() self.urls = UrlManager.UrlManager() self.downloader = HtmlDownloader.HtmlDownloader() self.parser = HtmlParser.HtmlParser() self.outputer = HtmlOutputer.HtmlOutputer()