Пример #1
0
 def __init__(self):
     '''
     初始化各模块
     '''
     self.DA = DataArranger.DataArranger()
     self.HD = HtmlDownloader.HtmlDownloader()
     self.HP = HtmlParser.HtmlParser()
     self.UM = UrlManager.UrlManager()
Пример #2
0
 def __init__(self):
     '''
     初始化各类
     '''
     self.UM = UrlManager.UrlManager()
     self.HP = HtmlParser.HtmlParser()
     self.DA = DataArranger.DataArrange()
     self.driver = webdriver.Chrome()
Пример #3
0
    def __init__(self, Response):
        '''
		'''
        try:
            parser = HtmlParser.HtmlParser(Response)
        except:
            msg = 'There is no parser for "%s".' % Response.get_url()
            raise msg

        self._parser = parser
Пример #4
0
 def __init__(self):
     # 爬取深度(页数)
     self.maxPageDeep = 1
     # 地址管理器
     self.UrlsManager = UrlManager.UrlManager()
     # 下载器
     self.Downloader = HtmlDownloader.HtmlDownloader()
     # 解析器
     self.Parser = HtmlParser.HtmlParser()
     # 输出器
     self.Outputer = HtmlOutputer.HtmlOutputer()
Пример #5
0
def main():
    #浙江大学
    # school = "浙江大学"
    # url = 'https://baike.baidu.com/item/%E6%B5%99%E6%B1%9F%E5%A4%A7%E5%AD%A6'
    #东北大学
    # school = "东北大学"
    # url = 'https://baike.baidu.com/item/%E4%B8%9C%E5%8C%97%E5%A4%A7%E5%AD%A6/18014'

    # 高玉堂
    # name = "高玉堂"
    # url = 'http://xueshu.baidu.com/scholarID/CN-BT73WSNJ'

    name = "scholar"
    i = 0
    # url_list
    url_list = []
    url_list.append('http://xueshu.baidu.com/scholarID/CN-BT73WSNJ')
    url_list.append('http://xueshu.baidu.com/scholarID/CN-B3742FWJ')
    url_list.append('http://xueshu.baidu.com/scholarID/CN-B0746Q8J')
    url_list.append('http://xueshu.baidu.com/scholarID/CN-B97472MJ')
    url_list.append('http://xueshu.baidu.com/scholarID/CN-BN733MNJ')

    download = HtmlDownloader()
    parser = HtmlParser()

    for item in url_list:

        url = item
        # download = HtmlDownloader()
        # parser = HtmlParser()
        html_cont = download.download(url)
        # data, table = parser.parser(url, html_cont)
        data = parser.parser(url, html_cont, url_list)

        # output_dict(school + "out_dict.txt", data)
        # output_table(school + "out_table.txt", table)
        # return

        output_dict(name + str(i) + "out_dict.txt", data)
        # output_table(name + "out_table.txt", table)
        i += 1
        # time.sleep(1)

    return
Пример #6
0
 def __init__(self):
     # 初始化分布式进程中工作节点的链接工作
     # 实现第一步: 使用BaseManager注册用于获取Queue的方法名称
     BaseManager.register('get_task_queue')
     BaseManager.register('get_result_queue')
     # 实现第二步: 连接到服务器
     server_addr = '127.0.0.1'
     print('Connect to server %s...' % server_addr)
     # 注意保持端口和验证口令与服务器进程设置的完全一致
     self.m = BaseManager(address=(server_addr, 8001), authkey='baike')
     # 从网络连接
     self.m.connect()
     # 实现第三步: 获取Queue的对象
     self.task = self.m.get_task_queue()
     self.result = self.m.get_result_queue()
     # 初始化网页下载器和解析器
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     print('init finish.')
Пример #7
0
 def __init__(self):
     #初始化分布式进程中的工作节点的连接工作
     # 实现第一步:使用BaseManager注册获取Queue的方法名称
     BaseManager.register('get_task_queue')
     BaseManager.register('get_result_queue')
     # 实现第二步:连接到服务器:
     server_addr = '127.0.0.1'
     print('Connect to server %s...' % server_addr)
     # 端口和验证口令注意保持与服务进程设置的完全一致:
     self.m = BaseManager(address=(server_addr, 8001), authkey='baike'.encode('utf-8'))
     # 从网络连接:
     self.m.connect()
     # 实现第三步:获取Queue的对象:
     self.task = self.m.get_task_queue()
     self.result = self.m.get_result_queue()
     #初始化网页下载器和解析器
     self.downloader = HtmlDownloader.HtmlDownloader()
     self.parser = HtmlParser.HtmlParser()
     print('init finish')
Пример #8
0
 def __init__(self):
     self.urls = UrlManager.UrlManager()
     self.downloader = HtmlDownloader.HtmlDownloader()
     self.parser = HtmlParser.HtmlParser()
     self.outputer = HtmlOutputer.HtmlOutputer()
Пример #9
0
 def __init__(self):
     self.downloader=downloader.Downloader()
     self.outputer=FileOutputer.FileOutputer()
     self.parser=HtmlParser.HtmlParser()
Пример #10
0
 def __init__(self):
     
     self.UM = UM.UrlManager()
     self.HD = HD.HtmlDownloader()
     self.HP = HP.HtmlParser()
     self.DA = DA.DataArrange()
Пример #11
0
 def __init__(self):
     self.manager = UrlManager.UrlManager()
     self.downloader = HtmlDownloader.HtmlDownloader()
     self.parser = HtmlParser.HtmlParser()
     self.output = DataOutput.DataOutput()
Пример #12
0
 def __init__(self):
     self.urlManager = UrlManager.UrlManager()
     self.htmlDownloader = HtmlDownloader.HtmlDownloader()
     self.htmlParser = HtmlParser.HtmlParser()
     self.fileOutputer = FileOutputer.FileOutputer()
# coding=utf-8
import IOUtils
import HtmlDownloader
import HtmlParser
import HtmlOutputer
import bd

L=[]
hd = HtmlDownloader.HtmlDownloader()
hp = HtmlParser.HtmlParser()
io =IOUtils.IOUtils()
ho = HtmlOutputer.HtmlOutputer()
datas = io.getListOrDictFromJsonFile("F:\\ajk\\info.json")
print len(datas)
L_res=[]
for data in datas:
    name = data[u'小区名称']
    print name
    #下面这一步是为了url编码
    name = name.encode('utf-8') 
    url = data[u'网站']

    d = bd.getPos(name)
    if d is None:
        data[u'精度']=u"未找到"
        data[u'纬度']=u"未找到"
        data[u'附件500米幼儿园']=u"未找到"
        data[u'附近3000米幼儿园']=u"未找到"
        data[u'附近500米医院']=u"未找到"
        data[u'附近3000米医院']=u"未找到"
        data[u'附近500米商场']=u"未找到"
Пример #14
0
 def __init__(self):
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()
Пример #15
0
 def __init__(self):
     self.manager = UrlManager.UrlManager()
     self.downloader = HtmlDownloader.HtmlDownloader()
     self.parser = HtmlParser.HtmlParser()
Пример #16
0
 def __init__(self):
     super(SpiderMain, self).__init__()
     self.urls = UrlManager.UrlManager()
     self.downloader = HtmlDownloader.HtmlDownloader()
     self.parser = HtmlParser.HtmlParser()
     self.outputer = HtmlOutputer.HtmlOutputer()