예제 #1
0
class SpiderMan(object):
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.download(root_url)
        urls = self.parser.parser_url(root_url, content)
        for url in urls:
            try:
                t = time.strftime("%Y%m%d%H%M%S3282", time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                    '?Ajax_CallBack=true' \
                    '&Ajax_CallBackType=Mtime.Library.Services' \
                    '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                    '&Ajax_CrossDomain=1' \
                    '&Ajax_RequestUrl=%s' \
                    '&t=%s' \
                    '&Ajax_CallBackArgument0=%s' % (url[0],t,url[1])
                rank_content = self.downloader.download(rank_url)
                data = self.parser.parser_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception as e:
                print('Crawl failed')
        self.output.output_end()
        print('Crawl finish')
예제 #2
0
 def __init__(self):
     """构造函数,初始化属性"""
     self.urls = UrlManager()
     self.log = MyLog("spider_main", "logs")
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.outputer = HtmlOutputer()
예제 #3
0
class SpiderMain:

    def __init__(self):
        """
        初始化方法,主要是将其他组件实例化
        """
        self.url_manager = UrlManager()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        self.data_storage = DataStorage()

    def start(self):
        """
        爬虫的主启动方法
        :return:
        """
        """ 页码 """
        title = set()
        for a in range(2, 10):
            html = self.html_downloader.download(
                'http://ggzy.foshan.gov.cn/jyxx/fss/zfcg_1108551/zbxx/index_'+str(a)+'.html?1')
            _title = self.html_parser.titleParer(html)
            for i in _title:
                title.add(i)
        for i in title:
            print(i)
            html = self.html_downloader.download(i)
            _product = self.html_parser.contextParer(html)
            self.data_storage.storage(_product)
예제 #4
0
 def __init__(self, url):
     self.root_url = url
     self.urlManager = UrlManager()
     self.dLoader = HtmlDLoader()
     self.contParser = HtmlParser()
     self.contOutputer = HtmlOutputer()
     pass
예제 #5
0
 def __init__(self):
     """
     初始化方法,主要是将其他组件实例化
     """
     self.url_manager = UrlManager()
     self.html_downloader = HtmlDownloader()
     self.html_parser = HtmlParser()
     self.data_storage = DataStorage()
예제 #6
0
 def __init__(self):
     self.url = UrlManager()
     self.downloader = Downloader()
     self.parser = HtmlParser()
     self.output = OutputUse()
     self.headers = {
         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.100 Safari/537.36',
     }
예제 #7
0
 def test_get_links(self):
     """
     Tests get_links method
     """
     file_util = FileUtil()
     expected_links = file_util.get_file_contents("links_test_data.txt")
     html_parser = HtmlParser()
     html_test_data = file_util.get_file_contents("html_test_data.html")
     actual_links = html_parser.get_links(html_test_data)
     self.assertEqual(expected_links, actual_links)
예제 #8
0
 def test_get_web_pages(self):
     """
     Tests get_web_pages method
     """
     file_util = FileUtil()
     expected_web_pages = file_util.get_file_contents("web_pages_test_data.txt")
     html_parser = HtmlParser()
     same_hostname_urls = file_util.get_file_contents("same_hostname_urls_test_data.txt")
     actual_web_pages = html_parser.get_web_pages(same_hostname_urls)
     self.assertEqual(expected_web_pages, actual_web_pages)
예제 #9
0
 def test_get_same_hostname_urls(self):
     """
     Tests get_same_hostname_urls method
     """
     file_util = FileUtil()
     expected_same_hostname_urls = file_util.get_file_contents("same_hostname_urls_test_data.txt")
     html_parser = HtmlParser()
     hostname = "http://www.domain.com/"
     links = file_util.get_file_contents("links_test_data.txt")
     actual_same_hostname_urls = html_parser.get_same_hostname_urls(hostname, links)
     self.assertEqual(expected_same_hostname_urls, actual_same_hostname_urls)
예제 #10
0
 def __init__(self):
     # URL 管理器
     # self.urls = UrlManager.UrlManager()
     self.urls = UrlManager()
     # URL 下载器
     # self.downloader = HtmlDownloader.HtmlDownloader()
     self.downloader = HtmlDownloader()
     # URL 解析器
     # self.parser = html_parser.HtmlParser()
     self.parser = HtmlParser()
     # self.outputer = html_outputer.HtmlOutputer()
     self.outputer = HtmlOutputer()
예제 #11
0
 def parse_html(page_url):
     html_string = ''
     try:
         response = urlopen(page_url, timeout=5)
         if 'text/html' in response.getheader('Content-Type'):
             html_bytes = response.read()
             html_string = html_bytes.decode("utf-8")
         finder = HtmlParser(Spider.base_url, page_url)
         finder.feed(html_string)
     except Exception as e:
         print(str(e))
         return set(), html_string
     return finder.page_links(), html_string
예제 #12
0
 def __init__(self):
     BaseManager.register('get_task_queue')
     BaseManager.register('get_result_queue')
     server_addr = '127.0.0.1'
     print('Connect to server %s...' % server_addr)
     self.m = BaseManager(address=(server_addr,8001),authkey=b'baike')
     self.m.connect()
     self.task = self.m.get_task_queue()
     print(self.task.qsize())
     self.result = self.m.get_result_queue()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     print('init finish')
예제 #13
0
    async def parse(self):
        retries = PasswordPageParser.RETRIES
        status = None
        while retries > 0 and status != 200:
            html, status = await self.get_page()
            retries -= 1

        if status == 200:
            table_parser = HtmlParser(html)
            table_data = table_parser.parse('get_table_data')

        if table_data:
            arranged_table_data = self.arrange_table_data(table_data)
            return arranged_table_data
예제 #14
0
class SpiderWork(object):
    def __init__(self):
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        server_addr = '127.0.0.1'
        print('Connect to server %s...' % server_addr)
        self.m = BaseManager(address=(server_addr,8001),authkey=b'baike')
        self.m.connect()
        self.task = self.m.get_task_queue()
        print(self.task.qsize())
        self.result = self.m.get_result_queue()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')

    def crawl(self):
        import time
        while(True):
            try:
                if not self.task.empty():
                    url = self.task.get()
                    if url == 'end':
                        print('控制节点通知爬虫节点停止工作...')
                        self.result.put({'new_urls':'end', 'data':'end'})
                        return
                    print('爬虫节点正在解析:%s' % url.encode('utf-8'))
                    content = self.downloader.download(url)
                    new_urls,data = self.parser.parser(url,content)
                    self.result.put({'new_urls':new_urls, 'data':data})
            except EOFError as e:
                print('连接工作节点失败')
                return
            except Exception as e:
                print(e)
                print('Crawl fail')
예제 #15
0
파일: spider.py 프로젝트: ZhitongLei/web
	def start(self):
		url_queue = Queue.Queue()
		url_queue.put((self.root_request_info.url, 0))

		request_info = RequestInfo('', None, self.root_request_info.headers)
		fetcher = Fetcher()

		while not url_queue.empty():
			curr_url, depth = url_queue.get()		
			#print 'url=%s, depth=%d' % (curr_url, depth)
			print curr_url

			if depth > self.depth_limit:
				continue
			
			depth += 1
			request_info.url = curr_url
			page_content = fetcher.request(request_info)

			## parse page
			## Content.parse(page_content)

			url_list = HtmlParser.extract_url(curr_url, page_content)
			if url_list:
				for url in url_list:
					url_queue.put((url, depth))
예제 #16
0
class SpiderMain(object):
    def __init__(self):
        self.urls = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()

    def craw(self, url):
        count = 1
        self.urls.add_new_url(url)
        while self.urls.has_new_url():
            try:
                new_url = self.urls.get_new_url()
                html_cont = self.downloader.download(new_url)
                new_urls, html_data = self.parser.parse(new_url, html_cont)
                self.urls.add_new_urls(new_urls)
                self.outputer.collect_data(html_data)
                print "%d craw success : %s" % (count, new_url)
                if count >= 10:
                    break
                count = count + 1
            except Exception as e:
                print str(e)
                print "%d craw failed : %s" % (count, new_url)
        self.outputer.output()
    def get_parser(self, dom):
        lettingInformationDiv = dom.find("div", id="lettingInformation")

        if lettingInformationDiv:
            return HtmlParser(dom)
        else:
            return PageModelParser(dom)
예제 #18
0
class SpiderMain(object):
    def __init__(self):
        self.urls = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = HtmlOutputer()

    def craw(self, root_url, page_amount=5, time_sleep=None):
        count = 1
        # 添加第一个待爬取url
        self.urls.add_new_url(root_url)
        # 如果集合中有url, 就取出一个url 请求, 没有链接则跳出。
        while self.urls.has_new_url():
            try:
                # 开始爬取
                new_url = self.urls.get_new_url()
                print(f'craw{count}:{new_url}')
                # 请求url, 返回html
                html_content = self.downloader.download(new_url)
                # xpath 解析html,得到需要的数据
                new_urls, new_data = self.parser.parse(html_content)
                # 一个词条页面上关联的a 链表列表加入到url 管理器中待爬取
                self.urls.add_new_urls(new_urls)
                self.output.collect_data(new_url, new_data)
                count += 1
                if count > page_amount:
                    break

                time.sleep(2)
            except Exception as e:
                print(e)
                print(f'抓取失败:{new_url}')
        self.output.output_html()
예제 #19
0
 def __init__(self):
     # 实例化其他模块类
     self.mysql_handler = MysqlHandler()
     self.html_downloader = HtmlDownloader()
     self.html_parser = HtmlParser()
     # 爬取起点url
     self.root_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
     # 用于后续url的拼接
     self.split_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
     # 省页面列表
     self.province_url_list = []
     # 市页面列表
     self.city_url_list = []
     # 区页面列表
     self.county_url_list = []
     # 乡镇、街道页面列表
     self.town_url_list = []
예제 #20
0
 def __init__(self, address='127.0.0.1', port=8001, authkey=b'baike'):
     """初始化分布式进程中工作节点的连接工作"""
     # 注册用于获取Queue的方法名称
     BaseManager.register('get_task_queue')
     BaseManager.register('get_result_queue')
     # 连接到服务器
     print('Connect to server %s:%s...' % (address, port))
     self.manager = BaseManager(address=(address, port), authkey=authkey)
     # 开始连接
     self.manager.connect()
     # 获取Queue对象
     self.task_q = self.manager.get_task_queue()
     self.result_q = self.manager.get_result_queue()
     # 初始化下载器和解析器
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     print('init finish')
예제 #21
0
class Scheduler(object):
    def __init__(self):
        self.url = UrlManager()
        self.downloader = Downloader()
        self.parser = HtmlParser()
        self.output = OutPutUse()
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.100 Safari/537.36',
        }

    def run(self):
        url_seed_main = self.url.get_main_seed_url()
        content = self.downloader.download(url_seed_main,
                                           retry_count=2,
                                           headers=self.headers).decode('utf8')
        subject_urls = self.parser.parse_main_subjects(content)
        for subject_url in subject_urls:
            self._run_subject(subject_url)

    def _run_subject(self, subject_url):
        print('#subject_url#:' + subject_url)
        content = self.downloader.download(subject_url,
                                           retry_count=2,
                                           headers=self.headers).decode('utf8')
        mj_info = self.parser.parse_subject_mj_info(content)
        if mj_info is None:
            return
        mj_max_count = int(mj_info['count'])
        mj_name = str(mj_info['mj_name'])
        cur_count = 1
        index = 1
        while cur_count <= mj_max_count:
            real_url = subject_url
            if index > 1:
                real_url = subject_url[0:len(subject_url) -
                                       5] + ('_' + str(index) + '.html')
            index = index + 1
            # 正常每页的大图个数为4
            cur_count = cur_count + 4
            print('正在获取大图的页面是:' + real_url)
            content = self.downloader.download(
                real_url, retry_count=2, headers=self.headers).decode('utf8')
            pic_urls = self.parser.parse_page_pics(content)
            for pic_url in pic_urls:
                self.output.download_and_save(pic_url, mj_name)
예제 #22
0
 def main():
     sd = input("Start Date(yyyy,m,d): ")
     ed = input("End Date(yyyy,m,d): ")
     print(datetime.datetime.now())
     multiParsedTagList = hp.get_fullParsedTagList(sd, ed)
     tagSelect = sc.get_singlePageInfo(multiParsedTagList)
     pageInfos = sc.get_pageInfos(tagSelect)
     #pp.print_mergedList(pageInfos)
     pp.save_csv(pageInfos, sd, ed)
     print(datetime.datetime.now())
예제 #23
0
    def __init__(self, max_tasks=10, loop=None):
        self.loop = loop or asyncio.get_event_loop()
        self.parser = HtmlParser()
        self.saver = Saver()
        self.url_manager = UrlManager()

        self.max_tasks = max_tasks

        # 初始化队列
        if not self.url_manager:
            self.url_manager.put("http://company.yellowurl.cn/")
예제 #24
0
class WebCrawler:
    def __init__(self):
        self.url_util = UrlUtil()
        self.html_requester = HtmlRequester()
        self.html_parser = HtmlParser()

    def crawl(self, url):
        """
        Returns the URLs reachable from the parameter URL
        The assets of each URL are also returned.
        Only URLs with the same hostname including subdomain as the parameter URL are returned.
        """

        url = self.url_util.normalise_url(url)
        hostname = self.url_util.get_hostname(url)

        urlsToVisit = [url]
        urlsVisted = []
        output = []
        # Each iteration of this loop processes the next URL to visit.
        while (len(urlsToVisit) > 0):

            url = urlsToVisit.pop(0)
            urlsVisted.append(url)

            html = self.html_requester.get_html(url)
            links = self.html_parser.get_links(html)
            same_hostname_urls = self.html_parser.get_same_hostname_urls(
                hostname, links)
            assets = self.html_parser.get_assets(same_hostname_urls)
            web_pages = self.html_parser.get_web_pages(same_hostname_urls)

            output.append({"url": url, "assets": assets})
            print json.dumps({"url": url, "assets": assets}, indent=4)

            for web_page in web_pages:
                # Do not visit a page more than once
                if not web_page in urlsToVisit and web_page not in urlsVisted:
                    urlsToVisit.append(web_page)

        return json.dumps(output, indent=4).splitlines()
예제 #25
0
    def craw(self):
        # 下载
        downloader = HtmlDownloader()

        root_cont = downloader.download(self.url)
        parser = HtmlParser()
        urls, data = parser.parse(self.url, root_cont, True)
        result = ""
        for url in urls:
            cont = downloader.download(url)
            newurls, month = parser.parse(url, cont, False)
            if month != None:
                result += month.getMonthly()
            month = None
            #print(month.getMonthly())

        f = open("阿里巴巴数据库内核组月报.md", "w+", encoding='utf-8')
        result = "## 阿里巴巴数据库内核月报\n\n" + result
        f.write(result)
        f.close()

        pass
예제 #26
0
class Spider:
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.download(root_url)
        urls = self.parser.parse_url(root_url, content)
        for url in urls:
            try:
                # http://service.library.mtime.com/Movie.api
                # ?Ajax_CallBack=true
                # &Ajax_CallBackType=Mtime.Library.Services
                # &Ajax_CallBackMethod=GetMovieOverviewRating
                # &Ajax_CrossDomain=1
                # &Ajax_RequestUrl=http%3A%2F%2Fmovie.mtime.com%2F246526%2F&t=201710117174393728&Ajax_CallBackArgument0=246526
                t = time.strftime('%Y%m%d%H%M%S3282', time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                           '?Ajax_CallBack=true' \
                           '&Ajax_CallBackType=Mtime.Library.Services' \
                           '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                           '&Ajax_CrossDomain=1' \
                           '&Ajax_RequestUrl=%s' \
                           '&t=%s' \
                           '&Ajax_CallbackArgument0=%s' % (url[0].replace('://', '%3A%2F%2F')[:-1], t, url[1])
                rank_content = self.downloader.download(rank_url)
                if rank_content is None:
                    print('None')
                data = self.parser.parse_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception as e:
                raise e
                # print(e)
                # print('Crawl failed')

        self.output.output_end()
        print('Crawl finish')
예제 #27
0
class SpiderWorker:
    def __init__(self, address='127.0.0.1', port=8001, authkey=b'baike'):
        """初始化分布式进程中工作节点的连接工作"""
        # 注册用于获取Queue的方法名称
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        # 连接到服务器
        print('Connect to server %s:%s...' % (address, port))
        self.manager = BaseManager(address=(address, port), authkey=authkey)
        # 开始连接
        self.manager.connect()
        # 获取Queue对象
        self.task_q = self.manager.get_task_queue()
        self.result_q = self.manager.get_result_queue()
        # 初始化下载器和解析器
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('init finish')

    def crawl(self):

        while True:
            try:
                if not self.task_q.empty():
                    url = self.task_q.get()

                    if url == 'end':
                        print('控制节点通知爬虫节点停止工作...')
                        # 接着通知其他节点停止工作
                        self.result_q.put({'new_urls': 'end', 'data': 'end'})
                        return

                    print('爬虫节点正在解析: %s' % url)
                    content = self.downloader.download(url)
                    new_urls, data = self.parser.parse(url, content)
                    self.result_q.put({'new_urls': new_urls, 'data': data})

                else:
                    print('task queue is empty', self.task_q.empty())
            except EOFError:
                print('连接工作节点失败')
                return
            except Exception as e:
                print(e)
                print('crawl fail')
예제 #28
0
class SpiderMain():
    def __init__(self):
        # URL 管理器
        # self.urls = UrlManager.UrlManager()
        self.urls = UrlManager()
        # URL 下载器
        # self.downloader = HtmlDownloader.HtmlDownloader()
        self.downloader = HtmlDownloader()
        # URL 解析器
        # self.parser = html_parser.HtmlParser()
        self.parser = HtmlParser()
        # self.outputer = html_outputer.HtmlOutputer()
        self.outputer = HtmlOutputer()

    def craw(self, root_url):
        count = 1
        originSet = set()
        originSet.add(root_url)
        self.urls.add_new_urls(originSet)
        while self.urls.has_new_rul():
            try:
                new_url = self.urls.get_new_url()
                print "craw %d : %s" % (count, new_url)
                html_cont = self.downloader.downloader(new_url)

                # 输出信息
                downStat = "ERROR"
                if html_cont != None:
                    downStat = "SUCCESS"
                    print "[Page ID : %d downloader %s!]" % (count, downStat)

                new_urls, new_data = self.parser.parser(new_url, html_cont)
                # print "\nnew_urls[%s], new_data[%s]" % (new_urls, new_data)

                self.urls.add_new_urls(new_urls)
                self.outputer.collect_data(new_data)

                if count == 15:
                    break
                count = count + 1
            except Exception as err:
                print "craw failed! ERROR infomation : %s" % err
        self.outputer.output_html()
예제 #29
0
class Crawler(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()

    def get_urls(self, keywords):
        data = {}
        for word in keywords:
            url = self.crawl(word)
            data[word] = url

        return data

    def crawl(self, word):
        results = {}
        url = self.manager.get_url(word)
        page = self.downloader.download(url)

        return self.parser.search(page)
예제 #30
0
class Crawler(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()

    def get_urls(self, keywords):
        data = {}
        for word in keywords:
            url = self.crawl(word)
            data[word] = url;
        
        return data
    
    def crawl(self, word):
        results = {}
        url = self.manager.get_url(word);
        page = self.downloader.download(url)
        
        return self.parser.search(page)
예제 #31
0
class CodeSpider(object):
    def __init__(self):
        # 实例化其他模块类
        #self.mysql_handler = MysqlHandler()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        self.path = "/Users/spike/python_项目/get_cd_school/"
        # # 爬取起点url
        # self.root_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages=1'
        # # 用于后续url的拼接
        # self.split_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages='
        # school info
        # self.school_infos = []

    def craw(self, downloading_url):
        try:
            # 记录正在下载、解析的url,便于分析错误
            # downloading_url = self.root_url
            html_content = self.html_downloader.download(downloading_url)
            # 第一个参数:需要解析的html代码
            # 第二个参数:用于url拼接的url
            self.school_infos = self.html_parser.province_parser(html_content)
            # print(self.school_infos)
            #exit()
            if (len(self.school_infos) != 20):
                print(downloading_url + "解析成功")
                print("当前页面数据:" + str(len(self.school_infos)))
            #print(self.province_url_list)
            with open(self.path + "school.txt", "a") as f:
                # print("writting")
                for mc, xd, qy, xz, dh, dz in self.school_infos:
                    f.write(mc + "\t" + xd + "\t" + qy + "\t" + xz + "\t" +
                            dh + "\t" + dz)
            f.close()
            return len(self.school_infos)

        except Exception as e:
            print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e)
            # 利用traceback定位异常
            traceback.print_exc()
예제 #32
0
class CodeSpider(object):
    def __init__(self):
        # 实例化其他模块类
        self.mysql_handler = MysqlHandler()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        # 爬取起点url
        # self.root_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages=1'
        # # 用于后续url的拼接
        # self.split_url = 'http://infomap.cdedu.gov.cn/Home/Index?all=1&pages='
        # # school info
        # self.school_infos = []
        #日志文件路径需要自行修改
        # self.last_log_path = "d:\\log.txt"
        # self.last_log_path = "/Users/spike/spider_log.txt"
    def craw(self,downloading_url):
        try:
            # 记录正在下载、解析的url,便于分析错误
            # downloading_url = self.root_url
            html_content = self.html_downloader.download(downloading_url)
            # 第一个参数:需要解析的html代码
            self.school_infos = self.html_parser.province_parser(html_content)
            # print(self.school_infos)
            if (len(self.school_infos)!=20):
                print(downloading_url+"解析成功")
                print("当前页面数据:"+str(len(self.school_infos)))
            for mc,xd,qy,xz,dh,dz in self.school_infos:
                # print(mc+xd+qy+xz+dh+dz)
                province_id = self.mysql_handler.insert(mc,xd,qy,xz,dh,dz)     
                # print(province_id)
                # exit()
                # 记录正在下载、解析的url,便于分析错误  
            # self.mysql_handler.close()
            return len(self.school_infos)
        except Exception as e:
            print('[ERROR] Craw Field!Url:', downloading_url, 'Info:', e)
            # 利用traceback定位异常
            traceback.print_exc()
            time.sleep(60)            
예제 #33
0
class SpiderMain(object):
    """docstring for SpiderMain"""
    def __init__(self):
        self.urlManage = UrlManage()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()
    def craw(self,url):
        self.urlManage.add_new_url(url)
        
        count = 1
        while self.urlManage.has_new_url():
            url = self.urlManage.get_new_url()
            print '%dth page,address:%s' % (count,url)
            html_content = self.downloader.downloadPage(url)
            new_urls,new_data = self.parser.parse(html_content,url)
            self.urlManage.add_new_urls(new_urls)
            self.outputer.collect_data(new_data)

            if count == 10:
                break

            count = count + 1 
        self.outputer.output_html()
예제 #34
0
def parse_html(input_file_path):
    result_file_path = "".join([os.path.splitext(input_file_path)[0], '.content'])
    with open(input_file_path, 'r') as html_file:
        doc = HtmlParser(html_file.read(), "lxml")
        doc.write_to_file(result_file_path)
예제 #35
0
 def __init__(self):
     self.urlManage = UrlManage()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.outputer = HtmlOutputer()
예제 #36
0
 def __init__(self):
     self.manager = UrlManager()
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()