Пример #1
0
class SpiderMain(object):
    def __init__(self):
        self.urls = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()

    def craw(self, url):
        count = 1
        self.urls.add_new_url(url)
        while self.urls.has_new_url():
            try:
                new_url = self.urls.get_new_url()
                html_cont = self.downloader.download(new_url)
                new_urls, html_data = self.parser.parse(new_url, html_cont)
                self.urls.add_new_urls(new_urls)
                self.outputer.collect_data(html_data)
                print "%d craw success : %s" % (count, new_url)
                if count >= 10:
                    break
                count = count + 1
            except Exception as e:
                print str(e)
                print "%d craw failed : %s" % (count, new_url)
        self.outputer.output()
Пример #2
0
 def url_manager_proc(self, url_q: Queue, conn_q: Queue, root_url):
     print('url manager process start...')
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     print('url manager process started...')
     while True:
         while url_manager.has_new_url():
             new_url = url_manager.get_new_url()
             print('new_url', new_url)
             # 将新的URL发给工作节点
             url_q.put(new_url)
             # 加一个判断条件, 当爬取2000个链接后就关闭, 并保存进度
             if url_manager.old_url_size() > 2000:
                 # 通知爬行节点工作结束
                 url_q.put('end')
                 print('控制节点发起结束通知')
                 # 关闭管理节点, 同事存储set状态
                 url_manager.save_process(path.join('dist', 'new_urls.txt'),
                                          url_manager.new_urls)
                 url_manager.save_process(path.join('dist', 'old_urls.txt'),
                                          url_manager.old_urls)
                 return
         # 将从result_solve_proc 获取到的URL添加到URL管理器
         try:
             if not conn_q.empty():
                 urls = conn_q.get()
                 url_manager.add_new_urls(urls)
         except BaseException as e:
             time.sleep(0.1)
Пример #3
0
class SpiderMain(object):
    def __init__(self):
        self.urls = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = HtmlOutputer()

    def craw(self, root_url, page_amount=5, time_sleep=None):
        count = 1
        # 添加第一个待爬取url
        self.urls.add_new_url(root_url)
        # 如果集合中有url, 就取出一个url 请求, 没有链接则跳出。
        while self.urls.has_new_url():
            try:
                # 开始爬取
                new_url = self.urls.get_new_url()
                print(f'craw{count}:{new_url}')
                # 请求url, 返回html
                html_content = self.downloader.download(new_url)
                # xpath 解析html,得到需要的数据
                new_urls, new_data = self.parser.parse(html_content)
                # 一个词条页面上关联的a 链表列表加入到url 管理器中待爬取
                self.urls.add_new_urls(new_urls)
                self.output.collect_data(new_url, new_data)
                count += 1
                if count > page_amount:
                    break

                time.sleep(2)
            except Exception as e:
                print(e)
                print(f'抓取失败:{new_url}')
        self.output.output_html()
class Scheduler(object):
    def __init__(self):
        self.url_manager = UrlManager()
        self.downloader = Downloader()
        self.parser = Parser()
        self.data_output = DadaOutput()

    def crawl(self, start_url, max_page):
        self.url_manager.add_new_url(start_url)
        while self.url_manager.has_url(
        ) and self.url_manager.old_url_size() < max_page:
            page_url = self.url_manager.get_new_url()
            page_html = self.downloader.down(page_url)
            new_urls, new_data = self.parser.parse(start_url, page_html)
            self.url_manager.add_new_urls(new_urls)
            self.data_output.store_data(new_data)
            self.data_output.output_html()
            print('第%s条数据写入' % (self.url_manager.old_url_size()))
Пример #5
0
class SpiderMain():
    def __init__(self):
        # URL 管理器
        # self.urls = UrlManager.UrlManager()
        self.urls = UrlManager()
        # URL 下载器
        # self.downloader = HtmlDownloader.HtmlDownloader()
        self.downloader = HtmlDownloader()
        # URL 解析器
        # self.parser = html_parser.HtmlParser()
        self.parser = HtmlParser()
        # self.outputer = html_outputer.HtmlOutputer()
        self.outputer = HtmlOutputer()

    def craw(self, root_url):
        count = 1
        originSet = set()
        originSet.add(root_url)
        self.urls.add_new_urls(originSet)
        while self.urls.has_new_rul():
            try:
                new_url = self.urls.get_new_url()
                print "craw %d : %s" % (count, new_url)
                html_cont = self.downloader.downloader(new_url)

                # 输出信息
                downStat = "ERROR"
                if html_cont != None:
                    downStat = "SUCCESS"
                    print "[Page ID : %d downloader %s!]" % (count, downStat)

                new_urls, new_data = self.parser.parser(new_url, html_cont)
                # print "\nnew_urls[%s], new_data[%s]" % (new_urls, new_data)

                self.urls.add_new_urls(new_urls)
                self.outputer.collect_data(new_data)

                if count == 15:
                    break
                count = count + 1
            except Exception as err:
                print "craw failed! ERROR infomation : %s" % err
        self.outputer.output_html()
Пример #6
0
class Spider():
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HTMLDownloader()
        self.parser = HTMLParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)
        while (self.manager.has_new_url()
               and self.manager.old_urls_size() < 50):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print("已经抓取%s个链接" % self.manager.old_urls_size())
            except Exception as e:
                print(e)
        self.output.output_html()
Пример #7
0
class SpiderMain(object):
    def __init__(self):
        self.urls = UrlManager()
        self.downlaoder = HtmlDownlaoder()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()

    def craw(self, root_url):
        count = 1
        # 把根url 传入url管理列表
        self.urls.add_url(root_url)

        # 页面爬取循环程序
        while self.urls.has_new_url():
            try:
                # 获取一个待爬取的url
                new_url = self.urls.get_new_url()
                print('craw %d: %s' % (count, new_url))

                # 下载该url爬取context
                html_cont = self.downlaoder.download(new_url)

                # 通过解析器,解析该url下载到的内容,获取新的 new_urls 和 新的 data
                new_urls, new_data = self.parser.parse(new_url, html_cont)

                # 把获取到 新的url添加到新的url管理器,
                self.urls.add_new_urls(new_urls)

                # 把获取的新的data添加到新的数据处理器中
                self.outputer.collect_data(new_data)

                if count == 100:
                    break

                count += 1
            except Exception as e:
                print('craw failed')

        self.outputer.output_html()
Пример #8
0
 def url_manager_proc(self, url_q, conn_q, root_url):
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     print(url_q)
     while True:
         while (url_manager.has_new_url()):
             new_url = url_manager.get_new_url()
             url_q.put(new_url)
             print('old_url=%s' % url_manager.old_url_size())
             if (url_manager.old_url_size() > 2000):
                 url_q.put('end')
                 print('控制节点发起结束通知!')
                 url_manager.save_progress('new_urls.txt',
                                           url_manager.new_urls)
                 url_manager.save_progress('old_urls.txt',
                                           url_manager.old_urls)
                 return
         try:
             if not conn_q.empty():
                 urls = conn_q.get()
                 url_manager.add_new_urls(urls)
         except BaseException as e:
             time.sleep(0.1)
Пример #9
0
class SpiderMain(object):
    def __init__(self):
        self.urls = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()

    def craw(self, root_url):
        count = 1
        self.urls.add_new_url(root_url)
        while self.urls.has_new_url():
            try:
                new_url = self.urls.get_new_url()  # 获取新url
                html_cont = self.downloader.download(new_url)  # 下载url内容
                new_urls, new_data = self.parser.parse(new_url,
                                                       html_cont)  # 解析url内容
                self.urls.add_new_urls(new_urls)  # 将解析到的新url存入url管理器
                self.outputer.collect_data(new_data)  # 收集解析到的数据
                if count == 200:
                    break
                count = count + 1
            except:
                print("craw failed")
        self.outputer.output_html()
Пример #10
0
class SpiderMain():
    """爬虫程序主模块"""
    def __init__(self):
        """构造函数,初始化属性"""
        self.urls = UrlManager()
        self.log = MyLog("spider_main", "logs")
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.outputer = HtmlOutputer()
        #self.util=utill.DBConn()

    def craw(self, root_url):
        """爬虫入口函数"""
        areas = {
            "gulou": 100,
            "jianye": 72,
            "qinhuai": 100,
            "xuanwu": 67,
            "yuhuatai": 32,
            "qixia": 62,
            "baijiahu": 33,
            "chalukou1": 26,
            "jiangningqita11": 3,
            "dongshanzhen": 29,
            "jiangningdaxuecheng": 15,
            "jiulonghu": 12,
            "jiangjundadao11": 22,
            "kexueyuan": 9,
            "qilinzhen": 42,
            "tiexinqiao": 9,
            "pukou": 100,
            "liuhe": 1,
        }

        #areas = {"gulou":1}

        #1、抓取所有二手房详情界面链接,并将所有连接放入URL管理模块
        for area, pg_sum in areas.items():
            for num in range(1, pg_sum + 1):
                #1.1 拼接页面地址: https://nj.lianjia.com/ershoufang/gulou/pg2/
                pg_url = root_url + area + "/pg" + str(num) + "/"
                self.log.logger.info("1.1 拼接页面地址:" + pg_url)
                print("1.1 拼接页面地址:" + pg_url)
                #1.2 启动下载器,下载页面.
                try:
                    html_cont = self.downloader.download(pg_url)
                except Exception as e:
                    self.log.logger.error("1.2 下载页面出现异常:" + repr(e))
                    time.sleep(60 * 30)
                else:
                    #1.3 解析PG页面,获得二手房详情页面的链接,并将所有链接放入URL管理模块
                    try:
                        ershoufang_urls = self.parser.get_erhoufang_urls(
                            html_cont)
                    except Exception as e:
                        self.log.logger.error("1.3 页面解析出现异常:" + repr(e))
                    else:
                        self.urls.add_new_urls(ershoufang_urls)
                        #暂停0~3秒的整数秒,时间区间:[0,3]
                        time.sleep(random.randint(0, 3))

        time.sleep(60 * 20)
        #2、解析二手房具体细心页面
        id = 1
        stop = 1
        while self.urls.has_new_url():
            #2.1 获取url
            try:
                detail_url = self.urls.get_new_url()
                self.log.logger.info("2.1 二手房页面地址:" + detail_url)
                print("2.1 二手房页面地址:" + detail_url)
            except Exception as e:
                print("2.1 拼接地址出现异常")
                self.log.logger.error("2.1 拼接地址出现异常:" + detail_url)

            #2.2 下载页面
            try:
                detail_html = self.downloader.download(detail_url)
            except Exception as e:
                self.log.logger.error("2.2 下载页面出现异常:" + repr(e))
                self.urls.add_new_url(detail_url)
                time.sleep(60 * 30)
            else:
                #2.3 解析页面
                try:
                    ershoufang_data = self.parser.get_ershoufang_data(
                        detail_html, id)
                except Exception as e:
                    self.log.logger.error("2.3 解析页面出现异常:" + repr(e))
                else:
                    #2.4 输出数据
                    try:
                        self.outputer.collect_data(ershoufang_data)
                    except Exception as e:
                        self.log.logger.error("2.4 输出数据出现异常:" + repr(e))
                    else:
                        print(id)
                        id = id + 1
                        stop = stop + 1
                        #暂停0~3秒的整数秒,时间区间:[0,3]
                        time.sleep(random.randint(0, 3))
                        if stop == 2500:
                            stop = 1
                            time.sleep(60 * 20)
Пример #11
0
class spider_main():
    def __init__(self):
        self.urls = UrlManager()
        self.parser = HtmlParser()
        self.downloader = UrlDownloader()
        self.log = MyLog("spider", "logs")
        self.output = HtmlOutPut()

    # 主模块中开始爬虫
    def Crawling(self, root_url):
        # 用字典存放地区名和网页数
        areas = {
            "gulou": 100,
            "jianye": 100,
            "qinhuai": 100,
            "xuanwu": 100,
            "yuhuatai": 100,
            "qixia": 100,
            "baijiahu": 64,
            "jiangningqita11": 5,
            "chalukou1": 63,
            "dongshanzhen": 42,
            "jiangningdaxuecheng": 28,
            "jiulonghu": 28,
            "jiangjundadao11": 50,
            "kexueyuan": 16,
            "pukou": 100,
            "liuhe": 13,
            "lishui": 9,
            "jiangning": 100,
            "qilinzhen": 83,
            "tangshanzhen": 2,
            "fenghuangxijie1": 82,
            "xianlin2": 33,
            "yaohuamen": 4,
            "maigaoqiao1": 33,
            "maqun1": 31,
            "qixiaqita1": 5,
            "xiaozhuang": 9,
            "yanziji": 2,
            "yueyuan": 15,
            "wanshou1": 5,
            "hongshan1": 16,
            "caochangmendajie": 27,
            "dinghuaimendajie": 37,
            "fujianlu": 9,
            "hanzhongmendajie": 19,
            "huxijie": 15,
            "jiangdong2": 8,
            "nanhu4": 38,
            "nanyuan2": 38,
            "shuiximen1": 13,
            "wandaguangchang1": 25,
            "xiaoxing": 13,
            "yuhuaxincun": 15,
            "lukou": 14,
            "dingshanjiedao": 8,
            "gaoxinqu2": 12,
            "jiangpujiedao": 29,
            "pukouqita11": 8,
            "qiaobei": 100,
            "taishanjiedao": 12
        }
        # 通过拼接形成所有的url地址,将所有的url连接保存
        for area, num in areas.items():
            for n in range(1, num + 1):
                # 拼接url: https://nj.lianjia.com/ershoufang/
                splice_url = root_url + area + "/pg" + str(n) + "/"
                # 将拼接url写入日志
                self.log.logger.info("url地址拼接" + splice_url)
                # 控制台打印
                print("url地址拼接" + splice_url)
                # 拼接完成后开始进行网页下载
                try:
                    html_down = self.downloader.download(splice_url)
                except Exception as e:
                    # 将错误信息写入日志
                    self.log.logger.error("html下载出现错误" + repr(e))
                    # 挂起进程
                    time.sleep(60)
                else:
                    # 如果下载页面不出现错误,进行网页解析
                    try:
                        secondhome_urls = self.parser.get_secondhandhome_urls(
                            html_down)
                    except Exception as e:
                        # 将错误信息写入日志
                        self.log.logger.error("html页面解析错误" + repr(e))
                    else:
                        # 页面解析正常
                        self.urls.add_new_urls(secondhome_urls)
                        # time.sleep(random.randint(0,3))
        time.sleep(60)
        # 具体解析html 获取需要的数据集
        id = 1  # 起始
        stop = 1
        while self.urls.isEmpty_new_urls():
            # 取出url
            try:
                temp_url = self.urls.get_new_url()
                # 控制台打印
                print("html页面地址" + temp_url)
                # 日志写入
                self.log.logger.info("html页面地址" + temp_url)
            except Exception as e:
                # 错误信息写入日志
                # 控制台打印
                print("html页面地址获取失败" + temp_url)
                self.log.logger.error("获取url错误" + repr(e))

            # url获取正常进行下载
            try:
                temp_data = self.downloader.download(temp_url)
            except Exception as e:
                # 控制台打印
                print("页面下载失败" + temp_url)
                # 错误写入日志
                self.log.logger.error("页面下载失败" + repr(e))
                self.urls.add_new_url(temp_url)
                time.sleep(10)
            else:  # 正常下载后 进行页面解析
                try:
                    temp_parser = self.parser.get_secondhandhome_data(
                        temp_data, id)
                except Exception as e:
                    self.log.logger.error("html页面解析错误" + repr(e))
                    print("html页面解析错误" + repr(e))
                else:
                    # 页面解析正常 进行写出
                    try:
                        self.output.write_data(temp_parser)
                    except Exception as e:
                        self.log.logger.error("数据集写出错误" + repr(e))
                        print("数据集写出错误" + repr(e))
                    else:
                        print(id)
                        id = id + 1
                        stop = stop + 1
                        time.sleep(0.2)
                        if stop == 2500:
                            stop = 1
                            time.sleep(60)
Пример #12
0
class GrabMain(object):
    def __init__(self, url):
        self.root_url = url
        self.urlManager = UrlManager()
        self.dLoader = HtmlDLoader()
        self.contParser = HtmlParser()
        self.contOutputer = HtmlOutputer()
        pass

    def grabText(self):
        if self.root_url is None:
            return
        self.urlManager.add_new_next_url(self.root_url)
        self.contParser.parser_set(None, None, None, None, None)
        while self.urlManager.get_new_next_count():
            try:
                new_url = self.urlManager.get_new_next_url()
                html_cont = self.dLoader.download(new_url)
                urls, nexts = self.contParser.parser_text_urls(html_cont)
                self.urlManager.add_new_next_urls(nexts)
                self.urlManager.add_new_urls(urls)
            except:
                print "url is error."

        pool = threadpool.ThreadPool(10)
        requests = threadpool.makeRequests(self.thread_grabText,
                                           self.urlManager.new_urls)
        [pool.putRequest(req) for req in requests]
        pool.wait()

    def thread_grabText(self, url):
        try:
            print "curr url is %s." % url
            html_cont = self.dLoader.download(url)
            title, cont = self.contParser.parser_text_cont(html_cont)
            self.contOutputer.output_cont(title, cont)
        except:
            print "url is %s, error." % url

    def grabImgs(self):
        if self.root_url is None:
            return None
        self.urlManager.add_new_next_url(self.root_url)
        self.contParser.parser_set(None, None, None, None, None)
        while self.urlManager.get_new_next_count():
            try:
                new_url = self.urlManager.get_new_next_url()
                html_cont = self.dLoader.download(new_url)
                urls, nexts = self.contParser.parser_text_urls(html_cont)
                self.urlManager.add_new_next_urls(nexts)
                self.urlManager.add_new_urls(urls)
            except:
                print "url is error."

        pool = threadpool.ThreadPool(10)
        requests = threadpool.makeRequests(self.thread_grabImg,
                                           self.urlManager.new_urls)
        [pool.putRequest(req) for req in requests]
        pool.wait()

    def thread_grabImg(self, url):
        try:
            print "curr url is %s." % url
            html_cont = self.dLoader.download(url)
            title, links = self.contParser.parser_img_cont(html_cont)
            if links is None or len(links) == 0:
                print "url is %s, not src." % url
                return None

            if title is None:
                title = time.time()
            try:
                if not os.path.isdir(title):
                    os.mkdir(title)
            except:
                title = time.time()
                if not os.path.isdir(title):
                    os.mkdir(title)

            params = []
            index = 0
            for link in links:
                params.append(([title, link, index], None))
                index += 1

            pool = threadpool.ThreadPool(12)
            requests = threadpool.makeRequests(self.contOutputer.output_img,
                                               params)
            [pool.putRequest(req) for req in requests]
            pool.wait()
        except:
            print "url is %s, error." % url
Пример #13
0
class Wenku():
    def __init__(self):
        self.authority = r'https://www.wenku8.net'
        self.loginurl = r'https://www.wenku8.net/login.php?do=submit&jumpurl=http%3A%2F%2Fwww.wenku8.net%2Findex.php'
        self.pageurl = r"/modules/article/articlelist.php?page="
        self.username = r''
        self.password = r''

        self.formdata = {}
        self.formdata['username'] = self.username
        self.formdata['password'] = self.password
        self.formdata['usecookie'] = '0'
        self.formdata['action'] = r'login'
        self.formdata[
            'submit'] = r'%26%23160%3B%B5%C7%26%23160%3B%26%23160%3B%C2%BC%26%23160%3B'

        self.headers = {}
        self.headers['origin'] = r'https://www.wenku8.net'
        self.headers[
            'referer'] = r'https://www.wenku8.net/login.php?jumpurl=http%3A%2F%2Fwww.wenku8.net%2Findex.php'
        self.headers['upgrade-insecure-requests'] = '1'
        self.headers[
            'user-agent'] = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'

        self.session = requests.Session()
        self.manager = UrlManager()
        self.downloader = UrlDownloader()
        self.parser = UrlParser(self.session)

    def login(self):
        response = self.session.post(self.loginurl,
                                     data=self.formdata,
                                     headers=self.headers)
        if response.status_code == 200:
            return True
        return False

    def parser_some_pages(self, begin, end):
        for turn in range(begin, end):
            index = self.session.get(self.authority + self.pageurl + str(turn))
            if index.status_code != 200:
                print('get page error page num: ' + str(turn) +
                      ' ,error code: ' + str(index.status_code))
                return
            index.encoding = 'gbk'
            self.parser_one_page(index)
            print('parser page ' + str(turn) + ' done!')
            time.sleep(random.random() * 3)

    def parser_one_page(self, index):
        data = self.parser.parser(index, index.text)
        self.manager.add_new_urls(data)

    def save_2_files(self, filename):
        self.manager.save_2_file(filename)

    def Run(self):
        thread_pool = []
        for i in range(0, 10):
            t = threading.Thread(target=self.parser_some_pages,
                                 args=(1 + i * 10, 1 + (i + 1) * 10))
            thread_pool.append(t)
        for t in thread_pool:
            t.start()
        for t in thread_pool:
            t.join()
        self.save_2_files('dict.txt')

    def load_and_download(self):
        with open('dict.txt', 'r') as f:
            urls = json.load(fp=f)
        for k, v in urls.items():
            name = (k + '.txt').replace('?', '!')
            self.downloader.download(v[0], name)
            print('download done ' + name)
            time.sleep(random.random() * 3)
Пример #14
0
class NewsCrawler:

    def __init__(self):
        self.seed = ['', 'http://news.163.com/' ] # 网易新闻首页
        self.downloader = multiThreadDownloader.downloader()
        self.analyze = HtmlAnalyze()
        self.craw_url_man = UrlManager()
        self.page_url_man = UrlManager()
        self.conn = MySQLdb.connect(
            host='localhost', user='******', passwd='toor',
            db='newsGather', charset='utf8')
        self.cur = self.conn.cursor()

        # 将数据库中已下载的url加入url管理器的old_urls中
        self.cur.execute("select url from news_info;")
        results = self.cur.fetchall()
        exist_urls = list()
        if results == ():
            pass
        else:
            for i in results:
                exist_urls.append(i[0])
            self.page_url_man.add_old_urls(exist_urls)

    def get_news(self, website):  # 处理url管理器中的新的新闻url
        news = list()
        dic = dict()
        count = 0
        new_urls = self.page_url_man.get_new_urls(len(self.page_url_man.new_urls))
        print "获取新闻网页:"
        pages = self.downloader.download(new_urls, 6)
        print "分析新闻网页并存储新闻...."
        for page in pages:
            dic = self.analyze.Content(website, page['content'])
            if dic:
                dic['url'] = page['url']
                news.append(dic)
                try:
                    print 'save ',dic['url']
                    sql_raw = "INSERT IGNORE INTO news_info (url, post_time, title,  keywords, content, source, origin) VALUES ('%s', '%s', '%s', '%s', '%s', '%s', \"%s\")" % (dic['url'], dic['post_time'], dic['title'], dic['keywords'], raw(dic['content']), dic['source'], dic['origin'])
                    spider.cur.execute(sql_raw)
                    spider.conn.commit()
                    count += 1
                except:
                    print "save error!"
        print '抓取新闻数:%d' % count
        return news

    def craw(self, news_num, website, expand_patt, news_patt):  # 新闻抓取
        # print "hello"
        self.craw_url_man.add_new_url(self.seed[website])
        news = list()
        dic = dict()
        count = 0
        i = 0
        while self.craw_url_man.has_new_url:
            print "第%d次扩展:" % i
            #print "获取待扩展页面:"
            craw_num = len(self.craw_url_man.new_urls)
            if craw_num < 60:
                new_urls = self.craw_url_man.get_new_urls(craw_num)
            else:
                new_urls = self.craw_url_man.get_new_urls(60)
            
            pages = self.downloader.download(new_urls, 6)
            print "分析待扩展页面....."
            for page in pages:
                craw_new_urls = self.analyze.getUrl(page['content'], expand_patt)
                self.craw_url_man.add_new_urls(craw_new_urls)
                page_new_urls = self.analyze.getUrl(page['content'], news_patt)
                #count = count + len(page_new_urls)
                self.page_url_man.add_new_urls(page_new_urls)
                count = len(self.page_url_man.new_urls)
                if count > news_num:
                    news += self.get_news(website)
                    break
            else:
                i = i + 1
                news += self.get_news(website)
                continue

            break
        return news
Пример #15
0
class SpiderMain():
    def __init__(self):
        self.urlDownLoader = HtmlDownloader()
        self.htmlParser = HtmlParser()
        self.urlManager = UrlManager()
        self.jsondataParser = JsonData_Parser()
        self.htmlOutPuter = HtmlOutPuter()

    def _get_from_discover_toplist(self, url):
        urls = self.htmlParser.parse(
            htmlContent=self.urlDownLoader.download(url),
            type='discover_toplist')
        self.urlManager.add_new_urls(urls)

    def _get_from_discover_artist(self, url):
        urls = self.htmlParser.parse(
            htmlContent=self.urlDownLoader.download(url),
            type='discover_artist')
        self.urlManager.add_new_urls(urls)

    def _get_from_artist(self, url):
        results = self.htmlParser.parse(
            htmlContent=self.urlDownLoader.download(url), type='artist')
        for name, urls in results.items():
            print(name)
            self.urlManager.add_new_urls(urls)

    def _get_from_song(self, url):
        tmp = {}
        name = self.htmlParser.parse(
            htmlContent=self.urlDownLoader.download(url), type='song')
        print("正收集:" + name)
        comments = self.jsondataParser.parse(
            self.urlDownLoader.downloadJsonData(url))
        tmp[name] = comments
        self.htmlOutPuter.collect_datas(tmp)

    def _parse_url(self, url):
        res = ''
        SONG = 'song'
        DISCOVER = 'discover'
        ARTIST = 'artist'
        TOPLIST = 'toplist'
        if (url.find(DISCOVER) != -1):
            res += DISCOVER
        if (url.find(ARTIST) != -1):
            if (res != ''):
                res += '_' + ARTIST
            else:
                res += ARTIST
        if (url.find(TOPLIST) != -1):
            if (res != ''):
                res += '_' + TOPLIST
            else:
                res += TOPLIST
        if (url.find(SONG) != -1):
            res += SONG
        return res

    def craw(self, rootUrl, direction=""):
        if (rootUrl.find('#') != -1):
            pos = rootUrl.find('#')
            rootUrl = rootUrl[:pos] + rootUrl[pos + 2:]
        self.urlManager.add_new_url(rootUrl)
        while self.urlManager.has_new_url():
            url = self.urlManager.get_url()
            methodName = '_get_from_' + self._parse_url(url)
            method = getattr(self, methodName)
            if (method != None):
                method(url)
        self.htmlOutPuter.output_html(direction=direction)