示例#1
0
    def addKu6Url(self):
        websiteId = tools.getWebsiteId(Constance.KU6)
        urlDict = {
            # 视频 最多播放 今日
            'http://top.ku6.com/v_t1d1c126000p%d.html': 15,
            # 视频 最多播放 本周
            'http://top.ku6.com/v_t1d2c126000p%d.html': 15,
            # 视频 最多播放 本月
            'http://top.ku6.com/v_t1d3c126000p%d.html': 15,
            # 视频 最新推荐
            'http://top.ku6.com/v_t7d4c126000p%d.html': 30,

            # 专辑 最多播放 今日
            'http://top.ku6.com/p_t1d1c126000p%d.html': 1,
            # 专辑 最多播放 本周
            'http://top.ku6.com/p_t1d2c126000p%d.html': 1,
            # 专辑 最多播放 本月
            'http://top.ku6.com/p_t1d3c126000p%d.html': 15,
            # 专辑 最新推荐
            'http://top.ku6.com/p_t2d4c126000p%d.html': 30
        }

        for baseUrl, pageCount in urlDict.items():
            for i in range(1, pageCount + 1):
                url = baseUrl % i
                log.debug("ku6 base url = %s" % url)
                self.addUrl(url, websiteId)
示例#2
0
 def addPPTVUrl(self):
     baseUrl = 'http://list.pptv.com/channel_list.html?page=%d&type=210548&sort=1'
     pageCount = 17
     websiteId = tools.getWebsiteId(Constance.PPTV)
     for i in range(1, pageCount + 1):
         url = baseUrl % i
         log.debug("pptv base url = %s" % url)
         self.addUrl(url, websiteId)
示例#3
0
 def addWangYiUrl(self):
     baseUrl = 'http://so.v.163.com/search/000-0-0000-1-%d-0-纪录片/'
     pageCount = 614
     websiteId = tools.getWebsiteId(Constance.WANG_YI)
     for i in range(1, pageCount + 1):
         url = baseUrl % i
         log.debug("wangyi base url = %s" % url)
         self.addUrl(url, websiteId)
示例#4
0
    def addTouDouUrl(self):
        # 全部 剧集
        # 添加首页 后续页面在tudou里添加
        baseUrl = 'http://www.tudou.com/s3portal/service/pianku/data.action?pageSize=90&app=mainsitepc&deviceType=1&tags=&tagType=3&firstTagId=8&areaCode=&initials=&hotSingerId=&pageNo=1&sortDesc=quality'
        websiteId = tools.getWebsiteId(Constance.TUDOU)
        self.addUrl(baseUrl, websiteId, Constance.EPISODE)

        # 视频
        # 添加首页 后续页面在tudou里添加
        baseUrl = 'http://www.tudou.com/list/itemData.action?tagType=1&firstTagId=8&areaCode=&tags=&initials=&hotSingerId=&page=1&sort=2&key='
        websiteId = tools.getWebsiteId(Constance.TUDOU)
        self.addUrl(baseUrl, websiteId, Constance.VIDEO)

        # 栏目
        # 添加首页 后续页面在tudou里添加
        baseUrl = 'http://www.tudou.com/list/playlistData.action?tagType=2&firstTagId=8&areaCode=&tags=&initials=&hotSingerId=&page=1&sort=2&key='
        websiteId = tools.getWebsiteId(Constance.TUDOU)
        self.addUrl(baseUrl, websiteId, Constance.ITERM)
示例#5
0
 def addTencentUrl(self):
     baseUrl = 'http://v.qq.com/x/documentarylist/?itype=-1&offset=%d&sort=4'
     pageCount = 121  # 后续改成动态获取网页上的尾页  正则:'data-total="(.+)"></div)'
     websiteId = tools.getWebsiteId(Constance.TENCENT)
     #self.addUrl('http://v.qq.com/x/documentarylist/?itype=-1&offset=0&sort=4', websiteId)
     for i in range(0, pageCount * 20, 20):
         url = 'http://v.qq.com/x/documentarylist/?itype=-1&offset=%d&sort=4' % i
         log.debug("tencent base url = %s" % url)
         self.addUrl(url, websiteId)
示例#6
0
    def addCCTVUrl(self):
        urlsList = []
        filepath = '..\\urls\\cctv.conf'
        for line in fileinput.input(filepath):
            url = tools.getInfo(line, '= (.+?)\n')
            if url:
                urlsList.extend(url)

        websiteId = tools.getWebsiteId(Constance.CCTV)

        for url in urlsList:
            log.debug("Add url %s to DB" % url)
            self.addUrl(url, websiteId)
示例#7
0
def myTest():
    websiteId = tools.getWebsiteId('cctv.com')
    urls = []
    f = open('D:\html.txt', 'r')
    while True:
        line = f.readline()
        if not line: break
        url = tools.getInfo(line, '"url".+"(.+?)"')
        if url != []:
            urls.extend(url)

    for url in urls:
        parseLeafUrl(url, websiteId)

#myTest()
示例#8
0
    def addKanKanUrl(self):
        def getPageNum(url):
            html = tools.getHtml(url)
            regex = 'list-pager-v2.*>(.*?)</a><a id="pagenav_next"'
            pageNum = tools.getInfo(html, regex)
            pageNum = len(pageNum) == 0 and '1' or pageNum[0]
            # log.debug(pageNum)
            return int(pageNum)

        # 全部视频
        websiteId = tools.getWebsiteId(Constance.KAN_KAN)
        baseUrl = 'http://movie.kankan.com/type/documentary/'
        log.debug("kankan base url = %s" % baseUrl)
        self.addUrl(baseUrl, websiteId)

        pageCount = getPageNum(baseUrl)
        log.debug("kankan 页数 = %d" % pageCount)
        for i in range(2, pageCount + 1):
            url = baseUrl + 'page%d/' % i
            log.debug("kankan base url = %s" % url)
            self.addUrl(url, websiteId)

        # 按类型
        log.debug('----------------------按类型分类-----------------------')
        regex = '"div_genre">(.*?)</dd>'
        html = tools.getHtml(baseUrl)
        typeBlockUrl = tools.getInfo(html, regex)

        regex = ' href="(.*?)"'
        typeUrls = tools.getInfo(typeBlockUrl, regex)
        for typeBaseUrl in typeUrls:
            log.debug("kankan type base url = %s" % typeBaseUrl)
            pageCount = getPageNum(typeBaseUrl)
            log.debug("kankan 页数 = %d" % pageCount)

            self.addUrl(typeBaseUrl, websiteId)

            for i in range(2, pageCount + 1):
                url = typeBaseUrl + 'page%d/' % i
                log.debug("kankan type base url = %s" % url)
                self.addUrl(url, websiteId)
 def addSoHuUrl(self):
     baseUrl = "http://www.sohu.com/"
     websiteId = tools.getWebsiteId(Constance.SOHU)
     self.addUrl(baseUrl, websiteId)
示例#10
0
 def addTencentUrl(self):
     baseUrl = "http://www.qq.com"
     websiteId = tools.getWebsiteId(Constance.TENCENT)
     self.addUrl(baseUrl, websiteId)
示例#11
0
    def addYoukuUrl(self):
        # 取页数
        def getPageNum(pageUrl):
            html = tools.getHtml(pageUrl)

            pageNumRegexs = [
                '<ul class="yk-pages">.*>(.*?)</a></li><li class="next"',
                '<ul class="yk-pages">.*>(.*?)</span></li><li class="next"'
            ]
            pageNum = tools.getInfo(html, pageNumRegexs)
            pageNum = len(pageNum) == 0 and '0' or pageNum[0]
            # log.debug(pageNum)
            return int(pageNum)

        showUrl = 'http://list.youku.com/category/show/c_84_s_1_d_1_p_1.html'
        videoUrl = 'http://list.youku.com/category/video/c_84_d_1_s_1_p_1.html'
        showPageCount = getPageNum(showUrl)
        videoPageCount = getPageNum(videoUrl)
        websiteId = tools.getWebsiteId(Constance.YOUKU)

        ## 全部节目类
        for i in range(1, showPageCount + 1):
            url = showUrl.replace('_p_1.html', '_p_%d.html' % i)
            # log.debug("youku base url = %s"%url)
            self.addUrl(url, websiteId, 'show')

        ## 全部视频类
        for i in range(1, videoPageCount + 1):
            url = videoUrl.replace('_p_1.html', '_p_%d.html' % i)
            # log.debug("youku base url = %s"%url)
            self.addUrl(url, websiteId, 'video')

        log.debug('----------------------按节目分类-----------------------')
        ## 节目 按分类
        html = tools.getHtml(showUrl)
        # 取地区url
        regex = '<label>地区:</label>(.*?)</ul>'
        regionUrlBlock = tools.getInfo(html, regex)
        regex = 'href="(.*?)">'
        regionUrls = tools.getInfo(regionUrlBlock, regex)
        for regionUrl in regionUrls:
            log.debug("地区url = " + regionUrl)
            # 取地区下类型url
            html = tools.getHtml(regionUrl)
            regex = "<label>类型:</label>(.*?)</ul>"
            typeUrlBlock = tools.getInfo(html, regex)
            regex = 'href="(.*?)">'
            typeUrls = tools.getInfo(typeUrlBlock, regex)
            # 取每个类型的页数,拼出每页的地址,存到数据库
            for typeUrl in typeUrls:
                log.debug("typeUrl = " + typeUrl)
                pageNum = getPageNum(typeUrl)
                for i in range(1, pageNum + 1):
                    url = typeUrl.replace('.html', '_p_%d.html' % i)
                    self.addUrl(url, websiteId, 'show')

        log.debug('----------------------按视频分类-----------------------')
        ## 视频 按分类
        regex = "<label>类型:</label>(.*?)</ul>"
        html = tools.getHtml(videoUrl)
        typeUrlBlock = tools.getInfo(html, regex)
        regex = 'href="(.*?)">'
        typeUrls = tools.getInfo(typeUrlBlock, regex)
        # 取每个类型的页数,拼出每页的地址,存到数据库
        for typeUrl in typeUrls:
            log.debug("typeUrl = " + typeUrl)
            pageNum = getPageNum(typeUrl)
            for i in range(1, pageNum + 1):
                url = typeUrl.replace('.html', '_p_%d.html' % i)
                self.addUrl(url, websiteId, 'video')
示例#12
0
 def addIFengUrl(self):
     baseUrl = "http://www.ifeng.com/"
     websiteId = tools.getWebsiteId(Constance.IFENG)
     self.addUrl(baseUrl, websiteId)
示例#13
0
 def addV1Url(self):
     # 添加首页 后续页面在v1里添加
     baseUrl = 'http://api.v1.cn/v1Enhanced/interfaceForJsonP?callback=jQuery18308286485691806487_1477619118750&obj=cms.getArticle&cid=1147&page=1&nums=24&_=1477619416282'
     websiteId = tools.getWebsiteId(Constance.V1)
     self.addUrl(baseUrl, websiteId)
示例#14
0
 def addWangYiUrl(self):
     baseUrl = "http://www.163.com/"
     websiteId = tools.getWebsiteId(Constance.WANG_YI)
     self.addUrl(baseUrl, websiteId)
示例#15
0
 def addPeopleUrl(self):
     baseUrl = "http://www.people.com.cn/"
     websiteId = tools.getWebsiteId(Constance.PEOPLE)
     self.addUrl(baseUrl, websiteId)
示例#16
0
 def addCCTVUrl(self):
     baseUrl = "http://www.cctv.com/"
     websiteId = tools.getWebsiteId(Constance.CCTV)
     self.addUrl(baseUrl, websiteId)
示例#17
0
 def addSinaUrl(self):
     baseUrl = "http://www.sina.com.cn/"
     websiteId = tools.getWebsiteId(Constance.SINA)
     self.addUrl(baseUrl, websiteId)
    def __inputData(self):
        if len(Collector._urls) > int(
                tools.getConfValue("collector", "max_size")):
            log.debug("collector 已满 size = %d" % len(Collector._urls))
            return
        mylock.acquire()  #加锁

        website = tools.getConfValue("collector", "website")
        depth = int(tools.getConfValue("collector", "depth"))
        urlCount = int(tools.getConfValue("collector", "url_count"))

        beginTime = time.time()

        if DEBUG:
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": DEPTH
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)
        elif website == 'all':
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": {
                        "$lte": depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)  #sort -1 降序 1 升序
        else:
            websiteId = tools.getWebsiteId(website)
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "website_id": websiteId,
                    "depth": {
                        "$lte": depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)

        endTime = time.time()

        urlsList = list(urlsList)

        log.debug('get url time ' + str(endTime - beginTime) + " size " +
                  str(len(urlsList)))

        beginTime = time.time()
        Collector._urls.extend(urlsList)
        log.debug('put get url time ' + str(time.time() - beginTime) +
                  " size " + str(len(urlsList)))

        #更新已取到的url状态为doing
        beginTime = time.time()
        for url in urlsList:
            Collector._db.urls.update(url,
                                      {'$set': {
                                          'status': Constance.DOING
                                      }})
        endTime = time.time()
        log.debug('update url time ' + str(endTime - beginTime))

        if self.isAllHaveDone():
            self.stop()
            exportData.export()

        mylock.release()
示例#19
0
    def __inputData(self):
        if len(Collector._urls) > int(
                tools.getConfValue("collector", "max_size")):
            return
        mylock.acquire()  #加锁

        website = tools.getConfValue("collector", "website")
        depth = int(tools.getConfValue("collector", "depth"))
        urlCount = int(tools.getConfValue("collector", "url_count"))
        if DEBUG:
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": DEPTH
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)
        elif website == 'all':
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": {
                        "$lte": depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)  #sort -1 降序 1 升序
        else:
            websiteId = tools.getWebsiteId(website)
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "website_id": websiteId,
                    "depth": {
                        "$lte": depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)

        urlsList = list(urlsList)
        Collector._urls.extend(urlsList)
        #更新已取到的url状态为doing
        for url in urlsList:
            Collector._db.urls.update(url,
                                      {'$set': {
                                          'status': Constance.DOING
                                      }})

        mylock.release()
    def __inputData(self):
        log.debug('buffer size %d' % self.getMaxReadSize())
        log.debug('buffer can write size = %d' % self.getMaxWriteSize())
        if self.getMaxWriteSize() == 0:
            log.debug("collector 已满 size = %d" % self.getMaxReadSize())
            return

        beginTime = time.time()

        urlCount = Collector._urlCount if Collector._urlCount <= self.getMaxWriteSize(
        ) else self.getMaxWriteSize()

        if DEBUG:
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": DEPTH
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)
        elif Collector._website == 'all':
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": {
                        "$lte": Collector._depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)  #sort -1 降序 1 升序
        else:
            websiteId = tools.getWebsiteId(Collector._website)
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "website_id": websiteId,
                    "depth": {
                        "$lte": Collector._depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)

        endTime = time.time()

        urlsList = list(urlsList)

        log.debug('get url time ' + str(endTime - beginTime) + " size " +
                  str(len(urlsList)))

        # 存url
        self.putUrls(urlsList)

        #更新已取到的url状态为doing
        beginTime = time.time()
        for url in urlsList:
            Collector._db.urls.update(url,
                                      {'$set': {
                                          'status': Constance.DOING
                                      }})
        endTime = time.time()
        log.debug('update url time ' + str(endTime - beginTime))

        if self.isAllHaveDone():
            self.stop()
            exportData.export()
示例#21
0
 def addXinHuaUrl(self):
     baseUrl = "http://www.xinhuanet.com/"
     websiteId = tools.getWebsiteId(Constance.XIN_HUA)
     self.addUrl(baseUrl, websiteId)