def parseUrl(urlInfo): log.debug('处理 %s' % urlInfo) sourceUrl = urlInfo['url'] websiteId = urlInfo['website_id'] html = tools.getHtml(sourceUrl, 'gb2312') if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return regex = '<span class="length">(.*?)</span>.*? href="(.*?)">(.*?)</a>.*?<p>(.*?)</p>' infos = tools.getInfo(html, regex) for info in infos: videoLength = info[0] videoUrl = info[1] videoName = info[2] videoReleaseTime = info[3] # 名称中有<span id='video_hl'>纪录片</span>这个信息将其过滤 rubbishs = tools.getInfo(videoName, '<span.*?</span>') #查找简介里面的html标签 for rubbish in rubbishs: videoName = videoName.replace(rubbish, "") log.debug('\n片名 %s\n发布时间 %s\n时长 %s\nurl %s\n' % (videoName, videoReleaseTime, videoLength, videoUrl)) basePaser.addDocumentary(websiteId, videoName, '', videoUrl, 1, '', videoLength, videoReleaseTime) basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseVideoInfo(sourceUrl, websiteId): log.debug('取视频信息 %s' % sourceUrl) html = tools.getHtml(sourceUrl, 'gbk') if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return regex = 'class="time">(.*?)<.*?href="(http.*?)".*?title="(.*?)".*?播放:(.*?)<.*?发布:(.*?)<' infos = tools.getInfo(html, regex) for info in infos: length = info[0] url = info[1] videoName = info[2] playCount = info[3] releaseTime = info[4] log.debug('url : %s\n片名 : %s\n发布时间 : %s\n时长 : %s\n播放次数 : %s' % (url, videoName, releaseTime, length, playCount)) basePaser.addUrl(url, websiteId, VIDEO_ABSTRACT) basePaser.addDocumentary(websiteId, videoName, '', url, '', playCount, length, releaseTime) basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseItermInfo(sourceUrl, websiteId): print(websiteId) log.debug('解析栏目信息 %s'%sourceUrl) html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return json = tools.getJson(html) jsonArray = json['data'] # 当没有数据时(到最后一页) jsonArray 为[] #添加下一页的url if jsonArray != []: currentPageRegex = 'page=(\d*?)&' currentPage = tools.getInfo(sourceUrl, currentPageRegex)[0] nextPage = int(currentPage) + 1 nextPageUrl = sourceUrl.replace('page=%s'%currentPage, 'page=%d'%nextPage) log.debug('nextPageUrl = %s'%nextPageUrl) # 添加到urls表 depth为0 basePaser.addUrl(nextPageUrl, websiteId, ITERM_JSON, Constance.ITERM) for info in jsonArray: title = info['name'] url = info['playUrl'] releaseTime = info['createdTime'] itemsCount = str(info['itemsCount']) log.debug('视频:%s 发布时间:%s 集数:%s url: %s'%(title, releaseTime, itemsCount, url)) basePaser.addUrl(url, websiteId, ITERM_URL, Constance.ITERM) basePaser.addDocumentary(websiteId, title, '', url, itemsCount, '', '', releaseTime) basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseUrl(urlInfo): log.debug('处理 %s' % urlInfo) sourceUrl = urlInfo['url'] websiteId = urlInfo['website_id'] html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return # 匹配带期数的 regex = 'ui-list-ct.*?href=\'(.*?)\'.*?class="msk-txt">(.*?)<.*?class="main-tt">(.*?)</span>' infos = tools.getInfo(html, regex) for info in infos: print(info) videoUrl = info[0] videoReleaseTime = info[1] videoName = info[2] log.debug('\n片名 %s\n发布时间 %s\nnurl %s\n' % (videoName, videoReleaseTime, videoUrl)) basePaser.addDocumentary(websiteId, videoName, '', videoUrl, '', '', '', videoReleaseTime) print('-' * 40) regex = 'ui-list-ct.*?href=\'(.*?)\'.*?class="main-tt">(.*?)</span>' infos = tools.getInfo(html, regex) for info in infos: videoUrl = info[0] videoName = info[1] log.debug('\n片名 %s\nurl %s\n' % (videoName, videoUrl)) basePaser.addDocumentary(websiteId, videoName, '', videoUrl, '', '', '', '', '') basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseShowInfo(sourceUrl, websiteId): log.debug('解析节目信息%s' % sourceUrl) html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return # 节目名 regex = "<h1>(.*?)</h1>" showName = tools.getInfo(html, regex) showName = len(showName) > 0 and showName[0] or '' showName = tools.replaceStr(showName, '<.*?>') log.debug('片名:%s' % showName) # 播放次数 regex = "播放次数.*?>(.*?)<" playCount = tools.getInfo(html, regex) playCount = len(playCount) > 0 and playCount[0] or '' log.debug('播放次数: %s' % playCount) # 发布时间 regex = '<li>年份.*?>(.*?)<' releaseTime = tools.getInfo(html, regex) releaseTime = len(releaseTime) > 0 and releaseTime[0] or '' log.debug('发布时间: %s' % releaseTime) # 集数 regex = '更新至\s*?(.*?)<' episodeNum = tools.getInfo(html, regex) episodeNum = len(episodeNum) > 0 and episodeNum[0] or '' log.debug('集数: %s' % episodeNum) # 片长 regex = '片长.*?>(.*?)<' showLength = tools.getInfo(html, regex) showLength = len(showLength) > 0 and showLength[0] or '' log.debug('片长: %s' % showLength) # 简介 # 带详情的和不带详情的 regexs = [ 'intro_cont_all.*?<p>(.*?)<span', 'introduction.*?<p>(.*?)</div>' ] abstract = tools.getInfo(html, regexs) abstract = len(abstract) > 0 and abstract[0] or '' abstract = tools.replaceStr(abstract, '<.*?>') abstract = tools.replaceStr(abstract, '“|”') log.debug('简介: %s\n' % abstract) basePaser.addDocumentary(websiteId, showName, abstract, sourceUrl, episodeNum, playCount, showLength, releaseTime) basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseUrl(urlInfo): log.debug('处理 %s' % urlInfo) sourceUrl = urlInfo['url'] websiteId = urlInfo['website_id'] html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return # print(html) regex = "\('(.*?)'\)" jsonStr = tools.getInfo(html, regex)[0] # 去掉多余的反斜杠 jsonStr = jsonStr.replace('\\\\', '~~~') jsonStr = jsonStr.replace('\\', '') jsonStr = jsonStr.replace('~~~', '\\') # log.debug(u'%s'%jsonStr) json = tools.getJson(jsonStr) jsonArray = json['result']['data']['items'] if jsonArray != None: # 添加下一页的url currentPageRegex = 'page=(\d*?)&' currentPage = tools.getInfo(sourceUrl, currentPageRegex)[0] nextPage = int(currentPage) + 1 nextPageUrl = sourceUrl.replace('page=%s' % currentPage, 'page=%d' % nextPage) log.debug('nextPageUrl = %s' % nextPageUrl) # 添加到urls表 depth为0 basePaser.addUrl(nextPageUrl, websiteId, 0) #取当前页的信息 for info in jsonArray: url = info['url'] videoName = info['title'] releaseTime = info['create_time'] source = info['source'] abstract = info['intro'] length = info['duration'] playtimes = info['pv'] log.debug( 'url : %s\n片名 : %s\n发布时间 : %s\n时长 : %s\n播放次数 : %s\n来源 : %s\n简介 : %s' % (url, videoName, releaseTime, length, playtimes, source, abstract)) basePaser.addDocumentary(websiteId, videoName, abstract, url, '', playtimes, length, releaseTime, source) basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseInfo(sourceUrl): html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return # print(html) regex = "\('(.*?)'\)" jsonStr = tools.getInfo(html, regex)[0] # 去掉多余的反斜杠 jsonStr = jsonStr.replace('\\\\', '~~~') jsonStr = jsonStr.replace('\\', '') jsonStr = jsonStr.replace('~~~', '\\') # log.debug(u'%s'%jsonStr) json = tools.getJson(jsonStr) jsonArray = json['result']['data']['items'] if jsonArray != None: # 添加下一页的url currentPageRegex = 'page=(\d*?)&' currentPage = tools.getInfo(sourceUrl, currentPageRegex)[0] nextPage = int(currentPage) + 1 nextPageUrl = sourceUrl.replace('page=%s' % currentPage, 'page=%d' % nextPage) log.debug('nextPageUrl = %s' % nextPageUrl) # 添加到urls表 depth为0 basePaser.addUrl(nextPageUrl, websiteId, 0) #取当前页的信息 for info in jsonArray: url = info['url'] videoName = info['title'] releaseTime = info['create_time'] source = info['source'] abstract = info['intro'] length = info['duration'] playtimes = info['pv'] log.debug( 'url : %s\n片名 : %s\n发布时间 : %s\n时长 : %s\n播放次数 : %s\n来源 : %s\n简介 : %s' % (url, videoName, releaseTime, length, playtimes, source, abstract)) basePaser.addDocumentary(websiteId, videoName, abstract, url, '', playtimes, length, releaseTime, source) basePaser.updateUrl(sourceUrl, Constance.DONE) # url = 'http://api.v1.cn/v1Enhanced/interfaceForJsonP?callback=jQuery18308286485691806487_1477619118750&obj=cms.getArticle&cid=1147&page=1&nums=24&_=1477619416282' # parseInfo(url)
def parseVideoAbstract(sourceUrl, websiteId): log.debug('取视频 %s' % sourceUrl) html = tools.getHtml(sourceUrl, 'gbk') if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return regex = 'class="ckl_neir".*<p>(.*?)</p>' abstract = tools.getInfo(html, regex) abstract = abstract == [] and '' or abstract[0] abstract = abstract.replace('"', '"') log.debug("url :%s\n简介:%s" % (sourceUrl, abstract)) basePaser.addDocumentary(websiteId, '', abstract, sourceUrl) basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseVideoAbstract(sourceUrl, websiteId): # 进入url 取简介 videoHtml = tools.getHtml(sourceUrl) videoHtml = tools.getHtml(sourceUrl) if videoHtml == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return regex = 'class="v_desc">(.*?)</p>' abstract = tools.getInfo(videoHtml, regex) abstract = len(abstract) > 0 and abstract[0] or '' # abstract = tools.replaceStr(abstract, '<.*?>') log.debug('url: %s\n简介: %s\n'%(sourceUrl, abstract)) basePaser.addDocumentary(websiteId, '', abstract, sourceUrl) basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseShowInfo(sourceUrl, websiteId): log.debug('解析节目信息%s' % sourceUrl) html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return #片名 regexs = '<h1 class="title">.*?class="name">(.+?)</span>' showName = tools.getInfo(html, regexs) showName = len(showName) > 0 and showName[0] or '' log.debug('片名:%s' % showName) #集数 regexs = 'class="basenotice">.*?([\d-]+).*?</div>' episodeNum = tools.getInfo(html, regexs) episodeNum = len(episodeNum) > 0 and episodeNum[0] or '' log.debug('集数: %s' % episodeNum) #播放量 regexs = "总播放:.*?>([\d,]+).*?</" playCount = tools.getInfo(html, regexs) playCount = len(playCount) > 0 and playCount[0] or '' log.debug('播放量: %s' % playCount) #简介 regexs = '<div class="detail">(.*?)</div>' abstract = tools.getInfo(html, regexs) abstract = len(abstract) > 0 and abstract[0] or '' rubbishs = tools.getInfo(abstract, '<.*?>') #查找简介里面的html标签 #去掉简介中的html标签 for rubbish in rubbishs: abstract = abstract.replace(rubbish, "") rubbishs = tools.getInfo(abstract, '\s') #查找简介里面的空白字符,包括空格、制表符、换页符等等 #去掉简介中的空白字符,包括空格、制表符、换页符等等 for rubbish in rubbishs: abstract = abstract.replace(rubbish, "") log.debug('简介: %s' % abstract) basePaser.addDocumentary(websiteId, showName, abstract, sourceUrl, episodeNum, playCount) basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseItermAbstract(sourceUrl, websiteId): # 进入url 取简介 html = tools.getHtml(sourceUrl) html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return regex = '<span class="desc">(.*?)</span>' abstract = tools.getInfo(html, regex) abstract = len(abstract) > 0 and abstract[0] or '' # abstract = tools.replaceStr(abstract, '<.*?>') log.debug('url: %s\n简介: %s\n'%(sourceUrl, abstract)) basePaser.addDocumentary(websiteId, '', abstract, sourceUrl) basePaser.updateUrl(sourceUrl, Constance.DONE) # sourceUrl = 'http://www.tudou.com/list/playlistData.action?tagType=2&firstTagId=8&areaCode=&tags=&initials=&hotSingerId=&page=1&sort=2&key=' # parseItermInfo(sourceUrl, '')
def parseVideoInfo(sourceUrl, websiteId): log.debug('解析视频信息 %s'%sourceUrl) html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return json = tools.getJson(html) jsonArray = json['data'] # 当没有数据时(到最后一页) jsonArray 为[] #添加下一页的url if jsonArray != []: currentPageRegex = 'page=(\d*?)&' currentPage = tools.getInfo(sourceUrl, currentPageRegex)[0] nextPage = int(currentPage) + 1 nextPageUrl = sourceUrl.replace('page=%s'%currentPage, 'page=%d'%nextPage) log.debug('nextPageUrl = %s'%nextPageUrl) # 添加到urls表 depth为0 basePaser.addUrl(nextPageUrl, websiteId, VIDEO_JSON, Constance.VIDEO) # 解析当前页的信息 for info in jsonArray: title = info['title'] playTimes = str(info['playTimes']) pubDate = info['pubDate'] totalTimeStr = info['totalTimeStr'] urlCode = info['code'] url = 'http://www.tudou.com/programs/view/%s/'%urlCode log.debug('视频:%s 播放次数:%s 发布时间:%s 总时长:%s url: %s'%(title, playTimes, pubDate, totalTimeStr, url)) # # 进入url 取简介 # videoHtml = tools.getHtml(url) # regex = 'class="v_desc">(.*?)</p>' # abstract = tools.getInfo(videoHtml, regex) # abstract = len(abstract) > 0 and abstract[0] or '' # # abstract = tools.replaceStr(abstract, '<.*?>') # log.debug('简介: %s\n'%abstract) basePaser.addUrl(url, websiteId, VIDEO_URL, Constance.VIDEO) basePaser.addDocumentary(websiteId, title, '', url, '', playTimes, totalTimeStr, pubDate) basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseEpisodeInfo(sourceUrl, websiteId): log.debug('解析剧集信息%s'%sourceUrl) html = tools.getHtml(sourceUrl, 'gbk') if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return # 片名 regex = 'class="cover_info">.*?title="(.*?)"' showName = tools.getInfo(html, regex) showName = len(showName) > 0 and showName[0] or '' log.debug('片名:%s'%showName) # 发布时间 regex = 'class="first".*?>(.*?)<' releaseTime = tools.getInfo(html, regex) releaseTime = len(releaseTime) > 0 and releaseTime[0] or '' log.debug('发布时间: %s'%releaseTime) # 播放量 regex = 'class="key_item t_1".*?</span>(.*?)</span>' playCount = tools.getInfo(html, regex) playCount = len(playCount) > 0 and playCount[0] or '' log.debug('播放次数: %s'%playCount) # 集数 regex = 'update:\'(.*?)\'' episodeNum = tools.getInfo(html, regex) episodeNum = len(episodeNum) > 0 and episodeNum[0] or '' log.debug('集数: %s'%episodeNum) # 简介 regex = 'class=\'desc\'>(.*?)</div>' abstract = tools.getInfo(html, regex) abstract = len(abstract) > 0 and abstract[0] or '' abstract = tools.replaceStr(abstract, '<.*?>') log.debug('简介: %s\n'%abstract) basePaser.addDocumentary(websiteId, showName, abstract, sourceUrl, episodeNum, playCount, '', releaseTime) basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseVideoInfo(sourceUrl, websiteId): log.debug("解析视频 baserul = %s" % sourceUrl) html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return #片名 播放量 regexs = 'class="info-list">.*?href="(.+?)".*?title="\s*(.+?)\s*">.*?<li class=" ">\s*(.+?)\s*</li>' videosInfo = tools.getInfo(html, regexs) for videoInfo in videosInfo: videoUrl = videoInfo[0] videoName = videoInfo[1] videoPlayNum = videoInfo[2] log.debug("视频:%s\n播放量:%s\nurl: %s\n" % (videoName, videoPlayNum, videoUrl)) basePaser.addDocumentary(websiteId, videoName, '', videoUrl, 1, videoPlayNum) basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseLeafUrl(sourceUrl, websiteId): log.debug('解析 LeafNode url = %s begin...' % sourceUrl) try: driver = webdriver.PhantomJS() driver.get(sourceUrl) time.sleep(2) html = driver.page_source finally: driver.quit() #html = tools.getHtml(sourceUrl) #h = httplib.Http(timeout=3) #resp,content=h.request(sourceUrl) #html = content.decode('utf-8','ignore') if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) log.debug('未能正确获取此URL源码%s !!!' % sourceUrl) return f = open('D:\cctv_html.txt', 'a+') f.write(sourceUrl) f.write('\n') f.close() return log.debug('URL=%s正则匹配详细信息开始。。。' % sourceUrl) # 专辑名称 videoName = '' regExs = ['player_title">(.+?)[\s]*<'] for reg in regExs: videoName = ''.join(tools.getInfo(html, reg)) if videoName != '': break log.debug('专辑名称: %s' % videoName) # 集数 videoNumber = '' regExs = ['专辑总数据.+共([\d]+?)个'] for reg in regExs: videoNumber = ''.join(tools.getInfo(html, reg)) if videoNumber != '': break log.debug('集数: %s' % videoNumber) # 简介 videoDescription = '' regExs = ['itemprop="description" content="(.*?)">?'] for reg in regExs: videoDescription = ''.join(tools.getInfo(html, reg)) if videoDescription != '': break log.debug('简介: %s' % videoDescription) # 总播放量 videoPlayCount = '' regExs = ['mod_album_total.+?total_count">总播放量.+?>(.*?)</em>'] for reg in regExs: videoPlayCount = ''.join(tools.getInfo(html, reg)) if videoPlayCount != '': break log.debug('总播放量: %s' % videoPlayCount) # url log.debug('URL = %s' % sourceUrl) # 总片长 (单位秒) videoAllTime = '' regExs = ['<span class="figure_info">(.*?)</span>'] for reg in regExs: videoAllTime = ''.join(tools.timeListToString(tools.getInfo(html, reg))) if videoAllTime != '': break log.debug('总片长 : %s' % videoAllTime) # 发布时间 videoReleaseTime = '' regExs = ['meta itemprop="datePublished" content="(.*?)"'] for reg in regExs: videoReleaseTime = ''.join(tools.getInfo(html, reg)) if videoReleaseTime != '': break log.debug('发布时间 : %s' % videoReleaseTime) # 播出机构 videoPlayCompany = '' log.debug('播出机构暂无。。。') # 百度百科上的信息 videoBaiduInfo = '' log.debug('百度百科上的信息暂无。。。') log.debug('URL=%s正则匹配详细信息结束。。。' % sourceUrl) basePaser.addDocumentary(websiteId, videoName, videoDescription, sourceUrl, videoNumber, videoPlayCount, videoAllTime, videoReleaseTime) basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseLeafUrl(sourceUrl, websiteId): log.debug('解析 LeafNode url = %s begin...'%sourceUrl) #html = tools.getHtml(sourceUrl) h = httplib.Http(timeout=3) resp,content=h.request(sourceUrl) html = content.decode('utf-8','ignore') if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) log.debug('未能正确获取此URL源码%s !!!'%sourceUrl) return f = open('D:\cctv_html.txt', 'a+') f.write(sourceUrl) f.write('\n') f.close() return log.debug('URL=%s正则匹配详细信息开始。。。'%sourceUrl) # 专辑名称 videoName = '' regExs = ['text_mod.+?<h3>(.+?)</h3>', '<td>名.*?称:</td>.+?<a.+?>(.+?)</a></td>'] for reg in regExs: videoName = ''.join(tools.getInfo(html, reg)) if videoName != '': break log.debug('专辑名称: %s'%videoName) # 集数 videoNumber = '' regExs = ['<p><span>集.*?数:.*?</span>(.*?)</p>', '<td>集.*?数:</td>.+?<a.+?>(.*?)</a></td>'] for reg in regExs: videoNumber = ''.join(tools.getInfo(html, reg)) if videoNumber != '': break log.debug('集数: %s'%videoNumber) # 简介 videoDescription = '' regExs = ['<p id="shuoqi".+?简.*?介:</span>(.+?)<a', '内.*?容.*?简.*?介:</td>.+?brief=\'(.+?)\''] for reg in regExs: videoDescription = ''.join(tools.getInfo(html, reg)) if videoDescription != '': break log.debug('简介: %s'%videoDescription) # 总播放量 videoPlayCount = '' log.debug('总播放量暂无。。。') # url log.debug('URL = %s'%sourceUrl) # 总片长 (单位秒) videoAllTime = '' log.debug('总片长暂无。。。') # 播出机构 videoPlayCompany = '' log.debug('播出机构暂无。。。') # 发布时间 videoReleaseTime = '' log.debug('发布时间暂无。。。') # 百度百科上的信息 videoBaiduInfo = '' log.debug('百度百科上的信息暂无。。。') log.debug('URL=%s正则匹配详细信息结束。。。'%sourceUrl) basePaser.addDocumentary(websiteId, videoName, videoDescription, sourceUrl, videoNumber) basePaser.updateUrl(sourceUrl, Constance.DONE)