def parseVideoInfo(sourceUrl, websiteId): log.debug('取视频信息 %s' % sourceUrl) html = tools.getHtml(sourceUrl, 'gbk') if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return regex = 'class="time">(.*?)<.*?href="(http.*?)".*?title="(.*?)".*?播放:(.*?)<.*?发布:(.*?)<' infos = tools.getInfo(html, regex) for info in infos: length = info[0] url = info[1] videoName = info[2] playCount = info[3] releaseTime = info[4] log.debug('url : %s\n片名 : %s\n发布时间 : %s\n时长 : %s\n播放次数 : %s' % (url, videoName, releaseTime, length, playCount)) basePaser.addUrl(url, websiteId, VIDEO_ABSTRACT) basePaser.addDocumentary(websiteId, videoName, '', url, '', playCount, length, releaseTime) basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseItermInfo(sourceUrl, websiteId): print(websiteId) log.debug('解析栏目信息 %s'%sourceUrl) html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return json = tools.getJson(html) jsonArray = json['data'] # 当没有数据时(到最后一页) jsonArray 为[] #添加下一页的url if jsonArray != []: currentPageRegex = 'page=(\d*?)&' currentPage = tools.getInfo(sourceUrl, currentPageRegex)[0] nextPage = int(currentPage) + 1 nextPageUrl = sourceUrl.replace('page=%s'%currentPage, 'page=%d'%nextPage) log.debug('nextPageUrl = %s'%nextPageUrl) # 添加到urls表 depth为0 basePaser.addUrl(nextPageUrl, websiteId, ITERM_JSON, Constance.ITERM) for info in jsonArray: title = info['name'] url = info['playUrl'] releaseTime = info['createdTime'] itemsCount = str(info['itemsCount']) log.debug('视频:%s 发布时间:%s 集数:%s url: %s'%(title, releaseTime, itemsCount, url)) basePaser.addUrl(url, websiteId, ITERM_URL, Constance.ITERM) basePaser.addDocumentary(websiteId, title, '', url, itemsCount, '', '', releaseTime) basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseEpisodeUrl(sourceUrl, websiteId): log.debug('取剧集url %s'%sourceUrl) html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return regex = '"playUrl":"(.*?)"' urls = tools.getInfo(html, regex, True) for url in urls: log.debug("剧集url: %s"%url) basePaser.addUrl(url, websiteId, EPISODE_DESCRIBE, Constance.EPISODE) basePaser.updateUrl(sourceUrl, Constance.DONE) #添加下一页的url if urls != []: currentPageRegex = 'pageNo=(\d*?)&' currentPage = tools.getInfo(sourceUrl, currentPageRegex)[0] nextPage = int(currentPage) + 1 nextPageUrl = sourceUrl.replace('pageNo=%s'%currentPage, 'pageNo=%d'%nextPage) log.debug('nextPageUrl = %s'%nextPageUrl) # 添加到urls表 depth为0 basePaser.addUrl(nextPageUrl, websiteId, EPISODE_URL, Constance.EPISODE)
def parseUrl(urlInfo): log.debug('处理 %s'%urlInfo) sourceUrl = urlInfo['url'] depth = urlInfo['depth'] websiteId = urlInfo['website_id'] description = urlInfo['description'] html = tools.getHtml(sourceUrl) if not DEBUG: if html == None: basePaser.updateUrl(sourceUrl, Constance.EXCEPTION) return regex = '[\u4e00-\u9fa5]+' chineseWord = tools.getInfo(html, regex) if not chineseWord: basePaser.updateUrl(sourceUrl, Constance.DONE) return # 取当前页面的全部url urls = tools.getUrls(html) # 过滤掉外链接 添加到数据库 fitUrl = tools.fitUrl(urls, ['news.cn', 'xinhuanet.com']) for url in fitUrl: # log.debug('url = ' + url) basePaser.addUrl(url, websiteId, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1.*?>(.*?)</h1>' title = tools.getInfo(html, regexs) title = title and title[0] or '' title = tools.delHtmlTag(title) # 内容 regexs = ['<div id="content">(.*?)<div class="clear"></div>', '<div class="article">(.*?)<!--文章操作-->', '<div id="videoArea">(.*?)<!--文章操作-->', '<div class="content">(.*?)<div id="articleEdit">' ] content = tools.getInfo(html, regexs) content = content and content[0] or '' content = tools.delHtmlTag(content) log.debug(''' depth = %d sourceUrl = %s title = %s content = %s '''%(depth, sourceUrl, title, content)) if not DEBUG: if content and title: basePaser.addTextInfo(websiteId, sourceUrl, title, content) # 更新sourceUrl为done basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseUrl(urlInfo): log.debug('处理 %s' % urlInfo) sourceUrl = urlInfo['url'] depth = urlInfo['depth'] websiteId = urlInfo['website_id'] description = urlInfo['description'] html = tools.getHtml(sourceUrl, 'gb2312') if not DEBUG: if html == None: basePaser.updateUrl(sourceUrl, Constance.EXCEPTION) return regex = '[\u4e00-\u9fa5]+' chineseWord = tools.getInfo(html, regex) if not chineseWord: basePaser.updateUrl(sourceUrl, Constance.DONE) return # 取当前页面的全部url urls = tools.getUrls(html) # 过滤掉外链接 添加到数据库 fitUrl = tools.fitUrl(urls, "163.com") for url in fitUrl: # log.debug('url = ' + url) basePaser.addUrl(url, websiteId, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1>(.*?)</h1>' title = tools.getInfo(html, regexs) title = title and title[0] or '' title = tools.replaceStr(title, '&.*?;') # 内容 regexs = [ '<div id="endText".*?>(.*?)<div class="post_btmshare">', '<div class="post_text".*?>(.*?)<div class="post_btmshare">' ] content = tools.getInfo(html, regexs) content = content and content[0] or '' content = tools.delHtmlTag(content) log.debug(''' depth = %d sourceUrl = %s title = %s content = %s ''' % (depth, sourceUrl, title, content)) if not DEBUG: if content and title: basePaser.addTextInfo(websiteId, sourceUrl, title, content) # 更新sourceUrl为done basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseUrl(urlInfo): log.debug('处理 %s'%urlInfo) sourceUrl = urlInfo['url'] depth = urlInfo['depth'] websiteId = urlInfo['website_id'] description = urlInfo['description'] html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.EXCEPTION) return # 判断中英文 regex = '[\u4e00-\u9fa5]+' chineseWord = tools.getInfo(html, regex) if not chineseWord: basePaser.updateUrl(sourceUrl, Constance.DONE) return # 取当前页面的全部url urls = tools.getUrls(html) # 过滤掉外链接 添加到数据库 fitUrl = tools.fitUrl(urls, "feng.com") for url in fitUrl: # log.debug('url = ' + url) basePaser.addUrl(url, websiteId, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1.*?>(.*?)</h1>' title = tools.getInfo(html, regexs) title = title and title[0] or '' title = tools.delHtmlTag(title) # 内容 regexs = ['<div id="main_content".*?>(.*?)</div>', '<div class="yc_con_l">(.*?)<div class="txt_share_box"', '<div id="slideNoInsertDefault"></div>(.*?)</div>'] content = tools.getInfo(html, regexs) content = content and content[0] or '' content = tools.delHtmlTag(content) log.debug(''' depth = %d sourceUrl = %s title = %s content = %s '''%(depth, sourceUrl, title, content)) if content: basePaser.addTextInfo(websiteId, sourceUrl, title, content) # 更新sourceUrl为done basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseUrl(urlInfo): log.debug('处理 %s'%urlInfo) sourceUrl = urlInfo['url'] depth = urlInfo['depth'] websiteId = urlInfo['website_id'] description = urlInfo['description'] html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.EXCEPTION) return # 判断中英文 regex = '[\u4e00-\u9fa5]+' chineseWord = tools.getInfo(html, regex) if not chineseWord: basePaser.updateUrl(sourceUrl, Constance.DONE) return # 取当前页面的全部url urls = tools.getUrls(html) # 过滤掉外链接 添加到数据库 fitUrl = tools.fitUrl(urls, "cctv.com") for url in fitUrl: # log.debug('url = ' + url) basePaser.addUrl(url, websiteId, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1><!--repaste.title.begin-->(.*?)<!--repaste.title.end-->' title = tools.getInfo(html, regexs) title = title and title[0] or '' title = tools.delHtmlTag(title) # 内容 regexs = ['<!--repaste.body.begin-->(.*?)<!--repaste.body.end-->'] content = tools.getInfo(html, regexs) content = content and content[0] or '' content = tools.delHtmlTag(content) log.debug(''' depth = %d sourceUrl = %s title = %s content = %s '''%(depth, sourceUrl, title, content)) if content and title: basePaser.addTextInfo(websiteId, sourceUrl, title, content) # 更新sourceUrl为done basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseUrl(urlInfo): log.debug('处理 %s' % urlInfo) sourceUrl = urlInfo['url'] depth = urlInfo['depth'] websiteId = urlInfo['website_id'] description = urlInfo['description'] html = tools.getHtml(sourceUrl, 'gb2312') if html == None: basePaser.updateUrl(sourceUrl, Constance.EXCEPTION) return # 取当前页面的全部url urls = tools.getUrls(html) # 过滤掉外链接 添加到数据库 fitUrl = tools.fitUrl(urls, "sohu.com") for url in fitUrl: # log.debug('url = ' + url) basePaser.addUrl(url, websiteId, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1.*?>(.*?)</h1>' title = tools.getInfo(html, regexs) title = title and title[0] or '' title = tools.delHtmlTag(title) # 内容 regexs = [ '<div class="content clear clearfix".*?>(.*?)</div>', '<div class="box_con".*?>(.*?)<div class="edit clearfix"', '<div class="show_text">(.*?)</div>', '<div class="text">.*?<hr class="nonehr">', '<div itemprop="articleBody">(.*?)<div style="display:none;">', '<article>(.*?)</article>' ] content = tools.getInfo(html, regexs) content = content and content[0] or '' content = tools.delHtmlTag(content) log.debug(''' depth = %d sourceUrl = %s title = %s content = %s ''' % (depth, sourceUrl, title, content)) if content: basePaser.addTextInfo(websiteId, sourceUrl, title, content) # 更新sourceUrl为done basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseUrl(urlInfo): log.debug('处理 %s' % urlInfo) sourceUrl = urlInfo['url'] websiteId = urlInfo['website_id'] html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return # print(html) regex = "\('(.*?)'\)" jsonStr = tools.getInfo(html, regex)[0] # 去掉多余的反斜杠 jsonStr = jsonStr.replace('\\\\', '~~~') jsonStr = jsonStr.replace('\\', '') jsonStr = jsonStr.replace('~~~', '\\') # log.debug(u'%s'%jsonStr) json = tools.getJson(jsonStr) jsonArray = json['result']['data']['items'] if jsonArray != None: # 添加下一页的url currentPageRegex = 'page=(\d*?)&' currentPage = tools.getInfo(sourceUrl, currentPageRegex)[0] nextPage = int(currentPage) + 1 nextPageUrl = sourceUrl.replace('page=%s' % currentPage, 'page=%d' % nextPage) log.debug('nextPageUrl = %s' % nextPageUrl) # 添加到urls表 depth为0 basePaser.addUrl(nextPageUrl, websiteId, 0) #取当前页的信息 for info in jsonArray: url = info['url'] videoName = info['title'] releaseTime = info['create_time'] source = info['source'] abstract = info['intro'] length = info['duration'] playtimes = info['pv'] log.debug( 'url : %s\n片名 : %s\n发布时间 : %s\n时长 : %s\n播放次数 : %s\n来源 : %s\n简介 : %s' % (url, videoName, releaseTime, length, playtimes, source, abstract)) basePaser.addDocumentary(websiteId, videoName, abstract, url, '', playtimes, length, releaseTime, source) basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseInfo(sourceUrl): html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return # print(html) regex = "\('(.*?)'\)" jsonStr = tools.getInfo(html, regex)[0] # 去掉多余的反斜杠 jsonStr = jsonStr.replace('\\\\', '~~~') jsonStr = jsonStr.replace('\\', '') jsonStr = jsonStr.replace('~~~', '\\') # log.debug(u'%s'%jsonStr) json = tools.getJson(jsonStr) jsonArray = json['result']['data']['items'] if jsonArray != None: # 添加下一页的url currentPageRegex = 'page=(\d*?)&' currentPage = tools.getInfo(sourceUrl, currentPageRegex)[0] nextPage = int(currentPage) + 1 nextPageUrl = sourceUrl.replace('page=%s' % currentPage, 'page=%d' % nextPage) log.debug('nextPageUrl = %s' % nextPageUrl) # 添加到urls表 depth为0 basePaser.addUrl(nextPageUrl, websiteId, 0) #取当前页的信息 for info in jsonArray: url = info['url'] videoName = info['title'] releaseTime = info['create_time'] source = info['source'] abstract = info['intro'] length = info['duration'] playtimes = info['pv'] log.debug( 'url : %s\n片名 : %s\n发布时间 : %s\n时长 : %s\n播放次数 : %s\n来源 : %s\n简介 : %s' % (url, videoName, releaseTime, length, playtimes, source, abstract)) basePaser.addDocumentary(websiteId, videoName, abstract, url, '', playtimes, length, releaseTime, source) basePaser.updateUrl(sourceUrl, Constance.DONE) # url = 'http://api.v1.cn/v1Enhanced/interfaceForJsonP?callback=jQuery18308286485691806487_1477619118750&obj=cms.getArticle&cid=1147&page=1&nums=24&_=1477619416282' # parseInfo(url)
def parseRootUrl(sourceUrl, websiteId, depth): #html = tools.getHtml(sourceUrl) h = httplib.Http() resp, content = h.request(sourceUrl) html = content.decode('utf-8', 'ignore') regexs = 'data-trigger-class="list_item_hover">.+?href="(.+?)"' urls = tools.getInfo(html, regexs) for url in urls: log.debug("保存视频url到DB: %s" % url) basePaser.addUrl(url, websiteId, depth + 1, '') basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseEpisodeDescribeUrl(sourceUrl, websiteId): log.debug('取剧集简介 url ' + sourceUrl) html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return regex = 'videoKw.*?href="(.*?)"' urls = tools.getInfo(html, regex) for url in urls: log.debug("剧集简介url: %s"%url) basePaser.addUrl(url, websiteId, EPISODE_INFO, Constance.EPISODE) basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseShowDescribeUrl(sourceUrl, websiteId): log.debug('取节目简介 url ' + sourceUrl) html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return regexs = 'class="desc-link".*?href="(.+?)"' urls = tools.getInfo(html, regexs) for url in urls: log.debug("节目简介url: %s" % url) basePaser.addUrl(url, websiteId, SHOW_INFO, 'show') basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseShowDescribeUrl(sourceUrl, websiteId): log.debug('取节目简介 url ' + sourceUrl) html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return regex = 'movieTitle.*?href="(.*?)"' urls = tools.getInfo(html, regex) for url in urls: log.debug("节目详情url: %s" % url) basePaser.addUrl(url, websiteId, SHOW_INFO) basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseRootUrl(sourceUrl, websiteId, depth): log.debug('解析 RootNode url = %s begin...'%sourceUrl) html = tools.getHtml(sourceUrl) reg = '<ul.*?<h3><a href="(.+?)".*?<h3><a href="(.+?)".*?<h3><a href="(.+?)".*?<h3><a href="(.+?)".*?<h3><a href="(.+?)".*?</ul>' urlss = tools.getInfo(html, reg) for urls in urlss: for url in urls: log.debug("保存视频url到DB: %s"%url) basePaser.addUrl(url, websiteId, depth + 1, '') basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseShowUrl(sourceUrl, websiteId): html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return regTypeId = basePaser.getRegexTypeId(Constance.VIDEO_URL) regexs = basePaser.getRegex(websiteId, regTypeId) urls = tools.getInfo(html, regexs) for url in urls: log.debug("节目url: %s" % url) basePaser.addUrl(url, websiteId, SHOW_DESCRIBE, 'show') basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseShowUrl(sourceUrl, websiteId): html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return regex = 'movielist_tt.*?href="(.*?)"' urls = tools.getInfo(html, regex) for url in urls: log.debug("节目url: %s" % url) if url.endswith('.shtml'): basePaser.addUrl(url, websiteId, SHOW_DESCRIBE) else: basePaser.addUrl(url, websiteId, SHOW_INFO) basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseVideoInfo(sourceUrl, websiteId): log.debug('解析视频信息 %s'%sourceUrl) html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.TODO) return json = tools.getJson(html) jsonArray = json['data'] # 当没有数据时(到最后一页) jsonArray 为[] #添加下一页的url if jsonArray != []: currentPageRegex = 'page=(\d*?)&' currentPage = tools.getInfo(sourceUrl, currentPageRegex)[0] nextPage = int(currentPage) + 1 nextPageUrl = sourceUrl.replace('page=%s'%currentPage, 'page=%d'%nextPage) log.debug('nextPageUrl = %s'%nextPageUrl) # 添加到urls表 depth为0 basePaser.addUrl(nextPageUrl, websiteId, VIDEO_JSON, Constance.VIDEO) # 解析当前页的信息 for info in jsonArray: title = info['title'] playTimes = str(info['playTimes']) pubDate = info['pubDate'] totalTimeStr = info['totalTimeStr'] urlCode = info['code'] url = 'http://www.tudou.com/programs/view/%s/'%urlCode log.debug('视频:%s 播放次数:%s 发布时间:%s 总时长:%s url: %s'%(title, playTimes, pubDate, totalTimeStr, url)) # # 进入url 取简介 # videoHtml = tools.getHtml(url) # regex = 'class="v_desc">(.*?)</p>' # abstract = tools.getInfo(videoHtml, regex) # abstract = len(abstract) > 0 and abstract[0] or '' # # abstract = tools.replaceStr(abstract, '<.*?>') # log.debug('简介: %s\n'%abstract) basePaser.addUrl(url, websiteId, VIDEO_URL, Constance.VIDEO) basePaser.addDocumentary(websiteId, title, '', url, '', playTimes, totalTimeStr, pubDate) basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseUrl(urlInfo): log.debug('处理 %s' % urlInfo) sourceUrl = urlInfo['url'] depth = urlInfo['depth'] websiteId = urlInfo['website_id'] description = urlInfo['description'] # 使用urlopen网页有时乱码 用get请求 然后设置编码解决了问题 html = tools.getHtmlByGet(sourceUrl) if not DEBUG: if html == None: if sourceUrl == Constance.SINA: basePaser.updateUrl(sourceUrl, Constance.TODO) else: basePaser.updateUrl(sourceUrl, Constance.EXCEPTION) return regex = '[\u4e00-\u9fa5]+' chineseWord = tools.getInfo(html, regex) if not chineseWord: basePaser.updateUrl(sourceUrl, Constance.DONE) return # 取当前页面的全部url urls = tools.getUrls(html) # 过滤掉外链接 添加到数据库 fitUrl = tools.fitUrl(urls, "sina.com.cn") for url in fitUrl: # log.debug('url = ' + url) basePaser.addUrl(url, websiteId, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1.*?>(.*?)</h1>' title = tools.getInfo(html, regexs) title = title and title[0] or '' title = tools.delHtmlTag(title) if title == '加载中...': # 更新sourceUrl为done basePaser.updateUrl(sourceUrl, Constance.TODO) return # 内容 regexs = [ 'id="artibody".*?>(.*?)<!-- 吸顶导航结束定位标记 -->', 'id="artibody".*?>(.*?)<div id="left_hzh_ad">', '<!-- 正文内容 begin -->(.*?)<!-- 正文内容 end -->', 'id="articleContent".*?>(.*?)<div class="spacer"></div>' ] content = tools.getInfo(html, regexs) content = content and content[0] or '' content = tools.delHtmlTag(content) log.debug(''' depth = %d sourceUrl = %s title = %s content = %s ''' % (depth, sourceUrl, title, content)) if not DEBUG: if content and title: basePaser.addTextInfo(websiteId, sourceUrl, title, content) # 更新sourceUrl为done basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseUrl(urlInfo): log.debug('处理 %s' % urlInfo) sourceUrl = urlInfo['url'] depth = urlInfo['depth'] websiteId = urlInfo['website_id'] description = urlInfo['description'] # 使用urlopen网页有时乱码 用get请求 然后设置编码解决了问题 html = tools.getHtmlByGet(sourceUrl, '') if not DEBUG: if html == None: if sourceUrl == Constance.TENCENT: basePaser.updateUrl(sourceUrl, Constance.TODO) else: basePaser.updateUrl(sourceUrl, Constance.EXCEPTION) return regex = '[\u4e00-\u9fa5]+' chineseWord = tools.getInfo(html, regex) if not chineseWord: basePaser.updateUrl(sourceUrl, Constance.DONE) return # 取当前页面的全部url urls = tools.getUrls(html) # 过滤掉外链接 添加到数据库 fitUrl = tools.fitUrl(urls, "qq.com") fitUrl = tools.filterRule(fitUrl, lineList) for url in fitUrl: # log.debug('url = ' + url) basePaser.addUrl(url, websiteId, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1.*?>(.*?)</h1>' title = tools.getInfo(html, regexs) title = title and title[0] or '' title = tools.delHtmlTag(title) # 内容 regexs = [ 'bossZone="content">(.+?)正文已结束.+?</span>', 'id="articleContent">(.*?)<div class="hasc">' ] content = tools.getInfo(html, regexs) content = content and content[0] or '' content = tools.delHtmlTag(content) log.debug(''' depth = %d sourceUrl = %s title = %s content = %s ''' % (depth, sourceUrl, title, content)) if not DEBUG: if content and title: basePaser.addTextInfo(websiteId, sourceUrl, title, content) # 更新sourceUrl为done basePaser.updateUrl(sourceUrl, Constance.DONE)