def parseUrl(urlInfo): log.debug('处理 %s'%urlInfo) sourceUrl = urlInfo['url'] depth = urlInfo['depth'] websiteId = urlInfo['website_id'] description = urlInfo['description'] html = tools.getHtml(sourceUrl) if not DEBUG: if html == None: basePaser.updateUrl(sourceUrl, Constance.EXCEPTION) return regex = '[\u4e00-\u9fa5]+' chineseWord = tools.getInfo(html, regex) if not chineseWord: basePaser.updateUrl(sourceUrl, Constance.DONE) return # 取当前页面的全部url urls = tools.getUrls(html) # 过滤掉外链接 添加到数据库 fitUrl = tools.fitUrl(urls, ['news.cn', 'xinhuanet.com']) for url in fitUrl: # log.debug('url = ' + url) basePaser.addUrl(url, websiteId, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1.*?>(.*?)</h1>' title = tools.getInfo(html, regexs) title = title and title[0] or '' title = tools.delHtmlTag(title) # 内容 regexs = ['<div id="content">(.*?)<div class="clear"></div>', '<div class="article">(.*?)<!--文章操作-->', '<div id="videoArea">(.*?)<!--文章操作-->', '<div class="content">(.*?)<div id="articleEdit">' ] content = tools.getInfo(html, regexs) content = content and content[0] or '' content = tools.delHtmlTag(content) log.debug(''' depth = %d sourceUrl = %s title = %s content = %s '''%(depth, sourceUrl, title, content)) if not DEBUG: if content and title: basePaser.addTextInfo(websiteId, sourceUrl, title, content) # 更新sourceUrl为done basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseUrl(urlInfo): log.debug('处理 %s' % urlInfo) sourceUrl = urlInfo['url'] depth = urlInfo['depth'] websiteId = urlInfo['website_id'] description = urlInfo['description'] html = tools.getHtml(sourceUrl, 'gb2312') if not DEBUG: if html == None: basePaser.updateUrl(sourceUrl, Constance.EXCEPTION) return regex = '[\u4e00-\u9fa5]+' chineseWord = tools.getInfo(html, regex) if not chineseWord: basePaser.updateUrl(sourceUrl, Constance.DONE) return # 取当前页面的全部url urls = tools.getUrls(html) # 过滤掉外链接 添加到数据库 fitUrl = tools.fitUrl(urls, "163.com") for url in fitUrl: # log.debug('url = ' + url) basePaser.addUrl(url, websiteId, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1>(.*?)</h1>' title = tools.getInfo(html, regexs) title = title and title[0] or '' title = tools.replaceStr(title, '&.*?;') # 内容 regexs = [ '<div id="endText".*?>(.*?)<div class="post_btmshare">', '<div class="post_text".*?>(.*?)<div class="post_btmshare">' ] content = tools.getInfo(html, regexs) content = content and content[0] or '' content = tools.delHtmlTag(content) log.debug(''' depth = %d sourceUrl = %s title = %s content = %s ''' % (depth, sourceUrl, title, content)) if not DEBUG: if content and title: basePaser.addTextInfo(websiteId, sourceUrl, title, content) # 更新sourceUrl为done basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseUrl(urlInfo): log.debug('处理 %s'%urlInfo) sourceUrl = urlInfo['url'] depth = urlInfo['depth'] websiteId = urlInfo['website_id'] description = urlInfo['description'] html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.EXCEPTION) return # 判断中英文 regex = '[\u4e00-\u9fa5]+' chineseWord = tools.getInfo(html, regex) if not chineseWord: basePaser.updateUrl(sourceUrl, Constance.DONE) return # 取当前页面的全部url urls = tools.getUrls(html) # 过滤掉外链接 添加到数据库 fitUrl = tools.fitUrl(urls, "feng.com") for url in fitUrl: # log.debug('url = ' + url) basePaser.addUrl(url, websiteId, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1.*?>(.*?)</h1>' title = tools.getInfo(html, regexs) title = title and title[0] or '' title = tools.delHtmlTag(title) # 内容 regexs = ['<div id="main_content".*?>(.*?)</div>', '<div class="yc_con_l">(.*?)<div class="txt_share_box"', '<div id="slideNoInsertDefault"></div>(.*?)</div>'] content = tools.getInfo(html, regexs) content = content and content[0] or '' content = tools.delHtmlTag(content) log.debug(''' depth = %d sourceUrl = %s title = %s content = %s '''%(depth, sourceUrl, title, content)) if content: basePaser.addTextInfo(websiteId, sourceUrl, title, content) # 更新sourceUrl为done basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseUrl(urlInfo): log.debug('处理 %s'%urlInfo) sourceUrl = urlInfo['url'] depth = urlInfo['depth'] websiteId = urlInfo['website_id'] description = urlInfo['description'] html = tools.getHtml(sourceUrl) if html == None: basePaser.updateUrl(sourceUrl, Constance.EXCEPTION) return # 判断中英文 regex = '[\u4e00-\u9fa5]+' chineseWord = tools.getInfo(html, regex) if not chineseWord: basePaser.updateUrl(sourceUrl, Constance.DONE) return # 取当前页面的全部url urls = tools.getUrls(html) # 过滤掉外链接 添加到数据库 fitUrl = tools.fitUrl(urls, "cctv.com") for url in fitUrl: # log.debug('url = ' + url) basePaser.addUrl(url, websiteId, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1><!--repaste.title.begin-->(.*?)<!--repaste.title.end-->' title = tools.getInfo(html, regexs) title = title and title[0] or '' title = tools.delHtmlTag(title) # 内容 regexs = ['<!--repaste.body.begin-->(.*?)<!--repaste.body.end-->'] content = tools.getInfo(html, regexs) content = content and content[0] or '' content = tools.delHtmlTag(content) log.debug(''' depth = %d sourceUrl = %s title = %s content = %s '''%(depth, sourceUrl, title, content)) if content and title: basePaser.addTextInfo(websiteId, sourceUrl, title, content) # 更新sourceUrl为done basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseUrl(urlInfo): log.debug('处理 %s' % urlInfo) sourceUrl = urlInfo['url'] depth = urlInfo['depth'] websiteId = urlInfo['website_id'] description = urlInfo['description'] html = tools.getHtml(sourceUrl, 'gb2312') if html == None: basePaser.updateUrl(sourceUrl, Constance.EXCEPTION) return # 取当前页面的全部url urls = tools.getUrls(html) # 过滤掉外链接 添加到数据库 fitUrl = tools.fitUrl(urls, "sohu.com") for url in fitUrl: # log.debug('url = ' + url) basePaser.addUrl(url, websiteId, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1.*?>(.*?)</h1>' title = tools.getInfo(html, regexs) title = title and title[0] or '' title = tools.delHtmlTag(title) # 内容 regexs = [ '<div class="content clear clearfix".*?>(.*?)</div>', '<div class="box_con".*?>(.*?)<div class="edit clearfix"', '<div class="show_text">(.*?)</div>', '<div class="text">.*?<hr class="nonehr">', '<div itemprop="articleBody">(.*?)<div style="display:none;">', '<article>(.*?)</article>' ] content = tools.getInfo(html, regexs) content = content and content[0] or '' content = tools.delHtmlTag(content) log.debug(''' depth = %d sourceUrl = %s title = %s content = %s ''' % (depth, sourceUrl, title, content)) if content: basePaser.addTextInfo(websiteId, sourceUrl, title, content) # 更新sourceUrl为done basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseUrl(urlInfo): log.debug('处理 %s' % urlInfo) sourceUrl = urlInfo['url'] depth = urlInfo['depth'] websiteId = urlInfo['website_id'] description = urlInfo['description'] # 使用urlopen网页有时乱码 用get请求 然后设置编码解决了问题 html = tools.getHtmlByGet(sourceUrl, '') if not DEBUG: if html == None: if sourceUrl == Constance.TENCENT: basePaser.updateUrl(sourceUrl, Constance.TODO) else: basePaser.updateUrl(sourceUrl, Constance.EXCEPTION) return regex = '[\u4e00-\u9fa5]+' chineseWord = tools.getInfo(html, regex) if not chineseWord: basePaser.updateUrl(sourceUrl, Constance.DONE) return # 取当前页面的全部url urls = tools.getUrls(html) # 过滤掉外链接 添加到数据库 fitUrl = tools.fitUrl(urls, "qq.com") fitUrl = tools.filterRule(fitUrl, lineList) for url in fitUrl: # log.debug('url = ' + url) basePaser.addUrl(url, websiteId, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1.*?>(.*?)</h1>' title = tools.getInfo(html, regexs) title = title and title[0] or '' title = tools.delHtmlTag(title) # 内容 regexs = [ 'bossZone="content">(.+?)正文已结束.+?</span>', 'id="articleContent">(.*?)<div class="hasc">' ] content = tools.getInfo(html, regexs) content = content and content[0] or '' content = tools.delHtmlTag(content) log.debug(''' depth = %d sourceUrl = %s title = %s content = %s ''' % (depth, sourceUrl, title, content)) if not DEBUG: if content and title: basePaser.addTextInfo(websiteId, sourceUrl, title, content) # 更新sourceUrl为done basePaser.updateUrl(sourceUrl, Constance.DONE)
def parseUrl(urlInfo): log.debug('处理 %s' % urlInfo) sourceUrl = urlInfo['url'] depth = urlInfo['depth'] websiteId = urlInfo['website_id'] description = urlInfo['description'] # 使用urlopen网页有时乱码 用get请求 然后设置编码解决了问题 html = tools.getHtmlByGet(sourceUrl) if not DEBUG: if html == None: if sourceUrl == Constance.SINA: basePaser.updateUrl(sourceUrl, Constance.TODO) else: basePaser.updateUrl(sourceUrl, Constance.EXCEPTION) return regex = '[\u4e00-\u9fa5]+' chineseWord = tools.getInfo(html, regex) if not chineseWord: basePaser.updateUrl(sourceUrl, Constance.DONE) return # 取当前页面的全部url urls = tools.getUrls(html) # 过滤掉外链接 添加到数据库 fitUrl = tools.fitUrl(urls, "sina.com.cn") for url in fitUrl: # log.debug('url = ' + url) basePaser.addUrl(url, websiteId, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1.*?>(.*?)</h1>' title = tools.getInfo(html, regexs) title = title and title[0] or '' title = tools.delHtmlTag(title) if title == '加载中...': # 更新sourceUrl为done basePaser.updateUrl(sourceUrl, Constance.TODO) return # 内容 regexs = [ 'id="artibody".*?>(.*?)<!-- 吸顶导航结束定位标记 -->', 'id="artibody".*?>(.*?)<div id="left_hzh_ad">', '<!-- 正文内容 begin -->(.*?)<!-- 正文内容 end -->', 'id="articleContent".*?>(.*?)<div class="spacer"></div>' ] content = tools.getInfo(html, regexs) content = content and content[0] or '' content = tools.delHtmlTag(content) log.debug(''' depth = %d sourceUrl = %s title = %s content = %s ''' % (depth, sourceUrl, title, content)) if not DEBUG: if content and title: basePaser.addTextInfo(websiteId, sourceUrl, title, content) # 更新sourceUrl为done basePaser.updateUrl(sourceUrl, Constance.DONE)