def parseArticle(self, response): source_url = response.meta['source_url'] body = EncodeUtil.toUnicode(response.body) if False: self.infoStr(u'访问过多被禁止') else: self.infoStr(u'开始解析界面') title = response.meta['title'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] contentItem = response.meta['contentItem'] selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href | //style/text()').extract() styleList = [] for styleUrl in styleUrls: # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') styles = CssUtil.clearUrl(styles) contentItem['styles'] = styles content_html = selector.xpath('//*[@id="imedia-article"]') if len(content_html): contentItem['content_html'] = content_html.extract_first('') return contentItem
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: title = response.meta['title'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) category = selector.xpath( '//*[@class="a_catalog"]/a/text()|//*[@class="a_catalog"]/text()' ).extract_first('') post_user = selector.xpath( '//*[@class="a_author"]/text() | //*[@class="where"]/text()| //*[@class="where"]/a/text()' ).extract_first('') src_ref = selector.xpath( '//*[@class="a_source"]/text() | //*[@class="a_source"]/a/text()' ).extract_first('') a_time = selector.xpath( '//*[@class="a_time"]/text() | //*[@class="pubTime"]/text()' ).extract_first('') if a_time: post_date = a_time else: post_date = (post_date or '') post_date = post_date.replace(u'年', '-').replace( u'月', '-').replace(u'日', u' ').replace(u'\xa0', u' ') try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M")) except Exception: pass content_html = selector.xpath('//div[@id="Cnt-Main-Article-QQ"]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath( '*[not(name(.)="script") and not(name(.)="style") and not(name(.)="iframe") and not(boolean(@class="rv-root-v2 rv-js-root"))]' ) if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div id="Cnt-Main-Article-QQ" class="Cnt-Main-Article-QQ" bosszone="content">${++content++}</div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace( image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = category contentItem['post_user'] = post_user contentItem['tags'] = '' contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 3 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = '腾讯科技' contentItem['src_ref'] = src_ref return contentItem
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: title = response.meta['title'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] sub_channel = response.meta['sub_channel'] src_channel = response.meta['src_channel'] tags = response.meta['tags'] self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) styles = CssUtil.clearBackgroundColor(styles, ['#f5f5f5']) post_user = selector.xpath( '//div[@class="article-info"]//span[@class="author"]//text()' ).extract_first('') src_ref = src_channel post_date = selector.xpath( '//div[@class="article-info"]//span[@class="date"]//text()' ).extract_first('') try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y/%m/%d %H:%M")) except Exception: pass tags_ = selector.xpath( '//div[@class="article-info"]//*[@class="tags"]//text()' ).extract() tags = tags + tags_ tags = ','.join(tags) """ article-main article-img article-content p article-source p:来源 p:点击下载“界面新闻”APP 不抓 """ # 得到article-img article_img = selector.xpath( '//div[@class="article-main"]/div[@class="article-img"]' ).extract_first('') # 得到article-content article_content = selector.xpath( '//div[@class="article-main"]/div[@class="article-content"]' ).extract_first('') if not article_content: self.logDao.info(u'文章不存在' + title + ':' + source_url + ':' + post_date) return contentSelector = Selector(text=article_content) content_items = contentSelector.xpath( '//div[@class="article-content"]/*[not(name(.)="script") and not(' 'name(.)="iframe") and not(name(.)="style") and not(boolean( ' 'contains(a//@href,"?m=app"))) and not(boolean(@class="share-view" ' 'or @class="article-source"))]') # 得到来源 做替换 contentSource = contentSelector.xpath( '//div[@class="article-content"]/div[@class="article-source"]/p/text()' ).extract_first('') if contentSource: contentSource = contentSource.replace(u'来源:', u'') src_ref = contentSource # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = u"""<div class="article-main">${++articleImg++}<div class="article-content" style="font-family: 'Microsoft YaHei', 黑体;">${++content++}</div></div> """ content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++articleImg++}', article_img).replace( '${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace( image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = sub_channel contentItem['post_user'] = post_user contentItem['tags'] = tags contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 5 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = src_channel contentItem['src_ref'] = src_ref return contentItem
def parseDetail2(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: category = response.meta['category'] title = response.meta['title'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + category + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath('//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace('\\', '\\\\') styles = CssUtil.clearUrl(styles) styles = styles.replace('overflow-x:hidden', '').replace('overflow:hidden', '') post_date = selector.xpath('//*[@id="pub_date"]/text() | //*[@class="titer"]/text()').extract_first('') post_date = post_date.replace('\r\n', '').strip(' ').replace(u'年', '-').replace(u'月', '-').replace(u'日', ' ') try: post_date = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M")) except Exception: pass src_ref = selector.xpath( '//*[@id="media_name"]/a[1]/text() | //*[@class="source"]/a/text() | //*[@class="source"]/text()').extract_first( '') post_user = selector.xpath('//*[@id="author_ename"]/a/text()').extract_first('') tags = selector.xpath('//p[@class="art_keywords"]/a/text()').extract() or [] tags = ','.join(tags) content_html = selector.xpath('//*[@id="artibody"][1]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 2017-07-24 19:23 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath('*[not(boolean(@class="entHdPic")) and not(name(.)="script")]') if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 # TODO...之后处理 取出标题类型 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') if u'来源:' in allTxt and len(allTxt) < 25: # 说明这是真正的来源 if not post_user: # 先替换作者 ,如果不存在的话 post_user = src_ref src_ref = allTxt.replace(u'来源:', '').strip(u' ') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div class="content_wrappr_left"><div class="content"><div class="BSHARE_POP blkContainerSblkCon clearfix blkContainerSblkCon_16" id="artibody">${++content++}</div></div></div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src image_url = img.xpath('@src').extract_first() if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = category contentItem['post_user'] = post_user contentItem['tags'] = tags contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 2 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = '新浪科技' contentItem['src_ref'] = src_ref return contentItem
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] title = response.meta['title'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url) selector = Selector(text=body) # 页面存在的css pageStyles = selector.xpath('//style/text()').extract() # 得到样式 styleUrls = selector.xpath('//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styleList = pageStyles + styleList styles = CssUtil.compressCss(styleList).replace('\'', '"').replace('\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) tags = selector.xpath('//meta[@name="keywords"]/@content').extract_first('') src_ref = selector.xpath('//*[@class="m-title f-pr"]/h2[@class="f-ff1 f-fwn f-fs14"]/i//text()').extract_first('') content_html = selector.xpath('//div[@class="m-text"]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath('*[not(name(.)="script") and not(name(.)="style") and not(name(.)="iframe")]') if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div class="m-text">${++content++}</div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace(image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = sub_channel contentItem['post_user'] = '' contentItem['tags'] = tags contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 9 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = src_channel contentItem['src_ref'] = src_ref return contentItem
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] title = response.meta['title'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) post_user = selector.xpath( '//div[@class="tip fl"]/text()').extract_first('').replace( '\r', '').replace('\t', '').replace('\n', '') src_ref = selector.xpath( '//div[@class="tip fl"]/a/text()').extract_first('') post_date = selector.xpath( '//div[@class="tip fl"]/span[@class="pr20"]/text()' ).extract_first('') if post_date: try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M:%S")) except Exception as e: self.logDao.warn(e.message) pass content_html = selector.xpath('//div[@class="art_contextBox"]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath( '*[not(name(.)="script") and not(name(.)="style") and not(name(.)="iframe") and not(boolean(name(.)="div" and @style="text-align:right;font-size:12px"))]' ) if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div class="art_context"><div class="art_contextBox" style="visibility: visible; height:auto;">${++content++}</div></div> """ content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace( image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = sub_channel contentItem['post_user'] = post_user contentItem['tags'] = '' contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 6 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = src_channel contentItem['src_ref'] = src_ref return contentItem
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] styleUrlDefault = response.meta['styleUrlDefault'] title = response.meta['title'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href').extract() styleUrls = styleUrls + styleUrlDefault styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) styles = CssUtil.clearBackgroundColor(styles, ['#eaeaea']) tags = selector.xpath( '//meta[@name="keywords"]/@content').extract_first('') category = selector.xpath( '//meta[boolean(contains(@name, "og:category"))]/@content' ).extract_first('') if category: sub_channel = sub_channel + ',' + category src_ref = selector.xpath( '//span[@class="ss03"]//text()').extract_first('') if not src_ref.replace('\n', '').replace(' ', ''): src_ref = selector.xpath( '//div[@id="artical_sth"]/p/text()').extract() src_ref = ''.join(src_ref).replace('\n', '').replace( u'来源:', '').replace(' ', '') post_date = selector.xpath( '//meta[@name="og:time"]/@content').extract_first('') post_date = post_date.replace(u'年', '-').replace( u'月', '-').replace(u'日', u' ').replace(u'\xa0', u' ') try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M:%S")) except Exception as e: try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M")) except Exception as e: self.logDao.warn(e.message) pass pass content_html = selector.xpath('//div[@id="main_content"]') logoHtml = selector.xpath( '//span[@class="ifengLogo"]').extract_first('') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath( '*[not(name(.)="script") and not(name(.)="style") and not(name(.)="iframe")]' ) if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div id="artical_real" class="js_img_share_area"><div id="main_content" class="js_selection_area" bosszone="content">${++content++}</div></div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) content_html = content_html.replace(logoHtml, '') selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace( image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = sub_channel contentItem['post_user'] = '' contentItem['tags'] = tags contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 8 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = src_channel contentItem['src_ref'] = src_ref return contentItem
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) selector = Selector(text=body) title = selector.xpath('//title/text()').extract_first('').strip(u' ') source_url = response.meta['source_url'] wx_account = response.meta['wx_account'] isN = u"请输入验证码" == title if isN or response.status == 302: self.logDao.info(u'访问过多被禁止,重新拨号') # 存起来 self.brokenAccounts.append(wx_account) # 获取Ip # 同时空线程30s NetworkUtil.getNewIp() TimerUtil.sleep(80) NetworkUtil.openWebbrowser(source_url) else: title = response.meta['title'] source_url = response.meta['source_url'] wx_account_id = response.meta['wx_account_id'] self.logDao.info(u'开始解析文章' + wx_account + ':' + title + ':' + source_url) self.logDao.info(u'开始解析文章:' + source_url) # 进行解析 post_date = selector.xpath( '//*[@id="post-date"]/text()').extract_first('') try: post_date = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d")) except Exception: pass styles = selector.xpath('//style/text()').extract() styles = CssUtil.compressCss(styles).replace('\'', '"').replace( '\\', '\\\\') styles = CssUtil.clearUrl(styles) styles = CssUtil.clearBackgroundColor(styles, ['#f3f3f3']) post_user = selector.xpath( '//*[@id="post-user"]/text()').extract_first('') content_html = selector.xpath('//*[@id="js_content"]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 content_items = content_html.xpath('*') if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # content_items_new = [] # for item in content_items: # itemStr = item.extract() # if u'订阅微信' in itemStr: # continue # content_items_new.append(item) # content_items = content_items_new # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div class="rich_media_content " id="js_content">${++content++}</div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url = img.xpath('@src | @data-src').extract_first('') if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) self.logDao.info(wx_account + u'得到文章:' + title + ":" + post_date + ':' + post_user) self.logDao.info(u'得到文章:' + source_url) # 得到hashCode1 hash_code = self.checkDao.getHashCode(title, wx_account, 1) self.saveFile(hash_code, body) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = '' contentItem['post_user'] = post_user contentItem['tags'] = '' contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 1 contentItem['src_account_id'] = wx_account_id contentItem['src_channel'] = '微信公众号' contentItem['src_ref'] = '' contentItem['wx_account'] = wx_account return contentItem