def parseArticle(self, response): source_url = response.meta['source_url'] body = EncodeUtil.toUnicode(response.body) if False: self.infoStr(u'访问过多被禁止') else: self.infoStr(u'开始解析界面') title = response.meta['title'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] contentItem = response.meta['contentItem'] selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href | //style/text()').extract() styleList = [] for styleUrl in styleUrls: # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') styles = CssUtil.clearUrl(styles) contentItem['styles'] = styles content_html = selector.xpath('//*[@id="imedia-article"]') if len(content_html): contentItem['content_html'] = content_html.extract_first('') return contentItem
def process_item(self, item, spider): image_urls = [] for image_url in item['image_urls']: url = image_url.get('url') urlHash = EncryptUtil.md5(url) path = 'full/' + str(urlHash) + '.jpg' detailPath = self.savePath + '/' + path # 创建目录 saveDir = self.savePath + '/full' if not FileUtil.dirIsExist(saveDir): FileUtil.createDir(saveDir) if FileUtil.fileIsExist(detailPath): image_url_new = {'ok': True, 'x': {'url': url, 'path': path}} else: try: fileResponse = requests.get(url, timeout=10) req_code = fileResponse.status_code req_msg = fileResponse.reason if req_code == 200: open(detailPath, 'wb').write(fileResponse.content) image_url_new = { 'ok': True, 'x': { 'url': url, 'path': path } } else: print '下载图片失败', url image_url_new = { 'ok': False, 'x': { 'url': url, } } except Exception, e: print e print '下载图片失败', url image_url_new = { 'ok': False, 'x': { 'url': url, } } image_urls.append(image_url_new) # 空转2s TimerUtil.sleep(2)
def getHashCode(self, source_url): # 具体逻辑 return EncryptUtil.md5(source_url)
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: title = response.meta['title'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) category = selector.xpath( '//*[@class="a_catalog"]/a/text()|//*[@class="a_catalog"]/text()' ).extract_first('') post_user = selector.xpath( '//*[@class="a_author"]/text() | //*[@class="where"]/text()| //*[@class="where"]/a/text()' ).extract_first('') src_ref = selector.xpath( '//*[@class="a_source"]/text() | //*[@class="a_source"]/a/text()' ).extract_first('') a_time = selector.xpath( '//*[@class="a_time"]/text() | //*[@class="pubTime"]/text()' ).extract_first('') if a_time: post_date = a_time else: post_date = (post_date or '') post_date = post_date.replace(u'年', '-').replace( u'月', '-').replace(u'日', u' ').replace(u'\xa0', u' ') try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M")) except Exception: pass content_html = selector.xpath('//div[@id="Cnt-Main-Article-QQ"]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath( '*[not(name(.)="script") and not(name(.)="style") and not(name(.)="iframe") and not(boolean(@class="rv-root-v2 rv-js-root"))]' ) if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div id="Cnt-Main-Article-QQ" class="Cnt-Main-Article-QQ" bosszone="content">${++content++}</div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace( image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = category contentItem['post_user'] = post_user contentItem['tags'] = '' contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 3 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = '腾讯科技' contentItem['src_ref'] = src_ref return contentItem
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: title = response.meta['title'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] sub_channel = response.meta['sub_channel'] src_channel = response.meta['src_channel'] tags = response.meta['tags'] self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) styles = CssUtil.clearBackgroundColor(styles, ['#f5f5f5']) post_user = selector.xpath( '//div[@class="article-info"]//span[@class="author"]//text()' ).extract_first('') src_ref = src_channel post_date = selector.xpath( '//div[@class="article-info"]//span[@class="date"]//text()' ).extract_first('') try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y/%m/%d %H:%M")) except Exception: pass tags_ = selector.xpath( '//div[@class="article-info"]//*[@class="tags"]//text()' ).extract() tags = tags + tags_ tags = ','.join(tags) """ article-main article-img article-content p article-source p:来源 p:点击下载“界面新闻”APP 不抓 """ # 得到article-img article_img = selector.xpath( '//div[@class="article-main"]/div[@class="article-img"]' ).extract_first('') # 得到article-content article_content = selector.xpath( '//div[@class="article-main"]/div[@class="article-content"]' ).extract_first('') if not article_content: self.logDao.info(u'文章不存在' + title + ':' + source_url + ':' + post_date) return contentSelector = Selector(text=article_content) content_items = contentSelector.xpath( '//div[@class="article-content"]/*[not(name(.)="script") and not(' 'name(.)="iframe") and not(name(.)="style") and not(boolean( ' 'contains(a//@href,"?m=app"))) and not(boolean(@class="share-view" ' 'or @class="article-source"))]') # 得到来源 做替换 contentSource = contentSelector.xpath( '//div[@class="article-content"]/div[@class="article-source"]/p/text()' ).extract_first('') if contentSource: contentSource = contentSource.replace(u'来源:', u'') src_ref = contentSource # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = u"""<div class="article-main">${++articleImg++}<div class="article-content" style="font-family: 'Microsoft YaHei', 黑体;">${++content++}</div></div> """ content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++articleImg++}', article_img).replace( '${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace( image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = sub_channel contentItem['post_user'] = post_user contentItem['tags'] = tags contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 5 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = src_channel contentItem['src_ref'] = src_ref return contentItem
def getHashCode(self, title, wx_account, source_id): # 具体逻辑 return EncryptUtil.md5( title.encode('utf8') + wx_account.encode('utf8') + str(source_id))
def parseDetail2(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: category = response.meta['category'] title = response.meta['title'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + category + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath('//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace('\\', '\\\\') styles = CssUtil.clearUrl(styles) styles = styles.replace('overflow-x:hidden', '').replace('overflow:hidden', '') post_date = selector.xpath('//*[@id="pub_date"]/text() | //*[@class="titer"]/text()').extract_first('') post_date = post_date.replace('\r\n', '').strip(' ').replace(u'年', '-').replace(u'月', '-').replace(u'日', ' ') try: post_date = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M")) except Exception: pass src_ref = selector.xpath( '//*[@id="media_name"]/a[1]/text() | //*[@class="source"]/a/text() | //*[@class="source"]/text()').extract_first( '') post_user = selector.xpath('//*[@id="author_ename"]/a/text()').extract_first('') tags = selector.xpath('//p[@class="art_keywords"]/a/text()').extract() or [] tags = ','.join(tags) content_html = selector.xpath('//*[@id="artibody"][1]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 2017-07-24 19:23 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath('*[not(boolean(@class="entHdPic")) and not(name(.)="script")]') if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 # TODO...之后处理 取出标题类型 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') if u'来源:' in allTxt and len(allTxt) < 25: # 说明这是真正的来源 if not post_user: # 先替换作者 ,如果不存在的话 post_user = src_ref src_ref = allTxt.replace(u'来源:', '').strip(u' ') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div class="content_wrappr_left"><div class="content"><div class="BSHARE_POP blkContainerSblkCon clearfix blkContainerSblkCon_16" id="artibody">${++content++}</div></div></div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src image_url = img.xpath('@src').extract_first() if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = category contentItem['post_user'] = post_user contentItem['tags'] = tags contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 2 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = '新浪科技' contentItem['src_ref'] = src_ref return contentItem
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] title = response.meta['title'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url) selector = Selector(text=body) # 页面存在的css pageStyles = selector.xpath('//style/text()').extract() # 得到样式 styleUrls = selector.xpath('//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styleList = pageStyles + styleList styles = CssUtil.compressCss(styleList).replace('\'', '"').replace('\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) tags = selector.xpath('//meta[@name="keywords"]/@content').extract_first('') src_ref = selector.xpath('//*[@class="m-title f-pr"]/h2[@class="f-ff1 f-fwn f-fs14"]/i//text()').extract_first('') content_html = selector.xpath('//div[@class="m-text"]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath('*[not(name(.)="script") and not(name(.)="style") and not(name(.)="iframe")]') if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div class="m-text">${++content++}</div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace(image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = sub_channel contentItem['post_user'] = '' contentItem['tags'] = tags contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 9 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = src_channel contentItem['src_ref'] = src_ref return contentItem
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] title = response.meta['title'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) post_user = selector.xpath( '//div[@class="tip fl"]/text()').extract_first('').replace( '\r', '').replace('\t', '').replace('\n', '') src_ref = selector.xpath( '//div[@class="tip fl"]/a/text()').extract_first('') post_date = selector.xpath( '//div[@class="tip fl"]/span[@class="pr20"]/text()' ).extract_first('') if post_date: try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M:%S")) except Exception as e: self.logDao.warn(e.message) pass content_html = selector.xpath('//div[@class="art_contextBox"]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath( '*[not(name(.)="script") and not(name(.)="style") and not(name(.)="iframe") and not(boolean(name(.)="div" and @style="text-align:right;font-size:12px"))]' ) if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div class="art_context"><div class="art_contextBox" style="visibility: visible; height:auto;">${++content++}</div></div> """ content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace( image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = sub_channel contentItem['post_user'] = post_user contentItem['tags'] = '' contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 6 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = src_channel contentItem['src_ref'] = src_ref return contentItem
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] styleUrlDefault = response.meta['styleUrlDefault'] title = response.meta['title'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href').extract() styleUrls = styleUrls + styleUrlDefault styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) styles = CssUtil.clearBackgroundColor(styles, ['#eaeaea']) tags = selector.xpath( '//meta[@name="keywords"]/@content').extract_first('') category = selector.xpath( '//meta[boolean(contains(@name, "og:category"))]/@content' ).extract_first('') if category: sub_channel = sub_channel + ',' + category src_ref = selector.xpath( '//span[@class="ss03"]//text()').extract_first('') if not src_ref.replace('\n', '').replace(' ', ''): src_ref = selector.xpath( '//div[@id="artical_sth"]/p/text()').extract() src_ref = ''.join(src_ref).replace('\n', '').replace( u'来源:', '').replace(' ', '') post_date = selector.xpath( '//meta[@name="og:time"]/@content').extract_first('') post_date = post_date.replace(u'年', '-').replace( u'月', '-').replace(u'日', u' ').replace(u'\xa0', u' ') try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M:%S")) except Exception as e: try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M")) except Exception as e: self.logDao.warn(e.message) pass pass content_html = selector.xpath('//div[@id="main_content"]') logoHtml = selector.xpath( '//span[@class="ifengLogo"]').extract_first('') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath( '*[not(name(.)="script") and not(name(.)="style") and not(name(.)="iframe")]' ) if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div id="artical_real" class="js_img_share_area"><div id="main_content" class="js_selection_area" bosszone="content">${++content++}</div></div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) content_html = content_html.replace(logoHtml, '') selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace( image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = sub_channel contentItem['post_user'] = '' contentItem['tags'] = tags contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 8 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = src_channel contentItem['src_ref'] = src_ref return contentItem