def parseArticleList(self, response): url = response.meta['url'] body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] self.logDao.info(u'开始解析列表') body = body.lstrip('var data=').rstrip(';') # 格式化 jsonStr = demjson.decode(body) or {} articles = jsonStr.get('news') or [] categoryList = jsonStr.get('category') or [] for article_ins in articles: for article in article_ins: source_url = article.get('l', '') title = article.get('t', '') timeStr = article.get('p', '') # 如果存在则不抓取 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在' + title + timeStr + source_url) continue categoryIndex = article.get('c') category = '' if 0 <= categoryIndex < len(categoryList): category = categoryList[categoryIndex].get('n') if category: sub_channel = EncodeUtil.toUnicode( sub_channel) + u',' + EncodeUtil.toUnicode( category) post_date = article.get('p') self.logDao.info(u'抓取文章' + title + ':' + post_date + ':' + source_url) yield scrapy.Request(url=source_url, meta={ 'request_type': 'wangyi_detail', 'title': title, 'post_date': post_date, 'source_url': source_url, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticle)
def parseArticle(self, response): source_url = response.meta['source_url'] body = EncodeUtil.toUnicode(response.body) if False: self.infoStr(u'访问过多被禁止') else: self.infoStr(u'开始解析界面') title = response.meta['title'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] contentItem = response.meta['contentItem'] selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href | //style/text()').extract() styleList = [] for styleUrl in styleUrls: # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') styles = CssUtil.clearUrl(styles) contentItem['styles'] = styles content_html = selector.xpath('//*[@id="imedia-article"]') if len(content_html): contentItem['content_html'] = content_html.extract_first('') return contentItem
def parseList2(self, response): data = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: url = response.meta['url'] self.logDao.info(u'开始解析列表' + url) # 格式化 data = demjson.decode(data) or {} result = data.get('result', {}) list = result.get('data', []) for item in list: channel_name = u'科技' title = item.get('title', '') source_url = item.get('url', '') callback = self.parseDetail2 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在:' + title + source_url) continue self.logDao.info(u"开始抓取文章:" + source_url) # item['url'] = "http://tech.sina.com.cn/d/f/2017-07-31/doc-ifyinvwu3872514.shtml" yield scrapy.Request(url=item['url'], meta={'request_type': 'sina_detail', 'category': channel_name, 'title': title, 'source_url': source_url}, callback=callback)
def parseArticleList(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] styleUrlDefault = response.meta['styleUrlDefault'] self.logDao.info(u'开始解析列表') selector = Selector(text=body) articles = selector.xpath('//div[@class="newsList"]//li') for article in articles: source_url = article.xpath('./a/@href').extract_first( '').lstrip('%20').lstrip(' ') title = article.xpath('./a/text()').extract_first('') # 如果存在则不抓取 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在' + title + ':' + source_url) continue self.logDao.info(u'抓取文章' + title + ':' + source_url) yield scrapy.Request(url=source_url, meta={ 'request_type': 'fenghuang_detail', "title": title, "source_url": source_url, 'src_channel': src_channel, 'sub_channel': sub_channel, 'styleUrlDefault': styleUrlDefault }, callback=self.parseArticle)
def parseList(self, response): source = response.meta['source'] wx_account = response.meta['wx_account'] url = response.meta['url'] body = EncodeUtil.toUnicode(response.body) # 判断被禁止 提示需要重启路由 清理cookie if response.status == 302: # 更新状态为更新失败 self.logDao.warn(u'您的访问过于频繁,重新拨号') self.wxSourceDao.updateStatus(wx_account, 'updateFail') # 获取Ip # 同时空线程30s NetworkUtil.getNewIp() TimerUtil.sleep(30) else: self.logDao.info(u'开始解析:' + wx_account) # 进行解析 selector = Selector(text=body) results = selector.xpath('//*[@id="main"]/div[4]/ul/li') self.logDao.info(u'列表长度:' + str(len(results))) hasCatch = False for result in results: wx_name = result.xpath('//*[@id="sogou_vr_11002301_box_0"]/div/div[2]/p[1]/a/text()').extract_first() wx_account_ = result.xpath('//p[@class="info"]/label/text()').extract_first() wx_url = result.xpath('//p[@class="tit"]/a/@href').extract_first() if wx_account_ == wx_account: self.logDao.info(u'成功抓取:' + wx_account_) self.wxSourceDao.updateSource(wx_account, wx_name, wx_url, 'last') hasCatch = True break if not hasCatch: self.logDao.info(u'没有抓到:' + wx_account_) self.wxSourceDao.updateStatus(wx_account, 'none') pass
def parseArticleList(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: # 格式化 jsonStr = demjson.decode(body.lstrip('(').rstrip(')')) or {} rst = jsonStr.get('rst', '') if not rst: self.logDao.info(u'不存在内容') return self.logDao.info(u'开始解析列表') selector = Selector(text=rst) articles = selector.xpath('//div[@class="news-img"]/a') for article in articles: source_url = article.xpath('@href').extract_first('') title = article.xpath('@title').extract_first('') # 如果存在则不抓取 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在' + title + ':' + source_url) continue if not source_url: self.logDao.info(u'文章不存在' + title + ':' + source_url) continue self.logDao.info(u'抓取文章' + title + ':' + source_url) yield scrapy.Request(url=source_url, meta={ 'request_type': 'jiemian_detail', "title": title, "source_url": source_url }, callback=self.parseArticle)
def parseArticleList2(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] self.logDao.info(u'开始解析列表') selector = Selector(text=body) articles = selector.xpath('//div[@class="list yaowen"]//li[boolean(contains(@class, "item"))]/a[1]') for article in articles: source_url = article.xpath('./@href').extract_first('') title = article.xpath('./text()').extract_first('') if not source_url: continue # 如果存在则不抓取 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在' + title + ':' + source_url) continue self.logDao.info(u'抓取文章' + title + ':' + source_url) yield scrapy.Request(url=source_url, meta={ 'request_type': 'tengxun_detail', "title": title, 'post_date': '', 'source_url': source_url, 'src_channel': src_channel, 'sub_channel': sub_channel, }, callback=self.parseArticle)
def parseList(self, response): data = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: url = response.meta['url'] self.logDao.info(u'开始解析列表' + url) data = data.lstrip('var jsonData = ').rstrip(';') # 格式化 data = demjson.decode(data) or {} list = data.get('list', []) for item in list: channel = item.get('channel', {}) channel_name = channel.get('title', '') title = item.get('title', '') source_url = item.get('url', '') callback = self.parseDetail2 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在:' + title + source_url) continue self.logDao.info(u"开始抓取文章:" + item['url']) yield scrapy.Request(url=item['url'], meta={ 'request_type': 'sina_detail', 'category': channel_name, 'title': title, 'source_url': source_url }, callback=callback)
def parseList3(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] url = response.meta['url'] self.logDao.info(u'开始解析列表' + url) selector = Selector(text=body) articles = selector.xpath('//*[@id="fin_tabs0_c0"]//a') # 格式化 for article in articles: title = article.xpath('./text()').extract_first('') source_url = article.xpath('./@href').extract_first('') if not source_url: continue callback = self.parseDetail2 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在:' + title + source_url) continue self.logDao.info(u"开始抓取文章:" + source_url) yield scrapy.Request(url=source_url, meta={ 'request_type': 'sina_detail', 'title': title, 'source_url': source_url, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=callback)
def parseArticleList(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: self.logDao.info(u'开始解析列表') selector = Selector(text=body) articles = selector.xpath('//div[@class="mod newslist"]//li') for article in articles: source_url = article.xpath('a/@href').extract_first('') title = article.xpath('a/text()').extract_first('') post_date = article.xpath('span/text()').extract_first('') post_date = time.strftime('%Y', time.localtime( time.time())) + u'年' + post_date if not source_url: continue # 如果存在则不抓取 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在' + title + ':' + post_date + ':' + source_url) continue self.logDao.info(u'抓取文章' + title + ':' + post_date + ':' + source_url) yield scrapy.Request(url=source_url, meta={ 'request_type': 'demoName_detail', "title": title, 'post_date': post_date, "source_url": source_url }, callback=self.parseArticle)
def parseArticleList(self, response): body = EncodeUtil.toUnicode(response.body) selector = Selector(text=body) source_url = response.meta['source_url'] print source_url title = selector.xpath('//title/text()').extract_first('').strip(u' ') isN = u"请输入验证码" == title if isN or response.status == 302: self.logDao.info(u'访问过多被禁止,重新拨号') # 获取Ip # 同时空线程30s NetworkUtil.getNewIp() TimerUtil.sleep(50) NetworkUtil.openWebbrowser(source_url) else: source = response.meta['source'] wx_account = response.meta['wx_account'] wx_account_id = response.meta['wx_account_id'] self.logDao.info(u'开始解析列表:' + wx_account) # 进行解析 articleJS = selector.xpath('//script/text()').extract() for js in articleJS: if 'var msgList = ' in js: p8 = re.compile('var\s*msgList\s*=.*;') matchList = p8.findall(js) for match in matchList: match = match.lstrip('var msgList = ').rstrip(';') # 格式化 articles = demjson.decode(match) or {} articles = articles['list'] or [] self.logDao.info(u'匹配到文章列表' + wx_account) for article in articles: app_msg_ext_info = article.get( 'app_msg_ext_info') or {} desc = app_msg_ext_info.get('digest') or '' title = app_msg_ext_info.get('title') or '' # 如果存在则不抓取 if self.checkDao.checkExist(title, wx_account, 1): self.logDao.info(u'已经存在' + wx_account + ':' + title) continue detailUrl = app_msg_ext_info['content_url'] or '' if not detailUrl: continue detailUrl = "http://mp.weixin.qq.com" + detailUrl detailUrl = detailUrl.replace("amp;", "") self.logDao.info(u'抓取' + wx_account + ':' + title + ':' + detailUrl) yield scrapy.Request(url=detailUrl, meta={ 'request_type': 'weixin_detail', 'wx_account': wx_account, "source": source, "title": title, 'wx_account_id': wx_account_id, "source_url": detailUrl }, callback=self.parseArticle)
def parseArticleList(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: # 格式化 jsonStr = demjson.decode(body.lstrip('(').rstrip(')')) or {} rst = jsonStr.get('rst', '') if not rst: self.logDao.info(u'不存在内容') return self.logDao.info(u'开始解析列表') src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] selector = Selector(text=rst) articles = selector.xpath( '//div[boolean(contains(@class,"news-view"))]') for article in articles: source_url = article.xpath( './/div[@class="news-header"]//a/@href').extract_first('') title = article.xpath( './/div[@class="news-header"]//a/@title | .//div[@class="news-header"]//a/text()' ).extract_first('') post_date = article.xpath( './/div[@class="news-footer"]//span[@class="date"]/text()' ).extract_first('') tags = article.xpath( './/div[@class="news-tag"]/a/text()').extract() if not source_url: self.logDao.info(u'文章不存在' + title + ':' + source_url + ':' + post_date) continue # 如果存在则不抓取 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在' + title + ':' + source_url + ':' + post_date) continue self.logDao.info(u'抓取文章' + title + ':' + source_url + ':' + post_date) yield scrapy.Request(url=source_url, meta={ 'request_type': 'jiemian_detail', "title": title, 'post_date': post_date, 'sub_channel': sub_channel, 'src_channel': src_channel, 'tags': tags, 'source_url': source_url }, callback=self.parseArticle)
def parseArticleList(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] # 格式化 articles = demjson.decode( body.lstrip('/**/').lstrip('(').rstrip(';').rstrip(')')) or [] if not articles: self.logDao.info(u'不存在内容') return self.logDao.info(u'开始解析列表') for article in articles: id = article.get('id', '') authorId = article.get('authorId', '') if not id or not authorId: continue pageId = str(id) + '_' + str(authorId) source_url = 'http://www.sohu.com/a/' + pageId + '?loc=1&focus_pic=0' title = article.get('title', '') post_user = article.get('authorName', '') tags = article.get('tags', []) tagsStr = [] for tag in tags: tagsStr.append(tag.get('name', '')) publicTime = article.get('publicTime', time.time() * 1000) post_date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(publicTime / 1000)) # 如果存在则不抓取 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在' + title + ':' + post_date + ':' + source_url) continue self.logDao.info(u'抓取文章' + title + ':' + post_date + ':' + source_url) yield scrapy.Request(url=source_url, meta={ 'request_type': 'sohu_detail', 'title': title, 'post_date': post_date, 'post_user': post_user, 'tags': tagsStr, 'source_url': source_url, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticle)
def parse(self, response): body = EncodeUtil.toUnicode(response.body) if False: pass else: # 格式化 jsonStr = demjson.decode(body) or {} articles = jsonStr.get('result') or [] for article in articles: default_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) title = article.get('title', '') post_date = article.get('date', default_time) summary = article.get('summary', '') keywords = article.get('keywords', []) source_url = article.get('url') content_type = article.get('content_type', 'news') # news video image_urls = article.get('image_urls', []) if not title or not source_url or content_type == 'video': continue # 如果存在则不抓取 if self.checkDao.checkExist(source_url): self.infoStr('已经存在') continue contentItem = ContentItem() contentItem['title'] = title contentItem['post_date'] = post_date contentItem['summary'] = summary contentItem['keywords'] = ','.join(keywords) contentItem['source_url'] = source_url contentItem['content_type'] = content_type image_urls_new = [] for image_url in image_urls: if not image_url.startswith('http'): image_url = 'http://i1.go2yd.com/image.php?type=thumbnail_336x216&url=%s' % image_url image_urls_new.append(image_url) contentItem['image_urls'] = ','.join(image_urls_new) self.infoStr(u'抓取文章' + title + ':' + post_date + ':' + source_url) yield scrapy.Request(url=source_url, meta={ 'request_type': 'zixun_detail', "title": title, 'contentItem': contentItem, "source_url": source_url }, callback=self.parseArticle)
def parseArticleList(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: self.logDao.info(u'开始解析列表') selector = Selector(text=body) src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] articles = selector.xpath('//dl[@class="f-cb dl-item"]') for article in articles: source_url = article.xpath('.//h3[@class="f-ff1 f-fwn f-fs22"]/a/@href').extract_first('') title = article.xpath('.//h3[@class="f-ff1 f-fwn f-fs22"]/a/text()').extract_first('') sub_channel_ = article.xpath('.//h5[@class="f-ff1 f-fwn f-fs14"]/a/text()').extract_first('') post_date = article.xpath('.//h4[@class="f-ff1 f-fwn f-fs14"]/span/text()').extract_first('') if not source_url: continue if sub_channel_: sub_channel = EncodeUtil.toUnicode(sub_channel) + u',' + EncodeUtil.toUnicode(sub_channel_) if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在' + title + ':' + post_date + ':' + source_url) continue self.logDao.info(u'抓取文章' + title + ':' + post_date + ':' + source_url) yield scrapy.Request(url=source_url, meta={ 'request_type': 'diyicaijing_detail', "title": title, 'post_date': post_date, 'source_url': source_url, 'sub_channel': sub_channel, 'src_channel': src_channel, }, callback=self.parseArticle)
def parseArticleList(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] # 格式化 articles = demjson.decode(body.lstrip('TradeTab_JsonData=')) or [] if not articles: self.logDao.info(u'不存在内容') return self.logDao.info(u'开始解析列表') for article in articles: source_url = article.get('titleLink', '') title = article.get('title', '') post_date = article.get( 'dateInf', time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time()))) if not source_url: self.logDao.info(u'文章不存在' + title + ':' + source_url + ':' + post_date) continue # 时间多久以前的不做抓取 # TODO.. # 如果存在则不抓取 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在' + title + ':' + post_date + ':' + source_url) continue self.logDao.info(u'抓取文章' + title + ':' + post_date + ':' + source_url) yield scrapy.Request(url=source_url, meta={ 'request_type': 'hexun_detail', 'title': title, 'post_date': post_date, 'source_url': source_url, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticle)
def parseList(self, response): data = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] url = response.meta['url'] self.logDao.info(u'开始解析列表' + url) data = data.lstrip('var jsonData = ').rstrip(';') # 格式化 data = demjson.decode(data) or {} list = data.get('list', []) for item in list: channel = item.get('channel', {}) channel_name = channel.get('title', '') if channel_name: sub_channel = sub_channel + ',' + channel_name title = item.get('title', '') source_url = item.get('url', '') callback = self.parseDetail2 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在:' + title + source_url) continue self.logDao.info(u"开始抓取文章:" + item['url']) # item['url'] = "http://tech.sina.com.cn/d/f/2017-07-31/doc-ifyinvwu3872514.shtml" yield scrapy.Request(url=item['url'], meta={ 'request_type': 'sina_detail', 'title': title, 'source_url': source_url, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=callback)
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] title = response.meta['title'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) post_user = selector.xpath( '//div[@class="tip fl"]/text()').extract_first('').replace( '\r', '').replace('\t', '').replace('\n', '') src_ref = selector.xpath( '//div[@class="tip fl"]/a/text()').extract_first('') post_date = selector.xpath( '//div[@class="tip fl"]/span[@class="pr20"]/text()' ).extract_first('') if post_date: try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M:%S")) except Exception as e: self.logDao.warn(e.message) pass content_html = selector.xpath('//div[@class="art_contextBox"]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath( '*[not(name(.)="script") and not(name(.)="style") and not(name(.)="iframe") and not(boolean(name(.)="div" and @style="text-align:right;font-size:12px"))]' ) if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div class="art_context"><div class="art_contextBox" style="visibility: visible; height:auto;">${++content++}</div></div> """ content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace( image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = sub_channel contentItem['post_user'] = post_user contentItem['tags'] = '' contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 6 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = src_channel contentItem['src_ref'] = src_ref return contentItem
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: title = response.meta['title'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) category = selector.xpath( '//*[@class="a_catalog"]/a/text()|//*[@class="a_catalog"]/text()' ).extract_first('') post_user = selector.xpath( '//*[@class="a_author"]/text() | //*[@class="where"]/text()| //*[@class="where"]/a/text()' ).extract_first('') src_ref = selector.xpath( '//*[@class="a_source"]/text() | //*[@class="a_source"]/a/text()' ).extract_first('') a_time = selector.xpath( '//*[@class="a_time"]/text() | //*[@class="pubTime"]/text()' ).extract_first('') if a_time: post_date = a_time else: post_date = (post_date or '') post_date = post_date.replace(u'年', '-').replace( u'月', '-').replace(u'日', u' ').replace(u'\xa0', u' ') try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M")) except Exception: pass content_html = selector.xpath('//div[@id="Cnt-Main-Article-QQ"]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath( '*[not(name(.)="script") and not(name(.)="style") and not(name(.)="iframe") and not(boolean(@class="rv-root-v2 rv-js-root"))]' ) if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div id="Cnt-Main-Article-QQ" class="Cnt-Main-Article-QQ" bosszone="content">${++content++}</div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace( image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = category contentItem['post_user'] = post_user contentItem['tags'] = '' contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 3 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = '腾讯科技' contentItem['src_ref'] = src_ref return contentItem
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: title = response.meta['title'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] sub_channel = response.meta['sub_channel'] src_channel = response.meta['src_channel'] tags = response.meta['tags'] self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) styles = CssUtil.clearBackgroundColor(styles, ['#f5f5f5']) post_user = selector.xpath( '//div[@class="article-info"]//span[@class="author"]//text()' ).extract_first('') src_ref = src_channel post_date = selector.xpath( '//div[@class="article-info"]//span[@class="date"]//text()' ).extract_first('') try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y/%m/%d %H:%M")) except Exception: pass tags_ = selector.xpath( '//div[@class="article-info"]//*[@class="tags"]//text()' ).extract() tags = tags + tags_ tags = ','.join(tags) """ article-main article-img article-content p article-source p:来源 p:点击下载“界面新闻”APP 不抓 """ # 得到article-img article_img = selector.xpath( '//div[@class="article-main"]/div[@class="article-img"]' ).extract_first('') # 得到article-content article_content = selector.xpath( '//div[@class="article-main"]/div[@class="article-content"]' ).extract_first('') if not article_content: self.logDao.info(u'文章不存在' + title + ':' + source_url + ':' + post_date) return contentSelector = Selector(text=article_content) content_items = contentSelector.xpath( '//div[@class="article-content"]/*[not(name(.)="script") and not(' 'name(.)="iframe") and not(name(.)="style") and not(boolean( ' 'contains(a//@href,"?m=app"))) and not(boolean(@class="share-view" ' 'or @class="article-source"))]') # 得到来源 做替换 contentSource = contentSelector.xpath( '//div[@class="article-content"]/div[@class="article-source"]/p/text()' ).extract_first('') if contentSource: contentSource = contentSource.replace(u'来源:', u'') src_ref = contentSource # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = u"""<div class="article-main">${++articleImg++}<div class="article-content" style="font-family: 'Microsoft YaHei', 黑体;">${++content++}</div></div> """ content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++articleImg++}', article_img).replace( '${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace( image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = sub_channel contentItem['post_user'] = post_user contentItem['tags'] = tags contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 5 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = src_channel contentItem['src_ref'] = src_ref return contentItem
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) selector = Selector(text=body) title = selector.xpath('//title/text()').extract_first('').strip(u' ') source_url = response.meta['source_url'] wx_account = response.meta['wx_account'] isN = u"请输入验证码" == title if isN or response.status == 302: self.logDao.info(u'访问过多被禁止,重新拨号') # 存起来 self.brokenAccounts.append(wx_account) # 获取Ip # 同时空线程30s NetworkUtil.getNewIp() TimerUtil.sleep(80) NetworkUtil.openWebbrowser(source_url) else: title = response.meta['title'] source_url = response.meta['source_url'] wx_account_id = response.meta['wx_account_id'] self.logDao.info(u'开始解析文章' + wx_account + ':' + title + ':' + source_url) self.logDao.info(u'开始解析文章:' + source_url) # 进行解析 post_date = selector.xpath( '//*[@id="post-date"]/text()').extract_first('') try: post_date = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d")) except Exception: pass styles = selector.xpath('//style/text()').extract() styles = CssUtil.compressCss(styles).replace('\'', '"').replace( '\\', '\\\\') styles = CssUtil.clearUrl(styles) styles = CssUtil.clearBackgroundColor(styles, ['#f3f3f3']) post_user = selector.xpath( '//*[@id="post-user"]/text()').extract_first('') content_html = selector.xpath('//*[@id="js_content"]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 content_items = content_html.xpath('*') if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # content_items_new = [] # for item in content_items: # itemStr = item.extract() # if u'订阅微信' in itemStr: # continue # content_items_new.append(item) # content_items = content_items_new # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div class="rich_media_content " id="js_content">${++content++}</div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url = img.xpath('@src | @data-src').extract_first('') if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) self.logDao.info(wx_account + u'得到文章:' + title + ":" + post_date + ':' + post_user) self.logDao.info(u'得到文章:' + source_url) # 得到hashCode1 hash_code = self.checkDao.getHashCode(title, wx_account, 1) self.saveFile(hash_code, body) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = '' contentItem['post_user'] = post_user contentItem['tags'] = '' contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 1 contentItem['src_account_id'] = wx_account_id contentItem['src_channel'] = '微信公众号' contentItem['src_ref'] = '' contentItem['wx_account'] = wx_account return contentItem
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] styleUrlDefault = response.meta['styleUrlDefault'] title = response.meta['title'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href').extract() styleUrls = styleUrls + styleUrlDefault styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) styles = CssUtil.clearBackgroundColor(styles, ['#eaeaea']) tags = selector.xpath( '//meta[@name="keywords"]/@content').extract_first('') category = selector.xpath( '//meta[boolean(contains(@name, "og:category"))]/@content' ).extract_first('') if category: sub_channel = sub_channel + ',' + category src_ref = selector.xpath( '//span[@class="ss03"]//text()').extract_first('') if not src_ref.replace('\n', '').replace(' ', ''): src_ref = selector.xpath( '//div[@id="artical_sth"]/p/text()').extract() src_ref = ''.join(src_ref).replace('\n', '').replace( u'来源:', '').replace(' ', '') post_date = selector.xpath( '//meta[@name="og:time"]/@content').extract_first('') post_date = post_date.replace(u'年', '-').replace( u'月', '-').replace(u'日', u' ').replace(u'\xa0', u' ') try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M:%S")) except Exception as e: try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M")) except Exception as e: self.logDao.warn(e.message) pass pass content_html = selector.xpath('//div[@id="main_content"]') logoHtml = selector.xpath( '//span[@class="ifengLogo"]').extract_first('') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath( '*[not(name(.)="script") and not(name(.)="style") and not(name(.)="iframe")]' ) if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div id="artical_real" class="js_img_share_area"><div id="main_content" class="js_selection_area" bosszone="content">${++content++}</div></div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) content_html = content_html.replace(logoHtml, '') selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace( image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = sub_channel contentItem['post_user'] = '' contentItem['tags'] = tags contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 8 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = src_channel contentItem['src_ref'] = src_ref return contentItem
def parseDetail2(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: category = response.meta['category'] title = response.meta['title'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + category + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath('//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace('\\', '\\\\') styles = CssUtil.clearUrl(styles) styles = styles.replace('overflow-x:hidden', '').replace('overflow:hidden', '') post_date = selector.xpath('//*[@id="pub_date"]/text() | //*[@class="titer"]/text()').extract_first('') post_date = post_date.replace('\r\n', '').strip(' ').replace(u'年', '-').replace(u'月', '-').replace(u'日', ' ') try: post_date = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M")) except Exception: pass src_ref = selector.xpath( '//*[@id="media_name"]/a[1]/text() | //*[@class="source"]/a/text() | //*[@class="source"]/text()').extract_first( '') post_user = selector.xpath('//*[@id="author_ename"]/a/text()').extract_first('') tags = selector.xpath('//p[@class="art_keywords"]/a/text()').extract() or [] tags = ','.join(tags) content_html = selector.xpath('//*[@id="artibody"][1]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 2017-07-24 19:23 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath('*[not(boolean(@class="entHdPic")) and not(name(.)="script")]') if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 # TODO...之后处理 取出标题类型 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') if u'来源:' in allTxt and len(allTxt) < 25: # 说明这是真正的来源 if not post_user: # 先替换作者 ,如果不存在的话 post_user = src_ref src_ref = allTxt.replace(u'来源:', '').strip(u' ') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div class="content_wrappr_left"><div class="content"><div class="BSHARE_POP blkContainerSblkCon clearfix blkContainerSblkCon_16" id="artibody">${++content++}</div></div></div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src image_url = img.xpath('@src').extract_first() if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = category contentItem['post_user'] = post_user contentItem['tags'] = tags contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 2 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = '新浪科技' contentItem['src_ref'] = src_ref return contentItem
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] title = response.meta['title'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url) selector = Selector(text=body) # 页面存在的css pageStyles = selector.xpath('//style/text()').extract() # 得到样式 styleUrls = selector.xpath('//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styleList = pageStyles + styleList styles = CssUtil.compressCss(styleList).replace('\'', '"').replace('\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) tags = selector.xpath('//meta[@name="keywords"]/@content').extract_first('') src_ref = selector.xpath('//*[@class="m-title f-pr"]/h2[@class="f-ff1 f-fwn f-fs14"]/i//text()').extract_first('') content_html = selector.xpath('//div[@class="m-text"]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath('*[not(name(.)="script") and not(name(.)="style") and not(name(.)="iframe")]') if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div class="m-text">${++content++}</div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace(image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = sub_channel contentItem['post_user'] = '' contentItem['tags'] = tags contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 9 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = src_channel contentItem['src_ref'] = src_ref return contentItem