Exemplo n.º 1
0
	def fetcharticle2(self, url, decoder):
		#url = self.http_daili % url[7:]
		opener = URLOpener(self.host, timeout=self.timeout)
		result = opener.open(url)
		print result.realurl
		status_code, content = result.code, result.content
		if status_code != 200 or not content:
			self.log.warn('fetch article failed(%d):%s.' % (status_code,url))
			return None
		soup = BeautifulSoup(content,'lxml')
		cont = soup.findAll(attrs={"align":"right"})
		url = cont[0].a['href']

		url = self.trueURL_zzh(url)
		#文章url
		result = opener.open(url)
		status_code, content = result.code, result.content
		if status_code != 200 or not content:
			self.log.warn('fetch article failed(%d):%s.' % (status_code,url))
			return None

		if self.page_encoding:
			return content.decode(self.page_encoding)
		else:
			return decoder.decode(content,url)
Exemplo n.º 2
0
	def ParseFeedUrls(self):
		urls = []
		urladded = set()
		url = self.feeds[0][1]
		section = self.feeds[0][0]
		opener = URLOpener(self.host, timeout=self.timeout)
		result = opener.open(url)
		if result.code == 200 and result.content:
			content = result.content.decode(self.feed_encoding)
			soup = BeautifulSoup(content, "lxml")
			tag_a = soup.find_all('a')
			href = tag_a[1]['href']
			temp_url = href[0:6]
			url = 'http://www.housebook.com.cn/'+ href
			result = opener.open(url)
			if result.code != 200:
				self.log.warn('fetch rss failed:%s'%mainurl)
				return []
			content = result.content.decode(self.feed_encoding)
			soup = BeautifulSoup(content, "lxml")
			tag_a = soup.find_all('a')
			for art in tag_a:
				if art['href'] == '../main.htm':
					continue
				urlfeed = 'http://www.housebook.com.cn/' + temp_url +'/' +art['href']
				title = art.text
				urls.append((section, title, urlfeed, None))
				urladded.add(urlfeed)
		else:
			self.log.warn('fetch rss failed(%d):%s'%(result.code,url))
		return urls
Exemplo n.º 3
0
    def ParseFeedUrls(self):
        #解析xml,返回相关信息
        """ return list like [(section,title,url,desc),..] """
        urls = []
        tnow = datetime.datetime.utcnow()
        urladded = set()

        for feed in self.feeds:
            section, url = feed[0], feed[1]
            isfulltext = feed[2] if len(feed) > 2 else False
            timeout = self.timeout + 10 if isfulltext else self.timeout
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)

            if result.code == 200 and result.content:
                if self.feed_encoding:
                    content = result.content.decode(self.feed_encoding)
                else:
                    content = AutoDecoder(True).decode(result.content, url)
                feed = feedparser.parse(content)  #进行解析

                #分解得到的内容
                for e in feed['entries'][:self.
                                         max_articles_per_feed]:  #取相应数量的feed
                    if self.oldest_article > 0 and hasattr(
                            e, 'updated_parsed'):  #是否有更新
                        updated = e.updated_parsed
                        if updated:
                            delta = tnow - datetime.datetime(*(updated[0:6]))
                            #根据时间来判断要取的文章
                            if delta.days * 86400 + delta.seconds > 86400 * self.oldest_article:
                                self.log.info("Skip old article: %s" % e.link)
                                continue
                    #支持HTTPS
                    urlfeed = e.link.replace(
                        'http://',
                        'https://') if url.startswith('https://') else e.link
                    if urlfeed in urladded:
                        continue

                    desc = None
                    if isfulltext:
                        if hasattr(e, 'content') and e.content[0]['value']:
                            desc = e.content[0]['value']
                        elif hasattr(e, 'description'):
                            desc = e.description
                        else:
                            self.log.warn(
                                'fulltext feed item no has desc,link to webpage for article.(%s)'
                                % e.title)
                    urls.append((section, e.title, urlfeed, desc))
                    urladded.add(urlfeed)
            else:
                self.log.warn('fetch rss failed(%d):%s' % (result.code, url))

        return urls
Exemplo n.º 4
0
	def fetcharticle(self, url, decoder):
		opener = URLOpener(self.host, timeout=self.timeout)
		result = opener.open(url)
		status_code, content = result.code, result.content
		if status_code != 200 or not content:
			self.log.warn('fetch article failed(%d):%s.' % (status_code,url))
			return None

		if self.page_encoding:
			return content.decode(self.page_encoding)
		else:
			return decoder.decode(content,url)
Exemplo n.º 5
0
    def fetcharticle(self, url, decoder):
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        status_code, content = result.code, result.content
        if status_code != 200 or not content:
            self.log.warn('fetch article failed(%d):%s.' % (status_code, url))
            return None

        if self.page_encoding:
            return content.decode(self.page_encoding)
        else:
            return decoder.decode(content, url)
Exemplo n.º 6
0
	def ParseFeedUrls(self):
		#解析xml,返回相关信息
		""" return list like [(section,title,url,desc),..] """
		urls = []
		tnow = datetime.datetime.utcnow()
		urladded = set()

		for feed in self.feeds:
			section,url = feed[0],feed[1]
			isfulltext = feed[2] if len(feed)>2 else False
			timeout = self.timeout+10 if isfulltext else self.timeout
			opener = URLOpener(self.host,timeout=timeout)
			result = opener.open(url)

			if result.code == 200 and result.content:
				if self.feed_encoding:
					content = result.content.decode(self.feed_encoding)
				else:
					content = AutoDecoder(True).decode(result.content,url)
				feed = feedparser.parse(content)#进行解析

				#分解得到的内容
				for e in feed['entries'][:self.max_articles_per_feed]:#取相应数量的feed
					if self.oldest_article > 0 and hasattr(e,'updated_parsed'):#是否有更新
						updated = e.updated_parsed
						if updated:
							delta = tnow - datetime.datetime(*(updated[0:6]))
							#根据时间来判断要取的文章
							if delta.days*86400+delta.seconds > 86400*self.oldest_article:
								self.log.info("Skip old article: %s" % e.link)
								continue
					#支持HTTPS
					urlfeed = e.link.replace('http://','https://') if url.startswith('https://') else e.link
					if urlfeed in urladded:
						continue

					desc = None
					if isfulltext:
						if hasattr(e,'content') and e.content[0]['value']:
							desc = e.content[0]['value']
						elif hasattr(e,'description'):
							desc = e.description
						else:
							self.log.warn('fulltext feed item no has desc,link to webpage for article.(%s)' % e.title)
					urls.append((section, e.title, urlfeed, desc))
					urladded.add(urlfeed)
			else:
				self.log.warn('fetch rss failed(%d):%s'%(result.code,url))

		return urls
Exemplo n.º 7
0
	def ParseFeedUrls(self):
		""" return list like [(section,title,url,desc),..] """
	        mainurl = 'http://www.economist.com/printedition'
	        urls = []
	        urladded = set()
	        opener = URLOpener(self.host, timeout=30)
	        result = opener.open(mainurl)
	        if result.code != 200:
	            self.log.warn('fetch rss failed:%s'%mainurl)
	            return []
	     
	        content = result.content.decode(self.feed_encoding)
	        soup = BeautifulSoup(content, "lxml")
#href=re.compile("elsie") 
		for section in soup.find_all(id=re.compile("section-")):
			h4 = section.find('h4')
			if h4 is None:
				self.log.warn('h4 is empty')
				continue
			sectitle = string_of_tag(h4).strip()
			if not sectitle:
				self.log.warn('h4 string is empty')
				continue
			#self.log.info('Found section: %s' % section_title)
			articles = []
			subsection = ''
			for node in section.find_all(class_='article'):
				subsec = node.find('h5')
				if subsec is not None:
					subsection = string_of_tag(subsec)
				prefix = (subsection + ': ') if subsection else ''
				a = node.find('a', attrs={"href":True}, recursive=False)
				if a is not None:
					url = a['href']
					if url.startswith(r'/'):
						url = 'http://www.economist.com' + url
					url += '/print'
					title = string_of_tag(a)
					if title:
						title = prefix + title
						#self.log.info('\tFound article:%s' % title)
						if url not in urladded:
							urls.append((sectitle,title,url,None))
							urladded.add(url)
		if len(urls) == 0:
			self.log.warn('len of urls is zero.')
		return urls
Exemplo n.º 8
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        mainurl = 'http://www.economist.com/printedition'
        urls = []
        urladded = set()
        opener = URLOpener(self.host, timeout=30)
        result = opener.open(mainurl)
        if result.code != 200:
            self.log.warn('fetch rss failed:%s' % mainurl)
            return []

        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")
        #href=re.compile("elsie")
        for section in soup.find_all(id=re.compile("section-")):
            h4 = section.find('h4')
            if h4 is None:
                self.log.warn('h4 is empty')
                continue
            sectitle = string_of_tag(h4).strip()
            if not sectitle:
                self.log.warn('h4 string is empty')
                continue
            #self.log.info('Found section: %s' % section_title)
            articles = []
            subsection = ''
            for node in section.find_all(class_='article'):
                subsec = node.find('h5')
                if subsec is not None:
                    subsection = string_of_tag(subsec)
                prefix = (subsection + ': ') if subsection else ''
                a = node.find('a', attrs={"href": True}, recursive=False)
                if a is not None:
                    url = a['href']
                    if url.startswith(r'/'):
                        url = 'http://www.economist.com' + url
                    url += '/print'
                    title = string_of_tag(a)
                    if title:
                        title = prefix + title
                        #self.log.info('\tFound article:%s' % title)
                        if url not in urladded:
                            urls.append((sectitle, title, url, None))
                            urladded.add(url)
        if len(urls) == 0:
            self.log.warn('len of urls is zero.')
        return urls
Exemplo n.º 9
0
	def ParseFeedUrls(self):
		urls = []
		urladded = set()
		url = self.feeds[0][1]
		opener = URLOpener(self.host, timeout=self.timeout)
		result = opener.open(url)
		section = self.feeds[0][0]
		if result.code == 200 and result.content:
			soup = BeautifulSoup(result.content,'lxml')
			cont = soup.findAll('item')
			for con in cont:
				title = con.title.get_text()
				href = con.contents[2]
				urls.append((section, title, href, None))
		else:
			self.log.warn('fetch rss failed(%d):%s'%(result.code,url))
		return urls
Exemplo n.º 10
0
    def fetcharticle(self, url, decoder):
        """链接网页获取一篇文章"""
        if self.fulltext_by_instapaper and not self.fulltext_by_readability:
            #用instapaper进行初期的内容提取
            url = "http://www.instapaper.com/m?u=%s" % self.url_unescape(url)

        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        code, content = result.code, result.content
        if code != 200 or not content:
            self.log.warn('fetch article failed(%d):%s.' % (code, url))
            return None

        if self.page_encoding:
            return content.decode(self.page_encoding)
        else:
            return decoder.decode(content, url)
Exemplo n.º 11
0
	def ParseFeedUrls(self):
		urls = []
		urladded = set()
		url = self.feeds[0][1]
		opener = URLOpener(self.host, timeout=self.timeout)
		result = opener.open(url)
		section = self.feeds[0][0]
		if result.code == 200 and result.content:
			soup = BeautifulSoup(result.content,'lxml')
			cont = soup.findAll(attrs={"class":"feed_item_question"})
			for con in cont:
				title = con.a.get_text()
				href = con.a['href']
				urls.append((section, title, href, None))
		else:
			self.log.warn('fetch rss failed(%d):%s'%(result.code,url))
		return urls
Exemplo n.º 12
0
	def ParseFeedUrls(self):
		urls = []
		urladded = set()
		url = self.feeds[0][1]
		opener = URLOpener(self.host, timeout=self.timeout)
		result = opener.open(url)
		section = self.feeds[0][0]
		if result.code == 200 and result.content:
			soup = BeautifulSoup(result.content,'lxml')
			cont = soup.findAll(attrs={"class":"feed_item_question"})
			for con in cont:
				title = con.a.get_text()
				href = "http://chuansongme.com%s" % con.a['href']
				urls.append((section, title, href, None))
		else:
			self.log.warn('fetch rss failed(%d):%s'%(result.code,url))
		return urls
Exemplo n.º 13
0
	def fetcharticle(self,url,decoder):
		"""链接网页获取一篇文章"""
		if self.fulltext_by_instapaper and not self.fulltext_by_readability:
			#用instapaper进行初期的内容提取
			url = "http://www.instapaper.com/m?u=%s" % self.url_unescape(url)

		opener = URLOpener(self.host, timeout=self.timeout)
		result = opener.open(url)
		code, content = result.code, result.content
		if code != 200 or not content:
			self.log.warn('fetch article failed(%d):%s.' % (code,url))
			return None

		if self.page_encoding:
			return content.decode(self.page_encoding)
		else:
			return decoder.decode(content,url)
Exemplo n.º 14
0
	def ParseFeedUrls(self):
		urls = []
		urladded = set()
		url = self.feeds[0][1]
		opener = URLOpener(self.host, timeout=self.timeout)
		result = opener.open(url)
		section = self.feeds[0][0]
		if result.code == 200 and result.content:
			soup = BeautifulSoup(result.content,'lxml')
			cont = soup.findAll(attrs={"class":"field field-name-title field-type-ds field-label-hidden"})
			root_url = 'https://s3.amazonaws.com/pao-pao/%s'
			for con in cont:
				title = con.a.get_text()
				href = root_url % con.a['href']
				urls.append((section, title, href, None))
		else:
			self.log.warn('fetch rss failed(%d):%s'%(result.code,url))
		return urls
Exemplo n.º 15
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        url = self.feeds[0][1]
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        if result.code != 200 or not result.content:
            self.log.warn('fetch webpage failed(%d):%s.' % (result.code, url))
            return []

        if self.feed_encoding:
            try:
                content = result.content.decode(self.feed_encoding)
            except UnicodeDecodeError:
                content = AutoDecoder(False).decode(result.content,
                                                    opener.realurl)
        else:
            content = AutoDecoder(False).decode(result.content, opener.realurl)

        soup = BeautifulSoup(content, 'lxml')
        for article in soup.find_all('div', attrs={'class': 'post'}):
            title = article.find('a', attrs={'class': 'title'})
            if not title or not title.string.startswith(u'安邦'):
                continue

            #获取发布时间
            pubdate = article.find('span', attrs={'class': 'date'})
            if not pubdate:
                continue
            mt = re.match(ur'(\d{4})年(\d{1,2})月(\d{1,2})日', pubdate.string)
            if not mt:
                continue
            pubdate = datetime.datetime(int(mt.group(1)), int(mt.group(2)),
                                        int(mt.group(3)))

            #确定文章是否需要推送,时区固定为北京时间
            tnow = datetime.datetime.utcnow() + datetime.timedelta(hours=8)
            delta = tnow - pubdate
            if self.oldest_article > 0 and delta.days > self.oldest_article:
                continue

            urls.append((u'安邦咨询', title.string, title['href'], None))

        return urls
Exemplo n.º 16
0
	def ParseFeedUrls(self):
		urls = []
		urladded = set()
		url = self.feeds[0][1]
		opener = URLOpener(self.host, timeout=self.timeout)
		result = opener.open(url)
		if result.code == 200 and result.content:
			feed = json.loads(result.content.decode(self.feed_encoding))

			for partition,section in self.partitions:
				for item in feed[partition]:
					urlfeed = item['share_url']
					if urlfeed in urladded:
						self.log.info('skipped %s' % urlfeed)
						continue
					urls.append((section, item['title'], urlfeed, None))
					urladded.add(urlfeed)
		else:
			self.log.warn('fetch rss failed(%d):%s'%(result.code,url))
		return urls
Exemplo n.º 17
0
    def ParseFeedUrls(self):
        urls = []
        urladded = set()
        url = self.feeds[0][1]
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        if result.code == 200 and result.content:
            feed = json.loads(result.content.decode(self.feed_encoding))

            for partition, section in self.partitions:
                for item in feed[partition]:
                    urlfeed = item['share_url']
                    if urlfeed in urladded:
                        self.log.info('skipped %s' % urlfeed)
                        continue
                    urls.append((section, item['title'], urlfeed, None))
                    urladded.add(urlfeed)
        else:
            self.log.warn('fetch rss failed(%d):%s' % (result.code, url))
        return urls
Exemplo n.º 18
0
	def ParseFeedUrls(self):
		urls = []
		urladded = set()
		url = self.feeds[0][1]
		opener = URLOpener(self.host, timeout=self.timeout)
		result = opener.open(url)
		section = self.feeds[0][0]
		if result.code == 200 and result.content:
			soup = BeautifulSoup(result.content,'lxml')
			cont1 = soup.findAll("title")
			cont2 = soup.findAll("guid")
			nums = len(cont2)
			for i in range(nums):
				title = cont1[i+2].string
				href = cont2[i].string
				url = self.trueURL_zzh(href)
				urls.append((section, title, url, None))
		else:
			self.log.warn('fetch rss failed(%d):%s'%(result.code,url))
		return urls
Exemplo n.º 19
0
 def ParseFeedUrls(self):
     urls = []
     urladded = set()
     url = self.feeds[0][1]
     opener = URLOpener(self.host, timeout=self.timeout)
     result = opener.open(url)
     section = self.feeds[0][0]
     if result.code == 200 and result.content:
         soup = BeautifulSoup(result.content, 'lxml')
         cont = soup.findAll(attrs={
             "class":
             "field field-name-title field-type-ds field-label-hidden"
         })
         root_url = 'https://s3.amazonaws.com/pao-pao/%s'
         for con in cont:
             title = con.a.get_text()
             href = root_url % con.a['href']
             urls.append((section, title, href, None))
     else:
         self.log.warn('fetch rss failed(%d):%s' % (result.code, url))
     return urls
Exemplo n.º 20
0
    def GET(self):
        code = web.input().get('code')
        client = Client(
            KEY_Q,
            SECRET_Q,
            site='https://graph.qq.com',
            authorize_url='https://graph.qq.com/oauth2.0/authorize',
            token_url='https://graph.qq.com/oauth2.0/token')

        if not code:
            try:
                authorize_url = client.auth_code.authorize_url(
                    redirect_uri=CALLBACK_Q, scope='get_user_info')
                web.seeother(authorize_url)
            except:
                raise web.seeother(r'/')
        else:
            try:
                access_token = client.auth_code.get_token(
                    code, redirect_uri=CALLBACK_Q, parse='query')
                url = "https://graph.qq.com/oauth2.0/me?access_token=%s" % access_token.token
                opener = URLOpener()
                result = opener.open(url)
                r_code, content = result.code, result.content
            except:
                raise web.seeother(r'/')
            if content.find('error') == 0:
                raise web.seeother(r'/')

            if content.find("callback") == 0:
                lp = content.find('(')
                rp = content.find(')')
                con = content[lp + 1:rp - 1]

                try:
                    data = json.loads(con)

                    openid = data['openid']
                    clientid = data['client_id']

                    url2 = "https://graph.qq.com/user/get_user_info?oauth_consumer_key=%s&access_token=%s&openid=%s&format=json" % (
                        KEY_Q, access_token.token, openid)

                    r2 = opener.open(url2)
                    content2 = r2.content
                    data2 = json.loads(content2)
                    ret = data2['ret']
                except:
                    raise web.seeother(r'/')
                if ret == 0:
                    #name = data2['nickname']+'('+openid[2:6]+')'
                    name = openid[2:6]
                    #存在,登录
                    if model.isuser(name, 'qq') == 1:
                        session.login = 1
                        session.username = name
                        model.update_logintime(local_time(), name)
                        raise web.seeother(r'/')
                    else:
                        #不存在,注册,登录返回
                        #注册
                        model.input_user(name, 'qq')
                        if model.isuser(name, 'qq') == 1:
                            session.login = 1
                            session.username = name
                            raise web.seeother(r'/my')
                        else:
                            return jjenv.get_template("register.html").render(
                                nickname='', title='Register', tips="")
                else:
                    raise web.seeother(r'/')
            else:
                raise web.seeother(r'/')
Exemplo n.º 21
0
	def GET(self):
		code = web.input().get('code')
		client = Client(KEY_Q, SECRET_Q,
				site='https://graph.qq.com',
				authorize_url='https://graph.qq.com/oauth2.0/authorize',
				token_url='https://graph.qq.com/oauth2.0/token')

		if not code:
			try:
				authorize_url = client.auth_code.authorize_url(redirect_uri=CALLBACK_Q,scope='get_user_info')
				web.seeother(authorize_url)
			except:
				raise web.seeother(r'/')
		else:
			try:
				access_token = client.auth_code.get_token(code, redirect_uri=CALLBACK_Q, parse='query')
				url = "https://graph.qq.com/oauth2.0/me?access_token=%s" % access_token.token;
				opener = URLOpener()
				result = opener.open(url)
				r_code, content = result.code, result.content
			except:
				raise web.seeother(r'/')
			if content.find('error') == 0:
				raise web.seeother(r'/')

			if content.find("callback") == 0:
				lp = content.find('(')
				rp = content.find(')')
				con = content[lp+1:rp-1]

				try:
					data = json.loads(con)

					openid = data['openid']
					clientid = data['client_id']

					url2 = "https://graph.qq.com/user/get_user_info?oauth_consumer_key=%s&access_token=%s&openid=%s&format=json" % (KEY_Q,access_token.token,openid)

					r2 = opener.open(url2)
					content2 =  r2.content
					data2 = json.loads(content2)
					ret = data2['ret']
				except:
					raise web.seeother(r'/')
				if ret == 0:
					#name = data2['nickname']+'('+openid[2:6]+')'
					name = openid[2:6]
					#存在,登录
					if model.isuser(name,'qq') == 1:
						session.login = 1
						session.username = name
						model.update_logintime(local_time(),name)
						raise web.seeother(r'/')
					else:
						#不存在,注册,登录返回
						#注册
						model.input_user(name,'qq')
						if model.isuser(name,'qq') == 1:
							session.login = 1
							session.username = name
							raise web.seeother(r'/my')
						else:
							return jjenv.get_template("register.html").render(nickname='',title='Register',tips="")
				else:
					raise web.seeother(r'/')
			else:
				raise web.seeother(r'/')
Exemplo n.º 22
0
encoding = chardet.detect(content)['encoding']
print encoding
result = content.decode(encoding)

netloc = urlparse.urlsplit(url)[1]

print netloc
r.set(netloc,encoding)

print r.get(netloc)
'''
#url='http://tech.sina.com.cn/internet/'
#url='http://tech.sina.com.cn/i/2014-01-08/08039077686.shtml'
#url='http://blog.knownsec.com/2012/04/about-content-encoding-gzip/'
url ='http://book.douban.com/review/6549990/'
zzh=URLOpener()
re=zzh.open(url)
#print re.info()
#print re.content.decode('GBK').encode('utf-8')
#print re.content
fout=open('zhang_test','wb')
fout.write(re.content)
fout.close()
'''
encoding = chardet.detect(re.content)['encoding']
print encoding
print re.headers
print isinstance(re.content,unicode)
print re.content.decode(encoding,'ignore').encode('utf-8')
'''
doc = readability.Document(re.content)
Exemplo n.º 23
0
encoding = chardet.detect(content)['encoding']
print encoding
result = content.decode(encoding)

netloc = urlparse.urlsplit(url)[1]

print netloc
r.set(netloc,encoding)

print r.get(netloc)
'''
#url='http://tech.sina.com.cn/internet/'
#url='http://tech.sina.com.cn/i/2014-01-08/08039077686.shtml'
#url='http://blog.knownsec.com/2012/04/about-content-encoding-gzip/'
url = 'http://book.douban.com/review/6549990/'
zzh = URLOpener()
re = zzh.open(url)
#print re.info()
#print re.content.decode('GBK').encode('utf-8')
#print re.content
fout = open('zhang_test', 'wb')
fout.write(re.content)
fout.close()
'''
encoding = chardet.detect(re.content)['encoding']
print encoding
print re.headers
print isinstance(re.content,unicode)
print re.content.decode(encoding,'ignore').encode('utf-8')
'''
doc = readability.Document(re.content)
Exemplo n.º 24
0
    def readability(self, article, url, opts=None):
        """ 使用readability-lxml处理全文信息 """
        content = self.preprocess(article)
        #		print '--------------'
        #		print content
        #		print '---------------'
        # 提取正文
        try:
            doc = readability.Document(content)
            summary = doc.summary(html_partial=True)
        except:
            self.log.warn('article is invalid.[%s]' % url)
            return

        title = doc.short_title()
        title = self.processtitle(title)
        #		print '=================='
        #		print summary
        #		print '==================='

        soup = BeautifulSoup(summary, 'lxml')
        #	soup = BeautifulSoup(content,'lxml')
        '''
		#没有head
		h = soup.find('head')
		if not h:
			h = soup.new_tag('head')
			t = soup.new_tag('title')
			t.string = title
			h.append(t)
			soup.html.insert(0,h)

		#没有h
		t = soup.html.body.find(['h1','h2'])
		if not t:
			t = soup.new_tag('h1')
			t.string = title
			soup.html.body.insert(0,t)
		else:
			totallen = 0
			for ps in t.previous_siblings:
				totallen += len(string_of_tag(ps))
				if totallen > 40:
					t = soup.new_tag('h1')
					t.string = title
					soup.html.body.insert(0,t)
					break
		'''
        self.soupbeforeimage(soup)

        if self.remove_tags:
            for tag in soup.find_all(self.remove_tags):
                tag.decompose()
        for id in self.remove_ids:
            for tag in soup.find_all(attrs={"id": id}):
                tag.decompose()
        for cls in self.remove_classes:
            for tag in soup.find_all(attrs={"class": cls}):
                tag.decompose()
        for attr in self.remove_attrs:
            for tag in soup.find_all(attrs={attr: True}):
                del tag[attr]
        for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)):
            cmt.extract()

        if self.extra_css:
            sty = soup.new_tag('style', type="text/css")
            sty.string = self.extra_css
            soup.html.head.append(sty)

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            for img in soup.find_all('img', attrs={'src': True}):
                imgurl = img['src']
                if img.get('height') in ('1','2','3','4','5') \
                 or img.get('width') in ('1','2','3','4','5'):
                    self.log.warn('img size too small,take it away : %s' %
                                  imgurl)
                    img.decompose()
                    continue
                if not imgurl.startswith('http'):
                    imgurl = self.urljoin(url, imgurl)
                if self.fetch_img_via_ssl and url.startswith('https://'):
                    imgurl = imgurl.replace('http://', 'https://')
                if self.isfiltered(imgurl):
                    self.log.warn('img filtered : %s' % imgurl)
                    img.decompose()
                    continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(
                    imgresult.content, opts) if imgresult.code == 200 else None
                if imgcontent:
                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype
                                              == 'jpeg' else imgtype)
                        img['src'] = fnimg
                        yield (imgmime, imgurl, fnimg, imgcontent, None)
                    else:
                        img.decompose()
                else:
                    self.log.warn('fetch img failed(err:%d):%s' %
                                  (imgresult.code, imgurl))
                    img.decompose()
            #去掉图像上面的链接
            for img in soup.find_all('img'):
                if img.parent and img.parent.parent and \
                 img.parent.name == 'a':
                    img.parent.replace_with(img)
        else:
            for img in soup.find_all('img'):
                img.decompose()

        self.soupprocessex(soup)
        #		print '====-=-=-=-=-=-=-='
        #		print soup
        #		print '-=-=-=-=-=-=-=-=-=-=-'
        cc = soup.body.contents[0]
        #		cc.name = "articleblock"
        #		print cc
        #		print soup.body.renderContents()
        #content = unicode(soup)
        content = unicode(cc)

        #print soup.find('body').contents
        #print soup.body.contents

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            body = soup.find('body')
            for h in body.find_all(['h1', 'h2']):  # 去掉h1/h2,避免和标题重复
                h.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break

        soup = None
        yield (title, None, None, content, brief)
Exemplo n.º 25
0
	def Items(self,opts=None):
		decoder = AutoDecoder(False)
		timeout = self.timeout
		for section ,url in self.feeds:
			opener = URLOpener(self.host,timeout=timeout)
			result = opener.open(url)
			code ,content = result.code,result.content
			if code!=200 or not content:
				self.log.warn('fetch article failed(%d):%s.' % (code,url))
				continue

			if self.page_encoding:
				try:
					content = content.decode(self.page_encoding)
				except UnicodeDecodeError:
					content = decoder.decode(content,opener.realurl)
			else:
				content = decoder.decode(content,opener.realurl)

			content = self.preprocess(content)
			soup = BeautifulSoup(content, "lxml")

			h=soup.find('head')
			if not h:
				h = soup.new_tag('head')
				t = soup.new_tag('title')
				t.string = section
				h.append(t)
				soup.html.insert(0, h)
			try:
				title = soup.html.head.title.string
			except AttributeError:
				title = section

			title = self.processtitle(title)

			if self.keep_only_tags:
				body = soup.new_tag('body')
				try:
					if isinstance(self.keep_only_tags,dict):
						keep_only_tags= [self.keep_only_tags]
					else:
						keep_only_tags = self.keep_only_tags
					for spec in keep_only_tags:
						for tag in soup.find('body').find_all(**spec):
							body.insert(len(body.contents), tag)
					soup.find('body').replace_with(body)
				except AttributeError: # soup has no body element
					pass


			for spec in self.remove_tags_after:
				tag = soup.find(**spec)
				remove_beyond(tag, 'next_sibling')

			for spec in self.remove_tags_before:
				tag = soup.find(**spec)
				remove_beyond(tag, 'previous_sibling')

			remove_tags = self.insta_remove_tags + self.remove_tags
			remove_ids = self.insta_remove_ids + self.remove_ids
			remove_classes = self.insta_remove_classes + self.remove_classes
			remove_attrs = self.insta_remove_attrs + self.remove_attrs
			for tag in soup.find_all(remove_tags):
				tag.decompose()
			for id in remove_ids:
				for tag in soup.find_all(attrs={"id":id}):
					tag.decompose()
			for cls in remove_classes:
				for tag in soup.find_all(attrs={"class":cls}):
					tag.decompose()
			for attr in remove_attrs:
				for tag in soup.find_all(attrs={attr:True}):
					del tag[attr]
			for cmt in soup.find_all(text=lambda text:isinstance(text, Comment)):
				cmt.extract()

			if self.extra_css:
				sty = soup.new_tag('style', type="text/css")
				sty.string = self.extra_css
				soup.html.head.append(sty)

			if self.keep_image:
				self.soupbeforeimage(soup)
				for img in soup.find_all('img',attrs={'src':True}):
					imgurl = img['src']
					if img.get('height') in ('1','2','3','4','5') \
						or img.get('width') in ('1','2','3','4','5'):
						self.log.warn('img size too small,take away it:%s' % imgurl)
						img.decompose()
						continue
					if not imgurl.startswith('http'):
						imgurl = self.urljoin(url, imgurl)
					if self.fetch_img_via_ssl and url.startswith('https://'):
						imgurl = imgurl.replace('http://', 'https://')
					if self.isfiltered(imgurl):
						self.log.warn('img filtered:%s' % imgurl)
						img.decompose()
						continue

					imgresult = opener.open(imgurl)
					imgcontent = self.process_image(imgresult.content,opts) if imgresult.code==200 else None
					if imgcontent:
						imgtype = imghdr.what(None, imgcontent)
						if imgtype:
							imgmime = r"image/" + imgtype
							fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype=='jpeg' else imgtype)
							img['src'] = fnimg
							yield (imgmime, imgurl, fnimg, imgcontent, None)
						else:
							img.decompose()
					else:
						self.log.warn('fetch img failed(err:%d):%s' % (imgresult.code,imgurl))
						img.decompose()

				for img in soup.find_all('img'):
					if img.parent and img.parent.parent and img.parent.name == 'a':
						img.parent.replace_with(img)
			else:
				for img in soup.find_all('img'):
					img.decompose()

			self.soupprocessex(soup)
			content = unicode(soup)


			brief = u''
			if GENERATE_TOC_DESC:
				body = soup.find('body')
				for h in body.find_all(['h1','h2']): # 去掉h1/h2,避免和标题重
					h.decompose()
				for s in body.stripped_strings:
					brief += unicode(s) + u' '
					if len(brief) >= TOC_DESC_WORD_LIMIT:
						brief = brief[:TOC_DESC_WORD_LIMIT]
						break

			soup = None
			content =  self.postprocess(content)
			yield (section, url, title, content, brief)
Exemplo n.º 26
0
	def readability_by_soup(self,article,url,opts=None):
		content = self.preprocess(article)
		soup = BeautifulSoup(content,"lxml")

		try:
			title = soup.html.head.title.string
		except AttributeError:
			self.log.warn('object soup invalid!(%s)'%url)
			return

		title = self.processtitle(title)
		soup.html.head.title.string = title

		if self.keep_only_tags:
			body = soup.new_tag('body')
			try:
				if isinstance(self.keep_only_tags, dict):
					keep_only_tags = [self.keep_only_tags]
				else:
					keep_only_tags = self.keep_only_tags
				for spec in keep_only_tags:
					for tag in soup.find('body').find_all(**spec):
						body.insert(len(body.contents), tag)
				soup.find('body').replace_with(body)
			except AttributeError:
				pass

		for spec in self.remove_tags_after:
			tag = soup.find(**spec)
			remove_beyond(tag, 'next_sibling')

		for spec in self.remove_tags_before:
			tag = soup.find(**spec)
			remove_beyond(tag, 'previous_sibling')

		remove_tags = self.insta_remove_tags + self.remove_tags
		remove_ids = self.insta_remove_ids + self.remove_ids
		remove_classes = self.insta_remove_classes + self.remove_classes
		remove_attrs = self.insta_remove_attrs + self.remove_attrs

		for tag in soup.find_all(remove_tags):
			tag.decompose()
		for id in remove_ids:
			for tag in soup.find_all(attrs={"id":id}):
				tag.decompose()
		for cls in remove_classes:
			for tag in soup.find_all(attrs={"class":cls}):
				tag.decompose()
		for attr in remove_attrs:
			for tag in soup.find_all(attrs={attr:True}):
				del tag[attr]
		for cmt in soup.find_all(text=lambda text:isinstance(text, Comment)):
			cmt.extract()

		if self.extra_css:
			sty = soup.new_tag('style', type="text/css")
			sty.string = self.extra_css
			soup.html.head.append(sty)

		if self.keep_image:
			opener = URLOpener(self.host, timeout=self.timeout)
			self.soupbeforeimage(soup)
			for img in soup.find_all('img',attrs={'src':True}):
				imgurl = img['src']
				if img.get('height') in ('1','2','3','4','5') \
					or img.get('width') in ('1','2','3','4','5'):
					self.log.warn('img size too small,take away it:%s' % imgurl)
					img.decompose()
					continue
				if not imgurl.startswith('http'):
					imgurl = self.urljoin(url, imgurl)
					print url
					print imgurl
				if self.fetch_img_via_ssl and url.startswith('https://'):
					imgurl = imgurl.replace('http://', 'https://')
				if self.isfiltered(imgurl):
					self.log.warn('img filtered:%s' % imgurl)
					img.decompose()
					continue
				imgresult = opener.open(imgurl)
				imgcontent = self.process_image(imgresult.content,opts) if imgresult.code==200 else None
				if imgcontent:
					imgtype = imghdr.what(None, imgcontent)
					if imgtype:
						imgmime = r"image/" + imgtype
						fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype=='jpeg' else imgtype)
						img['src'] = fnimg
						yield (imgmime, imgurl, fnimg, imgcontent, None)
					else:
						img.decompose()
				else:
					self.log.warn('fetch img failed(err:%d):%s' % (imgresult.code,imgurl))
					img.decompose()

			for img in soup.find_all('img'):#去掉图像上面的链接
				if img.parent and img.parent.parent and \
					img.parent.name == 'a':
					img.parent.replace_with(img)

		else:
			for img in soup.find_all('img'):
				img.decompose()

		#如果没有内容标题则添加
		t = soup.html.body.find(['h1','h2'])
		if not t:
			t = soup.new_tag('h1')
			t.string = title
			soup.html.body.insert(0, t)
		else:
			totallen = 0
			for ps in t.previous_siblings:
				totallen += len(string_of_tag(ps))
				if totallen > 40: #此H1/H2在文章中间出现,不是文章标题
					t = soup.new_tag('h1')
					t.string=title
					soup.html.body.insert(0, t)
					break

		self.soupprocessex(soup)
		content = unicode(soup)

		#提取文章内容的前面一部分做为摘要
		brief = u''
		if GENERATE_TOC_DESC:
			body = soup.find('body')
			for h in body.find_all(['h1','h2']): # 去掉h1/h2,避免和标题重复
				h.decompose()
			for s in body.stripped_strings:
				brief += unicode(s) + u' '
				if len(brief) >= TOC_DESC_WORD_LIMIT:
					brief = brief[:TOC_DESC_WORD_LIMIT]
					break
		soup = None

		yield (title, None, None, content, brief)
Exemplo n.º 27
0
	def readability(self,article,url,opts=None):
		""" 使用readability-lxml处理全文信息 """
		content = self.preprocess(article)
#		print '--------------'
#		print content
#		print '---------------'
		# 提取正文
		try:
			doc = readability.Document(content)
			summary = doc.summary(html_partial=True)
		except:
			self.log.warn('article is invalid.[%s]' % url)
			return

		title = doc.short_title()
		title = self.processtitle(title)
#		print '=================='
#		print summary
#		print '==================='

		soup = BeautifulSoup(summary,'lxml')
#	soup = BeautifulSoup(content,'lxml')
		'''
		#没有head
		h = soup.find('head')
		if not h:
			h = soup.new_tag('head')
			t = soup.new_tag('title')
			t.string = title
			h.append(t)
			soup.html.insert(0,h)

		#没有h
		t = soup.html.body.find(['h1','h2'])
		if not t:
			t = soup.new_tag('h1')
			t.string = title
			soup.html.body.insert(0,t)
		else:
			totallen = 0
			for ps in t.previous_siblings:
				totallen += len(string_of_tag(ps))
				if totallen > 40:
					t = soup.new_tag('h1')
					t.string = title
					soup.html.body.insert(0,t)
					break
		'''
		self.soupbeforeimage(soup)

		if self.remove_tags:
			for tag in soup.find_all(self.remove_tags):
				tag.decompose()
		for id in self.remove_ids:
			for tag in soup.find_all(attrs={"id":id}):
				tag.decompose()
		for cls in self.remove_classes:
			for tag in soup.find_all(attrs={"class":cls}):
				tag.decompose()
		for attr in self.remove_attrs:
			for tag in soup.find_all(attrs={attr:True}):
				del tag[attr]
		for cmt in soup.find_all(text=lambda text:isinstance(text,Comment)):
			cmt.extract()

		if self.extra_css:
			sty = soup.new_tag('style', type="text/css")
			sty.string = self.extra_css
			soup.html.head.append(sty)

		if self.keep_image:
			opener = URLOpener(self.host, timeout=self.timeout)
			for img in soup.find_all('img',attrs={'src':True}):
				imgurl = img['src']
				if img.get('height') in ('1','2','3','4','5') \
					or img.get('width') in ('1','2','3','4','5'):
					self.log.warn('img size too small,take it away : %s' % imgurl)
					img.decompose()
					continue
				if not imgurl.startswith('http'):
					imgurl = self.urljoin(url, imgurl)
				if self.fetch_img_via_ssl and url.startswith('https://'):
					imgurl = imgurl.replace('http://', 'https://')
				if self.isfiltered(imgurl):
					self.log.warn('img filtered : %s' % imgurl)
					img.decompose()
					continue
				imgresult = opener.open(imgurl)
				imgcontent = self.process_image(imgresult.content,opts) if imgresult.code==200 else None
				if imgcontent:
					imgtype = imghdr.what(None, imgcontent)
					if imgtype:
						imgmime = r"image/" + imgtype
						fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype=='jpeg' else imgtype)
						img['src'] = fnimg
						yield (imgmime, imgurl, fnimg, imgcontent, None)
					else:
						img.decompose()
				else:
					self.log.warn('fetch img failed(err:%d):%s' % (imgresult.code,imgurl))
					img.decompose()
			#去掉图像上面的链接
			for img in soup.find_all('img'):
				if img.parent and img.parent.parent and \
					img.parent.name == 'a':
					img.parent.replace_with(img)
		else:
			for img in soup.find_all('img'):
				img.decompose()


		self.soupprocessex(soup)
#		print '====-=-=-=-=-=-=-='
#		print soup
#		print '-=-=-=-=-=-=-=-=-=-=-'
		cc = soup.body.contents[0]
#		cc.name = "articleblock"
#		print cc
#		print soup.body.renderContents()
		#content = unicode(soup)
		content = unicode(cc)

#print soup.find('body').contents
#print soup.body.contents

		#提取文章内容的前面一部分做为摘要
		brief = u''
		if GENERATE_TOC_DESC:
			body = soup.find('body')
			for h in body.find_all(['h1','h2']): # 去掉h1/h2,避免和标题重复
				h.decompose()
			for s in body.stripped_strings:
				brief += unicode(s) + u' '
				if len(brief) >= TOC_DESC_WORD_LIMIT:
					brief = brief[:TOC_DESC_WORD_LIMIT]
					break

		soup = None
		yield (title, None, None, content, brief)
Exemplo n.º 28
0
    def readability_by_soup(self, article, url, opts=None):
        content = self.preprocess(article)
        soup = BeautifulSoup(content, "lxml")

        try:
            title = soup.html.head.title.string
        except AttributeError:
            self.log.warn('object soup invalid!(%s)' % url)
            return

        title = self.processtitle(title)
        soup.html.head.title.string = title

        if self.keep_only_tags:
            body = soup.new_tag('body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    keep_only_tags = [self.keep_only_tags]
                else:
                    keep_only_tags = self.keep_only_tags
                for spec in keep_only_tags:
                    for tag in soup.find('body').find_all(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replace_with(body)
            except AttributeError:
                pass

        for spec in self.remove_tags_after:
            tag = soup.find(**spec)
            remove_beyond(tag, 'next_sibling')

        for spec in self.remove_tags_before:
            tag = soup.find(**spec)
            remove_beyond(tag, 'previous_sibling')

        remove_tags = self.insta_remove_tags + self.remove_tags
        remove_ids = self.insta_remove_ids + self.remove_ids
        remove_classes = self.insta_remove_classes + self.remove_classes
        remove_attrs = self.insta_remove_attrs + self.remove_attrs

        for tag in soup.find_all(remove_tags):
            tag.decompose()
        for id in remove_ids:
            for tag in soup.find_all(attrs={"id": id}):
                tag.decompose()
        for cls in remove_classes:
            for tag in soup.find_all(attrs={"class": cls}):
                tag.decompose()
        for attr in remove_attrs:
            for tag in soup.find_all(attrs={attr: True}):
                del tag[attr]
        for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)):
            cmt.extract()

        if self.extra_css:
            sty = soup.new_tag('style', type="text/css")
            sty.string = self.extra_css
            soup.html.head.append(sty)

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            self.soupbeforeimage(soup)
            for img in soup.find_all('img', attrs={'src': True}):
                imgurl = img['src']
                if img.get('height') in ('1','2','3','4','5') \
                 or img.get('width') in ('1','2','3','4','5'):
                    self.log.warn('img size too small,take away it:%s' %
                                  imgurl)
                    img.decompose()
                    continue
                if not imgurl.startswith('http'):
                    imgurl = self.urljoin(url, imgurl)
                if self.fetch_img_via_ssl and url.startswith('https://'):
                    imgurl = imgurl.replace('http://', 'https://')
                if self.isfiltered(imgurl):
                    self.log.warn('img filtered:%s' % imgurl)
                    img.decompose()
                    continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(
                    imgresult.content, opts) if imgresult.code == 200 else None
                if imgcontent:
                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype
                                              == 'jpeg' else imgtype)
                        img['src'] = fnimg
                        yield (imgmime, imgurl, fnimg, imgcontent, None)
                    else:
                        img.decompose()
                else:
                    self.log.warn('fetch img failed(err:%d):%s' %
                                  (imgresult.code, imgurl))
                    img.decompose()

            for img in soup.find_all('img'):  #去掉图像上面的链接
                if img.parent and img.parent.parent and \
                 img.parent.name == 'a':
                    img.parent.replace_with(img)

        else:
            for img in soup.find_all('img'):
                img.decompose()

        #如果没有内容标题则添加
        t = soup.html.body.find(['h1', 'h2'])
        if not t:
            t = soup.new_tag('h1')
            t.string = title
            soup.html.body.insert(0, t)
        else:
            totallen = 0
            for ps in t.previous_siblings:
                totallen += len(string_of_tag(ps))
                if totallen > 40:  #此H1/H2在文章中间出现,不是文章标题
                    t = soup.new_tag('h1')
                    t.string = title
                    soup.html.body.insert(0, t)
                    break

        self.soupprocessex(soup)
        content = unicode(soup)

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            body = soup.find('body')
            for h in body.find_all(['h1', 'h2']):  # 去掉h1/h2,避免和标题重复
                h.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break
        soup = None

        yield (title, None, None, content, brief)
Exemplo n.º 29
0
from lib.img import rescale_image
from lib.url_req import URLOpener
import os

#rescale_image(data, maxsizeb=4000000, dimen=None, png2jpg=False,     graying=True, reduceto=(600,800)):


test = URLOpener().open('http://img.xinjunshi.com/uploads/allimg/140224/11-140224101225.jpg')
#test=URLOpener().open('http://www.sucaitianxia.com/d/file/20131222/28caa29d1ddad3c085035e024a9f0b02.png')
con = test.content

con = rescale_image(con,reduceto=(400,600),graying=False)
fout = open('zzh.jpg', "wb")
fout.write(con)
fout.close()

Exemplo n.º 30
0
    def Items(self, opts=None):
        decoder = AutoDecoder(False)
        timeout = self.timeout
        for section, url in self.feeds:
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)
            code, content = result.code, result.content
            if code != 200 or not content:
                self.log.warn('fetch article failed(%d):%s.' % (code, url))
                continue

            if self.page_encoding:
                try:
                    content = content.decode(self.page_encoding)
                except UnicodeDecodeError:
                    content = decoder.decode(content, opener.realurl)
            else:
                content = decoder.decode(content, opener.realurl)

            content = self.preprocess(content)
            soup = BeautifulSoup(content, "lxml")

            h = soup.find('head')
            if not h:
                h = soup.new_tag('head')
                t = soup.new_tag('title')
                t.string = section
                h.append(t)
                soup.html.insert(0, h)
            try:
                title = soup.html.head.title.string
            except AttributeError:
                title = section

            title = self.processtitle(title)

            if self.keep_only_tags:
                body = soup.new_tag('body')
                try:
                    if isinstance(self.keep_only_tags, dict):
                        keep_only_tags = [self.keep_only_tags]
                    else:
                        keep_only_tags = self.keep_only_tags
                    for spec in keep_only_tags:
                        for tag in soup.find('body').find_all(**spec):
                            body.insert(len(body.contents), tag)
                    soup.find('body').replace_with(body)
                except AttributeError:  # soup has no body element
                    pass

            for spec in self.remove_tags_after:
                tag = soup.find(**spec)
                remove_beyond(tag, 'next_sibling')

            for spec in self.remove_tags_before:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previous_sibling')

            remove_tags = self.insta_remove_tags + self.remove_tags
            remove_ids = self.insta_remove_ids + self.remove_ids
            remove_classes = self.insta_remove_classes + self.remove_classes
            remove_attrs = self.insta_remove_attrs + self.remove_attrs
            for tag in soup.find_all(remove_tags):
                tag.decompose()
            for id in remove_ids:
                for tag in soup.find_all(attrs={"id": id}):
                    tag.decompose()
            for cls in remove_classes:
                for tag in soup.find_all(attrs={"class": cls}):
                    tag.decompose()
            for attr in remove_attrs:
                for tag in soup.find_all(attrs={attr: True}):
                    del tag[attr]
            for cmt in soup.find_all(
                    text=lambda text: isinstance(text, Comment)):
                cmt.extract()

            if self.extra_css:
                sty = soup.new_tag('style', type="text/css")
                sty.string = self.extra_css
                soup.html.head.append(sty)

            if self.keep_image:
                self.soupbeforeimage(soup)
                for img in soup.find_all('img', attrs={'src': True}):
                    imgurl = img['src']
                    if img.get('height') in ('1','2','3','4','5') \
                     or img.get('width') in ('1','2','3','4','5'):
                        self.log.warn('img size too small,take away it:%s' %
                                      imgurl)
                        img.decompose()
                        continue
                    if not imgurl.startswith('http'):
                        imgurl = self.urljoin(url, imgurl)
                    if self.fetch_img_via_ssl and url.startswith('https://'):
                        imgurl = imgurl.replace('http://', 'https://')
                    if self.isfiltered(imgurl):
                        self.log.warn('img filtered:%s' % imgurl)
                        img.decompose()
                        continue

                    imgresult = opener.open(imgurl)
                    imgcontent = self.process_image(
                        imgresult.content,
                        opts) if imgresult.code == 200 else None
                    if imgcontent:
                        imgtype = imghdr.what(None, imgcontent)
                        if imgtype:
                            imgmime = r"image/" + imgtype
                            fnimg = "img%d.%s" % (self.imgindex,
                                                  'jpg' if imgtype == 'jpeg'
                                                  else imgtype)
                            img['src'] = fnimg
                            yield (imgmime, imgurl, fnimg, imgcontent, None)
                        else:
                            img.decompose()
                    else:
                        self.log.warn('fetch img failed(err:%d):%s' %
                                      (imgresult.code, imgurl))
                        img.decompose()

                for img in soup.find_all('img'):
                    if img.parent and img.parent.parent and img.parent.name == 'a':
                        img.parent.replace_with(img)
            else:
                for img in soup.find_all('img'):
                    img.decompose()

            self.soupprocessex(soup)
            content = unicode(soup)

            brief = u''
            if GENERATE_TOC_DESC:
                body = soup.find('body')
                for h in body.find_all(['h1', 'h2']):  # 去掉h1/h2,避免和标题重
                    h.decompose()
                for s in body.stripped_strings:
                    brief += unicode(s) + u' '
                    if len(brief) >= TOC_DESC_WORD_LIMIT:
                        brief = brief[:TOC_DESC_WORD_LIMIT]
                        break

            soup = None
            content = self.postprocess(content)
            yield (section, url, title, content, brief)