예제 #1
0
def generate_tags(request):
    if 'contents' not in request.POST or 'callback' not in request.POST:
        return HttpResponse(status=501)
    from taggit.settings import TAGGIT_TAG_GENERATE_FUNC
    if TAGGIT_TAG_GENERATE_FUNC is None:
        return HttpResponse(status=501)

    # Generate tags from the content
    content = request.POST['contents']
    tags = TAGGIT_TAG_GENERATE_FUNC(content)

    # Filter out tags that are in our 'bad' list
    # stub
    tags = utils.filter_tags(tags)
    data = simplejson.dumps(tags)
    return HttpResponse(u'%s(%s)' % (request.POST['callback'], data),
                        content_type='application/json')
예제 #2
0
 def parse_article(self):
     d = dict()
     #parse article title
     title = self.tree.xpath('//p[@class="article-title"]/text()')
     if title:
         d['article_title'] = title[0].encode('ISO-8859-1')
     else:
         d['article_title'] = 'no_title'
     #parse article author name
     d['author_name'] = self.tree.xpath(
         '//a[contains(@class, "author-name author-info__detail")]/text() | //a[@class="author-info__detail"]//text()'
     )[0].encode('ISO-8859-1')
     #parse article time
     d['article_time'] = self.tree.xpath(
         '//p[@class="author-info__detail"]/text()')[0].encode('ISO-8859-1')
     #parse article content
     # d['article_content'] ='\r\n'.join([a.encode('ISO-8859-1') for a in self.tree.xpath('//div[@class="article-info"]//text()')])
     content_ele = self.tree.xpath('//div[@class="article-info"]')[0]
     #convert content_ele to string
     tmp = etree.tostring(content_ele, encoding='ISO-8859-1', method="html")
     # print tmp
     d['article_content'] = filter_tags(tmp)
     #print 'article_content type is:', type(d['article_content'])
     return d
예제 #3
0
	def prase_book(self, book, html):
		reg_pattern = re.compile(u'<h1>(.*?)</h1>')
		match = reg_pattern.search(html)
		if match:
			book["name"] = match.group(1)
			book["name"] = filter_tags(book["name"])
			book["name"] = filter_r_and_n(book["name"])
		
		reg_pattern = re.compile(u'<div class="PI_info">(.*?)</div>')
		match = reg_pattern.search(html)
		if match:
			book_info_str = match.group(1)
			
			#作者
			reg_pattern = re.compile(u'<h3 class="PI_item">作者(.*?)</h3>')
			match = reg_pattern.search(book_info_str)
			if match:
				book["author"] = match.group(1)
				book["author"] = filter_tags(book["author"])
				book["author"] = filter_r_and_n(book["author"])
				book["author"] = book["author"].replace(" / ", "")
				
			#出版社
			reg_pattern = re.compile(u'<h3 class="PI_item">出版社(.*?)</h3>')
			match = reg_pattern.search(book_info_str)
			if match:
				book["press"] = match.group(1)
				book["press"] = filter_tags(book["press"])
				book["press"] = filter_r_and_n(book["press"])
				book["press"] = book["press"].replace(" / ", "")
				
			#出版日期
			reg_pattern = re.compile(u'<h3 class="PI_item">出版日期(.*?)</h3>')
			match = reg_pattern.search(book_info_str)
			if match:
				book["publictime"] = match.group(1)
				book["publictime"] = filter_tags(book["publictime"])
				book["publictime"] = filter_r_and_n(book["publictime"])
				book["publictime"] = book["publictime"].replace(" / ", "")
			
			#定价
			reg_pattern = re.compile(u'<h3 class="PI_item">定價(.*?)</h3>')
			match = reg_pattern.search(book_info_str)
			if match:
				book["price"] = match.group(1)
				book["price"] = filter_tags(book["price"])
				book["price"] = filter_r_and_n(book["price"])
				book["price"] = book["price"].replace(" / ", "")
			
			#售价
			reg_pattern = re.compile(u'<h3 class="PI_item">售價(.*?)</h3>')
			match = reg_pattern.search(book_info_str)
			if match:
				book["sell_price"] = match.group(1)
				book["sell_price"] = filter_tags(book["sell_price"])
				book["sell_price"] = filter_r_and_n(book["sell_price"])
				book["sell_price"] = book["sell_price"].replace(" / ", "")
			
			#裝訂
			reg_pattern = re.compile(u'class="PI_item">裝訂(.*?)<')
			match = reg_pattern.search(book_info_str)
			if match:
				book["print"] = match.group(1)
				book["print"] = filter_tags(book["print"])
				book["print"] = filter_r_and_n(book["print"])
				book["print"] = book["print"].replace(" / ", "")
				
			#商品語言
			reg_pattern = re.compile(u'class="PI_item">商品語言(.*?)<')
			match = reg_pattern.search(book_info_str)
			if match:
				book["language"] = match.group(1)
				book["language"] = filter_tags(book["language"])
				book["language"] = filter_r_and_n(book["language"])
				book["language"] = book["language"].replace(" / ", "")
		
		#詳細資料
		
		reg_pattern = re.compile(u'<div class="C_box"><h2>詳細資料</h2>(.*?)</div>')
		match = reg_pattern.search(html)
		if match:
			book_info_str = match.group(1)
			book_info_str = filter_tags(book_info_str)
			book_info_str = book_info_str.replace("\t", "")
		
			reg_pattern = re.compile(u'ISBN 13 /(\d+)')
			match = reg_pattern.search(book_info_str)
			if match:
				book["isbn"] = match.group(1)
				
			reg_pattern = re.compile(u'頁數/(\d+)')
			match = reg_pattern.search(book_info_str)
			if match:
				book["pagecnt"] = match.group(1)
				
			
		#目录
		reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_catelog" class="C_box" style="display:none;">(.*?)</div>')
		match = reg_pattern.search(html)
		if match:
			book["menu"] =  filter_tags(match.group(1))
			book["menu"] = book["menu"].replace("本書目錄", "")
		return book
	
		#作者介绍
		reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_all_character" class="C_box" style="display:none;">(.*?)</div>')
		match = reg_pattern.search(html)
		if match:
			book["authordesc"] =  filter_tags(match.group(1))
			book["authordesc"] = book["authordesc"].replace("作者介紹", "")
		
		#内容接受	
		reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_introduction" class="C_box" style="display:block;">(.*?)</div>')
		match = reg_pattern.search(html)
		if match:
			book["desc"] =  filter_tags(match.group(1))
			book["desc"] = book["desc"].replace("內容簡介", "")
		
		#媒体推荐	
		reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_medium" class="C_box" style="display:none;">(.*?)</div>')
		match = reg_pattern.search(html)
		if match:
			book["meidum"] =  filter_tags(match.group(1))
			book["meidum"] = book["meidum"].replace("媒體推薦", "")
		
		#得獎紀錄
		reg_pattern = re.compile(u'<div id="ctl00_ContentPlaceHolder1_Product_info_more1_award" class="C_box" style="display:none;">(.*?)</div>')
		match = reg_pattern.search(html)
		if match:
			book["award"] =  filter_tags(match.group(1))
			book["award"] = book["award"].replace("得獎紀錄", "")
		
		return book
예제 #4
0
# -*- coding: utf-8 -*-
import re
from utils import filter_tags, clean_html, replace_charentity

html = """
aaa<p>11111111111</p>sss
"""

if __name__ == '__main__':
    re_pp = re.compile('</p>', re.I)
    s = re_pp.sub('\n', html)
    print s
    print filter_tags(html)