示例#1
0
    p = re.compile(r'< style[^<>]*?>.*?< / style >')
    data = p.sub('', data)

    # remove html comments
    p = re.compile(r'')
    data = p.sub('', data)

    # remove all the tags
    p = re.compile(r'<[^<]*?>')
    data = p.sub('', data)

    return data


db = Stock()
pages = db.visit()
for page in pages:
    try:
        if (page['html'].__len__() > 100):
            html = page['html']
        else:
            html = page['text']

        clear_html = re.sub('<[^<]+?>', '', html)
        normalizado = normalize('NFKD', clear_html.decode('utf-8')).encode(
            'ASCII', 'ignore').lower()
        text = re.sub(r'[^a-zA-Z\-\ ]', '', normalizado)
        text = re.sub(r'[-_\/]|[a-z]{13,}|\W+|[ \t]+', ' ', text)

        token = text.split()
        print page['_id']
示例#2
0
import re
from datastorage import Stock

db = Stock()

for page in db.visit():

	try:

		page['text'] = u" ".join(page['text'].replace(u"\xa0", u" ").strip().split())
		print str(page['_id']) + " " + re.sub(r'[-_\/]',' ',re.sub(r'[^a-zA-Z\-\ ]', '', page['text'].lower() ))

	except Exception:
		continue
		
示例#3
0
文件: sample.py 项目: KOS-mo/nlp
	# remove the css styles
	p = re.compile(r'< style[^<>]*?>.*?< / style >')
	data = p.sub('', data)

	# remove html comments
	p = re.compile(r'')
	data = p.sub('', data)

	# remove all the tags
	p = re.compile(r'<[^<]*?>')
	data = p.sub('', data)

	return data

db = Stock()
pages = db.visit();
for page in pages:
	try:
		if (page['html'].__len__() > 100):
			html = page['html']
		else:
			html = page['text']

		clear_html  = re.sub('<[^<]+?>','',html)
		normalizado = normalize('NFKD',clear_html.decode('utf-8')).encode('ASCII','ignore').lower()
		text        = re.sub(r'[^a-zA-Z\-\ ]','',normalizado)
		text        = re.sub(r'[-_\/]|[a-z]{13,}|\W+|[ \t]+',' ',text)

		

		token       = text.split()