Python Stock.visit示例

编程语言: Python

命名空间/包名称: datastorage

类/类型: Stock

方法/功能: visit

hotexamples.com的示例: 3

Python Stock.visit - 已找到3个示例。这些是从开源项目中提取的最受好评的datastorage.Stock.visit现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

url(2)

Stock(2)

visit(2)

checksum(1)

count(1)

exist_url(1)

save_data(1)

update(1)

示例#1

显示文件

    p = re.compile(r'< style[^<>]*?>.*?< / style >')
    data = p.sub('', data)

    # remove html comments
    p = re.compile(r'')
    data = p.sub('', data)

    # remove all the tags
    p = re.compile(r'<[^<]*?>')
    data = p.sub('', data)

    return data


db = Stock()
pages = db.visit()
for page in pages:
    try:
        if (page['html'].__len__() > 100):
            html = page['html']
        else:
            html = page['text']

        clear_html = re.sub('<[^<]+?>', '', html)
        normalizado = normalize('NFKD', clear_html.decode('utf-8')).encode(
            'ASCII', 'ignore').lower()
        text = re.sub(r'[^a-zA-Z\-\ ]', '', normalizado)
        text = re.sub(r'[-_\/]|[a-z]{13,}|\W+|[ \t]+', ' ', text)

        token = text.split()
        print page['_id']

示例#2

显示文件

文件： generator_corpus.py 项目： dmouse/nlp

import re
from datastorage import Stock

db = Stock()

for page in db.visit():

	try:

		page['text'] = u" ".join(page['text'].replace(u"\xa0", u" ").strip().split())
		print str(page['_id']) + " " + re.sub(r'[-_\/]',' ',re.sub(r'[^a-zA-Z\-\ ]', '', page['text'].lower() ))

	except Exception:
		continue

示例#3

显示文件

文件： sample.py 项目： KOS-mo/nlp

	# remove the css styles
	p = re.compile(r'< style[^<>]*?>.*?< / style >')
	data = p.sub('', data)

	# remove html comments
	p = re.compile(r'')
	data = p.sub('', data)

	# remove all the tags
	p = re.compile(r'<[^<]*?>')
	data = p.sub('', data)

	return data

db = Stock()
pages = db.visit();
for page in pages:
	try:
		if (page['html'].__len__() > 100):
			html = page['html']
		else:
			html = page['text']

		clear_html  = re.sub('<[^<]+?>','',html)
		normalizado = normalize('NFKD',clear_html.decode('utf-8')).encode('ASCII','ignore').lower()
		text        = re.sub(r'[^a-zA-Z\-\ ]','',normalizado)
		text        = re.sub(r'[-_\/]|[a-z]{13,}|\W+|[ \t]+',' ',text)

		

		token       = text.split()