示例#1
0
from datastorage import Stock # Interactua con mongodb

db = Stock()

site = db.url()

while( site ):
	#db.update(site)
	site = db.url()
	print site['url']
示例#2
0
import re
from datastorage import Stock

db = Stock()

for page in db.visit():

	try:

		page['text'] = u" ".join(page['text'].replace(u"\xa0", u" ").strip().split())
		print str(page['_id']) + " " + re.sub(r'[-_\/]',' ',re.sub(r'[^a-zA-Z\-\ ]', '', page['text'].lower() ))

	except Exception:
		continue
		
示例#3
0
    # remove the css styles
    p = re.compile(r'< style[^<>]*?>.*?< / style >')
    data = p.sub('', data)

    # remove html comments
    p = re.compile(r'')
    data = p.sub('', data)

    # remove all the tags
    p = re.compile(r'<[^<]*?>')
    data = p.sub('', data)

    return data


db = Stock()
pages = db.visit()
for page in pages:
    try:
        if (page['html'].__len__() > 100):
            html = page['html']
        else:
            html = page['text']

        clear_html = re.sub('<[^<]+?>', '', html)
        normalizado = normalize('NFKD', clear_html.decode('utf-8')).encode(
            'ASCII', 'ignore').lower()
        text = re.sub(r'[^a-zA-Z\-\ ]', '', normalizado)
        text = re.sub(r'[-_\/]|[a-z]{13,}|\W+|[ \t]+', ' ', text)

        token = text.split()
示例#4
0
import time 
#import nltk # NLP 
import hashlib

from spider import Spider     # Clase para visitar los sitios web
from datastorage import Stock # Interactua con mongodb

from unidecode import unidecode

stop = True
db = Stock() # instancia para almacenamiento

if (not db.count()):
	db.save_data({'visit':False,'url':''});

while( stop ):
	break
	if ( not db.url() ):
		break

	site = db.url()     # obtenemos una url no visitada 
	url  = site['url']  # separo la url
	m    = hashlib.sha1()
	date = time.strftime("%Y-%m-%d %H:%m")

	print "[ Visit  ] " + url 

	response = Spider.get_source(url) # obtiene el html de la url


	if not response :         #si no hay respuesta lo marca como visitado
示例#5
0
文件: sample.py 项目: KOS-mo/nlp
	# remove the css styles
	p = re.compile(r'< style[^<>]*?>.*?< / style >')
	data = p.sub('', data)

	# remove html comments
	p = re.compile(r'')
	data = p.sub('', data)

	# remove all the tags
	p = re.compile(r'<[^<]*?>')
	data = p.sub('', data)

	return data

db = Stock()
pages = db.visit();
for page in pages:
	try:
		if (page['html'].__len__() > 100):
			html = page['html']
		else:
			html = page['text']

		clear_html  = re.sub('<[^<]+?>','',html)
		normalizado = normalize('NFKD',clear_html.decode('utf-8')).encode('ASCII','ignore').lower()
		text        = re.sub(r'[^a-zA-Z\-\ ]','',normalizado)
		text        = re.sub(r'[-_\/]|[a-z]{13,}|\W+|[ \t]+',' ',text)