Пример #1
0
url = 'http://www.cnn.com/2014/07/19/world/europe/ukraine-malaysia-airlines-crash/'

## fetch html
import requests
r=  requests.get(url)
html = r.content

##nltk: fetch text by cleaning html
import nltk
text = nltk.clean_html(html)

##fetch text based on density :useful text
import usefulText as u
text = u.extract_text(html)


## unicode 
text = text.decode('utf-8','ignore')

## segment into sentences
import sys
sys.path.append('../version0.0/')
import segment_sentence as ss


def isProper(sentence):
	if len(sentence) <=5:
		return False

	if '|' in sentence:
		return False
Пример #2
0
	def __init__(self, url):
		r= requests.get()
		html = r.content
		self.text = usefulText.extract_text(html).decode('utf-8', 'ignore')  ## Scope of improvement
		self.segmentor = ss.segmentor(1)