url = 'http://www.cnn.com/2014/07/19/world/europe/ukraine-malaysia-airlines-crash/' ## fetch html import requests r= requests.get(url) html = r.content ##nltk: fetch text by cleaning html import nltk text = nltk.clean_html(html) ##fetch text based on density :useful text import usefulText as u text = u.extract_text(html) ## unicode text = text.decode('utf-8','ignore') ## segment into sentences import sys sys.path.append('../version0.0/') import segment_sentence as ss def isProper(sentence): if len(sentence) <=5: return False if '|' in sentence: return False
def __init__(self, url): r= requests.get() html = r.content self.text = usefulText.extract_text(html).decode('utf-8', 'ignore') ## Scope of improvement self.segmentor = ss.segmentor(1)