def review_to_wordlist(review, remove_stopwords=False): # Function to convert a document to a sequence of words, # optionally removing stop words. Returns a list of words. # # 1. Remove HTML review_text = BeautifulSoup(review).getText() # # 2. Remove non-letters review_text = re.sub("[^a-zA-Z]", " ", review_text) # # 3. Convert words to lower case and split them words = review_text.lower().split() # # 4. Optionally remove stop words (false by default) if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] #taggedwords= nltk.pos_tag(words) #taggedwords= [s for s in taggedwords if s[1] != 'NN'] #lists= [] #for i in xrange( 0,len(taggedwords)): # lists.append(taggedwords[i][0]) #taggedwords. # 5. Return a list of words return (words)
def paragraph_to_wordlist( raw_review): # Function to clean data # # removing html tags using BeautifulSoup api review_text = BeautifulSoup(raw_review).text # # removing non-alpahbetical data review_text = re.sub("[^a-zA-Z]"," ", review_text) # # converting to consistant lowercase words = review_text.lower().split() return(words)
def paragraph_to_wordlist(raw_review): # Function to clean data # # removing html tags using BeautifulSoup api review_text = BeautifulSoup(raw_review).text # # removing non-alpahbetical data review_text = re.sub("[^a-zA-Z]", " ", review_text) # # converting to consistant lowercase words = review_text.lower().split() return (words)
def generate_snippet(self,url): snippet_list = [] req = urllib2.Request(url) req.add_header('User-Agent', 'Mozilla/5.0' ) try: page = urllib2.urlopen(req) code = page.getcode() text = BeautifulSoup(page.read()).body.getText() for s in self.query: if s not in self.stopwords: ind = text.lower().find(s) if ind not in snippet_list: self.snippet += text[max(ind-30,0):min(ind+30,(len(text)-1))].strip() self.snippet += u"... " snippet_list.append(ind) except urllib2.HTTPError, err: return (None,err.code)
def review_to_wordlist( raw_review, remove_stopwords=False ): # Function to convert a document to a sequence of words, # optionally removing stop words. Returns a list of words. # # 1. Remove HTML review_text = BeautifulSoup(raw_review).text # # 2. Remove non-letters review_text = re.sub("[^a-zA-Z]"," ", review_text) # # 3. Convert words to lower case and split them words = review_text.lower().split() # # 4. Optionally remove stop words (false by default) if remove_stopwords: words = [w for w in words if not w in stops] # # 5. Return a list of words return(words)
def review_to_wordlist(raw_review, remove_stopwords=False): # Function to convert a document to a sequence of words, # optionally removing stop words. Returns a list of words. # # 1. Remove HTML review_text = BeautifulSoup(raw_review).text # # 2. Remove non-letters review_text = re.sub("[^a-zA-Z]", " ", review_text) # # 3. Convert words to lower case and split them words = review_text.lower().split() # # 4. Optionally remove stop words (false by default) if remove_stopwords: words = [w for w in words if not w in stops] # # 5. Return a list of words return (words)