def review_to_wordlist(review, remove_stopwords=False):
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).getText()
        #
        # 2. Remove non-letters
        review_text = re.sub("[^a-zA-Z]", " ", review_text)
        #
        # 3. Convert words to lower case and split them
        words = review_text.lower().split()

        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
        #taggedwords= nltk.pos_tag(words)
        #taggedwords= [s for s in taggedwords if s[1] != 'NN']
        #lists= []
        #for i in xrange( 0,len(taggedwords)):
        #    lists.append(taggedwords[i][0])
        #taggedwords.
        # 5. Return a list of words
        return (words)
def paragraph_to_wordlist( raw_review):
    # Function to clean data
    #
    # removing html tags using BeautifulSoup api
    review_text = BeautifulSoup(raw_review).text
    #  
    # removing non-alpahbetical data
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # converting to consistant lowercase
    words = review_text.lower().split()
    return(words)
def paragraph_to_wordlist(raw_review):
    # Function to clean data
    #
    # removing html tags using BeautifulSoup api
    review_text = BeautifulSoup(raw_review).text
    #
    # removing non-alpahbetical data
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    #
    # converting to consistant lowercase
    words = review_text.lower().split()
    return (words)
    def generate_snippet(self,url):
        snippet_list = []
        req = urllib2.Request(url)
        req.add_header('User-Agent', 'Mozilla/5.0' )
        try:
            page = urllib2.urlopen(req)
            code = page.getcode()
            text = BeautifulSoup(page.read()).body.getText()
            for s in self.query:
                if s not in self.stopwords:
                    ind = text.lower().find(s)
                    if ind not in snippet_list:
                        self.snippet += text[max(ind-30,0):min(ind+30,(len(text)-1))].strip()
                        self.snippet += u"... "
                        snippet_list.append(ind)

        except urllib2.HTTPError, err:
            return (None,err.code)
def review_to_wordlist( raw_review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).text
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)
示例#6
0
def review_to_wordlist(raw_review, remove_stopwords=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).text
    #
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return (words)