예제 #1
0
 def tokeniseText(self, doc, isFile, stemFlag):
     stemmer = PorterStemmer()
     tokens = dict()
     stopWords = self.loadStopWords()
     fh = list()
     if isFile is True:
         fh = open(doc)
     else:
         fh.append(doc)
     for line in fh:
         line = re.sub('(<.*>)', '', line)
         line = re.sub('[^0-9a-zA-Z]+', ' ', line)
         line = line.strip().lower()
         words = line.split()
         if stemFlag is True:
             for word in words:
                 if word not in stopWords:
                     word = stemmer.stem(word, 0,
                                         len(word) - 1)
                     if len(word) > 1 and word not in stopWords:
                         tokens[word] = tokens.get(word, 0) + 1
         else:
             for word in words:
                 if len(word) > 1:
                     tokens[word] = tokens.get(word, 0) + 1
     return tokens
def group_stems(total_count, individual_counts, occurence_per_doc):
    """Use the Porter Stemmer algorithm to take only the stems of words and then group them together as a single
        count. For instance, run and running might both be in the counts, hence we reduce this to just run."""
    stemmer = PorterStemmer()
    new_individual_counts = {}
    new_total_counts = Counter()
    new_occurences_per_doc = Counter()
    for file_name, counts in individual_counts.iteritems():
        file_counts = Counter()
        for word, count in counts.iteritems():
            word_stem = stemmer.stem(word, 0, len(word) - 1)
            file_counts[word_stem] += count
        new_individual_counts[file_name] = file_counts

    for word, count in total_count.iteritems():
        word_stem = stemmer.stem(word, 0, len(word) - 1)
        new_total_counts[word_stem] += count

    for word, count in occurence_per_doc.iteritems():
        word_stem = stemmer.stem(word, 0, len(word) -1)
        new_occurences_per_doc[word_stem] += count

    print "Finished grouping words by their stems."

    return new_total_counts, new_individual_counts, new_occurences_per_doc
예제 #3
0
def load_data(stem=True):
    ps = PorterStemmer()
    positive = []
    negative = []

    directory = os.fsencode("./NEG")
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        f = open('./NEG/' + filename, 'r')
        text = f.read()
        if stem:
            text = ps.stem(text, 0, len(text) - 1)
        negative.append(text)

    directory = os.fsencode("./POS")
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        f = open('./POS/' + filename, 'r')
        text = f.read()
        if stem:
            text = ps.stem(text, 0, len(text) - 1)
        positive.append(text)

    target_pos = []
    target_neg = []
    for i in range(0, 1000):
        target_pos.append(0)
    for i in range(0, 1000):
        target_neg.append(1)

    X = positive + negative
    y = target_pos + target_neg
    return X, y
예제 #4
0
    def __init__(self, stopwords_io_stream = None):
    	self.stemmer = PorterStemmer()
        
        if(not stopwords_io_stream):
    	  stopwords_io_stream = open(Parser.STOP_WORDS_FILE, 'r')

        self.stopwords = stopwords_io_stream.read().split()
예제 #5
0
def processTweet(tweet):
    # Remove HTML special entities (e.g. &amp;)
    tweet = re.sub(r'\&\w*;', '', tweet)
    #remove @username
    tweet = re.sub('@[^\s]+', '', tweet)
    # Remove tickers
    tweet = re.sub(r'\$\w*', '', tweet)
    # To lowercase
    tweet = tweet.lower()
    # Remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet)
    # Remove hashtags
    tweet = re.sub(r'#\w*', '', tweet)
    # Remove Punctuation and split 's, 't, 've with a space for filter
    tweet = re.sub(r'[' + string.punctuation.replace('@', '') + ']+', ' ',
                   tweet)
    # Remove words with 2 or fewer letters
    tweet = re.sub(r'\b\w{1,2}\b', '', tweet)
    # Remove whitespace (including new line characters)
    tweet = re.sub(r'\s\s+', ' ', tweet)
    # Remove single space remaining at the front of the tweet.
    tweet = tweet.lstrip(' ')
    # Removing Stopwords from tweet using sklearn.feature_extraction
    split_list = tweet.split(" ")
    tweet = [
        word for word in split_list
        if word not in stop_words.ENGLISH_STOP_WORDS
    ]
    # Stemming the
    ps = PorterStemmer()
    tweet = [ps.stem(word) for word in tweet]
    tweet = ' '.join(tweet)
    return tweet
예제 #6
0
def stem(words: List[str]):
    p = PorterStemmer()
    output = []
    for word in words:
        if word.isalpha():
            output.append(p.stem(word, 0,len(word)-1))
    return output
예제 #7
0
def stem():
    for line in sys.stdin:
        line = line.strip('\n')
        if line:
            token = line.split('\t')[1]
            ps = PorterStemmer().stem(token, 0, len(token) - 1)
            print line + '\t' + ps
예제 #8
0
class Tokenizer(object):
	def __init__(self):
		self.stemmer = PorterStemmer()

	def __call__(self, tweet):
		
		# TODO: This function takes in a single tweet (just the text part)
		# then it will process/clean the tweet and return a list of tokens (words).
		# For example, if tweet was 'I eat', the function returns ['i', 'eat']
		# Lowercase the tweet and strip punctuations
		parsed = tweet.lower().replace(',',' ').replace('.',' ').replace('?',' ').replace('!',' ').replace('-',' ').replace('#',' ').replace(':',' ').replace('(',' ').replace(')',' ').replace('=',' ').replace('...',' ').split()
		for i, word in enumerate(parsed):
			# do replacement
			if '@' in word:
				parsed[i] = 'AT_USER'
			if ('www.' in word) or ('.com' in word) or ('http://' in word) or ('http(s)://' in word):
				parsed[i] = 'URL'
			# Replacing three or more occurrence of the same character with one occurrence using regular expression
			parsed[i] = re.sub(r'(.)\1{2,}', r'\1', parsed[i])
		# Apply stemming to each term to restore its original form
		res = [self.stemmer.stem(kw, 0, len(kw)-1) for kw in parsed]
		# De-duplicate the term list to make it only contains distinct terms
		res = list(set(res))
		# You will not need to call this function explictly. 
		# Once you initialize your vectorizer with this tokenizer, 
		# then 'vectorizer.fit_transform()' will implictly call this function to 
		# extract features from the training set, which is a list of tweet texts.
		# So once you call 'fit_transform()', the '__call__' function will be applied 
		# on each tweet text in the training set (a list of tweet texts),

		return res
    def __call__(self, tweet):
        # TODO: This function takes in a single tweet (just the text part)
        # then it will process/clean the tweet and return a list of tokens (words).
        # For example, if tweet was 'I eat', the function returns ['i', 'eat']

        # You will not need to call this function explictly.
        # Once you initialize your vectorizer with this tokenizer,
        # then 'vectorizer.fit_transform()' will implictly call this function to
        # extract features from the training set, which is a list of tweet texts.
        # So once you call 'fit_transform()', the '__call__' function will be applied
        # on each tweet text in the training set (a list of tweet texts),
        tweet.lower()
        # 1. Lowercase all letters
        for i in string.punctuation:
            tweet = tweet.replace(i, " ")
        words = tweet.split()

        result = []
        for word in words:
            if word[0] == "@":  # 7. Removing user references
                word = "AT_USER"
            if word[0] == "\#":  # 5. Removing hashtags
                word[0] = word[0].replace("\#", "")
            if word[0].isalpha(
            ):  # Ignoring words that don't start with an alphabet letter
                if word.startswith("www.") or word.startswith(
                        "https://") or word.startswith("http://"):
                    word = "URL"
                word = PorterStemmer().stem(word, 0,
                                            len(word) -
                                            1)  # 2. Applying stemming
                word = re.sub(r'([a-z])\1+', r'\1\1', word)
                result.append(word)

        return result
예제 #10
0
class Tokenizer:
    def __init__(self, ngrams=1):
        self.stemmer = PorterStemmer()
        self.ngrams = ngrams
        print ' -- initializing tokenizer with maximum ngram = %d' % ngrams

    def tokenize(self, tweet):
        words = set()
        ng = []

        # fix the most common HTML escapes
        tweet = tweet.replace('&quot;', '').replace('&nbsp;',
                                                    ' ').replace('&amp;', ' ')

        # replace certain word-splitting with spaces (e.g. 'foo...bar' should
        # be split). do not include '.' because this will incorrectly split URLs
        for c in ('...', '..', '&', ','):
            tweet = tweet.replace(c, ' ')

        for w in tweet.split():
            w = w.lower()  # convert to lowercase

            # get rid of links and user mentions
            for s in links:
                if s in w: continue
            if w.startswith('@'): continue

            w = w.strip(punct)  # strip punctuation (including hashtag)

            if len(w) == 0: continue  # ignore now-blank words

            if w in stopwords: continue  # ignore stopwords

            # replace two or more occurrence of same char with two
            notrip = ''
            for i, x in enumerate(w):
                if i < 2 or x != w[i - 2] or x != w[i - 1]: notrip += x

            notrip = self.stemmer.stem(
                notrip, 0,
                len(notrip) - 1)  # apply stemming using Porter Stemmer

            if not notrip[0].isalpha(): continue

            words.add(notrip)  # ignore words that don't start with alphabet

            if self.ngrams > 1: ng.append(notrip)

        # Generate all ngrams and add them
        if self.ngrams > 1 and len(words) > 1:
            if self.ngrams > len(words): self.ngrams = len(words)
            for i in range(len(ng) - self.ngrams + 1):
                ngram = ng[i]
                for j in range(1, self.ngrams):
                    ngram += ' ' + ng[i + j]
                    if ngram == 'madison squar garden': print 'msg'
                    words.add(ngram)

        return words
예제 #11
0
class Tokenizer:
	def __init__(self, ngrams=1):
		self.stemmer = PorterStemmer()
		self.ngrams = ngrams
		print ' -- initializing tokenizer with maximum ngram = %d' % ngrams

	def tokenize(self, tweet):
		words = set()
		ng = []

		# fix the most common HTML escapes
		tweet = tweet.replace('&quot;','').replace('&nbsp;',' ').replace('&amp;',' ')

		# replace certain word-splitting with spaces (e.g. 'foo...bar' should 
		# be split). do not include '.' because this will incorrectly split URLs 
		for c in ('...','..','&',','):
			tweet = tweet.replace(c,' ')


		for w in tweet.split():
			w = w.lower() # convert to lowercase
			
			# get rid of links and user mentions
			for s in links:
				if s in w: continue
			if w.startswith('@'): continue

			w = w.strip(punct) # strip punctuation (including hashtag)

			if len(w) == 0: continue  # ignore now-blank words

			if w in stopwords: continue # ignore stopwords
			
			# replace two or more occurrence of same char with two
			notrip = '' 
			for i,x in enumerate(w):
				if i < 2 or x != w[i-2] or x != w[i-1]: notrip += x

			notrip = self.stemmer.stem(notrip, 0, len(notrip)-1) # apply stemming using Porter Stemmer

			if not notrip[0].isalpha(): continue
			
			words.add(notrip) # ignore words that don't start with alphabet

			if self.ngrams>1: ng.append(notrip)


		# Generate all ngrams and add them
		if self.ngrams>1 and len(words)>1:
			if self.ngrams > len(words): self.ngrams = len(words)
			for i in range(len(ng)-self.ngrams+1):
				ngram = ng[i]
				for j in range(1,self.ngrams):
					ngram += ' '+ng[i+j]
					if ngram == 'madison squar garden': print 'msg'
					words.add(ngram)


		return words
예제 #12
0
class Tokenizer(object):
    def __init__(self):
        self.stemmer = PorterStemmer()

    # only admit non-number with length>2
    def qualify(self, word):
        return len(word)>2 and not word.isdigit()

    # TODO: Change the text processing here
    # You only need to edit the below function
    def process_tweet(self, tweet):
        tweet = tweet.lower()
        exclude = set(string.punctuation)
        tweet = ''.join(ch for ch in tweet if ch not in exclude)
        words = tweet.split()

        for i in range(len(words)):
            words[i] = self.stemmer.stem(words[i], 0, len(words[i]) - 1)
        
        tweet = ' '.join(words)
        return tweet

    def __call__(self, tweet):
        # This function takes in a single tweet (just the text part)
        # then it will process/clean the tweet and return a list of tokens (words).
        # For example, if tweet was 'I eat', the function returns ['i', 'eat']

        # You will not need to call this function explictly.
        # Once you initialize your vectorizer with this tokenizer,
        # then 'vectorizer.fit_transform()' will implictly call this function to
        # extract features from the training set, which is a list of tweet texts.
        # So once you call 'fit_transform()', the '__call__' function will be applied
        # on each tweet text in the training set (a list of tweet texts),
        features = []
        for word in self.process_tweet(tweet).split():
            if self.qualify(word):
                # Stem
                word = self.stemmer.stem(word, 0, len(word) - 1)

                features.append(word)

        return features
예제 #13
0
class Tokenizer(object):
    def __init__(self, stopwords_file):
        self.stemmer = PorterStemmer()
        self.stop_words = set(line.strip() for line in open(stopwords_file))

    def tokenize(self, text):
        tokens = []
        words = text.lower().split()
        for word in words:
            word = re.sub(r'[^\w\s]', '', word)  # remove punctuation
            word = self.stemmer.stem(word, 0, len(word) - 1)  # stem

            # throw out words in stop words and those starting with non alphabet
            if word and word not in self.stop_words and word[0].isalpha():
                tokens.append(word)

        return tokens
예제 #14
0
class Tokenizer(object):
    def __init__(self, stopwords_file):
        self.stemmer = PorterStemmer()
        self.stop_words = set(line.strip() for line in open(stopwords_file))

    def tokenize(self, text):
        tokens = []
        words = text.lower().split()
        for word in words:
            word = re.sub(r'[^\w\s]', '', word) # remove punctuation
            word = self.stemmer.stem(word, 0, len(word) - 1) # stem

            # throw out words in stop words and those starting with non alphabet
            if word and word not in self.stop_words and word[0].isalpha():
                tokens.append(word)

        return tokens
예제 #15
0
class Parser:
    STOP_WORDS_FILE = '%s/../data/english.stop' % os.path.dirname(
        os.path.realpath(__file__))

    stemmer = None
    stopwords = []

    def __init__(self, stopwords_io_stream=None):
        self.stemmer = PorterStemmer()

        if (not stopwords_io_stream):
            stopwords_io_stream = open(Parser.STOP_WORDS_FILE, 'r')

        self.stopwords = stopwords_io_stream.read().split()

    def tokenise_and_remove_stop_words(self, document_list):
        if not document_list:
            return []

        vocabulary_string = " ".join(document_list)

        tokenised_vocabulary_list = self._tokenise(vocabulary_string)
        clean_word_list = self._remove_stop_words(tokenised_vocabulary_list)
        return clean_word_list

    def _remove_stop_words(self, list):
        """ Remove common words which have no search value """
        return [word for word in list if word not in self.stopwords]

    def _tokenise(self, string):
        """ break string up into tokens and stem words """
        string = self._clean(string)
        words = string.split(" ")

        return [self.stemmer.stem(word, 0, len(word) - 1) for word in words]

    def _clean(self, string):
        """ remove any nasty grammar tokens from string """
        string = "".join(l for l in string if l not in stg.punctuation)
        #string = string.replace("\""," ")
        #string = string.replace("\'"," ")
        #string = string.replace(":","")
        string = string.replace("\s+", " ")
        string = string.lower()
        return string
예제 #16
0
class Parser:
    STOP_WORDS_FILE = '%s/../data/english.stop' %  os.path.dirname(os.path.realpath(__file__))

    stemmer = None
    stopwords = []

    def __init__(self, stopwords_io_stream = None):
    	self.stemmer = PorterStemmer()
        
        if(not stopwords_io_stream):
    	  stopwords_io_stream = open(Parser.STOP_WORDS_FILE, 'r')

        self.stopwords = stopwords_io_stream.read().split()

    def tokenise_and_remove_stop_words(self, document_list):
        if not document_list:
          return []
          
    	vocabulary_string = " ".join(document_list)
                
    	tokenised_vocabulary_list = self._tokenise(vocabulary_string)
    	clean_word_list = self._remove_stop_words(tokenised_vocabulary_list)
        return clean_word_list

    def _remove_stop_words(self, list):
    	""" Remove common words which have no search value """
    	return [word for word in list if word not in self.stopwords ]


    def _tokenise(self, string):
    	""" break string up into tokens and stem words """
    	string = self._clean(string)
    	words = string.split(" ")
		
    	return [self.stemmer.stem(word, 0, len(word)-1) for word in words]

    def _clean(self, string):
    	""" remove any nasty grammar tokens from string """
    	string = string.replace(".","")
    	string = string.replace("\s+"," ")
    	string = string.lower()
    	return string
예제 #17
0
class Tokenizer(object):
	def __init__(self):
		self.stemmer = PorterStemmer()

	def process_review(self, tweet):
		#TODO: pre-process tweet
		# this is a helper function for __call__
		
		tweet = tweet.lower() # To lower case
		tweet = re.sub('[^\x20-\x7E]*','',tweet)
		tweet = re.sub(r'([a-z0-9])\1+',r'\1\1',tweet) #Replace two or more occurrences of the same character with two occurrences

		stopword_file= open("stopwords.txt")
		stopwords =set()
		stopword_content = stopword_file.readlines()
		for word in stopword_content:
			stopwords.add(word.strip().lower())


		tweet_words = tweet.split()
		tweet = ""
		for w in tweet_words:
			if len(w)< 1 or w in stopwords:
				continue
			elif not w[0].isalpha(): 
				continue
			else:
				tweet+= " "+w
		tweet = tweet.strip()
		tweet = ''.join(ch for ch in tweet if ch not in string.punctuation)

		tweet_words = [self.stemmer.stem(word, 0, len(word)-1) for word in tweet.split(" ")]

		return tweet_words

	def __call__(self, doc):
		# this function will tokenize the given document and return a list of extracted features (tokens)
		processed_doc = self.process_tweet(doc)
		#TODO: return a list of features extracted from processed_doc
		return processed_doc
 def __init__(self):
     self.stemmer = PorterStemmer()
예제 #19
0
# dic = {"joy": 0, "surprise": 0, "sad": 0, "angry": 0}


def get_text_fromcsv(filename):
    with open(filename, "r") as file:
        f = csv.reader(file)
        d = []
        next(file)
        for r in f:
            l = get_label(r[10:])
            if l != "None":
                cleaned = ' '.join(clean_words(special_split(r[1])))
                # dic[l] = dic[l] + 1
                d.append(cleaned + " __label__" + l)
                # d.append(cleaned)
        return d

delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '#', '$', '[', ']',
              '(', ')', '-', '=', '@', '%', '&', '*', '_', '>', '<',
              '{', '}', '|', '/', '\\', '\'', '"', '\t', '+', '~',
              '^']
stop_words = load_stop_words()
porter = PorterStemmer()
data = get_text_fromcsv("combined.csv")

fi = open("preprocess_combined.txt", "w")
for line in data:
    fi.write(line + "\n")
fi.close()
예제 #20
0
	def __init__(self):
		self.stemmer = PorterStemmer()
예제 #21
0
from porter_stemmer import PorterStemmer

stemmer = PorterStemmer()
resume = open('assets/resume.txt', 'r').read()
print(stemmer.stem_document(resume))
예제 #22
0
	def __init__(self, ngrams=1):
		self.stemmer = PorterStemmer()
		self.ngrams = ngrams
		print ' -- initializing tokenizer with maximum ngram = %d' % ngrams
예제 #23
0
 def __init__(self, ngrams=1):
     self.stemmer = PorterStemmer()
     self.ngrams = ngrams
     print ' -- initializing tokenizer with maximum ngram = %d' % ngrams
예제 #24
0
from porter_stemmer import PorterStemmer

stemmer = PorterStemmer()

print stemmer.stem_word("adoption")
print stemmer.stem_word("controll")
print stemmer.stem_word("roll")
print stemmer.stem_word("agreed")
예제 #25
0
 def __init__(self, stopwords_file):
     self.stemmer = PorterStemmer()
     self.stop_words = set(line.strip() for line in open(stopwords_file))
# custom porter stemmer
from porter_stemmer import PorterStemmer

stemmer = PorterStemmer()
document = "day"
print(stemmer.stem_document(document))

# using the nltk library
# from nltk.stem import PorterStemmer
#
# stemmer = PorterStemmer()
# print(stemmer.stem('day'))
예제 #27
0
    def extract_sentiment_for_movies(self, preprocessed_input):
        """Creative Feature: Extracts the sentiments from a line of
        pre-processed text that may contain multiple movies. Note that the
        sentiments toward the movies may be different.

        You should use the same sentiment values as extract_sentiment, described

        above.
        Hint: feel free to call previously defined functions to implement this.

        Example:
          sentiments = chatbot.extract_sentiment_for_text(
                           chatbot.preprocess(
                           'I liked both "Titanic (1997)" and "Ex Machina".'))
          print(sentiments) // prints [("Titanic (1997)", 1), ("Ex Machina", 1)]

        :param preprocessed_input: a user-supplied line of text that has been
        pre-processed with preprocess()
        :returns: a list of tuples, where the first item in the tuple is a movie
        title, and the second is the sentiment in the text toward that movie
        """
        #don't need to consider the case where some movies are in the database while the rest are not
        title_array = self.extract_titles(preprocessed_input)
        if len(title_array) == 1:
            return [(title_array[0],
                     self.extract_sentiment(preprocessed_input))]

        stemmer = PorterStemmer()
        split_input = preprocessed_input.lower().split()
        negate = 1
        num_conjunctions = 0
        count = 0
        in_quotes = False
        power = 1
        conjunctions = ['and', 'nor', 'but', 'or', 'yet']
        neg_list = [
            "no", "not", "rather", "couldn't", "wasn't", "didn't", "wouldn't",
            "shouldn't", "weren't", "don't", "doesn't", "haven't", "hasn't",
            "won't", "wont", "hadn't", "never", "none", "nobody", "nothing",
            "neither", "nowhere", "isn't", "can't", "cannot", "mustn't",
            "mightn't", "shan't", "without", "needn't"
        ]
        power_list = [
            "really", "reeally", "loved", "love", "hate", "hated", "terrible",
            "amazing", "fantastic", "incredible", "dreadful", "horrible",
            "horrid", "horrendous"
        ]
        sentiment_list = []
        for word in split_input:
            word = word.strip()
            word_no_punc = word.rstrip(",.")
            stem = stemmer.stem(word_no_punc, 0, len(word_no_punc) - 1)
            if stem.endswith('i'):
                stem = stem[:-1] + 'y'
            if word.startswith("\""):
                in_quotes = True
            if word.endswith("\""):
                in_quotes = False
                continue
            if in_quotes:
                continue
            if word in neg_list and not word.endswith(
                    ","
            ):  # if word in neg_list but ends in comma, negate would be positive
                negate = -1  # or have negate * -1
            else:
                has_comma = False
                # maybe include other punctuation?
                if word.endswith(","):
                    has_comma = True
                if self.creative:
                    if word_no_punc in power_list or stem in power_list or word.endswith(
                            "!"):
                        power = 2
                    if word_no_punc in conjunctions or stem in conjunctions:
                        if (count == 0):
                            if num_conjunctions != 0:
                                sentiment_list.append(
                                    sentiment_list[num_conjunctions - 1])
                            else:
                                sentiment_list.append(0)
                        else:
                            sentiment_list.append(count)
                        count = 0
                        num_conjunctions += 1
                if word_no_punc in self.sentiment:
                    if self.sentiment[word_no_punc] == "pos":
                        count += 1 * negate
                    else:
                        count += -1 * negate
                elif stem in self.sentiment:
                    if self.sentiment[stem] == "pos":
                        count += 1 * negate
                    else:
                        count += -1 * negate
                if has_comma:
                    negate = 1

        if (count == 0):
            sentiment_list.append(sentiment_list[num_conjunctions - 1])
        else:
            sentiment_list.append(count)

        res = []
        i = 0
        for title in title_array:
            curr_count = 0
            if sentiment_list[i] > 0:
                curr_count = 1 * power
            elif sentiment_list[i] < 0:
                curr_count = -1 * power
            res.append((title, curr_count))
            i += 1
        return res
예제 #28
0
class Tokenizer(object):
    def __init__(self):
        self.stemmer = PorterStemmer()

    # only admit non-number with length>2
    def qualify(self, word):
        return len(word) > 2 and not word.isdigit()

    def process_desc(self, desc):

        ndesc = []

        for word in desc.split():

            # lowercase all characters
            word = word.lower()
            # replace words with hashtags with just the words
            if word[0] == "#":
                word = word[1:]
            # replace words with @ with "AT_USER"
            elif word[0] == "@":
                word = "AT_USER"
            # replace words with url beginnings with "URL"
            elif len(word) > 4:
                if word[:4] == "www.":
                    word = "URL"
            elif len(word) > 10:
                if word[:10] == "http(s)://":
                    word = "URL"
            # strip punctuation using translate string method
            translator = str.maketrans('', '', string.punctuation)
            word = word.translate(translator)

            # use stop words list to filter out low value words
            if word not in stop:
                # ignore words that are one letter long
                if len(word) > 1:
                    # check to see if the first letter of the word is an alphabetic character
                    if word[0].isalpha() == True:

                        # finally check for duplicates
                        if word not in ndesc:
                            ndesc.append(word)

        return ' '.join(ndesc)

    def __call__(self, desc):
        # This function takes in a single desc (just the text part)
        # then it will process/clean the desc and return a list of tokens (words).
        # For example, if desc was 'I eat', the function returns ['i', 'eat']

        # You will not need to call this function explictly.
        # Once you initialize your vectorizer with this tokenizer,
        # then 'vectorizer.fit_transform()' will implictly call this function to
        # extract features from the training set, which is a list of desc texts.
        # So once you call 'fit_transform()', the '__call__' function will be applied
        # on each desc text in the training set (a list of desc texts),
        features = []
        for word in self.process_desc(desc).split():
            if self.qualify(word):
                # Stem
                word = self.stemmer.stem(word, 0, len(word) - 1)

                features.append(word)

        return features
예제 #29
0
 def tokenise(self, string):
     """ break string up into tokens and stem words """
     stemmer = PorterStemmer()
     string = self.clean(string)
     words = string.split(" ")
     return [stemmer.stem(word, 0, len(word) - 1) for word in words]
예제 #30
0
    def extract_sentiment(self, preprocessed_input):
        """Extract a sentiment rating from a line of pre-processed text.

        You should return -1 if the sentiment of the text is negative, 0 if the
        sentiment of the text is neutral (no sentiment detected), or +1 if the
        sentiment of the text is positive.

        As an optional creative extension, return -2 if the sentiment of the
        text is super negative and +2 if the sentiment of the text is super
        positive.

        Example:
          sentiment = chatbot.extract_sentiment(chatbot.preprocess(
                                                    'I liked "The Titanic"'))
          print(sentiment) // prints 1

        :param preprocessed_input: a user-supplied line of text that has been
        pre-processed with preprocess()
        :returns: a numerical value for the sentiment of the text
        """
        stemmer = PorterStemmer()
        split_input = preprocessed_input.lower().split()
        negate = 1
        count = 0
        in_quotes = False
        power = 1
        neg_list = [
            "no", "not", "rather", "couldn't", "wasn't", "didn't", "wouldn't",
            "shouldn't", "weren't", "don't", "doesn't", "haven't", "hasn't",
            "won't", "wont", "hadn't", "never", "none", "nobody", "nothing",
            "neither", "nor", "nowhere", "isn't", "can't", "cannot", "mustn't",
            "mightn't", "shan't", "without", "needn't"
        ]
        power_list = [
            "really", "reeally", "loved", "love", "hate", "hated", "terrible",
            "amazing", "fantastic", "incredible", "dreadful", "horrible",
            "horrid", "horrendous"
        ]
        for word in split_input:
            word = word.strip()
            word_no_punc = word.rstrip(",.")
            stem = stemmer.stem(word_no_punc, 0, len(word_no_punc) - 1)
            if stem.endswith('i'):
                stem = stem[:-1] + 'y'
            if word.startswith("\""):
                in_quotes = True
            if word.endswith("\""):
                in_quotes = False
                continue
            if in_quotes:
                continue
            if word in neg_list and not word.endswith(
                    ","
            ):  # if word in neg_list but ends in comma, negate would be positive
                negate = -1  # or have negate * -1
            else:
                has_comma = False
                # maybe include other punctuation?
                if word.endswith(","):
                    has_comma = True
                if self.creative:
                    if word_no_punc in power_list or stem in power_list or word.endswith(
                            "!"):
                        power = 2
                if word_no_punc in self.sentiment:
                    if self.sentiment[word_no_punc] == "pos":
                        count += 1 * negate
                    else:
                        count += -1 * negate
                elif stem in self.sentiment:
                    if self.sentiment[stem] == "pos":
                        count += 1 * negate
                    else:
                        count += -1 * negate
                if has_comma:
                    negate = 1
        if count > 0:
            return 1 * power
        elif count < 0:
            return -1 * power
        return 0
예제 #31
0
 def __init__(self, stopwords_file):
     self.stemmer = PorterStemmer()
     self.stop_words = set(line.strip() for line in open(stopwords_file))