def __init__(self, stopwords_io_stream = None): self.stemmer = PorterStemmer() if(not stopwords_io_stream): stopwords_io_stream = open(Parser.STOP_WORDS_FILE, 'r') self.stopwords = stopwords_io_stream.read().split()
def stem(): for line in sys.stdin: line = line.strip('\n') if line: token = line.split('\t')[1] ps = PorterStemmer().stem(token, 0, len(token) - 1) print line + '\t' + ps
def load_data(stem=True): ps = PorterStemmer() positive = [] negative = [] directory = os.fsencode("./NEG") for file in os.listdir(directory): filename = os.fsdecode(file) f = open('./NEG/' + filename, 'r') text = f.read() if stem: text = ps.stem(text, 0, len(text) - 1) negative.append(text) directory = os.fsencode("./POS") for file in os.listdir(directory): filename = os.fsdecode(file) f = open('./POS/' + filename, 'r') text = f.read() if stem: text = ps.stem(text, 0, len(text) - 1) positive.append(text) target_pos = [] target_neg = [] for i in range(0, 1000): target_pos.append(0) for i in range(0, 1000): target_neg.append(1) X = positive + negative y = target_pos + target_neg return X, y
def __call__(self, tweet): # TODO: This function takes in a single tweet (just the text part) # then it will process/clean the tweet and return a list of tokens (words). # For example, if tweet was 'I eat', the function returns ['i', 'eat'] # You will not need to call this function explictly. # Once you initialize your vectorizer with this tokenizer, # then 'vectorizer.fit_transform()' will implictly call this function to # extract features from the training set, which is a list of tweet texts. # So once you call 'fit_transform()', the '__call__' function will be applied # on each tweet text in the training set (a list of tweet texts), tweet.lower() # 1. Lowercase all letters for i in string.punctuation: tweet = tweet.replace(i, " ") words = tweet.split() result = [] for word in words: if word[0] == "@": # 7. Removing user references word = "AT_USER" if word[0] == "\#": # 5. Removing hashtags word[0] = word[0].replace("\#", "") if word[0].isalpha( ): # Ignoring words that don't start with an alphabet letter if word.startswith("www.") or word.startswith( "https://") or word.startswith("http://"): word = "URL" word = PorterStemmer().stem(word, 0, len(word) - 1) # 2. Applying stemming word = re.sub(r'([a-z])\1+', r'\1\1', word) result.append(word) return result
def group_stems(total_count, individual_counts, occurence_per_doc): """Use the Porter Stemmer algorithm to take only the stems of words and then group them together as a single count. For instance, run and running might both be in the counts, hence we reduce this to just run.""" stemmer = PorterStemmer() new_individual_counts = {} new_total_counts = Counter() new_occurences_per_doc = Counter() for file_name, counts in individual_counts.iteritems(): file_counts = Counter() for word, count in counts.iteritems(): word_stem = stemmer.stem(word, 0, len(word) - 1) file_counts[word_stem] += count new_individual_counts[file_name] = file_counts for word, count in total_count.iteritems(): word_stem = stemmer.stem(word, 0, len(word) - 1) new_total_counts[word_stem] += count for word, count in occurence_per_doc.iteritems(): word_stem = stemmer.stem(word, 0, len(word) -1) new_occurences_per_doc[word_stem] += count print "Finished grouping words by their stems." return new_total_counts, new_individual_counts, new_occurences_per_doc
def tokeniseText(self, doc, isFile, stemFlag): stemmer = PorterStemmer() tokens = dict() stopWords = self.loadStopWords() fh = list() if isFile is True: fh = open(doc) else: fh.append(doc) for line in fh: line = re.sub('(<.*>)', '', line) line = re.sub('[^0-9a-zA-Z]+', ' ', line) line = line.strip().lower() words = line.split() if stemFlag is True: for word in words: if word not in stopWords: word = stemmer.stem(word, 0, len(word) - 1) if len(word) > 1 and word not in stopWords: tokens[word] = tokens.get(word, 0) + 1 else: for word in words: if len(word) > 1: tokens[word] = tokens.get(word, 0) + 1 return tokens
def processTweet(tweet): # Remove HTML special entities (e.g. &) tweet = re.sub(r'\&\w*;', '', tweet) #remove @username tweet = re.sub('@[^\s]+', '', tweet) # Remove tickers tweet = re.sub(r'\$\w*', '', tweet) # To lowercase tweet = tweet.lower() # Remove hyperlinks tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet) # Remove hashtags tweet = re.sub(r'#\w*', '', tweet) # Remove Punctuation and split 's, 't, 've with a space for filter tweet = re.sub(r'[' + string.punctuation.replace('@', '') + ']+', ' ', tweet) # Remove words with 2 or fewer letters tweet = re.sub(r'\b\w{1,2}\b', '', tweet) # Remove whitespace (including new line characters) tweet = re.sub(r'\s\s+', ' ', tweet) # Remove single space remaining at the front of the tweet. tweet = tweet.lstrip(' ') # Removing Stopwords from tweet using sklearn.feature_extraction split_list = tweet.split(" ") tweet = [ word for word in split_list if word not in stop_words.ENGLISH_STOP_WORDS ] # Stemming the ps = PorterStemmer() tweet = [ps.stem(word) for word in tweet] tweet = ' '.join(tweet) return tweet
def stem(words: List[str]): p = PorterStemmer() output = [] for word in words: if word.isalpha(): output.append(p.stem(word, 0,len(word)-1)) return output
def __init__(self, stopwords_file): self.stemmer = PorterStemmer() self.stop_words = set(line.strip() for line in open(stopwords_file))
# dic = {"joy": 0, "surprise": 0, "sad": 0, "angry": 0} def get_text_fromcsv(filename): with open(filename, "r") as file: f = csv.reader(file) d = [] next(file) for r in f: l = get_label(r[10:]) if l != "None": cleaned = ' '.join(clean_words(special_split(r[1]))) # dic[l] = dic[l] + 1 d.append(cleaned + " __label__" + l) # d.append(cleaned) return d delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '#', '$', '[', ']', '(', ')', '-', '=', '@', '%', '&', '*', '_', '>', '<', '{', '}', '|', '/', '\\', '\'', '"', '\t', '+', '~', '^'] stop_words = load_stop_words() porter = PorterStemmer() data = get_text_fromcsv("combined.csv") fi = open("preprocess_combined.txt", "w") for line in data: fi.write(line + "\n") fi.close()
def __init__(self): self.stemmer = PorterStemmer()
def extract_sentiment_for_movies(self, preprocessed_input): """Creative Feature: Extracts the sentiments from a line of pre-processed text that may contain multiple movies. Note that the sentiments toward the movies may be different. You should use the same sentiment values as extract_sentiment, described above. Hint: feel free to call previously defined functions to implement this. Example: sentiments = chatbot.extract_sentiment_for_text( chatbot.preprocess( 'I liked both "Titanic (1997)" and "Ex Machina".')) print(sentiments) // prints [("Titanic (1997)", 1), ("Ex Machina", 1)] :param preprocessed_input: a user-supplied line of text that has been pre-processed with preprocess() :returns: a list of tuples, where the first item in the tuple is a movie title, and the second is the sentiment in the text toward that movie """ #don't need to consider the case where some movies are in the database while the rest are not title_array = self.extract_titles(preprocessed_input) if len(title_array) == 1: return [(title_array[0], self.extract_sentiment(preprocessed_input))] stemmer = PorterStemmer() split_input = preprocessed_input.lower().split() negate = 1 num_conjunctions = 0 count = 0 in_quotes = False power = 1 conjunctions = ['and', 'nor', 'but', 'or', 'yet'] neg_list = [ "no", "not", "rather", "couldn't", "wasn't", "didn't", "wouldn't", "shouldn't", "weren't", "don't", "doesn't", "haven't", "hasn't", "won't", "wont", "hadn't", "never", "none", "nobody", "nothing", "neither", "nowhere", "isn't", "can't", "cannot", "mustn't", "mightn't", "shan't", "without", "needn't" ] power_list = [ "really", "reeally", "loved", "love", "hate", "hated", "terrible", "amazing", "fantastic", "incredible", "dreadful", "horrible", "horrid", "horrendous" ] sentiment_list = [] for word in split_input: word = word.strip() word_no_punc = word.rstrip(",.") stem = stemmer.stem(word_no_punc, 0, len(word_no_punc) - 1) if stem.endswith('i'): stem = stem[:-1] + 'y' if word.startswith("\""): in_quotes = True if word.endswith("\""): in_quotes = False continue if in_quotes: continue if word in neg_list and not word.endswith( "," ): # if word in neg_list but ends in comma, negate would be positive negate = -1 # or have negate * -1 else: has_comma = False # maybe include other punctuation? if word.endswith(","): has_comma = True if self.creative: if word_no_punc in power_list or stem in power_list or word.endswith( "!"): power = 2 if word_no_punc in conjunctions or stem in conjunctions: if (count == 0): if num_conjunctions != 0: sentiment_list.append( sentiment_list[num_conjunctions - 1]) else: sentiment_list.append(0) else: sentiment_list.append(count) count = 0 num_conjunctions += 1 if word_no_punc in self.sentiment: if self.sentiment[word_no_punc] == "pos": count += 1 * negate else: count += -1 * negate elif stem in self.sentiment: if self.sentiment[stem] == "pos": count += 1 * negate else: count += -1 * negate if has_comma: negate = 1 if (count == 0): sentiment_list.append(sentiment_list[num_conjunctions - 1]) else: sentiment_list.append(count) res = [] i = 0 for title in title_array: curr_count = 0 if sentiment_list[i] > 0: curr_count = 1 * power elif sentiment_list[i] < 0: curr_count = -1 * power res.append((title, curr_count)) i += 1 return res
def extract_sentiment(self, preprocessed_input): """Extract a sentiment rating from a line of pre-processed text. You should return -1 if the sentiment of the text is negative, 0 if the sentiment of the text is neutral (no sentiment detected), or +1 if the sentiment of the text is positive. As an optional creative extension, return -2 if the sentiment of the text is super negative and +2 if the sentiment of the text is super positive. Example: sentiment = chatbot.extract_sentiment(chatbot.preprocess( 'I liked "The Titanic"')) print(sentiment) // prints 1 :param preprocessed_input: a user-supplied line of text that has been pre-processed with preprocess() :returns: a numerical value for the sentiment of the text """ stemmer = PorterStemmer() split_input = preprocessed_input.lower().split() negate = 1 count = 0 in_quotes = False power = 1 neg_list = [ "no", "not", "rather", "couldn't", "wasn't", "didn't", "wouldn't", "shouldn't", "weren't", "don't", "doesn't", "haven't", "hasn't", "won't", "wont", "hadn't", "never", "none", "nobody", "nothing", "neither", "nor", "nowhere", "isn't", "can't", "cannot", "mustn't", "mightn't", "shan't", "without", "needn't" ] power_list = [ "really", "reeally", "loved", "love", "hate", "hated", "terrible", "amazing", "fantastic", "incredible", "dreadful", "horrible", "horrid", "horrendous" ] for word in split_input: word = word.strip() word_no_punc = word.rstrip(",.") stem = stemmer.stem(word_no_punc, 0, len(word_no_punc) - 1) if stem.endswith('i'): stem = stem[:-1] + 'y' if word.startswith("\""): in_quotes = True if word.endswith("\""): in_quotes = False continue if in_quotes: continue if word in neg_list and not word.endswith( "," ): # if word in neg_list but ends in comma, negate would be positive negate = -1 # or have negate * -1 else: has_comma = False # maybe include other punctuation? if word.endswith(","): has_comma = True if self.creative: if word_no_punc in power_list or stem in power_list or word.endswith( "!"): power = 2 if word_no_punc in self.sentiment: if self.sentiment[word_no_punc] == "pos": count += 1 * negate else: count += -1 * negate elif stem in self.sentiment: if self.sentiment[stem] == "pos": count += 1 * negate else: count += -1 * negate if has_comma: negate = 1 if count > 0: return 1 * power elif count < 0: return -1 * power return 0
def tokenise(self, string): """ break string up into tokens and stem words """ stemmer = PorterStemmer() string = self.clean(string) words = string.split(" ") return [stemmer.stem(word, 0, len(word) - 1) for word in words]
def __init__(self, ngrams=1): self.stemmer = PorterStemmer() self.ngrams = ngrams print ' -- initializing tokenizer with maximum ngram = %d' % ngrams