예제 #1
0
    def __init__(self, stopwords_io_stream = None):
    	self.stemmer = PorterStemmer()
        
        if(not stopwords_io_stream):
    	  stopwords_io_stream = open(Parser.STOP_WORDS_FILE, 'r')

        self.stopwords = stopwords_io_stream.read().split()
예제 #2
0
def stem():
    for line in sys.stdin:
        line = line.strip('\n')
        if line:
            token = line.split('\t')[1]
            ps = PorterStemmer().stem(token, 0, len(token) - 1)
            print line + '\t' + ps
예제 #3
0
def load_data(stem=True):
    ps = PorterStemmer()
    positive = []
    negative = []

    directory = os.fsencode("./NEG")
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        f = open('./NEG/' + filename, 'r')
        text = f.read()
        if stem:
            text = ps.stem(text, 0, len(text) - 1)
        negative.append(text)

    directory = os.fsencode("./POS")
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        f = open('./POS/' + filename, 'r')
        text = f.read()
        if stem:
            text = ps.stem(text, 0, len(text) - 1)
        positive.append(text)

    target_pos = []
    target_neg = []
    for i in range(0, 1000):
        target_pos.append(0)
    for i in range(0, 1000):
        target_neg.append(1)

    X = positive + negative
    y = target_pos + target_neg
    return X, y
    def __call__(self, tweet):
        # TODO: This function takes in a single tweet (just the text part)
        # then it will process/clean the tweet and return a list of tokens (words).
        # For example, if tweet was 'I eat', the function returns ['i', 'eat']

        # You will not need to call this function explictly.
        # Once you initialize your vectorizer with this tokenizer,
        # then 'vectorizer.fit_transform()' will implictly call this function to
        # extract features from the training set, which is a list of tweet texts.
        # So once you call 'fit_transform()', the '__call__' function will be applied
        # on each tweet text in the training set (a list of tweet texts),
        tweet.lower()
        # 1. Lowercase all letters
        for i in string.punctuation:
            tweet = tweet.replace(i, " ")
        words = tweet.split()

        result = []
        for word in words:
            if word[0] == "@":  # 7. Removing user references
                word = "AT_USER"
            if word[0] == "\#":  # 5. Removing hashtags
                word[0] = word[0].replace("\#", "")
            if word[0].isalpha(
            ):  # Ignoring words that don't start with an alphabet letter
                if word.startswith("www.") or word.startswith(
                        "https://") or word.startswith("http://"):
                    word = "URL"
                word = PorterStemmer().stem(word, 0,
                                            len(word) -
                                            1)  # 2. Applying stemming
                word = re.sub(r'([a-z])\1+', r'\1\1', word)
                result.append(word)

        return result
def group_stems(total_count, individual_counts, occurence_per_doc):
    """Use the Porter Stemmer algorithm to take only the stems of words and then group them together as a single
        count. For instance, run and running might both be in the counts, hence we reduce this to just run."""
    stemmer = PorterStemmer()
    new_individual_counts = {}
    new_total_counts = Counter()
    new_occurences_per_doc = Counter()
    for file_name, counts in individual_counts.iteritems():
        file_counts = Counter()
        for word, count in counts.iteritems():
            word_stem = stemmer.stem(word, 0, len(word) - 1)
            file_counts[word_stem] += count
        new_individual_counts[file_name] = file_counts

    for word, count in total_count.iteritems():
        word_stem = stemmer.stem(word, 0, len(word) - 1)
        new_total_counts[word_stem] += count

    for word, count in occurence_per_doc.iteritems():
        word_stem = stemmer.stem(word, 0, len(word) -1)
        new_occurences_per_doc[word_stem] += count

    print "Finished grouping words by their stems."

    return new_total_counts, new_individual_counts, new_occurences_per_doc
예제 #6
0
 def tokeniseText(self, doc, isFile, stemFlag):
     stemmer = PorterStemmer()
     tokens = dict()
     stopWords = self.loadStopWords()
     fh = list()
     if isFile is True:
         fh = open(doc)
     else:
         fh.append(doc)
     for line in fh:
         line = re.sub('(<.*>)', '', line)
         line = re.sub('[^0-9a-zA-Z]+', ' ', line)
         line = line.strip().lower()
         words = line.split()
         if stemFlag is True:
             for word in words:
                 if word not in stopWords:
                     word = stemmer.stem(word, 0,
                                         len(word) - 1)
                     if len(word) > 1 and word not in stopWords:
                         tokens[word] = tokens.get(word, 0) + 1
         else:
             for word in words:
                 if len(word) > 1:
                     tokens[word] = tokens.get(word, 0) + 1
     return tokens
예제 #7
0
def processTweet(tweet):
    # Remove HTML special entities (e.g. &amp;)
    tweet = re.sub(r'\&\w*;', '', tweet)
    #remove @username
    tweet = re.sub('@[^\s]+', '', tweet)
    # Remove tickers
    tweet = re.sub(r'\$\w*', '', tweet)
    # To lowercase
    tweet = tweet.lower()
    # Remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet)
    # Remove hashtags
    tweet = re.sub(r'#\w*', '', tweet)
    # Remove Punctuation and split 's, 't, 've with a space for filter
    tweet = re.sub(r'[' + string.punctuation.replace('@', '') + ']+', ' ',
                   tweet)
    # Remove words with 2 or fewer letters
    tweet = re.sub(r'\b\w{1,2}\b', '', tweet)
    # Remove whitespace (including new line characters)
    tweet = re.sub(r'\s\s+', ' ', tweet)
    # Remove single space remaining at the front of the tweet.
    tweet = tweet.lstrip(' ')
    # Removing Stopwords from tweet using sklearn.feature_extraction
    split_list = tweet.split(" ")
    tweet = [
        word for word in split_list
        if word not in stop_words.ENGLISH_STOP_WORDS
    ]
    # Stemming the
    ps = PorterStemmer()
    tweet = [ps.stem(word) for word in tweet]
    tweet = ' '.join(tweet)
    return tweet
예제 #8
0
def stem(words: List[str]):
    p = PorterStemmer()
    output = []
    for word in words:
        if word.isalpha():
            output.append(p.stem(word, 0,len(word)-1))
    return output
예제 #9
0
 def __init__(self, stopwords_file):
     self.stemmer = PorterStemmer()
     self.stop_words = set(line.strip() for line in open(stopwords_file))
예제 #10
0
# dic = {"joy": 0, "surprise": 0, "sad": 0, "angry": 0}


def get_text_fromcsv(filename):
    with open(filename, "r") as file:
        f = csv.reader(file)
        d = []
        next(file)
        for r in f:
            l = get_label(r[10:])
            if l != "None":
                cleaned = ' '.join(clean_words(special_split(r[1])))
                # dic[l] = dic[l] + 1
                d.append(cleaned + " __label__" + l)
                # d.append(cleaned)
        return d

delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '#', '$', '[', ']',
              '(', ')', '-', '=', '@', '%', '&', '*', '_', '>', '<',
              '{', '}', '|', '/', '\\', '\'', '"', '\t', '+', '~',
              '^']
stop_words = load_stop_words()
porter = PorterStemmer()
data = get_text_fromcsv("combined.csv")

fi = open("preprocess_combined.txt", "w")
for line in data:
    fi.write(line + "\n")
fi.close()
 def __init__(self):
     self.stemmer = PorterStemmer()
예제 #12
0
    def extract_sentiment_for_movies(self, preprocessed_input):
        """Creative Feature: Extracts the sentiments from a line of
        pre-processed text that may contain multiple movies. Note that the
        sentiments toward the movies may be different.

        You should use the same sentiment values as extract_sentiment, described

        above.
        Hint: feel free to call previously defined functions to implement this.

        Example:
          sentiments = chatbot.extract_sentiment_for_text(
                           chatbot.preprocess(
                           'I liked both "Titanic (1997)" and "Ex Machina".'))
          print(sentiments) // prints [("Titanic (1997)", 1), ("Ex Machina", 1)]

        :param preprocessed_input: a user-supplied line of text that has been
        pre-processed with preprocess()
        :returns: a list of tuples, where the first item in the tuple is a movie
        title, and the second is the sentiment in the text toward that movie
        """
        #don't need to consider the case where some movies are in the database while the rest are not
        title_array = self.extract_titles(preprocessed_input)
        if len(title_array) == 1:
            return [(title_array[0],
                     self.extract_sentiment(preprocessed_input))]

        stemmer = PorterStemmer()
        split_input = preprocessed_input.lower().split()
        negate = 1
        num_conjunctions = 0
        count = 0
        in_quotes = False
        power = 1
        conjunctions = ['and', 'nor', 'but', 'or', 'yet']
        neg_list = [
            "no", "not", "rather", "couldn't", "wasn't", "didn't", "wouldn't",
            "shouldn't", "weren't", "don't", "doesn't", "haven't", "hasn't",
            "won't", "wont", "hadn't", "never", "none", "nobody", "nothing",
            "neither", "nowhere", "isn't", "can't", "cannot", "mustn't",
            "mightn't", "shan't", "without", "needn't"
        ]
        power_list = [
            "really", "reeally", "loved", "love", "hate", "hated", "terrible",
            "amazing", "fantastic", "incredible", "dreadful", "horrible",
            "horrid", "horrendous"
        ]
        sentiment_list = []
        for word in split_input:
            word = word.strip()
            word_no_punc = word.rstrip(",.")
            stem = stemmer.stem(word_no_punc, 0, len(word_no_punc) - 1)
            if stem.endswith('i'):
                stem = stem[:-1] + 'y'
            if word.startswith("\""):
                in_quotes = True
            if word.endswith("\""):
                in_quotes = False
                continue
            if in_quotes:
                continue
            if word in neg_list and not word.endswith(
                    ","
            ):  # if word in neg_list but ends in comma, negate would be positive
                negate = -1  # or have negate * -1
            else:
                has_comma = False
                # maybe include other punctuation?
                if word.endswith(","):
                    has_comma = True
                if self.creative:
                    if word_no_punc in power_list or stem in power_list or word.endswith(
                            "!"):
                        power = 2
                    if word_no_punc in conjunctions or stem in conjunctions:
                        if (count == 0):
                            if num_conjunctions != 0:
                                sentiment_list.append(
                                    sentiment_list[num_conjunctions - 1])
                            else:
                                sentiment_list.append(0)
                        else:
                            sentiment_list.append(count)
                        count = 0
                        num_conjunctions += 1
                if word_no_punc in self.sentiment:
                    if self.sentiment[word_no_punc] == "pos":
                        count += 1 * negate
                    else:
                        count += -1 * negate
                elif stem in self.sentiment:
                    if self.sentiment[stem] == "pos":
                        count += 1 * negate
                    else:
                        count += -1 * negate
                if has_comma:
                    negate = 1

        if (count == 0):
            sentiment_list.append(sentiment_list[num_conjunctions - 1])
        else:
            sentiment_list.append(count)

        res = []
        i = 0
        for title in title_array:
            curr_count = 0
            if sentiment_list[i] > 0:
                curr_count = 1 * power
            elif sentiment_list[i] < 0:
                curr_count = -1 * power
            res.append((title, curr_count))
            i += 1
        return res
예제 #13
0
    def extract_sentiment(self, preprocessed_input):
        """Extract a sentiment rating from a line of pre-processed text.

        You should return -1 if the sentiment of the text is negative, 0 if the
        sentiment of the text is neutral (no sentiment detected), or +1 if the
        sentiment of the text is positive.

        As an optional creative extension, return -2 if the sentiment of the
        text is super negative and +2 if the sentiment of the text is super
        positive.

        Example:
          sentiment = chatbot.extract_sentiment(chatbot.preprocess(
                                                    'I liked "The Titanic"'))
          print(sentiment) // prints 1

        :param preprocessed_input: a user-supplied line of text that has been
        pre-processed with preprocess()
        :returns: a numerical value for the sentiment of the text
        """
        stemmer = PorterStemmer()
        split_input = preprocessed_input.lower().split()
        negate = 1
        count = 0
        in_quotes = False
        power = 1
        neg_list = [
            "no", "not", "rather", "couldn't", "wasn't", "didn't", "wouldn't",
            "shouldn't", "weren't", "don't", "doesn't", "haven't", "hasn't",
            "won't", "wont", "hadn't", "never", "none", "nobody", "nothing",
            "neither", "nor", "nowhere", "isn't", "can't", "cannot", "mustn't",
            "mightn't", "shan't", "without", "needn't"
        ]
        power_list = [
            "really", "reeally", "loved", "love", "hate", "hated", "terrible",
            "amazing", "fantastic", "incredible", "dreadful", "horrible",
            "horrid", "horrendous"
        ]
        for word in split_input:
            word = word.strip()
            word_no_punc = word.rstrip(",.")
            stem = stemmer.stem(word_no_punc, 0, len(word_no_punc) - 1)
            if stem.endswith('i'):
                stem = stem[:-1] + 'y'
            if word.startswith("\""):
                in_quotes = True
            if word.endswith("\""):
                in_quotes = False
                continue
            if in_quotes:
                continue
            if word in neg_list and not word.endswith(
                    ","
            ):  # if word in neg_list but ends in comma, negate would be positive
                negate = -1  # or have negate * -1
            else:
                has_comma = False
                # maybe include other punctuation?
                if word.endswith(","):
                    has_comma = True
                if self.creative:
                    if word_no_punc in power_list or stem in power_list or word.endswith(
                            "!"):
                        power = 2
                if word_no_punc in self.sentiment:
                    if self.sentiment[word_no_punc] == "pos":
                        count += 1 * negate
                    else:
                        count += -1 * negate
                elif stem in self.sentiment:
                    if self.sentiment[stem] == "pos":
                        count += 1 * negate
                    else:
                        count += -1 * negate
                if has_comma:
                    negate = 1
        if count > 0:
            return 1 * power
        elif count < 0:
            return -1 * power
        return 0
예제 #14
0
 def tokenise(self, string):
     """ break string up into tokens and stem words """
     stemmer = PorterStemmer()
     string = self.clean(string)
     words = string.split(" ")
     return [stemmer.stem(word, 0, len(word) - 1) for word in words]
예제 #15
0
 def __init__(self, ngrams=1):
     self.stemmer = PorterStemmer()
     self.ngrams = ngrams
     print ' -- initializing tokenizer with maximum ngram = %d' % ngrams