train_pos = all_positive_tweets[:4000] test_neg = all_negative_tweets[4000:] train_neg = all_negative_tweets[:4000] train_x = train_pos + train_neg test_x = test_pos + test_neg # creating the combinations of positive and negative labels to possible training and testing datasets train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0) test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0) # using the build frequencies dictionary which has count of the frequencies of the word in the tweet exist in positive and negative labels of words freqs = build_freqs(train_x, train_y) # follwoing the processing step # tokenizing # lowercasing # removing stop words & punctuations # stemming print('This is an example of a positive tweet: \n', train_x[0]) print('\nThis is an example of the processed version of the tweet: \n', process_tweet(train_x[0])) import math def sigmoid(z): h = 1 / (1 + np.exp(-z)) return h
# As shown above, each key is a 2-element tuple containing a `(word, y)` pair. The `word` is an element in a processed tweet while `y` is an integer representing the corpus: `1` for the positive tweets and `0` for the negative tweets. The value associated with this key is the number of times that word appears in the specified corpus. For example: # # ``` # # "folowfriday" appears 25 times in the positive tweets # ('followfriday', 1.0): 25 # # # "shame" appears 19 times in the negative tweets # ('shame', 0.0): 19 # ``` # Now, it is time to use the dictionary returned by the `build_freqs()` function. First, let us feed our `tweets` and `labels` lists then print a basic report: # In[14]: # create frequency dictionary freqs = build_freqs(tweets, labels) # check data type print(f'type(freqs) = {type(freqs)}') # check length of the dictionary print(f'len(freqs) = {len(freqs)}') # Now print the frequency of each word depending on its class. # In[15]: print(freqs) # Unfortunately, this does not help much to understand the data. It would be better to visualize this output to gain better insights.