train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg
# creating the combinations of positive and negative labels to possible training and testing datasets
train_y = np.append(np.ones((len(train_pos), 1)),
                    np.zeros((len(train_neg), 1)),
                    axis=0)
test_y = np.append(np.ones((len(test_pos), 1)),
                   np.zeros((len(test_neg), 1)),
                   axis=0)

# using the build frequencies dictionary which has count of the frequencies of the word in the tweet exist in positive and negative labels of words
freqs = build_freqs(train_x, train_y)
# follwoing the processing step
# tokenizing
# lowercasing
# removing stop words & punctuations
# stemming

print('This is an example of a positive tweet: \n', train_x[0])
print('\nThis is an example of the processed version of the tweet: \n',
      process_tweet(train_x[0]))
import math


def sigmoid(z):
    h = 1 / (1 + np.exp(-z))
    return h
示例#2
0
# As shown above, each key is a 2-element tuple containing a `(word, y)` pair. The `word` is an element in a processed tweet while `y` is an integer representing the corpus: `1` for the positive tweets and `0` for the negative tweets. The value associated with this key is the number of times that word appears in the specified corpus. For example:
#
# ```
# # "folowfriday" appears 25 times in the positive tweets
# ('followfriday', 1.0): 25
#
# # "shame" appears 19 times in the negative tweets
# ('shame', 0.0): 19
# ```

# Now, it is time to use the dictionary returned by the `build_freqs()` function. First, let us feed our `tweets` and `labels` lists then print a basic report:

# In[14]:

# create frequency dictionary
freqs = build_freqs(tweets, labels)

# check data type
print(f'type(freqs) = {type(freqs)}')

# check length of the dictionary
print(f'len(freqs) = {len(freqs)}')

# Now print the frequency of each word depending on its class.

# In[15]:

print(freqs)

# Unfortunately, this does not help much to understand the data. It would be better to visualize this output to gain better insights.