Пример #1
0
def getSentimentAnalyzer():
    stop_words = stopwords.words('english')
    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)

    positive_tokens_for_model = get_tweets_for_model(
        positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset
    random.shuffle(dataset)
    train_data = dataset
    classifier = NaiveBayesClassifier.train(train_data)
    return classifier
Пример #2
0
def startAnalysis():
    # tokenize positive_tweets
    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens))
    
    pos_tweet_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    pos_dataset = [(tweet_dict,"Positive") for tweet_dict in pos_tweet_for_model]
    neg_tweet_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
    neg_dataset = [(tweet_dict,"Negative") for tweet_dict in neg_tweet_for_model]
  
    dataset = pos_dataset + neg_dataset
    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data= dataset[7000:]

    classifier = NaiveBayesClassifier.train(train_data)
    print("Accuracy is:", classify.accuracy(classifier, test_data))
    print(classifier.show_most_informative_features(10))
Пример #3
0
    def train(self):
        positive_tweet_tokens = twitter_samples.tokenized(
            "positive_tweets.json")
        negative_tweet_tokens = twitter_samples.tokenized(
            "negative_tweets.json")

        positive_cleaned_tokens_list = []
        negative_cleaned_tokens_list = []

        for tokens in positive_tweet_tokens:
            positive_cleaned_tokens_list.append(
                self.clear_data(tokens, self.stop_words))

        for tokens in negative_tweet_tokens:
            negative_cleaned_tokens_list.append(
                self.clear_data(tokens, self.stop_words))

        positive_tokens_for_model = self.get_tweets_for_model(
            positive_cleaned_tokens_list)
        negative_tokens_for_model = self.get_tweets_for_model(
            negative_cleaned_tokens_list)

        positive_dataset = [(tweet_dict, "Positive")
                            for tweet_dict in positive_tokens_for_model]

        negative_dataset = [(tweet_dict, "Negative")
                            for tweet_dict in negative_tokens_for_model]

        dataset = positive_dataset + negative_dataset

        random.shuffle(dataset)

        return NaiveBayesClassifier.train(dataset)
    def fit(self, dataset=None):
        """
        This Method will initialize the Training for model using data sample present in nltk libraries
        :param dataset:
        :return:model object
        """
        positive_tweet_tokens = twitter_samples.tokenized(
            'positive_tweets.json')
        negative_tweet_tokens = twitter_samples.tokenized(
            'negative_tweets.json')

        positive_cleaned_tokens_list = []
        negative_cleaned_tokens_list = []

        for tokens in positive_tweet_tokens:
            positive_cleaned_tokens_list.append(cleanData(tokens, stop_words))

        for tokens in negative_tweet_tokens:
            negative_cleaned_tokens_list.append(cleanData(tokens, stop_words))

        positive_tokens_for_model = get_tweets_for_model(
            positive_cleaned_tokens_list)
        negative_tokens_for_model = get_tweets_for_model(
            negative_cleaned_tokens_list)
        positive_dataset = [(tweet_dict, "Positive")
                            for tweet_dict in positive_tokens_for_model]

        negative_dataset = [(tweet_dict, "Negative")
                            for tweet_dict in negative_tokens_for_model]

        dataset = positive_dataset + negative_dataset

        random.shuffle(dataset)
        self.classifier = nltk.NaiveBayesClassifier.train(dataset)
        return self
Пример #5
0
 def __init__(self):
     print("normalizing twitter samples...")
     self.stop_words = stopwords.words('english')
     self.positive_tweet_tokens = twitter_samples.tokenized(
         'positive_tweets.json')
     self.negative_tweet_tokens = twitter_samples.tokenized(
         'negative_tweets.json')
def getSentimentAnalyzer():
    stop_words = stopwords.words('english')
    posTokens = twitter_samples.tokenized('positive_tweets.json')
    negTokens = twitter_samples.tokenized('negative_tweets.json')

    posCleanedTokens = []
    negCleanedTokens = []

    for tokens in posTokens:
        posCleanedTokens.append(removeNoise(tokens, stop_words))

    for tokens in negTokens:
        negCleanedTokens.append(removeNoise(tokens, stop_words))

    allPosWords = get_all_words(posCleanedTokens)

    freqDistPos = FreqDist(allPosWords)
    print(freqDistPos.most_common(10))

    posTokensModel = get_tweets_for_model(posCleanedTokens)
    negTokensModel = get_tweets_for_model(negCleanedTokens)

    posData = [(tweet_dict, "Positive") for tweet_dict in posTokensModel]

    negData = [(tweet_dict, "Negative") for tweet_dict in negTokensModel]

    dataset = posData + negData
    random.shuffle(dataset)
    train_data = dataset
    classifier = NaiveBayesClassifier.train(train_data)
    return classifier
Пример #7
0
    def preprocess_data(self):
        self.pos_tweets = twitter_samples.strings('positive_tweets.json')
        self.neg_tweets = twitter_samples.strings('negative_tweets.json')
        self.text = twitter_samples.strings('tweets.20150430-223406.json')
        self.pos_tweet_tokens = twitter_samples.tokenized(
            'positive_tweets.json')
        self.neg_tweet_tokens = twitter_samples.tokenized(
            'negative_tweets.json')
        self.pos_cleaned_tokens_list = [
            self.remove_noise(tokens) for tokens in self.pos_tweet_tokens
        ]
        self.neg_cleaned_tokens_list = [
            self.remove_noise(tokens) for tokens in self.neg_tweet_tokens
        ]
        self.all_pos_words = self.get_all_words(self.pos_cleaned_tokens_list)
        self.all_neg_words = self.get_all_words(self.neg_cleaned_tokens_list)
        self.freq_dist_pos = FreqDist(self.all_pos_words)
        self.freq_dist_neg = FreqDist(self.all_neg_words)
        self.pos_tokens_for_model = self.get_tweets_for_model(
            self.pos_cleaned_tokens_list)
        self.neg_tokens_for_model = self.get_tweets_for_model(
            self.neg_cleaned_tokens_list)

        self.pos_dataset = [(tweet_dict, "Positive")
                            for tweet_dict in self.pos_tokens_for_model]
        self.neg_dataset = [(tweet_dict, "Negative")
                            for tweet_dict in self.neg_tokens_for_model]
        self.dataset = self.pos_dataset + self.neg_dataset
        random.shuffle(self.dataset)
        mid = len(self.dataset) // 2
        self.train_data = self.dataset[:mid]
        self.test_data = self.dataset[mid:]
Пример #8
0
def prepare_sentiment_classifier():
    if(not os.path.exists('./.venv/nltk_data')):
        os.makedirs('./.venv/nltk_data')
        nltk.download('twitter_samples', './.venv/nltk_data/')
        nltk.download('punkt', './.venv/nltk_data/')
        nltk.download('averaged_perceptron_tagger','./.venv/nltk_data/')
        nltk.download('wordnet','./.venv/nltk_data/')
        nltk.download('stopwords','./.venv/nltk_data/')

    #Normalization - canonical form conversion

    #Define a lexical tags in the tweets and lemmatize (remove the past form, ending etc.)
    #Then remove noise and stop words
    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    # List the most used positive words
    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    # Prepare a dictionary for Bayes
    positive_tokens_for_model = prepare_tweets_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = prepare_tweets_for_model(negative_cleaned_tokens_list)

    negative_sentiment_value = -10
    positive_sentiment_value = 10
    positive_dataset = [(tweet_dict, positive_sentiment_value)
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, negative_sentiment_value)
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset
    #Mix the data to avoid bias
    random.shuffle(dataset)

    train_data = dataset[:7000]

    #The rest 3k of 10k tweets are for testing
    test_data = dataset[7000:]
    #Train the model
    classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier, test_data))

    #print(classifier.show_most_informative_features(10))
    return classifier
Пример #9
0
    def __init__(self):
        """ This class trains the data on 10000 tweets """

        self.stop_words = stopwords.words('english')
        self.positive_cleaned_tokens_list = []
        self.negative_cleaned_tokens_list = []
        self.positive_tweets_tokens = twitter_samples.tokenized('positive_tweets.json')
        self.negative_tweets_tokens = twitter_samples.tokenized('negative_tweets.json')
 def __init__(self):
     self.stop_words = stopwords.words('english')
     self.positive_cleaned_tokens_list = []
     self.negative_cleaned_tokens_list = []
     self.positive_tweets_tokens = twitter_samples.tokenized(
         'positive_tweets.json')
     self.negative_tweets_tokens = twitter_samples.tokenized(
         'negative_tweets.json')
Пример #11
0
 def assign_cleans(self):
     print('Training AI...')
     print('Learning Positive Tweets')
     for tokens in twitter_samples.tokenized('positive_tweets.json'):
         self.clean_positive.append(remove_noise(tokens))
     print('Learning Negative Tweets')
     for tokens in twitter_samples.tokenized('negative_tweets.json'):
         self.clean_negative.append(remove_noise(tokens))
    def load_raw_training_data(self):
        #Load the in built NLTK training data. Used other data sets if necessary
        #to improve accuracy.
        positive_data_tokenized = twitter_samples.tokenized(
            'positive_tweets.json')
        negative_data_tokenized = twitter_samples.tokenized(
            'negative_tweets.json')

        return positive_data_tokenized, negative_data_tokenized
Пример #13
0
def driver():

    # String varaiables of the dataset
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []
    text = twitter_samples.strings('tweets.20150430-223406.json')
    stop_words = stopwords.words('english')

    # Tokenized variables of dataset
    # tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    # Cleaning the noise in the tweet tokens
    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    # Frequency distribution for cleaned words
    all_pos_words = get_all_words(positive_cleaned_tokens_list)
    freq_dist_pos = FreqDist(all_pos_words)

    # Creating positive and negative dictionaries
    positive_tokens_for_model = get_tweets_for_model(
        positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)

    # Creating the final dataset for training
    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    # Shuffling for suedo-randomness and avoid bias
    random.shuffle(dataset)

    # Dividing shuffled data into train and test data
    train_data = dataset[:7000]
    test_data = dataset[7000:]

    # Initializing the classifier
    classifier = NaiveBayesClassifier.train(train_data)

    custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."

    custom_tokens = remove_noise(word_tokenize(custom_tweet))

    print(custom_tweet, classifier.classify(
        dict([token, True] for token in custom_tokens)))
Пример #14
0
def train_model():
    """
    Trains a Naive Bayes sentiment classifier using the twitter_samples
    dataset from NLTK. Each tweet is tokenized and cleaned to produce a training
    dataset for the machine learning model.
    Parameters
    ----------
    Returns
    -------
    NaiveBayesClassifier
    """
    #Load dataset from nltk data
    positive_tweets = twitter_samples.strings("positive_tweets.json")
    negative_tweets = twitter_samples.strings("negative_tweets.json")

    #Retrieve english stop words
    stop_words = stopwords.words("english")

    #Tweet tokenization
    positive_tweet_tokens = twitter_samples.tokenized("positive_tweets.json")
    negative_tweet_tokens = twitter_samples.tokenized("negative_tweets.json")

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    #Token cleaning
    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    #Extract words from tokens
    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    #Frequency distribition of words
    freq_dist_pos = FreqDist(all_pos_words)

    positive_tokens_for_model = get_tweets_for_model(
        positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)

    #Create datasets
    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]
    #Merge individual datasets into singular training data
    dataset = positive_dataset + negative_dataset

    train_data = dataset

    classifier = NaiveBayesClassifier.train(train_data)
    return classifier
def main_code(analysis_input):

    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    # print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(
        positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    classifier = NaiveBayesClassifier.train(train_data)

    # print("Accuracy is:", classify.accuracy(classifier, test_data))

    # print(classifier.show_most_informative_features(10))

    custom_tokens = remove_noise(word_tokenize(analysis_input))

    pos_or_neg = str(
        classifier.classify(dict([token, True] for token in custom_tokens)))

    return pos_or_neg
Пример #16
0
def create_model():
    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    # print(positive_tweet_tokens)
    # print(positive_cleaned_tokens_list)

    #FINDING WORD DISTRIBUTION
    # all_pos_words = get_all_words(positive_cleaned_tokens_list)
    # freq_dist_pos = FreqDist(all_pos_words)
    # print(freq_dist_pos.most_common(10))

    # MODEL PREPARATION
    positive_tokens_for_model = get_tweets_for_model(
        positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)
    # code attaches a Positive or Negative label to each tweet
    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]
    # print(positive_dataset)
    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]
    dataset = positive_dataset + negative_dataset
    random.shuffle(dataset)
    # number of tweets is 10000 -- ratio of 70:30 for training and testing
    train_data = dataset[:7000]
    test_data = dataset[7000:]

    # MODEL TRAINING
    classifier = NaiveBayesClassifier.train(train_data)
    # MODEL SAVING WITH PICKLE
    filename = 'model/model_pickle1.sav'
    pickle.dump(classifier, open(filename, 'wb'))
    # MODEL SAVING WITH cPICKLE
    filename = 'model/model_cpickle1.sav'
    cPickle.dump(classifier, open(filename, 'wb'))
    # MODEL SAVING WITH JOBLIB
    filename = 'model/model_joblib1.sav'
    joblib.dump(classifier, filename)

    # MODEL ACCURACY
    print("Accuracy is:", classify.accuracy(classifier, test_data))
    print(classifier.show_most_informative_features(10))
Пример #17
0
def calibrate():
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized(
        'positive_tweets.json'
    )  #files downloaded from setup.py used to calibrate the classifer for sentiment analysis
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    #print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(
        positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)

    positive_dataset = [
        (tweet_dict, "Positive")  #calibrating positive
        for tweet_dict in positive_tokens_for_model
    ]

    negative_dataset = [
        (tweet_dict, "Negative")  #calibrating negative
        for tweet_dict in negative_tokens_for_model
    ]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]
    global classifier

    classifier = NaiveBayesClassifier.train(train_data)  #trains the data!
    print("Calibration complete!")

    print("Accuracy is:", classify.accuracy(classifier, test_data))
Пример #18
0
def train_social():
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    # tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]
    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                         for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                         for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    # Classifier - TODO Add persistence
    classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier, test_data))

    print(classifier.show_most_informative_features(100))

    custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."

    custom_tokens = remove_noise(word_tokenize(custom_tweet))

    print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens)))
    return classifier
def main():
    custom_input = get_custom_input()
    stop_words = stopwords.words('english')
    custom_tokens = remove_noise(word_tokenize(custom_input), stop_words)

    nltk_downloader()

    sid = SentimentIntensityAnalyzer()

    print("")
    print(
        "1: Sentiment Intensity Analysis: poitive, negative, neutral percentages"
    )
    print("Sentiment Intensity without noise removal")
    ss = sid.polarity_scores(custom_input)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')
    print()

    print("Sentiment Intensity with noise removal")
    ss = sid.polarity_scores(" ".join(custom_tokens))
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')
    print()

    print("")
    print('2: Basic Sentiment Analysis: Only Positive or Negative')
    positive_tweets, negative_tweets, text = fetch_twitter_samples()

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = list()
    negative_cleaned_tokens_list = list()

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    freq_dist_pos = FreqDist(get_all_words(positive_cleaned_tokens_list))
    freq_dist_neg = FreqDist(get_all_words(negative_cleaned_tokens_list))

    train_data, test_data = get_train_test_data(positive_cleaned_tokens_list,
                                                negative_cleaned_tokens_list)

    classifier = get_model_classifier(train_data, test_data,
                                      NaiveBayesClassifier)

    print("Result: ",
          classifier.classify(dict([token, True] for token in custom_tokens)))
Пример #20
0
def train():
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(
        positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier, test_data))

    print(classifier.show_most_informative_features(10))

    f = open('classifier.pickle', 'wb')
    pickle.dump(classifier, f)
    f.close()
Пример #21
0
def train_twtr_classifier():
    if os.path.isfile(SAVED_CLSR_LOC):
        return load_classifier()

    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    pos_twt_toks = twitter_samples.tokenized('positive_tweets.json')
    neg_twt_toks = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = [
        remove_noise(toks, stop_words) for toks in pos_twt_toks
    ]
    negative_cleaned_tokens_list = [
        remove_noise(toks, stop_words) for toks in neg_twt_toks
    ]

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(
        positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier, test_data))

    save_classifier(classifier)

    return classifier
Пример #22
0
def build_classifier():
    print('reading data')
    stop_words = stopwords.words('english')
    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    print('cleaning tokens')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    print('building freq dist...')

    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(
        positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]
    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    print('training...')

    classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier, test_data))

    print(classifier.show_most_informative_features(10))

    return classifier
Пример #23
0
    def __init__(self):
        """
        Gather data
        """
        positive = twitter_samples.strings('positive_tweets.json')
        negative = twitter_samples.strings('negative_tweets.json')
        self.stop_words = list(set(stopwords.words('english')))

        positive_tokens = twitter_samples.tokenized('positive_tweets.json')
        negative_tokens = twitter_samples.tokenized('negative_tweets.json')
        """
        Clean the data
        """
        positive_clean = []
        negative_clean = []

        for token in positive_tokens:
            positive_clean.append(self.clean(token))

        for token in negative_tokens:
            negative_clean.append(self.clean(token))

        positive_model_tokens = self.final_token_generator(positive_clean)
        negative_model_tokens = self.final_token_generator(negative_clean)
        """
        Use generator to make datasets
        """
        positive_dataset = [(token, "Positive")
                            for token in positive_model_tokens]

        negative_dataset = [(token, "Negative")
                            for token in negative_model_tokens]

        dataset = positive_dataset + negative_dataset
        """
        Shake it all about
        """
        random.shuffle(dataset)
        random.shuffle(dataset)
        random.shuffle(dataset)
        """
        Split them up
        """
        training = dataset[:7000]
        testing = dataset[7000:]
        """
        Train the classifier
        """
        self.classifier = NaiveBayesClassifier.train(training)
        """
Пример #24
0
def read_input(infile, NUM_TRAIN, NUM_TEST):
    train = []
    test = []
    pos_tweets = 0
    neg_tweets = 0
    for line in twitter_samples.tokenized("positive_tweets.json"):
        sent = "Positive"
        #Remove usernames, urls
        for i, token in enumerate(line):

            line[i] = re.sub("@[\S]+", "USERNAME", line[i])
            line[i] = re.sub("www.[\S]+|https://[\S]+|http://[\S]+", "URL",
                             line[i])
            newstr = ""
            for ch in line[i]:
                if ord(ch) > 128:
                    newstr += "EMOJI_{0}".format(ord(ch))
                    #print [ch], ord(ch)
                else:
                    newstr += (ch)
            line[i] = newstr

        pos_tweets += 1
        if pos_tweets < NUM_TRAIN:
            train.append((line, sent))
        else:
            test.append((line, sent))

    for line in twitter_samples.tokenized("negative_tweets.json"):
        sent = "Negative"
        neg_tweets += 1
        #Remove usernames, urls
        for i, token in enumerate(line):

            line[i] = re.sub("@[\S]+", "USERNAME", line[i])
            line[i] = re.sub("www.[\S]+|https://[\S]+", "URL", line[i])
            newstr = ""
            for ch in line[i]:
                if ord(ch) > 128:
                    newstr += "EMOJI_{0}".format(ord(ch))
                    #print [ch], ord(ch)
                else:
                    newstr += (ch)
            line[i] = newstr
        if neg_tweets < NUM_TRAIN:
            train.append((line, sent))
        else:
            test.append((line, sent))
    return test, train
Пример #25
0
def read_input(infile, NUM_TRAIN, NUM_TEST):
	train = []
	test = []
	pos_tweets = 0
	neg_tweets = 0
	for line in twitter_samples.tokenized("positive_tweets.json"):
		sent = "Positive"
		#Remove usernames, urls
		for i,token in enumerate(line):
			
			line[i] = re.sub("@[\S]+", "USERNAME", line[i])
			line[i] = re.sub("www.[\S]+|https://[\S]+|http://[\S]+", "URL", line[i])
			newstr = ""
			for ch in line[i]:
				if ord(ch)>128:
					newstr+= "EMOJI_{0}".format(ord(ch))
					#print [ch], ord(ch)
				else:
					newstr+=(ch)
			line[i] = newstr

		pos_tweets+=1
		if pos_tweets < NUM_TRAIN:
			train.append((line, sent))
		else:			
			test.append((line, sent))			


	for line in twitter_samples.tokenized("negative_tweets.json"):
		sent = "Negative"
		neg_tweets+=1
		#Remove usernames, urls
		for i,token in enumerate(line):

			line[i] = re.sub("@[\S]+", "USERNAME", line[i])
			line[i] = re.sub("www.[\S]+|https://[\S]+", "URL", line[i])
			newstr = ""
			for ch in line[i]:
				if ord(ch)>128:
					newstr+= "EMOJI_{0}".format(ord(ch))
					#print [ch], ord(ch)
				else:
					newstr+=(ch)
			line[i] = newstr
		if neg_tweets < NUM_TRAIN:
			train.append((line, sent))
		else:		
			test.append((line, sent))	
	return test, train
Пример #26
0
 def __init__(self):
     self.stop_words = stopwords.words('english')
     self.positive_cleaned_tokens_list = []
     self.negative_cleaned_tokens_list = []
     self.positive_tweets_tokens = twitter_samples.tokenized(
         'positive_tweets.json')
     self.negative_tweets_tokens = twitter_samples.tokenized(
         'negative_tweets.json')
     self.non_abusive = self.positive_tweets_tokens[:
                                                    808] + self.negative_tweets_tokens[:
                                                                                       811]
     self.abusive_words = pd.read_csv('bad-words.csv')['jigaboo']
     self.abusive = []
     for word in self.abusive_words:
         self.abusive.append(word)
Пример #27
0
def parseTweets(isNewData, tweets, posOrNeg):
    print("inside parse")
    allCleanedTokens = []
    if (isNewData == True):
        for tweet in tweets:
            tokenizedTweet = tokenizeTweet(tweet)
            cleanedTokens = removeNoise(tokenizedTweet,
                                        stopwords.words('english'))
            allCleanedTokens.append(cleanedTokens)
            #wordsAllTweets = get_all_words(allCleanedTokens)
            #print(FreqDist(wordsAllTweets).most_common(25))
        tokensForModel = get_tweets_for_model(allCleanedTokens)

    else:
        tweets = 'positive_tweets.json' if (
            posOrNeg == "positive") else 'negative_tweets.json'
        tweet_tokens = twitter_samples.tokenized(tweets)
        for tokens in tweet_tokens:
            allCleanedTokens.append(
                removeNoise(tokens, stopwords.words('english')))

        tokensForModel = get_tweets_for_model(allCleanedTokens)
        #wordsAllTweets = get_all_words(positive_cleaned_tokens_list)
        #print(FreqDist(wordsAllTweets).most_common(25))
    return tokensForModel
Пример #28
0
def twitter_token():
    from nltk.corpus import twitter_samples
    from nltk.tag import pos_tag_sents

    tweets = twitter_samples.strings('positive_tweets.json')
    tweets_tokens = twitter_samples.tokenized('positive_tweets.json')

    tweets_tagged = pos_tag_sents(tweets_tokens)
    """
    JJ:Adjective
    singular nouns (NN)
    plural nouns (NNS)
    
    """
    JJ_count = 0
    NN_count = 0

    for tweet in tweets_tagged:
        for key, tag in tweet:
            #tag = pair[1]
            if tag == 'JJ':
                JJ_count += 1
            elif tag == 'NN':
                NN_count += 1

    print('Total number of adjectives = ', JJ_count)
    print('Total number of nouns = ', NN_count)
Пример #29
0
def corpusreader_demo():
    """
    Use :module:`TwitterCorpusReader` tp read a file of tweets, and print out

    * some full tweets in JSON format;
    * some raw strings from the tweets (i.e., the value of the `text` field); and
    * the result of tokenising the raw strings.

    """
    from nltk.corpus import twitter_samples as tweets

    print()
    print("Complete tweet documents")
    print(SPACER)
    for tweet in tweets.docs("tweets.20150430-223406.json")[:1]:
        print(json.dumps(tweet, indent=1, sort_keys=True))

    print()
    print("Raw tweet strings:")
    print(SPACER)
    for text in tweets.strings("tweets.20150430-223406.json")[:15]:
        print(text)

    print()
    print("Tokenized tweet strings:")
    print(SPACER)
    for toks in tweets.tokenized("tweets.20150430-223406.json")[:15]:
        print(toks)
Пример #30
0
 def validate(self,classifier):
     """Test the accuracy of a given classifier against a test dataset with labels.
     Args:
         classifier: (Bayesain,DecisionTree,SVC,LinearSVC) for use in classifying data
     Returns:
         None
     """
     tweets =  twitter_samples.fileids()
     pos_tweets = twitter_samples.tokenized(tweets[1])
     neg_tweets = twitter_samples.tokenized(tweets[0])
     pos_testing = pos_tweets[(len(pos_tweets)*7/8):]
     neg_testing = neg_tweets[(len(neg_tweets)*7/8):]
     pos_test  = [(self.train_feats(f), 'positive') for f in pos_testing ]
     neg_test = [(self.train_feats(f), 'negative') for f in neg_testing ]
     testfeats = pos_test + neg_test
     print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testfeats))*100)
Пример #31
0
def create_and_train():
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]
    
    positive_cleaned_tokens_list = tokenize('positive_tweets.json')
    negative_cleaned_tokens_list = tokenize('negative_tweets.json')

    all_pos_words = get_all_words(positive_cleaned_tokens_list)
    freq_dist_pos = FreqDist(all_pos_words)

    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset
    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    classifier = NaiveBayesClassifier.train(train_data)

    return classifier
Пример #32
0
def corpusreader_demo():
    """
    Use `TwitterCorpusReader` tp read a file of tweets, and print out

    * some full tweets in JSON format;
    * some raw strings from the tweets (i.e., the value of the `text` field); and
    * the result of tokenising the raw strings.

    """
    from nltk.corpus import twitter_samples as tweets

    print()
    print("Complete tweet documents")
    print(SPACER)
    for tweet in tweets.docs("tweets.20150430-223406.json")[:1]:
        print(json.dumps(tweet, indent=1, sort_keys=True))

    print()
    print("Raw tweet strings:")
    print(SPACER)
    for text in tweets.strings("tweets.20150430-223406.json")[:15]:
        print(text)

    print()
    print("Tokenized tweet strings:")
    print(SPACER)
    for toks in tweets.tokenized("tweets.20150430-223406.json")[:15]:
        print(toks)
def word_delegation():
    positive = twitter_samples.tokenized('positive_tweets.json')
    negative = twitter_samples.tokenized('negative_tweets.json')

    tweets, all_words = [], []

    for tweet in positive:
        tweets.append((tweet, 'pos'))
        for word in tweet:
            all_words.append(word.lower())

    for tweet in negative:
        tweets.append((tweet, 'neg'))
        for word in tweet:
            all_words.append(word.lower())

    word_features = list(nltk.FreqDist(all_words).keys())[:1000]

    return tweets, word_features
Пример #34
0
from collections import Counter
from nltk.corpus import twitter_samples, stopwords
from nltk import bigrams
from nltk import trigrams


tokenizedWordsFromTwitter = twitter_samples.tokenized('tweets.20150430-223406.json')
tokenizedWordsFromTwitter += twitter_samples.tokenized('negative_tweets.json')
tokenizedWordsFromTwitter += twitter_samples.tokenized('positive_tweets.json')
#tokenizedWordsFromTwitter = twitter_samples.tokenized('positive_tweets.json')

for tweetIndex in range(len(tokenizedWordsFromTwitter)):
    tokenizedWordsFromTwitter[tweetIndex] = [item for item in tokenizedWordsFromTwitter[tweetIndex] if item not in [".",",",":",";","!","?","(",")"]]
    tokenizedWordsFromTwitter[tweetIndex] = [item for item in tokenizedWordsFromTwitter[tweetIndex] if item not in stopwords.words('english')]

print("Number of Tweets: " + str(len(tokenizedWordsFromTwitter)))

twitter_unigrams = Counter({})
twitter_bigrams = Counter({})
twitter_trigrams = Counter({})

for i in tokenizedWordsFromTwitter:

    twitter_unigrams += Counter(i)

    individual_bigrams = bigrams(i)
    twitter_bigrams += Counter(individual_bigrams)

    individual_trigrams = trigrams(i)
    twitter_trigrams += Counter(individual_trigrams)
Пример #35
0
    list_of_tokenized_words = [each_word for each_word in list_of_tokenized_words if each_word not in [",", ".", "..", "...", ":", "?", "!", "\'", "\"", "#", "-", "_", "(", ")"]]
    list_of_tokenized_words = [stemmer.stem(each_word) for each_word in list_of_tokenized_words]
    print(list_of_tokenized_words)
    return list_of_tokenized_words


def word_feats(words):
    return dict([(word, True) for word in get_words_from_sentence(words)])


def tokenize_sentence_into_list_of_words(sentence):
    tknzr = TweetTokenizer()
    list_of_tokenized_words = tknzr.tokenize(sentence)
    return list_of_tokenized_words

negative_tokenizedWordsFromTwitter = twitter_samples.tokenized('negative_tweets.json')
positive_tokenizedWordsFromTwitter = twitter_samples.tokenized('positive_tweets.json')

negfeats = [(word_feats(f), 'neg') for f in negative_tokenizedWordsFromTwitter]
posfeats = [(word_feats(f), 'pos') for f in positive_tokenizedWordsFromTwitter]

negcutoff = int(len(negfeats)*3/4)
poscutoff = int(len(posfeats)*3/4)

trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

classifier = NaiveBayesClassifier.train(trainfeats)
print('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
print(classifier.show_most_informative_features())
Пример #36
0
from nltk.corpus import twitter_samples
from nltk.tag import pos_tag_sents

tweets = twitter_samples.strings('positive_tweets.json')
tweets_tokens = twitter_samples.tokenized('positive_tweets.json')

JJ_count = 0
NN_count = 0

tweets_tagged = pos_tag_sents(tweets_tokens)

for tweet in tweets_tagged:
    for pair in tweet:
        tag = pair[1]
        if tag == 'JJ':
            JJ_count += 1
        elif tag == 'NN':
            NN_count += 1

print('Total number of adjectives = ', JJ_count)
print('Total number of nouns = ', NN_count)
Пример #37
0
import nltk
import plotly.plotly as py
import plotly.graph_objs as go
from tqdm import tqdm

# These are corpus files. One positive, one negative and one mixed
print twitter_samples.fileids()

# Empty dictionaries to store the collected tags
posTags = {}
negTags = {}

# The corpuses are already POS tagged

# Load the positive tweets and load the POS tags into 'posTags'
tokenized = twitter_samples.tokenized('positive_tweets.json')
print "Loaded"
for toks in tqdm(tokenized):
	toks = nltk.pos_tag(toks)
	for word in toks:
		if word[1] in posTags:
			posTags[word[1]] += 1
		else:
			posTags[word[1]] = 1

# Load the negative tweets and load the POS tags into 'negTags'
tokenized = twitter_samples.tokenized('negative_tweets.json')
for toks in tqdm(tokenized):
	toks = nltk.pos_tag(toks)
	for word in toks:
		if word[1] in negTags: