def get_correctly_classified_tweets(self, tweets_and_sentiment): """ Classifies the given set of tweets and returns the ones that were correctly classified. """ tweets, sentimentvalues = zip(*tweets_and_sentiment) if sentimentvalues!=None: self.test_words_and_values = sentimentvalues count_vector = self.vect.transform([t.text for t in tweets]) tfidf_count = self.tfidf_transformer.transform(count_vector) if self.only_text_features: combined_vector = tfidf_count else: dict_vector = self.dict_vectorizer.transform([features.get_feature_set(t, self.featureset, v) for t,v in zip(tweets, self.test_words_and_values)]) tfidf_dict = self.dict_transformer.transform(dict_vector) combined_vector = sp.hstack([tfidf_count, tfidf_dict]) predictions = self.best_estimator.predict(combined_vector) tweets, targets = utils.make_subjectivity_targets(tweets) #return the tweets where the target match prediction correct_tweets = [] correct_sentimentvalues = [] for i in xrange(len(tweets)): if predictions[i]==targets[i]: correct_tweets.append(tweets[i]) correct_sentimentvalues.append(sentimentvalues[i]) return correct_tweets, correct_sentimentvalues
def set_feature_set(self, featureset, sentimentvalues): """ Extracts and stores the given feature set for classification. """ self.featureset = featureset if featureset=='SA' or featureset=='PA': self.only_text_features=True self.feature_set = {} else: words_and_values = sentimentvalues self.feature_set = [features.get_feature_set(t, self.featureset, v) for t,v in zip(self.train_tweets,words_and_values)]
def classify(self, tweets, sentimentvalues=None): """ Performs the classification process on list of tweets. """ if sentimentvalues!=None: self.test_words_and_values = sentimentvalues count_vector = self.vect.transform([t.text for t in tweets]) tfidf_count = self.tfidf_transformer.transform(count_vector) if self.only_text_features: combined_vector = tfidf_count else: dict_vector = self.dict_vectorizer.transform([features.get_feature_set(t, self.featureset, v) for t,v in zip(tweets, self.test_words_and_values)]) tfidf_dict = self.dict_transformer.transform(dict_vector) combined_vector = sp.hstack([tfidf_count, tfidf_dict]) predictions = self.best_estimator.predict(combined_vector) return predictions