def __init__(self, tweet): print('Loading training modules') self.bag_of_words = [] self.vectorizer = DictVectorizer(dtype=int, sparse=True) self.encoder = LabelEncoder() self.lexicon_classifier = LexiconClassifier() self.classifier = LinearSVC(C=0.005) self.train(trainset)
def __init__(self, tweets=[]): # initialize internal variables self.rules_classifier = RulesClassifier() self.lexicon_classifier = LexiconClassifier() self.ml_classifier = None # if the ML model has been generated, load the model from model.pkl if sys.version_info >= (3, 0): if os.path.exists( str(var.model_classifier) + '-model_python3.pkl'): print('Reading the ' + str(var.model_classifier) + ' model from model_python3.pkl') self.ml_classifier = pickle.load( open( str(var.model_classifier) + '-model_python3.pkl', 'rb')) else: if os.path.exists( str(var.model_classifier) + '-model_python2.pkl'): print('Reading the ' + str(var.model_classifier) + ' model from model_python2.pkl') self.ml_classifier = pickle.load( open( str(var.model_classifier) + '-model_python2.pkl', 'rb')) if self.ml_classifier == None: # Preprocess the data and train a new model print('Preprocessing the training data') tweet_messages = [tweet_message for tweet_message, label in tweets] tweet_labels = [label for tweet_message, label in tweets] # preproces all the tweet_messages (Tokenization, POS and normalization) tweet_tokens = pre_process(tweet_messages) # compile a trainset with tweek_tokens and labels (positive, # negative or neutral) trainset = [(tweet_tokens[i], tweet_labels[i]) for i in range(len(tweets))] # initialize the classifier and train it classifier = MachineLearningClassifier(trainset) # dump the model into de pickle python_version = sys.version_info[0] model_name = str(var.model_classifier) + '-model_python' + str( python_version) + '.pkl' print('Saving the trained model at ' + model_name) pickle.dump(classifier, open(model_name, 'wb')) self.ml_classifier = classifier
def __init__(self, trainset=[]): print ('Loading training modules') self.bag_of_words = [] self.vectorizer = DictVectorizer(dtype=int, sparse=True) self.encoder = LabelEncoder() self.lexicon_classifier = LexiconClassifier() self.classifier = LinearSVC(C=0.005) self.train(trainset)
def __init__(self, trainset=[]): print('Loading training modules') self.bag_of_words = [] self.vectorizer = DictVectorizer(dtype=int, sparse=True) self.encoder = LabelEncoder() self.lexicon_classifier = LexiconClassifier() if var.model_classifier == "svm": self.classifier = LinearSVC(C=0.005) elif var.model_classifier == "randomForest": self.classifier = RandomForestClassifier() elif var.model_classifier == "naive": self.classifier = GaussianNB() elif var.model_classifier == "lreg": self.classifier = LogisticRegression() elif var.model_classifier == "sgd": self.classifier = SGDClassifier(penalty='elasticnet', alpha=0.001, l1_ratio=0.85, n_iter=1000) self.train(trainset)
def __init__(self, tweets=[]): # initialize internal variables self.rules_classifier = RulesClassifier() self.lexicon_classifier = LexiconClassifier() self.ml_classifier = None # if the ML model has been generated, load the model from model.pkl if sys.version_info >= (3,0): if os.path.exists('model_python3.pkl'): print ('Reading the model from model_python3.pkl') self.ml_classifier = pickle.load(open('model_python3.pkl','rb')) else: if os.path.exists('model_python2.pkl'): print ('Reading the model from model_python2.pkl') self.ml_classifier = pickle.load(open('model_python2.pkl','rb')) if self.ml_classifier == None: # Preprocess the data and train a new model print ('Preprocessing the training data') tweet_messages = [tweet_message for tweet_message,label in tweets] tweet_labels = [label for tweet_message,label in tweets] # preproces all the tweet_messages (Tokenization, POS and normalization) tweet_tokens = pre_process(tweet_messages) # compile a trainset with tweek_tokens and labels (positive, # negative or neutral) trainset = [(tweet_tokens[i],tweet_labels[i]) for i in range(len(tweets))] # initialize the classifier and train it classifier = MachineLearningClassifier(trainset) # dump the model into de pickle python_version = sys.version_info[0] model_name = 'model_python' + str(python_version) + '.pkl' print ('Saving the trained model at ' + model_name) pickle.dump(classifier, open(model_name, 'wb')) self.ml_classifier = classifier
def extract_features(self, tweet_tokens): if len(self.bag_of_words) == 0: print('Bag-of-Words empty!') unigrams = [w.lower() for w, t in tweet_tokens] tokens = unigrams tokens += ['_'.join(b) for b in bigrams(unigrams)] tokens += ['_'.join(t) for t in trigrams(unigrams)] tokens += [t1 + '_*_' + t3 for t1, t2, t3 in trigrams(unigrams)] tweet_tags = [tag for token, tag in tweet_tokens] feature_set = {} # 1st set of features: bag-of-words for token in set(tokens).intersection(self.bag_of_words): feature_set['has_' + token] = True # 2nd set of features: the count for each tag type present in the message # Tweet_nlp taget. Info: # http://www.ark.cs.cmu.edu/TweetNLP/annot_guidelines.pdf for tag in [ 'CC', 'CD', 'DT', 'EX', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WRB' ]: feature_set['num_' + tag] = sum( [1 for t in tweet_tags if t == tag]) # 3rd feature: negation is present? negators = set(LexiconClassifier().read_negation_words()) if len(negators.intersection(set(tokens))) > 0: feature_set['has_negator'] = True # 4th feature: character ngrams regexp = re.compile(r"([a-z])\1{2,}") feature_set['has_char_ngrams'] = False for token, tag in tweet_tokens: if regexp.search(token): feature_set['has_char_ngrams'] = True break # 5th feature: punctuaion ngrams regexp = re.compile(r"([!\?])\1{2,}") feature_set['has_punct_ngrams'] = False for token, tag in tweet_tokens: if regexp.search(token): feature_set['has_punct_ngrams'] = True break # 6th feature: the number of all upper cased words feature_set['num_all_caps'] = sum([ 1 for token, tag in tweet_tokens if token.isupper() and len(token) >= 3 ]) # 7th and 8th feature: the positive and negative score from lexicon # classifier (i.e., number of positive and negative words from lexicon) positive_score, negative_score = self.lexicon_classifier.classify( tweet_tokens) feature_set['pos_lexicon'] = positive_score feature_set['neg_lexicon'] = -1 * negative_score return feature_set
class MachineLearningClassifier(object): # Constructor def __init__(self, tweet): print('Loading training modules') self.bag_of_words = [] self.vectorizer = DictVectorizer(dtype=int, sparse=True) self.encoder = LabelEncoder() self.lexicon_classifier = LexiconClassifier() self.classifier = LinearSVC(C=0.005) self.train(trainset) # Extract features for ML process # Some insights from http://aclweb.org/anthology/S/S13/S13-2053.pdf def extract_features(self, tweet_tokens): if len(self.bag_of_words) == 0: print('Bag-of-Words empty!') unigrams = [w.lower() for w, t in tweet_tokens] tokens = unigrams tokens += ['_'.join(b) for b in bigrams(unigrams)] tokens += ['_'.join(t) for t in trigrams(unigrams)] tokens += [t1 + '_*_' + t3 for t1, t2, t3 in trigrams(unigrams)] tweet_tags = [tag for token, tag in tweet_tokens] feature_set = {} # 1st set of features: bag-of-words for token in set(tokens).intersection(self.bag_of_words): feature_set['has_' + token] = True # 2nd set of features: the count for each tag type present in the message # Tweet_nlp taget. Info: # http://www.ark.cs.cmu.edu/TweetNLP/annot_guidelines.pdf for tag in [ 'CC', 'CD', 'DT', 'EX', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WRB' ]: feature_set['num_' + tag] = sum( [1 for t in tweet_tags if t == tag]) # 3rd feature: negation is present? negators = set(LexiconClassifier().read_negation_words()) if len(negators.intersection(set(tokens))) > 0: feature_set['has_negator'] = True # 4th feature: character ngrams regexp = re.compile(r"([a-z])\1{2,}") feature_set['has_char_ngrams'] = False for token, tag in tweet_tokens: if regexp.search(token): feature_set['has_char_ngrams'] = True break # 5th feature: punctuaion ngrams regexp = re.compile(r"([!\?])\1{2,}") feature_set['has_punct_ngrams'] = False for token, tag in tweet_tokens: if regexp.search(token): feature_set['has_punct_ngrams'] = True break # 6th feature: the number of all upper cased words feature_set['num_all_caps'] = sum([ 1 for token, tag in tweet_tokens if token.isupper() and len(token) >= 3 ]) # 7th and 8th feature: the positive and negative score from lexicon # classifier (i.e., number of positive and negative words from lexicon) positive_score, negative_score = self.lexicon_classifier.classify( tweet_tokens) feature_set['pos_lexicon'] = positive_score feature_set['neg_lexicon'] = -1 * negative_score return feature_set # train the classifier # Tweets argument must be a list of dicitionaries. Each dictionary must # have the keys ['MESSAGE'] and ['SENTIMENT'] with the message string and # the classificationclass, respectively. def train(self, tweets): # 1st step: build the bag-of-words model tweet_tokens_list = [tweet_tokens for tweet_tokens, label in tweets] tokens = [] print('Computing the trainset vocabulary of n-grams') for tweet_tokens in tweet_tokens_list: unigrams = [w.lower() for w, t in tweet_tokens] tokens += unigrams tokens += ['_'.join(b) for b in bigrams(unigrams)] tokens += ['_'.join(t) for t in trigrams(unigrams)] tokens += [t1 + '_*_' + t3 for t1, t2, t3 in trigrams(unigrams)] # build the bag-of-words list using all the tokens self.bag_of_words = set(tokens) data = list() total_tweets = len(tweets) features_list = list() for index, (tweet_tokens, label) in enumerate(tweets): print('Training for tweet n. {}/{}'.format(index + 1, total_tweets)) features_list.append(self.extract_features(tweet_tokens)) # Train a SVM classifier #data = self.vectorizer.fit_transform([features for features,label in self.train_set_features]) print('Vectorizing the features') data = self.vectorizer.fit_transform(features_list) target = self.encoder.fit_transform( [label for tweet_tokens, label in tweets]) print('Builing the model') self.classifier.fit(data, target) # classify a new message. Return the scores (probabilities) for each # classification class def classify(self, tweet_tokens): data = self.vectorizer.transform(self.extract_features(tweet_tokens)) probs = self.classifier.decision_function(data) classes = self.encoder.classes_ return {classes.item(i): probs.item(i) for i in range(len(classes))} # return the probability of classification into one of the three classes def decision_function(self, tweet_tokens): data = self.vectorizer.transform(self.extract_features(tweet_tokens)) probs = self.classifier.decision_function(data) classes = self.encoder.classes_ return {classes.item(i): probs.item(i) for i in range(len(classes))}
class TwitterHybridClassifier(object): def __init__(self, tweets=[]): # initialize internal variables self.rules_classifier = RulesClassifier() self.lexicon_classifier = LexiconClassifier() self.ml_classifier = None # if the ML model has been generated, load the model from model.pkl if sys.version_info >= (3, 0): if os.path.exists( str(var.model_classifier) + '-model_python3.pkl'): print('Reading the ' + str(var.model_classifier) + ' model from model_python3.pkl') self.ml_classifier = pickle.load( open( str(var.model_classifier) + '-model_python3.pkl', 'rb')) else: if os.path.exists( str(var.model_classifier) + '-model_python2.pkl'): print('Reading the ' + str(var.model_classifier) + ' model from model_python2.pkl') self.ml_classifier = pickle.load( open( str(var.model_classifier) + '-model_python2.pkl', 'rb')) if self.ml_classifier == None: # Preprocess the data and train a new model print('Preprocessing the training data') tweet_messages = [tweet_message for tweet_message, label in tweets] tweet_labels = [label for tweet_message, label in tweets] # preproces all the tweet_messages (Tokenization, POS and normalization) tweet_tokens = pre_process(tweet_messages) # compile a trainset with tweek_tokens and labels (positive, # negative or neutral) trainset = [(tweet_tokens[i], tweet_labels[i]) for i in range(len(tweets))] # initialize the classifier and train it classifier = MachineLearningClassifier(trainset) # dump the model into de pickle python_version = sys.version_info[0] model_name = str(var.model_classifier) + '-model_python' + str( python_version) + '.pkl' print('Saving the trained model at ' + model_name) pickle.dump(classifier, open(model_name, 'wb')) self.ml_classifier = classifier # Apply the classifier over a tweet message in String format def classify(self, tweet_text): # 0. Pre-process the teets (tokenization, tagger, normalizations) tweet_tokens_list = [] print('Preprocessing the string') # pre-process the tweets tweet_tokens_list = pre_process([tweet_text]) predictions = [] total_tweets = len(tweet_tokens_list) # iterate over the tweet_tokens for index, tweet_tokens in enumerate(tweet_tokens_list): # 1. Rule-based classifier. Look for emoticons basically positive_score, negative_score = self.rules_classifier.classify( tweet_tokens) # 1. Apply the rules, If any found, classify the tweet here. If none found, continue for the lexicon classifier. if positive_score >= 1 and negative_score == 0: sentiment = ('positive', 'RB') predictions.append(sentiment) continue elif positive_score == 0 and negative_score <= -1: sentiment = ('negative', 'RB') predictions.append(sentiment) continue # 2. Lexicon-based classifier positive_score, negative_score = self.lexicon_classifier.classify( tweet_tokens) lexicon_score = positive_score + negative_score # 2. Apply lexicon classifier, # If in the threshold classify the tweet here. If not, continue for the ML classifier if positive_score >= 1 and negative_score == 0: sentiment = ('positive', 'LB') predictions.append(sentiment) continue elif negative_score <= -2: sentiment = ('negative', 'LB') predictions.append(sentiment) continue # 3. Machine learning based classifier - used the Train+Dev set sto define the best features to classify new instances result = self.ml_classifier.classify(tweet_tokens) positive_conf = result['positive'] negative_conf = result['negative'] neutral_conf = result['neutral'] if negative_conf >= -0.4: sentiment = ('negative', 'ML') elif positive_conf > neutral_conf: sentiment = ('positive', 'ML') else: sentiment = ('neutral', 'ML') predictions.append(sentiment) return predictions # Apply the classifier in batch over a list of tweet messages in String format def classify_batch(self, tweet_texts): # 0. Pre-process the teets (tokenization, tagger, normalizations) tweet_tokens_list = [] if len(tweet_texts) == 0: return tweet_tokens_list print('Preprocessing the test data') # pre-process the tweets tweet_tokens_list = pre_process(tweet_texts) predictions = [] total_tweets = len(tweet_tokens_list) line_save = [] my_index = 0 # iterate over the tweet_tokens for index, tweet_tokens in enumerate(tweet_tokens_list): print('Testing for tweet n. {}/{}'.format(index + 1, total_tweets)) ''' I comment this part to classify all the messages using only the ML method (airtonbjunior) # 1. Rule-based classifier. Look for emoticons basically positive_score,negative_score = self.rules_classifier.classify(tweet_tokens) # 1. Apply the rules, If any found, classify the tweet here. If none found, continue for the lexicon classifier. if positive_score >= 1 and negative_score == 0: sentiment = ('positive','RB') predictions.append(sentiment) continue elif positive_score == 0 and negative_score <= -1: sentiment = ('negative','RB') predictions.append(sentiment) continue # 2. Lexicon-based classifier positive_score, negative_score = self.lexicon_classifier.classify(tweet_tokens) lexicon_score = positive_score + negative_score # 2. Apply lexicon classifier, # If in the threshold classify the tweet here. If not, continue for the ML classifier if positive_score >= 1 and negative_score == 0: sentiment = ('positive','LB') predictions.append(sentiment) continue elif negative_score <= -2: sentiment = ('negative','LB') predictions.append(sentiment) continue ''' # 3. Machine learning based classifier - used the Train+Dev set sto define the best features to classify new instances result = self.ml_classifier.classify(tweet_tokens) #print(str(result)) #input("Press enter to continue...") positive_conf = result['positive'] negative_conf = result['negative'] neutral_conf = result['neutral'] line_save.append( str(positive_conf) + '\t' + str(negative_conf) + '\t' + str(neutral_conf)) #print(str(positive_conf)) #print(str(negative_conf)) #print(str(neutral_conf)) if var.model_classifier == "svm": if negative_conf >= -0.4: sentiment = ('negative', 'ML') elif positive_conf > neutral_conf: sentiment = ('positive', 'ML') else: sentiment = ('neutral', 'ML') elif var.model_classifier == "randomForest": if positive_conf > negative_conf and positive_conf > neutral_conf: sentiment = ('positive', 'ML') elif negative_conf > positive_conf and negative_conf > neutral_conf: sentiment = ('negative', 'ML') elif neutral_conf > positive_conf and neutral_conf > negative_conf: sentiment = ('neutral', 'ML') else: if positive_conf == neutral_conf: sentiment = ('positive', 'ML') elif negative_conf == neutral_conf: sentiment = ('negative', 'ML') else: sentiment = ('neutral', 'ML') elif var.model_classifier == "naive": #sentiment = var.naive_raw_predict[my_index] #print(str(sentiment)) sentiment = "" elif var.model_classifier == "lreg": if positive_conf > negative_conf and positive_conf > neutral_conf: sentiment = ('positive', 'ML') elif negative_conf > positive_conf and negative_conf > neutral_conf: sentiment = ('negative', 'ML') elif neutral_conf > positive_conf and neutral_conf > negative_conf: sentiment = ('neutral', 'ML') elif var.model_classifier == "sgd": if positive_conf > negative_conf and positive_conf > neutral_conf: sentiment = ('positive', 'ML') elif negative_conf > positive_conf and negative_conf > neutral_conf: sentiment = ('negative', 'ML') elif neutral_conf > positive_conf and neutral_conf > negative_conf: sentiment = ('neutral', 'ML') predictions.append(sentiment) my_index += 1 print('Saving the predictions values of ' + str(var.model_classifier) + ' on file ' + str(var.model_classifier) + '_test_results.txt') with open(str(var.model_classifier) + '_test_results.txt', 'a') as fr: ii = 0 for pred in line_save: if (var.model_classifier) == "randomForest": fr.write(pred + '\t' + str(var.rf_predicts[ii])[2:-2] + '\n') elif (var.model_classifier) == "svm": fr.write(pred + '\t' + str(var.svm_predicts[ii][2:-2]) + '\n') elif (var.model_classifier) == "naive": fr.write(pred + '\t' + str(var.naive_predicts[ii][2:-2]) + '\n') elif (var.model_classifier) == "lreg": fr.write(pred + '\t' + str(var.lreg_predicts[ii]) + '\n') elif (var.model_classifier) == "sgd": fr.write(pred + '\t' + str(var.sgd_predicts[ii]) + '\n') ii += 1 return predictions # Output Individual scores for each method def output_individual_scores(self, tweets): tweet_texts = [tweet_message for tweet_message, label in tweets] tweet_labels = [label for tweet_message, label in tweets] # write the log fp = codecs.open('individual_scores.tab', 'w', encoding='utf8') line = 'pos_score_rule\tneg_score_rule\tpos_score_lex\tneg_score_lex\tpos_conf\tneg_conf\tneutral_conf\tclass\tmessage\n' fp.write(line) # 0. Pre-process the text (emoticons, misspellings, tagger) tweet_tokens_list = None tweet_tokens_list = pre_process(tweet_texts) predictions = [] for index, tweet_tokens in enumerate(tweet_tokens_list): line = '' # 1. Rule-based classifier. Look for emoticons basically positive_score, negative_score = self.rules_classifier.classify( tweet_tokens) line += str(positive_score) + '\t' + str(negative_score) + '\t' # 2. Lexicon-based classifier (using url_score obtained from RulesClassifier) positive_score, negative_score = self.lexicon_classifier.classify( tweet_tokens) lexicon_score = positive_score + negative_score line += str(positive_score) + '\t' + str(negative_score) + '\t' # 3. Machine learning based classifier - used the training set to define the best features to classify new instances result = self.ml_classifier.decision_function(tweet_tokens) line += str(result['positive']) + '\t' + str( result['negative']) + '\t' + str(result['neutral']) + '\t' line += tweet_labels[index] + '\t"' + tweet_texts[index].replace( '"', '') + '"\n' fp.write(line) print('Indivual score saved in the file: individual_scores.tab')
class TwitterHybridClassifier(object): predictions = [] def __init__(self, tweets=[]): # initialize internal variables self.rules_classifier = RulesClassifier() self.lexicon_classifier = LexiconClassifier() self.ml_classifier = None # if the ML model has been generated, load the model from model.pkl if sys.version_info >= (3, 0): if os.path.exists('model_python3.pkl'): print('Reading the model from model_python3.pkl') self.ml_classifier = pickle.load( open('model_python3.pkl', 'rb')) else: if os.path.exists('model_python2.pkl'): print('Reading the model from model_python2.pkl') self.ml_classifier = pickle.load( open('model_python2.pkl', 'rb')) if self.ml_classifier == None: # Preprocess the data and train a new model print('Preprocessing the training data') tweet_messages = [tweet_message for tweet_message, label in tweets] tweet_labels = [label for tweet_message, label in tweets] # preproces all the tweet_messages (Tokenization, POS and normalization) tweet_tokens = pre_process(tweet_messages) # compile a trainset with tweek_tokens and labels (positive, # negative or neutral) trainset = [(tweet_tokens[i], tweet_labels[i]) for i in range(len(tweets))] # initialize the classifier and train it classifier = MachineLearningClassifier(trainset) # dump the model into de pickle python_version = sys.version_info[0] model_name = 'model_python' + str(python_version) + '.pkl' print('Saving the trained model at ' + model_name) pickle.dump(classifier, open(model_name, 'wb')) self.ml_classifier = classifier # Apply the classifier over a tweet message in String format def classify(self, tweet_text): # 0. Pre-process the teets (tokenization, tagger, normalizations) tweet_tokens_list = [] predictions = [] print('Preprocessing the string') # pre-process the tweets tweet_tokens = pre_process([tweet_text]) print(tweet_tokens_list) # 1. Rule-based classifier. Look for emoticons basically positive_score, negative_score = self.rules_classifier.classify( tweet_tokens) # 1. Apply the rules, If any found, classify the tweet here. If none found, continue for the lexicon classifier. if positive_score >= 1 and negative_score == 0: sentiment = ('positive', 'EB') predictions.append(sentiment) #continue elif positive_score == 0 and negative_score <= -1: sentiment = ('negative', 'EB') predictions.append(sentiment) #continue # 2. Lexicon-based classifier positive_score, negative_score = self.lexicon_classifier.classify( tweet_tokens) lexicon_score = positive_score + negative_score if positive_score >= 1 and negative_score == 0: sentiment = ('positive', 'LB') predictions.append(sentiment) #continue elif negative_score <= -2: sentiment = ('negative', 'LB') predictions.append(sentiment) #continue # 3. Machine learning based classifier - used the Train+Dev set sto define the best features to classify new instances result = self.ml_classifier.classify(tweet_tokens) positive_conf = result['positive'] negative_conf = result['negative'] neutral_conf = result['nuetral'] if negative_conf >= -0.4: sentiment = ('negative', 'ML') elif positive_conf > neutral_conf: sentiment = ('positive', 'ML') else: sentiment = ('neutral', 'ML') predictions.append(sentiment) return predictions
def __init__(self, trainset=[]): self.rules_classifier = RulesClassifier() self.lexicon_classifier = LexiconClassifier() self.ml_classifier = MachineLearningClassifier(trainset)
class TwitterHybridClassifier(object): def __init__(self, trainset=[]): self.rules_classifier = RulesClassifier() self.lexicon_classifier = LexiconClassifier() self.ml_classifier = MachineLearningClassifier(trainset) # Apply the classifier over a tweet message in String format def classify(self,tweet_text): # 0. Pre-process the text (emoticons, misspellings, tagger) tweet_text = pre_process(tweet_text) # 1. Rule-based classifier. Look for emoticons basically positive_score,negative_score = self.rules_classifier.classify(tweet_text) rules_score = positive_score + negative_score # 1. Apply the rules, If any found, classify the tweet here. If none found, continue for the lexicon classifier. if rules_score != 0: if rules_score > 0: sentiment = 'positive' else: sentiment = 'negative' return sentiment # 2. Lexicon-based classifier positive_score, negative_score = self.lexicon_classifier.classify(tweet_text) lexicon_score = positive_score + negative_score # 2. Apply lexicon classifier, If the lexicon score is # 0 (strictly neutral), >3 (positive with confidence) or # <3 (negative with confidence), classify the tweet here. If not, # continue for the SVM classifier if lexicon_score == 0: sentiment = 'neutral' return sentiment if lexicon_score >= 3: sentiment = 'positive' return sentiment if lexicon_score <= -3: sentiment = 'negative' return sentiment # 3. Machine learning based classifier - used the training set to define the best features to classify new instances scores = self.ml_classifier.classify(tweet_text) positive_conf = scores[0][1] negative_conf = scores[1][1] neutral_conf = scores[2][1] # 3. Apply machine learning classifier, If positive or negative # confidence (probability) is >=0.3, classify with the sentiment. # Otherwise, classify as neutral if positive_conf >= 0.3 and negative_conf < positive_conf: sentiment = 'positive' elif negative_conf >= 0.3: sentiment = 'negative' else: sentiment = 'neutral' return sentiment
class TwitterHybridClassifier(object): def __init__(self, tweets=[]): # initialize internal variables self.rules_classifier = RulesClassifier() self.lexicon_classifier = LexiconClassifier() self.ml_classifier = None # if the ML model has been generated, load the model from model.pkl if sys.version_info >= (3,0): if os.path.exists('model_python3.pkl'): print ('Reading the model from model_python3.pkl') self.ml_classifier = pickle.load(open('model_python3.pkl','rb')) else: if os.path.exists('model_python2.pkl'): print ('Reading the model from model_python2.pkl') self.ml_classifier = pickle.load(open('model_python2.pkl','rb')) if self.ml_classifier == None: # Preprocess the data and train a new model print ('Preprocessing the training data') tweet_messages = [tweet_message for tweet_message,label in tweets] tweet_labels = [label for tweet_message,label in tweets] # preproces all the tweet_messages (Tokenization, POS and normalization) tweet_tokens = pre_process(tweet_messages) # compile a trainset with tweek_tokens and labels (positive, # negative or neutral) trainset = [(tweet_tokens[i],tweet_labels[i]) for i in range(len(tweets))] # initialize the classifier and train it classifier = MachineLearningClassifier(trainset) # dump the model into de pickle python_version = sys.version_info[0] model_name = 'model_python' + str(python_version) + '.pkl' print ('Saving the trained model at ' + model_name) pickle.dump(classifier, open(model_name, 'wb')) self.ml_classifier = classifier # Apply the classifier over a tweet message in String format def classify(self,tweet_text): # 0. Pre-process the teets (tokenization, tagger, normalizations) tweet_tokens_list = [] print ('Preprocessing the string') # pre-process the tweets tweet_tokens_list = pre_process([tweet_text]) predictions = [] total_tweets = len(tweet_tokens_list) # iterate over the tweet_tokens for index, tweet_tokens in enumerate(tweet_tokens_list): # 1. Rule-based classifier. Look for emoticons basically positive_score,negative_score = self.rules_classifier.classify(tweet_tokens) # 1. Apply the rules, If any found, classify the tweet here. If none found, continue for the lexicon classifier. if positive_score >= 1 and negative_score == 0: sentiment = ('positive','RB') predictions.append(sentiment) continue elif positive_score == 0 and negative_score <= -1: sentiment = ('negative','RB') predictions.append(sentiment) continue # 2. Lexicon-based classifier positive_score, negative_score = self.lexicon_classifier.classify(tweet_tokens) lexicon_score = positive_score + negative_score # 2. Apply lexicon classifier, # If in the threshold classify the tweet here. If not, continue for the ML classifier if positive_score >= 1 and negative_score == 0: sentiment = ('positive','LB') predictions.append(sentiment) continue elif negative_score <= -2: sentiment = ('negative','LB') predictions.append(sentiment) continue # 3. Machine learning based classifier - used the Train+Dev set sto define the best features to classify new instances result = self.ml_classifier.classify(tweet_tokens) positive_conf = result['positive'] negative_conf = result['negative'] neutral_conf = result['neutral'] if negative_conf >= -0.4: sentiment = ('negative','ML') elif positive_conf > neutral_conf: sentiment = ('positive','ML') else: sentiment = ('neutral','ML') predictions.append(sentiment) return predictions # Apply the classifier in batch over a list of tweet messages in String format def classify_batch(self,tweet_texts): # 0. Pre-process the teets (tokenization, tagger, normalizations) tweet_tokens_list = [] if len(tweet_texts) == 0: return tweet_tokens_list print ('Preprocessing the test data') # pre-process the tweets tweet_tokens_list = pre_process(tweet_texts) predictions = [] total_tweets = len(tweet_tokens_list) # iterate over the tweet_tokens for index, tweet_tokens in enumerate(tweet_tokens_list): print('Testing for tweet n. {}/{}'.format(index+1,total_tweets)) # 1. Rule-based classifier. Look for emoticons basically positive_score,negative_score = self.rules_classifier.classify(tweet_tokens) # 1. Apply the rules, If any found, classify the tweet here. If none found, continue for the lexicon classifier. if positive_score >= 1 and negative_score == 0: sentiment = ('positive','RB') predictions.append(sentiment) continue elif positive_score == 0 and negative_score <= -1: sentiment = ('negative','RB') predictions.append(sentiment) continue # 2. Lexicon-based classifier positive_score, negative_score = self.lexicon_classifier.classify(tweet_tokens) lexicon_score = positive_score + negative_score # 2. Apply lexicon classifier, # If in the threshold classify the tweet here. If not, continue for the ML classifier if positive_score >= 1 and negative_score == 0: sentiment = ('positive','LB') predictions.append(sentiment) continue elif negative_score <= -2: sentiment = ('negative','LB') predictions.append(sentiment) continue # 3. Machine learning based classifier - used the Train+Dev set sto define the best features to classify new instances result = self.ml_classifier.classify(tweet_tokens) positive_conf = result['positive'] negative_conf = result['negative'] neutral_conf = result['neutral'] if negative_conf >= -0.4: sentiment = ('negative','ML') elif positive_conf > neutral_conf: sentiment = ('positive','ML') else: sentiment = ('neutral','ML') predictions.append(sentiment) return predictions # Output Individual scores for each method def output_individual_scores(self,tweets): tweet_texts = [tweet_message for tweet_message,label in tweets] tweet_labels = [label for tweet_message,label in tweets] # write the log fp = codecs.open('individual_scores.tab','w',encoding='utf8') line = 'pos_score_rule\tneg_score_rule\tpos_score_lex\tneg_score_lex\tpos_conf\tneg_conf\tneutral_conf\tclass\tmessage\n' fp.write(line) # 0. Pre-process the text (emoticons, misspellings, tagger) tweet_tokens_list = None tweet_tokens_list = pre_process(tweet_texts) predictions = [] for index,tweet_tokens in enumerate(tweet_tokens_list): line = '' # 1. Rule-based classifier. Look for emoticons basically positive_score,negative_score = self.rules_classifier.classify(tweet_tokens) line += str(positive_score) + '\t' + str(negative_score) + '\t' # 2. Lexicon-based classifier (using url_score obtained from RulesClassifier) positive_score, negative_score = self.lexicon_classifier.classify(tweet_tokens) lexicon_score = positive_score + negative_score line += str(positive_score) + '\t' + str(negative_score) + '\t' # 3. Machine learning based classifier - used the training set to define the best features to classify new instances result = self.ml_classifier.decision_function(tweet_tokens) line += str(result['positive']) + '\t' + str(result['negative']) + '\t' + str(result['neutral']) + '\t' line += tweet_labels[index] + '\t"' + tweet_texts[index].replace('"','') + '"\n' fp.write(line) print('Indivual score saved in the file: individual_scores.tab')
class MachineLearningClassifier(object): # Constructor def __init__(self, trainset=[]): print ('Loading training modules') self.bag_of_words = [] self.vectorizer = DictVectorizer(dtype=int, sparse=True) self.encoder = LabelEncoder() self.lexicon_classifier = LexiconClassifier() self.classifier = LinearSVC(C=0.005) self.train(trainset) # Extract features for ML process # Some insights from http://aclweb.org/anthology/S/S13/S13-2053.pdf def extract_features(self, tweet_tokens): if len(self.bag_of_words) == 0: print('Bag-of-Words empty!') unigrams = [w.lower() for w,t in tweet_tokens] tokens = unigrams tokens += ['_'.join(b) for b in bigrams(unigrams)] tokens += ['_'.join(t) for t in trigrams(unigrams)] tokens += [t1 + '_*_' + t3 for t1,t2,t3 in trigrams(unigrams)] tweet_tags = [tag for token, tag in tweet_tokens] feature_set = {} # 1st set of features: bag-of-words for token in set(tokens).intersection(self.bag_of_words): feature_set['has_'+token] = True # 2nd set of features: the count for each tag type present in the message # Tweet_nlp taget. Info: # http://www.ark.cs.cmu.edu/TweetNLP/annot_guidelines.pdf for tag in ['N','O','^','S','Z','V','A','R','!','D','P','&','T','X','#','@','~','U','E','$',',','G','L','M','Y']: feature_set['num_'+tag] = sum([1 for t in tweet_tags if t == tag]) # 3rd feature: negation is present? negators = set(LexiconClassifier().read_negation_words()) if len(negators.intersection(set(tokens))) > 0: feature_set['has_negator'] = True # 4th feature: character ngrams regexp = re.compile(r"([a-z])\1{2,}") feature_set['has_char_ngrams'] = False for token,tag in tweet_tokens: if regexp.search(token): feature_set['has_char_ngrams'] = True break # 5th feature: punctuation ngrams regexp = re.compile(r"([!\?])\1{2,}") feature_set['has_punct_ngrams'] = False for token,tag in tweet_tokens: if regexp.search(token): feature_set['has_punct_ngrams'] = True break # 6th feature: the number of all upper cased words feature_set['num_all_caps'] = sum([1 for token,tag in tweet_tokens if token.isupper() and len(token)>=3]) # 7th and 8th feature: the positive and negative score from lexicon # classifier (i.e., number of positive and negative words from lexicon) positive_score, negative_score = self.lexicon_classifier.classify(tweet_tokens) feature_set['pos_lexicon'] = positive_score feature_set['neg_lexicon'] = -1 * negative_score return feature_set # train the classifier # Tweets argument must be a list of dictionaries. Each dictionary must # have the keys ['MESSAGE'] and ['SENTIMENT'] with the message string and # the classificationclass, respectively. def train(self,tweets): # 1st step: build the bag-of-words model tweet_tokens_list = [tweet_tokens for tweet_tokens,label in tweets] tokens = [] print('Computing the trainset vocabulary of n-grams') for tweet_tokens in tweet_tokens_list: unigrams = [w.lower() for w,t in tweet_tokens] tokens += unigrams tokens += ['_'.join(b) for b in bigrams(unigrams)] tokens += ['_'.join(t) for t in trigrams(unigrams)] tokens += [t1 + '_*_' + t3 for t1,t2,t3 in trigrams(unigrams)] # build the bag-of-words list using all the tokens self.bag_of_words = set(tokens) data = list() total_tweets = len(tweets) features_list = list() for index,(tweet_tokens,label) in enumerate(tweets): print('Training for tweet n. {}/{}'.format(index+1,total_tweets)) features_list.append(self.extract_features(tweet_tokens)) # Train a SVM classifier #data = self.vectorizer.fit_transform([features for features,label in self.train_set_features]) print('Vectorizing the features') data = self.vectorizer.fit_transform(features_list) target = self.encoder.fit_transform([label for tweet_tokens,label in tweets]) print('Building the model') self.classifier.fit(data, target) # classify a new message. Return the scores (probabilities) for each # classification class def classify(self, tweet_tokens): data = self.vectorizer.transform(self.extract_features(tweet_tokens)) probs = self.classifier.decision_function(data) classes = self.encoder.classes_ return {classes.item(i): probs.item(i) for i in range(len(classes))} # return the probability of classification into one of the three classes def decision_function(self, tweet_tokens): data = self.vectorizer.transform(self.extract_features(tweet_tokens)) probs = self.classifier.decision_function(data) classes = self.encoder.classes_ return {classes.item(i): probs.item(i) for i in range(len(classes))}
class MachineLearningClassifier(object): # Constructor def __init__(self, trainset=[]): print('Loading training modules') self.bag_of_words = [] self.vectorizer = DictVectorizer(dtype=int, sparse=True) self.encoder = LabelEncoder() self.lexicon_classifier = LexiconClassifier() if var.model_classifier == "svm": self.classifier = LinearSVC(C=0.005) elif var.model_classifier == "randomForest": self.classifier = RandomForestClassifier() elif var.model_classifier == "naive": self.classifier = GaussianNB() elif var.model_classifier == "lreg": self.classifier = LogisticRegression() elif var.model_classifier == "sgd": self.classifier = SGDClassifier(penalty='elasticnet', alpha=0.001, l1_ratio=0.85, n_iter=1000) self.train(trainset) # Extract features for ML process # Some insights from http://aclweb.org/anthology/S/S13/S13-2053.pdf def extract_features(self, tweet_tokens): if len(self.bag_of_words) == 0: print('Bag-of-Words empty!') #print(str(self.bag_of_words)) #input("enter 2...") unigrams = [w.lower() for w, t in tweet_tokens] tokens = unigrams tokens += ['_'.join(b) for b in bigrams(unigrams)] tokens += ['_'.join(t) for t in trigrams(unigrams)] tokens += [t1 + '_*_' + t3 for t1, t2, t3 in trigrams(unigrams)] tweet_tags = [tag for token, tag in tweet_tokens] feature_set = {} # 1st set of features: bag-of-words for token in set(tokens).intersection(self.bag_of_words): feature_set['has_' + token] = True # 2nd set of features: the count for each tag type present in the message # Tweet_nlp taget. Info: # http://www.ark.cs.cmu.edu/TweetNLP/annot_guidelines.pdf for tag in [ 'N', 'O', '^', 'S', 'Z', 'V', 'A', 'R', '!', 'D', 'P', '&', 'T', 'X', '#', '@', '~', 'U', 'E', '$', ',', 'G', 'L', 'M', 'Y' ]: feature_set['num_' + tag] = sum( [1 for t in tweet_tags if t == tag]) # 3rd feature: negation is present? negators = set(LexiconClassifier().read_negation_words()) if len(negators.intersection(set(tokens))) > 0: feature_set['has_negator'] = True # 4th feature: character ngrams regexp = re.compile(r"([a-z])\1{2,}") feature_set['has_char_ngrams'] = False for token, tag in tweet_tokens: if regexp.search(token): feature_set['has_char_ngrams'] = True break # 5th feature: punctuaion ngrams regexp = re.compile(r"([!\?])\1{2,}") feature_set['has_punct_ngrams'] = False for token, tag in tweet_tokens: if regexp.search(token): feature_set['has_punct_ngrams'] = True break # 6th feature: the number of all upper cased words feature_set['num_all_caps'] = sum([ 1 for token, tag in tweet_tokens if token.isupper() and len(token) >= 3 ]) # 7th and 8th feature: the positive and negative score from lexicon # classifier (i.e., number of positive and negative words from lexicon) positive_score, negative_score = self.lexicon_classifier.classify( tweet_tokens) feature_set['pos_lexicon'] = positive_score feature_set['neg_lexicon'] = -1 * negative_score return feature_set # train the classifier # Tweets argument must be a list of dicitionaries. Each dictionary must # have the keys ['MESSAGE'] and ['SENTIMENT'] with the message string and # the classificationclass, respectively. def train(self, tweets): # 1st step: build the bag-of-words model tweet_tokens_list = [tweet_tokens for tweet_tokens, label in tweets] tokens = [] print('Computing the trainset vocabulary of n-grams') for tweet_tokens in tweet_tokens_list: unigrams = [w.lower() for w, t in tweet_tokens] tokens += unigrams tokens += ['_'.join(b) for b in bigrams(unigrams)] tokens += ['_'.join(t) for t in trigrams(unigrams)] tokens += [t1 + '_*_' + t3 for t1, t2, t3 in trigrams(unigrams)] # build the bag-of-words list using all the tokens self.bag_of_words = set(tokens) data = list() total_tweets = len(tweets) features_list = list() for index, (tweet_tokens, label) in enumerate(tweets): print('Training for tweet n. {}/{}'.format(index + 1, total_tweets)) features_list.append(self.extract_features(tweet_tokens)) #import pickle #pickle_out = open("../../features.pickle","wb") #pickle.dump(features_list, pickle_out) #pickle_out.close() # Train a SVM classifier #data = self.vectorizer.fit_transform([features for features,label in self.train_set_features]) print('Vectorizing the features') data = self.vectorizer.fit_transform(features_list) target = self.encoder.fit_transform( [label for tweet_tokens, label in tweets]) print('Builing the model') if (var.model_classifier) == "naive": self.classifier.fit(data.toarray(), target) else: self.classifier.fit(data, target) # classify a new message. Return the scores (probabilities) for each # classification class def classify(self, tweet_tokens): #print(str(self.extract_features(tweet_tokens))) #input("press enter...") ft = self.extract_features(tweet_tokens) data = self.vectorizer.transform(ft) var.features_test.append(ft) #data = self.vectorizer.transform(self.extract_features(tweet_tokens)) if var.model_classifier == "svm": probs = self.classifier.decision_function(data) classes = self.encoder.classes_ var.svm_predicts.append(classes[self.classifier.predict(data)]) return { classes.item(i): probs.item(i) for i in range(len(classes)) } elif var.model_classifier == "randomForest": probs = self.classifier.predict_proba(data) classes = self.encoder.classes_ var.rf_predicts.append(classes[self.classifier.predict(data)]) return { classes.item(i): probs.item(i) for i in range(len(classes)) } elif var.model_classifier == "naive": probs = self.classifier.predict_proba(data.toarray()) classes = self.encoder.classes_ var.naive_predicts.append(classes[self.classifier.predict( data.toarray())]) return { classes.item(i): probs.item(i) for i in range(len(classes)) } elif var.model_classifier == "lreg": probs = self.classifier.predict_proba(data) classes = self.encoder.classes_ a = classes[self.classifier.predict(data)] var.lreg_predicts.append(a) print(str(a)) return { classes.item(i): probs.item(i) for i in range(len(classes)) } elif var.model_classifier == "sgd": probs = self.classifier.decision_function(data) classes = self.encoder.classes_ a = classes[self.classifier.predict(data)] var.sgd_predicts.append(a) print(str(a)) return { classes.item(i): probs.item(i) for i in range(len(classes)) } # return the probability of classification into one of the three classes #def decision_function(self, tweet_tokens): def predict_proba(self, tweet_tokens): data = self.vectorizer.transform(self.extract_features(tweet_tokens)) #probs = self.classifier.decision_function(data) probs = self.classifier.predict_proba(data) # if(var.model_classifier == "naive"): #a = self.classifier.predict(data) #var.naive_raw_predict.append(a) #print(str(a)) classes = self.encoder.classes_ return {classes.item(i): probs.item(i) for i in range(len(classes))} def decision_function(self, tweet_tokens): data = self.vectorizer.transform(self.extract_features(tweet_tokens)) probs = self.classifier.decision_function(data) #print(self.classifier.predict(data)) #input("Press AGAIN...") #probs = self.classifier.predict_proba(data) classes = self.encoder.classes_ return {classes.item(i): probs.item(i) for i in range(len(classes))}
class TwitterHybridClassifier(object): def __init__(self, trainset=[]): self.rules_classifier = RulesClassifier() self.lexicon_classifier = LexiconClassifier() self.ml_classifier = MachineLearningClassifier(trainset) # Apply the classifier over a tweet message in String format def classify(self, tweet_text): # 0. Pre-process the text (emoticons, misspellings, tagger) tweet_text = pre_process(tweet_text) # 1. Rule-based classifier. Look for emoticons basically positive_score, negative_score = self.rules_classifier.classify( tweet_text) rules_score = positive_score + negative_score # 1. Apply the rules, If any found, classify the tweet here. If none found, continue for the lexicon classifier. if rules_score != 0: if rules_score > 0: sentiment = 'positive' else: sentiment = 'negative' return sentiment # 2. Lexicon-based classifier positive_score, negative_score = self.lexicon_classifier.classify( tweet_text) lexicon_score = positive_score + negative_score # 2. Apply lexicon classifier, If the lexicon score is # 0 (strictly neutral), >3 (positive with confidence) or # <3 (negative with confidence), classify the tweet here. If not, # continue for the SVM classifier if lexicon_score == 0: sentiment = 'neutral' return sentiment if lexicon_score >= 3: sentiment = 'positive' return sentiment if lexicon_score <= -3: sentiment = 'negative' return sentiment # 3. Machine learning based classifier - used the training set to define the best features to classify new instances scores = self.ml_classifier.classify(tweet_text) positive_conf = scores[0][1] negative_conf = scores[1][1] neutral_conf = scores[2][1] # 3. Apply machine learning classifier, If positive or negative # confidence (probability) is >=0.3, classify with the sentiment. # Otherwise, classify as neutral if positive_conf >= 0.3 and negative_conf < positive_conf: sentiment = 'positive' elif negative_conf >= 0.3: sentiment = 'negative' else: sentiment = 'neutral' return sentiment