示例#1
0
 def __init__(self, tweet):
     print('Loading training modules')
     self.bag_of_words = []
     self.vectorizer = DictVectorizer(dtype=int, sparse=True)
     self.encoder = LabelEncoder()
     self.lexicon_classifier = LexiconClassifier()
     self.classifier = LinearSVC(C=0.005)
     self.train(trainset)
示例#2
0
    def __init__(self, tweets=[]):
        # initialize internal variables
        self.rules_classifier = RulesClassifier()
        self.lexicon_classifier = LexiconClassifier()
        self.ml_classifier = None

        # if the ML model has been generated, load the model from model.pkl
        if sys.version_info >= (3, 0):
            if os.path.exists(
                    str(var.model_classifier) + '-model_python3.pkl'):
                print('Reading the ' + str(var.model_classifier) +
                      ' model from model_python3.pkl')
                self.ml_classifier = pickle.load(
                    open(
                        str(var.model_classifier) + '-model_python3.pkl',
                        'rb'))
        else:
            if os.path.exists(
                    str(var.model_classifier) + '-model_python2.pkl'):
                print('Reading the ' + str(var.model_classifier) +
                      ' model from model_python2.pkl')
                self.ml_classifier = pickle.load(
                    open(
                        str(var.model_classifier) + '-model_python2.pkl',
                        'rb'))

        if self.ml_classifier == None:
            # Preprocess the data and train a new model
            print('Preprocessing the training data')
            tweet_messages = [tweet_message for tweet_message, label in tweets]
            tweet_labels = [label for tweet_message, label in tweets]

            # preproces all the tweet_messages (Tokenization, POS and normalization)
            tweet_tokens = pre_process(tweet_messages)

            # compile a trainset with tweek_tokens and labels (positive,
            # negative or neutral)

            trainset = [(tweet_tokens[i], tweet_labels[i])
                        for i in range(len(tweets))]

            # initialize the classifier and train it
            classifier = MachineLearningClassifier(trainset)

            # dump the model into de pickle
            python_version = sys.version_info[0]
            model_name = str(var.model_classifier) + '-model_python' + str(
                python_version) + '.pkl'
            print('Saving the trained model at ' + model_name)
            pickle.dump(classifier, open(model_name, 'wb'))
            self.ml_classifier = classifier
 def __init__(self, trainset=[]):
     print ('Loading training modules')
     self.bag_of_words = []
     self.vectorizer = DictVectorizer(dtype=int, sparse=True)
     self.encoder = LabelEncoder()
     self.lexicon_classifier = LexiconClassifier()
     self.classifier = LinearSVC(C=0.005)
     self.train(trainset)
示例#4
0
 def __init__(self, trainset=[]):
     print('Loading training modules')
     self.bag_of_words = []
     self.vectorizer = DictVectorizer(dtype=int, sparse=True)
     self.encoder = LabelEncoder()
     self.lexicon_classifier = LexiconClassifier()
     if var.model_classifier == "svm":
         self.classifier = LinearSVC(C=0.005)
     elif var.model_classifier == "randomForest":
         self.classifier = RandomForestClassifier()
     elif var.model_classifier == "naive":
         self.classifier = GaussianNB()
     elif var.model_classifier == "lreg":
         self.classifier = LogisticRegression()
     elif var.model_classifier == "sgd":
         self.classifier = SGDClassifier(penalty='elasticnet',
                                         alpha=0.001,
                                         l1_ratio=0.85,
                                         n_iter=1000)
     self.train(trainset)
    def __init__(self, tweets=[]):
        # initialize internal variables
        self.rules_classifier = RulesClassifier()
        self.lexicon_classifier = LexiconClassifier()
        self.ml_classifier = None

        # if the ML model has been generated, load the model from model.pkl
        if sys.version_info >= (3,0):
            if os.path.exists('model_python3.pkl'):
                print ('Reading the model from model_python3.pkl')
                self.ml_classifier = pickle.load(open('model_python3.pkl','rb'))
        else:
            if os.path.exists('model_python2.pkl'):
                print ('Reading the model from model_python2.pkl')
                self.ml_classifier = pickle.load(open('model_python2.pkl','rb'))

        if self.ml_classifier == None:
            # Preprocess the data and train a new model
            print ('Preprocessing the training data')
            tweet_messages = [tweet_message for tweet_message,label in tweets]
            tweet_labels = [label for tweet_message,label in tweets]

            # preproces all the tweet_messages (Tokenization, POS and normalization)
            tweet_tokens = pre_process(tweet_messages)

            # compile a trainset with tweek_tokens and labels (positive,
            # negative or neutral)

            trainset = [(tweet_tokens[i],tweet_labels[i]) for i in range(len(tweets))]

            # initialize the classifier and train it
            classifier = MachineLearningClassifier(trainset)

            # dump the model into de pickle
            python_version = sys.version_info[0]
            model_name = 'model_python' + str(python_version) + '.pkl'
            print ('Saving the trained model at ' + model_name)
            pickle.dump(classifier, open(model_name, 'wb'))
            self.ml_classifier = classifier
示例#6
0
    def extract_features(self, tweet_tokens):

        if len(self.bag_of_words) == 0:
            print('Bag-of-Words empty!')

        unigrams = [w.lower() for w, t in tweet_tokens]
        tokens = unigrams
        tokens += ['_'.join(b) for b in bigrams(unigrams)]
        tokens += ['_'.join(t) for t in trigrams(unigrams)]
        tokens += [t1 + '_*_' + t3 for t1, t2, t3 in trigrams(unigrams)]

        tweet_tags = [tag for token, tag in tweet_tokens]

        feature_set = {}

        # 1st set of features: bag-of-words
        for token in set(tokens).intersection(self.bag_of_words):
            feature_set['has_' + token] = True

        # 2nd set of features: the count for each tag type present in the message
        # Tweet_nlp taget. Info:
        # http://www.ark.cs.cmu.edu/TweetNLP/annot_guidelines.pdf
        for tag in [
                'CC', 'CD', 'DT', 'EX', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD',
                'NN', 'NNP', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR',
                'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP',
                'VBZ', 'WDT', 'WP', 'WRB'
        ]:
            feature_set['num_' + tag] = sum(
                [1 for t in tweet_tags if t == tag])

        # 3rd feature: negation is present?
        negators = set(LexiconClassifier().read_negation_words())
        if len(negators.intersection(set(tokens))) > 0:
            feature_set['has_negator'] = True

        # 4th feature: character ngrams
        regexp = re.compile(r"([a-z])\1{2,}")
        feature_set['has_char_ngrams'] = False
        for token, tag in tweet_tokens:
            if regexp.search(token):
                feature_set['has_char_ngrams'] = True
                break

        # 5th feature: punctuaion ngrams
        regexp = re.compile(r"([!\?])\1{2,}")
        feature_set['has_punct_ngrams'] = False
        for token, tag in tweet_tokens:
            if regexp.search(token):
                feature_set['has_punct_ngrams'] = True
                break

        # 6th feature: the number of all upper cased words
        feature_set['num_all_caps'] = sum([
            1 for token, tag in tweet_tokens
            if token.isupper() and len(token) >= 3
        ])

        # 7th and 8th feature: the positive and negative score from lexicon
        # classifier (i.e., number of positive and negative words from lexicon)
        positive_score, negative_score = self.lexicon_classifier.classify(
            tweet_tokens)
        feature_set['pos_lexicon'] = positive_score
        feature_set['neg_lexicon'] = -1 * negative_score

        return feature_set
示例#7
0
class MachineLearningClassifier(object):

    # Constructor
    def __init__(self, tweet):
        print('Loading training modules')
        self.bag_of_words = []
        self.vectorizer = DictVectorizer(dtype=int, sparse=True)
        self.encoder = LabelEncoder()
        self.lexicon_classifier = LexiconClassifier()
        self.classifier = LinearSVC(C=0.005)
        self.train(trainset)

    # Extract features for ML process
    # Some insights from http://aclweb.org/anthology/S/S13/S13-2053.pdf
    def extract_features(self, tweet_tokens):

        if len(self.bag_of_words) == 0:
            print('Bag-of-Words empty!')

        unigrams = [w.lower() for w, t in tweet_tokens]
        tokens = unigrams
        tokens += ['_'.join(b) for b in bigrams(unigrams)]
        tokens += ['_'.join(t) for t in trigrams(unigrams)]
        tokens += [t1 + '_*_' + t3 for t1, t2, t3 in trigrams(unigrams)]

        tweet_tags = [tag for token, tag in tweet_tokens]

        feature_set = {}

        # 1st set of features: bag-of-words
        for token in set(tokens).intersection(self.bag_of_words):
            feature_set['has_' + token] = True

        # 2nd set of features: the count for each tag type present in the message
        # Tweet_nlp taget. Info:
        # http://www.ark.cs.cmu.edu/TweetNLP/annot_guidelines.pdf
        for tag in [
                'CC', 'CD', 'DT', 'EX', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD',
                'NN', 'NNP', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR',
                'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP',
                'VBZ', 'WDT', 'WP', 'WRB'
        ]:
            feature_set['num_' + tag] = sum(
                [1 for t in tweet_tags if t == tag])

        # 3rd feature: negation is present?
        negators = set(LexiconClassifier().read_negation_words())
        if len(negators.intersection(set(tokens))) > 0:
            feature_set['has_negator'] = True

        # 4th feature: character ngrams
        regexp = re.compile(r"([a-z])\1{2,}")
        feature_set['has_char_ngrams'] = False
        for token, tag in tweet_tokens:
            if regexp.search(token):
                feature_set['has_char_ngrams'] = True
                break

        # 5th feature: punctuaion ngrams
        regexp = re.compile(r"([!\?])\1{2,}")
        feature_set['has_punct_ngrams'] = False
        for token, tag in tweet_tokens:
            if regexp.search(token):
                feature_set['has_punct_ngrams'] = True
                break

        # 6th feature: the number of all upper cased words
        feature_set['num_all_caps'] = sum([
            1 for token, tag in tweet_tokens
            if token.isupper() and len(token) >= 3
        ])

        # 7th and 8th feature: the positive and negative score from lexicon
        # classifier (i.e., number of positive and negative words from lexicon)
        positive_score, negative_score = self.lexicon_classifier.classify(
            tweet_tokens)
        feature_set['pos_lexicon'] = positive_score
        feature_set['neg_lexicon'] = -1 * negative_score

        return feature_set

    # train the classifier
    # Tweets argument must be a list of dicitionaries. Each dictionary must
    # have the keys ['MESSAGE'] and ['SENTIMENT'] with the message string and
    # the classificationclass, respectively.
    def train(self, tweets):
        # 1st step: build the bag-of-words model
        tweet_tokens_list = [tweet_tokens for tweet_tokens, label in tweets]
        tokens = []
        print('Computing the trainset vocabulary of n-grams')
        for tweet_tokens in tweet_tokens_list:
            unigrams = [w.lower() for w, t in tweet_tokens]
            tokens += unigrams
            tokens += ['_'.join(b) for b in bigrams(unigrams)]
            tokens += ['_'.join(t) for t in trigrams(unigrams)]
            tokens += [t1 + '_*_' + t3 for t1, t2, t3 in trigrams(unigrams)]

        # build the bag-of-words list using all the tokens
        self.bag_of_words = set(tokens)

        data = list()
        total_tweets = len(tweets)
        features_list = list()
        for index, (tweet_tokens, label) in enumerate(tweets):
            print('Training for tweet n. {}/{}'.format(index + 1,
                                                       total_tweets))
            features_list.append(self.extract_features(tweet_tokens))

        # Train a SVM classifier
        #data = self.vectorizer.fit_transform([features for features,label in self.train_set_features])
        print('Vectorizing the features')
        data = self.vectorizer.fit_transform(features_list)
        target = self.encoder.fit_transform(
            [label for tweet_tokens, label in tweets])
        print('Builing the model')
        self.classifier.fit(data, target)

    # classify a new message. Return the scores (probabilities) for each
    # classification class
    def classify(self, tweet_tokens):
        data = self.vectorizer.transform(self.extract_features(tweet_tokens))
        probs = self.classifier.decision_function(data)
        classes = self.encoder.classes_
        return {classes.item(i): probs.item(i) for i in range(len(classes))}

    # return the probability of classification into one of the three classes
    def decision_function(self, tweet_tokens):
        data = self.vectorizer.transform(self.extract_features(tweet_tokens))
        probs = self.classifier.decision_function(data)
        classes = self.encoder.classes_
        return {classes.item(i): probs.item(i) for i in range(len(classes))}
示例#8
0
class TwitterHybridClassifier(object):
    def __init__(self, tweets=[]):
        # initialize internal variables
        self.rules_classifier = RulesClassifier()
        self.lexicon_classifier = LexiconClassifier()
        self.ml_classifier = None

        # if the ML model has been generated, load the model from model.pkl
        if sys.version_info >= (3, 0):
            if os.path.exists(
                    str(var.model_classifier) + '-model_python3.pkl'):
                print('Reading the ' + str(var.model_classifier) +
                      ' model from model_python3.pkl')
                self.ml_classifier = pickle.load(
                    open(
                        str(var.model_classifier) + '-model_python3.pkl',
                        'rb'))
        else:
            if os.path.exists(
                    str(var.model_classifier) + '-model_python2.pkl'):
                print('Reading the ' + str(var.model_classifier) +
                      ' model from model_python2.pkl')
                self.ml_classifier = pickle.load(
                    open(
                        str(var.model_classifier) + '-model_python2.pkl',
                        'rb'))

        if self.ml_classifier == None:
            # Preprocess the data and train a new model
            print('Preprocessing the training data')
            tweet_messages = [tweet_message for tweet_message, label in tweets]
            tweet_labels = [label for tweet_message, label in tweets]

            # preproces all the tweet_messages (Tokenization, POS and normalization)
            tweet_tokens = pre_process(tweet_messages)

            # compile a trainset with tweek_tokens and labels (positive,
            # negative or neutral)

            trainset = [(tweet_tokens[i], tweet_labels[i])
                        for i in range(len(tweets))]

            # initialize the classifier and train it
            classifier = MachineLearningClassifier(trainset)

            # dump the model into de pickle
            python_version = sys.version_info[0]
            model_name = str(var.model_classifier) + '-model_python' + str(
                python_version) + '.pkl'
            print('Saving the trained model at ' + model_name)
            pickle.dump(classifier, open(model_name, 'wb'))
            self.ml_classifier = classifier

    # Apply the classifier over a tweet message in String format
    def classify(self, tweet_text):

        # 0. Pre-process the teets (tokenization, tagger, normalizations)
        tweet_tokens_list = []

        print('Preprocessing the string')
        # pre-process the tweets
        tweet_tokens_list = pre_process([tweet_text])

        predictions = []
        total_tweets = len(tweet_tokens_list)

        # iterate over the tweet_tokens
        for index, tweet_tokens in enumerate(tweet_tokens_list):

            # 1. Rule-based classifier. Look for emoticons basically
            positive_score, negative_score = self.rules_classifier.classify(
                tweet_tokens)

            # 1. Apply the rules, If any found, classify the tweet here. If none found, continue for the lexicon classifier.
            if positive_score >= 1 and negative_score == 0:
                sentiment = ('positive', 'RB')
                predictions.append(sentiment)
                continue
            elif positive_score == 0 and negative_score <= -1:
                sentiment = ('negative', 'RB')
                predictions.append(sentiment)
                continue

            # 2. Lexicon-based classifier
            positive_score, negative_score = self.lexicon_classifier.classify(
                tweet_tokens)
            lexicon_score = positive_score + negative_score

            # 2. Apply lexicon classifier,
            # If in the threshold classify the tweet here. If not, continue for the ML classifier
            if positive_score >= 1 and negative_score == 0:
                sentiment = ('positive', 'LB')
                predictions.append(sentiment)
                continue
            elif negative_score <= -2:
                sentiment = ('negative', 'LB')
                predictions.append(sentiment)
                continue

            # 3. Machine learning based classifier - used the Train+Dev set sto define the best features to classify new instances
            result = self.ml_classifier.classify(tweet_tokens)
            positive_conf = result['positive']
            negative_conf = result['negative']
            neutral_conf = result['neutral']

            if negative_conf >= -0.4:
                sentiment = ('negative', 'ML')
            elif positive_conf > neutral_conf:
                sentiment = ('positive', 'ML')
            else:
                sentiment = ('neutral', 'ML')

            predictions.append(sentiment)

        return predictions

    # Apply the classifier in batch over a list of tweet messages in String format
    def classify_batch(self, tweet_texts):

        # 0. Pre-process the teets (tokenization, tagger, normalizations)
        tweet_tokens_list = []

        if len(tweet_texts) == 0:
            return tweet_tokens_list

        print('Preprocessing the test data')
        # pre-process the tweets
        tweet_tokens_list = pre_process(tweet_texts)

        predictions = []
        total_tweets = len(tweet_tokens_list)

        line_save = []

        my_index = 0

        # iterate over the tweet_tokens
        for index, tweet_tokens in enumerate(tweet_tokens_list):

            print('Testing for tweet n. {}/{}'.format(index + 1, total_tweets))
            '''
            I comment this part to classify all the messages using only the ML method (airtonbjunior)

            # 1. Rule-based classifier. Look for emoticons basically
            positive_score,negative_score = self.rules_classifier.classify(tweet_tokens)

            # 1. Apply the rules, If any found, classify the tweet here. If none found, continue for the lexicon classifier.
            if positive_score >= 1 and negative_score == 0:
                sentiment = ('positive','RB')
                predictions.append(sentiment)
                continue
            elif positive_score == 0 and negative_score <= -1:
                sentiment = ('negative','RB')
                predictions.append(sentiment)
                continue

            # 2. Lexicon-based classifier
            positive_score, negative_score = self.lexicon_classifier.classify(tweet_tokens)
            lexicon_score = positive_score + negative_score

            # 2. Apply lexicon classifier,
            # If in the threshold classify the tweet here. If not, continue for the ML classifier
            if positive_score >= 1 and negative_score == 0:
                sentiment = ('positive','LB')
                predictions.append(sentiment)
                continue
            elif negative_score <= -2:
                sentiment = ('negative','LB')
                predictions.append(sentiment)
                continue
            '''

            # 3. Machine learning based classifier - used the Train+Dev set sto define the best features to classify new instances
            result = self.ml_classifier.classify(tweet_tokens)
            #print(str(result))
            #input("Press enter to continue...")
            positive_conf = result['positive']
            negative_conf = result['negative']
            neutral_conf = result['neutral']

            line_save.append(
                str(positive_conf) + '\t' + str(negative_conf) + '\t' +
                str(neutral_conf))

            #print(str(positive_conf))
            #print(str(negative_conf))
            #print(str(neutral_conf))

            if var.model_classifier == "svm":
                if negative_conf >= -0.4:
                    sentiment = ('negative', 'ML')
                elif positive_conf > neutral_conf:
                    sentiment = ('positive', 'ML')
                else:
                    sentiment = ('neutral', 'ML')
            elif var.model_classifier == "randomForest":
                if positive_conf > negative_conf and positive_conf > neutral_conf:
                    sentiment = ('positive', 'ML')
                elif negative_conf > positive_conf and negative_conf > neutral_conf:
                    sentiment = ('negative', 'ML')
                elif neutral_conf > positive_conf and neutral_conf > negative_conf:
                    sentiment = ('neutral', 'ML')
                else:
                    if positive_conf == neutral_conf:
                        sentiment = ('positive', 'ML')
                    elif negative_conf == neutral_conf:
                        sentiment = ('negative', 'ML')
                    else:
                        sentiment = ('neutral', 'ML')
            elif var.model_classifier == "naive":
                #sentiment = var.naive_raw_predict[my_index]
                #print(str(sentiment))
                sentiment = ""

            elif var.model_classifier == "lreg":
                if positive_conf > negative_conf and positive_conf > neutral_conf:
                    sentiment = ('positive', 'ML')
                elif negative_conf > positive_conf and negative_conf > neutral_conf:
                    sentiment = ('negative', 'ML')
                elif neutral_conf > positive_conf and neutral_conf > negative_conf:
                    sentiment = ('neutral', 'ML')

            elif var.model_classifier == "sgd":
                if positive_conf > negative_conf and positive_conf > neutral_conf:
                    sentiment = ('positive', 'ML')
                elif negative_conf > positive_conf and negative_conf > neutral_conf:
                    sentiment = ('negative', 'ML')
                elif neutral_conf > positive_conf and neutral_conf > negative_conf:
                    sentiment = ('neutral', 'ML')

            predictions.append(sentiment)
            my_index += 1

        print('Saving the predictions values of ' + str(var.model_classifier) +
              ' on file ' + str(var.model_classifier) + '_test_results.txt')
        with open(str(var.model_classifier) + '_test_results.txt', 'a') as fr:
            ii = 0
            for pred in line_save:
                if (var.model_classifier) == "randomForest":
                    fr.write(pred + '\t' + str(var.rf_predicts[ii])[2:-2] +
                             '\n')
                elif (var.model_classifier) == "svm":
                    fr.write(pred + '\t' + str(var.svm_predicts[ii][2:-2]) +
                             '\n')
                elif (var.model_classifier) == "naive":
                    fr.write(pred + '\t' + str(var.naive_predicts[ii][2:-2]) +
                             '\n')
                elif (var.model_classifier) == "lreg":
                    fr.write(pred + '\t' + str(var.lreg_predicts[ii]) + '\n')
                elif (var.model_classifier) == "sgd":
                    fr.write(pred + '\t' + str(var.sgd_predicts[ii]) + '\n')
                ii += 1

        return predictions

    # Output Individual scores for each method
    def output_individual_scores(self, tweets):

        tweet_texts = [tweet_message for tweet_message, label in tweets]
        tweet_labels = [label for tweet_message, label in tweets]

        # write the log
        fp = codecs.open('individual_scores.tab', 'w', encoding='utf8')
        line = 'pos_score_rule\tneg_score_rule\tpos_score_lex\tneg_score_lex\tpos_conf\tneg_conf\tneutral_conf\tclass\tmessage\n'
        fp.write(line)

        # 0. Pre-process the text (emoticons, misspellings, tagger)
        tweet_tokens_list = None
        tweet_tokens_list = pre_process(tweet_texts)

        predictions = []
        for index, tweet_tokens in enumerate(tweet_tokens_list):
            line = ''

            # 1. Rule-based classifier. Look for emoticons basically
            positive_score, negative_score = self.rules_classifier.classify(
                tweet_tokens)
            line += str(positive_score) + '\t' + str(negative_score) + '\t'

            # 2. Lexicon-based classifier (using url_score obtained from RulesClassifier)
            positive_score, negative_score = self.lexicon_classifier.classify(
                tweet_tokens)
            lexicon_score = positive_score + negative_score
            line += str(positive_score) + '\t' + str(negative_score) + '\t'

            # 3. Machine learning based classifier - used the training set to define the best features to classify new instances
            result = self.ml_classifier.decision_function(tweet_tokens)
            line += str(result['positive']) + '\t' + str(
                result['negative']) + '\t' + str(result['neutral']) + '\t'

            line += tweet_labels[index] + '\t"' + tweet_texts[index].replace(
                '"', '') + '"\n'

            fp.write(line)
        print('Indivual score saved in the file: individual_scores.tab')
class TwitterHybridClassifier(object):
    predictions = []

    def __init__(self, tweets=[]):
        # initialize internal variables
        self.rules_classifier = RulesClassifier()
        self.lexicon_classifier = LexiconClassifier()
        self.ml_classifier = None

        # if the ML model has been generated, load the model from model.pkl
        if sys.version_info >= (3, 0):
            if os.path.exists('model_python3.pkl'):
                print('Reading the model from model_python3.pkl')
                self.ml_classifier = pickle.load(
                    open('model_python3.pkl', 'rb'))
        else:
            if os.path.exists('model_python2.pkl'):
                print('Reading the model from model_python2.pkl')
                self.ml_classifier = pickle.load(
                    open('model_python2.pkl', 'rb'))

        if self.ml_classifier == None:
            # Preprocess the data and train a new model
            print('Preprocessing the training data')
            tweet_messages = [tweet_message for tweet_message, label in tweets]
            tweet_labels = [label for tweet_message, label in tweets]

            # preproces all the tweet_messages (Tokenization, POS and normalization)
            tweet_tokens = pre_process(tweet_messages)

            # compile a trainset with tweek_tokens and labels (positive,
            # negative or neutral)

            trainset = [(tweet_tokens[i], tweet_labels[i])
                        for i in range(len(tweets))]

            # initialize the classifier and train it
            classifier = MachineLearningClassifier(trainset)

            # dump the model into de pickle
            python_version = sys.version_info[0]
            model_name = 'model_python' + str(python_version) + '.pkl'
            print('Saving the trained model at ' + model_name)
            pickle.dump(classifier, open(model_name, 'wb'))
            self.ml_classifier = classifier

    # Apply the classifier over a tweet message in String format
    def classify(self, tweet_text):

        # 0. Pre-process the teets (tokenization, tagger, normalizations)
        tweet_tokens_list = []
        predictions = []

        print('Preprocessing the string')
        # pre-process the tweets

        tweet_tokens = pre_process([tweet_text])

        print(tweet_tokens_list)

        # 1. Rule-based classifier. Look for emoticons basically
        positive_score, negative_score = self.rules_classifier.classify(
            tweet_tokens)

        # 1. Apply the rules, If any found, classify the tweet here. If none found, continue for the lexicon classifier.
        if positive_score >= 1 and negative_score == 0:
            sentiment = ('positive', 'EB')
            predictions.append(sentiment)
            #continue
        elif positive_score == 0 and negative_score <= -1:
            sentiment = ('negative', 'EB')
            predictions.append(sentiment)
            #continue

            # 2. Lexicon-based classifier
        positive_score, negative_score = self.lexicon_classifier.classify(
            tweet_tokens)
        lexicon_score = positive_score + negative_score

        if positive_score >= 1 and negative_score == 0:
            sentiment = ('positive', 'LB')
            predictions.append(sentiment)
            #continue

        elif negative_score <= -2:
            sentiment = ('negative', 'LB')
            predictions.append(sentiment)
        #continue

        # 3. Machine learning based classifier - used the Train+Dev set sto define the best features to classify new instances
        result = self.ml_classifier.classify(tweet_tokens)
        positive_conf = result['positive']
        negative_conf = result['negative']
        neutral_conf = result['nuetral']

        if negative_conf >= -0.4:
            sentiment = ('negative', 'ML')
        elif positive_conf > neutral_conf:
            sentiment = ('positive', 'ML')
        else:
            sentiment = ('neutral', 'ML')

        predictions.append(sentiment)

        return predictions
 def __init__(self, trainset=[]):
     self.rules_classifier = RulesClassifier()
     self.lexicon_classifier = LexiconClassifier()
     self.ml_classifier = MachineLearningClassifier(trainset)
class TwitterHybridClassifier(object):

    def __init__(self, trainset=[]):
        self.rules_classifier = RulesClassifier()
        self.lexicon_classifier = LexiconClassifier()
        self.ml_classifier = MachineLearningClassifier(trainset)

    # Apply the classifier over a tweet message in String format
    def classify(self,tweet_text):

        # 0. Pre-process the text (emoticons, misspellings, tagger)
        tweet_text = pre_process(tweet_text)

        # 1. Rule-based classifier. Look for emoticons basically
        positive_score,negative_score = self.rules_classifier.classify(tweet_text)
        rules_score = positive_score + negative_score

        # 1. Apply the rules, If any found, classify the tweet here. If none found, continue for the lexicon classifier.
        if rules_score != 0:
            if rules_score > 0:
                sentiment = 'positive'
            else:
                sentiment = 'negative'
            return sentiment

        # 2. Lexicon-based classifier
        positive_score, negative_score = self.lexicon_classifier.classify(tweet_text)
        lexicon_score = positive_score + negative_score

        # 2. Apply lexicon classifier, If the lexicon score is
        # 0 (strictly neutral), >3 (positive with confidence) or
        # <3 (negative with confidence), classify the tweet here. If not,
        # continue for the SVM classifier
        if lexicon_score == 0:
            sentiment = 'neutral'
            return sentiment

        if lexicon_score >= 3:
            sentiment = 'positive'
            return sentiment

        if lexicon_score <= -3:
            sentiment = 'negative'
            return sentiment

        # 3. Machine learning based classifier - used the training set to define the best features to classify new instances
        scores = self.ml_classifier.classify(tweet_text)
        positive_conf = scores[0][1]
        negative_conf = scores[1][1]
        neutral_conf = scores[2][1]

        # 3. Apply machine learning classifier, If positive or negative
        # confidence (probability) is >=0.3, classify with the sentiment.
        # Otherwise, classify as neutral
        if positive_conf >= 0.3 and negative_conf < positive_conf:
            sentiment = 'positive'
        elif negative_conf >= 0.3:
            sentiment = 'negative'
        else:
            sentiment = 'neutral'

        return sentiment
class TwitterHybridClassifier(object):

    def __init__(self, tweets=[]):
        # initialize internal variables
        self.rules_classifier = RulesClassifier()
        self.lexicon_classifier = LexiconClassifier()
        self.ml_classifier = None

        # if the ML model has been generated, load the model from model.pkl
        if sys.version_info >= (3,0):
            if os.path.exists('model_python3.pkl'):
                print ('Reading the model from model_python3.pkl')
                self.ml_classifier = pickle.load(open('model_python3.pkl','rb'))
        else:
            if os.path.exists('model_python2.pkl'):
                print ('Reading the model from model_python2.pkl')
                self.ml_classifier = pickle.load(open('model_python2.pkl','rb'))

        if self.ml_classifier == None:
            # Preprocess the data and train a new model
            print ('Preprocessing the training data')
            tweet_messages = [tweet_message for tweet_message,label in tweets]
            tweet_labels = [label for tweet_message,label in tweets]

            # preproces all the tweet_messages (Tokenization, POS and normalization)
            tweet_tokens = pre_process(tweet_messages)

            # compile a trainset with tweek_tokens and labels (positive,
            # negative or neutral)

            trainset = [(tweet_tokens[i],tweet_labels[i]) for i in range(len(tweets))]

            # initialize the classifier and train it
            classifier = MachineLearningClassifier(trainset)

            # dump the model into de pickle
            python_version = sys.version_info[0]
            model_name = 'model_python' + str(python_version) + '.pkl'
            print ('Saving the trained model at ' + model_name)
            pickle.dump(classifier, open(model_name, 'wb'))
            self.ml_classifier = classifier

    # Apply the classifier over a tweet message in String format
    def classify(self,tweet_text):

        # 0. Pre-process the teets (tokenization, tagger, normalizations)
        tweet_tokens_list = []

        print ('Preprocessing the string')
        # pre-process the tweets
        tweet_tokens_list = pre_process([tweet_text])

        predictions = []
        total_tweets = len(tweet_tokens_list)

        # iterate over the tweet_tokens
        for index, tweet_tokens in enumerate(tweet_tokens_list):

            # 1. Rule-based classifier. Look for emoticons basically
            positive_score,negative_score = self.rules_classifier.classify(tweet_tokens)

            # 1. Apply the rules, If any found, classify the tweet here. If none found, continue for the lexicon classifier.
            if positive_score >= 1 and negative_score == 0:
                sentiment = ('positive','RB')
                predictions.append(sentiment)
                continue
            elif positive_score == 0 and negative_score <= -1:
                sentiment = ('negative','RB')
                predictions.append(sentiment)
                continue

            # 2. Lexicon-based classifier
            positive_score, negative_score = self.lexicon_classifier.classify(tweet_tokens)
            lexicon_score = positive_score + negative_score

            # 2. Apply lexicon classifier,
            # If in the threshold classify the tweet here. If not, continue for the ML classifier
            if positive_score >= 1 and negative_score == 0:
                sentiment = ('positive','LB')
                predictions.append(sentiment)
                continue
            elif negative_score <= -2:
                sentiment = ('negative','LB')
                predictions.append(sentiment)
                continue

            # 3. Machine learning based classifier - used the Train+Dev set sto define the best features to classify new instances
            result = self.ml_classifier.classify(tweet_tokens)
            positive_conf = result['positive']
            negative_conf = result['negative']
            neutral_conf = result['neutral']

            if negative_conf >= -0.4:
                sentiment = ('negative','ML')
            elif positive_conf > neutral_conf:
                sentiment = ('positive','ML')
            else:
                sentiment = ('neutral','ML')

            predictions.append(sentiment)

        return predictions

    # Apply the classifier in batch over a list of tweet messages in String format
    def classify_batch(self,tweet_texts):

        # 0. Pre-process the teets (tokenization, tagger, normalizations)
        tweet_tokens_list = []

        if len(tweet_texts) == 0:
            return tweet_tokens_list

        print ('Preprocessing the test data')
        # pre-process the tweets
        tweet_tokens_list = pre_process(tweet_texts)

        predictions = []
        total_tweets = len(tweet_tokens_list)

        # iterate over the tweet_tokens
        for index, tweet_tokens in enumerate(tweet_tokens_list):

            print('Testing for tweet n. {}/{}'.format(index+1,total_tweets))

            # 1. Rule-based classifier. Look for emoticons basically
            positive_score,negative_score = self.rules_classifier.classify(tweet_tokens)

            # 1. Apply the rules, If any found, classify the tweet here. If none found, continue for the lexicon classifier.
            if positive_score >= 1 and negative_score == 0:
                sentiment = ('positive','RB')
                predictions.append(sentiment)
                continue
            elif positive_score == 0 and negative_score <= -1:
                sentiment = ('negative','RB')
                predictions.append(sentiment)
                continue

            # 2. Lexicon-based classifier
            positive_score, negative_score = self.lexicon_classifier.classify(tweet_tokens)
            lexicon_score = positive_score + negative_score

            # 2. Apply lexicon classifier,
            # If in the threshold classify the tweet here. If not, continue for the ML classifier
            if positive_score >= 1 and negative_score == 0:
                sentiment = ('positive','LB')
                predictions.append(sentiment)
                continue
            elif negative_score <= -2:
                sentiment = ('negative','LB')
                predictions.append(sentiment)
                continue

            # 3. Machine learning based classifier - used the Train+Dev set sto define the best features to classify new instances
            result = self.ml_classifier.classify(tweet_tokens)
            positive_conf = result['positive']
            negative_conf = result['negative']
            neutral_conf = result['neutral']

            if negative_conf >= -0.4:
                sentiment = ('negative','ML')
            elif positive_conf > neutral_conf:
                sentiment = ('positive','ML')
            else:
                sentiment = ('neutral','ML')

            predictions.append(sentiment)

        return predictions

    # Output Individual scores for each method
    def output_individual_scores(self,tweets):

        tweet_texts = [tweet_message for tweet_message,label in tweets]
        tweet_labels = [label for tweet_message,label in tweets]

        # write the log
        fp = codecs.open('individual_scores.tab','w',encoding='utf8')
        line = 'pos_score_rule\tneg_score_rule\tpos_score_lex\tneg_score_lex\tpos_conf\tneg_conf\tneutral_conf\tclass\tmessage\n'
        fp.write(line)

        # 0. Pre-process the text (emoticons, misspellings, tagger)
        tweet_tokens_list = None
        tweet_tokens_list = pre_process(tweet_texts)

        predictions = []
        for index,tweet_tokens in enumerate(tweet_tokens_list):
            line = ''

            # 1. Rule-based classifier. Look for emoticons basically
            positive_score,negative_score = self.rules_classifier.classify(tweet_tokens)
            line += str(positive_score) + '\t' + str(negative_score) + '\t'

            # 2. Lexicon-based classifier (using url_score obtained from RulesClassifier)
            positive_score, negative_score = self.lexicon_classifier.classify(tweet_tokens)
            lexicon_score = positive_score + negative_score
            line += str(positive_score) + '\t' + str(negative_score) + '\t'

            # 3. Machine learning based classifier - used the training set to define the best features to classify new instances
            result = self.ml_classifier.decision_function(tweet_tokens)
            line += str(result['positive']) + '\t' + str(result['negative']) + '\t' + str(result['neutral']) + '\t'

            line += tweet_labels[index] + '\t"' + tweet_texts[index].replace('"','') + '"\n'

            fp.write(line)
        print('Indivual score saved in the file: individual_scores.tab')
class MachineLearningClassifier(object):

    # Constructor
    def __init__(self, trainset=[]):
        print ('Loading training modules')
        self.bag_of_words = []
        self.vectorizer = DictVectorizer(dtype=int, sparse=True)
        self.encoder = LabelEncoder()
        self.lexicon_classifier = LexiconClassifier()
        self.classifier = LinearSVC(C=0.005)
        self.train(trainset)

    # Extract features for ML process
    # Some insights from http://aclweb.org/anthology/S/S13/S13-2053.pdf
    def extract_features(self, tweet_tokens):

        if len(self.bag_of_words) == 0:
            print('Bag-of-Words empty!')

        unigrams = [w.lower() for w,t in tweet_tokens]
        tokens = unigrams
        tokens += ['_'.join(b) for b in bigrams(unigrams)]
        tokens += ['_'.join(t) for t in trigrams(unigrams)]
        tokens += [t1 + '_*_' + t3 for t1,t2,t3 in trigrams(unigrams)]

        tweet_tags =  [tag for token, tag in tweet_tokens]

        feature_set = {}

        # 1st set of features: bag-of-words
        for token in set(tokens).intersection(self.bag_of_words):
            feature_set['has_'+token] = True

        # 2nd set of features: the count for each tag type present in the message
        # Tweet_nlp taget. Info:
        # http://www.ark.cs.cmu.edu/TweetNLP/annot_guidelines.pdf
        for tag in ['N','O','^','S','Z','V','A','R','!','D','P','&','T','X','#','@','~','U','E','$',',','G','L','M','Y']:
            feature_set['num_'+tag] = sum([1 for t in tweet_tags if t == tag])

        # 3rd feature: negation is present?
        negators = set(LexiconClassifier().read_negation_words())
        if len(negators.intersection(set(tokens))) > 0:
            feature_set['has_negator'] = True

        # 4th feature: character ngrams
        regexp = re.compile(r"([a-z])\1{2,}")
        feature_set['has_char_ngrams'] = False
        for token,tag in tweet_tokens:
            if regexp.search(token):
                feature_set['has_char_ngrams'] = True
                break

        # 5th feature: punctuation ngrams
        regexp = re.compile(r"([!\?])\1{2,}")
        feature_set['has_punct_ngrams'] = False
        for token,tag in tweet_tokens:
            if regexp.search(token):
                feature_set['has_punct_ngrams'] = True
                break

        # 6th feature: the number of all upper cased words
        feature_set['num_all_caps'] = sum([1 for token,tag in tweet_tokens if token.isupper() and len(token)>=3])

        # 7th and 8th feature: the positive and negative score from lexicon
        # classifier (i.e., number of positive and negative words from lexicon)
        positive_score, negative_score = self.lexicon_classifier.classify(tweet_tokens)
        feature_set['pos_lexicon'] = positive_score
        feature_set['neg_lexicon'] = -1 * negative_score

        return feature_set


    # train the classifier
    # Tweets argument must be a list of dictionaries. Each dictionary must
    # have the keys ['MESSAGE'] and ['SENTIMENT'] with the message string and
    # the classificationclass, respectively.
    def train(self,tweets):
        # 1st step: build the bag-of-words model
        tweet_tokens_list = [tweet_tokens for tweet_tokens,label in tweets]
        tokens = []
        print('Computing the trainset vocabulary of n-grams')
        for tweet_tokens in tweet_tokens_list:
            unigrams = [w.lower() for w,t in tweet_tokens]
            tokens += unigrams
            tokens += ['_'.join(b) for b in bigrams(unigrams)]
            tokens += ['_'.join(t) for t in trigrams(unigrams)]
            tokens += [t1 + '_*_' + t3 for t1,t2,t3 in trigrams(unigrams)]

        # build the bag-of-words list using all the tokens
        self.bag_of_words = set(tokens)

        data = list()
        total_tweets = len(tweets)
        features_list = list()
        for index,(tweet_tokens,label) in enumerate(tweets):
            print('Training for tweet n. {}/{}'.format(index+1,total_tweets))
            features_list.append(self.extract_features(tweet_tokens))

        # Train a SVM classifier
        #data = self.vectorizer.fit_transform([features for features,label in self.train_set_features])
        print('Vectorizing the features')
        data = self.vectorizer.fit_transform(features_list)
        target = self.encoder.fit_transform([label for tweet_tokens,label in tweets])
        print('Building the model')
        self.classifier.fit(data, target)



    # classify a new message. Return the scores (probabilities) for each
    # classification class
    def classify(self, tweet_tokens):
        data = self.vectorizer.transform(self.extract_features(tweet_tokens))
        probs = self.classifier.decision_function(data)
        classes = self.encoder.classes_
        return {classes.item(i): probs.item(i) for i in range(len(classes))}


    # return the probability of classification into one of the three classes
    def decision_function(self, tweet_tokens):
        data = self.vectorizer.transform(self.extract_features(tweet_tokens))
        probs = self.classifier.decision_function(data)
        classes = self.encoder.classes_
        return {classes.item(i): probs.item(i) for i in range(len(classes))}
示例#14
0
class MachineLearningClassifier(object):

    # Constructor
    def __init__(self, trainset=[]):
        print('Loading training modules')
        self.bag_of_words = []
        self.vectorizer = DictVectorizer(dtype=int, sparse=True)
        self.encoder = LabelEncoder()
        self.lexicon_classifier = LexiconClassifier()
        if var.model_classifier == "svm":
            self.classifier = LinearSVC(C=0.005)
        elif var.model_classifier == "randomForest":
            self.classifier = RandomForestClassifier()
        elif var.model_classifier == "naive":
            self.classifier = GaussianNB()
        elif var.model_classifier == "lreg":
            self.classifier = LogisticRegression()
        elif var.model_classifier == "sgd":
            self.classifier = SGDClassifier(penalty='elasticnet',
                                            alpha=0.001,
                                            l1_ratio=0.85,
                                            n_iter=1000)
        self.train(trainset)

    # Extract features for ML process
    # Some insights from http://aclweb.org/anthology/S/S13/S13-2053.pdf
    def extract_features(self, tweet_tokens):

        if len(self.bag_of_words) == 0:
            print('Bag-of-Words empty!')

        #print(str(self.bag_of_words))
        #input("enter 2...")

        unigrams = [w.lower() for w, t in tweet_tokens]
        tokens = unigrams
        tokens += ['_'.join(b) for b in bigrams(unigrams)]
        tokens += ['_'.join(t) for t in trigrams(unigrams)]
        tokens += [t1 + '_*_' + t3 for t1, t2, t3 in trigrams(unigrams)]

        tweet_tags = [tag for token, tag in tweet_tokens]

        feature_set = {}

        # 1st set of features: bag-of-words
        for token in set(tokens).intersection(self.bag_of_words):
            feature_set['has_' + token] = True

        # 2nd set of features: the count for each tag type present in the message
        # Tweet_nlp taget. Info:
        # http://www.ark.cs.cmu.edu/TweetNLP/annot_guidelines.pdf
        for tag in [
                'N', 'O', '^', 'S', 'Z', 'V', 'A', 'R', '!', 'D', 'P', '&',
                'T', 'X', '#', '@', '~', 'U', 'E', '$', ',', 'G', 'L', 'M', 'Y'
        ]:
            feature_set['num_' + tag] = sum(
                [1 for t in tweet_tags if t == tag])

        # 3rd feature: negation is present?
        negators = set(LexiconClassifier().read_negation_words())
        if len(negators.intersection(set(tokens))) > 0:
            feature_set['has_negator'] = True

        # 4th feature: character ngrams
        regexp = re.compile(r"([a-z])\1{2,}")
        feature_set['has_char_ngrams'] = False
        for token, tag in tweet_tokens:
            if regexp.search(token):
                feature_set['has_char_ngrams'] = True
                break

        # 5th feature: punctuaion ngrams
        regexp = re.compile(r"([!\?])\1{2,}")
        feature_set['has_punct_ngrams'] = False
        for token, tag in tweet_tokens:
            if regexp.search(token):
                feature_set['has_punct_ngrams'] = True
                break

        # 6th feature: the number of all upper cased words
        feature_set['num_all_caps'] = sum([
            1 for token, tag in tweet_tokens
            if token.isupper() and len(token) >= 3
        ])

        # 7th and 8th feature: the positive and negative score from lexicon
        # classifier (i.e., number of positive and negative words from lexicon)
        positive_score, negative_score = self.lexicon_classifier.classify(
            tweet_tokens)
        feature_set['pos_lexicon'] = positive_score
        feature_set['neg_lexicon'] = -1 * negative_score

        return feature_set

    # train the classifier
    # Tweets argument must be a list of dicitionaries. Each dictionary must
    # have the keys ['MESSAGE'] and ['SENTIMENT'] with the message string and
    # the classificationclass, respectively.
    def train(self, tweets):
        # 1st step: build the bag-of-words model
        tweet_tokens_list = [tweet_tokens for tweet_tokens, label in tweets]
        tokens = []
        print('Computing the trainset vocabulary of n-grams')
        for tweet_tokens in tweet_tokens_list:
            unigrams = [w.lower() for w, t in tweet_tokens]
            tokens += unigrams
            tokens += ['_'.join(b) for b in bigrams(unigrams)]
            tokens += ['_'.join(t) for t in trigrams(unigrams)]
            tokens += [t1 + '_*_' + t3 for t1, t2, t3 in trigrams(unigrams)]

        # build the bag-of-words list using all the tokens
        self.bag_of_words = set(tokens)

        data = list()
        total_tweets = len(tweets)
        features_list = list()
        for index, (tweet_tokens, label) in enumerate(tweets):
            print('Training for tweet n. {}/{}'.format(index + 1,
                                                       total_tweets))
            features_list.append(self.extract_features(tweet_tokens))

        #import pickle

        #pickle_out = open("../../features.pickle","wb")
        #pickle.dump(features_list, pickle_out)
        #pickle_out.close()

        # Train a SVM classifier
        #data = self.vectorizer.fit_transform([features for features,label in self.train_set_features])
        print('Vectorizing the features')
        data = self.vectorizer.fit_transform(features_list)
        target = self.encoder.fit_transform(
            [label for tweet_tokens, label in tweets])
        print('Builing the model')
        if (var.model_classifier) == "naive":
            self.classifier.fit(data.toarray(), target)
        else:
            self.classifier.fit(data, target)

    # classify a new message. Return the scores (probabilities) for each
    # classification class
    def classify(self, tweet_tokens):
        #print(str(self.extract_features(tweet_tokens)))
        #input("press enter...")
        ft = self.extract_features(tweet_tokens)
        data = self.vectorizer.transform(ft)

        var.features_test.append(ft)

        #data = self.vectorizer.transform(self.extract_features(tweet_tokens))
        if var.model_classifier == "svm":
            probs = self.classifier.decision_function(data)
            classes = self.encoder.classes_
            var.svm_predicts.append(classes[self.classifier.predict(data)])

            return {
                classes.item(i): probs.item(i)
                for i in range(len(classes))
            }
        elif var.model_classifier == "randomForest":
            probs = self.classifier.predict_proba(data)
            classes = self.encoder.classes_
            var.rf_predicts.append(classes[self.classifier.predict(data)])

            return {
                classes.item(i): probs.item(i)
                for i in range(len(classes))
            }

        elif var.model_classifier == "naive":
            probs = self.classifier.predict_proba(data.toarray())
            classes = self.encoder.classes_
            var.naive_predicts.append(classes[self.classifier.predict(
                data.toarray())])

            return {
                classes.item(i): probs.item(i)
                for i in range(len(classes))
            }

        elif var.model_classifier == "lreg":
            probs = self.classifier.predict_proba(data)
            classes = self.encoder.classes_
            a = classes[self.classifier.predict(data)]
            var.lreg_predicts.append(a)
            print(str(a))

            return {
                classes.item(i): probs.item(i)
                for i in range(len(classes))
            }

        elif var.model_classifier == "sgd":
            probs = self.classifier.decision_function(data)
            classes = self.encoder.classes_
            a = classes[self.classifier.predict(data)]
            var.sgd_predicts.append(a)
            print(str(a))

            return {
                classes.item(i): probs.item(i)
                for i in range(len(classes))
            }

    # return the probability of classification into one of the three classes
    #def decision_function(self, tweet_tokens):
    def predict_proba(self, tweet_tokens):
        data = self.vectorizer.transform(self.extract_features(tweet_tokens))
        #probs = self.classifier.decision_function(data)
        probs = self.classifier.predict_proba(data)

        #        if(var.model_classifier == "naive"):
        #a = self.classifier.predict(data)
        #var.naive_raw_predict.append(a)
        #print(str(a))

        classes = self.encoder.classes_
        return {classes.item(i): probs.item(i) for i in range(len(classes))}

    def decision_function(self, tweet_tokens):
        data = self.vectorizer.transform(self.extract_features(tweet_tokens))
        probs = self.classifier.decision_function(data)
        #print(self.classifier.predict(data))
        #input("Press AGAIN...")
        #probs = self.classifier.predict_proba(data)
        classes = self.encoder.classes_
        return {classes.item(i): probs.item(i) for i in range(len(classes))}
 def __init__(self, trainset=[]):
     self.rules_classifier = RulesClassifier()
     self.lexicon_classifier = LexiconClassifier()
     self.ml_classifier = MachineLearningClassifier(trainset)
class TwitterHybridClassifier(object):
    def __init__(self, trainset=[]):
        self.rules_classifier = RulesClassifier()
        self.lexicon_classifier = LexiconClassifier()
        self.ml_classifier = MachineLearningClassifier(trainset)

    # Apply the classifier over a tweet message in String format
    def classify(self, tweet_text):

        # 0. Pre-process the text (emoticons, misspellings, tagger)
        tweet_text = pre_process(tweet_text)

        # 1. Rule-based classifier. Look for emoticons basically
        positive_score, negative_score = self.rules_classifier.classify(
            tweet_text)
        rules_score = positive_score + negative_score

        # 1. Apply the rules, If any found, classify the tweet here. If none found, continue for the lexicon classifier.
        if rules_score != 0:
            if rules_score > 0:
                sentiment = 'positive'
            else:
                sentiment = 'negative'
            return sentiment

        # 2. Lexicon-based classifier
        positive_score, negative_score = self.lexicon_classifier.classify(
            tweet_text)
        lexicon_score = positive_score + negative_score

        # 2. Apply lexicon classifier, If the lexicon score is
        # 0 (strictly neutral), >3 (positive with confidence) or
        # <3 (negative with confidence), classify the tweet here. If not,
        # continue for the SVM classifier
        if lexicon_score == 0:
            sentiment = 'neutral'
            return sentiment

        if lexicon_score >= 3:
            sentiment = 'positive'
            return sentiment

        if lexicon_score <= -3:
            sentiment = 'negative'
            return sentiment

        # 3. Machine learning based classifier - used the training set to define the best features to classify new instances
        scores = self.ml_classifier.classify(tweet_text)
        positive_conf = scores[0][1]
        negative_conf = scores[1][1]
        neutral_conf = scores[2][1]

        # 3. Apply machine learning classifier, If positive or negative
        # confidence (probability) is >=0.3, classify with the sentiment.
        # Otherwise, classify as neutral
        if positive_conf >= 0.3 and negative_conf < positive_conf:
            sentiment = 'positive'
        elif negative_conf >= 0.3:
            sentiment = 'negative'
        else:
            sentiment = 'neutral'

        return sentiment