Пример #1
0
 def __init__(self, tweet):
     print('Loading training modules')
     self.bag_of_words = []
     self.vectorizer = DictVectorizer(dtype=int, sparse=True)
     self.encoder = LabelEncoder()
     self.lexicon_classifier = LexiconClassifier()
     self.classifier = LinearSVC(C=0.005)
     self.train(trainset)
Пример #2
0
    def __init__(self, tweets=[]):
        # initialize internal variables
        self.rules_classifier = RulesClassifier()
        self.lexicon_classifier = LexiconClassifier()
        self.ml_classifier = None

        # if the ML model has been generated, load the model from model.pkl
        if sys.version_info >= (3, 0):
            if os.path.exists(
                    str(var.model_classifier) + '-model_python3.pkl'):
                print('Reading the ' + str(var.model_classifier) +
                      ' model from model_python3.pkl')
                self.ml_classifier = pickle.load(
                    open(
                        str(var.model_classifier) + '-model_python3.pkl',
                        'rb'))
        else:
            if os.path.exists(
                    str(var.model_classifier) + '-model_python2.pkl'):
                print('Reading the ' + str(var.model_classifier) +
                      ' model from model_python2.pkl')
                self.ml_classifier = pickle.load(
                    open(
                        str(var.model_classifier) + '-model_python2.pkl',
                        'rb'))

        if self.ml_classifier == None:
            # Preprocess the data and train a new model
            print('Preprocessing the training data')
            tweet_messages = [tweet_message for tweet_message, label in tweets]
            tweet_labels = [label for tweet_message, label in tweets]

            # preproces all the tweet_messages (Tokenization, POS and normalization)
            tweet_tokens = pre_process(tweet_messages)

            # compile a trainset with tweek_tokens and labels (positive,
            # negative or neutral)

            trainset = [(tweet_tokens[i], tweet_labels[i])
                        for i in range(len(tweets))]

            # initialize the classifier and train it
            classifier = MachineLearningClassifier(trainset)

            # dump the model into de pickle
            python_version = sys.version_info[0]
            model_name = str(var.model_classifier) + '-model_python' + str(
                python_version) + '.pkl'
            print('Saving the trained model at ' + model_name)
            pickle.dump(classifier, open(model_name, 'wb'))
            self.ml_classifier = classifier
Пример #3
0
 def __init__(self, trainset=[]):
     print('Loading training modules')
     self.bag_of_words = []
     self.vectorizer = DictVectorizer(dtype=int, sparse=True)
     self.encoder = LabelEncoder()
     self.lexicon_classifier = LexiconClassifier()
     if var.model_classifier == "svm":
         self.classifier = LinearSVC(C=0.005)
     elif var.model_classifier == "randomForest":
         self.classifier = RandomForestClassifier()
     elif var.model_classifier == "naive":
         self.classifier = GaussianNB()
     elif var.model_classifier == "lreg":
         self.classifier = LogisticRegression()
     elif var.model_classifier == "sgd":
         self.classifier = SGDClassifier(penalty='elasticnet',
                                         alpha=0.001,
                                         l1_ratio=0.85,
                                         n_iter=1000)
     self.train(trainset)
Пример #4
0
    def extract_features(self, tweet_tokens):

        if len(self.bag_of_words) == 0:
            print('Bag-of-Words empty!')

        unigrams = [w.lower() for w, t in tweet_tokens]
        tokens = unigrams
        tokens += ['_'.join(b) for b in bigrams(unigrams)]
        tokens += ['_'.join(t) for t in trigrams(unigrams)]
        tokens += [t1 + '_*_' + t3 for t1, t2, t3 in trigrams(unigrams)]

        tweet_tags = [tag for token, tag in tweet_tokens]

        feature_set = {}

        # 1st set of features: bag-of-words
        for token in set(tokens).intersection(self.bag_of_words):
            feature_set['has_' + token] = True

        # 2nd set of features: the count for each tag type present in the message
        # Tweet_nlp taget. Info:
        # http://www.ark.cs.cmu.edu/TweetNLP/annot_guidelines.pdf
        for tag in [
                'CC', 'CD', 'DT', 'EX', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD',
                'NN', 'NNP', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR',
                'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP',
                'VBZ', 'WDT', 'WP', 'WRB'
        ]:
            feature_set['num_' + tag] = sum(
                [1 for t in tweet_tags if t == tag])

        # 3rd feature: negation is present?
        negators = set(LexiconClassifier().read_negation_words())
        if len(negators.intersection(set(tokens))) > 0:
            feature_set['has_negator'] = True

        # 4th feature: character ngrams
        regexp = re.compile(r"([a-z])\1{2,}")
        feature_set['has_char_ngrams'] = False
        for token, tag in tweet_tokens:
            if regexp.search(token):
                feature_set['has_char_ngrams'] = True
                break

        # 5th feature: punctuaion ngrams
        regexp = re.compile(r"([!\?])\1{2,}")
        feature_set['has_punct_ngrams'] = False
        for token, tag in tweet_tokens:
            if regexp.search(token):
                feature_set['has_punct_ngrams'] = True
                break

        # 6th feature: the number of all upper cased words
        feature_set['num_all_caps'] = sum([
            1 for token, tag in tweet_tokens
            if token.isupper() and len(token) >= 3
        ])

        # 7th and 8th feature: the positive and negative score from lexicon
        # classifier (i.e., number of positive and negative words from lexicon)
        positive_score, negative_score = self.lexicon_classifier.classify(
            tweet_tokens)
        feature_set['pos_lexicon'] = positive_score
        feature_set['neg_lexicon'] = -1 * negative_score

        return feature_set
 def __init__(self, trainset=[]):
     self.rules_classifier = RulesClassifier()
     self.lexicon_classifier = LexiconClassifier()
     self.ml_classifier = MachineLearningClassifier(trainset)