def __init__(self, tweet): print('Loading training modules') self.bag_of_words = [] self.vectorizer = DictVectorizer(dtype=int, sparse=True) self.encoder = LabelEncoder() self.lexicon_classifier = LexiconClassifier() self.classifier = LinearSVC(C=0.005) self.train(trainset)
def __init__(self, tweets=[]): # initialize internal variables self.rules_classifier = RulesClassifier() self.lexicon_classifier = LexiconClassifier() self.ml_classifier = None # if the ML model has been generated, load the model from model.pkl if sys.version_info >= (3, 0): if os.path.exists( str(var.model_classifier) + '-model_python3.pkl'): print('Reading the ' + str(var.model_classifier) + ' model from model_python3.pkl') self.ml_classifier = pickle.load( open( str(var.model_classifier) + '-model_python3.pkl', 'rb')) else: if os.path.exists( str(var.model_classifier) + '-model_python2.pkl'): print('Reading the ' + str(var.model_classifier) + ' model from model_python2.pkl') self.ml_classifier = pickle.load( open( str(var.model_classifier) + '-model_python2.pkl', 'rb')) if self.ml_classifier == None: # Preprocess the data and train a new model print('Preprocessing the training data') tweet_messages = [tweet_message for tweet_message, label in tweets] tweet_labels = [label for tweet_message, label in tweets] # preproces all the tweet_messages (Tokenization, POS and normalization) tweet_tokens = pre_process(tweet_messages) # compile a trainset with tweek_tokens and labels (positive, # negative or neutral) trainset = [(tweet_tokens[i], tweet_labels[i]) for i in range(len(tweets))] # initialize the classifier and train it classifier = MachineLearningClassifier(trainset) # dump the model into de pickle python_version = sys.version_info[0] model_name = str(var.model_classifier) + '-model_python' + str( python_version) + '.pkl' print('Saving the trained model at ' + model_name) pickle.dump(classifier, open(model_name, 'wb')) self.ml_classifier = classifier
def __init__(self, trainset=[]): print('Loading training modules') self.bag_of_words = [] self.vectorizer = DictVectorizer(dtype=int, sparse=True) self.encoder = LabelEncoder() self.lexicon_classifier = LexiconClassifier() if var.model_classifier == "svm": self.classifier = LinearSVC(C=0.005) elif var.model_classifier == "randomForest": self.classifier = RandomForestClassifier() elif var.model_classifier == "naive": self.classifier = GaussianNB() elif var.model_classifier == "lreg": self.classifier = LogisticRegression() elif var.model_classifier == "sgd": self.classifier = SGDClassifier(penalty='elasticnet', alpha=0.001, l1_ratio=0.85, n_iter=1000) self.train(trainset)
def extract_features(self, tweet_tokens): if len(self.bag_of_words) == 0: print('Bag-of-Words empty!') unigrams = [w.lower() for w, t in tweet_tokens] tokens = unigrams tokens += ['_'.join(b) for b in bigrams(unigrams)] tokens += ['_'.join(t) for t in trigrams(unigrams)] tokens += [t1 + '_*_' + t3 for t1, t2, t3 in trigrams(unigrams)] tweet_tags = [tag for token, tag in tweet_tokens] feature_set = {} # 1st set of features: bag-of-words for token in set(tokens).intersection(self.bag_of_words): feature_set['has_' + token] = True # 2nd set of features: the count for each tag type present in the message # Tweet_nlp taget. Info: # http://www.ark.cs.cmu.edu/TweetNLP/annot_guidelines.pdf for tag in [ 'CC', 'CD', 'DT', 'EX', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WRB' ]: feature_set['num_' + tag] = sum( [1 for t in tweet_tags if t == tag]) # 3rd feature: negation is present? negators = set(LexiconClassifier().read_negation_words()) if len(negators.intersection(set(tokens))) > 0: feature_set['has_negator'] = True # 4th feature: character ngrams regexp = re.compile(r"([a-z])\1{2,}") feature_set['has_char_ngrams'] = False for token, tag in tweet_tokens: if regexp.search(token): feature_set['has_char_ngrams'] = True break # 5th feature: punctuaion ngrams regexp = re.compile(r"([!\?])\1{2,}") feature_set['has_punct_ngrams'] = False for token, tag in tweet_tokens: if regexp.search(token): feature_set['has_punct_ngrams'] = True break # 6th feature: the number of all upper cased words feature_set['num_all_caps'] = sum([ 1 for token, tag in tweet_tokens if token.isupper() and len(token) >= 3 ]) # 7th and 8th feature: the positive and negative score from lexicon # classifier (i.e., number of positive and negative words from lexicon) positive_score, negative_score = self.lexicon_classifier.classify( tweet_tokens) feature_set['pos_lexicon'] = positive_score feature_set['neg_lexicon'] = -1 * negative_score return feature_set
def __init__(self, trainset=[]): self.rules_classifier = RulesClassifier() self.lexicon_classifier = LexiconClassifier() self.ml_classifier = MachineLearningClassifier(trainset)