예제 #1
0
class MachineLearningNLP:
    def __init__(self, classifier_type='NaiveBayes', feats=word_feats):
        # "Thumbs up? Sentiment Classification using Machine Learning Techniques
        classifier_list = ['NaiveBayes', 'MaximumEntropy', 'SVM']
        if classifier_type in classifier_list:
            self.classifier_type = classifier_type
        else:
            print("Classifier Type is not implemented: " + classifier_type)
        if self.classifier_type == 'MaximumEntropy':
            self.classifier = MaxentClassifier
        elif self.classifier_type == 'SVM':
            self.classifier = SklearnClassifier(LinearSVC(), sparse=False)
        elif self.classifier_type == 'NaiveBayes':
            self.classifier = NaiveBayesClassifier
        self.feats = feats

    def convert_txt(self, file_neg, file_pos):
        negfeats = list(map(self.feats, word_preprocess(file_neg)))
        posfeats = list(map(self.feats, word_preprocess(file_pos)))

        negfeats = list(zip(negfeats, ['neg'] * len(negfeats)))
        posfeats = list(zip(posfeats, ['pos'] * len(posfeats)))
        #        negfeats = [(self.feats(f), 'neg') for f in word_preprocess(file_neg)]
        #        posfeats = [(self.feats(f), 'pos') for f in word_preprocess(file_pos)]
        return (negfeats, posfeats)

    def train(self, train_data, **kwargs):
        self.classifier = self.classifier.train(train_data, **kwargs)

    def predict(self, test_data):
        return [self.classifier.classify(feats) for feats, label in test_data]

    def annotate(self, text):
        assert isinstance(text, str)
        text_Encoded = self.feats(text.split())
        return self.classifier.classify(text_Encoded)

    def performance(self, test_data):
        prediction = self.predict(test_data)
        pos_loc = set(
            [i for i in range(len(prediction)) if prediction[i] == 'pos'])
        neg_loc = set(range(len(prediction))) - pos_loc
        pos_ref = set(
            [i for i in range(len(prediction)) if test_data[i][1] == 'pos'])
        neg_ref = set(range(len(prediction))) - pos_ref
        print('===============================\n')
        print('Model Summary:\n')
        print(self.classifier_type + ' with features ' + self.feats.__name__ +
              '\n')
        print('Overall Accuracy: %.3f\n' %
              (nltk.classify.util.accuracy(self.classifier, test_data)))
        print('Positive Precision: %.3f\n' %
              (nltk.precision(pos_ref, pos_loc)))
        print('Positive Recall: %.3f\n' % (nltk.recall(pos_ref, pos_loc)))
        print('Negative Precision: %.3f\n' %
              (nltk.precision(neg_ref, neg_loc)))
        print('Negative Recall: %.3f\n' % (nltk.recall(neg_ref, neg_loc)))
예제 #2
0
def funcn():
    f = open("amazon_data.txt")
    pos_tweets = list()
    neg_tweets = list()
    for line in f:
        words = line.split("\t")
        if words[1] == '0\n' or words[1] == '0':
            neg_tweets.append(words)
        else:
            pos_tweets.append(words)
    f.close()

    tweets = []
    for (words, sentiment) in pos_tweets + neg_tweets:
        words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
        tweets.append((words_filtered, sentiment))

    def get_words_in_tweets(tweets):
        all_words = []
        for (words, sentiment) in tweets:
            all_words.extend(words)
        return all_words

    def get_word_features(wordlist):
        wordlist = nltk.FreqDist(wordlist)
        word_features = wordlist.keys()
        return word_features

    word_features = get_word_features(get_words_in_tweets(tweets))

    def extract_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    training_set = nltk.classify.apply_features(extract_features, tweets)
    classifie = nltk.NaiveBayesClassifier.train(training_set)

    classifier = SklearnClassifier(BernoulliNB()).train(training_set)

    tweet = 'it is not bad'
    print(classifie.classify(extract_features(tweet.split())))
    print(classifier.classify(extract_features(tweet.split())))

    classif = SklearnClassifier(SVC(), sparse=False).train(training_set)
    print(classif.classify(extract_features(tweet.split())))
예제 #3
0
def searchSGDClassifier_classifier(title, train_departments):
    """

    :param title:
    :param train_departments:
    :return:
    """
    timeTraning = time.time()
    classifier = SklearnClassifier(SGDClassifier(loss='log'))
    classifier.train(train_departments)
    timeTraning = time.time() - timeTraning

    test_sent_features = word_feats(title)

    timeClassify = time.time()
    found_department = classifier.classify(test_sent_features)
    timeClassify = time.time() - timeClassify

    probability = classifier.prob_classify(test_sent_features)
    print(probability.prob(found_department))

    return [
        found_department,
        probability.prob(found_department),
        accuracy(classifier, train_departments[1000:]),
        timeClassify,
        timeTraning,
    ]
예제 #4
0
def searchLinearSVC(title, train_departments):
    """
    Linear SVC
    :param title:
    :param train_departments:
    :return:
    """
    timeTraning = time.time()
    #classifier = SklearnClassifier(LinearSVC(probability=True))
    classifier = SklearnClassifier(SVC(kernel='linear', probability=True))
    classifier.train(train_departments)
    timeTraning = time.time() - timeTraning

    test_sent_features = word_feats(title)

    timeClassify = time.time()
    found_department = classifier.classify(test_sent_features)
    timeClassify = time.time() - timeClassify

    probability = classifier.prob_classify(test_sent_features)
    print(probability.prob(found_department))

    return [
        found_department,
        probability.prob(found_department),
        accuracy(classifier, train_departments[1000:]),
        timeClassify,
        timeTraning,
    ]
예제 #5
0
def searchNuSVC_classifier(title, train_departments):
    """
    Nu-Support Vector Classification.
    :param title:
    :param train_departments:
    :return:
    """
    classifier = SklearnClassifier(NuSVC())
    classifier.train(train_departments)
    test_sent_features = word_feats(title)
    return classifier.classify(test_sent_features)
예제 #6
0
def predict_nltk(in_text='', n=2):
    ''' Text language classification
        Then use scikit-learn classifiers from within NLTK 
        to classify new taxt based on training set.
    '''
    trainingset = []
    for label in text:
        featurs = text_features(text[label])
        trainingset.append((featurs, label))
    classifier = SklearnClassifier(MultinomialNB()).train(trainingset)
    in_features = text_features(in_text, n=n)
    lang = classifier.classify(in_features)
    print 'Language:', lang
예제 #7
0
def predict_nltk(in_text='', n=2): 
    ''' Text language classification
        Then use scikit-learn classifiers from within NLTK 
        to classify new taxt based on training set.
    '''
    trainingset = []  
    for label in text:
        featurs = text_features(text[label])
        trainingset.append((featurs, label))
    classifier = SklearnClassifier(MultinomialNB()).train(trainingset)
    in_features = text_features(in_text, n=n)
    lang = classifier.classify(in_features)
    print 'Language:', lang
예제 #8
0
class LinearSVC2Model(SKLearnModel):
    """This model classifies tweets into any one of twenty classes
    using SVM classification.
    """

    def __init__(self, balanced=False, C=1.0, dual=True, tol=1e-4, max_iter=1000, loss="squared_hinge") -> None:
        # Setup tweet tokenizer note this is the same as in our baseline. For a full description checkout the
        # model_naive_bayes_baselines source file.
        self.tokenizer = TweetTokenizer(preserve_case=False,
                                        reduce_len=True,
                                        strip_handles=True).tokenize

        # set class_weight to None unless the 'balanced' has been set to true in the config
        class_weight = None  # type: Optional[str]
        if balanced:
            class_weight = "balanced"

        # Here we create the pipeline for the classifier.
        # The TfidfTransformer is the same as in our baseline. For a full description checkout the
        # model_naive_bayes_baselines source file.
        # The LinearSVC sets up a Linear Support Vector Machine classifier. This is different because than using SCV
        # with a Linear kernel because it uses liblinear as a backend instead of libsvm. This makes it run a lot faster.
        pipeline = Pipeline([('tfidf', TfidfTransformer()),
                             ('linearsvc', LinearSVC(class_weight=class_weight, C=C,
                                                     dual=dual, tol=tol, max_iter=max_iter,
                                                     loss=loss))])
        self.classif = SklearnClassifier(pipeline)

    @staticmethod
    def get_extra_configs():
        configs = [{"name": "balanced", "default": False},
                   {"name": "C", "default": 1.0},
                   {"name": "dual", "default": True},
                   {"name": "tol", "default": 1e-4},
                   {"name": "max_iter", "default": 1000},
                   {"name": "loss", "default": "squared_hinge"}]  # add config for balanced.
        return super(LinearSVC2Model, LinearSVC2Model).get_extra_configs() + configs

    def train(self, tweets: List[Tweet]) -> None:
        def tweet_to_tuple(x):
            return (FreqDist(self.tokenizer(x.text)), x.emoji)

        # Generates tuples of all the tweets to form the corpus
        corpus = map(tweet_to_tuple, tweets)

        # Train this model!
        self.classif.train(corpus)

    def predict(self, text):
        return self.classif.classify(FreqDist(self.tokenizer(text)))
예제 #9
0
  def leaveKOutValidation(k=1):
    accuracy = 0.0
    print("Performing leave-"+str(k)+"-out cross-validation")
    gamesClusters = [feats[int(i*k):int((i+1)*k)] for i in range(int(len(feats)/k))]
    for games in gamesClusters:
      training = [x for x in feats if x not in games]

      pipeline = Pipeline([('tfidf', TfidfTransformer()),
        #('chi2', SelectKBest(chi2, k=250)),  
        ('nb', MultinomialNB())])

      classifier = SklearnClassifier(pipeline).train(training)

      rw = []
      for game in games:
        classification = classifier.classify(game[0])
        accuracy += int((game[1] > 0) == (classification > 0)) / float(len(feats))
    print("With leave-"+str(k)+"-out cross-validation, the algorithm is "+str(round(accuracy*100,4))+"% accurate")
def evaluate_classifier(featx,collocationFunc):
    #negFiles = movie_reviews.fileids('neg')
    #posFiles = movie_reviews.fileids('pos')
    #negWordsList=[movie_reviews.words(fileids=[f]) for f in negFiles]
    #posWordsList=[movie_reviews.words(fileids=[f]) for f in posFiles]
    #negfeats = [(featx(negWords), 'neg') for negWords in negWordsList]
    #posfeats = [(featx(posWords), 'pos') for posWords in posWordsList]

    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
 
    negfeats = [(featx(movie_reviews.words(fileids=[f]),collocationFunc), 'neg') for f in negids]
    posfeats = [(featx(movie_reviews.words(fileids=[f]),collocationFunc), 'pos') for f in posids]

    #lenNegFeats=min(len(negfeats),24)
    #lenPosFeats=min(len(posfeats),24)
    lenNegFeats=len(negfeats)
    lenPosFeats=len(posfeats)
    negcutoff = int(lenNegFeats*3/4)
    poscutoff = int(lenPosFeats*3/4)
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:lenNegFeats] + posfeats[poscutoff:lenPosFeats]
 
    #classifier = MaxentClassifier.train(trainfeats)
    classifier = SklearnClassifier(BernoulliNB()).train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
    evaluationMetrics={}
    print(classifier)
    evaluationMetrics['accuracy']=nltk.classify.util.accuracy(classifier, testfeats)
    evaluationMetrics['posPrec']=nltk.precision(refsets['pos'], testsets['pos'])
    evaluationMetrics['posRecall']=nltk.recall(refsets['pos'], testsets['pos'])
    evaluationMetrics['posF_Score']=nltk.f_measure(refsets['pos'], testsets['pos'])
    evaluationMetrics['negPrec']=nltk.precision(refsets['neg'], testsets['neg'])
    evaluationMetrics['negRecall']=nltk.recall(refsets['neg'], testsets['neg'])
    evaluationMetrics['negF_Score']=nltk.f_measure(refsets['neg'], testsets['neg'])
    return evaluationMetrics
예제 #11
0
파일: test3.py 프로젝트: jshenaop/soleka
def evaluate_classifier(featx):
    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]

    negcutoff = len(negfeats) * 3 / 4
    poscutoff = len(posfeats) * 3 / 4

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

    # using 3 classifiers
    classifier_list = ['nb', 'maxent', 'svm']

    for cl in classifier_list:
        if cl == 'maxent':
            classifierName = 'Maximum Entropy'
            classifier = MaxentClassifier.train(trainfeats,
                                                'GIS',
                                                trace=0,
                                                encoding=None,
                                                labels=None,
                                                sparse=True,
                                                gaussian_prior_sigma=0,
                                                max_iter=1)
        elif cl == 'svm':
            classifierName = 'SVM'
            classifier = SklearnClassifier(LinearSVC(), sparse=False)
            classifier.train(trainfeats)
        else:
            classifierName = 'Naive Bayes'
            print(trainfeats)
            classifier = NaiveBayesClassifier.train(trainfeats)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        accuracy = nltk.classify.util.accuracy(classifier, testfeats)
예제 #12
0
 def ml_sentiment(self, text):
     ''' Machine Learning for Sentiment detection.
     '''
     trainingset = []
     for tweet in self.data:
         trainingset.append(self.sentiment_featrues(tweet))
     #classifier = nltk.NaiveBayesClassifier.train(trainingset)
     #classifier = nltk.DecisionTreeClassifier.train(trainingset)
     classifier = SklearnClassifier(MultinomialNB()).train(trainingset)
     tokenz = self.ml_tag(text, print_tags=False)
     tweet = {
         'tokens': tokenz,
         'sentiment': ''
     } 
     tokenz_features = self.sentiment_featrues(tweet)
     #print tokenz_features
     sentiment = classifier.classify(tokenz_features[0])
     #print text, sentiment
     tweet['sentiment'] = sentiment
     print '\nTweet:', text
     self.show_tweet(tweet)
     return sentiment
예제 #13
0
class SVCModel(SKLearnModel):
    """This model classifies tweets into any one of twenty classes
    using SVM classification.
    """

    def __init__(self, kernel: str = "") -> None:
        # Setup tweet tokenizer note this is the same as in our baseline. For a full description checkout the
        # model_naive_bayes_baselines source file.
        self.tokenizer = TweetTokenizer(preserve_case=False,
                                        reduce_len=True,
                                        strip_handles=True).tokenize

        # Here we create the pipeline for the classifier.
        # The TfidfTransformer is the same as in our baseline. For a full description checkout the
        # model_naive_bayes_baselines source file.
        # The SVC sets up a Support Vector Machine classifier with the configured kernel.
        # In this case it is either a linear or a radial basis function kernel.
        # The details for the above items are discussed in the model's readme.
        pipeline = Pipeline([('tfidf', TfidfTransformer()),
                             ('{}svc'.format(kernel), SVC(kernel=kernel))])
        self.classif = SklearnClassifier(pipeline)

    def train(self, tweets: List[Tweet]) -> None:
        def tweet_to_tuple(x):
            return (FreqDist(self.tokenizer(x.text)), x.emoji)

        # Generates tuples of all the tweets to form the corpus
        corpus = map(tweet_to_tuple, tweets)

        # Train this model!
        self.classif.train(corpus)

    def predict(self, text):
        return self.classif.classify(FreqDist(self.tokenizer(text)))

    def tokenize(self, text):
        return self.tokenizer(text)
예제 #14
0
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

########################################################################################
########################################################################################

refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)

str = 'SINGLE FOLD RESULT ' + '(' + 'linear-svc' + ')'
#training with LinearSVC
classifier = SklearnClassifier(LinearSVC())
classifier.train(trainfeats)
for i, (feats, label) in enumerate(testfeats):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)
accuracy = nltk.classify.util.accuracy(classifier, testfeats) * 100
pos_precision = nltk.precision(refsets['pos'], testsets['pos'])

pos_recall = recall(refsets['pos'], testsets['pos'])
pos_fmeasure = f_measure(refsets['pos'], testsets['pos'])
neg_precision = precision(refsets['neg'], testsets['neg'])
neg_recall = recall(refsets['neg'], testsets['neg'])
neg_fmeasure = f_measure(refsets['neg'], testsets['neg'])
print('')
print('---------------------------------------')
print(str)
print('---------------------------------------')
print('accuracy: ', accuracy, '%')
print('precision', (pos_precision + neg_precision) / 2)

nonlinear_svm = SklearnClassifier(SVC(gamma='scale', kernel='poly', coef0 = 5.0, degree = 5, C = 5.0, shrinking=True, probability=False, tol=1e-3), sparse=False).train(train_set)
print("Accuracy - Nonlinear SVM: ")
print(nltk.classify.accuracy(nonlinear_svm, test_set))


random_forest = SklearnClassifier(RandomForestClassifier(n_estimators = 100,
                                                         criterion = 'gini',
                                                         max_depth = 5,
                                                         min_samples_split = 2,
                                                         min_samples_leaf = 1,
                                                         min_weight_fraction_leaf = 0.0,
                                                         max_features = 25,
                                                         max_leaf_nodes = 20,
                                                         min_impurity_decrease = 0.0,
                                                         bootstrap = True,
                                                         oob_score = False,
                                                         random_state = None ),
                                  sparse = False)
random_forest.train(train_set)
print("Accuracy - Random Forest Classifier: ")
print(nltk.classify.accuracy(random_forest, test_set))


test_tweet = "75% of illegal Aliens commit Felons such as ID, SSN and Welfare Theft Illegal #Immigration is not a Victimless Crime !"
# print(naive_bayes.classify(extract_features_of_tweet(test_tweet, raw=True)))
# print(maxent.classify(extract_features_of_tweet(test_tweet, raw=True)))
print(linear_svm_classifier.classify(extract_features_of_tweet(test_tweet, raw=False)))
print(nonlinear_svm.classify(extract_features_of_tweet(test_tweet, raw=True)))
예제 #16
0
def evaluate_classifier(featx):
    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    #testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

    
    print 'Reading Tweets\n'
    tweets_data_path = '20161019_202620.txt'
    tweets_data = []
    tweets_file = open(tweets_data_path, "r")
    for line in tweets_file:
	    try:
	        tweet = json.loads(line)
	        tweets_data.append(tweet)
	    except:
	    	continue
	     	
    tweets = pd.DataFrame()
    tweets['text'] = [tweet.get('text','') for tweet in tweets_data]
    
    tdata = tweets['text']
    negfeats = [(featx(f), 'neg') for f in word_split(tdata)]
    testfeats = negfeats

    print np.shape(testfeats)
    #testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    #print np.shape(testfeats)
    
    
    # using 3 classifiers
    classifier_list = ['nb', 'maxent', 'svm']     
        
    for cl in classifier_list:
        if cl == 'maxent':
            classifierName = 'Maximum Entropy'
            classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None,  gaussian_prior_sigma=0, max_iter = 1)
        elif cl == 'svm':
            classifierName = 'SVM'
            classifier = SklearnClassifier(LinearSVC(), sparse=False)
            classifier.train(trainfeats)
        else:
            classifierName = 'Naive Bayes'
            classifier = NaiveBayesClassifier.train(trainfeats)
            
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
 
        for i, (feats, label) in enumerate(testfeats):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)

        print testsets[observed]

        accuracy = nltk.classify.util.accuracy(classifier, testfeats)
        #pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos'])
        #pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos'])
        #pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
        #neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg'])
        #neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg'])
        #neg_fmeasure =  nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
        
        print ''
        print '---------------------------------------'
        print 'SINGLE FOLD RESULT ' + '(' + classifierName + ')'
        print '---------------------------------------'
        print 'accuracy:', accuracy
예제 #17
0
def evaluate_classifier(featx):

    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
    #print(negfeats)

    negcutoff = int(len(negfeats) * 3 / 4)
    poscutoff = int(len(posfeats) * 3 / 4)
    #print(negcutoff)

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    #print(trainfeats)
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

    classifierName = 'SVM'
    classifier = SklearnClassifier(LinearSVC(), sparse=False).train(trainfeats)
    #classifier.train(trainfeats)

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    #print(testfeats)
    for i, (feats, label) in enumerate(testfeats):
        #feats : list of words
        #label : neg/pos
        #observed : neg/pos
        #print(feats,'---',label)
        refsets[label].add(i)
        observed = classifier.classify(feats)
        #print(observed)
        testsets[observed].add(i)

    accuracy = nltk.classify.util.accuracy(classifier, testfeats)
    pos_precision = nltk.precision(refsets['pos'], testsets['pos'])
    pos_recall = nltk.recall(refsets['pos'], testsets['pos'])
    pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos'])
    neg_precision = nltk.precision(refsets['neg'], testsets['neg'])
    neg_recall = nltk.recall(refsets['neg'], testsets['neg'])
    neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg'])

    print('')
    print('---------------------------------------')
    print('SINGLE FOLD RESULT ' + '(' + classifierName + ')')
    print('---------------------------------------')
    print('accuracy:', accuracy)
    print('precision', (pos_precision + neg_precision) / 2)
    print('recall', (pos_recall + neg_recall) / 2)
    print('f-measure', (pos_fmeasure + neg_fmeasure) / 2)

    #classifier.show_most_informative_features()

    print('')

    ## CROSS VALIDATION

    trainfeats = negfeats + posfeats

    # SHUFFLE TRAIN SET
    # As in cross validation, the test chunk might have only negative or only positive data
    random.shuffle(trainfeats)
    n = 5  # 5-fold cross-validation

    subset_size = int(len(trainfeats) / n)
    accuracy = []
    pos_precision = []
    pos_recall = []
    neg_precision = []
    neg_recall = []
    pos_fmeasure = []
    neg_fmeasure = []
    cv_count = 1
    for i in range(n):
        testing_this_round = trainfeats[i * subset_size:][:subset_size]
        training_this_round = trainfeats[:i * subset_size] + trainfeats[
            (i + 1) * subset_size:]

        classifierName = 'SVM'
        classifier = SklearnClassifier(LinearSVC(), sparse=False)
        classifier.train(training_this_round)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
        for i, (feats, label) in enumerate(testing_this_round):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        cv_accuracy = nltk.classify.util.accuracy(classifier,
                                                  testing_this_round)
        cv_pos_precision = nltk.precision(refsets['pos'], testsets['pos'])
        cv_pos_recall = nltk.recall(refsets['pos'], testsets['pos'])
        cv_pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos'])
        cv_neg_precision = nltk.precision(refsets['neg'], testsets['neg'])
        cv_neg_recall = nltk.recall(refsets['neg'], testsets['neg'])
        cv_neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg'])

        accuracy.append(cv_accuracy)
        pos_precision.append(cv_pos_precision)
        pos_recall.append(cv_pos_recall)
        neg_precision.append(cv_neg_precision)
        neg_recall.append(cv_neg_recall)
        pos_fmeasure.append(cv_pos_fmeasure)
        neg_fmeasure.append(cv_neg_fmeasure)

        cv_count += 1

    print('---------------------------------------')
    print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')')
    print('---------------------------------------')
    print('accuracy:', sum(accuracy) / n)
    print('precision', (sum(pos_precision) / n + sum(neg_precision) / n) / 2)
    print('recall', (sum(pos_recall) / n + sum(neg_recall) / n) / 2)
    print('f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n) / 2)
    print('')
예제 #18
0
class RandomForestCascadeClassifier():
    def __init__(self,
                 dataset,
                 k,
                 user_followers=True,
                 users_reachable=True,
                 average_time=True,
                 time_to_k=True):
        self.k = k
        self._twtokenize = TweetTokenizer(strip_handles=True)
        self._dataset = dataset
        self._user_followers = user_followers
        self._users_reachable = users_reachable
        self._average_time = average_time
        self._time_to_k = time_to_k
        self._stopwords = stopwords.words('english')
        self._stemmer = PorterStemmer()
        self._f_count = []
        self._r_count = []
        self._rt_count = []
        self._avg = []
        self._time = []
        self._train()

    def _tokenize(self, tweet_text):
        return [
            self._stemmer.stem(token)
            for token in self._twtokenize.tokenize(tweet_text)
            if token not in self._stopwords
        ]

    def _sorted_cascade_nodes(self, cascade):
        nodes = cascade['cascade']
        cascade_nodes = [(int(key), nodes[key]) for key in nodes.keys()]
        return sorted(cascade_nodes, key=lambda x: x[0])

    def _tweet_length_feature(self, cascade):
        length = len(cascade['root_tweet']['text'])
        return length

    def _user_followers_feature(self, cascade):
        followers = cascade['root_tweet']['user']['followers_count']
        self._f_count.append(followers)
        return followers

    def _users_reachable_feature(self, nodes):
        reachable = 0
        for kth, node in zip(range(self.k + 1), nodes):
            reachable += node[1]['user_followees_count']
        self._r_count.append(reachable)
        return reachable

    def _average_time_feature(self, nodes):
        timestamp = [
            int(node[1]['created_at'])
            for kth, node in zip(range(self.k + 1), nodes)
        ]
        average = (sum(numpy.diff(timestamp)) / float(len(timestamp))) / 1000
        self._avg.append(average)
        return average

    def _users_retweet_feature(self, cascade):
        retweets = cascade['root_tweet']['retweet_count']
        self._rt_count.append(retweets)
        return retweets

    def _time_to_k_feature(self, nodes):
        first = int(nodes[0][1]['created_at'])
        kth = int(list(zip(range(self.k + 1), nodes))[-1][1][1]['created_at'])
        diff = (kth - first) / 1000
        self._time.append(diff)
        return diff

    def _extract_features(self, cascade):
        if cascade['root_tweet']['lang'] == 'en':
            tweet_tokens = self._tokenize(cascade['root_tweet']['text'])
            features = {
                "contains({0})".format(token): True
                for token in tweet_tokens
            }
        else:
            features = {}

        features['tweet_length'] = self._tweet_length_feature(cascade)
        # features['rtweet'] = self._users_retweet_feature(cascade)

        if self._user_followers:
            features["user_followers"] = self._user_followers_feature(cascade)

        cascade_nodes = self._sorted_cascade_nodes(cascade)

        if self._users_reachable:
            features['reachable'] = self._users_reachable_feature(
                cascade_nodes)
        if self._average_time:
            features['average'] = self._average_time_feature(cascade_nodes)
        if self._time_to_k:
            features['timetok'] = self._time_to_k_feature(cascade_nodes)

        return features

    def _train(self):
        pickle_filename = "{0}.pickle".format(self.__class__.__name__)
        if os.path.isfile(pickle_filename):
            with open(pickle_filename, "rb") as classifier_f:
                self._classifier = pickle.load(classifier_f)
            classifier_f.close()
        else:
            train_set = [(self._extract_features(cascade), cascade['label'])
                         for cascade in self._dataset]
            pipeline = Pipeline([('tfidf', TfidfTransformer()),
                                 ('chi2', SelectKBest(chi2, k=1000)),
                                 ('rf',
                                  RandomForestClassifier(n_estimators=1000))])
            self._classifier = SklearnClassifier(pipeline,
                                                 sparse=False).train(train_set)

            with open(pickle_filename, "wb") as save_classifier:
                pickle.dump(self._classifier, save_classifier)

            save_classifier.close()

    def classify(self, cascade):
        features = self._extract_features(cascade)
        return self._classifier.classify(features)

    def classify_prob(self, cascade):
        features = self._extract_features(cascade)
        result = self._classifier.prob_classify(features)
        return {"positive": result.prob(True), "negative": result.prob(False)}

    def _metrics(self, results):
        print(
            metrics.classification_report(results['actual'],
                                          results['prediction']))

    def classify_cascades(self, test_dataset):
        results = {"prediction": [], "actual": []}

        for cascade in test_dataset:
            result = self.classify(cascade)
            actual = cascade['label']
            results["prediction"].append(result)
            results["actual"].append(actual)

        self._metrics(results)

        print("Average: {0}, Median: {1}, Std: {2}".format(
            numpy.average(self._f_count), numpy.median(self._f_count),
            numpy.std(self._f_count)))
        print("Average: {0}, Median: {1}, Std: {2}".format(
            numpy.average(self._r_count), numpy.median(self._r_count),
            numpy.std(self._r_count)))
        print("Average: {0}, Median: {1}, Std: {2}".format(
            numpy.average(self._avg), numpy.median(self._avg),
            numpy.std(self._avg)))
        print("Average: {0}, Median: {1}, Std: {2}".format(
            numpy.average(self._time), numpy.median(self._time),
            numpy.std(self._time)))

    def classify_cascades_prob_export(self, test_dataset):
        export = "dataset/" + self.__class__.__name__ + "_results.json"
        results = {}

        for cascade in test_dataset:
            results[cascade['url']] = self.classify_prob(cascade)

        export_file = open(export, 'w')
        export_file.write(json.dumps(results))
예제 #19
0
    observed = logit_classifier.classify(feats)
    testset[observed].add(i)
print("UnigramsLogit Recall")
print('Bullying recall:', recall(testset['Bullying'], refset['Bullying']))
print("")

# In[14]:

#Support Vector Machine for Unigrams
from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
SVM_classifier = SklearnClassifier(SVC(), sparse=False).train(train_set)

for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = SVM_classifier.classify(feats)
    testset[observed].add(i)

print("UnigramSVM Recall")
print('Bullying recall:', recall(testset['Bullying'], refset['Bullying']))

# In[15]:

#Same thing with Bigrams
from nltk import bigrams, trigrams
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

# In[16]:

combined = zip(Tweet, Labels)
예제 #20
0
def evaluate_classifier(data):
    
    trainfeats, testfeats  = train_test_split(data, test_size=0.3, random_state=0)
    
    # using 3 classifiers
    classifier_list = ['nb','svm']
    classifier_dict ={'nb':'Naive Bayes', 'svm':'SVM'}     
        
    for cl in classifier_list:
        classifierPkl = os.path.join('pkl',cl+".pkl")
        if not os.path.exists('./%s'%classifierPkl):
            if cl == 'svm':
                classifier = SklearnClassifier(LinearSVC(), sparse=False)
                classifier.train(trainfeats)
            else:
                classifier = NaiveBayesClassifier.train(trainfeats)
            pickle.dump(classifier,open(classifierPkl, 'wb'))
        else:                 
            classifier = pickle.load(open(classifierPkl,'rb'))    
                
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
 
        for i, (feats, label) in enumerate(testfeats):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)
 
        accuracy = nltk.classify.util.accuracy(classifier, testfeats)
        pos_precision = precision(refsets['positive'], testsets['positive'])
        pos_recall = recall(refsets['positive'], testsets['positive'])
        pos_fmeasure = f_measure(refsets['positive'], testsets['positive'])
        neg_precision = precision(refsets['negative'], testsets['negative'])
        neg_recall = recall(refsets['negative'], testsets['negative'])
        neg_fmeasure =  f_measure(refsets['negative'], testsets['negative'])
        
        print('')
        print('---------------------------------------')
        print('SINGLE FOLD RESULT ' + '(' + classifier_dict[cl] + ')')
        print('---------------------------------------')
        print('accuracy:', accuracy)
        print('precision', (pos_precision + neg_precision) / 2)
        print('recall', (pos_recall + neg_recall) / 2)
        print('f-measure', (pos_fmeasure + neg_fmeasure) / 2)
                
        #classifier.show_most_informative_features()
    
    print('')

    
       
    n = 5 # 5-fold cross-validation    
    
    for cl in classifier_list:
        
        subset_size = len(trainfeats) / n
        accuracy = []
        pos_precision = []
        pos_recall = []
        neg_precision = []
        neg_recall = []
        pos_fmeasure = []
        neg_fmeasure = []
        cv_count = 1
        for i in range(n):        
            testing_this_round = trainfeats[i*int(subset_size):][:int(subset_size)]
            training_this_round = trainfeats[:i*int(subset_size)] + trainfeats[(i+1)*int(subset_size):]
            classifierPkl = os.path.join('pkl',cl+"_cv.pkl")
            if not os.path.exists('./%s'%classifierPkl):
                if cl == 'svm':
                    classifier = SklearnClassifier(LinearSVC(), sparse=False)
                    classifier.train(training_this_round)
                else:
                    classifier = NaiveBayesClassifier.train(training_this_round)
                pickle.dump(classifier,open(classifierPkl, 'wb'))         
            else:
                classifier = pickle.load(open(classifierPkl,'rb'))                           
            refsets = collections.defaultdict(set)
            testsets = collections.defaultdict(set)
            for i, (feats, label) in enumerate(testing_this_round):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)
            
            cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round)
            cv_pos_precision = precision(refsets['positive'], testsets['positive'])
            cv_pos_recall = recall(refsets['positive'], testsets['positive'])
            cv_pos_fmeasure = f_measure(refsets['positive'], testsets['positive'])
            cv_neg_precision = precision(refsets['negative'], testsets['negative'])
            cv_neg_recall = recall(refsets['negative'], testsets['negative'])
            cv_neg_fmeasure =  f_measure(refsets['negative'], testsets['negative'])
                    
            accuracy.append(cv_accuracy)
            pos_precision.append(cv_pos_precision)
            pos_recall.append(cv_pos_recall)
            neg_precision.append(cv_neg_precision)
            neg_recall.append(cv_neg_recall)
            pos_fmeasure.append(cv_pos_fmeasure)
            neg_fmeasure.append(cv_neg_fmeasure)
            
            cv_count += 1
                
        print('---------------------------------------')
        print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifier_dict[cl] + ')')
        print('---------------------------------------')
        print('accuracy:', sum(accuracy) / n)
        print('precision', (sum(pos_precision)/n + sum(neg_precision)/n) / 2)
        print('recall', (sum(pos_recall)/n + sum(neg_recall)/n) / 2)
        print('f-measure', (sum(pos_fmeasure)/n + sum(neg_fmeasure)/n) / 2)
        print('')
예제 #21
0
def evaluate_classifier(featx):

    negfeats = [(featx(f), 'negative') for f in splitter(negative)]
    posfeats = [(featx(f), 'positive') for f in splitter(positive)]
    neautralfeats = [(featx(f), 'neautral') for f in splitter(neautral)]
    negcutoff = int(len(negfeats) * 3 / 4)
    poscutoff = int(len(posfeats) * 3 / 4)
    neautcutoff = int(len(neautralfeats) * 3 / 4)

    trainfeats = negfeats[:
                          negcutoff] + posfeats[:
                                                poscutoff] + neautralfeats[:
                                                                           neautcutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] + neautralfeats[
        neautcutoff:]
    # Max Entropy and SVM classifiers
    classifier_list = ['maxent', 'svm']

    for cl in classifier_list:
        if cl == 'maxent':
            classifierName = 'Maximum Entropy'
            classifier = MaxentClassifier.train(trainfeats,
                                                'GIS',
                                                trace=0,
                                                encoding=None,
                                                labels=None,
                                                gaussian_prior_sigma=0,
                                                max_iter=1)
        elif cl == 'svm':
            classifierName = 'SVM'
            classifier = SklearnClassifier(LinearSVC(), sparse=False)
            classifier.train(trainfeats)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        accuracy = nltk.classify.util.accuracy(classifier, testfeats)

        pos_precision = precision(refsets['positive'], testsets['positive'])
        if pos_precision is None:
            pos_precision = 0.0
        pos_recall = recall(refsets['positive'], testsets['positive'])
        if pos_recall is None:
            pos_recall = 0.0
        pos_fmeasure = f_measure(refsets['positive'], testsets['positive'])
        if pos_fmeasure is None:
            pos_fmeasure = 0.0

        neut_precision = precision(refsets['neautral'], testsets['neautral'])
        if neut_precision is None:
            neut_precision = 0.0
        neut_recall = recall(refsets['neautral'], testsets['neautral'])
        if neut_recall is None:
            neut_recall = 0.0
        neut_fmeasure = f_measure(refsets['neautral'], testsets['neautral'])
        if neut_fmeasure is None:
            neut_fmeasure = 0.0

        neg_precision = precision(refsets['negative'], testsets['negative'])
        if neg_precision is None:
            neg_precision = 0.0
        neg_recall = recall(refsets['negative'], testsets['negative'])
        if neg_recall is None:
            neg_recall = 0.0
        neg_fmeasure = f_measure(refsets['negative'], testsets['negative'])
        if neg_fmeasure is None:
            neg_fmeasure = 0.0
        print('\n')
        print(classifierName)
        print('accuracy:', accuracy)
        acrcy.append(accuracy)
        print('precision',
              (pos_precision + neg_precision + neut_precision) / 3)
        prcsn.append((pos_precision + neg_precision + neut_precision) / 3)
        print('recall', (pos_recall + neg_recall + neut_recall) / 3)
        rcall.append((pos_recall + neg_recall + neut_recall) / 3)
        print('f-measure', (pos_fmeasure + neg_fmeasure + neut_fmeasure) / 3)
        fmsr.append((pos_fmeasure + neg_fmeasure + neut_fmeasure) / 3)
def evaluate_classifier(featx):

    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]

    negcutoff = int(len(negfeats) * 3 / 4)
    poscutoff = int(len(posfeats) * 3 / 4)

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

    # using 3 classifiers
    classifier_list = ['nb', 'maxent', 'svm']

    for cl in classifier_list:
        if cl == 'maxent':
            classifierName = 'Maximum Entropy'
            classifier = MaxentClassifier.train(trainfeats,
                                                'GIS',
                                                trace=0,
                                                encoding=None,
                                                labels=None,
                                                gaussian_prior_sigma=0,
                                                max_iter=1)
        elif cl == 'svm':
            classifierName = 'SVM'
            classifier = SklearnClassifier(LinearSVC(), sparse=False)
            classifier.train(trainfeats)
        else:
            classifierName = 'Naive Bayes'
            classifier = NaiveBayesClassifier.train(trainfeats)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        accuracy = nltk.classify.util.accuracy(classifier, testfeats)
        pos_precision = precision(refsets['pos'], testsets['pos'])
        pos_recall = recall(refsets['pos'], testsets['pos'])
        pos_fmeasure = f_measure(refsets['pos'], testsets['pos'])
        neg_precision = precision(refsets['neg'], testsets['neg'])
        neg_recall = recall(refsets['neg'], testsets['neg'])
        neg_fmeasure = f_measure(refsets['neg'], testsets['neg'])

        print('')
        print('---------------------------------------')
        print('SINGLE FOLD RESULT ' + '(' + classifierName + ')')
        print('---------------------------------------')
        print('accuracy:', accuracy)
        print('precision', (pos_precision + neg_precision) / 2)
        print('recall', (pos_recall + neg_recall) / 2)
        print('f-measure', (pos_fmeasure + neg_fmeasure) / 2)

        #classifier.show_most_informative_features()

    print('')

    ## CROSS VALIDATION

    trainfeats = negfeats + posfeats

    # SHUFFLE TRAIN SET
    # As in cross validation, the test chunk might have only negative or only positive data
    random.shuffle(trainfeats)
    n = 5  # 5-fold cross-validation

    for cl in classifier_list:

        subset_size = int(len(trainfeats) / n)
        accuracy = []
        pos_precision = []
        pos_recall = []
        neg_precision = []
        neg_recall = []
        pos_fmeasure = []
        neg_fmeasure = []
        cv_count = 1
        for i in range(n):
            testing_this_round = trainfeats[i * subset_size:][:subset_size]
            training_this_round = trainfeats[:i * subset_size] + trainfeats[
                (i + 1) * subset_size:]

            if cl == 'maxent':
                classifierName = 'Maximum Entropy'
                classifier = MaxentClassifier.train(training_this_round,
                                                    'GIS',
                                                    trace=0,
                                                    encoding=None,
                                                    labels=None,
                                                    gaussian_prior_sigma=0,
                                                    max_iter=1)
            elif cl == 'svm':
                classifierName = 'SVM'
                classifier = SklearnClassifier(LinearSVC(), sparse=False)
                classifier.train(training_this_round)
            else:
                classifierName = 'Naive Bayes'
                classifier = NaiveBayesClassifier.train(training_this_round)

            refsets = collections.defaultdict(set)
            testsets = collections.defaultdict(set)
            for i, (feats, label) in enumerate(testing_this_round):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)

            cv_accuracy = nltk.classify.util.accuracy(classifier,
                                                      testing_this_round)
            cv_pos_precision = precision(refsets['pos'], testsets['pos'])
            cv_pos_recall = recall(refsets['pos'], testsets['pos'])
            cv_pos_fmeasure = f_measure(refsets['pos'], testsets['pos'])
            cv_neg_precision = precision(refsets['neg'], testsets['neg'])
            cv_neg_recall = recall(refsets['neg'], testsets['neg'])
            cv_neg_fmeasure = f_measure(refsets['neg'], testsets['neg'])

            accuracy.append(cv_accuracy)
            pos_precision.append(cv_pos_precision)
            pos_recall.append(cv_pos_recall)
            neg_precision.append(cv_neg_precision)
            neg_recall.append(cv_neg_recall)
            pos_fmeasure.append(cv_pos_fmeasure)
            neg_fmeasure.append(cv_neg_fmeasure)

            cv_count += 1

        print('---------------------------------------')
        print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')')
        print('---------------------------------------')
        print('accuracy:', sum(accuracy) / n)
        print('precision',
              (sum(pos_precision) / n + sum(neg_precision) / n) / 2)
        print('recall', (sum(pos_recall) / n + sum(neg_recall) / n) / 2)
        print('f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n) / 2)
        print('')
예제 #23
0




tweets = []
stop_words = set(stopwords.words('english'))

for (words, sentiment) in train:
	words_filtered = [e.lower() for e in words.split() if e not in stop_words]
	tweets.append((words_filtered, sentiment))

# print tweets
# word_features = get_word_features(get_words_in_tweets(tweets))
# training_set = nltk.classify.apply_features(extract_features, tweets)

training_set=traindict(tweets)
print training_set

# classifier = nltk.NaiveBayesClassifier.train(training_set)

classifier =  SklearnClassifier(SVC(), sparse=False).train(training_set)

tweetd = 'I have cows :('
print classifier.classify(dict(Counter(clean(tweetd.lower()))))



# tweetd = 'Obama is boring :('
# print classifier.classify(extract_features(tweetd.lower().split()))
예제 #24
0
    if r['tag'] == 2:
        train.append((tx, "obj"))
    # elif r['tag']==0:
    #  	train.append((tx,"neg"))
    else:
        train.append((tx, "subj"))

tweets = []
stop_words = set(stopwords.words('english'))

for (words, sentiment) in train:
    words_filtered = [e.lower() for e in words.split() if e not in stop_words]
    tweets.append((words_filtered, sentiment))

# print tweets
# word_features = get_word_features(get_words_in_tweets(tweets))
# training_set = nltk.classify.apply_features(extract_features, tweets)

training_set = traindict(tweets)
print training_set

# classifier = nltk.NaiveBayesClassifier.train(training_set)

classifier = SklearnClassifier(SVC(), sparse=False).train(training_set)

tweetd = 'I have cows :('
print classifier.classify(dict(Counter(clean(tweetd.lower()))))

# tweetd = 'Obama is boring :('
# print classifier.classify(extract_features(tweetd.lower().split()))
예제 #25
0
def create_classifier(featx):
    pos_data = pickle.load(
        open(os.path.join(config.test_path, 'pos_review.pkl'), 'r'))
    neg_data = pickle.load(
        open(os.path.join(config.test_path, 'neg_review.pkl'), 'r'))

    pos_words = pos_data[:]
    neg_words = neg_data[:]

    print len(pos_words), '------', len(neg_words)
    pos_features = [(featx(w_lst), 'pos') for w_lst in pos_words]
    neg_features = [(featx(w_lst), 'neg') for w_lst in neg_words]

    negoff = int(len(neg_features) * 0.9)
    posoff = int(len(pos_features) * 0.9)

    r_pos_cut = pos_features[:posoff]
    r_neg_cut = neg_features[:negoff]

    print r_pos_cut is None, '---r_pos_cut----', len(r_pos_cut)
    print r_neg_cut is None, '---r_neg_cut----', len(r_neg_cut)

    t_pos_cut = pos_features[posoff:]
    t_neg_cut = neg_features[negoff:]

    print t_pos_cut is None, '---t_pos_cut----', len(t_pos_cut)
    print t_neg_cut is None, '---t_neg_cut----', len(t_neg_cut)

    r_pos_cut.extend(r_neg_cut)
    train_set = r_pos_cut

    t_pos_cut.extend(t_neg_cut)
    test_set = t_pos_cut

    # print pos_features
    print train_set is None, '---train_set----', len(train_set)
    print test_set is None, '-----test_set--', len(test_set)

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
    nba = nltk.classify.accuracy(nb_classifier, test_set)
    for i, (feats, label) in enumerate(test_set):
        refsets[label].add(i)
        observed = nb_classifier.classify(feats)
        testsets[observed].add(i)
    print "NBayes accuracy is %.7f" % nba  # 0.5325077

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    svm_classifier = SklearnClassifier(LinearSVC()).train(train_set)
    for i, (feats, label) in enumerate(test_set):
        refsets[label].add(i)
        observed = svm_classifier.classify(feats)
        testsets[observed].add(i)
    svmm = nltk.classify.accuracy(svm_classifier, test_set)
    print "SVM accuracy is %.7f" % svmm  # 0.6604747

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    maxent_classifier = nltk.classify.MaxentClassifier.train(train_set,
                                                             max_iter=7)
    for i, (feats, label) in enumerate(test_set):
        refsets[label].add(i)
        observed = maxent_classifier.classify(feats)
        testsets[observed].add(i)
    maxent = nltk.classify.accuracy(maxent_classifier, test_set)
    print "MaxentClassifier accuracy is %.7f" % maxent  # 0.6449948

    classifier_pkl = os.path.join(config.test_path,
                                  'my_classifier_svm.pkl')  # 消极语料
    with open(classifier_pkl, 'wb') as f:
        pickle.dump(svm_classifier, f)

    classifier_pkl = os.path.join(config.test_path,
                                  'my_classifier_maxent.pkl')  # 消极语料
    with open(classifier_pkl, 'wb') as f:
        pickle.dump(maxent_classifier, f)

    classifier_pkl = os.path.join(config.test_path,
                                  'my_classifier_nb.pkl')  # 消极语料
    with open(classifier_pkl, 'wb') as f:
        pickle.dump(nb_classifier, f)

    print 'done!'
예제 #26
0
def hello():
    f = open("Training_amazon_data.txt")
    pos_tweets = list()
    neg_tweets = list()
    for line in f:
        words = line.split("\t")
        if words[1] == '0\n' or words[1] == '0':
            neg_tweets.append(words)
        else:
            pos_tweets.append(words)
    f.close()

    tweets = []
    for (words, sentiment) in pos_tweets + neg_tweets:
        words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
        tweets.append((words_filtered, sentiment))

        def get_words_in_tweets(tweets):
            all_words = []
            for (words, sentiment) in tweets:
                all_words.extend(words)
            return all_words

    def get_word_features(wordlist):
        wordlist = nltk.FreqDist(wordlist)
        word_features = wordlist.keys()
        return word_features

    word_features = get_word_features(get_words_in_tweets(tweets))

    def extract_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    training_set = nltk.classify.apply_features(extract_features, tweets)
    #classifie = nltk.NaiveBayesClassifier.train(training_set)

    classifie = SklearnClassifier(BernoulliNB()).train(training_set)

    form = ReusableForm(request.form)
    print(form.errors)

    if request.method == 'POST':
        name = request.form['name']
        file = open("test.txt")
        resfile = open("result.txt", "w")
        for line in file:
            review = classifie.classify(extract_features(line.split()))
            resfile.write(line)
            resfile.write(review)
        file.close()
        resfile.close()
        #if (classifie.classify(extract_features(name.split())) == '1'):
        #    review = 'Positive'
        # else:
        #    review = 'Negative'
        name = classifie.classify(extract_features(name.split()))
        print(name)

        if form.validate():
            # Save the comment here.
            flash(name)
        else:
            flash('Error: All the form fields are required. ')

    return render_template('analysis.html', form=form)
예제 #27
0
from sklearn.svm import SVC
#TRAINING AND TEST DATA
def SVM_Classifier():
    train = [('I love this sandwich.', 'pos'),
            ('This is an amazing place!', 'pos'),
            ('I feel very good about these beers.', 'pos'),
            ('This is my best work.', 'pos'),
            ("What an awesome view", 'pos'),
            ('I do not like this restaurant', 'neg'),
            ('I am tired of this stuff.', 'neg'),
            ("I can't deal with this", 'neg'),
           ('He is my sworn enemy!', 'neg'),
          ('My boss is horrible.', 'neg')]
    test = [
        ('The beer was good.', 'pos'),
        ('I do not enjoy my job', 'neg'),
        ("I ain't feeling dandy today.", 'neg'),
        ("I feel amazing!", 'pos'),
        ('Gary is a friend of mine.', 'pos'),
        ("I can't believe I'm doing this.", 'neg')]
  test_sentence = "This is the best band I've ever heard!"
  #FEATURESETS
  all_words = set(word.lower() for passage in train for word in word_tokenize(passage[0]))
  t = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train]
  testf=[({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in test]
  test_sent_features = {word.lower(): (word in word_tokenize(test_sentence.lower())) for word in all_words}
  #CLASSIFICATION
  #SUPPORT VECTOR MACHINE
  classif1 = SklearnClassifier(SVC(), sparse=False).train(t)
  classif1.classify(test_sent_features)
print('neg F-measure:', f_measure(refsets['neg'], testsets['neg']))

# Model #4: **UNIGRAMS** & SVM Model

# In[28]:

# #Create an SVM to compare which is the better performing model

from nltk.classify import SklearnClassifier
from sklearn.svm import SVC

SVM_classifier = SklearnClassifier(SVC(), sparse=False).train(train_set)

for i, (uni_featureset, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = SVM_classifier.classify(uni_featureset)
    testsets[observed].add(i)

print('pos precision:', precision(refsets['pos'], testsets['pos']))
print('pos recall:', recall(refsets['pos'], testsets['pos']))
print('pos F-measure:', f_measure(refsets['pos'], testsets['pos']))
print('neg precision:', precision(refsets['neg'], testsets['neg']))
print('neg recall:', recall(refsets['neg'], testsets['neg']))
print('neg F-measure:', f_measure(refsets['neg'], testsets['neg']))

# Model #5 In order to get more context, we should start modeling **BIGRAMS** & Naive Bayes with the same dataset and compare

# In[29]:

rl = zip(reviews, ratings)
예제 #29
0
def evaluate_bow():
    lines = codecs.open(BC3_LABELLED_FILE, "r").readlines()

    data = []
    gold = []

    for i, line in enumerate(lines):
        tokens = line.strip().split()

        if len(tokens) > 2:
            label = tokens.pop(0)
            tag = tokens.pop(0)

            if tag == "none":
                continue

            if i < len(lines) and len(lines[i + 1].strip().split()) > 2:
                lines[i + 1].strip().split().pop(0)
                next_label = lines[i + 1].strip().split().pop(0)
            else:
                next_label = "T"

            gold.append(tag)
            data.append((FreqDist(tokens), tag, next_label))

    limit = int(float(len(data)) * 0.8)

    # training set: bags-of-words and tag tuples
    train = [(bow, tag) for bow, tag, next_label in data[:limit]]
    # training the classifier
    classifier = SklearnClassifier(MultinomialNB()).train(train)

    results = {
        "segmented": [],
        "unsegmented": []
    }

    all_choices = [] # all choices made
    choices = [] # choices for the current segment
    nb = 1 # number of lines in the segment
   
    for i, (bow, tag, next_label) in enumerate(data[limit:]):
        # bow classification
        choice = classifier.classify(bow)
        choices.append(choice)
        all_choices.append(choice)

        # line by line classification for unsegmented results
        results["unsegmented"].append(choice)

        # more complex classification for segmented results
        if next_label == "T":
            most_common = Counter(choices).most_common()

            if len(most_common) > 1:
                tf = FreqDist(all_choices)
                vote = most_common[0]
                best = 1

                for candidate, occ in most_common:
                    if tf[candidate] > best:
                        vote = candidate
                        best = tf[candidate]
            else:
                vote, occ = most_common[0]

            results["segmented"] += [vote for choice in choices]
            choices = []
            nb = 1
        else:
            nb += 1 # incrementing the current number of lines in the bag
  
    for i, label in enumerate(gold[limit:]):
        bow, tag, next_label = data[i + limit]
        print("# {0}\t{1}\t{2}".format(label, results["unsegmented"][i], results["segmented"][i]))
        if next_label == "T":
            print("# ------------------")

    # segmented metrics
    sp = metrics.precision_score(gold[limit:], results["segmented"])
    sr = metrics.recall_score(gold[limit:], results["segmented"])
    sf = (2.0 * (sr * sp)) / (sr + sp)

    # unsegmented metrics
    up = metrics.precision_score(gold[limit:], results["unsegmented"])
    ur = metrics.recall_score(gold[limit:], results["unsegmented"])
    uf = (2.0 * (ur * up)) / (ur + up)

    print("#")
    print("#                Pre.:\t\tRec:\t\tF1:")
    print("# segmented:     {0}%\t\t{1}%\t\t{2}%".format(dec(sp * 100), dec(sr * 100), dec(sf * 100)))
    print("# non-segmented: {0}%\t\t{1}%\t\t{2}%".format(dec(up * 100), dec(ur * 100), dec(uf * 100)))
예제 #30
0
print("UnigramsLogit Recall")
print('Bullying recall:', recall(testset['Bullying'], refset['Bullying']))
print("")
 


# In[34]:

#Run Support Vector Machine for Unigrams
from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
SVM_classifier = SklearnClassifier(SVC(), sparse=False).train(train_set)
 
for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = SVM_classifier.classify(feats)
    testset[observed].add(i)
    
print("UniigramSVM Recall")
print('Bullying recall:', recall(testset['Bullying'], refset['Bullying']))


# In[35]:

#Do the same thing with bigrams
from nltk import bigrams, trigrams
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures


# In[36]:
예제 #31
0
def evaluate_classifier(featx):

    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]

    negcutoff = int(len(negfeats) * 3 / 4)
    poscutoff = int(len(posfeats) * 3 / 4)

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    #testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

    classifierName = 'SVM'
    classifier = SklearnClassifier(LinearSVC(), sparse=False).train(trainfeats)

    newsdata = {}
    '''
    news_path = "./xa/"
    out_ = open('result.txt', 'w')

    for root, dirs, files, in os.walk(news_path):
        for name in files:
            if name == ".DS_Store":
                continue
            fp = open(root+'/'+name, 'r')
            #print(name)
            date = ''
            text = []
            gotDate = False
            #print(root+'/'+name)
            for line in fp:
                if gotDate == False:
                    date = line.replace('\n','')
                    gotDate = True
                    if date not in newsdata:
                        newsdata[date] = [0,0]
                else:
                    if len(line.strip()) == 0:
                        gotDate = False
                        continue
                    text.append(line)
                    #print(text)
                    newsfeat = [(featx(f), date) for f in word_split(text)]
                    del text[:]
                    observed = classifier.classify(newsfeat[0][0])
                    if observed == 'neg':
                        newsdata[date][1] += 1
                        #print('------------------------------ '+ 'neg')
                    else:
                        newsdata[date][0] += 1
                        #print('------------------------------ '+ 'pos')
                        #print(root+'/'+name+': '+ 'pos')

                    gotDate = False
            fp.close()
    for date in newsdata:
        #print(date+': '+str(newsdata[date][0])+', '+str(newsdata[date][1]))
        out_.write(date+'\n'+str(newsdata[date][0])+', '+str(newsdata[date][1])+'\n')
    out_.close() 
    '''
    out_ = open('TEST_result.txt', 'w')

    fp = open('test_half_half.txt', 'r')
    #print(name)
    date = ''
    text = []
    gotDate = False
    #print(root+'/'+name)
    for line in fp:
        if gotDate == False:
            date = line.replace('\n', '')
            gotDate = True
            if date not in newsdata:
                newsdata[date] = [0, 0]
        else:
            if len(line.strip()) == 0:
                gotDate = False
                continue
            text.append(line)
            print(text)
            newsfeat = [(featx(f), date) for f in word_split(text)]
            del text[:]
            observed = classifier.classify(newsfeat[0][0])
            if observed == 'neg':
                newsdata[date][1] += 1
                print('------------------------------ ' + 'neg')
            else:
                newsdata[date][0] += 1
                print('------------------------------ ' + 'pos')
                #print(root+'/'+name+': '+ 'pos')

            gotDate = False
    fp.close()
    for date in newsdata:
        #print(date+': '+str(newsdata[date][0])+', '+str(newsdata[date][1]))
        out_.write(date + '\n' + str(newsdata[date][0]) + ', ' +
                   str(newsdata[date][1]) + '\n')
    out_.close()
예제 #32
0
def evaluate_classifier(featx):

    #negfeats = [(featx(mark_negation(f)), 'neg') for f in word_split(negdata)]
    #posfeats = [(featx(mark_negation(f)), 'pos') for f in word_split(posdata)]
    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    #print negfeats[1:25]
    #raw_input('>')
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
    negcutoff = len(negfeats) * 3 / 4
    poscutoff = len(posfeats) * 3 / 4

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    print "No of training reviews:", len(trainfeats)
    #print trainfeats
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    print "No of testing reviews:", len(testfeats)

    # using 3 classifiers
    classifier_list = ['nb', 'svm', 'maxent']  #
    NB_pred = []
    new_label = []
    for cl in classifier_list:
        if cl == 'maxent':
            classifierName = 'Maximum Entropy'
            classifier = MaxentClassifier.train(trainfeats,
                                                'GIS',
                                                trace=0,
                                                encoding=None,
                                                labels=None,
                                                gaussian_prior_sigma=0,
                                                max_iter=1)
        elif cl == 'svm':
            classifierName = 'SVM'
            classifier = SklearnClassifier(LinearSVC(), sparse=False)
            classifier.train(trainfeats)
        else:
            classifierName = 'Naive Bayes'
            classifier = NaiveBayesClassifier.train(trainfeats)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
        original_label = []

        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            original_label.append(label)
            #print feats
            #raw_input('> ')
            observed = classifier.classify(feats)
            NB_pred.append(observed)

            testsets[observed].add(i)

        #print refsets['pos']
        #print testsets['pos']
        #print original_label
        #print NB_Pred
        #cm = confusion_matrix(original_label,NB_pred)
        #print cm
        #print "The accuracy score is {:.2%}".format(accuracy_score(original_label,NB_pred))
        new_label = original_label
        accuracy = nltk.classify.util.accuracy(classifier, testfeats)
        pos_precision = nltk.precision(refsets['pos'], testsets['pos'])
        pos_recall = nltk.recall(refsets['pos'], testsets['pos'])
        pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos'])
        neg_precision = nltk.precision(refsets['neg'], testsets['neg'])
        neg_recall = nltk.recall(refsets['neg'], testsets['neg'])
        neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg'])

        print('')
        print('---------------------------------------')
        print('SINGLE FOLD RESULT ' + '(' + classifierName + ')')
        print('---------------------------------------')
        print('accuracy:', accuracy)
        print('precision', (pos_precision + neg_precision) / 2)
        print('recall', (pos_recall + neg_recall) / 2)
        print('f-measure', (pos_fmeasure + neg_fmeasure) / 2)

        #classifier.show_most_informative_features(50)

    print('')

    #print len(NB_pred)

    ME_pred = NB_pred[982:]
    SVM_pred = NB_pred[491:982]
    NB_pred = NB_pred[0:491]
    #print NB_pred
    #print "-----------------------"
    #print ME_pred
    #print "-----------------------"
    #print SVM_pred
    #print "-----------------------"
    #cm = confusion_matrix(SVM_pred,NB_pred)
    #print cm
    #print "The accuracy score is {:.2%}".format(accuracy_score(SVM_pred,NB_pred))
    #cm = confusion_matrix(ME_pred,NB_pred)
    #print cm
    #print "The accuracy score is {:.2%}".format(accuracy_score(ME_pred,NB_pred))
    #cm = confusion_matrix(SVM_pred,ME_pred)
    #print cm
    #print "The accuracy score is {:.2%}".format(accuracy_score(SVM_pred,ME_pred))

    final_pred = []
    for i in range(0, 491):
        c1 = 0
        if NB_pred[i] == 'pos':
            c1 = c1 + 1
        if ME_pred[i] == 'pos':
            c1 = c1 + 1
        if SVM_pred[i] == 'pos':
            c1 = c1 + 1
        #print i
        if c1 == 3 or c1 == 2:
            final_pred.append('pos')
        else:
            final_pred.append('neg')

    print "-----------------------"
    #print final_pred
    print "-----------------------"
    #print new_label

    print "Results of ensemble: NB + SVM + ME::"
    print "----------Confusion Matrix--------------"
    cm = confusion_matrix(final_pred, new_label)
    print cm
    print ""
    print "The accuracy score of ensemble is {:.2%}".format(
        accuracy_score(final_pred, new_label))
    print "##############################################"

    ## CROSS VALIDATION

    trainfeats = negfeats + posfeats

    # SHUFFLE TRAIN SET
    # As in cross validation, the test chunk might have only negative or only positive data
    random.shuffle(trainfeats)
    n = 5  # 5-fold cross-validation

    for cl in classifier_list:

        subset_size = len(trainfeats) / n
        accuracy = []
        pos_precision = []
        pos_recall = []
        neg_precision = []
        neg_recall = []
        pos_fmeasure = []
        neg_fmeasure = []
        cv_count = 1
        for i in range(n):
            testing_this_round = trainfeats[i * subset_size:][:subset_size]
            training_this_round = trainfeats[:i * subset_size] + trainfeats[
                (i + 1) * subset_size:]

            if cl == 'maxent':
                classifierName = 'Maximum Entropy'
                classifier = MaxentClassifier.train(training_this_round,
                                                    'GIS',
                                                    trace=0,
                                                    encoding=None,
                                                    labels=None,
                                                    gaussian_prior_sigma=0,
                                                    max_iter=1)
            elif cl == 'svm':
                classifierName = 'SVM'
                classifier = SklearnClassifier(LinearSVC(), sparse=False)
                classifier.train(training_this_round)
            else:
                classifierName = 'Naive Bayes'
                classifier = NaiveBayesClassifier.train(training_this_round)

            refsets = collections.defaultdict(set)
            testsets = collections.defaultdict(set)
            for i, (feats, label) in enumerate(testing_this_round):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)

            cv_accuracy = nltk.classify.util.accuracy(classifier,
                                                      testing_this_round)
            cv_pos_precision = nltk.precision(refsets['pos'], testsets['pos'])
            cv_pos_recall = nltk.recall(refsets['pos'], testsets['pos'])
            cv_pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos'])
            cv_neg_precision = nltk.precision(refsets['neg'], testsets['neg'])
            cv_neg_recall = nltk.recall(refsets['neg'], testsets['neg'])
            cv_neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg'])

            accuracy.append(cv_accuracy)
            pos_precision.append(cv_pos_precision)
            pos_recall.append(cv_pos_recall)
            neg_precision.append(cv_neg_precision)
            neg_recall.append(cv_neg_recall)
            pos_fmeasure.append(cv_pos_fmeasure)
            neg_fmeasure.append(cv_neg_fmeasure)

            cv_count += 1

        print('---------------------------------------')
        print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')')
        print('---------------------------------------')
        print('accuracy:', sum(accuracy) / n)
        print('precision',
              (sum(pos_precision) / n + sum(neg_precision) / n) / 2)
        print('recall', (sum(pos_recall) / n + sum(neg_recall) / n) / 2)
        print('f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n) / 2)
        if cl == 'maxent':
            maxent_accuracy_next = (sum(accuracy) / n)
            maxent_accuracy.append(maxent_accuracy_next)
        elif cl == 'svm':
            svm_accuracy_next = (sum(accuracy) / n)
            svm_accuracy.append(svm_accuracy_next)
        else:
            nb_accuracy_next = (sum(accuracy) / n)
            nb_accuracy.append(nb_accuracy_next)
예제 #33
0
def evaluate_mult_classifiers(feature_x, n_folds=5):

    # 5-fold default for cross-validation
    # train_feats = 75% of pos_data + 75% of neg_data
    # test_feats  = 25% of pos_data + 25% of neg_data

    neg_feats = [(feature_x(i), 'neg') for i in word_split(neg_data)]
    pos_feats = [(feature_x(i), 'pos') for i in word_split(pos_data)]

    neg_cutoff = int(len(neg_feats) * 0.75)
    pos_cutoff = int(len(pos_feats) * 0.75)

    train_feats = neg_feats[:neg_cutoff] + pos_feats[:pos_cutoff]
    test_feats = neg_feats[neg_cutoff:] + pos_feats[pos_cutoff:]

    classifier_list = ['NB', 'SVM']

    ## CROSS VALIDATION
    train_feats = neg_feats + pos_feats

    # Shuffle training set
    random.shuffle(train_feats)

    for cl in classifier_list:

        subset_size = int(len(train_feats) / n_folds)
        accuracy = []
        pos_precision = []
        pos_recall = []
        neg_precision = []
        neg_recall = []
        pos_fmeasure = []
        neg_fmeasure = []
        cv_count = 1

        print('--------------------------')
        print('Beginning Cross-validation')
        print('--------------------------')

        for i in range(n_folds):
            testing_this_round = train_feats[i * subset_size:][:subset_size]
            training_this_round = train_feats[:i * subset_size] + train_feats[
                (i + 1) * subset_size:]

            if cl == 'NB':
                classifierName = 'Naive Bayes'
                # Using NLTK NaiveBayesClassifier
                classifier = NaiveBayesClassifier.train(training_this_round)
            else:
                classifierName = 'SVM'
                classifier = SklearnClassifier(LinearSVC(), sparse=False)
                classifier.train(training_this_round)

            ref_sets = collections.defaultdict(set)
            test_sets = collections.defaultdict(set)

            for i, (feats, label) in enumerate(testing_this_round):
                ref_sets[label].add(i)
                observed = classifier.classify(feats)
                test_sets[observed].add(i)

            cv_accuracy = nltk.classify.util.accuracy(classifier,
                                                      testing_this_round)
            cv_pos_precision = nltk.precision(ref_sets['pos'],
                                              test_sets['pos'])
            cv_pos_recall = nltk.recall(ref_sets['pos'], test_sets['pos'])
            cv_pos_fmeasure = nltk.f_measure(ref_sets['pos'], test_sets['pos'])
            cv_neg_precision = nltk.precision(ref_sets['neg'],
                                              test_sets['neg'])
            cv_neg_recall = nltk.recall(ref_sets['neg'], test_sets['neg'])
            cv_neg_fmeasure = nltk.f_measure(ref_sets['neg'], test_sets['neg'])

            print('Fold: {} Acc       : {:.4F}'.format(cv_count, cv_accuracy))
            print('Fold: {} pos_prec  : {:.4F} neg_prec  : {:.4F}'.format(
                cv_count, cv_pos_precision, cv_neg_precision))
            print('Fold: {} pos_recall: {:.4F} neg_recall: {:.4F}'.format(
                cv_count, cv_pos_recall, cv_neg_recall))
            print('Fold: {} pos_fmeas : {:.4F} neg_fmeas : {:.4F}'.format(
                cv_count, cv_pos_fmeasure, cv_neg_fmeasure))
            print('--')

            accuracy.append(cv_accuracy)
            pos_precision.append(cv_pos_precision)
            pos_recall.append(cv_pos_recall)
            neg_precision.append(cv_neg_precision)
            neg_recall.append(cv_neg_recall)
            pos_fmeasure.append(cv_pos_fmeasure)
            neg_fmeasure.append(cv_neg_fmeasure)

            cv_count += 1

        print('----------------------------------------------------------')
        print('{}-Fold Cross Validation results for {} Classifier'.format(
            n_folds, classifierName))
        print('----------------------------------------------------------')
        print('accuracy : {:.4F}'.format(sum(accuracy) / n_folds))
        print('precision: {:.4F}'.format(
            (sum(pos_precision) / n_folds + sum(neg_precision) / n_folds) / 2))
        print('recall   : {:.4F}'.format(
            (sum(pos_recall) / n_folds + sum(neg_recall) / n_folds) / 2))
        print('f-measure: {:.4F}'.format(
            (sum(pos_fmeasure) / n_folds + sum(neg_fmeasure) / n_folds) / 2))
        print('\n')
예제 #34
0
                        "to": "2018-07-18 00:00:00"
                    }
                }
            }]
        }
    }
}
res = es.search(index='twint', doc_type='items', body=doc, scroll='1m')
count = 0
print len(res)
while len(res) > 0:
    #if True:
    scrollId = res['_scroll_id']
    print len(res), res['hits']['hits']
    #res = es.scroll(scroll_id = scrollId, scroll = '1m')
    print len(res)
    #quit()
    for doc in res['hits']['hits']:
        print doc['_id']
        cf = [(word_feats(f), '')
              for f in word_split([doc['_source']['tweet']])]
        observed = classifier.classify(cf[0][0])
        count = count + 1
        print(doc['_id'], observed, count)
        es.update(index='twint',
                  doc_type='items',
                  id=doc['_id'],
                  body={"doc": {
                      "polarity": observed
                  }})
예제 #35
0
def runClassifiers(positives, negatives, featuresToUse, outFile, verbose, classifiersToUse):
    onDataSet = 0
    numDataSets = len(positives + negatives)
    table = []
    pos = []
    neg = []

    short = NUM_CLASSIFIERS - len(classifiersToUse)
    for x in range(short):
        classifiersToUse.append(False)

    # print which features we are using
    print("Using these features: ", FeatureExtractor.featuresToString(featuresToUse))

    for data in positives:
        pos.append((FeatureExtractor.langFeatures(data, featuresToUse), True))
        onDataSet += 1

    for data in negatives:
        neg.append((FeatureExtractor.langFeatures(data, featuresToUse), False))
        onDataSet += 1

    random.shuffle(pos)
    random.shuffle(neg)

    # Testing is 1/4 of the data set, so we will cut it off there
    minLen = min(len(pos), len(neg))
    posCut = minLen//4
    negCut = posCut*2

    # splits training and test sets
    train_data = pos[posCut:] + neg[negCut:]
    test_data = pos[:posCut] + neg[:posCut]

    maxEntSupport = featuresToUse["max_ent"]

    if classifiersToUse[0]:
        print("Running Naive Bayes classifier")
        timeStart = time.time()

        # NLTK's built-in implementation of the Naive Bayes classifier is trained
        classifier = nltk.NaiveBayesClassifier.train(train_data)


        # attempt to use sklearn naive bayes, not as good unfortunately
        # clf = MultinomialNB()

        # if featuresToUse["words"] or featuresToUse["ngrams"]:
        #     pipeline = Pipeline([     # ('tfidf', TfidfTransformer()),
        #                  ('chi2', SelectKBest(chi2, k='all')),
        #                  ('NB', clf)])

        #     classifier = SklearnClassifier(pipeline)
        # else:
        #     classifier = SklearnClassifier(clf)
        # classifier.train(train_data)

        # get the time it takes to train Naive Bayes
        print ("\nTime to train in seconds: ", time.time() - timeStart)

        # if featuresToUse["laugh_count"]:
        #     DataCreator.pickleData("pickled_data/MaxEnt_Full", classifier)
        # else:
        #     DataCreator.pickleData("pickled_data/MaxEnt_Part", classifier)

        # store the accuracy in the table
        table.append(assess_classifier(classifier, test_data, "Naive Bayes", maxEntSupport))

        if verbose:
            # this is a nice function that reports the top most impactful features the NB classifier found
            print("\n\n")
            print (classifier.show_most_informative_features(20))

    if classifiersToUse[1]:
        print("Running Decision Tree classifier")
        timeStart = time.time()

        # NLTK's built-in implementation of the Decision Tree classifier is trained
        classifier = nltk.DecisionTreeClassifier.train(train_data)

        # get the time to train Decision tree
        print ("\nTime to train in seconds: ", time.time() - timeStart)

        # store the accuracy in the table
        table.append(assess_classifier(classifier, test_data, "Decision Tree"))

        if verbose:
            print("Printing tree")
            # print(classifier.pretty_format())
            for (feats, cor) in test_data[:20]:
                classification = classifier.classify(feats)
                print("Correct: ", cor, " Result: ", classification)#, "for ", feats[0])

    if classifiersToUse[2]:
        print("Running Maximum Entropy classifier")
        timeStart = time.time()

        # NLTK's built-in implementation of the Max Entropy classifier is trained
        classifier = nltk.MaxentClassifier.train(train_data, max_iter=25)

        if featuresToUse["laugh_count"]:
            DataCreator.pickleData("pickled_data/MaxEnt_Full", classifier)
        else:
            DataCreator.pickleData("pickled_data/MaxEnt_Part", classifier)

        # get the time to train Maximum Entropy
        print ("\nTime to train in seconds: ", time.time() - timeStart)

        # store the accuracy in the table
        table.append(assess_classifier(classifier, test_data, "Maximum Entropy"))

        if verbose:
            # this is a nice function that reports the top most impactful features the NB classifier found
            print (classifier.show_most_informative_features(20))
            # this is a function that explains the effect of each feature in the set
            # print (classifier.explain())

    if classifiersToUse[3]:
        print("Running SVM classifier")
        timeStart = time.time()

        # Scikit-learn's LinearSVC classifier, wrapped up in NLTK's wrapper class
        clf = LinearSVC()

        if featuresToUse["Dim Reduction"]:
            # pipeline = Pipeline([   # ('tfidf', TfidfTransformer()),
            #              ('chi2', SelectKBest(chi2, k='all')),
            #              ('randomforest', clf)])
            pipeline = Pipeline([('PCA', PCA()), ('classifier', clf)])
            classifier = SklearnClassifier(pipeline)
        else:
            classifier = SklearnClassifier(clf)

        classifier.train(train_data)

        # get the time to train a Support Vector Machine
        print ("\nTime to train in seconds: ", time.time() - timeStart)

        # store the accuracy in the table
        table.append(assess_classifier(classifier, test_data, "Linear SVC"))

    if classifiersToUse[4]:
        numEstimators = 50
        print("Running AdaBoost classifier")
        timeStart = time.time()

        # Scikit-learn's AdaBoost classifier wrapped up in NLTK's wrapper class
        # The main parameters to tune to obtain good results are:
        # n_estimators and the complexity of the base estimators

        # testclf = RandomForestClassifier()
        # clf = AdaBoostClassifier(base_estimator=testclf, n_estimators=numEstimators)
        clf = AdaBoostClassifier(n_estimators=numEstimators)

        if featuresToUse["Dim Reduction"]:
            pipeline = Pipeline([('TruncatedSVD', TruncatedSVD()), ('classifier', clf)])
            classifier = SklearnClassifier(pipeline)
        else:
            classifier = SklearnClassifier(clf)
        classifier.train(train_data)

        # get the time to train
        print ("\nTime to train in seconds: ", time.time() - timeStart)

        # store the accuracy in the table
        table.append(assess_classifier(classifier, test_data, "AdaBoost(" + str(numEstimators) + ")", maxEntSupport))

    if classifiersToUse[5]:
        print("Running Random Forest Classifier classifier")
        timeStart = time.time()

        # Scikit-learn's Random Forest classifier wrapped up in NLTK's
        # wrapper class
        # The main parameters to tune to obtain good results are:
        # n_estimators
        clf = RandomForestClassifier()

        if featuresToUse["Dim Reduction"]:
            # pipeline = Pipeline([   # ('tfidf', TfidfTransformer()),
            #              ('chi2', SelectKBest(chi2, k='all')),
            #              ('randomforest', clf)])
            pipeline = Pipeline([('TruncatedSVD', TruncatedSVD()), ('classifier', clf)])
            classifier = SklearnClassifier(pipeline)
        else:
            classifier = SklearnClassifier(clf)
        classifier.train(train_data)

        # get the time to train
        print ("\nTime to train in seconds: ", time.time() - timeStart)

        # store the accuracy in the table
        table.append(assess_classifier(classifier, test_data, "Random Forest", maxEntSupport))

    if classifiersToUse[6]:
        numEstimators = 50
        print("Running Combo classifier")
        timeStart = time.time()

        adaclf = SklearnClassifier(AdaBoostClassifier(n_estimators=numEstimators))
        adaclf.train(train_data)
        naive = nltk.NaiveBayesClassifier.train(train_data)

        # get the time to train
        print ("\nTime to train in seconds: ", time.time() - timeStart)

        # store the accuracy in the table
        TP = TN = FP = FN = 0
        for i, (feats, label) in enumerate(test_data):
            observed = False
            if naive.classify(feats) and adaclf.classify(feats):
                observed = True
            if label == observed:
                if observed:
                    TP += 1
                else:
                    TN += 1
            else:
                if observed:
                    FP += 1
                else:
                    FN += 1

        accuracy = (TP+TN)/(TP+FP+TN+FN)
        p_prec = TP/(TP+FP)
        p_rec = TP/(TP+FN)
        f1Pos = 2*((p_prec*p_rec)/(p_prec + p_rec))
        n_prec = TN/(TN+FN)
        n_rec = TN/(TN+FP)
        f1Neg = 2*((n_prec*n_rec)/(n_prec + n_rec))
        table.append(["COMBO", accuracy, p_prec, p_rec, f1Pos, n_prec, n_rec, f1Neg])


    if (outFile == ""):
        print("\n", FeatureExtractor.featuresToString(featuresToUse))
        # print(tabulate(table, headers=["Classifier", "accuracy", "pos precision", "pos recall", "pos f1", "neg precision", "neg recall", "neg f1"]))
    else:
        with open(outFile, 'a') as out:
            out.write("\n")
            out.write(FeatureExtractor.featuresToString(featuresToUse))
            out.write(tabulate(table, headers=["Classifier", "accuracy", "pos precision", "pos recall", "pos f1", "neg precision", "neg recall", "neg f1"]))
            out.write("\n")

    return table
예제 #36
0
def evaluate_classifier(featx, balance=False):
    global negdata
    global neudata
    global posdata

    if balance:

        neudata = resample(neudata, n_samples=len(negdata))
        posdata = resample(posdata, n_samples=len(negdata))

    # using 3 classifiers
    classifier_list = ['svm', 'nb', 'maxent']

    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
    neufeats = [(featx(f), 'neu') for f in word_split(neudata)]
    alldata = negdata + posdata + neudata
    allfeats = negfeats + posfeats + neufeats

    #10-fold cross-validation
    correct = []
    incorrect = []
    for n in [10]:  #range(2,6):
        negfeatssplit = chunkIt(negfeats, n)
        negdatasplit = chunkIt(negdata, n)
        posfeatssplit = chunkIt(posfeats, n)
        posdatasplit = chunkIt(posdata, n)
        neufeatssplit = chunkIt(neufeats, n)
        neudatasplit = chunkIt(neudata, n)
        for cl in classifier_list:
            accuracy = []
            pos_precision = []
            pos_recall = []
            neg_precision = []
            neg_recall = []
            neu_precision = []
            neu_recall = []
            pos_fmeasure = []
            neg_fmeasure = []
            neu_fmeasure = []
            cv_count = 1
            res = {}
            res["neg"] = 0
            res["pos"] = 0
            res["neu"] = 0

            for i in range(n):
                testing_this_round = negfeatssplit[i - 1] + posfeatssplit[
                    i - 1] + neufeatssplit[i - 1]
                training_this_round = gettrainfeat(
                    negfeatssplit, i) + gettrainfeat(
                        posfeatssplit, i) + gettrainfeat(neufeatssplit, i)

                if cl == 'maxent':
                    classifierName = 'Maximum Entropy'
                    classifier = MaxentClassifier.train(training_this_round,
                                                        'GIS',
                                                        trace=0,
                                                        encoding=None,
                                                        labels=None,
                                                        gaussian_prior_sigma=0,
                                                        max_iter=1)
                elif cl == 'svm':
                    classifierName = 'SVM'
                    classifier = SklearnClassifier(LinearSVC(), sparse=False)
                    classifier.train(training_this_round)
                else:
                    classifierName = 'Naive Bayes'
                    classifier = NaiveBayesClassifier.train(
                        training_this_round)

                refsets = collections.defaultdict(set)
                testsets = collections.defaultdict(set)
                aux_test = {}
                auxFP_test = {}
                aux_test['pos'] = 0
                aux_test['neu'] = 0
                aux_test['neg'] = 0
                auxFP_test['pos'] = 0
                auxFP_test['neu'] = 0
                auxFP_test['neg'] = 0
                for ii, (feats, label) in enumerate(testing_this_round):
                    refsets[label].add(ii)
                    observed = classifier.classify(feats)
                    testsets[observed].add(ii)
                    res[observed] = res[observed] + 1
                    auxFP_test[observed] = auxFP_test[observed] + 1
                    if (observed == label):
                        correct.append((feats, label))
                        aux_test[label] = aux_test[label] + 1
                    else:
                        incorrect.append((feats, label))

                cv_accuracy = nltk.classify.util.accuracy(
                    classifier, testing_this_round)
                cv_neg_precision = float(aux_test['neg']) / float(
                    len(negfeatssplit[i - 1]))
                print cv_neg_precision

                cv_neg_recall = float(aux_test['neg']) / float(
                    auxFP_test['neg'])
                cv_neg_fmeasure = 2 * ((cv_neg_precision * cv_neg_recall) /
                                       (cv_neg_precision + cv_neg_recall))
                cv_pos_precision = float(aux_test['pos']) / float(
                    len(posfeatssplit[i - 1]))
                cv_pos_recall = float(aux_test['pos']) / float(
                    auxFP_test['pos'])
                cv_pos_fmeasure = 2 * ((cv_pos_precision * cv_pos_recall) /
                                       (cv_pos_precision + cv_pos_recall))
                cv_neu_precision = float(aux_test['neu']) / float(
                    len(neufeatssplit[i - 1]))
                cv_neu_recall = float(aux_test['neu']) / float(
                    auxFP_test['neu'])
                cv_neu_fmeasure = 2 * ((cv_neu_precision * cv_neu_recall) /
                                       (cv_neu_precision + cv_neu_recall))
                #cv_accuracy = float(aux_test['neg'] + aux_test['pos']+ aux_test['neu'])/float(len(testing_this_round))

                accuracy.append(cv_accuracy)
                pos_precision.append(cv_pos_precision)
                neg_precision.append(cv_neg_precision)
                neu_precision.append(cv_neu_precision)
                pos_recall.append(cv_pos_recall)
                neg_recall.append(cv_neg_recall)
                neu_recall.append(cv_neu_recall)
                pos_fmeasure.append(cv_pos_fmeasure)
                neg_fmeasure.append(cv_neg_fmeasure)
                neu_fmeasure.append(cv_neu_fmeasure)

                cv_count += 1

            print "Balance = ", balance
            print '---------------------------------------'
            print str(
                n
            ) + '-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')'
            print "Nbr = ", res
            print 'accuracy:', sum(accuracy) / n
            print 'precision', ((sum(pos_precision) / n) +
                                (sum(neg_precision) / n) +
                                (sum(neu_precision) / n)) / 3.0
            print sum(pos_precision) / n, sum(neg_precision) / n, sum(
                neu_precision) / n
            print 'recall', (sum(pos_recall) / n + sum(neg_recall) / n +
                             sum(neu_recall) / n) / 3.0
            print sum(pos_recall) / n, sum(neg_recall) / n, sum(neu_recall) / n
            print 'f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n +
                                sum(neu_fmeasure) / n) / 3.0
            print sum(pos_fmeasure) / n, sum(neg_fmeasure) / n, sum(
                neu_fmeasure) / n

            print "*********CORRECT****"
            print(len(correct), len(incorrect))
            #print (correct,incorrect)

            for tt in correct:
                print(tt[1], alldata[allfeats.index(tt)])
            print "***INCORRECT**********"
            for tt in incorrect:
                print(tt[1], alldata[allfeats.index(tt)])  #.index(correct[0]))
            print "..."