示例#1
0
	def train( self, observations ,  k=5 ):
		'''
		An ensamble K-Fold Classifier 
		'''
		self.forest = []
		splitdata = np.array_split(observations, k)
		combos = list(reversed(list(itertools.combinations(splitdata, k-1))))
		accuracy_sum = 0
		for i in range(k):
			train = list(itertools.chain(*combos[i]))
			test = splitdata[i]
			if k==1:
				train = observations
				test = observations
			c = SklearnClassifier(RandomForestClassifier())
			#c = SklearnClassifier(cls)	
			c.train(train)
			accuracy_sum += nltk.classify.accuracy(c,test)
			self.forest.append(c)

		print('Accuracy on Train data(Using K fold)= ', accuracy_sum/k )
class SentimentMNB(SentimentClassifier):
    # Sub class constructor
    def __init__(self, chiK=3368):
        # Call the super class constructor which initializes the classifier
        self.chiK = chiK
        super(SentimentMNB, self).__init__()
        # End func return
        return
    # End wrapper class constructor
    
    # Function to initialize the classifier pipeline
    def initPipeline(self):
        # Pipeline of transformers with a final estimator
        # The pipeline class behaves like a compound classifier
        # pipeline(steps=[...])

        # Old MNB pipeline with TFIDF
        # self.pipeline = Pipeline([('tfidf', TfidfTransformer()),
        #              ('chi2', SelectKBest(chi2, k=1000)),
        #              ('nb', MultinomialNB())])

        self.pipeline = Pipeline([('chi2', SelectKBest(chi2, k=self.chiK)),
                      ('nb', MultinomialNB())])
        # End func return
        return
    # End initPipeline
        
    # Overriding func to train multinomial NB classifier
    def trainClassifier(self):
        self.initPipeline()
        # Create the multinomial NB classifier
        self.classifier = SklearnClassifier(self.pipeline)
        # Train the classifier
        self.classifier.train(self.trainingSet)
        # End func return
        return
    # End trainClassifier override
# End sub class
示例#3
0
class SVCModel(SKLearnModel):
    """This model classifies tweets into any one of twenty classes
    using SVM classification.
    """

    def __init__(self, kernel: str = "") -> None:
        # Setup tweet tokenizer note this is the same as in our baseline. For a full description checkout the
        # model_naive_bayes_baselines source file.
        self.tokenizer = TweetTokenizer(preserve_case=False,
                                        reduce_len=True,
                                        strip_handles=True).tokenize

        # Here we create the pipeline for the classifier.
        # The TfidfTransformer is the same as in our baseline. For a full description checkout the
        # model_naive_bayes_baselines source file.
        # The SVC sets up a Support Vector Machine classifier with the configured kernel.
        # In this case it is either a linear or a radial basis function kernel.
        # The details for the above items are discussed in the model's readme.
        pipeline = Pipeline([('tfidf', TfidfTransformer()),
                             ('{}svc'.format(kernel), SVC(kernel=kernel))])
        self.classif = SklearnClassifier(pipeline)

    def train(self, tweets: List[Tweet]) -> None:
        def tweet_to_tuple(x):
            return (FreqDist(self.tokenizer(x.text)), x.emoji)

        # Generates tuples of all the tweets to form the corpus
        corpus = map(tweet_to_tuple, tweets)

        # Train this model!
        self.classif.train(corpus)

    def predict(self, text):
        return self.classif.classify(FreqDist(self.tokenizer(text)))

    def tokenize(self, text):
        return self.tokenizer(text)
示例#4
0
文件: test3.py 项目: jshenaop/eko
def evaluate_classifier(featx):
    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]

    negcutoff = len(negfeats) * 3 / 4
    poscutoff = len(posfeats) * 3 / 4

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

    # using 3 classifiers
    classifier_list = ['nb', 'maxent', 'svm']

    for cl in classifier_list:
        if cl == 'maxent':
            classifierName = 'Maximum Entropy'
            classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, sparse=True,
                                                gaussian_prior_sigma=0, max_iter=1)
        elif cl == 'svm':
            classifierName = 'SVM'
            classifier = SklearnClassifier(LinearSVC(), sparse=False)
            classifier.train(trainfeats)
        else:
            classifierName = 'Naive Bayes'
            print(trainfeats)
            classifier = NaiveBayesClassifier.train(trainfeats)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        accuracy = nltk.classify.util.accuracy(classifier, testfeats)
示例#5
0
def bag_of_words_model(df, column_name, target='label', k=1000):
    """
    """
    pos_array = df[(df[target] == 1)][column_name].values
    neg_array = df[(df[target] == 0)][column_name].values

    pipeline = Pipeline([('tfidf', TfidfTransformer()),
                         ('chi2', SelectKBest(chi2, k=k)),
                         ('nb', MultinomialNB())])
    clf = SklearnClassifier(pipeline)

    pos = [FreqDist(word_list) for word_list in pos_array]
    neg = [FreqDist(word_list) for word_list in neg_array]

    add_label = lambda lst, lab: [(x, lab) for x in lst]

    trained_clf = clf.train(add_label(pos, 1) + add_label(neg, 0))

    return trained_clf
示例#6
0
def bag_of_words_model(df, column_name, target='label', k=1000):
    """
    """
    pos_array = df[(df[target] == 1)][column_name].values
    neg_array = df[(df[target] == 0)][column_name].values

    pipeline = Pipeline([('tfidf', TfidfTransformer()),
                         ('chi2', SelectKBest(chi2, k=k)),
                         ('nb', MultinomialNB())])
    clf = SklearnClassifier(pipeline)

    pos = [FreqDist(word_list) for word_list in pos_array]
    neg = [FreqDist(word_list) for word_list in neg_array]

    add_label = lambda lst, lab: [(x, lab) for x in lst]

    trained_clf = clf.train(add_label(pos, 1) + add_label(neg, 0))

    return trained_clf
def evaluate_classifier(featx):

    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]

    negcutoff = int(len(negfeats) * 3 / 4)
    poscutoff = int(len(posfeats) * 3 / 4)

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

    # using 3 classifiers
    classifier_list = ['nb', 'maxent', 'svm']

    for cl in classifier_list:
        if cl == 'maxent':
            classifierName = 'Maximum Entropy'
            classifier = MaxentClassifier.train(trainfeats,
                                                'GIS',
                                                trace=0,
                                                encoding=None,
                                                labels=None,
                                                gaussian_prior_sigma=0,
                                                max_iter=1)
        elif cl == 'svm':
            classifierName = 'SVM'
            classifier = SklearnClassifier(LinearSVC(), sparse=False)
            classifier.train(trainfeats)
        else:
            classifierName = 'Naive Bayes'
            classifier = NaiveBayesClassifier.train(trainfeats)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        accuracy = nltk.classify.util.accuracy(classifier, testfeats)
        pos_precision = precision(refsets['pos'], testsets['pos'])
        pos_recall = recall(refsets['pos'], testsets['pos'])
        pos_fmeasure = f_measure(refsets['pos'], testsets['pos'])
        neg_precision = precision(refsets['neg'], testsets['neg'])
        neg_recall = recall(refsets['neg'], testsets['neg'])
        neg_fmeasure = f_measure(refsets['neg'], testsets['neg'])

        print('')
        print('---------------------------------------')
        print('SINGLE FOLD RESULT ' + '(' + classifierName + ')')
        print('---------------------------------------')
        print('accuracy:', accuracy)
        print('precision', (pos_precision + neg_precision) / 2)
        print('recall', (pos_recall + neg_recall) / 2)
        print('f-measure', (pos_fmeasure + neg_fmeasure) / 2)

        #classifier.show_most_informative_features()

    print('')

    ## CROSS VALIDATION

    trainfeats = negfeats + posfeats

    # SHUFFLE TRAIN SET
    # As in cross validation, the test chunk might have only negative or only positive data
    random.shuffle(trainfeats)
    n = 5  # 5-fold cross-validation

    for cl in classifier_list:

        subset_size = int(len(trainfeats) / n)
        accuracy = []
        pos_precision = []
        pos_recall = []
        neg_precision = []
        neg_recall = []
        pos_fmeasure = []
        neg_fmeasure = []
        cv_count = 1
        for i in range(n):
            testing_this_round = trainfeats[i * subset_size:][:subset_size]
            training_this_round = trainfeats[:i * subset_size] + trainfeats[
                (i + 1) * subset_size:]

            if cl == 'maxent':
                classifierName = 'Maximum Entropy'
                classifier = MaxentClassifier.train(training_this_round,
                                                    'GIS',
                                                    trace=0,
                                                    encoding=None,
                                                    labels=None,
                                                    gaussian_prior_sigma=0,
                                                    max_iter=1)
            elif cl == 'svm':
                classifierName = 'SVM'
                classifier = SklearnClassifier(LinearSVC(), sparse=False)
                classifier.train(training_this_round)
            else:
                classifierName = 'Naive Bayes'
                classifier = NaiveBayesClassifier.train(training_this_round)

            refsets = collections.defaultdict(set)
            testsets = collections.defaultdict(set)
            for i, (feats, label) in enumerate(testing_this_round):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)

            cv_accuracy = nltk.classify.util.accuracy(classifier,
                                                      testing_this_round)
            cv_pos_precision = precision(refsets['pos'], testsets['pos'])
            cv_pos_recall = recall(refsets['pos'], testsets['pos'])
            cv_pos_fmeasure = f_measure(refsets['pos'], testsets['pos'])
            cv_neg_precision = precision(refsets['neg'], testsets['neg'])
            cv_neg_recall = recall(refsets['neg'], testsets['neg'])
            cv_neg_fmeasure = f_measure(refsets['neg'], testsets['neg'])

            accuracy.append(cv_accuracy)
            pos_precision.append(cv_pos_precision)
            pos_recall.append(cv_pos_recall)
            neg_precision.append(cv_neg_precision)
            neg_recall.append(cv_neg_recall)
            pos_fmeasure.append(cv_pos_fmeasure)
            neg_fmeasure.append(cv_neg_fmeasure)

            cv_count += 1

        print('---------------------------------------')
        print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')')
        print('---------------------------------------')
        print('accuracy:', sum(accuracy) / n)
        print('precision',
              (sum(pos_precision) / n + sum(neg_precision) / n) / 2)
        print('recall', (sum(pos_recall) / n + sum(neg_recall) / n) / 2)
        print('f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n) / 2)
        print('')
示例#8
0
print len(per[0]), len(per[1]), len(per[2]), len(per[3]), len(per[4]), 

train1 = (9*len(per[0]))/10
train2 = (9*len(per[1]))/10
train3 = (9*len(per[2]))/10
train4 = (9*len(per[3]))/10
train5 = (9*len(per[4]))/10

ones = [FreqDist(x) for x in per[0]]
twos = [FreqDist(x) for x in per[1]]
threes = [FreqDist(x) for x in per[2]]
fours = [FreqDist(x) for x in per[3]]
fives = [FreqDist(x) for x in per[4]]

print "Starting to train"
classif.train(add_label(ones[:train1], '1') + add_label(twos[:train2], '2') + add_label(threes[:train3], '3') + add_label(fours[:train4], '4') + add_label(fives[:train5], '5')) 
print "Done learning"
l_ones = np.array(classif.batch_classify(ones[train1:]))
print "one done"
l_twos = np.array(classif.batch_classify(twos[train2:]))
print "two done"
l_threes = np.array(classif.batch_classify(threes[train3:]))
print "three done"
l_fours = np.array(classif.batch_classify(fours[train4:]))
print "four done"
l_fives = np.array(classif.batch_classify(fives[train5:]))
print "five done"

con_ma = [[(l_ones == '1').sum(), (l_ones == '2').sum(), (l_ones == '3').sum(), (l_ones == '4').sum(), (l_ones == '5').sum()],
          [(l_twos == '1').sum(), (l_twos == '2').sum(), (l_twos == '3').sum(), (l_twos == '4').sum(), (l_twos == '5').sum()],
          [(l_threes == '1').sum(), (l_threes == '2').sum(), (l_threes == '3').sum(), (l_threes == '4').sum(), (l_threes == '5').sum()],
        lines[i].remove('$')
    #print(lines[i])

positive_vocab = []
negative_vocab = []
neutral_vocab = []

for l in lines:
    k = float(l[2])
    if(k > 0.2):
        positive_vocab.append(l[0])
    elif(k < -0.1):
        negative_vocab.append(l[0])
    else:
        neutral_vocab.append(l[0])

# print(neutral_vocab)
# print(positive_vocab)

positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab]
negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab]
neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab]

train_set = positive_features + negative_features + neutral_features
random.shuffle(train_set)
cls = SklearnClassifier(SGDClassifier())
classifier = cls.train(train_set)

sent = '☺😅😭'
w = word_tokenize(sent)
print(len(sent))
示例#10
0
def find_feature(document):
    words = set(document)
    feature = {}
    for w in words_feature:
        feature[w] = (w is words)

    return feature


features = [(find_feature(rev), category) for (rev, category) in documents]

testing_set = features[1900:]
training_set = features[:1900]

if not os.path.isfile(naivebayes):
    classifier = nltk.NaiveBayesClassifier.train(training_set)

    save_classifier = open(naivebayes, "wb")
    pickle.dump(classifier, save_classifier)
    save_classifier.close()
else:
    classifier_f = open(naivebayes, "rb")
    classifier = pickle.load(classifier_f)
    classifier_f.close()

print("Original Naive Bayes Classifier accuracy precent:", (nltk.classify.accuracy(classifier, testing_set) * 100))

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("Multinomial Naive Bayes Classifier accuracy precent:", (nltk.classify.accuracy(classifier, testing_set) * 100))
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC


testing_set=x_test
training_set=x_train


from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(x_train)
print(nltk.classify.accuracy(classifier, x_test))
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

Random_Forest_Classifier=SklearnClassifier(RandomForestClassifier())
Random_Forest_Classifier.train(training_set)
# Random_Forest_Classifier_Normal=RandomForestClassifier()
# Random_Forest_Classifier_Normal.fit(x_train)
print("Random Forest Classifier After Ontology Matching percent:", (nltk.classify.accuracy(Random_Forest_Classifier, testing_set))*100)
print("Random Forest Classifier :", (nltk.classify.accuracy(Random_Forest_Classifier, testing_set))*100)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy After Ontology Matching percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy After Ontology Matching percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)
示例#12
0
            out = out + l[c]
    return out


neudata = resample(neudata, n_samples=len(negdata))
posdata = resample(posdata, n_samples=len(negdata))

negfeats = [(word_feats(f), 'neg') for f in word_split(negdata)]
posfeats = [(word_feats(f), 'pos') for f in word_split(posdata)]
neufeats = [(word_feats(f), 'neu') for f in word_split(neudata)]

alldata = negdata + posdata + neudata
allfeats = negfeats + posfeats + neufeats

classifier = SklearnClassifier(LinearSVC(), sparse=False)
classifier.train(allfeats)

es = Elasticsearch(['http://localhost:9200/'])
doc = {
    "query": {
        "bool": {
            "must_not": [
                {
                    "exists": {
                        "field": "likes"
                    }
                },
                {
                    "exists": {
                        "field": "replies"
                    }
示例#13
0
#randomly shuffle the features
random.shuffle(features)

#splitting into training and testing sets
train_set = features[:5000]
test_set = features[10000:]

#print len(train_set),len(test_set)
import nltk

nltk_nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
print "NLTK NB classifier score : ", nltk.classify.accuracy(
    nltk_nb_classifier, test_set) * 100.0

mnb_classifier = SklearnClassifier(MultinomialNB())
mnb_classifier.train(train_set)
print "mnb_classfier score : ", nltk.classify.accuracy(mnb_classifier,
                                                       test_set) * 100.0

bnb_classifier = SklearnClassifier(BernoulliNB())
bnb_classifier.train(train_set)
print "bnb_classfier score : ", nltk.classify.accuracy(bnb_classifier,
                                                       test_set) * 100.0

svc = SklearnClassifier(SVC(kernel='rbf'))
svc.train(train_set)
print "SVC : ", nltk.classify.accuracy(svc, test_set) * 100.0

lin_svc = SklearnClassifier(LinearSVC())
lin_svc.train(train_set)
print "Linear SCV : ", nltk.classify.accuracy(lin_svc, test_set) * 100.0
示例#14
0
mec = nltk.classify.MaxentClassifier.train(train_features,
                                           'GIS',
                                           trace=0,
                                           max_iter=1000)

from sklearn import cross_validation
cv = cross_validation.KFold(len(train_features),
                            n_folds=10,
                            indices=True,
                            shuffle=False,
                            random_state=None)

for traincv, evalcv in cv:
    classifier = nltk.NaiveBayesClassifier.train(
        train_features[traincv[0]:traincv[len(traincv) - 1]])
    print 'accuracy: %.3f' % nltk.classify.util.accuracy(
        classifier, train_features[evalcv[0]:evalcv[len(evalcv) - 1]])

import sklearn
from sklearn.svm import LinearSVC
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('tfidf', TfidfTransformer()),
                     ('chi2', SelectKBest(chi2, k=2000)),
                     ('nb', MultinomialNB())])
pipecl = SklearnClassifier(pipeline)
pipecl.train(train_features)
示例#15
0
def evaluate_classifier(featx):

    negfeats = [(featx(f), 'negative') for f in splitter(negative)]
    posfeats = [(featx(f), 'positive') for f in splitter(positive)]
    neautralfeats = [(featx(f), 'neautral') for f in splitter(neautral)]
    negcutoff = int(len(negfeats) * 3 / 4)
    poscutoff = int(len(posfeats) * 3 / 4)
    neautcutoff = int(len(neautralfeats) * 3 / 4)

    trainfeats = negfeats[:
                          negcutoff] + posfeats[:
                                                poscutoff] + neautralfeats[:
                                                                           neautcutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] + neautralfeats[
        neautcutoff:]
    # Max Entropy and SVM classifiers
    classifier_list = ['maxent', 'svm']

    for cl in classifier_list:
        if cl == 'maxent':
            classifierName = 'Maximum Entropy'
            classifier = MaxentClassifier.train(trainfeats,
                                                'GIS',
                                                trace=0,
                                                encoding=None,
                                                labels=None,
                                                gaussian_prior_sigma=0,
                                                max_iter=1)
        elif cl == 'svm':
            classifierName = 'SVM'
            classifier = SklearnClassifier(LinearSVC(), sparse=False)
            classifier.train(trainfeats)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        accuracy = nltk.classify.util.accuracy(classifier, testfeats)

        pos_precision = precision(refsets['positive'], testsets['positive'])
        if pos_precision is None:
            pos_precision = 0.0
        pos_recall = recall(refsets['positive'], testsets['positive'])
        if pos_recall is None:
            pos_recall = 0.0
        pos_fmeasure = f_measure(refsets['positive'], testsets['positive'])
        if pos_fmeasure is None:
            pos_fmeasure = 0.0

        neut_precision = precision(refsets['neautral'], testsets['neautral'])
        if neut_precision is None:
            neut_precision = 0.0
        neut_recall = recall(refsets['neautral'], testsets['neautral'])
        if neut_recall is None:
            neut_recall = 0.0
        neut_fmeasure = f_measure(refsets['neautral'], testsets['neautral'])
        if neut_fmeasure is None:
            neut_fmeasure = 0.0

        neg_precision = precision(refsets['negative'], testsets['negative'])
        if neg_precision is None:
            neg_precision = 0.0
        neg_recall = recall(refsets['negative'], testsets['negative'])
        if neg_recall is None:
            neg_recall = 0.0
        neg_fmeasure = f_measure(refsets['negative'], testsets['negative'])
        if neg_fmeasure is None:
            neg_fmeasure = 0.0
        print('\n')
        print(classifierName)
        print('accuracy:', accuracy)
        acrcy.append(accuracy)
        print('precision',
              (pos_precision + neg_precision + neut_precision) / 3)
        prcsn.append((pos_precision + neg_precision + neut_precision) / 3)
        print('recall', (pos_recall + neg_recall + neut_recall) / 3)
        rcall.append((pos_recall + neg_recall + neut_recall) / 3)
        print('f-measure', (pos_fmeasure + neg_fmeasure + neut_fmeasure) / 3)
        fmsr.append((pos_fmeasure + neg_fmeasure + neut_fmeasure) / 3)
示例#16
0
def trainClassifier(trainData):
    classifier = SklearnClassifier(LinearSVC())

    result = classifier.train(trainData)
    return result
示例#17
0
testingData = featuresContainer[int(TRAIN_TEST_RATIO *
                                    (NUMBER_OF_POS_AND_NEG_COMMENTS * 2)):]
trainingData = featuresContainer[:int(TRAIN_TEST_RATIO *
                                      (NUMBER_OF_POS_AND_NEG_COMMENTS * 2))]
# train naive bayes classifier
naiveBayesClassifier = nltk.classify.NaiveBayesClassifier.train(trainingData)
# print Naive Bayes accuracy
print("Naive Bayes accuracy in percent:",
      (nltk.classify.util.accuracy(naiveBayesClassifier, testingData)) * 100)
# save trained naive bayes classifier
classifier_to_save = open("naiveBayes.pickle", "wb")
pickle.dump(naiveBayesClassifier, classifier_to_save)
classifier_to_save.close()
# train multinomial naive bayes classifier
multinomial_naive_bayes_classifier = SklearnClassifier(MultinomialNB())
multinomial_naive_bayes_classifier.train(trainingData)
# print multinomial Naive Bayes classifier accuracy
print("multinomial naive bayes accuracy in percent:",
      (nltk.classify.util.accuracy(multinomial_naive_bayes_classifier,
                                   testingData)) * 100)
# save trained multinomial naive bayes classifier
classifier_to_save = open("multiNaiveBayes.pickle", "wb")
pickle.dump(multinomial_naive_bayes_classifier, classifier_to_save)
classifier_to_save.close()
# train Bernoulli naive bayes classifier
bernoulli_naive_bayes_classifier = SklearnClassifier(BernoulliNB())
bernoulli_naive_bayes_classifier.train(trainingData)
# print Bernoulli Naive Bayes classifier accuracy
print("Bernoulli naive bayes accuracy in percent",
      (nltk.classify.util.accuracy(bernoulli_naive_bayes_classifier,
                                   testingData)) * 100)
示例#18
0
rating_names = [student['name'] for student in ratings]
data_names = list(set([student['Name'] for student in data]))
#cleans text for classifying
for i,student in enumerate(data):
	text = tech.cleanse(student['Student Comment'])
	data[i]['Student Comment'] = text

#split into testing and training sets
n = len(data)
test_idx = random.sample(xrange(n),int(n*0.5))
train_idx = set(xrange(n))-set(test_idx)

test_set = filter(lambda item: item[1] ,map(extract_featurelabel,[data[i] for i in test_idx]))
train_set = filter(lambda item: item[1] ,map(extract_featurelabel,[data[i] for i in train_idx]))

#classifier = NaiveBayesClassifier.train(train_set)
classif.train(test_set)
#Compute accuracy
test_data,test_label = zip(*test_set)
train_data,train_label = zip(*train_set)

predictions = classif.classify_many(test_data)

print confusion_matrix(test_label,predictions)
print matthews_corrcoef(test_label,predictions)
'''
#Only work if using built-in NLTK classifier
print ('Accuracy: {0:.2f}%'.format(100 * nltk.classify.accuracy(classif, test_set)))
classif.show_most_informative_features(20)
'''
示例#19
0
 def train(self, features_label):
     svm = SklearnClassifier(SVC(C=10.0, gamma=0.0001))
     self._classifier = svm.train(features_label)
     return None
示例#20
0
import scipy

from nltk.classify import maxent
nltk.classify.MaxentClassifier.ALGORITHMS
# ['GIS','IIS','CG','BFGS','Powell','LBFGSB','Nelder-Mead','MEGAM','TADM']

# MEGAM or TADM are not rec'd for text classification
mec = nltk.classify.MaxentClassifier.train(train_features, 'GIS', trace=0, max_iter=1000)

from sklearn import cross_validation
cv = cross_validation.KFold(len(train_features), n_folds=10, indices=True, shuffle=False, random_state=None)

for traincv, evalcv in cv:
    classifier = nltk.NaiveBayesClassifier.train(train_features[traincv[0]:traincv[len(traincv)-1]])
    print 'accuracy: %.3f' % nltk.classify.util.accuracy(classifier, train_features[evalcv[0]:evalcv[len(evalcv)-1]])



import sklearn
from sklearn.svm import LinearSVC
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('tfidf', TfidfTransformer()),
                     ('chi2', SelectKBest(chi2, k=2000)),
                     ('nb', MultinomialNB())])
pipecl = SklearnClassifier(pipeline)
pipecl.train(train_features)
    return features


featureset = [(find_features(rev), category) for (rev, category) in dataset]
training_set = featureset[:1900]
testing_set = featureset[1900:]

classifier_f = open("naive_bayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()
print("Inbuilt Naive Bayes accuracy = ",
      (nltk.classify.accuracy(classifier, testing_set)) * 100)
# classifier.show_most_informative_features()

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MN Naive Bayes accuracy = ",
      (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100)

BE_classifier = SklearnClassifier(BernoulliNB())
BE_classifier.train(training_set)
print("BE Naive Bayes accuracy = ",
      (nltk.classify.accuracy(BE_classifier, testing_set)) * 100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression accuracy = ",
      (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) *
      100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
示例#22
0
class MyClassifier:
    def __init__(self, load_clf=False, load_tr_data=False):
        self.features = self.__load_support_vector_features()
        self.training_data = []
        self.n_samples = 0
        self.all_tweets = self.__load_tweets_from_file()  # list not dict

        # Classifier loading
        if load_clf:
            self.load_clf()
        else:
            self.clf = SklearnClassifier(SVC(), sparse=False)

        # Training Data loading
        if load_tr_data:
            self.__load_training_data()

    def __load_tweets_from_file(self):
        # open latest file
        list_of_files = glob.glob(
            "datasets_twitter/twitter_training_data_raw*.txt")
        latest_file = max(list_of_files, key=os.path.getctime)
        f = open(latest_file, "r", encoding="UTF-8")

        tweet_list = []
        for line in f:
            line = line.split("%\t%")
            tweet_text, tweet_id = line[0], line[1]
            tweet_list.append((tweet_text, tweet_id))

        return tweet_list

    def __load_support_vector_features(self):
        feature_f = open("verifiability_features.txt", "r")

        # get all features
        support_vector_features = []
        for line_f in feature_f:
            support_vector_features.append(line_f.replace("\n", ""))

        feature_f.close()

        return support_vector_features

    def __get_sample(self, text_str):
        """
        Changes the text_str into a sample of data in the form of [0, 0, 0, ...]
        This is to be used by the classifier, when
            1) Assembling Training Data, and
            2) Testing data.
        It returns a list of int, which is basically a count of how many of each feature existed in text_str.

        :param text_str: a string of text which is to be verified
        :return: curr_sample, a list of int, sort of mapped to self.features
        """
        tokens = pos_tag(word_tokenize(text_str))

        curr_sample = [0] * len(
            self.features)  # list of n_features of 0s ex. [0, 0, 0, ..]

        for token in tokens:  # for each feature
            t_text, t_feature = token[0], token[1]
            try:
                for index in range(len(self.features)):
                    if t_feature == self.features[index]:
                        # when found, increment/decrement sample vector's  value

                        if self.features[index] == self.features[-1]:
                            # checking if there is a "?" in the text
                            if token[0] == "?":
                                # decrement
                                curr_sample[index] -= 1
                                break
                        else:
                            curr_sample[index] += 1
                            break

            except IndexError:
                # if the feature isn't in the sv_features list
                pass

        return curr_sample

    def __get_training_target(self, sample):
        """
        Returns the label depending on the sample given.

        :param sample: int[] from self.__get_sample()
        :return: "VER" or "NVER", representing the two labels Verifiable and Non-Verifiable
        """
        # check sample if VER or NVER
        t_sum = 0
        for v in sample:
            if v < 0:
                # if there exists a "?" in the sample text
                # (this is the only reason why there'd be a -ve value in curr_sv)
                t_sum = -1
                break

            t_sum += v

        if t_sum > 0:
            return "VER"
        else:
            return "NVER"

    def __assemble_training_data(self):
        """
        Construct the training data using the twitter training data set.

        To be used directly prior to training the Classifier

        :return:
        """

        for tweet in self.all_tweets:
            # get the sample and target for each tweet
            tweet_text = tweet[0]
            curr_sample = self.__get_sample(tweet_text)
            curr_target = self.__get_training_target(curr_sample)

            # change the above into training data
            tr_dict = {}
            for i in range(len(self.features)):
                tr_dict[self.features[i]] = curr_sample[i]

            tup = (tr_dict, curr_target)

            # add to self.training_data
            self.training_data.append(tup)

        # repeat

    def __save_training_data(self):
        timestamp = '{:%Y_%m_%d_%H_%M_%S}'.format(datetime.datetime.now())
        f = open(
            "datasets_twitter/twitter_training_dataset" + timestamp + ".json",
            "w+")
        json_data = json.dumps(self.training_data)
        f.write(json_data)
        f.close()

    def __load_training_data(self):
        list_of_files = glob.glob(
            "datasets_twitter/twitter_training_dataset*.json")
        latest_file = max(list_of_files, key=os.path.getctime)
        f = open(latest_file, "r")
        s = f.readline()
        js = json.loads(s)
        for i in js:
            tup = (i[0], i[1])  # sample, target
            self.training_data.append(tup)

    def train_with_svc(self):
        # make the training data
        self.__assemble_training_data()

        # Train the classifier
        self.clf.train(self.training_data)

        # save classifier as soon as it is trained
        self.save_clf()

    def predict_single(self, test_text):
        """
        Predict a single sample. Then based on user's input, add the sample to the training data with the correct label.

        :param test_text:
        :return:
        """

        test_sample = self.__get_sample(test_text)
        test_dict = {}
        for index in range(len(self.features)):
            test_dict[self.features[index]] = test_sample[index]

        pred = self.clf.classify_many([test_dict])
        return (pred[0], test_sample)

    def predict_multiple(self, test_list):
        """
        Predict more than one sample at a time.

        :param test_list:
        :return:
        """
        # translate test_list into clf passable data format
        test_data = []
        for i in test_list:
            curr_test_sample = self.__get_sample(i)
            test_dict = {}
            for index in range(len(self.features)):
                test_dict[self.features[index]] = curr_test_sample[index]

            test_data.append(test_dict)

        # predict
        pred = self.clf.classify_many(test_data)
        return pred

    def update_pred_into_training(self, test_tweet, pred_val):
        """
        Adds predicted ( {feat:sample}, target ) to training data
        then saves the training data

        if test_text already exists in the training data
            update the target value instead
            then save the training data


        :param test_tweet: a tweet in the form of (tweet_text, tweet_id)
        :param pred_val: the value of the prediction made by the classifier
        :return:
        """
        # a flag to make sure only one part of the code is run
        updated = False

        # localise
        test_tweet_text = test_tweet[0]

        # if text exists in training data already, update the target for this tweet
        for i in range(len(self.all_tweets)):
            tweet = self.all_tweets[i]

            if test_tweet_text == tweet[0]:  # if found
                test_sample = self.__get_sample(test_tweet_text)

                # make into trainable data format
                test_dict = {}
                for j in range(len(self.features)):
                    test_dict[self.features[j]] = test_sample[j]
                test_target = pred_val

                tup = (test_dict, test_target)

                # get the current tup for the test_text and replace
                self.training_data[i] = tup

                # there should only be one tweet with the same text
                updated = True
                break

        # if test_text is not in the training data already
        if not updated:
            # make into trainable data format
            test_sample = self.__get_sample(test_tweet_text)
            test_dict = {}
            for j in range(len(self.features)):
                test_dict[self.features[j]] = test_sample[j]
            test_target = pred_val

            tup = (test_dict, test_target)

            # add tweet to all_tweets and training data
            # get tweet_id
            self.all_tweets.append(test_tweet)
            self.training_data.append(tup)

            # consistency
            updated = True

        # save the training data to file
        self.__save_training_data()
        # train the classifier again
        self.train_with_svc()

    def load_clf(self):
        """
        Load a previously trained and saved classifier.
        :return:
        """
        self.clf = joblib.load("twitterClassifier.pkl")

    def save_clf(self):
        """
        Save the current classifier to file
        :return:
        """
        joblib.dump(self.clf, "twitterClassifier.pkl")
示例#23
0
shuffle(neg_tweets_set)

test_set = pos_tweets_set[:2500] + neg_tweets_set[:2500]
train_set = pos_tweets_set[2500:] + neg_tweets_set[2500:]
#train_set = pos_tweets_set + neg_tweets_set

ME_classifier = MaxentClassifier.train(train_set,
                                       'GIS',
                                       trace=0,
                                       encoding=None,
                                       labels=None,
                                       gaussian_prior_sigma=0,
                                       max_iter=1)
NB_classifier = NaiveBayesClassifier.train(train_set)
SVM_classifier = SklearnClassifier(LinearSVC(), sparse=False)
SVM_classifier.train(train_set)

#ME_accuracy = classify.accuracy(ME_classifier, test_set)
#NB_accuracy = classify.accuracy(NB_classifier, test_set)
#SVM_accuracy = classify.accuracy(SVM_classifier, test_set)
#print(ME_accuracy, NB_accuracy, SVM_accuracy)

actual_set = defaultdict(set)
predicted_set = defaultdict(set)

for index, (feature, actual_label) in enumerate(test_set):
    actual_set[actual_label].add(index)
    predicted_label = NB_classifier.classify(feature)
    predicted_set[predicted_label].add(index)

accuracy = classify.accuracy(NB_classifier, test_set)
def word_feats(words):
    return dict([(word, True) for word in words])


def create_word_features(words):
    useful_words = [word for word in words if word not in stopwords.words("english")]
    my_dict = dict([(word, True) for word in useful_words])
    return my_dict

positive_features = [(word_feats(pos), 'pos') for pos in pos_vocab]
negative_features = [(word_feats(neg), 'neg') for neg in neg_vocab]

train_set = negative_features + positive_features

LRclassifier = SklearnClassifier(LogisticRegression())
LRclassifier.train(train_set)



def pre(text):
    text = word_tokenize(''.join(text).lower())
    neg = 0
    pos = 0
    for word in text:
        classResult = LRclassifier.classify( word_feats(word))
        if classResult == 'neg':
            neg = neg + 1
        if classResult == 'pos':
            pos = pos + 1
        outdict = {'pos': str(float(pos)/len(text)), 'neg' : str(float(neg)/len(text))}
    return outdict
示例#25
0
def evaluate_classifier(featx):
    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    #testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

    
    print 'Reading Tweets\n'
    tweets_data_path = '20161019_202620.txt'
    tweets_data = []
    tweets_file = open(tweets_data_path, "r")
    for line in tweets_file:
	    try:
	        tweet = json.loads(line)
	        tweets_data.append(tweet)
	    except:
	    	continue
	     	
    tweets = pd.DataFrame()
    tweets['text'] = [tweet.get('text','') for tweet in tweets_data]
    
    tdata = tweets['text']
    negfeats = [(featx(f), 'neg') for f in word_split(tdata)]
    testfeats = negfeats

    print np.shape(testfeats)
    #testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    #print np.shape(testfeats)
    
    
    # using 3 classifiers
    classifier_list = ['nb', 'maxent', 'svm']     
        
    for cl in classifier_list:
        if cl == 'maxent':
            classifierName = 'Maximum Entropy'
            classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None,  gaussian_prior_sigma=0, max_iter = 1)
        elif cl == 'svm':
            classifierName = 'SVM'
            classifier = SklearnClassifier(LinearSVC(), sparse=False)
            classifier.train(trainfeats)
        else:
            classifierName = 'Naive Bayes'
            classifier = NaiveBayesClassifier.train(trainfeats)
            
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
 
        for i, (feats, label) in enumerate(testfeats):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)

        print testsets[observed]

        accuracy = nltk.classify.util.accuracy(classifier, testfeats)
        #pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos'])
        #pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos'])
        #pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
        #neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg'])
        #neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg'])
        #neg_fmeasure =  nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
        
        print ''
        print '---------------------------------------'
        print 'SINGLE FOLD RESULT ' + '(' + classifierName + ')'
        print '---------------------------------------'
        print 'accuracy:', accuracy
示例#26
0
def evaluate_mult_classifiers(feature_x, n_folds=5):

    # 5-fold default for cross-validation
    # train_feats = 75% of pos_data + 75% of neg_data
    # test_feats  = 25% of pos_data + 25% of neg_data

    neg_feats = [(feature_x(i), 'neg') for i in word_split(neg_data)]
    pos_feats = [(feature_x(i), 'pos') for i in word_split(pos_data)]

    neg_cutoff = int(len(neg_feats) * 0.75)
    pos_cutoff = int(len(pos_feats) * 0.75)

    train_feats = neg_feats[:neg_cutoff] + pos_feats[:pos_cutoff]
    test_feats = neg_feats[neg_cutoff:] + pos_feats[pos_cutoff:]

    classifier_list = ['NB', 'SVM']

    ## CROSS VALIDATION
    train_feats = neg_feats + pos_feats

    # Shuffle training set
    random.shuffle(train_feats)

    for cl in classifier_list:

        subset_size = int(len(train_feats) / n_folds)
        accuracy = []
        pos_precision = []
        pos_recall = []
        neg_precision = []
        neg_recall = []
        pos_fmeasure = []
        neg_fmeasure = []
        cv_count = 1

        print('--------------------------')
        print('Beginning Cross-validation')
        print('--------------------------')

        for i in range(n_folds):
            testing_this_round = train_feats[i * subset_size:][:subset_size]
            training_this_round = train_feats[:i * subset_size] + train_feats[
                (i + 1) * subset_size:]

            if cl == 'NB':
                classifierName = 'Naive Bayes'
                # Using NLTK NaiveBayesClassifier
                classifier = NaiveBayesClassifier.train(training_this_round)
            else:
                classifierName = 'SVM'
                classifier = SklearnClassifier(LinearSVC(), sparse=False)
                classifier.train(training_this_round)

            ref_sets = collections.defaultdict(set)
            test_sets = collections.defaultdict(set)

            for i, (feats, label) in enumerate(testing_this_round):
                ref_sets[label].add(i)
                observed = classifier.classify(feats)
                test_sets[observed].add(i)

            cv_accuracy = nltk.classify.util.accuracy(classifier,
                                                      testing_this_round)
            cv_pos_precision = nltk.precision(ref_sets['pos'],
                                              test_sets['pos'])
            cv_pos_recall = nltk.recall(ref_sets['pos'], test_sets['pos'])
            cv_pos_fmeasure = nltk.f_measure(ref_sets['pos'], test_sets['pos'])
            cv_neg_precision = nltk.precision(ref_sets['neg'],
                                              test_sets['neg'])
            cv_neg_recall = nltk.recall(ref_sets['neg'], test_sets['neg'])
            cv_neg_fmeasure = nltk.f_measure(ref_sets['neg'], test_sets['neg'])

            print('Fold: {} Acc       : {:.4F}'.format(cv_count, cv_accuracy))
            print('Fold: {} pos_prec  : {:.4F} neg_prec  : {:.4F}'.format(
                cv_count, cv_pos_precision, cv_neg_precision))
            print('Fold: {} pos_recall: {:.4F} neg_recall: {:.4F}'.format(
                cv_count, cv_pos_recall, cv_neg_recall))
            print('Fold: {} pos_fmeas : {:.4F} neg_fmeas : {:.4F}'.format(
                cv_count, cv_pos_fmeasure, cv_neg_fmeasure))
            print('--')

            accuracy.append(cv_accuracy)
            pos_precision.append(cv_pos_precision)
            pos_recall.append(cv_pos_recall)
            neg_precision.append(cv_neg_precision)
            neg_recall.append(cv_neg_recall)
            pos_fmeasure.append(cv_pos_fmeasure)
            neg_fmeasure.append(cv_neg_fmeasure)

            cv_count += 1

        print('----------------------------------------------------------')
        print('{}-Fold Cross Validation results for {} Classifier'.format(
            n_folds, classifierName))
        print('----------------------------------------------------------')
        print('accuracy : {:.4F}'.format(sum(accuracy) / n_folds))
        print('precision: {:.4F}'.format(
            (sum(pos_precision) / n_folds + sum(neg_precision) / n_folds) / 2))
        print('recall   : {:.4F}'.format(
            (sum(pos_recall) / n_folds + sum(neg_recall) / n_folds) / 2))
        print('f-measure: {:.4F}'.format(
            (sum(pos_fmeasure) / n_folds + sum(neg_fmeasure) / n_folds) / 2))
        print('\n')
示例#27
0
def runClassifiers(positives, negatives, featuresToUse, outFile, verbose, classifiersToUse):
    onDataSet = 0
    numDataSets = len(positives + negatives)
    table = []
    pos = []
    neg = []

    short = NUM_CLASSIFIERS - len(classifiersToUse)
    for x in range(short):
        classifiersToUse.append(False)

    # print which features we are using
    print("Using these features: ", FeatureExtractor.featuresToString(featuresToUse))

    for data in positives:
        pos.append((FeatureExtractor.langFeatures(data, featuresToUse), True))
        onDataSet += 1

    for data in negatives:
        neg.append((FeatureExtractor.langFeatures(data, featuresToUse), False))
        onDataSet += 1

    random.shuffle(pos)
    random.shuffle(neg)

    # Testing is 1/4 of the data set, so we will cut it off there
    minLen = min(len(pos), len(neg))
    posCut = minLen//4
    negCut = posCut*2

    # splits training and test sets
    train_data = pos[posCut:] + neg[negCut:]
    test_data = pos[:posCut] + neg[:posCut]

    maxEntSupport = featuresToUse["max_ent"]

    if classifiersToUse[0]:
        print("Running Naive Bayes classifier")
        timeStart = time.time()

        # NLTK's built-in implementation of the Naive Bayes classifier is trained
        classifier = nltk.NaiveBayesClassifier.train(train_data)


        # attempt to use sklearn naive bayes, not as good unfortunately
        # clf = MultinomialNB()

        # if featuresToUse["words"] or featuresToUse["ngrams"]:
        #     pipeline = Pipeline([     # ('tfidf', TfidfTransformer()),
        #                  ('chi2', SelectKBest(chi2, k='all')),
        #                  ('NB', clf)])

        #     classifier = SklearnClassifier(pipeline)
        # else:
        #     classifier = SklearnClassifier(clf)
        # classifier.train(train_data)

        # get the time it takes to train Naive Bayes
        print ("\nTime to train in seconds: ", time.time() - timeStart)

        # if featuresToUse["laugh_count"]:
        #     DataCreator.pickleData("pickled_data/MaxEnt_Full", classifier)
        # else:
        #     DataCreator.pickleData("pickled_data/MaxEnt_Part", classifier)

        # store the accuracy in the table
        table.append(assess_classifier(classifier, test_data, "Naive Bayes", maxEntSupport))

        if verbose:
            # this is a nice function that reports the top most impactful features the NB classifier found
            print("\n\n")
            print (classifier.show_most_informative_features(20))

    if classifiersToUse[1]:
        print("Running Decision Tree classifier")
        timeStart = time.time()

        # NLTK's built-in implementation of the Decision Tree classifier is trained
        classifier = nltk.DecisionTreeClassifier.train(train_data)

        # get the time to train Decision tree
        print ("\nTime to train in seconds: ", time.time() - timeStart)

        # store the accuracy in the table
        table.append(assess_classifier(classifier, test_data, "Decision Tree"))

        if verbose:
            print("Printing tree")
            # print(classifier.pretty_format())
            for (feats, cor) in test_data[:20]:
                classification = classifier.classify(feats)
                print("Correct: ", cor, " Result: ", classification)#, "for ", feats[0])

    if classifiersToUse[2]:
        print("Running Maximum Entropy classifier")
        timeStart = time.time()

        # NLTK's built-in implementation of the Max Entropy classifier is trained
        classifier = nltk.MaxentClassifier.train(train_data, max_iter=25)

        if featuresToUse["laugh_count"]:
            DataCreator.pickleData("pickled_data/MaxEnt_Full", classifier)
        else:
            DataCreator.pickleData("pickled_data/MaxEnt_Part", classifier)

        # get the time to train Maximum Entropy
        print ("\nTime to train in seconds: ", time.time() - timeStart)

        # store the accuracy in the table
        table.append(assess_classifier(classifier, test_data, "Maximum Entropy"))

        if verbose:
            # this is a nice function that reports the top most impactful features the NB classifier found
            print (classifier.show_most_informative_features(20))
            # this is a function that explains the effect of each feature in the set
            # print (classifier.explain())

    if classifiersToUse[3]:
        print("Running SVM classifier")
        timeStart = time.time()

        # Scikit-learn's LinearSVC classifier, wrapped up in NLTK's wrapper class
        clf = LinearSVC()

        if featuresToUse["Dim Reduction"]:
            # pipeline = Pipeline([   # ('tfidf', TfidfTransformer()),
            #              ('chi2', SelectKBest(chi2, k='all')),
            #              ('randomforest', clf)])
            pipeline = Pipeline([('PCA', PCA()), ('classifier', clf)])
            classifier = SklearnClassifier(pipeline)
        else:
            classifier = SklearnClassifier(clf)

        classifier.train(train_data)

        # get the time to train a Support Vector Machine
        print ("\nTime to train in seconds: ", time.time() - timeStart)

        # store the accuracy in the table
        table.append(assess_classifier(classifier, test_data, "Linear SVC"))

    if classifiersToUse[4]:
        numEstimators = 50
        print("Running AdaBoost classifier")
        timeStart = time.time()

        # Scikit-learn's AdaBoost classifier wrapped up in NLTK's wrapper class
        # The main parameters to tune to obtain good results are:
        # n_estimators and the complexity of the base estimators

        # testclf = RandomForestClassifier()
        # clf = AdaBoostClassifier(base_estimator=testclf, n_estimators=numEstimators)
        clf = AdaBoostClassifier(n_estimators=numEstimators)

        if featuresToUse["Dim Reduction"]:
            pipeline = Pipeline([('TruncatedSVD', TruncatedSVD()), ('classifier', clf)])
            classifier = SklearnClassifier(pipeline)
        else:
            classifier = SklearnClassifier(clf)
        classifier.train(train_data)

        # get the time to train
        print ("\nTime to train in seconds: ", time.time() - timeStart)

        # store the accuracy in the table
        table.append(assess_classifier(classifier, test_data, "AdaBoost(" + str(numEstimators) + ")", maxEntSupport))

    if classifiersToUse[5]:
        print("Running Random Forest Classifier classifier")
        timeStart = time.time()

        # Scikit-learn's Random Forest classifier wrapped up in NLTK's
        # wrapper class
        # The main parameters to tune to obtain good results are:
        # n_estimators
        clf = RandomForestClassifier()

        if featuresToUse["Dim Reduction"]:
            # pipeline = Pipeline([   # ('tfidf', TfidfTransformer()),
            #              ('chi2', SelectKBest(chi2, k='all')),
            #              ('randomforest', clf)])
            pipeline = Pipeline([('TruncatedSVD', TruncatedSVD()), ('classifier', clf)])
            classifier = SklearnClassifier(pipeline)
        else:
            classifier = SklearnClassifier(clf)
        classifier.train(train_data)

        # get the time to train
        print ("\nTime to train in seconds: ", time.time() - timeStart)

        # store the accuracy in the table
        table.append(assess_classifier(classifier, test_data, "Random Forest", maxEntSupport))

    if classifiersToUse[6]:
        numEstimators = 50
        print("Running Combo classifier")
        timeStart = time.time()

        adaclf = SklearnClassifier(AdaBoostClassifier(n_estimators=numEstimators))
        adaclf.train(train_data)
        naive = nltk.NaiveBayesClassifier.train(train_data)

        # get the time to train
        print ("\nTime to train in seconds: ", time.time() - timeStart)

        # store the accuracy in the table
        TP = TN = FP = FN = 0
        for i, (feats, label) in enumerate(test_data):
            observed = False
            if naive.classify(feats) and adaclf.classify(feats):
                observed = True
            if label == observed:
                if observed:
                    TP += 1
                else:
                    TN += 1
            else:
                if observed:
                    FP += 1
                else:
                    FN += 1

        accuracy = (TP+TN)/(TP+FP+TN+FN)
        p_prec = TP/(TP+FP)
        p_rec = TP/(TP+FN)
        f1Pos = 2*((p_prec*p_rec)/(p_prec + p_rec))
        n_prec = TN/(TN+FN)
        n_rec = TN/(TN+FP)
        f1Neg = 2*((n_prec*n_rec)/(n_prec + n_rec))
        table.append(["COMBO", accuracy, p_prec, p_rec, f1Pos, n_prec, n_rec, f1Neg])


    if (outFile == ""):
        print("\n", FeatureExtractor.featuresToString(featuresToUse))
        # print(tabulate(table, headers=["Classifier", "accuracy", "pos precision", "pos recall", "pos f1", "neg precision", "neg recall", "neg f1"]))
    else:
        with open(outFile, 'a') as out:
            out.write("\n")
            out.write(FeatureExtractor.featuresToString(featuresToUse))
            out.write(tabulate(table, headers=["Classifier", "accuracy", "pos precision", "pos recall", "pos f1", "neg precision", "neg recall", "neg f1"]))
            out.write("\n")

    return table
示例#28
0
def evaluate_classifier(featx, balance=False):
    global negdata
    global neudata
    global posdata

    if balance:

        neudata = resample(neudata, n_samples=len(negdata))
        posdata = resample(posdata, n_samples=len(negdata))

    # using 3 classifiers
    classifier_list = ['svm', 'nb', 'maxent']

    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
    neufeats = [(featx(f), 'neu') for f in word_split(neudata)]
    alldata = negdata + posdata + neudata
    allfeats = negfeats + posfeats + neufeats

    #10-fold cross-validation
    correct = []
    incorrect = []
    for n in [10]:  #range(2,6):
        negfeatssplit = chunkIt(negfeats, n)
        negdatasplit = chunkIt(negdata, n)
        posfeatssplit = chunkIt(posfeats, n)
        posdatasplit = chunkIt(posdata, n)
        neufeatssplit = chunkIt(neufeats, n)
        neudatasplit = chunkIt(neudata, n)
        for cl in classifier_list:
            accuracy = []
            pos_precision = []
            pos_recall = []
            neg_precision = []
            neg_recall = []
            neu_precision = []
            neu_recall = []
            pos_fmeasure = []
            neg_fmeasure = []
            neu_fmeasure = []
            cv_count = 1
            res = {}
            res["neg"] = 0
            res["pos"] = 0
            res["neu"] = 0

            for i in range(n):
                testing_this_round = negfeatssplit[i - 1] + posfeatssplit[
                    i - 1] + neufeatssplit[i - 1]
                training_this_round = gettrainfeat(
                    negfeatssplit, i) + gettrainfeat(
                        posfeatssplit, i) + gettrainfeat(neufeatssplit, i)

                if cl == 'maxent':
                    classifierName = 'Maximum Entropy'
                    classifier = MaxentClassifier.train(training_this_round,
                                                        'GIS',
                                                        trace=0,
                                                        encoding=None,
                                                        labels=None,
                                                        gaussian_prior_sigma=0,
                                                        max_iter=1)
                elif cl == 'svm':
                    classifierName = 'SVM'
                    classifier = SklearnClassifier(LinearSVC(), sparse=False)
                    classifier.train(training_this_round)
                else:
                    classifierName = 'Naive Bayes'
                    classifier = NaiveBayesClassifier.train(
                        training_this_round)

                refsets = collections.defaultdict(set)
                testsets = collections.defaultdict(set)
                aux_test = {}
                auxFP_test = {}
                aux_test['pos'] = 0
                aux_test['neu'] = 0
                aux_test['neg'] = 0
                auxFP_test['pos'] = 0
                auxFP_test['neu'] = 0
                auxFP_test['neg'] = 0
                for ii, (feats, label) in enumerate(testing_this_round):
                    refsets[label].add(ii)
                    observed = classifier.classify(feats)
                    testsets[observed].add(ii)
                    res[observed] = res[observed] + 1
                    auxFP_test[observed] = auxFP_test[observed] + 1
                    if (observed == label):
                        correct.append((feats, label))
                        aux_test[label] = aux_test[label] + 1
                    else:
                        incorrect.append((feats, label))

                cv_accuracy = nltk.classify.util.accuracy(
                    classifier, testing_this_round)
                cv_neg_precision = float(aux_test['neg']) / float(
                    len(negfeatssplit[i - 1]))
                print cv_neg_precision

                cv_neg_recall = float(aux_test['neg']) / float(
                    auxFP_test['neg'])
                cv_neg_fmeasure = 2 * ((cv_neg_precision * cv_neg_recall) /
                                       (cv_neg_precision + cv_neg_recall))
                cv_pos_precision = float(aux_test['pos']) / float(
                    len(posfeatssplit[i - 1]))
                cv_pos_recall = float(aux_test['pos']) / float(
                    auxFP_test['pos'])
                cv_pos_fmeasure = 2 * ((cv_pos_precision * cv_pos_recall) /
                                       (cv_pos_precision + cv_pos_recall))
                cv_neu_precision = float(aux_test['neu']) / float(
                    len(neufeatssplit[i - 1]))
                cv_neu_recall = float(aux_test['neu']) / float(
                    auxFP_test['neu'])
                cv_neu_fmeasure = 2 * ((cv_neu_precision * cv_neu_recall) /
                                       (cv_neu_precision + cv_neu_recall))
                #cv_accuracy = float(aux_test['neg'] + aux_test['pos']+ aux_test['neu'])/float(len(testing_this_round))

                accuracy.append(cv_accuracy)
                pos_precision.append(cv_pos_precision)
                neg_precision.append(cv_neg_precision)
                neu_precision.append(cv_neu_precision)
                pos_recall.append(cv_pos_recall)
                neg_recall.append(cv_neg_recall)
                neu_recall.append(cv_neu_recall)
                pos_fmeasure.append(cv_pos_fmeasure)
                neg_fmeasure.append(cv_neg_fmeasure)
                neu_fmeasure.append(cv_neu_fmeasure)

                cv_count += 1

            print "Balance = ", balance
            print '---------------------------------------'
            print str(
                n
            ) + '-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')'
            print "Nbr = ", res
            print 'accuracy:', sum(accuracy) / n
            print 'precision', ((sum(pos_precision) / n) +
                                (sum(neg_precision) / n) +
                                (sum(neu_precision) / n)) / 3.0
            print sum(pos_precision) / n, sum(neg_precision) / n, sum(
                neu_precision) / n
            print 'recall', (sum(pos_recall) / n + sum(neg_recall) / n +
                             sum(neu_recall) / n) / 3.0
            print sum(pos_recall) / n, sum(neg_recall) / n, sum(neu_recall) / n
            print 'f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n +
                                sum(neu_fmeasure) / n) / 3.0
            print sum(pos_fmeasure) / n, sum(neg_fmeasure) / n, sum(
                neu_fmeasure) / n

            print "*********CORRECT****"
            print(len(correct), len(incorrect))
            #print (correct,incorrect)

            for tt in correct:
                print(tt[1], alldata[allfeats.index(tt)])
            print "***INCORRECT**********"
            for tt in incorrect:
                print(tt[1], alldata[allfeats.index(tt)])  #.index(correct[0]))
            print "..."
示例#29
0
def evaluate_classifier(featx):

    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
    #print(negfeats)

    negcutoff = int(len(negfeats) * 3 / 4)
    poscutoff = int(len(posfeats) * 3 / 4)
    #print(negcutoff)

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    #print(trainfeats)
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

    classifierName = 'SVM'
    classifier = SklearnClassifier(LinearSVC(), sparse=False).train(trainfeats)
    #classifier.train(trainfeats)

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    #print(testfeats)
    for i, (feats, label) in enumerate(testfeats):
        #feats : list of words
        #label : neg/pos
        #observed : neg/pos
        #print(feats,'---',label)
        refsets[label].add(i)
        observed = classifier.classify(feats)
        #print(observed)
        testsets[observed].add(i)

    accuracy = nltk.classify.util.accuracy(classifier, testfeats)
    pos_precision = nltk.precision(refsets['pos'], testsets['pos'])
    pos_recall = nltk.recall(refsets['pos'], testsets['pos'])
    pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos'])
    neg_precision = nltk.precision(refsets['neg'], testsets['neg'])
    neg_recall = nltk.recall(refsets['neg'], testsets['neg'])
    neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg'])

    print('')
    print('---------------------------------------')
    print('SINGLE FOLD RESULT ' + '(' + classifierName + ')')
    print('---------------------------------------')
    print('accuracy:', accuracy)
    print('precision', (pos_precision + neg_precision) / 2)
    print('recall', (pos_recall + neg_recall) / 2)
    print('f-measure', (pos_fmeasure + neg_fmeasure) / 2)

    #classifier.show_most_informative_features()

    print('')

    ## CROSS VALIDATION

    trainfeats = negfeats + posfeats

    # SHUFFLE TRAIN SET
    # As in cross validation, the test chunk might have only negative or only positive data
    random.shuffle(trainfeats)
    n = 5  # 5-fold cross-validation

    subset_size = int(len(trainfeats) / n)
    accuracy = []
    pos_precision = []
    pos_recall = []
    neg_precision = []
    neg_recall = []
    pos_fmeasure = []
    neg_fmeasure = []
    cv_count = 1
    for i in range(n):
        testing_this_round = trainfeats[i * subset_size:][:subset_size]
        training_this_round = trainfeats[:i * subset_size] + trainfeats[
            (i + 1) * subset_size:]

        classifierName = 'SVM'
        classifier = SklearnClassifier(LinearSVC(), sparse=False)
        classifier.train(training_this_round)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
        for i, (feats, label) in enumerate(testing_this_round):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        cv_accuracy = nltk.classify.util.accuracy(classifier,
                                                  testing_this_round)
        cv_pos_precision = nltk.precision(refsets['pos'], testsets['pos'])
        cv_pos_recall = nltk.recall(refsets['pos'], testsets['pos'])
        cv_pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos'])
        cv_neg_precision = nltk.precision(refsets['neg'], testsets['neg'])
        cv_neg_recall = nltk.recall(refsets['neg'], testsets['neg'])
        cv_neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg'])

        accuracy.append(cv_accuracy)
        pos_precision.append(cv_pos_precision)
        pos_recall.append(cv_pos_recall)
        neg_precision.append(cv_neg_precision)
        neg_recall.append(cv_neg_recall)
        pos_fmeasure.append(cv_pos_fmeasure)
        neg_fmeasure.append(cv_neg_fmeasure)

        cv_count += 1

    print('---------------------------------------')
    print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')')
    print('---------------------------------------')
    print('accuracy:', sum(accuracy) / n)
    print('precision', (sum(pos_precision) / n + sum(neg_precision) / n) / 2)
    print('recall', (sum(pos_recall) / n + sum(neg_recall) / n) / 2)
    print('f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n) / 2)
    print('')
示例#30
0
def evaluate_classifier(featx):

    #negfeats = [(featx(mark_negation(f)), 'neg') for f in word_split(negdata)]
    #posfeats = [(featx(mark_negation(f)), 'pos') for f in word_split(posdata)]
    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    #print negfeats[1:25]
    #raw_input('>')
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
    negcutoff = len(negfeats) * 3 / 4
    poscutoff = len(posfeats) * 3 / 4

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    print "No of training reviews:", len(trainfeats)
    #print trainfeats
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    print "No of testing reviews:", len(testfeats)

    # using 3 classifiers
    classifier_list = ['nb', 'svm', 'maxent']  #
    NB_pred = []
    new_label = []
    for cl in classifier_list:
        if cl == 'maxent':
            classifierName = 'Maximum Entropy'
            classifier = MaxentClassifier.train(trainfeats,
                                                'GIS',
                                                trace=0,
                                                encoding=None,
                                                labels=None,
                                                gaussian_prior_sigma=0,
                                                max_iter=1)
        elif cl == 'svm':
            classifierName = 'SVM'
            classifier = SklearnClassifier(LinearSVC(), sparse=False)
            classifier.train(trainfeats)
        else:
            classifierName = 'Naive Bayes'
            classifier = NaiveBayesClassifier.train(trainfeats)

        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
        original_label = []

        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            original_label.append(label)
            #print feats
            #raw_input('> ')
            observed = classifier.classify(feats)
            NB_pred.append(observed)

            testsets[observed].add(i)

        #print refsets['pos']
        #print testsets['pos']
        #print original_label
        #print NB_Pred
        #cm = confusion_matrix(original_label,NB_pred)
        #print cm
        #print "The accuracy score is {:.2%}".format(accuracy_score(original_label,NB_pred))
        new_label = original_label
        accuracy = nltk.classify.util.accuracy(classifier, testfeats)
        pos_precision = nltk.precision(refsets['pos'], testsets['pos'])
        pos_recall = nltk.recall(refsets['pos'], testsets['pos'])
        pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos'])
        neg_precision = nltk.precision(refsets['neg'], testsets['neg'])
        neg_recall = nltk.recall(refsets['neg'], testsets['neg'])
        neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg'])

        print('')
        print('---------------------------------------')
        print('SINGLE FOLD RESULT ' + '(' + classifierName + ')')
        print('---------------------------------------')
        print('accuracy:', accuracy)
        print('precision', (pos_precision + neg_precision) / 2)
        print('recall', (pos_recall + neg_recall) / 2)
        print('f-measure', (pos_fmeasure + neg_fmeasure) / 2)

        #classifier.show_most_informative_features(50)

    print('')

    #print len(NB_pred)

    ME_pred = NB_pred[982:]
    SVM_pred = NB_pred[491:982]
    NB_pred = NB_pred[0:491]
    #print NB_pred
    #print "-----------------------"
    #print ME_pred
    #print "-----------------------"
    #print SVM_pred
    #print "-----------------------"
    #cm = confusion_matrix(SVM_pred,NB_pred)
    #print cm
    #print "The accuracy score is {:.2%}".format(accuracy_score(SVM_pred,NB_pred))
    #cm = confusion_matrix(ME_pred,NB_pred)
    #print cm
    #print "The accuracy score is {:.2%}".format(accuracy_score(ME_pred,NB_pred))
    #cm = confusion_matrix(SVM_pred,ME_pred)
    #print cm
    #print "The accuracy score is {:.2%}".format(accuracy_score(SVM_pred,ME_pred))

    final_pred = []
    for i in range(0, 491):
        c1 = 0
        if NB_pred[i] == 'pos':
            c1 = c1 + 1
        if ME_pred[i] == 'pos':
            c1 = c1 + 1
        if SVM_pred[i] == 'pos':
            c1 = c1 + 1
        #print i
        if c1 == 3 or c1 == 2:
            final_pred.append('pos')
        else:
            final_pred.append('neg')

    print "-----------------------"
    #print final_pred
    print "-----------------------"
    #print new_label

    print "Results of ensemble: NB + SVM + ME::"
    print "----------Confusion Matrix--------------"
    cm = confusion_matrix(final_pred, new_label)
    print cm
    print ""
    print "The accuracy score of ensemble is {:.2%}".format(
        accuracy_score(final_pred, new_label))
    print "##############################################"

    ## CROSS VALIDATION

    trainfeats = negfeats + posfeats

    # SHUFFLE TRAIN SET
    # As in cross validation, the test chunk might have only negative or only positive data
    random.shuffle(trainfeats)
    n = 5  # 5-fold cross-validation

    for cl in classifier_list:

        subset_size = len(trainfeats) / n
        accuracy = []
        pos_precision = []
        pos_recall = []
        neg_precision = []
        neg_recall = []
        pos_fmeasure = []
        neg_fmeasure = []
        cv_count = 1
        for i in range(n):
            testing_this_round = trainfeats[i * subset_size:][:subset_size]
            training_this_round = trainfeats[:i * subset_size] + trainfeats[
                (i + 1) * subset_size:]

            if cl == 'maxent':
                classifierName = 'Maximum Entropy'
                classifier = MaxentClassifier.train(training_this_round,
                                                    'GIS',
                                                    trace=0,
                                                    encoding=None,
                                                    labels=None,
                                                    gaussian_prior_sigma=0,
                                                    max_iter=1)
            elif cl == 'svm':
                classifierName = 'SVM'
                classifier = SklearnClassifier(LinearSVC(), sparse=False)
                classifier.train(training_this_round)
            else:
                classifierName = 'Naive Bayes'
                classifier = NaiveBayesClassifier.train(training_this_round)

            refsets = collections.defaultdict(set)
            testsets = collections.defaultdict(set)
            for i, (feats, label) in enumerate(testing_this_round):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)

            cv_accuracy = nltk.classify.util.accuracy(classifier,
                                                      testing_this_round)
            cv_pos_precision = nltk.precision(refsets['pos'], testsets['pos'])
            cv_pos_recall = nltk.recall(refsets['pos'], testsets['pos'])
            cv_pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos'])
            cv_neg_precision = nltk.precision(refsets['neg'], testsets['neg'])
            cv_neg_recall = nltk.recall(refsets['neg'], testsets['neg'])
            cv_neg_fmeasure = nltk.f_measure(refsets['neg'], testsets['neg'])

            accuracy.append(cv_accuracy)
            pos_precision.append(cv_pos_precision)
            pos_recall.append(cv_pos_recall)
            neg_precision.append(cv_neg_precision)
            neg_recall.append(cv_neg_recall)
            pos_fmeasure.append(cv_pos_fmeasure)
            neg_fmeasure.append(cv_neg_fmeasure)

            cv_count += 1

        print('---------------------------------------')
        print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')')
        print('---------------------------------------')
        print('accuracy:', sum(accuracy) / n)
        print('precision',
              (sum(pos_precision) / n + sum(neg_precision) / n) / 2)
        print('recall', (sum(pos_recall) / n + sum(neg_recall) / n) / 2)
        print('f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n) / 2)
        if cl == 'maxent':
            maxent_accuracy_next = (sum(accuracy) / n)
            maxent_accuracy.append(maxent_accuracy_next)
        elif cl == 'svm':
            svm_accuracy_next = (sum(accuracy) / n)
            svm_accuracy.append(svm_accuracy_next)
        else:
            nb_accuracy_next = (sum(accuracy) / n)
            nb_accuracy.append(nb_accuracy_next)

nonlinear_svm = SklearnClassifier(SVC(gamma='scale', kernel='poly', coef0 = 5.0, degree = 5, C = 5.0, shrinking=True, probability=False, tol=1e-3), sparse=False).train(train_set)
print("Accuracy - Nonlinear SVM: ")
print(nltk.classify.accuracy(nonlinear_svm, test_set))


random_forest = SklearnClassifier(RandomForestClassifier(n_estimators = 100,
                                                         criterion = 'gini',
                                                         max_depth = 5,
                                                         min_samples_split = 2,
                                                         min_samples_leaf = 1,
                                                         min_weight_fraction_leaf = 0.0,
                                                         max_features = 25,
                                                         max_leaf_nodes = 20,
                                                         min_impurity_decrease = 0.0,
                                                         bootstrap = True,
                                                         oob_score = False,
                                                         random_state = None ),
                                  sparse = False)
random_forest.train(train_set)
print("Accuracy - Random Forest Classifier: ")
print(nltk.classify.accuracy(random_forest, test_set))


test_tweet = "75% of illegal Aliens commit Felons such as ID, SSN and Welfare Theft Illegal #Immigration is not a Victimless Crime !"
# print(naive_bayes.classify(extract_features_of_tweet(test_tweet, raw=True)))
# print(maxent.classify(extract_features_of_tweet(test_tweet, raw=True)))
print(linear_svm_classifier.classify(extract_features_of_tweet(test_tweet, raw=False)))
print(nonlinear_svm.classify(extract_features_of_tweet(test_tweet, raw=True)))
示例#32
0
class MyClassifier:
    def __init__(self):
        self.features = self.get_support_vector_features()
        self.training_data = []
        self.n_samples = 0
        self.tweets_from_file = self.__get_tweets_from_file()
        self.clf = None

    def __get_tweets_from_file(self):
        # open latest file
        list_of_files = glob.glob(
            "datasets_twitter/twitter_training_data_set*.txt")
        latest_file = max(list_of_files, key=os.path.getctime)
        f = open(latest_file, "r")

        tweet_list = []
        for line in f:
            line = line.split("%\t%")
            tweet_text, tweet_id = line[0], line[1]
            tweet_list.append(tweet_text, tweet_id)

        return tweet_list

    def __get_support_vector_features(self):
        feature_f = open("verifiability_features.txt", "r")

        # get all features
        support_vector_features = []
        for line_f in feature_f:
            support_vector_features.append(line_f.replace("\n", ""))

        feature_f.close()

        return support_vector_features

    def __get_sample(self, text_str):
        """
        Changes the text_str into a sample of data in the form of [0, 0, 0, ...]
        This is to be used by the classifier, when
            1) Assembling Training Data, and
            2) Testing data.
        It returns a list of int, which is basically a count of how many of each feature existed in text_str.

        :param text_str: a string of text which is to be verified
        :return: curr_sample, a list of int, sort of mapped to self.features
        """
        tokens = pos_tag(word_tokenize(text_str))

        curr_sample = [0] * len(
            self.features)  # list of n_features of 0s ex. [0, 0, 0, ..]

        for token in tokens:  # for each feature
            t_text, t_feature = token[0], token[1]
            try:
                for index in range(len(self.features)):
                    if t_feature == self.features[index]:
                        # when found, increment/decrement sample vector's  value

                        if self.features[index] == self.features[-1]:
                            # checking if there is a "?" in the text
                            if token[0] == "?":
                                # decrement
                                curr_sample[index] -= 1
                                break
                        else:
                            curr_sample[index] += 1
                            break

            except IndexError:
                # if the feature isn't in the sv_features list
                pass

        return curr_sample

    def __get_training_target(self, sample):
        """
        Returns the label depending on the sample given.

        :param sample: int[] from self.__get_sample()
        :return: "VER" or "NVER", representing the two labels Verifiable and Non-Verifiable
        """
        # check sample if VER or NVER
        t_sum = 0
        for v in sample:
            if v < 0:
                # if there exists a "?" in the sample text
                # (this is the only reason why there'd be a -ve value in curr_sv)
                t_sum = -1
                break

            t_sum += v

        if t_sum > 0:
            return "VER"
        else:
            return "NVER"

    def assemble_training_data(self):

        pass

    def train_with_SVC(self):
        self.clf = SklearnClassifier(SVC(), sparse=False)
        self.clf.train(self.training_data)

    def predict_single(self, test_text):
        test_sample = self.__get_sample(test_text)
        test_dict = {}
        for index in range(self.features):
            test_dict[self.features[index]] = test_sample[index]

        pred = self.clf.predict([test_dict])
        print("Prediction:", pred)
        feedback = input("Is this prediction correct? Y/N")

        # Make data + target into a tuple
        if feedback == "Y" or feedback == "y":
            tup = (test_dict, pred)
        else:
            # correct the target and make into a tuple
            if pred == "VER":
                tup = (test_dict, "NVER")
            else:
                tup = (test_dict, "VER")

        # add tuple to training data

    def predict_multiple(self, test_list):
        test_data = []
        for i in test_list:
            curr_test_sample = self.__get_sample(i)
            test_dict = {}
            for index in range(self.features):
                test_dict[self.features[index]] = curr_test_sample[index]

            test_data.append(test_dict)

        pred = self.clf.predict([test_data])
        print("Prediction:", pred)

        feedback = input("Are these predictions correct? Y/N")

        # Make data + target into a tuple
        if feedback == "Y" or feedback == "y":
            # get individual tuples
            for i in range(len(test_data)):
                tup = (test_data[i], pred[i])
                # Add to training data

        else:
            # must correct test data manually before adding into training data
            print(
                "Please predict each separately to add samples into training dataset."
            )
示例#33
0
## split the training sets into training and validation sets
training_set, validation_set = train_test_split(features_sets, test_size=0.2)


###create all the classifier we gonna use
classifier = nltk.NaiveBayesClassifier.train(training_set)
print "Original Naive Bayes Algo accuracy:", nltk.classify.accuracy(classifier, validation_set)*100
classifier.show_most_informative_features(15)
## save the classifier
save_classifier = open("pickled_algos/naivebayes5k.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
# print "MNB_classifier accuracy:", nltk.classify.accuracy(MNB_classifier, testing_set)*100
# MNB_classifier.show_most_informative_features(15)   ##show which features are most distinctive
save_classifier = open("pickled_algos/MNB_5k.pickle", "wb")
pickle.dump(MNB_classifier, save_classifier)
save_classifier.close()


BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
# print "BernoulliNB_classifier accuracy:", nltk.classify.accuracy(BernoulliNB_classifier, testing_set)*100
save_classifier = open("pickled_algos/BernoulliNB_5k.pickle", "wb")
pickle.dump(BernoulliNB_classifier, save_classifier)
save_classifier.close()
#
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
示例#34
0
 def train(self, features_label):
     svm = SklearnClassifier(SVC(C=1000.0, gamma=0.0001))
     self._classifier = svm.train(features_label)
     return None
示例#35
0
random.shuffle(rawData)
testData = rawData[:size]
trainData = rawData[size:]
random.shuffle(trainData)

# Generate TermFrequency for each doc
trainTF = [(FreqDist(tokenize(text)), tag) for text, tag in trainData]
testTF = [(FreqDist(tokenize(text)), tag) for text, tag in testData]

# Create classifier
pipeline = Pipeline([('tfidf', TfidfTransformer()),
                     ('chi2', SelectKBest(chi2, k=1000)),
                     ('nb', MultinomialNB())])
classif = SklearnClassifier(pipeline)
# Train classifier
classif.train(trainTF)


# Evaluate
testTags = [tag for tf, tag in testTF]
testResults = classif.batch_classify([tf for tf, tag in testTF])

right = 0
for i, tg in enumerate(testTags):
    if testResults[i] == tg:
        right += 1

print 'Results: ------------------------------------'
print testResults
print 'Accuracy:', right / float(len(testTags))
print '---------------------------------------------'
示例#36
0
def evaluate_classifier(data):
    
    trainfeats, testfeats  = train_test_split(data, test_size=0.3, random_state=0)
    
    # using 3 classifiers
    classifier_list = ['nb','svm']
    classifier_dict ={'nb':'Naive Bayes', 'svm':'SVM'}     
        
    for cl in classifier_list:
        classifierPkl = os.path.join('pkl',cl+".pkl")
        if not os.path.exists('./%s'%classifierPkl):
            if cl == 'svm':
                classifier = SklearnClassifier(LinearSVC(), sparse=False)
                classifier.train(trainfeats)
            else:
                classifier = NaiveBayesClassifier.train(trainfeats)
            pickle.dump(classifier,open(classifierPkl, 'wb'))
        else:                 
            classifier = pickle.load(open(classifierPkl,'rb'))    
                
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
 
        for i, (feats, label) in enumerate(testfeats):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)
 
        accuracy = nltk.classify.util.accuracy(classifier, testfeats)
        pos_precision = precision(refsets['positive'], testsets['positive'])
        pos_recall = recall(refsets['positive'], testsets['positive'])
        pos_fmeasure = f_measure(refsets['positive'], testsets['positive'])
        neg_precision = precision(refsets['negative'], testsets['negative'])
        neg_recall = recall(refsets['negative'], testsets['negative'])
        neg_fmeasure =  f_measure(refsets['negative'], testsets['negative'])
        
        print('')
        print('---------------------------------------')
        print('SINGLE FOLD RESULT ' + '(' + classifier_dict[cl] + ')')
        print('---------------------------------------')
        print('accuracy:', accuracy)
        print('precision', (pos_precision + neg_precision) / 2)
        print('recall', (pos_recall + neg_recall) / 2)
        print('f-measure', (pos_fmeasure + neg_fmeasure) / 2)
                
        #classifier.show_most_informative_features()
    
    print('')

    
       
    n = 5 # 5-fold cross-validation    
    
    for cl in classifier_list:
        
        subset_size = len(trainfeats) / n
        accuracy = []
        pos_precision = []
        pos_recall = []
        neg_precision = []
        neg_recall = []
        pos_fmeasure = []
        neg_fmeasure = []
        cv_count = 1
        for i in range(n):        
            testing_this_round = trainfeats[i*int(subset_size):][:int(subset_size)]
            training_this_round = trainfeats[:i*int(subset_size)] + trainfeats[(i+1)*int(subset_size):]
            classifierPkl = os.path.join('pkl',cl+"_cv.pkl")
            if not os.path.exists('./%s'%classifierPkl):
                if cl == 'svm':
                    classifier = SklearnClassifier(LinearSVC(), sparse=False)
                    classifier.train(training_this_round)
                else:
                    classifier = NaiveBayesClassifier.train(training_this_round)
                pickle.dump(classifier,open(classifierPkl, 'wb'))         
            else:
                classifier = pickle.load(open(classifierPkl,'rb'))                           
            refsets = collections.defaultdict(set)
            testsets = collections.defaultdict(set)
            for i, (feats, label) in enumerate(testing_this_round):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)
            
            cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round)
            cv_pos_precision = precision(refsets['positive'], testsets['positive'])
            cv_pos_recall = recall(refsets['positive'], testsets['positive'])
            cv_pos_fmeasure = f_measure(refsets['positive'], testsets['positive'])
            cv_neg_precision = precision(refsets['negative'], testsets['negative'])
            cv_neg_recall = recall(refsets['negative'], testsets['negative'])
            cv_neg_fmeasure =  f_measure(refsets['negative'], testsets['negative'])
                    
            accuracy.append(cv_accuracy)
            pos_precision.append(cv_pos_precision)
            pos_recall.append(cv_pos_recall)
            neg_precision.append(cv_neg_precision)
            neg_recall.append(cv_neg_recall)
            pos_fmeasure.append(cv_pos_fmeasure)
            neg_fmeasure.append(cv_neg_fmeasure)
            
            cv_count += 1
                
        print('---------------------------------------')
        print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifier_dict[cl] + ')')
        print('---------------------------------------')
        print('accuracy:', sum(accuracy) / n)
        print('precision', (sum(pos_precision)/n + sum(neg_precision)/n) / 2)
        print('recall', (sum(pos_recall)/n + sum(neg_recall)/n) / 2)
        print('f-measure', (sum(pos_fmeasure)/n + sum(neg_fmeasure)/n) / 2)
        print('')
示例#37
0
# https://pythonprogramming.net/sklearn-scikit-learn-nltk-tutorial/
from sklearn.linear_model import LogisticRegression
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

from nltk.classify import SklearnClassifier

data = pd.read_csv('reviews.csv')

print(data[['text', 'sentiment']])
data = data[['text', 'sentiment']]

training_set, testing_set = train_test_split(data, test_size=0.1)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:",
      (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) *
      100)
示例#38
0
print "Accuracy :"
accuracy = nltk.classify.util.accuracy(classifier, test_set)
print(accuracy * 100)

print "Building model : Naive Bayes"
classifier = NaiveBayesClassifier.train(train_set)

import pickle
f = open('naive_classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

print "Accuracy :"
accuracy = nltk.classify.util.accuracy(classifier, test_set)
print(accuracy * 100)

print "Building model : SVM"
classifier = SklearnClassifier(LinearSVC(), sparse=True)
classifier.train(train_set)

import pickle
f = open('svm_classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

print "Accuracy :"
accuracy = nltk.classify.util.accuracy(classifier, test_set)
print(accuracy * 100)

print "model saved"
            for f in word_split(posdata)]
negcutoff = int(len(negfeats) * 10 / 11)
poscutoff = int(len(posfeats) * 10 / 11)
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

########################################################################################
########################################################################################

refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)

str = 'SINGLE FOLD RESULT ' + '(' + 'linear-svc' + ')'
#training with LinearSVC
classifier = SklearnClassifier(LinearSVC())
classifier.train(trainfeats)
for i, (feats, label) in enumerate(testfeats):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)
accuracy = nltk.classify.util.accuracy(classifier, testfeats) * 100
pos_precision = nltk.precision(refsets['pos'], testsets['pos'])

pos_recall = recall(refsets['pos'], testsets['pos'])
pos_fmeasure = f_measure(refsets['pos'], testsets['pos'])
neg_precision = precision(refsets['neg'], testsets['neg'])
neg_recall = recall(refsets['neg'], testsets['neg'])
neg_fmeasure = f_measure(refsets['neg'], testsets['neg'])
print('')
print('---------------------------------------')
print(str)
示例#40
0
# just break on gaps -- note that this doesn't filter out punctuation
tokenizer = RegexpTokenizer('[\w\d]+')

training_set = []
with open('train_jlm.csv', 'rb') as f:
    reader = csv.reader(f)
    for row in reader:
        if row[0] != 'OrganisationId': # header
            words = tokenizer.tokenize(row[1])
            if row[4] == 'Academic':
                training_set.append((words, 'academic'))
            else:
                training_set.append((words, 'private'))

classif.train([(FreqDist(words), label) for (words, label) in training_set])
training_set_classification = classif.batch_classify(
    [FreqDist(words) for (words, label) in training_set])

print "training set score:", sum([training_set[i][1] == training_set_classification[i] for i in range(len(training_set))]), '/', len(training_set)

training_set_prob_classification = classif.batch_prob_classify(
    [FreqDist(words) for (words, label) in training_set])

full_set = []
with open('organisations.csv', 'rb') as f:
    reader = csv.reader(f)
    for row in reader:
        if row[0] != 'OrganisationId': # header
            organisation_ids = row[0]
            words = tokenizer.tokenize(row[1])