Пример #1
0
class SKClassifier:

    classifier = None

    def __init__(self, cls='SVC'):
        self.classifier = SklearnClassifier({
            'SVC': SVC(),
            'LogisticRegression': LogisticRegression(),
            'BernoulliNB': BernoulliNB()
        }[cls])
        if not self.classifier:
            self.classifier = SklearnClassifier(SVC())

    def train(self, trainset):
        self.classifier.train(trainset)

    def test(self, tagged, featuresets):
        predict = self.classifier.classify_many(featuresets)
        print predict
        return accuracy_score(tagged, predict)

    def classify(self, featureset):
        return self.classifier.classify(featureset)

    def classify_many(self, featuresets):
        return self.classifier.classify_many(featuresets)
Пример #2
0
def classifier_for_lemma(lemma, filenames):
    # XXX: always doing non-null and Random Forest for initial version
    classifier = SklearnClassifier(RandomForestClassifier(), sparse=False)
    print("loading training data for", lemma)
    load_training_for_word(lemma, filenames.bitextfn, filenames.alignfn,
                           filenames.annotatedfn)

    training = trainingdata.trainingdata_for(lemma, nonnull=True)
    print("got {0} instances for {1}".format(len(training), lemma))

    # delete the sentences themselves; we have the instances
    trainingdata.set_examples([], [])
    trainingdata.set_sl_annotated([])
    gc.collect()

    if len(training) > (20 * 1000):
        print("capping to 20k instances to fit in memory")
        training = training[: 20 * 1000]

    labels = set(label for (feat,label) in training)
    print("loaded training data for", lemma)
    if (not training) or len(labels) < 2:
        return None
    classifier.train(training)
    return classifier
Пример #3
0
def trainClassifiers(tweets):
    # Generate the training set
    training_set = nltk.classify.util.apply_features(extract_features, tweets)
    print("Training set created!")

    # Train and save the Naive Bayes classifier to a file
    NBClassifier = nltk.NaiveBayesClassifier.train(training_set)
    f = open('data/trained_classifiers/NBClassifier.pickle', 'wb')
    pickle.dump(NBClassifier, f, 1)
    f.close()
    print("NBClassifier Classifier Trained")

    #Train linear SVC
    linear_SVC_classifier = SklearnClassifier(LinearSVC())
    linear_SVC_classifier.train(training_set)

    # Train Max Entropy Classifier
    # MaxEntClassifier = nltk.classify.maxent.MaxentClassifier.train(training_set, 'IIS', trace=2, \
    #                        encoding=None, labels=None, sparse=True, gaussian_prior_sigma=0, max_iter = 5)
    # f = open('data/trained_classifiers/MaxEntClassifier.pickle', 'wb')
    # pickle.dump(MaxEntClassifier, f, 1)
    # f.close()
    # print("MaxEntClassifier Classifier Trained")

    # return (training_set, NBClassifier, MaxEntClassifier)
    return (training_set, NBClassifier, linear_SVC_classifier)
Пример #4
0
def learn_model(data,target):
    bestwords = best_of_words(data, target)
    # preparing data for split validation. 80% training, 20% test
    data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.1,random_state=43)
    #classifier = BernoulliNB().fit(data_train,target_train)
    train_feature=[]
    test_feature=[]
    for i in range(len(data_train)):
        d=data_train[i]
        d=jieba.cut(d, cut_all=False)
        l=target_train[i]
        #tmp=[bigram(d),l]
        tmp = [dict([(word, True) for word in d if word in bestwords]), l]
        train_feature.append(tmp)
        
    for i in range(len(data_test)):
        d=data_test[i]
        d=jieba.cut(d, cut_all=False)
        l=target_test[i]
        #tmp=bigram(d)
        tmp = dict([(word, True) for word in d if word in bestwords])
        test_feature.append(tmp)
    
        
    classifier = SklearnClassifier(MultinomialNB())
    classifier.train(train_feature)
   
    predicted = classifier.classify_many(test_feature)
    
    evaluate_model(target_test,predicted)

    return classifier, bestwords
def clf_score(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(train_set)
    # nltk.classify.scikitlearn(BernoulliNB())
    predict = classifier.classify_many(test)
    # classifier.prob_classify_many()
    return accuracy_score(tag_test, predict)
Пример #6
0
def sentiment_classifier(debug):
	# trainingfp = open('training.csv', 'rb')
	train = pd.read_csv( 'training.csv', delimiter=',', quotechar='"', escapechar='\\',header=0 )
	num_tweets = train['TweetText'].size
	
	cleantweets = []
	for i in xrange(0, num_tweets):
		if debug and ( (i+1)%1000 == 0 ):
			print "Tweet %d of %d\n" % ( i+1, num_tweets )          
		cleantweets.append((tweet_to_words(train['TweetText'][i]), train['Sentiment'][i]))

	# vectorizer = CountVectorizer(analyzer = "word",   \
 #                             tokenizer = None,    \
 #                             preprocessor = None, \
 #                             stop_words = None,   \
 #                             max_features = 5000) 

	# train_data_features = vectorizer.fit_transform([t for (t,_) in cleantweets])
	
	# feature_labels = [(m,l) for ((f,l),m) in zip(cleantweets, train_data_features)]

	# forest = RandomForestClassifier(n_estimators = sensitivity)
	# forest = forest.fit(train_data_features, train['Sentiment'])
	classif = SklearnClassifier(LinearSVC())
	classif.train(cleantweets)

	return (classif)
def score(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(trainset)

    # pred = classifier.batch_classify(test)
    pred = classifier.classify_many(test)
    return accuracy_score(tag_test, pred)
Пример #8
0
def buildclassifiers(featureslist, SAMPLE_PROPORTION, n):
	classnames = ['Naive Bayes', 'Logistic Regression', 'Linear SCV']
	allclassifiers = []
	for name in classnames:
		for i in range(n):
			random.shuffle(featureslist)
			train_set, test_set = buildsets(featureslist, SAMPLE_PROPORTION)

			if name == 'Naive Bayes':
				spamclassifier = NaiveBayesClassifier.train(train_set)
			if name == 'Logistic Regression':
				spamclassifier = SklearnClassifier(LogisticRegression())
				spamclassifier.train(train_set)
			if name == 'Linear SCV':
				spamclassifier = SklearnClassifier(LinearSVC(C=0.01))
				spamclassifier.train(train_set)
			perfmeasures_i = evaluate(train_set, test_set, spamclassifier, name)
			if i == 0:
				perfmeasures_n = perfmeasures_i
			else:
				perfmeasures_n = map(add, perfmeasures_n, perfmeasures_i)
	
		# Store last classifier built per model
		allclassifiers.append(spamclassifier)
		
		# Print performance measures per classifier
		printperformance(name, perfmeasures_n, n)	
		
	return allclassifiers
Пример #9
0
def evaluate(train_qs, test_qs, params, d):

    data = [train_qs, test_qs]
    (W, b, W2, b2, W3, b3, L) = params

    train_feats = []
    test_feats = []

    for tt, split in enumerate(data):

        for qs, ans in split:

            prev_qs = zeros((d, 1))
            prev_sum = zeros((d, 1))
            count = 0.
            history = []

            for dist in qs:

                sent = qs[dist]

                # input is average of all nouns in sentence
                # av = average(L[:, sent], axis=1).reshape((d, 1))
                history += sent
                prev_sum += sum(L[:, sent], axis=1).reshape((d, 1))
                if len(history) == 0:
                    av = zeros((d, 1))
                else:
                    av = prev_sum / len(history)

                # apply non-linearity
                p = relu(W.dot(av) + b)
                p2 = relu(W2.dot(p) + b2)
                p3 = relu(W3.dot(p2) + b3)

                curr_feats = {}
                for dim, val in ndenumerate(p3):
                    curr_feats['__' + str(dim)] = val

                if tt == 0:
                    train_feats.append( (curr_feats, ans[0]) )

                else:
                    test_feats.append( (curr_feats, ans[0]) )

    print 'total training instances:', len(train_feats)
    print 'total testing instances:', len(test_feats)
    random.shuffle(train_feats)

    # can modify this classifier / do grid search on regularization parameter using sklearn
    classifier = SklearnClassifier(LogisticRegression(C=10))
    classifier.train(train_feats)

    print 'accuracy train:', nltk.classify.util.accuracy(classifier, train_feats)
    print 'accuracy test:', nltk.classify.util.accuracy(classifier, test_feats)
    print ''

    print 'dumping classifier'
    cPickle.dump(classifier, open('data/deep/classifier', 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)
def evaluate(classifier_alo):
    
    classifier = SklearnClassifier(classifier_alo) #在nltk 中使用scikit-learn 的接口
    classifier.train(trainFeatures) #训练分类器
    
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)	
    i = 0
    for item in testFeatures:
        referenceSets[item[1]].add(i)
        predicted = classifier.classify(item[0])
        testSets[predicted].add(i)	
        i += 1
    
    pos_pre = nltk.metrics.precision(referenceSets['pos'], testSets['pos'])
    pos_recall = nltk.metrics.recall(referenceSets['pos'], testSets['pos'])
    neg_pre =  nltk.metrics.precision(referenceSets['neg'], testSets['neg'])
    neg_recall = nltk.metrics.recall(referenceSets['neg'], testSets['neg'])
    
    print (str('{0:.3f}'.format(float(pos_pre))) + "  "
    +str('{0:.3f}'.format(float(pos_recall))) + "  "
    +str('{0:.3f}'.format(float(neg_pre))) + "  "
    +str( '{0:.3f}'.format(float(neg_recall))) + "  "
    +str('{0:.3f}'.format(2*(float(pos_pre)*float(pos_recall)) / (float(pos_recall)+float(pos_pre)))) + "  "
    +str('{0:.3f}'.format(2*(float(neg_pre)*float(neg_recall)) / (float(neg_recall)+float(neg_pre)))))
Пример #11
0
def validate(data, params, d):

    stop = stopwords.words("english")

    (rel_dict, Wv, b, L) = params

    print "validating, adding lookup"
    for split in data:
        for tree in split:
            for node in tree.get_nodes():
                node.vec = L[:, node.ind].reshape((d, 1))

    train_feats = []
    val_feats = []

    for tt, split in enumerate(data):

        if tt == 0:
            print "processing train"

        else:
            print "processing val"

        for num_finished, tree in enumerate(split):

            # process validation trees
            forward_prop(None, params, tree, d, labels=False)

            ave = zeros((d, 1))
            words = zeros((d, 1))
            count = 0
            wcount = 0
            word_list = []
            for ex, node in enumerate(tree.get_nodes()):

                if ex != 0 and node.word not in stop:
                    ave += node.p_norm
                    count += 1

            ave = ave / count
            featvec = ave.flatten()

            curr_feats = {}
            for dim, val in ndenumerate(featvec):
                curr_feats["_" + str(dim)] = val

            if tt == 0:
                train_feats.append((curr_feats, tree.ans))

            else:
                val_feats.append((curr_feats, tree.ans))

    print "training"
    classifier = SklearnClassifier(LogisticRegression(C=10))
    classifier.train(train_feats)

    print "predicting..."
    train_acc = nltk.classify.util.accuracy(classifier, train_feats)
    val_acc = nltk.classify.util.accuracy(classifier, val_feats)
    return train_acc, val_acc
Пример #12
0
def svm(train_data,preprocessing=True):
    training_data = []
    for data in train_data:
        training_data.append(preprocess(data[0],label=data[1]))
    cl = SklearnClassifier(LinearSVC())
    cl.train(training_data)
    return cl
Пример #13
0
class chatBot(object):

    def __init__(self):
        self.posts = nltk.corpus.nps_chat.xml_posts()
        self.categories = ['Emotion', 'ynQuestion', 'yAnswer', 'Continuer',
                'whQuestion', 'System', 'Accept', 'Clarify', 'Emphasis',
                'nAnswer', 'Greet', 'Statement', 'Reject', 'Bye', 'Other']
        self.mapper = [0, 2, 6, 3, 11, 5, 8, 1, 8, 3, 10, 11, 13, 13, 13]
        self.responses = {}
        self.featuresets = []
        self.train = []
        self.test = []
        self.testSet = []
        self.testSetClass = []
        self.classif = SklearnClassifier(LinearSVC())
        for i in range(0, 15):
            self.responses[i] = []
        for post in self.posts:
            self.featuresets.append((self.tokenize(post.text),self.categories.index(post.get('class'))))
            self.temp = self.responses[self.categories.index(post.get('class'))]
            self.temp.append(post.text)

    def tokenize(self, sentence):
        """
            Extracts a set of features from a message.
        """
        features = {}
        tokens = nltk.word_tokenize(sentence)
        for t in tokens:
            features['contains(%s)' % t.lower()] = True
        return features

    def talk(self):
        while 1:
            inp = raw_input("YOU: ")
            features = self.tokenize(inp)
            pp = self.classif.classify_many(features)
            pp = pp[0]
            pp = int(pp)
            m = self.mapper[pp]
            r = self.responses[m]
            val = randint(0, len(r))
            print("BOT: "+r[val])

    def trainSet(self):
        shuffle(self.featuresets)
        size = int(len(self.featuresets) * .1) # 10% is used for the test set
        self.train = self.featuresets[size:]
        self.test = self.featuresets[:size]
        self.classif.train(self.train)

        self.testSet = []
        self.testSetClass = []
        for i in self.test:
            self.testSet.append(i[0])
            self.testSetClass.append(i[1])
        self.batch = self.classif.classify_many(self.testSet)

    def statistics(self):
        print (classification_report(self.testSetClass, self.batch, labels=list(set(self.testSetClass)),target_names=self.categories))
Пример #14
0
def main3():
    from nltk.classify.scikitlearn import SklearnClassifier
    from sklearn.svm import LinearSVC
    from sklearn.metrics import confusion_matrix
    from matplotlib import pyplot

    svm = SklearnClassifier(LinearSVC(loss="hinge"))
    svm.train(trainData)
    print("SVM: ", nltk.classify.accuracy(svm, testData))
    results = svm.classify_many(item[0] for item in testData)

    print(results)
    from sklearn.metrics import classification_report

    # getting a full report
    print(classification_report(t_test_skl, results, labels=list(set(t_test_skl)), target_names=t_test_skl))

    # Compute confusion matrix
    import numpy as np
    cmm = confusion_matrix([x[1] for x in testData], results)

    print(cmm)
    cmm = np.array(cmm, dtype = np.float)
    print(cmm.shape)

    #f=figure()
    #ax = f.add_subplot(111)
    #show()
    #%pylab inline

    # Show confusion matrix in a separate window
    print(pyplot.imshow(cmm, interpolation='nearest'))
Пример #15
0
def SVM(training_set, test_set):
    classifier = SklearnClassifier(LinearSVC())
    print("Training a new SVM classifier")
    classifier.train(training_set)
    print("Accuracy of SVM in training:",nltk.classify.accuracy(classifier, test_set))
#     classifier.show_most_informative_features(5)
    #print("Running new Decision Tree classifier")
    accuracy = nltk.classify.accuracy(classifier, test_set)
    trueLabels = [l for d, l in test_set]
    predictedLabels = classifier.classify_many([d for d,t in test_set])
    #print("Accuracy:",accuracy)
#     classifier.show_most_informative_features(MIF)
    def runTrained(test_set, hasTags=False):
        #print("Running pre-trained Decision Tree classifier")
        if hasTags:
            tagglessTest_set = [data for data, tag in test_set]
            acc = nltk.classify.accuracy(classifier, test_set)
            print("Accuracy:", acc)
            predictions = classifier.classify_many(tagglessTest_set)
            return ([e for e in zip(tagglessTest_set, predictions)], acc)
        else:
            tagglessTest_set = test_set         
        predictions = classifier.classify_many(tagglessTest_set)
        #print("Predicted Labels:",predictions)
        return [e for e in zip(tagglessTest_set, predictions)]
    return (runTrained, accuracy, predictedLabels, trueLabels) 
def train(cleanedDataCollection, tagPool):
	posSamples = []
	negSamples = []

	featuresets = [(extractFeatures(d,tagPool), c) for (d,c) in cleanedDataCollection]
	for sample in featuresets:
		if sample[1] == "trash":
			negSamples.append(sample)
		else:
			posSamples.append(sample)

	train_set = negSamples[10:]+posSamples[10:]
	test_set = negSamples[:10]+posSamples[:10]


	# classifier = nltk.NaiveBayesClassifier.train(train_set)
	# print(nltk.classify.accuracy(classifier, test_set))
	# classifier.show_most_informative_features(5) 
	# return classifier

	sk_classifier = SklearnClassifier(MultinomialNB())
	sk_classifier.train(train_set)
	print "accuracy is: %s" % (accuracy(sk_classifier, test_set))

	precision, recall, fMeasure = precision_recall_fmeasure(sk_classifier,  test_set, "useful")

	print "precision is: %s" % (precision)
	print "recall is: %s" % (recall)
	print "F-measure is: %s" % (fMeasure)
	return sk_classifier
Пример #17
0
def score(trainset, testset, classifier):
    classifier = SklearnClassifier(classifier)
    classifier._vectorizer.sort = False
    classifier.train(trainset)
    (test, tag_test) = zip(*testset)
    pred = classifier.classify_many(test)
    return accuracy_score(tag_test, pred)
Пример #18
0
def svm(trainfeats, testfeats):
	y = []
	accuracy = []
	classif = SklearnClassifier(LinearSVC(C=0.032))
	classif.train(trainfeats)
	print "SVM output"
	print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
	y.append( nltk.classify.util.accuracy(classif, testfeats))
	print y
def svm(total_train_feats,total_test_feats):
    y = []
    accuracy = []
    classifier = SklearnClassifier(LinearSVC(C=0.032))
    classifier.train(total_train_feats)
    print 'train on %d instances, test on %d instances' % (len(total_train_feats), len(total_test_feats))
    y.append( nltk.classify.util.accuracy(classifier, total_test_feats))
    print y
    del classifier
    all_results.append(y)
def buildClassifier_score(trainSet,devtestSet,classifier):
    #print devtestSet
    from nltk import compat
    dev, tag_dev = zip(*devtestSet) #把开发测试集(已经经过特征化和赋予标签了)分为数据和标签
    classifier = SklearnClassifier(classifier) #在nltk 中使用scikit-learn 的接口
    #x,y in  list(compat.izip(*trainSet))
    classifier.train(trainSet) #训练分类器
    #help('SklearnClassifier.batch_classify')
    pred = classifier.classify_many(dev)#batch_classify(testSet) #对开发测试集的数据进行分类,给出预测的标签
    return accuracy_score(tag_dev, pred) #对比分类预测结果和人工标注的正确结果,给出分类器准确度
def store_classifier(clf, trainset, filepath):
    classifier = SklearnClassifier(clf)
    classifier.train(trainset)

    pred = classifier.prob_classify_many(extract_features(sentiment))
    p_file = open(filepath,'w+') #把结果写入文档
    # for i in pred:
    #     p_file.write(str(i.prob('pos'))+' '+str(i.prob('neg')))
    for (i,j) in zip(pred,sen_cur):
        p_file.write(str(i.prob('pos'))+'\t'+str(i.prob('neg'))+'\t'+j + '\n')
    p_file.close()
Пример #22
0
def train():
    pipeline = Pipeline([('tfidf', TfidfTransformer()),
                         ('chi2', SelectKBest(chi2, k=40)),
                         ('nb', MultinomialNB())])
    classif = SklearnClassifier(pipeline)
    
    
    pos = [FreqDist(i) for i in open('/home/mel/workspace/datascience/assignment5_kaggle/data/useful.txt', 'r').readlines()]
    neg = [FreqDist(i) for i in open('/home/mel/workspace/datascience/assignment5_kaggle/data/not.txt', 'r').readlines()]
    add_label = lambda lst, lab: [(x, lab) for x in lst]
    classif.train(add_label(pos, 'pos') + add_label(neg, 'neg'))
    return classif
Пример #23
0
	def learn_model(self,featuresets):
		"""
		trains and tests the logistic regression classifier on the data
		"""
		random.shuffle(featuresets)
	
		limit = int(0.75*len(featuresets)) #partitioning 3:1 for train:test
		train_set = featuresets[:limit]
		test_set = featuresets[limit:]
	
		lr_classifier = SklearnClassifier(LogisticRegression())
		lr_classifier.train(train_set)
		
		print 'Logistic classifier Accuracy : ',str(nltk.classify.accuracy(lr_classifier,test_set)*100)
Пример #24
0
    def handle(self, *args, **options):
        trains = get_train_tweets()
        if not trains:
            raise CommandError('No train data, please add some from the admin page!')

        train_count = trains.count()
        train_set = generate_trainset(trains)
        nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
        sci_classifier = SklearnClassifier(LinearSVC())
        sci_classifier.train(train_set)

        while True:
            unclassified_tweets = Tweet.objects.filter(train=False, klass=None)
            total_count = unclassified_tweets.count()
            if total_count > 0:
                print('Classifying %d tweets...' % total_count)
                counts_nb = defaultdict(int)
                counts_svm = defaultdict(int)
                start_time = time.time()
                for tweet in unclassified_tweets:
                    feature_vect = get_feature_vector(process_tweet(tweet.body))
                    features = extract_features(feature_vect)
                    sentiment_nb = nb_classifier.classify(features)
                    sentiment_svm = sci_classifier.classify(features)
                    counts_nb[sentiment_nb] += 1
                    counts_svm[sentiment_svm] += 1
                    tweet.klass = sentiment_nb
                    tweet.klass_svm = sentiment_svm
                    msg_nb = ['%d %s' % (counts_nb[k], v) for k, v in Tweet.CLASSES]
                    msg_svm = ['%d %s' % (counts_svm[k], v) for k, v in Tweet.CLASSES]
                    print('\rNB: ' + ', '.join(msg_nb) + ';\tSVM: ' + ', '.join(msg_svm), end='')
                    # print('\r' + ', '.join(msg_nb), end='')
                    tweet.save()
                    if settings.DEBUG:
                        db.reset_queries()
                elapsed = int(time.time() - start_time)
                print('\nClassifying finished in %d seconds.' % elapsed)

            new_trains = get_train_tweets()
            if new_trains.count() != train_count:
                print('Train set has been changed, retraining...')
                trains = new_trains
                train_count = new_trains.count()
                train_set = generate_trainset(trains)
                nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
                sci_classifier = SklearnClassifier(LinearSVC())
                sci_classifier.train(train_set)
            else:
                print('Waiting...')
                time.sleep(3)
def get_performance(clf_sel, train_features, test_features):
    ref_set = collections.defaultdict(set)
    test_set = collections.defaultdict(set)
    classification_error = False

    clf = SklearnClassifier(clf_sel)
    try:
        classifier = clf.train(train_features)
    except:
        classification_error = True
        # print (str(clf_sel.__class__),'NA')

    if str(clf_sel.__class__) == "<class 'sklearn.naive_bayes.MultinomialNB'>":
        pickle_cls(classifier, 'MultinomialNB')

    # print(str(clf_sel), 'accuracy:'(nltk.classify.accuracy(classifier, test_features)) * 100)

    if not classification_error:
        clf_acc = nltk.classify.accuracy(classifier, test_features)

        for i, (features, label) in enumerate(test_features):
            ref_set[label].add(i)
            predicted = classifier.classify(features)
            test_set[predicted].add(i)

        pos_precision = precision(ref_set['pos'], test_set['pos'])
        pos_recall = recall(ref_set['pos'], test_set['pos'])
        neg_precision = precision(ref_set['neg'], test_set['neg'])
        neg_recall = recall(ref_set['neg'], test_set['neg'])

        print(
            "{0},{1},{2},{3},{4},{5}".format(clf_sel.__class__, clf_acc, pos_precision, pos_recall, neg_precision,
                                             neg_recall))
Пример #26
0
def classifier_for_lemma(lemma):
    # always doing non-null and Random Forest for initial version
    classifier = SklearnClassifier(RandomForestClassifier(), sparse=False)
    training = trainingdata.trainingdata_for(lemma, nonnull=True)
    print("got {0} instances for {1}".format(len(training), lemma))

    if len(training) > (20 * 1000):
        print("capping to 20k instances to fit in memory")
        training = training[: 20 * 1000]

    labels = set(label for (feat,label) in training)
    print("loaded training data for", lemma)
    if (not training) or len(labels) < 2:
        return None
    classifier.train(training)
    return classifier
Пример #27
0
 def trainPosNeg(self):
     positive = "./positive"
     negative = "./negative"
     pos_files = ptr(positive, '.*')
     neg_files = ptr(negative, '.*')
     pos_all_words = [pos_files.raw(fileid).split(" ")
                      for fileid in pos_files.fileids()]
     neg_all_words = [neg_files.raw(fileid).split(" ")
                      for fileid in neg_files.fileids()]
     pos_splited_words = [(self.getBigrams(words), 'positive')
                          for words in pos_all_words]
     neg_splited_words = [(self.getBigrams(words), 'negative')
                          for words in neg_all_words]
     pos_neg_trainfeats = pos_splited_words[:] + neg_splited_words[:]
     classifier = SklearnClassifier(LinearSVC())
     classifier.train(pos_neg_trainfeats)
     return classifier
Пример #28
0
def cross_validation(data_set, n_folds=8):
    kf = KFold(len(data_set), n_folds=n_folds)
    best_accuracy = -1
    training_accuracy = 0
    for train, cv in kf:
        classifier = SklearnClassifier(
            Pipeline([('tfidf', TfidfTransformer()),
                      ('nb', LinearSVC(C=1, tol=0.000001))]))
        training_data = data_set[0:cv[0]] + data_set[cv[-1]:]
        cv_data = data_set[cv[0]:cv[-1]+1]
        classifier.train(training_data)
        accuracy = classify.accuracy(classifier, cv_data)
        if accuracy > best_accuracy:
            best_classifier = classifier
            best_accuracy = accuracy
            training_accuracy = classify.accuracy(classifier, training_data)
    return best_classifier, training_accuracy, best_accuracy
Пример #29
0
def cross_validate(data,model=None):
    training_set = nltk.classify.apply_features(preprocess,data)
    cv = cross_validation.KFold(len(training_set), n_folds=10, shuffle=False, random_state=None)
    if model == "svm" or model=="SVM":
        svm = SklearnClassifier(LinearSVC())
        for traincv, testcv in cv:
            classifier = svm.train(training_set[traincv[0]:traincv[len(traincv)-1]])
            print 'accuracy:', nltk.classify.util.accuracy(classifier, training_set[testcv[0]:testcv[len(testcv)-1]])
Пример #30
0
def TrainClassifiers():
    training_set, testing_set = TestTrainData()

    classifiers = list()
    classifier_name = list()

    NaiveBayesClassifier_classifier = NaiveBayesClassifier.train(training_set)
    classifiers.append(NaiveBayesClassifier_classifier)
    classifier_name.append("NaiveBayesClassifier")

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    classifiers.append(MNB_classifier)
    classifier_name.append("MultinomialNBClassifier")

    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)
    classifiers.append(BernoulliNB_classifier)
    classifier_name.append("BernoulliNBClassifier")

    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    classifiers.append(LogisticRegression_classifier)
    classifier_name.append("LogisticRegressionClassifier")

    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    classifiers.append(LogisticRegression_classifier)
    classifier_name.append("LinearSVCClassifier")

    SGDC_classifier = SklearnClassifier(SGDClassifier())
    SGDC_classifier.train(training_set)
    classifiers.append(SGDC_classifier)
    classifier_name.append("SGDClassifier")

    print("Naive_Bayes Algo accuracy percent:", (classify.accuracy(NaiveBayesClassifier_classifier, testing_set))*100)
    print("MNB_classifier accuracy percent:", (classify.accuracy(MNB_classifier, testing_set))*100)
    print("BernoulliNB_classifier accuracy percent:", (classify.accuracy(BernoulliNB_classifier, testing_set))*100)
    print("LogisticRegression_classifier accuracy percent:", (classify.accuracy(LogisticRegression_classifier, testing_set))*100)
    print("LinearSVC_classifier accuracy percent:", (classify.accuracy(LinearSVC_classifier, testing_set))*100)
    print("SGDClassifier accuracy percent:", (classify.accuracy(SGDC_classifier, testing_set))*100)

    SaveClassifiers(classifiers, classifier_name)

    return classifiers
Пример #31
0
def train(trainfeats, testfeats, nlt=True, skl=True, most=10):
    # print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

    nltk_output = dict()
    sklearn_output = dict()

    if nlt:

        my_classifier = NaiveBayesClassifier.train(trainfeats)
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)

        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = my_classifier.classify(feats)
            testsets[observed].add(i)

        # precision and recall
        accuracy = 0
        pos_prec = 0
        pos_rec = 0
        neg_prec = 0
        neg_rec = 0

        try:
            accuracy = nltk.classify.util.accuracy(my_classifier,
                                                   testfeats) * 100
            pos_prec = precision(refsets[4], testsets[4]) * 100
            pos_rec = recall(refsets[4], testsets[4]) * 100
            neg_prec = precision(refsets[0], testsets[0]) * 100
            neg_rec = recall(refsets[0], testsets[0]) * 100
        except Exception as e:
            print(e)
            pass

        # round
        # accuracy = round(accuracy, 1)
        # pos_prec = round(pos_prec, 1)
        # pos_rec = round(pos_rec, 1)
        # neg_prec = round(neg_prec, 1)
        # neg_rec = round(neg_rec, 1)

        # print('pos F-measure:', f_measure(refsets['pos'], testsets['pos']))
        # print('neg F-measure:', f_measure(refsets['neg'], testsets['neg']))
        # my_classifier.show_most_informative_features(most)

        nltk_output['accuracy'] = round(accuracy, 1)
        nltk_output['pos_prec'] = round(pos_prec, 1)
        nltk_output['neg_prec'] = round(neg_prec, 1)
        nltk_output['pos_rec'] = round(pos_rec, 1)
        nltk_output['neg_rec'] = round(neg_rec, 1)
        nltk_output['most1'] = my_classifier.most_informative_features()[0][0]
        nltk_output['most2'] = my_classifier.most_informative_features()[1][0]
        nltk_output['most3'] = my_classifier.most_informative_features()[2][0]

    if skl:

        MNB_classifier = SklearnClassifier(MultinomialNB())
        MNB_classifier._vectorizer.sort = False
        MNB_classifier.train(trainfeats)
        mnb = (nltk.classify.accuracy(MNB_classifier, testfeats)) * 100
        # mnb = round(mnb, 1)
        # print(mnb)

        BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
        BernoulliNB_classifier._vectorizer.sort = False
        BernoulliNB_classifier.train(trainfeats)
        bnb = (nltk.classify.accuracy(BernoulliNB_classifier, testfeats)) * 100
        # bnb = round(bnb, 1)
        # print(bnb)

        LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
        LogisticRegression_classifier._vectorizer.sort = False
        LogisticRegression_classifier.train(trainfeats)
        lr = (nltk.classify.accuracy(LogisticRegression_classifier,
                                     testfeats)) * 100
        # lr = round(lr, 1)
        # print(lr)

        LinearSVC_classifier = SklearnClassifier(LinearSVC())
        LinearSVC_classifier._vectorizer.sort = False
        LinearSVC_classifier.train(trainfeats)
        lsvc = (nltk.classify.accuracy(LinearSVC_classifier, testfeats)) * 100
        # lsvc = round(lsvc, 1)
        # print(lsvc)

        NuSVC_classifier = SklearnClassifier(NuSVC())
        NuSVC_classifier._vectorizer.sort = False
        NuSVC_classifier.train(trainfeats)
        nsvc = (nltk.classify.accuracy(NuSVC_classifier, testfeats)) * 100
        # nsvc = round(nsvc, 1)
        # print(nsvc)

        voted_classifier = VoteClassifier(NuSVC_classifier,
                                          LinearSVC_classifier, MNB_classifier,
                                          BernoulliNB_classifier,
                                          LogisticRegression_classifier)
        voted = (nltk.classify.accuracy(voted_classifier, testfeats)) * 100
        # voted = round(voted, 1)

        sklearn_output['mnb'] = round(mnb, 1)
        sklearn_output['bnb'] = round(bnb, 1)
        sklearn_output['lr'] = round(lr, 1)
        sklearn_output['lsvc'] = round(lsvc, 1)
        sklearn_output['nsvc'] = round(nsvc, 1)
        sklearn_output['voted'] = round(voted, 1)

    return (nltk_output, sklearn_output)
Пример #32
0
class Classifier:
    def __init__(self, trainNew, trainDirectory):
        self.data = {}
        self.labels = ["bearish", "bullish", "neutral"]
        self.trainSet = []
        self.testSet = []
        self.featureList = []
        self.featureSets = []
        self.trainDir = trainDirectory

        if not trainNew and exists("./data/models/svm.pickle"):
            self.model = pickle.load(open('./data/models/svm.pickle', 'rb'))

        else:
            print("Training model")
            self.model = SklearnClassifier(LinearSVC(random_state=0, tol=1e-5))
            self.load_data()
            self.extract_features()
            self.train_model()

    # end

    def load_data(self):

        self.data = {}
        for label in self.labels:
            file = open(self.trainDir + label + ".txt", 'r')
            for line in file:
                self.data[line] = label
        return self.data

    # end

    def extract_features(self):
        if (len(self.data) == 0):
            self.load_data()

        f = FeatureExtractor()
        counter = 0
        for tweet, label in self.data.items():
            featureVector = f.get_feature_vector(tweet)
            if len(featureVector) > 0:
                self.featureSets.append(
                    (featureVector, label)
                )  # dictionary of [bigrams in a tweet] : sentiment of that tweet
                self.featureList = self.featureList + featureVector  # list of all bigrams, later gets repeats removed
                self.trainSet.append(
                    (dict([(tuple(word), True)
                           for word in featureVector]), label))
                counter += 1
        print(len(self.featureSets), "tweets total")
        self.featureList = list(set(tuple(i) for i in self.featureList))
        print(len(self.featureList), "unique features")
        print(len(self.trainSet), "training tweets")

        return self.trainSet

    # end

    def train_model(self):

        if (len(self.trainSet) == 0):
            self.extract_features()

        self.model.train(self.trainSet)

        pickle.dump(self.model, open("data/models/svm.pickle", "wb"))

        return self.model

    # end

    def classify(self, tweetText):
        f = FeatureExtractor()
        tweetText = str(tweetText)
        featureVector = f.get_feature_vector(tweetText)
        features = dict([(tuple(word), True) for word in featureVector])

        prediction = self.model.classify(features)

        return prediction
Пример #33
0
###使用测试集测试分类器的最终效果
test, tag_test = zip(*testSet)


def final_score(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(train)
    pred_1 = classifier.classify_many(test)
    return accuracy_score(tag_test, pred_1)


trainSet = posFeatures + negFeatures

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(trainSet)
pickle.dump(BernoulliNB_classifier, open('classifier1.pkl', 'wb+'))


def getgood():
    print('BernoulliNB`s accuracy is %f' % score(BernoulliNB()))
    print(final_score(BernoulliNB()))  # 使用开发集中得出的最佳分类器
    print('MultinomiaNB`s accuracy is %f' % score(MultinomialNB()))
    print(final_score(MultinomialNB()))
    print('LogisticRegression`s accuracy is %f' % score(LogisticRegression()))
    print(final_score(LogisticRegression()))
    print('SVC`s accuracy is %f' % score(SVC()))
    print(final_score(SVC()))
    print('LinearSVC`s accuracy is %f' % score(LinearSVC()))
    print(final_score(LinearSVC()))
    print('NuSVC`s accuracy is %f' % score(NuSVC()))
Пример #34
0
def store_classifier(clf, trainset, filepath):
    classifier = SklearnClassifier(clf)
    classifier.train(trainset)
    # use pickle to store classifier
    pickle.dump(classifier, open(filepath,'w'))
Пример #35
0
def clf_score(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(train_set)
    predict = classifier.batch_classify(test)
    return accuracy_score(tag_test, predict)
Пример #36
0
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features


featuresets = [(extract_features(d), c) for (d, c) in tweets]
print('featuresets: ', len(featuresets))
train_set, test_set = featuresets[:1900], featuresets[1900:]

#Multinomial Naive Bayes classifier
pipeline = Pipeline([('tfidf', TfidfTransformer()),
                     ('chi2', SelectKBest(chi2, k='all')),
                     ('nb', MultinomialNB())])

classif = SklearnClassifier(pipeline)
classif.train(train_set)

#Max entropy classifier
"""
classif = MaxentClassifier.train(train_set, 'megam')
"""
print(nltk.classify.accuracy(classif, test_set))

pred = classif.classify_many([feature for feature, sentiment in test_set])
test_true = [sentiment for feature, sentiment in test_set]
matx = confusion_matrix(test_true, pred)
print(matx)

#joblib.dump(tweets, 'tweets.pkl')
#joblib.dump(classif, 'classif.pkl')
"""
Пример #37
0
    b for t in tweets for b in zip(t.split(" ")[:-1],
                                   t.split(" ")[1:])
]
b_features = Counter(all_bigrams).most_common(500)
bigram_features = []
for (bigram, freq) in b_features:
    bigram_features.append(bigram)


def find_features(single_tweet):
    words = set(single_tweet)
    features = {}
    for w in bigram_features:
        features[w] = (w in words)
    return features


featuresets = [(find_features(tweets), stances)
               for (tweets, stances) in tweets_with_labels]
random.shuffle(featuresets)
training_set = featuresets[:500]
testing_set = featuresets[501:]
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percentage:",
      (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100)
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percentage:",
      (nltk.classify.accuracy(SVC_classifier, testing_set)) * 100)
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier


# In[34]:


svc = SVC()
classifier_sklearn = SklearnClassifier(svc)


# In[35]:


classifier_sklearn.train(training_data)


# In[36]:


nltk.classify.accuracy(classifier_sklearn, testing_data)


# In[37]:


from sklearn.ensemble import RandomForestClassifier


# In[38]:
Пример #39
0
## Load classifier
fh_in = open("naivebayes.pickle",
             "rb")  ## Open pickle file to read, rb = read in bytes
classifier = pickle.load(fh_in)  ## load classifer
fh_in.close()  ## close pickle file

################################## sk-learn classifiers ##################################################

from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

MNB_clf = SklearnClassifier(MultinomialNB())
MNB_clf.train(training_set)
print("MNB_classifier accuracy", nltk.classify.accuracy(MNB_clf, test_set))

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy:",
      (nltk.classify.accuracy(BernoulliNB_classifier, test_set)) * 100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:",
      (nltk.classify.accuracy(SVC_classifier, test_set)) * 100)

#### Naive Bayes classification
## Prepare data - could have doen this above but analyses below forced to make a new loop
x = []
def buildFeatures(tokenized):
    

for p in short_pos.split('\n'):
    documents.append( (p, "pos") )
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())

for p in short_neg.split('\n'):
    documents.append( (p, "neg") )
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())
            
save_documents = open("documents.pickle","wb")
pickle.dump(documents, save_documents)
save_documents.close()

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:5000]

save_word_features = open("word_features5k.pickle","wb")
pickle.dump(word_features, save_word_features)
save_word_features.close()

def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
        
    return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]
random.shuffle(featuresets)
print(len(featuresets))

testing_set = featuresets[10000:]
training_set = featuresets[:10000]

classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

###############
save_classifier = open("originalnaivebayes5k.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:",(nltk.classify.accuracy(MNB_classifier, testing_set))*100)

save_classifier = open("MNB_classifier5k.pickle","wb")
pickle.dump(MNB_classifier, save_classifier)
save_classifier.close()

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:",(nltk.classify.accuracy(BernoulliNB_classifier,testing_set))*100)

save_classifier = open("BernoulliNB_classifier5k.pickle","wb")
pickle.dump(BernoulliNB_classifier, save_classifier)
save_classifier.close()

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:",(nltk.classify.accuracy(LogisticRegression_classifier,testing_set))*100)

save_classifier =open("LogisticRegression_classifier5k.pickle","wb")
pickle.dump(LogisticRegression_classifier, save_classifier)
save_classifier.close()

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:",(nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

save_classifier = open("LinearSVC_classifier5k.pickle","wb")
pickle.dump(LinearSVC_classifier, save_classifier)
save_classifier.close()

SGDC_classifier = SklearnClassifier(SGDClassifier())
SGDC_classifier.train(training_set)
print("SGDClassifier accuracy percent:",nltk.classify.accuracy(SGDC_classifier,testing_set)*100)

save_classifier = open("SGDC_classifier5k.pickle","wb")
pickle.dump(SGDC_classifier, save_classifier)
save_classifier.close()
Пример #41
0
#classifier = nltk.NaiveBayesClassifier.train(train_set)

classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

print("Original Naive Bayes Accuracy Percentage: ",
      (nltk.classify.accuracy(classifier, test_set)) * 100)
classifier.show_most_informative_features(30)

#save_classifier = open("naivebayes.pickle", "wb")
#pickle.dump(classifier, save_classifier)
#save_classifier.close()

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(train_set)
print("MNB_classifier Accuracy Percentage: ",
      (nltk.classify.accuracy(MNB_classifier, test_set)) * 100)

BernoulliNB = SklearnClassifier(BernoulliNB())
BernoulliNB.train(train_set)
print("BernoulliNB Accuracy Percentage: ",
      (nltk.classify.accuracy(BernoulliNB, test_set)) * 100)

LogisticRegression = SklearnClassifier(LogisticRegression())
LogisticRegression.train(train_set)
print("LogisticRegression Accuracy Percentage: ",
      (nltk.classify.accuracy(LogisticRegression, test_set)) * 100)

SGDClassifier = SklearnClassifier(SGDClassifier())
SGDClassifier.train(train_set)
print("here")

featuresets = [(find_features(rev), category) for (rev, category) in documents]
'''DecisionTreeClassifier_classifier = SklearnClassifier(tree.DecisionTreeClassifier())
DecisionTreeClassifier_classifier.train(training_set)
print(nltk.classify.accuracy(DecisionTreeClassifier_classifier, testing_set))'''

print("here")

accuracy_sum = 0

for j in range(0, 10):

    random.shuffle(featuresets)

    testing_set = featuresets[1900:]
    training_set = featuresets[:1900]

    # classifier = OpinionLexiconClassifier()
    # accuracy = nltk.classify.accuracy(classifier)
    NuSVC_classifier = SklearnClassifier(NuSVC(nu=0.8))
    NuSVC_classifier.train(training_set)
    accuracy = nltk.classify.accuracy(NuSVC_classifier, testing_set)

    accuracy_sum += accuracy
    print("NuSVC_classifier accuracy percent:", str(accuracy * 100))

print("Average of the ten accuracies with top 4000 features:",
      str(accuracy_sum / 10))

featureSets = [(find_features(rev), category) for (rev, category) in documents]
random.shuffle(featureSets)

# training_set = featureSets[:750]
# testing_set = featureSets[750:]

training_set = featureSets[:360]
testing_set = featureSets[360:]

NB_classifier = nltk.NaiveBayesClassifier.train(training_set)
# classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)

LR_classifier = SklearnClassifier(LogisticRegression())
LR_classifier.train(training_set)

# stochastic gradient descent
SGD_classifier = SklearnClassifier(SGDClassifier())
SGD_classifier.train(training_set)

SV_classifier = SklearnClassifier(SVC())
SV_classifier.train(training_set)

LSV_classifier = SklearnClassifier(LinearSVC())
LSV_classifier.train(training_set)

RF_classifier = SklearnClassifier(RandomForestClassifier())
Пример #44
0
def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features


featuresets = [(find_features(rev), category) for (rev, category) in documents]

random.shuffle(featuresets)
print(len(featuresets))

testing_set = featuresets[10000:]
training_set = featuresets[:10000]

training_set = featuresets[9000:10000]

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:",
      (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100)

save_classifier = open("pickled_algos/LinearSVC_classifier5k.pickle", "wb")
pickle.dump(LinearSVC_classifier, save_classifier)
save_classifier.close()
print("Execution time: ")
print((time.clock() - start_time))
Пример #45
0

# uncomment below line - Initial training
classifier = nltk.NaiveBayesClassifier.train(training_set)
# save classifier to a file
dump.dump(classifier, "naivebayes")
print("naive Bayes accuracy: ", nltk.classify.accuracy(
    classifier, testing_set) * 100)
# classifier.show_most_informative_features(10)


# MultinomialNB, BernoulliNB

# train & save
MultinomialNB_classifier = SklearnClassifier(MultinomialNB())
MultinomialNB_classifier.train(training_set)
dump.dump(MultinomialNB_classifier, "MNB")
print("MultinomialNB_classifier accuracy: ", nltk.classify.accuracy(
    MultinomialNB_classifier, testing_set) * 100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
dump.dump(BernoulliNB_classifier, "BNB")
print("BernoulliNB_classifier accuracy: ", nltk.classify.accuracy(
    BernoulliNB_classifier, testing_set) * 100)


# LogisticRegression, SGDClassifier
# SVC, LinearSVC, NuSVC

# LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
featuresets = [(find_features(rev), category) for (rev, category) in documents]
random.shuffle(featuresets)

training_set = featuresets[:17500]
test_set = featuresets[17500:]

#Fitting Naive Bayes
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Classifier accuracy percent:",
      (nltk.classify.accuracy(classifier, test_set)) * 100)
classifier.show_most_informative_features(15)

#Fitting Multinomial Naive Bayes
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percent:",
      (nltk.classify.accuracy(MNB_classifier, test_set)) * 100)

#Fitting Bernoulli Naive Bayes
BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BernoulliNB accuracy percent:",
      (nltk.classify.accuracy(BNB_classifier, test_set)) * 100)

#Fitting Logistic Regression
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:",
      (nltk.classify.accuracy(LogisticRegression_classifier, test_set)) * 100)
Пример #47
0
    words = word_tokenize(sent)
    features = {}
    for w in BOW:
        features[w] = (w in words)

    return features


train += test

NBclassifier = nltk.NaiveBayesClassifier.train(train)
#print("orginal NB accuracy",(nltk.classify.accuracy(NBclassifier,test))*100)

MNBclassifier = SklearnClassifier(MultinomialNB())
MNBclassifier.train(train)
#print("classifier accuracy",(nltk.classify.accuracy(MNBclassifier,test))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(train)
#print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, test))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(train)
#print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, test))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(train)
#print("BernoulliNB_classifier accuracy",(nltk.classify.accuracy(BernoulliNB_classifier,test))*100

print("Trained")
Пример #48
0
testing_set = featuresets[10000:]
training_set = featuresets[:10000]


classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

###############
save_classifier = open("pickled_algos/originalnaivebayes5k.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

save_classifier = open("pickled_algos/MNB_classifier5k.pickle","wb")
pickle.dump(MNB_classifier, save_classifier)
save_classifier.close()

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

save_classifier = open("pickled_algos/BernoulliNB_classifier5k.pickle","wb")
pickle.dump(BernoulliNB_classifier, save_classifier)
save_classifier.close()

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
Пример #49
0
# * De labels zijn categorieen (dus de event nummers in ons geval)
featureSet = [({
    'Dit': 1.1,
    'zijn': 0.9,
    'features': 0.4,
    "testFeature": False
}, 1), ({
    'Dit': 1.1,
    'ook': 1.0,
    "testFeature": True
}, 2)]

# train de NLTK Naive Bayes classifier
NLTK_NB = NaiveBayesClassifier.train(featureSet)
# train de Scikit Learn MultinomialNB Classifier
SCI_NB.train(featureSet)
# train de Scikit Learn SVM Classifier
SVM.train(featureSet)

classifier = SVM
# Even de classifier uitproberen
print(classifier.classify({"Dit": 1.1, "zijn": 0.9}))  # 1
print(classifier.classify({
    "zijn": 0.5,
    "Dit": 1.1,
    "testFeature": True,
}))  # 2
"""
# * OPTIONEEL
# TEST DIT MET MEER DATA: vind de beste parameters voor de SVM (automatisch) met GridSearch
# Weet nog niet echt zeker of dit gaat werken. Eerst maar eens zien of het uberhaupt allemaal
Пример #50
0
classifier=nltk.NaiveBayesClassifier.train(training_set)


print("original_accuracy by naive_bayes:",nltk.classify.accuracy(classifier,test_set)*100)

classifier.show_most_informative_features(15)

save_classifier=open("originalnaivebayes.pickle","wb")
pickle.dump(classifier,save_classifier)
save_classifier.close()

#MNB CLASSIFIER

MNB_classifier=SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("mnb_accuracy:",nltk.classify.accuracy(MNB_classifier,test_set)*100)

save_classifier=open("MNB_classifier.pickle","wb")
pickle.dump(MNB_classifier,save_classifier)
save_classifier.close()

#USING bernoulliCLASSSIFIER

BN_classifier=SklearnClassifier(BernoulliNB())
BN_classifier.train(training_set)
print("bn_accuracy:",nltk.classify.accuracy(BN_classifier,test_set)*100)

save_classifier=open("BernoulliNB_classifier.pickle","wb")
pickle.dump(BernoulliNB_classifier,save_classifier)
save_classifier.close()
Пример #51
0
    classifier_l2 = pickle.load(f)

"""# Using Scikit-Learn API"""

from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(train_set)
print("Test Accuracy of MNB Classifier: %0.2f " %(nltk.classify.accuracy(MNB_classifier, test_set)*100))

# GNB_classifier = SklearnClassifier(GaussianNB())
# GNB_classifier.train(train_set)
# print("Test Accuracy of MNB Classifier: %0.2f " %(nltk.classify.accuracy(GNB_classifier, test_set)*100))

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(train_set)
print("Test Accuracy of MNB Classifier: %0.2f " %(nltk.classify.accuracy(LogisticRegression_classifier, test_set)*100))

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(train_set)
print("Test Accuracy of MNB Classifier: %0.2f " %(nltk.classify.accuracy(SVC_classifier, test_set)*100))

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
Пример #52
0
def final_score(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(train)
    pred_1 = classifier.classify_many(test)
    return accuracy_score(tag_test, pred_1)
Пример #53
0
def score(classifier,train_set,test,tag_test):
    classifier = SklearnClassifier(classifier)
    classifier.train(train_set)

    pred = classifier.batch_classify(test)
    return accuracy_score(tag_test, pred)
Пример #54
0
    testing_set = []
    training_set = []
    dates = []

    #split the sets into training and testing sets
    for n in (featuresets):  #adding training data and +/- for
        training_set.append([dict(n[1]), n[2]])
    for line in test_featuresets:
        testing_set.append(ast.literal_eval(line[1]))
    #train data

    classifier = nltk.NaiveBayesClassifier.train(training_set)

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)

    #Maxent_classifier = SklearnClassifier(MaxentClassifier())
    #Maxent_classifier.train(training_set)

    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)

    RandomForest_classifier = SklearnClassifier(
        RandomForestClassifier(n_estimators=100))
    RandomForest_classifier.train(training_set)

    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)

    SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
                for (rev, category) in documents]

training_set = feature_sets[:1900]
testing_set = feature_sets[1900:]

########################################################################################################################
# #  Naive-Bayes (posterior = (prior occurrences * likelihood) / evidence)
classifier = nltk.NaiveBayesClassifier.train(training_set)
accuracy = nltk.classify.accuracy(classifier, testing_set) * 100

print("Original NB Accuracy: ", accuracy)
classifier.show_most_informative_features()

# # Multinomial Naive-Bayes
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
multi_accuracy = nltk.classify.accuracy(MNB_classifier, testing_set) * 100

print("\nMultinomial NB Accuracy: ", multi_accuracy)

# # Bernoulli Naive-Bayes
Bern_classifier = SklearnClassifier(BernoulliNB())
Bern_classifier.train(training_set)
bern_accuracy = nltk.classify.accuracy(Bern_classifier, testing_set) * 100

print("\nBernoulli NB Accuracy: ", bern_accuracy)

# # Logistic Regression
logistic_regression_classifier = SklearnClassifier(LogisticRegression())
logistic_regression_classifier.train(training_set)
log_accuracy = nltk.classify.accuracy(logistic_regression_classifier,
Пример #56
0
def test():
    short_pos = open('resources/positive.txt', 'r', errors='ignore').read()
    short_neg = open('resources/negative.txt', 'r', errors='ignore').read()
    documents = []
    for r in short_pos.split("\n"):
        documents.append((r, 'pos'))

    for r in short_neg.split("\n"):
        documents.append((r, 'neg'))

    all_words = []

    short_pos_words = word_tokenize(short_pos)
    short_neg_words = word_tokenize(short_neg)

    for w in short_pos_words:
        all_words.append(w.lower())

    for w in short_neg_words:
        all_words.append(w.lower())

    all_words = nltk.FreqDist(all_words)

    word_features = list(all_words.keys())[:5000]

    def find_features(document):
        words = word_tokenize(document)
        features = {}
        for w in word_features:
            features[w] = (w in words)
        return features

    features_set = [(find_features(rev), category)
                    for (rev, category) in documents]

    random.shuffle(features_set)

    training_set = features_set[:10000]
    testing_set = features_set[10000:]

    # clf = nltk.NaiveBayesClassifier.train(training_set)

    clf_file = open("resources/nb_basic_classifier1.pickle", "rb")
    clf = pickle.load(clf_file)
    clf_file.close()

    print("Naive Bayes Algo accuracy score:",
          nltk.classify.accuracy(clf, testing_set))
    clf.show_most_informative_features(15)

    MNB_clf = SklearnClassifier(MultinomialNB())
    MNB_clf.train(training_set)
    print("MNB_clf Algo accuracy score:",
          nltk.classify.accuracy(MNB_clf, testing_set))

    BernoulliNB_clf = SklearnClassifier(BernoulliNB())
    BernoulliNB_clf.train(training_set)
    print("BernoulliNB_clf Algo accuracy score:",
          nltk.classify.accuracy(BernoulliNB_clf, testing_set))

    LogisticRegression_clf = SklearnClassifier(LogisticRegression())
    LogisticRegression_clf.train(training_set)
    print("LogisticRegression_clf Algo accuracy score:",
          nltk.classify.accuracy(LogisticRegression_clf, testing_set))

    SGDClassifier_clf = SklearnClassifier(SGDClassifier())
    SGDClassifier_clf.train(training_set)
    print("SGDClassifier_clf Algo accuracy score:",
          nltk.classify.accuracy(SGDClassifier_clf, testing_set))

    # SVC_clf = SklearnClassifier(SVC())
    # SVC_clf.train(training_set)
    # print("SVC_clf Algo accuracy score:", nltk.classify.accuracy(SVC_clf, testing_set))

    LinearSVC_clf = SklearnClassifier(LinearSVC())
    LinearSVC_clf.train(training_set)
    print("LinearSVC_clf Algo accuracy score:",
          nltk.classify.accuracy(LinearSVC_clf, testing_set))

    NuSVC_clf = SklearnClassifier(NuSVC())
    NuSVC_clf.train(training_set)
    print("NuSVC_clf Algo accuracy score:",
          nltk.classify.accuracy(NuSVC_clf, testing_set))

    vote_clf = VoteClassifier(clf, MNB_clf, BernoulliNB_clf,
                              LogisticRegression_clf, SGDClassifier_clf,
                              LinearSVC_clf, NuSVC_clf)
    print("vote_clf Algo accuracy score:",
          nltk.classify.accuracy(vote_clf, testing_set))
Пример #57
0
print("Starting the first round of training")
print("Length of featureset is:", len(featureset))

for i in range(0, 30):

    random.shuffle(featureset)

    testing_set = featureset[(int(len(featureset) * 0.9)):]
    training_set = featureset[:(int(len(featureset) * 0.9))]

    start_time = gettime.time()

    NuSVClassifier = SklearnClassifier(
        NuSVC(nu=0.8, decision_function_shape="ovr"))
    NuSVClassifier.train(training_set)
    NuSVClassifier_accuracy = nltk.classify.accuracy(NuSVClassifier,
                                                     testing_set)

    print("NuSVC done.")

    RFC = SklearnClassifier(
        RandomForestClassifier(n_estimators=25, min_samples_leaf=6))
    RFC.train(training_set)
    RFC_accuracy = nltk.classify.accuracy(RFC, testing_set)

    print("RFC done.")

    # OLC = OpinionLexiconClassifier()
    # OLC_accuracy = nltk.classify.accuracy(OLC, testing_set)
Пример #58
0
testing_set = featuresets[:100]

classfier = nltk.NaiveBayesClassifier.train(
    traning_set)  #using naive bayes algo to classify pos or neg movie reviews

#classfier_f = open("NaiveBayesSentiment.pickle","rb") #loading the trained model using pickle
#classfier = pickle.load(classfier_f)
#classfier_f.close()

print("ORiginal Naive Bayes Algorithm Accuracy Percent : ",
      (nltk.classify.accuracy(classfier, testing_set)) *
      100)  #calculating accuracy of th model
classfier.show_most_informative_features(30)

MNB_classfier = SklearnClassifier(MultinomialNB())
MNB_classfier.train(traning_set)

print("Multinomial Naive Bayes Algorithm Accuracy Percent : ",
      (nltk.classify.accuracy(MNB_classfier, testing_set)) *
      100)  #calculating accuracy of th model

B_classfier = SklearnClassifier(BernoulliNB())
B_classfier.train(traning_set)

print("Bernoulli Naive Bayes Algorithm Accuracy Percent : ",
      (nltk.classify.accuracy(B_classfier, testing_set)) *
      100)  #calculating accuracy of th model

#LogisticRegression,SGDClassifier
#SVC,LinearSVC,NuSVC
Пример #59
0
fe = find_features(movie_reviews.words('neg/cv000_29416.txt'))

featuresets = [(find_features(rev), category) for (rev, category) in documents]

#or rev in documents:
#  featuresets.append((find_features(rev)))

training_set = featuresets[:1900]
testing_set = featuresets[1900:]

classifier = nltk.NaiveBayesClassifier.train(training_set)
print("accuracy : ", (nltk.classify.accuracy(classifier, testing_set)))
classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("mnb classifier accuracy status is: ",
      (nltk.classify.accuracy(MNB_classifier, training_set)))

#Gau_classifier = SklearnClassifier(GaussianNB())
#Gau_classifier.train(training_set)
#print("gaussian classifier acccuracy is:",(nltk.classify.accuracy(Gau_classifier,training_set)))

bernoulliNB_classifier = SklearnClassifier(BernoulliNB())
bernoulliNB_classifier.train(training_set)
print("bernoulli classifier acccuracy is:",
      (nltk.classify.accuracy(bernoulliNB_classifier, training_set)))

#LinearRegression, SGDClassifier

LinearRegression_classifier = SklearnClassifier(LinearRegression())
Пример #60
0
def train_and_test_classifiers(train_set, test_set):
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print("Classic Naive Bayes Classifier accuracy percent:",
          (nltk.classify.accuracy(classifier, test_set)) * 100)
    # classifier.show_most_informative_features(15)

    MNB_classifier = SklearnClassifier(
        MultinomialNB(alpha=0.01, fit_prior=False))
    MNB_classifier.train(train_set)
    print("Multinomial Naive Bayes Classifier accuracy percent:",
          (nltk.classify.accuracy(MNB_classifier, test_set)) * 100)

    print("Skipping Gaussian Bayes Classifier accuracy percent")
    # GNB_classifier = SklearnClassifier(GaussianNB())
    # GNB_classifier.fit(features_train, target_train)
    # target_pred = clf.predict(features_test)
    # GNB_classifier.train(train_set)
    # print("Gaussian Naive Bayes Classifier accuracy percent:", (nltk.classify.accuracy(GNB_classifier, test_set))*100)

    BNB_classifier = SklearnClassifier(BernoulliNB(alpha=.01))
    BNB_classifier.train(train_set)
    print("Bernoulli Naive Bayes Classifier accuracy percent:",
          (nltk.classify.accuracy(BNB_classifier, test_set)) * 100)

    LG_classifier = SklearnClassifier(LogisticRegression(random_state=42))
    LG_classifier.train(train_set)
    print("Logistic Regression Classifier accuracy percent:",
          (nltk.classify.accuracy(LG_classifier, test_set)) * 100)

    # Train SGD with hinge penalty
    SGD_classifier1 = SklearnClassifier(
        SGDClassifier(loss='hinge',
                      penalty='l2',
                      alpha=1e-3,
                      random_state=42,
                      max_iter=1000,
                      tol=None))
    # SGD_classifier = SklearnClassifier(SGDClassifier(alpha=0.0005, max_iter=1000))
    SGD_classifier1.train(train_set)
    print("Stochastic Gradient Descent Classifier 1 accuracy percent:",
          (nltk.classify.accuracy(SGD_classifier1, test_set)) * 100)

    # Train SGD with Elastic Net penalty
    SGD_classifier2 = SklearnClassifier(
        SGDClassifier(alpha=1e-3,
                      random_state=42,
                      penalty="elasticnet",
                      max_iter=1000,
                      tol=None))
    SGD_classifier2.train(train_set)
    print("Stochastic Gradient Descent Classifier 2 accuracy percent:",
          (nltk.classify.accuracy(SGD_classifier2, test_set)) * 100)

    # print("Skipping C-Support Vector Classifier")
    # print("Skipping Linear-Support Vector Classifier")
    SVC_classifier = SklearnClassifier(SVC(), sparse=False).train(train_set)
    SVC_classifier.train(train_set)
    print("C-Support Vector Classifier accuracy percent:",
          (nltk.classify.accuracy(SVC_classifier, test_set)) * 100)
    LinearSVC_classifier1 = SklearnClassifier(
        SVC(kernel='linear', probability=True, tol=1e-3))
    LinearSVC_classifier1.train(train_set)
    print("Linear Support Vector Classifier 1 accuracy percent:",
          (nltk.classify.accuracy(LinearSVC_classifier1, test_set)) * 100)
    LinearSVC_classifier2 = SklearnClassifier(
        LinearSVC("l1", dual=False, tol=1e-3))
    LinearSVC_classifier2.train(train_set)
    print("Linear Support Vector Classifier 2 accuracy percent:",
          (nltk.classify.accuracy(LinearSVC_classifier2, test_set)) * 100)
    LinearSVC_classifier3 = SklearnClassifier(
        LinearSVC("l2", dual=False, tol=1e-3))
    LinearSVC_classifier3.train(train_set)
    print("Linear Support Vector Classifier 3 accuracy percent:",
          (nltk.classify.accuracy(LinearSVC_classifier3, test_set)) * 100)

    NuSVC_classifier = SklearnClassifier(NuSVC())
    NuSVC_classifier.train(train_set)
    print("Nu-Support Vector Classifier accuracy percent:",
          (nltk.classify.accuracy(NuSVC_classifier, test_set)) * 100)

    # new code

    # Train NearestCentroid (aka Rocchio classifier) without threshold
    Nearest_Centroid_classifier = SklearnClassifier(NearestCentroid())
    Nearest_Centroid_classifier.train(train_set)
    print("Nearest Centroid Classifier accuracy percent:",
          (nltk.classify.accuracy(Nearest_Centroid_classifier, test_set)) *
          100)

    Ridge_classifier = SklearnClassifier(
        RidgeClassifier(alpha=0.5, tol=1e-2, solver="sag"))
    Ridge_classifier.train(train_set)
    print("Ridge Classifier accuracy percent:",
          (nltk.classify.accuracy(Ridge_classifier, test_set)) * 100)

    Perceptron_classifier = SklearnClassifier(Perceptron(max_iter=1000))
    Perceptron_classifier.train(train_set)
    print("Perceptron Classifier accuracy percent:",
          (nltk.classify.accuracy(Perceptron_classifier, test_set)) * 100)

    Passive_Aggressive_classifier = SklearnClassifier(
        PassiveAggressiveClassifier(max_iter=1000))
    Passive_Aggressive_classifier.train(train_set)
    print("Passive-Aggressive Classifier accuracy percent:",
          (nltk.classify.accuracy(Passive_Aggressive_classifier, test_set)) *
          100)

    kNN_classifier = SklearnClassifier(KNeighborsClassifier(n_neighbors=10))
    kNN_classifier.train(train_set)
    print("kNN Classifier accuracy percent:",
          (nltk.classify.accuracy(kNN_classifier, test_set)) * 100)

    voted_classifier = VoteClassifier(classifier, MNB_classifier,
                                      BNB_classifier, LG_classifier,
                                      SGD_classifier2, LinearSVC_classifier2,
                                      NuSVC_classifier)
    print("Voted Classifier Classifier accuracy percent:",
          (nltk.classify.accuracy(voted_classifier, test_set)) * 100)
    print("Classification: ", voted_classifier.classify(test_set[0][0]),
          "Confidence: %",
          voted_classifier.confidence(test_set[0][0]) * 100)
    print("Classification: ", voted_classifier.classify(test_set[2][0]),
          "Confidence: %",
          voted_classifier.confidence(test_set[2][0]) * 100)
    print("Classification: ", voted_classifier.classify(test_set[3][0]),
          "Confidence: %",
          voted_classifier.confidence(test_set[3][0]) * 100)
    print("Classification: ", voted_classifier.classify(test_set[4][0]),
          "Confidence: %",
          voted_classifier.confidence(test_set[4][0]) * 100)