Exemplo n.º 1
0
    def test_plot(self):

        try:
            import matplotlib.pyplot        # noqa
        except ImportError:
            import nose
            raise nose.SkipTest()

        y_true = ['rabbit', 'cat', 'rabbit', 'rabbit', 'cat', 'dog',
                  'dog', 'rabbit', 'rabbit', 'cat', 'dog', 'rabbit']
        y_pred = ['cat', 'cat', 'rabbit', 'dog', 'cat', 'rabbit', 'dog',
                  'cat', 'rabbit', 'cat', 'rabbit', 'rabbit']

        cm = ConfusionMatrix(y_true, y_pred)

        # check plot works
        cm.plot()
        cm.plot(backend='seaborn')

        with self.assertRaises(ValueError):
            cm.plot(backend='xxx')
                                   predictions.tolist())
confusion_matrix

# Few stats
cms = confusion_matrix.stats()
print("Overall Accuracy is ", round(cms['overall']['Accuracy'], 2),
      ", Kappa is ", round(cms['overall']['Kappa'], 2))

# Predict on test data
predictions = classifier.predict(test[listAllPredictiveFeatures])
confusion_matrix = ConfusionMatrix(test[strResponse].tolist(),
                                   predictions.tolist())
confusion_matrix

# normalized confusion matrix
confusion_matrix.plot(normalized=True)
plt.show()

#Statistics are also available as follows
confusion_matrix.print_stats()
cms = confusion_matrix.stats()
print("Overall Accuracy is ", round(cms['overall']['Accuracy'], 2),
      ", Kappa is ", round(cms['overall']['Kappa'], 2))
#5000: Overall Accuracy is  0.8 , Kappa is  0.24

df = cms['class'].reset_index()
df[df['index'].str.contains('Precision')]
df[df['index'].str.contains('Sensitivity')]
df[df['index'].str.contains('Specificity')]  # 101 line

# How to predict probabilities of each class
def evaluate(textFile,
             valueFile=None,
             varStatusBar=None,
             varCmOutput=None,
             varOutput=None):
    timestamp = strftime("%Y-%m-%d:%H-%M-%S")
    reportFile = "./reports/" + timestamp + ".txt"
    outputFile = "./evaluations/" + timestamp + ".csv"
    statsFile = "./statistics/" + timestamp + ".txt"
    wf = WordFilter()
    totalReal = []
    totalPred = []
    with open("./data/Priors.csv", "r") as priorFile:
        print(priorFile)
        priors = priorFile.readline().strip().split(',')[1:]
        priors = [log10(float(x)) for x in priors]

    testSize = 0
    lst = []
    lst.append(("Real Emotion", "Predicted Emotion", "Tweet"))
    for line in tqdm(textFile):
        testSize += 1

        lineID = line.split(',')[0]
        words = wf.filterWords(line)

        predValues = []
        unfound = []

        for word in words:
            try:
                values = evaluateWord(word)
            except IOError:
                varStatusBar.set(
                    "WordMap not found. Please train system first.")
                raise
            if values is not None:
                predValues.append(values)
            else:
                unfound.append(word)
        predValues = map(sum, zip(*predValues))
        predProb = map(sum, zip(priors, predValues))
        predEmotion = guessEmotion(predProb)
        valueFormat = ",".join("%.2f" % n for n in predValues)

        if valueFile:
            realValues = [
                float(i) for i in valueFile.readline().strip().split(',')[1:]
            ]
            realEmotion = guessEmotion(realValues)
            if predEmotion != "No Words Found":
                totalReal.append(realEmotion)
                totalPred.append(predEmotion)

                if realEmotion != predEmotion:
                    lst.append((realEmotion, predEmotion, line))

        with open(outputFile, "a+") as output:
            output.write("{},{},{}\n".format(lineID, predEmotion, valueFormat))

        with open(reportFile, "a+") as report:
            report.write("{}\n".format(line))
            report.write("Filtered: {}\n".format(words))
            report.write("Words not found:{}\n".format(unfound))
            report.write("Emotion probabilities: {}\n".format(valueFormat))
            report.write("Predicted emotion: {}\n".format(predEmotion))
            if valueFile:
                report.write("Correct emotion: {}\n".format(realEmotion))
            report.write("-" * 70)
            report.write("\n")

    if valueFile:
        varStatusBar.set("Evaluation Complete.")

        with open('./data/RealPred.csv', 'w') as realpredFile:
            writer = csv.writer(realpredFile, delimiter=',')
            writer.writerows(lst)

        cm = ConfusionMatrix(totalReal, totalPred)
        viewPlot = tkMessageBox.askyesno("Confusion Matrix",
                                         "View confusion matrix plot?")
        if viewPlot:
            normaliseData = tkMessageBox.askyesno("Confusion Matrix",
                                                  "Normalise plot?")

            varOutput.set("Accuracy: " +
                          str(cm.stats()['overall']['Accuracy']))
            varCmOutput.set("Confusion Matrix: \n" + str(cm.stats()['cm']))

            data = cm.stats()
            for key, value in data.items():
                print(key, value)

            cm.plot(normalized=normaliseData)
            plt.show()

        with open(statsFile, "w+") as report:
            report.seek(0)
            report.write(str(cm))
            report.write("\n")
Exemplo n.º 4
0
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
from pandas_ml import ConfusionMatrix

# The 20 newsgroups dataset comprises around 18000 newsgroups posts
# on 20 topics split in two subsets

newsgroups_train = datasets.fetch_20newsgroups(subset='train')
newsgroups_test = datasets.fetch_20newsgroups(subset='test')

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_test = vectorizer.transform(newsgroups_test.data)

y_train = newsgroups_train.target
y_test = newsgroups_test.target

model = MultinomialNB()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

# print model.score(X_test, y_test)
# print metrics.classification_report(y_test, predictions)

labels = list(newsgroups_train.target_names)
print(labels)
cm = ConfusionMatrix(y_test, predictions, labels)
cm.plot()
plt.show()
#for i in range(0,len(y_predictions)):
#print y_predictions[i], y_test.as_matrix()[i]

print 'Accuracy:', clf.score(X_test, y_test)

#printing the training data size for each element
print collections.Counter(y_train.factorize()[0])

#draw confusion matrix
#get all labels
#le = preprocessing.LabelEncoder()
#le.fit(y.as_matrix())
#labels = le.classes_
cm = ConfusionMatrix(y_test.as_matrix(), y_predictions)
cm.plot(normalized=True)

#cross validation scores
scores = cross_val_score(clf, X, y, cv=5)
print scores.mean(), scores

from sklearn.metrics import classification_report
le = preprocessing.LabelEncoder()
le.fit(y.as_matrix())
target_names = le.classes_

print classification_report(y_test, y_predictions, target_names=target_names)

print collections.Counter(y_test.factorize()[0])
'''
This will plot the correlation between the attributes
Exemplo n.º 6
0
saver = tf.train.Saver()
saver.restore(session, savepoint)
state = session.run(graph.initial_state)

for batch, labelx in enumerate(
        getBatches(testData, hyperparameters['batchSize']), 1):
    feedDict = {
        graph.input_data: labelx,
        graph.keep_prob: hyperparameters['dropoutProb'],
        graph.initial_state: state
    }

    getPredictions = session.run(graph.predictions, feed_dict=feedDict)

    for i in range(len(getPredictions)):
        finalPredictions.append(getPredictions[i, :])

finalPredictions = np.asarray(finalPredictions)
predictions = np.argmax(finalPredictions, axis=1)
actualPredictions = testRatings.argmax(axis=1)[:predictions.shape[0]]

cm = ConfusionMatrix(actualPredictions, predictions)
cm.plot(backend='seaborn', normalized=True)
plt.title('Confusion Matrix Stars prediction')
plt.figure(figsize=(12, 10))

test_correct_pred = np.equal(predictions, testRatings)
test_accuracy = np.mean(test_correct_pred.astype(float))

print("Test accuracy is: " + str(test_accuracy))
model = GaussianNB()
model.fit(X, target)
yhat = model.predict(X)

print('Accuracy:')
print(metrics.accuracy_score(target, yhat))
print('Classification report:')
print(metrics.classification_report(target, yhat))

print('Confusion matrix:')
cm = ConfusionMatrix(target, yhat)
print(cm)
print('Stats:')
cm.print_stats()
ax = cm.plot(backend='seaborn', annot=True, fmt='g')
ax.set_title('Confusion Matrix')
plt.show()
plt.clf()

fpr, tpr, threshold = metrics.roc_curve(target, yhat)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()