print(reflist[:30] )
print(testlist[:30])

dir(nltk.metrics)
from nltk.metrics import *


reflist = []
testlist = []
for (features, label) in test_set:
    reflist.append(label) 
    testlist.append(classifier.classify(features))


cm = ConfusionMatrix(reflist, testlist)
print(cm)


from sklearn.svm import LinearSVC
from nltk.classify.scikitlearn import SklearnClassifier
classifier=nltk.classify.SklearnClassifier(LinearSVC()).train(train_set)
nltk.classify.accuracy(classifier, test_set)


# calculating true negative, false positive, false negative, true positive
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(reflist, testlist).ravel()
(tn, fp, fn, tp)
Exemplo n.º 2
0
    print("Creating average feature vecs for test reviews")

    testDataVecs = getAvgFeatureVecs(getCleanReviews(test), model,
                                     num_features)

    classificador = nltk.NaiveBayesClassifier.train(trainDataVecs)
    print(classificador.labels())
    print(classificador.show_most_informative_features(20))

    # ****** Fit a random forest to the training set, then make predictions
    #
    # Fit a random forest to the training data, using 100 trees
    forest = RandomForestClassifier(n_estimators=100)

    print("Fitting a random forest to labeled training data...")
    forest = forest.fit(trainDataVecs, train["sentiment"])

    # Test & extract results
    result = forest.predict(testDataVecs)

    # Write the test results
    output = pd.DataFrame(data={"id": test["id"], "sentiment": result})
    output.to_csv("saida.csv", index=False, quoting=3)

    print("Accuracy: ", accuracy_score(list(test['sentiment']), result))
    print("F1: ", f1_score(list(test['sentiment']), result))
    print("Precision: ", precision_score(list(test['sentiment']), result))
    matriz = ConfusionMatrix(test['sentiment'], result)

    print(matriz)
Exemplo n.º 3
0
def report(expected, predicted, labels, log):
    
    cm = ConfusionMatrix(expected, predicted)

    log.info("Confusion matrix:\n%s", cm)

    log.info("Confusion matrix: sorted by count\n%s", cm.pretty_format(sort_by_count=True))
      
    true_positives = Counter()
    false_negatives = Counter()
    false_positives = Counter()
    missing_labels = Counter()

    #merge expected & predicted, & get unique values
    tested_labels = set(expected + predicted)

    for i in tested_labels:
        for j in tested_labels:
            if i == j:
                true_positives[i] += cm[i,j]
            else:
                false_negatives[i] += cm[i,j]
                false_positives[j] += cm[i,j]

    sb = ''
    for value, count in true_positives.most_common():
        s = '{0}={1}, '.format(value, count)
        sb += s
    log.info("True Positives (%d): %s\n", sum(true_positives.values()), sb)
    
    sb = ''
    for value, count in false_negatives.most_common():
        s = '{0}={1}, '.format(value, count)
        sb += s
    log.info("False Negatives (%d): %s\n", sum(false_negatives.values()), sb)

    sb = ''
    for value, count in false_positives.most_common():
        s = '{0}={1}, '.format(value, count)
        sb += s
    log.info("False Positives (%d): %s\n", sum(false_positives.values()), sb)

    sb = ''
    last = len(tested_labels) - 1
    for i, x in enumerate(sorted(tested_labels)):
        if true_positives[x] == 0:
            fscore = 0
        else:
            precision = true_positives[x] / float(true_positives[x]+false_positives[x])
            recall = true_positives[x] / float(true_positives[x]+false_negatives[x])
            fscore = 2 * (precision * recall) / float(precision + recall)

        if i != last:
            sb += '{0}={1}, '.format(x, fscore)
        else:
            sb += '{0}={1}'.format(x, fscore)

    log.info('F Scores: {0}\n'.format(sb))

    untested_labels = set(labels) - tested_labels

    if (len(untested_labels)):
        log.info('No F Scores for untested categories: {0}\n'.format(list(untested_labels)))
Exemplo n.º 4
0
t0 = DefaultTagger('NN')
t1 = UnigramTagger(text, backoff=t0)
t2 = BigramTagger(text, backoff=t1)

bigram_tagged = t2.tag(tokens_updated)
ref_tag_set = [word[1] for word in bigram_tagged]

print("********************\n")
print("*** UNDERSTANDING TAGGING ERRORS ***\n")
print("POS tags from the default pos_tag() method: \n", tags_set)
print("POS tags from the Bigram tagger in nltk: \n", ref_tag_set)

from nltk.metrics import ConfusionMatrix
from collections import Counter

cm = ConfusionMatrix(ref_tag_set, tags_set)  #Creating a Confusion Matrix

labels = set(ref_tag_set + tags_set)  #Getting all the tags present

true_positives = Counter()
false_negatives = Counter()
false_positives = Counter()

for i in labels:
    for j in labels:
        if i == j:
            true_positives[i] += cm[i, j]
        else:
            false_negatives[i] += cm[i, j]
            false_positives[j] += cm[i, j]
print(
Exemplo n.º 5
0
    print('- - - {} - - -'.format(label))
    print('Agreement on: {}/{}'.format(same, same + diff))
    print('Average observed agreement: {}'.format(t.avg_Ao()))
    print('Krippendorff\'s alpha: {}'.format(t.alpha()))

if len(set([t[0] for t in task])) == 2:
    # number of raters = 2
    type_arr1 = []
    type_arr2 = []
    att = align_annot_task(annot_task_type)
    att.sort(key=itemgetter(1))
    for key in set([t[1] for t in att]):
        r1, r2 = [t for t in att if t[1] == key]
        type_arr1.append(r1[2])
        type_arr2.append(r2[2])
    cm = ConfusionMatrix(type_arr1, type_arr2)

    types = ['claim', 'ne', 'example', 'other']
    print()
    print('\t'.join([''] + types))
    for tx in types:
        vals = []
        for ty in types:
            vals.append(cm[tx, ty])
        print('\t'.join([tx] + [str(v) for v in vals]))

    users_sep = list(users.items())
    users_both = [('all', users_sep[0][1] + users_sep[1][1])]
    users_comb = users_sep + users_both
else:
    users_comb = list(users.items())
dictionaries = [liwc, oplexicon, sentilex]

for dictionary in dictionaries:

    # from LexiconClassifier library
    classifier = Classifier(dictionary)

    # build the train and test set
    word_vector = negative_words + positive_words
    gold_standard = [-1 for i in range(len(negative_words))
                     ] + [1 for i in range(len(positive_words))]
    results = [classifier.classify(s) for s in word_vector]

    # print the classification results
    print 'Dictionary : ', dictionary.get_name(), '\n'
    print ConfusionMatrix(gold_standard, results).pp()
    print 'Accuracy: ', accuracy(gold_standard, results)
    for c in [0, 1, -1]:
        print 'Metrics for class ', c
        gold = set()
        test = set()
        for i, x in enumerate(gold_standard):
            if x == c:
                gold.add(i)
        for i, x in enumerate(results):
            if x == c:
                test.add(i)
        print 'Precision: ', precision(gold, test)
        print 'Recall   : ', recall(gold, test)
        print 'F_measure: ', f_measure(gold, test)
    print '\n\n'
Exemplo n.º 7
0
def main(argv):

    listFile1 = []
    listFile2 = []

    #choose whichs tags you want the program to use
    tags = ['https', 'http']
    #tags = ['COU', 'NAT', 'ENT', 'SPO', 'ANI', 'ORG', 'CIT', 'PER']

    f = open("test.set.ent")
    lines = []
    for line in f:
        tag = 0
        lines.append(line.strip())
        line = line[:-1]
        wordsline = line.rstrip("\r").split()
        tagged = ifTag(wordsline, tags)
        if tagged:
            listFile1.append(tagged)
        else:
            listFile1.append('x')
    print(listFile1)
    g = open("test.set")
    for line in g:
        tag = 0
        lines.append(line.strip())
        line = line[:-1]
        wordsline = line.rstrip("\r").split()
        tagged = ifTag(wordsline, tags)
        if tagged:
            listFile2.append(tagged)
        else:
            listFile2.append('x')
    #print(listFile2)
    #measurement factor for the amount of links that are the same for both lists
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    c = 0
    d = 0
    Hoi = 0
    TOTAL = 0
    goldstandard = 0
    onze = 0

    for link in listFile1:
        c = c + 1
        if link != "x":
            goldstandard += 1
        if listFile2[c - 1] != "x":
            onze += 1
        if link == listFile2[c - 1] and link != "x":
            TP += 1
        if listFile2[c - 1] != "x" and link == "x":
            FN += 1
        if link == "x" and listFile2[c - 1] == "x":
            TN += 1
        if link != "x" and listFile2[c - 1] == "x":
            FP += 1
        if link != "x" and listFile2[c - 1] != "x" and listFile2[c -
                                                                 1] != link:
            Hoi += 1

    print(goldstandard, "gs")
    print(onze, "ons")
    print("TP", TP)
    print("FP", FP)
    print("TN", TN)
    print("FN", FN)
    print("hoi", Hoi)
    #--------------------------------------------------------------------------

    cm = ConfusionMatrix(listFile1, listFile2)
    print(cm)

    # choose which labels u want to be used by the program
    labels = set("http x".split())
    #labels = set("COU NAT ENT SPO ANI ORG CIT PER x".split())

    true_positives = Counter()
    false_negatives = Counter()
    false_positives = Counter()

    for i in labels:
        for j in labels:
            if i == j:
                true_positives[i] += cm[i, j]
            else:
                false_negatives[i] += cm[i, j]
                false_positives[j] += cm[i, j]

    print("TP:", sum(true_positives.values()), true_positives)
    print("FN:", sum(false_negatives.values()), false_negatives)
    print("FP:", sum(false_positives.values()), false_positives)
    print()

    for i in sorted(labels):
        if true_positives[i] == 0:
            fscore = 0
            print("fscore = 0")
        else:
            precision = true_positives[i] / float(true_positives[i] +
                                                  false_positives[i])
            recall = true_positives[i] / float(true_positives[i] +
                                               false_negatives[i])
            fscore = 2 * (precision * recall) / float(precision + recall)
            print(i, 'fscore:', fscore, 'precision:', precision, 'recall',
                  recall)
Exemplo n.º 8
0
def main(args):
    infile = args.infile

    # Read CSV and map input columns -> answer columns
    df_input = pd.read_csv(infile)
    iomap = dict(filter(lambda s: s, map(find_answer_field, df_input.columns)))

    dfs = []
    for i, o in iomap.items():
        idata = df_input[i].fillna("-NONE-")
        odata = df_input[o].fillna("-NONE-")
        id = (df_input['HITId'] + "-\"" + df_input[i] + "\"").map(make_GUID)
        df_new = pd.DataFrame({
            "input": idata,
            "output": odata,
            "id": id,
            'worker_id': df_input['WorkerId']
        })
        dfs.append(df_new.set_index("id"))

    df = pd.concat(dfs, ignore_index=False)
    print df.shape

    # Print feedback
    print "Feedback:"
    s = df_input['Answer.Feedback']
    for line in s.loc[s.notnull()]:
        print ">> " + line

    # Collect all labels for a given input
    datamap = collections.defaultdict(lambda: [])
    for i, o in zip(df.input, df.output):
        datamap[i].append(o)

    # Compute confusion matrix
    from nltk.metrics import ConfusionMatrix
    x, ys = zip(*datamap.items())
    y0, y1 = zip(*ys)  # get first, second element of each
    c = ConfusionMatrix(y0, y1)
    print c
    acc = nltk.metrics.accuracy(y0, y1)
    print "Full cross-annotator accuracy: %.02f%%" % (100 * acc)
    print ""

    # Try binarizing: sentence/not
    ys_b = map(lambda ls: [(l if l == '-SENTENCE-' else '-NOT-') for l in ls],
               ys)
    y0, y1 = zip(*ys_b)
    c_b = ConfusionMatrix(y0, y1)
    print c_b
    acc = nltk.metrics.accuracy(y0, y1)
    print "Binarized cross-annotator accuracy: %.02f%%" % (100 * acc)
    print ""

    # Save data to file
    # d = df.to_dict('records')
    # print "Saving %d records to %s" % (len(d), args.outfile)
    # with open(args.outfile, 'w') as fd:
    #     for r in d:
    #         print >> fd, json.dumps(r)

    # Match format of old annotator script
    print "Saving %d unique lines to %s" % (len(datamap), args.outfile)
    outmap = {
        make_GUID(str(i) + args.infile): (str(i), o)
        for (i, o) in datamap.items()
    }
    with open(args.outfile, 'w') as fd:
        print >> fd, json.dumps(outmap, indent=1)
Exemplo n.º 9
0
def get_ConfusionMatrix(true,predicted):
    #Confusion Matrix is only valid for partial evaluation.
    true_chain=list(itertools.chain.from_iterable(true))
    predicted_chain=list(itertools.chain.from_iterable(predicted))
    print("Confusion Matrix of combined folds (partial evaluation)\n{0}".format(ConfusionMatrix(true_chain,predicted_chain)))
Exemplo n.º 10
0
    ['2', 55829, 'LOC'],
    ['1', 259742, 'PER'],
    ['2', 259742, 'LOC'],
    ['1', 269340, 'PER'],
    ['2', 269340, 'LOC']
]
task = AnnotationTask(data=toy_data)
print(task.kappa())
print(task.alpha())
# 16h52 : Yes ! ça marche !

# L'annotateur est remplacé par une division en deux variables
# L'élément est remplacé par la position dans la liste
toy1 = ['ORG', 'LOC', 'PER', 'PER']
toy2 = ['ORG', 'LOC', 'LOC', 'LOC']
cm = ConfusionMatrix(toy1, toy2)
print(cm)

# multilabel pour une classe (un but)
# only 2 utilisateurs

rater1 = ['no', 'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'no']
rater2 = ['yes', 'no', 'no', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'yes']

if len(rater1) != len(rater2):
    raise Exception('Not good')
nb = 0
toy_data = []
while nb < len(rater1):
    toy_data.append(['1', nb, rater1[nb]])
    toy_data.append(['2', nb, rater2[nb]])
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

time_out_format_log = '%Y-%m-%d %H:%M:%S'
utc_timestamp = datetime.datetime.utcnow()
classifier = nltk.NaiveBayesClassifier.train(training_set)
time_taken_to_train = -1* (utc_timestamp - datetime.datetime.utcnow()).total_seconds()
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set)) * 100)
print("Time taken to train: "+str(time_taken_to_train))
classifier.show_most_informative_features(15)
result_set = []
for test_rev in test_reviews:
    result_set.append(classifier.classify(test_rev))
print(ConfusionMatrix(test_labels,result_set))

###############
save_classifier = open("pickled_algos/originalnaivebayes5k.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

utc_timestamp = datetime.datetime.utcnow()
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100)
time_taken_to_train = -1* (utc_timestamp - datetime.datetime.utcnow()).total_seconds()
print("Time taken to train: "+str(time_taken_to_train))
result_set = []
for test_rev in test_reviews:
    result_set.append(MNB_classifier.classify(test_rev))
Exemplo n.º 12
0
tags_words_test = replace_with_UNKING(tags_words_test)
tags_words_test = replace_with_UNKED(tags_words_test)
words_train = ([w for (_, w) in tags_words_train])
words_test = ([w for (_, w) in tags_words_test])
tags_train = ([t for (t, _) in tags_words_train])
tags_test = ([t for (t, _) in tags_words_test])
distinct_tags = set(tags_train)
# calculating transition probability
cfd_tags = ConditionalFreqDist(nltk.bigrams(tags_train))
cpd_tags = ConditionalProbDist(cfd_tags, MLEProbDist)
# calculating observation likelihood
cfd_tagwords = ConditionalFreqDist(tags_words_train)
cpd_tagwords = ConditionalProbDist(cfd_tagwords, MLEProbDist)
backpointer = find_tag_for_sentences(words_test)
accuracy_with_UNKING_UNKED_tag = calculate_accuracy(tags_test, backpointer)
cm = ConfusionMatrix(tags_test, backpointer)
'''
test and train with UNK-CAP tag, UNK-ING tag and UNK-ED tag
'''
tags_words_train = add_start_end(0, 10000)
tags_words_test = add_start_end(10001, 10501)
tags_words_train = replace_with_UNKCAP(tags_words_train)
tags_words_train = replace_with_UNKING(tags_words_train)
tags_words_train = replace_with_UNKED(tags_words_train)
tags_words_test = replace_with_UNKCAP(tags_words_test)
tags_words_test = replace_with_UNKING(tags_words_test)
tags_words_test = replace_with_UNKED(tags_words_test)
words_train = ([w for (_, w) in tags_words_train])
words_test = ([w for (_, w) in tags_words_test])
tags_train = ([t for (t, _) in tags_words_train])
tags_test = ([t for (t, _) in tags_words_test])
Exemplo n.º 13
0
    predicted_tagged_list = []
    for sent in test_document:
        predicted_tagged_list.extend(
            sent_tag_list(tagger.tag(sent_word_list(sent))))

    ref = tag_list(test_document)
    print 'test tagset_______'
    print set(ref)

    print 'predicted tagset_______'
    print set(predicted_tagged_list)

    # print 'chito'
    # print ref

    cm = ConfusionMatrix(ref, predicted_tagged_list)
    print cm
    # print cm.pretty_format(show_percents=True, values_in_chart=True,
    #           truncate=None, sort_by_count=False)
    print len(predicted_tagged_list)
    print len(set(predicted_tagged_list))
    print len(ref)
    print len(set(ref))

    print_cm_accuracy(cm)

    normalized_cm = get_normalized_cm(cm)

    # print normalized_cm[1][1]
    l = np.array(normalized_cm)
    cm_out_file.write(np.array2string(l, max_line_width=200, separator=', '))
Exemplo n.º 14
0
    # Loading the model
    print "Loading the CRF model..."
    tagger = pycrfsuite.Tagger()
    tagger.open(model)

    # Testing progress
    #sys.stdout.write("Testing: ")
    #sys.stdout.flush()
    #pred = []
    #idx = 0
    #for i in featset.items():
    #    idx += 1
    #    if idx % 1000 == 0:
    #        sys.stdout.write('.')
    #        sys.stdout.flush()
    #    pred.append(str(tagger.tag(i)))
    print "Testing..."
    pred = tagger.tag(featset)
    tagger.close()
    pred = [str(p) for p in pred]

    # Show result
    accuracy = scores.accuracy(ref, pred)
    print "\nAccuracy: %.4f" % accuracy
    cm = ConfusionMatrix(ref, pred)
    print "Confusion Matrix:"
    print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

    # Finished?
    print "DONE!!"
Exemplo n.º 15
0
        print("Running on dev")
        test_data = [json.loads(line) for line in open_file("twt.dev.json")]
    else:
        print("Running on test")
        test_data = [json.loads(line) for line in open_file("twt.test.json")]
    test_data = handle_lowfreq_words(vocab)(test_data)
    twitter_model = hmm.HiddenMarkovModelTagger(symbols=hmm_model.symbols,
                                                states=tagset,
                                                transitions=transition_model,
                                                outputs=emission_model,
                                                priors=init_model)

    # Compute the accuracy - we can call this, but then we just do extra decoding
    # work. What we really need is just call nltk.metrics.accuracy on the gold and
    # predicted.
    # twitter_model.test( test_data )

    # Compute the confusion matrix, technically we would be doing this twice, as
    # when computing accuracy we would've already done this. It would be more
    # optimal to modify the hmm library. But meh.
    gold = tag_list(test_data)
    unlabeled_data = LazyMap(unlabeled_words, test_data)
    predicted_labels = list(LazyMap(twitter_model.tag, unlabeled_data))
    predicted = tag_list(predicted_labels)

    acc = accuracy(gold, predicted)
    print("Accuracy: ", acc)
    cm = ConfusionMatrix(gold, predicted)
    print(cm.pretty_format(sort_by_count=True, show_percents=True,
                           truncate=25))
Exemplo n.º 16
0
def evaluate(word, dlc, test_corpus, sample_size=3):

    result = {}

    # do some baseline counting
    word_senses = list(set([w[0] for w in test_corpus]))
    word_sense, word_sense_star = word_senses[0], word_senses[1]
    word_sense_count = 0
    word_sense_star_count = 0
    word_sense_majority = ''
    for w in test_corpus:
        if w[0] == word_sense:
            word_sense_count += 1
        else:
            word_sense_star_count += 1

    if word_sense_count/(word_sense_star_count+word_sense_count) > \
        word_sense_star_count/(word_sense_star_count+word_sense_count) :
        word_sense_majority = word_senses[0]
    else:
        word_sense_majority = word_senses[1]

    # baseline testing
    baseline_correct = 0
    result["majority_baseline"] = word_sense_majority
    for row in test_corpus:
        if row[0] == word_sense_majority:
            baseline_correct += 1
    result["majority_baseline_percent_correct"] = round(
        baseline_correct / len(test_corpus) * 100, 2)

    # analyze using model prediction
    result["correct_count"], result["incorrect_count"] = 0, 0
    guesses = []  # what we guessed
    actual = []  # what acutally was
    correctly_guessed = []
    incorrectly_guessed = []
    for row in test_corpus:
        g = dlc.predict(word, row[1])
        guesses.append(g)
        actual.append(row[0])
        if g == row[0]:
            result["correct_count"] += 1
            correctly_guessed.append(row[1])
        else:
            result["incorrect_count"] += 1
            incorrectly_guessed.append(row[1])

    result["correct_guess_sample"] = random.sample(correctly_guessed,
                                                   sample_size)
    result["incorrect_guess_sample"] = random.sample(incorrectly_guessed,
                                                     sample_size)
    result["percent_correct"] = round(
        (result["correct_count"] / len(test_corpus)) * 100, 2)

    # confusion
    result["confusion_matrix"] = ConfusionMatrix(actual, guesses)

    # calculate true/false_positive/negatives for both senses
    true_pos = Counter()
    false_neg = Counter()
    false_pos = Counter()
    for i in [word_sense, word_sense_star]:
        for j in [word_sense, word_sense_star]:
            if i == j:
                true_pos[i] += result["confusion_matrix"][i, j]
            else:
                false_neg[i] += result["confusion_matrix"][i, j]
                false_pos[j] += result["confusion_matrix"][i, j]

    # # precision
    # result["precision_word"] = true_pos[word_sense] / float(true_pos[word_sense]+false_pos[word_sense])
    # if float(true_pos[word_sense_star]+false_pos[word_sense_star]) == 0:
    #     result["precision_word_star"] = 0
    # else:
    #     result["precision_word_star"] = true_pos[word_sense_star] / float(true_pos[word_sense_star]+false_pos[word_sense_star])
    #
    # # recall
    # result["recall_word"] = true_pos[word_sense] / float(true_pos[word_sense]+false_neg[word_sense])
    # result["recall_word_star"] = true_pos[word_sense_star] / float(true_pos[word_sense_star]+false_neg[word_sense_star])
    #
    # # macros
    # result["macro_precision"] = (float(result["recall_word"]) + float(result["recall_word_star"])) / 2.0
    # result["macro_recall"] = (float(result["recall_word"]) + float(result["recall_word_star"])) / 2.0
    #
    # result["word_sense"] = word_sense
    # result["word_sense_star"] = word_sense_star

    return result
Exemplo n.º 17
0
for i in labels:
    for j in labels:
        if i == j:
            true_positives[i] += cm[i,j]
        else:
            false_negatives[i] += cm[i,j]
            false_positives[j] += cm[i,j]

print "TP:", sum(true_positives.values()), true_positives
print "FN:", sum(false_negatives.values()), false_negatives
print "FP:", sum(false_positives.values()), false_positives
"""

ref    = 'DET NN VB DET JJ JJ NN  IN NN VB DET NN'.split()
tagged = 'DET VB VB DET VB VB DET NN JJ JJ NN  IN'.split()
cm = ConfusionMatrix(ref, tagged)
labels_ref = set(ref)
labels_tagged=set(tagged)
print cm


true_positives = {}
false_negatives = Counter()
false_positives = Counter()

print ref
print tagged

set_ref=[]
set_tagged=[]
Exemplo n.º 18
0
    result = classifier.classify(feature)
    if result != target:
        errors.append((target, result, feature))

for (target, result, feature) in errors:
    print(target, result, feature)

# usando a matrix de confução para saber como está os dados em relação de erros e acertos
y_test = []
y_pred = []
for feature, target in dataset_test:
    result = classifier.classify(feature)
    y_test.append(result)
    y_pred.append(target)

cm = ConfusionMatrix(y_test, y_pred)
print(cm)

# 1. Cenário
# 2. Número de classes 16%
# 3. ZeroRules 21,051%

# Testando novos classificador com novos dados
phrase_test = 'eu sinto amor por voce'
test_stemming = []
stemmer = RSLPStemmer()
for word in phrase_test.split():
    w_stemming = [p for p in word.split()]
    test_stemming.append(str(stemmer.stem(w_stemming[0])))
#print(test_stemming)
Exemplo n.º 19
0
# -- Avaliação dos resultados --
erros = []
for (frase, classe) in base_compl_teste:
    resultado = classificador.classify(frase)  # previsão do algoritmo
    if resultado != classe:
        erros.append((classe, resultado, frase))

# -- Visualização dos resultados --
resultado_esperado = []
resultado_previsto = []
for (frase, classe) in base_compl_teste:
    resultado = classificador.classify(frase)
    resultado_previsto.append(resultado)
    resultado_esperado.append(classe)

matriz = ConfusionMatrix(resultado_esperado, resultado_previsto)
print(matriz)

# -- P/ Teste --
teste = 'estou muito alegre hoje'
teste_stemming = []
stemmer = nltk.stem.RSLPStemmer()
for (palavras_treinamento) in teste.split():
    com_stem = [p for p in palavras_treinamento.split()]
    teste_stemming.append(str(stemmer.stem(com_stem[0])))

novo = seleciona_palavra_frase(teste_stemming)
print(teste_stemming)
print(teste)
distribuicao = classificador.prob_classify(novo)
for classe in distribuicao.samples():
Exemplo n.º 20
0
        erros.append((classe, resultado, frase))
#for (classe, resultado, frase) in erros:
#    print(classe, resultado, frase)

from nltk.metrics import ConfusionMatrix

esperado = []
previsto = []
for (frase, classe) in basecompletateste:
    resultado = classificador.classify(frase)
    previsto.append(resultado)
    esperado.append(classe)

#esperado = 'alegria alegria alegria alegria medo medo surpresa surpresa'.split()
#previsto = 'alegria alegria medo surpresa medo medo medo surpresa'.split()
matriz = ConfusionMatrix(esperado, previsto)
print(matriz)

# 1. Cenário
# 2. Número de classes - 16%
# 3. ZeroRules - 21,05%

teste = 'eu sinto amor por voce'
testestemming = []
stemmer = nltk.stem.RSLPStemmer()
for (palavrastreinamento) in teste.split():
    comstem = [p for p in palavrastreinamento.split()]
    testestemming.append(str(stemmer.stem(comstem[0])))
#print(testestemming)

novo = extratorpalavras(testestemming)
Exemplo n.º 21
0
errosMatriz = []
for (termos, classe) in dicionariofinalteste:
    result = classificador.classify(termos)
    if result != classe:
       errosMatriz.append((classe, result, termos))

from nltk.metrics import ConfusionMatrix
espera = []
real = []
for (texto, classe) in dicionariofinalteste:
    result = classificador.classify(texto)
    real.append(result)
    espera.append(classe)

matrizConfusao = ConfusionMatrix(espera, real)
print(matrizConfusao)


noticia = 'Em uma derrota para o governo do presidente Jair Bolsonaro, a Câmara aprovou na noite desta terça-feira, 5, uma versão desidratada do projeto de lei do Executivo que tratava sobre a posse e o porte de armas. Depois de uma série de tentativas de se aprovar a matéria em plenário, parlamentares fecharam um acordo para votar apenas partes do projeto que tratam de regras para colecionadores, atiradores e caçadores (CACs), além de mudar penas de crimes com armas e outros temas. O projeto foi aprovado com 283 votos a favor e 140 contra, além de duas abstenções.\
Em maio, Bolsonaro editou um decreto que facilitou o porte de arma e o acesso a munições para os CACs. Mas, no fim de junho, o presidente revogou o texto e outros dois, também sobre armas, e enviou ao Congresso esse projeto que originalmente tratava também sobre o registro, posse e comercialização de armas de fogo e munição e também sobre o Sistema Nacional de Armas (Sinarm). Além de retirar algumas medidas previstas no texto do governo, os deputados também incluíram aumento de algumas penas previstas, como a para quem for flagrado em posse ou portando, de maneira irregular, uma arma.\
Foi aprovada ainda uma emenda do deputado Arthur Lira (PP-AL) que especifica que o atirador esportivo, maior de 25 anos, terá direito ao porte de armas somente depois de cinco anos da primeira emissão do certificado de registro, em vez de dois anos depois, como constava da redação proposta pelo relator, deputado Alexandre Leite (DEM-SP). Foram retiradas do texto qualquer possibilidade de estender porte e posse a outras categorias, como queria o governo. Agora, o Executivo deve enviar um novo texto à Câmara amanhã para tratar da ampliação das categorias que têm direito a porte de arma para o exercício de sua profissão e outros assuntos. No fim de agosto, a Câmara aprovou a flexibiização da posse estendida de armas de fogo em propriedades rurais, a primeira legislação pró-arma aprovada no Congresso desde o início do governo Bolsonaro.'
noticiastemming = []
stemmer = nltk.stem.RSLPStemmer()
for (termostreino) in noticia.split():
    temstem = [p for p in termostreino.split()]
    noticiastemming.append(str(stemmer.stem(temstem[0])))

noticiaclass = extracaotermos(noticiastemming)
distribuir = classificador.prob_classify(noticiaclass)
Exemplo n.º 22
0
    if "|" in str2tuple(j)[1]:
        temp=str2tuple(j)
        b.append((temp[0],temp[1].split("|")[0]))
    else:
        b.append(str2tuple(j))

#Creating a list of predicted and actual and appending tags from a and b
predicted=[]
for i in a:
    predicted.append(i[1])
actual=[]
for i in b:
    actual.append(i[1])
    
#Creating Confusion Matrix
matrix = ConfusionMatrix(predicted,actual)
print("Confusion Matrix")
print(matrix)

#Check for Accuracy
print("Accuracy: " +str(accuracy_score(actual,predicted)))   

#Writing Confusion Matrix to pos-taggingreport.txt file
with open('pos-taggingreport.txt', 'w') as f:
    #Writing to file
    f.write("Confusion Matrix: "+str(matrix))
    f.write("Accuracy: "+str(accuracy_score(actual,predicted)))             ##Accuracy: 0.8485675066873152

#To check for Runtime of the scorer.py file where end denotes the end time of the Program
end=time.time()
Exemplo n.º 23
0
    predicted_set[predicted_label].add(index)
    predicted_set_cm.append(predicted_label)

from nltk.metrics import precision, recall, f_measure, ConfusionMatrix

#print('pos precision:', precision(actual_set['pos'], predicted_set['pos']))
#print('pos recall:', recall(actual_set['pos'], predicted_set['pos']))
#print('pos F-measure:', f_measure(actual_set['pos'], predicted_set['pos']))
#print('neg precision:', precision(actual_set['neg'], predicted_set['neg']))
#print('neg recall:', recall(actual_set['neg'], predicted_set['neg']))
#print('neg F-measure:', f_measure(actual_set['neg'], predicted_set['neg']))

# Confusion Matrix is a table we will be using to to describe the performance of the classifier
print('*****************| Performance of the classifier |*******************')
cm = ConfusionMatrix(actual_set_cm, predicted_set_cm)
print(cm)

# Result in tables print (cm) will look like
# – 761 negative tweets were correctly classified as negative (TN)
# – 239 negative tweets were incorrectly classified as positive (FP)
# – 231 positive tweets were incorrectly classified as negative (FN)
# – 769 positive tweets were correctly classified as positive (TP)

# Let's print the percentage of the performance
print('*****************| Percentage of the performance |*******************')
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))
# – 38.0% negative tweets were correctly classified as negative (TN)
# – 11.9% negative tweets were incorrectly classified as positive (FP)
# – 11.6% positive tweets were incorrectly classified as negative (FN)
# – 38.5% positive tweets were correctly classified as positive (TP)