Exemplo n.º 1
0
def kfold_svm(data):
    start = timeit.default_timer()
    accuracy, precision, recall, f1, stdev = kfoldcv(SVM, data, folds=10, type=CLASSIFICATION, kernel=POLYNOMIAL)
    stop = timeit.default_timer()
    print '*SVM*'
    print 'Accuracy: ' + str(accuracy)
    print 'Precision: ' + str(precision)
    print 'Recall: ' + str(recall)
    print 'F1-score: ' + str(f1)
    print 'STDev: ' + str(stdev)
    print 'Time: ' + str(stop - start)
    print
Exemplo n.º 2
0
def kfold_slp(data, itr=3):
    start = timeit.default_timer()
    accuracy, precision, recall, f1, stdev = kfoldcv(SLP, data, folds=10, iterations=itr)
    stop = timeit.default_timer()
    print '*SLP3*'
    print 'Accuracy: ' + str(accuracy)
    print 'Precision: ' + str(precision)
    print 'Recall: ' + str(recall)
    print 'F1-score: ' + str(f1)
    print 'STDev: ' + str(stdev)
    print 'Time: ' + str(stop - start)
    print
Exemplo n.º 3
0
def kfold_nb(data):
    start = timeit.default_timer()
    accuracy, precision, recall, f1, stdev = kfoldcv(NB, data, folds=10, method=MULTINOMIAL)
    stop = timeit.default_timer()
    print '*NB*'
    print 'Accuracy: ' + str(accuracy)
    print 'Precision: ' + str(precision)
    print 'Recall: ' + str(recall)
    print 'F1-score: ' + str(f1)
    print 'STDev: ' + str(stdev)
    print 'Time: ' + str(stop - start)
    print
Exemplo n.º 4
0
def kfold_knn(data, kk=9):
    start = timeit.default_timer()
    accuracy, precision, recall, f1, stdev = kfoldcv(KNN, data, folds=10, k=kk, distance=COSINE)
    stop = timeit.default_timer()
    print '*KNN9*'
    print 'Accuracy: ' + str(accuracy)
    print 'Precision: ' + str(precision)
    print 'Recall: ' + str(recall)
    print 'F1-score: ' + str(f1)
    print 'STDev: ' + str(stdev)
    print 'Time: ' + str(stop - start)
    print
Exemplo n.º 5
0
def validate(trainingSet):
    #Displays as (accuracy, precision, recall, F1, stdev)
    print '\n10-fold cross validation results on training set:'
    print kfoldcv(NB, trainingSet, folds=10)
    print ''
Exemplo n.º 6
0
    def vector(self, name): 
        """ Returns a dictionary with character bigrams and suffix.
            For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1}
        """
        v = chngrams(name, n=2)
        v = count(v)
        v[name[-2:]+"$"] = 1
        v[len(name)] = 1
        return v

data = csv("given-names.csv")

# Test average (accuracy, precision, recall, F-score).

print kfoldcv(GenderByName, data, folds=3) # (0.81, 0.79, 0.77, 0.78)

# Train and save the classifier.
# With final=True, discards the original training data (= smaller file).

g = GenderByName(train=data)
g.save("gender-by-name.svm", final=True)

# Next time, we can simply load the trained classifier.
# Keep in mind that the script that loads the classifier
# must include the code for the GenderByName class description,
# otherwise Python won't know how to load the data.

g = GenderByName.load("gender-by-name.svm")

for name in (
Exemplo n.º 7
0
print accuracy, precision, recall, f1
# confusion matrix
print nb.distribution
print nb.confusion_matrix(data[500:])
print nb.confusion_matrix(data[500:])(True)  # (TP, TN, FP, FN)
# precision and recall
print nb.test(data[500:], target=True)
print nb.test(data[500:], target=False)
print nb.test(data[500:])
# k-fold cross validation
data = csv('data/input/reviews.csv')
data = [(review, int(rating) >= 3) for review, rating in data]
data = [
    Document(review, type=rating, stopwords=True) for review, rating in data
]
print kfoldcv(NB, data, folds=10)
print kfoldcv(KNN, data, folds=10, k=3, distance=EUCLIDEAN)
# feature selection


def v(review1):
    v3 = parsetree(review1, lemmata=True)[0]
    v4 = [w.lemma for w in v3 if w.tag.startswith(('JJ', 'NN', 'VB', '!'))]
    v5 = count(v4)
    return v5


data = csv('data/input/reviews.csv')
data = [(v(review), int(rating) >= 3) for review, rating in data]
print kfoldcv(NB, data)
data = csv('data/input/reviews.csv')
Exemplo n.º 8
0
    def vector(self, name):
        """ Returns a dictionary with character bigrams and suffix.
            For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1}
        """
        v = chngrams(name, n=2)
        v = count(v)
        v[name[-2:] + "$"] = 1
        v[len(name)] = 1
        return v


data = csv(pd("given-names.csv"))

# Test average (accuracy, precision, recall, F-score, standard deviation).

print(kfoldcv(GenderByName, data, folds=3))  # (0.81, 0.79, 0.77, 0.78, 0.00)

# Train and save the classifier in the current folder.
# With final=True, discards the original training data (= smaller file).

g = GenderByName(train=data)
g.save(pd("gender-by-name.svm"), final=True)

# Next time, we can simply load the trained classifier.
# Keep in mind that the script that loads the classifier
# must include the code for the GenderByName class description,
# otherwise Python won't know how to load the data.

g = GenderByName.load(pd("gender-by-name.svm"))

for name in ("Felix", "Felicia", "Rover", "Kitty", "Legolas", "Arwen", "Jabba",
Exemplo n.º 9
0
URL = re.compile(r"https?://[^\s]+")  # http://www.emrg.be
REF = re.compile(r"@[a-z0-9_./]+", flags=re.I)  # @tom_de_smedt

from pattern.db import Datasheet, pd

train = []
for name, alignment, tweet in Datasheet.load(pd("good-evil.csv")):
    tweet = URL.sub("http://", tweet)  # Anonymize URLs.
    tweet = REF.sub("@friend", tweet)  # Anonymize usernames.
    train.append((ngram_vector(tweet, 5), alignment))

# ------------------------------------------------------------------------------------

# Let's look at the statistical accuracy of the classifier:
print kfoldcv(SVM, train, folds=3)
print

# This returns an (accuracy, precision, recall, F1-score, stdev)-tuple.
# The F1-score is the most important.
# An SVM trained on our data would be 94.6% accurate in knowing good from evil
# (this is a suspiciously high accuracy).

# ------------------------------------------------------------------------------------

classifier = SVM(train)

print classifier.distribution
print
# This reveals that there are 13,000 good tweets, and 5,000 evil tweets.
# This means the classifier is biased to predict "good",
Exemplo n.º 10
0
    def vector(self, name): 
        """ Returns a dictionary with character bigrams and suffix.
            For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1}
        """
        v = chngrams(name, n=2)
        v = count(v)
        v[name[-2:] + "$"] = 1
        v[len(name)] = 1
        return v

data = csv(pd("given-names.csv"))

# Test average (accuracy, precision, recall, F-score, standard deviation).

print(kfoldcv(GenderByName, data, folds=3))  # (0.81, 0.79, 0.77, 0.78, 0.00)

# Train and save the classifier in the current folder.
# With final=True, discards the original training data (= smaller file).

g = GenderByName(train=data)
g.save(pd("gender-by-name.svm"), final=True)

# Next time, we can simply load the trained classifier.
# Keep in mind that the script that loads the classifier
# must include the code for the GenderByName class description,
# otherwise Python won't know how to load the data.

g = GenderByName.load(pd("gender-by-name.svm"))

for name in (
Exemplo n.º 11
0
URL = re.compile(r"https?://[^\s]+")           # http://www.emrg.be
REF = re.compile(r"@[a-z0-9_./]+", flags=re.I) # @tom_de_smedt

from pattern.db import Datasheet, pd

train = []
for name, alignment, tweet in Datasheet.load(pd("good-evil.csv")):
    tweet = URL.sub("http://", tweet) # Anonymize URLs.
    tweet = REF.sub("@friend", tweet) # Anonymize usernames.
    train.append((ngram_vector(tweet, 5), alignment))
    
# ------------------------------------------------------------------------------------

# Let's look at the statistical accuracy of the classifier:
print kfoldcv(SVM, train, folds=3)
print

# This returns an (accuracy, precision, recall, F1-score, stdev)-tuple.
# The F1-score is the most important.
# An SVM trained on our data would be 94.6% accurate in knowing good from evil
# (this is a suspiciously high accuracy).

# ------------------------------------------------------------------------------------

classifier = SVM(train)

print classifier.distribution
print
# This reveals that there are 13,000 good tweets, and 5,000 evil tweets.
# This means the classifier is biased to predict "good",
#How many features were there before
print len(refinedfeatures)


cleanFeatures = [i for i in refinedfeatures if i not in avoidList]


#How many features after filtering out avoid list
print len(cleanFeatures)

#model redefines
model = model.filter(features=cleanFeatures)

#This will give k fold cross validation results; Instead of Naive Bayes you can try SVM, SLP, KNN etc

print kfoldcv(NB, model)







#Writing the features and their weights to a csv file

listofFeatures = []

for i in cleanFeatures:
    innerList= [i,model.ig(i)]
    listofFeatures.append(innerList)
    #print(vectors.vectors)
else:
    vectors = documents

if options["train"]:
    if classifier_type == "SVM":
        classifier = SVM(train=vectors,
                         type=svm_type,
                         kernel=svm_kernel)
    else:
        classifier = getattr(pattern.vector, classifier_type)(train=vectors)

    print("Classes: " + repr(classifier.classes))

    #performance = kfoldcv(NB, vectors, folds=n_fold)
    performance = kfoldcv(type(classifier), vectors, folds=n_fold)
    print("Accuracy: %.3f\n" \
          "Precision: %.3f\n" \
          "Recall: %.3f\n" \
          "F1: %.3f\n" \
          "Stddev:%.3f" % performance)
    print()
    print("Confusion matrx:")
    print(classifier.confusion_matrix(vectors).table)

    classifier.save(trained_filename)
elif options["predict"]:
    classifier = Classifier.load(trained_filename)

    print("#Author\tURL\tPrediction\tActual")
    for v in vectors:
Exemplo n.º 14
0
    def vector(self, name):
        """ Returns a dictionary with character bigrams and suffix.
            For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1}
        """
        v = chngrams(name, n=2)
        v = count(v)
        v[name[-2:] + "$"] = 1
        v[len(name)] = 1
        return v


data = csv("given-names.csv")

# Test average (accuracy, precision, recall, F-score).

print kfoldcv(GenderByName, data, folds=3)  # (0.81, 0.79, 0.77, 0.78)

# Train and save the classifier.
# With final=True, discards the original training data (= smaller file).

g = GenderByName(train=data)
g.save("gender-by-name.svm", final=True)

# Next time, we can simply load the trained classifier.
# Keep in mind that the script that loads the classifier
# must include the code for the GenderByName class description,
# otherwise Python won't know how to load the data.

g = GenderByName.load("gender-by-name.svm")

for name in ("Felix", "Felicia", "Rover", "Kitty", "Legolas", "Arwen", "Jabba",