예제 #1
0
def category_by_movie():
    from nltk.corpus import movie_reviews as mr
    from nltk import FreqDist
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.corpus import names
    from nltk.classify import apply_features
    import random

    documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
    random.shuffle(documents)

    all_words = FreqDist(w.lower() for w in mr.words())
    word_features = all_words.keys()[:2000]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    #print document_features(mr.words('pos/cv957_8737.txt'))
    #print documents[0]

    features = [(document_features(d), c) for (d, c) in documents]
    train_set, test_set = features[100:], features[:100]
    classifier = NaiveBayesClassifier.train(train_set)
    print classify.accuracy(classifier, train_set)
예제 #2
0
def cross_validate(classifier, training_set, test_set):
    chosen_classif = classifier
    best_accuracy = 0.0
    best_train_accuracy = None
    best_classifier = None
    k_fold = cross_validation.KFold(len(training_set), n_folds=10)
    for train_indices, test_indices in k_fold:
        train = itemgetter(*train_indices)(training_set)
        test = itemgetter(*test_indices)(training_set)
        classifier = chosen_classif.train(train)
        print '--------------------------------'
        train_accuracy = classify.accuracy(classifier, train)
        print 'Training set accuracy:' + str(train_accuracy)
        if len(test_indices) == 1:
            test = (test,)
        accuracy = classify.accuracy(classifier, test)
        if accuracy > best_accuracy:
            best_classifier = classifier
            best_accuracy = accuracy
            best_train_accuracy = train_accuracy
        print 'Cross validation set accuracy: ' + str(accuracy)
        get_fscore(classifier, test)
    print 'Best classifier CV accuracy: ' + str(best_accuracy)
    test_accuracy = classify.accuracy(best_classifier, test_set)
    print 'Best classifier accuracy: ' + str(test_accuracy)
    print 'Best classifier precision recall fscore: '
    print get_fscore(best_classifier, test_set)
    return [test_accuracy, best_train_accuracy, best_classifier]
예제 #3
0
def evaluate(train_set, test_set, classifier, name):
	refsets = collections.defaultdict(set)
	testsets = collections.defaultdict(set)
	for i, (features, label) in enumerate(test_set):
		refsets[label].add(i)
		observed = classifier.classify(features)
		testsets[observed].add(i)
	# Get accuracy on training set, test set and get positive and negative recall.
	trainacc = 100 * classify.accuracy(classifier, train_set)
	testacc = 100 * classify.accuracy(classifier, test_set)
	spam_false = 100 - nltk.recall(refsets['spam'], testsets['spam'])*100
	ham_false = 100 - nltk.recall(refsets['ham'], testsets['ham'])*100
	return trainacc, testacc, spam_false, ham_false
예제 #4
0
def main():
    spam = load_dataset('spam', sys.argv[1], True)
    ham = load_dataset('ham', sys.argv[2], True)
    training_spam = spam[:11500]
    training_ham = ham[:11500]
    test_spam = spam[1000:]
    test_ham = ham[1000:]

    nbc = NaiveBayesClassifier.train(training_ham + training_spam)
    cPickle.dump(nbc, sys.stdout)

    sys.stderr.writelines(['Spam accuracy: %f\n' % accuracy(nbc, test_spam),
                           'Ham accuracy: %f\n' % accuracy(nbc, test_ham)])
예제 #5
0
def classifier(lambda_):
  clf = get_classifier('%f' % lambda_, __train_fs, lambda_)
  logging.debug("Finished training the classifier lambda=%f ..." % lambda_)
  dev_acc = accuracy(clf, __dev_fs)
  logging.debug("classifier lambda=%f accuracy on DEV is: %3.5f",
                lambda_, dev_acc)
  train_acc = accuracy(clf, __train_fs)
  logging.debug("classifier lambda=%f accuracy on TRAIN is: %3.5f",
                lambda_, train_acc)
#  clf.show_most_informative_features()
  result = [clf.classify(fs) for fs,label in __dev_fs]
  gold = [label for fs,label in __dev_fs]
  cm = nltk.ConfusionMatrix(gold, result)
  cf_text = cm.pp(sort_by_count=True, show_percents=True, truncate=20)
  return (lambda_, dev_acc, train_acc, cf_text, clf)
예제 #6
0
def update_category_by_pos():
    from nltk.corpus import brown
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.tag import untag
    from nltk import DecisionTreeClassifier

    def pos_features(sentence, i):
        features = {'suffix(1)':sentence[i][-1:],
                    'suffix(2)':sentence[i][-2:],
                    'suffix(3)':sentence[i][-3:]
                    }
        features['prev-word'] = '<start>' if i==0 else sentence[i-1]
        return features

    print pos_features(brown.sents()[0], 8)

    tagged_sents = brown.tagged_sents(categories='news')
    featuresets = []

    for tagged_sent in tagged_sents:
        untagged_sent = untag(tagged_sent)
        for i, (word, tag) in enumerate(tagged_sent):
            featuresets.append((pos_features(untagged_sent, i), tag))

    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
#    classifier = NaiveBayesClassifier.train(train_set)
    classifier = DecisionTreeClassifier.train(train_set)
    print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
def main_function():
    conn = MySQLdb.connect(
        host=DATABASES["date_cutoff"]["HOST"],
        user=DATABASES["date_cutoff"]["USER"],
        passwd=DATABASES["date_cutoff"]["PASSWORD"],
        db=DATABASES["date_cutoff"]["NAME"],
    )

    training_tweets = get_training_tweets(conn)
    training_feature_set = classify.process_tweets(training_tweets)

    classifier = NaiveBayesClassifier.train(training_feature_set)

    error_dict = {"+": 0, "-": 0, "I": 0, "O": 0}
    count_dict = {"+": 0, "-": 0, "I": 0, "O": 0}
    guess_dict = {"+": 0, "-": 0, "I": 0, "O": 0}

    full_matrix = {
        "+": {"+": 0, "-": 0, "I": 0, "O": 0},
        "-": {"+": 0, "-": 0, "I": 0, "O": 0},
        "I": {"+": 0, "-": 0, "I": 0, "O": 0},
        "O": {"+": 0, "-": 0, "I": 0, "O": 0},
    }

    count_table = {"+": 0, "-": 0, "I": 0, "O": 0}

    test_tweets = get_test_tweets(conn)
    test_feature_set = classify.process_tweets(test_tweets)

    classifier_accuracy = accuracy(classifier, test_feature_set)

    print count_table
    print "classifier accuracy: " + repr(classifier_accuracy)
예제 #8
0
def main_function():
	conn = MySQLdb.connect(host="localhost", user="******", passwd="tanzania", db="twitter_analysis")
	hq_conn = MySQLdb.connect(host="localhost", user="******", passwd="tanzania", db="twitter")

	training_tweets = get_test_tweets(conn)
	training_feature_set = process_tweets(training_tweets)

	classifier = DecisionTreeClassifier.train(training_feature_set)

	test_tweets = get_training_tweets(conn)
	test_feature_set = process_tweets(test_tweets)

	classifier_accuracy = accuracy(classifier, test_feature_set)

	alt_full_matrix = {'+':{'+':0, '-':0, 'E':0}, 
				'-':{'+':0, '-':0, 'E':0}, 
				'E':{'+':0, '-':0, 'E':0}}

	#for f in test_tweets:
	#f = test_tweets[0]

	#print f
	#guess = classifier.classify(process_tweet(f[1]))
	#print guess
	#	update_tweet_polarity(f[0], guess, conn)
	##	pl = classifier.prob_classify(process_tweet(f[1]))
	#	idx = f[2]
	#	if idx == 'I' or idx == 'O':
	#		idx = 'E'
	#	alt_full_matrix[idx][guess] += 1

	#print alt_full_matrix

	print "classifier accuracy: " + repr(classifier_accuracy)
 def benchmarking(self, classifier,_test_set,all_f_measure=[],all_precision=[],all_recall=[]):
     from nltk import classify
     accuracy = classify.accuracy(classifier, _test_set)
     
     print("accuracy:",accuracy)
     
     from nltk.metrics import precision
     from nltk.metrics import recall
     from nltk.metrics import f_measure
     
     import collections
     refsets = collections.defaultdict(set)
     testsets = collections.defaultdict(set)
     for i, (feats, label) in enumerate(_test_set):
         refsets[label].add(i)
         observed = classifier.classify(feats)
         testsets[observed].add(i)
         
     prec=precision(refsets['class'], testsets['class'])
     rec=recall(refsets['class'], testsets['class'])
     f1=f_measure(refsets['class'], testsets['class'])
     print('precision:', prec)
     print('recall:', rec)
     print('F-measure:', f1)
             
     all_f_measure.append(f1)
     all_precision.append(prec)
     all_recall.append(rec)
     print('========Show top 10 most informative features========')
     classifier.show_most_informative_features(10)
예제 #10
0
파일: svm.py 프로젝트: trunghlt/nltk
def demo():
    def gender_features(word):
        return {"last_letter": word[-1], "penultimate_letter": word[-2]}

    from nltk.classify import accuracy
    from nltk.corpus import names

    import random

    names = [(name, "male") for name in names.words("male.txt")] + [
        (name, "female") for name in names.words("female.txt")
    ]
    import random

    random.seed(60221023)
    random.shuffle(names)

    featuresets = [(gender_features(n), g) for (n, g) in names]
    train_set, test_set = featuresets[500:], featuresets[:500]

    print "--- nltk.classify.svm demo ---"
    print "Number of training examples:", len(train_set)
    classifier = SvmClassifier.train(train_set)
    print "Total SVM dimensions:", len(classifier._svmfeatureindex)
    print "Label mapping:", classifier._labelmapping
    print "--- Processing an example instance ---"
    print "Reference instance:", names[0]
    print "NLTK-format features:\n    " + str(test_set[0])
    print "SVMlight-format features:\n    " + str(
        map_instance_to_svm(test_set[0], classifier._labelmapping, classifier._svmfeatureindex)
    )
    distr = classifier.prob_classify(test_set[0][0])
    print "Instance classification and confidence:", distr.max(), distr.prob(distr.max())
    print "--- Measuring classifier performance ---"
    print "Overall accuracy:", accuracy(classifier, test_set)
예제 #11
0
def weka(train_set, test_set, algorithm="svm"):
    from nltk.classify import weka

    print "--- nltk.classify.weka %s ---" % algorithm
    temp_dir = tempfile.mkdtemp()
    classifier = nltk.classify.WekaClassifier.train(temp_dir + "/cred.model", train_set, algorithm)
    print "Overall accuracy:", accuracy(classifier, test_set)
예제 #12
0
def demo():

    def gender_features(word):
        return {'last_letter': word[-1], 'penultimate_letter': word[-2]}

    from nltk.classify import accuracy
    from nltk.corpus import names
    
    
    import random
    names = ([(name, 'male') for name in names.words('male.txt')] +
             [(name, 'female') for name in names.words('female.txt')])
    import random
    random.seed(60221023)
    random.shuffle(names)

    featuresets = [(gender_features(n), g) for (n,g) in names]
    train_set, test_set = featuresets[500:], featuresets[:500]

    print '--- nltk.classify.svm demo ---'
    print 'Number of training examples:', len(train_set)
    classifier = SvmClassifier.train(train_set)
    print 'Total SVM dimensions:', len(classifier._svmfeatureindex)
    print 'Label mapping:', classifier._labelmapping
    print '--- Processing an example instance ---'
    print 'Reference instance:', names[0]
    print 'NLTK-format features:\n    ' + str(test_set[0])
    print 'SVMlight-format features:\n    ' + str(map_instance_to_svm(test_set[0], classifier._labelmapping, classifier._svmfeatureindex))
    distr = classifier.prob_classify(test_set[0][0])
    print 'Instance classification and confidence:', distr.max(), distr.prob(distr.max())
    print '--- Measuring classifier performance ---'
    print 'Overall accuracy:', accuracy(classifier, test_set)
def test_raw_mail(org_email):

	features_test = {}
	wordtokens_test = [word_limit.lemmatize(key.lower()) for key in
	word_tokenize(org_email)]
	for key in wordtokens_test:
		if key not in stpwords:
			features_test[key] = True
	return features_test

	#Extracting the features(Tonenized, stemmed and non-stopwords emails) from all the emails
	feature_sets = [(raw_mail(n), g) for (n,g) in mail_shuffle]

	#Splitting the test and training data sets from the whole email set features
	size_feature = int(len(feature_sets) * 0.10)
	train_set, test_set = feature_sets[size_feature:], feature_sets[:size_feature]
	classifier = NaiveBayesClassifier.train(train_set)
	#print (test_set[1:5])

	#Printing the accuracy of the machine
	print ('accuracy of the machine: ', (classify.accuracy(classifier,test_set))*100) 
	
	#Printing the top 50 features
	classifier.show_most_informative_features(50) 

	#Printing the spam and ham labels
	print ('labels:',classifier.labels())

	#Classification of user entered email
	while(True):
		featset = raw_mail(input("Enter text to classify: "))
		print (classifier.classify(featset))
예제 #14
0
def main_function():
	conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'], 
			user=DATABASES['date_cutoff']['USER'], 
			passwd=DATABASES['date_cutoff']['PASSWORD'], 
			db=DATABASES['date_cutoff']['NAME'])

	training_tweets = classify.get_training_tweets(conn_analysis)
	training_feature_set = process_tweets(training_tweets)

	config_megam('/opt/packages')
	classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0)

	error_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	count_dict = {'+':0, '-':0, 'I':0, 'O':0} 
	guess_dict = {'+':0, '-':0, 'I':0, 'O':0} 

	full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, 
				'-':{'+':0, '-':0, 'I':0, 'O':0}, 
				'I':{'+':0, '-':0, 'I':0, 'O':0}, 
				'O':{'+':0, '-':0, 'I':0, 'O':0}}


	test_tweets = classify.get_test_tweets(conn_analysis)
	test_feature_set = process_tweets(test_tweets)

	classifier.show_most_informative_features(10)
	classifier_accuracy = accuracy(classifier, test_feature_set)
	print "classifier accuracy: " + repr(classifier_accuracy)
예제 #15
0
def cross_validation(data_set, n_folds=8):
    kf = KFold(len(data_set), n_folds=n_folds)
    best_accuracy = -1
    training_accuracy = 0
    for train, cv in kf:
        classifier = SklearnClassifier(
            Pipeline([('tfidf', TfidfTransformer()),
                      ('nb', LinearSVC(C=1, tol=0.000001))]))
        training_data = data_set[0:cv[0]] + data_set[cv[-1]:]
        cv_data = data_set[cv[0]:cv[-1]+1]
        classifier.train(training_data)
        accuracy = classify.accuracy(classifier, cv_data)
        if accuracy > best_accuracy:
            best_classifier = classifier
            best_accuracy = accuracy
            training_accuracy = classify.accuracy(classifier, training_data)
    return best_classifier, training_accuracy, best_accuracy
예제 #16
0
def wsd_classifier(trainer, word, features, stopwords_list = STOPWORDS, number=300, log=False, distance=3, confusion_matrix=False):
    
    print "Reading data..."
    global _inst_cache
    if word not in _inst_cache:
        _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
    events = _inst_cache[word][:]
    senses = list(set(l for (i, l) in events))
    instances = [i for (i, l) in events]
    vocab = extract_vocab(instances, stopwords=stopwords_list, n=number)
    print ' Senses: ' + ' '.join(senses)

    # Split the instances into a training and test set,
    #if n > len(events): n = len(events)
    n = len(events)
    random.seed(5444522)
    random.shuffle(events)
    training_data = events[:int(0.8 * n)]
    test_data = events[int(0.8 * n):n]
    # Train classifier
    print 'Training classifier...'
    classifier = trainer([(features(i, vocab, distance), label) for (i, label) in training_data])
    # Test classifier
    print 'Testing classifier...'
    acc = accuracy(classifier, [(features(i, vocab, distance), label) for (i, label) in test_data] )
    print 'Accuracy: %6.4f' % acc
    if log==True:
        #write error file
        print 'Writing errors to errors.txt'
        output_error_file = open('errors.txt', 'w')
        errors = []
        for (i, label) in test_data:
            guess = classifier.classify(features(i, vocab, distance))
            if guess != label:
                con =  i.context
                position = i.position
                item_number = str(test_data.index((i, label)))
                word_list = []
                for (word, tag) in con:
                    word_list.append(word)
                hard_highlighted = word_list[position].upper()
                word_list_highlighted = word_list[0:position] + [hard_highlighted] + word_list[position+1:]
                sentence = ' '.join(word_list_highlighted)
                errors.append([item_number, sentence, guess,label])
        error_number = len(errors)
        output_error_file.write('There are ' + str(error_number) + ' errors!' + '\n' + '----------------------------' +
                                '\n' + '\n')
        for error in errors:
            output_error_file.write(str(errors.index(error)+1) +') ' + 'example number: ' + error[0] + '\n' +
                                    '    sentence: ' + error[1] + '\n' +
                                    '    guess: ' + error[2] + ';  label: ' + error[3] + '\n' + '\n')
        output_error_file.close()
    if confusion_matrix==True:
        gold = [label for (i, label) in test_data]
        derived = [classifier.classify(features(i,vocab)) for (i,label) in test_data]
        cm = nltk.ConfusionMatrix(gold,derived)
        print cm
        return cm
예제 #17
0
	def testall_accuracy(self, testset=[]):
		# default test set is class field (all test files)
		if not testset:
			testset = self.test_feature_set_custom
		print 'Measuring classifier performance...'
		acc = accuracy(self.classifier, self.test_feature_set_custom)
		print 'Overall accuracy:', acc
		
		return acc
예제 #18
0
def buildClassifier(hamDir, spamDir):
	spamEmails = []
	hamEmails = []
	allEmails = []
	features = []

	# Using glob instead of os.listdir to ignore hidden files

	for email in glob.glob(spamDir + "/*"):
		f = open(email)
		spamEmails.append(f.read())
		f.close()

	for email in glob.glob(hamDir + "/*"):
		f = open(email)
		hamEmails.append(f.read())
		f.close()

	for email in spamEmails:
		allEmails.append((email, 'spam'))

	for email in hamEmails:
		allEmails.append((email, 'ham'))

	# Shuffle to get the accuracy of the 70:30 ratio. Otherwise, if no check were to be done, would not need to shuffle.
	random.shuffle(allEmails)

	# Make a list of feature per email
	for (email, label) in allEmails:
		features.append((emailFeatures(email), label))

	# 70:30 ratio for training:testing
	print "Using a 70:30 ratio for training:testing, the accuracy is as follows: "
	totalSize = int(len(features) * 0.7)
	trainingEmails, testingEmails = features[:totalSize], features[totalSize:]

	print "training size: %d; testing size: %d" %(len(trainingEmails), len(testingEmails))
	classifier = NaiveBayesClassifier.train(trainingEmails)
	print classify.accuracy(classifier, testingEmails)

	print "Now creating and saving a full size classifier made up of %d emails..." %len(features)
	classifier = NaiveBayesClassifier.train(features)

	saveClassifier(classifier, "full-classifier.pickle")
예제 #19
0
    def naives_classifier(self, training_set, dev_set, log=0):

        classifier = NaiveBayesClassifier.train(training_set)
        accuracy = classify.accuracy(classifier, dev_set)

        print('Naive Bayes accuracy dev percent: ', (accuracy * 100))
        if log == 1:
            classifier.show_most_informative_features(20)

        return classifier
예제 #20
0
def maximum_entropy(train_set, test_set):
    print "--- nltk.classify.maximum_entropy ---"

    from nltk.classify import megam

    megam.config_megam()

    classifier = nltk.classify.MaxentClassifier.train(train_set, "megam")

    print "Overall accuracy:", accuracy(classifier, test_set)
    classifier.show_most_informative_features(10)
예제 #21
0
def cross_validate(classifier, training_set, test_set):
    """
     Performs 10-fold cross validation
     parameters: a classifier, training set, test set
     returns: best classifier, f-score, accuracy
    """
    chosen_classif = classifier
    best_accuracy = 0.0
    best_train_accuracy = None
    best_classifier = None

    # retrieve indices of data to be used as training and test sets in cross-validation
    k_fold = cross_validation.KFold(len(training_set), n_folds=10)

    # loop through tweets with those indices
    for train_indices, test_indices in k_fold:

        # retrieve training and test (cross validation) sets
        train = itemgetter(*train_indices)(training_set)
        test = itemgetter(*test_indices)(training_set)
        
        classifier = chosen_classif.train(train)
        train_accuracy = classify.accuracy(classifier, train)
        
        if len(test_indices) == 1:
            test = (test,)

        # retrieve accuracy
        accuracy = classify.accuracy(classifier, test)

        # if accuracy is greater than the best accuracy, store as best classifier
        if accuracy > best_accuracy:
            best_classifier = classifier
            best_accuracy = accuracy
            best_train_accuracy = train_accuracy
        fscore = get_fscore(classifier, test)
    
    test_accuracy = classify.accuracy(best_classifier, test_set)
    accuracy = {'test_accuracy': test_accuracy, 'best_train_accuracy': best_train_accuracy}
    to_return = {'classifier': best_classifier, 'fscore': fscore, 'accuracy': accuracy}
    return to_return
예제 #22
0
def cross_validate(content_set, times, words, amount_of_words):
    incr = len(content_set) // times
    document_extraction = Doc_extract(words, amount_of_words)
    for i in xrange(times):
        train_set = apply_features(document_extraction,
                                   content_set[:i * incr] + content_set[(i + 1) * incr:])
        test_set = apply_features(document_extraction,
                                  content_set[i * incr: min((i + 1) * incr, len(content_set))])
        classifier = get_trained_classifier(train_set)
        acc = accuracy(classifier, test_set)
        print('\n{0} classifier\n\tAccuracy:  {1:.6}'.format(i + 1, acc))
        print('\tPrecision: {0:.6}\n\tRecall:    {1:.6}\n\tF_measure: {2:.6}'.format(*get_f_measure(classifier, test_set)))
예제 #23
0
 def _cross_train(self, fold_sz):
     rid2shard = ST.random_shardlize(fold_sz, len(self._train_xs), load=True)
     precision = 0
     for fid,sd in rid2shard.items():
         tmp_train_xs = [self._train_xs[i] for i in sd]
         tmp_train_ys = [self._train_ys[i] for i in sd]
         test_set = [(self._feature_encoding(self._train_xs[i]), self._train_ys[i]) for i in sd]
         classifier = self._train(tmp_train_xs, tmp_train_ys)
         p = classify.accuracy(classifier, test_set)
         linfo('maxent classifier precision: %.4f' % p)
         precision += p
     linfo('average maxent classifier precision: %.4f' % precision/fold_sz)
예제 #24
0
def category_by_name():
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.corpus import names
    from nltk.classify import apply_features
    import random

    names = ([(name, 'male') for name in names.words('male.txt')] +[(name, 'female') for name in
    names.words('female.txt')])

    random.shuffle(names)

    def gender_features(word):
        return {'last_letter':word[-1]}

    train_set = apply_features(gender_features, names[500:])
    test_set = apply_features(gender_features, names[:500])

    classifier = NaiveBayesClassifier.train(train_set)
    print classifier.classify(gender_features('Neo')) 
    print classify.accuracy(classifier, train_set)
예제 #25
0
def model_dev(func_name): 
	from nltk.corpus import names   
	names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
	random.shuffle(names)
	print "Length of dataset %d"%len(names)
	random.shuffle(names)
	random.shuffle(names)
	print "How the data set looks"
	print names[0:10]
	print "Testing the output of feature extraction"
	print "For name Gary -- %s"%func_name('Gary')
	featuresets = [(func_name(n), g) for (n, g) in names]
	print "length of featureset data %d"%len(featuresets)
	print featuresets[0:10]
	train_set, test_set = featuresets[500:], featuresets[:500]
	print "Length of train data %d"%len(train_set)
	print "length of test data %d"%len(test_set)
	time.sleep(10)
	os.system('clear')

	print "\n\nNaive Bayes Classification\n\n"
	nb_classifier = NaiveBayesClassifier.train(train_set)
	check_list=['Gary', 'Shivam', 'Grace', 'Sarah', 'Shaym', 'Richa', 'Abhisheyk']
	for name in check_list:
		print "Naive gender classification of ---%s --is-- %s---"%(name,nb_classifier.classify(func_name(name)))
	print "The accuracy of the naive classifier is"
	print classify.accuracy(nb_classifier, test_set)
	print "The most informative features are:"
	print nb_classifier.show_most_informative_features(5)

	time.sleep(10)
	os.system('clear')
	print "\n\nMaxent Classification\n\n"
	mod=MaxentClassifier.train(train_set)
	for name in check_list:
		print "Maxent gender classification of ---%s --is-- %s---"%(name,mod.classify(func_name(name)))
	print "The accuracy of maxent is"
	print classify.accuracy(mod, test_set)
	print "The most informative features are:"
	print mod.show_most_informative_features(5)
예제 #26
0
def test():
    naive_test_data = datapreparation.create_naive_test_data()
# print naive_data[-1]
# d1 = doc_cls[1:100]
#gc.collect()
# naive_data2 = naive_train_data
# random.shuffle(naive_data2)
# edge = (len(naive_data)/3) * 2

# print '##############################################################'
    print 'accuracy: ', classify.accuracy(classifier,naive_test_data)
    print classifier.most_informative_features()
    print classifier.show_most_informative_features()
예제 #27
0
	def train_and_test(self, num_folds=10):
		shuffle(self.comment_list)

		fold_size = int(ceil(len(self.comment_list) / float(num_folds)))

		folds = []
		for i in range(num_folds):
			start = i * fold_size
			end = (i+1) * fold_size

			if end > len(self.comment_list):
				end = len(self.comment_list)

			train_comments = self.comment_list[:start] + self.comment_list[end:]
			test_comments = self.comment_list[start:end]

			train_set = data_set(train_comments)
			test_set = data_set(test_comments)

			classifier = NaiveBayesClassifier.train(train_set)

			print accuracy(classifier, test_set)
예제 #28
0
def main():
    global best_words
    tweets = get_tweets_from_db()
    tweet_list = tweets[1000:1599000]
    test_list = tweets[:1000]+ tweets[1599000:]
    word_scores = create_word_scores()
    best_words = find_best_words(word_scores, 500000)
    f = open('bestwords.pickle', 'wb')
    pickle.dump(best_words, f)
    f.close()
    training_set = classify.apply_features(best_word_features, tweet_list)
    print "extracted features"
    # train the classifier with the training set
    classifier = NaiveBayesClassifier.train(training_set)
    print "trained classifier"
    # create the pickle file
    f = open('NBclassifier_new.pickle', 'wb')
    pickle.dump(classifier, f)
    f.close()
    print "created pickle"
    # test for precision and recall
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    test_set = classify.apply_features(best_word_features, test_list)
 
    for i, (feats, label) in enumerate(test_set):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
     
    print 'neg precision:', metrics.precision(refsets['0'], testsets['0'])
    print 'neg recall:', metrics.recall(refsets['0'], testsets['0'])
    print 'pos precision:', metrics.precision(refsets['4'], testsets['4'])
    print 'pos recall:', metrics.recall(refsets['4'], testsets['4'])
    # test_set = classify.apply_features(extract_features, test_list)
    # print "extracted features"
    print classify.accuracy(classifier, test_set)
    print classifier.show_most_informative_features(30)
def sentim(self, data):
    stop_words = ['the', 'an', 'the', 'i', 'a', 'and', 'to'] #, 'none'] #, 'heartworm', ' distemper/parvo'] #stopwords.words('english')

    path_csv = '../data/csv/tf_idf_adoptable_csv.csv'
    df = read_df_csv(path_csv)
    X_negative = df["description"] #data
    corpus_dirty = []
    for doc in range(len(X_negative)):
        str_corpus = str(X_negative[doc])
        corpus_dirty.append(str_corpus)

    negative_documents = []
    for doc in range(len(X_negative)):
        record = X_negative[doc]
        record = (record.lower())
        replaced = record.replace(", '...'", "").replace("...", '').replace('\d+', '') 
        remove_digits = str.maketrans('', '', digits) 
        replaced = replaced.translate(remove_digits) 
        clean = replaced.replace(", '...'", "").replace("...", '')
        negative_documents.append(clean)
    # print(documents)
# #     # 2. Create a set of tokenized documents.
    negative_descriptions = [word_tokenize(content) for content in negative_documents]

    negative_cleaned_tokens_list = []
    for tokens in negative_descriptions:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_neg_words = get_all_words(negative_cleaned_tokens_list)
    
    
    freq_dist_neg = FreqDist(all_neg_words)
    print("most common ADOPTABLE words: ", freq_dist_neg.most_common(10))

    ##################################################################
    ##################################################################
    ##################################################################

    
    path_csv = '../data/csv/tf_idf_adopted_csv.csv'
    df = read_df_csv(path_csv)
    X_positive = df["description"] #data
    corpus_dirty = []
    for doc in range(len(X_positive)):
        str_corpus = str(X_positive[doc])
        corpus_dirty.append(str_corpus)

    positive_documents = []
    for doc in range(len(X_positive)):
        record = X_positive[doc]
        record = (record.lower())
        replaced = record.replace(", '...'", "").replace("...", '').replace('\d+', '') 
        remove_digits = str.maketrans('', '', digits) 
        replaced = replaced.translate(remove_digits) 
        clean = replaced.replace(", '...'", "").replace("...", '')
        positive_documents.append(clean)
    # print(documents)
# #     # 2. Create a set of tokenized documents.
    positive_descriptions = [word_tokenize(content) for content in positive_documents]
    # print("\n\nPositive Descriptions Tokenized: ", positive_descriptions)
    # ['dora', 'female', 'shep', 'mix', 'brindle', 'dhpp', 'kc', '//', 'no', 'puppy', 'hi', 'cathleen', ',', 'she', 'is', 'doing', 'great', 'and', 'really', 'starting'], ['meet', 'nova', '!', 'now', 'that', 'she', 'is', 'done', 'raising', 'her', 'pups', 'she', 'is', 'looking', 'for', 'a', 'home', 'of', 'her', 'own', 'where']]
    
    positive_cleaned_tokens_list = []
    for tokens in positive_descriptions:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))


    
    
    all_pos_words = get_all_words(positive_cleaned_tokens_list)
    
    # save_documents = open("pickled_algos/all_pos_words.pickle","wb")
    # pickle.dump(positive_cleaned_tokens_list, save_documents)
    # save_documents.close()
    

    freq_dist_pos = FreqDist(all_pos_words)
    print("most common ADOPTED words: ", freq_dist_pos.most_common(10))

    ##################################################################
    ##################################################################
    ##################################################################
    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    # positive_tokens_for_model = all_pos_words.pickle
    
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
    

    
    
    
    positive_dataset = [(description_dict, "Positive")
                    for description_dict in positive_tokens_for_model]

    negative_dataset = [(description_dict, "Negative")
                        for description_dict in negative_tokens_for_model]
    
    # print("positive_dataset: ", positive_dataset)
    # print("negative_dataset: ", negative_dataset)


    dataset = positive_dataset + negative_dataset
    seventy_percent_of_data = int(len(dataset) * .7)
    thirty_percent_of_data = int(len(dataset) * .3)
    # print(thirty_percent_of_data) #361

    random.shuffle(dataset) #to avoid bias

    train_data = dataset[:seventy_percent_of_data]
    test_data = dataset[thirty_percent_of_data:]

    classifier = NaiveBayesClassifier.train(train_data)
    # classifier = MultinomialNB.fit(train_data)
    save_classifier = open("naivebayes_pet.pickle","wb")
    pickle.dump(classifier, save_classifier)
    save_classifier.close()

    print("%%%%%%%%%%%%%%%%%%%Accuracy is:", classify.accuracy(classifier, test_data))

    print(classifier.show_most_informative_features(10))
    
    # from nltk.corpus import twitter_samples
    # print("&&&&&&&&&&&&&&&&&&&&&&&&&")
    # print(twitter_samples)
    data = str(data)
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    for ele in data:  
        if ele in punc:  
            data = data.replace(ele, "")
    data = data.split()
    # print("tokenized data: ", data)
    
    #breakdown parts of speech
    parts_of_speech = [] 
    parts_of_speech.append(nltk.pos_tag(data))
    print("parts of speech tagging: ", parts_of_speech) 
    #lemmatized data:
    stop_words = [] #left here in case I want to add words in the future
    cleaned_tokens = []


    for token, tag in nltk.pos_tag(data):
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos) 



        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    
    custom_tokens = remove_noise(word_tokenize(str(data)))

    print(str(data), classifier.classify(dict([token, True] for token in custom_tokens)))

    sentiment_result = [classifier.classify(dict([token, True] for token in custom_tokens))]

    print("sentiment_result: ", type(sentiment_result), sentiment_result)

    data = sentiment_result
    return data
예제 #30
0
featuresets = [(gender_features(n), g) for (n, g) in names]

print(len(featuresets))

print(featuresets[0:10])

train_set, test_set = featuresets[500:], featuresets[:500]

print(len(train_set))
print(len(test_set))

nb_classifier = NaiveBayesClassifier.train(train_set)
print(nb_classifier.classify(gender_features('Gary')))
print(nb_classifier.classify(gender_features('Grace')))

print(classify.accuracy(nb_classifier, test_set))
print(nb_classifier.show_most_informative_features(5))

me_classifier = MaxentClassifier.train(train_set)

print(me_classifier.classify(gender_features('Gary')))
print(me_classifier.classify(gender_features('Grace')))

classify.accuracy(me_classifier, test_set)

me_classifier.show_most_informative_features(5)


def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
예제 #31
0
def create_features(article):
    article_features = set(re.sub("[^a-z]", "", article.lower()).split())
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in article_features)
    return features


print("[=] Generating features...")
feature_set = []
i = 0
for (article, score) in data[:500]:
    feature_set.append([create_features(article), score])
    i = i + 1
    print("Generated feature " + str(i) + "/" + str(len(data)))
print("[+] Generated features...")
training_set, testing_set = feature_set[:int(
    len(feature_set) * 3 / 4)], feature_set[int(len(feature_set) * 3 / 4):]
print("[+] Generated training set of length " + str(len(training_set)) +
      ", and testing set of length " + str(len(testing_set)))
print("[*] Training...")
classifier = NaiveBayesClassifier.train(training_set)
print("[+] Finished training.")
f_dos = open('models/checkpoints/headline_classifier.pickle', 'wb')
pickle.dump(classifier, f_dos)
f_dos.close()
print("[+] Saved classifier.")
print("[=] Accuracy: " + str(classify.accuracy(classifier, testing_set)))
print("")
print("[=] Analyzing features...")
classifier.show_most_informative_features(5)
예제 #32
0
pro_dataset = [(tweet_dict, "Professional") for tweet_dict in pro_tokens]
unpro_dataset = [(tweet_dict, "Unprofessional") for tweet_dict in unpro_tokens]

#print(pro_dataset[0])
dataset = pro_dataset + unpro_dataset

random.shuffle(dataset)

seventy = int(len(dataset) * 0.70)
train_data = dataset[:seventy]  #70%
test_data = dataset[seventy:]  #30%

#-----model
classifier = NaiveBayesClassifier.train(train_data)

accuracy = classify.accuracy(classifier, test_data)
#print("\nAccuracy is:", classify.accuracy(classifier, test_data))

#print(classifier.show_most_informative_features(10))

#-----testing on custom tweet
custom = sys.argv

custom_tweet = ""

for i in range(len(custom)):
    if not i == 0:
        custom_tweet = custom_tweet + " " + custom[i]

custom_tokens = remove_noise(word_tokenize(custom_tweet))
test_set = pos_reviews_set[:200] + neg_reviews_set[:200]
train_set = pos_reviews_set[100:] + neg_reviews_set[100:]

print(len(test_set), len(train_set))

# #### This part of code trains the classifier and then print the accuracy gained(which can be different evertime)
#

# In[25]:

from nltk import classify
from nltk import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(train_set)

accuracy = classify.accuracy(classifier, test_set)
print(accuracy)

# #### Voice input
# Here we are first taking input in the formm of live audio stream and then converting it into text.

# In[26]:

import wave
import pyaudio
from os import path
from pydub import AudioSegment
import nltk
import nltk.corpus
import os
import re
예제 #34
0
 def test(self, test_set):
     return classify.accuracy(self.classifier, test_set)
    def train_test_model(self):
        '''
        This functions is an entirely self contained, trained Naive Bayes Model for text sentiment analysis with a 75.467% accuracy

        Importing more positive and negative classified tweets could be used to improve the model.

        The results are stored in the self.trained_model variable for the DataTransform class
        '''

        print('Preprocessing classified tweets for model.')
        from nltk.corpus import twitter_samples
        import random

        positive_tweets = twitter_samples.strings('positive_tweets.json')
        negative_tweets = twitter_samples.strings('negative_tweets.json')

        positive_df = pd.DataFrame(positive_tweets).rename(columns={0: 'text'})
        negative_df = pd.DataFrame(negative_tweets).rename(columns={0: 'text'})

        dict_samp = {}
        positive_dict = []
        positive = []
        negative=[]
        negative_dict = []

        datatransform_positive = DataTransform()
        datatransform_positive.set_df(positive_df)
        datatransform_positive.clean_text('text','token_text')

        for i in range(len(datatransform_positive.output_df.index)):
            for j in range(len(datatransform_positive.output_df['token_text'][i])):
                dict_samp.update({datatransform_positive.output_df['token_text'][i][j]: True})
            positive_dict.append(dict_samp)
            dict_samp = {}
        
        for w in positive_dict:
            positive.append((w, 'Positive'))

        datatransform_negative = DataTransform()
        datatransform_negative.set_df(negative_df)
        datatransform_negative.clean_text('text','token_text')

        for i in range(len(datatransform_negative.output_df.index)):
            for j in range(len(datatransform_negative.output_df['token_text'][i])):
                dict_samp.update({datatransform_negative.output_df['token_text'][i][j]: True})
            negative_dict.append(dict_samp)
            dict_samp = {}
        
        for w in negative_dict:
            negative.append((w, 'Negative'))

        dataset = positive+negative

        random.shuffle(dataset)

        train_data = dataset[:7000]
        test_data = dataset[7000:]

        self.trained_model = NaiveBayesClassifier.train(train_data)

        print("Accuracy is:", classify.accuracy(self.trained_model, test_data))
        return




            
db = Query('canada_subreddit.db')
db.connect()
cur = db.cursor()
cur.execute('''
SELECT c.body, s.label
FROM submissions as s, comments as c
WHERE s.submission_id = c.submission_id
AND (s.label = "Climate"
OR s.label = "Housing");
''')
data = cur.fetchall()
db.close()

feature_set = FeaturePipeline().create_set(data)

split = lambda x: -int(len(x) / 5)
k = split(feature_set)
training_set = feature_set[:k]
testing_set = feature_set[k:]

print('Now training...')

Naive_classifier = NaiveBayesClassifier.train(training_set)
print("Naive Bayes Algo accuracy percent:",
      (classify.accuracy(Naive_classifier, testing_set)))
Naive_classifier.show_most_informative_features(30)
#
# with open(pickle_file,"wb") as save_classifier:
#     pickle.dump(Naive_classifier, save_classifier)
#     save_classifier.close()
예제 #37
0
random.shuffle(labeled_names)
# featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
# # entries are    ({'last_letter': 'g'}, 'male')
# train_set, test_set = featuresets[500:], featuresets[:500]
#
# classifier = nltk.NaiveBayesClassifier.train(train_set)
#
# ans1 = classifier.classify(gender_features('Mark'))
# ans2 = classifier.classify(gender_features('Precilla'))
#
# print("Mark is:", ans1)
# print("Precilla is:", ans2)
# print(accuracy(classifier, test_set))
# classifier.show_most_informative_features(5)
# print(nltk.classify.accuracy(classifier, test_set))
acc = []
for i in range(12):
    featuresets = [(gender_features(n)[i], gender)
                   for (n, gender) in labeled_names]
    train_set, test_set = featuresets[500:], featuresets[:500]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    acc.append(accuracy(classifier, test_set))

print(acc)
plt.plot(range(12), acc)
plt.xlabel('Features')
plt.ylabel('Accuracy')
plt.xticks(range(12))
plt.savefig('bc.png', dpi=100, bbox_inches='tight')
plt.show()
예제 #38
0
#from pprint import pprint
##pprint.pprint(training_featureset)
##pprint.pprint(results)
#pprint(results)
import sys
#sys.exit()

print '''
Classifier accuracy (Bayes): %s
B Precision (Bayes): %s
B Recall (Bayes): %s
I Precision (Bayes): %s
I Recall (Bayes): %s
O Precision (Bayes): %s
O Recall (Bayes): %s
''' % (accuracy(bayes_classifier, test_featureset),
       precision(results[0]['B-SNP'], results[1]['B-SNP']),
       recall(results[0]['B-SNP'], results[1]['B-SNP']),
       precision(results[0]['I-SNP'], results[1]['I-SNP']),
       recall(results[0]['I-SNP'], results[1]['I-SNP']),
       precision(results[0]['O'],
                 results[1]['O']), recall(results[0]['O'], results[1]['O']))

#bayes_classifier.show_most_informative_features(10)

sys.exit()

maxent_classifier = nltk.classify.MaxentClassifier.train(training_featureset)
maxent_results = get_results(maxent_classifier)

print '''
예제 #39
0
def wst_classifier(trainer,
                   word,
                   features,
                   stopwords_list=STOPWORDS,
                   number=300,
                   log=False,
                   distance=3,
                   confusion_matrix=False):
    """
    This function takes as arguments:
        a trainer (e.g., NaiveBayesClassifier.train);
        a target word from senseval2 (you can find these out with senseval.fileids(),
            and they are 'hard.pos', 'interest.pos', 'line.pos' and 'serve.pos');
        a feature set (this can be wsd_context_features or wsd_word_features);
        a number (defaults to 300), which determines for wsd_word_features the number of
            most frequent words within the context of a given sense that you use to classify examples;
        a distance (defaults to 3) which determines the size of the window for wsd_context_features (if distance=3, then
            wsd_context_features gives 3 words and tags to the left and 3 words and tags to
            the right of the target word);
        log (defaults to false), which if set to True outputs the errors into a file errors.txt
        confusion_matrix (defaults to False), which if set to True prints a confusion matrix.

    Calling this function splits the senseval data for the word into a training set and a test set (the way it does
    this is the same for each call of this function, because the argument to random.seed is specified,
    but removing this argument would make the training and testing sets different each time you build a classifier).

    It then trains the trainer on the training set to create a classifier that performs WSD on the word,
    using features (with number or distance where relevant).

    It then tests the classifier on the test set, and prints its accuracy on that set.

    If log==True, then the errors of the classifier over the test set are written to errors.txt.
    For each error four things are recorded: (i) the example number within the test data (this is simply the index of the
    example within the list test_data); (ii) the sentence that the target word appeared in, (iii) the
    (incorrect) derived label, and (iv) the gold label.

    If confusion_matrix==True, then calling this function prints out a confusion matrix, where each cell [i,j]
    indicates how often label j was predicted when the correct label was i (so the diagonal entries indicate labels
    that were correctly predicted).
    """
    print "Reading data..."
    global _inst_cache
    if word not in _inst_cache:
        _inst_cache[word] = [(i, i.senses[0])
                             for i in senseval.instances(word)]
    events = _inst_cache[word][:]
    senses = list(set(l for (i, l) in events))
    instances = [i for (i, l) in events]
    vocab = extract_vocab(instances, stopwords=stopwords_list, n=number)
    print ' Senses: ' + ' '.join(senses)

    # Split the instances into a training and test set,
    #if n > len(events): n = len(events)
    n = len(events)
    random.seed(5444522)
    random.shuffle(events)
    training_data = events[:int(0.8 * n)]
    test_data = events[int(0.8 * n):n]
    # Train classifier
    print 'Training classifier...'
    classifier = trainer([(features(i, vocab, distance), label)
                          for (i, label) in training_data])
    # Test classifier
    print 'Testing classifier...'
    acc = accuracy(classifier, [(features(i, vocab, distance), label)
                                for (i, label) in test_data])
    print 'Accuracy: %6.4f' % acc
    if log == True:
        #write error file
        print 'Writing errors to errors.txt'
        output_error_file = open('errors.txt', 'w')
        errors = []
        for (i, label) in test_data:
            guess = classifier.classify(features(i, vocab, distance))
            if guess != label:
                con = i.context
                position = i.position
                item_number = str(test_data.index((i, label)))
                word_list = []
                for (word, tag) in con:
                    word_list.append(word)
                hard_highlighted = word_list[position].upper()
                word_list_highlighted = word_list[0:position] + [
                    hard_highlighted
                ] + word_list[position + 1:]
                sentence = ' '.join(word_list_highlighted)
                errors.append([item_number, sentence, guess, label])
        error_number = len(errors)
        output_error_file.write('There are ' + str(error_number) + ' errors!' +
                                '\n' + '----------------------------' + '\n' +
                                '\n')
        for error in errors:
            output_error_file.write(
                str(errors.index(error) + 1) + ') ' + 'example number: ' +
                error[0] + '\n' + '    sentence: ' + error[1] + '\n' +
                '    guess: ' + error[2] + ';  label: ' + error[3] + '\n' +
                '\n')
        output_error_file.close()
    if confusion_matrix == True:
        gold = [label for (i, label) in test_data]
        derived = [
            classifier.classify(features(i, vocab)) for (i, label) in test_data
        ]
        cm = nltk.ConfusionMatrix(gold, derived)
        print cm
        return cm
예제 #40
0
 def test(classifier, test_set):
     print('Testing classifier...')
     return classify.accuracy(classifier, test_set)
def sentim_twitter(self, data):
    '''heavily borrowed from https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk
    to show functioning model'''
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
    
    

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:700]
    test_data = dataset[700:]

    classifier = NaiveBayesClassifier.train(train_data)
    print("twitter data **********************************")

    print("%%%%%%%%%%%%%%%%%%% Twitter Accuracy is:", classify.accuracy(classifier, test_data))
    print("twitter data **********************************")

    print(classifier.show_most_informative_features(10))

    # data = (data)

    # custom_tweet = str(data) 
    print("twitter data **********************************")
    print("twitter data **********************************")
    print("is this reading data correctly???: ", type(str(data)))
    custom_tweet = str(data)
    # this gives negative
    
    
    
    custom_tokens = remove_noise(word_tokenize(custom_tweet))
    print("twitter data **********************************")
    print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens)))
    twitter =  classifier.classify(dict([token, True] for token in custom_tokens))
    return twitter
예제 #42
0
def train_test_evaluation():
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')

    print('Total number of positive_tweets are : ', len(positive_tweets))
    print('Total number of negative_tweets are : ', len(negative_tweets))
    print('-------------------------')
    print('one smaple of positive_tweets : ', positive_tweets[0])
    print('one smaple of negative_tweets : ', negative_tweets[0])
    print('-------------------------\n\n')

    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
    print('Total number of positive_tweet_tokens are : ',
          len(positive_tweet_tokens))
    print('Total number of negative_tweet_tokens are : ',
          len(negative_tweet_tokens))
    print('-------------------------')
    print('one smaple of positive_tweet_tokens : ', positive_tweet_tokens[0])
    print('one smaple of negative_tweet_tokens : ', negative_tweet_tokens[0])
    print('-------------------------\n\n')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    # all_pos_words = get_all_words(positive_cleaned_tokens_list)
    # freq_dist_pos = FreqDist(all_pos_words)
    # print('Most Frequent Items in Positive Tweets',freq_dist_pos.most_common(10))
    #
    # all_neg_words = get_all_words(negative_cleaned_tokens_list)
    # freq_dist_neg = FreqDist(all_neg_words)
    # print('Most Frequent Items in negative Tweets',freq_dist_neg.most_common(10))
    # print('-------------------------')

    positive_tokens_for_model = get_tweets_for_model(
        positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:9000]
    test_data = dataset[9000:]

    print('Length of Train Data is : ', len(train_data))
    print(' A sample of Traing Data : ', train_data[0])
    print('-------------------------')
    print('Length of Test Data is : ', len(train_data))
    print(' A sample of Test Data : ', test_data[0])
    print('-------------------------')

    classifier = NaiveBayesClassifier.train(train_data)

    print("\n\n Accuracy is:", classify.accuracy(classifier, test_data))

    print(classifier.show_most_informative_features(10))

    f = open('tweeter_trained_cls.pickle', 'wb')
    pickle.dump(classifier, f)
    f.close()

    return classifier
예제 #43
0
neutral_dataset = [(tweet_dict, "Neutral")
                     for tweet_dict in neutral_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

from nltk import classify
from nltk import  NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

print('Accuracy is:', classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(10))

from nltk.tokenize import word_tokenize

custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."

custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(classifier.classify(dict([token, True] for token in custom_tokens)))

custom_tweets = 'Congrats #SportStar on your 7th best goal from last season winning goal of the year :) #Baller #Topbin #oneofmanyworldies'

tet = 'I am a very good boy, do you know that'
custom_tokens = remove_noise(word_tokenize(tet))
예제 #44
0
    test_data = pd.read_csv(test_url)

    train_data['0'] = train_data['0'].apply(lambda x: ast.literal_eval(x))
    test_data['0'] = test_data['0'].apply(lambda x: ast.literal_eval(x))

    train_data_tuple = list(zip(train_data['0'], train_data['1']))
    test_data_tuple = list(zip(test_data['0'], test_data['1']))
    # # st.write(type(train_data['0'][0]))

    # clf_nb = NaiveBayesClassifier.train(train_data_tuple)

    # with open('nb_nltk.pkl', 'wb') as f:
    #     pickle.dump(clf_nb, f)
    nb_pickle_url = 'https://github.com/boblandsky/onion_ml/raw/master/nb_nltk.pkl'
    clf_nb = pd.read_pickle(nb_pickle_url)
    nb_acc = round(classify.accuracy(clf_nb, test_data_tuple), 4)*100
    st.write(f'Accuracy on test set: {nb_acc}%')

    test_nb_headline = st.text_input("Give me a headline to predict. A sample one is provided.",
                                      "MLS Commissioner Relieved That Nobody Knows Him by Name")

    if st.button('Onion or not? NLTK Edition'):
        test_nb_tokens = remove_noise(word_tokenize(test_nb_headline))
        results_nb = clf_nb.classify(dict([token, True] for token in test_nb_tokens))
        if results_nb == 1:
            st.write("It's from the Onion!")
        else:
            st.write("It's not from the Onion!")

    # nb_worked = st.radio('Did the Naive Bayes model make an accurate prediction?',
    #                     ('Yes', 'No')
def predict():

    import nltk
    nltk.download('twitter_samples')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('punkt')

    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.corpus import twitter_samples, stopwords
    from nltk.tag import pos_tag
    from nltk.tokenize import word_tokenize
    from nltk import FreqDist, classify, NaiveBayesClassifier
    import re, string, random
    import pickle

    def remove_noise(tweet_tokens, stop_words=()):

        cleaned_tokens = []

        for token, tag in pos_tag(tweet_tokens):
            token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                           '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
            token = re.sub("(@[A-Za-z0-9_]+)", "", token)

            if tag.startswith("NN"):
                pos = 'n'
            elif tag.startswith('VB'):
                pos = 'v'
            else:
                pos = 'a'

            lemmatizer = WordNetLemmatizer()
            token = lemmatizer.lemmatize(token, pos)

            if len(token
                   ) > 0 and token not in string.punctuation and token.lower(
                   ) not in stop_words:
                cleaned_tokens.append(token.lower())
        return cleaned_tokens

    def get_all_words(cleaned_tokens_list):
        for tokens in cleaned_tokens_list:
            for token in tokens:
                yield token

    def get_tweets_for_model(cleaned_tokens_list):
        for tweet_tokens in cleaned_tokens_list:
            yield dict([token, True] for token in tweet_tokens)

    if __name__ == "__main__":

        positive_tweets = twitter_samples.strings('positive_tweets.json')
        negative_tweets = twitter_samples.strings('negative_tweets.json')
        text = twitter_samples.strings('tweets.20150430-223406.json')
        tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

        stop_words = stopwords.words('english')

        positive_tweet_tokens = twitter_samples.tokenized(
            'positive_tweets.json')
        negative_tweet_tokens = twitter_samples.tokenized(
            'negative_tweets.json')

        positive_cleaned_tokens_list = []
        negative_cleaned_tokens_list = []

        for tokens in positive_tweet_tokens:
            positive_cleaned_tokens_list.append(
                remove_noise(tokens, stop_words))

        for tokens in negative_tweet_tokens:
            negative_cleaned_tokens_list.append(
                remove_noise(tokens, stop_words))

        all_pos_words = get_all_words(positive_cleaned_tokens_list)

        freq_dist_pos = FreqDist(all_pos_words)
        print(freq_dist_pos.most_common(10))

        positive_tokens_for_model = get_tweets_for_model(
            positive_cleaned_tokens_list)
        negative_tokens_for_model = get_tweets_for_model(
            negative_cleaned_tokens_list)

        positive_dataset = [(tweet_dict, "Positive")
                            for tweet_dict in positive_tokens_for_model]

        negative_dataset = [(tweet_dict, "Negative")
                            for tweet_dict in negative_tokens_for_model]

        dataset = positive_dataset + negative_dataset

        random.shuffle(dataset)

        train_data = dataset[:7000]
        test_data = dataset[7000:]

        classifier = NaiveBayesClassifier.train(train_data)

        print("Accuracy is:", classify.accuracy(classifier, test_data))

        print(classifier.show_most_informative_features(10))

        custom_tweet = ""

        if request.method == 'POST':
            custom_tweet = request.form['text']

        custom_tokens = remove_noise(word_tokenize(custom_tweet))

        NB_Cls = classifier.classify(
            dict([token, True] for token in custom_tokens))

        print(custom_tweet, NB_Cls)

        pickle.dump(NB_Cls, open('sentimental_101.pkl', 'wb'))

        return render_template('results.html', result=NB_Cls)
예제 #46
0
    def run(self):
        kf = KFold(n_splits=10, shuffle=False, random_state=None)
        #features_set = [(self.corpora.get_sentence_by_id(key).opinion_finder_features(), value) for (key, value) in self.items]
        #features_set = [(self.corpora.get_sentence_by_id(key).arguing_features(), value) for (key, value) in self.items]
        #features_set = [(self.corpora.get_sentence_by_id(key).verb_features(), value) for (key, value) in self.items]
        #features_set = [(self.corpora.get_sentence_by_id(key).strong_subjectivity_feature(), value) for (key, value) in self.items]
        features_set = [
            (self.corpora.get_sentence_by_id(key).get_all_features(), value)
            for (key, value) in self.items
        ]
        #print('features_set: ',features_set)
        accuracy_list = []
        arg_precision_list = []
        arg_recall_list = []
        arg_f_measure_list = []
        non_arg_precision_list = []
        non_arg_recall_list = []
        non_arg_f_measure_list = []
        for train_index, test_index in kf.split(features_set):
            #print("TRAIN:", train_index, "TEST:", test_index) #SVC(), sparse=False
            #svm = SVC(kernel='linear',degree = 10 )
            #classifier = SklearnClassifier(svm, sparse=False).train(features_set[train_index[0]:train_index[len(train_index) - 1]])
            #print('coef :',svm.coef_)
            #print('_________________________________________')
            classifier = nltk.NaiveBayesClassifier.train(
                features_set[train_index[0]:train_index[len(train_index) - 1]])
            #print('most_informative_features: ',classifier.most_informative_features(10))

            #print('training set:', features_set[train_index[0]:train_index[len(train_index) - 1]])

            refsets = collections.defaultdict(set)
            testsets = collections.defaultdict(set)

            for i, (feats, label) in enumerate(
                    features_set[test_index[0]:test_index[len(test_index) -
                                                          1]]):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)
            arg_precision = precision(refsets['arg'], testsets['arg'])
            arg_recall = recall(refsets['arg'], testsets['arg'])
            arg_f_measure = f_measure(refsets['arg'], testsets['arg'])
            non_arg_precision = precision(refsets['non-arg'],
                                          testsets['non-arg'])
            non_arg_recall = recall(refsets['non-arg'], testsets['non-arg'])
            non_arg_f_measure = f_measure(refsets['non-arg'],
                                          testsets['non-arg'])
            accuracy_ = accuracy(
                classifier,
                features_set[test_index[0]:test_index[len(test_index) - 1]])

            accuracy_list.append(accuracy_)
            arg_precision_list.append(arg_precision)
            arg_recall_list.append(arg_recall)
            arg_f_measure_list.append(arg_f_measure)
            non_arg_precision_list.append(non_arg_precision)
            non_arg_recall_list.append(non_arg_recall)
            non_arg_f_measure_list.append(non_arg_f_measure)
        print('median accuracy: ', accuracy_list)
        print('median arg_precision: ', arg_precision_list)
        print('median arg_recall: ', arg_recall_list)
        print('median arg_f_measure: ', arg_f_measure_list)
        print('median non_arg_precision: ', non_arg_precision_list)
        print('median non_arg_recall: ', non_arg_recall_list)
        print('median non_arg_f_measure: ', non_arg_f_measure_list)
@profile
def doc_features(doc):
    doc_words = FreqDist(w for w in doc if not isStopWord(w))
    features = {}
    for word in word_features:
        features['count (%s)' % word] = (doc_words.get(word, 0))
    return features


@profile
def make_features(docs):
    return [(doc_features(d), c) for (d, c) in docs]


@profile
def split_data(sets):
    return sets[200:], sets[:200]


if __name__ == "__main__":
    labeled_docs = label_docs()

    sw = set(stopwords.words('english'))
    filtered = filter_corpus()
    word_features = select_word_features(filtered)
    featuresets = make_features(labeled_docs)
    train_set, test_set = split_data(featuresets)
    classifier = NaiveBayesClassifier.train(train_set)
    print("Accuracy", accuracy(classifier, test_set))
    print(classifier.show_most_informative_features())
예제 #48
0
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier, test_data))

    print(classifier.show_most_informative_features(10))

    custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."

    custom_tokens = remove_noise(word_tokenize(custom_tweet))
    print("Testing with custom tweet")
    print(custom_tweet,
          classifier.classify(dict([token, True] for token in custom_tokens)))

    # added to store the model
    pickle_model(classifier)
            open(
                os.path.join(os.path.dirname(__file__),
                             'models/MaxEnt/maxent.pkl'), 'wb'))

    if not TRAIN:
        classifier = pickle.load(
            open(
                os.path.join(os.path.dirname(__file__),
                             'models/MaxEnt/maxent_85.pkl'), 'rb'))

    pred = []
    actual = [x[1] for x in test_set]
    for t, l in test_set:
        pred.append(classifier.classify(t))

    print(classify.accuracy(classifier, test_set))
    '''
    questions = open(os.path.join(os.path.dirname(__file__), 'datasets/ADA_v2_Exercise_Questions.txt')).read().split('\n')

    X_ada = []
    X_ada_orig = []
    for q in questions:
        X_ada.append(clean(q, return_as_list=False, stem=False))
        X_ada_orig.append(q)
    X_ada = get_filtered_questions(X_ada, threshold=0.75, what_type='ada')

    X_ada_features = [features(k.split()) for k in X_ada]

    preds = []
    for t in X_ada_features:
        preds.append(classifier.classify(t))
예제 #50
0
vocabulary = set()
for fileid in train_fileids:
    for word in movie_reviews.words(fileid):
        vocabulary.add(word)

# Try a feature set of 500 random words
vocabulary = list(vocabulary)
random.shuffle(vocabulary)
random_featureset = vocabulary[:500]

train_set = format_dataset(train_fileids, random_featureset)
test_set = format_dataset(test_fileids, random_featureset)
bayes = NaiveBayesClassifier.train(train_set)

print("Random words: ", random_featureset)
print("Naive Bayes accuracy:", accuracy(bayes, test_set))

# Try a feature set of the 500 words that appear most often in the training examples
common_words = dict()
for fileid in train_fileids:
    for word in movie_reviews.words(fileid):
        if word not in common_words:
            common_words[word] = 1
        else:
            word = word + 1

sorted_common = sorted(common_words.items(),
                       key=operator.itemgetter(1))[fileid_count -
                                                   500:fileid_count]
train_set = format_dataset(train_fileids, sorted_common)
test_set = format_dataset(test_fileids, sorted_common)
예제 #51
0
    print(len(dataset))

    train_data = dataset[:5]
    test_data = dataset[5:]

names = [
    "MultinomialNBclassifier", "BernoulliNB", "LogisticRegression_classifier",
    "SGDClassifier_classifier ", "SVC_classifier", "LinearSVC_classifier",
    "NaiveBayesClassifier"
]

MultinomialNBclassifier = SklearnClassifier(MultinomialNB())
MultinomialNBclassifier.train(train_data)
print("\nMultinomialNB Accuracy is:",
      (classify.accuracy(MultinomialNBclassifier, test_data)) * 100)

# GaussianNBclassifier = SklearnClassifier(GaussianNB())
# GaussianNBclassifier.train(train_data)
# print("\nGaussianNB Accuracy is:", classify.accuracy(GaussianNBclassifier, test_data))

BernoulliNB = SklearnClassifier(BernoulliNB())
BernoulliNB.train(train_data)
print("BernoulliNB Algo Accuracy: ",
      (nltk.classify.accuracy(BernoulliNB, test_data)) * 100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(train_data)
print("LogisticRegression Algo Accuracy: ",
      (nltk.classify.accuracy(LogisticRegression_classifier, test_data)) * 100)
    pos_features.append((bag_of_words(words), 'pos'))

# negative reviews feature set
neg_features = []
for words in neg_reviews:
    neg_features.append((bag_of_words(words), 'neg'))

shuffle(pos_features)
shuffle(neg_features)

test_feature_set = pos_features[:200] + neg_features[:200]
train_feature_set = pos_features[200:] + neg_features[200:]

classifier = NBC.train(train_feature_set)

accuracy = classify.accuracy(classifier, test_feature_set)
print(accuracy)
#f = open('unigram_classifier.pickle', 'wb')
#pickle.dump(classifier, f)
#f.close()

while (1):
    custom_review = input(
        "Enter a custom movie review (Press ENTER key to exit):\n")
    if (len(custom_review) < 1):
        break
    custom_review_tokens = word_tokenize(custom_review)
    custom_feature_set = bag_of_words(custom_review_tokens)
    print(classifier.classify(custom_feature_set))
    prob_result = classifier.prob_classify(custom_feature_set)
    print("confidence: " + (str)(prob_result.prob(prob_result.max())))
예제 #53
0
 def accuracy_test(self):
     print('Performing Accuracy Test')
     print('Accuracy is:')
     print(classify.accuracy(self.classifier, self.test_data))
     print('---')
     print(self.classifier.show_most_informative_features(25))
예제 #54
0
from nltk import classify

db = Query('canada_subreddit.db')
db.connect()
cur = db.cursor()
cur.execute('''
SELECT c.body, s.label
FROM submissions as s, comments as c
WHERE s.submission_id = c.submission_id
AND (s.label = "Climate"
OR s.label = "Housing");
''')
data = cur.fetchall()
feature_set = FeaturePipeline().create_set(data)

split = lambda x: - int(len(x) / 5)
k = split(feature_set)
training_set = feature_set[:k]
testing_set = feature_set[k:]


print('Now training...')

Naive_classifier = NaiveBayesClassifier.train(training_set)
print("Naive Bayes Algo accuracy percent:", (classify.accuracy(Naive_classifier, testing_set)))
Naive_classifier.show_most_informative_features(30)
#
# with open(pickle_file,"wb") as save_classifier:
#     pickle.dump(Naive_classifier, save_classifier)
#     save_classifier.close()
예제 #55
0
def WSDClasifier(trainer, 
                 word,
				 features,
				 stopwords=STOPWORDS, 
				 number=300,
				 distance=3,
				 log=False,
				 confusion_matrix=False):
	"""
	Build a classifier instance for the senseval2 senses of a word and applies it

	:param word: from senseval2 (we have 'hard.pos', 'interest.pos', 'line.pos' and 'serve.pos')
	:type string:
	:param features: selector to which feature set to use
	:type features: str (word, context)
	:param n: passed to extract_vocab when constructing the second argument to the feature set constructor
	:type int:
	:param dist: passed to the feature set constructor as 3rd argument
	:type int:
	:param log: if set to True outputs any errors into a file errors.txt
	:type bool:
	:param confusion_matrix: if set to True prints a confusion matrix
	:type bool:

	Calling this function splits the senseval data for the word into a training set and a test set (the way it does
	this is the same for each call of this function, because the argument to random.seed is specified,
	but removing this argument would make the training and testing sets different each time you build a classifier).

	It then trains the trainer on the training set to create a classifier that performs WSD on the word,
	using features (with number or distance where relevant).

	It then tests the classifier on the test set, and prints its accuracy on that set.

	If log==True, then the errors of the classifier over the test set are written to errors.txt.
	For each error four things are recorded: (i) the example number within the test data (this is simply the index of the
	example within the list test_data); (ii) the sentence that the target word appeared in, (iii) the
	(incorrect) derived label, and (iv) the gold label.

	If confusion_matrix==True, then calling this function prints out a confusion matrix, where each cell [i,j]
	indicates how often label j was predicted when the correct label was i (so the diagonal entries indicate labels
	that were correctly predicted).
	"""
	global inst_cache

	if word not in inst_cache:
		inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
		
	events = inst_cache[word][:]
	senses = list(set(l for (i, l) in events))
	instances = [i for (i, l) in events]
	vocab = extract_vocab(instances, number)
	print(' Senses: ' + ' '.join(senses))
	# Split the instances into a training and test set,
	#if N > len(events): N = len(events)
	N = len(events)
	random.seed(123456789) 
	random.shuffle(events)
	train_data = events[:int(0.8 * N)]
	test_data = events[int(0.8 * N):N]

	# Train classifier
	print('Training classifier...')
	classifier = trainer([(features(i, vocab, distance), label) for (i, label) in train_data])
	# Test classifier
	print('Testing classifier...')
	acc = accuracy(classifier, [(features(i, vocab, distance), label) for (i, label) in test_data] )
	print('Accuracy: {:6.4f}'.format(acc))

	if log:
		#write error file
		print('Writing errors to errors.txt')
		with open('errors.txt', 'w') as file:
			errors = []
			for (i, label) in test_data:
				guess = classifier.classify(features(i, vocab, distance))
				if guess != label:
					con =  i.context
					position = i.position
					item_number = str(test_data.index((i, label)))
					word_list=[cv[0] if isinstance(cv,tuple) else cv for cv in con]
					hard_highlighted = word_list[position].upper()
					word_list_highlighted = word_list[0:position] + [hard_highlighted] + word_list[position+1:]
					sentence = ' '.join(word_list_highlighted)
					errors.append([item_number, sentence, guess,label])
			file.write('There are {} errors'.format(len(errors)))
			file.write('----------------------------\n')
			for error in errors:
				idx = errors.index(error)+1
				num, snt, guess, label = error
				file.write('{}) example #: {} \n sentence: {}\n guess: {}\n label: {}\n'.format(idx, num, snt, guess, label))
					
	if confusion_matrix:
		gold = [label for (i, label) in test_data]
		derived = [classifier.classify(features(i,vocab)) for (i,label) in test_data]
		cm = nltk.ConfusionMatrix(gold,derived)
		print(cm)
예제 #56
0
print("Dictionary with Negative class : ", negativeReviewDataset[7])
#print("tagged neg :",negative_dataset[0])

dataset = positiveReviewDataset + negativeReviewDataset

print("Dataset[0] :", dataset[0])
print("Dataset length", len(dataset))

random.shuffle(dataset)

trainData = dataset[:7000]
testData = dataset[7000:]

trainedModel = NaiveBayesClassifier.train(trainData)

print("Accuracy of the model : ", classify.accuracy(trainedModel, testData))

review = "This is a bad product."
reviewTokens = noiseRemoval(word_tokenize(review))

# Test print
print(review, " : ",
      trainedModel.classify(dict([token, True] for token in reviewTokens)))

#Text = "j@nittha"
#Text = re.sub("@", "a", Text)
#print(Text)


# Flask API to be used in backend
@app.route("/NLP")
예제 #57
0
def evaluate(train_set, test_set, classifier):
    print ('Accuracy (training set) = ' + str(classify.accuracy(classifier, train_set)))
    print ('Accuracy (test set) = ' + str(classify.accuracy(classifier, test_set)))
    
    classifier.show_most_informative_features(20)
예제 #58
0
tagged_words = brown.tagged_words(categories='news')

print(tagged_words)
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
featuresets[0]

from nltk import DecisionTreeClassifier
from nltk.classify import accuracy

cutoff = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[cutoff:], featuresets[:cutoff]

classifier = DecisionTreeClassifier.train(train_set) # NLTK is a teaching toolkit which is not really optimized for speed. Therefore, this may take forever. For speed, use scikit-learn for the classifiers.


accuracy(classifier, test_set)
classifier.classify(pos_features('cats'))


'''

To accompany the video, here is the sample code for NLTK part of speech tagging with lots of comments and info as well:

POS tag list:

CC coordinating conjunction
CD cardinal digit
DT determiner
EX existential there (like: "there is" ... think of it like "there exists")
FW foreign word
IN preposition/subordinating conjunction
예제 #59
-1
def category_by_pos():
    from nltk.corpus import brown
    from nltk import FreqDist
    from nltk import DecisionTreeClassifier
    from nltk import NaiveBayesClassifier
    from nltk import classify

    suffix_fdist = FreqDist()
    for word in brown.words():
        word = word.lower()
        suffix_fdist.inc(word[-1:])
        suffix_fdist.inc(word[-2:])
        suffix_fdist.inc(word[-3:])

    common_suffixes = suffix_fdist.keys()[:100]
#    print common_suffixes

    def pos_features(word):
        features = {}
        for suffix in common_suffixes:
            features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
        return features

    tagged_words = brown.tagged_words(categories='news')
    featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
#    classifier = DecisionTreeClassifier.train(train_set)
#    print 'Decision Tree %f' % classify.accuracy(classifier, test_set)

    classifier = NaiveBayesClassifier.train(train_set)
    print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
예제 #60
-1
파일: classifier.py 프로젝트: anov/honors
	def validate(self, validation_set):
		if self.classifier is None:
			raise Exception("self.classifier is None")
		reference=defaultdict(set)
		observed=defaultdict(set)
		observed['neutral']=set()

		for i, (tweet, label) in enumerate(validation_set):
			reference[label].add(i)
			observation=self.classify(tweet)
			observed[observation].add(i)
		acc=classify.accuracy(self.classifier, observed)
		posp=precision(reference['positive'],observed['positive'])
		posr=recall(reference['positive'], observed['positive'])
		posf=f_measure(reference['positive'], observed['positive'])
		negp=precision(reference['negative'],observed['negative'])
		negr=recall(reference['negative'], observed['negative'])
		negf=f_measure(reference['negative'], observed['negative'])
		
		print "accuracy: %s" % acc
		print "pos precision: %s" % posp
		print "pos recall: %s" % posr
		print "pos f-measure: %s" % posf
		print "neg precision: %s" % negp
		print "neg recall: %s" % negr
		print "neg f-measure: %s" % negf
		return (acc, posp, posr, posf, negp, negr, negf)