def train(self, training_corpus):
     assert isinstance(training_corpus, (list, tuple))
     assert isinstance(training_corpus[0], dict)
     featureset = [(twit_features(i["text"]), i["polarity"])
                     for i in training_corpus
                     if i["denied"] == 0]
     self.classifier = NaiveBayesClassifier.train(featureset)
def get_sentiment_data(query, training_set):
	train = []
	with open('training/' + training_set + '/training.txt') as f:
		for line in f:
			temp = line.split('\t')
			#print temp
			train.append((get_features(temp[1]), temp[0]))
	clf = NaiveBayesClassifier.train(train)

	tweets = grab_tweets(query)
	print "HERE"
	classified = {}
	for tweet in tweets:
		if tweet.created_at in classified.keys():
			classified[tweet.created_at] = classified[tweet.created_at] + [clf.classify(get_features(tweet.text))]
		else:
			classified[tweet.created_at] = [clf.classify(get_features(tweet.text))]
	print classified

	returndata = {}
	for key in classified:
		#numpos = sum([1 if v=='pos' else 0 for v in classified[key]])
		#returndata[key] = (numpos, len(classified[key]) - numpos) #tuple of positive, negative
		# percent:
		returndata[key] = float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key])
		#returndata[key] = math.ceil(float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key])*100)/100.0
	print returndata
	return returndata
Exemplo n.º 3
0
def nltk_model():
    """Fits the (non-parametric) naive Bayes classifier from nltk on the names
    dataset."""
    # each elt of all_names will be a (name, gender) tuple
    all_names = list()

    with open(MALE_FILE, "r") as f:
        for line in f:
            all_names.append((line.rstrip(), "male"))  # rstrip removes trailing whitespace

    with open(FEMALE_FILE, "r") as g:
        for line in g:
            all_names.append((line.rstrip(), "female"))

    # assert stmts can be useful for debugging etc
    assert len(all_names) == 7944

    # shuffle all_names in place
    random.shuffle(all_names)

    # features are ({'feature_type': feature_value}, gender) tuples
    features = [(nltk_featurize(name), gender) for name, gender in all_names]
    split_pt = int(TRAIN_PCT * len(features))

    train_set, test_set = features[:split_pt], features[split_pt:]
    nb = NaiveBayesClassifier.train(train_set)

    print "accuracy = {0} %".format(int(100 * nltk.classify.accuracy(nb, test_set)))
    nb.show_most_informative_features(10)
def test_raw_mail(org_email):

	features_test = {}
	wordtokens_test = [word_limit.lemmatize(key.lower()) for key in
	word_tokenize(org_email)]
	for key in wordtokens_test:
		if key not in stpwords:
			features_test[key] = True
	return features_test

	#Extracting the features(Tonenized, stemmed and non-stopwords emails) from all the emails
	feature_sets = [(raw_mail(n), g) for (n,g) in mail_shuffle]

	#Splitting the test and training data sets from the whole email set features
	size_feature = int(len(feature_sets) * 0.10)
	train_set, test_set = feature_sets[size_feature:], feature_sets[:size_feature]
	classifier = NaiveBayesClassifier.train(train_set)
	#print (test_set[1:5])

	#Printing the accuracy of the machine
	print ('accuracy of the machine: ', (classify.accuracy(classifier,test_set))*100) 
	
	#Printing the top 50 features
	classifier.show_most_informative_features(50) 

	#Printing the spam and ham labels
	print ('labels:',classifier.labels())

	#Classification of user entered email
	while(True):
		featset = raw_mail(input("Enter text to classify: "))
		print (classifier.classify(featset))
Exemplo n.º 5
0
    def __init__(self, chatbot, **kwargs):
        super().__init__(chatbot, **kwargs)
        from nltk import NaiveBayesClassifier

        self.positive = kwargs.get('positive', [
            'what time is it',
            'hey what time is it',
            'do you have the time',
            'do you know the time',
            'do you know what time it is',
            'what is the time'
        ])

        self.negative = kwargs.get('negative', [
            'it is time to go to sleep',
            'what is your favorite color',
            'i had a great time',
            'thyme is my favorite herb',
            'do you have time to look at my essay',
            'how do you have the time to do all this'
            'what is it'
        ])

        labeled_data = (
            [(name, 0) for name in self.negative] +
            [(name, 1) for name in self.positive]
        )

        train_set = [
            (self.time_question_features(text), n) for (text, n) in labeled_data
        ]

        self.classifier = NaiveBayesClassifier.train(train_set)
def check_classifier(feature_extractor, **kwargs):
    '''
    Train the classifier on the training spam and ham, then check its accuracy
    on the test data, and show the classifier's most informative features.
    '''
    
    # Make training and testing sets of (features, label) data
    train_set, test_spam, test_ham = \
        make_train_test_sets(feature_extractor, **kwargs)
    
    #===============================================
    # ADD YOUR CODE HERE
    # Train the classifier on the training set (train_set)
    # classifier = /your code/
    # Test accuracy on test spam emails (test_spam) and test ham emails(test_ham)
    # spam_accuracy = /your code/
    # Test accuracy on test ham emails (test_spam) and test ham emails(test_ham)
    # ham_accuracy = /your code/
    #===============================================
    classifier = NaiveBayesClassifier.train(train_set)
    spam_accuracy = nltk.classify.accuracy(classifier, test_spam)
    ham_accuracy = nltk.classify.accuracy(classifier, test_ham)
    
    # How accurate is the classifier on the test sets?
    print ('Test Spam accuracy: {0:.2f}%'
       .format(100 * spam_accuracy))
    print ('Test Ham accuracy: {0:.2f}%'
       .format(100 * ham_accuracy))

    # Show the top 20 informative features
    print classifier.show_most_informative_features(20)
    def train(self):
        """
        """
        catalog = getToolByName(self, "portal_catalog")
        presentNouns = dict()
        trainingData = []
        allNouns = catalog.uniqueValuesFor("noun_terms")
        for item in allNouns:
            presentNouns.setdefault(item, 0)

        subjectIndex = catalog._catalog.getIndex("Subject")
        nounTermsIndex = catalog._catalog.getIndex("noun_terms")

        # The internal catalog ids of the objects
        # that have noun terms in the catalog
        nounTermIndexIds = IISet(nounTermsIndex._unindex.keys())

        # The internal catalog ids of the objects
        # that have subjects in the catalog
        subjectIndexIds = IISet(subjectIndex._unindex.keys())
        commonIds = intersection(subjectIndexIds, nounTermIndexIds)

        for cid in commonIds:
            nounPresence = presentNouns.copy()
            nouns = nounTermsIndex._unindex[cid]
            tags = subjectIndex._unindex[cid]
            for noun in nouns:
                nounPresence[noun] = 1
            for tag in tags:
                trainingData.append((nounPresence, tag))
        if trainingData:
            self.classifier = NaiveBayesClassifier.train(trainingData)
Exemplo n.º 8
0
def train_nltk(data, labels):
    '''
    Returns a trained nltk.NaiveBayesClassifier
    
    Inputs
    ---------
    data -- np.array of tuples
    '''
    # For now, shuffle, since for now assuming that only the post language itself is all that's needed for offensive measure, though in the future, 2 anti-something users may actually not be offended by one another if they are both negative about something
    kf = cv.KFold(n=len(data), n_folds=N_FOLDS, shuffle=True)

    best_model = None
    max_acc = float('-inf')
    for k, (train_index, test_index) in enumerate(kf):
        X_train, Y_train = data[train_index], labels[train_index]
        X_test, Y_test = data[test_index], labels[test_index]

        features_train = bulk_extract_features(X_train)
        features_test = bulk_extract_features(X_test)

        train_set = zip(features_train, Y_train)
        test_set = zip(features_test, Y_test)
        
        model = nbc.train(train_set)

        acc = nltk.classify.accuracy(model, test_set)
        print str(acc)
        if acc > max_acc:
            max_acc = acc
            best_model = model
    best_model.show_most_informative_features(30)
    return best_model
    def __init_naive_bayes( self ):
        """
    	    Create and trains the NaiveBayes Classifier
        """
	try:
#		corpus_no = abs(int(raw_input('Enter the number (1-3) of corpus: ')))
#		while corpus_no == 0 or corpus_no > 3:
#		    corpus_no = abs(int(raw_input('Please the number of corpus from 1 to 2:' )))
		corpus = 'corpus2'#+str(corpus_no)
		
		path = os.path.join('corpora/',corpus)
		spam_path = os.path.join(path,'spam')
		ham_path = os.path.join(path,'ham')
		
		
		spam_dir = os.listdir(spam_path)
		ham_dir = os.listdir(ham_path)
		
		train_spam_filelist = [os.path.join(spam_path, f) for f in spam_dir]
		train_ham_filelist = [os.path.join(ham_path, f) for f in ham_dir]

		spam_size = len(train_spam_filelist)
		ham_size = len(train_ham_filelist)
		
		train_spam_set = self.__make_featured_set(train_spam_filelist,'spam')
		train_ham_set = self.__make_featured_set(train_ham_filelist,'ham')
		train_set = train_spam_set + train_ham_set
		
		self.classifier = NaiveBayesClassifier.train( train_set )

	except:
		    raise Exception( "Unexpected error in SpamFilter: __spamFilter:",sys.exc_info()[0].__name__,\
			os.path.basename( sys.exc_info()[2].tb_frame.f_code.co_filename ),\
			sys.exc_info()[2].tb_lineno, \
			sys.exc_info()[1].message )
Exemplo n.º 10
0
def get_matrix(spam_set, ham_set, num_folds):
	'''
	Generate different matrix by taking the average of K Fold data
	'''
	total_precision = total_recall = F1 = spam_accuracy = ham_accuracy = 0

	for train_set, test_spam_set, test_ham_set in utils.get_kfold_data(spam_set, ham_set, num_folds):
		classifier = NaiveBayesClassifier.train(train_set)
		spam_len = len(test_spam_set)
		ham_len = len(test_ham_set)
		true_positive = false_positive = true_negative = false_negative = 0
		for test in test_spam_set:
			features = test[0]
			predicted_label = classifier.classify(features)
			if predicted_label == 0:
				true_positive += 1
			else:
				false_negative += 1
		for test in test_ham_set:
			features = test[0]
			predicted_label = classifier.classify(features)
			if predicted_label == 1:
				true_negative += 1
			else:
				false_positive += 1
												
		precision = true_positive / float(true_positive + false_positive)
		recall = true_positive / float(true_positive + false_negative)
		F1 += (2 * precision * recall) / (precision + recall)
		spam_accuracy += true_positive / float(true_positive + false_negative)
		ham_accuracy += true_negative / float(true_negative + false_positive)
		total_precision += precision
		total_recall += recall

	return total_precision/num_folds, total_recall/num_folds, F1/num_folds, spam_accuracy*100/num_folds, ham_accuracy*100/num_folds
Exemplo n.º 11
0
    def __init__(self, **kwargs):
        super(TimeLogicAdapter, self).__init__(**kwargs)
        from nltk import NaiveBayesClassifier

        self.positive = [
            'what time is it',
            'do you know the time',
            'do you know what time it is',
            'what is the time'
        ]

        self.negative = [
            'it is time to go to sleep',
            'what is your favorite color',
            'i had a great time',
            'what is'
        ]

        labeled_data = (
            [(name, 0) for name in self.negative] +
            [(name, 1) for name in self.positive]
        )

        # train_set = apply_features(self.time_question_features, training_data)
        train_set = [(self.time_question_features(n), text) for (n, text) in labeled_data]

        self.classifier = NaiveBayesClassifier.train(train_set)
Exemplo n.º 12
0
def buildclassifiers(featureslist, SAMPLE_PROPORTION, n):
	classnames = ['Naive Bayes', 'Logistic Regression', 'Linear SCV']
	allclassifiers = []
	for name in classnames:
		for i in range(n):
			random.shuffle(featureslist)
			train_set, test_set = buildsets(featureslist, SAMPLE_PROPORTION)

			if name == 'Naive Bayes':
				spamclassifier = NaiveBayesClassifier.train(train_set)
			if name == 'Logistic Regression':
				spamclassifier = SklearnClassifier(LogisticRegression())
				spamclassifier.train(train_set)
			if name == 'Linear SCV':
				spamclassifier = SklearnClassifier(LinearSVC(C=0.01))
				spamclassifier.train(train_set)
			perfmeasures_i = evaluate(train_set, test_set, spamclassifier, name)
			if i == 0:
				perfmeasures_n = perfmeasures_i
			else:
				perfmeasures_n = map(add, perfmeasures_n, perfmeasures_i)
	
		# Store last classifier built per model
		allclassifiers.append(spamclassifier)
		
		# Print performance measures per classifier
		printperformance(name, perfmeasures_n, n)	
		
	return allclassifiers
Exemplo n.º 13
0
 def train_classifiers(self):
     for word in self.senses:
         train_set = []
         for senseId in self.senses[word]:
             for lsa_vector in self.senses[word][senseId]:
                 train_set.append([dict(lsa_vector), senseId])
         self.classifiers[word] = NaiveBayesClassifier.train(train_set)
Exemplo n.º 14
0
def category_by_movie():
    from nltk.corpus import movie_reviews as mr
    from nltk import FreqDist
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.corpus import names
    from nltk.classify import apply_features
    import random

    documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
    random.shuffle(documents)

    all_words = FreqDist(w.lower() for w in mr.words())
    word_features = all_words.keys()[:2000]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    #print document_features(mr.words('pos/cv957_8737.txt'))
    #print documents[0]

    features = [(document_features(d), c) for (d, c) in documents]
    train_set, test_set = features[100:], features[:100]
    classifier = NaiveBayesClassifier.train(train_set)
    print classify.accuracy(classifier, train_set)
Exemplo n.º 15
0
def training(features, method, proportion_training):
	training_set = features[:int(proportion_training*len(features))] # we take 2/3 for training and 1/3 for testing
	testing_set = features[int(proportion_training*len(features)):]
				
	if method == 'NaiveBayes':
		classifier = NaiveBayesClassifier.train(training_set)
				
	return training_set, testing_set, classifier
Exemplo n.º 16
0
  def train(self, foldPercent=.8):
    features = self.buildFeatures()

    foldIndex = int(foldPercent * len(features))
    self.setTrain = features[:foldIndex]
    self.setTest = features[foldIndex:]

    self.classifier = nbc.train(self.setTrain)
Exemplo n.º 17
0
def train(features, samples_proportion):
    train_size = int(len(features) * samples_proportion)
    train_set, test_set = features[:train_size], features[train_size:]
    print ('Training set size = ' + str(len(train_set)) + ' emails')
    print ('Test set size = ' + str(len(test_set)) + ' emails')
    train_set_tuple = tuple(train_set)
    classifier = NaiveBayesClassifier.train(train_set_tuple)
    return train_set, test_set, classifier
Exemplo n.º 18
0
def textClass():
    #dbFile = open("samp.txt")
    dbFile = open("all.txt")

    reviews = list() #each list element is a list of words in the review
    ratings = list() #ratings given
    usefulness = list() #review classification

    tot_recs = 0
    len_tot = 0
    mlen = 0

    #parse the file and create the list to be passed to the NBClassifiers
    while tot_recs < 150000:#True:
        if tot_recs % 1000 == 0:
            print "num records:", tot_recs
        tot_recs += 1
        raw_rec = readRec(dbFile)
        if len(raw_rec) == 0:
            break
        review_text = [word.strip(punctuation) for word in raw_rec["text"]]
        rate_val = str( raw_rec["score"][0] )
        
        prs_rec = parse4ftrs(raw_rec)
        len_tot += prs_rec["length"]
        if prs_rec["length"] > mlen:
            mlen = prs_rec["length"]
        use_val = str( prs_rec["class"] )

        #print use_val, rate_val
        #word feature dictionary
        wfd = word_feats(review_text)

        ratings.append( ( wfd  , rate_val)  )
        usefulness.append( ( wfd, use_val)  )

    dbFile.close()
    print "avg length:", len_tot/tot_recs
    print "max len:", mlen
    #select a cutoff for test v training
    #nrecs = len(ratings)
    nrecs = tot_recs
    rate_cl = NaiveBayesClassifier.train(ratings)
    use_cl = NaiveBayesClassifier.train(usefulness)
    return rate_cl, use_cl
def evaluate_classifier(train_set, test_spam, test_ham):
    """ Using NaiveBayesClassifier.train() method from NLTK to train the train_set (spam + ham),
	then classifier is used to evaluate the accuracy of test Spam, Ham. Finally, the most informative 
	features are showed.
	"""
    classifier = NaiveBayesClassifier.train(train_set)
    print ("Test Spam accuracy: {0:.2f} %".format(100 * nltk.classify.accuracy(classifier, test_spam)))
    print ("Test Ham accuracy: {0:.2f} %".format(100 * nltk.classify.accuracy(classifier, test_ham)))
    print classifier.show_most_informative_features(20)
Exemplo n.º 20
0
def train(features, samples_proportion):
    train_size = int(len(features) * samples_proportion)
    # initialise the training and test sets
    train_set, test_set = features[:train_size], features[train_size:]
    print ('Training set size = ' + str(len(train_set)) + ' emails')
    print ('Test set size = ' + str(len(test_set)) + ' emails')
    # train the classifier
    classifier = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, classifier
Exemplo n.º 21
0
def buildClassifier(hamDir, spamDir):
	spamEmails = []
	hamEmails = []
	allEmails = []
	features = []

	# Using glob instead of os.listdir to ignore hidden files

	for email in glob.glob(spamDir + "/*"):
		f = open(email)
		spamEmails.append(f.read())
		f.close()

	for email in glob.glob(hamDir + "/*"):
		f = open(email)
		hamEmails.append(f.read())
		f.close()

	for email in spamEmails:
		allEmails.append((email, 'spam'))

	for email in hamEmails:
		allEmails.append((email, 'ham'))

	# Shuffle to get the accuracy of the 70:30 ratio. Otherwise, if no check were to be done, would not need to shuffle.
	random.shuffle(allEmails)

	# Make a list of feature per email
	for (email, label) in allEmails:
		features.append((emailFeatures(email), label))

	# 70:30 ratio for training:testing
	print "Using a 70:30 ratio for training:testing, the accuracy is as follows: "
	totalSize = int(len(features) * 0.7)
	trainingEmails, testingEmails = features[:totalSize], features[totalSize:]

	print "training size: %d; testing size: %d" %(len(trainingEmails), len(testingEmails))
	classifier = NaiveBayesClassifier.train(trainingEmails)
	print classify.accuracy(classifier, testingEmails)

	print "Now creating and saving a full size classifier made up of %d emails..." %len(features)
	classifier = NaiveBayesClassifier.train(features)

	saveClassifier(classifier, "full-classifier.pickle")
Exemplo n.º 22
0
    def __init__(self,classifierType):

        titles = []
        bodies = []
        invalids = []
        drivers = []
        fromFields = []
        toFields = []
        ctitles = []
        cbodies = []
        cdrivers = []

        
        dirname = os.path.dirname(__file__)
        with open(os.path.join(dirname,'sfIsGood.csv'), 'rb') as csvfile:
            spamreader = csv.reader(csvfile, delimiter=',')
            i = -1
            for row in spamreader:
                i += 1
                if (i > 0):
                    titles.append(row[0])
                    bodies.append(row[3])
                    fromFields.append(row[6])
                    toFields.append(row[7])
                    invalids.append(row[6] == 'invalid')
                    drivers.append(row[10])
                    if not row[6] == 'invalid':
                        ctitles.append(row[0])
                        cbodies.append(row[3])
                        cdrivers.append(row[10])

        words = []
        if classifierType == 'driver':
            for i in range(len(ctitles)):
                words += nltk.word_tokenize(ctitles[i])
                words += nltk.word_tokenize(cbodies[i])

            documents = [((nltk.word_tokenize(ctitles[i]) +
                           nltk.word_tokenize(cbodies[i]))
                          , cdrivers[i]) for i in range(len(ctitles))]
            random.shuffle(documents)

        elif classifierType == 'invalid':
            for i in range(len(titles)):
                words += nltk.word_tokenize(titles[i])
                words += nltk.word_tokenize(bodies[i])

            documents = [((nltk.word_tokenize(titles[i]) +
                           nltk.word_tokenize(bodies[i]))
                          , str(invalids[i])) for i in range(len(ctitles))]
            random.shuffle(documents)
            
        all_words = nltk.FreqDist(w.lower() for w in words)
        self.word_features = all_words.keys()[:500]
        self.training_set = [(self.document_features(d), c) for (d,c) in documents]
        self.classifier = NaiveBayesClassifier.train(self.training_set)
Exemplo n.º 23
0
    def naives_classifier(self, training_set, dev_set, log=0):

        classifier = NaiveBayesClassifier.train(training_set)
        accuracy = classify.accuracy(classifier, dev_set)

        print('Naive Bayes accuracy dev percent: ', (accuracy * 100))
        if log == 1:
            classifier.show_most_informative_features(20)

        return classifier
def user_name_classify(user_name, classifier):
    """Infer a gender for a User given any name, using a Naive Bayes classifier
    """

    names = [(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]
    features = [(name, gender) for (name, gender) in names]
    training_set = features[500:]
    test_set = features[:500]
    classifier = NaiveBayesClassifier.train(training_set)
    return classifier.classify(user_name)
Exemplo n.º 25
0
def classify(text, sender=None, subject=None):
    training_set = load_training_set()
    classifier = NaiveBayesClassifier.train(training_set)
    test_data = bag_of_words(extract_bigrams(text))
    if sender is not None:
        test_data[sender] = True
    if subject is not None:
        test_data[subject] = True
    classified = classifier.prob_classify(test_data)
    pprint({categories[sample]: classified.prob(sample) for sample in classified.samples()})
    return categories[classified.max()]
Exemplo n.º 26
0
 def train(self, data):
     self.result_string = self._represent(data)
     self.labels = defaultdict(int)
     result_string_len = len(self.result_string)
     self.labels = FreqDist(self.result_string)
     train = []
     for start in range(0, len(self.result_string) - self.n_w, self.n_w - 1):
         window = self.result_string[start:start + self.n_w]
         x_key = self.result_string[start + self.n_w]
         train.append(self._gen_feature(window, x_key))
     self.classifier = NaiveBayesClassifier.train(train)
Exemplo n.º 27
0
def train(positiveFile='positive.csv', negativeFile='negative.csv', nOccurrences=25, trainProportion=0.9):
  files = [positiveFile, negativeFile]
  tweetfeats = []
  masterfeats = {}
  for fn in files:
    f = open(fn, 'r')
    theclass = "pos"
    if fn == negativeFile:
      theclass = "neg"
    sep = '\t'
    fin = csv.reader(f, delimiter = sep)
    for line in fin:
      text = line[1]
      if (len(line) != 9):
        print(text)
      # break up into tokens removing all non-word chars
      feat = featurify(text)
      for f in feat:
        if f in masterfeats:
          masterfeats[f] += 1
        else:
          masterfeats[f] = 0
      if len(feat) > 0:
        tweetfeats.append((feat, theclass))

  mfn = masterfeats.copy()
  for f in masterfeats:
    if masterfeats[f] < nOccurrences:
      del mfn[f]
  masterfeats = mfn
  f = open("features.lst", "w")
  f.write('\n'.join(list(masterfeats.keys())))
  f.close()
  print "Number of Features = %i" % len(masterfeats)

  train_cut = int(len(tweetfeats) * trainProportion)
  random.shuffle(tweetfeats)
  trainfeats = tweetfeats[:train_cut]
  testfeats = tweetfeats[train_cut:]

  print "Training sentiment classifier..."
  sys.stdout.flush()
  classifier = NaiveBayesClassifier.train(trainfeats)
  print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
  classifier.show_most_informative_features()
  sys.stdout.flush()

  # SAVE the classifier & features
  f = open("classifier.pickle", 'w')
  pickle.dump(classifier, f)
  f.close()
  f = open("features.pickle", 'w')
  pickle.dump(masterfeats, f)
  f.close()
Exemplo n.º 28
0
def cross_validate():
    training_set = load_training_set()
    random.shuffle(training_set)
    average = 0
    cv = KFold(len(training_set), n_folds=10, indices=True, shuffle=False, random_state=None)
    for traincv, evalcv in cv:
        classifier = NaiveBayesClassifier.train(training_set[traincv[0]:traincv[len(traincv) - 1]])
        acc = accuracy(classifier, training_set[evalcv[0]:evalcv[len(evalcv) - 1]])
        print 'Range: ', evalcv[0], 'to', evalcv[len(evalcv) - 1]
        print 'Accuracy: %4.2f' % acc
        average += acc
    print 'Average accuracy: %4.2f' % (average / 10)
Exemplo n.º 29
0
   def buildRevClassifier(self, features, normalize, validity):
      revs = self.values()
      random.shuffle(revs)

      featureSets = [(features(rev), rev.reviewer) for rev in self.values()]

      #limit = {'5':0, '4':0, '3':0, '2':0, '1':0}
      #for feature, rank in featureSets:
      #   if limit[rank] > normalize:
      #      featureSets.remove((feature, rank))
      #   limit[rank] += 1
      return NaiveBayesClassifier.train(featureSets)
pos_features = []
for words in pos_reviews:
    pos_features.append((bag_of_words(words), 'pos'))

# negative reviews feature set
neg_features = []
for words in neg_reviews:
    neg_features.append((bag_of_words(words), 'neg'))

shuffle(pos_features)
shuffle(neg_features)

test_feature_set = pos_features[:200] + neg_features[:200]
train_feature_set = pos_features[200:] + neg_features[200:]

classifier = NBC.train(train_feature_set)

accuracy = classify.accuracy(classifier, test_feature_set)
print(accuracy)
#f = open('unigram_classifier.pickle', 'wb')
#pickle.dump(classifier, f)
#f.close()

while (1):
    custom_review = input(
        "Enter a custom movie review (Press ENTER key to exit):\n")
    if (len(custom_review) < 1):
        break
    custom_review_tokens = word_tokenize(custom_review)
    custom_feature_set = bag_of_words(custom_review_tokens)
    print(classifier.classify(custom_feature_set))
Exemplo n.º 31
0
def nbtrain(train_set):
    classifier = NaiveBayesClassifier.train(train_set)
    return classifier
Exemplo n.º 32
0
ts = ts[:2]
#print ts
#feat_set=dict(feat_set)

training_data = zip(tl, ts)
#training_data=dict(training_data)

#training_data, test_set = feat_set[:700],feat_set[700:]

vocabulary = set(chain(*[word_tokenize(i[0].lower()) for i in training_data]))

feature_set = [
    ({i: (i in word_tokenize(sentence.lower()))
      for i in vocabulary}, tag) for sentence, tag in training_data
]

classifier = nbc.train(feature_set)

#for classifying a new sentence

test_sentence = tl[1]
featurized_test_sentence = {
    i: (i in word_tokenize(test_sentence.lower()))
    for i in vocabulary
}

print "test_sent:", test_sentence
print "tag:", classifier.classify(featurized_test_sentence)

#print nltk.classify.accuracy(classifier,test_set)
Exemplo n.º 33
0
        feature[u_word] = (u_word in doc)
    return feature


extract = extract_words(['admir', 'med', 'pesso'])
#print(extract)

# Retorna todas as palavras do documento, verifica se as palavras passada por parametro tem no documento e informe ao final sua classe(alegria ou medo)
dataset_train = apply_features(extract_words, words_stemmer_train)
dataset_test = apply_features(extract_words, words_stemmer_test)
#print(dataset)

# FAZENDO O MODELO COM NAIVE BAYES

# constroi uma tabela de probabilidade
classifier = NaiveBayesClassifier.train(dataset_train)
#print(classifier.labels())
#print(classifier.show_most_informative_features())
#print(accuracy(classifier, dataset_test))

errors = []
for feature, target in dataset_test:
    result = classifier.classify(feature)
    if result != target:
        errors.append((target, result, feature))

for (target, result, feature) in errors:
    print(target, result, feature)

# usando a matrix de confução para saber como está os dados em relação de erros e acertos
y_test = []
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

positive_dataset = [
    (tweet_dict, "Positive")  # creating the dictionary
    for tweet_dict in positive_tokens_for_model
]

negative_dataset = [(tweet_dict, "Negative")
                    for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset  # total dataset, includes positives and negatives
random.shuffle(dataset)  # shuffling it

train_data = dataset[:7000]  # train data consists of %70 of dataset
test_data = dataset[7000:]  # test data consists of %30 of dataset
classifier = NaiveBayesClassifier.train(
    train_data)  # classifying with Naive Bayes

print("Accuracy is:", classify.accuracy(classifier,
                                        test_data))  # accuracy of testing
print(classifier.show_most_informative_features(
    20))  # most informative 20 words of dataset

custom_tokens = remove_noise(word_tokenize(data))  # using our data
print(classifier.classify(dict([token, True] for token in custom_tokens)))
print(custom_tokens)

unique_words = set(custom_tokens)
freq_list = []

for words in unique_words:
    freq_list.append([custom_tokens.count(words), words])
Exemplo n.º 35
0
 def train(self, train_set):
     self.classifier = NaiveBayesClassifier.train(train_set)
     return self.classifier
Exemplo n.º 36
0
import nltk
from nltk import NaiveBayesClassifier
from nltk.corpus import names
import random

names = ([(name, 'male') for name in names.words('male.txt')] +
         [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)
print(names[:3])

feature_sets = [(gender_features(n), g) for (n, g) in names]
print(feature_sets[:3])
train_set, test_set = feature_sets[500:], feature_sets[:500]

classifier = NaiveBayesClassifier.train(train_set)
print(classifier.classify(gender_features('Neo')))
print(classifier.classify(gender_features('Trinity')))
print(nltk.classify.accuracy(classifier, test_set))
print(classifier.show_most_informative_features(5))

from nltk.classify import apply_features
train_set = apply_features(gender_features, names[500:])
test_set = apply_features(gender_features, names[:500])

"""
素性抽出関数を改善する
訓練データに偏った素性になってしまう -> 過学習
"""

from collections import OrderedDict
Exemplo n.º 37
0
def get_classifier():
    train_set = get_trains_set()
    return NaiveBayesClassifier.train(train_set)
Exemplo n.º 38
0
features_data = np.array(sentences)
features_data_test = np.array(testSentences)

k_fold = KFold(n_splits=10, random_state=1992, shuffle=True)
word_features = None
accuracy_scores = []
accuracy_data_scores = []
for train_set, test_set in k_fold.split(features_data):
    word_features = get_word_features(
        get_words_in_sentences(features_data[train_set].tolist()))
    train_features = apply_features(extract_features,
                                    features_data[train_set].tolist())
    test_features = apply_features(extract_features,
                                   features_data[test_set].tolist())
    classifier = NaiveBayesClassifier.train(train_features)

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    testdata_features = apply_features(extract_features,
                                       features_data_test.tolist())
    refdatasets = collections.defaultdict(set)
    testdatasets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(test_features):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    for i, (feats, label) in enumerate(testdata_features):
Exemplo n.º 39
0
def train(labeled_featuresets, estimator=ELEProbDist):
    label_probdist = estimator(label_freqdist)
    feature_probdist = {}
    return NaiveBayesClassifier(label_probdist, feature_probdist)
Exemplo n.º 40
0
def evaluate_model(dataset, train_percentage=0.9):
    feature_set = [(get_features(i), label) for (i, label) in dataset]
    count = int(len(feature_set) * train_percentage)
    train_set, test_set = feature_set[:count], feature_set[count:]
    classifier = NaiveBayesClassifier.train(train_set)
    return nltk.classify.accuracy(classifier, test_set)
Exemplo n.º 41
0
    def train_test_model(self):
        '''
        This functions is an entirely self contained, trained Naive Bayes Model for text sentiment analysis with a 75.467% accuracy

        Importing more positive and negative classified tweets could be used to improve the model.

        The results are stored in the self.trained_model variable for the DataTransform class
        '''

        print('Preprocessing classified tweets for model.')
        from nltk.corpus import twitter_samples
        import random

        positive_tweets = twitter_samples.strings('positive_tweets.json')
        negative_tweets = twitter_samples.strings('negative_tweets.json')

        positive_df = pd.DataFrame(positive_tweets).rename(columns={0: 'text'})
        negative_df = pd.DataFrame(negative_tweets).rename(columns={0: 'text'})

        dict_samp = {}
        positive_dict = []
        positive = []
        negative=[]
        negative_dict = []

        datatransform_positive = DataTransform()
        datatransform_positive.set_df(positive_df)
        datatransform_positive.clean_text('text','token_text')

        for i in range(len(datatransform_positive.output_df.index)):
            for j in range(len(datatransform_positive.output_df['token_text'][i])):
                dict_samp.update({datatransform_positive.output_df['token_text'][i][j]: True})
            positive_dict.append(dict_samp)
            dict_samp = {}
        
        for w in positive_dict:
            positive.append((w, 'Positive'))

        datatransform_negative = DataTransform()
        datatransform_negative.set_df(negative_df)
        datatransform_negative.clean_text('text','token_text')

        for i in range(len(datatransform_negative.output_df.index)):
            for j in range(len(datatransform_negative.output_df['token_text'][i])):
                dict_samp.update({datatransform_negative.output_df['token_text'][i][j]: True})
            negative_dict.append(dict_samp)
            dict_samp = {}
        
        for w in negative_dict:
            negative.append((w, 'Negative'))

        dataset = positive+negative

        random.shuffle(dataset)

        train_data = dataset[:7000]
        test_data = dataset[7000:]

        self.trained_model = NaiveBayesClassifier.train(train_data)

        print("Accuracy is:", classify.accuracy(self.trained_model, test_data))
        return




            
 def trainModel(self, train_data, test_data):
     return NaiveBayesClassifier.train(train_data)
    all_words += tweet[0]
freq = fd(all_words)
common = freq.most_common(200)
features = [i[0] for i in common]


def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features


training_data = [(get_feature_dict(tweet), sentiment)
                 for tweet, sentiment in clean_words_train]
testing_data = [(get_feature_dict(tweet)) for tweet in clean_words_test]
print(training_data)
print(testing_data[0])
classifier = nb.train(training_data)
output = []
# for tweet_words in testing_data:
#     print("--------------------------------")
#     print(tweet_words)
output = [classifier.classify(tweet_words) for tweet_words in testing_data]
print(output)
np.savetxt("predictions_twitter_sentimental.csv",
           output,
           fmt="%s",
           delimiter=" ")
Exemplo n.º 44
0
def train_test_evaluation():
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')

    print('Total number of positive_tweets are : ', len(positive_tweets))
    print('Total number of negative_tweets are : ', len(negative_tweets))
    print('-------------------------')
    print('one smaple of positive_tweets : ', positive_tweets[0])
    print('one smaple of negative_tweets : ', negative_tweets[0])
    print('-------------------------\n\n')

    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
    print('Total number of positive_tweet_tokens are : ',
          len(positive_tweet_tokens))
    print('Total number of negative_tweet_tokens are : ',
          len(negative_tweet_tokens))
    print('-------------------------')
    print('one smaple of positive_tweet_tokens : ', positive_tweet_tokens[0])
    print('one smaple of negative_tweet_tokens : ', negative_tweet_tokens[0])
    print('-------------------------\n\n')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    # all_pos_words = get_all_words(positive_cleaned_tokens_list)
    # freq_dist_pos = FreqDist(all_pos_words)
    # print('Most Frequent Items in Positive Tweets',freq_dist_pos.most_common(10))
    #
    # all_neg_words = get_all_words(negative_cleaned_tokens_list)
    # freq_dist_neg = FreqDist(all_neg_words)
    # print('Most Frequent Items in negative Tweets',freq_dist_neg.most_common(10))
    # print('-------------------------')

    positive_tokens_for_model = get_tweets_for_model(
        positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(
        negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:9000]
    test_data = dataset[9000:]

    print('Length of Train Data is : ', len(train_data))
    print(' A sample of Traing Data : ', train_data[0])
    print('-------------------------')
    print('Length of Test Data is : ', len(train_data))
    print(' A sample of Test Data : ', test_data[0])
    print('-------------------------')

    classifier = NaiveBayesClassifier.train(train_data)

    print("\n\n Accuracy is:", classify.accuracy(classifier, test_data))

    print(classifier.show_most_informative_features(10))

    f = open('tweeter_trained_cls.pickle', 'wb')
    pickle.dump(classifier, f)
    f.close()

    return classifier
def predict():

    import nltk
    nltk.download('twitter_samples')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('punkt')

    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.corpus import twitter_samples, stopwords
    from nltk.tag import pos_tag
    from nltk.tokenize import word_tokenize
    from nltk import FreqDist, classify, NaiveBayesClassifier
    import re, string, random
    import pickle

    def remove_noise(tweet_tokens, stop_words=()):

        cleaned_tokens = []

        for token, tag in pos_tag(tweet_tokens):
            token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                           '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
            token = re.sub("(@[A-Za-z0-9_]+)", "", token)

            if tag.startswith("NN"):
                pos = 'n'
            elif tag.startswith('VB'):
                pos = 'v'
            else:
                pos = 'a'

            lemmatizer = WordNetLemmatizer()
            token = lemmatizer.lemmatize(token, pos)

            if len(token
                   ) > 0 and token not in string.punctuation and token.lower(
                   ) not in stop_words:
                cleaned_tokens.append(token.lower())
        return cleaned_tokens

    def get_all_words(cleaned_tokens_list):
        for tokens in cleaned_tokens_list:
            for token in tokens:
                yield token

    def get_tweets_for_model(cleaned_tokens_list):
        for tweet_tokens in cleaned_tokens_list:
            yield dict([token, True] for token in tweet_tokens)

    if __name__ == "__main__":

        positive_tweets = twitter_samples.strings('positive_tweets.json')
        negative_tweets = twitter_samples.strings('negative_tweets.json')
        text = twitter_samples.strings('tweets.20150430-223406.json')
        tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

        stop_words = stopwords.words('english')

        positive_tweet_tokens = twitter_samples.tokenized(
            'positive_tweets.json')
        negative_tweet_tokens = twitter_samples.tokenized(
            'negative_tweets.json')

        positive_cleaned_tokens_list = []
        negative_cleaned_tokens_list = []

        for tokens in positive_tweet_tokens:
            positive_cleaned_tokens_list.append(
                remove_noise(tokens, stop_words))

        for tokens in negative_tweet_tokens:
            negative_cleaned_tokens_list.append(
                remove_noise(tokens, stop_words))

        all_pos_words = get_all_words(positive_cleaned_tokens_list)

        freq_dist_pos = FreqDist(all_pos_words)
        print(freq_dist_pos.most_common(10))

        positive_tokens_for_model = get_tweets_for_model(
            positive_cleaned_tokens_list)
        negative_tokens_for_model = get_tweets_for_model(
            negative_cleaned_tokens_list)

        positive_dataset = [(tweet_dict, "Positive")
                            for tweet_dict in positive_tokens_for_model]

        negative_dataset = [(tweet_dict, "Negative")
                            for tweet_dict in negative_tokens_for_model]

        dataset = positive_dataset + negative_dataset

        random.shuffle(dataset)

        train_data = dataset[:7000]
        test_data = dataset[7000:]

        classifier = NaiveBayesClassifier.train(train_data)

        print("Accuracy is:", classify.accuracy(classifier, test_data))

        print(classifier.show_most_informative_features(10))

        custom_tweet = ""

        if request.method == 'POST':
            custom_tweet = request.form['text']

        custom_tokens = remove_noise(word_tokenize(custom_tweet))

        NB_Cls = classifier.classify(
            dict([token, True] for token in custom_tokens))

        print(custom_tweet, NB_Cls)

        pickle.dump(NB_Cls, open('sentimental_101.pkl', 'wb'))

        return render_template('results.html', result=NB_Cls)
Exemplo n.º 46
0
    word_features = list(set(all_words))[:2000]

    def find_features(wordList):
        words = set(wordList)
        features = {}
        for w in word_features:
            features[w] = (w in words)

        return features

    training_set = []

    for wordList, category in documents:
        training_set.append((find_features(wordList), category))

    classifier = NaiveBayesClassifier.train(training_set)

while True:
    choose = 0

    print("Opinion List")
    print("============")

    if (len(opinionList) > 0):
        for index, opinion in enumerate(opinionList):
            print(str(index + 1) + ". " + opinion)
    else:
        print("No opinion inserted")

    print("Opinion Analysis")
    print("1. Insert Opinion")
class NBClassifier(TransformerMixin):
    """Naive Bayes classifier for part-of-text classification.

    The classifier creates a wrapper around NLTK NaiveBayesClassifier
    and implements `transform` and `fit_transform` methods suitable for
    pipeline integration.

        :param label_probdist:
            P(label), the probability distribution over labels.

            It is expressed as a ``ProbDistI`` whose samples are labels.
            I.e., P(label) = ``label_probdist.prob(label)``.

        :param feature_probdist:
            P(fname=fval|label), the probability distribution for feature values, given labels.

            It is expressed as a dictionary whose keys are ``(label, fname)``
            pairs and whose values are ``ProbDistI`` objects over feature values.
            I.e., P(fname=fval|label) = ``feature_probdist[label,fname].prob(fval)``.
            If a given ``(label,fname)`` is not a key in ``feature_probdist``,
            then it is assumed that the corresponding P(fname=fval|label)
            is 0 for all values of ``fval``.
    """
    def __init__(self,
                 label_probdist=None,
                 feature_probdist=None,
                 estimator=ELEProbDist):

        self._estimator = estimator

        # in case arguments are specified (ie. when restoring the classifier)
        if all([label_probdist, feature_probdist]):
            self._classifier = NaiveBayesClassifier(
                label_probdist=label_probdist,
                feature_probdist=feature_probdist,
            )
        else:
            self._classifier = None

    @property
    def features(self):
        if self._classifier is None:
            return None

        return self._classifier.most_informative_features()

    # noinspection PyPep8Naming, PyUnusedLocal
    def fit(self, X: typing.Iterable, y=None, **fit_params):  # pylint: disable=invalid-name,unused-argument
        """Fits the classifier to the given data set.

        :param X: Iterable, output of FeatureExtractor

            The X is expected to be an iterable of tuples (tagged_word, feature_set, label),
            where feature set is a dictionary of evaluated features.
            The format of X matches the output of `FeatureExtractor`.

        :param y: redundant (included to preserve base class method definition)
        """

        # NLTK classifier expects stacked featuresets for the training,
        # so we need to reduce the dimenstionality
        labeled_featuresets = list()
        for entry in X:
            labeled_featuresets.extend([
                (featureset, feature_label)
                for _, featureset, feature_label in entry
            ])

        # initialize the NLTK classifier
        self._classifier = NaiveBayesClassifier.train(
            labeled_featuresets, estimator=self._estimator)

        return self

    # noinspection PyPep8Naming, PyUnusedLocal
    def transform(self, X):  # pylint: disable=invalid-name,unused-argument
        """Auxiliary function to be used in pipeline."""

        return self

    # noinspection PyPep8Naming
    def evaluate(
            self,
            X: typing.Iterable,  # pylint: disable=invalid-name
            y: typing.Iterable,
            sample,
            n=3):
        """Perform evaluation of the classifier instance.

        :param X: Iterable, test data

            Same shape as for `fit` and `fit_predict` methods

        :param y: Iterable, of labels
        :param sample:

        one of labels to get the prediction for (for example,
                                                 if labels are ['class_A', 'class_B', 'class_C'], the sample
        could be 'class_A'.

        :param n: int, number of candidates to output
        """
        # noinspection PyTypeChecker,PyTypeChecker
        if len(X) != len(y):
            raise ValueError("`X` and `y` must be of the same length.")

        candidate_arr = self.fit_predict(X, n=n, sample=sample)

        correctly_predicted = 0
        for candidates, label in zip(candidate_arr, y):
            pred = self._valid_candidates(candidates, label)
            correctly_predicted += int(pred)

        # return the accuracy score
        # noinspection PyTypeChecker
        return precision(total=len(y), correct=correctly_predicted)

    # noinspection PyPep8Naming
    def fit_predict(self, X: typing.Iterable, y=None, **fit_params):  # pylint: disable=invalid-name,unused-argument
        """Makes prediction about the given data.

        :param X: Iterable, prediction data

            The prediction data is expected to be of type List[(name_tuple, feature_set [,feature,label)]
            where feature_set corresponds to the output of FeatureExtractor and feature labels (if provided)
            should be None (will be ignored anyway).

        :param y: redundant (included to preserve bace class method definition)
        :param fit_params: kwargs, fit parameters

            n: number of candidates to output
            sample: one of labels to get the prediction for (for example,
            if labels are ['class_A', 'class_B', 'class_C'], the sample
            could be 'class_A'.
        """
        # get fit parameters
        n = fit_params.get('n', 3)
        sample = fit_params.get('sample', None)

        # do not allow sample to be `None` (wouldn't be possible to sort
        # the candidates in a logical way)
        if sample is None:
            raise ValueError("`fit_parameter` `sample` was not specified."
                             " This is not allowed in `fit_predict` method")

        if not all([hasattr(var, '__len__') for var in [X, y or []]]):
            raise TypeError("`X` and `y` must implement `__len__` method")

        # noinspection PyTypeChecker
        predictions = [None] * len(X)
        for i, x in enumerate(X):
            candidate_pred = [None] * len(x)
            for j, candidate in enumerate(x):
                if len(candidate) == 3:
                    # feature label was provided as part of X set (usual case), ignore it
                    name_tuple, features, _ = candidate
                else:
                    name_tuple, features = candidate
                candidate_pred[j] = (name_tuple,
                                     self.predict(features, sample=sample))

            sorted_pred = sorted(candidate_pred,
                                 key=lambda t: t[1],
                                 reverse=True)
            predictions[i] = sorted_pred[:n]

        return np.array(predictions)

    def predict(self, features: dict, sample=None) -> typing.Any:
        """Make predictions based on given features.

        :param features: dict, features to be used for prediction

            Dictionary of (feature_key, feature_value)

        :param sample:

            one of labels to get the prediction for (for example,
            if labels are ['class_A', 'class_B', 'class_C'], the sample
            could be 'class_A'.

        :returns: Union[float, dict]

            If `sample` is specified, returns P(sample|features),
            ie the probability of `sample` given features,
            where `sample` is one of labels.
            Otherwise returns dict of (label: max_prob) for all
            known labels.
        """
        if self._classifier is None:
            raise ValueError("Unable to make predictions. "
                             "Classifier has not been trained yet!")

        prob_dist = self._classifier.prob_classify(features)
        # sort by the probability

        if sample is not None:
            probs = prob_dist.prob(sample)
        else:
            probs = {s: prob_dist.prob(s) for s in self._classifier.labels()}

        return probs

    def show_most_informative_features(self):
        if self._classifier is None:
            return

        self._classifier.show_most_informative_features()

    def export(self, export_dir=None, export_name=None) -> str:
        """Exports timestamped pickled classifier to the given directory.

        :returns: path to the timestamped .checkpoint file
        """
        export_dir = export_dir or 'export/'
        export_name = export_name or 'classifier'

        if export_name.endswith('.checkpoint'):
            export_name = ".".join(export_name.split('.')[:-1])

        time_stamp = str(datetime.datetime.now().timestamp())

        # create export directory
        os.makedirs(export_dir, exist_ok=True)

        time_stamped_fname = ".".join([export_name, time_stamp, 'checkpoint'])
        time_stamped_fpath = os.path.join(export_dir, time_stamped_fname)

        # pickle and export the classifier
        with open(time_stamped_fpath, 'wb') as exp_file:
            pickle.dump(self, exp_file)

        return time_stamped_fname

    @staticmethod
    def restore(checkpoint) -> "NBClassifier":
        """Restores the classifier from a checkpoint file.

        :param checkpoint: path to directory or specific checkpoint

            If path to directory provided, the newest checkpoint
            is restored.
        """
        def _restore_checkpoint(fp):
            with open(fp, 'rb') as checkpoint_file:
                # load the exported classifier
                return pickle.load(checkpoint_file)

        if os.path.isdir(checkpoint):
            checkpoint_dir = checkpoint
            checkpoints = [
                os.path.join(checkpoint_dir, f) for f in os.listdir(checkpoint)
                if f.endswith('.checkpoint')
            ]
            # find the latest
            if not checkpoints:
                raise ValueError(
                    "No checkpoints were found in `{}`.".format(checkpoint))
            latest_checkpoint = sorted(checkpoints)[-1]
            clf = _restore_checkpoint(latest_checkpoint)

        else:
            clf = _restore_checkpoint(checkpoint)

        return clf

    @staticmethod
    def _valid_candidates(candidates: typing.Iterable, label):
        """Check whether the correct label is among candidates."""
        for candidate, _ in candidates:
            # FIXME: a bug here, NLTK lets weird things like '**' go through -> causes crash
            candidate_name, _ = candidate
            try:
                if re.search(candidate_name, label, flags=re.IGNORECASE):
                    return True
            except:
                return False

        return False
Exemplo n.º 48
0
 def __init__(self, feat_sets):
     self.train_set = feat_sets[:9500]
     self.test_set = feat_sets[9500:]
     self.Multinomial_classifier = SklearnClassifier(MultinomialNB())
     self.bernoulli_classifier = SklearnClassifier(BernoulliNB())
     self.naivebayes_classifier = NaiveBayesClassifier.train(self.train_set)
def sentim(self, data):
    stop_words = ['the', 'an', 'the', 'i', 'a', 'and', 'to'] #, 'none'] #, 'heartworm', ' distemper/parvo'] #stopwords.words('english')

    path_csv = '../data/csv/tf_idf_adoptable_csv.csv'
    df = read_df_csv(path_csv)
    X_negative = df["description"] #data
    corpus_dirty = []
    for doc in range(len(X_negative)):
        str_corpus = str(X_negative[doc])
        corpus_dirty.append(str_corpus)

    negative_documents = []
    for doc in range(len(X_negative)):
        record = X_negative[doc]
        record = (record.lower())
        replaced = record.replace(", '...'", "").replace("...", '').replace('\d+', '') 
        remove_digits = str.maketrans('', '', digits) 
        replaced = replaced.translate(remove_digits) 
        clean = replaced.replace(", '...'", "").replace("...", '')
        negative_documents.append(clean)
    # print(documents)
# #     # 2. Create a set of tokenized documents.
    negative_descriptions = [word_tokenize(content) for content in negative_documents]

    negative_cleaned_tokens_list = []
    for tokens in negative_descriptions:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_neg_words = get_all_words(negative_cleaned_tokens_list)
    
    
    freq_dist_neg = FreqDist(all_neg_words)
    print("most common ADOPTABLE words: ", freq_dist_neg.most_common(10))

    ##################################################################
    ##################################################################
    ##################################################################

    
    path_csv = '../data/csv/tf_idf_adopted_csv.csv'
    df = read_df_csv(path_csv)
    X_positive = df["description"] #data
    corpus_dirty = []
    for doc in range(len(X_positive)):
        str_corpus = str(X_positive[doc])
        corpus_dirty.append(str_corpus)

    positive_documents = []
    for doc in range(len(X_positive)):
        record = X_positive[doc]
        record = (record.lower())
        replaced = record.replace(", '...'", "").replace("...", '').replace('\d+', '') 
        remove_digits = str.maketrans('', '', digits) 
        replaced = replaced.translate(remove_digits) 
        clean = replaced.replace(", '...'", "").replace("...", '')
        positive_documents.append(clean)
    # print(documents)
# #     # 2. Create a set of tokenized documents.
    positive_descriptions = [word_tokenize(content) for content in positive_documents]
    # print("\n\nPositive Descriptions Tokenized: ", positive_descriptions)
    # ['dora', 'female', 'shep', 'mix', 'brindle', 'dhpp', 'kc', '//', 'no', 'puppy', 'hi', 'cathleen', ',', 'she', 'is', 'doing', 'great', 'and', 'really', 'starting'], ['meet', 'nova', '!', 'now', 'that', 'she', 'is', 'done', 'raising', 'her', 'pups', 'she', 'is', 'looking', 'for', 'a', 'home', 'of', 'her', 'own', 'where']]
    
    positive_cleaned_tokens_list = []
    for tokens in positive_descriptions:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))


    
    
    all_pos_words = get_all_words(positive_cleaned_tokens_list)
    
    # save_documents = open("pickled_algos/all_pos_words.pickle","wb")
    # pickle.dump(positive_cleaned_tokens_list, save_documents)
    # save_documents.close()
    

    freq_dist_pos = FreqDist(all_pos_words)
    print("most common ADOPTED words: ", freq_dist_pos.most_common(10))

    ##################################################################
    ##################################################################
    ##################################################################
    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    # positive_tokens_for_model = all_pos_words.pickle
    
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
    

    
    
    
    positive_dataset = [(description_dict, "Positive")
                    for description_dict in positive_tokens_for_model]

    negative_dataset = [(description_dict, "Negative")
                        for description_dict in negative_tokens_for_model]
    
    # print("positive_dataset: ", positive_dataset)
    # print("negative_dataset: ", negative_dataset)


    dataset = positive_dataset + negative_dataset
    seventy_percent_of_data = int(len(dataset) * .7)
    thirty_percent_of_data = int(len(dataset) * .3)
    # print(thirty_percent_of_data) #361

    random.shuffle(dataset) #to avoid bias

    train_data = dataset[:seventy_percent_of_data]
    test_data = dataset[thirty_percent_of_data:]

    classifier = NaiveBayesClassifier.train(train_data)
    # classifier = MultinomialNB.fit(train_data)
    save_classifier = open("naivebayes_pet.pickle","wb")
    pickle.dump(classifier, save_classifier)
    save_classifier.close()

    print("%%%%%%%%%%%%%%%%%%%Accuracy is:", classify.accuracy(classifier, test_data))

    print(classifier.show_most_informative_features(10))
    
    # from nltk.corpus import twitter_samples
    # print("&&&&&&&&&&&&&&&&&&&&&&&&&")
    # print(twitter_samples)
    data = str(data)
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    for ele in data:  
        if ele in punc:  
            data = data.replace(ele, "")
    data = data.split()
    # print("tokenized data: ", data)
    
    #breakdown parts of speech
    parts_of_speech = [] 
    parts_of_speech.append(nltk.pos_tag(data))
    print("parts of speech tagging: ", parts_of_speech) 
    #lemmatized data:
    stop_words = [] #left here in case I want to add words in the future
    cleaned_tokens = []


    for token, tag in nltk.pos_tag(data):
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos) 



        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    
    custom_tokens = remove_noise(word_tokenize(str(data)))

    print(str(data), classifier.classify(dict([token, True] for token in custom_tokens)))

    sentiment_result = [classifier.classify(dict([token, True] for token in custom_tokens))]

    print("sentiment_result: ", type(sentiment_result), sentiment_result)

    data = sentiment_result
    return data
Exemplo n.º 50
0
#print(positive)

sad_token = get_tweets_for_model(negative)
joy_token = get_tweets_for_model(positive)

negative_dataset = [(tweet_dict, "negative") for tweet_dict in sad_token]

positive_dataset = [(tweet_dict, "positive") for tweet_dict in joy_token]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)
train_data = dataset[:900]
test_data = dataset[900:]

classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

# Connect to MariaDB Platform
try:
    conn = mariadb.connect(
        user="******",  #- enter your username
        #password="******" - enter your password
        database="tcsproject"  # - enter your database name
    )
except mariadb.Error as e:
    print(f"Error connecting to MariaDB Platform: {e}")
    sys.exit(1)
Exemplo n.º 51
0
            for word in features:

                if word not in labelled_features:
                    labelled_features[word.lower()] = label_count

                labelled_features[word.lower()][label] += features[word]

            print "Currently at %d distinct tokens and %d papers" % (
                len(labelled_features), samplecount)

    label_probdist = get_label_probdist(labelled_features)

    feature_probdist = get_feature_probdist(labelled_features)

    classifier = NaiveBayesClassifier(label_probdist, feature_probdist)

    for samplefile in test_samples:
        features = {}

        p = PaperParser()
        p.parsePaper(samplefile)

        for sentence in p.extractRawSentences():
            tokens = nltk.word_tokenize(sentence)

            for word in tokens:
                features[word] = True

        dirname = os.path.basename(os.path.dirname(samplefile))
        label = labels[dirname]
Exemplo n.º 52
0
def train(all_features, ratio):
    train_size = int(len(all_features) * ratio)
    train_set, test_set = all_features[:train_size], all_features[train_size:]
    clf = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, clf
Exemplo n.º 53
0
print("Dictionary with Positive class : ", positiveReviewDataset[7])
print("Dictionary with Negative class : ", negativeReviewDataset[7])
#print("tagged neg :",negative_dataset[0])

dataset = positiveReviewDataset + negativeReviewDataset

print("Dataset[0] :", dataset[0])
print("Dataset length", len(dataset))

random.shuffle(dataset)

trainData = dataset[:7000]
testData = dataset[7000:]

trainedModel = NaiveBayesClassifier.train(trainData)

print("Accuracy of the model : ", classify.accuracy(trainedModel, testData))

review = "This is a bad product."
reviewTokens = noiseRemoval(word_tokenize(review))

# Test print
print(review, " : ",
      trainedModel.classify(dict([token, True] for token in reviewTokens)))

#Text = "j@nittha"
#Text = re.sub("@", "a", Text)
#print(Text)

Exemplo n.º 54
0
 def train_topic_classifier(self, train_set):
     classifier = NaiveBayesClassifier.train(train_set)
     return classifier
    print("Also see: Hindu Marriage Act")
elif resultc != -1 or y == "Christian":
    f1 = open("Christian.txt")
    f2 = open("christian01.txt")
    l1 = f1.read()
    arr = sent_tokenize(l1)
    l2 = f2.read()
    arr2 = word_tokenize(l2)
    for i in range(0, len(arr)):
        li1.append(tuple((arr[i], arr2[i])))
    f1.close()
    f2.close()
    print("Also see: Indian Divorce Act")
mycase = sys.argv[3]
#mycase=input("enter your case ")
c1 = 0
c2 = 0
model = NaiveBayesClassifier(li1)
#model=nltk.NaiveBayesClassifier.train(li1)
#print(model.classify(mycase))
case = sent_tokenize(mycase)
print(mycase)
for i in range(0, len(case)):
    temp = model.classify(case[i])

    if temp == "0":
        c1 = c1 + 1
    else:
        c2 = c2 + 1
print("Probability of winning case", (c1 / (c1 + c2)) * 100)
def sentim_twitter(self, data):
    '''heavily borrowed from https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk
    to show functioning model'''
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
    
    

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:700]
    test_data = dataset[700:]

    classifier = NaiveBayesClassifier.train(train_data)
    print("twitter data **********************************")

    print("%%%%%%%%%%%%%%%%%%% Twitter Accuracy is:", classify.accuracy(classifier, test_data))
    print("twitter data **********************************")

    print(classifier.show_most_informative_features(10))

    # data = (data)

    # custom_tweet = str(data) 
    print("twitter data **********************************")
    print("twitter data **********************************")
    print("is this reading data correctly???: ", type(str(data)))
    custom_tweet = str(data)
    # this gives negative
    
    
    
    custom_tokens = remove_noise(word_tokenize(custom_tweet))
    print("twitter data **********************************")
    print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens)))
    twitter =  classifier.classify(dict([token, True] for token in custom_tokens))
    return twitter
Exemplo n.º 57
0
 def train_model(self, data):
     self.model = NaiveBayesClassifier.train(data)
Exemplo n.º 58
0
    def train(self, corpus, selected_feats):
        train_set = self.parse_corpus(corpus)
        print('Train set:', len(train_set))

        # # unigram
        self.unigrams = Counter([
            word for chat, win, duration, extra in train_set for word in chat
        ])
        self.common_unigrams = [
            unigram for unigram, value in self.unigrams.items() if value > 1
        ]
        # print(len(self.unigrams), len(self.common_unigrams))

        # # bigram
        self.bigrams = Counter([
            ' '.join((word, chat[i + 1]))
            for chat, win, duration, extra in train_set
            for i, word in enumerate(chat[:-1])
        ])
        self.common_bigrams = [
            bigram for bigram, value in self.bigrams.items() if value > 1
        ]
        # print(len(self.bigrams), len(self.common_bigrams))
        # # trigram
        self.trigrams = Counter([
            ' '.join((word, chat[i + 1], chat[i + 2]))
            for chat, win, duration, extra in train_set
            for i, word in enumerate(chat[:-2])
        ])
        self.common_trigrams = [
            trigram for trigram, value in self.trigrams.items() if value > 1
        ]
        # print(len(self.trigrams), len(self.common_trigrams))
        # # fourgram
        self.fourgrams = Counter([
            ' '.join((word, chat[i + 1], chat[i + 2], chat[i + 3]))
            for chat, win, duration, extra in train_set
            for i, word in enumerate(chat[:-3])
        ])
        self.common_fourgrams = [
            fourgram for fourgram, value in self.fourgrams.items() if value > 1
        ]
        # print(len(self.fourgrams), len(self.common_fourgrams))
        # # fivegram
        self.fivegrams = Counter([
            ' '.join(
                (word, chat[i + 1], chat[i + 2], chat[i + 3], chat[i + 4]))
            for chat, win, duration, extra in train_set
            for i, word in enumerate(chat[:-4])
        ])
        self.common_fivegrams = [
            fivegram for fivegram, value in self.fivegrams.items() if value > 1
        ]
        # print(len(self.fivegrams), len(self.common_fivegrams))

        ###### WP30 PLOT #######
        # wp30s = [len(chat) // (duration / 1800) for chat,win,duration,extra in train_set]
        # n, bins, patches = plt.hist(wp30s, 100,alpha=0.75)
        # plt.show()
        # self.doclen = Counter([len(chat) for chat,win,duration in train_set])

        ###### CHATTER PLOT ######
        # data = []
        # for chat, win, duration,extra in w8m8.iterate(train_set, out='Training'):
        #     nchars = [0,0,0,0,0]
        #     for player, message in extra:
        #         nchars[player] += len(message)
        #     avg = sum(nchars) / 5
        #     data.append(max(nchars) / avg)
        # n, bins, patches = plt.hist(data, 1000,alpha=0.75)
        # plt.show()

        t = []
        for chat, win, duration, extra in w8m8.iterate(train_set,
                                                       out='Training'):
            features = self.get_features(chat, duration, extra, selected_feats)
            t.append((features, win))
        self.classifier = NaiveBayesClassifier.train(t)
        self.classifier.show_most_informative_features(20)
Exemplo n.º 59
0
def main():
    print('Building model...')
    print('Gathering training data...')

    # set nltk twitter samples as list of strings
    pos_sample_tweets = twitter_samples.strings('positive_tweets.json')
    neg_sample_tweets = twitter_samples.strings('negative_tweets.json')

    #### UPDATE HERE: Option to add your own tweet samples
    #### Remove the empty list, uncomment and update filepaths below
    pos_custom_tweets = []  ## helpers.import_csv('positive_tweets.csv')
    neg_custom_tweets = []  ## helpers.import_csv('negative_tweets.csv')

    # combine nltk twitter samples and custom tweets
    positive_tweets = pos_sample_tweets + pos_custom_tweets
    negative_tweets = neg_sample_tweets + neg_custom_tweets

    # tokenize tweets
    positive_tweet_tokens = [casual_tokenize(i) for i in positive_tweets]
    negative_tweet_tokens = [casual_tokenize(i) for i in negative_tweets]

    # set cleaned tokens lists
    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    stop_words = stopwords.words('english')

    # get cleaned positive tokens
    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(
            helpers.remove_noise(tokens, stop_words))

    # get cleaned negative tokens
    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(
            helpers.remove_noise(tokens, stop_words))

    # convert tokens into iterable word lists
    all_pos_words = helpers.get_all_words(positive_cleaned_tokens_list)
    all_neg_words = helpers.get_all_words(negative_cleaned_tokens_list)

    # get frequency distribution of word lists
    freq_dist_pos = FreqDist(all_pos_words)
    freq_dist_neg = FreqDist(all_neg_words)

    # print top 10 positive and negative words
    print('Top 10 positive and negative words:')
    print(freq_dist_pos.most_common(10))
    print(freq_dist_neg.most_common(10))

    # convert tokens to a dictionary for modelling
    positive_tokens_for_model = helpers.get_tweets_for_model(
        positive_cleaned_tokens_list)
    negative_tokens_for_model = helpers.get_tweets_for_model(
        negative_cleaned_tokens_list)

    # assign a label to positive tokens
    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    # assign a label to negative tokens
    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    # set dataset and randomize to train model
    dataset = positive_dataset + negative_dataset
    random.shuffle(dataset)

    # split the data into a 70:30 ratio among 10K tweets
    train_data = dataset[:7000]
    test_data = dataset[7000:]

    # train a Naive Bayes model
    classifier = NaiveBayesClassifier.train(train_data)

    # print model accuracy
    print("Model accuracy is:", classify.accuracy(classifier, test_data))
    print(classifier.show_most_informative_features(10))
    print('Model complete!\n')

    return classifier
Exemplo n.º 60
-1
def category_by_pos():
    from nltk.corpus import brown
    from nltk import FreqDist
    from nltk import DecisionTreeClassifier
    from nltk import NaiveBayesClassifier
    from nltk import classify

    suffix_fdist = FreqDist()
    for word in brown.words():
        word = word.lower()
        suffix_fdist.inc(word[-1:])
        suffix_fdist.inc(word[-2:])
        suffix_fdist.inc(word[-3:])

    common_suffixes = suffix_fdist.keys()[:100]
#    print common_suffixes

    def pos_features(word):
        features = {}
        for suffix in common_suffixes:
            features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
        return features

    tagged_words = brown.tagged_words(categories='news')
    featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
#    classifier = DecisionTreeClassifier.train(train_set)
#    print 'Decision Tree %f' % classify.accuracy(classifier, test_set)

    classifier = NaiveBayesClassifier.train(train_set)
    print 'NaiveBay %f' % classify.accuracy(classifier, test_set)