def train(samples_proportion=0.7): global words_in_ham, ham_word_count, words_in_spam, spam_word_count, raw_ham_prob, raw_spam_prob ham, spam = read_spam_ham() print("Spam size: " + str(len(spam)) + " Ham size: " + str(len(ham))) all_emails = append_ham_and_spam(ham, spam) random.shuffle(all_emails) print('Corpus size = ' + str(len(all_emails)) + ' emails') features = [(Preprocessor.get_features(email, ' '), label) for (email, label) in all_emails] print('Collected ' + str(len(features)) + ' feature sets') ''' # define Support value in % support = 10 spam_support_count = (spam_size * 10) / 100; ham_support_count = (ham_size * 10) / 100; print('Spam support count:' + str(spam_support_count)) print('Ham support count:' + str(ham_support_count)) # get the spam frequent itemset and ham frequent itemset # spam_frequent, ham_frequent = get_frequent(all_features, spam_support_count, ham_support_count) # train the our own naivebayes classifier and collect dictionary of raw probabilities of words ''' train_size = int(len(features) * samples_proportion) train_set, test_set = features[:train_size], features[train_size:] ham_mail_count, spam_mail_count = mails_in_ham_spam(train_set) spam_prior = 1.0 * spam_mail_count / len(train_set) ham_prior = 1.0 * ham_mail_count / len(train_set) words_in_ham, words_in_spam = frequency_in_ham_spam(train_set) spam_vocab = len(spam_word_count) ham_vocab = len(ham_word_count) t = get_probabilities_in_each_class(ham_prior, words_in_ham, ham_vocab, ham_word_count, raw_ham_prob, raw_spam_prob, spam_prior, words_in_spam, spam_vocab, spam_word_count, test_set, train_set) ham_prior, words_in_ham, ham_vocab, raw_ham_prob, raw_spam_prob, spam_prior, words_in_spam, spam_vocab, test_set, train_set = get_parameters( t) #print("Train Size:" + str(len(train_set)) + str(' Test size:') + str(len(test_set))) #evaluate(train_set, test_set, raw_spam_prob, raw_ham_prob, words_in_spam, words_in_ham, spam_vocab, ham_vocab, # spam_prior, # ham_prior) classifier = NaiveBayesClassifier(list(spam_word_count), list(ham_word_count)) t = classifier.prob_classify(classifier, train_set).max()
def train(labeled_featuresets, estimator=ELEProbDist): """Runs test sentences back through the model to train the model. """ # Create the P(label) distribution label_probdist = esitmator(label_freqdist) #Create the P(fval | label, fname) distribution feature_probdist = {} return NaiveBayesClassifier(label_probdist, feature_probdist)
def __init__(self, label_probdist=None, feature_probdist=None, estimator=ELEProbDist): """Initialize NBClassifier.""" self._estimator = estimator # in case arguments are specified (ie. when restoring the classifier) if all([label_probdist, feature_probdist]): self._classifier = NaiveBayesClassifier( label_probdist=label_probdist, feature_probdist=feature_probdist, ) else: self._classifier = None
def train(labeled_featuresets, estimator=ELEProbDist): # Create the P(label) distribution label_probdist = estimator(label_freqdist) # Create the P(fval|label, fname) distribution feature_probdist = {} return NaiveBayesClassifier(label_probdist, feature_probdist)
from nltk import NaiveBayesClassifier classifier = NaiveBayesClassifier()
for fileid in movie_reviews.fileids(category)] random.shuffle(documents) allWords = [] for w in movie_reviews.words(): allWords.append(w.lower()) allWords = nltk.FreqDist(allWords) wordFeatures = list(allWords.keys())[:3000] def findFeatures(document): words = set(document) features = {} for w in wordFeatures: features[w] = (w in words) return features #print((findFeatures(movie_reviews.words('neg/cv000_29416.txt')))) featureSets = [(findFeatures(rev), category) for (rev, category) in documents ] new_training_set = featureSets[:100] testing_set = featureSets[100:] cl = NaiveBayesClassifier(new_training_set) print(cl.accuracy(testing_set))
print("Also see: Hindu Marriage Act") elif resultc != -1 or y == "Christian": f1 = open("Christian.txt") f2 = open("christian01.txt") l1 = f1.read() arr = sent_tokenize(l1) l2 = f2.read() arr2 = word_tokenize(l2) for i in range(0, len(arr)): li1.append(tuple((arr[i], arr2[i]))) f1.close() f2.close() print("Also see: Indian Divorce Act") mycase = sys.argv[3] #mycase=input("enter your case ") c1 = 0 c2 = 0 model = NaiveBayesClassifier(li1) #model=nltk.NaiveBayesClassifier.train(li1) #print(model.classify(mycase)) case = sent_tokenize(mycase) print(mycase) for i in range(0, len(case)): temp = model.classify(case[i]) if temp == "0": c1 = c1 + 1 else: c2 = c2 + 1 print("Probability of winning case", (c1 / (c1 + c2)) * 100)
def train(labeled_featuresets, estimator=ELEProbDist): label_probdist = estimator(label_freqdist) feature_probdist = {} return NaiveBayesClassifier(label_probdist, feature_probdist)
for word in features: if word not in labelled_features: labelled_features[word.lower()] = label_count labelled_features[word.lower()][label] += features[word] print "Currently at %d distinct tokens and %d papers" % ( len(labelled_features), samplecount) label_probdist = get_label_probdist(labelled_features) feature_probdist = get_feature_probdist(labelled_features) classifier = NaiveBayesClassifier(label_probdist, feature_probdist) for samplefile in test_samples: features = {} p = PaperParser() p.parsePaper(samplefile) for sentence in p.extractRawSentences(): tokens = nltk.word_tokenize(sentence) for word in tokens: features[word] = True dirname = os.path.basename(os.path.dirname(samplefile)) label = labels[dirname]
def updateNaiveBayes(): cl = NaiveBayesClassifier(new_training_set) print(cl.accuracy(testing_set))
def train(): classifer = NaiveBayesClassifier(training_data) f = open('algorithm.pickle', 'wb') pickle.dump(classifer, f) f.close()
import json import re from nltk import NaiveBayesClassifier def clean_tweet(tweet): return ' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split()) mydata = [] json_data = open('convertcsv.json', 'r') data = json.load(json_data) for d in data: if d.get('hate_speech') == 0: mydata.append({"text": clean_tweet(d.get('tweet')), "label": "pos"}) else: mydata.append({"text": clean_tweet(d.get('tweet')), "label": "neg"}) cl = NaiveBayesClassifier(mydata, format="json") cl.classify("This is an amazing library!")