def train_nlp(self, features_train, labels_train): """ Trains two Naive Bayes classifiers, one for author names and one for titles. :param cur: A database pointer for the Goodreads data (see pull_data.py). :param con: The database connection. :return: nlp_title: An NLP classifier for titles. nlp_author: An NLP classifier for authors. """ # Organize title and author data train_data = list(zip(features_train, labels_train)) title_data = [(title_prep(record[0][0]), record[1]) for record in train_data] author_data = [(author_prep(record[0][1]), record[1]) for record in train_data] # Train the classifiers using the training data # Title Classifier self.nlp_title = NaiveBayesClassifier.train(title_data) # nltk.classify.util.accuracy(clf, title_data[divide:]) # Author Classifier self.nlp_author = NaiveBayesClassifier.train(author_data)
def train(self, clf_type): print('Training classifier...') words, labels = self.load_data(self.train_path) self.pos = [t[1] for t in nltk.pos_tag(words)] self.previous_labels = ["O"] + labels # next_labels = labels[1:] + ['O'] features = [self.features(words, i) for i in range(len(words))] train_samples = [(f, l) for (f, l) in zip(features, labels)] if clf_type == 'SVM': # classifier = SklearnClassifier( make_pipeline(StandardScaler(with_mean=False), SVC(kernel='rbf', # probability=True, max_iter=1000))).train(train_samples) classifier = SklearnClassifier(LinearSVC()).train(train_samples) elif clf_type == 'MLP': classifier = SklearnClassifier( MLPClassifier()).train(train_samples) elif clf_type == 'Naive Bayes': classifier = NaiveBayesClassifier.train(train_samples) else: classifier = MaxentClassifier.train(train_samples, max_iter=self.max_iter) self.dict_classifiers[clf_type] = classifier self.pos = self.previous_labels = None
def analyze_data(pos_train, neg_train, pos_test, neg_test): global tweets pos_tweets = read_tweets(pos_train, 'positive') neg_tweets = read_tweets(neg_train, 'negative') # filter away words that are less than 3 letters to form the training data for (words, sentiment) in pos_tweets + neg_tweets: words_filtered = [e.lower() for e in words.split() if len(e) >= 3] tweets.append((words_filtered, sentiment)) # get the training set and train the Naive Bayes Classifier training_set = nltk.classify.util.apply_features(extract_features, tweets) classifier = NaiveBayesClassifier.train(training_set) # read in the test tweets and check accuracy # to add your own test tweets, add them in the respective files test_tweets = read_tweets(pos_test, 'positive') test_tweets.extend(read_tweets(neg_test, 'negative')) total = accuracy = float(len(test_tweets)) for tweet in test_tweets: if classify_tweet(tweet[0], classifier) != tweet[1]: accuracy -= 1 tot_accuracy = accuracy / total * 100 print("\n\nResults:") print("######################################") print(" Total accuracy: ", end="") print('%.3f' % tot_accuracy, end="") print("%", end="") print(' (%d/%d)! ' % (accuracy, total)) print("######################################")
def train(self): print 'Classifier Training in progress....' poscutoff = len(self.positiveFeatures) negcutoff = len(self.negativeFeatures) print "Train Pos Cutoff: " + str(poscutoff) + " Train Neg Cutoff: " + str(negcutoff) trainfeats = self.positiveFeatures[:poscutoff] + self.negativeFeatures[:negcutoff] testfeats = self.test() print 'Train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) self.classifier = NaiveBayesClassifier.train(trainfeats) print 'accuracy:', accuracy(self.classifier, testfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = self.classifier.classify(feats) #print label, observed testsets[observed].add(i) print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos']) print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
def __init__(self, classList, featureMatrix): super(NaiveBayes, self).__init__() print "\n-------------------------\nNaive Bayes:\n-------------------------\n" self.classes = classList self.featureMatrix = featureMatrix self.nb = NB.train(zip(featureMatrix, classList)) self.showMostInformativeFeatures()
def build_classifier(self): #print "Informal" self.labeled_features = self.build_informal_set() #print "Formal" self.labeled_features.extend(self.build_formal_set()) classifier = learner.train(self.labeled_features) #classifier.show_most_informative_features() return classifier
def train(self): self._test_set = [ ({word: (word in pt.applyTokenizer(x[0])) for word in _LEXICON}, x[1]) for x in __TRAIN_SET__ ] #print("> Test Set: ", self._test_set) #self._training_set = apply_features(self.extractFeature, self._test_set) self._classifier = NaiveBayesClassifier.train(self._test_set)
def evaluate_features(feature_select, best_words): posFeatures = [] negFeatures = [] sentences = read_in_tweets(twitter_data) random.shuffle(sentences) sentences = sentences[:100000] posSentences = [] negSentences = [] for tup in sentences: if tup[0]=='0': negSentences.append(tup[1]) elif tup[0]=='4': posSentences.append(tup[1]) for i in posSentences: posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords = [feature_select(posWords,best_words), 'pos'] posFeatures.append(posWords) for i in negSentences: negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords = [feature_select(negWords,best_words), 'neg'] negFeatures.append(negWords) # selects 3/4 of the features to be used for training and 1/4 to be used for testing posCutoff = int(math.floor(len(posFeatures) * 3 / 4)) negCutoff = int(math.floor(len(negFeatures) * 3 / 4)) trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff] testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:] # trains a Naive Bayes Classifier classifier = NaiveBayesClassifier.train(trainFeatures) # initiates referenceSets and testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) # puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = classifier.classify(features) testSets[predicted].add(i) # prints metrics to show how well the feature selection did print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures)) print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures) print 'pos precision:', nltk.metrics.precision(referenceSets['pos'], testSets['pos']) print 'pos recall:', nltk.metrics.recall(referenceSets['pos'], testSets['pos']) print 'neg precision:', nltk.metrics.precision(referenceSets['neg'], testSets['neg']) print 'neg recall:', nltk.metrics.recall(referenceSets['neg'], testSets['neg']) classifier.show_most_informative_features(10)
def modelTrainingLexicon(traginingData, testData): print("--Lexicon Model--") tab = [] dataLexiconFeature = [] dataLexiconFeatureT = [] for data in traginingData: booleanNeg = False pos_score = neg_score = obj_score = 0 tagData = pos_tag(data[0]) negationData = mark_negation(data[0]) pos_score, neg_score, obj_score =tagCount(data,tagData,negationData,pos_score,neg_score,obj_score,booleanNeg) total = int(pos_score) - int(neg_score) if (total < 0): overall = 'neg' elif (total > 0): overall = 'pos' elif (total == 0): overall = 'neutre' tab.append(pos_score) tab.append(neg_score) tab.append(obj_score) feats = ({'positive': pos_score, 'negative': neg_score}, data[1]) dataLexiconFeature.append(feats) for dataT in testData: booleanNegT = False pos_scoreT = neg_scoreT = obj_scoreT = 0 tagData = pos_tag(dataT[0]) negationDataT = mark_negation(dataT[0]) pos_scoreT, neg_scoreT, obj_score = tagCount(dataT, tagData, negationDataT, pos_scoreT, neg_scoreT, obj_scoreT, booleanNegT) total = int(pos_scoreT) - int(neg_scoreT) tab.append(pos_scoreT) tab.append(neg_scoreT) tab.append(obj_scoreT) featsT = ({'positive': pos_scoreT, 'negative': neg_scoreT}, dataT[1]) dataLexiconFeatureT.append(featsT) classifier = NaiveBayesClassifier.train(dataLexiconFeature) realSet = collections.defaultdict(set) testSet = collections.defaultdict(set) tabPr = [] tabOut = [] for i, (feat, ovAll) in enumerate(dataLexiconFeatureT): realSet[ovAll].add(i) predicted = classifier.classify(feat) tabOut.append(predicted) tabPr.append(predicted) testSet[predicted].add(i) print("Accuracy Naive Bayes for Lexicon Model : ", nltk.classify.util.accuracy(classifier, dataLexiconFeatureT)) return realSet, testSet, tabPr, tabOut
def trainCustom(self, trainSet): #print("\n> Train set custom", trainSet) self._test_set = [ ({word: (word in pt.applyTokenizer(x[0])) for word in trainSet}, x[1]) for x in __TRAIN_SET__ ] #print("> Test Set: ", self._test_set) #self._training_set = apply_features(self.extractFeature, self._test_set) self._classifier = NaiveBayesClassifier.train(self._test_set)
def create_model(pos_tweets, neg_tweets, neu_tweets, classifier_param='LinearSVC'): # filter away words that are less than 3 letters to form the training training_data tweets = [] for (words, sentiment) in pos_tweets + neg_tweets + neu_tweets: words = util.clean_text(words, True) words_filtered = [e.lower() for e in words.split() if len(e) >= 3] #words_filtered = [' '.join(w) for w in [ x for x in nltk.bigrams(words.split())]] tweets.append((words_filtered, sentiment)) # make sure tweets are shuffled randomly shuffle(tweets) # get the training set and train the Classifier training_set = nltk.classify.util.apply_features(extract_features, tweets) max_specificity = -1 best_classifier = None average_accuracy = 0.0 # perform 10-fold cross validation cv = cross_validation.KFold(len(training_set), n_folds=10, shuffle=False, random_state=None) for traincv, testcv in cv: if classifier_param == "LinearSVC": classifier = SklearnClassifier(LinearSVC()).train(training_set[traincv[0]:traincv[len(traincv)-1]]) elif classifier_param == "Tfid": # does TF-IDF weighting, # chooses the 1000 best features based on a chi2 statistic, # and then passes that into a multinomial naive Bayes classifier. pipeline = Pipeline([('tfidf', TfidfTransformer()), \ ('chi2', SelectKBest(chi2, k=1000)), \ ('nb', MultinomialNB())]) classifier = SklearnClassifier(pipeline).train(training_set[traincv[0]:traincv[len(traincv)-1]]) elif classifier_param == "Bernoulli": classifier = SklearnClassifier(BernoulliNB()).train(training_set[traincv[0]:traincv[len(traincv)-1]]) elif classifier_param == "NaiveBayes": classifier = NaiveBayesClassifier.train(training_set[traincv[0]:traincv[len(traincv)-1]]) else: print "Classifier option not available: ", classifier_param sys.exit(1) accuracy_of_classifier, specificity = \ util.accuracy(classifier, tweets[testcv[0]:testcv[len(testcv)-1]]) average_accuracy += accuracy_of_classifier if specificity > max_specificity: max_specificity = specificity best_classifier = classifier print "\naverage accuracy: ", average_accuracy/cv.n_folds # save the classifier joblib.dump(best_classifier, "model/%s_classifier.pkl" % classifier_param) print "saved classifier"
def _loadClassifier(self): # Choose estimator estimator = ELEProbDist # Create the P(label) distribution label_probdist = estimator(self._label_freqdist) # Create the P(fval|label, fname) distribution feature_probdist = {} for ((label, fname), freqdist) in self._feature_freqdist.items(): probdist = estimator(freqdist, bins=len(self._feature_values[fname])) feature_probdist[label,fname] = probdist self._classifier = NaiveBayesClassifier(label_probdist, feature_probdist)
def run_train(self, mode='agreeable'): # mode in ['gender', 'age_group', 'extroverted', 'stable', # 'agreeable',·'conscientious', 'openness'] train_input = [] print(f"making train_input: {mode}") for infos in tqdm(self.train.values()): for info in infos['text']: # process same label for 100 texts train_input.append((self.get_feature_dict(info), infos[mode])) print(f"running trainer... {mode}") self.classifier[mode] = NB.train(train_input) print("running trainer done")
def main(): rdr = CategorizedPlaintextCorpusReader('/home/mel/workspace/datascience/assignment5_kaggle/data/', r'.*\.txt', cat_pattern=r'(.*)\.txt') clf = NaiveBayesClassifier.train(list(make_training_data(rdr))) clf.show_most_informative_features(10) review_file = open("/home/mel/workspace/datascience/assignment5_kaggle/data/yelp_test_set/yelp_test_set_review.json") lines = review_file.readlines() output_file = open('/home/mel/workspace/datascience/assignment5_kaggle/output.csv', 'w+') for word in ('good', 'service'): print('probability {w!r} is useful: {p:.2%}'.format( w = word, p = clf.prob_classify({word : True}).prob('useful')))
def evaluateclassifier(self, featureselection): positivecount=0 negativecount=0 negativetweets = [] positivetweets = [] #print 'Evaluating Classifier' print featureselection with open(r'..\polarityData\TweetCorpus\training.1600000.processed.noemoticon.csv', 'rb') as f: #print 'Opening corpus file' reader = csv.reader(f) for row in reader: #Positive sentiment tweets if(row[0] == '4' and positivecount < self.corpuslength): positivetweets.append(row[5]) positivecount+=1 #Negative sentiment tweets if(row[0] == '0' and negativecount < self.corpuslength): negativetweets.append(row[5]) negativecount+=1 #print 'Generating Features' self.positivefeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'pos') for tweet in positivetweets] self.negativefeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'neg') for tweet in negativetweets] poscutoff = len(self.positivefeatures) negcutoff = len(self.negativefeatures) print "Train Pos Cutoff: " + str(poscutoff) + " Train Neg Cutoff: " + str(negcutoff) trainfeats = self.positivefeatures[:poscutoff] + self.negativefeatures[:negcutoff] testfeats = self.test(featureselection) #testfeats = self.positivefeatures[:poscutoff] + self.negativefeatures[:negcutoff] print 'Train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) classifier = NaiveBayesClassifier.train(trainfeats) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) #classifier.show_most_informative_features(20) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) #print label, observed testsets[observed].add(i) print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos']) print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
def _train(self): pickle_filename = "{0}.pickle".format(self.__class__.__name__) if os.path.isfile(pickle_filename): with open(pickle_filename, "rb") as classifier_f: self._classifier = pickle.load(classifier_f) classifier_f.close() else: train_set = [(self._extract_features(cascade), cascade['label']) for cascade in self._dataset] self._classifier = NaiveBayesClassifier.train(train_set) with open(pickle_filename, "wb") as save_classifier: pickle.dump(self._classifier, save_classifier) save_classifier.close()
def train(filename): print 'Reading data from the file ' + filename labeled_featuresets = [] with open(filename) as f: for line in f: sentence, category = line.split(' ,,, ', 1) labeled_featuresets.append((extract_features(sentence), category.strip())) print 'Training started' classifier = NaiveBayesClassifier.train(labeled_featuresets) print 'Training completed\n' return classifier
def __init__(self): """ Gather data """ positive = twitter_samples.strings('positive_tweets.json') negative = twitter_samples.strings('negative_tweets.json') self.stop_words = list(set(stopwords.words('english'))) positive_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tokens = twitter_samples.tokenized('negative_tweets.json') """ Clean the data """ positive_clean = [] negative_clean = [] for token in positive_tokens: positive_clean.append(self.clean(token)) for token in negative_tokens: negative_clean.append(self.clean(token)) positive_model_tokens = self.final_token_generator(positive_clean) negative_model_tokens = self.final_token_generator(negative_clean) """ Use generator to make datasets """ positive_dataset = [(token, "Positive") for token in positive_model_tokens] negative_dataset = [(token, "Negative") for token in negative_model_tokens] dataset = positive_dataset + negative_dataset """ Shake it all about """ random.shuffle(dataset) random.shuffle(dataset) random.shuffle(dataset) """ Split them up """ training = dataset[:7000] testing = dataset[7000:] """ Train the classifier """ self.classifier = NaiveBayesClassifier.train(training) """
def train_classifier(self, data): try: positive = self.cleaner.clean_tweets(data["positive"]) negative = self.cleaner.clean_tweets(data["negative"]) dataset = [] for tokens in positive: dataset.append((dict([t, True] for t in tokens), +1)) for tokens in negative: dataset.append((dict([t, True] for t in tokens), -1)) random.shuffle(dataset) model = NaiveBayesClassifier.train(dataset) with open(self.get_model_path(), "wb") as analyser: analyser.write(pickle.dumps(model)) except Exception as ex: print(ex)
def test_simple(self): training_features = [ ({'nice': True, 'good': True}, 'positive'), ({'bad': True, 'mean': True}, 'negative'), ] classifier = NaiveBayesClassifier.train(training_features) result = classifier.prob_classify({'nice': True}) self.assertTrue(result.prob('positive') > result.prob('negative')) self.assertEqual(result.max(), 'positive') result = classifier.prob_classify({'bad': True}) self.assertTrue(result.prob('positive') < result.prob('negative')) self.assertEqual(result.max(), 'negative')
def run_naive_bayes(self, language): self.__check_language(language) util.time_log("starting nb...") ret_list = [] self.load_data_reviews(language) for k_iter in range(0, self.k): util.time_log("learning...") classifier = NaiveBayesClassifier.train( self.training_data_text_vectorized_nb(language, k_iter)) util.time_log("classifying") ret_list.append([ classifier.classify(x) for x in self.test_data_text_vectorized_nb(language, k_iter) ]) return ret_list
def build_classifier(self): #print "Creating a list of labels. If this is done, the previous init doesn't have to be" labels = ['arts','business','computers','home','recreation','science','shopping','knowledge'] self.labeled_features = [] for label in labels: print label.upper() self.labeled_features.extend(self.build_data_set(label)) print self.labeled_features print self.labeled_features print "Labeled Features: ",self.labeled_features classifier = learner.train(self.labeled_features) classifier.show_most_informative_features() return classifier
def main(): mainDir="/media/eea1ee1d-e5c4-4534-9e0b-24308315e271/corpus2" input="/media/eea1ee1d-e5c4-4534-9e0b-24308315e271/tweets/cache" logger.info("Start app") documents = [(list(w.lower() for w in my_corpus.words(fileid)), categoryMapper(category)) for category in my_corpus.categories() for fileid in my_corpus.fileids(category)] random.shuffle(documents) featuresets = [(document_features(d), c) for (d,c) in documents] train_set, test_set = featuresets[250:], featuresets[:50] clf = NaiveBayesClassifier.train(train_set) logger.info("Accuracy: " + str(nltk.classify.accuracy(clf, test_set))) ref = [cat for features, cat in test_set] test = [clf.classify(features) for features, cat in test_set] logger.info(clf.show_most_informative_features(20)) logger.info("\n" + nltk.ConfusionMatrix(ref, test).pp()) logger.info("Exit app")
def test_simple(self): training_features = [({ 'nice': True, 'good': True }, 'positive'), ({ 'bad': True, 'mean': True }, 'negative')] classifier = NaiveBayesClassifier.train(training_features) result = classifier.prob_classify({'nice': True}) self.assertTrue(result.prob('positive') > result.prob('negative')) self.assertEqual(result.max(), 'positive') result = classifier.prob_classify({'bad': True}) self.assertTrue(result.prob('positive') < result.prob('negative')) self.assertEqual(result.max(), 'negative')
def finalclassification(self): negative_words=[] positive_words=[] with open('positive.txt', 'r') as posSentences: for i in posSentences: posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords = [negativevalues.makeadict(posWords), 'pos'] positive_words.append(posWords) with open('negative.txt', 'r') as negSentences: for i in negSentences: negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords = [negativevalues.makeadict(negWords), 'neg'] negative_words.append(negWords) trainFeatures = positive_words[:] + negative_words[:] classifier = NaiveBayesClassifier.train(trainFeatures) return classifier
def train(records): global CUR_CL train_data = [] for record in records: text = record[1] class_label = record[0] feats = features_from_text(text, class_label, stopwords=sw) train_data.append(feats) if CUR_CL is None: if CLASSIFIER == "NaiveBayesClassifier": classifier = NaiveBayesClassifier.train(train_data) elif CLASSIFIER == "sklearnLinSVC": pipeline = Pipeline( [ ("tfidf", TfidfTransformer()), ("chi2", SelectKBest(chi2, k=1000)), ("nb", LinearSVC(multi_class="ovr")), ] ) classifier = SklearnClassifier(pipeline).train(train_data) elif CLASSIFIER == "BernoulliNB": pipeline = Pipeline( [("tfidf", TfidfTransformer()), ("chi2", SelectKBest(chi2, k=1000)), ("nb", BernoulliNB())] ) classifier = SklearnClassifier(pipeline).train(train_data) elif CLASSIFIER == "MultinomialNB": pipeline = Pipeline( [("tfidf", TfidfTransformer()), ("chi2", SelectKBest(chi2, k=1000)), ("nb", MultinomialNB())] ) classifier = SklearnClassifier(pipeline).train(train_data) print CLASSIFIER CUR_CL = classifier else: print "Partial fitting.. \n\n" CUR_CL.train(train_data) f = open("%s/%s.pickle" % (pickles_dir, "news_based_" + CLASSIFIER), "wb") pickle.dump(CUR_CL, f) f.close() print "%s/%s.pickle saved" % (pickles_dir, "news_based_" + CLASSIFIER) gc.collect()
def test_simple(self): training_features = [ ({ "nice": True, "good": True }, "positive"), ({ "bad": True, "mean": True }, "negative"), ] classifier = NaiveBayesClassifier.train(training_features) result = classifier.prob_classify({"nice": True}) self.assertTrue(result.prob("positive") > result.prob("negative")) self.assertEqual(result.max(), "positive") result = classifier.prob_classify({"bad": True}) self.assertTrue(result.prob("positive") < result.prob("negative")) self.assertEqual(result.max(), "negative")
def modelUnigram(trainData, testData): print("--MODEL UNIGRAM--") tab = [] classifier = NaiveBayesClassifier.train(trainData) realSet = collections.defaultdict(set) testSet = collections.defaultdict(set) tabOut = [] tabOver = [] for i, (wordFeat, overall) in enumerate(testData): realSet[overall].add(i) predicted = classifier.classify(wordFeat) tabOut.append(predicted) tabOver.append(overall) tab.append(predicted) testSet[predicted].add(i) print("Accuracy Naive Bayes for Unigram Model : ", nltk.classify.util.accuracy(classifier, testData)) return realSet, testSet, tab, tabOut, tabOver
def train(self,training_set=None): """ Trains the BOW NaiveBayes classifier. """ if (training_set == None): training_set = [(sent, sent.certainty) for sent in self._corpus.sents()] #training_set = training_set[0:10] #para comparar con los resultados anteriores #build features self._build_bow_features(training_set) #build featuresets for each sentence labeled_featuresets = [] for sent in training_set: featureset = self.sentenceFeatures(sent) labeled_featuresets.append((featureset,sent.certainty)) debug('Size of training set: '+str(len(labeled_featuresets))) #pp = pprint.PrettyPrinter(indent=4) #pp.pprint(labeled_featuresets) #train the NaiveBayes self._classifier = NaiveBayesClassifier.train(labeled_featuresets)
def start(): reviews = get_reviews() top_words = [i[0] for i in get_top_words(reviews, 2000)] # Generate Features Sets print ("Generate Feature_set for all documents: Started") feature_set = [] for review, category in reviews: feature_set.append((get_features(review, top_words), category)) print("Generate Feature_set for all documents: Completed") test_set, train_set = feature_set[:20000], feature_set[20000:] print("Training Started") classifier = NaiveBayesClassifier.train(train_set) print("Training Started") print("Testing Now....") print(nltk.classify.accuracy(classifier, test_set))
def create_classifier(feature_select, filename): posFeatures = [] negFeatures = [] # http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation # breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list sentences = read_in_tweets(twitter_data) random.shuffle(sentences) sentences = sentences[:100000] posSentences = [] negSentences = [] for tup in sentences: if tup[0]=='0': negSentences.append(tup[1]) elif tup[0]=='4': posSentences.append(tup[1]) for i in posSentences: posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords = [feature_select(posWords), 'pos'] posFeatures.append(posWords) for i in negSentences: negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords = [feature_select(negWords), 'neg'] negFeatures.append(negWords) # selects 3/4 of the features to be used for training and 1/4 to be used for testing trainFeatures = negFeatures[:] + posFeatures[:] # trains a Naive Bayes Classifier classifier = NaiveBayesClassifier.train(trainFeatures) f = open(filename, 'wb') pickle.dump(classifier, f) f.close()
def train(self, training_set=None): """ Trains the BOW NaiveBayes classifier. """ if (training_set == None): training_set = [(sent, sent.certainty) for sent in self._corpus.sents()] #training_set = training_set[0:10] #para comparar con los resultados anteriores #build features self._build_bow_features(training_set) #build featuresets for each sentence labeled_featuresets = [] for sent in training_set: featureset = self.sentenceFeatures(sent) labeled_featuresets.append((featureset, sent.certainty)) debug('Size of training set: ' + str(len(labeled_featuresets))) #pp = pprint.PrettyPrinter(indent=4) #pp.pprint(labeled_featuresets) #train the NaiveBayes self._classifier = NaiveBayesClassifier.train(labeled_featuresets)
def get_baseline_method(x_train, y_train, x_test, y_test, method=None, keywords=None): def transform_features(sentence): words = sentence.lower().split() return dict(('contains(%s)' % w, True) for w in words) if 'nb' in method: x_train = list(map(transform_features, x_train)) x_test = list(map(transform_features, x_test)) train_set = list(zip(x_train, y_train)) clf = NaiveBayesClassifier.train(train_set) score_test = np.array([clf.prob_classify(t).prob(1.0) for t in x_test]) score_train = np.array( [clf.prob_classify(t).prob(1.0) for t in x_train]) else: x_train = [extract_glove_feature(text) for text in x_train] x_test = [extract_glove_feature(text) for text in x_test] if 'randomforest' in method: clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1).fit(x_train, y_train) score_train = clf.predict_proba(x_train)[:, 1] score_test = clf.predict_proba(x_test)[:, 1] elif 'knn' in method: clf = KNeighborsClassifier(10).fit(x_train, y_train) score_train = clf.predict_proba(x_train)[:, 1] score_test = clf.predict_proba(x_test)[:, 1] elif 'gloverank' in method: from sklearn.metrics.pairwise import cosine_similarity keyword_doc = extract_glove_feature(keywords).reshape(1, 50) score_train = cosine_similarity(keyword_doc, x_train)[0] score_test = cosine_similarity(keyword_doc, x_test)[0] return score_test, y_test, score_train, y_train
def train(self): print 'Classifier Training in progress....' poscutoff = len(self.positiveFeatures) negcutoff = len(self.negativeFeatures) print "Train Pos Cutoff: " + str( poscutoff) + " Train Neg Cutoff: " + str(negcutoff) trainfeats = self.positiveFeatures[: poscutoff] + self.negativeFeatures[: negcutoff] testfeats = self.test() print 'Train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) self.classifier = NaiveBayesClassifier.train(trainfeats) print 'accuracy:', accuracy(self.classifier, testfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = self.classifier.classify(feats) #print label, observed testsets[observed].add(i) print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos']) print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
def train(records): global CUR_CL train_data = [] for record in records: text = record[1] class_label = record[0] feats = features_from_text(text, class_label, stopwords=sw) train_data.append(feats) if CUR_CL is None: if CLASSIFIER == 'NaiveBayesClassifier': classifier = NaiveBayesClassifier.train(train_data) elif CLASSIFIER == 'sklearnLinSVC': pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=1000)), ('nb', LinearSVC(multi_class='ovr'))]) classifier = SklearnClassifier(pipeline).train(train_data) elif CLASSIFIER == 'BernoulliNB': pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=1000)), ('nb', BernoulliNB())]) classifier = SklearnClassifier(pipeline).train(train_data) elif CLASSIFIER == 'MultinomialNB': pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=1000)), ('nb', MultinomialNB())]) classifier = SklearnClassifier(pipeline).train(train_data) print CLASSIFIER CUR_CL = classifier else: print 'Partial fitting.. \n\n' CUR_CL.train(train_data) f = open("%s/%s.pickle" % (pickles_dir, 'news_based_' + CLASSIFIER), 'wb') pickle.dump(CUR_CL, f) f.close() print"%s/%s.pickle saved" % (pickles_dir, 'news_based_' + CLASSIFIER) gc.collect()
for word in word_features: features['contains(%s)' % word] = (word in document_words) return features def classify_tweet(tweet): return classifier.classify(extract_features(nltk.word_tokenize(tweet))) pos_tweets = read_tweets('Training_Data/Social_Inspirer.txt', 'positive') neg_tweets = read_tweets('Training_Data/Negative.txt', 'negative') tweets = [] for (words, sentiment) in pos_tweets + neg_tweets: words_filtered = [e.lower() for e in words.split() if len(e) >= 3] tweets.append((words_filtered, sentiment)) word_features = get_word_features(get_words_in_tweets(tweets)) training_set = nltk.classify.util.apply_features(extract_features, tweets) classifier = NaiveBayesClassifier.train(training_set) test_tweets = read_tweets('Test_Tweets/Tweets_Positive.txt', 'positive') test_tweets.extend(read_tweets('Test_Tweets/Tweets_Negative.txt', 'negative')) total = accuracy = float(len(test_tweets)) for tweet in test_tweets: if classify_tweet(tweet[0]) != tweet[1]: accuracy -= 1 result = accuracy / total * 100
for key, value in {'noun': nouns, 'verb': verbs, 'adj': adj, 'adv': adv}.items(): value.sort() for idx, word in enumerate(value[:3]): features[key + '-' + str(idx)] = word[1].lower() return features train_set = [] for sent in train_data: tagged_sent = [(word[2], word[0]) for word in sent] for idx, word in enumerate(sent): features = wsd_features(tagged_sent, idx) sense = word[1] train_set.append((features, sense)) classifier = NaiveBayesClassifier.train(train_set) code.interact(local=locals()) class Concept(object): def __init__(self, *args): if args: synsets = [wordnet.synsets(x) for x in args] self.synsets = self._common_synsets(synsets) if len(args) > 1: isas = [self._isa_synsets(synsets, x) for x in synsets] self.synsets = set.union(self.synsets, *isas) else: self.synsets = set()
tweets.append((words_filtered, sentiment)) # extract the word features out from the training data word_features = get_word_features(get_words_in_tweets(tweets)) # get the training set and train the Naive Bayes Classifier print("Aplicando o treino com o Naive Bayes Classifier (by NLTK)...\n") training_set = nltk.classify.util.apply_features(extract_features, tweets) cv = cross_validation.KFold(len(training_set), n_folds=number_cross, indices=True, shuffle=False, random_state=None, k=None) totalaccuracy = 0 test = { 'positive': 0, 'negative': 0, 'totpos': 0, 'totneg': 0 } for tweet, testcv in cv: classifier = NaiveBayesClassifier.train(training_set[tweet[0]:tweet[len(tweet)-1]]) accuracy = nltk.classify.util.accuracy(classifier, training_set[testcv[0]:testcv[len(testcv)-1]]) totalaccuracy += accuracy classified = classify_tweet(in_tweets[testcv[0]][0]) # print 'accuracy:', accuracy # print ("Tweet: ... : Pre-class: %s || Classificado como: %s" % (in_tweets[testcv[0]][1], classified)) if classified == 'positive': test['positive'] += 1 else: test['negative'] += 1 if in_tweets[testcv[0]][1] == 'positive': test['totpos'] += 1 else:
def __init__(self, *args, **kwargs): self.load_training_data() # train classifier self.word_features = nltk.FreqDist(self.all_words).keys() training_set = nltk.classify.util.apply_features(self.extract_features, self.training_tweets) self.classifier = NaiveBayesClassifier.train(training_set)
def train(self, pairs): features = [(self.features(x,y), judgment) for x,y,judgment in pairs] self.model = NaiveBayesClassifier.train(features) #self.model = MaxentClassifier.train(features, max_iter=10) print self.model.most_informative_features()
for (n , g) in names: print n print g break featuresets = [(gender_features(n), g) for (n, g) in names] featuresets len(featuresets) train_set, test_set = featuresets[500:], featuresets[:500] train_set from nltk import NaiveBayesClassifier nb_classifier = NaiveBayesClassifier.train(train_set) nb_classifier.classify(gender_features('Gary')) nb_classifier.classify(gender_features('Grace')) from nltk import classify classify.accuracy(nb_classifier, test_set) nb_classifier.show_most_informative_features(5)
iteracao = iteracao + 1 arquivoMedicoes = open('medicoes_analise_threads_' + str(iteracao) + '.txt', 'w') precisao = accuracy(classificador, featuresClassificados) * 100 arquivoMedicoes.write('Tempo de Execução = ' + str(tempo) + '\nPrecisão = {0:.2f}%'.format(precisao)) arquivoMedicoes.close() features = resultadoPositivos.get() + resultadoNegativos.get() + resultadosNeutros.get() pool1.terminate() pool1.close() pool2.terminate() pool2.close() pool3.terminate() pool3.close() if precisao > 50: features.extend(featuresClassificados) shuffle(features) classificador = NaiveBayesClassifier.train(features) arquivoClassificador = open('classificador.pickle', 'wb') dump(classificador, arquivoClassificador, protocol=HIGHEST_PROTOCOL) arquivoClassificador.close() arquivoPositivos = open('positivos.json', 'w') ujson.dump(positivos, arquivoPositivos) arquivoPositivos.close() arquivoNegativos = open('negativos.json', 'w') ujson.dump(negativos, arquivoNegativos) arquivoNegativos.close() arquivoNeutros = open('neutros.json', 'w') ujson.dump(neutros, arquivoNeutros) arquivoNeutros.close() arquivoResultados = open('resultados_sem_stopwords' + str(iteracao) + '.csv', 'w', newline='') w = writer(arquivoResultados, delimiter=',') linhas = [['Resposta', 'Pontos', 'Sentimento - Naive Bayes', 'Sentimento - AlchemyAPI']]
import base64 from nltk.classify.naivebayes import NaiveBayesClassifier from nltk.classify import PositiveNaiveBayesClassifier from nltk.corpus.reader.plaintext import PlaintextCorpusReader corpusdir = "./text" newcorpus = PlaintextCorpusReader(corpusdir, ".*") labeled_names = ( [(name, "comp") for name in newcorpus.words("comp.txt")] + [(name, "animal") for name in newcorpus.words("animal.txt")] + [(word, "ignore") for word in newcorpus.words("ignorethese.txt")] ) features = [({n: n}, thing) for (n, thing) in labeled_names] training = features[:] testing = "What color is the mouse?".lower().split(" ") classifier = NaiveBayesClassifier.train(training) pickleclf = pickle.dumps(classifier) compressed = base64.b64encode(zlib.compress(pickleclf, 9)) with open("PickledClassifier.txt", "wb") as outobj: outobj.write(compressed) compScore = 0 animalScore = 0 for word in testing: if ( word[len(word) - 1] == "." or word[len(word) - 1] == "," or word[len(word) - 1] == "?" or word[len(word) - 1] == "!" ): word = word[: len(word) - 1] result = classifier.classify({word: word})
def start(): global classifications_collection, tweets_collection, global_count sw = stopwords.words('english') thr = 5 refactored_tweets = {} records = tweets_collection.find() for record in records: tweet = record['text'] tmp_classifiers = record['classifications'] for clasfId, classId in tmp_classifiers.iteritems(): if clasfId not in refactored_tweets.keys(): refactored_tweets[clasfId] = [] refactored_tweets[clasfId].append({'text': tweet, 'classId':classId}) records = None gc.collect() for classification in classifications_collection.find(): tweets = [] classification_name = classification['classification'] classification_id = str(classification["_id"]) classes = classification['classes'] #records = tweets_collection.find({"clasfId":classification_id}) records = [] try: records = refactored_tweets[classification_id] except KeyError: print "No tweets for classification ", classification_name continue records_count = len(records) print classification_name, records_count if classification_id in global_count.keys(): if int(records_count/thr)>global_count[classification_id]: print "Exceeded threshold. Training started" for record in records: tweet = record['text'] class_id = record['classId'] class_label = get_class_label(class_id, classes) feats = features_from_tweet(tweet, class_label, word_indicator, stopwords=sw) tweets.append(feats) classifier = NaiveBayesClassifier.train(tweets) f = open("%s.pickle"%classification_name, 'wb') pickle.dump(classifier, f) f.close() global_count[classification_id] = int(records_count/thr) else: pass else: global_count[classification_id] = int(records_count/thr) if global_count[classification_id] >=1: print "New classification or just started monitor" for record in records: tweet = record['text'] class_id = record['classId'] class_label = get_class_label(class_id, classes) feats = features_from_tweet(tweet, class_label, word_indicator, stopwords=sw) tweets.append(feats) classifier = NaiveBayesClassifier.train(tweets) f = open("%s.pickle"%classification_name, 'wb') pickle.dump(classifier, f) f.close()