class SVMTweetClassifier(TweetClassifier): """ A simple Naive Bayes classifier. Documents are tokenized and stemmed, and then converted to bag-of-words format. The preprocessed documents are then handled by NLTKs Naive Bayes classifier. """ def __init__(self, trainfile=None, datafile=None, outfile=None): super(SVMTweetClassifier, self).__init__(trainfile, datafile, outfile) self.dictionary = SimpleDict() self.scores = {} self.stemmer = PorterStemmer() def getFeatures(self, tweet): """ Replace this method to select different features than just bag-of-words representation of the whole tweet. This is probably the one piece of code we should work on most, since features basically decide whether we have a good or bad classifier. """ return self.getFeatures2(tweet) #tokens = string.lower(tweet.tweet.translate(string.maketrans("",""), string.punctuation)).split(" ") #tokens = [self.stemmer.stem(token) for token in tokens] #tokens = [token for token in tokens if not token[0:4] == "http"] #remove links #for stop in STOPWORDS: # if stop in tokens: # tokens.remove(stop) #return self.dictionary.doc2bow(tokens, True) def getFeatures2(self, tweet): """ POS tag and take only nouns, verbs and adjectives """ text = nltk.word_tokenize(tweet.tweet) return self.dictionary.doc2bow([pos for pos in nltk.pos_tag(text) if pos[1] in ["NN","JJ","JJR","JJS","VBD","VBG","VBN" ,"VBP","VBZ" ,"RB"] ]) def train(self, trainfile=None): self.readTrainingData((trainfile or self.trainfile)) print "getting features.." # the classifier expects a list of (feature_set, label) elements, where each feature_set is a dictionary of {feature_name: value, ...} mappings train_set = [(self.getFeatures(tweet), tweet.sentiment) for tweet in self.trainingTweets] print train_set print "training SVM classifier" self.classifier = SklearnClassifier(SVC(), sparse=True).train(train_set) def classifyTweets(self, datafile=None, outfile=None): print "reading dataset" self.readDataset(datafile) print "classifying Tweets with SVM classifier" # we use prob_classify so we can remember the scores. This means we could later on judge the certainty of a measurement, and if it's low, change the sentiment. res = self.classifier.batch_classify([self.getFeatures(tweet) for tweet in self.evalTweets]) print "assigning sentiments" for idx, tweet in enumerate(self.evalTweets): tweet.sentiment = res[idx] #self.scores[(tweet.id1,tweet.id2)] = res #tweet.sentiment = res.max() self.writeResults(outfile)
def get_ten_fold_accuracy_score(classifier, train_set): classifier = SklearnClassifier(classifier) k_fold = cross_validation.KFold(len(train_set), n_folds=10) metric = [] for train_range, test_range in k_fold: train_data = [] test_data = [] for i in train_range: train_data.append(train_set[i]) for i in test_range: test_data.append(train_set[i]) classifier.train(train_data) test_fea, test_tag = zip(*test_data) tag_pred = classifier.batch_classify(test_fea) metric.append(accuracy_score(test_tag, tag_pred, 'macro')) metric_array = np.array(metric) return np.mean(metric_array[0:])
def getAccuracy(self, classifier): classifier = SklearnClassifier(classifier) accuracy = 0 for fold in range(0, self.n_fold): log(str(fold+1) + " iteration...") log(" Partitioning...") datacv = self.getCrossValidationData(self.tweets, fold) traincv = datacv[0] testcv = datacv[1] testlabel = datacv[2] log(" Training...") classifier.train(traincv) log(" Classifying...") label_pred = classifier.batch_classify(testcv) tempScore = accuracy_score(testlabel, label_pred) log(" Accuracy for this iteration: " + str(tempScore)) accuracy += tempScore return accuracy/self.n_fold
def runClassifier(train, test, algo='LogisticRegression'): train_features = [] for co in train: train_features.append((co.featureset, co.isbug)) test_features = [] for c in test: if c is None: continue test_features.append(c.featureset) if algo == 'LogisticRegression': print 'LogisticRegression' try: from sklearn.linear_model.sparse import LogisticRegression except ImportError: # separate sparse LR to be removed in 0.12 from sklearn.linear_model import LogisticRegression classif = SklearnClassifier(LogisticRegression(C=1000)) else: # if not logistic, assume SVM for now # SVM with a Linear Kernel and default parameters from sklearn.svm import LinearSVC print 'svm' classif = SklearnClassifier(LinearSVC()) classif.train(train_features) try: p = classif.classify_many(test_features) except AttributeError: p = classif.batch_classify(test_features) test_commits = [] for idx, val in enumerate(p): t = test[idx] t.isbug = val test_commits.append(t) return test_commits
def get_accuracy_score(classifier, train_set, test, tag_test): classifier = SklearnClassifier(classifier) classifier.train(train_set) pred = classifier.batch_classify(test) return accuracy_score(tag_test, pred, 'macro') #积极类 消极类精度加权平均值
def evaluate(data_split, model_file, d, rnn_feats=True, bow_feats=False, rel_feats=False): stop = stopwords.words("english") vocab, rel_list, ans_list, tree_dict = cPickle.load(open(data_split, "rb")) train_trees = tree_dict["train"] + tree_dict["dev"] test_trees = tree_dict["test"] + tree_dict["devtest"] params, vocab, rel_list = cPickle.load(open(model_file, "rb")) (rel_dict, Wv, b, We) = params data = [train_trees, test_trees] # get rid of trees that the parser messed up on for sn, split in enumerate(data): bad_trees = [] for ind, tree in enumerate(split): if tree.get(0).is_word == 0: # print tree.get_words() bad_trees.append(ind) continue # print 'removed', len(bad_trees) for ind in bad_trees[::-1]: split.pop(ind) # adding lookup ans_list = array([vocab.index(ans) for ans in ans_list]) for split in data: for tree in split: for node in tree.get_nodes(): node.vec = We[:, node.ind].reshape((d, 1)) tree.ans_list = ans_list[ans_list != tree.ans_ind] train_q, test_q = collapse_questions(train_trees, test_trees) # print 'number of training questions:', len(train_q) # print 'number of testing questions:', len(test_q) train_feats = [] test_feats = [] test_ord = [] for tt, split in enumerate([train_q, test_q]): # if tt == 0: # print 'processing train' # else: # print 'processing test' # for each question in the split for qid in split: q = split[qid] ave = zeros((d, 1)) words = zeros((d, 1)) bow = [] count = 0.0 curr_ave = None curr_words = None # for each sentence in the question, generate features for i in range(0, len(q)): try: tree = q[i] except: continue curr_feats = {} if rnn_feats: forward_prop(None, params, tree, d, labels=False) # features: average of hidden representations and average of word embeddings for ex, node in enumerate(tree.get_nodes()): if node.word not in stop: ave += node.p_norm words += node.vec count += 1.0 if count > 0: curr_ave = ave / count curr_words = words / count featvec = concatenate([curr_ave.flatten(), curr_words.flatten()]) # add QANTA's features to the current feature set for dim, val in ndenumerate(featvec): curr_feats["__" + str(dim)] = val # add unigram indicator features to the current feature set if bow_feats: bow += [l.word for l in tree.get_nodes()] for word in bow: curr_feats[word] = 1.0 # add dependency relation indicator features to the current feature set if rel_feats: for l in tree.get_nodes(): if len(l.parent) > 0: par, rel = l.parent[0] this_rel = l.word + "__" + rel + "__" + tree.get(par).word curr_feats[this_rel] = 1.0 if tt == 0: train_feats.append((curr_feats, tree.ans.lower())) else: test_feats.append((curr_feats, tree.ans.lower())) test_ord.append(tree) # print 'total training instances:', len(train_feats) # print 'total testing instances:', len(test_feats) # can modify this classifier / do grid search on regularization parameter using sklearn classifier = SklearnClassifier(LogisticRegression(C=10)) classifier.train(train_feats) print "accuracy train:", nltk.classify.util.accuracy(classifier, train_feats) print "accuracy test:", nltk.classify.util.accuracy(classifier, test_feats) print "" # finer-grained evaluation, see how well QANTA does at each sentence position pred = classifier.batch_classify([fs for (fs, l) in test_feats]) count_dists = Counter() corr_dists = Counter() for ind, tree in enumerate(test_ord): curr_dist = tree.dist count_dists[curr_dist] += 1.0 label = tree.ans if label == pred[ind]: corr_dists[curr_dist] += 1.0 prob_dists = {} print "sentence position: correctly answered at that position, total sentences at that position,", "accuracy" for key in corr_dists: prob_dists[key] = corr_dists[key] / count_dists[key] print key, ": ", corr_dists[key], count_dists[key], prob_dists[key]
class movie_sentiment: def __init__(self): self.config = ConfigParser.ConfigParser() self.config.read("senti_analysis.config") #File names: cur_dir = os.getcwd() rel_dir_path = self.config.get('GLOBAL', 'reviews_file_dir') self.reviews_file_dir = os.path.join(cur_dir, rel_dir_path) self.d2_pos_reviews_file_dir = os.path.join(cur_dir, self.config.get('GLOBAL', 'pos_reviews_dir')) self.d2_neg_reviews_file_dir = os.path.join(cur_dir, self.config.get('GLOBAL', 'neg_reviews_dir')) self.pos_rev_file_name = self.config.get('GLOBAL', 'pos_reviews_file_name') self.pos_rev_file = os.path.join(self.reviews_file_dir, self.pos_rev_file_name) self.neg_rev_file_name = self.config.get('GLOBAL', 'neg_reviews_file_name') self.neg_rev_file = os.path.join(self.reviews_file_dir, self.neg_rev_file_name) self.logger_file = os.path.join("OUTPUT", "senti_analysis.log") #Dataset 2 self.bow_file_dir = self.config.get('GLOBAL','bag_of_words_file_dir') self.bow_file_name = self.config.get('GLOBAL', 'bag_of_words_file_name') self.bow_file = os.path.join(self.bow_file_dir, self.bow_file_name) self.train_feat_file_dir = self.config.get('GLOBAL', 'feat_file_dir') self.train_feat_file_name = self.config.get('GLOBAL', 'feat_file_name') self.train_file = os.path.join(self.train_feat_file_dir, self.train_feat_file_name) self.test_feat_file_dir = self.config.get('GLOBAL', 'test_feat_file_dir') self.test_feat_file_name = self.config.get('GLOBAL', 'test_feat_file_name') self.test_file = os.path.join(self.test_feat_file_dir, self.test_feat_file_name) #Global ds self.pos_reviews_list = [] self.neg_reviews_list = [] self.bow_dict = {} #self.words_selection_dict = {"top_100":100, "top_500":500, "top_1000":1000, "top_5000":5000, "top_10000":10000, "top_20000":20000, "bigram":10000, "all_words":10000} self.words_selection_dict = {"top_12500":12500} self.stopwords_set = set(stopwords.words('english')) self.stemmer = nltk.stem.PorterStemmer() def initialize_logger(self): logging.basicConfig(filename=self.logger_file, level=logging.INFO) logging.info("Initialized logger") def run_main(self): self.preprocessing() # Classifier is trained on all features, specific number of top features, specified number of bigram features for key, words_count in self.words_selection_dict.iteritems(): self.all_feat = self.bigram_active = self.best_feat = 0 print "Training classifier on %s" % (key) if key == "all_words": #Selects all feats as features self.all_feat = 1 elif key == "bigram": #Selects top k bigram feats and top n unigram feats self.bigram_active = 1 else: self.best_feat = 1 #Selects top n unigram feats self.words_count = words_count self.feature_extraction() self.classification() self.cross_validation() def preprocessing(self): self.initialize_logger() self.open_files() self.load_data() self.close_files() self.compute_word_scores() def open_files(self): self.pos_rev_fd = open(self.pos_rev_file, 'r') self.neg_rev_fd = open(self.neg_rev_file, 'r') self.bow_fd = open(self.bow_file, 'r') self.train_file_fd = open(self.train_file, 'r') self.test_file_fd = open(self.test_file, 'r') def load_data(self): self.load_bow() self.load_reviews() def load_bow(self): counter = 0 for word in self.bow_fd.readlines(): self.bow_dict[counter] = word and word.strip() counter += 1 def load_reviews(self): #Loading pos reviews for review in self.pos_rev_fd.readlines(): self.pos_reviews_list.append(review) #Loading neg reviews for review in self.neg_rev_fd.readlines(): self.neg_reviews_list.append(review) #self.load_dataset_two() #self.load_d2_reviews() def load_d2_reviews(self): d2_pos_reviews = [] d2_neg_reviews = [] pos_files = os.listdir(self.d2_pos_reviews_file_dir) neg_files = os.listdir(self.d2_neg_reviews_file_dir) for pos_file in pos_files: pos_filename = os.path.join(self.d2_pos_reviews_file_dir, pos_file) pos_fd = open(pos_filename, 'r') for lines in pos_fd.readlines(): d2_pos_reviews.append(lines) for neg_file in neg_files: neg_filename = os.path.join(self.d2_neg_reviews_file_dir, neg_file) neg_fd = open(neg_filename, 'r') for lines in neg_fd.readlines(): d2_neg_reviews.append(lines) self.pos_reviews_list.extend(d2_pos_reviews[:1000]) self.neg_reviews_list.extend(d2_neg_reviews[:5000]) def load_dataset_two(self): d2_pos_reviews_list = [] d2_neg_reviews_list = [] for review in self.train_file_fd.readlines(): label = review[0] if int(label) >= 7: kv_list = review.split(" ")[1:] sent = "" for kv in kv_list: sent = sent + " " + self.bow_dict.get(int(kv.split(":")[0])) d2_pos_reviews_list.append(sent) if int(label) <= 4: kv_list = review.split(" ")[1:] sent="" for kv in kv_list: sent = sent + " " + self.bow_dict.get(int(kv.split(":")[0])) d2_neg_reviews_list.append(sent) self.pos_reviews_list.extend(d2_pos_reviews_list) self.neg_reviews_list.extend(d2_neg_reviews_list) def close_files(self): self.pos_rev_fd.close() self.neg_rev_fd.close() self.bow_fd.close() self.train_file_fd.close() self.test_file_fd.close() def feature_selection(self, features_list): selected_feat_list = [] self.bestwords = list(set([w for w, s in self.best[:self.words_count]])) for feat in features_list: if feat and feat in self.bestwords: selected_feat_list.append((feat, True)) return dict(selected_feat_list) def all_feature_selection(self, features_list): selected_feat_list = [] for feat in features_list: if feat: selected_feat_list.append((feat, True)) return dict(selected_feat_list) def bigram_feature_selection(self, features_list): score = BigramAssocMeasures.chi_sq n = 250 all_bigrams = BigramCollocationFinder.from_words(features_list) best_bigrams = all_bigrams.nbest(score, n) selected_bigrams = dict([(bigram, True) for bigram in best_bigrams]) selected_monograms = self.feature_selection(features_list) selected_bigrams.update(selected_monograms) return selected_bigrams def compute_word_scores(self): #Core module which assigns scores to features and top features are selected based on this score. freq_dist_obj = FreqDist() cond_freq_dist_obj = ConditionalFreqDist() #Iterating over pos reviews, to calcutate scores for pos feats for review in self.pos_reviews_list: review_words = self.apply_preprocessing(review) for word in review_words: freq_dist_obj.inc(word) cond_freq_dist_obj['pos'].inc(word) #Iterating over neg reviews, to calculate scores for neg feats for review in self.neg_reviews_list: review_words = self.apply_preprocessing(review) for word in review_words: freq_dist_obj.inc(word) cond_freq_dist_obj['neg'].inc(word) pos_word_count = cond_freq_dist_obj['pos'].N() neg_word_count = cond_freq_dist_obj['neg'].N() total_word_count = pos_word_count + neg_word_count word_score_dict = {} #Finding the scores using chi square for word, freq in freq_dist_obj.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_freq_dist_obj['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_freq_dist_obj['neg'][word], (freq, neg_word_count), total_word_count) word_score_dict[word] = pos_score + neg_score #self.best = sorted(word_score_dict.iteritems(), key=lambda (w,s): s, reverse=True) self.best = sorted(word_score_dict.iteritems(), key=operator.itemgetter(1), reverse=True) def feature_extraction(self): self.pos_feat_extraction() self.neg_feat_extraction() def apply_preprocessing(self, review): cleaned_review = [] for word in review.split(): if word and word not in self.stopwords_set: cleaned_review.append(word.lower().strip()) return cleaned_review def pos_feat_extraction(self): self.selected_pos_feats = [] #Select positive features for review in self.pos_reviews_list: review_words = self.apply_preprocessing(review) # Top n best pos features are selected if self.best_feat: selected_review_words = self.feature_selection(review_words) # All pos features are selected elif self.all_feat: selected_review_words = self.all_feature_selection(review_words) # Top k bigram features are selected along with top n best pos features elif self.bigram_active: selected_review_words = self.bigram_feature_selection(review_words) self.selected_pos_feats.append((selected_review_words, 'pos')) def neg_feat_extraction(self): self.selected_neg_feats = [] #Selecte negative features for review in self.neg_reviews_list: review_words = self.apply_preprocessing(review) # Top n best neg features are selected if self.best_feat: selected_review_words = self.feature_selection(review_words) # All neg features are selected elif self.all_feat: selected_review_words = self.all_feature_selection(review_words) # Top k bigram features are selected along with the top n best neg features elif self.bigram_active: selected_review_words = self.bigram_feature_selection(review_words) self.selected_neg_feats.append((selected_review_words, 'neg')) def classification(self): #Let us use 75 % of data for training and 25 % for testing pos_feats_count = int(len(self.selected_pos_feats) * 0.75) neg_feats_count = int(len(self.selected_neg_feats) * 0.75) pos_train_features = self.selected_pos_feats[:pos_feats_count] neg_train_features = self.selected_neg_feats[:neg_feats_count] pos_test_features = self.selected_pos_feats[pos_feats_count:] neg_test_features = self.selected_neg_feats[neg_feats_count:] self.train_features = pos_train_features + neg_train_features self.test_features = pos_test_features + neg_test_features #NaiveBayes Classfication self.NaiveBayesClassification(self.train_features, self.test_features) #Support vecotr machine - Linear support vector classification self.SVMClassification(self.train_features, self.test_features) self.testing("75-25") def NaiveBayesClassification(self, train_features, test_features): # Training and finding accuracy of NaiveBayes Classifier #Training self.nb_classifier = NaiveBayesClassifier.train(train_features) #Testing #print '\n ACCURACY - NAIVE BAYE CLASSIFIER: %s \n' % (nltk.classify.util.accuracy(self.nb_classifier, test_features)) #self.nb_classifier.show_most_informative_features() def SVMClassification(self, train_features, test_features): # Training and finding accuracy of SVM Linear SVC classifier test_feat_list = [] test_feat_labels_list = [] #Training self.svm_classifier = SklearnClassifier(LinearSVC()) self.svm_classifier.train(train_features) #Testing for test_feat in test_features: test_feat_list.append(test_feat[0]) test_feat_labels_list.append(test_feat[1]) svm_test = self.svm_classifier.batch_classify(test_feat_list) #print classification_report(test_feat_labels_list, svm_test, labels=['pos','neg'], target_names=['pos', 'neg']) def testing(self, iteration): #Findng precision, recall and f measures for both classifier #Naive Bayes classification print "NAIVE BAYES - ITERATION %s" % (iteration) actual_pol_dict, predicted_pol_dict = self.get_actual_and_predicted_polarity_dict(self.nb_classifier) pos_precision, neg_precision = self.find_precision(actual_pol_dict, predicted_pol_dict) pos_recall, neg_recall = self.find_recall(actual_pol_dict, predicted_pol_dict) self.find_fmeasure(pos_precision, neg_precision, pos_recall, neg_recall) print "SVM - Linear SVC - ITERATION %s" % (iteration) #Support Vector Machine - Linear SVC classification actual_pol_dict, predicted_pol_dict = self.get_actual_and_predicted_polarity_dict(self.svm_classifier) pos_precision, neg_precision = self.find_precision(actual_pol_dict, predicted_pol_dict) pos_recall, neg_recall = self.find_recall(actual_pol_dict, predicted_pol_dict) self.find_fmeasure(pos_precision, neg_precision, pos_recall, neg_recall) def cross_validation(self): #10 fold cross validation for both classifiers pos_feats_count = len(self.selected_pos_feats) neg_feats_count = len(self.selected_neg_feats) pos_feats_fold_size = int(pos_feats_count / 10) neg_feats_fold_size = int(neg_feats_count / 10) for a in range(10): start_pos_feats_test_index = a * pos_feats_fold_size end_pos_feats_test_index = start_pos_feats_test_index + pos_feats_fold_size start_neg_feats_test_index = a * neg_feats_fold_size end_neg_feats_test_index = start_neg_feats_test_index + neg_feats_fold_size pos_test_features = self.selected_pos_feats[start_pos_feats_test_index:end_pos_feats_test_index] neg_test_features = self.selected_neg_feats[start_neg_feats_test_index:end_neg_feats_test_index] pos_train_features = self.selected_pos_feats[:start_pos_feats_test_index] pos_train_features += self.selected_pos_feats[end_pos_feats_test_index:] neg_train_features = self.selected_neg_feats[:start_neg_feats_test_index] neg_train_features += self.selected_neg_feats[end_neg_feats_test_index:] train_features = pos_train_features + neg_train_features test_features = pos_test_features + neg_test_features #Naive Bayes classification self.NaiveBayesClassification(train_features, test_features) #SVM classification self.SVMClassification(train_features, test_features) self.testing(a) def get_actual_and_predicted_polarity_dict(self, classifier): #Find the precision and recall actual_polarity_dict = {} predicted_polarity_dict = {} for i, (features, label) in enumerate(self.test_features): actual_polarity_dict.setdefault(label, set()).add(i) predicted_polarity = classifier.classify(features) predicted_polarity_dict.setdefault(predicted_polarity, set()).add(i) return (actual_polarity_dict, predicted_polarity_dict) def find_precision(self, actual_polarity_dict, predicted_polarity_dict): #Finding precision values pos_precision = self.pos_precision(actual_polarity_dict, predicted_polarity_dict) neg_precision = self.neg_precision(actual_polarity_dict, predicted_polarity_dict) return (pos_precision, neg_precision) def pos_precision(self, actual_polarity_dict, predicted_polarity_dict): pos_val_precision = nltk.metrics.precision(actual_polarity_dict['pos'], predicted_polarity_dict['pos']) print "Pos values precision %s" % (pos_val_precision) return pos_val_precision def neg_precision(self, actual_polarity_dict, predicted_polarity_dict): neg_val_precision = nltk.metrics.precision(actual_polarity_dict['neg'], predicted_polarity_dict['neg']) print "Neg values precision %s\n" % (neg_val_precision) return neg_val_precision def find_recall(self, actual_polarity_dict, predicted_polarity_dict): #Finding recall values pos_recall = self.pos_recall(actual_polarity_dict, predicted_polarity_dict) neg_recall = self.neg_recall(actual_polarity_dict, predicted_polarity_dict) return (pos_recall, neg_recall) def pos_recall(self, actual_polarity_dict, predicted_polarity_dict): pos_val_recall = nltk.metrics.recall(actual_polarity_dict['pos'], predicted_polarity_dict['pos']) print "Pos values recall %s" % (pos_val_recall) return pos_val_recall def neg_recall(self, actual_polarity_dict, predicted_polarity_dict): neg_val_recall = nltk.metrics.recall(actual_polarity_dict['neg'], predicted_polarity_dict['neg']) print "Neg values recall %s\n" % (neg_val_recall) return neg_val_recall def find_fmeasure(self, pos_precision, neg_precision, pos_recall, neg_recall): #Finding f measure pos_f_measure = self.pos_fmeasure(pos_precision, pos_recall) neg_f_measure = self.neg_fmeasure(neg_precision, neg_recall) print "Average F-measure %s\n" % ((pos_f_measure + neg_f_measure)/2) def pos_fmeasure(self, pos_precision, pos_recall): pos_fmeasure_val = 2 * (pos_precision * pos_recall) / float(pos_precision + pos_recall) print "F-measure for pos val %s" % (pos_fmeasure_val) return pos_fmeasure_val def neg_fmeasure(self, neg_precision, neg_recall): neg_fmeasure_val = 2 * (neg_precision * neg_recall) / float(neg_precision + neg_recall) print "F-measure for neg val %s" % (neg_fmeasure_val) return neg_fmeasure_val
def score(classifier): classifier = SklearnClassifier(classifier) classifier.train(trainset) pred = classifier.batch_classify(test) return accuracy_score(tag_test, pred)
from sklearn.pipeline import Pipeline DATA_PATH = '../../datasets/sentiment_analysis/en/rt-polaritydata' def word_feats(words): return dict([(word, True) for word in sent_tokenize(words)]) add_label = lambda lst, lab: [(x, lab) for x in lst] pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=1000)), ('nb', MultinomialNB())]) classifier = SklearnClassifier(pipeline) pos = map(word_feats, open(os.path.join(DATA_PATH, 'rt-polarity.pos')).readlines()) neg = map(word_feats, open(os.path.join(DATA_PATH, 'rt-polarity.neg')).readlines()) features = zip(pos[:len(pos) / 2], itertools.repeat("pos")) + \ zip(neg[:len(neg) / 2], itertools.repeat("neg")) classifier.train(features) l_pos = np.array(classifier.batch_classify(pos[len(pos) / 2:])) l_neg = np.array(classifier.batch_classify(neg[len(neg) / 2:])) print "Confusion matrix:\n%d\t%d\n%d\t%d" % ( (l_pos == 'pos').sum(), (l_pos == 'neg').sum(), (l_neg == 'pos').sum(), (l_neg == 'neg').sum())
def score(classifier): classifier = SklearnClassifier(classifier) #在nltk 中使用scikit-learn 的接口 classifier.train(train) #训练分类器 pred = classifier.batch_classify(testSet) #对开发测试集的数据进行分类,给出预测的标签 return accuracy_score(tag_test, pred) #对比分类预测结果和人工标注的正确结果,给出分类器准确度
class SVMTweetClassifier(TweetClassifier): """ A simple Naive Bayes classifier. Documents are tokenized and stemmed, and then converted to bag-of-words format. The preprocessed documents are then handled by NLTKs Naive Bayes classifier. """ def __init__(self, trainfile=None, datafile=None, outfile=None): super(SVMTweetClassifier, self).__init__(trainfile, datafile, outfile) self.dictionary = SimpleDict() self.scores = {} self.stemmer = PorterStemmer() def getFeatures(self, tweet): """ Replace this method to select different features than just bag-of-words representation of the whole tweet. This is probably the one piece of code we should work on most, since features basically decide whether we have a good or bad classifier. """ return self.getFeatures2(tweet) #tokens = string.lower(tweet.tweet.translate(string.maketrans("",""), string.punctuation)).split(" ") #tokens = [self.stemmer.stem(token) for token in tokens] #tokens = [token for token in tokens if not token[0:4] == "http"] #remove links #for stop in STOPWORDS: # if stop in tokens: # tokens.remove(stop) #return self.dictionary.doc2bow(tokens, True) def getFeatures2(self, tweet): """ POS tag and take only nouns, verbs and adjectives """ text = nltk.word_tokenize(tweet.tweet) return self.dictionary.doc2bow([ pos for pos in nltk.pos_tag(text) if pos[1] in [ "NN", "JJ", "JJR", "JJS", "VBD", "VBG", "VBN", "VBP", "VBZ", "RB" ] ]) def train(self, trainfile=None): self.readTrainingData((trainfile or self.trainfile)) print "getting features.." # the classifier expects a list of (feature_set, label) elements, where each feature_set is a dictionary of {feature_name: value, ...} mappings train_set = [(self.getFeatures(tweet), tweet.sentiment) for tweet in self.trainingTweets] print train_set print "training SVM classifier" self.classifier = SklearnClassifier(SVC(), sparse=True).train(train_set) def classifyTweets(self, datafile=None, outfile=None): print "reading dataset" self.readDataset(datafile) print "classifying Tweets with SVM classifier" # we use prob_classify so we can remember the scores. This means we could later on judge the certainty of a measurement, and if it's low, change the sentiment. res = self.classifier.batch_classify( [self.getFeatures(tweet) for tweet in self.evalTweets]) print "assigning sentiments" for idx, tweet in enumerate(self.evalTweets): tweet.sentiment = res[idx] #self.scores[(tweet.id1,tweet.id2)] = res #tweet.sentiment = res.max() self.writeResults(outfile)
# label set cls_set = ['Emotion', 'ynQuestion', 'yAnswer', 'Continuer', 'whQuestion', 'System', 'Accept', 'Clarify', 'Emphasis', 'nAnswer', 'Greet', 'Statement', 'Reject', 'Bye', 'Other'] featuresets = [] # list of tuples of the form (post, features) for post in posts: # applying the feature extractor to each post # post.get('class') is the label of the current post featuresets.append((dialogue_act_features(post.text),cls_set.index(post.get('class')))) from random import shuffle shuffle(featuresets) size = int(len(featuresets) * .1) # 10% is used for the test set train = featuresets[size:] test = featuresets[:size] print(train) from sklearn.svm import LinearSVC from nltk.classify.scikitlearn import SklearnClassifier # SVM with a Linear Kernel and default parameters classif = SklearnClassifier(LinearSVC()) classif.train(train) test_skl = [] t_test_skl = [] for d in test: test_skl.append(d[0]) t_test_skl.append(d[1]) # run the classifier on the train test p = classif.batch_classify(test_skl) from sklearn.metrics import classification_report # getting a full report print(classification_report(t_test_skl, p, labels=list(set(t_test_skl)),target_names=cls_set))
def evaluate(data_split, model_file, d, rnn_feats=True, bow_feats=False, rel_feats=False): stop = stopwords.words('english') vocab, rel_list, ans_list, tree_dict = \ cPickle.load(open(data_split, 'rb')) train_trees = tree_dict['train'] + tree_dict['dev'] test_trees = tree_dict['test'] + tree_dict['devtest'] params, vocab, rel_list = cPickle.load(open(model_file, 'rb')) (rel_dict, Wv, b, We) = params data = [train_trees, test_trees] # get rid of trees that the parser messed up on for sn, split in enumerate(data): bad_trees = [] for ind, tree in enumerate(split): if tree.get(0).is_word == 0: # print tree.get_words() bad_trees.append(ind) continue # print 'removed', len(bad_trees) for ind in bad_trees[::-1]: split.pop(ind) # adding lookup ans_list = array([vocab.index(ans) for ans in ans_list]) for split in data: for tree in split: for node in tree.get_nodes(): node.vec = We[:, node.ind].reshape( (d, 1)) tree.ans_list = ans_list[ans_list != tree.ans_ind] train_q, test_q = collapse_questions(train_trees, test_trees) # print 'number of training questions:', len(train_q) # print 'number of testing questions:', len(test_q) train_feats = [] test_feats = [] test_ord = [] for tt, split in enumerate([train_q, test_q]): # if tt == 0: # print 'processing train' # else: # print 'processing test' # for each question in the split for qid in split: q = split[qid] ave = zeros( (d, 1)) words = zeros ( (d, 1)) bow = [] count = 0. curr_ave = None curr_words = None # for each sentence in the question, generate features for i in range(0, len(q)): try: tree = q[i] except: continue forward_prop(params, tree, d, labels=False) # features: average of hidden representations and average of word embeddings for ex, node in enumerate(tree.get_nodes()): if node.word not in stop: ave += node.p_norm words += node.vec count += 1. if count > 0: curr_ave = ave / count curr_words = words / count featvec = concatenate([curr_ave.flatten(), curr_words.flatten()]) curr_feats = {} # add QANTA's features to the current feature set if rnn_feats: for dim, val in ndenumerate(featvec): curr_feats['__' + str(dim)] = val # add unigram indicator features to the current feature set if bow_feats: bow += [l.word for l in tree.get_nodes()] for word in bow: curr_feats[word] = 1.0 # add dependency relation indicator features to the current feature set if rel_feats: for l in tree.get_nodes(): if len(l.parent) > 0: par, rel = l.parent[0] this_rel = l.word + '__' + rel + '__' + tree.get(par).word curr_feats[this_rel] = 1.0 if tt == 0: train_feats.append( (curr_feats, tree.ans.lower()) ) else: test_feats.append( (curr_feats, tree.ans.lower()) ) test_ord.append(tree) # print 'total training instances:', len(train_feats) # print 'total testing instances:', len(test_feats) # can modify this classifier / do grid search on regularization parameter using sklearn classifier = SklearnClassifier(LogisticRegression(C=10)) classifier.train(train_feats) print 'accuracy train:', nltk.classify.util.accuracy(classifier, train_feats) print 'accuracy test:', nltk.classify.util.accuracy(classifier, test_feats) print '' # finer-grained evaluation, see how well QANTA does at each sentence position pred = classifier.batch_classify([fs for (fs,l) in test_feats]) count_dists = Counter() corr_dists = Counter() for ind, tree in enumerate(test_ord): curr_dist = tree.dist count_dists[curr_dist] += 1.0 label = tree.ans if label == pred[ind]: corr_dists[curr_dist] += 1.0 prob_dists = {} print 'sentence position: correctly answered at that position, total sentences at that position,',\ 'accuracy' for key in corr_dists: prob_dists[key] = corr_dists[key] / count_dists[key] print key, ': ', corr_dists[key], count_dists[key], prob_dists[key]
pred_NB=cf.batch_classify(test_feat) #results=[cf.classify(test[a][0]) for a in range(size)] #gold=[test[a][1] for a in range(size)] cm_NB=nltk.ConfusionMatrix(test_tag,pred_NB) print cm_NB.pp(sort_by_count=True, show_percents=False, truncate=10) #create structures for classification test_doc=[a[0] for a in test] #build, train, and test classifiers from sklearn.svm import LinearSVC from nltk.classify.scikitlearn import SklearnClassifier sv=SklearnClassifier(LinearSVC()) sv.train(train) #note that train performance matches tmp.sum() pred_train_sv=sv.batch_classify(train_feat) nltk.ConfusionMatrix(train_tag,pred_train_sv) #also test performance matches tmp_test.sum() pred_sv=sv.batch_classify(test_feat) #confusion matrices cmsv=nltk.ConfusionMatrix(test_tag,pred_sv) print cmsv.pp(sort_by_count=True, show_percents=False, truncate=5) #some SklearnClassifier internals featsets, labs = zip(*train) X = sv._convert(featsets) import numpy y=numpy.array([sv._label_index[l] for l in labs]) #then to train one would use sv._clf.fit(X,y) #------------------------------------- #To vectorize/classify all in sklearn
class MNBayes(text_classifier.TextClassifier): def __init__(self,trainDir,labelFile,numTrees=10): self.classifier = None self.labelFile = labelFile self.trainingDir = trainDir self.labels = None self.all_words = None self.numTrees = numTrees self.classifier = SklearnClassifier(MultinomialNB()) #self.labels = training.setup(labelFile) #self.train() def train(self): feature_sets = self.getFeatures() self.classifier.train(feature_sets) """ Determines training error""" def trainingError(self): feature_sets = self.getFeatures() p = nltk.classify.accuracy(self.classifier,feature_sets) return p """ Make sure that the algorithm works on training data using a k fold cross validation scheme """ def kfoldCrossValidation(self,k): feature_sets = self.getFeatures() error = 0 for i in range(k): self.classifier = SklearnClassifier(MultinomialNB()) n = len(feature_sets)/k train_set,test_set = feature_sets[:n*i],feature_sets[n*i:] test_set1 = feature_sets[:n*i] train_set = feature_sets[n*i:n*(i+1)] test_set2 = feature_sets[i+1:] test_set = test_set1+test_set2 self.classifier.train(feature_sets) p = nltk.classify.accuracy(self.classifier,test_set) return p """ Make sure that the algorithm works on training data using a leave one out cross validation scheme """ def leave1OutCrossValidation(self): error = 0 feature_sets = self.getFeatures() N = len(feature_sets) for i in range(N): self.classifier = SklearnClassifier(MultinomialNB()) train_set1,test_set,train_set2 = feature_sets[:i],feature_sets[i],feature_sets[i+1:] train_set = train_set1+train_set2 test_set = [test_set] self.classifier.train(feature_sets) p = nltk.classify.accuracy(self.classifier,test_set) error+=p return error/N """ Construct a learning curve to see if there is overfitting""" def learningCurve(self,numTrials=4): accuracies = [] feature_sets = self.getFeatures() for k in xrange(1,len(feature_sets)-1): total = 0 for i in xrange(numTrials): self.classifier = SklearnClassifier(MultinomialNB()) random.shuffle(feature_sets) train_set,test_set = feature_sets[:k],feature_sets[k:] self.classifier.train(feature_sets) p = nltk.classify.accuracy(self.classifier,test_set) total+=p accuracies.append(total/numTrials) return accuracies """ Train on only k features and return training labels and predicted labels """ def testClassify(self,k): feature_sets = self.getFeatures() random.shuffle(feature_sets) self.classifier = SklearnClassifier(MultinomialNB()) self.classifier.train(feature_sets[k:]) features,ref_labels = zip(*feature_sets[:k]) pred_labels = self.classifier.classify_many(features) return ref_labels,pred_labels """ nltk confusion matrix """ def confusionMatrix(self,ref,test): ref.sort(key=lambda x: x[0]) test.sort(key=lambda x: x[0]) _,ref_labels = zip(*ref) _,test_labels = zip(*test) cm = ConfusionMatrix(ref_labels, test_labels) return cm """ Classifies proteins based on its text """ def classify(self,db,fastain): proIDs,features,labels = [],[],[] prevFeatureset = '' prevText = '' for seq_record in SeqIO.parse(fastain, "fasta"): title = seq_record.id toks = title.split("|") proteinID = toks[5] query_rows = genbank.proteinQuery(proteinID,db) ids,text = zip(*query_rows) text = ''.join(map(str,text)) if text=='': label = ['na'] else: text = word_reg.findall(text) featureset = self.gene_features(text) assert text!=prevText assert featureset!=prevFeatureset prevFeatureset = featureset prevText = text label = self.classifier.batch_classify([featureset]) proIDs.append(proteinID) labels+=label return zip(proIDs,labels)
def score(classifier): classifier = SklearnClassifier(classifier) classifier.train(trainset) pred = classifier.batch_classify(test) return accuracy_score(tag_test, pred)
class RForests(text_classifier.TextClassifier): def __init__(self,trainDir,labelFile,numTrees=10,numJobs=1): self.classifier = None self.labelFile = labelFile self.trainingDir = trainDir self.labels = None self.all_words = None self.numTrees = numTrees self.numJobs = numJobs self.classifier = SklearnClassifier(RandomForestClassifier( n_estimators=self.numTrees, n_jobs=numJobs),sparse=False) #self.labels = training.setup(labelFile) #self.train() def train(self): feature_sets = self.getFeatures() self.classifier.train(feature_sets) """ Determines training error""" def trainingError(self): feature_sets = self.getFeatures() p = nltk.classify.accuracy(self.classifier,feature_sets) return p """ Make sure that the algorithm works on training data using a k fold cross validation scheme """ def kfoldCrossValidation(self,k): feature_sets = self.getFeatures() error = 0 for i in range(k): self.classifier = SklearnClassifier(RandomForestClassifier( n_estimators=self.numTrees),sparse=False) n = len(feature_sets)/k train_set,test_set = feature_sets[:n*i],feature_sets[n*i:] test_set1 = feature_sets[:n*i] train_set = feature_sets[n*i:n*(i+1)] test_set2 = feature_sets[i+1:] test_set = test_set1+test_set2 self.classifier.train(feature_sets) p = nltk.classify.accuracy(self.classifier,test_set) return p """ Make sure that the algorithm works on training data using a leave one out cross validation scheme """ def leave1OutCrossValidation(self): error = 0 feature_sets = self.getFeatures() N = len(feature_sets) for i in range(N): self.classifier = SklearnClassifier(RandomForestClassifier( n_estimators=self.numTrees),sparse=False) train_set1,test_set,train_set2 = feature_sets[:i],feature_sets[i],feature_sets[i+1:] train_set = train_set1+train_set2 test_set = [test_set] self.classifier.train(feature_sets) p = nltk.classify.accuracy(self.classifier,test_set) error+=p return error/N """ Construct a learning curve to see if there is overfitting""" def learningCurve(self,numTrials=4): accuracies = [] feature_sets = self.getFeatures() for k in xrange(1,len(feature_sets)-1): total = 0 for i in xrange(numTrials): self.classifier = SklearnClassifier(RandomForestClassifier( n_estimators=self.numTrees), sparse=False) random.shuffle(feature_sets) train_set,test_set = feature_sets[:k],feature_sets[k:] self.classifier.train(train_set) p = nltk.classify.accuracy(self.classifier,test_set) print len(train_set),len(test_set),p total+=p accuracies.append(total/numTrials) return accuracies """ Train on only k features and return training labels and predicted labels """ def testClassify(self,k): feature_sets = self.getFeatures() random.shuffle(feature_sets) self.classifier = SklearnClassifier(RandomForestClassifier( n_estimators=self.numTrees),sparse=False) self.classifier.train(feature_sets[k:]) features,ref_labels = zip(*feature_sets[:k]) pred_labels = self.classifier.batch_classify(features) return ref_labels,pred_labels """ nltk confusion matrix """ def confusionMatrix(self,ref,test): ref.sort(key=lambda x: x[0]) test.sort(key=lambda x: x[0]) _,ref_labels = zip(*ref) _,test_labels = zip(*test) cm = ConfusionMatrix(ref_labels, test_labels) return cm def prob_classify(self,db,fastain): proIDs,pds,labels = [],[],[] prevFeatureset = '' prevText = '' for seq_record in SeqIO.parse(fastain, "fasta"): title = seq_record.id toks = title.split("|") proteinID = toks[5] query_rows = genbank.proteinQuery(proteinID,db) ids,text = zip(*query_rows) text = ''.join(map(str,text)) if text=='': label = ['na'] pd = None else: text = word_reg.findall(text) featureset = self.gene_features(text) assert text!=prevText assert featureset!=prevFeatureset prevFeatureset = featureset prevText = text label = self.classifier.batch_classify(featureset) pd = self.classifier.prob_classify([featureset])[0] proIDs.append(proteinID) pds.append(pd) labels+=label return proIDs,labels,pds def classifyPickle(self,pickle,fastain): proIDs,features,labels = [],[],[] prevFeatureset = '' prevText = '' gbkTable = genbank.GenBankTable() gbkTable.load(pickle) for seq_record in SeqIO.parse(fastain, "fasta"): title = seq_record.id toks = title.split("|") locus_tag = toks[5] text = gbkTable.getLocusText(locus_tag) if text=='': label = 'na' else: text = word_reg.findall(text) featureset = self.gene_features(text) #assert text!=prevText #assert featureset!=prevFeatureset prevFeatureset = featureset prevText = text label = self.classifier.classify(featureset) #print label,text proIDs.append(locus_tag) labels.append(label) return zip(proIDs,labels) """ Classifies proteins based on its text from sqlite3 database""" def classifyDB(self,db,fastain): proIDs,features,labels = [],[],[] prevFeatureset = '' prevText = '' for seq_record in SeqIO.parse(fastain, "fasta"): title = seq_record.id toks = title.split("|") locus_tag = toks[5] locus_rows = genbank_sqlite3.locusQuery(locus_tag,db) protein_rows = [] for row in locus_rows: locus,proteinID = row query_rows = genbank_sqlite3.proteinQuery(proteinID,db) protein_rows+=query_rows #print len(protein_rows),locus_tag if len(protein_rows)==0: label = 'na' else: ids,text = zip(*protein_rows) text = ''.join(map(str,text)) if text=='': label = 'na' else: text = word_reg.findall(text) featureset = self.gene_features(text) #assert text!=prevText #assert featureset!=prevFeatureset prevFeatureset = featureset prevText = text label = self.classifier.classify(featureset) #print label,text proIDs.append(locus_tag) labels.append(label) return zip(proIDs,labels) def classify(self,dbin,fastain,type='sqlite3'): if type=='sqlite3': return self.classifyDB(dbin,fastain) else: return self.classifyPickle(dbin,fastain)