def test_bernollinb_returns_correct_result(self): train_data = [({ "a": 4, "b": 1, "c": 0 }, "ham"), ({ "a": 5, "b": 2, "c": 1 }, "ham"), ({ "a": 0, "b": 3, "c": 4 }, "spam"), ({ "a": 5, "b": 1, "c": 1 }, "ham"), ({ "a": 1, "b": 4, "c": 3 }, "spam")] classif = SklearnClassifier(BernoulliNB()).train(train_data) test_data = [{"a": 3, "b": 2, "c": 1}, {"a": 0, "b": 3, "c": 7}] ccm = classif.classify_many(test_data) ['ham', 'spam'] self.assertEqual(ccm, ['ham', 'spam'])
def test_svc_returns_correct_result(self): train_data = [({ "a": 4, "b": 1, "c": 0 }, "ham"), ({ "a": 5, "b": 2, "c": 1 }, "ham"), ({ "a": 0, "b": 3, "c": 4 }, "spam"), ({ "a": 5, "b": 1, "c": 1 }, "ham"), ({ "a": 1, "b": 4, "c": 3 }, "spam")] classif = SklearnClassifier(SVC(), sparse=False).train(train_data) test_data = [{"a": 3, "b": 2, "c": 1}, {"a": 0, "b": 3, "c": 7}] ccm = classif.classify_many(test_data) self.assertEqual(ccm, ['ham', 'spam'])
def read(filename): fp = open(filename, "r") f = fp.readlines() vocab = [s.encode('utf-8').split() for s in f] #print vocab voc_vec = word2vec.Word2Vec(vocab, min_count=1, size=4) #print voc_vec.syn0.shape #print type(voc_vec['yav']) #Openning data file fp.close() fp = open("test_data.txt", "r") f = fp.read() tokens = nltk.word_tokenize(f) D = OrderedDict() sentences = [] #print len(tokens) for word in tokens[0:200]: D[word.split("|")[0]] = word.split("|")[1] sentences.append(word.split("|")[0]) #print D train_data = [] for key in D: l = voc_vec[key] x = {} x['a'] = l[0] x['b'] = l[1] x['c'] = l[2] x['d'] = l[3] train_data.append((x, D[key])) classif = SklearnClassifier(BernoulliNB()).train(train_data) #print train_data test_data = [] D2 = OrderedDict() for word in tokens[200:300]: D2[word.split("|")[0]] = word.split("|")[1] expected_list = [] for key in D2: l = voc_vec[key] x = {} x['a'] = l[0] x['b'] = l[1] x['c'] = l[2] x['d'] = l[3] test_data.append(x) expected_list.append(D2[key]) predicted = classif.classify_many(test_data) print len(predicted) print len(expected_list) print accuracy_score(expected_list, predicted, normalize=False)
do_evaluation (pairs) do_evaluation (pairs, pos_cls='neg') #%% Other classifier : SVM ################################################### # http://www.nltk.org/howto/classify.html # Run example from nltk.classify import SklearnClassifier from sklearn.svm import SVC t0 = time.time() classif = SklearnClassifier(SVC(), sparse=False).train(train_set) print(round(time.time()-t0,2)) classif.classify_many(test_set[0][0]) sizeTrain = [800] # the first 100, the first 300 ,etc testDoc = [800, 1000] # 800 to 999 classif.classify_many(test_set[0][0]) #%% SVM Class ################################################################ from nltk.classify import SklearnClassifier from sklearn.svm import SVC class SVM: """ SVM data = dict (key = pos or neg), value = list of filenames
ngram_truncate(ngram_records, 1000) print('done truncation') train_max = 55000 train_data4 = [(x, y) for x, y in zip(ngram_records[:train_max], labels[:train_max])] # for C in [1,0.8,0.6,0.4,0.2]: # for C in [0.1, 0.08, 0.06, 0.04, 0.02]: for C in [1, 0.8, 0.6, 0.4, 0.2, 0.1, 0.08, 0.06, 0.04, 0.02]: # for C in [0.2, 0.1, 0.08, 0.06, 0.04, 0.02]: print('C=', C) classifier4 = SklearnClassifier(LogisticRegression(C=C, penalty='l1'), sparse=False).train(train_data4) val_labels4 = classifier4.classify_many(ngram_records[train_max:]) aa = [x == y for x, y in zip(val_labels4, labels[train_max:])] print(np.mean(aa)) accu.append(np.mean(aa)) train_labels4 = classifier4.classify_many(ngram_records[:train_max]) aatr = [x == y for x, y in zip(train_labels4, labels[:train_max])] print(np.mean(aatr)) classifiers.append(classifier4) # with open('classifiers_temp_b0p03_v5_n6_l1_sweep_th1000_scale.pkl','wb') as f: # pickle.dump(classifiers,f) with open('DEBU_classifiers_temp_b0p1_v5_n6_l1_VFB_sweep_th1000_scale.pkl', 'wb') as f: pickle.dump(classifiers, f)
isNER = False w = lancaster_stemmer.stem(word[0]) features['{}'.format(w.lower()), '{}'.format(word[1]), '{}'.format(isNER)] = 'inference' return features trainFeaturesets0 = [(get_features_basic(post), post.get('value')) for post in trainRoot] testFeaturesets0 = [(get_features_basic(post), post.get('value')) for post in testRoot] classifier0 = SklearnClassifier(BernoulliNB()).train(trainFeaturesets0) actual = [t[1] for t in testFeaturesets0] prediction = classifier0.classify_many([fs for (fs, l) in testFeaturesets0]) result = zip(actual, prediction) truePositive = 0 falseNegative = 0 falsePositive = 0 trueNegative = 0 for a in result: if a[0] == 'TRUE': if a[1] == 'TRUE': truePositive = truePositive + 1 else: falseNegative = falseNegative + 1 else: if a[1] == 'TRUE':
#region SVMClassifier WriteLog("\nEntering SVM", ClassificationLogFile) trainD = list() testD = list() gTruth = list() #Formatting the Data for dictPair in training_set: trainD.append(dictPair) for dictPair in testing_set: testD.append(dictPair[0]) gTruth.append(dictPair[1]) WriteLog("Starting SVM Training", ClassificationLogFile) SVMClassifier = SklearnClassifier(SVC(), sparse=False).train(trainD) SVMPredictions = SVMClassifier.classify_many(testD) WriteLog("SVM Training Set Accuracy:", ClassificationLogFile) WriteLog(str(accuracy_score(gTruth, SVMPredictions, normalize=True, sample_weight=None)), ClassificationLogFile) #SVM Classification WriteLog("SVM Classification", ClassificationLogFile) DoClassify(SVMClassifier, SVMtopicResultsTxt, topicTweetsLDATxt) #SVM Predictions WriteLog("SVM Predictions:", ClassificationLogFile) WriteLog(SVMPredictions, ClassificationLogFile) #endregion #region NaiveBayes WriteLog("\nNaive Bayes Training", ClassificationLogFile)
def LinSVC(): classifierLinSVC = SklearnClassifier(LinearSVC(), sparse=False).train(train_set) return classifierLinSVC.classify_many(test)
rating_names = [student['name'] for student in ratings] data_names = list(set([student['Name'] for student in data])) #cleans text for classifying for i,student in enumerate(data): text = tech.cleanse(student['Student Comment']) data[i]['Student Comment'] = text #split into testing and training sets n = len(data) test_idx = random.sample(xrange(n),int(n*0.5)) train_idx = set(xrange(n))-set(test_idx) test_set = filter(lambda item: item[1] ,map(extract_featurelabel,[data[i] for i in test_idx])) train_set = filter(lambda item: item[1] ,map(extract_featurelabel,[data[i] for i in train_idx])) #classifier = NaiveBayesClassifier.train(train_set) classif.train(test_set) #Compute accuracy test_data,test_label = zip(*test_set) train_data,train_label = zip(*train_set) predictions = classif.classify_many(test_data) print confusion_matrix(test_label,predictions) print matthews_corrcoef(test_label,predictions) ''' #Only work if using built-in NLTK classifier print ('Accuracy: {0:.2f}%'.format(100 * nltk.classify.accuracy(classif, test_set))) classif.show_most_informative_features(20) '''
def NBtfidf(): classifierTF = SklearnClassifier(pipeline).train(train_set) return classifierTF.classify_many(test)
""" Linear (Bernoulli) SVC Implementation of Support Vector Machine classifier using libsvm: the kernel can be non-linear but its SMO algorithm does not scale to large number of samples as LinearSVC does. """ from nltk.classify import SklearnClassifier from sklearn.naive_bayes import BernoulliNB from sklearn.svm import SVC print " " print "=============================" print "Bernoulli SVC Classifier:" classifierBi = SklearnClassifier(BernoulliNB()).train(train_set) classifierBi.classify_many(test) for pdist in classifierBi.prob_classify_many(test): print pdist.prob("human"), pdist.prob("auto") for i in range(len(classifierBi.classify_many(test))): print classifierBi.classify_many(test)[i] classifierSVC = SklearnClassifier(SVC(), sparse=True).train(train_set) classifierSVC.classify_many(test) # svc = nltk.classify.accuracy(classifierSVC, test_set) # print 'accuracy is %.2f' %round(svc*100,4), '%' def SVC(): classifierBi = SklearnClassifier(BernoulliNB()).train(train_set) return classifierSVC.classify_many(test)
class MyClassifier: def __init__(self, load_clf=False, load_tr_data=False): self.features = self.__load_support_vector_features() self.training_data = [] self.n_samples = 0 self.all_tweets = self.__load_tweets_from_file() # list not dict # Classifier loading if load_clf: self.load_clf() else: self.clf = SklearnClassifier(SVC(), sparse=False) # Training Data loading if load_tr_data: self.__load_training_data() def __load_tweets_from_file(self): # open latest file list_of_files = glob.glob( "datasets_twitter/twitter_training_data_raw*.txt") latest_file = max(list_of_files, key=os.path.getctime) f = open(latest_file, "r", encoding="UTF-8") tweet_list = [] for line in f: line = line.split("%\t%") tweet_text, tweet_id = line[0], line[1] tweet_list.append((tweet_text, tweet_id)) return tweet_list def __load_support_vector_features(self): feature_f = open("verifiability_features.txt", "r") # get all features support_vector_features = [] for line_f in feature_f: support_vector_features.append(line_f.replace("\n", "")) feature_f.close() return support_vector_features def __get_sample(self, text_str): """ Changes the text_str into a sample of data in the form of [0, 0, 0, ...] This is to be used by the classifier, when 1) Assembling Training Data, and 2) Testing data. It returns a list of int, which is basically a count of how many of each feature existed in text_str. :param text_str: a string of text which is to be verified :return: curr_sample, a list of int, sort of mapped to self.features """ tokens = pos_tag(word_tokenize(text_str)) curr_sample = [0] * len( self.features) # list of n_features of 0s ex. [0, 0, 0, ..] for token in tokens: # for each feature t_text, t_feature = token[0], token[1] try: for index in range(len(self.features)): if t_feature == self.features[index]: # when found, increment/decrement sample vector's value if self.features[index] == self.features[-1]: # checking if there is a "?" in the text if token[0] == "?": # decrement curr_sample[index] -= 1 break else: curr_sample[index] += 1 break except IndexError: # if the feature isn't in the sv_features list pass return curr_sample def __get_training_target(self, sample): """ Returns the label depending on the sample given. :param sample: int[] from self.__get_sample() :return: "VER" or "NVER", representing the two labels Verifiable and Non-Verifiable """ # check sample if VER or NVER t_sum = 0 for v in sample: if v < 0: # if there exists a "?" in the sample text # (this is the only reason why there'd be a -ve value in curr_sv) t_sum = -1 break t_sum += v if t_sum > 0: return "VER" else: return "NVER" def __assemble_training_data(self): """ Construct the training data using the twitter training data set. To be used directly prior to training the Classifier :return: """ for tweet in self.all_tweets: # get the sample and target for each tweet tweet_text = tweet[0] curr_sample = self.__get_sample(tweet_text) curr_target = self.__get_training_target(curr_sample) # change the above into training data tr_dict = {} for i in range(len(self.features)): tr_dict[self.features[i]] = curr_sample[i] tup = (tr_dict, curr_target) # add to self.training_data self.training_data.append(tup) # repeat def __save_training_data(self): timestamp = '{:%Y_%m_%d_%H_%M_%S}'.format(datetime.datetime.now()) f = open( "datasets_twitter/twitter_training_dataset" + timestamp + ".json", "w+") json_data = json.dumps(self.training_data) f.write(json_data) f.close() def __load_training_data(self): list_of_files = glob.glob( "datasets_twitter/twitter_training_dataset*.json") latest_file = max(list_of_files, key=os.path.getctime) f = open(latest_file, "r") s = f.readline() js = json.loads(s) for i in js: tup = (i[0], i[1]) # sample, target self.training_data.append(tup) def train_with_svc(self): # make the training data self.__assemble_training_data() # Train the classifier self.clf.train(self.training_data) # save classifier as soon as it is trained self.save_clf() def predict_single(self, test_text): """ Predict a single sample. Then based on user's input, add the sample to the training data with the correct label. :param test_text: :return: """ test_sample = self.__get_sample(test_text) test_dict = {} for index in range(len(self.features)): test_dict[self.features[index]] = test_sample[index] pred = self.clf.classify_many([test_dict]) return (pred[0], test_sample) def predict_multiple(self, test_list): """ Predict more than one sample at a time. :param test_list: :return: """ # translate test_list into clf passable data format test_data = [] for i in test_list: curr_test_sample = self.__get_sample(i) test_dict = {} for index in range(len(self.features)): test_dict[self.features[index]] = curr_test_sample[index] test_data.append(test_dict) # predict pred = self.clf.classify_many(test_data) return pred def update_pred_into_training(self, test_tweet, pred_val): """ Adds predicted ( {feat:sample}, target ) to training data then saves the training data if test_text already exists in the training data update the target value instead then save the training data :param test_tweet: a tweet in the form of (tweet_text, tweet_id) :param pred_val: the value of the prediction made by the classifier :return: """ # a flag to make sure only one part of the code is run updated = False # localise test_tweet_text = test_tweet[0] # if text exists in training data already, update the target for this tweet for i in range(len(self.all_tweets)): tweet = self.all_tweets[i] if test_tweet_text == tweet[0]: # if found test_sample = self.__get_sample(test_tweet_text) # make into trainable data format test_dict = {} for j in range(len(self.features)): test_dict[self.features[j]] = test_sample[j] test_target = pred_val tup = (test_dict, test_target) # get the current tup for the test_text and replace self.training_data[i] = tup # there should only be one tweet with the same text updated = True break # if test_text is not in the training data already if not updated: # make into trainable data format test_sample = self.__get_sample(test_tweet_text) test_dict = {} for j in range(len(self.features)): test_dict[self.features[j]] = test_sample[j] test_target = pred_val tup = (test_dict, test_target) # add tweet to all_tweets and training data # get tweet_id self.all_tweets.append(test_tweet) self.training_data.append(tup) # consistency updated = True # save the training data to file self.__save_training_data() # train the classifier again self.train_with_svc() def load_clf(self): """ Load a previously trained and saved classifier. :return: """ self.clf = joblib.load("twitterClassifier.pkl") def save_clf(self): """ Save the current classifier to file :return: """ joblib.dump(self.clf, "twitterClassifier.pkl")
##0-Suffix,1-Previous Number,2-Next Number ,3-Previous wordform,4-next wordform, 5-post position,6-present word form,7-POS#### from preprocess_train import features,number; from preprocess_test import features_test,number_test; from nltk.classify import SklearnClassifier from sklearn.naive_bayes import BernoulliNB from sklearn.svm import SVC train_data=[[x for x in range(2)] for y in range(357)] test_data=[[x for x in range(1)] for y in range(11)] for i in range(0,357): train_data[i][0]={'Suffix':features[i][0], 'Previous morph':features[i][1],'Next morph':features[i][2],'Previous wordform':features[i][3], 'Next wordform':features[i][4],'postposition':features[i][5],'wordform':features[i][6],'pos':features[i][7]} train_data[i][1]=number[i] for i in range(0,11): test_data[i]={'Suffix':features_test[i][0], 'Previous morph':features_test[i][1],'Next morph':features_test[i][2],'Previous wordform':features_test[i][3], 'Next wordform':features_test[i][4],'postposition':features_test[i][5],'wordform':features[i][6],'pos':features[i][7]} classif = SklearnClassifier(SVC(), sparse=False).train(train_data) result=classif.classify_many(test_data) classif1 = SklearnClassifier(BernoulliNB()).train(train_data) result1=classif1.classify_many(test_data) print result1
label_train.append(X_label[item]) print("test") for item in test_indices: list_test_data.append(X[item]) label_test.append(X_label[item]) break print(list_train_data) print(label_train) print(list_test_data) print(label_test) # bieu dien vector input train_data = buildVectorTrainData(list_feature, list_train_data, label_train) # example train_data = [({"stupid": 0, "lovely": 1, "dog": 2,"cat":0}, "positive_dog"), # ({"stupid": 1, "lovely": 0, "dog": 0, "cat": 2}, "negative_cat"), # ({"stupid": 0, "lovely": 0, "dog": 0, "cat": 0}, "normal")] test_data = buildVectorTestData(list_feature, list_test_data) # models and measure classif = SklearnClassifier(BernoulliNB()).train(train_data) # classif = SklearnClassifier(SVC(C=1.0, kernel='rbf', degree=3), sparse=False).train(train_data) # measure accuracy y_pred = classif.classify_many(test_data) y_true = label_test print(accuracy_score(y_true, y_pred)) # y_true = [0, 1, -1, -1, 0] # y_pred = [0, 0, -1, 1, 0] # target_names = ['class 0', 'class 1', 'class 2'] # print(classification_report(y_true, y_pred, target_names=target_names))
}, "ham"), ({ "a": 0, "b": 3, "c": 4 }, "spam"), ({ "a": 5, "b": 1, "c": 1 }, "ham"), ({ "a": 1, "b": 4, "c": 3 }, "spam")] classif = SklearnClassifier(BernoulliNB()).train(train_data) test_data = [{"a": 3, "b": 2, "c": 1}, {"a": 0, "b": 3, "c": 7}] classif.classify_many(test_data) classif = SklearnClassifier(SVC(), sparse=False).train(train_data) classif.classify_many(test_data) def print_maxent_test_header(): print(' ' * 11 + ''.join([' test[%s] ' % i for i in range(len(test))])) print(' ' * 11 + ' p(x) p(y)' * len(test)) print('-' * (11 + 15 * len(test))) def test_maxent(algorithm): print('%11s' % algorithm) try: classifier = nltk.classify.MaxentClassifier.train(train,
""" Linear (Bernoulli) SVC Implementation of Support Vector Machine classifier using libsvm: the kernel can be non-linear but its SMO algorithm does not scale to large number of samples as LinearSVC does. """ from nltk.classify import SklearnClassifier from sklearn.naive_bayes import BernoulliNB from sklearn.svm import SVC print ' ' print '=============================' print 'Bernoulli SVC Classifier:' classifierBi = SklearnClassifier(BernoulliNB()).train(train_set) classifierBi.classify_many(test) for pdist in classifierBi.prob_classify_many(test): print pdist.prob('human'), pdist.prob('auto') for i in range(len(classifierBi.classify_many(test))): print classifierBi.classify_many(test)[i] classifierSVC = SklearnClassifier(SVC(), sparse=True).train(train_set) classifierSVC.classify_many(test) # svc = nltk.classify.accuracy(classifierSVC, test_set) # print 'accuracy is %.2f' %round(svc*100,4), '%' def SVC(): classifierBi = SklearnClassifier(BernoulliNB()).train(train_set) return classifierSVC.classify_many(test)