def funcn(): f = open("amazon_data.txt") pos_tweets = list() neg_tweets = list() for line in f: words = line.split("\t") if words[1] == '0\n' or words[1] == '0': neg_tweets.append(words) else: pos_tweets.append(words) f.close() tweets = [] for (words, sentiment) in pos_tweets + neg_tweets: words_filtered = [e.lower() for e in words.split() if len(e) >= 3] tweets.append((words_filtered, sentiment)) def get_words_in_tweets(tweets): all_words = [] for (words, sentiment) in tweets: all_words.extend(words) return all_words def get_word_features(wordlist): wordlist = nltk.FreqDist(wordlist) word_features = wordlist.keys() return word_features word_features = get_word_features(get_words_in_tweets(tweets)) def extract_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features training_set = nltk.classify.apply_features(extract_features, tweets) classifie = nltk.NaiveBayesClassifier.train(training_set) classifier = SklearnClassifier(BernoulliNB()).train(training_set) tweet = 'it is not bad' print(classifie.classify(extract_features(tweet.split()))) print(classifier.classify(extract_features(tweet.split()))) classif = SklearnClassifier(SVC(), sparse=False).train(training_set) print(classif.classify(extract_features(tweet.split())))
def treina_classificadores(): posdados = [] with open('./dadostreino/train_EPTC_POA_v3nbal_1.data', 'rb') as myfile: reader = csv.reader(myfile, delimiter=',') for val in reader: posdados.append(val[0]) negdados = [] with open('./dadostreino/train_EPTC_POA_v3nbal_0.data', 'rb') as myfile: reader = csv.reader(myfile, delimiter=',') for val in reader: negdados.append(val[0]) neudados = [] with open('./dadostreino/train_EPTC_POA_v3nbal_2.data', 'rb') as myfile: reader = csv.reader(myfile, delimiter=',') for val in reader: neudados.append(val[0]) negfeats = [(bag_of_words(f), 'neg') for f in divide(negdados)] posfeats = [(bag_of_words(f), 'pos') for f in divide(posdados)] neufeats = [(bag_of_words(f), 'neu') for f in divide(neudados)] treino = negfeats + posfeats + neufeats #'Maximum Entropy' classificadorME = MaxentClassifier.train(treino, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) #SVM classificadorSVM = SklearnClassifier(LinearSVC(), sparse=False) classificadorSVM.train(treino) # Naive Bayes classificadorNB = NaiveBayesClassifier.train(treino) return ([classificadorME, classificadorSVM, classificadorNB])
def m_train(): train = [] with codecs.open('data/train_chunked_double.data', mode='r', encoding='UTF-8') as file: for line in file.readlines(): line = line.strip('\n') line = line.strip('\r') pair = line.split(',') e = pair[0] z = pair[1] for j in range(len(z)): x = gen_x(e, z, j) y = z[j] train.append((x, y)) try: clas = SklearnClassifier( LogisticRegression(solver='lbfgs', n_jobs=-1, max_iter=200)).train(train) save_model(clas) return clas except Exception as e: print('Error: %r' % e) return None
def test_bernollinb_returns_correct_result(self): train_data = [({ "a": 4, "b": 1, "c": 0 }, "ham"), ({ "a": 5, "b": 2, "c": 1 }, "ham"), ({ "a": 0, "b": 3, "c": 4 }, "spam"), ({ "a": 5, "b": 1, "c": 1 }, "ham"), ({ "a": 1, "b": 4, "c": 3 }, "spam")] classif = SklearnClassifier(BernoulliNB()).train(train_data) test_data = [{"a": 3, "b": 2, "c": 1}, {"a": 0, "b": 3, "c": 7}] ccm = classif.classify_many(test_data) ['ham', 'spam'] self.assertEqual(ccm, ['ham', 'spam'])
def trainClassifier(trainData): pipeline = Pipeline([('svc', LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=0, tol=0.0001, verbose=0))]) return SklearnClassifier(pipeline).train(trainData)
def searchLinearSVC(title, train_departments): """ Linear SVC :param title: :param train_departments: :return: """ timeTraning = time.time() #classifier = SklearnClassifier(LinearSVC(probability=True)) classifier = SklearnClassifier(SVC(kernel='linear', probability=True)) classifier.train(train_departments) timeTraning = time.time() - timeTraning test_sent_features = word_feats(title) timeClassify = time.time() found_department = classifier.classify(test_sent_features) timeClassify = time.time() - timeClassify probability = classifier.prob_classify(test_sent_features) print(probability.prob(found_department)) return [ found_department, probability.prob(found_department), accuracy(classifier, train_departments[1000:]), timeClassify, timeTraning, ]
def searchSGDClassifier_classifier(title, train_departments): """ :param title: :param train_departments: :return: """ timeTraning = time.time() classifier = SklearnClassifier(SGDClassifier(loss='log')) classifier.train(train_departments) timeTraning = time.time() - timeTraning test_sent_features = word_feats(title) timeClassify = time.time() found_department = classifier.classify(test_sent_features) timeClassify = time.time() - timeClassify probability = classifier.prob_classify(test_sent_features) print(probability.prob(found_department)) return [ found_department, probability.prob(found_department), accuracy(classifier, train_departments[1000:]), timeClassify, timeTraning, ]
def test_svc_returns_correct_result(self): train_data = [({ "a": 4, "b": 1, "c": 0 }, "ham"), ({ "a": 5, "b": 2, "c": 1 }, "ham"), ({ "a": 0, "b": 3, "c": 4 }, "spam"), ({ "a": 5, "b": 1, "c": 1 }, "ham"), ({ "a": 1, "b": 4, "c": 3 }, "spam")] classif = SklearnClassifier(SVC(), sparse=False).train(train_data) test_data = [{"a": 3, "b": 2, "c": 1}, {"a": 0, "b": 3, "c": 7}] ccm = classif.classify_many(test_data) self.assertEqual(ccm, ['ham', 'spam'])
def run_program(is_testing, mode): """########## CHECKING WHAT THE PROGRAM IS GOING TO EXECUTE ##########""" print(" ") print(print_vals(is_testing, mode)) """###################################################################""" iteration = 0 file_path = '' if is_testing: file_path = 'Data/datasets/test.csv' else: file_path = 'Data/datasets/training.csv' load_csv(file_path, mode) features = feature_choices() number_of_labels = int(len(labels)) iteration = 0 weighted_data = select_features(features) print("Training Classifier: ") classifier = SklearnClassifier( LinearSVC(loss='squared_hinge', max_iter=999999)).train(weighted_data) # make_predictions() return None
def train_and_save_model(data_set_name="NB_Model_Tatoeba_", n=2): trainingset = [] for i, label in enumerate(targets): featurs = text_features(data[i], n) trainingset.append((featurs, label)) classifier = SklearnClassifier(MultinomialNB()).train(trainingset) save(data_set_name + str(n) + "n", classifier) return classifier
def train_using_SklearnClassifier(self, training_data, test_data): # Giving bad results. Don't use. classifier = SklearnClassifier(BernoulliNB()).train(training_data) classifier2 = SklearnClassifier(SVC(), sparse=False).train(training_data) print(classifier) classifier_name = type(classifier).__name__ training_set_accuracy = nltk.classify.accuracy(classifier, training_data) training_set_accuracy2 = nltk.classify.accuracy( classifier2, training_data) test_set_accuracy = nltk.classify.accuracy(classifier, test_data) test_set_accuracy2 = nltk.classify.accuracy(classifier2, test_data) print(">>>>>>>>") print(training_set_accuracy, test_set_accuracy) print(training_set_accuracy2, test_set_accuracy2) return classifier, classifier_name, test_set_accuracy, training_set_accuracy
def LG_gender(train_set, test_set): print('== SkLearn MaxEnt ==') from nltk.classify import SklearnClassifier from sklearn.linear_model import LogisticRegression sklearn_classifier = SklearnClassifier( LogisticRegression(C=10e5)).train(train_set) print(sklearn_classifier.prob_classify(gender_features('mark'))._prob_dict) print(nltk.classify.accuracy(sklearn_classifier, test_set))
def main(): """Main.""" from sklearn.svm import SVC from nltk.classify import SklearnClassifier classifier = SklearnClassifier(SVC(kernel="rbf"), sparse=False) _train(classifier) _test(classifier)
def LG_gender(train_set): print('== SkLearn MaxEnt ==') from nltk.classify import SklearnClassifier from sklearn.linear_model import LogisticRegression sklearn_classifier = SklearnClassifier( LogisticRegression(C=10e5)).train(train_set) return sklearn_classifier
def searchNuSVC_classifier(title, train_departments): """ Nu-Support Vector Classification. :param title: :param train_departments: :return: """ classifier = SklearnClassifier(NuSVC()) classifier.train(train_departments) test_sent_features = word_feats(title) return classifier.classify(test_sent_features)
def train_scikit_model(best_features, feature_set, split_name, classifier_name): #train on the training data of word_features #find which classifier model to use if classifier_name == "nb": cls = nltk.classify.NaiveBayesClassifier.train(best_features) elif classifier_name == "nb_sk": cls = SklearnClassifier(BernoulliNB()).train(best_features) elif classifier_name == "dt": cls = nltk.classify.DecisionTreeClassifier.train(best_features) elif classifier_name == "dt_sk": cls = SklearnClassifier( tree.DecisionTreeClassifier()).train(best_features) elif classifier_name == "svm_sk" or classifier_name == "svm": cls = SklearnClassifier(svm.SVC()) else: assert False, "unknown classifier name:{}; known names: nb, dt, svm, nb_sk, dt_sk, svm_sk".format( classifier_name) return cls
def read(filename): fp = open(filename, "r") f = fp.readlines() vocab = [s.encode('utf-8').split() for s in f] #print vocab voc_vec = word2vec.Word2Vec(vocab, min_count=1, size=4) #print voc_vec.syn0.shape #print type(voc_vec['yav']) #Openning data file fp.close() fp = open("test_data.txt", "r") f = fp.read() tokens = nltk.word_tokenize(f) D = OrderedDict() sentences = [] #print len(tokens) for word in tokens[0:200]: D[word.split("|")[0]] = word.split("|")[1] sentences.append(word.split("|")[0]) #print D train_data = [] for key in D: l = voc_vec[key] x = {} x['a'] = l[0] x['b'] = l[1] x['c'] = l[2] x['d'] = l[3] train_data.append((x, D[key])) classif = SklearnClassifier(BernoulliNB()).train(train_data) #print train_data test_data = [] D2 = OrderedDict() for word in tokens[200:300]: D2[word.split("|")[0]] = word.split("|")[1] expected_list = [] for key in D2: l = voc_vec[key] x = {} x['a'] = l[0] x['b'] = l[1] x['c'] = l[2] x['d'] = l[3] test_data.append(x) expected_list.append(D2[key]) predicted = classif.classify_many(test_data) print len(predicted) print len(expected_list) print accuracy_score(expected_list, predicted, normalize=False)
def predict_nltk(in_text='', n=2): ''' Text language classification Then use scikit-learn classifiers from within NLTK to classify new taxt based on training set. ''' trainingset = [] for label in text: featurs = text_features(text[label]) trainingset.append((featurs, label)) classifier = SklearnClassifier(MultinomialNB()).train(trainingset) in_features = text_features(in_text, n=n) lang = classifier.classify(in_features) print 'Language:', lang
def create_classifier(featx): pos_data = pickle.load( open(os.path.join(config.pkl_path, 'pos_reviews.pkl'), 'rb')) neg_data = pickle.load( open(os.path.join(config.pkl_path, 'neg_reviews.pkl'), 'rb')) pos_test_data = pickle.load( open(os.path.join(config.pkl_path, 'test_pos_reviews.pkl'), 'rb')) neg_test_data = pickle.load( open(os.path.join(config.pkl_path, 'test_neg_reviews.pkl'), 'rb')) print len(pos_data), '---++---', len(neg_data) pos_features = [(featx(w_lst), 'pos') for w_lst in pos_data] neg_features = [(featx(w_lst), 'neg') for w_lst in neg_data] pos_test_features = [(featx(w_lst), 'pos') for w_lst in pos_test_data] neg_test_features = [(featx(w_lst), 'neg') for w_lst in neg_test_data] pos_features.extend(neg_features) train_set = pos_features pos_test_features.extend(neg_test_features) test_set = pos_test_features print train_set is None, '---train_set----', len(train_set) print test_set is None, '-----test_set--', len(test_set) """ 训练两个分类器 """ nb_classifier = nltk.NaiveBayesClassifier.train(train_set) nba = nltk.classify.accuracy(nb_classifier, test_set) print "NBayes accuracy is %.7f" % nba # 86.78% svm_classifier = SklearnClassifier(LinearSVC()).train(train_set) svmm = nltk.classify.accuracy(svm_classifier, test_set) print "svm_classifier accuracy is %.7f" % svmm # 89.124% """ 保存准确率更大的那个模型 """ classifier_pkl = os.path.join(config.pkl_path, 'my_classifier.pkl') # 消极语料 with open(classifier_pkl, 'wb') as f: if nba > svmm: pickle.dump(nb_classifier, f) print 'NBayes' else: pickle.dump(svm_classifier, f) print 'SVM' print 'done!'
def __init__(self, classifier_type='NaiveBayes', feats=word_feats): # "Thumbs up? Sentiment Classification using Machine Learning Techniques classifier_list = ['NaiveBayes', 'MaximumEntropy', 'SVM'] if classifier_type in classifier_list: self.classifier_type = classifier_type else: print("Classifier Type is not implemented: " + classifier_type) if self.classifier_type == 'MaximumEntropy': self.classifier = MaxentClassifier elif self.classifier_type == 'SVM': self.classifier = SklearnClassifier(LinearSVC(), sparse=False) elif self.classifier_type == 'NaiveBayes': self.classifier = NaiveBayesClassifier self.feats = feats
def classifyUsingSVM(feature): # Define folds for cross validation kf = cross_validation.KFold(len(feature), n_folds=5, shuffle=False); features = numpy.array(feature); max2 = 0; SVMmodel = SklearnClassifier(SVC()); #logreg = linear_model.LogisticRegression(C=1e5) for x,y in kf: train_set_fold = features[x]; test_set_fold = features[y]; train_set = list(train_set_fold); test_set = list(test_set_fold); # SV Classifier classifier2 = SklearnClassifier(SVC()).train(train_set); #classifier3=MultinomialNB() #classifier3.train(train_set) accuracy2 = nltk.classify.accuracy(classifier2,test_set)*100; #accuracy3 = nltk.classify.accuracy(classifier3,test_set)*100; # Use the best model if accuracy2 > max2: SVMmodel = classifier2; return SVMmodel;
def build_classifier(classifier_name): """ Accepted names: nb, dt, svm, sk_nb, sk_dt, sk_svm svm and sk_svm will return the same type of classifier. :param classifier_name: :return: """ if classifier_name == "nb": cls = nltk.classify.NaiveBayesClassifier elif classifier_name == "nb_sk": cls = SklearnClassifier(BernoulliNB()) elif classifier_name == "dt": cls = nltk.classify.DecisionTreeClassifier elif classifier_name == "dt_sk": cls = SklearnClassifier(tree.DecisionTreeClassifier()) elif classifier_name == "svm_sk" or classifier_name == "svm": cls = SklearnClassifier(svm.SVC()) else: assert False, "unknown classifier name:{}; known names: nb, svm, nb_sk, dt_sk, svm_sk".format( classifier_name) return cls
def train(records): global CUR_CL train_data = [] for record in records: text = record[1] class_label = record[0] feats = features_from_text(text, class_label, stopwords=sw) train_data.append(feats) if CUR_CL is None: if CLASSIFIER == 'NaiveBayesClassifier': classifier = NaiveBayesClassifier.train(train_data) elif CLASSIFIER == 'sklearnLinSVC': pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=1000)), ('nb', LinearSVC(multi_class='ovr'))]) classifier = SklearnClassifier(pipeline).train(train_data) elif CLASSIFIER == 'BernoulliNB': pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=1000)), ('nb', BernoulliNB())]) classifier = SklearnClassifier(pipeline).train(train_data) elif CLASSIFIER == 'MultinomialNB': pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=1000)), ('nb', MultinomialNB())]) classifier = SklearnClassifier(pipeline).train(train_data) print CLASSIFIER CUR_CL = classifier else: print 'Partial fitting.. \n\n' CUR_CL.train(train_data) f = open("%s/%s.pickle" % (pickles_dir, 'news_based_' + CLASSIFIER), 'wb') pickle.dump(CUR_CL, f) f.close() print"%s/%s.pickle saved" % (pickles_dir, 'news_based_' + CLASSIFIER) gc.collect()
def trainModel(self, size): neg_training = self.extract_features(self.mr, self.data['neg'][:size], 'neg', feature_extractor=self.unigram_features) pos_training = self.extract_features(self.mr, self.data['pos'][:size],'pos', feature_extractor=self.unigram_features) train_set = pos_training + neg_training classif = SklearnClassifier(SVC(), sparse=False).train(train_set) self.classif = classif
def __init__(self, load_clf=False, load_tr_data=False): self.features = self.__load_support_vector_features() self.training_data = [] self.n_samples = 0 self.all_tweets = self.__load_tweets_from_file() # list not dict # Classifier loading if load_clf: self.load_clf() else: self.clf = SklearnClassifier(SVC(), sparse=False) # Training Data loading if load_tr_data: self.__load_training_data()
def SKClassifierSVM(self, dati): try: train, test=self.__CreaDatasetTrainTest(dati) classifier=SklearnClassifier(LinearSVC()).train(train) print "ACCURACY SVM:", nltk.classify.accuracy(classifier, test) return classifier except Exception, e: print 'errore in SVM' for i in e: print i print
def __init__(self, kernel: str = "") -> None: # Setup tweet tokenizer note this is the same as in our baseline. For a full description checkout the # model_naive_bayes_baselines source file. self.tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize # Here we create the pipeline for the classifier. # The TfidfTransformer is the same as in our baseline. For a full description checkout the # model_naive_bayes_baselines source file. # The SVC sets up a Support Vector Machine classifier with the configured kernel. # In this case it is either a linear or a radial basis function kernel. # The details for the above items are discussed in the model's readme. pipeline = Pipeline([('tfidf', TfidfTransformer()), ('{}svc'.format(kernel), SVC(kernel=kernel))]) self.classif = SklearnClassifier(pipeline)
def _train(self): pickle_filename = "{0}.pickle".format(self.__class__.__name__) if os.path.isfile(pickle_filename): with open(pickle_filename, "rb") as classifier_f: self._classifier = pickle.load(classifier_f) classifier_f.close() else: train_set = [(self._extract_features(cascade), cascade['label']) for cascade in self._dataset] gbc_clf = GradientBoostingClassifier(n_estimators=1000) self._classifier = SklearnClassifier(gbc_clf, sparse=False).train(train_set) with open(pickle_filename, "wb") as save_classifier: pickle.dump(self._classifier, save_classifier) save_classifier.close()
def _train(self): pickle_filename = "{0}.pickle".format(self.__class__.__name__) if os.path.isfile(pickle_filename): with open(pickle_filename, "rb") as classifier_f: self._classifier = pickle.load(classifier_f) classifier_f.close() else: train_set = [(self._extract_features(cascade), cascade['label']) for cascade in self._dataset] pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=1000)), ('rf', SVC(kernel='linear', probability=True))]) self._classifier = SklearnClassifier(pipeline, sparse=False).train(train_set) with open(pickle_filename, "wb") as save_classifier: pickle.dump(self._classifier, save_classifier) save_classifier.close()
def __init__(self): self.pre_pro = TweetPreprocessor() self.classifier = SklearnClassifier(MultinomialNB(alpha=1.375)) neg_twts = [(self.process_tweet(twt), "negative") for twt in twitter_samples.strings('negative_tweets.json')] pos_twts = [(self.process_tweet(twt), "positive") for twt in twitter_samples.strings('positive_tweets.json')] all_twts = neg_twts + pos_twts acc_scores, confusion_matrix = self.cross_validate(self.classifier, all_twts, 10) self.classifier.train(all_twts) print("Initialised classifier with an accuracy of {:.2f}%, +/- {:.2f}%" .format(mean(acc_scores) * 100, stdev(acc_scores) * 2 * 100)) print("Confusion matrix: \n{}".format(confusion_matrix))