def classifer_chain(self): # initialize classifier chains multi-label classifier # with a gaussian naive bayes base classifier print("build classifier...") classifier = ClassifierChain(RandomForestClassifier()) #classifier = LabelPowerset(RandomForestClassifier()) print("end...") print("start training...") classifier.fit(self.X_train, self.y_train) print("end...") # predict print("start test...") predictions = classifier.predict(self.X_test) print("end...") print("result as following:") result = hamming_loss(self.y_test, predictions) print("hanming_loss: ", result) print("accuracy score: ", accuracy_score(y_test, predictions)) result = f1_score(self.y_test, predictions, average='micro') print("micro-f1_score: ", result)
def buildCCClassifier(xTrain, yTrain): # initialize classifier chains multi-label classifier # with a gaussian naive bayes base classifier classifier = ClassifierChain(GaussianNB()) # train classifier.fit(xTrain, yTrain) return classifier
def check(request): vect = TfidfVectorizer(max_features=40000, stop_words='english') target = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] data = pd.read_csv('train.csv') test_data = pd.read_csv('D:/T.Y.BTECH/BML/Project/test.csv') X = data.comment_text test_X = test_data.comment_text xt = vect.fit_transform(X) yt = vect.transform(test_X) y_trans = data.iloc[:, 2:8] X_train, X_test, y_train, y_test = train_test_split(xt, y_trans, test_size=0.3) input_comment = '' output_class = None toxic = None severe_toxic = None obscene = None threat = None insult = None identity_hate = None posts = Post.objects.all() for post in posts: cmnt = post input_comment1 = str(cmnt) input_comment1 = [input_comment1] input_comment1 = vect.transform(input_comment1) from skmultilearn.problem_transform import ClassifierChain classifier = ClassifierChain(LogisticRegression(), require_dense=[False, True]) classifier.fit(X_train, y_train) output_class = classifier.predict_proba(input_comment1).toarray() #load_model = joblib.load('knn.pkl') #load_model = joblib.load('lr.pkl') #output_class = load_model.predict_proba(input_comment1).toarray() # output_class = output_class.tolist() output_class = list(chain.from_iterable(output_class)) toxic = output_class[0] severe_toxic = output_class[1] obscene = output_class[2] threat = output_class[3] insult = output_class[4] identity_hate = output_class[5] print(output_class) context = dict() context['input_comment'] = input_comment context['output_class1'] = toxic context['output_class2'] = severe_toxic context['output_class3'] = obscene context['output_class4'] = threat context['output_class5'] = insult context['output_class6'] = identity_hate return render(request, 'polls/comment_details.html', context)
def test_if_order_is_set(self): classifier = ClassifierChain( classifier=GaussianNB(), require_dense=[True, True], order=None ) X, y = self.get_multilabel_data_for_tests(sparsity_indicator='sparse')[0] classifier.fit(X,y) self.assertEqual(classifier._order(), list(range(y.shape[1])))
def test_if_order_is_set(self): classifier = ClassifierChain(classifier=GaussianNB(), require_dense=[True, True], order=None) X, y = self.get_multilabel_data_for_tests( sparsity_indicator='sparse')[0] classifier.fit(X, y) self.assertEqual(classifier._order(), list(range(y.shape[1])))
def train(self): classifier = ClassifierChain(LogisticRegression()) classifier.fit(self.x_data, self.y_data) predictions = classifier.predict(self.x_test) return { 'accuracy': accuracy_score(self.y_test, predictions), 'f1_score': f1_score(self.y_test, predictions, average='micro') }
def test_if_order_is_set_when_explicitly_given(self): X, y = self.get_multilabel_data_for_tests(sparsity_indicator='sparse')[0] reversed_chain = list(reversed(range(y.shape[1]))) classifier = ClassifierChain( classifier=GaussianNB(), require_dense=[True, True], order=reversed_chain ) classifier.fit(X, y) self.assertEqual(classifier._order(), reversed_chain)
def test_if_order_is_set_when_explicitly_given(self): X, y = self.get_multilabel_data_for_tests( sparsity_indicator='sparse')[0] reversed_chain = list(reversed(range(y.shape[1]))) classifier = ClassifierChain(classifier=GaussianNB(), require_dense=[True, True], order=reversed_chain) classifier.fit(X, y) self.assertEqual(classifier._order(), reversed_chain)
class ClassifierChains: def __init__(self): self.model = ClassifierChain(LGBMClassifier()) def set_grow_step(self, new_step): self.grow_boost_round = new_step def fit(self, X_train, y_train): self.model.fit(X_train, y_train) def predict(self, X_test): return self.model.predict(X_test).A
def classifiers(X_train, Y_train, X_test): classifier1 = BinaryRelevance(GaussianNB()) classifier2 = ClassifierChain(GaussianNB()) classifier3 = LabelPowerset(GaussianNB()) classifier1.fit(X_train, Y_train) classifier2.fit(X_train, Y_train) classifier3.fit(X_train, Y_train) predictions1 = classifier1.predict(X_test) predictions2 = classifier2.predict(X_test) predictions3 = classifier3.predict(X_test) return predictions1, predictions2, predictions3
def RecommendByClassifierChain(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """分类器链""" classifier = ClassifierChain(RandomForestClassifier(oob_score=True, max_depth=10, min_samples_split=20)) classifier.fit(train_data, train_data_y) predictions = classifier.predict_proba(test_data) predictions = predictions.todense().getA() recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
def ClassifierChain_method(X_train, y_train, samples_leaf, samples_split): """ 问题转换-->分类器链方法 :param X_train: 输入数据 :param y_train: 对应标签数据 :return: """ try: classifier = ClassifierChain( DecisionTreeClassifier(min_samples_leaf=int(samples_leaf), min_samples_split=int(samples_split))) classifier.fit(X_train, y_train) return classifier except Exception as e: print("warning----分类器链|ClassifierChain_method----" + str(e)) return None
def train_model(X, y, strategy): X = np.array(X) y = np.array(y) clf = lightgbm.sklearn.LGBMClassifier(max_depth=9, num_leaves=500, n_estimators=50, n_jobs=-1) # 0.8 print(clf) if strategy=='ovr': # OneVsRest strategy also known as BinaryRelevance strategy ovr = OneVsRestClassifier(clf) ovr.fit(X, y) save_model(ovr, "model/flow/ovr") return ovr elif strategy=='classifier_chains': cc = ClassifierChain(clf) cc.fit(X, y) save_model(cc, "model/flow/cc") return cc else: raise Exception("Correct strategies:ovr or classifier_chains")
def randomForestClassifierChain(): print("Random forest classifier chain") start = time.time() classifier = ClassifierChain(classifier=RandomForestClassifier(), require_dense=[False, True]) filename = "randomForestClassifierChain" classifier.fit(train_x, train_y) # save pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def gaussianNaiveBayes(): print("Gaussian naive bayes") start = time.time() classifier = ClassifierChain(GaussianNB()) filename = "gaussianNaiveBayes" classifier.fit(train_x, train_y) # save pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def knnClassifierChain(): print("knn classifier chain") start = time.time() classifier = ClassifierChain(KNeighborsClassifier()) filename = "knnChain" classifier.fit(train_x, train_y) # save pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def supportVectorMachineChain(): print("Support vector machine") start = time.time() classifier = ClassifierChain(classifier=svm.SVC(), require_dense=[False, True]) filename = "SupportVectorMachine" classifier.fit(train_x, train_y) # save pickle.dump(classifier, open(filename, 'wb')) # load the model from disk classifier = pickle.load(open(filename, 'rb')) print('training time taken: ', round(time.time() - start, 0), 'seconds') predictions_new = classifier.predict(test_x) accuracy(test_y, predictions_new)
def train_model(X, y, strategy): X = np.array(X) y = np.array(y) # clf = SVC(C=1,kernel='rbf',probability=True, gamma='scale') # svc without class_weight # clf = SVC(C=10,kernel='rbf',class_weight='balanced',probability=True, gamma='scale') # svc with class_weight clf = XGBClassifier(subsample=0.8, colsample_bytree=0.8) # clf = XGBClassifier(learning_rate=0.1, n_estimators=150, max_depth=5, # min_child_weight=1, gamma=0.1, subsample=0.8, colsample_bytree=0.8, # objective='binary:logistic', nthread=4, scale_pos_weight=1) print(clf) if strategy == 'ovr': # OneVsRest strategy also known as BinaryRelevance strategy ovr = OneVsRestClassifier(clf) ovr.fit(X, y) save_model(ovr, "model/ovr") return ovr elif strategy == 'classifier_chains': cc = ClassifierChain(clf) cc.fit(X, y) save_model(cc, "model/cc") return cc else: raise Exception("Correct strategies:ovr or classifier_chains")
def ClassifierChain (): # Train-Test Split ======================================================= print("setting up a neural network...") from sklearn.model_selection import train_test_split train, test = train_test_split(df, test_size=0.33, shuffle=True) train_text = train['Book_Text'] test_text = test['Book_Text'] # TF-IDF ================================================================== from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2') vectorizer.fit(train_text) vectorizer.fit(test_text) x_train = vectorizer.transform(train_text) y_train = train.drop(labels = ['Book_Text'], axis=1) x_test = vectorizer.transform(test_text) y_test = test.drop(labels = ['Book_Text'], axis=1) # using classifier chains from skmultilearn.problem_transform import ClassifierChain from sklearn.linear_model import LogisticRegression # initialize classifier chains multi-label classifier classifier = ClassifierChain(LogisticRegression()) # Training logistic regression model on train data classifier.fit(x_train, y_train) # predict predictions = classifier.predict(x_test) # accuracy print("Accuracy = ",accuracy_score(y_test,predictions)) print("\n")
def train_model(X, y, strategy): X = np.array(X) y = np.array(y, dtype=int) # clf = SVC(C=1,kernel='rbf',probability=True, gamma='scale') # svc with class_weight # clf = XGBClassifier(max_depth=9, n_estimators=50, n_jobs=-1) # clf = XGBClassifier(learning_rate=0.1, n_estimators=150, max_depth=5, # min_child_weight=1, gamma=0.1, subsample=0.8, colsample_bytree=0.8, # objective='binary:logistic', nthread=4, scale_pos_weight=1) # clf = RandomForestClassifier(max_depth=20, n_estimators=2000,n_jobs=-1) clf = lightgbm.sklearn.LGBMClassifier(max_depth=9, num_leaves=500, n_estimators=50, n_jobs=-1) print(clf) if strategy == 'ovr': # OneVsRest strategy also known as BinaryRelevance strategy ovr = OneVsRestClassifier(clf) ovr.fit(X, y) save_model(ovr, "model/flow/ovr") return ovr elif strategy == 'classifier_chains': cc = ClassifierChain(clf) cc.fit(X, y) save_model(cc, "model/flow/cc") return cc else: raise Exception("Correct strategies:ovr or classifier_chains")
def Classifier_Chain(ytrain, yvalid, ytest, base_model): """ Fits a Classifier Chain Model with LinearSVC as base classifier specifiying either themes or subthemes for Y. Returns a table of results with train, valid, test score, and recall, precision, f1 scores for valid and test data. """ classifier_chain = ClassifierChain(base_model) model = classifier_chain.fit(X_train, ytrain) train = model.score(X_train, np.array(ytrain)) valid = model.score(X_valid, np.array(yvalid)) test = model.score(X_test, np.array(ytest)) #validation scores predictions = model.predict(X_valid) recall = recall_score(np.array(yvalid), predictions, average='micro') precision = precision_score(np.array(yvalid), predictions, average='micro') f1 = f1_score(np.array(yvalid), predictions, average='micro') #test scores predictions_test = model.predict(X_test) recall_test = recall_score(np.array(ytest), predictions_test, average='micro') precision_test = precision_score(np.array(ytest), predictions_test, average='micro') f1_test = f1_score(np.array(ytest), predictions_test, average='micro') #All rounded to 3 decimal place case = { 'Model': "TF-IDF + LinearSVC", 'Train Accuracy': round(train, 3), 'Validation Accuracy': round(valid, 3), 'Test Accuracy': round(test, 3), 'Valid Recall': round(recall, 3), 'Valid Precision': round(precision, 3), 'Valid F1': round(f1, 3), 'Test Recall': round(recall_test, 3), 'Test Precision': round(precision_test, 3), 'Test F1': round(f1_test, 3) } results_dict.append(case)
def binary_relevance(train_data, test_data): """ 可以正常运行和预测 使用二元关联。 仅仅选取一个分类结果,即将问题简化为多分类单标签问题。而实际问题是多分类多标签问题。 :param train_data: :param test_data: :return: """ from skmultilearn.problem_transform import BinaryRelevance from skmultilearn.problem_transform import ClassifierChain from sklearn.naive_bayes import GaussianNB # 用一个基于高斯朴素贝叶斯的分类器 # classifier = BinaryRelevance(GaussianNB())# 初始化二元关联多标签分类器 classifier = ClassifierChain(GaussianNB()) #X_train = train X_train, y_train = train_data.iloc[:, [0]], train_data.iloc[:, list(range(1, 21))] X_test, y_test = test_data.iloc[:, [0]], test_data.iloc[:, list(range(1, 21))] # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) # 训练 temp = X_train.values.tolist() X = [] for i in range(len(temp)): X.append(temp[i][0]) x = tfidf.transform(X) y = y_train.values.tolist() Y = []#长度为20的矩阵 for j in range(len(y)): if "1" in y[j]: indexs = y[j].index("1") Y.append(indexs+1) else: # print("0") Y.append(0)#其实有21类,因为有空 Y = np.array(Y) # Y值不能是多值?? classifier.fit(x, Y)# 直接预测数字? """ 报错:raise TypeError('no supported conversion for types: %r' % (args,)) TypeError: no supported conversion for types: (dtype('O'),) 难道是?? """ # 预测 temp = X_test.values.tolist() X_ts = [] for i in range(len(temp)): X_ts.append(temp[i][0]) x_test = tfidf.transform(X_ts) y_test = y_test.values.tolist() Y_test = [] for j in range(len(y_test)): if "1" in y_test[j]: indexs = y_test[j].index("1") Y_test.append(indexs + 1) else: # print("0") Y_test.append(0) # 其实有21类,因为有空 Y_test = np.array(Y_test)#形成一个矩阵 unique_test, counts_test = np.unique(Y_test, return_counts=True) print("truth=", dict(zip(unique_test, counts_test))) predictions = classifier.predict(x_test)#此时csr_matrix类型 predictions = predictions.toarray() # 里面有0吗?? unique, counts = np.unique(predictions, return_counts=True) print("preditions=", dict(zip(unique, counts))) from sklearn.metrics import accuracy_score score = accuracy_score(Y_test, predictions) print(score)
# predict for Binary Relevance predictions_binary = classifier_binary.predict(X_test) #Hamming Loss for Binary Relevance hamm_loss_binary = hamming_loss(y_test, predictions_binary) print("Hamming Loss:", hamm_loss_binary) print("\n\n\nTraining data with Classifier Chains using Gaussian Naive Bayes") #initialize Classifier Chains multi-label classifier #with a gaussian naive bayes base classifier classifier_cc = ClassifierChain(GaussianNB()) # train for Classifier Chaines classifier_cc.fit(X_train, y_train) # predict for Classifier Chains predictions_cc = classifier_cc.predict(X_test) #Hamming Loss for Classifier Chaines hamm_loss_cc = hamming_loss(y_test, predictions_cc) print("Hamming Loss:", hamm_loss_cc) print("\n\n\nTraining data with Label Powerset using Gaussian Naive Bayes") #initialize Label Powerset multi-label classifier #with a gaussian naive bayes base classifier classifier_lp = LabelPowerset(GaussianNB())
Y_train = train.iloc[:,4:].values Y_test = test.iloc[:,4:].values print (Y_test) """ Naive Bayes Classifier """ #naiveBayes = GaussianNB() classifier = ClassifierChain(GaussianNB()) classifier.fit(X_train_idf,Y_train) predictions = classifier.predict(X_test_idf) print (accuracy_score(Y_test,predictions)) """ Get training and test dataset """ """ naiveBayes.fit(X_train_idf,Y_train[:,97:98].flatten()) y_pred = naiveBayes.predict(X_test_idf) """
#id = 0.0 #for i in range (1, 20) : # classifier = MLkNN(k = i) # prediction = classifier.fit(train['X'], train['y']).predict(test['X']) # if (1 - metrics.hamming_loss(prediction, test['y']) > best) : # best = 1 - metrics.hamming_loss(prediction, test['y']) # id = i; # #classifier = MLkNN(k = id) #prediction = classifier.fit(train['X'], train['y']).predict(test['X']) #print classifier #print 'Subset Accuracy: ', metrics.accuracy_score(prediction, test['y']) #print 'Hamming Loss: ', metrics.hamming_loss(prediction, test['y']) #print 'Accuracy: ', 1 - metrics.hamming_loss(prediction, test['y']) classifier = ClassifierChain(SVC()) prediction = classifier.fit(train['X'], train['y']).predict(test['X']) print '------------------------------------------' print classifier print 'Subset Accuracy: ', metrics.accuracy_score(prediction, test['y']) print 'Hamming Loss: ', metrics.hamming_loss(prediction, test['y']) print 'Accuracy: ', 1 - metrics.hamming_loss(prediction, test['y']) classifier = BinaryRelevance(SVC()) prediction = classifier.fit(train['X'], train['y']).predict(test['X']) print '------------------------------------------' print classifier print 'Subset Accuracy: ', metrics.accuracy_score(prediction, test['y']) print 'Hamming Loss: ', metrics.hamming_loss(prediction, test['y']) print 'Accuracy: ', 1 - metrics.hamming_loss(prediction, test['y'])
class MultiLabelClassifier(object): def __init__(self): self.total_data_df = pd.read_csv(os.path.join("data", "cleaned_data.csv"), encoding="ISO-8859-1") self.data_df = self.total_data_df[~self.total_data_df.Tags.isnull()] self.total_records = len(self.data_df.index) self.train_df = self.data_df.tail(int(self.total_records * .67)) self.test_df = self.data_df.head(int(self.total_records * .23)) self.total_tag_list = self.get_tag_list() self.total_word_list = self.get_word_list() self.modified_train_df = pd.DataFrame() self.modified_test_df = pd.DataFrame() self.classifier = BernoulliNB() self.classifier_multilabel = ClassifierChain(BernoulliNB()) self.classifier_dt = DecisionTreeRegressor(max_depth=2000) self.classifier_random_forest = RandomForestRegressor(max_depth=100) self.classifier_svm = svm.SVC(kernel='linear') self.test_tags = pd.DataFrame() def get_tag_list(self): tag_set = set() for tags in self.train_df.Tags: if tags is not nan: tag_set.update(tags.split(',')) return sorted(list(tag_set)) def get_word_list(self): word_set = set() for words in self.train_df.stemmed_words: if words is not nan: word_set.update(words.split(' ')) return sorted(list(word_set)) def setup_data_frame(self): for each in self.total_word_list: self.modified_train_df[each] = pd.Series([ 1 if each in words.split(' ') else 0 for words in self.train_df.stemmed_words ], index=self.train_df.index) self.modified_test_df[each] = pd.Series([ 1 if each in words.split(' ') else 0 for words in self.test_df.stemmed_words ], index=self.test_df.index) for tag in self.total_tag_list: self.modified_train_df[tag] = pd.Series([ 1 if tag in tags.split(',') else 0 for tags in self.train_df.Tags ], index=self.train_df.index) self.test_tags[tag] = pd.Series([ 1 if tag in tags.split(',') else 0 for tags in self.test_df.Tags ], index=self.test_df.index) pca = PCA(n_components=966) principal = pca.fit(self.modified_train_df) # self.modified_train_df = principal return self.modified_train_df def multi_label_naive_bayes_classifier(self): test_rows = self.modified_test_df.values self.modified_test_df['predicted_labels'] = pd.Series( ['' for each in self.modified_test_df.index], index=self.modified_test_df.index) for tag in self.total_tag_list: self.classifier.fit( self.modified_train_df[self.total_word_list].values, self.modified_train_df[tag].tolist()) self.modified_test_df[tag] = pd.Series( self.classifier.predict(test_rows), index=self.modified_test_df.index) self.modified_test_df['predicted_labels'] = pd.Series( [ each + ',' + tag if value == 1 else each for each, value in zip( self.modified_test_df.predicted_labels, self.modified_test_df.tag) ], index=self.modified_test_df.index) def multi_label_naive_bayes_classifier_sklearn(self): test_rows = self.modified_test_df.values self.classifier_multilabel.fit( self.modified_train_df[self.total_word_list].values, self.modified_train_df[self.total_tag_list]) c = self.classifier_multilabel.predict(test_rows) print(c.shape) print(sps.csc_matrix(self.test_tags.values).shape) print(accuracy_score(sps.csc_matrix(self.test_tags.values), c)) def multi_label_decision_tree_regressor(self): test_rows = self.modified_test_df.values self.classifier_dt.fit( self.modified_train_df[self.total_word_list].values, self.modified_train_df[self.total_tag_list]) predictions = self.classifier_dt.predict(test_rows) temp_df = pd.DataFrame(predictions, columns=self.total_tag_list) self.test_df['predicted_labels'] = pd.Series( ['' for each in self.modified_test_df.index], index=self.modified_test_df.index) for tag in self.total_tag_list: self.test_df['predicted_labels'] = pd.Series( [ each + ',' + tag if value == 1 else each for each, value in zip(self.test_df.predicted_labels, temp_df[tag]) ], index=self.test_df.index) self.test_df[['stemmed_words', 'Tags', 'predicted_labels' ]].to_csv(os.path.join("data", "decision_tree_result.csv"), index=False) def multi_label_random_forest(self): test_rows = self.modified_test_df.values self.classifier_random_forest.fit( self.modified_train_df[self.total_word_list].values, self.modified_train_df[self.total_tag_list]) predictions = self.classifier_random_forest.predict(test_rows) temp_df = pd.DataFrame(predictions, columns=self.total_tag_list) self.test_df['predicted_labels'] = pd.Series( ['' for each in self.modified_test_df.index], index=self.modified_test_df.index) for tag in self.total_tag_list: self.test_df['predicted_labels'] = pd.Series( [ each + ',' + tag if value == 1 else each for each, value in zip(self.test_df.predicted_labels, temp_df[tag]) ], index=self.test_df.index) self.test_df[['stemmed_words', 'Tags', 'predicted_labels' ]].to_csv(os.path.join("data", "random_forest_result.csv"), index=False) def multi_label_svm(self): test_rows = self.modified_test_df.values tags = array(self.modified_train_df[self.total_tag_list]) temp_df = pd.DataFrame() for col in range(tags.shape[1]): self.classifier_svm.fit( self.modified_train_df[self.total_word_list].values, tags[:, col]) predictions = self.classifier_svm.predict(test_rows) temp_df[self.total_tag_list[col]] = pd.Series(predictions) #temp_df = pd.DataFrame(predictions, columns=self.total_tag_list) self.test_df['predicted_labels'] = pd.Series( ['' for each in self.modified_test_df.index], index=self.modified_test_df.index) for tag in self.total_tag_list: self.test_df['predicted_labels'] = pd.Series( [ each + ',' + tag if value == 1 else each for each, value in zip(self.test_df.predicted_labels, temp_df[tag]) ], index=self.test_df.index) self.test_df[['stemmed_words', 'Tags', 'predicted_labels' ]].to_csv(os.path.join("data", "linear_svm.csv"), index=False)
test_data = pd.read_csv('.data/test-data.dat', delimiter='\n', header=None) train_labels = pd.read_csv('.data/train-label.dat', sep=' ', header=None) test_labels = pd.read_csv('.data/test-label.dat', sep=' ', header=None) #replace <D> with nothing from data train_data = train_data.iloc[:, 0].str.replace('<\d+>', '') test_data = test_data.iloc[:, 0].str.replace('<\d+>', '') #count the frequency of every word in vocabulary in each document vectorizer = CountVectorizer() train_data_vector = vectorizer.fit_transform(train_data) test_data_vector = vectorizer.transform(test_data) #train the classifier model = ClassifierChain(RandomForestClassifier(n_jobs=-1, verbose=1)) model.fit(train_data_vector, train_labels) #test the classifier predicted_labels = model.predict(test_data_vector) predicted_labels_train = model.predict(train_data_vector) predicted_probabilities = model.predict_proba(test_data_vector) #test accuracy #~7% with random forest and binary relevance #~7% with random forest and classifier chain #~5% with random forest and label powerset #~4% with multilabel knn test_acc = accuracy_score(test_labels, predicted_labels) train_acc = accuracy_score(train_labels, predicted_labels_train) test_hamm_loss = hamming_loss(test_labels, predicted_labels) test_cov_err = coverage_error(test_labels, predicted_probabilities.toarray())
# In[68]: log_classifier.fit(x_train, y_train) print('Accuracy_score using LabelPowerset is ', round(accuracy_score(y_test, log_classifier.predict(x_test)) * 100, 1), '%') print('-------------------------------------------------') print('roc_auc_score using LabelPowerset is ', roc_auc_score(y_test, log_classifier.predict_proba(x_test).toarray())) # # ClassifierChain # * This method uses a chain of binary classifiers # * Each new Classifier uses the predictions of all previous classifiers # * This was the correlation b/w labels is taken into account # In[69]: chain = ClassifierChain(LogisticRegression()) # In[70]: chain.fit(x_train, y_train) print('Accuracy_score using ClassifierChain is ', round(accuracy_score(y_test, chain.predict(x_test)) * 100, 1), '%') print('-------------------------------------------------') print('roc_auc_score using ClassifierChain is ', roc_auc_score(y_test, chain.predict_proba(x_test).toarray()))
res = res / y_pred.shape[0] return np.round(res, 2) for i in range(5): log = LogisticRegression() log.fit(np.hstack((X, Y[:, 0:i])), Y[:, i]) # 每次训练将前一次的预测结果附带上 logs.append(log) results = [] for i in range(5): res = logs[i].predict(np.hstack((X, Y[:, 0:i]))) results.append(res) fres = [] for i in range(len(results[0])): a = [ results[0][i], results[1][i], results[2][i], results[3][i], results[4][i] ] fres.append(a) fres = np.matrix(fres) print(accuracy_score(fres, Y)) test = datasets.make_multilabel_classification() # 使用已写好的分类器链算法验证结果 cl = ClassifierChain(LogisticRegression()) cl.fit(data[0], data[1]) pred = cl.predict(test[0]) print(accuracy_score(pred, test[1]))
class Multi_labeling: def __init__(self, label_dict, train_labels, train_data, test_labels, test_data): self.label_dict = label_dict self.train_labels = train_labels self.train_data = train_data self.test_labels = test_labels self.test_data = test_data def classify(self): from skmultilearn.problem_transform import ClassifierChain from sklearn.svm import SVC,LinearSVC import sklearn.metrics as metrics # ============================= # ClassifierChain # # ============================= from sklearn.multiclass import OneVsRestClassifier # from sklearn.multioutput import ClassifierChain from sklearn.linear_model import LogisticRegression # cc = ClassifierChain(LogisticRegression()) self.cc = ClassifierChain(LinearSVC()) self.cc.fit(self.train_data, self.train_labels) # y_pred = self.cc.predict(self.test_data) # cc_art_f1 = metrics.f1_score(self.test_labels, y_pred, average='micro') # # initialize Classifier Chain multi-label classifier # # with an SVM classifier # # SVM in scikit only supports the X matrix in sparse representation # classifier = ClassifierChain( # classifier=SVC(), # require_dense=[False, True] # ) # # train # classifier.fit(self.train_data, self.train_labels) # # predict # predictions = classifier.predict(self.test_data) # print(predictions) # art_f1 = metrics.f1_score(self.test_labels, predictions, average='macro') # return art_f1 # ============================= # KNeighborsClassifier # # ============================= from sklearn.neighbors import KNeighborsClassifier knc = KNeighborsClassifier() knc.fit(self.train_data, self.train_labels) # Y_pred = knc.predict(self.test_data) # knc_art_f1 = metrics.f1_score(self.test_labels, Y_pred, average='micro') # ============================= # SGDClassifier # # ============================= from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=0, max_iter=6, tol=None) clf = OneVsRestClassifier(sgd) clf.fit(self.train_data, self.train_labels) # y_pred = clf.predict(self.test_data) # sgd_art_f1 = metrics.f1_score(self.test_labels, y_pred, average='micro') # return cc_art_f1, knc_art_f1, sgd_art_f1 def pred_all_other(self, input_data): y_pred = self.cc.predict(input_data) return y_pred
x_train = x_train['Documentation Text'].tolist() x_test = x_test['Documentation Text'].tolist() y_train = y_train.values y_test = y_test.values #n-gram #tfidf = TfidfVectorizer(ngram_range = (1,1), stop_words = 'english') #tfidf = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,1), norm='l2') tfidf = CountVectorizer() tfidf.fit(x_train) x_train = tfidf.transform(x_train) x_test = tfidf.transform(x_test) # train #classifier = BinaryRelevance(GaussianNB()) classifier.fit(x_train, y_train) # predict predictions = classifier.predict(x_test) y_pred = [] for i in predictions: y_pred.append(list(i.A[0])) #print(y_pred) y_test = y_test.tolist() #print(len(y_test)) y_pred_dataframe = pd.DataFrame(y_pred, columns=categories) y_test_dataframe = pd.DataFrame(y_test, columns=categories) #print(len(y_pred_dataframe)) this_pred_list = y_pred_dataframe[category].tolist()
class ArticleClassifier(ClassifierMixin): def __init__(self, ngram=(1, 3), tokenizer=prepareText, max_feature=20000): """ This classifier is a multi-label classifier. It have been trained on octo-articles dataset. You can train it using the fit function :parameter ---------- :param ngram {tuple}: default '(1,3)' ngram_range for the tfidfVectorizer :param tokenizer {func}: tokenizer used by tfidfvectorizer to prepapre the Data :param max_feature {int}: limit the matrix composition to the 'max_feature' most important element """ self.vectorizer_ = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=ngram, norm='l2', tokenizer=tokenizer, max_features=max_feature) pass def fit(self, X, y): """ fit the model to the data. Train the classifier Note: You should use the zodiac.classifier.cleaner on all the texts before you fit the data :parameter ---------- :param X: (list) list of clean text (you can use zodiac.cleaner.TextCleaner) :param y: (numpy.array) array of labels """ self.x_vec_ = self.vectorizer_.fit_transform(X) # initialize classifier chains multi-label classifier self.classifier_ = ClassifierChain(SVC(probability=True)) # Training logistic regression model on train data self.classifier_.fit(self.x_vec_, y) def score(self, X, y, average='samples', threshold=0.5): """ Compute the jaccard score using the given parameters :parameter ----------- :param x_test(list): list of text :param y_true (list): texts labels :param average: default 'average'. :return: ------- score : float jaccard score """ self.x_test_vec_ = self.vectorizer_.transform(X) predictions = self.classifier_.predict_proba(self.x_test_vec_) score = jaccard_score(y, predictions >= threshold, average=average) return score def show_stats(self, x_test, y): """ compute the jaccard score for differents threshold and display the jaccard scores using plotly scatter method :parameter ---------- :param x_test: (list) text list :param y: list of label """ thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] x_test_vec = self.vectorizer_.transform(x_test) predictions_probas = self.classifier_.predict_proba(x_test_vec) jaccard_scores = [] for threshold in thresholds: # print("For threshold: ", val) pred = predictions_probas.copy() ensemble_jaccard_score = jaccard_score( y, predictions_probas >= threshold, average='samples') jaccard_scores.append(ensemble_jaccard_score) self.jaccard_scores_threshold_df_ = pd.DataFrame({ 'threshold': thresholds, 'jaccard_score': jaccard_scores }) def load_weights(self, path): """ Load the weights of the model from path :parameter --- :param path {str}: path to the model weights """ joblib.load(path) def save_weights(self, path): """ Save the model weights locally :parameter ---------- :param path {str}: path to the directory to store the classifier wieghts """ joblib.dump(self.classifier_, path) px.scatter(self.jaccard_scores_threshold_df_, x='threshold', y='jaccard_score', color='threshold', title='Jaccard score depending on threshold')