def test_classifier_chain_fit_and_predict_with_sparse_data(): # Fit classifier chain with sparse data X, Y = generate_multilabel_dataset_with_correlations() X_sparse = sp.csr_matrix(X) classifier_chain = ClassifierChain(LogisticRegression()) classifier_chain.fit(X_sparse, Y) Y_pred_sparse = classifier_chain.predict(X_sparse) classifier_chain = ClassifierChain(LogisticRegression()) classifier_chain.fit(X, Y) Y_pred_dense = classifier_chain.predict(X) assert_array_equal(Y_pred_sparse, Y_pred_dense)
def train_model(model, df): """This function trains the model specified in 'model', either a sklearn multilabel classifier implemented with a Random Forest classifier, or a Chain Classifier implemented with a Random Forest classifier. model = 'multi' or 'chain' """ X = df.iloc[:, 0:13] y = df[list(filter(lambda x: str(x).startswith('notes'), df.columns))] if model == 'multi': multi = MultiOutputClassifier(RandomForestClassifier()).fit(X, y) return (multi, list(y.columns)) elif model == 'chain': columns = {} for index, value in enumerate(y.columns): columns.update({value: index}) constant = [ 'notes_type_0', 'notes_lineIndex_0', 'notes_lineLayer_0', 'notes_cutDirection_0', 'notes_type_1', 'notes_lineIndex_1', 'notes_lineLayer_1', 'notes_cutDirection_1', 'notes_type_3', 'notes_lineIndex_3', 'notes_lineLayer_3', 'notes_cutDirection_3' ] order = [columns[x] for x in constant] chain = ClassifierChain(RandomForestClassifier(), order=order).fit(X, y) return (chain, constant)
def test_randForest(df, truth, eval_type): param_randForest = { 'n_estimators': [10, 50, 100, 200], 'max_depth': [None, 50, 80, 100], 'max_features': ['auto', 'sqrt', 'log2'], 'criterion': ['gini', 'entropy'] } combinations_randForest = it.product(*(param_randForest[Name] for Name in param_randForest)) # Test the combinations for Random Forest with cross validation results_randForest = [] keys = [] for index, values in enumerate(combinations_randForest): key = "RF" + "-".join([str(item) for item in values]) clf = RandomForestClassifier(n_estimators=values[0], max_depth=values[1], max_features=values[2], criterion=values[3]) classifier = ClassifierChain(clf) kfold = KFold(n_splits=10, random_state=26) scores = cross_val_score(classifier, df.values, truth, cv=kfold, scoring=eval_type) keys.append(key) results_randForest.append(scores) msg = "%s: %f (%f)" % (key, scores.mean(), scores.std()) print(msg) return keys, results_randForest
def test_AdaBoost(df, truth, eval_type): param_adaBoost = {'n_estimators': [50, 100]} combinations_adaBoost = it.product(*(param_adaBoost[Name] for Name in param_adaBoost)) # Test the combinations for AdaBoost with cross validation results_adaBoost = [] keys = [] for index, values in enumerate(combinations_adaBoost): key = "ADA" + "-".join([str(item) for item in values]) clf = AdaBoostClassifier(n_estimators=values[0]) classifier = ClassifierChain(clf) kfold = KFold(n_splits=10, random_state=26) scores = cross_val_score(classifier, df.values, truth, cv=kfold, scoring=eval_type) keys.append(key) results_adaBoost.append(scores) msg = "%s: %f (%f)" % (key, scores.mean(), scores.std()) print(msg) return keys, results_adaBoost
def train_and_pred(dictTrainMats, Trainlabel, dictTestMats, lian): chain = OneVsRestClassifier(ExtraTreesClassifier(bootstrap=True, n_estimators=120), n_jobs=8) chains = [ClassifierChain(chain, order="random") for i in range(lian)] model = OneVsRestClassifier(ExtraTreesClassifier(bootstrap=True, n_estimators=200), n_jobs=8) fea_train = np.array([]) fea_test = np.array([]) for i in range(lian): X_train, X_test = dictTrainMats[i % 8], dictTestMats[i % 8] clf = chains[i] clf.fit(X_train, Trainlabel) y_pred = clf.predict(X_test) if i == 0: fea_train = clf.predict(X_train) fea_test = y_pred else: fea_train = np.hstack([fea_train, clf.predict(X_train)]) fea_test = np.hstack([fea_test, y_pred]) print(fea_train.shape, fea_test.shape) model.fit(fea_train, Trainlabel) y_pred = model.predict(fea_test) print(y_pred.shape) save_tmp(y_pred, "./data/mlamp_train_710Test.pickle")
def test_svm(df, truth, eval_type): # param_svm = {'C': [1, 10, 100, 1000], # 'kernel': ['linear', 'rbf'], # 'gamma': ['auto', 'scale'] # } param_svm = {'C': [100], 'kernel': ['linear'], 'gamma': ['auto']} combinations_svm = it.product(*(param_svm[Name] for Name in param_svm)) # Test the combinations for SVM with cross validation results_svm = [] keys = [] for index, values in enumerate(combinations_svm): key = "SVM" + "-".join([str(item) for item in values]) clf = svm.SVC(C=values[0], kernel=values[1], gamma=values[2]) classifier = ClassifierChain(clf) kfold = KFold(n_splits=10, random_state=26) scores = cross_val_score(classifier, df.values, truth, cv=kfold, scoring=eval_type) keys.append(key) results_svm.append(scores) msg = "%s: %f (%f)" % (key, scores.mean(), scores.std()) print(msg) return keys, results_svm
def train_baseline(ds_name, train_input, train_labels): tuned_params = {"random_state": [i for i in np.arange(10)]} base_lr = LogisticRegression(C=1, max_iter=500, fit_intercept=True, tol=1e-15, class_weight="balanced") gs_chain = GridSearchCV(ClassifierChain(base_lr, order="random"), tuned_params, cv=3, scoring=scorer) gs_chain.fit(train_input, train_labels) print("best order according to grid search is %s" % gs_chain.best_estimator_.order_) print("best order according to grid search is %s" % gs_chain.best_score_) from sklearn.ensemble import RandomForestClassifier tuned_params = { "n_estimators": [i for i in np.arange(1, 100, 10)], "max_depth": [i for i in np.arange(1, 100, 10)], } gs_forest = GridSearchCV(RandomForestClassifier(random_state=1), tuned_params, cv=3, scoring=scorer) gs_forest.fit(train_input, train_labels) return ( ("classifier_chain", gs_chain.best_estimator_), ("random_forest", gs_forest.best_estimator_), )
def calc_Fitness(train_d): vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1, 3), norm='l2') x_train = vectorizer.fit_transform(train_d.comment_text) y_train = train_d.drop(labels=['id', 'comment_text'], axis=1) x_test = vectorizer.transform(test.comment_text) y_test = test.drop(labels=['id', 'comment_text'], axis=1) # using classifier chains from sklearn.multioutput import ClassifierChain from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, hamming_loss, precision_score # initialize classifier chains multi-label classifier classifier = ClassifierChain(LogisticRegression()) # Training logistic regression model on train data classifier.fit(x_train, y_train) # predict predictions = classifier.predict(x_test) # accuracy quality = (accuracy_score(y_test, predictions) + (1 - hamming_loss(y_test, predictions)) + precision_score(y_test, predictions, average='weighted')) / 3 return quality
def CommonFunction(est, Xtrain, Ytrain, Xtest, parameters): n_cv = 5 # Number of cross-validations # multi_est=MultiOutputClassifier(est) multi_est = ClassifierChain(est) [n_trainsamples, n_decisions] = Ytrain.shape tparameters = 1 for keys in parameters.items(): tparameters = tparameters * len(keys[1]) print('Total number of hyperparameter combinations to be tested is', str(tparameters)) def ScoreFunction(decision, Y): # Scoring function return round( numpy.sum(decision == Y) / (n_trainsamples * n_decisions) * 100 * n_cv, 2) score = make_scorer(ScoreFunction, greater_is_better=True) multi_est_GS = dcv.GridSearchCV(multi_est, param_grid=parameters, scoring=score, cv=n_cv, n_jobs=-1).fit(Xtrain, Ytrain) decision = multi_est_GS.predict(Xtest) printer(multi_est_GS.best_score_, multi_est_GS.best_params_, tparameters) return decision
def build_model(): """ Function: --------- Uses SMOTE Oversampling technique from imblearn to balance our dataset, builds model with a variety of base learners, uses ClassifierChains and ensembles them by stacking a meta learner on top, with grid search implemented Parameters: ----------- None Returns: -------- Model ready to be fitted """ print('=============================') print('Building Model:') print('-----------------------------') # Aggregate an ensemble of RandomForest classifier chains and feed them # to the meta classifier print('Creating ClassifierChains...') chains = [ ClassifierChain( base_estimator=RandomForestClassifier(n_estimators=100), order='random', random_state=42) for _ in range(5) ] # Meta Classifier that will take the predictions # of each output of the classifier chains and figure out # the weight of each classifier in predicting labels print('Adding Meta Classifier...') meta_clf = MultiOutputClassifier(AdaBoostClassifier()) # Stack the base learners print('Stacking Meta Classifier on top of ClassifierChains...') sclf = StackingClassifier(classifiers=chains, meta_classifier=meta_clf) # Final Pipeline print('Building Pipeline...') pipeline = Pipeline([('features', FeatureUnion([ ('text_pipeline', Pipeline([ ('tfidf_vect', TfidfVectorizer(tokenizer=tokenize)), ])) ])), ('sclf', sclf)]) parameters = { 'features__text_pipeline__tfidf_vect__ngram_range': ((1, 2), (1, 10)) } print('Initializing GridSearchCV...') model = GridSearchCV(pipeline, param_grid=parameters, cv=5) return model
def test_classifier_chain_fit_and_predict_with_sparse_data_and_cv(): # Fit classifier chain with sparse data cross_val_predict X, Y = generate_multilabel_dataset_with_correlations() X_sparse = sp.csr_matrix(X) classifier_chain = ClassifierChain(LogisticRegression(), cv=3) classifier_chain.fit(X_sparse, Y) Y_pred = classifier_chain.predict(X_sparse) assert_equal(Y_pred.shape, Y.shape)
def chain_classifiers(x, y): from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) base_lr = LogisticRegression() ovr = OneVsRestClassifier(base_lr) ovr.fit(x_train, y_train) y_pred_ovr = ovr.predict(x_test) from sklearn.metrics import jaccard_score ovr_jaccard_score = jaccard_score(y_test, y_pred_ovr, average='samples') from sklearn.multioutput import ClassifierChain chains = [ ClassifierChain(base_lr, order='random', random_state=i) for i in range(10) ] for chain in chains: chain.fit(x_train, y_train) y_pred_chains = np.array([chain.predict(x_test) for chain in chains]) chain_jaccard_scores = [ jaccard_score(y_test, y_pred_chain >= 0.5, average='samples') for y_pred_chain in y_pred_chains ] y_pred_ensemble = y_pred_chains.mean(axis=0) ensemble_jaccard_score = jaccard_score(y_test, y_pred_ensemble >= 0.5, average='samples') model_scores = [ovr_jaccard_score] + chain_jaccard_scores model_scores.append(ensemble_jaccard_score) model_names = ('Independent', 'Chain 1', 'Chain 2', 'Chain 3', 'Chain 4', 'Chain 5', 'Chain 6', 'Chain 7', 'Chain 8', 'Chain 9', 'Chain 10', 'Ensemble') x_pos = np.arange(len(model_names)) # Plot the Jaccard similarity scores for the independent model, each of the # chains, and the ensemble (note that the vertical axis on this plot does # not begin at 0). fig, ax = plt.subplots(figsize=(7, 4)) ax.grid(True) ax.set_title('Classifier Chain Ensemble Performance Comparison') ax.set_xticks(x_pos) ax.set_xticklabels(model_names, rotation='vertical') ax.set_ylabel('Jaccard Similarity Score') ax.set_ylim([min(model_scores) * .9, max(model_scores) * 1.1]) colors = ['r'] + ['b'] * len(chain_jaccard_scores) + ['g'] ax.bar(x_pos, model_scores, alpha=0.5, color=colors) plt.tight_layout() plt.show() return chains[-1]
def cc(): print 'reading npy...' data = np.load('../data/1st.npy') feature_data = np.load('dnn_feature.npy') train_order = np.load('../data/train.npy') validation_order = np.load('../data/validation.npy') test_order = np.load('../data/test.npy') train_nlcd = get_data.get_feature(feature_data, train_order) train_label = get_data.get_label(data, train_order) test_nlcd = get_data.get_feature(feature_data, test_order) test_label = get_data.get_label(data, test_order) print 'chaining' # Fit an ensemble of logistic regression classifier chains and take the # take the average prediction of all the chains. chains = [] for i in range(10): chains.append( ClassifierChain(LogisticRegression(), order='random', random_state=i)) #chains.append(ClassifierChain(LogisticRegression())) #chains.append(ClassifierChain(LogisticRegression(), order=range(100), random_state=i)) f**k = 0 for chain in chains: print f**k + 1 chain.fit(train_nlcd, train_label) f**k += 1 print 'testing' # Y_pred_chains = np.array([chain.predict(X_test) for chain in # chains]) # # chain_jaccard_scores = [jaccard_similarity_score(Y_test, Y_pred_chain >= .5) # for Y_pred_chain in Y_pred_chains] # Y_pred_ensemble = Y_pred_chains.mean(axis=0) # ensemble_jaccard_score = jaccard_similarity_score(Y_test, # Y_pred_ensemble >= .5) # model_scores = [ovr_jaccard_score] + chain_jaccard_scores # model_scores.append(ensemble_jaccard_score) scores = [] for chain in chains: pre = chain.predict_proba(test_nlcd) #np.save('pre.npy',pre) chain_score = log_likelihood(test_label, pre) print chain_score scores.append(chain_score) scores = np.array(scores) print 'mean:' print np.mean(scores)
def test_classifier_chain_tuple_invalid_order(): X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]] y = [[3, 2], [2, 3], [3, 2]] order = tuple([1, 2]) chain = ClassifierChain(RandomForestClassifier(), order=order) with pytest.raises(ValueError, match='invalid order'): chain.fit(X, y)
def test_classifier_chain_crossval_fit_and_predict(): # Fit classifier chain with cross_val_predict and verify predict # performance X, Y = generate_multilabel_dataset_with_correlations() classifier_chain_cv = ClassifierChain(LogisticRegression(), cv=3) classifier_chain_cv.fit(X, Y) classifier_chain = ClassifierChain(LogisticRegression()) classifier_chain.fit(X, Y) Y_pred_cv = classifier_chain_cv.predict(X) Y_pred = classifier_chain.predict(X) assert_equal(Y_pred_cv.shape, Y.shape) assert_greater(jaccard_similarity_score(Y, Y_pred_cv), 0.4) assert_not_equal(jaccard_similarity_score(Y, Y_pred_cv), jaccard_similarity_score(Y, Y_pred))
def _set_estimators_reset_fitted(self): self.estimators_ = [ ClassifierChain(clone(self.base_estimator), order="random", cv=self.cv, random_state=None) for _ in range(self.k_) ] self._set_random_state_of_estimators() self.fitted_ = False
def test_base_chain_fit_and_predict_with_sparse_data_and_cv(): # Fit base chain with sparse data cross_val_predict X, Y = generate_multilabel_dataset_with_correlations() X_sparse = sp.csr_matrix(X) base_chains = [ClassifierChain(LogisticRegression(), cv=3), RegressorChain(Ridge(), cv=3)] for chain in base_chains: chain.fit(X_sparse, Y) Y_pred = chain.predict(X_sparse) assert_equal(Y_pred.shape, Y.shape)
def test_naiveBayes(df, truth, eval_type): clf = MultinomialNB() classifier = ClassifierChain(clf) kfold = KFold(n_splits=10, random_state=26) scores = cross_val_score(classifier, df.values, truth, cv=kfold, scoring=eval_type) return ["NB"], [scores]
def test_chainclassifier(implementation): name = "test_ls_cc" x, y = make_multilabel_classification() x_train, x_test, y_train, y_test = train_test_split(x, y) valid_cc = ClassifierChain(LinearSVC()) valid_cc.fit(x_train, y_train) implementation.save(valid_cc, name) test_cc = implementation.load(name) expected = valid_cc.predict(x_test) got = test_cc.predict(x_test) assert_array_equal(got, expected)
def test_classifier_chain_tuple_order(order_type): X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]] y = [[3, 2], [2, 3], [3, 2]] order = order_type([1, 0]) chain = ClassifierChain(RandomForestClassifier(), order=order) chain.fit(X, y) X_test = [[1.5, 2.5, 3.5]] y_test = [[3, 2]] assert_array_almost_equal(chain.predict(X_test), y_test)
def test_best_AdaBoost(df, truth, eval_type): clf = AdaBoostClassifier(n_estimators=50) classifier = ClassifierChain(clf) kfold = KFold(n_splits=10, random_state=26) print("Start crossvalidation...") scores = cross_val_score(classifier, df.values, truth, cv=kfold, scoring=eval_type) print(f"Crossvalidation done. Mean: {np.mean(scores)}") return scores
def run(classifier, train_test_set): X_train, X_test, y_train, y_test = train_test_set # init model and fit to train data chain = ClassifierChain(classifier, order='random', random_state=0) chain.fit(X_train, y_train) # make predictions y_pred = chain.predict(X_test) print('\n--------Classifier chains with {:}'.format(classifier)) return y_test, y_pred
def fit(self, train_x, train_y): self._estimators = [] self._feature_number = train_y.shape[1] for i in range(self._no_of_estimators): X, y = train_x, train_y print(random.sample(range(0, self._feature_number), self._feature_number)) estimator = ClassifierChain(DecisionTreeClassifier(), order=random.sample(range(0, self._feature_number), self._feature_number)) estimator.fit(X, y) self._estimators.append(estimator) return self
def chaining_svm(X, Y, max_iter=-1): X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=0) Cs = np.logspace(-2, 10, 30) res = [] print(f'Trying Cs: {Cs}') print('C \t accuracy \t f1 \t precision \t recall') for C in Cs: base_clf = SVC(C=C, kernel='rbf', max_iter=max_iter) chain = ClassifierChain(base_clf, cv=2, order='random', random_state=0) chain.fit(X_train, Y_train) y_pred = chain.predict(X_test) res.append([[ get_accuracy(Y_test, y_pred), get_f1(Y_test, y_pred), get_recall(Y_test, y_pred), get_precision(Y_test, y_pred) ], C]) print( f'{C}\t{get_accuracy(Y_test, y_pred)}\t{get_f1(Y_test, y_pred)}\t{get_recall(Y_test, y_pred)}\t{get_precision(Y_test, y_pred)}' ) store_data_as_pickle(res, f'svm-chain-logscale-values') acc = np.asarray([[a[0][0], a[1]] for a in res]) f1 = np.asarray([[a[0][1], a[1]] for a in res]) recall = np.asarray([[a[0][2], a[1]] for a in res]) precision = np.asarray([[a[0][3], a[1]] for a in res]) print("Max acc without question at default_dist: ", acc[np.argmax(acc[:, 0]), 1], " ", np.max(acc[:, 0])) print("Max f1 without question at default_dist: ", f1[np.argmax(f1[:, 0]), 1], " ", np.max(f1[:, 0])) print("Max recall without question at default_dist: ", recall[np.argmax(recall[:, 0]), 1], " ", np.max(recall[:, 0])) print("Max precision without question at default_dist: ", precision[np.argmax(precision[:, 0]), 1], " ", np.max(precision[:, 0])) plt.plot(acc[:, 1], acc[:, 0], label='Accuracy') plt.plot(f1[:, 1], f1[:, 0], label='F1-Score') plt.plot(recall[:, 1], recall[:, 0], label='Recall') plt.plot(precision[:, 1], precision[:, 0], label='Precision') plt.legend() plt.xscale('log') plt.xlabel("C regularization parameter") plt.title("SVM with ClassifierChain 10 folds") plt.show()
def test_classifier_chain_random_order(): # Fit classifier chain with random order X, Y = generate_multilabel_dataset_with_correlations() classifier_chain_random = ClassifierChain(LogisticRegression(), order='random', random_state=42) classifier_chain_random.fit(X, Y) Y_pred_random = classifier_chain_random.predict(X) assert_not_equal(list(classifier_chain_random.order), list(range(4))) assert_equal(len(classifier_chain_random.order_), 4) assert_equal(len(set(classifier_chain_random.order_)), 4) classifier_chain_fixed = \ ClassifierChain(LogisticRegression(), order=classifier_chain_random.order_) classifier_chain_fixed.fit(X, Y) Y_pred_fixed = classifier_chain_fixed.predict(X) # Randomly ordered chain should behave identically to a fixed order chain # with the same order. assert_array_equal(Y_pred_random, Y_pred_fixed)
def chaining_adaboost(X, Y): X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=0) base_clf = AdaBoostClassifier(algorithm="SAMME", n_estimators=200) chain = ClassifierChain(base_clf, cv=2, order='random', random_state=0) chain.fit(X_train, Y_train) y_pred = chain.predict(X_test) print( f'{get_accuracy(Y_test, y_pred)}\t{get_f1(Y_test, y_pred)}\t{get_recall(Y_test, y_pred)}\t{get_precision(Y_test, y_pred)}' )
def XGBoostChain(X_train, y_train, X_test): print("fitting the data") # Fitting X-Gradient boosting gbc = xgb.XGBClassifier(objective="binary:logistic", random_state=42) chains = [ClassifierChain(gbc, order='random', random_state=i) for i in range(10)] for chain in chains: chain.fit(X_train, y_train) Y_pred_chains = np.array([chain.predict_proba(X_test) for chain in chains]) Y_pred_ensemble = Y_pred_chains.mean(axis=0) print(Y_pred_ensemble)
def test_classifier_chain_fit_and_predict_with_linear_svc(): # Fit classifier chain and verify predict performance using LinearSVC X, Y = generate_multilabel_dataset_with_correlations() classifier_chain = ClassifierChain(LinearSVC()) classifier_chain.fit(X, Y) Y_pred = classifier_chain.predict(X) assert_equal(Y_pred.shape, Y.shape) Y_decision = classifier_chain.decision_function(X) Y_binary = (Y_decision >= 0) assert_array_equal(Y_binary, Y_pred) assert not hasattr(classifier_chain, 'predict_proba')
def test_classifier_chain_fit_and_predict_with_logistic_regression(): # Fit classifier chain and verify predict performance X, Y = generate_multilabel_dataset_with_correlations() classifier_chain = ClassifierChain(LogisticRegression()) classifier_chain.fit(X, Y) Y_pred = classifier_chain.predict(X) assert_equal(Y_pred.shape, Y.shape) Y_prob = classifier_chain.predict_proba(X) Y_binary = (Y_prob >= .5) assert_array_equal(Y_binary, Y_pred) assert_equal([c.coef_.size for c in classifier_chain.estimators_], list(range(X.shape[1], X.shape[1] + Y.shape[1])))
def test_best_rf(df, truth, eval_type): clf = RandomForestClassifier(n_estimators=200, max_depth=50, max_features='auto', criterion='entropy') classifier = ClassifierChain(clf) kfold = KFold(n_splits=10, random_state=26) print("Start crossvalidation...") scores = cross_val_score(classifier, df.values, truth, cv=kfold, scoring=eval_type) print(f"Crossvalidation done. Mean: {np.mean(scores)}") return scores