class Classifier(object): """This is a classifier for codon reassignment""" def __init__(self, method, classifier_spec={}, scale=False, n_estimators=1000): if method == 'rf': self.clf = RandomForestClassifier( n_estimators=n_estimators, n_jobs=-1, max_leaf_nodes=1000, **classifier_spec) elif method == 'svc': self.clf = svm.SVC(probability=True, **classifier_spec) elif method == 'etc': self.clf = ExtraTreesClassifier( n_estimators=n_estimators, **classifier_spec) elif method == 'gnb': self.clf = GaussianNB() else: raise NotImplementedError( "The method you chose (%s) is not implemented" % method) self.method = method self.trained = False self.scale = scale @classmethod def load_from_file(clc, loadfile): """Load model from a file""" try: clf = joblib.load(loadfile) return clf except IOError: print('Problem with file %s, can not open it' % loadfile) except Exception as e: raise e return None def save_model(self, outfile): """Save model to a file""" joblib.dump(self, outfile) def train(self, X=None, Y=None): """Train the model""" if self.scale: X = preprocessing.scale(X) self.clf.fit(X, Y) self.X = X self.y = Y self.trained = True @classmethod def from_classifier(clc, clfier): newclf = clc(clfier.method, {}, clfier.scale) newclf.__dict__.update(clfier.__dict__) return newclf def get_score(self, X, Y): """Return score for classification on X""" if self.scale: X = preprocessing.scale(X) return self.clf.score(X, Y) def predict(self, X): """Predict values for X""" if not self.trained: raise ValueError("Classifier is not trained") if self.scale: X = preprocessing.scale(X) return self.clf.predict(X) def predict_proba(self, X): """Return probability for each class prediction""" return self.clf.predict_proba(X) def feature_importance(self, outfile="importance.png", features_list=[]): """Show each feature importance""" if (self.method in ['rf', 'etc']): importances = self.clf.feature_importances_ if len(features_list) > 0 and len(features_list) != len(importances): raise ValueError("Number of features does not fit!") indices = np.argsort(importances)[::-1] n_feats = len(features_list) np.savetxt(outfile + ".txt", np.array([tree.feature_importances_ for tree in self.clf.estimators_]), delimiter=',', fmt='%1.3e') std = np.std( [tree.feature_importances_ for tree in self.clf.estimators_], axis=0) plt.figure() plt.title("Feature importances") plt.bar(range(n_feats), importances[ indices], width=0.5, color="b", yerr=std[indices], align="center") if len(features_list) > 0: features_list = np.asarray(features_list)[indices] plt.xticks(range(n_feats), features_list, rotation='vertical') plt.xlim([-1, n_feats]) plt.margins(0.2) plt.subplots_adjust(bottom=0.15) plt.savefig(outfile, bbox_inches='tight') else: raise NotImplementedError( "Not supported for classifier other than Ensembl Tree") def cross_validation(self, X, Y, X_test=None, Y_test=None, tsize=0.3): """Cross validation on X and Y, using a sub sample""" if X_test is None or Y_test is None: X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=tsize) else: X_train = X Y_train = Y self.train(X_train, Y_train) Y_predicted = self.predict(X_test) self.get_stat(X_test, Y_test) return Y_predicted, self.get_score(X_test, Y_test) def plot_precision_recall(self, X_test, y_test, infos="", outfile="precision_recall.png"): """plot precicion-recall curve""" if self.trained: try: y_score = self.clf.decision_function(X_test) except: y_score = self.clf.predict_proba(X_test)[:, 1] precision, recall, _ = precision_recall_curve(y_test, y_score) average_precision = average_precision_score( y_test, y_score, average="micro") # Plot Precision-Recall curve for each class plt.clf() plt.plot(recall, precision, label='Average Precision-recall curve (area = {0:0.2f})' ''.format(average_precision)) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision-Recall curve for %s (%s)' % (self.method, infos)) plt.legend(loc="lower right") plt.savefig(outfile) else: raise ValueError("Classifier is not trained") def get_stat(self, X_test, y_test): """Print list of score for the current classifier""" y_pred = self.predict(X_test) if hasattr(self.clf, "predict_proba"): prob_pos = self.clf.predict_proba(X_test)[:, 1] else: # use decision function prob_pos = self.clf.decision_function(X_test) prob_pos = (prob_pos - prob_pos.min()) / \ (prob_pos.max() - prob_pos.min()) clf_score = brier_score_loss(y_test, prob_pos) print("%s:" % self.method) print("\tBrier: %1.3f" % (clf_score)) print("\tPrecision: %1.3f" % precision_score(y_test, y_pred)) print("\tRecall: %1.3f" % recall_score(y_test, y_pred)) print("\tF1: %1.3f" % f1_score(y_test, y_pred)) print("\tROC AUC score: %1.3f\n" % roc_auc_score(y_test, prob_pos))
clf.fit(X, y) SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma='auto', kernel='linear', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) print(clf.predict([[-0.8, -1]])) print(clf.decision_function([[-0.8, -1]])) print(clf.get_params()) print(clf.score([[-0.8, -1]], [1])) print(clf.score([[-0.8, -1]], [2])) ######################################################################################################################## ######################################################################################################################## from sklearn.svm import LinearSVC from sklearn.datasets import make_blobs import mglearn X, y = make_blobs(random_state=42) linear_svm = LinearSVC().fit(X, y) mglearn.discrete_scatter(X[:, 0], X[:, 1], y) plt.xlabel("Feature 0")
learning_rate_init=1e-3, learning_rate='adaptive', tol=1e-4, max_iter=200) elif i == 4: clf = LinearSVC(penalty='l2', random_state=0, tol=1e-4) skf_accuracy1 = [] skf_accuracy2 = [] for train, test in skf.split(X, y): clf.fit(X[train], y[train]) if n_classes.size < 3: skf_accuracy1.append( roc_auc_score(y[test], clf.predict_proba(X[test])[:, 1] if i != 4 else clf.decision_function(X[test]), average='micro')) clf.fit(X_new[train], y[train]) skf_accuracy2.append( roc_auc_score( y[test], clf.predict_proba(X_new[test])[:, 1] if i != 4 else clf.decision_function(X_new[test]), average='micro')) else: ytest_one_hot = label_binarize(y[test], n_classes) skf_accuracy1.append( roc_auc_score(ytest_one_hot, clf.predict_proba(X[test]) if i != 4 else clf.decision_function(X[test]), average='micro'))
svc_prediction = clf.predict(NMF1_test) svc_score = clf.predict(NMF1_test) evaluate(test_label, svc_score) ##min_df=5 clf = GaussianNB().fit(NMF2_train, train_label) svc_prediction = clf.predict(NMF2_test) svc_score = clf.predict(NMF2_test) evaluate(test_label, svc_score) ## (h) print('(h)------------------------------------------') print('Logistic Regression with min_df=2: \n') clf = linear_model.LogisticRegression().fit(LSI1_train, train_label) svc_prediction = clf.predict(LSI1_test) svc_score = clf.decision_function(LSI1_test) evaluate(test_label, svc_score) print('Logistic Regression with min_df=5: \n') clf = linear_model.LogisticRegression().fit(LSI2_train, train_label) svc_prediction = clf.predict(LSI2_test) svc_score = clf.decision_function(LSI2_test) evaluate(test_label, svc_score) ## (i) cross_list = [] for c in [0.001, 0.01, 0.1, 1, 10, 100, 1000]: linear_svc = linear_model.LogisticRegression(penalty='l1', C=c).fit( LSI1_train, train_label) linear_svc.fit(LSI1_train, train_label) svc_prediction = linear_svc.predict(LSI1_test)
class Learning(object): """ usage: >> from feelit.features import Learning >> learner = Learning(verbose=args.verbose, debug=args.debug) >> learner.set(X_train, y_train, feature_name) >> >> scores = {} >> for C in Cs: >> for gamma in gammas: >> score = learner.kFold(kfolder, classifier='SVM', >> kernel='rbf', prob=False, >> C=c, scaling=True, gamma=gamma) >> scores.update({(c, gamma): score}) >> >> best_C, best_gamma = max(scores.iteritems(), key=operator.itemgetter(1))[0] >> learner.train(classifier='SVM', kernel='rbf', prob=True, C=best_C, gamma=best_gamma, >> scaling=True, random_state=np.random.RandomState(0)) >> results = learner.predict(X_test, yb_test, weighted_score=True, X_predict_prob=True, auc=True) """ def __init__(self, X=None, y=None, **kwargs): loglevel = logging.ERROR if 'loglevel' not in kwargs else kwargs['loglevel'] logging.basicConfig(format='[%(levelname)s][%(name)s] %(message)s', level=loglevel) self.logger = logging.getLogger(__name__+'.'+self.__class__.__name__) self.X = X self.y = y self.kfold_results = [] self.Xs = {} self.ys = {} self.scaling = False if 'scaling' not in kwargs else kwargs['scaling'] def set(self, X, y, feature_name): self.X = X self.y = y self.feature_name = feature_name def train(self, **kwargs): self._train(self.X, self.y, **kwargs) def _train(self, X_train, y_train, **kwargs): """ required: X_train, y_train options: classifier: 'SVM', 'SGD', 'GaussianNB' with_mean: True/False with_std: True/False scaling: True/False prob: True/False. Esimate probability during training random_state: seed, RandomState instance or None; for probability estimation kernel: 'rbf', ... C: float; svm parameters shuffle: True/False; for SGD """ ## setup a classifier classifier = "SVM" if "classifier" not in kwargs else kwargs["classifier"] # ## slice # delete = None if "delete" not in kwargs else kwargs["delete"] # if delete: # X_train = np.delete(utils.toDense(self.X), delete, axis=0) # y_train = np.delete(self.y, delete, axis=0) # else: self.logger.debug("%d samples x %d features in X_train" % ( X_train.shape[0], X_train.shape[1] )) self.logger.debug("%d samples in y_train" % ( y_train.shape[0] )) with_mean = True if 'with_mean' not in kwargs else kwargs['with_mean'] with_std = True if 'with_std' not in kwargs else kwargs['with_std'] # Cannot center sparse matrices, `with_mean` should be set as `False` # Douglas: this doesn't make sense #if utils.isSparse(self.X): # with_mean = False self.scaling = False if 'scaling' not in kwargs else kwargs['scaling'] if self.scaling: self.scaler = StandardScaler(with_mean=with_mean, with_std=with_std) ## apply scaling on X self.logger.debug("applying a standard scaling with_mean=%d, with_std=%d" % (with_mean, with_std)) X_train = self.scaler.fit_transform(X_train) ## determine whether using predict or predict_proba self.prob = False if 'prob' not in kwargs else kwargs["prob"] random_state = None if 'random_state' not in kwargs else kwargs["random_state"] if classifier == "SVM": ## setup a svm classifier kernel = "rbf" if 'kernel' not in kwargs else kwargs["kernel"] ## cost: default 1 C = 1.0 if "C" not in kwargs else kwargs["C"] ## gamma: default (1/num_features) num_features = X_train.shape[1] gamma = (1.0/num_features) if "gamma" not in kwargs else kwargs["gamma"] #self.clf = svm.SVC(C=C, gamma=gamma, kernel=kernel, probability=self.prob, random_state=random_state, class_weight='auto') self.clf = svm.SVC(C=C, gamma=gamma, kernel=kernel, probability=self.prob, random_state=random_state) self.params = "%s_%s C=%f gamma=%f probability=%d" % (classifier, kernel, C, gamma, self.prob) elif classifier == "SGD": shuffle = True if 'shuffle' not in kwargs else kwargs['shuffle'] if self.prob: self.clf = SGDClassifier(loss="log", shuffle=shuffle) else: self.clf = SGDClassifier(shuffle=shuffle) self.params = "%s_%s" % (classifier, 'linear') elif classifier == "GaussianNB": self.clf = GaussianNB() self.params = "%s_%s" % (classifier, 'NB') else: raise Exception("currently only support SVM, SGD and GaussianNB classifiers") self.logger.debug(self.params) self.clf.fit(X_train, y_train) def dump_model(self, file_name): try: pickle.dump(self.clf, open(file_name, "w")) except ValueError: self.logger.error("failed to dump %s" % (file_name)) def dump_scaler(self, file_name): try: if self.scaling: pickle.dump(self.scaler, open(file_name, "w")) else: self.logger.warning("scaler doesn't exist") except ValueError: self.logger.error("failed to dump %s" % (file_name)) def load_model(self, file_name): try: self.clf = pickle.load( open(file_name, "r")) except ValueError: self.logger.error("failed to load %s" % (file_name)) def load_scaler(self, file_name): try: self.scaler = pickle.load( open(file_name, "r")) if self.scaler: self.scaling = True except ValueError: self.logger.error("failed to load %s" % (file_name)) def predict(self, X_test, y_test, **kwargs): ''' return dictionary of results ''' if self.scaling: X_test = self.scaler.transform(X_test) self.logger.info('y_test = %s', str(y_test.shape)) y_predict = self.clf.predict(X_test) X_predict_prob = self.clf.predict_proba(X_test) if self.clf.probability else 0 results = {} if 'score' in kwargs and kwargs['score'] == True: results.update({'score': self.clf.score(X_test, y_test.tolist())}) self.logger.info('score = %f', results['score']) if 'weighted_score' in kwargs and kwargs['weighted_score'] == True: results.update({'weighted_score': self._weighted_score(y_test.tolist(), y_predict)}) self.logger.info('weighted_score = %f', results['weighted_score']) if 'y_predict' in kwargs and kwargs['y_predict'] == True: results.update({'y_predict': y_predict}) self.logger.info('y_predict = %f', results['y_predict']) if 'X_predict_prob' in kwargs and kwargs['X_predict_prob'] == True: results.update({'X_predict_prob': X_predict_prob[:, 1]}) self.logger.info('X_predict_prob = %s', str(results['X_predict_prob'])) if 'auc' in kwargs and kwargs['auc'] == True: fpr, tpr, thresholds = roc_curve(y_test, X_predict_prob[:, 1]) results.update({'auc': auc(fpr, tpr)}) self.logger.info('auc = %f', results['auc']) if 'decision_value' in kwargs and kwargs['decision_value'] == True: results.update({'decision_value': self.clf.decision_function(X_test)}) self.logger.debug('decision_value = %s', str(results['decision_value'])) return results def _weighted_score(self, y_test, y_predict): # calc weighted score n_pos = len([val for val in y_test if val == 1]) n_neg = len([val for val in y_test if val == -1]) temp_min = min(n_pos, n_neg) weight_pos = 1.0/(n_pos/temp_min) weight_neg = 1.0/(n_neg/temp_min) correct_predict = [i for i, j in zip(y_test, y_predict) if i == j] weighted_sum = 0.0 for answer in correct_predict: weighted_sum += weight_pos if answer == 1 else weight_neg wscore = weighted_sum / (n_pos * weight_pos + n_neg * weight_neg) return wscore def kfold(self, kfolder, **kwargs): """ return: mean score for kfold training required: kfolder: generated by sklearn.cross_validatio.KFold options: same as _train """ #amend = False if "amend" not in kwargs else kwargs["amend"] #if amend: ## amend dense matrix: replace NaN and None with float values # self.check_and_amend() #else: # self.logger.debug("skip the amending process") sum_score = 0.0 for (i, (train_index, test_index)) in enumerate(kfolder): self.logger.info("cross-validation fold %d: train=%d, test=%d" % (i, len(train_index), len(test_index))) X_train, X_test, y_train, y_test = self.X[train_index], self.X[test_index], self.y[train_index], self.y[test_index] self._train(X_train, y_train, **kwargs) score = self.predict(X_test, y_test, score=True)['score'] self.logger.info('score = %.5f' % (score)) sum_score += score mean_score = sum_score/len(kfolder) self.logger.info('*** C = %f, mean_score = %f' % (kwargs['C'], mean_score)) return mean_score
class SoundClassifier: def __init__(self, algorithm): if algorithm == 'knn': self._classifier = KNeighborsClassifier(n_neighbors=6) elif algorithm == 'linear': self._classifier = LogisticRegression( ) # alternative => (C=100) / (C=0.01) elif algorithm == 'linearMulti': self._classifier = LinearSVC() elif algorithm == 'sgd': self._classifier = SGDClassifier(random_state=0) elif algorithm == 'decisionTree': self._classifier = DecisionTreeClassifier( random_state=0) # alternative => max_depth=4 elif algorithm == 'randomForest': self._classifier = RandomForestClassifier(n_estimators=10, max_features=1300, max_depth=8, random_state=0) elif algorithm == 'gradientBoosting': self._classifier = GradientBoostingClassifier( random_state=0 ) # alternative => max_depth=1, learning_rate=0.01 elif algorithm == 'svm': self._classifier = SVC( C=1.3, kernel='rbf', gamma='scale' ) # alternative => C=1000, gamma=1000. Also pre-process data elif algorithm == 'neuralNetworks': self._classifier = MLPClassifier( random_state=0 ) # alternative => max_iter=1000, alpha=1. Also pre-process data elif algorithm == 'gmm': self._classifier = GaussianProcessClassifier( kernel=RationalQuadratic(alpha=1, length_scale=1), random_state=0) elif algorithm == 'gnb': self._classifier = GaussianNB() else: print('Algorithm not found') def train_classifier(self, X_train, y_train): self._classifier.fit(X_train, y_train) def get_predictions(self, X_test): return self._classifier.predict(X_test) def get_accuracy(self, X_test, y_test): return self._classifier.score(X_test, y_test) def show_feature_importance(self, data, target): """Only works with algorithms that have feature_importances_ attribute""" plt.plot(self._classifier.feature_importances_, 'o') plt.xticks(range(data.shape[1]), target, rotation=90) plt.show() def print_decision_function(self, X_test): """Only works with algorithms that have decision_function method""" print(self._classifier.decision_function(X_test)) # We can recover the predictions by computing the argmax print(np.argmax(self._classifier.decision_function(X_test), axis=1)) print(self._classifier.predict(X_test)) def print_prediction_probability(self, X_test): """Only works with algorithms that have predict_proba method""" print(self._classifier.predict_proba(X_test)) # We can recover the predictions by computing the argmax print(np.argmax(self._classifier.decision_function(X_test), axis=1)) print(self._classifier.predict(X_test))
def plotshow(trainingSet,testSet,method): y=[] data=[] label=[] data_test=[] label_test=[] for i in range(len(trainingSet)): data.append(list(map(eval,trainingSet[i][:-1]))) label.append(list(map(eval,trainingSet[i][-1]))) y.append(list(map(eval,trainingSet[i][-1]))) for n in range(len(testSet)): data_test.append(list(map(eval,testSet[n][:-1]))) label_test.append(list(map(eval,testSet[n][-1]))) y.append(list(map(eval,testSet[n][-1]))) global clf if method=="高斯朴素贝叶斯": clf = GaussianNB(priors=None) if method =="多项式分布贝叶斯": clf = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None) if method =="伯努利朴素贝叶斯": clf = BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None) ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2) ax2 = plt.subplot2grid((3, 1), (2, 0)) ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated") for clf, name in [(clf, "Naive Bayes")]: clf = clf.fit(data, label) if hasattr(clf, "predict_proba"): prob_pos = clf.predict_proba(data_test)[:, 1] else: prob_pos = clf.decision_function(data_test) prob_pos = \ (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min()) clf_score = brier_score_loss(label_test, prob_pos, pos_label=np.array(y).max()) fraction_of_positives, mean_predicted_value = \ calibration_curve(label_test, prob_pos, n_bins=10) ax1.plot(mean_predicted_value, fraction_of_positives, "s-", label="%s (%1.3f)" % (name, clf_score)) ax2.hist(prob_pos, range=(0, 1), bins=10, label=name, histtype="step", lw=2) ax1.set_ylabel("Fraction of positives") ax1.set_ylim([-0.05, 1.05]) ax1.legend(loc="lower right") ax1.set_title('Calibration plots (reliability curve)') ax2.set_xlabel("Mean predicted value") ax2.set_ylabel("Count") ax2.legend(loc="upper center", ncol=2) plt.tight_layout() plt.savefig("E:/Anaconda/Scripts/CorsApi/snippets/static/picture/bayes.jpg") score = clf.score(data_test,label_test) return score