def sk_demo_1(): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) Y = np.array([1, 1, 1, 2, 2, 2]) clf = GaussianNB() clf.fit(X, Y) test_item = np.array([[-0.8, -1]]) print clf.predict(test_item) # [1] print clf.get_params()
def analyses( ): #je pense qu'ici faut mettre ce qu'on va utiliser dans notre fonction genre les données à analyser mfd = my_datasets.load_my_fancy_dataset() X = mfd.data #mes echantillons y = mfd.target #efficacite print(dir(mfd)) target = mfd.target data = mfd.data for i in [0, 1]: #les differents cooldowns print("cooldowns : %s, nb exemplaires: %s" % (i, len(target[target == i]))) # tableau numpy de 2 dimensions de 200 enregistrements de 4 valeurs print(type(data), data.ndim, data.shape) sns.set() df = pd.DataFrame(data, columns=mfd['feature_names']) df['target'] = target df.head() sns.pairplot(df, hue='efficacite', vars=mfd['feature_names'], size=2) """ Apprentissage Nous pourrions ici utiliser plusieurs algorithmes. Nous proposons de commencer par la classification Naive Bayes qui suppose que chaque classe est construite à partir d'une distribution Gaussiènne alignée. Elle n'impose pas de définir d'hyperparamètres et est très rapide. """ clf = GaussianNB() #Création du classifieur #Apprentissage clf.fit(data, target) # On aurait aussi pu utiliser le dataframe df #☻print(dir(clf)) clf.get_params() #Exécutons la prédiction sur les données d'apprentissage elles-mêmes result = clf.predict(data) print(result) #Observons la qualité de la prédiction print(result - target) errors = sum(result != target) # erreurs sur 200 mesures print("Nb erreurs:", errors) print("Pourcentage de prediction juste:", (200 - errors) * 100 / 200) #la même chose, mais avec scikit from sklearn.metrics import accuracy_score precision = accuracy_score(result, target) print(precision) return () #vrai ou faux, faut faire une variable qu'on étudie.
def test_gaussian_naive_bayes_defaults(): """Unit test for Gaussian Naive Bayes classifer algorithm. Check if classifier container with default parameters performs the same as running the corresponding sklearn algorithm with their default parameters.""" # Generate dataset datasets = generate_tutorial_data() clf = GaussianNB() # manual sklearn categorizations expected_predictions = [] for ds_name in datasets: X, y = datasets[ds_name] clf.fit(X, y) # Train classifier expected_predictions.append(clf.predict(X)) clf_container = classifiers.GaussianNBContainer() # Check that default params are equal assert (clf_container.create_clf().get_params() == clf.get_params()) # Check that the evaluate function works correctly for i, ds_name in enumerate(datasets): X, y = datasets[ds_name] pipeline_ds = generate_pipeline_dataset(X, y) actual_predictions = clf_container.evaluate(pipeline_ds.training_set, pipeline_ds.testing_set) assert len(actual_predictions) == len(expected_predictions[i]) assert (actual_predictions == expected_predictions[i]).all()
def naive_bayes_k(k, sequence_origin='DairyDB', primers_origin='DairyDB', taxonomy_level: int = 1, selected_primer: str = 'V4', model_preprocessing='Computing frequency of {}-mer (ATCG) in every sequence', test_size=0.2): """ Apply Naive Bayes model on a set of sequence preprocessed data. :return: """ model_preprocessing = model_preprocessing.format(k) X_train, X_test, y_train, y_test = ETL_NB_k_mer(k=k, sequence_origin=sequence_origin, primers_origin=primers_origin, taxonomy_level=taxonomy_level, selected_primer=selected_primer) GNB = GaussianNB() y_pred = GNB.fit(X_train, y_train).predict(X_test) test_size, prop_main_class, accuracy = main_stats_model(y_train=y_train, y_test=y_test, y_pred=y_pred, model_name='Naive Bayes - NB({})'.format(k), model_parameters=GNB.get_params(), model_preprocessing=model_preprocessing, sequence_origin=sequence_origin, primers_origin=primers_origin, taxonomy_level=taxonomy_level, selected_primer=selected_primer, test_size=test_size) return test_size, prop_main_class, accuracy
def Guassian_NB(train_bow_tf_idf, train_labels, bow_test_tf_idf, test_labels): # training the Gaussian_NB model model = GaussianNB() model.fit(train_bow_tf_idf.toarray(), train_labels) print() print('------- Gaussian Naive Bayes -------') # evaluate the model print('Default hyperparameters:') print(model.get_params()) train_pred = model.predict(train_bow_tf_idf.toarray()) print('Gaussian NB train accuracy = {}'.format( (train_pred == train_labels).mean())) test_pred = model.predict(bow_test_tf_idf) print('Gaussian NB test accuracy = {}'.format( (test_pred == test_labels).mean())) # # gridsearch for best Hyperparameter # parameters = {'alpha': (1, 0.1, 0.01, 0.015, 0.001)} # gs_clf = GridSearchCV(model, parameters, n_jobs=-1) # gs_clf = gs_clf.fit(train_bow_tf_idf, train_labels) # # best_parameters = gs_clf.best_estimator_.get_params() # print('Best params using gridSearch:') # print(best_parameters) # gstrain_pred = gs_clf.predict(train_bow_tf_idf) # print('New hyperparameters Gaussian NB train accuracy = {}'.format((gstrain_pred == train_labels).mean())) # gstest_pred = gs_clf.predict(bow_test_tf_idf) # print('New hyperparameters Gaussian NB test accuracy = {}'.format((gstest_pred == test_labels).mean())) # print('---------------------------------------') # print() return model
def perform_GaussianNB(self): gnb = GaussianNB() gnb = gnb.fit(self.data_train, self.labels_train) self.GNB_result ={"parameters":gnb.get_params(),"labels_test_data":gnb.predict(self.data_test),"score":gnb.score(self.data_test,self.labels_test)} print_dict(self.GNB_result) print("f1_score:") print(f1_score(self.labels_test, self.GNB_result["labels_test_data"], average='macro') )
class GaussianNBClass(ClassicalModel): def __init__(self, input_size, output_size, labels, class_weights=None, **kwargs): super().__init__(input_size, output_size, labels, class_weights) self.model = GaussianNB(**kwargs) self.name = "Gaussian Naive Bayes Classifier: \n" + str( self.model.get_params())
def prediction(x, y): x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=SEED, test_size=0.20) clf = GaussianNB() print('Gaussian Fitting with params=', clf.get_params()) clf.fit(x_train, y_train) print("Predicting") y_pred = clf.predict(x_test) return classification_report(y_test, y_pred)
class ModelGaussNB(Model, BaseEstimator, ClassifierMixin): def __init__(self, run_fold_name, priors=None, var_smoothing=1e-09): params = {'priors': priors, 'var_smoothing': var_smoothing} super().__init__(run_fold_name, params) self.model = GaussianNB(**self.params) def train(self, tr_x, tr_y, va_x=None, va_y=None): self.model = self.model.fit(tr_x, tr_y) def fit(self, tr_x, tr_y): self.train(tr_x, tr_y) return self def predict(self, te_x): return self.model.predict(te_x) def score(self, te_x, te_y): y_pred = self.predict(te_x) return f1_score(np.identity(5)[te_y], np.identity(5)[y_pred], average='samples') def get_params(self, deep=True): dic = self.model.get_params(deep) dic["run_fold_name"] = self.run_fold_name return dic def set_params(self, **parameters): if "run_fold_name" in parameters: self.run_fold_name = parameters["run_fold_name"] parameters.pop("run_fold_name", None) self.params.update(parameters) self.model.set_params(**self.params) return self def save_model(self, feature): model_path = os.path.join(f'../model/model/{feature}', f'{self.run_fold_name}.model') os.makedirs(os.path.dirname(model_path), exist_ok=True) Util.dump(self.model, model_path) def load_model(self, feature): model_path = os.path.join(f'../model/model/{feature}', f'{self.run_fold_name}.model') self.model = Util.load(model_path)
class model(object): def __init__(self): self.embeddings = load_embedding_matrix( wv_path=general_config.wv_path, int2vocabPath=general_config.global_static_i2v_path) self.model = GaussianNB() self.log_dir = ensure_dir_exist(general_config.log_dir + "/NB") self.save_dir = ensure_dir_exist(general_config.save_dir + "/NB") self.logger = my_logger(self.log_dir + "/log.txt") def train(self, trainPath=general_config.data_dir + "/training_label_new.txt"): indices, sentences, labels = readNewFile( file=trainPath, vocab2intPath=general_config.global_static_v2i_path) sentences_ = [] for sentence in sentences: sentences_.append(self.embeddings[sentence].mean(axis=0)) self.model.fit(X=sentences_, y=labels) self.logger.info(self.model.get_params()) self.logger.info("Training Accuracy: %s" % self.model.score(X=sentences_, y=labels)) save_path = self.save_dir + "/model.pkl" joblib.dump(self.model, save_path) def test(self, testPath=general_config.data_dir + "/testing_data_new.txt"): indices, sentences, labels = readNewFile( file=testPath, vocab2intPath=general_config.global_static_v2i_path) sentences_ = [] for sentence in sentences: sentences_.append(self.embeddings[sentence].mean(axis=0)) self.model = joblib.load(self.save_dir + "/model.pkl") predicted = self.model.predict(sentences_) res = np.concatenate([ np.array(indices).reshape((-1, 1)), np.array(predicted).reshape((-1, 1)) ], axis=1) WriteToSubmission( res, fileName=self.save_dir.replace("checkpoints", "results") + "/predicted.csv")
class GenerativeModel(Classifier): def __init__(self, **kwargs): super().__init__() self._model = GaussianNB(**kwargs) self.hyperparams = self._model.get_params() def fit(self, X, Y): """ Training model with data provided. Parameters ========== X: Pandas DataFrame. Attribute Values Y: Pandas Series. Object labels. Returns ======= void. """ X = X.values Y = Y.values self._model.fit(X=X, y=Y) def predict(self, X): """ Returns prediction label for X. Parameters ========== X: Pandas DataFrame -> Data to predict value Returns ======= Prediction labels: array like of size (nsamples, [n_features]) """ pred = self._model.predict(X) return pred
def Gaussian_Naive_Bayes_Model(X_train, y_train, X_test, y_test): ''' Isomap demonstrated that the data is distributed in overlaping groups. Therefore, samples should be allocated to digit based on a Gaussian distribution. ''' model = GaussianNB() classifier = model.fit(X_train, y_train) testing_model = model.predict(X_test) score = model.score(X_test, y_test) cv_scores = cross_val_score(classifier, X_test, y_test, cv = 3) print(' ') print('===== Gaussian Naive Bayes Model =====') print('score:', score) print('cross validation scores:', cv_scores) # Visualize parameters in a table. visualize_params(model.get_params()) # Display confusion matrix. visualize_heatmap(y_test, testing_model, 'Gaussian Naive Bayes') return score
np.random.shuffle(trainingSet) trainingSetLabels = trainingSet[:, 12] #putting labels in separate array trainingSetLabels[trainingSetLabels == 0] = -1 #replacing all 0 with -1 to match sklearn format trainingSet = trainingSet[:, 1:11] #removing label cols from actual inputs trainingSet, testingSet, trainingSetLabels, testingSetLabels = train_test_split( trainingSet, trainingSetLabels, test_size=0.6, random_state=0) #fixes random_state so results reproducible startTime = time.time() print "Time before training = ", startTime clf = GaussianNB() clf = clf.fit(trainingSet, trainingSetLabels) print "Params after training:" print clf.get_params() trainingAccuracy = clf.score(trainingSet, trainingSetLabels) print "Training accuracy = ", trainingAccuracy testingAccuracy = clf.score(testingSet, testingSetLabels) print "Testing accuracy = ", testingAccuracy print "Done training and testing! Time = ", time.time() - startTime, "seconds"
from sklearn.preprocessing import StandardScaler scaler = StandardScaler() # Don't cheat - fit only on training data scaler.fit(training) training = scaler.transform(training) # apply same transformation to test data test = scaler.transform(test) from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf.fit(training, ground_training) print(clf.get_params()) results = clf.score(test, ground_test) print("mean accuracy: ", results) results = clf.predict(test) correct = 0 for i in range(len(results)): # results[i] = results[i] == ground_test[i] if results[i] == ground_test[i]: results[i] = results[i] == ground_test[i] correct += 1 print("accuracy: ", correct * 100 / len(results))
print(cp) # 获取各类标记对应的训练样本数 cc = bn1.class_count_ print(cc) # 获取各个类标记在各个特征上的均值 ct = bn1.theta_ print(ct) # 获取各个类标记在各个特征上的方差 cs = bn1.sigma_ print(cs) # 获取参数 gp = bn1.get_params(deep=True) print(gp) # 训练样本 bn = GaussianNB() bn.fit(X, y, np.array([0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2])) print(bn) print(bn.theta_) print(bn.sigma_) # 增量式训练 bn2 = GaussianNB() bn2.partial_fit(X, y, classes=[1, 2], sample_weight=np.array([0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2])) # 输出测试机预测的类标记 pr1 = bn1.predict([[-6, -6], [4, 5]])
# Train Test Split. X_train, X_test, Y_train, Y_test = train_test_split(data, ActionDf, test_size=0.55, stratify=ActionDf, random_state=1) model = GaussianNB() model.fit(X_train, Y_train) expected = Y_test predicted = model.predict(X_test) print("\nA: Simple Gaussian\nNaive Bayes") print("-------------------") print(" * Params", model.get_params()) print(" * Class labels", model.get_params()) print(" * Class labels", model.classes_) print(" * Probability of each class", model.class_prior_) print(" * Absolute additive value to variances", model.epsilon_) print(" * Variance of each feature per class", model.sigma_) print(" * Mean of each feature per class", model.theta_) print('\nAccuracy on test set: {:.2f}'.format(model.score(X_test, Y_test))) print("Confusion_Matrix...") print(metrics.confusion_matrix(expected, predicted)) print( "\n--> Part 1 Analysis: With no priors and no weights, the simple gaussian analysis\n" "\tresults in an accuracy of 70% and the highest achievable with this classifier.\n"
''' #GaussianNB一个重要的功能是有 partial_fit方法,这个方法的一般用在如果训练集数据量非常大,一次不能全部载入 #内存的时候。这时我们可以把训练集分成若干等分,重复调用partial_fit来一步步的学习训练集,非常方便 #在第一次调用partial_fit函数时,必须制定classes参数,在随后的调用可以忽略 clf.partial_fit(iris.data, iris.target,classes=[0,1,2]) ''' #学习后模型中的一些参数 clf.set_params( priors=[0.333, 0.333, 0.333]) #这里要设一下各个类标记对应的先验概率,如果不设置直接clf.priors返回的是None(不知道为什么?) print(clf.priors) #获取各个类标记对应的先验概率 print(clf.class_prior_ ) #同priors一样,都是获取各个类标记对应的先验概率,区别在于priors属性返回列表,class_prior_返回的是数组 print(clf.get_params(deep=True)) #返回priors与其参数值组成字典 print(clf.class_count_) #获取各类标记对应的训练样本数 print(clf.theta_) #获取各个类标记在各个特征上的均值 print(clf.sigma_) #获取各个类标记在各个特征上的方差 #测试数据 data_test = np.array([6, 4, 6, 2]) data = data_test.reshape(1, -1) Result_predict = clf.predict(data) Score = clf.score([[6, 8, 5, 3], [5, 3, 4, 2], [4, 6, 7, 2]], [2, 0, 1], sample_weight=[0.3, 0.5, 0.2]) Result_predict_proba = clf.predict_proba(data) Result_predict_log_proba = clf.predict_log_proba(data) print(Result_predict) #预测所属类别
ap.add_argument("-s", "--samples", required=True, help="Path to samples") ap.add_argument('-l', '--labels', required=True, help="Path to labels") ap.add_argument('-c', '--saveAs', required=True, help="Save as") args = vars(ap.parse_args()) samples = np.load(args['samples']) labels = np.load(args['labels']) startTimeNaiveBayes = timeit.default_timer() naiveBayesClassifier = GaussianNB() naiveBayesClassifier.fit(samples, labels) elapsedTimeNaiveBayes = timeit.default_timer() - startTimeNaiveBayes with open(args['saveAs'] + ".pkl", "wb") as f: joblib.dump(naiveBayesClassifier, f, compress=3) print() print("Time taken: ", elapsedTimeNaiveBayes) print() print("Parameters:") print() print(naiveBayesClassifier.get_params()) print() summary = open('summary_naive_bayes_' + args['saveAs'] + '.txt', 'wb') print >> summary, "Time taken: " + str(elapsedTimeNaiveBayes) print >> summary, "Parameters:" print >> summary, naiveBayesClassifier.get_params()
def recompute_model(model_name): def inner_map(x): if x == 0: return 3 else: return x def map_replace(x_list): return map(inner_map, x_list) print "In recompute model" model = models.find_one({"model": model_name}) if model: """ Schema is: {<model_name>, <product_id>: <rec_score> } """ print "Current model: " print model # we need to recompute the model product_attributes = pickle.load(open(DUMP_PRODUCTS, "rb")) attribute_list = pickle.load(open(DUMP_ATTR, "rb")) recommendation_attributes = pickle.load(open(DUMP_RECOMMENDATIONS_ATTR,"rb")) model_scores = model["scores"] # list of ratings scores_list = [] rated_product_attributes = [] error_count = 0 for product, score in model_scores.iteritems(): # <product_id>: <score> try: rating = map(inner_map, product_attributes[str(product)]) print "Success: " print product rated_product_attributes.append(rating) scores_list.append([score]) except: error_count +=1 # error occurs if product is not in pickle dump of rated_product print "Error finding product attributes" print product continue print error_count print len(scores_list) # matrix, rated products X attributes X = np.array(rated_product_attributes) # vector, includes all of the scores y = np.array(scores_list) Total = np.append(X, y, 1) num_products, num_attributes = Total.shape SortedTotal = Total[Total[:,(num_attributes - 1)].argsort()] print SortedTotal X = SortedTotal[:, :-1] ratings = discreteClasses(num_products) print "Ratings" print ratings mnb = GaussianNB() mnb.fit(X, ratings) # get factors to cluster with lcv = LinearSVC(dual=False) lcv.fit(X, ratings) coefs = lcv.coef_.tolist()[2] a_coefs = np.abs(np.array(coefs)) norm_coefs = a_coefs/np.mean(a_coefs) mean_list = get_feature_means(model_scores) recommendations = recommendation_attributes.keys() pre_proc = [recommendation_attributes[rec] for rec in recommendations] print "Pre proc" print pre_proc test_data_list = map(map_replace, pre_proc) print test_data_list test_data = np.array(test_data_list) classification = mnb.predict(test_data).tolist() print classification print "Getting params: " print mnb.get_params(True) print mnb.theta_ print mnb.sigma_ print mnb.class_prior_ good_recs = [] average_recs = [] for i in range(len(classification)): if classification[i] == 3: good_recs.append(recommendations[i]) elif classification[i] == 2: average_recs.append(recommendations[i]) else: print recommendations[i] print "Good recs: " print good_recs print "Average recs: " print average_recs #empirical_log = mnb.coef_.flatten().tolist() empirical_log = [1, 2, 3] # update model with values models.update({"_id": ObjectId(model["_id"])}, {"$set": {"feature_means": mean_list, "feature_weights": norm_coefs.tolist(), "good_recs": good_recs, "average_recs": average_recs}})
y = diagnostic[:trainingSetLength, 1:] # target values (i.e. expected output for X) for i in range(len(y)): y[i] = int(y[i]) y = np.transpose(y).astype('int') trainingSet = extractedFeatures[:trainingSetLength] bys = GaussianNB() bys.fit(trainingSet, y[0]) # letting the algorithm know which sample in X belongs to which class labelled in y # save the params to disk bys_params = bys.get_params() params_bys = 'params_bys.sav' # save the model to disk filename_bys = 'bys_model.sav' pickle.dump(bys, open(filename_bys, 'wb')) #testSet=extractedFeatures[trainingSetLength:trainingSetLength+10] #prediction=bys.predict(testSet) pickle.dump(bys_params, open(params_bys, 'wb')) #%%TEST CLASSIFICATION - kNN excelAddress = 'C:\\Users\\theor\\Downloads\\Ground_truth_ISIC_1.xlsx' trainingSetLength = 100 diagnostic = preProcessing(excelAddress)
# # Naive Bayes (Likelihood Ratio) # log.info('Starting to process %s Naive Bayes (Likelihood Ratio) %s' % (Fore.BLUE,Fore.WHITE)) nb_best_clf = GaussianNB() # There is no tuning of a likelihood ratio! if args.verbose: log.info('Parameters of the best classifier: A simple likelihood ratio has no parameters to be tuned!') nb_best_clf.verbose = 2 nb_best_clf.fit(X_train,y_train) nb_disc = nb_best_clf.predict_proba(X_test)[:,1] nb_fpr, nb_tpr, nb_thresholds = roc_curve(y_test, nb_disc) Classifiers["NB"]=(nb_best_clf,y_test,nb_disc,nb_fpr,nb_tpr,nb_thresholds) OutFile.write("NB: " + str(nb_best_clf.get_params()) + "\n") # # Multi-Layer Perceptron (Neural Network) # log.info('Starting to process %s Multi-Layer Perceptron (Neural Network) %s' % (Fore.BLUE,Fore.WHITE)) mlp_parameters = {'activation':list(['tanh','relu']), 'hidden_layer_sizes':list([5,10,15]), 'algorithm':list(['sgd','adam']), 'alpha':list([0.0001,0.00005,0.0005]), 'tol':list([0.00001,0.0001])} mlp_clf = GridSearchCV(MLPClassifier(learning_rate = 'adaptive'), mlp_parameters, n_jobs=-1, verbose=3, cv=2) if args.verbose else GridSearchCV(MLPClassifier(learning_rate = 'adaptive'), mlp_parameters, n_jobs=-1, verbose=0, cv=2) mlp_clf.fit(X_train_skimmed,y_train_skimmed) mlp_best_clf = mlp_clf.best_estimator_ if args.verbose: log.info('Parameters of the best classifier: %s' % str(mlp_best_clf.get_params()))
X = datasetx y = datasety """ Testes usando Holdout """ # Separacao da base em treinamento (70% da base) e teste (30% da base) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42, stratify=y) # Criacao do classificador clfa = GaussianNB(var_smoothing=1e-10) print(clfa.get_params()) # Treinamento do classificador clfa = clfa.fit(X_train, y_train.values.ravel()) # Resultados dos testes predicted = clfa.predict(X_test) # Porcentagem de acuracia score = clfa.score(X_test, y_test) # Criacao da matriz de confusao matrix = confusion_matrix(y_test, predicted) print("Accuracia = %.4f " % score) print("Matriz de confusao:") print(matrix)
funcEvltime = (time.time() - start_timeFun) / len(X_test) #Report #y_pred = model.predict(X_test) errorTst = model.score(X_test, y_test) now = datetime.now() current_time = now.strftime("%H:%M:%S") print(current_time, ' ', exp_num, data_file, ' ', runOpt, ' test accuracy:', errorTst) #collection error = [errorTrn, errorTst, 'none', funcEvltime, 'empty'] data_results_coll.update({ str(data_file.split('.')[0]) + "_" + str(exp_num) + "_" + runOpt: error }) #collect all SGDs #end for all solve #end for each runs data_results_coll.update({'model_config': model.get_params()}) np.save( os.getcwd() + os.sep + 'outputs' + os.sep + data_file.split('.')[0] + '_Ep_' + str(EPOCHS) + '_B_All_' + runOpt, data_results_coll) #end for each dataum #%% save np.load print('All experiments end. for') print(data_file_set) #a = np.load('outputs'+os.sep+'mlp_friedman_Ep_500_B1Sig_l2.npy.npy').item()
Valeur = [Colonne[-1] for Colonne in voisin] prediction = max(set(Valeur), key=Valeur.count) return prediction #Creaction base de donnée BaseDeDonne=ChargerFichier1('iris.data') colonne = [5.7,2.9,4.2,1.3] print("Choisir un nombre de points sur lequel faire l'etude compris entre 1 et 150" ) nombrepoints=eval(input()) echantillon = Prediction(BaseDeDonne, colonne, nombrepoints) print('Donnee=%s, Prediction: %s' % (colonne, echantillon)) target = iris.target data = iris.data clf = GaussianNB() clf.fit(data, target) clf.get_params() result = clf.predict(data) conf = confusion_matrix(target, result) print("la matrice de confusion est : ") print(conf)
x = df.drop(['label', 'lld'], axis=1).values y = df['label'].values #preprocessing standardized_x = preprocessing.scale(x) #create a test set of size of about 20% of the dataset x_train, x_test, y_train, y_test = train_test_split(standardized_x, y, test_size=0.2, random_state=42, stratify=y) model = GaussianNB() f.write('Default parameters %s \n' % str(model.get_params())) model.fit(x_train, y_train) y_pred = model.predict(x_test) y_true = y_test print('Recall (TRP) %.2f (1 = best 0 = worse)' % recall_score(y_test, y_pred)) print("Accuracy score: %.2f" % model.score(x_test, y_test)) if args.deploy: print("[+] Model ready for deployment") joblib.dump(model, 'models/nb_model.pkl') """ Performance - Confusion matrix
class GaussianNB(Classifier): r"""Implementation of gaussian Naive Bayes classifier. Date: 2020 Author: Luka Pečnik License: MIT Reference: Murphy, Kevin P. "Naive bayes classifiers." University of British Columbia 18 (2006): 60. Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html See Also: * :class:`niaaml.classifiers.Classifier` """ Name = 'Gaussian Naive Bayes' def __init__(self, **kwargs): r"""Initialize GaussianNB instance. """ warnings.filterwarnings(action='ignore', category=ChangedBehaviorWarning) warnings.filterwarnings(action='ignore', category=ConvergenceWarning) warnings.filterwarnings(action='ignore', category=DataConversionWarning) warnings.filterwarnings(action='ignore', category=DataDimensionalityWarning) warnings.filterwarnings(action='ignore', category=EfficiencyWarning) warnings.filterwarnings(action='ignore', category=FitFailedWarning) warnings.filterwarnings(action='ignore', category=NonBLASDotWarning) warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning) self.__gaussian_nb = GNB() super(GaussianNB, self).__init__() def set_parameters(self, **kwargs): r"""Set the parameters/arguments of the algorithm. """ self.__gaussian_nb.set_params(**kwargs) def fit(self, x, y, **kwargs): r"""Fit GaussianNB. Arguments: x (pandas.core.frame.DataFrame): n samples to classify. y (pandas.core.series.Series): n classes of the samples in the x array. Returns: None """ self.__gaussian_nb.fit(x, y) def predict(self, x, **kwargs): r"""Predict class for each sample (row) in x. Arguments: x (pandas.core.frame.DataFrame): n samples to classify. Returns: pandas.core.series.Series: n predicted classes. """ return self.__gaussian_nb.predict(x) def to_string(self): r"""User friendly representation of the object. Returns: str: User friendly representation of the object. """ return Classifier.to_string(self).format( name=self.Name, args=self._parameters_to_string(self.__gaussian_nb.get_params()))
plt.show() print("finished validation analysis for KNN") classifierKNN = classifierKNN.fit(features_train, labels_train) kNN_predictions = classifierKNN.predict(features_test) kNN_acc += accuracy_score(labels_test, kNN_predictions) KNN_valid_acc = np.mean(cross_val_score(classifierKNN, features, labels, cv=5), axis=1) print("KNN accuracy is" + str(kNN_acc)) # Gaussian Naive Bayes analysis classifierNB = GaussianNB() params = classifierNB.get_params() gnb_valid_accs = cross_val_score(classifierNB, features, labels, cv=5) print("gnb valid accs are" + str(gnb_valid_accs)) print("nb params are" + str(params)) classifierNB = classifierNB.fit(features_train, labels_train) gnb_predictions = classifierNB.predict(features_test) #gnb_valid_acc = np.mean(gnb_valid_accs, axis=1) gnb_valid_acc = np.mean(gnb_valid_accs) gnb_acc = accuracy_score(labels_test, gnb_predictions) print("gnb has validation accuracy:" + str(gnb_valid_acc)) print("gnb has regular testing accuracy:" + str(gnb_acc)) train_sizes, train_scores, validation_scores = learning_curve( classifierNB, X=features, y=labels, scoring="accuracy") mean_train_score = np.mean(train_scores, axis=1) std_train_score = np.std(train_scores, axis=1)
# tunig RF param_grid_RF = { "max_depth": [2, 3, 4, 5, 6, None], "max_features": [2, 3, 4], "min_samples_split": [2, 4, 6], "min_samples_leaf": [1, 2, 3], "bootstrap": [True, False], "criterion": ["gini", "entropy"] } # grid_search_RF = GridSearchCV(clfRF, param_grid=param_grid_RF, scoring="f1") grid_search_RF.fit(features, labels) print grid_search_RF.best_params_ #tuning Gaussian not possible print clfGNB.get_params().keys() # Example starting point. Try investigating other evaluation techniques! from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. print "Gaussian" test_classifier(clfGNB, my_dataset, features_list) dump_classifier_and_data(clfGNB, my_dataset, features_list)
pyplot.xlabel('n_estimators') pyplot.ylabel('Log Loss') pyplot.savefig('n_estimators_vs_learning_rate.png') #The output for this line of code can be found at : https://tinyurl.com/y9js976p #Learning rate graph can be found at : https://tinyurl.com/ycttuck3 ############# XGboost - Picking the best values for learning rate and estimators import xgboost as xgb from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import GridSearchCV model = xgb.XGBClassifier(learning_rate=0.1, n_estimators=500, max_depth=5, min_child_weight=4 ) final_m=model.fit(X_train, y_train) xgb.plot_importance(final_m) plt.show() predictions = model.predict(X_test) print("training set auc:",accuracy_score(y_test, predictions)) predictions = model.predict(X_test) print("test set auc:",accuracy_score(y_test, predictions)) print(model.get_params()) XGBA = accuracy_score(y_test, predictions) print("The Accuracy is {}".format(XGBA)) #The accuracy can be viewed at : https://tinyurl.com/y8l65kcv #As you can see the max accuracy is achieved with XgBoost.
preds = final_model.predict_proba(test_features)[:, 1] baseline_auc44 = roc_auc_score(test_labels, preds) print( 'The final tuned XGB_model scores {:.5f} ROC AUC on the test set.'.format( baseline_auc44)) # In[19]: #-----<Model5: GaussianNB>------- from sklearn.naive_bayes import GaussianNB #Establish a baseline model base_NB = GaussianNB() # Default hyperparamters hyperparameters = base_NB.get_params() print(hyperparameters) NB_scores = cross_val_score(base_NB, train_features, train_labels, scoring='roc_auc', cv=10) print('The mean AUC for GaussianNB is:', NB_scores.mean()) base_NB.fit(train_features, train_labels) # Actual class predictions NB_predictions = base_NB.predict(test_features) # Probabilities for each class base_NB_probs = base_NB.predict_proba(test_features)[:, 1]
plt.ylabel(f[y_axis]) plt.legend() plt.show() """ ############################################# ######## 3. Fit and Tune Classifier ######### ############################################# from sklearn.model_selection import GridSearchCV from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier print "\nStart training Naive Bayes..." NBclf = GaussianNB() NBclf.fit(X_train, y_train) print NBclf.get_params() print "\nStart training SVC..." #SVMparam = {'kernel':['linear', 'rbf', 'poly'], 'C':[1e2, 1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]} SVMparam = { 'kernel': ['linear', 'rbf'], 'C': [1e7, 1e6, 1e5, 1e4], 'gamma': [0.00001, 0.000001, 0.0000001] } #SVclf = SVC(kernel='rbf', C=1e7, gamma=0.00001) SVclf = GridSearchCV(SVC(), SVMparam) t0 = time() SVclf.fit(X_train, y_train) print "SVC completes in ", time() - t0, "seconds" print SVclf.best_estimator_
def proc_type(idx,ftype): typedir = args.indir+ftype+"/" log.info('************ Processing Type (%s/%s): %s %s %s ****************' % (str(idx+1),str(ntypes),Fore.GREEN,ftype,Fore.WHITE)) if args.verbose: log.info('Working in directory: %s' % typedir) Classifiers = {} OutFile = open(typedir+'OptimizedClassifiers.txt', 'w') featurenames = pickle.load(open(typedir + "featurenames.pkl","r")) X_full = pickle.load(open(typedir + "tree.pkl","r")) X_signal = np.asarray([x for x in X_full if x[-1] in flav_dict[args.signal]])[:,0:-1] X_bkg = np.asarray([x for x in X_full if x[-1] in flav_dict[args.bkg]])[:,0:-1] # select only every 'pickEvery' and onle the first 'element_per_sample' X_signal = np.asarray([X_signal[i] for i in range(len(X_signal)) if i%args.pickEvery == 0]) X_bkg = np.asarray([X_bkg[i] for i in range(len(X_bkg)) if i%args.pickEvery == 0]) X = np.concatenate((X_signal,X_bkg)) y = np.concatenate((np.ones(len(X_signal)),np.zeros(len(X_bkg)))) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) X_train_skimmed = np.asarray([X_train[i] for i in range(len(X_train)) if i%10 == 0]) # optimization only on 10 % y_train_skimmed = np.asarray([y_train[i] for i in range(len(y_train)) if i%10 == 0]) # # GBC # log.info('%s %s %s: Starting to process %s Gradient Boosting Classifier %s' % (Fore.GREEN,ftype,Fore.WHITE,Fore.BLUE,Fore.WHITE)) gbc_parameters = {'n_estimators':list([50,100,200]), 'max_depth':list([5,10,15]),'min_samples_split':list([int(0.005*len(X_train_skimmed)), int(0.01*len(X_train_skimmed))]), 'learning_rate':list([0.05,0.1])} gbc_clf = GridSearchCV(GradientBoostingClassifier(), gbc_parameters, n_jobs=-1, verbose=3, cv=2) if args.verbose else GridSearchCV(GradientBoostingClassifier(), gbc_parameters, n_jobs=-1, verbose=0, cv=2) gbc_clf.fit(X_train_skimmed,y_train_skimmed) gbc_best_clf = gbc_clf.best_estimator_ if args.verbose: log.info('Parameters of the best classifier: %s' % str(gbc_best_clf.get_params())) gbc_best_clf.verbose = 2 gbc_best_clf.fit(X_train,y_train) gbc_disc = gbc_best_clf.predict_proba(X_test)[:,1] gbc_fpr, gbc_tpr, gbc_thresholds = roc_curve(y_test, gbc_disc) Classifiers["GBC"]=(gbc_best_clf,y_test,gbc_disc,gbc_fpr,gbc_tpr,gbc_thresholds) OutFile.write("GBC: " + str(gbc_best_clf.get_params()) + "\n") # # Randomized Forest # log.info('%s %s %s: Starting to process %s Randomized Forest Classifier %s' % (Fore.GREEN,ftype,Fore.WHITE,Fore.BLUE,Fore.WHITE)) rf_parameters = {'n_estimators':list([50,100,200]), 'max_depth':list([5,10,15]),'min_samples_split':list([int(0.005*len(X_train_skimmed)), int(0.01*len(X_train_skimmed))]), 'max_features':list(["sqrt","log2",0.5])} rf_clf = GridSearchCV(RandomForestClassifier(n_jobs=5), rf_parameters, n_jobs=-1, verbose=3, cv=2) if args.verbose else GridSearchCV(RandomForestClassifier(n_jobs=5), rf_parameters, n_jobs=-1, verbose=0, cv=2) rf_clf.fit(X_train_skimmed,y_train_skimmed) rf_best_clf = rf_clf.best_estimator_ if args.verbose: log.info('Parameters of the best classifier: %s' % str(rf_best_clf.get_params())) rf_best_clf.verbose = 2 rf_best_clf.fit(X_train,y_train) rf_disc = rf_best_clf.predict_proba(X_test)[:,1] rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_test, rf_disc) Classifiers["RF"]=(rf_best_clf,y_test,rf_disc,rf_fpr,rf_tpr,rf_thresholds) OutFile.write("RF: " + str(rf_best_clf.get_params()) + "\n") # # Stochastic Gradient Descent # log.info('%s %s %s: Starting to process %s Stochastic Gradient Descent %s' % (Fore.GREEN,ftype,Fore.WHITE,Fore.BLUE,Fore.WHITE)) sgd_parameters = {'loss':list(['log','modified_huber']), 'penalty':list(['l2','l1','elasticnet']),'alpha':list([0.0001,0.00005,0.001]), 'n_iter':list([10,50,100])} sgd_clf = GridSearchCV(SGDClassifier(learning_rate='optimal'), sgd_parameters, n_jobs=-1, verbose=3, cv=2) if args.verbose else GridSearchCV(SGDClassifier(learning_rate='optimal'), sgd_parameters, n_jobs=-1, verbose=0, cv=2) sgd_clf.fit(X_train_skimmed,y_train_skimmed) sgd_best_clf = sgd_clf.best_estimator_ if args.verbose: log.info('Parameters of the best classifier: %s' % str(sgd_best_clf.get_params())) sgd_best_clf.verbose = 2 sgd_best_clf.fit(X_train,y_train) sgd_disc = sgd_best_clf.predict_proba(X_test)[:,1] sgd_fpr, sgd_tpr, sgd_thresholds = roc_curve(y_test, sgd_disc) Classifiers["SGD"]=(sgd_best_clf,y_test,sgd_disc,sgd_fpr,sgd_tpr,sgd_thresholds) OutFile.write("SGD: " + str(sgd_best_clf.get_params()) + "\n") # # Nearest Neighbors # log.info('%s %s %s: Starting to process %s Nearest Neighbors %s' % (Fore.GREEN,ftype,Fore.WHITE,Fore.BLUE,Fore.WHITE)) knn_parameters = {'n_neighbors':list([5,10,50,100]), 'algorithm':list(['ball_tree','kd_tree','brute']),'leaf_size':list([20,30,40]), 'metric':list(['euclidean','minkowski','manhattan','chebyshev'])} knn_clf = GridSearchCV(KNeighborsClassifier(), knn_parameters, n_jobs=-1, verbose=3, cv=2) if args.verbose else GridSearchCV(KNeighborsClassifier(), knn_parameters, n_jobs=-1, verbose=0, cv=2) knn_clf.fit(X_train_skimmed,y_train_skimmed) knn_best_clf = knn_clf.best_estimator_ if args.verbose: log.info('Parameters of the best classifier: %s' % str(knn_best_clf.get_params())) knn_best_clf.verbose = 2 knn_best_clf.fit(X_train,y_train) knn_disc = knn_best_clf.predict_proba(X_test)[:,1] knn_fpr, knn_tpr, knn_thresholds = roc_curve(y_test, knn_disc) Classifiers["kNN"]=(knn_best_clf,y_test,knn_disc,knn_fpr,knn_tpr,knn_thresholds) OutFile.write("kNN: " + str(knn_best_clf.get_params()) + "\n") # # Naive Bayes (Likelihood Ratio) # log.info('%s %s %s: Starting to process %s Naive Bayes (Likelihood Ratio) %s' % (Fore.GREEN,ftype,Fore.WHITE,Fore.BLUE,Fore.WHITE)) nb_best_clf = GaussianNB() # There is no tuning of a likelihood ratio! if args.verbose: log.info('Parameters of the best classifier: A simple likelihood ratio has no parameters to be tuned!') nb_best_clf.verbose = 2 nb_best_clf.fit(X_train,y_train) nb_disc = nb_best_clf.predict_proba(X_test)[:,1] nb_fpr, nb_tpr, nb_thresholds = roc_curve(y_test, nb_disc) Classifiers["NB"]=(nb_best_clf,y_test,nb_disc,nb_fpr,nb_tpr,nb_thresholds) OutFile.write("NB: " + str(nb_best_clf.get_params()) + "\n") # # Multi-Layer Perceptron (Neural Network) # log.info('%s %s %s: Starting to process %s Multi-Layer Perceptron (Neural Network) %s' % (Fore.GREEN,ftype,Fore.WHITE,Fore.BLUE,Fore.WHITE)) mlp_parameters = {'activation':list(['tanh','relu']), 'hidden_layer_sizes':list([10,(5,10),(10,15)]), 'algorithm':list(['adam']), 'alpha':list([0.0001,0.00005]), 'tol':list([0.00001,0.00005,0.0001]), 'learning_rate_init':list([0.001,0.005,0.0005])} mlp_clf = GridSearchCV(MLPClassifier(max_iter = 500), mlp_parameters, n_jobs=-1, verbose=3, cv=2) if args.verbose else GridSearchCV(MLPClassifier(max_iter = 500), mlp_parameters, n_jobs=-1, verbose=0, cv=2) #learning_rate = 'adaptive' mlp_clf.fit(X_train_skimmed,y_train_skimmed) mlp_best_clf = mlp_clf.best_estimator_ if args.verbose: log.info('Parameters of the best classifier: %s' % str(mlp_best_clf.get_params())) mlp_best_clf.verbose = 2 mlp_best_clf.fit(X_train,y_train) mlp_disc = mlp_best_clf.predict_proba(X_test)[:,1] mlp_fpr, mlp_tpr, mlp_thresholds = roc_curve(y_test, mlp_disc) Classifiers["MLP"]=(mlp_best_clf,y_test,mlp_disc,mlp_fpr,mlp_tpr,mlp_thresholds) OutFile.write("MLP: " + str(mlp_best_clf.get_params()) + "\n") # # Support Vector Machine # log.info('%s %s %s: Starting to process %s Support Vector Machine %s' % (Fore.GREEN,ftype,Fore.WHITE,Fore.BLUE,Fore.WHITE)) svm_parameters = {'kernel':list(['rbf']), 'gamma':list(['auto',0.05]), 'C':list([0.9,1.0])} svm_clf = GridSearchCV(SVC(probability=True), svm_parameters, n_jobs=-1, verbose=3, cv=2) if args.verbose else GridSearchCV(SVC(probability=True), svm_parameters, n_jobs=-1, verbose=0, cv=2) svm_clf.fit(X_train_skimmed,y_train_skimmed) svm_best_clf = svm_clf.best_estimator_ if args.verbose: log.info('Parameters of the best classifier: %s' % str(svm_best_clf.get_params())) svm_best_clf.verbose = 2 #svm_best_clf.fit(X_train,y_train) svm_disc = svm_best_clf.predict_proba(X_test)[:,1] svm_fpr, svm_tpr, svm_thresholds = roc_curve(y_test, svm_disc) Classifiers["SVM"]=(svm_best_clf,y_test,svm_disc,svm_fpr,svm_tpr,svm_thresholds) OutFile.write("SVM: " + str(svm_best_clf.get_params()) + "\n") if args.dumpROC: plt.semilogy(gbc_tpr, gbc_fpr,label='GBC') plt.semilogy(rf_tpr, rf_fpr,label='RF') plt.semilogy(svm_tpr, svm_fpr,label='SVM') plt.semilogy(sgd_tpr, sgd_fpr,label='SGD') plt.semilogy(knn_tpr, knn_fpr,label='kNN') plt.semilogy(nb_tpr, nb_fpr,label='NB') plt.semilogy(mlp_tpr, mlp_fpr,label='MLP') #plt.semilogy([0,0.1,0.2,0.3,0.4,0.5,0.6,0.8,1], [0.00001,0.002,0.01,0.04,0.1,0.2,0.3,0.6,1],label='Current c-tagger') plt.ylabel(args.bkg + " Efficiency") plt.xlabel(args.signal + " Efficiency") plt.legend(loc='best') plt.grid(True) plt.savefig("%sROCcurves.png" % typedir) plt.clf() log.info('Done Processing Type: %s, dumping output in %sTrainingOutputs.pkl' % (ftype,typedir)) print "" pickle.dump(Classifiers,open( typedir + "TrainingOutputs.pkl", "wb" )) OutFile.close()
re6 = clf.class_count_ # print(re6) # [4. 3.] re7 = clf.theta_ # print(re7) # [[-2.5 -2.5] # [ 2. 2. ]] re8 = clf.sigma_ # print(re8) # [[1.25000001 1.25000001] # [0.66666667 0.66666667]] re9 = clf.get_params(deep=True) # print(re9) # {'priors': [0.625, 0.375], 'var_smoothing': 1e-09} re10 = clf.get_params() # print(re10) # {'priors': [0.625, 0.375], 'var_smoothing': 1e-09} re11 = clf.set_params(priors=[0.625, 0.375]) # print(re11) # GaussianNB(priors=[0.625, 0.375], var_smoothing=1e-09) re12 = clf.fit(X, y, np.array([0.05, 0.05, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2])) re13 = clf.theta_ re14 = clf.sigma_ print(re12)