def MungeData(train, test): todrop = ['v22', 'v112', 'v125', 'v74', 'v1', 'v110', 'v47'] print(todrop) train.drop(todrop, axis=1, inplace=True) test.drop(todrop, axis=1, inplace=True) features = train.columns[2:] for col in features: if((train[col].dtype == 'object')): print(col) train, binfeatures = Binarize(col, train) test, _ = Binarize(col, test, binfeatures) nb = BernoulliNB() nb.fit(train[col+'_'+binfeatures].values, train.target.values) train[col] = \ nb.predict_proba(train[col+'_'+binfeatures].values)[:, 1] test[col] = \ nb.predict_proba(test[col+'_'+binfeatures].values)[:, 1] train.drop(col+'_'+binfeatures, inplace=True, axis=1) test.drop(col+'_'+binfeatures, inplace=True, axis=1) features = train.columns[2:] train[features] = train[features].astype(float) test[features] = test[features].astype(float) train.fillna(-1, inplace=True) test.fillna(-1, inplace=True) return train, test
def predict(cur, plyr_id, game_plyrs): #creates training set (called 'X') for plyr all_plyrs = all_player_ids(cur) #np.array - all NFL players (and coaches) games = games_played_in(cur, plyr_id) #np.array - the games_ids the player played in n_cols = all_plyrs.shape[0] #int m_rows = games.shape[0] #int zeros = np.zeros((m_rows, n_cols)) #2darr - used to initialize DF X = pd.DataFrame(zeros, index=games, columns=all_plyrs) #dataframe populate_training_set(cur, X, games, plyr_id) print "X: ", X.values #creates vector of known output values Y = training_output_vector(cur, games, plyr_id) print "(len) Y: ", len(Y), Y test_zeros = np.zeros((1, n_cols)) #2darr - used to initialize DF test_X = pd.DataFrame(zeros, columns=all_plyrs) #dataframe update_training_matrix(game_plyrs, 0, test_X) #run Bernoulli NB Classifier nb_clf = BernoulliNB() if len(X.values) == 0: return 0 nb_clf.fit(X, Y) nb_predictions = nb_clf.predict(test_X) print "test_X: ", test_X.values nb_norm_prob = normalize_probs(nb_clf.predict_proba(test_X)[0]) avgs = [1.5, 4.5, 7.5, 10.5, 13.5, 16.5, 19.5, 22.5, 25.5, 28.5, 31.5] print "param vector: ", nb_clf.predict_proba(test_X)[0] print "probs: ", nb_norm_prob print avgs ev = expected_val(nb_norm_prob, avgs) #can also calc dot product return round(ev, 1)
def MungeData(train, test, validation): features = train.columns[2:] print(type(features)) for col in features: if((train[col].dtype == 'object') and (col!="v22")): print(col) train, binfeatures = Binarize(col, train) test, _ = Binarize(col, test, binfeatures) validation , _ = Binarize(col, validation, binfeatures) nb = BernoulliNB() nb.fit(train[col+'_'+binfeatures].values, train.target.values) train[col] = \ nb.predict_proba(train[col+'_'+binfeatures].values)[:, 1] test[col] = \ nb.predict_proba(test[col+'_'+binfeatures].values)[:, 1] validation[col] = \ nb.predict_proba(validation[col+'_'+binfeatures].values)[:, 1] train.drop(col+'_'+binfeatures, inplace=True, axis=1) test.drop(col+'_'+binfeatures, inplace=True, axis=1) validation.drop(col+'_'+binfeatures, inplace=True, axis=1) train[col] = train[col].astype(float) test[col] = test[col].astype(float) validation[col] = validation[col].astype(float) return train, test, validation
def BernoulliNB_pred(X_train, X_test, y_train): clf_NB = BernoulliNB() clf_NB.fit(X_train, y_train) # Conveting to back, (could be used sklearn standardization function for both decoding and encoding) predictions_train = clf_NB.predict_proba(X_train) predictions = clf_NB.predict_proba(X_test) return predictions[:, 1], predictions_train[:, 1]
def main(): start_time = time.time() #read in game IDs games_data = pd.read_csv('games-data.csv') all_games = np.array(games_data['game_id']) all_plyrs = np.array(games_data['plyr_id']) uni_game_ids = np.unique(all_games) #read in player IDs player_data = pd.read_csv('players.csv') plyr_ids = np.unique(np.array(player_data['ID'])) #read in fantasy scores fantasy_scores = pd.read_csv('fantasy_scores.csv') #gets player training matrix plyr_id = 8439 X = create_training_set(plyr_id, games_data, plyr_ids) index = get_ninety_percent(len(np.array(X.index))) #for cross-validation train_X = X[:index] test_X = X[index:] #gets training output vector plyr_game_ids = np.array(train_X.index) scores = plyr_fantasy_pts(plyr_id, plyr_game_ids, fantasy_scores) Y = discretize(scores.values) train_Y = Y[:index] test_Y = Y[index:] #run Bernoulli NB Classifier nb_clf = BernoulliNB() nb_clf.fit(train_X, train_Y) nb_predictions = nb_clf.predict(test_X) #run Multinomial NB Classifier mn_clf = MultinomialNB() mn_clf.fit(train_X, train_Y) mn_predictions = nb_clf.predict(test_X) #test for game, fantasy score alignment for i in xrange(test_Y.shape[0]): print plyr_game_ids[i], scores.values[i], test_Y[i], nb_predictions[i], mn_predictions[i] print "Bernoulli NB accuracy: ", nb_clf.score(test_X, test_Y) print "Bernoulli NB prob estimates: ", nb_clf.predict_proba(test_X) print "Multinomial NB accuracy: ", mn_clf.score(test_X, test_Y) print "Bernoulli NB prob estimates: ", mn_clf.predict_proba(test_X) print len(nb_clf.predict_proba(test_X)[0]) nb_norm_prob = normalize_probs(nb_clf.predict_proba(test_X)[0]) vals = [1.5, 4.5, 7.5, 10.5, 13.5, 16.5, 19.5, 22.5, 25.5, 28.5, 31.5] ev = expected_val(nb_norm_prob, vals) print "EV: ", ev end_time = time.time() print("Elapsed time was %g seconds" % (end_time - start_time))
def test_alpha(): # Setting alpha=0 should not output nan results when p(x_i|y_j)=0 is a case X = np.array([[1, 0], [1, 1]]) y = np.array([0, 1]) nb = BernoulliNB(alpha=0.) assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1]) assert_warns(UserWarning, nb.fit, X, y) prob = np.array([[1, 0], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) nb = MultinomialNB(alpha=0.) assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1]) assert_warns(UserWarning, nb.fit, X, y) prob = np.array([[2. / 3, 1. / 3], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) # Test sparse X X = scipy.sparse.csr_matrix(X) nb = BernoulliNB(alpha=0.) assert_warns(UserWarning, nb.fit, X, y) prob = np.array([[1, 0], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) nb = MultinomialNB(alpha=0.) assert_warns(UserWarning, nb.fit, X, y) prob = np.array([[2. / 3, 1. / 3], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) # Test for alpha < 0 X = np.array([[1, 0], [1, 1]]) y = np.array([0, 1]) expected_msg = ('Smoothing parameter alpha = -1.0e-01. ' 'alpha should be > 0.') b_nb = BernoulliNB(alpha=-0.1) m_nb = MultinomialNB(alpha=-0.1) assert_raise_message(ValueError, expected_msg, b_nb.fit, X, y) assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y) b_nb = BernoulliNB(alpha=-0.1) m_nb = MultinomialNB(alpha=-0.1) assert_raise_message(ValueError, expected_msg, b_nb.partial_fit, X, y, classes=[0, 1]) assert_raise_message(ValueError, expected_msg, m_nb.partial_fit, X, y, classes=[0, 1])
def bayes_model(self): logger.info('Bayes_model beginning ...') classifier = BernoulliNB() classifier.fit(self.train_x, self.train_y) index = list(classifier.classes_).index(1) test_y_predict = pd.DataFrame(classifier.predict_proba(self.test_x), columns=list(classifier.classes_)) test_y_predict[index] = test_y_predict[index].apply( lambda x: 0 if x <= 0.01 else x) predict_y = list( map(lambda x: x[index], classifier.predict_proba(self.predict_x))) data_results.save_model(obj=classifier, path="../../data/results_2/bayes_model.pk") return test_y_predict, predict_y
def model_comparison(train, validation, features): #Naive Bayes model = BernoulliNB() model.fit(training[features], training['crime']) predicted = np.array(model.predict_proba(validation[features])) print("BernoulliNB") print(log_loss(validation['crime'], predicted)) #Logistic Regression for comparison model = LogisticRegression(C=.01) model.fit(training[features], training['crime']) predicted = np.array(model.predict_proba(validation[features])) print("LogisticRegression") print(log_loss(validation['crime'], predicted))
def estimate(self): # class_prior = [.9, .1] - we dunnno classifier = BernoulliNB() #classifier = MultinomialNB(alpha = 0.02) #classifier = DecisionTreeClassifier(class_weight = { 0: 1, 1: 9 }) #classifier = KNeighborsClassifier(n_neighbors=50, metric='minkowski', p=3) # classifier = RandomForestClassifier( # max_depth = 32, # n_estimators = 64, # max_features = 0.25, # class_weight = { 0: 1, 1: 9 }, # n_jobs = 3 # ) classifier.fit(self.X, self.Y) if self.calc_start is not None: print('Fitting time: ' + str((dt.datetime.now() - self.calc_start).total_seconds()) + 's') #if self.ids is not None: results_proba = classifier.predict_proba(self.test_data) if self.calc_start is not None: print('Prediction time: ' + str((dt.datetime.now() - self.calc_start).total_seconds()) + 's') print(results_proba[:100]) if self.ids is not None: self.estimates[:, 0] = self.ids self.estimates[:, 1] = results_proba[:, 1] else: self.crosscheck_estimates = { 'labels': classifier.predict(self.test_data), 'proba': results_proba[:, 1] }
def _simple_cross_validate(self): """ Use a simple fixed NB model to double check the correctness of sklearn Random search and my random search It can confirm our API compatible with late-fusion is correct :return: """ kf = KFold(n_splits=self.args.cv_num, random_state=self.args.random_seed) metric_values = {metric_name: [] for metric_name in self.metric_names} clf = BernoulliNB(alpha=0.8490, binarize=0.3086, fit_prior=True) clf = OneVsRestClassifier(clf, n_jobs=self.args.n_jobs) for train_idx_list, test_idx_list in kf.split(self.data_x, self.data_y): X_train = self.data_x[train_idx_list] y_train = self.data_y[train_idx_list] X_test = self.data_x[test_idx_list] y_test = self.data_y[test_idx_list] clf.fit(X_train, y_train) y_predict_score = clf.predict_proba(X_test) y_predict = np.argmax(y_predict_score, axis=-1) metric_results = utils.evaluate_any_type(y_test, y_predict, self.id2label) for metric_name in self.metric_names: metric_values[metric_name].append( [metric_results[metric_name], len(y_test)]) metric_weighted_avg = self._get_weighted_avg(metric_values) for metric_name in ['f1']: print_to_log('The {0} score in cross validation is {1}'.format( metric_name, metric_values[metric_name])) print_to_log('The average {0} score is {1}'.format( metric_name, metric_weighted_avg[metric_name])) quit()
def navie_bayes(df): """ 朴素贝叶斯进行预测 :param df: :return: """ # # 只取星期几和街区作为分类器输入特征 features = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN' ] # 添加犯罪的小时时间点作为特征 hour_fea = [x for x in np.arange(0, 24)] features = features + hour_fea # 分割训练集(70%)和测试集(30%) training, validtion = train_test_split(df, test_size=0.3) # 朴素贝叶斯建模,计算log_loss model = BernoulliNB() start = time.time() model.fit(training[features], training['crime']) cost_time = time.time() - start pridected = np.array(model.predict_proba(validtion[features])) loss = log_loss(validtion['crime'], pridected) logging.info(f'朴素贝叶斯建模耗时{cost_time}秒') logging.info(f'朴素贝叶斯log 损失为{loss}')
def _bernoulli_NB(self): clf = BernoulliNB() clf.fit(self.X_train, self.y_train) score = clf.score(self.X_test, self.y_test) print('Accuracy rate of Naive Bayes: {0:.3f}'.format(score)) y_pred = clf.predict_proba(self.X_test) ks(y_pred.T[0], self.y_test)
def anomality_model(history, predict): ''' model used: Bernoulli Naive Bayse :param history: the dataframe with which fit the model :param predict: the element to be used for the prediction :return: the probability of the user for the given array. ''' user = predict["display_name"] history = history.drop(['timestamp'], axis=1) history["display_name"] = np.where(history["display_name"] == user, 1, 0) df_target = history[history["display_name"] == 1] df_core = history[history["display_name"] == 0] df_core = df_core.drop_duplicates() # Reduce the "No-events" df = df_target.append(df_core) model = BernoulliNB() label = df['display_name'] main = df.drop(['display_name'], axis=1) model.fit(main, label) #print(user) predict = predict.drop(['timestamp', 'display_name']) #print("### predict #####") output = model.predict_proba(np.array(predict).reshape(1, -1)) return output[0][1]
def _eval_(self): ''' INPUT: None OUTPUT: None This function simply re-runs Naive Bayes, creates a classifier report, and a confusion matrix. It also tests for accuracy, creates an accuracy score based on overall predictions and the top 2 predicted car types. ''' X_train, X_test, y_train, y_test = \ train_test_split(self.country_dummies, self.cars['cluster'], test_size=0.2, random_state=42) nb = BernoulliNB(class_prior=self.new_priors) nb.fit(X_train, y_train) pred = nb.predict(X_test) self.nb_confusion = confusion_matrix(y_test, pred) diag_sum = np.trace(confusion_matrix(y_test, pred)) total = np.sum(confusion_matrix(y_test, pred)) self.nb_accuracy = diag_sum / float(total) self.nb_reports = classification_report(y_test, pred) self.baseline = np.bincount(y_test.T).max() / float(y_test.shape[0]) self.probs = nb.predict_proba(X_test) top2 = np.argsort(self.probs, axis=1)[:, -2:] t_f = [] for i in xrange(y_test.shape[0]): t_f.append(np.in1d(y_test[i], top2[i])[0]) self.nb_top2_accuracy = sum(t_f) / float(len(t_f))
def plot_roc_curves(X, y): plt.figure(figsize=(10, 6)) lw = 2 # train-val split and oversample X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=0) adasyn = ADASYN(random_state=44) X_oversampled_train, y_oversampled_train = adasyn.fit_sample( X_train, y_train) # Logistic Regression # fit model and predict probabilities of validation data log_reg = LogisticRegression(max_iter=5000, n_jobs=-1, random_state=44) log_reg.fit(X_oversampled_train, y_oversampled_train) y_pred = log_reg.predict_proba(X_val) fpr, tpr, thresholds = roc_curve(y_val, y_pred[:, 1]) model_auc = roc_auc_score(y_val, y_pred[:, 1]) plt.plot(fpr, tpr, color='b', lw=lw, label=f'Logistic Regression, AUC: {model_auc:.4f}') # Naive Bayes # fit model and predict probabilities of validation data nb = BernoulliNB() nb.fit(X_oversampled_train, y_oversampled_train) y_pred = nb.predict_proba(X_val) fpr, tpr, thresholds = roc_curve(y_val, y_pred[:, 1]) model_auc = roc_auc_score(y_val, y_pred[:, 1]) plt.plot(fpr, tpr, color='r', lw=lw, label=f'Bernoulli Naive Bayes, AUC: {model_auc:.4f}') # SVC # fit model and predict probabilities of validation data svc = SVC(probability=True, random_state=1) svc.fit(X_oversampled_train, y_oversampled_train) y_pred = svc.predict_proba(X_val) fpr, tpr, thresholds = roc_curve(y_val, y_pred[:, 1]) model_auc = roc_auc_score(y_val, y_pred[:, 1]) plt.plot(fpr, tpr, color='g', lw=lw, label=f'SVC, AUC: {model_auc:.4f}') plt.plot([0, 1], [0, 1], c='violet', ls='--', label='Chance Line') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC Curves for top 3 Contending Models') plt.legend(loc='lower right', prop={'size': 10}, frameon=True) plt.savefig('ROC Curves for top 3 Contending Models')
def bnb(self): from sklearn.naive_bayes import BernoulliNB from sklearn.metrics import classification_report, roc_auc_score bnb = BernoulliNB() bnb.fit(self.X_train, self.y_train) y_hat_train = bnb.predict(self.X_train) y_hat_test = bnb.predict(self.X_test) acc_bnb = round(bnb.score(self.X_test, self.y_test) * 100, 2) print('Model Accuracy: ', acc_bnb) print('Naive Bayes:\n 1. train 2. test') print( classification_report(self.y_train, y_hat_train), classification_report(self.y_test, y_hat_test), sep='\n-------------------------------------------------------\n') y_score = bnb.predict_proba(self.X_test) print( 'ovo', roc_auc_score(self.y_test, y_score, multi_class='ovo'), 'ovr', roc_auc_score(self.y_test, y_score, multi_class='ovr'), sep='\n-------------------------------------------------------\n')
class NaiveBayesClassifierBernoulli: def __init__( self, matrixFileName="/Users/chengyu/Documents/python/data/matrixForLearning", dicFileName="/Users/chengyu/Documents/python/data/dictionary"): self.X, self.Y = load_svmlight_file(matrixFileName) self.dictionary = pickle.load(open(dicFileName, "rb")) self.bernoulliNB = BernoulliNB() self.bernoulliNB.fit(self.X, self.Y) self.matrixParser = Parser.MatrixParserForLearning() def classifyOneSentence(self, string): row = self.matrixParser.getRowForClassify(string, self.dictionary) if row != None: # return self.bernoulliNB.predict(row) return self.bernoulliNB.predict(row) else: return None def classifyOneSentenceWithProbability(self, string): row = self.matrixParser.getRowForClassify(string, self.dictionary) if row != None: # return self.bernoulliNB.predict(row) a = self.bernoulliNB.predict_proba(row) return a[0][1] - a[0][0] else: return None
def bayes1(): # 分别表示:[刮北风、闷热、多云、天气预报有雨] X = np.array([[0, 1, 0, 1], [1, 1, 1, 0], [0, 1, 1, 0], [0, 0, 0, 1], [0, 1, 1, 0], [0, 1, 0, 1], [1, 0, 0, 1]]) # 实际的 7 天中,是否有雨,0-没雨,1-有雨 y = np.array([0, 1, 1, 0, 1, 0, 0]) counts = {} for label in np.unique(y): counts[label] = X[y == label].sum(axis=0) print('feature counts:\n{}'.format(counts)) clf = BernoulliNB() clf.fit(X, y) Next_day = [[0, 0, 1, 0]] pre = clf.predict(Next_day) print('------------') if pre == [1]: print('要下雨了') else: print('晴天') print('--------------') # 朴素贝叶斯对于预测具体的数值并不擅长,给出的概率仅供参考 print('模型预测分类的概率:{}'.format(clf.predict_proba(Next_day))) print('--------------')
def test_compare_to_sklearn_very_simple(self): data = pd.DataFrame([(True, 1, 1), (False, 2, 2), (True, 3, 1)], columns=["y", "col1", "col2"]) dnb = DiscreteNaiveBayes() dnb = dnb.fit(data[["col1", "col2"]], data["y"]) print(dnb.predict(data[["col1", "col2"]])) # compare to MultinomialNB X = pd.concat( [pd.get_dummies(data["col1"]), pd.get_dummies(data["col2"])], axis=1) nb = MultinomialNB(alpha=0) nb = nb.fit(X, data["y"]) print(nb.predict(X)) self.assertListEqual( dnb.predict(data[["col1", "col2"]]).tolist(), nb.predict(X).astype(int).tolist()) # compare to BernoulliNB bnb = BernoulliNB(alpha=0) bnb = bnb.fit(X, data["y"]) print(bnb.predict(X)) self.assertListEqual( dnb.predict(data[["col1", "col2"]]).tolist(), bnb.predict(X).astype(int).tolist()) print(dnb.predict_proba(data[["col1", "col2"]])) print(nb.predict_proba(X)) print(bnb.predict_proba(X))
class BernoulliNBImpl(): def __init__(self, alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None): self._hyperparams = { 'alpha': alpha, 'binarize': binarize, 'fit_prior': fit_prior, 'class_prior': class_prior } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X) def predict_proba(self, X): return self._sklearn_model.predict_proba(X)
def predict(self, item, **kwargs): seq = item.sequence_number tsk = item.task len_tsk = len(tsk) enc_vars = [0] * len_tsk x = self.cond_data[0] y = self.cond_data[seq + 1] # Task info for i in range(len_tsk): if tsk[i] == b: enc_vars[i] = 1 # Condtional # System 1 temp_sum = self.mfa[seq][1][0] + self.mfa[seq][1][1] p_sys1 = self.mfa[seq][1][1] / temp_sum # System 2 cond_model = BernoulliNB() cond_model.fit(x, y) p_sys2 = cond_model.predict_proba([enc_vars])[:, 1][0] # Apply weighting p_val = w * p_sys1 + (1 - w) * p_sys2 if p_val < 0.5: p_resp = str("A") else: p_resp = str("B") # PRINT SECTION TO BE DELETED #print("MFA:", self.mfa) #print("Conditional:", self.cond_data) return p_resp
class bernoullinbreadmeonly(ClassificationModule): """A Bernoulli Naive Bayes""" def __init__(self, text_corpus): ClassificationModule.__init__(self, "Readme Only Bernoulli Naive Bayes", "A Bernoulli Naive Bayes-Classifier") # Create vectorizer and fit on all available Descriptions self.vectorizer = getTextVectorizer( 9000) # Maximum of different columns corpus = [] for description in text_corpus: corpus.append(process_text(description)) self.vectorizer.fit(corpus) self.clf = BernoulliNB() print "\t-", self.name def resetAllTraining(self): """Reset classification module to status before training""" self.clf = sklearn.base.clone(self.clf) def trainOnSample(self, sample, shuffle=True, verbose=True): """Trainiere (inkrementell) mit Sample. Evtl zusätzlich mit best. Menge alter Daten, damit overfitten auf neue Daten verhindert wird.""" readme_vec = self.formatInputData(sample) label_index = getLabelIndex(sample) return self.clf.fit(readme_vec, np.expand_dims(label_index, axis=0)) def train(self, samples, shuffle=True, verbose=True): """Trainiere mit Liste von Daten. Evtl weitere Paramter nötig (nb_epoch, learning_rate, ...)""" train_samples = [] train_lables = [] for sample in samples: formatted_sample = self.formatInputData(sample)[0].tolist() train_samples.append(formatted_sample) train_lables.append(getLabelIndex(sample)) train_lables = np.asarray(train_lables) train_result = self.clf.fit(train_samples, train_lables) self.isTrained = True return train_result def predictLabel(self, sample): """Gibt zurück, wie der Klassifikator ein gegebenes Sample klassifizieren würde""" if not self.isTrained: return 0 sample = self.formatInputData(sample) return self.clf.predict(sample)[0] def predictLabelAndProbability(self, sample): """Return the probability the module assignes each label""" if not self.isTrained: return [0, 0, 0, 0, 0, 0, 0, 0] sample = self.formatInputData(sample) prediction = self.clf.predict_proba(sample)[0] return [np.argmax(prediction)] + list(prediction) def formatInputData(self, sample): """Extract readme and transform to vector""" sd = getReadme(sample) # Returns numpy array which contains 1 array with features return self.vectorizer.transform([sd]).toarray()
class Agent_skatkar(Agent): def __init__(self, name, seed=0): super(Agent_skatkar, self).__init__(name) self.clf = BernoulliNB() # self.clf = LogisticRegression() # self.clf = GaussianNB() # self.clf = KNeighborsClassifier() # self.clf = SVC() # self.clf = DecisionTreeClassifier() # self.clf = RandomForestClassifier() # self.clf = AdaBoostClassifier() def choose_one_product(self, products): XX = [i.features for i in self.my_products] yy = [i for i in self.product_labels] self.clf.fit(XX, yy) v = 0 g_val = 0 for i in range(len(products)): feat = self.clf.predict_proba(products[i].features)[0][1] val = products[i].value cost = products[i].price prob = feat temp = prob * (val - cost) if (temp > v): v = temp g_val = i return g_val
def run(self): data = self.input() out = self.output() test_df = pd.read_csv(data['training_data.csv'].path) pred_df = pd.read_csv(data['tournament_data.csv'].path) training_indices, testing_indices = train_test_split( test_df.index, stratify=test_df['target'].values, train_size=0.75, test_size=0.25) result1 = test_df.copy() # Perform classification with a BernoulliNB classifier bnb1 = BernoulliNB(alpha=0.64, binarize=0.23) bnb1.fit(result1.loc[training_indices].drop('target', axis=1).values, result1.loc[training_indices, 'target'].values) # Perform prediction val = pred_df.drop('t_id', axis=1) nb = bnb1.predict_proba(val) pred_df['probability'] = nb[:, 1] pred_df.to_csv(out.path, columns=('t_id', 'probability'), index=None)
class NaiveBayesClassifierBernoulli: """ this class capsules the Bernoulli NaiveBayes functions of scikit-learn in BernoulliNB class """ def __init__(self, matrixFileName = matrixFilePath, dicFileName = dictFilePath): self.X,self.Y = load_svmlight_file(matrixFileName) self.dictionary = pickle.load(open(dicFileName, "rb")) self.bernoulliNB = BernoulliNB() self.bernoulliNB.fit(self.X, self.Y) self.matrixParser = Parser.MatrixParserForLearning() def classifyOneSentence(self, string): row = self.matrixParser.getRowForClassify(string, self.dictionary) if row != None: # return self.bernoulliNB.predict(row) return self.bernoulliNB.predict(row) else : return None def classifyOneSentenceWithProbability(self,string): row = self.matrixParser.getRowForClassify(string, self.dictionary) if row != None: # return self.bernoulliNB.predict(row) a = self.bernoulliNB.predict_proba(row) return a[0][1] - a[0][0] else : return None
class Model(object): def __init__(self): # self.model = GradientBoostingClassifier(learning_rate=0.01, max_depth=8, # max_features=5, min_samples_leaf=5, n_estimators=1500) self.model = BernoulliNB(alpha=1) self.tfidf = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english', lowercase=True) pass def fit(self, X, y): # Import X and y as text X = self.tfidf.fit_transform(X) y = y self.model.fit(X, y) filename = 'data/model.pkl' pickle.dump(self, open(filename, 'wb')) return self def predict(self, X): X = self.tfidf.transform(X) predictions = self.model.predict(X) return predictions def predict_proba(self, X): X = self.tfidf.transform(X) proba_predictions = self.model.predict_proba(X) return proba_predictions def score(self, X, y): X = self.tfidf.transform(X) score = self.model.score(X, y) return score
def NB(x_train, x_test, y_train, y): gau = GaussianNB() ber = BernoulliNB() ctstrain = x_train.iloc[:, 0:11] ctstest = x_test.iloc[:, 0:11] cattrain = x_train.iloc[:, 11:30] cattest = x_test.iloc[:, 11:30] gau.fit(ctstrain, y_train.values.ravel()) ber.fit(cattrain, y_train.values.ravel()) predprobs = gau.predict_proba(ctstest) predprobss = ber.predict_proba(cattest) probs = np.multiply(predprobs, predprobss) preds = [] for i in probs: i.tolist() index_min = np.argmax(i) preds.append(index_min + 1) acc = 0 i = 0 for item in preds: if (preds[i] == y[i]): acc += 1 i += 1 acc = acc / len(preds) print("NB Accuracy: ", acc)
def _test_bernoulinb_classifer(self, num_classes, alpha=1.0, binarize=None, fit_prior=False, class_prior=None, labels_shift=0, backend="torch"): model = BernoulliNB(alpha=alpha, binarize=binarize, fit_prior=fit_prior, class_prior=class_prior) np.random.seed(0) if binarize is None: X = np.random.randint(2, size=(100, 200)) else: X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) + labels_shift model.fit(X, y) torch_model = hummingbird.ml.convert(model, backend, X) self.assertTrue(torch_model is not None) np.testing.assert_allclose(model.predict_proba(X), torch_model.predict_proba(X), rtol=1e-6, atol=1e-5)
class Dota_NB: def __init__(self): fo = open("id_name.json", "r") self.id_name_dic = json.loads(fo.read()) fo.close() self.hero_num = 0 for i in self.id_name_dic: if int(i) > self.hero_num: self.hero_num = int(i) dataset = pd.read_csv("X.csv", header=None).values X_0 = dataset[:, 0:10] self.Y = dataset[:, 10] self.X = np.zeros([X_0.shape[0], self.hero_num * 2], dtype=np.bool_) for i in range(0, X_0.shape[0]): for k in range(0, 5): self.X[i][X_0[i][k] - 1] = True for j in range(5, 10): self.X[i][X_0[i][j] - 1 + self.hero_num] = True self.clf = BernoulliNB() self.clf.fit(self.X, self.Y) def predict_one(self, X_test): return self.clf.predict([X_test])[0] def predict_one_proba(self, X_test): return max(self.clf.predict_proba([X_test])[0]) def predict_many(self, X_test): return self.clf.predict(X_test)
def main(X_data, y_data, test_size): X_train, X_test, label_train, label_test = cross_validation.train_test_split( X_data, y_data, test_size=test_size) X_train = X_train.toarray() # cria o classificador gnb = BernoulliNB() gnb.fit(X_train, label_train) # predicao do classificador label_pred = gnb.predict(X_test) probs = gnb.predict_proba(X_test) correct_hist = [] error_hist = [] for i in xrange(0, len(label_pred)): max_prob = max(probs[i]) if label_pred[i] == label_test[i]: correct_hist.append(max_prob) else: error_hist.append(max_prob) return correct_hist, error_hist
def test_predict(): model_path = train_nlu( nlu_data=NLU_DATA_PATH, config="tests/configs/sparse-naive-bayes-intent-classifier-config.yml", output="models", ) interpreter = load_interpreter(model_path) # Get features from the pipeline and prepare data in the format sklearn # expects. training_data = load_data(NLU_DATA_PATH) for example in training_data.intent_examples: interpreter.featurize_message(example) model = interpreter.interpreter.pipeline[-1] X, y = model.prepare_data(training_data) # Fit the equivalent sklearn classifier. from sklearn.naive_bayes import BernoulliNB clf = BernoulliNB(alpha=0.1, binarize=0.0, fit_prior=True) clf.fit(X, y) # Check that predictions agree. assert (clf.predict_proba(X) == model.predict_prob(X)).all() assert (clf.predict(X) == model.predict(X)[0][:, 0]).all()
def run(X_train, y_train, X_test, seed): # Train clf = BernoulliNB(alpha=0.9) clf.fit(X_train, y_train) # Test y_score = clf.predict_proba(X_test)[:, 1] return y_score
def score(train_X, train_y): X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.01, random_state=10) clf = BernoulliNB(binarize=False, fit_prior=True, alpha=0.7) clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_valid) return log_loss(y_valid, y_pred)
def bnb_model(self): classifier = BernoulliNB() classifier.fit(self.train_x, self.train_label) index = list(classifier.classes_).index(1) predict_y = list(map(lambda x: x[index], classifier.predict_proba(self.test_x))) self.save_model(classifier, path="../../data/results/Bayes_model.pk") logger.info('bnb_model finished ...') return predict_y
def ewh_hsi(rs): def daily_change(code, frdate, todate, base, numerator): e0 = yq.get_historical_prices(code, frdate, todate) print e0 e1 = e0[1:] e2 = e0[2:] e3 = map( lambda i: (e2[i][0], 1 if (float(e2[i][numerator]) - float(e1[i][base])) / float(e1[i][base]) > 0 else 0, e2[i][numerator], e1[i][base]), [i for i in range(len(e2))]) return e3 idx = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Clos'] EWH = daily_change('^DJI', '20150901', '20160330', idx.index('Adj Clos'), idx.index('Adj Clos')) #EWH = EWH[:20] # 1 if opens high and 0 otherwise HSI = daily_change('^HSI', '20150901', '20160330', idx.index('Open'), idx.index('Adj Clos')) #HSI = HSI[:20] print len(EWH), ''.join('%s,' % x[0] for x in EWH) print len(HSI), ''.join('%s,' % x[0] for x in HSI) HSI_dates = map(lambda x: x[0], HSI) # filter EWH entries for which a record has a corresponding next trade record in HSI # example, EWH trade date 2016-02-29 the corresponding record for HSI is 2016-03-01 EWH_filtered = filter( lambda x: datetime2ystr(rs.after(ystr2datetime(x[0]))) in HSI_dates, EWH) print len(EWH_filtered), EWH_filtered hsi_ewh = map( lambda x: (HSI[HSI_dates.index( datetime2ystr(rs.after(ystr2datetime(x[0]))))][1], x[1]), EWH_filtered) xx = np.array(map(lambda x: [x[1], 0], hsi_ewh)) yy = np.array(map(lambda x: x[0], hsi_ewh)) model = BernoulliNB() model.fit(xx, yy) predicted = model.predict([[0, 0], [1, 0]]) print predicted print model.predict_proba([[0, 0], [1, 0]]) print model.feature_count_
class InstaNaiveBayesSybilRankerFactory: basePath = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe())) ) modelFilename = basePath + \ '/modelfiles/InstaNaiveBayesSybilRanker.joblib' trainDataFilename = basePath + \ '/traindata/insta_train_data.csv' def __init__(self): # Importing dataset print(pd.__path__) self.data = pd.read_csv( self.trainDataFilename, sep=",", encoding='latin1') # data = self.data # data.fillna('', inplace=True) # # data = pd.read_csv( self.trainDataFilename, # sep=",", encoding='latin1' ) self.data = self.data.drop(['username'], 1) self.X = self.data.drop('bot', 1) self.Y = self.data['bot'] # create model self.gnb = BernoulliNB() # Train classifier self.gnb.fit(self.X, self.Y) # save model jl.dump(self, self.modelFilename) def validate(self): # Importing dataset X = self.X Y = self.Y bScores = cross_val_score(self.gnb, X, Y, cv=10) print('\tcrossvalidated accuracy after NaiveBayesSybilRanker: {}'\ .format(bScores.mean())) def getRank(self, nodeName): args = dict( username = nodeName, usernames = [], login_user= SybilRanking.settings.insta_username, login_pass= SybilRanking.settings.insta_password) scraper = InstagramScraper(**args) scraper.login() userData = scraper.scrapeUser( username = nodeName) userData.pop(0) detectedClass = self.gnb.predict( [userData] ) print("\tdetectedClass = ", detectedClass) predict_proba = self.gnb.predict_proba( [userData] ) print("\tpredict_proba = ", predict_proba) return predict_proba[0][0]*100
def bnb(X,y,Z,test_data): from sklearn.naive_bayes import BernoulliNB bnb = BernoulliNB() bnb.fit(X,y) #MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) test_probs_bnb = bnb.predict_proba(Z)[:, 1] sub = pd.DataFrame({'enrollment_id':test_data["enrollment_id"], 'truth':test_probs_bnb}).set_index("enrollment_id") sub.to_csv('data\\result\\sixth_bnb.csv')
def test_alpha(): # Setting alpha=0 should not output nan results when p(x_i|y_j)=0 is a case X = np.array([[1, 0], [1, 1]]) y = np.array([0, 1]) nb = BernoulliNB(alpha=0.) assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1]) assert_warns(UserWarning, nb.fit, X, y) prob = np.array([[1, 0], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) nb = MultinomialNB(alpha=0.) assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1]) assert_warns(UserWarning, nb.fit, X, y) prob = np.array([[2./3, 1./3], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) # Test sparse X X = scipy.sparse.csr_matrix(X) nb = BernoulliNB(alpha=0.) assert_warns(UserWarning, nb.fit, X, y) prob = np.array([[1, 0], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) nb = MultinomialNB(alpha=0.) assert_warns(UserWarning, nb.fit, X, y) prob = np.array([[2./3, 1./3], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) # Test for alpha < 0 X = np.array([[1, 0], [1, 1]]) y = np.array([0, 1]) expected_msg = ('Smoothing parameter alpha = -1.0e-01. ' 'alpha should be > 0.') b_nb = BernoulliNB(alpha=-0.1) m_nb = MultinomialNB(alpha=-0.1) assert_raise_message(ValueError, expected_msg, b_nb.fit, X, y) assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y) b_nb = BernoulliNB(alpha=-0.1) m_nb = MultinomialNB(alpha=-0.1) assert_raise_message(ValueError, expected_msg, b_nb.partial_fit, X, y, classes=[0, 1]) assert_raise_message(ValueError, expected_msg, m_nb.partial_fit, X, y, classes=[0, 1])
def prob_couple_will_not_break_up(args): (observations, classes) = createObservations() observations = np.array(observations) classes = np.array(classes) # make naive classifier naive = BernoulliNB(binarize=None) naive.fit(observations, classes) probs = naive.predict_proba([create_observations_from_dict(args)])[0] prob_not_broken_up, probs_broken_up = probs return prob_not_broken_up / (prob_not_broken_up + probs_broken_up)
def convertToNumeric(df): features = df.columns[2:] for col in features: if((df[col].dtype == 'object')): print "Converting {0} to numerical data".format(col) labelEncode(df, col) nb = BernoulliNB() nb.fit(df[[col]], df['target']) new_col = col + "_binarized" df[new_col] = nb.predict_proba(df[[col]])[:, 1]
class TextClassifier(object): """A text classifier model: - Vectorize the raw text into features. - Fit a naive bayes model to the resulting features. """ def __init__(self): #self._vectorizer = TfidfVectorizer(stop_words='english') self._vectorizer = CountVectorizer() self._classifier = BernoulliNB() #self._classifier = MultinomialNB() def fit(self, X, y): """Fit a text classifier model. Parameters ---------- X: A numpy array or list of text fragments, to be used as predictors. y: A numpy array or python list of labels, to be used as responses. Returns ------- self: The fit model object. """ # Code to fit the model. train_stuff = self._vectorizer.fit_transform(X, y) self._classifier.fit(train_stuff, y=y) return self def predict_proba(self, X): """Make probability predictions on new data.""" stuff = self._vectorizer.transform(X) result = self._classifier.predict_proba(stuff) return result pass def predict(self, X): """Make predictions on new data.""" stuff = self._vectorizer.transform(X) result = self._classifier.predict(stuff) return result pass def score(self, X, y): """Return a classification accuracy score on new data.""" stuff = self._vectorizer.transform(X) result = self._classifier.score(stuff, y) return result pass
def nb_onehot(): X_tr, Y_tr, X_va, Y_va, dictionary, X_te, id_list = util.create_or_load_data(freq_threshold=50) Y_te_pred_list = [] sum_auc_va = 0.0 for i in range(Y_tr.shape[1]): nb = BernoulliNB() j = 0 batch_size = 10000 while j < len(X_tr): end = min(j + batch_size, len(X_tr) - 1) batch = [data_process.seq2onehot(seq, dictionary) for seq in X_tr[j:end]] nb.partial_fit(batch, Y_tr[j:end, i], classes=[0, 1]) j += batch_size logging.info("Finish training") Y_va_pred = [] j = 0 while j < len(X_va): end = min(j + batch_size, len(X_va)) batch = [data_process.seq2onehot(seq, dictionary) for seq in X_va[j:end]] Y_va_pred.extend(nb.predict_proba(batch)) j += batch_size auc_va = util.auc(Y_va[:, i], Y_va_pred) logging.info("tag{}, valid auc: ".format(i) + str(auc_va)) sum_auc_va += auc_va Y_te_pred = [] j = 0 while j < len(X_te): end = min(j + batch_size, len(X_te)) batch = [data_process.seq2onehot(seq, dictionary) for seq in X_te[j:end]] Y_te_pred.extend(nb.predict_proba(batch)) j += batch_size Y_te_pred_list.append(Y_te_pred) logging.info("Avg auc: {}".format(sum_auc_va / Y_tr.shape[1])) util.submission(Y_te_pred_list, id_list)
def ImpactData(train, test, exclude = []): features = train.columns[2:] print(type(features)) for col in features: if((train[col].dtype == 'object') and (col not in exclude)): print(col) train, binfeatures = Binarize(col, train) test, _ = Binarize(col, test, binfeatures) nb = BernoulliNB() nb.fit(train[col+'_'+binfeatures].values, train.target.values) train[col] = \ nb.predict_proba(train[col+'_'+binfeatures].values)[:, 1] test[col] = \ nb.predict_proba(test[col+'_'+binfeatures].values)[:, 1] train.drop(col+'_'+binfeatures, inplace=True, axis=1) test.drop(col+'_'+binfeatures, inplace=True, axis=1) train[col] = train[col].astype(float) test[col] = test[col].astype(float) return train, test
def fit_model_8(self,lol = 0.0, toWrite=False): model = BernoulliNB(alpha = lol, binarize = 0.0) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict_proba(X_test)[:,1] print("Model 8 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model8/model.pkl','w') pickle.dump(model,f2) f2.close()
def naivebayesian(dataset,configFIUse,configFI,alpha,binerize): ds = dataset ds.dprint("Start Creating BernoulliNB Bayesian Network") if configFIUse: ds.dprint("Excluding following columns: " + str(configFI)) X_train = ds.X_train.drop(configFI, inplace=False, axis=1) X_test = ds.X_test.drop(configFI, inplace=False, axis=1) else: X_train = ds.X_train X_test = ds.X_test bnb = BernoulliNB(alpha=alpha, binarize=binerize) y_pred = bnb.fit(X_train, ds.y_train).predict(X_test) y_pred_proba = bnb.predict_proba(X_test) mislabeled = "Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0] ,(ds.y_test != y_pred).sum()) auc = roc_auc_score(ds.y_test, y_pred_proba[:, 1]) return (mislabeled,auc,y_pred_proba)
def test_bnb(): """ Tests that BernoulliNB when alpha=1.0 gives the same values as those given for the toy example in Manning, Raghavan, and Schuetze's "Introduction to Information Retrieval" book: http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html """ # Training data points are: # Chinese Beijing Chinese (class: China) # Chinese Chinese Shanghai (class: China) # Chinese Macao (class: China) # Tokyo Japan Chinese (class: Japan) # Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo X = np.array([[1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0], [0, 1, 0, 1, 0, 0], [0, 1, 1, 0, 0, 1]]) # Classes are China (0), Japan (1) Y = np.array([0, 0, 0, 1]) # Fit BernoulliBN w/ alpha = 1.0 clf = BernoulliNB(alpha=1.0) clf.fit(X, Y) # Check the class prior is correct class_prior = np.array([0.75, 0.25]) assert_array_almost_equal(np.exp(clf.class_log_prior_), class_prior) # Check the feature probabilities are correct feature_prob = np.array([[0.4, 0.8, 0.2, 0.4, 0.4, 0.2], [1/3.0, 2/3.0, 2/3.0, 1/3.0, 1/3.0, 2/3.0]]) assert_array_almost_equal(np.exp(clf.feature_log_prob_), feature_prob) # Testing data point is: # Chinese Chinese Chinese Tokyo Japan X_test = np.array([0, 1, 1, 0, 0, 1]) # Check the predictive probabilities are correct unnorm_predict_proba = np.array([[0.005183999999999999, 0.02194787379972565]]) predict_proba = unnorm_predict_proba / np.sum(unnorm_predict_proba) assert_array_almost_equal(clf.predict_proba(X_test), predict_proba)
def get_numbers(spam_list, nonspam_list, test): corpus = spam_list + nonspam_list print "Training data size: ", len(corpus) vectorizer = TfidfVectorizer(min_df=1) X = vectorizer.fit_transform(corpus).toarray() Y = [1]*len(spam_list) + [0]*len(spam_list) # clf = LogisticRegression(penalty='l1') clf = BernoulliNB() clf.fit(X, Y) print "Data Fitted. Predicting reviews: ", len(test) return_list = [] for data in test: result = clf.predict_proba(\ vectorizer.transform(\ [data]).toarray())[0][1] return_list.append(result) print "Returning data test size: ", len(return_list) return return_list
def naive_bayes(train,validation): #features season=['Fall','Spring','Summer','Winter'] #season=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'] district=['BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION','NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN'] time=['first','second','third'] features2 = [x for x in range(0,24)] Minute=[x for x in range(100,160)] features=district+time+Minute+season+features2 #split set into train, validation train,validation= train_test_split(train, train_size=0.9) model = BernoulliNB() model.fit(train[features],train['Category']) #time calculation stop = timeit.default_timer() print "Runnin time naive bayes is ", stop-start predicted = np.array(model.predict_proba(validation[features])) model1=model.predict(validation[features]) model2=model.predict(train[features]) print "-----------------------------Naive Bayes----------------------------------------------------------------------------" print "Precision is ",precision_score(validation['Category'].values.tolist(),model1,average='macro') print "Recall is ",recall_score(validation['Category'].values.tolist(),model1,average='macro') print "Accuracy is ", accuracy_score(validation['Category'].values.tolist(),model1) print "Training Accuracy is ", accuracy_score(train['Category'].values.tolist(),model2) Category_new=[] for i in range(0,len(model1)): Category_new.append(le_crime.classes_[model1[i]]) #store result into file result=pd.DataFrame(predicted, columns=le_crime.classes_) result['Predicted']=Category_new result.to_csv('naiveBayes_test.csv', index = True, index_label = 'Id' ) #log loss function print "Log loss is", log_loss(validation['Category'],predicted,eps=1e-15, normalize=True, sample_weight=None)
def multinomialNB(rawX, rawY, rawXTesting, rawYTesting): X = np.array([[elem[0], elem[1]] for elem in rawX]) # relatedness and commonness senses = [elem[2] for elem in rawX] # which sense it comes from words = [elem[3] for elem in rawX] # which word it comes from Y = np.array(rawY) clf = BernoulliNB(alpha = 0.0, class_prior = None, fit_prior = True) # clf = MultinomialNB(alpha = 0.1, class_prior = None, fit_prior = True) clf.fit(X, Y) # This part needs to be changed to a sample sampleX = np.array([[elem[0], elem[1]] for elem in rawXTesting]) sampleY = np.array(rawYTesting) q = clf.predict_proba(sampleX) predictedProb = [elem[1] for elem in q] predictedY = evaluation.getPredictedY(words, senses, predictedProb, rawXTesting, rawYTesting) return evaluation.evaluationMetrics(sampleY, predictedY)
class NaiveBayesClassifierBernoulli: def __init__(self, matrixFileName = "/Users/chengyu/Documents/python/data/matrixForLearning", dicFileName = "/Users/chengyu/Documents/python/data/dictionary"): self.X,self.Y = load_svmlight_file(matrixFileName) self.dictionary = pickle.load(open(dicFileName, "rb")) self.bernoulliNB = BernoulliNB() self.bernoulliNB.fit(self.X, self.Y) self.matrixParser = Parser.MatrixParserForLearning() def classifyOneSentence(self, string): row = self.matrixParser.getRowForClassify(string, self.dictionary) if row != None: # return self.bernoulliNB.predict(row) return self.bernoulliNB.predict(row) else : return None def classifyOneSentenceWithProbability(self,string): row = self.matrixParser.getRowForClassify(string, self.dictionary) if row != None: # return self.bernoulliNB.predict(row) a = self.bernoulliNB.predict_proba(row) return a[0][1] - a[0][0] else : return None
class Agent_skatkar(Agent): def __init__(self, name, seed=0): super(Agent_skatkar, self).__init__(name) self.clf = BernoulliNB() # self.clf = LogisticRegression() # self.clf = GaussianNB() # self.clf = KNeighborsClassifier() # self.clf = SVC() # self.clf = DecisionTreeClassifier() # self.clf = RandomForestClassifier() # self.clf = AdaBoostClassifier() def choose_one_product(self, products): XX = [i.features for i in self.my_products] yy = [i for i in self.product_labels] self.clf.fit(XX, yy) v = 0 g_val = 0 for i in range(len(products)): feat = self.clf.predict_proba(products[i].features)[0][1] val = products[i].value cost = products[i].price prob = feat temp = prob*(val - cost) if(temp > v): v = temp g_val = i return g_val
# Load the data (Assumes your current working directory is the Classify Job Titles problem directory) job_titles = pd.read_csv("_Data/jobtitles.csv") # Convert the categories Technology, Sales, and Finance to numbers 0, 1, and 2 y = list(map(lambda x: {'finance':0, 'sales':1, 'technology':2}[x], job_titles.job_category[:10])) # Buid an MXN matrix where M is the number of samples, N is the number of unique words in the corpus (subject to parameters) and element [i,j] is the # whether or not sample i contains word j count_vectorizer = CountVectorizer() count_vectorizer.fit(job_titles.job_title) X = count_vectorizer.transform(job_titles.job_title[:10]) # Dump results into a pandas DataFrame since this is a small example for illustrative purposes df = pd.DataFrame(X.toarray(), columns=count_vectorizer.get_feature_names()) df # Now consider a new title X_new = count_vectorizer.transform(job_titles.job_title[10:12]) df_new = pd.DataFrame(X_new.toarray(), columns=count_vectorizer.get_feature_names()) df_new # Check our results with scikit-learn's Bernoulli Naive Bayes classifier naive_bayes = BernoulliNB(alpha=0.000000001) # make alpha virtually 0 naive_bayes.fit(X=df, y=y) naive_bayes.predict_proba(X=df_new) # Re-run with alpha = 1 naive_bayes = BernoulliNB(alpha=1) # make alpha 1 naive_bayes.fit(X=df, y=y) naive_bayes.predict_proba(X=df_new)
Y=[] L=[] trainf=open(sys.argv[1]) for l in trainf.readlines(): sl = l.strip().split() L.append(sl[0]) Y.append(int(sl[1])) xx=map(float,sl[2:]) X.append(xx) #print X clf = BernoulliNB() clf = clf.fit(X, Y) print clf.feature_log_prob_ #print clf.oob_score_ Yp=clf.predict_proba(X) YY=[p[1] for p in Yp] print roc_auc_score(Y,YY) pf=open('clfNB.pkl','w') s = pickle.dump(clf, pf) pf.close() X=[] L=[] testf=open(sys.argv[2]) for l in testf.readlines(): sl = l.strip().split() L.append(sl[0]) xx=map(float,sl[1:]) X.append(xx)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, s=50) ax.set_title("PCA reduction (2d) of transformed data (%dd)" % X_transformed.shape[1]) ax.set_xticks(()) ax.set_yticks(()) # Plot the decision in original space. For that, we will assign a color to each # point in the mesh [x_min, m_max] x [y_min, y_max]. h = .01 x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # transform grid using RandomTreesEmbedding transformed_grid = hasher.transform(np.c_[xx.ravel(), yy.ravel()]) y_grid_pred = nb.predict_proba(transformed_grid)[:, 1] ax = pl.subplot(223) ax.set_title("Naive Bayes on Transformed data") ax.pcolormesh(xx, yy, y_grid_pred.reshape(xx.shape)) ax.scatter(X[:, 0], X[:, 1], c=y, s=50) ax.set_ylim(-1.4, 1.4) ax.set_xlim(-1.4, 1.4) ax.set_xticks(()) ax.set_yticks(()) # transform grid using ExtraTreesClassifier y_grid_pred = trees.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] ax = pl.subplot(224) ax.set_title("ExtraTrees predictions")
Danger_Date_feature, Not_Danger_Date_feature, #District_Crime_feature, District_Other_feature ] features = [str(x[1]) for x in feature_list if x[0]] # # Trainding and Validation # In[10]: training, validation = train_test_split(train_df_new, train_size=.60) #Score to beat 2.55250 model = BernoulliNB() model.fit(training[features], training['dummy_Category']) predicted = np.array(model.predict_proba(validation[features])) log_loss(validation['dummy_Category'], predicted) # In[ ]: from xgboost import XGBClassifier model = XGBClassifier(max_depth=8,n_estimators=128) model.fit(training[features], training['dummy_Category']) predicted = np.array(model.predict_proba(validation[features])) log_loss(validation['dummy_Category'], predicted) # In[ ]: from sklearn.ensemble import RandomForestClassifier
finalPredictList = np.empty([len(testData),44], dtype=int) counter = 0 for testList in testData: oriDestPair = testList[2] + testList[3] paxCount = int(testList[-2]) tempXList = [0] * 44 tempXList[pair[oriDestPair]] = 1 if paxCount > 1: tempXList[43] = 1 else: tempXList[42] = 1 finalPredictList[counter] = tempXList counter += 1 predicted = model.predict_proba(finalPredictList) predictedValue = model.predict(finalPredictList) #Result Array result = np.empty(len(testData),dtype=int) ctr = 0 for tempList in testData: result[ctr] = fareClass[tempList[-1]] ctr += 1 print("Accuracy : ", Metrices.accuracy_score(result, predictedValue, normalize=False)) #Writing to a CSV file csvFileHandle = open("data\\NBBernoulli.csv",'w', newline='') with csvFileHandle: csvWriter = csv.writer(csvFileHandle)
# list with active/inactive info ys_fit = [1]*num_actives + [0]*(len(training_list)-num_actives) # training fps train_fps = np_fps_act + [np_fps_dcy[i] for i in training_list[num_actives:]] # fit Naive Bayes ml.fit(train_fps, ys_fit) # test fps and molecule info test_fps = [np_fps_div_act[i] for i in test_list[:num_test_actives]] test_fps += [np_fps_dcy[i] for i in test_list[num_test_actives:]] test_mols = [[div_actives[i][0], 1] for i in test_list[:num_test_actives]] test_mols += [[decoys[i][0], 0] for i in test_list[num_test_actives:]] # rank based on probability single_score = ml.predict_proba(test_fps) # store: [probability, internal ID, active/inactive] single_score = [[s[1], m[0], m[1]] for s,m in zip(single_score, test_mols)] single_score.sort(reverse=True) scores['nb_'+fp_build].append(single_score) # write scores to file if do_append: outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'ab+') # binary format else: outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'wb+') # binary format for fp in ['nb_'+fp_build]: cPickle.dump([fp, scores[fp]], outfile, 2) outfile.close() print "scoring done and scored lists written"
#collect eigenvectors that correspond to 95% variance ninetyfiveIndex = np.nonzero(cum_var_exp > 0.95)[0][0] pca = PCA(n_components = ninetyfiveIndex) pca.fit(x_train_std) x_train_pca = pca.transform(x_train_std) x_test_pca = pca.transform(x_test_std) #Run a Bernoulli Naive Bayes classifier on the data #While we use the predict_proba method here, Naive Bayes is notorious #as a poor predictor, so it's likely that these probabilities will be #way off #In addition, Bernoulli Naive Bayes doesn't jive well with the fact that #age is not a binary variable. It will be binarized, though this is less #harmful than it may seem, as the cutoff of a single year of age seems #to be an important factor in determining outcome bnb = BernoulliNB() bnb.fit(x_train_pca, y_train) y_pred = bnb.predict_proba(x_test_pca) #Label data appropriately and export to csv data = {'Return_to_owner': y_pred[:,0],\ 'Euthanasia': y_pred[:,1],\ 'Died': y_pred[:,2], \ 'Adoption': y_pred[:,3],\ 'Transfer': y_pred[:, 4]} results = pd.DataFrame(data = data) results.to_csv("naive_bayes_results.csv")
def BernoulliNBClassify_Proba(enrollment_id, trainData, trainLabel, testData): nbClf = BernoulliNB() nbClf.fit(trainData, ravel(trainLabel)) testLabel = nbClf.predict_proba(testData)[:,1] saveResult(enrollment_id, testLabel, 'Proba_sklearn_BernoulliNB_Result.csv') return testLabel
if test_Y[i] == 0 and preds[i] == 1: incorrect_uncensored += 1 print "Total Accuracy: " + str(correct/float(correct + incorrect)) print "Censored Accuracy: " + str(correct_censored/float(correct_censored + incorrect_censored)) print "Uncensored Accuracy: " + str(correct_uncensored/float(correct_uncensored + incorrect_uncensored)) print "Rankings: " # print clf.ranking_ print "Training Bernoulli Naive Bayes" clf = BernoulliNB(alpha=1.0) clf.fit(train_X, train_Y) preds = clf.predict(test_X) probs = clf.predict_proba(test_X) fpr_lr_nb, tpr_lr_nb, thresholds_lr_nb = metrics.roc_curve(test_Y, probs.T[1], pos_label=1) roc_auc_lr_nb = metrics.auc(fpr_lr_nb, tpr_lr_nb) print "AUC nb: " + str(roc_auc_lr_nb) correct = 0 incorrect = 0 correct_censored = 0 incorrect_censored = 0 correct_uncensored = 0 incorrect_uncensored = 0 for i in range(len(preds)):