def MungeData(train, test):

    todrop = ['v22', 'v112', 'v125', 'v74', 'v1', 'v110', 'v47']
    print(todrop)

    train.drop(todrop,
               axis=1, inplace=True)
    test.drop(todrop,
              axis=1, inplace=True)

    features = train.columns[2:]
    for col in features:
        if((train[col].dtype == 'object')):
            print(col)
            train, binfeatures = Binarize(col, train)
            test, _ = Binarize(col, test, binfeatures)
            nb = BernoulliNB()
            nb.fit(train[col+'_'+binfeatures].values, train.target.values)
            train[col] = \
                nb.predict_proba(train[col+'_'+binfeatures].values)[:, 1]
            test[col] = \
                nb.predict_proba(test[col+'_'+binfeatures].values)[:, 1]
            train.drop(col+'_'+binfeatures, inplace=True, axis=1)
            test.drop(col+'_'+binfeatures, inplace=True, axis=1)

    features = train.columns[2:]
    train[features] = train[features].astype(float)
    test[features] = test[features].astype(float)
    train.fillna(-1, inplace=True)
    test.fillna(-1, inplace=True)
    return train, test
Exemplo n.º 2
0
def predict(cur, plyr_id, game_plyrs): 
  #creates training set (called 'X') for plyr
  all_plyrs = all_player_ids(cur) #np.array - all NFL players (and coaches)
  games = games_played_in(cur, plyr_id) #np.array - the games_ids the player played in
  n_cols = all_plyrs.shape[0] #int 
  m_rows = games.shape[0] #int
  zeros = np.zeros((m_rows, n_cols)) #2darr - used to initialize DF
  X = pd.DataFrame(zeros, index=games, columns=all_plyrs) #dataframe
  populate_training_set(cur, X, games, plyr_id)
  print "X: ", X.values
  
  
  #creates vector of known output values
  Y = training_output_vector(cur, games, plyr_id)
  print "(len) Y: ", len(Y), Y
  test_zeros = np.zeros((1, n_cols)) #2darr - used to initialize DF
  test_X = pd.DataFrame(zeros, columns=all_plyrs) #dataframe
  update_training_matrix(game_plyrs, 0, test_X)
  
  #run Bernoulli NB Classifier
  nb_clf = BernoulliNB()
  
  if len(X.values) == 0:
    return 0
  nb_clf.fit(X, Y)
  nb_predictions = nb_clf.predict(test_X)
  print "test_X: ", test_X.values
  nb_norm_prob = normalize_probs(nb_clf.predict_proba(test_X)[0])
  avgs = [1.5, 4.5, 7.5, 10.5, 13.5, 16.5, 19.5, 22.5, 25.5, 28.5, 31.5]
  print "param vector: ", nb_clf.predict_proba(test_X)[0]
  print "probs: ", nb_norm_prob
  print avgs
  ev = expected_val(nb_norm_prob, avgs) #can also calc dot product
  return round(ev, 1)
Exemplo n.º 3
0
def MungeData(train, test, validation):

    features = train.columns[2:]
    print(type(features))
    for col in features:
        if((train[col].dtype == 'object') and (col!="v22")):
            print(col)
            train, binfeatures = Binarize(col, train)
            test, _ = Binarize(col, test, binfeatures)
            validation , _ = Binarize(col, validation, binfeatures)
            nb = BernoulliNB()
            nb.fit(train[col+'_'+binfeatures].values, train.target.values)
            train[col] = \
                nb.predict_proba(train[col+'_'+binfeatures].values)[:, 1]
            test[col] = \
                            nb.predict_proba(test[col+'_'+binfeatures].values)[:, 1]
            validation[col] = \
                            nb.predict_proba(validation[col+'_'+binfeatures].values)[:, 1]
            train.drop(col+'_'+binfeatures, inplace=True, axis=1)
            test.drop(col+'_'+binfeatures, inplace=True, axis=1)
            validation.drop(col+'_'+binfeatures, inplace=True, axis=1)
            train[col] = train[col].astype(float)
            test[col] = test[col].astype(float)
            validation[col] = validation[col].astype(float)
    return train, test, validation
Exemplo n.º 4
0
def BernoulliNB_pred(X_train, X_test, y_train):
    clf_NB = BernoulliNB()
    clf_NB.fit(X_train, y_train)

    # Conveting to back, (could be used sklearn standardization function for both decoding and encoding)
    predictions_train = clf_NB.predict_proba(X_train)
    predictions = clf_NB.predict_proba(X_test)

    return predictions[:, 1], predictions_train[:, 1]
Exemplo n.º 5
0
def main():
  start_time = time.time()
  #read in game IDs
  games_data = pd.read_csv('games-data.csv')
  all_games = np.array(games_data['game_id'])
  all_plyrs = np.array(games_data['plyr_id'])
  uni_game_ids = np.unique(all_games)
  
  #read in player IDs
  player_data = pd.read_csv('players.csv')
  plyr_ids = np.unique(np.array(player_data['ID']))
  
  #read in fantasy scores
  fantasy_scores = pd.read_csv('fantasy_scores.csv')
  
  #gets player training matrix
  plyr_id = 8439
  X = create_training_set(plyr_id, games_data, plyr_ids)
  index = get_ninety_percent(len(np.array(X.index))) #for cross-validation
  train_X = X[:index]
  test_X = X[index:]
  
  #gets training output vector
  plyr_game_ids = np.array(train_X.index)
  scores = plyr_fantasy_pts(plyr_id, plyr_game_ids, fantasy_scores)
  Y = discretize(scores.values)
  train_Y = Y[:index]
  test_Y = Y[index:]
  
  #run Bernoulli NB Classifier
  nb_clf = BernoulliNB()
  nb_clf.fit(train_X, train_Y)
  nb_predictions = nb_clf.predict(test_X)
  
  #run Multinomial NB Classifier
  mn_clf = MultinomialNB()
  mn_clf.fit(train_X, train_Y)
  mn_predictions = nb_clf.predict(test_X)
  
  #test for game, fantasy score alignment  
  for i in xrange(test_Y.shape[0]):
    print plyr_game_ids[i], scores.values[i], test_Y[i], nb_predictions[i], mn_predictions[i]
  
  print "Bernoulli NB accuracy: ", nb_clf.score(test_X, test_Y)
  
  print "Bernoulli NB prob estimates: ", nb_clf.predict_proba(test_X)
  print "Multinomial NB accuracy: ", mn_clf.score(test_X, test_Y)
  print "Bernoulli NB prob estimates: ", mn_clf.predict_proba(test_X)
  print len(nb_clf.predict_proba(test_X)[0])
  nb_norm_prob = normalize_probs(nb_clf.predict_proba(test_X)[0])
  vals = [1.5, 4.5, 7.5, 10.5, 13.5, 16.5, 19.5, 22.5, 25.5, 28.5, 31.5]
  ev = expected_val(nb_norm_prob, vals)
  print "EV: ", ev
  end_time = time.time()
  print("Elapsed time was %g seconds" % (end_time - start_time))  
Exemplo n.º 6
0
def test_alpha():
    # Setting alpha=0 should not output nan results when p(x_i|y_j)=0 is a case
    X = np.array([[1, 0], [1, 1]])
    y = np.array([0, 1])
    nb = BernoulliNB(alpha=0.)
    assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1])
    assert_warns(UserWarning, nb.fit, X, y)
    prob = np.array([[1, 0], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    nb = MultinomialNB(alpha=0.)
    assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1])
    assert_warns(UserWarning, nb.fit, X, y)
    prob = np.array([[2. / 3, 1. / 3], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    # Test sparse X
    X = scipy.sparse.csr_matrix(X)
    nb = BernoulliNB(alpha=0.)
    assert_warns(UserWarning, nb.fit, X, y)
    prob = np.array([[1, 0], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    nb = MultinomialNB(alpha=0.)
    assert_warns(UserWarning, nb.fit, X, y)
    prob = np.array([[2. / 3, 1. / 3], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    # Test for alpha < 0
    X = np.array([[1, 0], [1, 1]])
    y = np.array([0, 1])
    expected_msg = ('Smoothing parameter alpha = -1.0e-01. '
                    'alpha should be > 0.')
    b_nb = BernoulliNB(alpha=-0.1)
    m_nb = MultinomialNB(alpha=-0.1)
    assert_raise_message(ValueError, expected_msg, b_nb.fit, X, y)
    assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y)

    b_nb = BernoulliNB(alpha=-0.1)
    m_nb = MultinomialNB(alpha=-0.1)
    assert_raise_message(ValueError,
                         expected_msg,
                         b_nb.partial_fit,
                         X,
                         y,
                         classes=[0, 1])
    assert_raise_message(ValueError,
                         expected_msg,
                         m_nb.partial_fit,
                         X,
                         y,
                         classes=[0, 1])
Exemplo n.º 7
0
 def bayes_model(self):
     logger.info('Bayes_model beginning ...')
     classifier = BernoulliNB()
     classifier.fit(self.train_x, self.train_y)
     index = list(classifier.classes_).index(1)
     test_y_predict = pd.DataFrame(classifier.predict_proba(self.test_x),
                                   columns=list(classifier.classes_))
     test_y_predict[index] = test_y_predict[index].apply(
         lambda x: 0 if x <= 0.01 else x)
     predict_y = list(
         map(lambda x: x[index], classifier.predict_proba(self.predict_x)))
     data_results.save_model(obj=classifier,
                             path="../../data/results_2/bayes_model.pk")
     return test_y_predict, predict_y
Exemplo n.º 8
0
def model_comparison(train, validation, features):
	#Naive Bayes
	model = BernoulliNB()
	model.fit(training[features], training['crime'])
	predicted = np.array(model.predict_proba(validation[features]))
	print("BernoulliNB")
	print(log_loss(validation['crime'], predicted)) 
 
	#Logistic Regression for comparison
	model = LogisticRegression(C=.01)
	model.fit(training[features], training['crime'])
	predicted = np.array(model.predict_proba(validation[features]))
	print("LogisticRegression")
	print(log_loss(validation['crime'], predicted))
Exemplo n.º 9
0
    def estimate(self):
        # class_prior = [.9, .1] - we dunnno
        classifier = BernoulliNB()
        #classifier = MultinomialNB(alpha = 0.02)
        #classifier = DecisionTreeClassifier(class_weight = { 0: 1, 1: 9 })
        #classifier = KNeighborsClassifier(n_neighbors=50, metric='minkowski', p=3)
        # classifier = RandomForestClassifier(
        #   max_depth = 32,
        #   n_estimators = 64,
        #   max_features = 0.25,
        #   class_weight = { 0: 1, 1: 9 },
        #   n_jobs = 3
        # )
        classifier.fit(self.X, self.Y)
        if self.calc_start is not None:
            print('Fitting time: ' +
                  str((dt.datetime.now() - self.calc_start).total_seconds()) +
                  's')

        #if self.ids is not None:
        results_proba = classifier.predict_proba(self.test_data)
        if self.calc_start is not None:
            print('Prediction time: ' +
                  str((dt.datetime.now() - self.calc_start).total_seconds()) +
                  's')
        print(results_proba[:100])
        if self.ids is not None:
            self.estimates[:, 0] = self.ids
            self.estimates[:, 1] = results_proba[:, 1]
        else:
            self.crosscheck_estimates = {
                'labels': classifier.predict(self.test_data),
                'proba': results_proba[:, 1]
            }
Exemplo n.º 10
0
    def _simple_cross_validate(self):
        """
        Use a simple fixed NB model to double check the correctness of sklearn Random search and my random search
        It can confirm our API compatible with late-fusion is correct
        :return:
        """
        kf = KFold(n_splits=self.args.cv_num,
                   random_state=self.args.random_seed)
        metric_values = {metric_name: [] for metric_name in self.metric_names}
        clf = BernoulliNB(alpha=0.8490, binarize=0.3086, fit_prior=True)
        clf = OneVsRestClassifier(clf, n_jobs=self.args.n_jobs)
        for train_idx_list, test_idx_list in kf.split(self.data_x,
                                                      self.data_y):
            X_train = self.data_x[train_idx_list]
            y_train = self.data_y[train_idx_list]
            X_test = self.data_x[test_idx_list]
            y_test = self.data_y[test_idx_list]
            clf.fit(X_train, y_train)
            y_predict_score = clf.predict_proba(X_test)
            y_predict = np.argmax(y_predict_score, axis=-1)
            metric_results = utils.evaluate_any_type(y_test, y_predict,
                                                     self.id2label)
            for metric_name in self.metric_names:
                metric_values[metric_name].append(
                    [metric_results[metric_name],
                     len(y_test)])

        metric_weighted_avg = self._get_weighted_avg(metric_values)
        for metric_name in ['f1']:
            print_to_log('The {0} score in cross validation is {1}'.format(
                metric_name, metric_values[metric_name]))
            print_to_log('The average {0} score is {1}'.format(
                metric_name, metric_weighted_avg[metric_name]))
        quit()
Exemplo n.º 11
0
def navie_bayes(df):
    """
    朴素贝叶斯进行预测
    :param df:
    :return:
    """
    # # 只取星期几和街区作为分类器输入特征
    features = [
        'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
        'Sunday', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN',
        'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN'
    ]
    # 添加犯罪的小时时间点作为特征
    hour_fea = [x for x in np.arange(0, 24)]
    features = features + hour_fea

    # 分割训练集(70%)和测试集(30%)
    training, validtion = train_test_split(df, test_size=0.3)
    # 朴素贝叶斯建模,计算log_loss
    model = BernoulliNB()
    start = time.time()
    model.fit(training[features], training['crime'])
    cost_time = time.time() - start
    pridected = np.array(model.predict_proba(validtion[features]))
    loss = log_loss(validtion['crime'], pridected)
    logging.info(f'朴素贝叶斯建模耗时{cost_time}秒')
    logging.info(f'朴素贝叶斯log 损失为{loss}')
Exemplo n.º 12
0
 def _bernoulli_NB(self):
     clf = BernoulliNB()
     clf.fit(self.X_train, self.y_train)
     score = clf.score(self.X_test, self.y_test)
     print('Accuracy rate of Naive Bayes: {0:.3f}'.format(score))
     y_pred = clf.predict_proba(self.X_test)
     ks(y_pred.T[0], self.y_test)
Exemplo n.º 13
0
def anomality_model(history, predict):
    '''
    model used: Bernoulli Naive Bayse
    :param history: the dataframe with which fit the model
    :param predict: the element to be used for the prediction
    :return: the probability of the user for the given array.

    '''
    user = predict["display_name"]
    history = history.drop(['timestamp'], axis=1)

    history["display_name"] = np.where(history["display_name"] == user, 1, 0)

    df_target = history[history["display_name"] == 1]
    df_core = history[history["display_name"] == 0]

    df_core = df_core.drop_duplicates()  # Reduce the "No-events"

    df = df_target.append(df_core)

    model = BernoulliNB()
    label = df['display_name']
    main = df.drop(['display_name'], axis=1)

    model.fit(main, label)

    #print(user)
    predict = predict.drop(['timestamp', 'display_name'])
    #print("### predict #####")

    output = model.predict_proba(np.array(predict).reshape(1, -1))

    return output[0][1]
Exemplo n.º 14
0
    def _eval_(self):
        '''
        INPUT: None
        OUTPUT: None

        This function simply re-runs Naive Bayes, creates a classifier report,
        and a confusion matrix. It also tests for accuracy, creates an accuracy
        score based on overall predictions and the top 2 predicted car types.
        '''
        X_train, X_test, y_train, y_test = \
            train_test_split(self.country_dummies, self.cars['cluster'],
                             test_size=0.2, random_state=42)

        nb = BernoulliNB(class_prior=self.new_priors)
        nb.fit(X_train, y_train)
        pred = nb.predict(X_test)
        self.nb_confusion = confusion_matrix(y_test, pred)
        diag_sum = np.trace(confusion_matrix(y_test, pred))
        total = np.sum(confusion_matrix(y_test, pred))
        self.nb_accuracy = diag_sum / float(total)
        self.nb_reports = classification_report(y_test, pred)
        self.baseline = np.bincount(y_test.T).max() / float(y_test.shape[0])

        self.probs = nb.predict_proba(X_test)
        top2 = np.argsort(self.probs, axis=1)[:, -2:]
        t_f = []
        for i in xrange(y_test.shape[0]):
            t_f.append(np.in1d(y_test[i], top2[i])[0])
        self.nb_top2_accuracy = sum(t_f) / float(len(t_f))
Exemplo n.º 15
0
def plot_roc_curves(X, y):
    plt.figure(figsize=(10, 6))
    lw = 2

    # train-val split and oversample
    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      test_size=.2,
                                                      random_state=0)
    adasyn = ADASYN(random_state=44)
    X_oversampled_train, y_oversampled_train = adasyn.fit_sample(
        X_train, y_train)

    # Logistic Regression
    # fit model and predict probabilities of validation data
    log_reg = LogisticRegression(max_iter=5000, n_jobs=-1, random_state=44)
    log_reg.fit(X_oversampled_train, y_oversampled_train)
    y_pred = log_reg.predict_proba(X_val)

    fpr, tpr, thresholds = roc_curve(y_val, y_pred[:, 1])
    model_auc = roc_auc_score(y_val, y_pred[:, 1])
    plt.plot(fpr,
             tpr,
             color='b',
             lw=lw,
             label=f'Logistic Regression, AUC: {model_auc:.4f}')

    # Naive Bayes
    # fit model and predict probabilities of validation data
    nb = BernoulliNB()
    nb.fit(X_oversampled_train, y_oversampled_train)
    y_pred = nb.predict_proba(X_val)

    fpr, tpr, thresholds = roc_curve(y_val, y_pred[:, 1])
    model_auc = roc_auc_score(y_val, y_pred[:, 1])
    plt.plot(fpr,
             tpr,
             color='r',
             lw=lw,
             label=f'Bernoulli Naive Bayes, AUC: {model_auc:.4f}')

    # SVC
    # fit model and predict probabilities of validation data
    svc = SVC(probability=True, random_state=1)
    svc.fit(X_oversampled_train, y_oversampled_train)
    y_pred = svc.predict_proba(X_val)

    fpr, tpr, thresholds = roc_curve(y_val, y_pred[:, 1])
    model_auc = roc_auc_score(y_val, y_pred[:, 1])
    plt.plot(fpr, tpr, color='g', lw=lw, label=f'SVC, AUC: {model_auc:.4f}')

    plt.plot([0, 1], [0, 1], c='violet', ls='--', label='Chance Line')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])

    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC Curves for top 3 Contending Models')
    plt.legend(loc='lower right', prop={'size': 10}, frameon=True)
    plt.savefig('ROC Curves for top 3 Contending Models')
Exemplo n.º 16
0
    def bnb(self):
        from sklearn.naive_bayes import BernoulliNB
        from sklearn.metrics import classification_report, roc_auc_score

        bnb = BernoulliNB()
        bnb.fit(self.X_train, self.y_train)

        y_hat_train = bnb.predict(self.X_train)
        y_hat_test = bnb.predict(self.X_test)

        acc_bnb = round(bnb.score(self.X_test, self.y_test) * 100, 2)
        print('Model Accuracy: ', acc_bnb)

        print('Naive Bayes:\n 1. train 2. test')
        print(
            classification_report(self.y_train, y_hat_train),
            classification_report(self.y_test, y_hat_test),
            sep='\n-------------------------------------------------------\n')

        y_score = bnb.predict_proba(self.X_test)
        print(
            'ovo',
            roc_auc_score(self.y_test, y_score, multi_class='ovo'),
            'ovr',
            roc_auc_score(self.y_test, y_score, multi_class='ovr'),
            sep='\n-------------------------------------------------------\n')
class NaiveBayesClassifierBernoulli:
    def __init__(
            self,
            matrixFileName="/Users/chengyu/Documents/python/data/matrixForLearning",
            dicFileName="/Users/chengyu/Documents/python/data/dictionary"):
        self.X, self.Y = load_svmlight_file(matrixFileName)
        self.dictionary = pickle.load(open(dicFileName, "rb"))
        self.bernoulliNB = BernoulliNB()
        self.bernoulliNB.fit(self.X, self.Y)
        self.matrixParser = Parser.MatrixParserForLearning()

    def classifyOneSentence(self, string):
        row = self.matrixParser.getRowForClassify(string, self.dictionary)
        if row != None:
            #             return self.bernoulliNB.predict(row)
            return self.bernoulliNB.predict(row)
        else:
            return None

    def classifyOneSentenceWithProbability(self, string):
        row = self.matrixParser.getRowForClassify(string, self.dictionary)
        if row != None:
            #             return self.bernoulliNB.predict(row)
            a = self.bernoulliNB.predict_proba(row)
            return a[0][1] - a[0][0]
        else:
            return None
Exemplo n.º 18
0
def bayes1():
    # 分别表示:[刮北风、闷热、多云、天气预报有雨]
    X = np.array([[0, 1, 0, 1],
                  [1, 1, 1, 0],
                  [0, 1, 1, 0],
                  [0, 0, 0, 1],
                  [0, 1, 1, 0],
                  [0, 1, 0, 1],
                  [1, 0, 0, 1]])
    # 实际的 7 天中,是否有雨,0-没雨,1-有雨
    y = np.array([0, 1, 1, 0, 1, 0, 0])
    counts = {}
    for label in np.unique(y):
        counts[label] = X[y == label].sum(axis=0)
    print('feature counts:\n{}'.format(counts))
    clf = BernoulliNB()
    clf.fit(X, y)
    Next_day = [[0, 0, 1, 0]]
    pre = clf.predict(Next_day)
    print('------------')
    if pre == [1]:
        print('要下雨了')
    else:
        print('晴天')
    print('--------------')
    # 朴素贝叶斯对于预测具体的数值并不擅长,给出的概率仅供参考
    print('模型预测分类的概率:{}'.format(clf.predict_proba(Next_day)))
    print('--------------')
Exemplo n.º 19
0
    def test_compare_to_sklearn_very_simple(self):
        data = pd.DataFrame([(True, 1, 1), (False, 2, 2), (True, 3, 1)],
                            columns=["y", "col1", "col2"])

        dnb = DiscreteNaiveBayes()
        dnb = dnb.fit(data[["col1", "col2"]], data["y"])
        print(dnb.predict(data[["col1", "col2"]]))

        # compare to MultinomialNB
        X = pd.concat(
            [pd.get_dummies(data["col1"]),
             pd.get_dummies(data["col2"])],
            axis=1)

        nb = MultinomialNB(alpha=0)
        nb = nb.fit(X, data["y"])
        print(nb.predict(X))

        self.assertListEqual(
            dnb.predict(data[["col1", "col2"]]).tolist(),
            nb.predict(X).astype(int).tolist())

        # compare to BernoulliNB
        bnb = BernoulliNB(alpha=0)
        bnb = bnb.fit(X, data["y"])
        print(bnb.predict(X))

        self.assertListEqual(
            dnb.predict(data[["col1", "col2"]]).tolist(),
            bnb.predict(X).astype(int).tolist())

        print(dnb.predict_proba(data[["col1", "col2"]]))
        print(nb.predict_proba(X))
        print(bnb.predict_proba(X))
Exemplo n.º 20
0
class BernoulliNBImpl():
    def __init__(self,
                 alpha=1.0,
                 binarize=0.0,
                 fit_prior=True,
                 class_prior=None):
        self._hyperparams = {
            'alpha': alpha,
            'binarize': binarize,
            'fit_prior': fit_prior,
            'class_prior': class_prior
        }

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)
Exemplo n.º 21
0
    def predict(self, item, **kwargs):
        seq = item.sequence_number
        tsk = item.task
        len_tsk = len(tsk)
        enc_vars = [0] * len_tsk
        x = self.cond_data[0]
        y = self.cond_data[seq + 1]

        # Task info
        for i in range(len_tsk):
            if tsk[i] == b:
                enc_vars[i] = 1

        # Condtional
        # System 1
        temp_sum = self.mfa[seq][1][0] + self.mfa[seq][1][1]
        p_sys1 = self.mfa[seq][1][1] / temp_sum
        # System 2
        cond_model = BernoulliNB()
        cond_model.fit(x, y)
        p_sys2 = cond_model.predict_proba([enc_vars])[:, 1][0]

        # Apply weighting
        p_val = w * p_sys1 + (1 - w) * p_sys2

        if p_val < 0.5:
            p_resp = str("A")
        else:
            p_resp = str("B")

        # PRINT SECTION TO BE DELETED
        #print("MFA:", self.mfa)
        #print("Conditional:", self.cond_data)

        return p_resp
Exemplo n.º 22
0
class bernoullinbreadmeonly(ClassificationModule):
    """A Bernoulli Naive Bayes"""
    def __init__(self, text_corpus):
        ClassificationModule.__init__(self,
                                      "Readme Only Bernoulli Naive Bayes",
                                      "A Bernoulli Naive Bayes-Classifier")
        # Create vectorizer and fit on all available Descriptions
        self.vectorizer = getTextVectorizer(
            9000)  # Maximum of different columns
        corpus = []
        for description in text_corpus:
            corpus.append(process_text(description))
        self.vectorizer.fit(corpus)

        self.clf = BernoulliNB()
        print "\t-", self.name

    def resetAllTraining(self):
        """Reset classification module to status before training"""
        self.clf = sklearn.base.clone(self.clf)

    def trainOnSample(self, sample, shuffle=True, verbose=True):
        """Trainiere (inkrementell) mit Sample. Evtl zusätzlich mit best. Menge alter Daten, damit overfitten auf neue Daten verhindert wird."""
        readme_vec = self.formatInputData(sample)
        label_index = getLabelIndex(sample)
        return self.clf.fit(readme_vec, np.expand_dims(label_index, axis=0))

    def train(self, samples, shuffle=True, verbose=True):
        """Trainiere mit Liste von Daten. Evtl weitere Paramter nötig (nb_epoch, learning_rate, ...)"""
        train_samples = []
        train_lables = []
        for sample in samples:
            formatted_sample = self.formatInputData(sample)[0].tolist()
            train_samples.append(formatted_sample)
            train_lables.append(getLabelIndex(sample))
        train_lables = np.asarray(train_lables)
        train_result = self.clf.fit(train_samples, train_lables)
        self.isTrained = True
        return train_result

    def predictLabel(self, sample):
        """Gibt zurück, wie der Klassifikator ein gegebenes Sample klassifizieren würde"""
        if not self.isTrained:
            return 0
        sample = self.formatInputData(sample)
        return self.clf.predict(sample)[0]

    def predictLabelAndProbability(self, sample):
        """Return the probability the module assignes each label"""
        if not self.isTrained:
            return [0, 0, 0, 0, 0, 0, 0, 0]
        sample = self.formatInputData(sample)
        prediction = self.clf.predict_proba(sample)[0]
        return [np.argmax(prediction)] + list(prediction)

    def formatInputData(self, sample):
        """Extract readme and transform to vector"""
        sd = getReadme(sample)
        # Returns numpy array which contains 1 array with features
        return self.vectorizer.transform([sd]).toarray()
Exemplo n.º 23
0
class Agent_skatkar(Agent):
    def __init__(self, name, seed=0):
        super(Agent_skatkar, self).__init__(name)

        self.clf = BernoulliNB()
        # self.clf = LogisticRegression()
        # self.clf = GaussianNB()
        # self.clf = KNeighborsClassifier()
        # self.clf = SVC()
        # self.clf = DecisionTreeClassifier()
        # self.clf = RandomForestClassifier()
        # self.clf = AdaBoostClassifier()

    def choose_one_product(self, products):

        XX = [i.features for i in self.my_products]

        yy = [i for i in self.product_labels]

        self.clf.fit(XX, yy)

        v = 0
        g_val = 0
        for i in range(len(products)):
            feat = self.clf.predict_proba(products[i].features)[0][1]
            val = products[i].value
            cost = products[i].price
            prob = feat
            temp = prob * (val - cost)
            if (temp > v):
                v = temp
                g_val = i
        return g_val
Exemplo n.º 24
0
    def run(self):
        data = self.input()
        out = self.output()

        test_df = pd.read_csv(data['training_data.csv'].path)
        pred_df = pd.read_csv(data['tournament_data.csv'].path)

        training_indices, testing_indices = train_test_split(
            test_df.index,
            stratify=test_df['target'].values,
            train_size=0.75,
            test_size=0.25)

        result1 = test_df.copy()

        # Perform classification with a BernoulliNB classifier
        bnb1 = BernoulliNB(alpha=0.64, binarize=0.23)
        bnb1.fit(result1.loc[training_indices].drop('target', axis=1).values,
                 result1.loc[training_indices, 'target'].values)

        # Perform prediction
        val = pred_df.drop('t_id', axis=1)
        nb = bnb1.predict_proba(val)
        pred_df['probability'] = nb[:, 1]

        pred_df.to_csv(out.path,
                       columns=('t_id', 'probability'),
                       index=None)
class NaiveBayesClassifierBernoulli:
    """
    this class capsules the Bernoulli NaiveBayes functions of scikit-learn in BernoulliNB class
"""
    def __init__(self, matrixFileName = matrixFilePath, dicFileName = dictFilePath):
        self.X,self.Y = load_svmlight_file(matrixFileName)
        self.dictionary = pickle.load(open(dicFileName, "rb"))
        self.bernoulliNB = BernoulliNB()
        self.bernoulliNB.fit(self.X, self.Y)
        self.matrixParser = Parser.MatrixParserForLearning()
        
    def classifyOneSentence(self, string):
        row = self.matrixParser.getRowForClassify(string, self.dictionary)
        if row != None:
#             return self.bernoulliNB.predict(row)
            return self.bernoulliNB.predict(row)
        else : return None
    
    def classifyOneSentenceWithProbability(self,string):
        row = self.matrixParser.getRowForClassify(string, self.dictionary)
        if row != None:
#             return self.bernoulliNB.predict(row)
            a = self.bernoulliNB.predict_proba(row)
            return a[0][1] - a[0][0]
        else : return None
Exemplo n.º 26
0
class Model(object):
    def __init__(self):
        # self.model = GradientBoostingClassifier(learning_rate=0.01, max_depth=8,
        # 	max_features=5, min_samples_leaf=5, n_estimators=1500)
        self.model = BernoulliNB(alpha=1)
        self.tfidf = TfidfVectorizer(max_df=1.0,
                                     min_df=1,
                                     stop_words='english',
                                     lowercase=True)
        pass

    def fit(self, X, y):
        # Import X and y as text
        X = self.tfidf.fit_transform(X)
        y = y
        self.model.fit(X, y)
        filename = 'data/model.pkl'
        pickle.dump(self, open(filename, 'wb'))
        return self

    def predict(self, X):
        X = self.tfidf.transform(X)
        predictions = self.model.predict(X)
        return predictions

    def predict_proba(self, X):
        X = self.tfidf.transform(X)
        proba_predictions = self.model.predict_proba(X)
        return proba_predictions

    def score(self, X, y):
        X = self.tfidf.transform(X)
        score = self.model.score(X, y)
        return score
Exemplo n.º 27
0
def NB(x_train, x_test, y_train, y):

    gau = GaussianNB()
    ber = BernoulliNB()

    ctstrain = x_train.iloc[:, 0:11]
    ctstest = x_test.iloc[:, 0:11]

    cattrain = x_train.iloc[:, 11:30]
    cattest = x_test.iloc[:, 11:30]

    gau.fit(ctstrain, y_train.values.ravel())
    ber.fit(cattrain, y_train.values.ravel())

    predprobs = gau.predict_proba(ctstest)
    predprobss = ber.predict_proba(cattest)

    probs = np.multiply(predprobs, predprobss)

    preds = []

    for i in probs:
        i.tolist()
        index_min = np.argmax(i)
        preds.append(index_min + 1)

    acc = 0
    i = 0
    for item in preds:
        if (preds[i] == y[i]):
            acc += 1
        i += 1

    acc = acc / len(preds)
    print("NB Accuracy: ", acc)
Exemplo n.º 28
0
    def _test_bernoulinb_classifer(self,
                                   num_classes,
                                   alpha=1.0,
                                   binarize=None,
                                   fit_prior=False,
                                   class_prior=None,
                                   labels_shift=0,
                                   backend="torch"):
        model = BernoulliNB(alpha=alpha,
                            binarize=binarize,
                            fit_prior=fit_prior,
                            class_prior=class_prior)
        np.random.seed(0)
        if binarize is None:
            X = np.random.randint(2, size=(100, 200))
        else:
            X = np.random.rand(100, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=100) + labels_shift

        model.fit(X, y)
        torch_model = hummingbird.ml.convert(model, backend, X)
        self.assertTrue(torch_model is not None)
        np.testing.assert_allclose(model.predict_proba(X),
                                   torch_model.predict_proba(X),
                                   rtol=1e-6,
                                   atol=1e-5)
Exemplo n.º 29
0
class Dota_NB:
    def __init__(self):
        fo = open("id_name.json", "r")
        self.id_name_dic = json.loads(fo.read())
        fo.close()
        self.hero_num = 0
        for i in self.id_name_dic:
            if int(i) > self.hero_num:
                self.hero_num = int(i)

        dataset = pd.read_csv("X.csv", header=None).values
        X_0 = dataset[:, 0:10]
        self.Y = dataset[:, 10]

        self.X = np.zeros([X_0.shape[0], self.hero_num * 2], dtype=np.bool_)
        for i in range(0, X_0.shape[0]):
            for k in range(0, 5):
                self.X[i][X_0[i][k] - 1] = True
            for j in range(5, 10):
                self.X[i][X_0[i][j] - 1 + self.hero_num] = True

        self.clf = BernoulliNB()
        self.clf.fit(self.X, self.Y)

    def predict_one(self, X_test):
        return self.clf.predict([X_test])[0]

    def predict_one_proba(self, X_test):
        return max(self.clf.predict_proba([X_test])[0])

    def predict_many(self, X_test):
        return self.clf.predict(X_test)
Exemplo n.º 30
0
def main(X_data, y_data, test_size):
    X_train, X_test, label_train, label_test = cross_validation.train_test_split(
        X_data, y_data, test_size=test_size)

    X_train = X_train.toarray()
    # cria o classificador

    gnb = BernoulliNB()

    gnb.fit(X_train, label_train)

    # predicao do classificador
    label_pred = gnb.predict(X_test)

    probs = gnb.predict_proba(X_test)

    correct_hist = []
    error_hist = []

    for i in xrange(0, len(label_pred)):
        max_prob = max(probs[i])
        if label_pred[i] == label_test[i]:
            correct_hist.append(max_prob)
        else:
            error_hist.append(max_prob)
    return correct_hist, error_hist
Exemplo n.º 31
0
def test_predict():
    model_path = train_nlu(
        nlu_data=NLU_DATA_PATH,
        config="tests/configs/sparse-naive-bayes-intent-classifier-config.yml",
        output="models",
    )

    interpreter = load_interpreter(model_path)

    # Get features from the pipeline and prepare data in the format sklearn
    # expects.
    training_data = load_data(NLU_DATA_PATH)
    for example in training_data.intent_examples:
        interpreter.featurize_message(example)
    model = interpreter.interpreter.pipeline[-1]
    X, y = model.prepare_data(training_data)

    # Fit the equivalent sklearn classifier.
    from sklearn.naive_bayes import BernoulliNB
    clf = BernoulliNB(alpha=0.1, binarize=0.0, fit_prior=True)
    clf.fit(X, y)

    # Check that predictions agree.
    assert (clf.predict_proba(X) == model.predict_prob(X)).all()
    assert (clf.predict(X) == model.predict(X)[0][:, 0]).all()
def run(X_train, y_train, X_test, seed):
    # Train
    clf = BernoulliNB(alpha=0.9)
    clf.fit(X_train, y_train)

    # Test
    y_score = clf.predict_proba(X_test)[:, 1]
    return y_score
Exemplo n.º 33
0
def score(train_X, train_y):

    X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.01, random_state=10)

    clf = BernoulliNB(binarize=False, fit_prior=True, alpha=0.7)
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_valid)
    return log_loss(y_valid, y_pred)
Exemplo n.º 34
0
 def bnb_model(self):
     classifier = BernoulliNB()
     classifier.fit(self.train_x, self.train_label)
     index = list(classifier.classes_).index(1)
     predict_y = list(map(lambda x: x[index], classifier.predict_proba(self.test_x)))
     self.save_model(classifier, path="../../data/results/Bayes_model.pk")
     logger.info('bnb_model finished ...')
     return predict_y
Exemplo n.º 35
0
def ewh_hsi(rs):
    def daily_change(code, frdate, todate, base, numerator):
        e0 = yq.get_historical_prices(code, frdate, todate)
        print e0
        e1 = e0[1:]
        e2 = e0[2:]

        e3 = map(
            lambda i:
            (e2[i][0], 1 if (float(e2[i][numerator]) - float(e1[i][base])) /
             float(e1[i][base]) > 0 else 0, e2[i][numerator], e1[i][base]),
            [i for i in range(len(e2))])
        return e3

    idx = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Clos']
    EWH = daily_change('^DJI', '20150901', '20160330', idx.index('Adj Clos'),
                       idx.index('Adj Clos'))
    #EWH = EWH[:20]
    # 1 if opens high and 0 otherwise
    HSI = daily_change('^HSI', '20150901', '20160330', idx.index('Open'),
                       idx.index('Adj Clos'))
    #HSI = HSI[:20]
    print len(EWH), ''.join('%s,' % x[0] for x in EWH)
    print len(HSI), ''.join('%s,' % x[0] for x in HSI)
    HSI_dates = map(lambda x: x[0], HSI)
    # filter EWH entries for which a record has a corresponding next trade record in HSI
    # example, EWH trade date 2016-02-29 the corresponding record for HSI is 2016-03-01
    EWH_filtered = filter(
        lambda x: datetime2ystr(rs.after(ystr2datetime(x[0]))) in HSI_dates,
        EWH)
    print len(EWH_filtered), EWH_filtered
    hsi_ewh = map(
        lambda x: (HSI[HSI_dates.index(
            datetime2ystr(rs.after(ystr2datetime(x[0]))))][1], x[1]),
        EWH_filtered)

    xx = np.array(map(lambda x: [x[1], 0], hsi_ewh))
    yy = np.array(map(lambda x: x[0], hsi_ewh))

    model = BernoulliNB()
    model.fit(xx, yy)
    predicted = model.predict([[0, 0], [1, 0]])
    print predicted
    print model.predict_proba([[0, 0], [1, 0]])
    print model.feature_count_
class InstaNaiveBayesSybilRankerFactory:
	basePath = os.path.dirname(
			os.path.abspath(inspect.getfile(inspect.currentframe()))
		)
	modelFilename = basePath + \
		'/modelfiles/InstaNaiveBayesSybilRanker.joblib'
	trainDataFilename = basePath + \
		'/traindata/insta_train_data.csv'

	def __init__(self):
		# Importing dataset
		print(pd.__path__)
		self.data = pd.read_csv( self.trainDataFilename, sep=",",
									encoding='latin1')
		# data = self.data
		# data.fillna('', inplace=True)
		#
		# data = pd.read_csv( self.trainDataFilename,
		# 	sep=",", encoding='latin1' )

		self.data = self.data.drop(['username'], 1)
		self.X = self.data.drop('bot', 1)
		self.Y = self.data['bot']

		# create model
		self.gnb = BernoulliNB()

		# Train classifier
		self.gnb.fit(self.X, self.Y)

		# save model
		jl.dump(self, self.modelFilename)

	def validate(self):
		# Importing dataset
		X = self.X
		Y = self.Y
		bScores = cross_val_score(self.gnb, X, Y, cv=10)
		print('\tcrossvalidated accuracy after NaiveBayesSybilRanker: {}'\
			.format(bScores.mean()))

	def getRank(self, nodeName):
		args = dict(
				username = nodeName,
				usernames = [],
				login_user= SybilRanking.settings.insta_username,
				login_pass= SybilRanking.settings.insta_password)

		scraper = InstagramScraper(**args)
		scraper.login()
		userData = scraper.scrapeUser( username = nodeName)
		userData.pop(0)
		detectedClass = self.gnb.predict( [userData] )
		print("\tdetectedClass = ", detectedClass)
		predict_proba = self.gnb.predict_proba( [userData] )
		print("\tpredict_proba = ", predict_proba)
		return predict_proba[0][0]*100
Exemplo n.º 37
0
def bnb(X,y,Z,test_data):  
    from sklearn.naive_bayes import BernoulliNB
    bnb = BernoulliNB()
    bnb.fit(X,y)
    #MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)  
    test_probs_bnb = bnb.predict_proba(Z)[:, 1]
    sub = pd.DataFrame({'enrollment_id':test_data["enrollment_id"], 
                        'truth':test_probs_bnb}).set_index("enrollment_id")
    sub.to_csv('data\\result\\sixth_bnb.csv')
def test_alpha():
    # Setting alpha=0 should not output nan results when p(x_i|y_j)=0 is a case
    X = np.array([[1, 0], [1, 1]])
    y = np.array([0, 1])
    nb = BernoulliNB(alpha=0.)
    assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1])
    assert_warns(UserWarning, nb.fit, X, y)
    prob = np.array([[1, 0], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    nb = MultinomialNB(alpha=0.)
    assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1])
    assert_warns(UserWarning, nb.fit, X, y)
    prob = np.array([[2./3, 1./3], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    # Test sparse X
    X = scipy.sparse.csr_matrix(X)
    nb = BernoulliNB(alpha=0.)
    assert_warns(UserWarning, nb.fit, X, y)
    prob = np.array([[1, 0], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    nb = MultinomialNB(alpha=0.)
    assert_warns(UserWarning, nb.fit, X, y)
    prob = np.array([[2./3, 1./3], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    # Test for alpha < 0
    X = np.array([[1, 0], [1, 1]])
    y = np.array([0, 1])
    expected_msg = ('Smoothing parameter alpha = -1.0e-01. '
                    'alpha should be > 0.')
    b_nb = BernoulliNB(alpha=-0.1)
    m_nb = MultinomialNB(alpha=-0.1)
    assert_raise_message(ValueError, expected_msg, b_nb.fit, X, y)
    assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y)

    b_nb = BernoulliNB(alpha=-0.1)
    m_nb = MultinomialNB(alpha=-0.1)
    assert_raise_message(ValueError, expected_msg, b_nb.partial_fit,
                         X, y, classes=[0, 1])
    assert_raise_message(ValueError, expected_msg, m_nb.partial_fit,
                         X, y, classes=[0, 1])
Exemplo n.º 39
0
def prob_couple_will_not_break_up(args):
	(observations, classes) = createObservations()
	observations = np.array(observations)
	classes = np.array(classes)
	# make naive classifier
	naive = BernoulliNB(binarize=None)
	naive.fit(observations, classes)
	probs = naive.predict_proba([create_observations_from_dict(args)])[0]
	prob_not_broken_up, probs_broken_up = probs
	return prob_not_broken_up / (prob_not_broken_up + probs_broken_up)
def convertToNumeric(df):
    features = df.columns[2:]
    for col in features:
        if((df[col].dtype == 'object')):
            print "Converting {0} to numerical data".format(col)
            labelEncode(df, col)
            nb = BernoulliNB()
            nb.fit(df[[col]], df['target'])
            new_col = col + "_binarized"
            df[new_col] = nb.predict_proba(df[[col]])[:, 1]
Exemplo n.º 41
0
class TextClassifier(object):
    """A text classifier model:
        - Vectorize the raw text into features.
        - Fit a naive bayes model to the resulting features.
    """
    def __init__(self):
        #self._vectorizer = TfidfVectorizer(stop_words='english')
        self._vectorizer = CountVectorizer()
        self._classifier = BernoulliNB()
        #self._classifier = MultinomialNB()

    def fit(self, X, y):
        """Fit a text classifier model.

        Parameters
        ----------
        X: A numpy array or list of text fragments, to be used as predictors.
        y: A numpy array or python list of labels, to be used as responses.

        Returns
        -------
        self: The fit model object.
        """
        # Code to fit the model.

        train_stuff = self._vectorizer.fit_transform(X, y)

        self._classifier.fit(train_stuff, y=y)

        return self

    def predict_proba(self, X):
        """Make probability predictions on new data."""

        stuff = self._vectorizer.transform(X)
        result = self._classifier.predict_proba(stuff)
        return result
        pass

    def predict(self, X):
        """Make predictions on new data."""

        stuff = self._vectorizer.transform(X)
        result = self._classifier.predict(stuff)
        return result
        pass

    def score(self, X, y):
        """Return a classification accuracy score on new data."""

        stuff = self._vectorizer.transform(X)
        result = self._classifier.score(stuff, y)

        return result
        pass
def nb_onehot():
    X_tr, Y_tr, X_va, Y_va, dictionary, X_te, id_list = util.create_or_load_data(freq_threshold=50)

    Y_te_pred_list = []
    sum_auc_va = 0.0
    for i in range(Y_tr.shape[1]):
        nb = BernoulliNB()

        j = 0
        batch_size = 10000
        while j < len(X_tr):
            end = min(j + batch_size, len(X_tr) - 1)
            batch = [data_process.seq2onehot(seq, dictionary) for seq in X_tr[j:end]]
            nb.partial_fit(batch, Y_tr[j:end, i], classes=[0, 1])
            j += batch_size

        logging.info("Finish training")

        Y_va_pred = []
        j = 0
        while j < len(X_va):
            end = min(j + batch_size, len(X_va))
            batch = [data_process.seq2onehot(seq, dictionary) for seq in X_va[j:end]]
            Y_va_pred.extend(nb.predict_proba(batch))
            j += batch_size

        auc_va = util.auc(Y_va[:, i], Y_va_pred)
        logging.info("tag{}, valid auc: ".format(i) + str(auc_va))
        sum_auc_va += auc_va

        Y_te_pred = []
        j = 0
        while j < len(X_te):
            end = min(j + batch_size, len(X_te))
            batch = [data_process.seq2onehot(seq, dictionary) for seq in X_te[j:end]]
            Y_te_pred.extend(nb.predict_proba(batch))
            j += batch_size
        Y_te_pred_list.append(Y_te_pred)

    logging.info("Avg auc: {}".format(sum_auc_va / Y_tr.shape[1]))

    util.submission(Y_te_pred_list, id_list)
Exemplo n.º 43
0
def ImpactData(train, test, exclude = []):

    features = train.columns[2:]
    print(type(features))
    for col in features:
        if((train[col].dtype == 'object') and (col not in exclude)):
            print(col)
            train, binfeatures = Binarize(col, train)
            test, _ = Binarize(col, test, binfeatures)
            nb = BernoulliNB()
            nb.fit(train[col+'_'+binfeatures].values, train.target.values)
            train[col] = \
                nb.predict_proba(train[col+'_'+binfeatures].values)[:, 1]
            test[col] = \
                nb.predict_proba(test[col+'_'+binfeatures].values)[:, 1]
            train.drop(col+'_'+binfeatures, inplace=True, axis=1)
            test.drop(col+'_'+binfeatures, inplace=True, axis=1)
            train[col] = train[col].astype(float)
            test[col] = test[col].astype(float)
    return train, test
Exemplo n.º 44
0
    def fit_model_8(self,lol = 0.0, toWrite=False):
        model = BernoulliNB(alpha = lol, binarize = 0.0)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict_proba(X_test)[:,1]
            print("Model 8 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model8/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Exemplo n.º 45
0
Arquivo: nbn.py Projeto: Dismeth/gui
def naivebayesian(dataset,configFIUse,configFI,alpha,binerize):
    ds = dataset
    ds.dprint("Start Creating BernoulliNB Bayesian Network")
    if configFIUse:
        ds.dprint("Excluding following columns: " + str(configFI))
        X_train = ds.X_train.drop(configFI, inplace=False, axis=1)
        X_test = ds.X_test.drop(configFI, inplace=False, axis=1)
    else:
        X_train = ds.X_train
        X_test = ds.X_test
    bnb = BernoulliNB(alpha=alpha, binarize=binerize)
    y_pred = bnb.fit(X_train, ds.y_train).predict(X_test)
    y_pred_proba = bnb.predict_proba(X_test)
    mislabeled = "Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0] ,(ds.y_test != y_pred).sum())
    auc = roc_auc_score(ds.y_test, y_pred_proba[:, 1])
    return (mislabeled,auc,y_pred_proba)
Exemplo n.º 46
0
def test_bnb():
    """
    Tests that BernoulliNB when alpha=1.0 gives the same values as
    those given for the toy example in Manning, Raghavan, and
    Schuetze's "Introduction to Information Retrieval" book:
    http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
    """

    # Training data points are:
    # Chinese Beijing Chinese (class: China)
    # Chinese Chinese Shanghai (class: China)
    # Chinese Macao (class: China)
    # Tokyo Japan Chinese (class: Japan)

    # Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo
    X = np.array([[1, 1, 0, 0, 0, 0],
                  [0, 1, 0, 0, 1, 0],
                  [0, 1, 0, 1, 0, 0],
                  [0, 1, 1, 0, 0, 1]])

    # Classes are China (0), Japan (1)
    Y = np.array([0, 0, 0, 1])

    # Fit BernoulliBN w/ alpha = 1.0
    clf = BernoulliNB(alpha=1.0)
    clf.fit(X, Y)

    # Check the class prior is correct
    class_prior = np.array([0.75, 0.25])
    assert_array_almost_equal(np.exp(clf.class_log_prior_), class_prior)

    # Check the feature probabilities are correct
    feature_prob = np.array([[0.4, 0.8, 0.2, 0.4, 0.4, 0.2],
                             [1/3.0, 2/3.0, 2/3.0, 1/3.0, 1/3.0, 2/3.0]])
    assert_array_almost_equal(np.exp(clf.feature_log_prob_), feature_prob)

    # Testing data point is:
    # Chinese Chinese Chinese Tokyo Japan
    X_test = np.array([0, 1, 1, 0, 0, 1])

    # Check the predictive probabilities are correct
    unnorm_predict_proba = np.array([[0.005183999999999999,
                                      0.02194787379972565]])
    predict_proba = unnorm_predict_proba / np.sum(unnorm_predict_proba)
    assert_array_almost_equal(clf.predict_proba(X_test), predict_proba)
Exemplo n.º 47
0
def get_numbers(spam_list, nonspam_list, test):
    corpus = spam_list + nonspam_list
    print "Training data size: ", len(corpus)
    vectorizer = TfidfVectorizer(min_df=1)
    X = vectorizer.fit_transform(corpus).toarray()
    Y = [1]*len(spam_list) + [0]*len(spam_list)
    # clf = LogisticRegression(penalty='l1')
    clf = BernoulliNB()
    clf.fit(X, Y)
    print "Data Fitted. Predicting reviews: ", len(test)
    return_list = []
    for data in test:
        result = clf.predict_proba(\
                    vectorizer.transform(\
                    [data]).toarray())[0][1]
        return_list.append(result)
    print "Returning data test size: ", len(return_list)
    return return_list
Exemplo n.º 48
0
def naive_bayes(train,validation):

    #features
    season=['Fall','Spring','Summer','Winter']
    #season=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
    district=['BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION','NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']
    time=['first','second','third']
    features2 = [x for x in range(0,24)]
    Minute=[x for x in range(100,160)]

    features=district+time+Minute+season+features2

    #split set into train, validation
    train,validation= train_test_split(train, train_size=0.9)
    model = BernoulliNB()
    model.fit(train[features],train['Category'])

    #time calculation
    stop = timeit.default_timer()
    print "Runnin  time naive bayes is ", stop-start

    predicted = np.array(model.predict_proba(validation[features]))
    model1=model.predict(validation[features])
    model2=model.predict(train[features])

    print "-----------------------------Naive Bayes----------------------------------------------------------------------------"
    print "Precision is ",precision_score(validation['Category'].values.tolist(),model1,average='macro')
    print "Recall is ",recall_score(validation['Category'].values.tolist(),model1,average='macro')
    print "Accuracy is ", accuracy_score(validation['Category'].values.tolist(),model1)
    print "Training Accuracy is ", accuracy_score(train['Category'].values.tolist(),model2)
    Category_new=[]
    for i in range(0,len(model1)):
        Category_new.append(le_crime.classes_[model1[i]])

    #store result into file
    result=pd.DataFrame(predicted, columns=le_crime.classes_)
    result['Predicted']=Category_new
    result.to_csv('naiveBayes_test.csv', index = True, index_label = 'Id' )


    #log loss function
    print "Log loss is", log_loss(validation['Category'],predicted,eps=1e-15, normalize=True, sample_weight=None)
def multinomialNB(rawX, rawY, rawXTesting, rawYTesting): 
  X = np.array([[elem[0], elem[1]] for elem in rawX]) # relatedness and commonness
  senses = [elem[2] for elem in rawX] # which sense it comes from
  words = [elem[3] for elem in rawX] # which word it comes from
 
  Y = np.array(rawY)

  clf = BernoulliNB(alpha = 0.0, class_prior = None, fit_prior = True)
  # clf = MultinomialNB(alpha = 0.1, class_prior = None, fit_prior = True)
  clf.fit(X, Y)
  
  # This part needs to be changed to a sample

  sampleX = np.array([[elem[0], elem[1]] for elem in rawXTesting])
  sampleY = np.array(rawYTesting)
  
  q = clf.predict_proba(sampleX)
  predictedProb = [elem[1] for elem in q]

  predictedY = evaluation.getPredictedY(words, senses, predictedProb, rawXTesting, rawYTesting)
  return evaluation.evaluationMetrics(sampleY, predictedY)
class NaiveBayesClassifierBernoulli:
    def __init__(self, matrixFileName = "/Users/chengyu/Documents/python/data/matrixForLearning", dicFileName = "/Users/chengyu/Documents/python/data/dictionary"):
        self.X,self.Y = load_svmlight_file(matrixFileName)
        self.dictionary = pickle.load(open(dicFileName, "rb"))
        self.bernoulliNB = BernoulliNB()
        self.bernoulliNB.fit(self.X, self.Y)
        self.matrixParser = Parser.MatrixParserForLearning()
        
    def classifyOneSentence(self, string):
        row = self.matrixParser.getRowForClassify(string, self.dictionary)
        if row != None:
#             return self.bernoulliNB.predict(row)
            return self.bernoulliNB.predict(row)
        else : return None
    
    def classifyOneSentenceWithProbability(self,string):
        row = self.matrixParser.getRowForClassify(string, self.dictionary)
        if row != None:
#             return self.bernoulliNB.predict(row)
            a = self.bernoulliNB.predict_proba(row)
            return a[0][1] - a[0][0]
        else : return None
class Agent_skatkar(Agent):

    def __init__(self, name, seed=0):
        super(Agent_skatkar, self).__init__(name)
        
        self.clf = BernoulliNB()
        # self.clf = LogisticRegression()
        # self.clf = GaussianNB()
        # self.clf = KNeighborsClassifier()
        # self.clf = SVC()
        # self.clf = DecisionTreeClassifier()
        # self.clf = RandomForestClassifier()
        # self.clf = AdaBoostClassifier()
            
    def choose_one_product(self, products):
                
        XX = [i.features for i in self.my_products]
            
        yy = [i for i in self.product_labels]
                
        self.clf.fit(XX, yy)

        v = 0
        g_val = 0
        for i in range(len(products)):
            feat = self.clf.predict_proba(products[i].features)[0][1]
            val = products[i].value
            cost = products[i].price
            prob = feat
            temp = prob*(val - cost)
            if(temp > v):
                v = temp
                g_val = i        
        return g_val
    
                
        
Exemplo n.º 52
0
# Load the data (Assumes your current working directory is the Classify Job Titles problem directory)
job_titles = pd.read_csv("_Data/jobtitles.csv")

# Convert the categories Technology, Sales, and Finance to numbers 0, 1, and 2
y = list(map(lambda x: {'finance':0, 'sales':1, 'technology':2}[x], job_titles.job_category[:10]))

# Buid an MXN matrix where M is the number of samples, N is the number of unique words in the corpus (subject to parameters) and element [i,j] is the
# whether or not sample i contains word j
count_vectorizer = CountVectorizer()
count_vectorizer.fit(job_titles.job_title)
X = count_vectorizer.transform(job_titles.job_title[:10])

# Dump results into a pandas DataFrame since this is a small example for illustrative purposes
df = pd.DataFrame(X.toarray(), columns=count_vectorizer.get_feature_names())
df

# Now consider a new title
X_new = count_vectorizer.transform(job_titles.job_title[10:12])
df_new = pd.DataFrame(X_new.toarray(), columns=count_vectorizer.get_feature_names())
df_new

# Check our results with scikit-learn's Bernoulli Naive Bayes classifier
naive_bayes = BernoulliNB(alpha=0.000000001)  # make alpha virtually 0
naive_bayes.fit(X=df, y=y)
naive_bayes.predict_proba(X=df_new)

# Re-run with alpha = 1
naive_bayes = BernoulliNB(alpha=1)  # make alpha 1
naive_bayes.fit(X=df, y=y)
naive_bayes.predict_proba(X=df_new)
Exemplo n.º 53
0
Arquivo: nb.py Projeto: choupi/KDD2014
Y=[]
L=[]
trainf=open(sys.argv[1])
for l in trainf.readlines():
    sl = l.strip().split()
    L.append(sl[0])
    Y.append(int(sl[1]))
    xx=map(float,sl[2:])
    X.append(xx)

#print X
clf = BernoulliNB()
clf = clf.fit(X, Y)
print clf.feature_log_prob_
#print clf.oob_score_
Yp=clf.predict_proba(X)
YY=[p[1] for p in Yp]
print roc_auc_score(Y,YY)

pf=open('clfNB.pkl','w')
s = pickle.dump(clf, pf)
pf.close()

X=[]
L=[]
testf=open(sys.argv[2])
for l in testf.readlines():
    sl = l.strip().split()
    L.append(sl[0])
    xx=map(float,sl[1:])
    X.append(xx)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, s=50)
ax.set_title("PCA reduction (2d) of transformed data (%dd)" %
             X_transformed.shape[1])
ax.set_xticks(())
ax.set_yticks(())

# Plot the decision in original space. For that, we will assign a color to each
# point in the mesh [x_min, m_max] x [y_min, y_max].
h = .01
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# transform grid using RandomTreesEmbedding
transformed_grid = hasher.transform(np.c_[xx.ravel(), yy.ravel()])
y_grid_pred = nb.predict_proba(transformed_grid)[:, 1]

ax = pl.subplot(223)
ax.set_title("Naive Bayes on Transformed data")
ax.pcolormesh(xx, yy, y_grid_pred.reshape(xx.shape))
ax.scatter(X[:, 0], X[:, 1], c=y, s=50)
ax.set_ylim(-1.4, 1.4)
ax.set_xlim(-1.4, 1.4)
ax.set_xticks(())
ax.set_yticks(())

# transform grid using ExtraTreesClassifier
y_grid_pred = trees.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

ax = pl.subplot(224)
ax.set_title("ExtraTrees predictions")
Exemplo n.º 55
0
                Danger_Date_feature, Not_Danger_Date_feature,
                #District_Crime_feature, District_Other_feature
               ]
features = [str(x[1]) for x in feature_list if x[0]]


# # Trainding and Validation

# In[10]:

training, validation = train_test_split(train_df_new, train_size=.60)
#Score to beat 2.55250 

model = BernoulliNB()
model.fit(training[features], training['dummy_Category'])
predicted = np.array(model.predict_proba(validation[features]))
log_loss(validation['dummy_Category'], predicted)


# In[ ]:

from xgboost import XGBClassifier
model = XGBClassifier(max_depth=8,n_estimators=128)
model.fit(training[features], training['dummy_Category'])
predicted = np.array(model.predict_proba(validation[features]))
log_loss(validation['dummy_Category'], predicted)


# In[ ]:

from sklearn.ensemble import RandomForestClassifier
Exemplo n.º 56
0
finalPredictList = np.empty([len(testData),44], dtype=int)

counter = 0
for testList in testData:
    oriDestPair = testList[2] + testList[3]
    paxCount = int(testList[-2])
    tempXList = [0] * 44
    tempXList[pair[oriDestPair]] = 1
    if paxCount > 1:
        tempXList[43] = 1
    else:
        tempXList[42] = 1
    finalPredictList[counter] = tempXList
    counter += 1

predicted = model.predict_proba(finalPredictList)
predictedValue = model.predict(finalPredictList)

#Result Array
result = np.empty(len(testData),dtype=int)
ctr = 0
for tempList in testData:
    result[ctr] = fareClass[tempList[-1]]
    ctr += 1

print("Accuracy : ", Metrices.accuracy_score(result, predictedValue, normalize=False))

#Writing to a CSV file
csvFileHandle = open("data\\NBBernoulli.csv",'w', newline='')
with csvFileHandle:
    csvWriter = csv.writer(csvFileHandle)
            # list with active/inactive info
            ys_fit = [1]*num_actives + [0]*(len(training_list)-num_actives)
            # training fps
            train_fps = np_fps_act + [np_fps_dcy[i] for i in training_list[num_actives:]]
            # fit Naive Bayes
            ml.fit(train_fps, ys_fit)

            # test fps and molecule info
            test_fps = [np_fps_div_act[i] for i in test_list[:num_test_actives]]
            test_fps += [np_fps_dcy[i] for i in test_list[num_test_actives:]]
            test_mols = [[div_actives[i][0], 1] for i in test_list[:num_test_actives]]
            test_mols += [[decoys[i][0], 0] for i in test_list[num_test_actives:]]

            # rank based on probability
            single_score = ml.predict_proba(test_fps)
            # store: [probability, internal ID, active/inactive]
            single_score = [[s[1], m[0], m[1]] for s,m in zip(single_score, test_mols)]
            single_score.sort(reverse=True)
            scores['nb_'+fp_build].append(single_score)

        # write scores to file
        if do_append:
            outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'ab+') # binary format
        else:
            outfile = gzip.open(outpath+'/list_'+str(target)+'.pkl.gz', 'wb+') # binary format
        for fp in ['nb_'+fp_build]:
            cPickle.dump([fp, scores[fp]], outfile, 2)
        outfile.close()
        print "scoring done and scored lists written"
Exemplo n.º 58
0
#collect eigenvectors that correspond to 95% variance
ninetyfiveIndex =  np.nonzero(cum_var_exp > 0.95)[0][0] 
pca = PCA(n_components = ninetyfiveIndex)
pca.fit(x_train_std)
x_train_pca = pca.transform(x_train_std)
x_test_pca = pca.transform(x_test_std)


#Run a Bernoulli Naive Bayes classifier on the data
#While we use the predict_proba method here, Naive Bayes is notorious
#as a poor predictor, so it's likely that these probabilities will be 
#way off
#In addition, Bernoulli Naive Bayes doesn't jive well with the fact that
#age is not a binary variable. It will be  binarized, though this is less
#harmful than it may seem, as the cutoff of a single year of age seems
#to be an important factor in determining outcome
bnb = BernoulliNB()
bnb.fit(x_train_pca, y_train)
y_pred = bnb.predict_proba(x_test_pca)

#Label data appropriately and export to csv
data = {'Return_to_owner': y_pred[:,0],\
        'Euthanasia': y_pred[:,1],\
        'Died': y_pred[:,2], \
        'Adoption': y_pred[:,3],\
        'Transfer': y_pred[:, 4]}
results = pd.DataFrame(data = data)
results.to_csv("naive_bayes_results.csv")

Exemplo n.º 59
0
def BernoulliNBClassify_Proba(enrollment_id, trainData, trainLabel, testData):
    nbClf = BernoulliNB()
    nbClf.fit(trainData, ravel(trainLabel))
    testLabel = nbClf.predict_proba(testData)[:,1]
    saveResult(enrollment_id, testLabel, 'Proba_sklearn_BernoulliNB_Result.csv')
    return testLabel
Exemplo n.º 60
0
	if test_Y[i] == 0 and preds[i] == 1:
		incorrect_uncensored += 1

print "Total Accuracy: " + str(correct/float(correct + incorrect))
print "Censored Accuracy: " + str(correct_censored/float(correct_censored + incorrect_censored))
print "Uncensored Accuracy: " + str(correct_uncensored/float(correct_uncensored + incorrect_uncensored))
print "Rankings: "
# print clf.ranking_


print "Training Bernoulli Naive Bayes"

clf = BernoulliNB(alpha=1.0)
clf.fit(train_X, train_Y)
preds = clf.predict(test_X)
probs = clf.predict_proba(test_X)
fpr_lr_nb, tpr_lr_nb, thresholds_lr_nb = metrics.roc_curve(test_Y, probs.T[1], pos_label=1)
roc_auc_lr_nb = metrics.auc(fpr_lr_nb, tpr_lr_nb)

print "AUC nb: " + str(roc_auc_lr_nb)



correct = 0
incorrect = 0
correct_censored = 0
incorrect_censored = 0
correct_uncensored = 0
incorrect_uncensored = 0

for i in range(len(preds)):