Python BernoulliNB.fit示例，sklearn.naive_bayes.BernoulliNB.fit Python示例

示例#1

0

显示文件

文件： Models.py 项目： ineilm/BountyApp

def BernoulliNB_1(train_predictors,test_predictors,train_target,test_target):
    clf = BernoulliNB()
    clf.fit(train_predictors,train_target)
    predicted = clf.predict(test_predictors)
    accuracy = accuracy_score(test_target, predicted)
    print "Accuracy for Bernoulli Naive Bayes: "+str(accuracy)
    return accuracy,predicted

示例#2

0

显示文件

文件： dataset_one_learner.py 项目： Ikram/DUMLS14

def tryBinomialNaiveBayes(goFast):
  best_score = 0

  from sklearn.datasets import dump_svmlight_file, load_svmlight_file
  if goFast:
    training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True)
    validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True)
    testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True)
  else:
    training_data, training_labels = load_svmlight_file("dt1.trn.svm")
    validation_data, validation_labels = load_svmlight_file("dt1.vld.svm")
    testing_data, testing_labels = load_svmlight_file("dt1.tst.svm")

  from sklearn.naive_bayes import BernoulliNB

  for alpha_value in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
    for binarize_value in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
      for fit_prior_value in [True, False]:
        binary_operator = BernoulliNB(alpha_value,binarize_value,fit_prior_value)
        binary_operator.fit(training_data,training_labels)
        current_score = binary_operator.score(validation_data,validation_labels)

        print "Current test: " + str(alpha_value), str(binarize_value), fit_prior_value
        print "Current score: " + str(current_score)

        if current_score > best_score:
          best_score = current_score
          print "***NEW MAXIMUM SCORE: " + str(best_score)
          print "***NEW MAXIMUM PARAMETERS: " + str(alpha_value), str(binarize_value), fit_prior_value

  print "Best score was " + str(best_score)

示例#3

0

显示文件

文件： NaiveBayes_skl.py 项目： bsherin/tactic

    def render_content(self):
        if self.text_source is None:
            return "No text source selected."
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.naive_bayes import BernoulliNB
        from sklearn import metrics
        self.dm("creating vectorizer")
        vectorizer = CountVectorizer(stop_words=self.get_user_list(self.stop_list), max_features=self.vocab_size)
        data = self.get_column_data(self.text_source)
        self.dm("using vectorizer")
        X_train = vectorizer.fit_transform(data)
        Y_train = self.get_column_data(self.code_source)
        self.dm("creating classifier")
        clf = BernoulliNB()
        clf.fit(X_train, Y_train)
        
        accuracy = clf.score(X_train, Y_train)
        self.dm("predicting")
        pred = clf.predict(X_train)
        cm = metrics.confusion_matrix(Y_train, pred)

        self.dm("displaying result")
        html_output = "accuracy is " + str(round(accuracy, 2))
        html_output += '<pre>'+ str(cm) + '</pre>'

        return html_output

示例#4

0

显示文件

文件： LearningModel.py 项目： s3341458/rmit-twitter-amalysis-heroku

class NaiveBayesClassifierBernoulli:
    """
    this class capsules the Bernoulli NaiveBayes functions of scikit-learn in BernoulliNB class
"""
    def __init__(self, matrixFileName = matrixFilePath, dicFileName = dictFilePath):
        self.X,self.Y = load_svmlight_file(matrixFileName)
        self.dictionary = pickle.load(open(dicFileName, "rb"))
        self.bernoulliNB = BernoulliNB()
        self.bernoulliNB.fit(self.X, self.Y)
        self.matrixParser = Parser.MatrixParserForLearning()
        
    def classifyOneSentence(self, string):
        row = self.matrixParser.getRowForClassify(string, self.dictionary)
        if row != None:
#             return self.bernoulliNB.predict(row)
            return self.bernoulliNB.predict(row)
        else : return None
    
    def classifyOneSentenceWithProbability(self,string):
        row = self.matrixParser.getRowForClassify(string, self.dictionary)
        if row != None:
#             return self.bernoulliNB.predict(row)
            a = self.bernoulliNB.predict_proba(row)
            return a[0][1] - a[0][0]
        else : return None

示例#5

0

显示文件

文件： algorithms.py 项目： HugoLG/SFCrimeClassification

def NB_train_classifier(train_x, train_y):
    """ Returns the predictions on the validation set
    """
    classifier = BernoulliNB()
    classifier.fit(train_x, train_y)

    return classifier

示例#6

0

显示文件

文件： main.py 项目： jaksah/MLProject

def bernoulli_classify():
    clf = BernoulliNB()
    traindata = []
    traintarget = []
    for f in glob.glob("../../../res/articles/training_data/*-articles.json"):
        target = f.replace("-articles.json", "")
        target = re.sub(r".*\/+", "", target)
        output = readWholeFileBernoulli(f, target)
        traindata.extend(output[0])
        traintarget.extend(output[1])

    testdata = []
    testtarget = []
    for f in glob.glob("../../../res/articles/test_data/*-articles.json"):
        target = f.replace("-articles.json", "")
        target = re.sub(r".*\/+", "", target)
        output = readWholeFileBernoulli(f, target)
        testdata.extend(output[0])
        testtarget.extend(output[1])

    clf.fit(traindata, traintarget)
    ncorrect = 0
    total = len(testdata)
    for i in range(len(testdata)):
        predict = clf.predict(testdata[i])
        correct = testtarget[i]
        if correct == predict[0]:
            ncorrect += 1

        print ("Correct: {0} - Predicted: {1}".format(correct, predict[0]))

    print "Correct ", ncorrect, " Total ", total, " Correctness ", ncorrect * 1.0 / total

示例#7

0

显示文件

文件： test_CountVector.py 项目： luoyan/miniprog

def test_BernouliNB2():
    X = np.array([
        [0, 1],
        [1, 1],
        [1, 0],
        [-1, 1],
        [1000, 1000],
        [1000, 10001],
        [998, 800],
        [990, 1100],
        ]
            )
    print 'X ' + str(X)
    #Y = np.array([1, 1, 1, 1, 2, 2, 2, 2])
    Y = np.array([1, 2, 3, 4, 5, 6, 7, 8])
    print 'Y ' + str(Y)
    clf = BernoulliNB(alpha = 1)
    clf.fit(X, Y)
    X2 = np.array(
            [
            [1002, 1010],
            [1010, 910],
            [1003, 980],
            [1008, 1030],
            [-1, -1],
            [-3, -10],
            [40, 1],
            [1, -100],
            ]
            )
    for i in xrange(len(X2)):
        #pred_ret = clf.predict_proba(X2[i])
        pred_ret = clf.predict(X2[i])
        print 'X[' + str(i) + '] = ' + str(X[i]) + ' pred_ret ' + str(pred_ret)

示例#8

0

显示文件

文件： Benouilli Naive Bayes.py 项目： Datainsightx/kaggleprojects

def MungeData(train, test):

    todrop = ['v22', 'v112', 'v125', 'v74', 'v1', 'v110', 'v47']
    print(todrop)

    train.drop(todrop,
               axis=1, inplace=True)
    test.drop(todrop,
              axis=1, inplace=True)

    features = train.columns[2:]
    for col in features:
        if((train[col].dtype == 'object')):
            print(col)
            train, binfeatures = Binarize(col, train)
            test, _ = Binarize(col, test, binfeatures)
            nb = BernoulliNB()
            nb.fit(train[col+'_'+binfeatures].values, train.target.values)
            train[col] = \
                nb.predict_proba(train[col+'_'+binfeatures].values)[:, 1]
            test[col] = \
                nb.predict_proba(test[col+'_'+binfeatures].values)[:, 1]
            train.drop(col+'_'+binfeatures, inplace=True, axis=1)
            test.drop(col+'_'+binfeatures, inplace=True, axis=1)

    features = train.columns[2:]
    train[features] = train[features].astype(float)
    test[features] = test[features].astype(float)
    train.fillna(-1, inplace=True)
    test.fillna(-1, inplace=True)
    return train, test

示例#9

0

显示文件

文件： main.py 项目： Alitzlan/cs578

def bnb_fit(train_data, train_lbl_data):
    from sklearn.naive_bayes import BernoulliNB
    print "Starts bnb"

    bnb = BernoulliNB()
    bnb.fit(train_data, train_lbl_data)
    return bnb

示例#10

0

显示文件

文件： predictions.py 项目： kwheeler27/insight_datasci

def predict(cur, plyr_id, game_plyrs): 
  #creates training set (called 'X') for plyr
  all_plyrs = all_player_ids(cur) #np.array - all NFL players (and coaches)
  games = games_played_in(cur, plyr_id) #np.array - the games_ids the player played in
  n_cols = all_plyrs.shape[0] #int 
  m_rows = games.shape[0] #int
  zeros = np.zeros((m_rows, n_cols)) #2darr - used to initialize DF
  X = pd.DataFrame(zeros, index=games, columns=all_plyrs) #dataframe
  populate_training_set(cur, X, games, plyr_id)
  print "X: ", X.values
  
  
  #creates vector of known output values
  Y = training_output_vector(cur, games, plyr_id)
  print "(len) Y: ", len(Y), Y
  test_zeros = np.zeros((1, n_cols)) #2darr - used to initialize DF
  test_X = pd.DataFrame(zeros, columns=all_plyrs) #dataframe
  update_training_matrix(game_plyrs, 0, test_X)
  
  #run Bernoulli NB Classifier
  nb_clf = BernoulliNB()
  
  if len(X.values) == 0:
    return 0
  nb_clf.fit(X, Y)
  nb_predictions = nb_clf.predict(test_X)
  print "test_X: ", test_X.values
  nb_norm_prob = normalize_probs(nb_clf.predict_proba(test_X)[0])
  avgs = [1.5, 4.5, 7.5, 10.5, 13.5, 16.5, 19.5, 22.5, 25.5, 28.5, 31.5]
  print "param vector: ", nb_clf.predict_proba(test_X)[0]
  print "probs: ", nb_norm_prob
  print avgs
  ev = expected_val(nb_norm_prob, avgs) #can also calc dot product
  return round(ev, 1)

示例#11

0

显示文件

文件： feature_classification_functions.py 项目： svenvdbeukel/Short-text-corpus-with-focus-on-humor-detection

def combined_experiment(train_x,train_y,test_x,test_y,train_f_x,train_f_y,test_f_x,test_f_y, bias):
    labels = [] # Will contain all the final labels that result from the voting
    clf_c1 = MultinomialNB()
    clf_c1.fit(train_x,train_y)
    clf_c2 = BernoulliNB()
    clf_c2.fit(train_x,train_y)
    clf_f1 = svm.SVC(kernel='linear',cache_size = 512)
    clf_f1.fit(train_f_x,train_f_y)
    clf_f2 = svm.SVC(kernel='rbf',cache_size = 512)
    clf_f2.fit(train_f_x,train_f_y)
    
    p1 = clf_c1.predict(test_x)
    p2 = clf_c2.predict(test_x)
    p3 = clf_f1.predict(test_f_x)
    p4 = clf_f2.predict(test_f_x)
    if bias == 'content':
        for i in range(len(p1)):
            if p1[i] == p2[i] or p1[i] == p3[i]:
                labels.append(p1[i])
            else:
                labels.append(p2[i])
    elif bias == "syntax":
        for i in range(len(p1)):
            if p1[i] == p3[i] or p1[i] == p4[i]:
                labels.append(p1[i])
            else:
                labels.append(p3[i])
    else:
        print 'Please enter a valid bias ("syntax" or "content")!'
    p_combined = np.array(labels)
    accuracy = (np.sum(p_combined == test_y)/np.float_(len(test_y)))
    return accuracy

示例#12

0

显示文件

文件： classfication.py 项目： Skylatitude/BNP-Kaggle-Competition

 def doclassify(self, type='normal'):
     if type == 'normal':
         clf = BernoulliNB()
         clf.fit(self.train_x, self.train_y)
         BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
         score = clf.score(self.train_x, self.train_y)
         print 'score = ', score

示例#13

0

显示文件

文件： ml_docs_classification_2.py 项目： RaoUmer/docs_classification

def BNB(data_train, data_train_vectors, data_test_vectors, **kwargs):
    # Implementing classification model- using BernoulliNB
    clf_BNB = BernoulliNB(alpha=.01)
    clf_BNB.fit(data_train_vectors, data_train.target)
    y_pred = clf_BNB.predict(data_test_vectors)
    
    return y_pred

示例#14

0

显示文件

文件： classifiers.py 项目： nate-parrott/relationship-thing

def compareClassifiers():
	(observations, classes) = createObservations()
	observations = np.array(observations)
	classes = np.array(classes)

	# make tree classifier
	my_tree = tree.DecisionTreeClassifier()
	my_tree.fit(observations, classes)
	tree_score = my_tree.score(observations, classes)
	tree_cv = cross_validation.cross_val_score(my_tree, observations, classes, scoring='accuracy', cv=10)
	#print "tree score:", tree_score, "tree cv", np.mean(tree_cv)

	# make naive classifier
	naive = BernoulliNB(binarize=None)
	naive.fit(observations, classes)
	naive_score = naive.score(observations, classes)
	naive_cv = cross_validation.cross_val_score(naive, observations, classes, scoring='accuracy', cv=10)
	#print "naive score:", naive_score, "naive cv", np.mean(naive_cv)

	# make SVM classifier
	svm = LinearSVC()
	svm.fit(observations, classes)
	svm_score = svm.score(observations, classes)
	svm_cv = cross_validation.cross_val_score(svm, observations, classes, scoring='accuracy', cv=10)
	#print "svm score:", svm_score, "svm cv", np.mean(svm_cv)

	# make Log classifier
	log = LogisticRegression()
	log.fit(observations, classes)
	log_score = log.score(observations, classes)
	log_cv = cross_validation.cross_val_score(log, observations, classes, scoring='accuracy', cv=10)
	#print "log score:", log_score, "log cv", np.mean(log_cv)

	return [(tree_score, np.mean(tree_cv)), (naive_score, np.mean(naive_cv)), (svm_score, np.mean(svm_cv)), (log_score, np.mean(log_cv))]

示例#15

0

显示文件

文件： test_CountVector.py 项目： luoyan/miniprog

def test_BernouliNB4():
    X = np.array([
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [0, 0],
        [0, 0],
        [1, 0],
        ]
            )
    print 'X ' + str(X)
    #Y = np.array([1, 1, 1, 1, 2, 2, 2, 2])
    Y = np.array([1, 1, 0, 1, 0, 0, 0, 1, 1, 0])
    print 'Y ' + str(Y)
    clf = BernoulliNB(alpha = 1)
    clf.fit(X, Y)
    X2 = np.array(
            [
            [1, 1],
            ]
            )
    for i in xrange(len(X2)):
        #pred_ret = clf.predict_proba(X2[i])
        pred_ret = clf.predict(X2[i])
        print 'X[' + str(i) + '] = ' + str(X2[i]) + ' pred_ret ' + str(pred_ret)

示例#16

0

显示文件

文件： extra_trees_classifier.py 项目： bigning/kaggle_bnp

def MungeData(train, test, validation):

    features = train.columns[2:]
    print(type(features))
    for col in features:
        if((train[col].dtype == 'object') and (col!="v22")):
            print(col)
            train, binfeatures = Binarize(col, train)
            test, _ = Binarize(col, test, binfeatures)
            validation , _ = Binarize(col, validation, binfeatures)
            nb = BernoulliNB()
            nb.fit(train[col+'_'+binfeatures].values, train.target.values)
            train[col] = \
                nb.predict_proba(train[col+'_'+binfeatures].values)[:, 1]
            test[col] = \
                            nb.predict_proba(test[col+'_'+binfeatures].values)[:, 1]
            validation[col] = \
                            nb.predict_proba(validation[col+'_'+binfeatures].values)[:, 1]
            train.drop(col+'_'+binfeatures, inplace=True, axis=1)
            test.drop(col+'_'+binfeatures, inplace=True, axis=1)
            validation.drop(col+'_'+binfeatures, inplace=True, axis=1)
            train[col] = train[col].astype(float)
            test[col] = test[col].astype(float)
            validation[col] = validation[col].astype(float)
    return train, test, validation

示例#17

0

显示文件

文件： naive_bayes.py 项目： phecy/cdips-kaggle

def main(output_file=time.strftime('%h%d-%Hh%Mm')+'.csv', in_pkl=None):
    """ Generates features and fits classifier. 
    Input command line argument is optional run name, defaults to date/time.
    """
    logging.info("Loading features...")
    if not in_pkl:
        return "input .plk required"
    trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(in_pkl)
    logging.info("Loaded features, fitting model...")
    # Bernoulli Naive Bayes
    clf = BernoulliNB(alpha=1.0, binarize=None, fit_prior=True)
    clf.fit(trainFeatures,trainTargets)
    logging.info("Predicting...")
    # Use probabilities instead of binary class prediction in order to generate a ranking    
    predicted_scores = clf.predict_log_proba(testFeatures).T[1]

    logging.info("Write results...")
    logging.info("Writing submission to %s" % output_file)
    f = open(output_file, "w")
    f.write("id\n")

    for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
        # only writes item_id per output spec, but may want to look at predicted_scores
        f.write("%d\n" % (item_id))

    f.close()
    logging.info("Done.")

示例#18

0

显示文件

文件： DataSimulator.py 项目： ds-ga-1007/final_project

    def generatePredictingModel(data):
        """
            Build the prediction model (based on the data set we have) in order to be able to predict the category
            of a new video from the user input
            Return a classifier able to predict the category of a video based on its title and description.
        """
        try:
            # Intitialize a timer to compute the time to build the model
            start = time.time()

            # Split into train-test data set
            X = data[[x for x in data.columns if x in ('title', 'description')]]
            Y = data[[x for x in data.columns if x in ('video_category_id')]]
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.80, random_state = 10)

            # Build the 2 text corpus
            corpus_title = X_train['title'].values.tolist()
            corpus_description = X_train['description'].values.tolist()

            # initializes the 2 vectorizers.
            count_vectorizer_title = CountVectorizer()
            count_vectorizer_description = CountVectorizer()

            # learn the 2 vocabulary dictionary
            count_vectorizer_title.fit(corpus_title)
            count_vectorizer_description.fit(corpus_description)

            # Build the sparse matrices
            X_train_count_title = count_vectorizer_title.transform(X_train['title'])
            X_train_count_description = count_vectorizer_description.transform(X_train['description'])
            X_test_count_title = count_vectorizer_title.transform(X_test['title'])
            X_test_count_description = count_vectorizer_description.transform(X_test['description'])

            # Set and train the models (for title and description features)
            model_count_title = BernoulliNB()
            model_count_description = BernoulliNB()
            model_count_title.fit(X_train_count_title, Y_train['video_category_id'])
            model_count_description.fit(X_train_count_description, Y_train['video_category_id'])

            # Merge the title and description predictions and build a new prediction based on these 2 predictions combined
            new_df_train = pd.DataFrame()
            new_df_train['title_prediction'] = model_count_title.predict(X_train_count_title)
            new_df_train['description_prediction'] = model_count_description.predict(X_train_count_description)
            new_df_test = pd.DataFrame()
            new_df_test['title_prediction'] = model_count_title.predict(X_test_count_title)
            new_df_test['description_prediction'] = model_count_description.predict(X_test_count_description)
            tree = DecisionTreeClassifier()
            tree.fit(new_df_train, Y_train)

            end = time.time()
            execution_time = end - start

            print "Time to build this incredibly amazing model, only : {} seconds!!!!!!".format(execution_time)
            time.sleep(3)

            return tree, model_count_title, model_count_description,count_vectorizer_title,count_vectorizer_description

        except:
            raise VideoAnalysisException(" Error while creation of predictive model ")

示例#19

0

显示文件

文件： naivebayes.py 项目： byted/ShelterAnimalOutcomeKaggle

def score(train_X, train_y):

    X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.01, random_state=10)

    clf = BernoulliNB(binarize=False, fit_prior=True, alpha=0.7)
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_valid)
    return log_loss(y_valid, y_pred)

示例#20

0

显示文件

文件： exp_sentiment.py 项目： appscluster/sentiment-CNN

	def testBoGNB(self):
		'''
		Test on sentiment analysis task using Naive Bayes classifier 
		with Bag-of-Word feature vectors.
		'''
		wordlist = []
		# Preprocessing of original txt data set
		for i, sent in enumerate(self.senti_train_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			wordlist.extend(words)
		for i, sent in enumerate(self.senti_test_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			wordlist.extend(words)
		word_dict = set(wordlist)
		word2index = dict(zip(word_dict, range(len(word_dict))))
		# Build BoG feature
		train_size = len(self.senti_train_txt)
		test_size = len(self.senti_test_txt)
		pprint('Training set size: %d' % train_size)
		pprint('Test set size: %d' % test_size)
		train_feat = np.zeros((train_size, len(word_dict)), dtype=np.float)
		test_feat = np.zeros((test_size, len(word_dict)), dtype=np.float)
		# Using binary feature
		start_time = time.time()
		for i, sent in enumerate(self.senti_train_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			indices = map(lambda x: word2index[x], words)
			train_feat[i, indices] = 1.0
		for i, sent in enumerate(self.senti_test_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			indices = map(lambda x: word2index[x], words)
			test_feat[i, indices] = 1.0
		end_time = time.time()
		pprint('Finished building training and test feature matrix, time used: %f seconds.' % (end_time-start_time))
		pprint('Classification using Bernoulli Naive Bayes classifier: ')
		clf = BernoulliNB()
		# clf = LogisticRegression()
		clf.fit(train_feat, self.senti_train_label)
		train_pred_label = clf.predict(train_feat)
		train_acc = np.sum(train_pred_label == self.senti_train_label) / float(train_size)
		pprint('Training accuracy = %f' % train_acc)
		pred_label = clf.predict(test_feat)
		acc = np.sum(pred_label == self.senti_test_label) / float(test_size)
		pprint('Accuracy: %f' % acc)
		train_pos_count = np.sum(self.senti_train_label == 1)
		train_neg_count = np.sum(self.senti_train_label == 0)
		test_pos_count = np.sum(self.senti_test_label == 1)
		test_neg_count = np.sum(self.senti_test_label == 0)
		pprint('Positive count in training set: %d' % train_pos_count)
		pprint('Negative count in training set: %d' % train_neg_count)
		pprint('Ratio: pos/neg = %f' % (float(train_pos_count) / train_neg_count))
		pprint('Positive count in test set: %d' % test_pos_count)
		pprint('Negative count in test set: %d' % test_neg_count)
		pprint('Ratio: pos/neg = %f' % (float(test_pos_count) / test_neg_count))

示例#21

0

显示文件

文件： ran_inference.py 项目： rAnYKM/rAnPrivGP

 def nb_classifier(self, secret):
     clf = BernoulliNB()
     x = self.raw_attr_vector(secret)
     y = self.get_labels(secret)
     fsl = self.feature_sel(secret)
     new_x = fsl.transform(x)
     clf.fit(new_x, y)
     new_y = clf.predict(new_x)
     return clf, fsl, self.evaluate(new_y, y)

示例#22

0

显示文件

文件： main.py 项目： TythonLee/kddcup-2015

def bnb(X,y,Z,test_data):  
    from sklearn.naive_bayes import BernoulliNB
    bnb = BernoulliNB()
    bnb.fit(X,y)
    #MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)  
    test_probs_bnb = bnb.predict_proba(Z)[:, 1]
    sub = pd.DataFrame({'enrollment_id':test_data["enrollment_id"], 
                        'truth':test_probs_bnb}).set_index("enrollment_id")
    sub.to_csv('data\\result\\sixth_bnb.csv')

示例#23

0

显示文件

文件： sentiment.py 项目： jannson/crfseg

def train(neg=None, pos=None):
    the_file = os.path.dirname(os.path.abspath(__file__))
    if not neg:
        neg = os.path.join(the_file, '..', 'origin', 'neg.txt')
    if not pos:
        pos = os.path.join(the_file, '..', 'origin', 'pos.txt')
    
    tagger = crfseg.create_tagger()
    tok_cn = lambda (x): crfseg.cut_zh(x, tagger)
    
    tfidf = TfidfVectorizer(tokenizer=tok_cn, sublinear_tf=True, max_df=0.5)
    pipe = Pipeline([
        ('tfidf', tfidf),
    #    ('svd', TruncatedSVD(32)),
    #    ('normal', Normalizer(copy=False))
        ])
    '''
    hasher = HashingVectorizer(n_features=2**16,
                               tokenizer=tok_cn, non_negative=True,
                               norm=None, binary=False)
    '''

    #clf = SGDClassifier(loss='log', penalty='l2', alpha=0.00001, n_iter=50, fit_intercept=True)
    #clf = MultinomialNB()
    clf = BernoulliNB()
    
    neg_file = codecs.open(neg, 'r', 'utf-8')
    pos_file = codecs.open(pos, 'r', 'utf-8')

    x_train = []
    y_train = []
    
    i = 0
    for line in neg_file:
        x_train.append(line)
        y_train.append(0)
    for line in pos_file:
        x_train.append(line)
        y_train.append(1)
    
    print 'begin transform'
    #x_train = hasher.transform(x_train)
    x_train = pipe.fit_transform(x_train)
    print 'begin fit'
    clf.fit(x_train, y_train)

    print 'begin save'
    tfidf_file = os.path.join(the_file, 'data', 'tfidf.pkl')
    clf_file = os.path.join(the_file, 'data', 'sgdc_clf.pkl')
    #_ = joblib.dump(tfidf, tfidf_file, compress=9)
    _ = joblib.dump(clf, clf_file, compress=9)

    print 'begin test'
    x_test = [u'这个东西真心很赞']
    #x_test = hasher.transform(x_test)
    x_test = pipe.transform(x_test)
    print clf.predict(x_test)

示例#24

0

显示文件

文件： naive_bayes_noFS.py 项目： poddar/predictive

def BernoulliNB_pred(X_train, X_test, y_train):
    clf_NB = BernoulliNB()
    clf_NB.fit(X_train, y_train)

    # Conveting to back, (could be used sklearn standardization function for both decoding and encoding)
    predictions_train = clf_NB.predict_proba(X_train)
    predictions = clf_NB.predict_proba(X_test)

    return predictions[:, 1], predictions_train[:, 1]

示例#25

0

显示文件

文件： classification.py 项目： FindBoat/Kaggle

def bernoulli_naive_bayes(x_train, y_train, x_cv, y_cv):
    """ Using Naive Bayes to classify the data. """

    print 'Training with NB...'
    clf = BernoulliNB()
    clf.fit(x_train, y_train)

    print 'Accuracy in training set: %f' % clf.score(x_train, y_train)
    print 'Accuracy in cv set: %f' % clf.score(x_cv, y_cv)
    return clf

示例#26

0

显示文件

文件： 01_withoutNAPattern.py 项目： BitTigerKaggle/bnp-chenditc-repo1

def convertToNumeric(df):
    features = df.columns[2:]
    for col in features:
        if((df[col].dtype == 'object')):
            print "Converting {0} to numerical data".format(col)
            labelEncode(df, col)
            nb = BernoulliNB()
            nb.fit(df[[col]], df['target'])
            new_col = col + "_binarized"
            df[new_col] = nb.predict_proba(df[[col]])[:, 1]

示例#27

0

显示文件

文件： Classify.py 项目： tbs1980/Kaggle_DecMeg2014

def BernoulliNaiveBayes(x_train, y_train, x_cv, y_cv):
	"""
	Bernoulli Naive Bayes
	"""
	#print "Classifier: Bernoulli Naive Bayes"
	clfr = BernoulliNB()
	clfr.fit(x_train, y_train)
	#print 'Accuracy in training set: %f' % clfr.score(x_train, y_train)
	#print 'Accuracy in cv set: %f' % clfr.score(x_cv, y_cv)
	return clfr

示例#28

0

显示文件

文件： test_CountVector.py 项目： luoyan/miniprog

def test_BernouliNB():
    X = np.random.randint(2, size=(6, 100))
    print 'X ' + str(X)
    Y = np.array([1, 2, 3, 4, 4, 5])
    print 'Y ' + str(Y)
    clf = BernoulliNB()
    clf.fit(X, Y)
    for i in xrange(6):
        pred_ret = clf.predict(X[i])
        print 'X[' + str(i) + '] = ' + str(X[i]) + ' pred_ret ' + str(pred_ret)

示例#29

0

显示文件

文件： naive_bayes.py 项目： TomWerner/sentiment_analysis

def evaluate_baseline():
    inputs, outputs, words = preprocessing.build_data_target_matrices("aclImdb/train/pos", "aclImdb/train/neg", binary_output=True)
    tst_inputs, tst_outputs, _ = preprocessing.build_test_data_target_matrices("aclImdb/test/pos", "aclImdb/test/neg", words, binary_output=True)
    model = BernoulliNB()

    scores = cross_val_score(model, inputs, outputs.ravel(), cv=10)
    logging.info("Accuracy for %s: %.02f, std: %.02f" % ("Baseline BernoulliNB", scores.mean(), scores.std()))

    model.fit(inputs, outputs.ravel())
    logging.info(accuracy_score(tst_outputs.ravel(), model.predict(tst_inputs)))

示例#30

0

显示文件

文件： model.py 项目： di0cvg/MEDIA_PROJECT

class NaiveBayes(StatModel):
	def __init__(self):
		self.name  = "nb"
		self.model = BernoulliNB()

	def train(self, samples, labels):
		self.model.fit(samples, labels)
				
	def predict(self, samples):
		return self.model.predict(samples)

示例#31

0

显示文件

文件： model_training_bagOfwords.py 项目： sacred-deer/Sentiment-Analysis-of-Movie-Reviews

    plt.ylabel('F1 Score')
    plt.xlabel('Log (' + param_name + ')')
    plt.title('Plot - Validation Set Performance of ' + classifier_name +
              ' w.r.t. ' + param_name)
    plt.show()


# Naive bayes classifier

print("Bernoulli Naive Bayes Classifier")
# Tuning Hyper Parameter alpha
hp_f1 = []
a = alpha_from
while a < alpha_to:
    classifier = BernoulliNB(alpha=a)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_valid)
    score = f1_score(y_valid, y_pred, average=f1_avg_param)
    hp_f1.append([math.log10(a), score, a])
    print("Alpha " + str(a) + " : " + str(score))
    a *= alpha_step
    # select alpha
selected_alpha = max(hp_f1, key=lambda item: item[1])
print("Alpha with best performance : " + str(selected_alpha[2]))

#plot the graph
performance_plot(
    np.asarray(hp_f1)[:, 0],
    np.asarray(hp_f1)[:, 1], selected_alpha, "Naive Bayes classifier", "Alpha")

#Training the classifier on the selected alpha

示例#32

0

显示文件

文件： Assignment 4.py 项目： deepakorantak/Python

corpus = [dictionary.doc2bow(text) for text in processed_texts]
#print(corpus[1])

# ## Initializing TFIDF parameters from corpus
tfidf = models.TfidfModel(corpus)

# ## Creating TFIDF Matrix from data
corpus_tfidf = tfidf[corpus]
print(corpus_tfidf.obj)

## Creating LSA model on the tfidf
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=120)
lsi.print_topics(10)

lsi_corpus = []
for lsi_doc in lsi[corpus]:
    lsi_corpus.append([topic_component[1] for topic_component in lsi_doc])
import numpy as np
lsi_corpus = np.array(lsi_corpus)
print(lsi_corpus.shape)

from sklearn.naive_bayes import BernoulliNB
nb_model = BernoulliNB()
nb_model.fit(lsi_corpus, all_categories)

from sklearn.metrics import accuracy_score

#backslash means the function continues in next line\
print('Accuracy on test data: {}%'.format(\
                                          accuracy_score(all_categories, nb_model.predict(lsi_corpus))\
                                          *100))

示例#33

0

显示文件

文件： model.py 项目： anukat2015/dossier.models

def extract(positive_fcs, negative_fcs, features=None):
    '''Takes a labeled set of feature collections (positive and negative)
       and the features wanted. And trains a Naive Bayes classifier on
       the underlying keys of the set of selected features features.
       If no features are selected, all are used.

       Returns two list of (keywords, strength) tuples ordered by strength. The
       first are feature keys that were predictive of the positive
       label and the second are the feature keys are were predictive
       of the negative label.

    ``*_fcs`` is the list of feature collections, positive label and
            negative label respectively.

    ``features`` designates which specific feature gets vectorized the
               other features are ignored.

    '''

    # Vector of labels
    labels = np.array([1] * len(positive_fcs) + [0] * len(negative_fcs))

    # Used to convert the feature collection keys into a sklearn
    # compatible format
    v = DictVectorizer(sparse=False)

    D = list()
    for fc in (positive_fcs + negative_fcs):
        feat = StringCounter()

        if not fc:
            logger.warn('how did we get an empty fc? %r', fc)

        else:
            # The features used to pull the keys for the classifier
            for f in features:
                feat += fc[f]

        D.append(feat)

    # Convert the list of Counters into an sklearn compatible format
    X = v.fit_transform(D)

    # Fit the sklearn Bernoulli Naive Bayes classifer
    clf = BernoulliNB()
    clf.fit(X, labels)

    # Extract the learned features that are predictive of the positive
    # and negative class
    positive_keywords = v.inverse_transform(clf.feature_log_prob_[1])[0]
    negative_keywords = v.inverse_transform(clf.feature_log_prob_[0])[0]

    pos_words = Counter(positive_keywords)
    neg_words = Counter(negative_keywords)

    ## make a list ordered by their weight
    pos_ordered = sorted(pos_words.items(),
                         key=operator.itemgetter(1),
                         reverse=True)
    neg_ordered = sorted(neg_words.items(),
                         key=operator.itemgetter(1),
                         reverse=True)

    return pos_ordered, neg_ordered

示例#34

0

显示文件

文件： Untitled23.py 项目： mshivi123/machinelearning_prob

y_train

# In[16]:

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

# In[17]:

myber = BernoulliNB()
mygau = GaussianNB()
mymul = MultinomialNB()

# In[19]:

mygaumodel = mygau.fit(x_train, y_train)
mybermodel = myber.fit(x_train, y_train)
mymulmodel = mymul.fit(x_train, y_train)

# In[20]:

ypgau = mygaumodel.predict(x_test)
ypber = mybermodel.predict(x_test)
ypmul = mymulmodel.predict(x_test)

# In[21]:

from sklearn import metrics

# In[24]:

acc_gau = metrics.accuracy_score(y_target, ypgau)

示例#35

0

显示文件

from __future__ import division
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
import sys
import image_util

(train_set, train_label,
 count_label) = image_util.load_dataset(image_util.DS2_TRAIN_PATH,
                                        image_util.DS2_LABEL_SIZE)

# clf = GaussianNB()
clf = BernoulliNB()
clf.fit(train_set, train_label)

(val_set, val_label,
 val_count_label) = image_util.load_dataset(image_util.DS2_VAL_PATH,
                                            image_util.DS2_LABEL_SIZE)

predictions = clf.predict(val_set)

correct_count = 0
for row in range(image_util.DS2_VAL_SIZE):
    print("prediction: " + str(predictions[row]))
    print("actual: " + str(val_label[row]))

    if predictions[row] == val_label[row]:
        correct_count = correct_count + 1

print(correct_count / image_util.DS2_VAL_SIZE)

示例#36

0

显示文件

文件： Classifiers.py 项目： hd0125/SpamMessage-1

def bernoulli_naive_bayes_classifier(train_x, train_y):
    from sklearn.naive_bayes import MultinomialNB
    model = BernoulliNB(alpha=0.01)
    model.fit(train_x, train_y)
    return model

示例#37

0

显示文件

文件： bernoulliNBclassifier.py 项目： Jtrice/CS-175-project

def bernNBClassifier(trainingVectors, targetValues):

    clf = BernoulliNB()
    clf.fit(trainingVectors, targetValues, targetValues * 10000)

    return (clf)

示例#38

0

显示文件

print("\n" + "SVC_classifier")
log_model3 = LinearSVC()
log_model3 = log_model3.fit(X=X_train, y=y_train)
y_pred = log_model3.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

from sklearn.naive_bayes import MultinomialNB, BernoulliNB

# MultinomialNB_classifier
print("\n" + "MultinomialNB")
log_model_multinomial = MultinomialNB()
log_model_multinomial = log_model_multinomial.fit(X=X_train, y=y_train)
y_pred = log_model_multinomial.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

# BernoulliNB ClassifierI
print("\n" + "BernoulliNB")
log_model_bernoulli = BernoulliNB()
log_model_bernoulli = log_model_bernoulli.fit(X=X_train, y=y_train)
y_pred = log_model_bernoulli.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

示例#39

0

显示文件

文件： tpot_mnist_pipeline_triangulateAggregationLevelParticipantSplitaggr_4_groups2.py 项目： brains-on-code/conducting-and-analyzing-human-studies

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.9051851851851852
exported_pipeline = BernoulliNB(alpha=0.001, fit_prior=True)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

示例#40

0

显示文件

文件：伯努利贝叶斯文本分类.py 项目： zjutcy/data-mining

#制作词向量表
data = pd.read_excel('C:/Users/64191/Desktop/Contents.xlsx', sheetname=0)
data.Content = data.Content.str.replace('[0-9a-zA-A]', '')
jieba.load_userdict(r'C:/Users/64191/Desktop/all_words.txt')
with open(r'C:/Users/64191/Desktop/mystopwords.txt', encoding='UTF-8') as f:
    stop_words = [i.strip('\n') for i in f.readlines()]


def cut(x):
    words = []
    for i in jieba.lcut(x):
        if i not in stop_words:
            words.append(i)
    result = ' '.join(words)
    return result


word = data.Content.apply(cut)
counts = CountVectorizer(min_df=0.01)
data_matrix = counts.fit_transform(word).toarray()
#进行分类与测试
X = pd.DataFrame(data_matrix, columns=counts.get_feature_names())
Y = data.Type
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=1)
bnb = BernoulliNB()
bnb.fit(X_train, Y_train)
pred = bnb.predict(X_test)
print(classification_report(Y_test, pred))

示例#41

0

显示文件

文件： NaiveBayes_TEST.py 项目： Ryuk17/MachineLearning

trainData = pd.read_table('../dataset1/train.txt',
                          header=None,
                          encoding='gb2312',
                          delim_whitespace=True)
testData = pd.read_table('../dataset1/test.txt',
                         header=None,
                         encoding='gb2312',
                         delim_whitespace=True)
trainLabel = np.array(trainData.pop(3))
trainData = np.array(trainData)
testLabel = np.array(testData.pop(3))
testData = np.array(testData)

time_start1 = time.time()
clf1 = BayesClassifier()
clf1.train(trainData, trainLabel)
clf1.predict(testData)
score1 = clf1.accuarcy(testLabel)
time_end1 = time.time()
print("Accuracy of self-Bayes: %f" % score1)
print("Runtime of self-Bayes:", time_end1 - time_start1)

time_start = time.time()
clf = BernoulliNB()
clf.fit(trainData, trainLabel)
clf.predict(testData)
score = clf.score(testData, testLabel, sample_weight=None)
time_end = time.time()
print("Accuracy of sklearn-Bayes: %f" % score)
print("Runtime of sklearn-Bayes:", time_end - time_start)

示例#42

0

显示文件

文件： Q1.py 项目： XilunWu/courses

 clf_sum = 0
 lr_sum = 0
 svm_li_sum = 0
 svm_rbf_sum = 0
 i = -1
 for train, test in kf:
     #NBC
     i += 1
     train_1 = train_disc_data[train]
     train_2 = train_conti_data[train]
     test_1 = train_disc_data[test]
     test_2 = train_conti_data[test]
     train_true = train_target[train]
     test_true = train_target[test]
     clf_train_disc = BernoulliNB()
     clf_train_disc.fit(train_1, train_true)
     clf_train_conti = GaussianNB()
     clf_train_conti.fit(train_2, train_true)
     result1 = clf_train_disc.predict_proba(test_1)
     result2 = clf_train_conti.predict_proba(test_2)
     result_arr = np.zeros(len(test), dtype=int)
     for index in range(len(test)):
         result_a = result1[index, 0] * result2[index, 0]
         result_b = result1[index, 1] * result2[index, 1]
         if (result_a < result_b): result_arr[index] = 1
         else: result_arr[index] = 0
     clf_sum += f1_score(test_true, result_arr)
     if (k_value == 50):
         t_test_value[i, 0] = f1_score(test_true, result_arr)
     #logistic
     lr_data = np.column_stack((train_conti_data, train_disc_data))

示例#43

0

显示文件

文件： ParkinsonDataAllProcess.py 项目： gulcesaydur/Parkinson-Data

testdf = testdf.dropna()
testLabel = testLabel.dropna()
testLabel = testLabel.apply(int)

try:
    svmModel = svmAlg.fit(trainDf, trainLabel)
    svmpred = svmModel.predict(testdf)
    svmAcc = accuracy_score(testLabel, svmpred)
    print 'SVM Accuracy : ' + str(svmAcc)

    gnbModel = gnb.fit(trainDf, trainLabel)
    gnbpred = gnbModel.predict(testdf)
    gnbAcc = accuracy_score(testLabel, gnbpred)
    print 'GNB Accuracy : ' + str(gnbAcc)

    bnbModel = bnb.fit(trainDf, trainLabel)
    bnbpred = bnbModel.predict(testdf)
    bnbAcc = accuracy_score(testLabel, bnbpred)
    print 'BNB Accuracy : ' + str(bnbAcc)

    treeModel = tree.fit(trainDf, trainLabel)
    treepred = treeModel.predict(testdf)
    treeAcc = accuracy_score(testLabel, treepred)
    print 'Decision Tree Accuracy : ' + str(treeAcc)

    rndModel = rnd.fit(trainDf, trainLabel)
    rndpred = rndModel.predict(testdf)
    rndAcc = accuracy_score(testLabel, rndpred)
    print 'Random Forest Accuracy : ' + str(rndAcc)
except Exception, e:
    print 'model error ' + str(e)

示例#44

0

显示文件

#############################SUPPORT VECTOR MACHINES###########################

from sklearn.svm import SVC

svc = SVC(verbose=True, random_state=0)

svc.fit(X_train, y_train)

############################### Naive Bayes ##################################

from sklearn.naive_bayes import BernoulliNB

BernNB = BernoulliNB(binarize=True)

BernNB.fit(X_train, y_train)

#### 3 SINGLE LAYER NEURAL NETWORL - PERCEPTRON ###############################

X_ann = X.copy()
X_train_ann = np.array(X_train.copy())
X_test_ann = np.array(X_test.copy())
y_train_ann = np.array(y_train.copy())


class NeuralNetwork():
    def __init__(self):
        np.random.seed(4)
        self.synaptic_weights = 2 * np.random.random((12, 1)) - 1

    def sigmoid(self, x):

示例#45

0

显示文件

from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Train KNeighborsClassifier Model
KNN_Classifier = KNeighborsClassifier(n_jobs=-1)
KNN_Classifier.fit(X_train, Y_train)

# Train LogisticRegression Model
LGR_Classifier = LogisticRegression(n_jobs=-1, random_state=0)
LGR_Classifier.fit(X_train, Y_train)

# Train Gaussian Naive Baye Model
BNB_Classifier = BernoulliNB()
BNB_Classifier.fit(X_train, Y_train)

# Train Decision Tree Model
DTC_Classifier = tree.DecisionTreeClassifier(criterion='entropy',
                                             random_state=0)
DTC_Classifier.fit(X_train, Y_train)
#Evaluate Models
from sklearn import metrics

models = []
models.append(('Naive Baye Classifier', BNB_Classifier))
models.append(('Decision Tree Classifier', DTC_Classifier))
models.append(('KNeighborsClassifier', KNN_Classifier))
models.append(('LogisticRegression', LGR_Classifier))

for i, v in models:

示例#46

0

显示文件

文件： BNB_sentiment.py 项目： thekana/cs9414-opinion-mining

X_train, X_test, y_train, y_test = train_test_split(
    text_data, Y, test_size=0.25, shuffle=False)

count = CountVectorizer(preprocessor=myPreprocessor,
                        lowercase=False, tokenizer=myTokenizer, max_features=size)

X_train = count.fit_transform(X_train).toarray()
print("----------Train vector------------", len(X_train))
print(X_train)
X_test = count.transform(X_test).toarray()
print("----------Test vector------------", len(X_test))
print(X_test)

start_time = time.time()
clf = BernoulliNB()
model = clf.fit(X_train, y_train)
training_time = (time.time() - start_time)

# print(y_test, y_pred)
# print(model.predict_proba(X_test))
# print(precision_score(y_test, y_pred, average='micro'))
# print(recall_score(y_test, y_pred, average='micro'))
# print(f1_score(y_test, y_pred, average='micro'))
# print(f1_score(y_test, y_pred, average='macro'))

y_pred = model.predict(X_test)
# print(classification_report(y_test, y_pred))
# print('Accuracy score:', accuracy_score(y_test, y_pred))
testtime = time.time() - start_time
test_report = classification_report(y_test, y_pred, output_dict=True)

示例#47

0

显示文件

文件： 3.Kaggle_San_Francisco_Crime_Classification.py 项目： LittleHeap/MachineLearning-Projects

'''
    模型搭建
'''

# 只取星期几和街区作为分类器输入特征
features = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'BAYVIEW', 'CENTRAL',
            'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']

# 分割训练集(3/5)和测试集(2/5)
training, validation = train_test_split(trainData, train_size=.60)

# 朴素贝叶斯建模，计算log_loss
model = BernoulliNB()
nbStart = time.time()
model.fit(training[features], training['crime'])
nbCostTime = time.time() - nbStart
predicted = np.array(model.predict_proba(validation[features]))
print("朴素贝叶斯建模耗时 %f 秒" % (nbCostTime))
# 朴素贝叶斯建模耗时 0.591072 秒
print("朴素贝叶斯log损失为 %f" % (log_loss(validation['crime'], predicted)))
# 朴素贝叶斯log损失为 2.615596

# 逻辑回归建模，计算log_loss
model = LogisticRegression(C=.01)
lrStart = time.time()
model.fit(training[features], training['crime'])
lrCostTime = time.time() - lrStart
predicted = np.array(model.predict_proba(validation[features]))
log_loss(validation['crime'], predicted)
print("逻辑回归建模耗时 %f 秒" % (lrCostTime))

示例#48

0

显示文件

文件： SVM_LR_DT.py 项目： KramerJack/Python-Scripts

plt.title('ROC curve for SMV Fraud Classification')
plt.xlabel('False Positive Rate (1-Specificity)')
plt.ylabel('True Possitive Rate (Sensitivity)')
plt.grid(True)
plt.show()

#END SVM MODEL

#START NAIVE BAYES MODEL

#Crating the train and test populations 33% in testing data set. for Naive Bayes and Decision Tree
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, Y1, test_size = .33, random_state = 17)

#NB1 BernoulliNB
BernNB = BernoulliNB(binarize = 0.025) # use either 0.025 0.1 or True
BernNB.fit(X1_train, Y1_train)
print(BernNB)

Y1_expect = Y1_test
Y1_pred = BernNB.predict(X1_test)
print(accuracy_score(Y1_expect, Y1_pred))

#BernNB Evalutation
#Confusion Matrix
confusion_matrix(Y1_expect, Y1_pred)
#AUCROC Curve
fpr, tpr, thresholds = metrics.roc_curve(Y1_expect, Y1_pred)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Beroulli Naives bays Fraud Classification')

示例#49

0

显示文件

文件： code.py 项目： philiphossu/MachineLearning-CS584

# Setup data structures to hold train, test results
train_jll = np.zeros((10, 15))
test_jll = np.zeros((10, 15))

for i in range(0, 10):
    idx = 0
    # Split datasets
    x_train, x_test, y_train, y_test = train_test_split(Xs[i],
                                                        ys[i],
                                                        test_size=1. / 3,
                                                        random_state=7000)
    for j in alphas:
        # 1. Create new Bernoulli Naive Bayes model using alpha value
        mod = BernoulliNB(alpha=j)
        # Fit the model to the training set
        mod.fit(x_train, y_train)
        # Compute the joint log likelihood for the training set, store it train_jll 2d array
        total_res = mod._joint_log_likelihood(x_train)
        y_train_binary = y_train * 1
        entry_val = 0
        # Sum-up by matching true labels
        for k in range(0, len(y_train)):
            entry_val += total_res[k][y_train_binary[k]]
        # Store result
        train_jll[i][idx] = entry_val
        # 2. Compute the joint log likelihood for the testing set, store it test_jll 2d array
        total_res = mod._joint_log_likelihood(x_test)
        y_test_binary = y_test * 1
        entry_val = 0
        # Sum-up by matching true labels
        for k in range(0, len(y_test)):

示例#50

0

显示文件

le.fit(dataset["Sex"])
dataset["Sex"] = le.transform(dataset["Sex"])

#assigning DV to y and IDV to x
y = dataset["Pclass"]
X = dataset[["Survived", "Sex", "Age", "SibSp", "Parch", "Fare"]]

print(y.count())

#training the model
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

#applying naive bayes algorithm
from sklearn.naive_bayes import BernoulliNB

clf = BernoulliNB()

#prediction
y_pred = clf.fit(X_train, y_train).predict(X_test)

#accuracy score
print("The accuracy score is : ", accuracy_score(y_test,
                                                 y_pred,
                                                 normalize=True))

#confusion matrix
print("The confusion matrix is: \n", confusion_matrix(y_test, y_pred))

示例#51

0

显示文件

文件： basic_classifier.py 项目： eshita-g/DataAnalyticsAndMining

classifier = SVC()
classifier2 = DecisionTreeClassifier()
classifier3 = BernoulliNB()
classifier4 = GaussianNB()

# shape,size,color
# spherical,oval,long
# small,medium,large

train_x = [[0, 1, 0], [0, 2, 1], [1, 2, 2], [0, 1, 2], [2, 1, 2], [0, 0, 1],
           [0, 0, 1], [0, 0, 1], [0, 0, 1], [0, 1, 0], [0, 1, 0], [0, 2, 1]]

train_y = [0, 1, 2, 3, 4, 5, 5, 5, 5, 0, 0, 1]

test_x = [[1, 2, 2], [0, 1, 0]]
# test_y = [2,5]
classifier.fit(train_x, train_y)
classifier2.fit(train_x, train_y)
classifier3.fit(train_x, train_y)
classifier4.fit(train_x, train_y)
prediction = classifier.predict(test_x)
prediction2 = classifier2.predict(test_x)
prediction3 = classifier3.predict(test_x)
prediction4 = classifier4.predict(test_x)

print("prediction  :", prediction)
print("prediction2 :", prediction2)
print("prediction3 :", prediction3)
print("prediction4 :", prediction4)

示例#52

0

显示文件

文件： build_matrix.py 项目： mvanderlyn27/multifunction_binary_classifier

def main():
    #start timer
    start = time.time()
    #import data from training file and test file
    sparse_matrix, ranks = importData("train_drugs.dat", True, 0)
    test_data = importData("test.dat", False, sparse_matrix.shape[1])

    #run dimensionality reduction on input data
    selector_tsvd = None
    sparse_matrix_tsvd = []
    if (path.exists("./pickles/selector_tsvd.p")
            and path.exists("./pickles/sparse_matrix_tsvd.p")):
        selector_tsvd = pk.load(open("./pickles/selector_tsvd.p", "rb"))
        sparse_matrix_tsvd = pk.load(
            open("./pickles/sparse_matrix_tsvd.p", "rb"))
    else:
        svd = TruncatedSVD(n_components=200, n_iter=7, random_state=42)
        selector_tsvd = svd.fit(sparse_matrix, ranks)
        sparse_matrix_tsvd = selector_tsvd.transform(sparse_matrix)
        pk.dump(selector_tsvd, open("./pickles/selector_tsvd.p", "wb"))
        pk.dump(sparse_matrix_tsvd, open("./pickles/sparse_matrix_tsvd.p",
                                         "wb"))

#run features selection on data to remove unimportant features
#recursive features selection to remove most of the features that are least important
    selector_rfe = None
    sparse_matrix_rfe = []
    if (path.exists("./pickles/selector_rfe.p")
            and path.exists("./pickles/sparse_matrix_rfe.p")):
        selector_rfe = pk.load(open("./pickles/selector_rfe.p", "rb"))
        sparse_matrix_rfe = pk.load(open("./pickles/sparse_matrix_rfe.p",
                                         "rb"))
    else:
        selector_rfe, sparse_matrix_rfe = rfeFeatureSelection(
            sparse_matrix, ranks)
        pk.dump(selector_rfe, open("./pickles/selector_rfe.p", "wb"))
        pk.dump(sparse_matrix_rfe, open("./pickles/sparse_matrix_rfe.p", "wb"))

    #recursive features selection with cross validation to chose best of reamining features
    selector_rfecv = None
    sparse_matrix_rfecv = []
    if (path.exists("./pickles/selector_rfecv.p")
            and path.exists("./pickles/sparse_matrix_rfecv.p")):
        selector_rfecv = pk.load(open("./pickles/selector_rfecv.p", "rb"))
        sparse_matrix_rfecv = pk.load(
            open("./pickles/sparse_matrix_rfecv.p", "rb"))
    else:
        selector_rfecv, sparse_matrix_rfecv = rfecvFeatureSelection(
            sparse_matrix_rfe, ranks)
        pk.dump(selector_rfecv, open("./pickles/selector_rfecv.p", "wb"))
        pk.dump(sparse_matrix_rfecv,
                open("./pickles/sparse_matrix_rfecv.p", "wb"))

    #run chi^2 selection on original data to see how accurate it is
    sparse_matrix_chi = []
    selector_chi = None
    if (path.exists("sparse_matrix_chi.p")):
        sparse_matrix_chi = pickle.load(
            open("./pickles/sparse_matrix_chi.p", "rb"))
        selector_chi = pickle.load(open("/pickles/selector_chi.p", "rb"))
    else:
        selector_chi, sparse_matrix_chi = chiSquareSelection(
            sparse_matrix, ranks)
        pk.dump(sparse_matrix_chi, open("./pickles/sparse_matrix_chi.p", "wb"))
        pk.dump(selector_chi, open("./pickles/selector_chi.p", "wb"))

    #account for imbalanced data with SMOTE over sampling
    Orig_X_resampled, Orig_y_resampled = SMOTE().fit_resample(
        sparse_matrix.todense(), ranks)

    TSVD_X_resampled, TSVD_y_resampled = SMOTE().fit_resample(
        sparse_matrix_tsvd, ranks)

    rfe_X_resampled, rfe_y_resampled = SMOTE().fit_resample(
        sparse_matrix_rfe, ranks)

    rfecv_X_resampled, rfecv_y_resampled = SMOTE().fit_resample(
        sparse_matrix_rfecv, ranks)

    chi_X_resampled, chi_y_resampled = SMOTE().fit_resample(
        sparse_matrix_chi, ranks)

    #set up classifiers, train on data
    #Bernoulli naive bayes
    nb_orig = BernoulliNB()
    nb_orig.fit(sparse_matrix, ranks)

    nb_orig_resampled = BernoulliNB()
    nb_orig_resampled.fit(Orig_X_resampled, Orig_y_resampled)

    nb_tsvd = BernoulliNB()
    nb_tsvd.fit(TSVD_X_resampled, TSVD_y_resampled)

    nb_tsvd_non_sampled = BernoulliNB()
    nb_tsvd_non_sampled.fit(sparse_matrix_tsvd, ranks)

    nb_rfec = BernoulliNB()
    nb_rfec.fit(selector_rfe.transform(sparse_matrix), ranks)

    nb_rfecv = BernoulliNB()
    nb_rfecv.fit(rfecv_X_resampled, rfecv_y_resampled)

    nb_rfecv_non_sampled = BernoulliNB()
    nb_rfecv_non_sampled.fit(sparse_matrix_rfecv, ranks)

    nb_chi = BernoulliNB()
    nb_chi.fit(chi_X_resampled, chi_y_resampled)

    #decision tree classifier
    dt_rfecv_resampled = DecisionTreeClassifier(random_state=0)
    dt_rfecv_resampled.fit(rfecv_X_resampled, rfecv_y_resampled)

    dt_rfecv = DecisionTreeClassifier(random_state=0)
    dt_rfecv.fit(sparse_matrix_rfecv, ranks)

    dt_orig = DecisionTreeClassifier(random_state=0)
    dt_orig.fit(sparse_matrix, ranks)

    dt_orig_resampled = DecisionTreeClassifier(random_state=0)
    dt_orig_resampled.fit(Orig_X_resampled, Orig_y_resampled)

    dt_tsvd = DecisionTreeClassifier(random_state=0)
    dt_tsvd.fit(sparse_matrix_tsvd, ranks)

    dt_tsvd_resampled = DecisionTreeClassifier(random_state=0)
    dt_tsvd_resampled.fit(TSVD_X_resampled, TSVD_y_resampled)

    dt_chi = DecisionTreeClassifier(random_state=0)
    dt_chi.fit(chi_X_resampled, chi_y_resampled)

    #run test predictions
    #run naive bayes predictions
    orig_pred = nb_orig.predict(sparse_matrix)
    orig_pred_resamp = nb_orig_resampled.predict(sparse_matrix)
    tsvd_pred = nb_tsvd.predict(selector_tsvd.transform(sparse_matrix))
    tsvd_non_sampled_pred = nb_tsvd_non_sampled.predict(
        selector_tsvd.transform(sparse_matrix))
    rfe_pred = nb_rfec.predict(selector_rfe.transform(sparse_matrix))
    rfecv_pred = nb_rfecv_non_sampled.predict(
        selector_rfecv.transform(selector_rfe.transform(sparse_matrix)))
    rfecv_pred_non_sampeld = nb_rfecv.predict(
        selector_rfecv.transform(selector_rfe.transform(sparse_matrix)))
    chi_pred = nb_chi.predict(selector_chi.transform(sparse_matrix))

    #run decision tree predictions
    dt_rfecv_resampled_pred = dt_rfecv_resampled.predict(
        selector_rfecv.transform(selector_rfe.transform(sparse_matrix)))
    dt_rfecv_pred = dt_rfecv.predict(
        selector_rfecv.transform(selector_rfe.transform(sparse_matrix)))
    dt_orig_pred = dt_orig.predict(sparse_matrix)
    dt_orig_resampled_pred = dt_orig_resampled.predict(sparse_matrix)
    dt_tsvd_pred = dt_tsvd.predict(selector_tsvd.transform(sparse_matrix))
    dt_tsvd_resampled_pred = dt_tsvd_resampled.predict(
        selector_tsvd.transform(sparse_matrix))
    dt_chi_pred = dt_chi.predict(selector_chi.transform(sparse_matrix))

    #test the output f1 score
    #test for naive bayes
    orig_f1 = f1_score(ranks, orig_pred, average='macro')
    orig_f1_resampled = f1_score(ranks, orig_pred_resamp, average='macro')
    tsvd_f1 = f1_score(ranks, tsvd_pred, average='macro')
    tvsd_resampled_f1 = f1_score(ranks, tsvd_non_sampled_pred, average='macro')
    rfe_f1 = f1_score(ranks, rfe_pred, average='macro')
    rfecv_f1 = f1_score(ranks, rfecv_pred, average='macro')
    rfecv_reasmple_f1_non_sampled = f1_score(ranks,
                                             rfecv_pred_non_sampeld,
                                             average='macro')
    chi_f1 = f1_score(ranks, chi_pred, average='macro')

    #test for decision trees
    dt_rfec_resampled_f1 = f1_score(ranks,
                                    dt_rfecv_resampled_pred,
                                    average='macro')
    dt_rfecv_f1 = f1_score(ranks, dt_rfecv_pred, average='macro')
    dt_orig_f1 = f1_score(ranks, dt_orig_pred, average='macro')
    dt_orig_resampled_f1 = f1_score(ranks,
                                    dt_orig_resampled_pred,
                                    average='macro')
    dt_tsvd_f1 = f1_score(ranks, dt_tsvd_pred, average='macro')
    dt_tsvd_resampled_f1 = f1_score(ranks,
                                    dt_tsvd_resampled_pred,
                                    average='macro')
    dt_chi_f1 = f1_score(ranks, dt_chi_pred, average='macro')

    #output the different test results
    print('orig:', orig_f1, 'orig_resampled:', orig_f1_resampled, 'tsvd:',
          tsvd_f1, 'tvsd_resampled_f1:', tvsd_resampled_f1, 'rfe_f1:', rfe_f1,
          'rfecv_f1:', rfecv_f1, 'rfecv_reasmple_f1_non_sampled:',
          rfecv_reasmple_f1_non_sampled, 'chi_f1:', chi_f1)
    print('dt_rfec_resampled_f1:', dt_rfec_resampled_f1, 'dt_rfecv_f1:',
          dt_rfecv_f1, 'dt_orig_f1:', dt_orig_f1, 'dt_orig_resampled_f1:',
          dt_orig_resampled_f1, 'dt_tsvd_f1:', dt_tsvd_f1,
          'dt_tsvd_resampled_f1:', dt_tsvd_resampled_f1)

    #test with testfile using best classifier
    transformed_data = selector_rfe.transform(test_data)
    test_predict = nb_chi.predict(selector_chi.transform(test_data))
    with open('test_file_prediction.dat', "w") as fp2:
        for num in test_predict:
            fp2.write(str(num) + '\n')
    print(len(test_predict))

    #print out time elapsed
    end = time.time()
    print(end - start)

示例#53

0

显示文件

文件： 4_get_feature_device_start_close_tfidf_1_2.py 项目： GaryLIA/yiguan_sex_age_predict_1st_solution

df_stack.to_csv('feature/tfidf_ridge_1_3_error_single_classfiy.csv',
                index=None,
                encoding='utf8')
print('ridge特征已保存\n')

########################### bnb(BernoulliNB) ################################
print('BernoulliNB stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0

for i, (tr, va) in enumerate(
        StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
    print('stack:%d/%d' % ((i + 1), n_folds))
    bnb = BernoulliNB()
    bnb.fit(train_feature[tr], score[tr])
    score_va = bnb.predict_proba(train_feature[va])
    score_te = bnb.predict_proba(test_feature)
    print(score_va)
    print('得分' +
          str(mean_squared_error(score[va], bnb.predict(train_feature[va]))))
    stack_train[va] += score_va
    stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
    df_stack['tfidf_bnb_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_bnb_1_3_error_single_classfiy.csv',
                index=None,
                encoding='utf8')

示例#54

0

显示文件

clf_prob = LogisticRegression(C=1,
                              class_weight='balanced',
                              dual=False,
                              fit_intercept=True,
                              intercept_scaling=0.2,
                              max_iter=100,
                              multi_class='ovr',
                              n_jobs=1,
                              penalty='l2',
                              random_state=None,
                              solver='liblinear',
                              tol=0.0001,
                              verbose=0,
                              warm_start=False)

clf_text.fit(trn_text_bow, trn_text_classes_bow)
clf_description.fit(trn_description_bow, trn_description_classes_bow)
trn_prob, trn_prob_class = prepare_combined_model(clf_text, clf_description,
                                                  vectorizer_text,
                                                  vectorizer_description,
                                                  test_set)
clf_prob.fit(trn_prob, trn_prob_class)

# ###################### Add genders for each user #######################
print 'Predicting...'
user = defaultdict(list)
image_url = {}
for id in database:
    document = database[str(id)]
    if document['gender'] == None and len(user[document['user']['id']]) < 5:
        if document['user']['profile_image_url'] != None and document['user'][

示例#55

0

显示文件

文件： p2.py 项目： ShellyPant/spam-detection-project

    after_stem_words = []
    for w in new_words:
        after_stem_words.append(ps.stem(w))
    clean_msg = ' '.join(after_stem_words)
    return clean_msg


df['msg'] = df.msg.apply(clean_text)

print('data cleaned...')

X = cv.fit_transform(df.msg).toarray()
new_X = pca.fit_transform(X)
y = df.iloc[:, 0].values
print('going for training...')
log.fit(new_X, y)
print('model trained....')

root = Tk()
root.state('zoomed')
root.configure(background='yellow')
l1 = Label(root,
           text='Spam Detection',
           bg='yellow',
           fg='blue',
           font=('', 40, 'bold'))
l1.place(x=190, y=20)

l2 = Label(root,
           text='Enter msg:',
           bg='yellow',

示例#56

0

显示文件

test_numbers = cv.transform(new_test_data).toarray()
print(test_numbers)

# # Multinomial Naive Bayes :

# In[10]:

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

# In[11]:

mnb = MultinomialNB()
mnb.fit(numbers, y)

# In[12]:

mnb.predict(test_numbers)

# # Bernaulli Naive Bayes :

# In[13]:

bnb = BernoulliNB()
bnb.fit(numbers, y)

# In[14]:

bnb.predict(test_numbers)

# In[ ]:

示例#57

0

显示文件

def bernoulliNB():
    X = np.random.randint(2, size=(6, 100))
    Y = np.array([1, 2, 3, 4, 4, 5])
    clf = BernoulliNB()
    clf.fit(X, Y)
    print(clf.predict(X[2:3]))

示例#58

0

显示文件

class NBClassifier(object):
    def __init__(self,
                 decompose_func=None,
                 preprocessor=None,
                 nbits=15,
                 seed=1):
        self.decompose_func = decompose_func
        self.nbits = nbits
        feature_size, bitmask = set_feature_size(nbits=nbits)
        self.feature_size = feature_size
        self.bitmask = bitmask
        self.encoding_func = make_encoder(decompose_func,
                                          preprocessors=preprocessor,
                                          bitmask=self.bitmask,
                                          seed=seed)
        self.classifier = BernoulliNB(alpha=0.1,
                                      binarize=None,
                                      fit_prior=True,
                                      class_prior=None)

    def fit(self, graphs, targets):
        data_mtx = vectorize_graphs(graphs,
                                    encoding_func=self.encoding_func,
                                    feature_size=self.feature_size)
        # binarize
        data_mtx.data = np.where(data_mtx.data > 0, 1, 0)
        self.classifier.fit(data_mtx, targets)
        return self

    def decision_function(self, graphs):
        # return probability associated to largest target type
        data_mtx = vectorize_graphs(graphs,
                                    encoding_func=self.encoding_func,
                                    feature_size=self.feature_size)
        # binarize
        data_mtx.data = np.where(data_mtx.data > 0, 1, 0)
        preds = self.classifier.predict_proba(data_mtx)
        # assuming binary classification and column 1 to represent positives
        preds = preds[:, 1].reshape(-1)
        return preds

    def predict(self, graphs):
        data_mtx = vectorize_graphs(graphs,
                                    encoding_func=self.encoding_func,
                                    feature_size=self.feature_size)
        # binarize
        data_mtx.data = np.where(data_mtx.data > 0, 1, 0)
        preds = self.classifier.predict(data_mtx)
        return preds

    def explain(self, graphs, top_k):
        feature_dict, feature_counts = get_feature_dict(
            graphs,
            decomposition_funcs=self.decompose_func,
            nbits=self.nbits,
            return_counts=True)
        # compute log-odds
        scores = self.classifier.feature_log_prob_[
            1, :] / self.classifier.feature_log_prob_[0, :]
        ranked_pos_features = np.argsort(-scores)
        # signature-counts
        stats = [(feature_dict[id].graph['signature'], feature_counts[id])
                 for id in feature_dict]
        # aggregate counts according to same signature
        sig_dict = dict()
        for sig, c in stats:
            if sig in sig_dict:
                sig_dict[sig] += c
            else:
                sig_dict[sig] = c
        # take logs
        for id in sig_dict:
            sig_dict[id] = math.log(sig_dict[id])
        # select top_k
        feature_graphs = [
            feature_dict[fid] for fid in ranked_pos_features[:top_k]
        ]
        c = Counter([g.graph['signature'] for g in feature_graphs])
        cnt = dict([(id, c[id] / sig_dict[id]) for id in c])
        tot = sum(cnt[id] for id in cnt)
        res = [
            (cnt[id] / tot, cnt[id], id)
            for id in sorted(cnt.keys(), key=lambda id: cnt[id], reverse=True)
        ]
        return res

示例#59

0

显示文件

                                                    test_size=0.000001,
                                                    random_state=0)
print('Training SVC: ')
clf = svm.SVC()
clf.fit(X_train, y_train)
print("SVC Accuracy Test: ", clf.score(X_test, y_test))
########################
print('Training GNB: ')
gnb = GaussianNB()
gnb.fit(X_train, y_train)
print("GNB Accuracy Test: ", gnb.score(X_test, y_test))
print(gnb.predict_proba(X_test[0].reshape(1, -1)))
########################
print('Training BNB: ')
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
print("BNB Accuracy Test: ", bnb.score(X_test, y_test))
#######################
'''
print('Training kNN: ')
test_result=list()
for jj in range (1,200,20):
	neigh = KNeighborsClassifier(n_neighbors=jj,p=1)
	neigh.fit(X_train,y_train)
	print("kNN Accuracy Test for",jj," neighbors: ",neigh.score(X_test,y_test))
	test_result.append(neigh.score(X_test,y_test))

plt.plot(test_result)
plt.ylabel('some numbers')
plt.show()'''
'''

示例#60

0

显示文件

            randForrC.fit(trainX, yTrain)
            tmpSCR = randForrC.score(testX, yTest)
            scores['randForr'][label].append(tmpSCR)
        else: 
            randForrR.fit(trainX, yTrain)
            tmpSCR = randForrR.score(testX, yTest)
            scores['randForr'][label].append(tmpSCR)

        # print("start adaBoost")
        # adaBoostC.fit(trainX, yTrain)
        # tmpSCR = adaBoostC.score(testX, yTest)
        # scores['adaBoost'][label].append(tmpSCR)

        print("start bernoulli NB")
        if cnt < 2:
            bernNB.fit(trainX, yTrain)
            tmpSCR = bernNB.score(testX, yTest)
            scores['bernNB'][label].append(tmpSCR)
        else:
            gausRidge.fit(trainX, yTrain)
            tmpSCR = gausRidge.score(testX, yTest)
            scores['bernNB'][label].append(tmpSCR)

        # print("start gradient boost")
        # gradBoostC.fit(trainX, yTrain)
        # tmpSCR = gradBoostC.score(trainX, yTest)
        # scores['gradBoost'][label].append(tmpSCR)

        print("start SVM")
        if cnt < 2:
            svmC.fit(trainX, yTrain)