예제 #1
0
def naive_bayes(df,column):
    reviews_pn = df[df['class'].isin(['positive','negative'])]
    comments = list(reviews_pn[column].values)
    classes = list(reviews_pn['class'].values)
    
    # preprocess creates the term frequency matrix for the review data set
    stop = stopwords.words('english')
    count_vectorizer = CountVectorizer(stop_words = stop, ngram_range=(1,3))
    comments1 = count_vectorizer.fit_transform(comments)
    tfidf_comments = TfidfTransformer(use_idf=True).fit_transform(comments1)
    
    # preparing data for split validation. 60% training, 40% test
    data_train,data_test,target_train,target_test = cross_validation.train_test_split(tfidf_comments,classes,test_size=0.4,random_state=43)
    classifier = BernoulliNB().fit(data_train,target_train)
    predicted = classifier.predict(data_test)
    
    print classification_report(target_test,predicted)
    print "The accuracy score is {:.2%}".format(accuracy_score(target_test,predicted))
    
    most_informative_feature_for_binary_classification(count_vectorizer,classifier,n=20)
    
    #predict on unknown
    reviews_nc = reviews_df[reviews_df['class'] == '']
    comments_nc = list(reviews_nc[column].values)
    comments_nc1 = count_vectorizer.transform(comments_nc)    
    tfidf_comments_nc = TfidfTransformer(use_idf=True).fit_transform(comments_nc1)    
    new_predicted = classifier.predict(tfidf_comments_nc)
    
    print "negative = %s" %sum(new_predicted == 'negative')
    print "positive = %s" %sum(new_predicted == 'positive')
예제 #2
0
def main():
	# Get the data and targets
	df = pd.read_csv('train1.csv')
	df = df[df.rating != 'rating']
	corpus = [review for review in df.review]
	splitPoint = len(corpus)*2/3
	trainingCorpus = corpus[:splitPoint]
	testCorpus = corpus[splitPoint:]
	target = [rating for rating in df.rating]
	trainingTarget = np.array(target[:splitPoint])
	testTarget = np.array(target[splitPoint:])

	# Train the algorithm
	train_X, vocabList = createVectorizer(trainingCorpus, 'None', True)
	NB_Bern_model = BernoulliNB().fit(train_X, trainingTarget)

	# Test the algorithm
	test_X = createVectorizer(testCorpus, vocabList, True)
	test_predict = NB_Bern_model.predict(test_X)
	print(np.mean(test_predict == testTarget))	
	print metrics.classification_report(testTarget, test_predict, target_names=['0', '1'])

	# Make Predictions
	predict_df = pd.read_csv('test2.csv')
	predictCorpus = [review for review in predict_df.review]
	member = [memberid for memberid in predict_df.ID]
	predict_X = createVectorizer(predictCorpus, vocabList, True)
	predictions = NB_Bern_model.predict(predict_X)
	predict_df.columns = ['ID', 'Predicted']
	for i in range(len(member)):
	 	predict_df.loc[predict_df['ID'] == member[i], 'Predicted'] = predictions[i]
	predict_df.to_csv('submission1.csv', sep = ',', index=False)
예제 #3
0
    def generatePredictingModel(data):
        """
            Build the prediction model (based on the data set we have) in order to be able to predict the category
            of a new video from the user input
            Return a classifier able to predict the category of a video based on its title and description.
        """
        try:
            # Intitialize a timer to compute the time to build the model
            start = time.time()

            # Split into train-test data set
            X = data[[x for x in data.columns if x in ('title', 'description')]]
            Y = data[[x for x in data.columns if x in ('video_category_id')]]
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.80, random_state = 10)

            # Build the 2 text corpus
            corpus_title = X_train['title'].values.tolist()
            corpus_description = X_train['description'].values.tolist()

            # initializes the 2 vectorizers.
            count_vectorizer_title = CountVectorizer()
            count_vectorizer_description = CountVectorizer()

            # learn the 2 vocabulary dictionary
            count_vectorizer_title.fit(corpus_title)
            count_vectorizer_description.fit(corpus_description)

            # Build the sparse matrices
            X_train_count_title = count_vectorizer_title.transform(X_train['title'])
            X_train_count_description = count_vectorizer_description.transform(X_train['description'])
            X_test_count_title = count_vectorizer_title.transform(X_test['title'])
            X_test_count_description = count_vectorizer_description.transform(X_test['description'])

            # Set and train the models (for title and description features)
            model_count_title = BernoulliNB()
            model_count_description = BernoulliNB()
            model_count_title.fit(X_train_count_title, Y_train['video_category_id'])
            model_count_description.fit(X_train_count_description, Y_train['video_category_id'])

            # Merge the title and description predictions and build a new prediction based on these 2 predictions combined
            new_df_train = pd.DataFrame()
            new_df_train['title_prediction'] = model_count_title.predict(X_train_count_title)
            new_df_train['description_prediction'] = model_count_description.predict(X_train_count_description)
            new_df_test = pd.DataFrame()
            new_df_test['title_prediction'] = model_count_title.predict(X_test_count_title)
            new_df_test['description_prediction'] = model_count_description.predict(X_test_count_description)
            tree = DecisionTreeClassifier()
            tree.fit(new_df_train, Y_train)

            end = time.time()
            execution_time = end - start

            print "Time to build this incredibly amazing model, only : {} seconds!!!!!!".format(execution_time)
            time.sleep(3)

            return tree, model_count_title, model_count_description,count_vectorizer_title,count_vectorizer_description

        except:
            raise VideoAnalysisException(" Error while creation of predictive model ")
예제 #4
0
	def testBoGNB(self):
		'''
		Test on sentiment analysis task using Naive Bayes classifier 
		with Bag-of-Word feature vectors.
		'''
		wordlist = []
		# Preprocessing of original txt data set
		for i, sent in enumerate(self.senti_train_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			wordlist.extend(words)
		for i, sent in enumerate(self.senti_test_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			wordlist.extend(words)
		word_dict = set(wordlist)
		word2index = dict(zip(word_dict, range(len(word_dict))))
		# Build BoG feature
		train_size = len(self.senti_train_txt)
		test_size = len(self.senti_test_txt)
		pprint('Training set size: %d' % train_size)
		pprint('Test set size: %d' % test_size)
		train_feat = np.zeros((train_size, len(word_dict)), dtype=np.float)
		test_feat = np.zeros((test_size, len(word_dict)), dtype=np.float)
		# Using binary feature
		start_time = time.time()
		for i, sent in enumerate(self.senti_train_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			indices = map(lambda x: word2index[x], words)
			train_feat[i, indices] = 1.0
		for i, sent in enumerate(self.senti_test_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			indices = map(lambda x: word2index[x], words)
			test_feat[i, indices] = 1.0
		end_time = time.time()
		pprint('Finished building training and test feature matrix, time used: %f seconds.' % (end_time-start_time))
		pprint('Classification using Bernoulli Naive Bayes classifier: ')
		clf = BernoulliNB()
		# clf = LogisticRegression()
		clf.fit(train_feat, self.senti_train_label)
		train_pred_label = clf.predict(train_feat)
		train_acc = np.sum(train_pred_label == self.senti_train_label) / float(train_size)
		pprint('Training accuracy = %f' % train_acc)
		pred_label = clf.predict(test_feat)
		acc = np.sum(pred_label == self.senti_test_label) / float(test_size)
		pprint('Accuracy: %f' % acc)
		train_pos_count = np.sum(self.senti_train_label == 1)
		train_neg_count = np.sum(self.senti_train_label == 0)
		test_pos_count = np.sum(self.senti_test_label == 1)
		test_neg_count = np.sum(self.senti_test_label == 0)
		pprint('Positive count in training set: %d' % train_pos_count)
		pprint('Negative count in training set: %d' % train_neg_count)
		pprint('Ratio: pos/neg = %f' % (float(train_pos_count) / train_neg_count))
		pprint('Positive count in test set: %d' % test_pos_count)
		pprint('Negative count in test set: %d' % test_neg_count)
		pprint('Ratio: pos/neg = %f' % (float(test_pos_count) / test_neg_count))
예제 #5
0
파일: sentiment.py 프로젝트: jannson/crfseg
def train(neg=None, pos=None):
    the_file = os.path.dirname(os.path.abspath(__file__))
    if not neg:
        neg = os.path.join(the_file, '..', 'origin', 'neg.txt')
    if not pos:
        pos = os.path.join(the_file, '..', 'origin', 'pos.txt')
    
    tagger = crfseg.create_tagger()
    tok_cn = lambda (x): crfseg.cut_zh(x, tagger)
    
    tfidf = TfidfVectorizer(tokenizer=tok_cn, sublinear_tf=True, max_df=0.5)
    pipe = Pipeline([
        ('tfidf', tfidf),
    #    ('svd', TruncatedSVD(32)),
    #    ('normal', Normalizer(copy=False))
        ])
    '''
    hasher = HashingVectorizer(n_features=2**16,
                               tokenizer=tok_cn, non_negative=True,
                               norm=None, binary=False)
    '''

    #clf = SGDClassifier(loss='log', penalty='l2', alpha=0.00001, n_iter=50, fit_intercept=True)
    #clf = MultinomialNB()
    clf = BernoulliNB()
    
    neg_file = codecs.open(neg, 'r', 'utf-8')
    pos_file = codecs.open(pos, 'r', 'utf-8')

    x_train = []
    y_train = []
    
    i = 0
    for line in neg_file:
        x_train.append(line)
        y_train.append(0)
    for line in pos_file:
        x_train.append(line)
        y_train.append(1)
    
    print 'begin transform'
    #x_train = hasher.transform(x_train)
    x_train = pipe.fit_transform(x_train)
    print 'begin fit'
    clf.fit(x_train, y_train)

    print 'begin save'
    tfidf_file = os.path.join(the_file, 'data', 'tfidf.pkl')
    clf_file = os.path.join(the_file, 'data', 'sgdc_clf.pkl')
    #_ = joblib.dump(tfidf, tfidf_file, compress=9)
    _ = joblib.dump(clf, clf_file, compress=9)

    print 'begin test'
    x_test = [u'这个东西真心很赞']
    #x_test = hasher.transform(x_test)
    x_test = pipe.transform(x_test)
    print clf.predict(x_test)
예제 #6
0
def main():
  start_time = time.time()
  #read in game IDs
  games_data = pd.read_csv('games-data.csv')
  all_games = np.array(games_data['game_id'])
  all_plyrs = np.array(games_data['plyr_id'])
  uni_game_ids = np.unique(all_games)
  
  #read in player IDs
  player_data = pd.read_csv('players.csv')
  plyr_ids = np.unique(np.array(player_data['ID']))
  
  #read in fantasy scores
  fantasy_scores = pd.read_csv('fantasy_scores.csv')
  
  #gets player training matrix
  plyr_id = 8439
  X = create_training_set(plyr_id, games_data, plyr_ids)
  index = get_ninety_percent(len(np.array(X.index))) #for cross-validation
  train_X = X[:index]
  test_X = X[index:]
  
  #gets training output vector
  plyr_game_ids = np.array(train_X.index)
  scores = plyr_fantasy_pts(plyr_id, plyr_game_ids, fantasy_scores)
  Y = discretize(scores.values)
  train_Y = Y[:index]
  test_Y = Y[index:]
  
  #run Bernoulli NB Classifier
  nb_clf = BernoulliNB()
  nb_clf.fit(train_X, train_Y)
  nb_predictions = nb_clf.predict(test_X)
  
  #run Multinomial NB Classifier
  mn_clf = MultinomialNB()
  mn_clf.fit(train_X, train_Y)
  mn_predictions = nb_clf.predict(test_X)
  
  #test for game, fantasy score alignment  
  for i in xrange(test_Y.shape[0]):
    print plyr_game_ids[i], scores.values[i], test_Y[i], nb_predictions[i], mn_predictions[i]
  
  print "Bernoulli NB accuracy: ", nb_clf.score(test_X, test_Y)
  
  print "Bernoulli NB prob estimates: ", nb_clf.predict_proba(test_X)
  print "Multinomial NB accuracy: ", mn_clf.score(test_X, test_Y)
  print "Bernoulli NB prob estimates: ", mn_clf.predict_proba(test_X)
  print len(nb_clf.predict_proba(test_X)[0])
  nb_norm_prob = normalize_probs(nb_clf.predict_proba(test_X)[0])
  vals = [1.5, 4.5, 7.5, 10.5, 13.5, 16.5, 19.5, 22.5, 25.5, 28.5, 31.5]
  ev = expected_val(nb_norm_prob, vals)
  print "EV: ", ev
  end_time = time.time()
  print("Elapsed time was %g seconds" % (end_time - start_time))  
예제 #7
0
class MuscleClassifier():

	def __init__(self, auto_load=True):
		""" Initializes our MuscleClassifier
			Option to preload it or start from fresh model 
		"""

		#=====[ If auto_load, then we rehydrate our existing models ]=====
		if auto_load:

			self.model = pickle.load(open('modules/pickled/muscle_classifier.p','r'))
			self.le = pickle.load(open('modules/pickled/muscle_classifier_le.p','r'))
			self.vectorizer = pickle.load(open('modules/pickled/muscle_classifier_vectorizer.p','r'))

		else:

			self.model = BernoulliNB()

	def train(self, muscle_groups, labels):
		""" 
			Vectorizes raw input and trains our classifier 
		"""

		#=====[ Instantiate label encoder to turn text labels into ints ]=====
		self.le = preprocessing.LabelEncoder()

		#=====[ Declare vectorizers and merge them via a FeatureUnion ]=====
		char_vzr = feature_extraction.text.CountVectorizer(lowercase=True, ngram_range=(3,8), analyzer='char', encoding='utf-8')
		word_vzr = feature_extraction.text.CountVectorizer(lowercase=True, ngram_range=(1,5), analyzer='word', encoding='utf-8')

		self.vectorizer = FeatureUnion([('char',char_vzr),('word',word_vzr)])

		#=====[ Transform our input and labels ]=====
		X = self.vectorizer.fit_transform(muscle_groups).toarray()
		Y = self.le.fit_transform(labels)

		#=====[ Fit our model and then run inference on training data ]=====
		self.model.fit(X,Y)
		y = self.model.predict(X)

		#=====[ Report Traning Accuracy ]=====
		print "Training Accuracy: %f " % (sum(y != Y)/float(len(Y)))

	def predict(self, exercises):
		""" Takes in raw input, vectorizes it, and reports back predicted muscle group """

		X = self.vectorizer.transform(exercises).toarray()
		y = self.model.predict(X)

		return self.le.classes_[y]
예제 #8
0
파일: q1.py 프로젝트: mchenchen/Course-Work
def bnb_baseline(bow_train, train_labels, bow_test, test_labels):
    # training the baseline model
    binary_train = (bow_train>0).astype(int)
    binary_test = (bow_test>0).astype(int)

    model = BernoulliNB()
    model.fit(binary_train, train_labels)

    #evaluate the baseline model
    train_pred = model.predict(binary_train)
    print('BernoulliNB baseline train accuracy = {}'.format((train_pred == train_labels).mean()))
    test_pred = model.predict(binary_test)
    print('BernoulliNB baseline test accuracy = {}'.format((test_pred == test_labels).mean()))

    return model
예제 #9
0
파일: Models.py 프로젝트: ineilm/BountyApp
def BernoulliNB_1(train_predictors,test_predictors,train_target,test_target):
    clf = BernoulliNB()
    clf.fit(train_predictors,train_target)
    predicted = clf.predict(test_predictors)
    accuracy = accuracy_score(test_target, predicted)
    print "Accuracy for Bernoulli Naive Bayes: "+str(accuracy)
    return accuracy,predicted  
예제 #10
0
def test_discretenb_predict_proba():
    """Test discrete NB classes' probability scores"""

    # The 100s below distinguish Bernoulli from multinomial.
    X_bernoulli = [[1, 100, 0], [0, 1, 0], [0, 100, 1]]
    X_multinomial = [[0, 1], [1, 3], [4, 0]]

    # Confirm that the 100s above distinguish Bernoulli from multinomial
    y = [0, 0, 1]
    cls_b = BernoulliNB().fit(X_bernoulli, y)
    cls_m = MultinomialNB().fit(X_bernoulli, y)
    assert_not_equal(cls_b.predict(X_bernoulli)[-1],
                     cls_m.predict(X_bernoulli)[-1])

    # test binary case (1-d output)
    y = [0, 0, 2]   # 2 is regression test for binary case, 02e673
    for cls, X in zip([BernoulliNB, MultinomialNB],
                      [X_bernoulli, X_multinomial]):
        clf = cls().fit(X, y)
        assert_equal(clf.predict(X[-1]), 2)
        assert_equal(clf.predict_proba(X[0]).shape, (1, 2))
        assert_array_almost_equal(clf.predict_proba(X[:2]).sum(axis=1),
                                  np.array([1., 1.]), 6)

    # test multiclass case (2-d output, must sum to one)
    y = [0, 1, 2]
    for cls, X in zip([BernoulliNB, MultinomialNB],
                      [X_bernoulli, X_multinomial]):
        clf = cls().fit(X, y)
        assert_equal(clf.predict_proba(X[0]).shape, (1, 3))
        assert_equal(clf.predict_proba(X[:2]).shape, (2, 3))
        assert_almost_equal(np.sum(clf.predict_proba(X[1])), 1)
        assert_almost_equal(np.sum(clf.predict_proba(X[-1])), 1)
        assert_almost_equal(np.sum(np.exp(clf.class_log_prior_)), 1)
        assert_almost_equal(np.sum(np.exp(clf.intercept_)), 1)
예제 #11
0
def train_model(data, target):
    """
    Splits the data into a training set and test set

    Instatiating a Bernoulli Naive Bayes classifier, train on the training set,
    and then evaluate the model based upon the test set
    """

    # Using cross-validation
    # TO TRY: stratification for dividing preclassified tweets into homogenous subgroups before
    # sampling in order to improve the representativeness of the sampling

    train_tweets, validation_tweets, train_sentiment, validation_sentiment = cross_validation.train_test_split(data, 
                                                                                                target,
                                                                                                test_size=0.4)

    
    # Fitting the Naive Bayes classifier wtih the training tweets and corresponding sentiment
    classifier = BernoulliNB().fit(train_tweets, train_sentiment)


    predicted = classifier.predict(validation_tweets)

    # Using the cross-validation split, evaluate the accuracy of the predicted tweets
    evaluate_model(validation_sentiment, predicted)

    # Pickling the classifier
    pickle_file = open('nb_classifier.pickle', 'wb')
    pickle.dump(classifier, pickle_file)
    pickle_file.close()

    return classifier
예제 #12
0
def predict(cur, plyr_id, game_plyrs): 
  #creates training set (called 'X') for plyr
  all_plyrs = all_player_ids(cur) #np.array - all NFL players (and coaches)
  games = games_played_in(cur, plyr_id) #np.array - the games_ids the player played in
  n_cols = all_plyrs.shape[0] #int 
  m_rows = games.shape[0] #int
  zeros = np.zeros((m_rows, n_cols)) #2darr - used to initialize DF
  X = pd.DataFrame(zeros, index=games, columns=all_plyrs) #dataframe
  populate_training_set(cur, X, games, plyr_id)
  print "X: ", X.values
  
  
  #creates vector of known output values
  Y = training_output_vector(cur, games, plyr_id)
  print "(len) Y: ", len(Y), Y
  test_zeros = np.zeros((1, n_cols)) #2darr - used to initialize DF
  test_X = pd.DataFrame(zeros, columns=all_plyrs) #dataframe
  update_training_matrix(game_plyrs, 0, test_X)
  
  #run Bernoulli NB Classifier
  nb_clf = BernoulliNB()
  
  if len(X.values) == 0:
    return 0
  nb_clf.fit(X, Y)
  nb_predictions = nb_clf.predict(test_X)
  print "test_X: ", test_X.values
  nb_norm_prob = normalize_probs(nb_clf.predict_proba(test_X)[0])
  avgs = [1.5, 4.5, 7.5, 10.5, 13.5, 16.5, 19.5, 22.5, 25.5, 28.5, 31.5]
  print "param vector: ", nb_clf.predict_proba(test_X)[0]
  print "probs: ", nb_norm_prob
  print avgs
  ev = expected_val(nb_norm_prob, avgs) #can also calc dot product
  return round(ev, 1)
예제 #13
0
def test_BernouliNB2():
    X = np.array([
        [0, 1],
        [1, 1],
        [1, 0],
        [-1, 1],
        [1000, 1000],
        [1000, 10001],
        [998, 800],
        [990, 1100],
        ]
            )
    print 'X ' + str(X)
    #Y = np.array([1, 1, 1, 1, 2, 2, 2, 2])
    Y = np.array([1, 2, 3, 4, 5, 6, 7, 8])
    print 'Y ' + str(Y)
    clf = BernoulliNB(alpha = 1)
    clf.fit(X, Y)
    X2 = np.array(
            [
            [1002, 1010],
            [1010, 910],
            [1003, 980],
            [1008, 1030],
            [-1, -1],
            [-3, -10],
            [40, 1],
            [1, -100],
            ]
            )
    for i in xrange(len(X2)):
        #pred_ret = clf.predict_proba(X2[i])
        pred_ret = clf.predict(X2[i])
        print 'X[' + str(i) + '] = ' + str(X[i]) + ' pred_ret ' + str(pred_ret)
예제 #14
0
def test_BernouliNB4():
    X = np.array([
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [0, 0],
        [0, 0],
        [1, 0],
        ]
            )
    print 'X ' + str(X)
    #Y = np.array([1, 1, 1, 1, 2, 2, 2, 2])
    Y = np.array([1, 1, 0, 1, 0, 0, 0, 1, 1, 0])
    print 'Y ' + str(Y)
    clf = BernoulliNB(alpha = 1)
    clf.fit(X, Y)
    X2 = np.array(
            [
            [1, 1],
            ]
            )
    for i in xrange(len(X2)):
        #pred_ret = clf.predict_proba(X2[i])
        pred_ret = clf.predict(X2[i])
        print 'X[' + str(i) + '] = ' + str(X2[i]) + ' pred_ret ' + str(pred_ret)
def BNB(data_train, data_train_vectors, data_test_vectors, **kwargs):
    # Implementing classification model- using BernoulliNB
    clf_BNB = BernoulliNB(alpha=.01)
    clf_BNB.fit(data_train_vectors, data_train.target)
    y_pred = clf_BNB.predict(data_test_vectors)
    
    return y_pred
class NaiveBayesClassifierBernoulli:
    """
    this class capsules the Bernoulli NaiveBayes functions of scikit-learn in BernoulliNB class
"""
    def __init__(self, matrixFileName = matrixFilePath, dicFileName = dictFilePath):
        self.X,self.Y = load_svmlight_file(matrixFileName)
        self.dictionary = pickle.load(open(dicFileName, "rb"))
        self.bernoulliNB = BernoulliNB()
        self.bernoulliNB.fit(self.X, self.Y)
        self.matrixParser = Parser.MatrixParserForLearning()
        
    def classifyOneSentence(self, string):
        row = self.matrixParser.getRowForClassify(string, self.dictionary)
        if row != None:
#             return self.bernoulliNB.predict(row)
            return self.bernoulliNB.predict(row)
        else : return None
    
    def classifyOneSentenceWithProbability(self,string):
        row = self.matrixParser.getRowForClassify(string, self.dictionary)
        if row != None:
#             return self.bernoulliNB.predict(row)
            a = self.bernoulliNB.predict_proba(row)
            return a[0][1] - a[0][0]
        else : return None
def combined_experiment(train_x,train_y,test_x,test_y,train_f_x,train_f_y,test_f_x,test_f_y, bias):
    labels = [] # Will contain all the final labels that result from the voting
    clf_c1 = MultinomialNB()
    clf_c1.fit(train_x,train_y)
    clf_c2 = BernoulliNB()
    clf_c2.fit(train_x,train_y)
    clf_f1 = svm.SVC(kernel='linear',cache_size = 512)
    clf_f1.fit(train_f_x,train_f_y)
    clf_f2 = svm.SVC(kernel='rbf',cache_size = 512)
    clf_f2.fit(train_f_x,train_f_y)
    
    p1 = clf_c1.predict(test_x)
    p2 = clf_c2.predict(test_x)
    p3 = clf_f1.predict(test_f_x)
    p4 = clf_f2.predict(test_f_x)
    if bias == 'content':
        for i in range(len(p1)):
            if p1[i] == p2[i] or p1[i] == p3[i]:
                labels.append(p1[i])
            else:
                labels.append(p2[i])
    elif bias == "syntax":
        for i in range(len(p1)):
            if p1[i] == p3[i] or p1[i] == p4[i]:
                labels.append(p1[i])
            else:
                labels.append(p3[i])
    else:
        print 'Please enter a valid bias ("syntax" or "content")!'
    p_combined = np.array(labels)
    accuracy = (np.sum(p_combined == test_y)/np.float_(len(test_y)))
    return accuracy
예제 #18
0
    def render_content(self):
        if self.text_source is None:
            return "No text source selected."
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.naive_bayes import BernoulliNB
        from sklearn import metrics
        self.dm("creating vectorizer")
        vectorizer = CountVectorizer(stop_words=self.get_user_list(self.stop_list), max_features=self.vocab_size)
        data = self.get_column_data(self.text_source)
        self.dm("using vectorizer")
        X_train = vectorizer.fit_transform(data)
        Y_train = self.get_column_data(self.code_source)
        self.dm("creating classifier")
        clf = BernoulliNB()
        clf.fit(X_train, Y_train)
        
        accuracy = clf.score(X_train, Y_train)
        self.dm("predicting")
        pred = clf.predict(X_train)
        cm = metrics.confusion_matrix(Y_train, pred)

        self.dm("displaying result")
        html_output = "accuracy is " + str(round(accuracy, 2))
        html_output += '<pre>'+ str(cm) + '</pre>'

        return html_output
예제 #19
0
파일: main.py 프로젝트: jaksah/MLProject
def bernoulli_classify():
    clf = BernoulliNB()
    traindata = []
    traintarget = []
    for f in glob.glob("../../../res/articles/training_data/*-articles.json"):
        target = f.replace("-articles.json", "")
        target = re.sub(r".*\/+", "", target)
        output = readWholeFileBernoulli(f, target)
        traindata.extend(output[0])
        traintarget.extend(output[1])

    testdata = []
    testtarget = []
    for f in glob.glob("../../../res/articles/test_data/*-articles.json"):
        target = f.replace("-articles.json", "")
        target = re.sub(r".*\/+", "", target)
        output = readWholeFileBernoulli(f, target)
        testdata.extend(output[0])
        testtarget.extend(output[1])

    clf.fit(traindata, traintarget)
    ncorrect = 0
    total = len(testdata)
    for i in range(len(testdata)):
        predict = clf.predict(testdata[i])
        correct = testtarget[i]
        if correct == predict[0]:
            ncorrect += 1

        print ("Correct: {0} - Predicted: {1}".format(correct, predict[0]))

    print "Correct ", ncorrect, " Total ", total, " Correctness ", ncorrect * 1.0 / total
def learn_model(data, target):
    # preparing data for split validation. 80% training, 20% test
    data_train, data_test, target_train, target_test = cross_validation.train_test_split(
        data, target, test_size=0.2, random_state=43
    )
    classifier = BernoulliNB().fit(data_train, target_train)
    predicted = classifier.predict(data_test)
    evaluate_model(target_test, predicted)
예제 #21
0
def naive_bayesB_classifier(X_train, categories, X_test, test_categories):
    from sklearn.naive_bayes import BernoulliNB   
    clf = BernoulliNB(alpha = 0.10000000000000001).fit(X_train, categories)
    y_nb_predicted = clf.predict(X_test)
    print "\n Here is the classification report for Naive Bayes classifier:"
    print metrics.classification_report(test_categories, y_nb_predicted)
    print "Accuracy score:"
    print metrics.accuracy_score(test_categories, y_nb_predicted)
    to_latex(test_categories, y_nb_predicted)  
def learnBModel(ip,label,tst,tst_label):
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
    X_train = vectorizer.fit_transform(ip.data)
    X_test = vectorizer.transform(tst.data)
    tfidf_train = TfidfTransformer(use_idf=False).fit_transform(X_train)
    tfidf_test = TfidfTransformer(use_idf=False).fit_transform(X_test)
    classifier = BernoulliNB().fit(tfidf_train,label)
    predicted_BModel = classifier.predict(tfidf_test)
    evaluate_model(tst_label,predicted_BModel)
예제 #23
0
 def nb_classifier(self, secret):
     clf = BernoulliNB()
     x = self.raw_attr_vector(secret)
     y = self.get_labels(secret)
     fsl = self.feature_sel(secret)
     new_x = fsl.transform(x)
     clf.fit(new_x, y)
     new_y = clf.predict(new_x)
     return clf, fsl, self.evaluate(new_y, y)
예제 #24
0
 def classify(self):
     '''
     using default classifiers to train and test  
     
     Returns
     -------
     label: the array of predicted result        
     '''
     
     if self.typeStr == 'NB':
         clf = BernoulliNB()
         clf.fit(self.X, self.Y)  
         self.y_hat = clf.predict(self.x)
     elif self.typeStr == 'Tree':
         clf = tree.DecisionTreeClassifier()
         clf.fit(self.X, self.Y)
         self.y_hat = clf.predict(self.x)
         
     return self.y_hat            
예제 #25
0
class NaiveBayes:
    def train(self, x, y, weight = None):
        self.classifier = BernoulliNB()
        self.classifier.fit(x, y, sample_weight = weight)
    def predict(self, x):
        return self.classifier.predict(x)[0]
    def newInstance(self):
        return NaiveBayes()
    def name(self):
        return "NaiveBayes"
예제 #26
0
def evaluate_baseline():
    inputs, outputs, words = preprocessing.build_data_target_matrices("aclImdb/train/pos", "aclImdb/train/neg", binary_output=True)
    tst_inputs, tst_outputs, _ = preprocessing.build_test_data_target_matrices("aclImdb/test/pos", "aclImdb/test/neg", words, binary_output=True)
    model = BernoulliNB()

    scores = cross_val_score(model, inputs, outputs.ravel(), cv=10)
    logging.info("Accuracy for %s: %.02f, std: %.02f" % ("Baseline BernoulliNB", scores.mean(), scores.std()))

    model.fit(inputs, outputs.ravel())
    logging.info(accuracy_score(tst_outputs.ravel(), model.predict(tst_inputs)))
예제 #27
0
class NaiveBayes(StatModel):
	def __init__(self):
		self.name  = "nb"
		self.model = BernoulliNB()

	def train(self, samples, labels):
		self.model.fit(samples, labels)
				
	def predict(self, samples):
		return self.model.predict(samples)
예제 #28
0
def test_BernouliNB():
    X = np.random.randint(2, size=(6, 100))
    print 'X ' + str(X)
    Y = np.array([1, 2, 3, 4, 4, 5])
    print 'Y ' + str(Y)
    clf = BernoulliNB()
    clf.fit(X, Y)
    for i in xrange(6):
        pred_ret = clf.predict(X[i])
        print 'X[' + str(i) + '] = ' + str(X[i]) + ' pred_ret ' + str(pred_ret)
예제 #29
0
def learn_model(data,target):
    # preparing data for split validation. 60% training, 40% test
    state=randrange(1,23432)+123
    print "statue 6857"
    print state

    data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.2,random_state=state)
    classifier = BernoulliNB().fit(data_train,target_train)
    predicted = classifier.predict(data_test)
    evaluate_model(target_test,predicted)
예제 #30
0
파일: NaiveBayes.py 프로젝트: AravindRam/ML
def naive_bayes(train,validation):

    #features
    season=['Fall','Spring','Summer','Winter']
    #season=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
    district=['BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION','NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']
    time=['first','second','third']
    features2 = [x for x in range(0,24)]
    Minute=[x for x in range(100,160)]

    features=district+time+Minute+season+features2

    #split set into train, validation
    train,validation= train_test_split(train, train_size=0.9)
    model = BernoulliNB()
    model.fit(train[features],train['Category'])

    #time calculation
    stop = timeit.default_timer()
    print "Runnin  time naive bayes is ", stop-start

    predicted = np.array(model.predict_proba(validation[features]))
    model1=model.predict(validation[features])
    model2=model.predict(train[features])

    print "-----------------------------Naive Bayes----------------------------------------------------------------------------"
    print "Precision is ",precision_score(validation['Category'].values.tolist(),model1,average='macro')
    print "Recall is ",recall_score(validation['Category'].values.tolist(),model1,average='macro')
    print "Accuracy is ", accuracy_score(validation['Category'].values.tolist(),model1)
    print "Training Accuracy is ", accuracy_score(train['Category'].values.tolist(),model2)
    Category_new=[]
    for i in range(0,len(model1)):
        Category_new.append(le_crime.classes_[model1[i]])

    #store result into file
    result=pd.DataFrame(predicted, columns=le_crime.classes_)
    result['Predicted']=Category_new
    result.to_csv('naiveBayes_test.csv', index = True, index_label = 'Id' )


    #log loss function
    print "Log loss is", log_loss(validation['Category'],predicted,eps=1e-15, normalize=True, sample_weight=None)
예제 #31
0
파일: by_action.py 프로젝트: iihcy/Rob
class MyApp(QtGui.QMainWindow, Ui_MainWindow):
    def __init__(self):
        self.x_data = list()
        self.y_data = list()
        QtGui.QMainWindow.__init__(self)
        Ui_MainWindow.__init__(self)
        self.setupUi(self)
        self.dt_action.clicked.connect(self.ss)
        self.mm_Button01.clicked.connect(self.ptu01)
        self.mm_Button02.clicked.connect(self.ptu02)
        self.roc_Button.clicked.connect(self.pro)
        self.save_button.clicked.connect(self.out_model)
        #对标准化radio加入组bg中
        self.bg01 = QtGui.QButtonGroup()
        self.bg01.addButton(self.s_radio_1, 1)
        self.bg01.addButton(self.s_radio_2, 2)
        #默认定义s_radio_1这个控件被选中
        self.s_radio_1.setChecked(True)
        #对数据集划分radio加入组bg中
        self.bg02 = QtGui.QButtonGroup()
        self.bg02.addButton(self.d_radio_1, 1)
        self.bg02.addButton(self.d_radio_2, 2)
        #默认定义s_radio_1这个控件被选中
        self.d_radio_1.setChecked(True)
        #对三种贝叶斯分类加入组bg中
        self.bg03 = QtGui.QButtonGroup()
        self.bg03.addButton(self.radioButton, 1)
        self.bg03.addButton(self.radioButton_2, 2)
        self.bg03.addButton(self.radioButton_3, 3)
        #默认定义s_radio_1这个控件被选中
        self.radioButton.setChecked(True)

    def ss(self):
        self.bz()  #标准化
        self.stt()  #划分数据集

        self.dtc()  #决策树

    #数据标准化
    def bz(self):
        if self.bg01.checkedId() == 1:
            self.x = preprocessing.scale(self.x_data)
        else:
            min_max_scaler = preprocessing.MinMaxScaler()
            self.x = min_max_scaler.fit_transform(self.x_data)

    #训练数据和测试数据的划分
    def stt(self):
        #对数据进行划分,其中自变量和因变量都进行
        #这样就产生四个数据集:x_train,x_test,y_train,y_test
        self.x_train = list()
        self.x_test = list()
        self.y_train = list()
        self.y_test = list()
        if self.bg02.checkedId() == 1:
            strte = self.tt_box.itemText(self.tt_box.currentIndex())
            s01 = str(strte).split(':')
            if len(s01) == 2:
                xnum = math.ceil((int(s01[0]) * 1.0 / 10) * len(self.x_data))

                for i in range(len(self.x_data)):
                    if i <= xnum:
                        self.x_train.append(self.x_data[i])
                        self.y_train.append(self.y_data[i])
                    else:
                        self.x_test.append(self.x_data[i])
                        self.y_test.append(self.y_data[i])
        else:
            ts01 = int(self.train.text())
            ts02 = int(self.test.text())
            for i in range(ts01 + ts02):
                if i < ts01:
                    self.x_train.append(self.x_data[i])
                    self.y_train.append(self.y_data[i])
                else:
                    self.x_test.append(self.x_data[i])
                    self.y_test.append(self.y_data[i])

    #计算各种分类评估指标
    def cmm(self, cm):
        ls = dict()
        for i in range(len(cm)):
            tmp = list()
            tp = cm[i][i]
            fp = sum(cm.T[i]) - tp
            fn = sum(cm[i]) - tp
            tn = sum(sum(cm)) - tp - fp - fn
            #求每个类别对应的评估值
            if tp != 0:
                TPR = float(tp) / (tp + fn)  #真正率
            else:
                TPR = 0
            if fn != 0:
                FNR = float(fn) / (fp + tn)  #假负率
            else:
                FNR = 0
            if fp != 0:
                FPR = float(fp) / (fp + tn)  #假正率
            else:
                FPR = 0
            if tn != 0:
                TNR = float(tn) / (tn + fp)  #真负率
            else:
                TNR = 0

            tmp.append(TPR)
            tmp.append(FNR)
            tmp.append(FPR)
            tmp.append(TNR)
            if tp != 0:
                P = float(tp) / (tp + fp)  #精确度
                R = float(tp) / (tp + fn)  #召回率
                F_score = 2 * P * R / (P + R)  #查准率和查全率的的调和平均值
            else:
                P = 0
                R = 0
                F_score = 0
            tmp.append(P)
            tmp.append(R)
            tmp.append(F_score)

            ls[self.labels[i]] = tmp
        return ls

    '''
    对每个参数的值判断是否为空
    '''

    def para(self):
        # 对每个参数的值判断是否为空
        # M_alpha参数的设置
        if not self.mal_edit.text().strip():
            self.m_alpha = 1.0
        else:
            self.m_alpha = float(self.mal_edit.text())

        # M_fit_prior参数的设置
        if self.mfp_box.itemText(self.mfp_box.currentIndex()) == 'False':
            self.m_fit_prior = False
        else:
            self.m_fit_prior = True

        # B_alpha参数的设置
        if not self.bal_edit.text().strip():
            self.b_alpha = 1.0
        else:
            self.b_alpha = float(self.bal_edit.text())

        # binarize参数的设置
        if not self.bi_edit.text().strip():
            self.binarize = None
        else:
            self.binarize = float(self.bi_edit.text())

        # fit_prior参数的设置
        if self.bfp_box.itemText(self.bfp_box.currentIndex()) == 'False':
            self.b_fit_prior = False
        else:
            self.b_fit_prior = True

    '''
    主函数
    '''

    def dtc(self):
        self.para()  #获取所有参数
        #将y转化为一维形式:self.y_train,self.y_test
        self.y01_train = list()
        self.y01_test = list()
        for a in range(len(self.y_train)):
            self.y01_train.append(self.y_train[a][0])
        for b in range(len(self.y_test)):
            self.y01_test.append(self.y_test[b][0])

        #取出其中labels
        self.labels = list()
        for c in range(len(self.y_test)):
            if self.labels.count(self.y_test[c][0]) == 0:
                self.labels.append(self.y_test[c][0])
        print(self.labels)
        '''
        bayes算法实现
        '''
        if self.bg03.checkedId() == 1:
            self.clf = GaussianNB()
        elif self.bg03.checkedId() == 2:
            self.clf = MultinomialNB(alpha=self.m_alpha,
                                     fit_prior=self.m_fit_prior)
        else:
            self.clf = BernoulliNB(alpha=self.b_alpha,
                                   binarize=self.binarize,
                                   fit_prior=self.b_fit_prior)

        self.clf.fit(self.x_train, self.y01_train)

        self.y_pred = self.clf.predict(self.x_test)
        self.x_pred = self.clf.predict(self.x_train)
        '''
        该模块是对dtable01模块进行设置,即显示训练集的训练结果
        '''
        #设置单元格的行数和列数
        self.dtable01.setRowCount(3 * len(self.labels))
        self.dtable01.setColumnCount(8)
        lab = [
            '真正率(TPR)', '假负率(FNR)', '假正率(FPR)', '真负率(TNR)', '精确度(PRE)',
            '召回率(REC)', 'F-SCORE'
        ]

        #训练数据
        xcm = confusion_matrix(self.y01_train, self.x_pred)
        self.train_precision = 0.0
        for i in range(len(xcm)):
            self.train_precision += xcm[i][i]
        self.train_precision = self.train_precision / sum(sum(xcm))
        print(xcm)

        #测试数据
        tcm = confusion_matrix(self.y01_test, self.y_pred)
        self.test_precision = 0.0
        for i in range(len(tcm)):
            self.test_precision += tcm[i][i]
        self.test_precision = self.test_precision / sum(sum(tcm))
        print(tcm)
        #求出每个类别labels作为正样本的TP,FP,FN,TN,以字典的形式存储
        xls = self.cmm(xcm)
        tls = self.cmm(tcm)
        #计算训练 每个类别labels对应的评估值
        num = 0
        for key in xls:

            tmp01 = xls[key]
            tmp02 = tls[key]
            mlan = "类别:" + str(key)
            self.dtable01.setItem(num, 0,
                                  QtGui.QTableWidgetItem(mlan.decode('utf-8')))
            self.dtable01.setItem(num + 1, 0, QtGui.QTableWidgetItem('train'))
            self.dtable01.setItem(num + 2, 0, QtGui.QTableWidgetItem('test'))
            for j in range(len(tmp01)):

                self.dtable01.setItem(
                    num, j + 1, QtGui.QTableWidgetItem(lab[j].decode('utf-8')))
                self.dtable01.setItem(
                    num + 1, j + 1,
                    QtGui.QTableWidgetItem(str(round(tmp01[j], 2))))
                self.dtable01.setItem(
                    num + 2, j + 1,
                    QtGui.QTableWidgetItem(str(round(tmp02[j], 2))))
            num = num + 3
        '''
        该模块是对dtable02模块进行设置,即显示训练集的训练结果
        '''
        #设置单元格的行数和列数
        self.dtable02.setRowCount(len(self.x_pred))
        self.dtable02.setColumnCount(2)
        self.dtable02.setHorizontalHeaderLabels(['real', 'pred'])

        for s in range(len(self.x_pred)):
            if self.y01_train[s] == self.x_pred[s]:
                self.dtable02.setItem(
                    s, 0, QtGui.QTableWidgetItem(str(self.y01_train[s])))
                self.dtable02.setItem(
                    s, 1, QtGui.QTableWidgetItem(str(self.x_pred[s])))
                self.dtable02.item(s, 0).setBackgroundColor(
                    QtGui.QColor(214, 71, 0))
                self.dtable02.item(s, 1).setBackgroundColor(
                    QtGui.QColor(214, 71, 0))
            else:
                self.dtable02.setItem(
                    s, 0, QtGui.QTableWidgetItem(str(self.y01_train[s])))
                self.dtable02.setItem(
                    s, 1, QtGui.QTableWidgetItem(str(self.x_pred[s])))
        '''
        该模块是对dtable03模块进行设置,显示测试集的测试结果
        '''
        #设置单元格的行数和列数
        self.dtable03.setRowCount(len(self.y_pred))
        self.dtable03.setColumnCount(2)
        self.dtable03.setHorizontalHeaderLabels(['real', 'pred'])

        for s in range(len(self.y_pred)):
            if self.y01_test[s] == self.y_pred[s]:
                self.dtable03.setItem(
                    s, 0, QtGui.QTableWidgetItem(str(self.y01_test[s])))
                self.dtable03.setItem(
                    s, 1, QtGui.QTableWidgetItem(str(self.y_pred[s])))
                self.dtable03.item(s, 0).setBackgroundColor(
                    QtGui.QColor(214, 71, 0))
                self.dtable03.item(s, 1).setBackgroundColor(
                    QtGui.QColor(214, 71, 0))
            else:
                self.dtable03.setItem(
                    s, 0, QtGui.QTableWidgetItem(str(self.y01_test[s])))
                self.dtable03.setItem(
                    s, 1, QtGui.QTableWidgetItem(str(self.y_pred[s])))
        '''
        该模块是对train_e模块进行设置,显示测试集的测试结果
        '''
        self.train_e.setText(str(round(self.train_precision, 3)))
        self.test_e.setText(str(round(self.test_precision, 3)))

    #保存模型
    def out_model(self):
        self.filepath = str(
            QtGui.QFileDialog.getSaveFileName(self, "文件保存", "F:/",
                                              "Model Files (*.model)"))
        joblib.dump(self.clf, self.filepath.decode('GB2312'))

    #对应mm_Button01的函数
    def ptu01(self):
        #画图train混淆矩阵窗口
        mm = mm_matrix.c_matrix()
        mm.labels = self.labels
        mm.y_true = self.y01_train
        mm.y_pred = self.x_pred
        mm.p_tu()

    #对应mm_Button02的函数
    def ptu02(self):
        #画图train混淆矩阵窗口
        mm = mm_matrix.c_matrix()
        mm.labels = self.labels
        mm.y_true = self.y01_test
        mm.y_pred = self.y_pred
        mm.p_tu()

    #对应roc_Button的函数
    def pro(self):

        p_roc = proc_s.proc()
        p_roc.y_true = self.y01_test
        p_roc.y_pred = self.y_pred
        p_roc.labels = self.labels
        p_roc.mroc()
예제 #32
0
def main():
    data = []
    with open('data-1_train.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            data.append(row)
    fields = data[0]
    data = np.array(data[1:], dtype=object)
    print(data.shape, fields)
    words = filterData(data)
    print(words.shape)

    x_train = []
    y_train = []
    for i in range(len(data)):
        x_train.append(words[i][1])
        y_train.append(data[i][4])
    x_train = np.array(x_train)
    y_train = np.array(y_train)

    # 10-Fold Cross Validation
    kf = KFold(n_splits=10)
    kf.get_n_splits(x_train)

    precision_nb = np.array([0.0, 0.0])
    recall_nb = np.array([0.0, 0.0])
    f_score_nb = np.array([0.0, 0.0])
    precision_svm = np.array([0.0, 0.0])
    recall_svm = np.array([0.0, 0.0])
    f_score_svm = np.array([0.0, 0.0])
    count = 1
    for train_index, test_index in kf.split(x_train):
        x_train_kf, x_test_kf = x_train[train_index], x_train[test_index]
        y_train_kf, y_test_kf = y_train[train_index], y_train[test_index]

        onehot_enc = MultiLabelBinarizer()
        onehot_enc.fit(x_train)

        bnbc = BernoulliNB(binarize=None)
        bnbc.fit(onehot_enc.transform(x_train_kf), y_train_kf)

        predicted_y = bnbc.predict(onehot_enc.transform(x_test_kf))
        print(onehot_enc.transform(x_test_kf))
        print(onehot_enc.transform(x_test_kf).shape)
        print('length of predicted', len(predicted_y))
        score = bnbc.score(onehot_enc.transform(x_test_kf), y_test_kf)

        precision_nb += precision_score(y_test_kf,
                                        predicted_y,
                                        labels=['-1', '1'],
                                        average=None)
        recall_nb += recall_score(y_test_kf,
                                  predicted_y,
                                  labels=['-1', '1'],
                                  average=None)
        f_score_nb += f1_score(y_test_kf,
                               predicted_y,
                               labels=['-1', '1'],
                               average=None)

        print(count, "Naive Bayesian Accuracy: ", score)
        # print(bnbc.predict(onehot_enc.transform(x_test_kf)))

        lsvm = LinearSVC()
        lsvm.fit(onehot_enc.transform(x_train_kf), y_train_kf)

        predicted_y = lsvm.predict(onehot_enc.transform(x_test_kf))
        precision_svm += precision_score(y_test_kf,
                                         predicted_y,
                                         labels=['-1', '1'],
                                         average=None)
        recall_svm += recall_score(y_test_kf,
                                   predicted_y,
                                   labels=['-1', '1'],
                                   average=None)
        f_score_svm += f1_score(y_test_kf,
                                predicted_y,
                                labels=['-1', '1'],
                                average=None)

        score = lsvm.score(onehot_enc.transform(x_test_kf), y_test_kf)
        print(count, "Linear SVM Accuracy: ", score)
        print("")
        count += 1

    print('NB Avg. Precisions', precision_nb / 10)

    print('NB Avg. Recalls', recall_nb / 10)

    print('NB Avg. F-Scores', f_score_nb / 10)

    print('SVM Avg. Precisions', precision_svm / 10)
    print('SVM Avg. Recalls', recall_svm / 10)
    print('SVM Avg. F-Scores', f_score_svm / 10)

    # Neural network
    batch_size = 300
    tf.reset_default_graph()

    vocab_len = len(onehot_enc.classes_)
    inputs_ = tf.placeholder(dtype=tf.float32,
                             shape=[None, vocab_len],
                             name="inputs")
    targets_ = tf.placeholder(dtype=tf.float32,
                              shape=[None, 3],
                              name="targets")

    h1 = tf.layers.dense(inputs_, 500, activation=tf.nn.relu)
    logits = tf.layers.dense(h1, 3, activation=None)
    output = tf.nn.sigmoid(logits)

    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                   labels=targets_))

    optimizer = tf.train.AdamOptimizer(0.001).minimize(loss)

    correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(targets_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32),
                              name='accuracy')

    sess = tf.Session()

    sess.run(tf.global_variables_initializer())

    for train_index, test_index in kf.split(x_train):
        x_train_kf, x_test_kf = x_train[train_index], x_train[test_index]
        y_train_kf, y_test_kf = y_train[train_index], y_train[test_index]

        for i in range(10):
            for x_batch, y_batch in get_batch(onehot_enc.transform(x_train_kf),
                                              label2bool(y_train_kf),
                                              batch_size):
                loss_value, _ = sess.run([loss, optimizer],
                                         feed_dict={
                                             inputs_: x_batch,
                                             targets_: y_batch
                                         })

        test_acc = sess.run(accuracy,
                            feed_dict={
                                inputs_: onehot_enc.transform(x_test_kf),
                                targets_: label2bool(y_test_kf)
                            })

        print("Test Accuracy: {}".format(test_acc))
예제 #33
0
important_words.sort(reverse=True)

# In[245]:

important_words

# ## Report the classification accuracy and confusion matrix. Inspecting the weight vector of the logistic regression, what are the words that play the most important roles in deciding the sentiment of the reviews?

# In[246]:

clf = BernoulliNB()

# In[247]:

clf.fit(traindata_x, trainlabels_x)
k1 = clf.predict(traindata_x)
sklearn.metrics.accuracy_score(trainlabels_x,
                               k1,
                               normalize=True,
                               sample_weight=None)

# In[248]:

clf.fit(traindata_y, trainlabels_y)
k2 = clf.predict(traindata_y)
sklearn.metrics.accuracy_score(trainlabels_y,
                               k2,
                               normalize=True,
                               sample_weight=None)

# In[249]:
예제 #34
0
# training data set in a separate line 1 for labels
train_labels = [line[1] for line in lines]

# print(train_labels)

# use scikit learn to vectorize

from sklearn.feature_extraction.text import CountVectorizer

# initiate counter vectorizer in sklearn
count_vectorizer = CountVectorizer(binary='true')

# use counter vectorizer fit transform to convert the training document
# into tuples of number which represents the frequency of words

train_documents = count_vectorizer.fit_transform(train_documents)

# print(train_documents) # will print only elements that are 1 and will leave all the 0's

# Training phase

from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB().fit(train_documents, train_labels)

# Testing phase

testing = classifier.predict(
    count_vectorizer.transform(["this is the worst movie"]))

print(testing)
예제 #35
0
        y_valid = dataset["yvalidate"]

        print("d = %d" % X.shape[1])
        print("n = %d" % X.shape[0])
        print("t = %d" % X_valid.shape[0])
        print("Num classes = %d" % len(np.unique(y)))

        model = NaiveBayes(num_classes=4)
        model.fit(X, y)
        y_pred = model.predict(X_valid)
        v_error = np.mean(y_pred != y_valid)
        print("Naive Bayes (ours) validation error: %.3f" % v_error)

        model2 = BernoulliNB()
        model2.fit(X, y)
        y_pred = model2.predict(X_valid)
        v_error2 = np.mean(y_pred != y_valid)
        print("Scikit learn bernouilli nb validation error: %.3f" % v_error2)

    elif question == '3':
        with open(os.path.join('..', 'data', 'citiesSmall.pkl'), 'rb') as f:
            dataset = pickle.load(f)

        X = dataset['X']
        y = dataset['y']
        Xtest = dataset['Xtest']
        ytest = dataset['ytest']
        for k in [1, 3, 10]:
            model = KNN(k)
            model.fit(X, y)
            y_pred = model.predict(Xtest)
예제 #36
0
#clf1.predict([[33,57,57,84,57,84,84,58,33,68,68,68,68,68,68,68,68,68,57,89],[0,0,0,0,57,11,37,37,37,37,37,84,57,37,37,37,37,37,37,37]])
A = [
    33, 57, 57, 84, 57, 84, 84, 58, 33, 68, 68, 68, 68, 68, 68, 68, 68, 68, 57,
    89
]
B = [
    0, 0, 0, 0, 57, 11, 37, 37, 37, 37, 37, 84, 57, 37, 37, 37, 37, 37, 37, 37
]
#print('Size of A=',len(A))
#print('Size of B=',len(B))
for i in range((len(A)), 198):
    A.append(0)
for i in range((len(B)), 198):
    B.append(0)
#print('Size of A=',len(A))
clf1.predict([A, B])

# In[42]:

clf3 = MultinomialNB()
clf3.fit(tennis_data, output_class)
#clf3.predict([[33,57,57,84,57,84,84,58,33,68,68,68,68,68,68,68,68,68,57,89],[0,0,0,0,57,11,37,37,37,37,37,84,57,37,37,37,37,37,37,37]])
A = [
    33, 57, 57, 84, 57, 84, 84, 58, 33, 68, 68, 68, 68, 68, 68, 68, 68, 68, 57,
    89
]
B = [
    0, 0, 0, 0, 57, 11, 37, 37, 37, 37, 37, 84, 57, 37, 37, 37, 37, 37, 37, 37
]
#print('Size of A=',len(A))
#print('Size of B=',len(B))
예제 #37
0
trainData = pd.read_table('./dataset1/train.txt',
                          header=None,
                          encoding='gb2312',
                          delim_whitespace=True)
testData = pd.read_table('./dataset1/test.txt',
                         header=None,
                         encoding='gb2312',
                         delim_whitespace=True)
trainLabel = np.array(trainData.pop(3))
trainData = np.array(trainData)
testLabel = np.array(testData.pop(3))
testData = np.array(testData)

time_start1 = time.time()
clf1 = BayesClassifier()
clf1.train(trainData, trainLabel)
clf1.predict(testData)
score1 = clf1.accuarcy(testLabel)
time_end1 = time.time()
print("Accuracy of self-Bayes: %f" % score1)
print("Runtime of self-Bayes:", time_end1 - time_start1)

time_start = time.time()
clf = BernoulliNB()
clf.fit(trainData, trainLabel)
clf.predict(testData)
score = clf.score(testData, testLabel, sample_weight=None)
time_end = time.time()
print("Accuracy of sklearn-Bayes: %f" % score)
print("Runtime of sklearn-Bayes:", time_end - time_start)
예제 #38
0
from sklearn import cross_validation
from sklearn.metrics import confusion_matrix

#Somente o nome do arquivo
if __name__ == '__main__':
    for file in glob.glob(sys.argv[1] + '*.mat'):
        data = scipy.io.loadmat(file)

        #print("\nTreinando Naive Bayes...")
        clf = BernoulliNB(alpha=0.2)

        ytrain = data['Ytrain'].T.reshape(data['Ytrain'].shape[1])
        Xtrain = data['Xtrain']
        Xval = data['Xval']
        clf.fit(Xtrain, ytrain)
        predict = clf.predict(Xval)

        yVal = data['Yval'].T.reshape(data['Yval'].shape[1])
        print "\nAcuracia: ", accuracy_score(yVal, predict)
        X_train = data["Xtrain"]
        X_val = data["Xval"]

        cm = confusion_matrix(yVal, predict)
        total = numpy.sum(cm, axis=1)

        if (cm.shape[0] < 2):
            acc = 1.0
        else:
            acc = []
            for i in range(total.shape[0]):
                if (total[i] > 0):
예제 #39
0
        stop_words=stop_words(),
        #ngram_range=(1,2),
        #max_features=4000
        )

X.columns
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split

# Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# My first Naive Bayes classifier!
clf = BernoulliNB()
clf.fit(X_train, y_train)
prediction = clf.predict(X_test)
print(np.mean([prediction == y_test]))

# 1 Train the classifier
clf = BernoulliNB()
clf.fit(X_train, y_train)

# 2 Predict the data (We need to tokenize the data using the same vectorizer object)
X_test = vectorizer.transform(test_df['text']).toarray()
prediction = clf.predict(X_test)

# 3 Create a the results file
output = pd.DataFrame({'Party': prediction})
output.index.name = 'Id'
output.to_csv('sample_submission.csv')
예제 #40
0
                                                    random_state=0)

# Scaling
sc = StandardScaler()

X_train = sc.fit_transform(x_train)
X_test = sc.transform(x_test)

# Model Building

# Chose the Bernoulli distribution algorithm to predict the binary(could be also say boolean) values like female or male
b_nb = BernoulliNB()

b_nb.fit(X_train, y_train.ravel())

y_pred = b_nb.predict(X_test)  # making prediction as always
"""
Naive Bayes  ->
    
    Bernoulli Naive Bayes   :   The naive Bayes training and classification algorithms for data 
                                that is distributed according to multivariate Bernoulli distributions. 
                                there may be multiple features but each one is assumed to be a binary-valued 
                                (Bernoulli, boolean) variable. 
    
    Gaussian Naive Bayes    :   If your data is increasing continuously you can implement
                                the Gaussian Naive Bayes algorithm for classification.
                            
    Multinomial Naive Bayes :   The naive Bayes algorithm for multinomially distributed data,
                                and is one of the two classic naive Bayes variants used in text classification
                                (where the data are typically represented as word vector counts, although 
                                tf-idf vectors are also known to work well in practice).
def main(argv):
	
	start_time = time.time()
	print "running main()"

	bowfile = ''
	clsfile = ''
	tstfile = ''

	# Parse arguments
	try:
		opts, args = getopt.getopt(argv,"b:c:t:T:k:a:",["bow=","cls=","tst=","tstcls=","k=","alpha="])
	except getopt.GetoptError:
		print 'Usage: \n python naiveBayes.py -b <bagofwords_csv> -c <classes_txt> -t <tst_bagofwords_csv> -T <tst_classes_txt> -k <kth_best> -a <alpha>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print 'Usage: \n python naiveBayes.py -b <bagofwords_csv> -c <classes_txt> -t <tst_bagofwords_csv> -T <tst_classes_txt> -k <kth_best> -a <alpha>'
			sys.exit()
		elif opt in ("-b", "--bow"):
			bowfile = arg
		elif opt in ("-c", "--cls"):
			clsfile = arg
		elif opt in ("-t", "--tst"):
			tstfile = arg
		elif opt in ("-T", "--tstcls"):
			tstclsfile = arg
		elif opt in ("-k", "--k"):
			k = int(arg)
		elif opt in ("-a", "--alpha"):
			alpha = float(arg)

    # Get bag of words array and sentiment array
	bow = read_bagofwords_dat(bowfile)
	cls = read_class_values_dat(clsfile)
	tst = read_bagofwords_dat(tstfile)
	tstcls = read_class_values_dat(tstclsfile)

	# Standard model
	model = BernoulliNB()
	model.fit(bow, cls)

	# Predict test set
	predict = model.predict_proba(tst)
	classes = model.predict(tst)

	# Write to file
	write_probs_to_file("./BNBbi_standard_probs.txt", predict)
	write_classes_to_file("./BNBbi_standard_classes.txt", classes)

	# KBest model
	sel = SelectKBest(f_classif, k)
	sel.fit(bow, cls)
	bowmod = sel.transform(bow)
	tstmod = sel.transform(tst)
	varmodel = BernoulliNB()
	varmodel.fit(bowmod, cls)

	varpredict = varmodel.predict_proba(tstmod)
	varclasses = varmodel.predict(tstmod)

	# Write to file
	write_probs_to_file("./BNBbi_kbest_probs.txt", varpredict)
	write_classes_to_file("./BNBbi_kbest_classes.txt", varclasses)

	# FPR model
	sel = SelectFpr(f_classif, alpha)
	sel.fit(bow, cls)
	bowmod = sel.transform(bow)
	tstmod = sel.transform(tst)
	varmodel = BernoulliNB()
	varmodel.fit(bowmod, cls)
	varpredict = varmodel.predict_proba(tstmod)
	varclasses = varmodel.predict(tstmod)

	# Write to file
	write_probs_to_file("./BNBbi_fpr_probs.txt", varpredict)
	write_classes_to_file("./BNBbi_fpr_classes.txt", varclasses)

	# Runtime
	print 'Runtime:', str(time.time() - start_time)
예제 #42
0
def create_and_save_model():

    # In[3]:

    data = pd.read_csv('character-predictions_pose.csv')
    data4 = pd.read_csv('uci-news-aggregator.csv')

    # In[4]:

    # to avoid 'Could not convert string to float on dataset' error
    for column in data.columns:
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column].astype(str))
        if data[column].dtype == type(object):
            data[column] = le.fit_transform(
                data[column])  # Fit label encoder and return encoded labels

    for column in data4.columns:
        le = LabelEncoder()
        data4[column] = le.fit_transform(data4[column].astype(str))
        if data4[column].dtype == type(object):
            data4[column] = le.fit_transform(
                data4[column])  # Fit label encoder and return encoded labels

    # In[5]:

    x = data.drop('isAlive', axis=1)
    y = data['isAlive']
    x2 = data4.drop('CATEGORY', axis=1)
    y2 = data4['CATEGORY']

    # In[6]:

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    x2_train, x2_test, y2_train, y2_test = train_test_split(x2,
                                                            y2,
                                                            test_size=0.2)

    # In[7]:

    gnb1 = GaussianNB()
    gnb2 = MultinomialNB()
    gnb3 = BernoulliNB()

    gnb1.fit(x_train, y_train)
    y_pred = gnb1.predict(x_test)
    print(gnb1.score(x_test, y_test))

    # In[8]:

    gnb2.fit(x2_train, y2_train)
    y2_pred = gnb2.predict(x2_test)
    print(gnb2.score(x2_test, y2_test))

    # In[9]:

    gnb3.fit(x_train, y_train)
    y_pred = gnb3.predict(x_test)
    print(gnb3.score(x_test, y_test))

    gnb3.fit(x2_train, y2_train)
    y3_pred = gnb3.predict(x2_test)
    print(gnb3.score(x2_test, y2_test))
    y3_pred

    # In[17]:
    # "DROP TABLE models;"
    data = pd.read_csv('character-predictions_pose.csv')
    data4 = pd.read_csv('uci-news-aggregator.csv')

    x = data.drop('isAlive', axis=1)
    y = data['isAlive']
    x2 = data4.drop('CATEGORY', axis=1)
    y2 = data4['CATEGORY']

    # In[6]:

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    x2_train, x2_test, y2_train, y2_test = train_test_split(x2,
                                                            y2,
                                                            test_size=0.2)
    database = "/Users/ziba/PycharmProjects/try6/realtime/database.db"
    sql_create_models_table = """CREATE TABLE IF NOT EXISTS models(
                                    id integer PRIMARY KEY,
                                    name text NOT NULL,
                                    model BLOB NOT NULL);"""
    conn = sqlite3.connect(database)
    # create models table
    cur = conn.cursor()
    cur.execute(sql_create_models_table)
    # Here we force pickle to use the efficient binary protocol
    # (protocol=2). This means you absolutely must use an SQLite BLOB field
    # and make sure you use sqlite3.Binary() to bind a BLOB parameter.
    modelName1 = "Gaussian"
    modelName2 = "Multinomial"
    modelName3 = "Bernoulli"
    cur.execute("insert into models(name,model) values (?,?)", (
        modelName1,
        sqlite3.Binary(pickle.dumps(gnb1, protocol=2)),
    ))
    cur.execute("insert into models(name,model) values (?,?)", (
        modelName2,
        sqlite3.Binary(pickle.dumps(gnb2, protocol=2)),
    ))
    cur.execute("insert into models(name,model) values (?,?)", (
        modelName3,
        sqlite3.Binary(pickle.dumps(gnb3, protocol=2)),
    ))
    # If we use old pickle protocol (protocol=0, which is also the default),
    # we get away with sending ASCII bytestrings to SQLite.
    # cur.execute("insert into models(name,model) values (?,?)", (pickle.dumps(gnb1, protocol=0),))

    # Fetch the BLOBs back from SQLite

    nameOfModel = "Gaussian"
    cur.execute("select model from models")
    # cur.execute("SELECT model FROM models WHERE name=?", (nameOfModel,))
    for row in cur:
        serializedModel = row[0]
        # Deserialize the BLOB to a Python object - # pickle.loads() needs a
        # bytestring.
        loadedModel = pickle.loads(serializedModel)  # (str(serialized_point))
        # print("got model back from database", loadedModel)
        y_pred = loadedModel.predict(x2_test)
        return y_pred
예제 #43
0
param_grid = [{
    'alpha': alpha_list,
}]

clf = GridSearchCV(MultinomialNB(), param_grid, cv=5)
clf.fit(traindata, trainlabel)
print("best param: {0}\nbest score: {1}".format(clf.best_params_,
                                                clf.best_score_))

# In[28]:

#bernoulli朴素贝叶斯
from sklearn.naive_bayes import BernoulliNB
ber_model = BernoulliNB(alpha=0.001)
ber_model.fit(traindata, trainlabel)
ber_predict = ber_model.predict(testdata)
print("bernoulli贝叶斯文本分类的准确率为:", metrics.accuracy_score(ber_predict, testlabel))

# In[29]:

#高斯贝叶斯分类器
gauss_model = GaussianNB()
gauss_model.fit(traindata.toarray(), trainlabel)
gauss_predict = ber_model.predict(testdata.toarray())
print("GaussianNB贝叶斯文本分类的准确率为:",
      metrics.accuracy_score(gauss_predict, testlabel))

# In[1]:

# 创建文件
import os
예제 #44
0
def nb():
    naive_bayes = BernoulliNB()
    naive_bayes.fit(train_x, train_y)
    return accuracy(
        test_y,
        naive_bayes.predict(test_x)), naive_bayes.predict_proba(test_x), test_y
예제 #45
0
# alright guess i'll just make my own grid search style thing also with test train split looped in

import sklearn.cross_validation

k_fold = KFold(len(y_train), n_folds=10, shuffle=True, random_state=123)
model = BernoulliNB(alpha=1)
print cross_val_score(clf, X, y, cv=k_fold, n_jobs=1)

# going by best AUC the winner is "the default" which is alpha =1 and binarize = 0,
model = BernoulliNB(alpha=1)

# Train the model using the training sets
model.fit(x_train_sm, y_train_sm.ravel())

#prediction
prediction = model.predict(x_test)

#Metrics
print(
    "\n\n Welcome to Naive Bayes. \n It 'Naively' assumes independance between variables. \n It's best feature is being very quick and relatively easy to make.\n Used mostly in text classification and reccomender systems \n and here we can see that it is awful \n\n"
)

print("CONFUSION MATRIX: \n", skmet.confusion_matrix(y_test, prediction))
print("\n CLASSIFICATION REPORT:\n\n",
      skmet.classification_report(y_test, prediction))
print('ACCURACY -> ', round(100 * skmet.accuracy_score(y_test, prediction),
                            2), '%')

print("recall:", skmet.recall_score(y_test, prediction))
print("precision:", skmet.precision_score(y_test, prediction))
print("f1_score:", skmet.f1_score(y_test, prediction))
예제 #46
0
    x, y = l.split(' ')
    Y_test.append(y)
    temp = open(path4 + x, 'r')
    temp = temp.read()
    X_test.append(temp)

x_train = X_train
y_train = Y_train
x_test = X_test
y_test = Y_test

print("bow initiated")

vect = fe.text.CountVectorizer(max_features=2000)
X_train_dtm = vect.fit_transform(x_train)
# pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())
X_test_dtm = vect.transform(x_test)
# pd.DataFrame(X_test_dtm.toarray(), columns=vect.get_feature_names())

tf_trans = fe.text.TfidfTransformer()
X_train_tfidf = tf_trans.fit_transform(X_train_dtm)
X_test_tfidf = tf_trans.transform(X_test_dtm)

# creating and training logistic regression model
print("training begins")
BNBC = BernoulliNB()
BNBC.fit(X_train_tfidf, y_train)
print("test begins")
y_predicted = BNBC.predict(X_test_tfidf)

print(ac(y_test, y_predicted))
Gauss.fit(x_train, y_train)

# In[11]:

y_predicted_g = Gauss.predict(x_test)
print(mean_squared_error(y_predicted_g, y_test))
print(y_test.values[1])

# In[12]:

Bern = BernoulliNB()
Bern.fit(x_train, y_train)

# In[13]:

y_predicted_b = Bern.predict(x_test)
print(mean_squared_error(y_predicted_b, y_test))

# In[14]:

Mult = MultinomialNB()
Mult.fit(x_train, y_train)

# In[15]:

y_predicted_m = Mult.predict(x_test)
print(mean_squared_error(y_predicted_m, y_test))

# In[16]:

g = 0
예제 #48
0
X_train = matr[0:150, 1:]
y_train = matr[0:150, 0]
X_test = matr[150:194, 1:]
y_test = matr[150:194, 0]

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_all = scaler.fit_transform(X_all)

from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
model.fit(X_train, y_train)
expected = y_test
predicted = model.predict(X_test)
print predicted

# Import the random forest package
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=10,
                               criterion='gini',
                               max_depth=None,
                               min_samples_split=2,
                               min_samples_leaf=1,
                               min_weight_fraction_leaf=0.0,
                               max_features='auto',
                               max_leaf_nodes=None,
                               bootstrap=True,
                               oob_score=False,
예제 #49
0
# In[108]:


bnb = BernoulliNB()


# In[109]:


bnb.fit(X_train, y_train)


# In[110]:


y_pred = bnb.predict(X_test)


# In[111]:


confusion_matrix(y_test, y_pred)


# In[112]:


pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)


# In[113]:
			verbose=1,
			population_size=50,
			gene_mutation_prob=0.10,
			gene_crossover_prob=0.5,
			tournament_size=3,
			generations_number=6,
			n_jobs=4)
	rnds.fit(X_train_little, y_train_little)
	# summarize the results of the random parameter search
	print(rnds.best_score_)
	print('\nalpha: ')
	print(rnds.best_estimator_.alpha)
	# apply best parameters
	nbb = BernoulliNB(alpha=rnds.best_estimator_.alpha)
	nbb.fit(X_train_pca,y_train)
	pred = nbb.predict(X_test_pca)
	pred_train = nbb.predict(X_train_pca)
if learner:
	# Random Forest
	from sklearn.ensemble import RandomForestClassifier
	min_samples_leaf_r = np.round(np.linspace(1, 80, 30))
	min_samples_leaf_range = min_samples_leaf_r.astype(int)
	max_depth_range = np.round(np.linspace(5, 15, 30))
	param_dist = dict(min_samples_leaf=min_samples_leaf_range, max_depth=max_depth_range)
	cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
	rnds = EvolutionaryAlgorithmSearchCV(
			estimator=RandomForestClassifier(n_estimators=(1+num_features/2)),
			params=param_dist,
			scoring="f1",
			cv=cv,
			verbose=1,
예제 #51
0
plt.plot(fprTREE, tprTREE, '', label="Flight, auc= %0.2f" % aucTREE)
plt.title('Receiver Operating Characteristic')
plt.xlabel('False Positive')
plt.ylabel('True Positive')
plt.legend(loc=4)
plt.show()
'''
######################################## NAIVE BAYES ########################################
'''

from sklearn.naive_bayes import BernoulliNB  # importação do algoritmo e sua classe BernoulliNB
classificadorNB = BernoulliNB()
classificadorNB.fit(
    previsores_treinamento,
    classe_treinamento)  #treina o algoritmo(cria a tabela de probabilidade)
previsoesNB = classificadorNB.predict(
    previsores_teste)  # Testamos os dados para achar sua taxa de acerto
#Retorna a precisão média nos dados e rótulos de teste fornecidos.
print("Best test score is {}".format(
    classificadorNB.score(previsores_teste, classe_teste)))
#Retorna a precisão média nos dados e rótulos de treinamento fornecidos.
print("Best training score is {}".format(
    classificadorNB.score(previsores_treinamento, classe_treinamento)))

# Metrica que usar valores de precisão e recall
f1NB = f1_score(classe_teste, previsoesNB, average='micro')

# Cria uma matriz para comparação de dados dos dois atributos
matrizNB = confusion_matrix(classe_teste, previsoesNB)

#resultado da avaliação cruzada feita com 3 testes. k=3
resultado_cvNB = cross_val_score(classificadorNB, previsores, classe, cv=3)
예제 #52
0
def BernoulliNB_Text(X, Y, testcase):
    clf = BernoulliNB()
    clf.fit(X, Y)
    BernoulliNB(alpha=1.0, class_prior=None, fit_prior=True)
    result = clf.predict(testcase)
    return result
예제 #53
0
def document_features(document):
    document_words = set(document)
    features = [1 if word in document_words else 0 for word in word_features]
    return features

train_features = [document_features(d) for (d,c) in train_docs]  # creates feature sets with features and classes
train_labels = [c for (d,c) in train_docs]
test_features = [document_features(d) for (d,c) in test_docs]  # creates feature sets with features and classes
test_labels = [c for (d,c) in test_docs]

#%% TRAIN CLASSIFIER

## Bernoulli NB
clf_country = BernoulliNB()
clf_country.fit(train_features, train_labels)
test_predictions = clf_country.predict(test_features)
accuracy = np.mean(test_predictions == test_labels)

print("Accuracy: %f" % (accuracy))
print(classification_report(test_labels, test_predictions))

#%%  trying with MULTINOMIAL NAIVE BAYES

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
예제 #54
0
                u'\ud83c[\udf00-\udfff]|'
                u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
                u'[\u2600-\u26FF\u2700-\u27BF])+', re.UNICODE)
            twt = myre.sub(r'', twt)
            twt = re.sub('[\s]+', ' ', twt)  # remove additional white spaces

            return twt

        outtweets = (processTweet(texts.encode("utf-8")))
        #outtweets = unicode(outtweets, 'utf8')

        print(outtweets)

        test_vectors = vectorizer.transform([outtweets])

        prediction_nb = classifier_nb.predict(test_vectors)
        prediction_rf = classifier_rf.predict(test_vectors)
        prediction_sgd = classifier_sgd.predict(test_vectors)
        prediction_svm = classifier_svm.predict(test_vectors)
        prediction_dt = classifier_dt.predict(test_vectors)

        #if prediction_nb==1 or prediction_svm==1 or prediction_nb==1 or prediction_rf==1 or prediction_sgd:
        test_vectors2 = vectorizer2.transform([outtweets])

        prediction_nb2 = classifier_nb2.predict(test_vectors2)
        prediction_rf2 = classifier_rf2.predict(test_vectors2)
        prediction_sgd2 = classifier_sgd2.predict(test_vectors2)
        prediction_svm2 = classifier_svm2.predict(test_vectors2)
        prediction_dt2 = classifier_dt2.predict(test_vectors2)

        #NB
########################### bnb(BernoulliNB) ################################
print('BernoulliNB stacking')
stack_train = np.zeros((train_feature.shape[0], number))
stack_test = np.zeros((test_feature.shape[0], number))
score_va = 0

for i, (tr, va) in enumerate(kfold.split(train_feature, score)):
    print('stack:%d/%d' % ((i + 1), n_folds))
    bnb = BernoulliNB()
    bnb.fit(train_feature[tr], score[tr])
    score_va = bnb.predict_proba(train_feature[va])
    score_te = bnb.predict_proba(test_feature)
    print(score_va)
    print('得分' +
          str(mean_squared_error(score[va], bnb.predict(train_feature[va]))))
    stack_train[va] += score_va
    stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
    df_stack['advertiser_id_tfidf_bnb_classfiy_{}'.format(i)] = np.around(
        stack[:, i], 6)
df_stack.to_csv(path +
                'feature/advertiser_id_tfidf_bnb_error_single_classfiy.csv',
                index=None,
                encoding='utf8')
print('BernoulliNB特征已保存\n')

########################### mnb(MultinomialNB) ################################
#                Predicted Fact     Predicted Opinion      Total
# Actual Fact:        1007                    114             1121
# Actual Optinion:     102                   1008           1110
#                     1109                   1122

#==============================================================================
# NAIVE BAYES
#==============================================================================
from sklearn.naive_bayes import BernoulliNB
nb_classifier, precision, recall, accuracy, f1 = test_classifier(
    train[features], y_train, test[features], test['y_label'], BernoulliNB())
nb_cv_scores = cv(BernoulliNB(), data[features], data['y_label'])

nb_classifier = BernoulliNB().fit(train[features], train['y_label'])
nb_preds_epos = nb_classifier.predict(test[features])
nb_cm = metrics.confusion_matrix(test['y_label'], nb_preds_epos)
print_cm(nb_cm)

#                Predicted Fact     Predicted Opinion      Total
# Actual Fact:        895                    226             1121
# Actual Optinion:     77                   1033           1110
#                     972                   1259

#==============================================================================
# SUPPORT VECTOR MACHINE
#==============================================================================
from sklearn import svm

svm_classifier = svm.SVC()
svm_classifier, precision, recall, accuracy, f1 = test_classifier(
예제 #57
0
def Q3():
    """
    3.	Perform classification of ‘Ecoli’ dataset using \
        all Naive Bayes / KNN / Decision Tree / Random Forests / SVM methods. \
        (15 points)
        A. Which of the methods show the best performance \
            in terms of accuracy? Explain why \
            (use 5-fold cross validation and report averaged performance). \
            (10 points)
        B. In random forests model, list THREE most important attributes \
            in classifying localization site. (5 points)

    Attribute Information:
        1. Sequence Name: Accession number for the SWISS-PROT database
        2. mcg: McGeoch's method for signal sequence recognition.
        3. gvh: von Heijne's method for signal sequence recognition.
        4. lip: von Heijne's Signal Peptidase II consensus sequence score. \
            Binary attribute.
        5. chg: Presence of charge on N-terminus of predicted lipoproteins. \
            Binary attribute.
        6. aac: score of discriminant analysis of the amino acid content of \
            outer membrane and periplasmic proteins.
        7. alm1: score of the ALOM membrane spanning region prediction program.
        8. alm2: score of ALOM program after excluding \
            putative cleavable signal regions from the sequence.
        (9. protein localization sites)
    """

    f3 = "ecoli.data"

    df = pd.read_csv(
        f3,
        delim_whitespace=True,
        header=None,
        names=[
            "Sequence Name",
            "mcg",
            "gvh",
            "lip",
            "chg",
            "aac",
            "alm1",
            "alm2",
            "protein localization sites",
        ],
    )
    # print(df.head())
    # print(df.describe())

    df_x = df.iloc[:, 1:-1]
    df_y = df.iloc[:, -1]

    kf = KFold(n_splits=5, shuffle=True)
    accuracy_score_list_GNB = []
    accuracy_score_list_BNB = []
    accuracy_score_list_MNB = []
    accuracy_score_list_KNN = []
    accuracy_score_list_DT = []
    accuracy_score_list_RF = []
    accuracy_score_list_SVM = []
    features = df_x.columns

    for idx_train, idx_test in kf.split(df_x):
        x_train, x_test = df_x.to_numpy()[idx_train], df_x.to_numpy()[idx_test]
        y_train, y_test = df_y.to_numpy()[idx_train], df_y.to_numpy()[idx_test]

        GNB_model = GaussianNB().fit(x_train, y_train)
        y_GNB = GNB_model.predict(x_test)
        BNB_model = BernoulliNB().fit(x_train, y_train)
        y_BNB = BNB_model.predict(x_test)
        MNB_model = MultinomialNB().fit(x_train, y_train)
        y_MNB = MNB_model.predict(x_test)
        KNN_model = KNeighborsClassifier().fit(x_train, y_train)
        y_KNN = KNN_model.predict(x_test)
        DT_model = DecisionTreeClassifier().fit(x_train, y_train)
        y_DT = DT_model.predict(x_test)
        RF_model = RandomForestClassifier(n_jobs=-1).fit(x_train, y_train)
        y_RF = RF_model.predict(x_test)
        SVM_model = SVC(kernel="rbf").fit(x_train, y_train)  # high-dimension
        y_SVM = SVM_model.predict(x_test)

        accuracy_score_list_GNB.append(accuracy_score(y_test, y_GNB))
        accuracy_score_list_BNB.append(accuracy_score(y_test, y_BNB))
        accuracy_score_list_MNB.append(accuracy_score(y_test, y_MNB))
        accuracy_score_list_KNN.append(accuracy_score(y_test, y_KNN))
        accuracy_score_list_DT.append(accuracy_score(y_test, y_DT))
        accuracy_score_list_RF.append(accuracy_score(y_test, y_RF))
        accuracy_score_list_SVM.append(accuracy_score(y_test, y_SVM))

        importances = RF_model.feature_importances_
        indices = np.argsort(importances)
        # print(features[indices], importances[indices])
        # Rank of attributes are always same in every fold.

    print("Accuracy score using Gaussian Naive Bayes: %.2f" %
          mean(accuracy_score_list_GNB))
    print("Accuracy score using Bernoulli Naive Bayes: %.2f" %
          mean(accuracy_score_list_BNB))
    print("Accuracy score using Multinomial Naive Bayes: %.2f" %
          mean(accuracy_score_list_MNB))
    print("Accuracy score using KNN: %.2f" % mean(accuracy_score_list_KNN))
    print("Accuracy score using Decision Tree: %.2f" %
          mean(accuracy_score_list_DT))
    print("Accuracy score using Random Forests: %.2f" %
          mean(accuracy_score_list_RF))
    print("Accuracy score using SVM: %.2f" % mean(accuracy_score_list_SVM))
    # Accuracy score using Gaussian Naive Bayes: 0.76
    # Accuracy score using Bernoulli Naive Bayes: 0.42
    # Accuracy score using Multinomial Naive Bayes: 0.43
    # Accuracy score using KNN: 0.86
    # Accuracy score using Decision Tree: 0.80
    # Accuracy score using Random Forests: 0.87
    # Accuracy score using SVM: 0.87

    important_attributes = features[indices][:3].format()
    print("THREE most important attributes: ", important_attributes)
예제 #58
0
    y = y.astype('int')
    y = y.flatten()

    #Load test data
    z = np.load('data/test_encoded_array_new.npy')
    t = np.load('data/test_target_array_new.npy')
    t = t.astype('int')
    t = t.flatten()

    #Predict using Naive Bayes Model
    clf = BernoulliNB(alpha=1)
    nmf = NMF(n_components=500, init='random', random_state=0)
    x_500d = nmf.fit_transform(x)
    z_500d = nmf.transform(z)
    clf.fit(x_500d, y)
    p = clf.predict(z_500d)

    # Compute training time
    endTime = datetime.datetime.now() - startTime
    print("Total time taken to train: ", endTime)
    print("\n")

    print("Bernoulli Naive Bayes with 500 features and alpha = 1")

    # Compute accuracy
    accuracy = metrics.accuracy_score(t, p, normalize=False)
    print("Accuracy: ", (accuracy / len(t)) * 100)

    # Confusion matrix
    confusion_matrix = metrics.confusion_matrix(t, p)
    print("Confusion Matrix:\n", confusion_matrix)
예제 #59
0
             #Upgrade the Posterior Probability of a packet being not an attack packet
             Test_Post_N = CalcPosterior(1-NewPrior, LLA, LLN)
             print "Test Posterior No attack", Test_Post_N
             attack_count = attack_count+ DetectAttack(PThres,Test_Post_A)
             
             i=i+1
             
           if i == 20:
             store_rtt=[]
             store_grat=[]
             i=0
             
          
    #We will give the training data to fit into the Naive Bayes Model
    
    ber = BernoulliNB()
    ber.fit(Train_Features, Train_Labels)

    testMat = np.array
    testMat = np.zeros((Train_len,5),dtype='i,f,f,i,b')
    for rtt,rss,seq_flag,grat in zip(logtest_rtt,logtest_rss,logtest_seq,logtest_grat):
         testMat[j,1]=
         testMat[j,0]= ber.predict(testMat[j,:])  
         j=j+1
        
    


    
    
예제 #60
0
centers = clus.cluster_centers_;
print("Centerss:")
print(centers)
labels = clus.labels_;
print("Labels:")
print(labels)
#print(UsersId)
#UsersId.astype(float)
#print(preprocessing.scale(UsersId))
#Predicting
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(SomeData, Popular)
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
print("Predict popularity according to likes, retweets and followers")
pred = clf.predict(SomeData)
print(pred)
#for s in pred:
 #   print(s)
print("Predict Probabilidade")
print(clf.predict_proba(SomeData))

from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
#dataNum = vec.fit_transform(SomeData).toarray()
#print(dataNum)