Python BernoulliNB示例，sklearn.naive_bayes.BernoulliNB Python示例

示例#1

0

显示文件

文件： test_CountVector.py 项目： luoyan/miniprog

def test_BernouliNB2():
    X = np.array([
        [0, 1],
        [1, 1],
        [1, 0],
        [-1, 1],
        [1000, 1000],
        [1000, 10001],
        [998, 800],
        [990, 1100],
        ]
            )
    print 'X ' + str(X)
    #Y = np.array([1, 1, 1, 1, 2, 2, 2, 2])
    Y = np.array([1, 2, 3, 4, 5, 6, 7, 8])
    print 'Y ' + str(Y)
    clf = BernoulliNB(alpha = 1)
    clf.fit(X, Y)
    X2 = np.array(
            [
            [1002, 1010],
            [1010, 910],
            [1003, 980],
            [1008, 1030],
            [-1, -1],
            [-3, -10],
            [40, 1],
            [1, -100],
            ]
            )
    for i in xrange(len(X2)):
        #pred_ret = clf.predict_proba(X2[i])
        pred_ret = clf.predict(X2[i])
        print 'X[' + str(i) + '] = ' + str(X[i]) + ' pred_ret ' + str(pred_ret)

示例#2

0

显示文件

文件： naive_bayes.py 项目： lgorham/TwitterElectionTracking

def train_model(data, target):
    """
    Splits the data into a training set and test set

    Instatiating a Bernoulli Naive Bayes classifier, train on the training set,
    and then evaluate the model based upon the test set
    """

    # Using cross-validation
    # TO TRY: stratification for dividing preclassified tweets into homogenous subgroups before
    # sampling in order to improve the representativeness of the sampling

    train_tweets, validation_tweets, train_sentiment, validation_sentiment = cross_validation.train_test_split(data, 
                                                                                                target,
                                                                                                test_size=0.4)

    
    # Fitting the Naive Bayes classifier wtih the training tweets and corresponding sentiment
    classifier = BernoulliNB().fit(train_tweets, train_sentiment)


    predicted = classifier.predict(validation_tweets)

    # Using the cross-validation split, evaluate the accuracy of the predicted tweets
    evaluate_model(validation_sentiment, predicted)

    # Pickling the classifier
    pickle_file = open('nb_classifier.pickle', 'wb')
    pickle.dump(classifier, pickle_file)
    pickle_file.close()

    return classifier

示例#3

0

显示文件

文件： Bernoulli.py 项目： Shurooo/gumgum

def train(cutoffs):
    print "\n========== Start Training =========="
    if len(__TRAIN_DATA) == 3:
        list_io_addr = get_io_addr(__TRAIN_DATA[0], __TRAIN_DATA[1], __TRAIN_DATA[2])
    else:
        list_io_addr = get_io_addr_random_sample(__TRAIN_DATA[0], __TRAIN_DATA[1])
    clf = BernoulliNB(fit_prior=True)

    for i in range(len(list_io_addr)):
        path_in = list_io_addr[i]
        print "\nGenerating training set from {}".format(path_in)
        with open(path_in, "r") as file_in:
            X = Sparse_Matrix_IO.load_sparse_csr(file_in)

        if len(cutoffs) > 0:
            print "Discarding selected features......"
            X = discard_vars(X, cutoffs)

        vector_len = len(X[0])
        X_train = X[:, 0:vector_len-1]
        y_train = X[:, vector_len-1]
        print "Done"

        # sm = SMOTE(ratio=0.9)
        # X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train)

        print "Fitting Model......"
        clf.partial_fit(X_train, y_train, classes=[0, 1])
        print "Done"

    with open(__ROOT_MODEL, "w") as file_out:
        pickle.dump(clf, file_out)

示例#4

0

显示文件

文件： test_naive_bayes.py 项目： BrianLondon/scikit-learn

def test_discretenb_predict_proba():
    """Test discrete NB classes' probability scores"""

    # The 100s below distinguish Bernoulli from multinomial.
    X_bernoulli = [[1, 100, 0], [0, 1, 0], [0, 100, 1]]
    X_multinomial = [[0, 1], [1, 3], [4, 0]]

    # Confirm that the 100s above distinguish Bernoulli from multinomial
    y = [0, 0, 1]
    cls_b = BernoulliNB().fit(X_bernoulli, y)
    cls_m = MultinomialNB().fit(X_bernoulli, y)
    assert_not_equal(cls_b.predict(X_bernoulli)[-1],
                     cls_m.predict(X_bernoulli)[-1])

    # test binary case (1-d output)
    y = [0, 0, 2]   # 2 is regression test for binary case, 02e673
    for cls, X in zip([BernoulliNB, MultinomialNB],
                      [X_bernoulli, X_multinomial]):
        clf = cls().fit(X, y)
        assert_equal(clf.predict(X[-1]), 2)
        assert_equal(clf.predict_proba(X[0]).shape, (1, 2))
        assert_array_almost_equal(clf.predict_proba(X[:2]).sum(axis=1),
                                  np.array([1., 1.]), 6)

    # test multiclass case (2-d output, must sum to one)
    y = [0, 1, 2]
    for cls, X in zip([BernoulliNB, MultinomialNB],
                      [X_bernoulli, X_multinomial]):
        clf = cls().fit(X, y)
        assert_equal(clf.predict_proba(X[0]).shape, (1, 3))
        assert_equal(clf.predict_proba(X[:2]).shape, (2, 3))
        assert_almost_equal(np.sum(clf.predict_proba(X[1])), 1)
        assert_almost_equal(np.sum(clf.predict_proba(X[-1])), 1)
        assert_almost_equal(np.sum(np.exp(clf.class_log_prior_)), 1)
        assert_almost_equal(np.sum(np.exp(clf.intercept_)), 1)

示例#5

0

显示文件

文件： naive_bayes.py 项目： phecy/cdips-kaggle

def main(output_file=time.strftime('%h%d-%Hh%Mm')+'.csv', in_pkl=None):
    """ Generates features and fits classifier. 
    Input command line argument is optional run name, defaults to date/time.
    """
    logging.info("Loading features...")
    if not in_pkl:
        return "input .plk required"
    trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(in_pkl)
    logging.info("Loaded features, fitting model...")
    # Bernoulli Naive Bayes
    clf = BernoulliNB(alpha=1.0, binarize=None, fit_prior=True)
    clf.fit(trainFeatures,trainTargets)
    logging.info("Predicting...")
    # Use probabilities instead of binary class prediction in order to generate a ranking    
    predicted_scores = clf.predict_log_proba(testFeatures).T[1]

    logging.info("Write results...")
    logging.info("Writing submission to %s" % output_file)
    f = open(output_file, "w")
    f.write("id\n")

    for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
        # only writes item_id per output spec, but may want to look at predicted_scores
        f.write("%d\n" % (item_id))

    f.close()
    logging.info("Done.")

示例#6

0

显示文件

文件： dataset_one_learner.py 项目： Ikram/DUMLS14

def tryBinomialNaiveBayes(goFast):
  best_score = 0

  from sklearn.datasets import dump_svmlight_file, load_svmlight_file
  if goFast:
    training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True)
    validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True)
    testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True)
  else:
    training_data, training_labels = load_svmlight_file("dt1.trn.svm")
    validation_data, validation_labels = load_svmlight_file("dt1.vld.svm")
    testing_data, testing_labels = load_svmlight_file("dt1.tst.svm")

  from sklearn.naive_bayes import BernoulliNB

  for alpha_value in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
    for binarize_value in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
      for fit_prior_value in [True, False]:
        binary_operator = BernoulliNB(alpha_value,binarize_value,fit_prior_value)
        binary_operator.fit(training_data,training_labels)
        current_score = binary_operator.score(validation_data,validation_labels)

        print "Current test: " + str(alpha_value), str(binarize_value), fit_prior_value
        print "Current score: " + str(current_score)

        if current_score > best_score:
          best_score = current_score
          print "***NEW MAXIMUM SCORE: " + str(best_score)
          print "***NEW MAXIMUM PARAMETERS: " + str(alpha_value), str(binarize_value), fit_prior_value

  print "Best score was " + str(best_score)

示例#7

0

显示文件

文件： classifiers.py 项目： nate-parrott/relationship-thing

def compareClassifiers():
	(observations, classes) = createObservations()
	observations = np.array(observations)
	classes = np.array(classes)

	# make tree classifier
	my_tree = tree.DecisionTreeClassifier()
	my_tree.fit(observations, classes)
	tree_score = my_tree.score(observations, classes)
	tree_cv = cross_validation.cross_val_score(my_tree, observations, classes, scoring='accuracy', cv=10)
	#print "tree score:", tree_score, "tree cv", np.mean(tree_cv)

	# make naive classifier
	naive = BernoulliNB(binarize=None)
	naive.fit(observations, classes)
	naive_score = naive.score(observations, classes)
	naive_cv = cross_validation.cross_val_score(naive, observations, classes, scoring='accuracy', cv=10)
	#print "naive score:", naive_score, "naive cv", np.mean(naive_cv)

	# make SVM classifier
	svm = LinearSVC()
	svm.fit(observations, classes)
	svm_score = svm.score(observations, classes)
	svm_cv = cross_validation.cross_val_score(svm, observations, classes, scoring='accuracy', cv=10)
	#print "svm score:", svm_score, "svm cv", np.mean(svm_cv)

	# make Log classifier
	log = LogisticRegression()
	log.fit(observations, classes)
	log_score = log.score(observations, classes)
	log_cv = cross_validation.cross_val_score(log, observations, classes, scoring='accuracy', cv=10)
	#print "log score:", log_score, "log cv", np.mean(log_cv)

	return [(tree_score, np.mean(tree_cv)), (naive_score, np.mean(naive_cv)), (svm_score, np.mean(svm_cv)), (log_score, np.mean(log_cv))]

示例#8

0

显示文件

文件： Bern_NB_model.py 项目： 2dvodcast/Data-Science-1

def main():
	# Get the data and targets
	df = pd.read_csv('train1.csv')
	df = df[df.rating != 'rating']
	corpus = [review for review in df.review]
	splitPoint = len(corpus)*2/3
	trainingCorpus = corpus[:splitPoint]
	testCorpus = corpus[splitPoint:]
	target = [rating for rating in df.rating]
	trainingTarget = np.array(target[:splitPoint])
	testTarget = np.array(target[splitPoint:])

	# Train the algorithm
	train_X, vocabList = createVectorizer(trainingCorpus, 'None', True)
	NB_Bern_model = BernoulliNB().fit(train_X, trainingTarget)

	# Test the algorithm
	test_X = createVectorizer(testCorpus, vocabList, True)
	test_predict = NB_Bern_model.predict(test_X)
	print(np.mean(test_predict == testTarget))	
	print metrics.classification_report(testTarget, test_predict, target_names=['0', '1'])

	# Make Predictions
	predict_df = pd.read_csv('test2.csv')
	predictCorpus = [review for review in predict_df.review]
	member = [memberid for memberid in predict_df.ID]
	predict_X = createVectorizer(predictCorpus, vocabList, True)
	predictions = NB_Bern_model.predict(predict_X)
	predict_df.columns = ['ID', 'Predicted']
	for i in range(len(member)):
	 	predict_df.loc[predict_df['ID'] == member[i], 'Predicted'] = predictions[i]
	predict_df.to_csv('submission1.csv', sep = ',', index=False)

示例#9

0

显示文件

文件： part2.py 项目： alw231/349project

def synergy_naive_bayes(data,target):
    # generate champion relations as binaries
    for i in xrange(len(data)):
        temp = []
        for j in xrange(len(data[i])):
            if data[i][j] == -1:
                temp.append(1)
            else:
                temp.append(0)
        for j in xrange(len(data[i])):
            if data[i][j] == 1:
                temp.append(1)
            else:
                temp.append(0)
        num_champ = 124
        for j in xrange(num_champ):
            for k in xrange(j,num_champ):
                temp.append(temp[j]*temp[k])
                temp.append(temp[j+num_champ]*temp[k+num_champ])

        data[i] = temp  
    X = array(data)

    y = array(target)

    combined = zip(X, y)
    shuffle(combined)

    gnb = BernoulliNB()
    y_pred = gnb.fit(X[:len(X) * 4 / 5], y[:len(y) * 4 / 5]).predict(X[len(X) * 4 / 5:])

    print (metrics.classification_report(y[len(y) * 4 / 5:],y_pred))

示例#10

0

显示文件

文件： feature_classification_functions.py 项目： svenvdbeukel/Short-text-corpus-with-focus-on-humor-detection

def combined_experiment(train_x,train_y,test_x,test_y,train_f_x,train_f_y,test_f_x,test_f_y, bias):
    labels = [] # Will contain all the final labels that result from the voting
    clf_c1 = MultinomialNB()
    clf_c1.fit(train_x,train_y)
    clf_c2 = BernoulliNB()
    clf_c2.fit(train_x,train_y)
    clf_f1 = svm.SVC(kernel='linear',cache_size = 512)
    clf_f1.fit(train_f_x,train_f_y)
    clf_f2 = svm.SVC(kernel='rbf',cache_size = 512)
    clf_f2.fit(train_f_x,train_f_y)
    
    p1 = clf_c1.predict(test_x)
    p2 = clf_c2.predict(test_x)
    p3 = clf_f1.predict(test_f_x)
    p4 = clf_f2.predict(test_f_x)
    if bias == 'content':
        for i in range(len(p1)):
            if p1[i] == p2[i] or p1[i] == p3[i]:
                labels.append(p1[i])
            else:
                labels.append(p2[i])
    elif bias == "syntax":
        for i in range(len(p1)):
            if p1[i] == p3[i] or p1[i] == p4[i]:
                labels.append(p1[i])
            else:
                labels.append(p3[i])
    else:
        print 'Please enter a valid bias ("syntax" or "content")!'
    p_combined = np.array(labels)
    accuracy = (np.sum(p_combined == test_y)/np.float_(len(test_y)))
    return accuracy

示例#11

0

显示文件

文件： LearningModel.py 项目： s3341458/rmit-twitter-amalysis-heroku

class NaiveBayesClassifierBernoulli:
    """
    this class capsules the Bernoulli NaiveBayes functions of scikit-learn in BernoulliNB class
"""
    def __init__(self, matrixFileName = matrixFilePath, dicFileName = dictFilePath):
        self.X,self.Y = load_svmlight_file(matrixFileName)
        self.dictionary = pickle.load(open(dicFileName, "rb"))
        self.bernoulliNB = BernoulliNB()
        self.bernoulliNB.fit(self.X, self.Y)
        self.matrixParser = Parser.MatrixParserForLearning()
        
    def classifyOneSentence(self, string):
        row = self.matrixParser.getRowForClassify(string, self.dictionary)
        if row != None:
#             return self.bernoulliNB.predict(row)
            return self.bernoulliNB.predict(row)
        else : return None
    
    def classifyOneSentenceWithProbability(self,string):
        row = self.matrixParser.getRowForClassify(string, self.dictionary)
        if row != None:
#             return self.bernoulliNB.predict(row)
            a = self.bernoulliNB.predict_proba(row)
            return a[0][1] - a[0][0]
        else : return None

示例#12

0

显示文件

文件： part2.py 项目： alw231/349project

def naive_bayes(data, target):
    # change data to binary
    for i in xrange(len(data)):
        temp = []
        for j in xrange(len(data[i])):
            if data[i][j] == -1:
                temp.append(1)
            else:
                temp.append(0)
        for j in xrange(len(data[i])):
            if data[i][j] == 1:
                temp.append(1)
            else:
                temp.append(0)
        data[i] = temp  
    X = array(data)

    y = array(target)

    combined = zip(X, y)
    shuffle(combined)

    gnb = BernoulliNB()
    y_pred = gnb.fit(X[:len(X) * 4 / 5], y[:len(y) * 4 / 5]).predict(X[len(X) * 4 / 5:])

    print (metrics.classification_report(y[len(y) * 4 / 5:],y_pred))

示例#13

0

显示文件

文件： main.py 项目： jaksah/MLProject

def bernoulli_classify():
    clf = BernoulliNB()
    traindata = []
    traintarget = []
    for f in glob.glob("../../../res/articles/training_data/*-articles.json"):
        target = f.replace("-articles.json", "")
        target = re.sub(r".*\/+", "", target)
        output = readWholeFileBernoulli(f, target)
        traindata.extend(output[0])
        traintarget.extend(output[1])

    testdata = []
    testtarget = []
    for f in glob.glob("../../../res/articles/test_data/*-articles.json"):
        target = f.replace("-articles.json", "")
        target = re.sub(r".*\/+", "", target)
        output = readWholeFileBernoulli(f, target)
        testdata.extend(output[0])
        testtarget.extend(output[1])

    clf.fit(traindata, traintarget)
    ncorrect = 0
    total = len(testdata)
    for i in range(len(testdata)):
        predict = clf.predict(testdata[i])
        correct = testtarget[i]
        if correct == predict[0]:
            ncorrect += 1

        print ("Correct: {0} - Predicted: {1}".format(correct, predict[0]))

    print "Correct ", ncorrect, " Total ", total, " Correctness ", ncorrect * 1.0 / total

示例#14

0

显示文件

文件： algorithms.py 项目： HugoLG/SFCrimeClassification

def NB_train_classifier(train_x, train_y):
    """ Returns the predictions on the validation set
    """
    classifier = BernoulliNB()
    classifier.fit(train_x, train_y)

    return classifier

示例#15

0

显示文件

文件： Benouilli Naive Bayes.py 项目： Datainsightx/kaggleprojects

def MungeData(train, test):

    todrop = ['v22', 'v112', 'v125', 'v74', 'v1', 'v110', 'v47']
    print(todrop)

    train.drop(todrop,
               axis=1, inplace=True)
    test.drop(todrop,
              axis=1, inplace=True)

    features = train.columns[2:]
    for col in features:
        if((train[col].dtype == 'object')):
            print(col)
            train, binfeatures = Binarize(col, train)
            test, _ = Binarize(col, test, binfeatures)
            nb = BernoulliNB()
            nb.fit(train[col+'_'+binfeatures].values, train.target.values)
            train[col] = \
                nb.predict_proba(train[col+'_'+binfeatures].values)[:, 1]
            test[col] = \
                nb.predict_proba(test[col+'_'+binfeatures].values)[:, 1]
            train.drop(col+'_'+binfeatures, inplace=True, axis=1)
            test.drop(col+'_'+binfeatures, inplace=True, axis=1)

    features = train.columns[2:]
    train[features] = train[features].astype(float)
    test[features] = test[features].astype(float)
    train.fillna(-1, inplace=True)
    test.fillna(-1, inplace=True)
    return train, test

示例#16

0

显示文件

文件： main.py 项目： Alitzlan/cs578

def bnb_fit(train_data, train_lbl_data):
    from sklearn.naive_bayes import BernoulliNB
    print "Starts bnb"

    bnb = BernoulliNB()
    bnb.fit(train_data, train_lbl_data)
    return bnb

示例#17

0

显示文件

文件： Best_Buy_API.py 项目： moyphilip/BestBuy

def naive_bayes(df,column):
    reviews_pn = df[df['class'].isin(['positive','negative'])]
    comments = list(reviews_pn[column].values)
    classes = list(reviews_pn['class'].values)
    
    # preprocess creates the term frequency matrix for the review data set
    stop = stopwords.words('english')
    count_vectorizer = CountVectorizer(stop_words = stop, ngram_range=(1,3))
    comments1 = count_vectorizer.fit_transform(comments)
    tfidf_comments = TfidfTransformer(use_idf=True).fit_transform(comments1)
    
    # preparing data for split validation. 60% training, 40% test
    data_train,data_test,target_train,target_test = cross_validation.train_test_split(tfidf_comments,classes,test_size=0.4,random_state=43)
    classifier = BernoulliNB().fit(data_train,target_train)
    predicted = classifier.predict(data_test)
    
    print classification_report(target_test,predicted)
    print "The accuracy score is {:.2%}".format(accuracy_score(target_test,predicted))
    
    most_informative_feature_for_binary_classification(count_vectorizer,classifier,n=20)
    
    #predict on unknown
    reviews_nc = reviews_df[reviews_df['class'] == '']
    comments_nc = list(reviews_nc[column].values)
    comments_nc1 = count_vectorizer.transform(comments_nc)    
    tfidf_comments_nc = TfidfTransformer(use_idf=True).fit_transform(comments_nc1)    
    new_predicted = classifier.predict(tfidf_comments_nc)
    
    print "negative = %s" %sum(new_predicted == 'negative')
    print "positive = %s" %sum(new_predicted == 'positive')

示例#18

0

显示文件

文件： classfication.py 项目： Skylatitude/BNP-Kaggle-Competition

 def doclassify(self, type='normal'):
     if type == 'normal':
         clf = BernoulliNB()
         clf.fit(self.train_x, self.train_y)
         BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
         score = clf.score(self.train_x, self.train_y)
         print 'score = ', score

示例#19

0

显示文件

文件： Models.py 项目： ineilm/BountyApp

def BernoulliNB_1(train_predictors,test_predictors,train_target,test_target):
    clf = BernoulliNB()
    clf.fit(train_predictors,train_target)
    predicted = clf.predict(test_predictors)
    accuracy = accuracy_score(test_target, predicted)
    print "Accuracy for Bernoulli Naive Bayes: "+str(accuracy)
    return accuracy,predicted

示例#20

0

显示文件

文件： ml_docs_classification_2.py 项目： RaoUmer/docs_classification

def BNB(data_train, data_train_vectors, data_test_vectors, **kwargs):
    # Implementing classification model- using BernoulliNB
    clf_BNB = BernoulliNB(alpha=.01)
    clf_BNB.fit(data_train_vectors, data_train.target)
    y_pred = clf_BNB.predict(data_test_vectors)
    
    return y_pred

示例#21

0

显示文件

文件： test_CountVector.py 项目： luoyan/miniprog

def test_BernouliNB4():
    X = np.array([
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [0, 0],
        [0, 0],
        [1, 0],
        ]
            )
    print 'X ' + str(X)
    #Y = np.array([1, 1, 1, 1, 2, 2, 2, 2])
    Y = np.array([1, 1, 0, 1, 0, 0, 0, 1, 1, 0])
    print 'Y ' + str(Y)
    clf = BernoulliNB(alpha = 1)
    clf.fit(X, Y)
    X2 = np.array(
            [
            [1, 1],
            ]
            )
    for i in xrange(len(X2)):
        #pred_ret = clf.predict_proba(X2[i])
        pred_ret = clf.predict(X2[i])
        print 'X[' + str(i) + '] = ' + str(X2[i]) + ' pred_ret ' + str(pred_ret)

示例#22

0

显示文件

文件： NaiveBayes_skl.py 项目： bsherin/tactic

    def render_content(self):
        if self.text_source is None:
            return "No text source selected."
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.naive_bayes import BernoulliNB
        from sklearn import metrics
        self.dm("creating vectorizer")
        vectorizer = CountVectorizer(stop_words=self.get_user_list(self.stop_list), max_features=self.vocab_size)
        data = self.get_column_data(self.text_source)
        self.dm("using vectorizer")
        X_train = vectorizer.fit_transform(data)
        Y_train = self.get_column_data(self.code_source)
        self.dm("creating classifier")
        clf = BernoulliNB()
        clf.fit(X_train, Y_train)
        
        accuracy = clf.score(X_train, Y_train)
        self.dm("predicting")
        pred = clf.predict(X_train)
        cm = metrics.confusion_matrix(Y_train, pred)

        self.dm("displaying result")
        html_output = "accuracy is " + str(round(accuracy, 2))
        html_output += '<pre>'+ str(cm) + '</pre>'

        return html_output

示例#23

0

显示文件

文件： DataSimulator.py 项目： ds-ga-1007/final_project

    def generatePredictingModel(data):
        """
            Build the prediction model (based on the data set we have) in order to be able to predict the category
            of a new video from the user input
            Return a classifier able to predict the category of a video based on its title and description.
        """
        try:
            # Intitialize a timer to compute the time to build the model
            start = time.time()

            # Split into train-test data set
            X = data[[x for x in data.columns if x in ('title', 'description')]]
            Y = data[[x for x in data.columns if x in ('video_category_id')]]
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.80, random_state = 10)

            # Build the 2 text corpus
            corpus_title = X_train['title'].values.tolist()
            corpus_description = X_train['description'].values.tolist()

            # initializes the 2 vectorizers.
            count_vectorizer_title = CountVectorizer()
            count_vectorizer_description = CountVectorizer()

            # learn the 2 vocabulary dictionary
            count_vectorizer_title.fit(corpus_title)
            count_vectorizer_description.fit(corpus_description)

            # Build the sparse matrices
            X_train_count_title = count_vectorizer_title.transform(X_train['title'])
            X_train_count_description = count_vectorizer_description.transform(X_train['description'])
            X_test_count_title = count_vectorizer_title.transform(X_test['title'])
            X_test_count_description = count_vectorizer_description.transform(X_test['description'])

            # Set and train the models (for title and description features)
            model_count_title = BernoulliNB()
            model_count_description = BernoulliNB()
            model_count_title.fit(X_train_count_title, Y_train['video_category_id'])
            model_count_description.fit(X_train_count_description, Y_train['video_category_id'])

            # Merge the title and description predictions and build a new prediction based on these 2 predictions combined
            new_df_train = pd.DataFrame()
            new_df_train['title_prediction'] = model_count_title.predict(X_train_count_title)
            new_df_train['description_prediction'] = model_count_description.predict(X_train_count_description)
            new_df_test = pd.DataFrame()
            new_df_test['title_prediction'] = model_count_title.predict(X_test_count_title)
            new_df_test['description_prediction'] = model_count_description.predict(X_test_count_description)
            tree = DecisionTreeClassifier()
            tree.fit(new_df_train, Y_train)

            end = time.time()
            execution_time = end - start

            print "Time to build this incredibly amazing model, only : {} seconds!!!!!!".format(execution_time)
            time.sleep(3)

            return tree, model_count_title, model_count_description,count_vectorizer_title,count_vectorizer_description

        except:
            raise VideoAnalysisException(" Error while creation of predictive model ")

示例#24

0

显示文件

文件： Amazon classifier.py 项目： vipmunot/Sentiment-Analysis

def learn_model(data, target):
    # preparing data for split validation. 80% training, 20% test
    data_train, data_test, target_train, target_test = cross_validation.train_test_split(
        data, target, test_size=0.2, random_state=43
    )
    classifier = BernoulliNB().fit(data_train, target_train)
    predicted = classifier.predict(data_test)
    evaluate_model(target_test, predicted)

示例#25

0

显示文件

文件： naivebayes.py 项目： byted/ShelterAnimalOutcomeKaggle

def score(train_X, train_y):

    X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.01, random_state=10)

    clf = BernoulliNB(binarize=False, fit_prior=True, alpha=0.7)
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_valid)
    return log_loss(y_valid, y_pred)

示例#26

0

显示文件

文件： exp_sentiment.py 项目： appscluster/sentiment-CNN

	def testBoGNB(self):
		'''
		Test on sentiment analysis task using Naive Bayes classifier 
		with Bag-of-Word feature vectors.
		'''
		wordlist = []
		# Preprocessing of original txt data set
		for i, sent in enumerate(self.senti_train_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			wordlist.extend(words)
		for i, sent in enumerate(self.senti_test_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			wordlist.extend(words)
		word_dict = set(wordlist)
		word2index = dict(zip(word_dict, range(len(word_dict))))
		# Build BoG feature
		train_size = len(self.senti_train_txt)
		test_size = len(self.senti_test_txt)
		pprint('Training set size: %d' % train_size)
		pprint('Test set size: %d' % test_size)
		train_feat = np.zeros((train_size, len(word_dict)), dtype=np.float)
		test_feat = np.zeros((test_size, len(word_dict)), dtype=np.float)
		# Using binary feature
		start_time = time.time()
		for i, sent in enumerate(self.senti_train_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			indices = map(lambda x: word2index[x], words)
			train_feat[i, indices] = 1.0
		for i, sent in enumerate(self.senti_test_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			indices = map(lambda x: word2index[x], words)
			test_feat[i, indices] = 1.0
		end_time = time.time()
		pprint('Finished building training and test feature matrix, time used: %f seconds.' % (end_time-start_time))
		pprint('Classification using Bernoulli Naive Bayes classifier: ')
		clf = BernoulliNB()
		# clf = LogisticRegression()
		clf.fit(train_feat, self.senti_train_label)
		train_pred_label = clf.predict(train_feat)
		train_acc = np.sum(train_pred_label == self.senti_train_label) / float(train_size)
		pprint('Training accuracy = %f' % train_acc)
		pred_label = clf.predict(test_feat)
		acc = np.sum(pred_label == self.senti_test_label) / float(test_size)
		pprint('Accuracy: %f' % acc)
		train_pos_count = np.sum(self.senti_train_label == 1)
		train_neg_count = np.sum(self.senti_train_label == 0)
		test_pos_count = np.sum(self.senti_test_label == 1)
		test_neg_count = np.sum(self.senti_test_label == 0)
		pprint('Positive count in training set: %d' % train_pos_count)
		pprint('Negative count in training set: %d' % train_neg_count)
		pprint('Ratio: pos/neg = %f' % (float(train_pos_count) / train_neg_count))
		pprint('Positive count in test set: %d' % test_pos_count)
		pprint('Negative count in test set: %d' % test_neg_count)
		pprint('Ratio: pos/neg = %f' % (float(test_pos_count) / test_neg_count))

示例#27

0

显示文件

文件： Amazon Machine Learning techniques.py 项目： vipmunot/Sentiment-Analysis

def learnBModel(ip,label,tst,tst_label):
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
    X_train = vectorizer.fit_transform(ip.data)
    X_test = vectorizer.transform(tst.data)
    tfidf_train = TfidfTransformer(use_idf=False).fit_transform(X_train)
    tfidf_test = TfidfTransformer(use_idf=False).fit_transform(X_test)
    classifier = BernoulliNB().fit(tfidf_train,label)
    predicted_BModel = classifier.predict(tfidf_test)
    evaluate_model(tst_label,predicted_BModel)

示例#28

0

显示文件

文件： main.py 项目： LewkowskiArkadiusz/magistrerka_app

def naive_bayesB_classifier(X_train, categories, X_test, test_categories):
    from sklearn.naive_bayes import BernoulliNB   
    clf = BernoulliNB(alpha = 0.10000000000000001).fit(X_train, categories)
    y_nb_predicted = clf.predict(X_test)
    print "\n Here is the classification report for Naive Bayes classifier:"
    print metrics.classification_report(test_categories, y_nb_predicted)
    print "Accuracy score:"
    print metrics.accuracy_score(test_categories, y_nb_predicted)
    to_latex(test_categories, y_nb_predicted)

示例#29

0

显示文件

文件： naive_bayes_noFS.py 项目： poddar/predictive

def BernoulliNB_pred(X_train, X_test, y_train):
    clf_NB = BernoulliNB()
    clf_NB.fit(X_train, y_train)

    # Conveting to back, (could be used sklearn standardization function for both decoding and encoding)
    predictions_train = clf_NB.predict_proba(X_train)
    predictions = clf_NB.predict_proba(X_test)

    return predictions[:, 1], predictions_train[:, 1]

示例#30

0

显示文件

文件： ran_inference.py 项目： rAnYKM/rAnPrivGP

 def nb_classifier(self, secret):
     clf = BernoulliNB()
     x = self.raw_attr_vector(secret)
     y = self.get_labels(secret)
     fsl = self.feature_sel(secret)
     new_x = fsl.transform(x)
     clf.fit(new_x, y)
     new_y = clf.predict(new_x)
     return clf, fsl, self.evaluate(new_y, y)

示例#31

0

显示文件

    # tf-idf处理
    vectorizer = TfidfVectorizer(input='content',
                                 stop_words='english',
                                 max_df=0.5,
                                 sublinear_tf=True)
    x_train = vectorizer.fit_transform(data_train.data)
    x_test = vectorizer.transform(data_test.data)
    print('训练集样本个数：%d，特征个数：%d' % x_train.shape)
    print('停止词:\n', end=' ')

    #pprint(vectorizer.get_stop_words())
    feature_names = np.asarray(vectorizer.get_feature_names())

    # 比较分类器结果
    clfs = (MultinomialNB(), BernoulliNB())
    result = []
    for clf in clfs:
        r = make_test(clf)
        result.append(r)
        print('\n')

    result = np.array(result)
    time_train, time_test, err, names = result.T
    time_train = time_train.astype(np.float)
    time_test = time_test.astype(np.float)
    err = err.astype(np.float)
    x = np.arange(len(time_train))
    mpl.rcParams['font.sans-serif'] = ['simHei']
    mpl.rcParams['axes.unicode_minus'] = False
    plt.figure(figsize=(10, 7), facecolor='w')

示例#32

0

显示文件

##datasets with a validation set
X_train2 = full_df[:1120000, :]
X_valid = full_df[1120000:1600000, :]

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y_train1 = le.fit_transform(full_data['Sentiment'])
y_train2 = le.transform(full_data['Sentiment'][:1120000])
y_valid = le.transform(full_data['Sentiment'][1120000:])
y_test = le.transform(test_data_pos_neg['Sentiment'])

######Try Binomial Naive Bayes Model without word stemming######
from sklearn.naive_bayes import BernoulliNB
##Convert word frequency matrix into binary matrix
X_train1_bin = X_train1.copy()
X_train1_bin[X_train1_bin > 0] = 1

clf_ber_bayes = BernoulliNB()
clf_ber_bayes.fit(X_train1_bin, y_train1)

train_preds = clf_ber_bayes.predict(X_train1_bin)
accuracy_score(train_preds, y_train1)
#Convert test dataframe to binary
X_test_bin = X_test.copy()
X_test_bin[X_test_bin > 0] = 1

test_preds = clf_ber_bayes.predict(X_test_bin)
accuracy_score(y_test, test_preds)  ##84.12 % accuracy_score

示例#33

0

显示文件

文件： modelTraining.py 项目： RJAIN-27/SlackBot-CSC510

def modelTraining(X_train, X_test, y_train, y_test, f):
    models = {}
    # Linear SVC
    try:
        lsvc = LinearSVC()
        y_pred = lsvc.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Linear Support Vector Classifier"] = model_accr
        f.writelines(
            "\n            Accuracy of Linear Support Vector Classifier is " +
            str(model_accr))
    except:
        logging.info("LSVC is throwing exception")
        f.writelines("\n            LSVC is throwing exception")

    # KNN
    try:
        knn = KNeighborsClassifier()
        y_pred = knn.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["KNN Classifier"] = model_accr
        f.writelines("\n            Accuracy of KNN Classifier is " +
                     str(model_accr))
    except:
        logging.info("KNN is throwing exception")
        f.writelines("\n            KNN is throwing exception")

    # DTC
    try:
        clf_gini = DecisionTreeClassifier(criterion="gini", random_state=0)
        y_pred = clf_gini.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Decision Tree Classifier - GINI"] = model_accr
        f.writelines(
            "\n            Accuracy of Decision Tree Classifier - GINI is " +
            str(model_accr))
    except:
        logging.info("DTC GINI is throwing exception")
        f.writelines("\n            DTC GINI is throwing exception")

    try:
        clf_entropy = DecisionTreeClassifier(criterion="entropy",
                                             random_state=0)
        y_pred = clf_entropy.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Decision Tree Classifier - ENTROPY"] = model_accr
        f.writelines(
            "\n            Accuracy of Decision Tree Classifier - ENTROPY is "
            + str(model_accr))
    except:
        logging.info("DTC ENTROPY is throwing exception")
        f.writelines("\n            DTC ENTROPY is throwing exception")

    # Multinomial NB
    try:
        mnb_model = MultinomialNB()
        y_pred = mnb_model.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Multinomial Naive Bayes"] = model_accr
        f.writelines("\n            Accuracy of Multinomial NB is " +
                     str(model_accr))
    except:
        logging.info("Multinomial NB is throwing exception")
        f.writelines("\n            Multinomial NB is throwing exception")

    # Bernoulli NB
    try:
        bnb_model = BernoulliNB()
        y_pred = bnb_model.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Bernoulli Naive Bayes"] = model_accr
        f.writelines("\n            Accuracy of Bernoulli NB is " +
                     str(model_accr))
    except:
        logging.info("Bernoulli NB is throwing exception")
        f.writelines("\n            Bernoulli NB is throwing exception")

    # Gaussian NB
    try:
        gnb_model = GaussianNB()
        y_pred = gnb_model.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Gaussian Naive Bayes"] = model_accr
        f.writelines("\n            Accuracy of GaussianNB is " +
                     str(model_accr))
    except:
        logging.info("GaussianNB is throwing exception")
        f.writelines("\n            GaussianNB is throwing exception")

    # ADB
    try:
        adb = AdaBoostClassifier(n_estimators=200, learning_rate=1)
        # Train Adaboost Classifer
        y_pred = adb.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["AdaBoost Classifier"] = model_accr
        f.writelines("\n            Accuracy of AdaBoost Classifier is " +
                     str(model_accr))
    except:
        logging.info("AdaBoost Classifier is throwing exception")
        f.writelines("\n            AdaBoost Classifier is throwing exception")

    # Random Forest Classifier
    try:
        rfc = RandomForestClassifier(n_estimators=100)
        y_pred = rfc.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Random Forest Classifier"] = model_accr
        f.writelines("\n            Accuracy of Random Forest Classifier is " +
                     str(model_accr))
    except:
        logging.info("Random Forest Classifier is throwing exception")
        f.writelines(
            "\n            Random Forest Classifier is throwing exception")

    return (models)

示例#34

0

显示文件

文件： train_model.py 项目： jessica-santos/ds-test

def classification_voting(X,y, nome):
    clf2 = RandomForestClassifier(n_estimators=30, max_depth=None, min_samples_split=2, random_state=0)
    clf3 = BernoulliNB()
    eclf2 = VotingClassifier(estimators=[('rf', clf2), ('bnb', clf3)],voting = 'soft')
    classification_model_cv(X, y, eclf2, "Voting Model "+nome)

示例#35

0

显示文件

文件： tfidf_advertiser_id.py 项目： yzheng51/tencent-2020

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

# call garbage collection to release some memory
del train, test, user, tf_csr, tfidf_csr
gc.collect()
# -------------------------------------------------------------------------------------------------

print('-' * 100)
print(f'Gender prediction with {TARGET_FEAT}\n')
models = dict(
    lr=LogisticRegression(random_state=seed, C=5, solver='sag'),
    svm=LinearSVC(random_state=seed, C=0.5),
    pac=PassiveAggressiveClassifier(random_state=seed, C=0.05),
    ridge=RidgeClassifier(random_state=seed, alpha=5),
    sgd=SGDClassifier(random_state=seed, penalty='l1', loss='log', alpha=1e-6),
    bnb=BernoulliNB(alpha=0.1),
    mnb=MultinomialNB(alpha=0.1),
)

# specify target label
y_train = label_gender

# define features
train_feat_gender = pd.DataFrame()
test_feat_gender = pd.DataFrame()

for name, model in models.items():
    timer.start()
    stack_train, stack_test = kfold_stack_binary(kfold, model, x_train,
                                                 y_train, x_test)
    timer.stop()

示例#36

0

显示文件

###############
save_classifier = open("originalnaivebayes5k.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:",
      (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100)

save_classifier = open("MNB_classifier5k.pickle", "wb")
pickle.dump(MNB_classifier, save_classifier)
save_classifier.close()

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:",
      (nltk.classify.accuracy(BernoulliNB_classifier, testing_set)) * 100)

save_classifier = open("BernoulliNB_classifier5k.pickle", "wb")
pickle.dump(BernoulliNB_classifier, save_classifier)
save_classifier.close()

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:",
      (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) *
      100)

save_classifier = open("LogisticRegression_classifier5k.pickle", "wb")

示例#37

0

显示文件

from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.tree import DecisionTreeClassifier
from tpot.builtins import StackingEstimator
from xgboost import XGBClassifier

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8702380952380953
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=BernoulliNB(alpha=100.0, fit_prior=True)),
    StackingEstimator(estimator=DecisionTreeClassifier(criterion="gini",
                                                       max_depth=7,
                                                       min_samples_leaf=8,
                                                       min_samples_split=20)),
    XGBClassifier(learning_rate=0.1,
                  max_depth=5,
                  min_child_weight=4,
                  n_estimators=100,
                  nthread=1,
                  subsample=0.7000000000000001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

示例#38

0

显示文件

文件： a1q3p3.py 项目： mbenitah/comp550

def classify(X, y, clf_type='nbc'):
    """
    Preprocess the input documents to extract feature vector representations of
    them. Your features should be N-gram counts, for N<=2.

    1. Experiment with the complexity of the N-gram features (i.e., unigrams,
       or unigrams and bigrams): `gram_min` + `gram_max`
    2. Experiment with removing stop words. (see NLTK)
    3. Remove infrequently occurring words and bigrams as features. You may tune
       the threshold at which to remove infrequent words and bigrams.
    4. Search over hyperparameters for the three models (nb, svm, lr) to
       find the best performing model.

    All 4 of the above are done in the context of 10-fold cross validation on
    the data. On the training data, 3-fold cross validation is done to find the
    optimal hyperparameters (using randomized CV), which are then tested on
    held-out data.
    """

    if clf_type == 'nbc':
        clf = BernoulliNB()
        params = SETTINGS_NB
    elif clf_type == 'svc':
        clf = LinearSVC()
        params = SETTINGS_SVC
    elif clf_type == 'lrc':
        clf = LogisticRegression()
        params = SETTINGS_LR
    else:
        raise Exception('invalid clf {}: {nbc, svc, lrc}'.format(clf_type))

    # pipeline runs preprocessing and model during every CV loop
    pipe = Pipeline([
        ('pre', CountVectorizer()),
        ('clf', clf),
    ])

    model = RandomizedSearchCV(
        pipe, params, n_jobs=-1, n_iter=N_CV, cv=INNER, scoring='f1_macro'
    )

    results = {
        'test':  {'loss': [], 'accuracy': [], 'confusion': [], 'errors': []},
        'train': {'loss': [], 'accuracy': [], 'confusion': []},
        'cv': {}
    }

    kf = StratifiedKFold(n_splits=FOLDS, shuffle=True)

    for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        print("[{}] {}/{}".format(clf_type, i+1, FOLDS))

        # split training and test sets
        X_train = X[train_idx]
        X_test = X[test_idx]
        y_train = y[train_idx]
        y_test = y[test_idx]

        # fit model
        model.fit(X_train, y_train)

        # save the best parameters from the inner-fold cross validation
        best_params = model.best_estimator_.get_params()
        for p in sorted(params.keys()):
            results['cv'][p] = best_params[p]

        # make predictions on train and test set
        y_test_pred = model.predict(X_test)
        y_train_pred = model.predict(X_train)

        # record some misclassified sentences
        idx_errors = np.where(y_test_pred != y_test)[0]
        np.random.shuffle(idx_errors)
        errors = X_test[idx_errors[:5]]
        results['test']['errors'].extend(errors)

        # store results
        results['test']['loss'].append(log_loss(y_test, y_test_pred))
        results['test']['accuracy'].append(accuracy_score(y_test, y_test_pred))
        results['test']['confusion'].append(confusion_matrix(y_test, y_test_pred))
        results['train']['loss'].append(log_loss(y_train, y_train_pred))
        results['train']['accuracy'].append(accuracy_score(y_train, y_train_pred))
        results['train']['confusion'].append(confusion_matrix(y_train, y_train_pred))

    return(results)

示例#39

0

显示文件

from scorer_semeval18 import main as eval

tokenized_tweets = pickle.load(open(TOK_TWEETS_PATH, 'rb'))
print('loaded tweets')

data_matrix = construct_data_matrix(tokenized_tweets)
print('constructed data matrix')
print('Dim:', data_matrix.shape)
print('Density:', np.count_nonzero(data_matrix) / np.size(data_matrix))

labels = np.asarray(open(CLEAN_LABELS_PATH).read().splitlines())
data_train, data_test, labels_train, labels_test = split_data(
    data_matrix, labels)
print('split data')

bern = BernoulliNB()
bern.fit(data_train, labels_train)
print("\nbern", bern.score(data_test, labels_test))
eval(labels_test, bern.predict(data_test))

multi = MultinomialNB()
multi.fit(data_train + abs(np.min(data_train)), labels_train)
print("\nmulti", multi.score(data_test + abs(np.min(data_test)), labels_test))
eval(labels_test, multi.predict(data_test))

tree = DecisionTreeClassifier(max_depth=10)
tree.fit(data_train, labels_train)
print("\ntree", tree.score(data_test, labels_test))
eval(labels_test, tree.predict(data_test))

clf = RandomForestClassifier(max_depth=3)

示例#40

0

显示文件

文件： script_1.py 项目： AlexeyOs/AlgorithmsML

# [5] Результат в процентах

from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    results = []
    for clf, name in [
        (BernoulliNB(alpha=0.4), 'Native Bayes'),
        (LinearSVC(C=9), 'SVC'),
        (
            DecisionTreeClassifier(max_depth=26),
            'DecisionTreeClassifier',
        ),
            # (LogisticRegression(C=12), 'LogisticRegression'),
            # (RandomForestClassifier(max_depth=2, random_state=0), 'RandomForest'),
        (KNeighborsClassifier(n_neighbors=13), 'KNN')
    ]:
        #     Y_train.reshape(Y_train.shape[0],)
        #     Y_test.reshape(Y_test.shape[0])
        clf.fit(X_train, Y_train)

        predictions = clf.predict(X_train)
        training_accuracy = accuracy_score(predictions, Y_train)

示例#41

0

显示文件

文件： Saving.py 项目： bl4ck4ndbr0wn/1infinitycOrp

            votes.append(v)
        return str(mode(votes)[0])

    def confidence(self, features):
        votes =[]
        for c in self._classifiers:
            v = c.predict(features)
            votes.append(v)
        choice_votes = int(mode(votes)[1])
        conf = choice_votes / len(votes)
        return conf
    #def test_accuracy(self, x2,x3,x4,x5,x6, x7):
    #    average = mean([x2,x3,x4,x5,x6, x7])
    #    return average

BNB = BernoulliNB()
BNB.fit(tfidf_train, y_train)
pred = BNB.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
x2 = metrics.accuracy_score(y_test, pred)
print("BernoulliNB Naive Bayes Accuracy:   %0.3f" % score)
#cm = metrics.confusion_matrix(y_test, pred, labels=[0,1])
#plot_confusion_matrix(cm, classes=[0, 1])

save_classifier = open("Pickled/BernoulliNB.pickle", "wb")
pickle.dump(BNB, save_classifier)
save_classifier.close()

LR = LogisticRegression()
LR.fit(tfidf_train, y_train)
pred = LR.predict(tfidf_test)

示例#42

0

显示文件

def main():
    show_plots = False #set to True to show plots, False to not show plots

    #read categories from arguments. e.g. "python3 test.py Comedy Drama Documentary Horror"
    categories = []
    for arg in sys.argv[1:]:
        categories.append(arg)

    X, y, files_used = read_files(categories)

    try:
        high_info_words = high_information_words(X, y)

        X_high_info = []
        for bag in X:
            new_bag = []
            for words in bag:
                if words in high_info_words:
                    new_bag.append(words)
            X_high_info.append(new_bag)
    except ZeroDivisionError:
        print("Not enough information too get high information words, please try again with more files.", file=sys.stderr)
        X_high_info = X

    X_wpm = wpm(files_used, categories, show_plots)
    X_dpm = dpm(files_used, categories, show_plots)
    X_wd = word_distribution(files_used, categories)

    doc2vec_model = Doc2Vec.load("d2v_150.model")
    #doc2vec_model = Doc2Vec.load("d2v_400.model")

    #Reason I don't infer the vector is that I used the data already while training the vector model (with tagged docoments), so I can just retrieve the data
    X_d2v = [doc2vec_model.docvecs[str(i)] for i in range(len(X))]
    #X_d2v = [doc2vec_model.infer_vector(to_list(str(i))) for i in X] 

    X = [(str(x), str(x_high), wpm, dpm, wd, d2v) for x, x_high, wpm, dpm, wd, d2v in zip(X, X_high_info, X_wpm, X_dpm, X_wd, X_d2v)]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

    clfs = [
        SVC(C=10, cache_size=500, class_weight=None, coef0=0.0, #parameters found using grid_search.py
        decision_function_shape=None, degree=3, gamma=0.0001, kernel='linear',
        max_iter=100000, probability=False, random_state=None, shrinking=True,
        tol=0.001, verbose=False),
        MultinomialNB(alpha=1.0),
        BernoulliNB(),
    ]

    pipeline = Pipeline([
        # Extract the features
        ('features', FeaturesExtractor()),

        # Use FeatureUnion to combine the features from subject and body
        ('union', FeatureUnion(
            transformer_list=[
                #Pipeline bag-of-words model 
                ('text', Pipeline([
                    ('selector', ItemSelector(key='text')),
                    ('tfidf', TfidfVectorizer(sublinear_tf=True, binary=True, norm='l2', ngram_range=(1,3))),
                    #('chi-square', SelectKBest(chi2, 300)),
                ])),

                #Pipeline for high info words bag-of-words model 
                ('text_high', Pipeline([
                    ('selector', ItemSelector(key='text_high')),
                    ('tfidf', TfidfVectorizer(sublinear_tf=True, norm='l2')),
                ])),

                #Pipeline for wpm feature
                ('wpm', Pipeline([
                    ('selector', ItemSelector(key='wpm')),
                    ('scaler', MinMaxScaler()),
                ])),

                #Pipeline for dpm feature
                ('dpm', Pipeline([
                    ('selector', ItemSelector(key='dpm')),
                    ('scaler', MinMaxScaler()),
                ])),

                #Pipeline for wd feature
                ('wd', Pipeline([
                    ('selector', ItemSelector(key='wd')),
                    ('scaler', MinMaxScaler()),
                ])),

                #Pipeline for d2v feature
                ('d2v', Pipeline([
                    ('selector', ItemSelector(key='d2v')),
                    ('scaler', MinMaxScaler()),
                ])),

                #Pipeline for POS tag features
                # ('pos', Pipeline([
                #     ('selector', ItemSelector(key='pos')),
                #     ('words', TfidfVectorizer(sublinear_tf=True, binary=True, norm='l2', ngram_range=(1,3)))
                # ])),

            ],

            # weight components in FeatureUnion
            transformer_weights={ 
                'text': 0.2,
                'text_high' : 1,
                'wpm': 0,
                'dpm': 0.2,
                'wd': 0,
                'd2v': 0,
                #'pos': 0,
            },
        )),

        # Use a classifier on the combined features
        ('classifier', clfs[0]),
    ])

    train(pipeline, X_train, y_train, categories, show_plots)

    final_pred = pipeline.predict(X_test)
    print("\nScores on test set:\n")
    print(metrics.accuracy_score(y_test, final_pred))
    print(metrics.classification_report(y_test, final_pred, digits=3))

    confusion_m = metrics.confusion_matrix(y_test, final_pred, labels=categories)
    plt.figure(figsize = (16, 9), dpi=150)
    sn.set(font_scale=1.4) #label size
    hm = sn.heatmap(confusion_m, annot=True, fmt='g', annot_kws={"size": 16}) #font size
    hm.set(xticklabels = categories, yticklabels = categories)
    plt.title(str(pipeline.named_steps['classifier']).split("(")[0] + ' Confusion Matrix')
    if show_plots:
        plt.show()
    hm.figure.savefig(str(pipeline.named_steps['classifier']).split("(")[0] + '_confusion_matrix_test' + '.png', figsize = (16, 9), dpi=150)
    plt.close()

示例#43

0

显示文件

文件： Quora.py 项目： Jeyamurugan14/Quora-NLP

#Naive Bayes Model
from sklearn.model_selection import train_test_split
quora_train, cv = train_test_split(quora_train, test_size=0.2)
x_train = quora_train.drop(['target'], axis=1)
y_train = quora_train['target']
x_cv = cv.drop(['target'], axis=1)
y_cv = cv['target']

from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vect = TfidfVectorizer()
reviews_tfidf = tf_idf_vect.fit_transform(x_train['question_text'].values)
reviews_tfidf1 = tf_idf_vect.transform(x_cv['question_text'].values)
reviews_tfidf2 = tf_idf_vect.transform(quora_test['question_text'].values)
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import BernoulliNB
nb = BernoulliNB()
param_grid = {
    'alpha': [1000, 100, 10, 1, 0.1, 0.01, 0.001]
}  #params we need to try on classifier
gsv = GridSearchCV(nb, param_grid, cv=2, verbose=1, n_jobs=-1, scoring='f1')
gsv.fit(reviews_tfidf, y_train)
nb = BernoulliNB(alpha=0.1)
nb.fit(reviews_tfidf, y_train)
train_pred = nb.predict(reviews_tfidf)
cv_pred = nb.predict(reviews_tfidf1)

test_pred = nb.predict(reviews_tfidf2)
print("Train Set Accuracy: {}".format(accuracy_score(train_pred, y_train)))
print("Train Set ROC: {}".format(roc_auc_score(train_pred, y_train)))
print("Train Set F1 Score: {}\n".format(f1_score(train_pred, y_train)))
print("Validation Set Accuracy: {}".format(accuracy_score(cv_pred, y_cv)))

示例#44

0

显示文件

文件： naive2.py 项目： abhijeetg12/Naive-Bayes-for-Email-Spam-Classification-

print Matr.shape
Matr=Matr[1:]
print len(Yval)

a=1000
b=100000
prior1=(a+spamc-1)*1.0/(a+b+spamc+legitc-2)
prior2=(a+legitc-1)*1.0/(a+b+spamc+legitc-2)
#   y=beta.pdf(x, a, b)
from sklearn.metrics import precision_recall_curve
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.cross_validation import train_test_split
a_train, a_test, b_train, b_test = train_test_split(Matr, Yval, test_size=0.2, random_state=42)
clf = MultinomialNB(class_prior=[1,2])
clf2= BernoulliNB(class_prior=[prior1,prior2])
clf.fit(a_train, b_train)
clf2.fit(a_train, b_train)
Ax=clf.predict(a_test)
Bx=clf2.predict(a_test)
from sklearn.metrics import f1_score

#print f1_score(b_test, Ax, average='macro')
print f1_score(b_test, Bx, average='macro')

import matplotlib.pyplot as plt

precision, recall, _ = precision_recall_curve(b_test, Bx)

plt.step(recall, precision, color='b', alpha=0.2,where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,color='b')

示例#45

0

显示文件

def main():
    x = [0, 1, 2, 3, 4, 5]
    LABELS = [
        'simple_nb', 'svm', 'KNN', 'gausian_nb', 'bernoulli', 'random_forest'
    ]
    plt.title("Accuracy of different algorithm on different user chat")
    plt.xlabel("Algorithms used")
    plt.ylabel("Accuracy")
    path = './chats_process'

    #test_negative = convert_float(test_nega)
    #labels_test_negative = get_labels(test_negative)
    count = 0
    results = [0, 0, 0, 0, 0, 0]
    for filename in os.listdir(path):

        count += 1

        #print filename
        t = path + '/' + filename + '/train.csv'
        splitRatio = .5
        dataset = loadCsv(t)
        trainingSet, testSet = splitDataset(dataset, splitRatio)

        #testSet = testSet + test_nega
        trainset_copy = trainingSet
        test_copy = testSet

        trainingSet = convert_float(trainingSet)
        testSet = convert_float(testSet)

        #print testSet

        summaries = summarizeByClass(trainingSet)
        predictions = getPredictions(summaries, testSet)
        acc_NB = getAccuracy1(testSet, predictions)

        #print "accuracy_simpleNB= " + str(acc_NB)
        results[0] += acc_NB
        train_set = convert_float(trainset_copy)
        labels_train = get_labels(trainset_copy)

        test_set = convert_float(test_copy)
        #testSet = testSet + test_negative
        labels_test = get_labels(test_copy)
        #labels_test = labels_test + labels_test_negative
        #print labels_test

        tp_NB = TruePositive(predictions, testSet)
        tn_NB = TrueNegative(predictions, testSet)
        fp_NB = FalsePositive(predictions, testSet)
        fn_NB = FalseNegative(predictions, testSet)

        prec_NB = tp_NB / (tp_NB + fp_NB)
        rec_NB = tp_NB / (tp_NB + fn_NB)

        # SVM
        clf = svm.SVC(probability=True)
        clf.fit(train_set, labels_train)
        #clf.decision_function(test_set)
        results_SVM = clf.predict(test_set)
        a = clf.predict_proba(test_set)
        acc_svm = getAccuracy(results_SVM, labels_test)
        #print "accuracy_svm= " + str(acc_svm)
        results[1] += acc_svm

        tp_SVM = TruePositive(results_SVM, labels_test)
        tn_SVM = TrueNegative(results_SVM, labels_test)
        fp_SVM = FalsePositive(results_SVM, labels_test)
        fn_SVM = FalseNegative(results_SVM, labels_test)

        prec_SVM = tp_SVM / (tp_SVM + fp_SVM)
        rec_SVM = tp_SVM / (tp_SVM + fn_SVM)

        #KNN
        neigh = KNeighborsClassifier(n_neighbors=3)
        neigh.fit(train_set, labels_train)
        results_KNN = neigh.predict(test_set)
        b = neigh.predict_proba(test_set)
        acc_knn = getAccuracy(results_KNN, labels_test)
        #print "accuracy_knn= " + str(acc_knn)
        results[2] += acc_knn

        tp_knn = TruePositive(results_KNN, labels_test)
        tn_knn = TrueNegative(results_KNN, labels_test)
        fp_knn = FalsePositive(results_KNN, labels_test)
        fn_knn = FalseNegative(results_KNN, labels_test)

        prec_knn = tp_knn / (tp_knn + fp_knn)
        rec_knn = tp_knn / (tp_knn + fn_knn)

        #gausianNB
        clf = GaussianNB()
        clf.fit(train_set, labels_train)
        results_GausianNB = clf.predict(test_set)
        c = clf.predict_proba(test_set)
        acc_gausNB = getAccuracy(results_GausianNB, labels_test)
        #print "accuracy_gausNB= " + str(acc_gausNB)
        results[3] += acc_gausNB

        tp_gnb = TruePositive(results_GausianNB, labels_test)
        tn_gnb = TrueNegative(results_GausianNB, labels_test)
        fp_gnb = FalsePositive(results_GausianNB, labels_test)
        fn_gnb = FalseNegative(results_GausianNB, labels_test)

        prec_gnb = tp_gnb / (tp_gnb + fp_gnb)
        rec_gnb = tp_gnb / (tp_gnb + fn_gnb)

        #BernoiliNB
        clf = BernoulliNB()
        clf.fit(train_set, labels_train)
        results_BernoulliNB = clf.predict(test_set)
        d = clf.predict_proba(test_set)
        acc_BernoNB = getAccuracy(results_BernoulliNB, labels_test)
        #print "accuracy_bernoNB= " + str(acc_BernoNB)
        results[4] += acc_BernoNB

        tp_gnb = TruePositive(results_BernoulliNB, labels_test)
        tn_gnb = TrueNegative(results_BernoulliNB, labels_test)
        fp_gnb = FalsePositive(results_BernoulliNB, labels_test)
        fn_gnb = FalseNegative(results_BernoulliNB, labels_test)

        prec_bnb = tp_gnb / (tp_gnb + fp_gnb)
        rec_bnb = tp_gnb / (tp_gnb + fn_gnb)

        #randomforests

        clf = RandomForestClassifier(n_estimators=10)
        clf.fit(train_set, labels_train)
        results_randomforest = clf.predict(test_set)
        e = clf.predict_proba(test_set)
        acc_random_F = getAccuracy(results_randomforest, labels_test)
        #print "accuracy_random_forest= " + str(acc_random_F)
        results[5] += acc_random_F

        tp_gnb = TruePositive(results_randomforest, labels_test)
        tn_gnb = TrueNegative(results_randomforest, labels_test)
        fp_gnb = FalsePositive(results_randomforest, labels_test)
        fn_gnb = FalseNegative(results_randomforest, labels_test)

        prec_rf = tp_gnb / (tp_gnb + fp_gnb)
        rec_rf = tp_gnb / (tp_gnb + fn_gnb)

        #print "-------------\n"
        #print results_SVM
        #print results_KNN
        #print results_GausianNB
        #print results_BernoulliNB
        #print results_randomforest

        #print "\n"
        #print labels_test
        #print results
        #plt.plot(x,results,marker='o')
        '''
		s = open('results.txt','a')
	
		with open('./chats_process/'+filename+'/'+'ml_training_'+'.csv', 'w') as csvoutput:
			writer = csv.writer(csvoutput)
			for a1,b1,c1,d1,e1,label in zip(a,b,c,d,e,labels_test):
				writer.writerow([a1[1],b1[1],c1[1],d1[1],e1[1],label])
				s.write("%s\n" % a1)
				s.write("%s\n" % b1)
				s.write("%s\n" % c1)
				s.write("%s\n" % d1)
				s.write("%s\n" % e1)
		
				#s.write(b1)
				#s.write(str(c1)) 
				#s.write(d1) 
				#s.write(e1)
				s.write("................\n")
	
		print('Split {0} rows into train={1} and test={2} rows').format(len(dataset), len(trainingSet), len(testSet))
		# prepare model
		summaries = summarizeByClass(trainingSet)
		# test model
		predictions = getPredictions(summaries, testSet)
		accuracy = getAccuracy(testSet, predictions)
		print('Accuracy: {0}%').format(accuracy) '''

    t = open('remove_one5.txt', 'a')
    t.write(str(prec_NB) + " , " + str(rec_NB) + '\n')
    t.write(str(prec_SVM) + " , " + str(rec_SVM) + '\n')
    t.write(str(prec_gnb) + " , " + str(rec_gnb) + '\n')
    t.write(str(prec_bnb) + " , " + str(rec_bnb) + '\n')
    t.write(str(prec_rf) + " , " + str(rec_rf) + '\n')
    t.write(str(prec_knn) + " , " + str(rec_knn) + '\n')

示例#46

0

显示文件

文件： #1 - Cleaned Baseline Script.py 项目： TolgaAkyazi/Sentiment_Thesis_2020

with open('Kfold_acc.pickle', 'wb') as f:
    pickle.dump(alternative_Kfold_mean, f)

#support vector machine
from sklearn.svm import LinearSVC
SVM = LinearSVC(random_state=123)
SVM.fit(X_train,y_train)

#decision tree
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier(random_state=123)
DT.fit(X_train,y_train)

#naive bayes
from sklearn.naive_bayes import BernoulliNB
NB = BernoulliNB()
NB.fit(X_train,y_train)

y_pred_log = Log_Reg.predict(X_test)
y_pred_svm = SVM.predict(X_test)
y_pred_DT = DT.predict(X_test)
y_pred_NB = NB.predict(X_test)

###    validation score    ##
## 10-fold cross validation ##
from sklearn.model_selection import cross_val_score
cross_val = (cross_val_score(Log_reg_fitted, X_train, y_train, cv=10))
alternative_Kfold_mean = np.mean(cross_val)
print('Average validation score Log Reg: ',alternative_Kfold_mean,'\n', 'Validation score per fold: ','\n',cross_val)

### rest of the classifiers' K-fold validation scores ###

示例#47

0

显示文件

文件： __init__.py 项目： burningskies42/Twitter_Analytics

    def train(self, with_trees, with_print):
        # if fetch_from_server:
        #    self.fetch_tweets(with_print=with_print,pth=pth,remove_stopwords=remove_stopwords,ngrams=ngrams,n_min=n_min,n_max=n_max)
        # else:

        # self.train_test_split(with_print)

        # Logistic Regression
        print(
            '------------------------------------------------------------------------\n',
            'Logistic Regression:')
        start_clf_time = time.time()
        LogisticRegression_classifier = LogisticRegression(fit_intercept=True)

        LogisticRegression_classifier.fit(X=self.X_train, y=self.y_train)
        output = Kappa(LogisticRegression_classifier,
                       X_test=self.X_test,
                       y_test=self.y_test).output

        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\LogisticRegression.pickle",
                "wb") as classifier_f:
            pickle.dump(LogisticRegression_classifier, classifier_f)
            classifier_f.close()

        print(
            '------------------------------------------------------------------------\n',
            'Naive Bayes:')
        start_clf_time = time.time()
        Naivebayes_classifier = GaussianNB()

        Naivebayes_classifier.fit(X=self.X_train, y=self.y_train)
        output = Kappa(Naivebayes_classifier,
                       X_test=self.X_test,
                       y_test=self.y_test).output
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        # Naivebayes_classifier.show_most_informative_features(15)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\Naivebayes_classifier.pickle",
                "wb") as classifier_f:
            pickle.dump(Naivebayes_classifier, classifier_f)
            classifier_f.close()

        print(
            '------------------------------------------------------------------------\n',
            'Multinomial Naive Bayes:')
        start_clf_time = time.time()
        MNB_classifier = MultinomialNB()

        MNB_classifier.fit(X=self.X_train, y=self.y_train)
        output = Kappa(MNB_classifier, X_test=self.X_test,
                       y_test=self.y_test).output
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\MNB_classifier.pickle",
                "wb") as classifier_f:
            pickle.dump(MNB_classifier, classifier_f)
            classifier_f.close()

        print(
            '------------------------------------------------------------------------\n',
            'Bernoulli Naive Bayes:')
        start_clf_time = time.time()
        BernoulliNB_classifier = BernoulliNB()
        BernoulliNB_classifier.fit(X=self.X_train, y=self.y_train)

        output = Kappa(BernoulliNB_classifier,
                       X_test=self.X_test,
                       y_test=self.y_test).output
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\BernoulliNB_classifier.pickle",
                "wb") as classifier_f:
            pickle.dump(BernoulliNB_classifier, classifier_f)
            classifier_f.close()
        '''
      ================================================================================================================================================
      ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS 
      ================================================================================================================================================
      '''

        print(
            '------------------------------------------------------------------------\n',
            'C-Support Vector Machine:')
        print('======================\n', 'Linear Kernel')
        start_clf_time = time.time()
        SVC_lin_classifier = SVC(kernel='linear')
        SVC_lin_classifier.fit(X=self.X_prep_train, y=self.y_train)

        output = Kappa(SVC_lin_classifier,
                       X_test=self.X_prep_test,
                       y_test=self.y_test).output
        output['Kernel'] = 'linear'
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() + "\\classifiers\\words_as_features\\SVC_lin.pickle",
                "wb") as classifier_f:
            pickle.dump(SVC_lin_classifier, classifier_f)
            classifier_f.close()

        print('======================\n', 'Polynomial Kernel')
        start_clf_time = time.time()
        SVC_poly_classifier = SVC(kernel='poly', C=1, gamma=1)
        SVC_poly_classifier.fit(X=self.X_prep_train, y=self.y_train)

        output = Kappa(SVC_poly_classifier,
                       X_test=self.X_prep_test,
                       y_test=self.y_test).output
        output['Kernel'] = 'poly'
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() + "\\classifiers\\words_as_features\\SVC_poly.pickle",
                "wb") as classifier_f:
            pickle.dump(SVC_poly_classifier, classifier_f)
            classifier_f.close()

        # Also default kernel
        print('======================\n', 'Radial Basis Function Kernel')
        start_clf_time = time.time()
        SVC_classifier = SVC(kernel='rbf', gamma=0.1, C=1.38)
        SVC_classifier.fit(X=self.X_prep_train, y=self.y_train)

        output = Kappa(SVC_classifier,
                       X_test=self.X_prep_test,
                       y_test=self.y_test).output
        output['Kernel'] = 'rbf'
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() + "\\classifiers\\words_as_features\\SVC_rbf.pickle",
                "wb") as classifier_f:
            pickle.dump(SVC_classifier, classifier_f)
            classifier_f.close()

        print('======================\n', 'Sigmoid Kernel')
        start_clf_time = time.time()
        SVC_sig_classifier = SVC(kernel='sigmoid', gamma=10)
        SVC_sig_classifier.fit(X=self.X_prep_train, y=self.y_train)

        output = Kappa(SVC_sig_classifier,
                       X_test=self.X_prep_test,
                       y_test=self.y_test).output
        output['Kernel'] = 'sigmoid'
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\SVC_sigmoid.pickle",
                "wb") as classifier_f:
            pickle.dump(SVC_sig_classifier, classifier_f)
            classifier_f.close()
        '''
      ================================================================================================================================================
      '''

        print(
            '------------------------------------------------------------------------\n',
            'Stochastic Gradient Descent:')
        start_clf_time = time.time()
        SGD_classifier = SGDClassifier()
        SGD_classifier.fit(X=self.X_train, y=self.y_train)

        output = Kappa(SGD_classifier, X_test=self.X_test,
                       y_test=self.y_test).output
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\SGD_classifier.pickle",
                "wb") as classifier_f:
            pickle.dump(SGD_classifier, classifier_f)
            classifier_f.close()

        print(
            '------------------------------------------------------------------------\n',
            'Multi-layer Perceptron:')
        start_clf_time = time.time()
        MLP_Classifier = MLPClassifier(alpha=1)
        MLP_Classifier.fit(X=self.X_train, y=self.y_train)

        output = Kappa(MLP_Classifier, X_test=self.X_test,
                       y_test=self.y_test).output
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\MLP_Classifier.pickle",
                "wb") as classifier_f:
            pickle.dump(SGD_classifier, classifier_f)
            classifier_f.close()
        '''
      Apart from training the forest classifier, both .dot and .png files are created with visual
      represntation of the trees
      '''
        print(
            '------------------------------------------------------------------------\n',
            'Random Forest:')
        start_clf_time = time.time()
        rnd_forest = RandomForestClassifier(n_jobs=-1,
                                            n_estimators=25,
                                            warm_start=True,
                                            max_features=7)
        RandomForest_Classifier = rnd_forest
        RandomForest_Classifier.fit(X=self.X_train, y=self.y_train)

        if with_trees:
            # Export trees
            i_tree = 0
            for tree_in_forest in rnd_forest.estimators_:
                tree_dot_str = getcwd() + '/trees/tree_' + str(i_tree) + '.dot'
                with open(tree_dot_str, 'w') as tree_dot_file:
                    tree_dot_file = tree.export_graphviz(
                        tree_in_forest, out_file=tree_dot_file)

                (graph, ) = pydot.graph_from_dot_file(tree_dot_str)
                graph.write_png(tree_dot_str.replace('.dot', '.png'))

                i_tree = i_tree + 1

        output = Kappa(RandomForest_Classifier,
                       X_test=self.X_test,
                       y_test=self.y_test).output
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\RandomForest_Classifier.pickle",
                "wb") as classifier_f:
            pickle.dump(SGD_classifier, classifier_f)
            classifier_f.close()

        print(
            '------------------------------------------------------------------------\n',
            'Adaptive Boosting:')
        start_clf_time = time.time()
        AdaBoost_Classifier = AdaBoostClassifier()
        AdaBoost_Classifier.fit(X=self.X_train, y=self.y_train)

        output = Kappa(AdaBoost_Classifier,
                       X_test=self.X_test,
                       y_test=self.y_test).output
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\AdaBoost_Classifier.pickle",
                "wb") as classifier_f:
            pickle.dump(SGD_classifier, classifier_f)
            classifier_f.close()

        print(
            '------------------------------------------------------------------------\n',
            'Voted Classifier:')
        start_clf_time = time.time()
        voted_classifier = VoteClassifier(
            Naivebayes_classifier,
            # SVR_classifier,
            MLP_Classifier,
            RandomForest_Classifier,
            # QDA_Classifier,
            AdaBoost_Classifier,
            SVC_lin_classifier,
            # SVC_poly_classifier,
            SVC_sig_classifier,
            SVC_classifier,
            SGD_classifier,
            MNB_classifier,
            BernoulliNB_classifier,
            LogisticRegression_classifier)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\voted_classifier.pickle",
                "wb") as classifier_f:
            pickle.dump(SGD_classifier, classifier_f)
            classifier_f.close()

        output = Kappa(voted_classifier,
                       X_test=self.X_test,
                       y_test=self.y_test).output
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        print(
            '------------------------------------------------------------------------'
        )

        self.output_log['Train_News'] = self.sizes_df.loc['Training']['News']
        self.output_log['Train_Spam'] = self.sizes_df.loc['Training'][
            'Not-News']
        self.output_log['Test_News'] = self.sizes_df.loc['Testing']['News']
        self.output_log['Test_Spam'] = self.sizes_df.loc['Testing']['Not-News']
        self.output_log['feature_cnt'] = None

        self.output_log['type'] = 'descriptive_features'

        # Reorder ouput log
        self.output_log = self.output_log[[
            # ID
            'time_stamp',
            'Name',
            'Kernel',
            'feature_cnt',
            'type',
            # Sizes
            'Train_News',
            'Train_Spam',
            'Test_News',
            'Test_Spam',
            'True_News',
            'True_Spam',
            'False_News',
            'False_Spam',

            # Measures
            'Accuracy',
            'Kappa',
            'rauc',
            'duration',
            'News_TPR',
            'News_FPR',
            'News_Prec',
            'News_Recall',
            'News_F1',
            'Spam_TPR',
            'Spam_FPR',
            'Spam_Prec',
            'Spam_Recall',
            'Spam_F1',
        ]]

        # Saving results to file
        df = pd.DataFrame()
        if os.path.isfile(
                getcwd() +
                "\\classifiers\\words_as_features\\desc_weighted_confs.csv"):
            retry = 5
            while retry > 0:
                try:
                    df = pd.DataFrame().from_csv(
                        getcwd() +
                        "\\classifiers\\words_as_features\\desc_weighted_confs.csv",
                        sep=";")
                except Exception as e:
                    retry -= 1
                    time.sleep(60)
                    print('Error reading file.', retry,
                          'attempts remainig ...')
                    continue
                break

            df = self.output_log.append(df, ignore_index=True)
        else:
            df = self.output_log

        retry = 5
        while retry > 0:
            try:
                df.to_csv(
                    getcwd() +
                    "\\classifiers\\words_as_features\\desc_weighted_confs.csv",
                    sep=";")
                print(
                    'saved to',
                    getcwd() +
                    "\\classifiers\\words_as_features\\desc_weighted_confs.csv"
                )
            except Exception as e:
                retry -= 1
                time.sleep(60)
                print('Error writing to file.', retry, 'attempts remainig ...')
                continue
            break

示例#48

0

显示文件

# Each vector has the length of the entire vocabulary and
# an integer count for the number of times each word appeared in the document.
myPattern = r'[a-z]{4,}' if token_pattern else r'(?u)\b\w\w+\b'

vectorizer = CountVectorizer(stop_words=stop_words,
                             max_df=max_df,
                             min_df=min_df,
                             token_pattern=myPattern)
counts = vectorizer.fit_transform(X_train)

# Create classifier and fit for multinomial model.
clfMulti = MultinomialNB()
clfMulti.fit(counts, Y_train)

# Create classifier and fit for bernoulli model
clfBernoulli = BernoulliNB(binarize=1)
clfBernoulli.fit(counts, Y_train)

X_test = df_test.text
Y_test = df_test.label

# Transforms each document into a vector (with length of vocabulary of train documents) with an
# integer count for the number of times each word appeared in the document
example_count = vectorizer.transform(X_test)

# Predict labels on the test data set
predictionsMulti = clfMulti.predict(example_count)
predictionsBernoulli = clfBernoulli.predict(example_count)


def getPercentageCorrect(predictions):

示例#49

0

显示文件

文件： main.py 项目： SwordShieldMouse/cmput566-project

svm_cv.fit(train_X, train_y)
print(svm_cv.best_params__)
print(svm_cv.cv_results__)"""
gamma_best = 1.0#svm_cv.best_params__["gamma"]


# final experiments (e.g., to get standard error)
numruns = 8

# try a neural network since svm can take too long to converge
nn = MLPClassifier(hidden_layer_sizes = (16, 8), alpha = 0.0, max_iter = 10, random_state = None)

final_algs = {
    "Logistic Regression": LogisticRegression(penalty = "l1", solver = "saga", random_state = None, class_weight = "balanced", max_iter = 90, C = C_best),
    "SVM": SVC(kernel = "rbf", random_state = None, class_weight = "balanced", gamma = gamma_best, max_iter = 1000),
    "Naive Bayes": BernoulliNB(alpha = 1.0, fit_prior = True)
    #"Neural Network": nn
    }

print("Starting final experiments")



conf_mats = {} # holds the confusion matrices for each algorithm
f1 = {} # holds the list of macro f1 scores for each algorithm
for name in final_algs.keys():
    conf_mats[name] = pd.DataFrame([[0, 0], [0, 0]])
    f1[name] = []

# compute macro average of f1 score (i.e., f1 score for every run) so that we may calculate a confidence interval
for i in range(numruns):

示例#50

0

显示文件

#####TRYING MIX OF ALL MODELS
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

X = train_data.tweet
y = train_data.label

cv = ShuffleSplit(n_splits=20, test_size=0.2)

models = [
    MultinomialNB(),
    BernoulliNB(),
    LogisticRegression(),
    SGDClassifier(),
    LinearSVC(),
    RandomForestClassifier(),
    MLPClassifier()
]

sm = SMOTE()

# Init a dictionary for storing results of each run for each model
results = {
    model.__class__.__name__: {
        'accuracy': [], 
        'f1_score': [],
        'confusion_matrix': []

示例#51

0

显示文件

文件： bai32_NaiveBayesClassifier_SpamFiltering_sklearn.py 项目： PhamKhoa96/MachineLearning

train_label_fn = 'train-labels-100.txt'
test_data_fn = 'test-features.txt'
test_label_fn = 'test-labels.txt'

(train_data, train_label)  = read_data(train_data_fn, train_label_fn)
(test_data, test_label)  = read_data(test_data_fn, test_label_fn)
clf = MultinomialNB()
clf.fit(train_data, train_label)
y_pred = clf.predict(test_data)
print('Training size = %d, accuracy = %.2f%%' % \
      (train_data.shape[0],accuracy_score(test_label, y_pred)*100))
    

train_data_fn = 'train-features-50.txt'
train_label_fn = 'train-labels-50.txt'
test_data_fn = 'test-features.txt'
test_label_fn = 'test-labels.txt'

(train_data, train_label)  = read_data(train_data_fn, train_label_fn)
(test_data, test_label)  = read_data(test_data_fn, test_label_fn)
clf = MultinomialNB()
clf.fit(train_data, train_label)
y_pred = clf.predict(test_data)
print('Training size = %d, accuracy = %.2f%%' % \
      (train_data.shape[0],accuracy_score(test_label, y_pred)*100))
    
clf = BernoulliNB(binarize = .5)
clf.fit(train_data, train_label)
y_pred = clf.predict(test_data)
print('Training size = %d, accuracy = %.2f%%' % \
      (train_data.shape[0],accuracy_score(test_label, y_pred)*100))

示例#52

0

显示文件

文件： train_model.py 项目： jessica-santos/ds-test

def classification_naive_bayes(X, Y, nome):
    nb_model = BernoulliNB()
    classification_model_cv(X, Y, nb_model, "Naive Bayes "+nome)

示例#53

0

显示文件

vecCount = CountVectorizer(min_df=3)
vecCount.fit(X_train["text"])
# 単語の種類
print("word size: ", len(vecCount.vocabulary_))
# 先頭5件の単語を表示
print("word content: ", dict(list(vecCount.vocabulary_.items())[0:5]))
# トレーニング・評価データをベクトル化
X_train_vec = vecCount.transform(X_train["text"])
X_test_vec = vecCount.transform(X_test["text"])
# 先頭5件のベクトル化データを表示
print("先頭5件のベクトル化データを表示")
print(pd.DataFrame(X_train_vec.toarray()[0:5], columns=vecCount.get_feature_names()))

# -モデル作成-
# ベルヌーイモデル
model = BernoulliNB()
model.fit(X_train_vec, Y_train["class"])

# -評価-
print("Train accuracy = %.3f" % model.score(X_train_vec, Y_train))
print("Test accuracy = %.3f" % model.score(X_test_vec, Y_test))

# -予測-
# 予測テキストデータ作成
data = np.array([
    "I am happy.",
    "Are you happy? 00",
    "Free service! Please contact me immediately. But it is 300 US dollars next month."
])
df_data = pd.DataFrame(data, columns=["text"])
# 予測テキストデータをベクトル化

示例#54

0

显示文件

文件： entrenar_kbest.py 项目： manucosta/aa-tp1

  print("TRAIN:", train_index, "TEST:", test_index)
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  clf.fit(X_train, y_train)
  score = f05_scorer(clf, X_test, y_test)
  if score > best_score:
    best_clf = clf
    best_score = score

fout = open('kbest-multinomialNB.pickle','w')
pickle.dump(clf,fout)
fout.close()

#######################
print "Bernoulli NB"
clf = BernoulliNB(binarize = 0.0, alpha = 0.25, fit_prior = False)

kf = KFold(72000, n_folds=10, shuffle=True)
best_score = 0
best_clf = 0
for train_index, test_index in kf:
  print("TRAIN:", train_index, "TEST:", test_index)
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  clf.fit(X_train, y_train)
  score = f05_scorer(clf, X_test, y_test)
  if score > best_score:
    best_clf = clf
    best_score = score

fout = open('kbest-bernoulliNB.pickle','w')

示例#55

0

显示文件

文件： models.py 项目： jt17383/AutoMl_Challenge

 def __init__(self, info, verbose=True, debug_mode=False, run_on_gpu=False):
     self.label_num = info['label_num']
     self.target_num = info['target_num']
     self.task = info['task']
     self.metric = info['metric']
     self.postprocessor = MultiLabelEnsemble(
         LogisticRegression(), balance=False)  # To calibrate proba
     if debug_mode >= 2:
         self.name = "RandomPredictor"
         self.model = RandomPredictor(self.target_num)
         self.predict_method = self.model.predict_proba
         return
     if info['task'] == 'regression':
         if info['is_sparse'] == True:
             self.name = "BaggingRidgeRegressor"
             self.model = BaggingRegressor(
                 base_estimator=Ridge(),
                 n_estimators=1,
                 verbose=verbose,
                 random_state=1)  # unfortunately, no warm start...
             # Lukasz uses BernoulliNB() instead of Ridge()
         else:
             #self.name = "GradientBoostingRegressor"
             #self.model = GradientBoostingRegressor(n_estimators=1, verbose=verbose, warm_start = True, random_state=1)
             # There is a problem with  "GradientBoostingRegressor", which does not accept non c-contiguous arrays.
             self.name = "RandomForestRegressor"
             self.model = RandomForestRegressor(n_estimators=1,
                                                random_state=1,
                                                warm_start=True)
         self.predict_method = self.model.predict
     else:
         if info['has_categorical']:  # Out of lazziness, we do not convert categorical variables...
             self.name = "RandomForestClassifier"
             self.model = RandomForestClassifier(
                 n_estimators=1, verbose=verbose, random_state=1
             )  # New: warm_start = True ,now there is warm start is sklearn 0.16.1 not in here for backward compatibility
         elif info['format'] == 'sparse_binary':
             self.name = "BaggingBernoulliNBClassifier"
             self.model = BaggingClassifier(
                 base_estimator=BernoulliNB(),
                 n_estimators=1,
                 verbose=verbose,
                 random_state=1)  # unfortunately, no warm start...
         elif info['format'] == 'sparse':
             self.name = "BaggingMutinomialNBClassifier"
             self.model = BaggingClassifier(
                 base_estimator=MultinomialNB(),
                 n_estimators=1,
                 verbose=verbose,
                 random_state=1)  # unfortunately, no warm start...
         else:
             if info['label_num'] > 100:
                 self.name = "BaggingGaussianNBClassifier"
                 self.model = BaggingClassifier(
                     base_estimator=GaussianNB(),
                     n_estimators=1,
                     verbose=verbose,
                     random_state=1)  # unfortunately, no warm start...
             else:
                 #self.name = "RandomForestClassifier"
                 #self.model = RandomForestClassifier(n_estimators=1, verbose=verbose, warm_start = True , random_state=1) # New: now there is warm start is sklearn 0.16.1
                 self.name = "GradientBoostingClassifier"
                 self.model = GradientBoostingClassifier(
                     n_estimators=1,
                     verbose=verbose,
                     random_state=1,
                     min_samples_split=10,
                     warm_start=False)  # New bug warm start no longer works
         if info['task'] == 'multilabel.classification':
             self.model = MultiLabelEnsemble(self.model)
         self.predict_method = self.model.predict_proba

示例#56

0

显示文件

def training_step(data, vectorizer):
    training_text = data['Lyrics']
    training_result = data['Year']
    training_text = vectorizer.fit_transform(training_text)

    return BernoulliNB().fit(training_text, training_result)

示例#57

0

显示文件

文件： gen_data_algos.py 项目： andreehrlich/Machine-Learning-Sentiment-Analysis-Methods

    SGD_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 4))),
        ('tfidf', TfidfTransformer()),
        ('clf', LogisticRegression()),
    ])
elif algo == "Perceptron" or algo == "perceptron":
    SGD_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 4))),
        ('tfidf', TfidfTransformer()),
        ('clf', Perceptron()),
    ])
elif algo == "BernoulliNB" or algo == "bernoulliNB":
    SGD_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 4))),
        ('tfidf', TfidfTransformer()),
        ('clf', BernoulliNB()),
    ])
elif algo == "SGDClassifier" or algo == "sgdClassifier":
    SGD_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 4))),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier()),
    ])

# Fit model to training set
SGD_clf.fit(X_train, y_train)

# Predict on test set
SVM_pred = SGD_clf.predict(X_test)

示例#58

0

显示文件

文件： voting.py 项目： ordikhan/supervised-learning

import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
import random
from sklearn.ensemble import VotingClassifier

random.seed(2002)

iris = datasets.load_iris()
X = iris.data
Y = iris.target

tree = DecisionTreeClassifier()
GNB = GaussianNB()
BNB = BernoulliNB()

vote = VotingClassifier(estimators=[('tree', tree), ('Gnb', GNB),
                                    ('Bnb', BNB)],
                        weights=[2, 1, 1])
vote.fit(X, Y)
pred = vote.predict(X)

print(accuracy_score(Y, pred))

示例#59

0

显示文件

文件： BNB_topics.py 项目： kavyamusty/Sentiment-Analysis


tweet_data = train_data['tweet_text']
topic_data = topic_analysis(train_data)
count = CountVectorizer(token_pattern=r'[a-zA-Z0-9#@%_$]+[a-zA-Z0-9#@%_$]+',
                        lowercase=False)
bag_of_words = count.fit_transform(tweet_data)
bag_of_words_2 = count.transform(test_data['tweet_text'])

X = bag_of_words.toarray()
Y = np.array(topic_data)

x_train = X
x_test = bag_of_words_2.toarray()
y_train = Y

from sklearn.naive_bayes import BernoulliNB
#from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, classification_report

clf = BernoulliNB()
model = clf.fit(x_train, y_train)

predictions = model.predict(x_test)
instance = test_data['instance_number']
dic = OrderedDict()
for i in range(len(instance)):
    dic[instance[i]] = predictions[i]

for k, v in dic.items():
    print(str(k) + ' ' + str(v))

示例#60

0

显示文件

classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(RandomForestClassifier(random_state=random_state))

#Gaussian process
classifiers.append(GaussianProcessClassifier(random_state=random_state))

#Generalized linear models
classifiers.append(LogisticRegressionCV(random_state=random_state))
classifiers.append(PassiveAggressiveClassifier(random_state=random_state))
classifiers.append(RidgeClassifierCV())
classifiers.append(SGDClassifier(random_state=random_state))
classifiers.append(Perceptron(random_state=random_state))
classifiers.append(MLPClassifier(random_state=random_state))

#Navies Bayes
classifiers.append(BernoulliNB())
classifiers.append(GaussianNB())

#Nearest Neighbors
classifiers.append(KNeighborsClassifier())

#Discrimnant analysis
classifiers.append(LinearDiscriminantAnalysis())

#Support vector machine
classifiers.append(SVC(random_state=random_state, probability=True))
classifiers.append(NuSVC(random_state=random_state, probability=True))
classifiers.append(LinearSVC(random_state=random_state))

#Trees
classifiers.append(DecisionTreeClassifier(random_state=random_state))