Exemplo n.º 1
0
def BernoulliNB_1(train_predictors,test_predictors,train_target,test_target):
    clf = BernoulliNB()
    clf.fit(train_predictors,train_target)
    predicted = clf.predict(test_predictors)
    accuracy = accuracy_score(test_target, predicted)
    print "Accuracy for Bernoulli Naive Bayes: "+str(accuracy)
    return accuracy,predicted  
Exemplo n.º 2
0
def tryBinomialNaiveBayes(goFast):
  best_score = 0

  from sklearn.datasets import dump_svmlight_file, load_svmlight_file
  if goFast:
    training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True)
    validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True)
    testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True)
  else:
    training_data, training_labels = load_svmlight_file("dt1.trn.svm")
    validation_data, validation_labels = load_svmlight_file("dt1.vld.svm")
    testing_data, testing_labels = load_svmlight_file("dt1.tst.svm")

  from sklearn.naive_bayes import BernoulliNB

  for alpha_value in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
    for binarize_value in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
      for fit_prior_value in [True, False]:
        binary_operator = BernoulliNB(alpha_value,binarize_value,fit_prior_value)
        binary_operator.fit(training_data,training_labels)
        current_score = binary_operator.score(validation_data,validation_labels)

        print "Current test: " + str(alpha_value), str(binarize_value), fit_prior_value
        print "Current score: " + str(current_score)

        if current_score > best_score:
          best_score = current_score
          print "***NEW MAXIMUM SCORE: " + str(best_score)
          print "***NEW MAXIMUM PARAMETERS: " + str(alpha_value), str(binarize_value), fit_prior_value

  print "Best score was " + str(best_score)
Exemplo n.º 3
0
    def render_content(self):
        if self.text_source is None:
            return "No text source selected."
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.naive_bayes import BernoulliNB
        from sklearn import metrics
        self.dm("creating vectorizer")
        vectorizer = CountVectorizer(stop_words=self.get_user_list(self.stop_list), max_features=self.vocab_size)
        data = self.get_column_data(self.text_source)
        self.dm("using vectorizer")
        X_train = vectorizer.fit_transform(data)
        Y_train = self.get_column_data(self.code_source)
        self.dm("creating classifier")
        clf = BernoulliNB()
        clf.fit(X_train, Y_train)
        
        accuracy = clf.score(X_train, Y_train)
        self.dm("predicting")
        pred = clf.predict(X_train)
        cm = metrics.confusion_matrix(Y_train, pred)

        self.dm("displaying result")
        html_output = "accuracy is " + str(round(accuracy, 2))
        html_output += '<pre>'+ str(cm) + '</pre>'

        return html_output
class NaiveBayesClassifierBernoulli:
    """
    this class capsules the Bernoulli NaiveBayes functions of scikit-learn in BernoulliNB class
"""
    def __init__(self, matrixFileName = matrixFilePath, dicFileName = dictFilePath):
        self.X,self.Y = load_svmlight_file(matrixFileName)
        self.dictionary = pickle.load(open(dicFileName, "rb"))
        self.bernoulliNB = BernoulliNB()
        self.bernoulliNB.fit(self.X, self.Y)
        self.matrixParser = Parser.MatrixParserForLearning()
        
    def classifyOneSentence(self, string):
        row = self.matrixParser.getRowForClassify(string, self.dictionary)
        if row != None:
#             return self.bernoulliNB.predict(row)
            return self.bernoulliNB.predict(row)
        else : return None
    
    def classifyOneSentenceWithProbability(self,string):
        row = self.matrixParser.getRowForClassify(string, self.dictionary)
        if row != None:
#             return self.bernoulliNB.predict(row)
            a = self.bernoulliNB.predict_proba(row)
            return a[0][1] - a[0][0]
        else : return None
Exemplo n.º 5
0
def NB_train_classifier(train_x, train_y):
    """ Returns the predictions on the validation set
    """
    classifier = BernoulliNB()
    classifier.fit(train_x, train_y)

    return classifier
Exemplo n.º 6
0
def bernoulli_classify():
    clf = BernoulliNB()
    traindata = []
    traintarget = []
    for f in glob.glob("../../../res/articles/training_data/*-articles.json"):
        target = f.replace("-articles.json", "")
        target = re.sub(r".*\/+", "", target)
        output = readWholeFileBernoulli(f, target)
        traindata.extend(output[0])
        traintarget.extend(output[1])

    testdata = []
    testtarget = []
    for f in glob.glob("../../../res/articles/test_data/*-articles.json"):
        target = f.replace("-articles.json", "")
        target = re.sub(r".*\/+", "", target)
        output = readWholeFileBernoulli(f, target)
        testdata.extend(output[0])
        testtarget.extend(output[1])

    clf.fit(traindata, traintarget)
    ncorrect = 0
    total = len(testdata)
    for i in range(len(testdata)):
        predict = clf.predict(testdata[i])
        correct = testtarget[i]
        if correct == predict[0]:
            ncorrect += 1

        print ("Correct: {0} - Predicted: {1}".format(correct, predict[0]))

    print "Correct ", ncorrect, " Total ", total, " Correctness ", ncorrect * 1.0 / total
Exemplo n.º 7
0
def test_BernouliNB2():
    X = np.array([
        [0, 1],
        [1, 1],
        [1, 0],
        [-1, 1],
        [1000, 1000],
        [1000, 10001],
        [998, 800],
        [990, 1100],
        ]
            )
    print 'X ' + str(X)
    #Y = np.array([1, 1, 1, 1, 2, 2, 2, 2])
    Y = np.array([1, 2, 3, 4, 5, 6, 7, 8])
    print 'Y ' + str(Y)
    clf = BernoulliNB(alpha = 1)
    clf.fit(X, Y)
    X2 = np.array(
            [
            [1002, 1010],
            [1010, 910],
            [1003, 980],
            [1008, 1030],
            [-1, -1],
            [-3, -10],
            [40, 1],
            [1, -100],
            ]
            )
    for i in xrange(len(X2)):
        #pred_ret = clf.predict_proba(X2[i])
        pred_ret = clf.predict(X2[i])
        print 'X[' + str(i) + '] = ' + str(X[i]) + ' pred_ret ' + str(pred_ret)
def MungeData(train, test):

    todrop = ['v22', 'v112', 'v125', 'v74', 'v1', 'v110', 'v47']
    print(todrop)

    train.drop(todrop,
               axis=1, inplace=True)
    test.drop(todrop,
              axis=1, inplace=True)

    features = train.columns[2:]
    for col in features:
        if((train[col].dtype == 'object')):
            print(col)
            train, binfeatures = Binarize(col, train)
            test, _ = Binarize(col, test, binfeatures)
            nb = BernoulliNB()
            nb.fit(train[col+'_'+binfeatures].values, train.target.values)
            train[col] = \
                nb.predict_proba(train[col+'_'+binfeatures].values)[:, 1]
            test[col] = \
                nb.predict_proba(test[col+'_'+binfeatures].values)[:, 1]
            train.drop(col+'_'+binfeatures, inplace=True, axis=1)
            test.drop(col+'_'+binfeatures, inplace=True, axis=1)

    features = train.columns[2:]
    train[features] = train[features].astype(float)
    test[features] = test[features].astype(float)
    train.fillna(-1, inplace=True)
    test.fillna(-1, inplace=True)
    return train, test
Exemplo n.º 9
0
def bnb_fit(train_data, train_lbl_data):
    from sklearn.naive_bayes import BernoulliNB
    print "Starts bnb"

    bnb = BernoulliNB()
    bnb.fit(train_data, train_lbl_data)
    return bnb
Exemplo n.º 10
0
def predict(cur, plyr_id, game_plyrs): 
  #creates training set (called 'X') for plyr
  all_plyrs = all_player_ids(cur) #np.array - all NFL players (and coaches)
  games = games_played_in(cur, plyr_id) #np.array - the games_ids the player played in
  n_cols = all_plyrs.shape[0] #int 
  m_rows = games.shape[0] #int
  zeros = np.zeros((m_rows, n_cols)) #2darr - used to initialize DF
  X = pd.DataFrame(zeros, index=games, columns=all_plyrs) #dataframe
  populate_training_set(cur, X, games, plyr_id)
  print "X: ", X.values
  
  
  #creates vector of known output values
  Y = training_output_vector(cur, games, plyr_id)
  print "(len) Y: ", len(Y), Y
  test_zeros = np.zeros((1, n_cols)) #2darr - used to initialize DF
  test_X = pd.DataFrame(zeros, columns=all_plyrs) #dataframe
  update_training_matrix(game_plyrs, 0, test_X)
  
  #run Bernoulli NB Classifier
  nb_clf = BernoulliNB()
  
  if len(X.values) == 0:
    return 0
  nb_clf.fit(X, Y)
  nb_predictions = nb_clf.predict(test_X)
  print "test_X: ", test_X.values
  nb_norm_prob = normalize_probs(nb_clf.predict_proba(test_X)[0])
  avgs = [1.5, 4.5, 7.5, 10.5, 13.5, 16.5, 19.5, 22.5, 25.5, 28.5, 31.5]
  print "param vector: ", nb_clf.predict_proba(test_X)[0]
  print "probs: ", nb_norm_prob
  print avgs
  ev = expected_val(nb_norm_prob, avgs) #can also calc dot product
  return round(ev, 1)
def combined_experiment(train_x,train_y,test_x,test_y,train_f_x,train_f_y,test_f_x,test_f_y, bias):
    labels = [] # Will contain all the final labels that result from the voting
    clf_c1 = MultinomialNB()
    clf_c1.fit(train_x,train_y)
    clf_c2 = BernoulliNB()
    clf_c2.fit(train_x,train_y)
    clf_f1 = svm.SVC(kernel='linear',cache_size = 512)
    clf_f1.fit(train_f_x,train_f_y)
    clf_f2 = svm.SVC(kernel='rbf',cache_size = 512)
    clf_f2.fit(train_f_x,train_f_y)
    
    p1 = clf_c1.predict(test_x)
    p2 = clf_c2.predict(test_x)
    p3 = clf_f1.predict(test_f_x)
    p4 = clf_f2.predict(test_f_x)
    if bias == 'content':
        for i in range(len(p1)):
            if p1[i] == p2[i] or p1[i] == p3[i]:
                labels.append(p1[i])
            else:
                labels.append(p2[i])
    elif bias == "syntax":
        for i in range(len(p1)):
            if p1[i] == p3[i] or p1[i] == p4[i]:
                labels.append(p1[i])
            else:
                labels.append(p3[i])
    else:
        print 'Please enter a valid bias ("syntax" or "content")!'
    p_combined = np.array(labels)
    accuracy = (np.sum(p_combined == test_y)/np.float_(len(test_y)))
    return accuracy
 def doclassify(self, type='normal'):
     if type == 'normal':
         clf = BernoulliNB()
         clf.fit(self.train_x, self.train_y)
         BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
         score = clf.score(self.train_x, self.train_y)
         print 'score = ', score
def BNB(data_train, data_train_vectors, data_test_vectors, **kwargs):
    # Implementing classification model- using BernoulliNB
    clf_BNB = BernoulliNB(alpha=.01)
    clf_BNB.fit(data_train_vectors, data_train.target)
    y_pred = clf_BNB.predict(data_test_vectors)
    
    return y_pred
Exemplo n.º 14
0
def compareClassifiers():
	(observations, classes) = createObservations()
	observations = np.array(observations)
	classes = np.array(classes)

	# make tree classifier
	my_tree = tree.DecisionTreeClassifier()
	my_tree.fit(observations, classes)
	tree_score = my_tree.score(observations, classes)
	tree_cv = cross_validation.cross_val_score(my_tree, observations, classes, scoring='accuracy', cv=10)
	#print "tree score:", tree_score, "tree cv", np.mean(tree_cv)

	# make naive classifier
	naive = BernoulliNB(binarize=None)
	naive.fit(observations, classes)
	naive_score = naive.score(observations, classes)
	naive_cv = cross_validation.cross_val_score(naive, observations, classes, scoring='accuracy', cv=10)
	#print "naive score:", naive_score, "naive cv", np.mean(naive_cv)

	# make SVM classifier
	svm = LinearSVC()
	svm.fit(observations, classes)
	svm_score = svm.score(observations, classes)
	svm_cv = cross_validation.cross_val_score(svm, observations, classes, scoring='accuracy', cv=10)
	#print "svm score:", svm_score, "svm cv", np.mean(svm_cv)

	# make Log classifier
	log = LogisticRegression()
	log.fit(observations, classes)
	log_score = log.score(observations, classes)
	log_cv = cross_validation.cross_val_score(log, observations, classes, scoring='accuracy', cv=10)
	#print "log score:", log_score, "log cv", np.mean(log_cv)

	return [(tree_score, np.mean(tree_cv)), (naive_score, np.mean(naive_cv)), (svm_score, np.mean(svm_cv)), (log_score, np.mean(log_cv))]
Exemplo n.º 15
0
def test_BernouliNB4():
    X = np.array([
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [0, 0],
        [0, 0],
        [1, 0],
        ]
            )
    print 'X ' + str(X)
    #Y = np.array([1, 1, 1, 1, 2, 2, 2, 2])
    Y = np.array([1, 1, 0, 1, 0, 0, 0, 1, 1, 0])
    print 'Y ' + str(Y)
    clf = BernoulliNB(alpha = 1)
    clf.fit(X, Y)
    X2 = np.array(
            [
            [1, 1],
            ]
            )
    for i in xrange(len(X2)):
        #pred_ret = clf.predict_proba(X2[i])
        pred_ret = clf.predict(X2[i])
        print 'X[' + str(i) + '] = ' + str(X2[i]) + ' pred_ret ' + str(pred_ret)
Exemplo n.º 16
0
def MungeData(train, test, validation):

    features = train.columns[2:]
    print(type(features))
    for col in features:
        if((train[col].dtype == 'object') and (col!="v22")):
            print(col)
            train, binfeatures = Binarize(col, train)
            test, _ = Binarize(col, test, binfeatures)
            validation , _ = Binarize(col, validation, binfeatures)
            nb = BernoulliNB()
            nb.fit(train[col+'_'+binfeatures].values, train.target.values)
            train[col] = \
                nb.predict_proba(train[col+'_'+binfeatures].values)[:, 1]
            test[col] = \
                            nb.predict_proba(test[col+'_'+binfeatures].values)[:, 1]
            validation[col] = \
                            nb.predict_proba(validation[col+'_'+binfeatures].values)[:, 1]
            train.drop(col+'_'+binfeatures, inplace=True, axis=1)
            test.drop(col+'_'+binfeatures, inplace=True, axis=1)
            validation.drop(col+'_'+binfeatures, inplace=True, axis=1)
            train[col] = train[col].astype(float)
            test[col] = test[col].astype(float)
            validation[col] = validation[col].astype(float)
    return train, test, validation
Exemplo n.º 17
0
def main(output_file=time.strftime('%h%d-%Hh%Mm')+'.csv', in_pkl=None):
    """ Generates features and fits classifier. 
    Input command line argument is optional run name, defaults to date/time.
    """
    logging.info("Loading features...")
    if not in_pkl:
        return "input .plk required"
    trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(in_pkl)
    logging.info("Loaded features, fitting model...")
    # Bernoulli Naive Bayes
    clf = BernoulliNB(alpha=1.0, binarize=None, fit_prior=True)
    clf.fit(trainFeatures,trainTargets)
    logging.info("Predicting...")
    # Use probabilities instead of binary class prediction in order to generate a ranking    
    predicted_scores = clf.predict_log_proba(testFeatures).T[1]

    logging.info("Write results...")
    logging.info("Writing submission to %s" % output_file)
    f = open(output_file, "w")
    f.write("id\n")

    for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
        # only writes item_id per output spec, but may want to look at predicted_scores
        f.write("%d\n" % (item_id))

    f.close()
    logging.info("Done.")
Exemplo n.º 18
0
    def generatePredictingModel(data):
        """
            Build the prediction model (based on the data set we have) in order to be able to predict the category
            of a new video from the user input
            Return a classifier able to predict the category of a video based on its title and description.
        """
        try:
            # Intitialize a timer to compute the time to build the model
            start = time.time()

            # Split into train-test data set
            X = data[[x for x in data.columns if x in ('title', 'description')]]
            Y = data[[x for x in data.columns if x in ('video_category_id')]]
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.80, random_state = 10)

            # Build the 2 text corpus
            corpus_title = X_train['title'].values.tolist()
            corpus_description = X_train['description'].values.tolist()

            # initializes the 2 vectorizers.
            count_vectorizer_title = CountVectorizer()
            count_vectorizer_description = CountVectorizer()

            # learn the 2 vocabulary dictionary
            count_vectorizer_title.fit(corpus_title)
            count_vectorizer_description.fit(corpus_description)

            # Build the sparse matrices
            X_train_count_title = count_vectorizer_title.transform(X_train['title'])
            X_train_count_description = count_vectorizer_description.transform(X_train['description'])
            X_test_count_title = count_vectorizer_title.transform(X_test['title'])
            X_test_count_description = count_vectorizer_description.transform(X_test['description'])

            # Set and train the models (for title and description features)
            model_count_title = BernoulliNB()
            model_count_description = BernoulliNB()
            model_count_title.fit(X_train_count_title, Y_train['video_category_id'])
            model_count_description.fit(X_train_count_description, Y_train['video_category_id'])

            # Merge the title and description predictions and build a new prediction based on these 2 predictions combined
            new_df_train = pd.DataFrame()
            new_df_train['title_prediction'] = model_count_title.predict(X_train_count_title)
            new_df_train['description_prediction'] = model_count_description.predict(X_train_count_description)
            new_df_test = pd.DataFrame()
            new_df_test['title_prediction'] = model_count_title.predict(X_test_count_title)
            new_df_test['description_prediction'] = model_count_description.predict(X_test_count_description)
            tree = DecisionTreeClassifier()
            tree.fit(new_df_train, Y_train)

            end = time.time()
            execution_time = end - start

            print "Time to build this incredibly amazing model, only : {} seconds!!!!!!".format(execution_time)
            time.sleep(3)

            return tree, model_count_title, model_count_description,count_vectorizer_title,count_vectorizer_description

        except:
            raise VideoAnalysisException(" Error while creation of predictive model ")
Exemplo n.º 19
0
def score(train_X, train_y):

    X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.01, random_state=10)

    clf = BernoulliNB(binarize=False, fit_prior=True, alpha=0.7)
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_valid)
    return log_loss(y_valid, y_pred)
Exemplo n.º 20
0
	def testBoGNB(self):
		'''
		Test on sentiment analysis task using Naive Bayes classifier 
		with Bag-of-Word feature vectors.
		'''
		wordlist = []
		# Preprocessing of original txt data set
		for i, sent in enumerate(self.senti_train_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			wordlist.extend(words)
		for i, sent in enumerate(self.senti_test_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			wordlist.extend(words)
		word_dict = set(wordlist)
		word2index = dict(zip(word_dict, range(len(word_dict))))
		# Build BoG feature
		train_size = len(self.senti_train_txt)
		test_size = len(self.senti_test_txt)
		pprint('Training set size: %d' % train_size)
		pprint('Test set size: %d' % test_size)
		train_feat = np.zeros((train_size, len(word_dict)), dtype=np.float)
		test_feat = np.zeros((test_size, len(word_dict)), dtype=np.float)
		# Using binary feature
		start_time = time.time()
		for i, sent in enumerate(self.senti_train_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			indices = map(lambda x: word2index[x], words)
			train_feat[i, indices] = 1.0
		for i, sent in enumerate(self.senti_test_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			indices = map(lambda x: word2index[x], words)
			test_feat[i, indices] = 1.0
		end_time = time.time()
		pprint('Finished building training and test feature matrix, time used: %f seconds.' % (end_time-start_time))
		pprint('Classification using Bernoulli Naive Bayes classifier: ')
		clf = BernoulliNB()
		# clf = LogisticRegression()
		clf.fit(train_feat, self.senti_train_label)
		train_pred_label = clf.predict(train_feat)
		train_acc = np.sum(train_pred_label == self.senti_train_label) / float(train_size)
		pprint('Training accuracy = %f' % train_acc)
		pred_label = clf.predict(test_feat)
		acc = np.sum(pred_label == self.senti_test_label) / float(test_size)
		pprint('Accuracy: %f' % acc)
		train_pos_count = np.sum(self.senti_train_label == 1)
		train_neg_count = np.sum(self.senti_train_label == 0)
		test_pos_count = np.sum(self.senti_test_label == 1)
		test_neg_count = np.sum(self.senti_test_label == 0)
		pprint('Positive count in training set: %d' % train_pos_count)
		pprint('Negative count in training set: %d' % train_neg_count)
		pprint('Ratio: pos/neg = %f' % (float(train_pos_count) / train_neg_count))
		pprint('Positive count in test set: %d' % test_pos_count)
		pprint('Negative count in test set: %d' % test_neg_count)
		pprint('Ratio: pos/neg = %f' % (float(test_pos_count) / test_neg_count))
Exemplo n.º 21
0
 def nb_classifier(self, secret):
     clf = BernoulliNB()
     x = self.raw_attr_vector(secret)
     y = self.get_labels(secret)
     fsl = self.feature_sel(secret)
     new_x = fsl.transform(x)
     clf.fit(new_x, y)
     new_y = clf.predict(new_x)
     return clf, fsl, self.evaluate(new_y, y)
Exemplo n.º 22
0
def bnb(X,y,Z,test_data):  
    from sklearn.naive_bayes import BernoulliNB
    bnb = BernoulliNB()
    bnb.fit(X,y)
    #MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)  
    test_probs_bnb = bnb.predict_proba(Z)[:, 1]
    sub = pd.DataFrame({'enrollment_id':test_data["enrollment_id"], 
                        'truth':test_probs_bnb}).set_index("enrollment_id")
    sub.to_csv('data\\result\\sixth_bnb.csv')
Exemplo n.º 23
0
def train(neg=None, pos=None):
    the_file = os.path.dirname(os.path.abspath(__file__))
    if not neg:
        neg = os.path.join(the_file, '..', 'origin', 'neg.txt')
    if not pos:
        pos = os.path.join(the_file, '..', 'origin', 'pos.txt')
    
    tagger = crfseg.create_tagger()
    tok_cn = lambda (x): crfseg.cut_zh(x, tagger)
    
    tfidf = TfidfVectorizer(tokenizer=tok_cn, sublinear_tf=True, max_df=0.5)
    pipe = Pipeline([
        ('tfidf', tfidf),
    #    ('svd', TruncatedSVD(32)),
    #    ('normal', Normalizer(copy=False))
        ])
    '''
    hasher = HashingVectorizer(n_features=2**16,
                               tokenizer=tok_cn, non_negative=True,
                               norm=None, binary=False)
    '''

    #clf = SGDClassifier(loss='log', penalty='l2', alpha=0.00001, n_iter=50, fit_intercept=True)
    #clf = MultinomialNB()
    clf = BernoulliNB()
    
    neg_file = codecs.open(neg, 'r', 'utf-8')
    pos_file = codecs.open(pos, 'r', 'utf-8')

    x_train = []
    y_train = []
    
    i = 0
    for line in neg_file:
        x_train.append(line)
        y_train.append(0)
    for line in pos_file:
        x_train.append(line)
        y_train.append(1)
    
    print 'begin transform'
    #x_train = hasher.transform(x_train)
    x_train = pipe.fit_transform(x_train)
    print 'begin fit'
    clf.fit(x_train, y_train)

    print 'begin save'
    tfidf_file = os.path.join(the_file, 'data', 'tfidf.pkl')
    clf_file = os.path.join(the_file, 'data', 'sgdc_clf.pkl')
    #_ = joblib.dump(tfidf, tfidf_file, compress=9)
    _ = joblib.dump(clf, clf_file, compress=9)

    print 'begin test'
    x_test = [u'这个东西真心很赞']
    #x_test = hasher.transform(x_test)
    x_test = pipe.transform(x_test)
    print clf.predict(x_test)
Exemplo n.º 24
0
def BernoulliNB_pred(X_train, X_test, y_train):
    clf_NB = BernoulliNB()
    clf_NB.fit(X_train, y_train)

    # Conveting to back, (could be used sklearn standardization function for both decoding and encoding)
    predictions_train = clf_NB.predict_proba(X_train)
    predictions = clf_NB.predict_proba(X_test)

    return predictions[:, 1], predictions_train[:, 1]
Exemplo n.º 25
0
def bernoulli_naive_bayes(x_train, y_train, x_cv, y_cv):
    """ Using Naive Bayes to classify the data. """

    print 'Training with NB...'
    clf = BernoulliNB()
    clf.fit(x_train, y_train)

    print 'Accuracy in training set: %f' % clf.score(x_train, y_train)
    print 'Accuracy in cv set: %f' % clf.score(x_cv, y_cv)
    return clf
def convertToNumeric(df):
    features = df.columns[2:]
    for col in features:
        if((df[col].dtype == 'object')):
            print "Converting {0} to numerical data".format(col)
            labelEncode(df, col)
            nb = BernoulliNB()
            nb.fit(df[[col]], df['target'])
            new_col = col + "_binarized"
            df[new_col] = nb.predict_proba(df[[col]])[:, 1]
Exemplo n.º 27
0
def BernoulliNaiveBayes(x_train, y_train, x_cv, y_cv):
	"""
	Bernoulli Naive Bayes
	"""
	#print "Classifier: Bernoulli Naive Bayes"
	clfr = BernoulliNB()
	clfr.fit(x_train, y_train)
	#print 'Accuracy in training set: %f' % clfr.score(x_train, y_train)
	#print 'Accuracy in cv set: %f' % clfr.score(x_cv, y_cv)
	return clfr
Exemplo n.º 28
0
def test_BernouliNB():
    X = np.random.randint(2, size=(6, 100))
    print 'X ' + str(X)
    Y = np.array([1, 2, 3, 4, 4, 5])
    print 'Y ' + str(Y)
    clf = BernoulliNB()
    clf.fit(X, Y)
    for i in xrange(6):
        pred_ret = clf.predict(X[i])
        print 'X[' + str(i) + '] = ' + str(X[i]) + ' pred_ret ' + str(pred_ret)
Exemplo n.º 29
0
def evaluate_baseline():
    inputs, outputs, words = preprocessing.build_data_target_matrices("aclImdb/train/pos", "aclImdb/train/neg", binary_output=True)
    tst_inputs, tst_outputs, _ = preprocessing.build_test_data_target_matrices("aclImdb/test/pos", "aclImdb/test/neg", words, binary_output=True)
    model = BernoulliNB()

    scores = cross_val_score(model, inputs, outputs.ravel(), cv=10)
    logging.info("Accuracy for %s: %.02f, std: %.02f" % ("Baseline BernoulliNB", scores.mean(), scores.std()))

    model.fit(inputs, outputs.ravel())
    logging.info(accuracy_score(tst_outputs.ravel(), model.predict(tst_inputs)))
Exemplo n.º 30
0
class NaiveBayes(StatModel):
	def __init__(self):
		self.name  = "nb"
		self.model = BernoulliNB()

	def train(self, samples, labels):
		self.model.fit(samples, labels)
				
	def predict(self, samples):
		return self.model.predict(samples)
    plt.ylabel('F1 Score')
    plt.xlabel('Log (' + param_name + ')')
    plt.title('Plot - Validation Set Performance of ' + classifier_name +
              ' w.r.t. ' + param_name)
    plt.show()


# Naive bayes classifier

print("Bernoulli Naive Bayes Classifier")
# Tuning Hyper Parameter alpha
hp_f1 = []
a = alpha_from
while a < alpha_to:
    classifier = BernoulliNB(alpha=a)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_valid)
    score = f1_score(y_valid, y_pred, average=f1_avg_param)
    hp_f1.append([math.log10(a), score, a])
    print("Alpha " + str(a) + " : " + str(score))
    a *= alpha_step
    # select alpha
selected_alpha = max(hp_f1, key=lambda item: item[1])
print("Alpha with best performance : " + str(selected_alpha[2]))

#plot the graph
performance_plot(
    np.asarray(hp_f1)[:, 0],
    np.asarray(hp_f1)[:, 1], selected_alpha, "Naive Bayes classifier", "Alpha")

#Training the classifier on the selected alpha
Exemplo n.º 32
0
corpus = [dictionary.doc2bow(text) for text in processed_texts]
#print(corpus[1])

# ## Initializing TFIDF parameters from corpus
tfidf = models.TfidfModel(corpus)

# ## Creating TFIDF Matrix from data
corpus_tfidf = tfidf[corpus]
print(corpus_tfidf.obj)

## Creating LSA model on the tfidf
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=120)
lsi.print_topics(10)

lsi_corpus = []
for lsi_doc in lsi[corpus]:
    lsi_corpus.append([topic_component[1] for topic_component in lsi_doc])
import numpy as np
lsi_corpus = np.array(lsi_corpus)
print(lsi_corpus.shape)

from sklearn.naive_bayes import BernoulliNB
nb_model = BernoulliNB()
nb_model.fit(lsi_corpus, all_categories)

from sklearn.metrics import accuracy_score

#backslash means the function continues in next line\
print('Accuracy on test data: {}%'.format(\
                                          accuracy_score(all_categories, nb_model.predict(lsi_corpus))\
                                          *100))
Exemplo n.º 33
0
def extract(positive_fcs, negative_fcs, features=None):
    '''Takes a labeled set of feature collections (positive and negative)
       and the features wanted. And trains a Naive Bayes classifier on
       the underlying keys of the set of selected features features.
       If no features are selected, all are used.

       Returns two list of (keywords, strength) tuples ordered by strength. The
       first are feature keys that were predictive of the positive
       label and the second are the feature keys are were predictive
       of the negative label.

    ``*_fcs`` is the list of feature collections, positive label and
            negative label respectively.

    ``features`` designates which specific feature gets vectorized the
               other features are ignored.

    '''

    # Vector of labels
    labels = np.array([1] * len(positive_fcs) + [0] * len(negative_fcs))

    # Used to convert the feature collection keys into a sklearn
    # compatible format
    v = DictVectorizer(sparse=False)

    D = list()
    for fc in (positive_fcs + negative_fcs):
        feat = StringCounter()

        if not fc:
            logger.warn('how did we get an empty fc? %r', fc)

        else:
            # The features used to pull the keys for the classifier
            for f in features:
                feat += fc[f]

        D.append(feat)

    # Convert the list of Counters into an sklearn compatible format
    X = v.fit_transform(D)

    # Fit the sklearn Bernoulli Naive Bayes classifer
    clf = BernoulliNB()
    clf.fit(X, labels)

    # Extract the learned features that are predictive of the positive
    # and negative class
    positive_keywords = v.inverse_transform(clf.feature_log_prob_[1])[0]
    negative_keywords = v.inverse_transform(clf.feature_log_prob_[0])[0]

    pos_words = Counter(positive_keywords)
    neg_words = Counter(negative_keywords)

    ## make a list ordered by their weight
    pos_ordered = sorted(pos_words.items(),
                         key=operator.itemgetter(1),
                         reverse=True)
    neg_ordered = sorted(neg_words.items(),
                         key=operator.itemgetter(1),
                         reverse=True)

    return pos_ordered, neg_ordered
Exemplo n.º 34
0
y_train

# In[16]:

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

# In[17]:

myber = BernoulliNB()
mygau = GaussianNB()
mymul = MultinomialNB()

# In[19]:

mygaumodel = mygau.fit(x_train, y_train)
mybermodel = myber.fit(x_train, y_train)
mymulmodel = mymul.fit(x_train, y_train)

# In[20]:

ypgau = mygaumodel.predict(x_test)
ypber = mybermodel.predict(x_test)
ypmul = mymulmodel.predict(x_test)

# In[21]:

from sklearn import metrics

# In[24]:

acc_gau = metrics.accuracy_score(y_target, ypgau)
Exemplo n.º 35
0
from __future__ import division
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
import sys
import image_util

(train_set, train_label,
 count_label) = image_util.load_dataset(image_util.DS2_TRAIN_PATH,
                                        image_util.DS2_LABEL_SIZE)

# clf = GaussianNB()
clf = BernoulliNB()
clf.fit(train_set, train_label)

(val_set, val_label,
 val_count_label) = image_util.load_dataset(image_util.DS2_VAL_PATH,
                                            image_util.DS2_LABEL_SIZE)

predictions = clf.predict(val_set)

correct_count = 0
for row in range(image_util.DS2_VAL_SIZE):
    print("prediction: " + str(predictions[row]))
    print("actual: " + str(val_label[row]))

    if predictions[row] == val_label[row]:
        correct_count = correct_count + 1

print(correct_count / image_util.DS2_VAL_SIZE)
Exemplo n.º 36
0
def bernoulli_naive_bayes_classifier(train_x, train_y):
    from sklearn.naive_bayes import MultinomialNB
    model = BernoulliNB(alpha=0.01)
    model.fit(train_x, train_y)
    return model
Exemplo n.º 37
0
def bernNBClassifier(trainingVectors, targetValues):

    clf = BernoulliNB()
    clf.fit(trainingVectors, targetValues, targetValues * 10000)

    return (clf)
Exemplo n.º 38
0
print("\n" + "SVC_classifier")
log_model3 = LinearSVC()
log_model3 = log_model3.fit(X=X_train, y=y_train)
y_pred = log_model3.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

from sklearn.naive_bayes import MultinomialNB, BernoulliNB

# MultinomialNB_classifier
print("\n" + "MultinomialNB")
log_model_multinomial = MultinomialNB()
log_model_multinomial = log_model_multinomial.fit(X=X_train, y=y_train)
y_pred = log_model_multinomial.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

# BernoulliNB ClassifierI
print("\n" + "BernoulliNB")
log_model_bernoulli = BernoulliNB()
log_model_bernoulli = log_model_bernoulli.fit(X=X_train, y=y_train)
y_pred = log_model_bernoulli.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.9051851851851852
exported_pipeline = BernoulliNB(alpha=0.001, fit_prior=True)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
#制作词向量表
data = pd.read_excel('C:/Users/64191/Desktop/Contents.xlsx', sheetname=0)
data.Content = data.Content.str.replace('[0-9a-zA-A]', '')
jieba.load_userdict(r'C:/Users/64191/Desktop/all_words.txt')
with open(r'C:/Users/64191/Desktop/mystopwords.txt', encoding='UTF-8') as f:
    stop_words = [i.strip('\n') for i in f.readlines()]


def cut(x):
    words = []
    for i in jieba.lcut(x):
        if i not in stop_words:
            words.append(i)
    result = ' '.join(words)
    return result


word = data.Content.apply(cut)
counts = CountVectorizer(min_df=0.01)
data_matrix = counts.fit_transform(word).toarray()
#进行分类与测试
X = pd.DataFrame(data_matrix, columns=counts.get_feature_names())
Y = data.Type
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=1)
bnb = BernoulliNB()
bnb.fit(X_train, Y_train)
pred = bnb.predict(X_test)
print(classification_report(Y_test, pred))
Exemplo n.º 41
0
trainData = pd.read_table('../dataset1/train.txt',
                          header=None,
                          encoding='gb2312',
                          delim_whitespace=True)
testData = pd.read_table('../dataset1/test.txt',
                         header=None,
                         encoding='gb2312',
                         delim_whitespace=True)
trainLabel = np.array(trainData.pop(3))
trainData = np.array(trainData)
testLabel = np.array(testData.pop(3))
testData = np.array(testData)

time_start1 = time.time()
clf1 = BayesClassifier()
clf1.train(trainData, trainLabel)
clf1.predict(testData)
score1 = clf1.accuarcy(testLabel)
time_end1 = time.time()
print("Accuracy of self-Bayes: %f" % score1)
print("Runtime of self-Bayes:", time_end1 - time_start1)

time_start = time.time()
clf = BernoulliNB()
clf.fit(trainData, trainLabel)
clf.predict(testData)
score = clf.score(testData, testLabel, sample_weight=None)
time_end = time.time()
print("Accuracy of sklearn-Bayes: %f" % score)
print("Runtime of sklearn-Bayes:", time_end - time_start)
Exemplo n.º 42
0
 clf_sum = 0
 lr_sum = 0
 svm_li_sum = 0
 svm_rbf_sum = 0
 i = -1
 for train, test in kf:
     #NBC
     i += 1
     train_1 = train_disc_data[train]
     train_2 = train_conti_data[train]
     test_1 = train_disc_data[test]
     test_2 = train_conti_data[test]
     train_true = train_target[train]
     test_true = train_target[test]
     clf_train_disc = BernoulliNB()
     clf_train_disc.fit(train_1, train_true)
     clf_train_conti = GaussianNB()
     clf_train_conti.fit(train_2, train_true)
     result1 = clf_train_disc.predict_proba(test_1)
     result2 = clf_train_conti.predict_proba(test_2)
     result_arr = np.zeros(len(test), dtype=int)
     for index in range(len(test)):
         result_a = result1[index, 0] * result2[index, 0]
         result_b = result1[index, 1] * result2[index, 1]
         if (result_a < result_b): result_arr[index] = 1
         else: result_arr[index] = 0
     clf_sum += f1_score(test_true, result_arr)
     if (k_value == 50):
         t_test_value[i, 0] = f1_score(test_true, result_arr)
     #logistic
     lr_data = np.column_stack((train_conti_data, train_disc_data))
testdf = testdf.dropna()
testLabel = testLabel.dropna()
testLabel = testLabel.apply(int)

try:
    svmModel = svmAlg.fit(trainDf, trainLabel)
    svmpred = svmModel.predict(testdf)
    svmAcc = accuracy_score(testLabel, svmpred)
    print 'SVM Accuracy : ' + str(svmAcc)

    gnbModel = gnb.fit(trainDf, trainLabel)
    gnbpred = gnbModel.predict(testdf)
    gnbAcc = accuracy_score(testLabel, gnbpred)
    print 'GNB Accuracy : ' + str(gnbAcc)

    bnbModel = bnb.fit(trainDf, trainLabel)
    bnbpred = bnbModel.predict(testdf)
    bnbAcc = accuracy_score(testLabel, bnbpred)
    print 'BNB Accuracy : ' + str(bnbAcc)

    treeModel = tree.fit(trainDf, trainLabel)
    treepred = treeModel.predict(testdf)
    treeAcc = accuracy_score(testLabel, treepred)
    print 'Decision Tree Accuracy : ' + str(treeAcc)

    rndModel = rnd.fit(trainDf, trainLabel)
    rndpred = rndModel.predict(testdf)
    rndAcc = accuracy_score(testLabel, rndpred)
    print 'Random Forest Accuracy : ' + str(rndAcc)
except Exception, e:
    print 'model error ' + str(e)
Exemplo n.º 44
0
#############################SUPPORT VECTOR MACHINES###########################

from sklearn.svm import SVC

svc = SVC(verbose=True, random_state=0)

svc.fit(X_train, y_train)

############################### Naive Bayes ##################################

from sklearn.naive_bayes import BernoulliNB

BernNB = BernoulliNB(binarize=True)

BernNB.fit(X_train, y_train)

#### 3 SINGLE LAYER NEURAL NETWORL - PERCEPTRON ###############################

X_ann = X.copy()
X_train_ann = np.array(X_train.copy())
X_test_ann = np.array(X_test.copy())
y_train_ann = np.array(y_train.copy())


class NeuralNetwork():
    def __init__(self):
        np.random.seed(4)
        self.synaptic_weights = 2 * np.random.random((12, 1)) - 1

    def sigmoid(self, x):
Exemplo n.º 45
0
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Train KNeighborsClassifier Model
KNN_Classifier = KNeighborsClassifier(n_jobs=-1)
KNN_Classifier.fit(X_train, Y_train)

# Train LogisticRegression Model
LGR_Classifier = LogisticRegression(n_jobs=-1, random_state=0)
LGR_Classifier.fit(X_train, Y_train)

# Train Gaussian Naive Baye Model
BNB_Classifier = BernoulliNB()
BNB_Classifier.fit(X_train, Y_train)

# Train Decision Tree Model
DTC_Classifier = tree.DecisionTreeClassifier(criterion='entropy',
                                             random_state=0)
DTC_Classifier.fit(X_train, Y_train)
#Evaluate Models
from sklearn import metrics

models = []
models.append(('Naive Baye Classifier', BNB_Classifier))
models.append(('Decision Tree Classifier', DTC_Classifier))
models.append(('KNeighborsClassifier', KNN_Classifier))
models.append(('LogisticRegression', LGR_Classifier))

for i, v in models:
Exemplo n.º 46
0
X_train, X_test, y_train, y_test = train_test_split(
    text_data, Y, test_size=0.25, shuffle=False)

count = CountVectorizer(preprocessor=myPreprocessor,
                        lowercase=False, tokenizer=myTokenizer, max_features=size)

X_train = count.fit_transform(X_train).toarray()
print("----------Train vector------------", len(X_train))
print(X_train)
X_test = count.transform(X_test).toarray()
print("----------Test vector------------", len(X_test))
print(X_test)

start_time = time.time()
clf = BernoulliNB()
model = clf.fit(X_train, y_train)
training_time = (time.time() - start_time)

# print(y_test, y_pred)
# print(model.predict_proba(X_test))
# print(precision_score(y_test, y_pred, average='micro'))
# print(recall_score(y_test, y_pred, average='micro'))
# print(f1_score(y_test, y_pred, average='micro'))
# print(f1_score(y_test, y_pred, average='macro'))

y_pred = model.predict(X_test)
# print(classification_report(y_test, y_pred))
# print('Accuracy score:', accuracy_score(y_test, y_pred))
testtime = time.time() - start_time
test_report = classification_report(y_test, y_pred, output_dict=True)
'''
    模型搭建
'''

# 只取星期几和街区作为分类器输入特征
features = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'BAYVIEW', 'CENTRAL',
            'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']

# 分割训练集(3/5)和测试集(2/5)
training, validation = train_test_split(trainData, train_size=.60)

# 朴素贝叶斯建模,计算log_loss
model = BernoulliNB()
nbStart = time.time()
model.fit(training[features], training['crime'])
nbCostTime = time.time() - nbStart
predicted = np.array(model.predict_proba(validation[features]))
print("朴素贝叶斯建模耗时 %f 秒" % (nbCostTime))
# 朴素贝叶斯建模耗时 0.591072 秒
print("朴素贝叶斯log损失为 %f" % (log_loss(validation['crime'], predicted)))
# 朴素贝叶斯log损失为 2.615596

# 逻辑回归建模,计算log_loss
model = LogisticRegression(C=.01)
lrStart = time.time()
model.fit(training[features], training['crime'])
lrCostTime = time.time() - lrStart
predicted = np.array(model.predict_proba(validation[features]))
log_loss(validation['crime'], predicted)
print("逻辑回归建模耗时 %f 秒" % (lrCostTime))
Exemplo n.º 48
0
plt.title('ROC curve for SMV Fraud Classification')
plt.xlabel('False Positive Rate (1-Specificity)')
plt.ylabel('True Possitive Rate (Sensitivity)')
plt.grid(True)
plt.show()

#END SVM MODEL

#START NAIVE BAYES MODEL

#Crating the train and test populations 33% in testing data set. for Naive Bayes and Decision Tree
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, Y1, test_size = .33, random_state = 17)

#NB1 BernoulliNB
BernNB = BernoulliNB(binarize = 0.025) # use either 0.025 0.1 or True
BernNB.fit(X1_train, Y1_train)
print(BernNB)

Y1_expect = Y1_test
Y1_pred = BernNB.predict(X1_test)
print(accuracy_score(Y1_expect, Y1_pred))

#BernNB Evalutation
#Confusion Matrix
confusion_matrix(Y1_expect, Y1_pred)
#AUCROC Curve
fpr, tpr, thresholds = metrics.roc_curve(Y1_expect, Y1_pred)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Beroulli Naives bays Fraud Classification')
Exemplo n.º 49
0
# Setup data structures to hold train, test results
train_jll = np.zeros((10, 15))
test_jll = np.zeros((10, 15))

for i in range(0, 10):
    idx = 0
    # Split datasets
    x_train, x_test, y_train, y_test = train_test_split(Xs[i],
                                                        ys[i],
                                                        test_size=1. / 3,
                                                        random_state=7000)
    for j in alphas:
        # 1. Create new Bernoulli Naive Bayes model using alpha value
        mod = BernoulliNB(alpha=j)
        # Fit the model to the training set
        mod.fit(x_train, y_train)
        # Compute the joint log likelihood for the training set, store it train_jll 2d array
        total_res = mod._joint_log_likelihood(x_train)
        y_train_binary = y_train * 1
        entry_val = 0
        # Sum-up by matching true labels
        for k in range(0, len(y_train)):
            entry_val += total_res[k][y_train_binary[k]]
        # Store result
        train_jll[i][idx] = entry_val
        # 2. Compute the joint log likelihood for the testing set, store it test_jll 2d array
        total_res = mod._joint_log_likelihood(x_test)
        y_test_binary = y_test * 1
        entry_val = 0
        # Sum-up by matching true labels
        for k in range(0, len(y_test)):
Exemplo n.º 50
0
le.fit(dataset["Sex"])
dataset["Sex"] = le.transform(dataset["Sex"])

#assigning DV to y and IDV to x
y = dataset["Pclass"]
X = dataset[["Survived", "Sex", "Age", "SibSp", "Parch", "Fare"]]

print(y.count())

#training the model
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

#applying naive bayes algorithm
from sklearn.naive_bayes import BernoulliNB

clf = BernoulliNB()

#prediction
y_pred = clf.fit(X_train, y_train).predict(X_test)

#accuracy score
print("The accuracy score is : ", accuracy_score(y_test,
                                                 y_pred,
                                                 normalize=True))

#confusion matrix
print("The confusion matrix is: \n", confusion_matrix(y_test, y_pred))
classifier = SVC()
classifier2 = DecisionTreeClassifier()
classifier3 = BernoulliNB()
classifier4 = GaussianNB()

# shape,size,color
# spherical,oval,long
# small,medium,large

train_x = [[0, 1, 0], [0, 2, 1], [1, 2, 2], [0, 1, 2], [2, 1, 2], [0, 0, 1],
           [0, 0, 1], [0, 0, 1], [0, 0, 1], [0, 1, 0], [0, 1, 0], [0, 2, 1]]

train_y = [0, 1, 2, 3, 4, 5, 5, 5, 5, 0, 0, 1]

test_x = [[1, 2, 2], [0, 1, 0]]
# test_y = [2,5]
classifier.fit(train_x, train_y)
classifier2.fit(train_x, train_y)
classifier3.fit(train_x, train_y)
classifier4.fit(train_x, train_y)
prediction = classifier.predict(test_x)
prediction2 = classifier2.predict(test_x)
prediction3 = classifier3.predict(test_x)
prediction4 = classifier4.predict(test_x)

print("prediction  :", prediction)
print("prediction2 :", prediction2)
print("prediction3 :", prediction3)
print("prediction4 :", prediction4)
def main():
    #start timer
    start = time.time()
    #import data from training file and test file
    sparse_matrix, ranks = importData("train_drugs.dat", True, 0)
    test_data = importData("test.dat", False, sparse_matrix.shape[1])

    #run dimensionality reduction on input data
    selector_tsvd = None
    sparse_matrix_tsvd = []
    if (path.exists("./pickles/selector_tsvd.p")
            and path.exists("./pickles/sparse_matrix_tsvd.p")):
        selector_tsvd = pk.load(open("./pickles/selector_tsvd.p", "rb"))
        sparse_matrix_tsvd = pk.load(
            open("./pickles/sparse_matrix_tsvd.p", "rb"))
    else:
        svd = TruncatedSVD(n_components=200, n_iter=7, random_state=42)
        selector_tsvd = svd.fit(sparse_matrix, ranks)
        sparse_matrix_tsvd = selector_tsvd.transform(sparse_matrix)
        pk.dump(selector_tsvd, open("./pickles/selector_tsvd.p", "wb"))
        pk.dump(sparse_matrix_tsvd, open("./pickles/sparse_matrix_tsvd.p",
                                         "wb"))

#run features selection on data to remove unimportant features
#recursive features selection to remove most of the features that are least important
    selector_rfe = None
    sparse_matrix_rfe = []
    if (path.exists("./pickles/selector_rfe.p")
            and path.exists("./pickles/sparse_matrix_rfe.p")):
        selector_rfe = pk.load(open("./pickles/selector_rfe.p", "rb"))
        sparse_matrix_rfe = pk.load(open("./pickles/sparse_matrix_rfe.p",
                                         "rb"))
    else:
        selector_rfe, sparse_matrix_rfe = rfeFeatureSelection(
            sparse_matrix, ranks)
        pk.dump(selector_rfe, open("./pickles/selector_rfe.p", "wb"))
        pk.dump(sparse_matrix_rfe, open("./pickles/sparse_matrix_rfe.p", "wb"))

    #recursive features selection with cross validation to chose best of reamining features
    selector_rfecv = None
    sparse_matrix_rfecv = []
    if (path.exists("./pickles/selector_rfecv.p")
            and path.exists("./pickles/sparse_matrix_rfecv.p")):
        selector_rfecv = pk.load(open("./pickles/selector_rfecv.p", "rb"))
        sparse_matrix_rfecv = pk.load(
            open("./pickles/sparse_matrix_rfecv.p", "rb"))
    else:
        selector_rfecv, sparse_matrix_rfecv = rfecvFeatureSelection(
            sparse_matrix_rfe, ranks)
        pk.dump(selector_rfecv, open("./pickles/selector_rfecv.p", "wb"))
        pk.dump(sparse_matrix_rfecv,
                open("./pickles/sparse_matrix_rfecv.p", "wb"))

    #run chi^2 selection on original data to see how accurate it is
    sparse_matrix_chi = []
    selector_chi = None
    if (path.exists("sparse_matrix_chi.p")):
        sparse_matrix_chi = pickle.load(
            open("./pickles/sparse_matrix_chi.p", "rb"))
        selector_chi = pickle.load(open("/pickles/selector_chi.p", "rb"))
    else:
        selector_chi, sparse_matrix_chi = chiSquareSelection(
            sparse_matrix, ranks)
        pk.dump(sparse_matrix_chi, open("./pickles/sparse_matrix_chi.p", "wb"))
        pk.dump(selector_chi, open("./pickles/selector_chi.p", "wb"))

    #account for imbalanced data with SMOTE over sampling
    Orig_X_resampled, Orig_y_resampled = SMOTE().fit_resample(
        sparse_matrix.todense(), ranks)

    TSVD_X_resampled, TSVD_y_resampled = SMOTE().fit_resample(
        sparse_matrix_tsvd, ranks)

    rfe_X_resampled, rfe_y_resampled = SMOTE().fit_resample(
        sparse_matrix_rfe, ranks)

    rfecv_X_resampled, rfecv_y_resampled = SMOTE().fit_resample(
        sparse_matrix_rfecv, ranks)

    chi_X_resampled, chi_y_resampled = SMOTE().fit_resample(
        sparse_matrix_chi, ranks)

    #set up classifiers, train on data
    #Bernoulli naive bayes
    nb_orig = BernoulliNB()
    nb_orig.fit(sparse_matrix, ranks)

    nb_orig_resampled = BernoulliNB()
    nb_orig_resampled.fit(Orig_X_resampled, Orig_y_resampled)

    nb_tsvd = BernoulliNB()
    nb_tsvd.fit(TSVD_X_resampled, TSVD_y_resampled)

    nb_tsvd_non_sampled = BernoulliNB()
    nb_tsvd_non_sampled.fit(sparse_matrix_tsvd, ranks)

    nb_rfec = BernoulliNB()
    nb_rfec.fit(selector_rfe.transform(sparse_matrix), ranks)

    nb_rfecv = BernoulliNB()
    nb_rfecv.fit(rfecv_X_resampled, rfecv_y_resampled)

    nb_rfecv_non_sampled = BernoulliNB()
    nb_rfecv_non_sampled.fit(sparse_matrix_rfecv, ranks)

    nb_chi = BernoulliNB()
    nb_chi.fit(chi_X_resampled, chi_y_resampled)

    #decision tree classifier
    dt_rfecv_resampled = DecisionTreeClassifier(random_state=0)
    dt_rfecv_resampled.fit(rfecv_X_resampled, rfecv_y_resampled)

    dt_rfecv = DecisionTreeClassifier(random_state=0)
    dt_rfecv.fit(sparse_matrix_rfecv, ranks)

    dt_orig = DecisionTreeClassifier(random_state=0)
    dt_orig.fit(sparse_matrix, ranks)

    dt_orig_resampled = DecisionTreeClassifier(random_state=0)
    dt_orig_resampled.fit(Orig_X_resampled, Orig_y_resampled)

    dt_tsvd = DecisionTreeClassifier(random_state=0)
    dt_tsvd.fit(sparse_matrix_tsvd, ranks)

    dt_tsvd_resampled = DecisionTreeClassifier(random_state=0)
    dt_tsvd_resampled.fit(TSVD_X_resampled, TSVD_y_resampled)

    dt_chi = DecisionTreeClassifier(random_state=0)
    dt_chi.fit(chi_X_resampled, chi_y_resampled)

    #run test predictions
    #run naive bayes predictions
    orig_pred = nb_orig.predict(sparse_matrix)
    orig_pred_resamp = nb_orig_resampled.predict(sparse_matrix)
    tsvd_pred = nb_tsvd.predict(selector_tsvd.transform(sparse_matrix))
    tsvd_non_sampled_pred = nb_tsvd_non_sampled.predict(
        selector_tsvd.transform(sparse_matrix))
    rfe_pred = nb_rfec.predict(selector_rfe.transform(sparse_matrix))
    rfecv_pred = nb_rfecv_non_sampled.predict(
        selector_rfecv.transform(selector_rfe.transform(sparse_matrix)))
    rfecv_pred_non_sampeld = nb_rfecv.predict(
        selector_rfecv.transform(selector_rfe.transform(sparse_matrix)))
    chi_pred = nb_chi.predict(selector_chi.transform(sparse_matrix))

    #run decision tree predictions
    dt_rfecv_resampled_pred = dt_rfecv_resampled.predict(
        selector_rfecv.transform(selector_rfe.transform(sparse_matrix)))
    dt_rfecv_pred = dt_rfecv.predict(
        selector_rfecv.transform(selector_rfe.transform(sparse_matrix)))
    dt_orig_pred = dt_orig.predict(sparse_matrix)
    dt_orig_resampled_pred = dt_orig_resampled.predict(sparse_matrix)
    dt_tsvd_pred = dt_tsvd.predict(selector_tsvd.transform(sparse_matrix))
    dt_tsvd_resampled_pred = dt_tsvd_resampled.predict(
        selector_tsvd.transform(sparse_matrix))
    dt_chi_pred = dt_chi.predict(selector_chi.transform(sparse_matrix))

    #test the output f1 score
    #test for naive bayes
    orig_f1 = f1_score(ranks, orig_pred, average='macro')
    orig_f1_resampled = f1_score(ranks, orig_pred_resamp, average='macro')
    tsvd_f1 = f1_score(ranks, tsvd_pred, average='macro')
    tvsd_resampled_f1 = f1_score(ranks, tsvd_non_sampled_pred, average='macro')
    rfe_f1 = f1_score(ranks, rfe_pred, average='macro')
    rfecv_f1 = f1_score(ranks, rfecv_pred, average='macro')
    rfecv_reasmple_f1_non_sampled = f1_score(ranks,
                                             rfecv_pred_non_sampeld,
                                             average='macro')
    chi_f1 = f1_score(ranks, chi_pred, average='macro')

    #test for decision trees
    dt_rfec_resampled_f1 = f1_score(ranks,
                                    dt_rfecv_resampled_pred,
                                    average='macro')
    dt_rfecv_f1 = f1_score(ranks, dt_rfecv_pred, average='macro')
    dt_orig_f1 = f1_score(ranks, dt_orig_pred, average='macro')
    dt_orig_resampled_f1 = f1_score(ranks,
                                    dt_orig_resampled_pred,
                                    average='macro')
    dt_tsvd_f1 = f1_score(ranks, dt_tsvd_pred, average='macro')
    dt_tsvd_resampled_f1 = f1_score(ranks,
                                    dt_tsvd_resampled_pred,
                                    average='macro')
    dt_chi_f1 = f1_score(ranks, dt_chi_pred, average='macro')

    #output the different test results
    print('orig:', orig_f1, 'orig_resampled:', orig_f1_resampled, 'tsvd:',
          tsvd_f1, 'tvsd_resampled_f1:', tvsd_resampled_f1, 'rfe_f1:', rfe_f1,
          'rfecv_f1:', rfecv_f1, 'rfecv_reasmple_f1_non_sampled:',
          rfecv_reasmple_f1_non_sampled, 'chi_f1:', chi_f1)
    print('dt_rfec_resampled_f1:', dt_rfec_resampled_f1, 'dt_rfecv_f1:',
          dt_rfecv_f1, 'dt_orig_f1:', dt_orig_f1, 'dt_orig_resampled_f1:',
          dt_orig_resampled_f1, 'dt_tsvd_f1:', dt_tsvd_f1,
          'dt_tsvd_resampled_f1:', dt_tsvd_resampled_f1)

    #test with testfile using best classifier
    transformed_data = selector_rfe.transform(test_data)
    test_predict = nb_chi.predict(selector_chi.transform(test_data))
    with open('test_file_prediction.dat', "w") as fp2:
        for num in test_predict:
            fp2.write(str(num) + '\n')
    print(len(test_predict))

    #print out time elapsed
    end = time.time()
    print(end - start)
df_stack.to_csv('feature/tfidf_ridge_1_3_error_single_classfiy.csv',
                index=None,
                encoding='utf8')
print('ridge特征已保存\n')

########################### bnb(BernoulliNB) ################################
print('BernoulliNB stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0

for i, (tr, va) in enumerate(
        StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
    print('stack:%d/%d' % ((i + 1), n_folds))
    bnb = BernoulliNB()
    bnb.fit(train_feature[tr], score[tr])
    score_va = bnb.predict_proba(train_feature[va])
    score_te = bnb.predict_proba(test_feature)
    print(score_va)
    print('得分' +
          str(mean_squared_error(score[va], bnb.predict(train_feature[va]))))
    stack_train[va] += score_va
    stack_test += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
    df_stack['tfidf_bnb_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_bnb_1_3_error_single_classfiy.csv',
                index=None,
                encoding='utf8')
Exemplo n.º 54
0
clf_prob = LogisticRegression(C=1,
                              class_weight='balanced',
                              dual=False,
                              fit_intercept=True,
                              intercept_scaling=0.2,
                              max_iter=100,
                              multi_class='ovr',
                              n_jobs=1,
                              penalty='l2',
                              random_state=None,
                              solver='liblinear',
                              tol=0.0001,
                              verbose=0,
                              warm_start=False)

clf_text.fit(trn_text_bow, trn_text_classes_bow)
clf_description.fit(trn_description_bow, trn_description_classes_bow)
trn_prob, trn_prob_class = prepare_combined_model(clf_text, clf_description,
                                                  vectorizer_text,
                                                  vectorizer_description,
                                                  test_set)
clf_prob.fit(trn_prob, trn_prob_class)

# ###################### Add genders for each user #######################
print 'Predicting...'
user = defaultdict(list)
image_url = {}
for id in database:
    document = database[str(id)]
    if document['gender'] == None and len(user[document['user']['id']]) < 5:
        if document['user']['profile_image_url'] != None and document['user'][
Exemplo n.º 55
0
    after_stem_words = []
    for w in new_words:
        after_stem_words.append(ps.stem(w))
    clean_msg = ' '.join(after_stem_words)
    return clean_msg


df['msg'] = df.msg.apply(clean_text)

print('data cleaned...')

X = cv.fit_transform(df.msg).toarray()
new_X = pca.fit_transform(X)
y = df.iloc[:, 0].values
print('going for training...')
log.fit(new_X, y)
print('model trained....')

root = Tk()
root.state('zoomed')
root.configure(background='yellow')
l1 = Label(root,
           text='Spam Detection',
           bg='yellow',
           fg='blue',
           font=('', 40, 'bold'))
l1.place(x=190, y=20)

l2 = Label(root,
           text='Enter msg:',
           bg='yellow',
Exemplo n.º 56
0
test_numbers = cv.transform(new_test_data).toarray()
print(test_numbers)

# # Multinomial Naive Bayes :

# In[10]:

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

# In[11]:

mnb = MultinomialNB()
mnb.fit(numbers, y)

# In[12]:

mnb.predict(test_numbers)

# # Bernaulli Naive Bayes :

# In[13]:

bnb = BernoulliNB()
bnb.fit(numbers, y)

# In[14]:

bnb.predict(test_numbers)

# In[ ]:
Exemplo n.º 57
0
def bernoulliNB():
    X = np.random.randint(2, size=(6, 100))
    Y = np.array([1, 2, 3, 4, 4, 5])
    clf = BernoulliNB()
    clf.fit(X, Y)
    print(clf.predict(X[2:3]))
Exemplo n.º 58
0
class NBClassifier(object):
    def __init__(self,
                 decompose_func=None,
                 preprocessor=None,
                 nbits=15,
                 seed=1):
        self.decompose_func = decompose_func
        self.nbits = nbits
        feature_size, bitmask = set_feature_size(nbits=nbits)
        self.feature_size = feature_size
        self.bitmask = bitmask
        self.encoding_func = make_encoder(decompose_func,
                                          preprocessors=preprocessor,
                                          bitmask=self.bitmask,
                                          seed=seed)
        self.classifier = BernoulliNB(alpha=0.1,
                                      binarize=None,
                                      fit_prior=True,
                                      class_prior=None)

    def fit(self, graphs, targets):
        data_mtx = vectorize_graphs(graphs,
                                    encoding_func=self.encoding_func,
                                    feature_size=self.feature_size)
        # binarize
        data_mtx.data = np.where(data_mtx.data > 0, 1, 0)
        self.classifier.fit(data_mtx, targets)
        return self

    def decision_function(self, graphs):
        # return probability associated to largest target type
        data_mtx = vectorize_graphs(graphs,
                                    encoding_func=self.encoding_func,
                                    feature_size=self.feature_size)
        # binarize
        data_mtx.data = np.where(data_mtx.data > 0, 1, 0)
        preds = self.classifier.predict_proba(data_mtx)
        # assuming binary classification and column 1 to represent positives
        preds = preds[:, 1].reshape(-1)
        return preds

    def predict(self, graphs):
        data_mtx = vectorize_graphs(graphs,
                                    encoding_func=self.encoding_func,
                                    feature_size=self.feature_size)
        # binarize
        data_mtx.data = np.where(data_mtx.data > 0, 1, 0)
        preds = self.classifier.predict(data_mtx)
        return preds

    def explain(self, graphs, top_k):
        feature_dict, feature_counts = get_feature_dict(
            graphs,
            decomposition_funcs=self.decompose_func,
            nbits=self.nbits,
            return_counts=True)
        # compute log-odds
        scores = self.classifier.feature_log_prob_[
            1, :] / self.classifier.feature_log_prob_[0, :]
        ranked_pos_features = np.argsort(-scores)
        # signature-counts
        stats = [(feature_dict[id].graph['signature'], feature_counts[id])
                 for id in feature_dict]
        # aggregate counts according to same signature
        sig_dict = dict()
        for sig, c in stats:
            if sig in sig_dict:
                sig_dict[sig] += c
            else:
                sig_dict[sig] = c
        # take logs
        for id in sig_dict:
            sig_dict[id] = math.log(sig_dict[id])
        # select top_k
        feature_graphs = [
            feature_dict[fid] for fid in ranked_pos_features[:top_k]
        ]
        c = Counter([g.graph['signature'] for g in feature_graphs])
        cnt = dict([(id, c[id] / sig_dict[id]) for id in c])
        tot = sum(cnt[id] for id in cnt)
        res = [
            (cnt[id] / tot, cnt[id], id)
            for id in sorted(cnt.keys(), key=lambda id: cnt[id], reverse=True)
        ]
        return res
Exemplo n.º 59
0
                                                    test_size=0.000001,
                                                    random_state=0)
print('Training SVC: ')
clf = svm.SVC()
clf.fit(X_train, y_train)
print("SVC Accuracy Test: ", clf.score(X_test, y_test))
########################
print('Training GNB: ')
gnb = GaussianNB()
gnb.fit(X_train, y_train)
print("GNB Accuracy Test: ", gnb.score(X_test, y_test))
print(gnb.predict_proba(X_test[0].reshape(1, -1)))
########################
print('Training BNB: ')
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
print("BNB Accuracy Test: ", bnb.score(X_test, y_test))
#######################
'''
print('Training kNN: ')
test_result=list()
for jj in range (1,200,20):
	neigh = KNeighborsClassifier(n_neighbors=jj,p=1)
	neigh.fit(X_train,y_train)
	print("kNN Accuracy Test for",jj," neighbors: ",neigh.score(X_test,y_test))
	test_result.append(neigh.score(X_test,y_test))

plt.plot(test_result)
plt.ylabel('some numbers')
plt.show()'''
'''
Exemplo n.º 60
0
            randForrC.fit(trainX, yTrain)
            tmpSCR = randForrC.score(testX, yTest)
            scores['randForr'][label].append(tmpSCR)
        else: 
            randForrR.fit(trainX, yTrain)
            tmpSCR = randForrR.score(testX, yTest)
            scores['randForr'][label].append(tmpSCR)

        # print("start adaBoost")
        # adaBoostC.fit(trainX, yTrain)
        # tmpSCR = adaBoostC.score(testX, yTest)
        # scores['adaBoost'][label].append(tmpSCR)

        print("start bernoulli NB")
        if cnt < 2:
            bernNB.fit(trainX, yTrain)
            tmpSCR = bernNB.score(testX, yTest)
            scores['bernNB'][label].append(tmpSCR)
        else:
            gausRidge.fit(trainX, yTrain)
            tmpSCR = gausRidge.score(testX, yTest)
            scores['bernNB'][label].append(tmpSCR)

        # print("start gradient boost")
        # gradBoostC.fit(trainX, yTrain)
        # tmpSCR = gradBoostC.score(trainX, yTest)
        # scores['gradBoost'][label].append(tmpSCR)

        print("start SVM")
        if cnt < 2:
            svmC.fit(trainX, yTrain)