Exemplo n.º 1
0
def bernoulli_naive_bayes(x_train, y_train, x_cv, y_cv):
    """ Using Naive Bayes to classify the data. """

    print 'Training with NB...'
    clf = BernoulliNB()
    clf.fit(x_train, y_train)

    print 'Accuracy in training set: %f' % clf.score(x_train, y_train)
    print 'Accuracy in cv set: %f' % clf.score(x_cv, y_cv)
    return clf
Exemplo n.º 2
0
def classify(opts, data_train, data_test, labels_train, labels_test):
	# ##### TRAIN THE MODEL ######################################
	# Initialize the corresponding type of the classifier and train it (using 'fit')
	if opts.classifier == 'nb':
		classifier = BernoulliNB(binarize=None)
		
	elif opts.classifier == 'lr':
		classifier = LinearRegression()

	elif opts.classifier == 'log':
		classifier = LogisticRegression()
		
	elif opts.classifier == 'svm':
		classifier = LinearSVC()
		
	else:
		raise Exception('Unrecognized classifier!')

	classifier.fit(data_train, labels_train) #all np
	# ############################################################


	# ###### VALIDATE THE MODEL ##################################
	# Print training mean accuracy
	accuracy = classifier.score(data_train, labels_train)
	if opts.verbose:
		print "accuracy = ", accuracy
	
	# Perform 10 fold cross validation (cross_validation.cross_val_score) with scoring='accuracy'
	# and print the mean score and std deviation
	# cv = cross_validation.KFold()
	if opts.classifier != 'lr':
		cross_val_scores = cross_validation.cross_val_score(classifier, data_train, labels_train, scoring='accuracy', cv=10)
		if opts.verbose:
			print "cross val mean = ",  cross_val_scores.mean()
			print "cross val stdev = ", cross_val_scores.std()

	# ############################################################

	test_accuracy = classifier.score(data_test, labels_test)
	if opts.verbose:
		print "test accuracy = ", test_accuracy

	# Predict labels for the test set
	labels_predicted = classifier.predict(data_test)
	labels_predicted = np.round(labels_predicted, 2) #round to hundredths place for readability
	# print "***************"
	if opts.verbose:
		print "actual labels:\n", labels_test
		print "predicted labels:\n", labels_predicted

	return [accuracy, test_accuracy]
def compareClassifiers():
	(observations, classes) = createObservations()
	observations = np.array(observations)
	classes = np.array(classes)

	# make tree classifier
	my_tree = tree.DecisionTreeClassifier()
	my_tree.fit(observations, classes)
	tree_score = my_tree.score(observations, classes)
	tree_cv = cross_validation.cross_val_score(my_tree, observations, classes, scoring='accuracy', cv=10)
	#print "tree score:", tree_score, "tree cv", np.mean(tree_cv)

	# make naive classifier
	naive = BernoulliNB(binarize=None)
	naive.fit(observations, classes)
	naive_score = naive.score(observations, classes)
	naive_cv = cross_validation.cross_val_score(naive, observations, classes, scoring='accuracy', cv=10)
	#print "naive score:", naive_score, "naive cv", np.mean(naive_cv)

	# make SVM classifier
	svm = LinearSVC()
	svm.fit(observations, classes)
	svm_score = svm.score(observations, classes)
	svm_cv = cross_validation.cross_val_score(svm, observations, classes, scoring='accuracy', cv=10)
	#print "svm score:", svm_score, "svm cv", np.mean(svm_cv)

	# make Log classifier
	log = LogisticRegression()
	log.fit(observations, classes)
	log_score = log.score(observations, classes)
	log_cv = cross_validation.cross_val_score(log, observations, classes, scoring='accuracy', cv=10)
	#print "log score:", log_score, "log cv", np.mean(log_cv)

	return [(tree_score, np.mean(tree_cv)), (naive_score, np.mean(naive_cv)), (svm_score, np.mean(svm_cv)), (log_score, np.mean(log_cv))]
def evaluateSubjectivity(k, tokenizer: Tokenizer, alphas):
    count_vectorizer = CountVectorizer(tokenizer=tokenizer)
    objective_data_stream = stream_subjectivity_documents(PATH_TO_SUBJECTIVITY_DATA_OBJECTIVE, Labels.strong_pos)
    subjective_data_stream = stream_subjectivity_documents(PATH_TO_SUBJECTIVITY_DATA_SUBJECTIVE, Labels.strong_neg)

    X_objective_data, y_obj_labels = zip(*objective_data_stream)
    X_subjective_data, y_subj_labels = zip(*subjective_data_stream)

    X_objective_train_data = X_objective_data[:4000]
    y_obj_train_labels = y_obj_labels[:4000]

    X_subjective_train_data = X_subjective_data[:4000]
    y_subj_train_labels = y_subj_labels[:4000]

    X_objective_test_data = X_objective_data[4000:]
    y_obj_test_labels = y_obj_labels[4000:]

    X_subjective_test_data = X_subjective_data[4000:]
    y_subj_test_labels = y_subj_labels[4000:]

    # get vector counts
    X_train_counts = count_vectorizer.fit_transform(X_objective_train_data + X_subjective_train_data)
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    accuracies = []
    for alpha in alphas:
        classifier = BernoulliNB(alpha=alpha)
        classifier.fit(X_train_tfidf, y_obj_train_labels + y_subj_train_labels)
        X_test_counts = count_vectorizer.transform(X_objective_test_data + X_subjective_test_data)
        score = classifier.score(X_test_counts, y_obj_test_labels + y_subj_test_labels)
        accuracies.append(score)
    return accuracies
Exemplo n.º 5
0
def nb():
    from sklearn.naive_bayes import BernoulliNB
    clf = BernoulliNB()
    clf.fit(Xtrain, Ytrain_labels)
    BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
    print(clf.predict(Xtrain[2:300]))
    print(clf.score(Xtest, Ytest_labels))
Exemplo n.º 6
0
 def _bernoulli_NB(self):
     clf = BernoulliNB()
     clf.fit(self.X_train, self.y_train)
     score = clf.score(self.X_test, self.y_test)
     print('Accuracy rate of Naive Bayes: {0:.3f}'.format(score))
     y_pred = clf.predict_proba(self.X_test)
     ks(y_pred.T[0], self.y_test)
Exemplo n.º 7
0
def getscores(target, data):
    ys = target.tolist()
    ys.sort()
    t = np.zeros(len(target))
    neg = ys[19]
    pos = ys[-20]
    for i in range(len(t)):
        if target[i] <= neg:
            t[i] = 0
        elif target[i] >= pos:
            t[i] = 2
        else:
            t[i] = 1
    dn = []
    for i in range(len(t)):
        if t[i] == 1:
            dn += [i]
    dn = np.array(dn)
    y = np.delete(t, dn, 0)
    x = np.delete(data, dn, 0)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
    clf = BernoulliNB()
    clf.fit(x_train, y_train)
    scores = clf.score(x_test, y_test)
    return (scores)
Exemplo n.º 8
0
    def render_content(self):
        if self.text_source is None:
            return "No text source selected."
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.naive_bayes import BernoulliNB
        from sklearn import metrics
        self.dm("creating vectorizer")
        vectorizer = CountVectorizer(stop_words=self.get_user_list(self.stop_list), max_features=self.vocab_size)
        data = self.get_column_data(self.text_source)
        self.dm("using vectorizer")
        X_train = vectorizer.fit_transform(data)
        Y_train = self.get_column_data(self.code_source)
        self.dm("creating classifier")
        clf = BernoulliNB()
        clf.fit(X_train, Y_train)
        
        accuracy = clf.score(X_train, Y_train)
        self.dm("predicting")
        pred = clf.predict(X_train)
        cm = metrics.confusion_matrix(Y_train, pred)

        self.dm("displaying result")
        html_output = "accuracy is " + str(round(accuracy, 2))
        html_output += '<pre>'+ str(cm) + '</pre>'

        return html_output
def evaluateIMDB(k, tokenizer: Tokenizer, alphas):
    count_vectorizer = CountVectorizer(tokenizer=tokenizer)

    train_pos_path = os.path.join(PATH_TO_IMDB_TRAIN_DATA, POS_LABEL)
    train_neg_path = os.path.join(PATH_TO_IMDB_TRAIN_DATA, NEG_LABEL)

    train_pos_data_stream = stream_documents(Labels.strong_pos, train_pos_path, os.listdir(train_pos_path))
    train_neg_data_stream = stream_documents(Labels.strong_neg, train_neg_path, os.listdir(train_neg_path))

    X_pos_train_data, y_pos_train_labels = zip(*train_pos_data_stream)
    X_neg_train_data, y_neg_train_labels = zip(*train_neg_data_stream)

    test_pos_path = os.path.join(PATH_TO_IMDB_TEST_DATA, POS_LABEL)
    test_neg_path = os.path.join(PATH_TO_IMDB_TEST_DATA, NEG_LABEL)

    test_pos_data_stream = stream_documents(Labels.strong_pos, test_pos_path, os.listdir(test_pos_path))
    test_neg_data_stream = stream_documents(Labels.strong_neg, test_neg_path, os.listdir(test_neg_path))

    X_pos_test_data, y_pos_test_labels = zip(*test_pos_data_stream)
    X_neg_test_data, y_neg_test_labels = zip(*test_neg_data_stream)

    # get vector counts
    X_train_counts = count_vectorizer.fit_transform(X_neg_train_data + X_pos_train_data)
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    accuracies = []
    for alpha in alphas:
        classifier = BernoulliNB(alpha=alpha)
        classifier.fit(X_train_tfidf, y_neg_train_labels + y_pos_train_labels)
        X_test_counts = count_vectorizer.transform(X_pos_test_data + X_neg_test_data)
        score = classifier.score(X_test_counts, y_pos_test_labels + y_neg_test_labels)
        accuracies.append(score)
    return accuracies
Exemplo n.º 10
0
def test_BernoulliNB():
    clf = BernoulliNB()
    for ptype in range(1, 8):
        train_set, test_set, fea_lst = get_dataframe(ptype)
        clf.fit(train_set[fea_lst], train_set['tag'])
        s = clf.score(test_set[fea_lst], test_set['tag'])
        print('ptype:', ptype, ' score:', s)
Exemplo n.º 11
0
def train():
    data, labels = preprocessing()
    train_data, test_data, y_train, y_test = split_data(data, labels)

    vectorizer = CountVectorizer(max_df=0.5, min_df=1, stop_words=None)
    X_train = vectorizer.fit_transform(train_data)
    X_test = vectorizer.transform(test_data)
    X = vectorizer.transform(data)
    #    print(vectorizer.get_feature_names())

    NBclf = MultinomialNB()
    NBclf.fit(X_train, y_train)
    print("多项式贝叶斯分类器交叉验证得分: ", NBclf.score(X_test, y_test))
    print("多项式贝叶斯分类器准确率: ", accuracy_score(labels, NBclf.predict(X)))

    BNBclf = BernoulliNB()
    BNBclf.fit(X_train, y_train)
    print("伯努利贝叶斯分类器交叉验证得分: ", BNBclf.score(X_test, y_test))
    print("伯努利贝叶斯分类器准确率: ", accuracy_score(labels, BNBclf.predict(X)))

    LRclf = LogisticRegression()
    LRclf.fit(X_train, y_train)
    print("罗吉斯回归分类器交叉验证得分: ", LRclf.score(X_test, y_test))
    print("罗吉斯回归二分类器准确率: ", accuracy_score(labels, LRclf.predict(X)))

    SVMclf = svm.SVC()
    SVMclf.fit(X_train, y_train)
    print("支持向量机分类器交叉验证得分: ", SVMclf.score(X_test, y_test))
    print("支持向量机二分类器准确率: ", accuracy_score(labels, SVMclf.predict(X)))
def BernoulliNB_classification(train, test, train_labels, test_labels, res={}):
    """

    :param train: training data, iterable/list
    :param test: testing data, iterable/list
    :param train_labels: training labels, iterable/list
    :param test_labels: testing labels, iterable/list
    :return: / --> Saves data in folder "Results"
    """
    print("Classifying with Bernoulli Nive Bayes...")

    bernNB = BernoulliNB(alpha=0.7)
    bernNB.fit(train, train_labels)

    prediction = bernNB.predict(test)
    utils.report_and_confmat(test_labels, prediction, "BernoulliNB")
    score = bernNB.score(test, test_labels)
    res["BernoulliNB"] = {
        "model": bernNB,
        "accuracy": score,
        "name": "BernoulliNB"
    }
    print("Bernoulli ended...")

    return score, bernNB
Exemplo n.º 13
0
def train_idf():
    data, labels = preprocessing()
    train_data, test_data, y_train, y_test = split_data(data, labels)

    #    stop_words = [line.strip() for line in open("stop_words/stop_words.txt", "r", encoding="utf-8")]
    vectorizer = TfidfVectorizer(max_df=0.5,
                                 min_df=1,
                                 stop_words=None,
                                 use_idf=True)
    X_train = vectorizer.fit_transform(train_data)
    X_test = vectorizer.transform(test_data)
    X = vectorizer.transform(data)

    NBclf = MultinomialNB()
    NBclf.fit(X_train, y_train)
    print("多项式贝叶斯分类器交叉验证得分: ", NBclf.score(X_test, y_test))
    print("多项式贝叶斯分类器准确率: ", accuracy_score(labels, NBclf.predict(X)))

    BNBclf = BernoulliNB()
    BNBclf.fit(X_train, y_train)
    print("伯努利贝叶斯分类器交叉验证得分: ", BNBclf.score(X_test, y_test))
    print("伯努利贝叶斯分类器准确率: ", accuracy_score(labels, BNBclf.predict(X)))

    LRclf = LogisticRegression()
    LRclf.fit(X_train, y_train)
    print("罗吉斯回归分类器交叉验证得分: ", LRclf.score(X_test, y_test))
    print("罗吉斯回归二分类器准确率: ", accuracy_score(labels, LRclf.predict(X)))

    SVMclf = svm.SVC()
    SVMclf.fit(X_train, y_train)
    print("支持向量机分类器交叉验证得分: ", SVMclf.score(X_test, y_test))
    print("支持向量机二分类器准确率: ", accuracy_score(labels, SVMclf.predict(X)))
Exemplo n.º 14
0
class Model(object):
    def __init__(self):
        # self.model = GradientBoostingClassifier(learning_rate=0.01, max_depth=8,
        # 	max_features=5, min_samples_leaf=5, n_estimators=1500)
        self.model = BernoulliNB(alpha=1)
        self.tfidf = TfidfVectorizer(max_df=1.0,
                                     min_df=1,
                                     stop_words='english',
                                     lowercase=True)
        pass

    def fit(self, X, y):
        # Import X and y as text
        X = self.tfidf.fit_transform(X)
        y = y
        self.model.fit(X, y)
        filename = 'data/model.pkl'
        pickle.dump(self, open(filename, 'wb'))
        return self

    def predict(self, X):
        X = self.tfidf.transform(X)
        predictions = self.model.predict(X)
        return predictions

    def predict_proba(self, X):
        X = self.tfidf.transform(X)
        proba_predictions = self.model.predict_proba(X)
        return proba_predictions

    def score(self, X, y):
        X = self.tfidf.transform(X)
        score = self.model.score(X, y)
        return score
Exemplo n.º 15
0
class BernoulliNaiveBayesClassifier:
    def __init__(self, x_train, y_train):
        self._x_train = x_train
        self._y_train = y_train
        self._bernoulli_naive_bayes = BernoulliNB()

    def train(self):
        self._bernoulli_naive_bayes.fit(self._x_train, self._y_train)

    def test(self, x_test):
        return self._bernoulli_naive_bayes.predict(x_test)

    def accuracy(self, x_test, y_test):
        return self._bernoulli_naive_bayes.score(x_test, y_test)

    def get_average_f1_score(self, x_test, y_test):
        labels = [1, 0, -1]
        y_pred = self._bernoulli_naive_bayes.predict(x_test)

        # Save predicted labels
        project_relative_path = os.path.dirname(
            os.path.dirname(os.path.dirname(__file__)))
        print(project_relative_path)
        output_file_sentiment_label = open(
            os.path.join(project_relative_path,
                         'saved_model_data/naive_bayes_labels.txt'), 'a')
        for label in y_pred:
            output_file_sentiment_label.write(str(label))
            output_file_sentiment_label.write('\n')

        return f1_score(y_test, y_pred, average='weighted', labels=labels)
Exemplo n.º 16
0
    def bnb(self):
        from sklearn.naive_bayes import BernoulliNB
        from sklearn.metrics import classification_report, roc_auc_score

        bnb = BernoulliNB()
        bnb.fit(self.X_train, self.y_train)

        y_hat_train = bnb.predict(self.X_train)
        y_hat_test = bnb.predict(self.X_test)

        acc_bnb = round(bnb.score(self.X_test, self.y_test) * 100, 2)
        print('Model Accuracy: ', acc_bnb)

        print('Naive Bayes:\n 1. train 2. test')
        print(
            classification_report(self.y_train, y_hat_train),
            classification_report(self.y_test, y_hat_test),
            sep='\n-------------------------------------------------------\n')

        y_score = bnb.predict_proba(self.X_test)
        print(
            'ovo',
            roc_auc_score(self.y_test, y_score, multi_class='ovo'),
            'ovr',
            roc_auc_score(self.y_test, y_score, multi_class='ovr'),
            sep='\n-------------------------------------------------------\n')
Exemplo n.º 17
0
def tryBinomialNaiveBayes(goFast):
  best_score = 0

  from sklearn.datasets import dump_svmlight_file, load_svmlight_file
  if goFast:
    training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True)
    validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True)
    testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True)
  else:
    training_data, training_labels = load_svmlight_file("dt1.trn.svm")
    validation_data, validation_labels = load_svmlight_file("dt1.vld.svm")
    testing_data, testing_labels = load_svmlight_file("dt1.tst.svm")

  from sklearn.naive_bayes import BernoulliNB

  for alpha_value in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
    for binarize_value in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
      for fit_prior_value in [True, False]:
        binary_operator = BernoulliNB(alpha_value,binarize_value,fit_prior_value)
        binary_operator.fit(training_data,training_labels)
        current_score = binary_operator.score(validation_data,validation_labels)

        print "Current test: " + str(alpha_value), str(binarize_value), fit_prior_value
        print "Current score: " + str(current_score)

        if current_score > best_score:
          best_score = current_score
          print "***NEW MAXIMUM SCORE: " + str(best_score)
          print "***NEW MAXIMUM PARAMETERS: " + str(alpha_value), str(binarize_value), fit_prior_value

  print "Best score was " + str(best_score)
 def doclassify(self, type='normal'):
     if type == 'normal':
         clf = BernoulliNB()
         clf.fit(self.train_x, self.train_y)
         BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
         score = clf.score(self.train_x, self.train_y)
         print 'score = ', score
Exemplo n.º 19
0
    def registerInitialState(self, state):

        # open datafile, extract content into an array, and close.
        self.datafile = open('good-moves.txt', 'r')
        content = self.datafile.readlines()
        self.datafile.close()

        # Now extract data, which is in the form of strings, into an
        # array of numbers, and separate into matched data and target
        # variables.
        self.data = []
        self.target = []
        # Turn content into nested lists
        for i in range(len(content)):
            lineAsArray = self.convertToArray(content[i])
            dataline = []
            for j in range(len(lineAsArray) - 1):
                dataline.append(lineAsArray[j])

            self.data.append(dataline)
            targetIndex = len(lineAsArray) - 1
            self.target.append(lineAsArray[targetIndex])

        # data and target are both arrays of arbitrary length.
        #
        # data is an array of arrays of integers (0 or 1) indicating state.
        #
        # target is an array of integers 0-3 indicating the action
        # taken in that state.

    # =============================================================================
    # Start: Running the classifier
    # =============================================================================

    # Train test split with 0.2 for my own classifier. This code will run the classifier and test it, returning the score of that split.
        self.split_score = self.train_test_splitter(self.data, self.target,
                                                    0.2)

        # Custom built cross-validation score for my NBayes classifier
        self.cross_val_score = self.k_fold(self.data, self.target, 10)

        # Learning built classifier with all the data
        self.probabilities, self.prior = self.Naive_Bayes_Train(
            self.data, self.target)

        self.score = 0  # This allows us to only print the metrics once (see getAction)
        # Calculating training score
        self.training_score, self.matrix = self.Bayes_score(
            self.data, self.target)

        # Using scikit learn metrics to compare algorithms
        clf = BernoulliNB().fit(self.data, self.target)
        self.scikit_score = clf.score(self.data, self.target)
        self.scikit_cross_val = cross_val_score(clf,
                                                self.data,
                                                self.target,
                                                cv=10).mean()
        self.scikit_matrix = confusion_matrix(self.target,
                                              clf.predict(self.data))
Exemplo n.º 20
0
def naive_bayes(X_train, Y_train, X_test, Y_test):
    # classifier = MultinomialNB()
    classifier = BernoulliNB()
    classifier.fit(X_train, Y_train)
    print("accuracy score of naive bayes")
    print(classifier.score(X_test, Y_test))
    filename = './naive_bayes_glove.sav'
    pickle.dump(classifier, open(filename, 'wb'))
Exemplo n.º 21
0
def bernoulli_nb(X_train, y_train, X_test, y_test):
    alpha_values = [0.0000000001, 0.000001, 0.0001, 0.1, 0.2, 0.5, 0.8, 1.0, 1.5, 2.0]

    for a in alpha_values:
        bnb = BernoulliNB(alpha=a)
        bnb.fit(X_train, y_train)
        score = bnb.score(X_test, y_test)
        print('Score bnb(alpha='+str(a)+'): ' + str(score))
Exemplo n.º 22
0
def cross_validate(X, y):

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=.3,
                                                        random_state=0)

    clf = BernoulliNB().fit(X_train, y_train)
    return (clf.score(X_test, y_test) * 100)
def BNB(X_train, y_train, X_test, y_test, weights={0: 1, 1: 1}, alpha = 1.0, folder = "bush_models"):

    bnb = BernoulliNB(alpha = alpha)

    bnb = bnb.fit(X_train, y_train)

    joblib.dump(bnb, folder+"/"+str(alpha)+'_bnb.joblib')

    print(bnb.score(X_test, y_test))
def predict_NB_Bernoulli(X, Y):
    X_dev, X_test, y_dev, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.15,
                                                    random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_dev,
                                                      y_dev,
                                                      test_size=0.15 / 0.85,
                                                      random_state=42)
    clf = BernoulliNB()
    clf.fit(X_train, y_train)
    print("Accuracy: " + str(clf.score(X_val, y_val)))
    prob_X = clf.predict_proba(X_val)
    prob_score = 0
    for i in range(len(y_val)):
        prob_score += prob_X[i][y_val[i]]
    print("Average prob for correct classes: " + str(prob_score / len(y_val)))
    return clf.score(X_val, y_val)
Exemplo n.º 25
0
def ml_algo(inp):
    df = pd.read_csv("data/final_preprocess.csv")
    X = np.array(df.drop(['Result'], axis=1))
    y = np.array(df['Result'])
    X, y = shuffle(X, y, random_state=1)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=0.2)

    model_centroid = NearestCentroid().fit(X_train, y_train)
    model_knn = KNeighborsClassifier(25).fit(X_train, y_train)
    model_svm = SVC().fit(X_train, y_train)
    model_lr = LinearRegression().fit(X_train, y_train)
    model_nb = BernoulliNB().fit(X_train, y_train)
    # criterion-> gini or entropy; splitter-> best or random; max_depth-> any integer value or None;
    # min_samples_split-> min no. of samples reqd. to split an internal node;
    # min_samples_leaf -> The minimum number of samples required to be at a leaf node.
    # min_impurity_split -> It defines the threshold for early stopping tree growth.
    model_dtree = DecisionTreeClassifier(criterion="entropy",
                                         random_state=100,
                                         max_depth=3,
                                         min_samples_leaf=5).fit(
                                             X_train, y_train)

    # print ("[1] ACCURACY OF DIFFERENT MODELS ",'\n___________________')
    accu_centroid = model_centroid.score(X_test, y_test)
    # print ("NearestCentroid -> ", accu_centroid)
    accu_knn = model_knn.score(X_test, y_test)
    # print ("Knn             -> ",accu_knn)
    accu_svm = model_svm.score(X_test, y_test)
    # print ("SVM             -> ", accu_svm,)
    accu_lr = model_lr.score(X_test, y_test)
    # print ("Linear Regr     -> ", accu_lr)
    accu_nb = model_nb.score(X_test, y_test)
    # print ("Naive Bayes     -> ", accu_nb)
    accu_dtree = model_dtree.score(X_test, y_test)
    # print ("Decission Tree  -> ", accu_dtree, "\n")

    result_centroid = model_centroid.predict(inp)
    result_knn = model_knn.predict(inp)
    result_svm = model_svm.predict(inp)
    result_lr = model_lr.predict(inp)
    result_nb = model_nb.predict(inp)
    result_dtree = model_dtree.predict(inp)

    # disease-name, description, [list of step to be taken], [list of to whom we can contact]

    # print ("[2] PREDICTION ",'\n___________________')
    # print ("NearestCentroid -> ", result_centroid)
    # print ("knn             -> ", result_centroid)
    # print ("svm             -> ", result_svm)
    # print ("LinearReg       -> ", result_lr)
    # print ("Naive Bayes     -> ", result_nb)
    # print ("Decission Tree  -> ", result_dtree)

    # return map_disease[str(result_knn[0])]
    return result_knn[0]
Exemplo n.º 26
0
 def doclassify(self, type='normal'):
     if type == 'normal':
         clf = BernoulliNB()
         clf.fit(self.train_x, self.train_y)
         BernoulliNB(alpha=1.0,
                     binarize=0.0,
                     class_prior=None,
                     fit_prior=True)
         score = clf.score(self.train_x, self.train_y)
         print 'score = ', score
Exemplo n.º 27
0
def getBernoulliNaiveBayesPredictions(bestAlpha, X_train, y_train, X_test,
                                      y_test):
    model = BernoulliNB(alpha=bestAlpha)
    model.fit(X_train, y_train)

    print(model)

    y_pred = model.predict(X_test)

    return y_pred, model.score(X_test, y_test)
def BNB(train_x, train_y, test_x, test_y):  #BernoulliNB알고리즘 결과출력
    bnb = BernoulliNB()
    bnb.fit(train_x, train_y)
    pre_arr = bnb.predict(test_x)
    pre_arr = pre_arr.reshape(10, 12)

    print('BernoulliNB의 테스트 세트 예측 :\n{}'.format(pre_arr))
    print('BernoulliNB의 테스트 세트 정확도 : {0:0.2f}%'.format(
        bnb.score(test_x, test_y) * 100))
    print('------------------------------------------------------')
Exemplo n.º 29
0
class TextClassifier(object):
    """A text classifier model:
        - Vectorize the raw text into features.
        - Fit a naive bayes model to the resulting features.
    """
    def __init__(self):
        #self._vectorizer = TfidfVectorizer(stop_words='english')
        self._vectorizer = CountVectorizer()
        self._classifier = BernoulliNB()
        #self._classifier = MultinomialNB()

    def fit(self, X, y):
        """Fit a text classifier model.

        Parameters
        ----------
        X: A numpy array or list of text fragments, to be used as predictors.
        y: A numpy array or python list of labels, to be used as responses.

        Returns
        -------
        self: The fit model object.
        """
        # Code to fit the model.

        train_stuff = self._vectorizer.fit_transform(X, y)

        self._classifier.fit(train_stuff, y=y)

        return self

    def predict_proba(self, X):
        """Make probability predictions on new data."""

        stuff = self._vectorizer.transform(X)
        result = self._classifier.predict_proba(stuff)
        return result
        pass

    def predict(self, X):
        """Make predictions on new data."""

        stuff = self._vectorizer.transform(X)
        result = self._classifier.predict(stuff)
        return result
        pass

    def score(self, X, y):
        """Return a classification accuracy score on new data."""

        stuff = self._vectorizer.transform(X)
        result = self._classifier.score(stuff, y)

        return result
        pass
Exemplo n.º 30
0
def main():
  start_time = time.time()
  #read in game IDs
  games_data = pd.read_csv('games-data.csv')
  all_games = np.array(games_data['game_id'])
  all_plyrs = np.array(games_data['plyr_id'])
  uni_game_ids = np.unique(all_games)
  
  #read in player IDs
  player_data = pd.read_csv('players.csv')
  plyr_ids = np.unique(np.array(player_data['ID']))
  
  #read in fantasy scores
  fantasy_scores = pd.read_csv('fantasy_scores.csv')
  
  #gets player training matrix
  plyr_id = 8439
  X = create_training_set(plyr_id, games_data, plyr_ids)
  index = get_ninety_percent(len(np.array(X.index))) #for cross-validation
  train_X = X[:index]
  test_X = X[index:]
  
  #gets training output vector
  plyr_game_ids = np.array(train_X.index)
  scores = plyr_fantasy_pts(plyr_id, plyr_game_ids, fantasy_scores)
  Y = discretize(scores.values)
  train_Y = Y[:index]
  test_Y = Y[index:]
  
  #run Bernoulli NB Classifier
  nb_clf = BernoulliNB()
  nb_clf.fit(train_X, train_Y)
  nb_predictions = nb_clf.predict(test_X)
  
  #run Multinomial NB Classifier
  mn_clf = MultinomialNB()
  mn_clf.fit(train_X, train_Y)
  mn_predictions = nb_clf.predict(test_X)
  
  #test for game, fantasy score alignment  
  for i in xrange(test_Y.shape[0]):
    print plyr_game_ids[i], scores.values[i], test_Y[i], nb_predictions[i], mn_predictions[i]
  
  print "Bernoulli NB accuracy: ", nb_clf.score(test_X, test_Y)
  
  print "Bernoulli NB prob estimates: ", nb_clf.predict_proba(test_X)
  print "Multinomial NB accuracy: ", mn_clf.score(test_X, test_Y)
  print "Bernoulli NB prob estimates: ", mn_clf.predict_proba(test_X)
  print len(nb_clf.predict_proba(test_X)[0])
  nb_norm_prob = normalize_probs(nb_clf.predict_proba(test_X)[0])
  vals = [1.5, 4.5, 7.5, 10.5, 13.5, 16.5, 19.5, 22.5, 25.5, 28.5, 31.5]
  ev = expected_val(nb_norm_prob, vals)
  print "EV: ", ev
  end_time = time.time()
  print("Elapsed time was %g seconds" % (end_time - start_time))  
def trainBernoulliNB(X,y,loadweights):
	print("Training BernoulliNB...")
	BN_classifier = BernoulliNB()
	if loadweights:
		with open('weights/BernoulliNB.pickle', 'rb') as handle:
			BN_classifier = pickle.load(handle)
	for _ in range(10):
		BN_classifier.partial_fit(X,y,classes=[0,1])
	with open('weights/BernoulliNB.pickle', 'wb') as handle:
		pickle.dump(BN_classifier, handle, protocol=pickle.HIGHEST_PROTOCOL)
	print (BN_classifier.score(X,y))
Exemplo n.º 32
0
    def train_classifier(self):

        # Get list of features
        # count_vect = CountVectorizer(min_df=3, max_df=0.90)
        # X_CV = count_vect.fit_transform(x_train)

        # print number of unique words (n_features)
        # print ("Shape of train data is "+str(X_CV.shape))

        # tfidf transformation###

        # tfidf_transformer = TfidfTransformer(use_idf = _use_idf)
        # X_tfidf = tfidf_transformer.fit_transform(X_CV)

        # train the classifier

        print ("Fitting data ...")
        clf = BernoulliNB().fit(x_train, y_train)


        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf, x_train, y_train, cv=10, scoring='f1_weighted')
        print ("Cross validation score: "+str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation
        # the factor two is to signify 2 sigma, which is 95% confidence level

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


        ##################
        # run classifier on test data
        ##################

        # X_test_CV = count_vect.transform(x_test)
        #
        # print ("Shape of test data is "+str(X_test_CV.shape))
        #
        # X_test_tfidf = tfidf_transformer.transform(X_test_CV)

        y_predicted = clf.predict(x_test)

        # print the mean accuracy on the given test data and labels

        print ("Classifier score on test data is: %0.2f " % clf.score(x_test,y_test))

        print(metrics.classification_report(y_test, y_predicted))
        cm = metrics.confusion_matrix(y_test, y_predicted)
        print(cm)

        return clf
Exemplo n.º 33
0
def main(X_data, y_data, test_size):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X_data, y_data, test_size=(1 - test_size))

    X_train = X_train.toarray()
    # cria o classificador
    gnb = BernoulliNB()

    gnb.fit(X_train, y_train)

    # mostra o resultado do classificador na base de teste
    return gnb.score(X_test, y_test)
class SKLearnBernoulliNB(ClassificationModel):
    def __init__(self, alpha=1.0):
        self.bernoulli_nb = BernoulliNB(alpha=alpha)

    def train(self, data, labels):
        self.bernoulli_nb.fit(data, labels)

    def score(self, data, labels):
        return self.bernoulli_nb.score(data, labels)

    def predict(self, data):
        return self.bernoulli_nb.predict(data)
Exemplo n.º 35
0
def BNB(alphas):

    bnb_one = BernoulliNB(binarize=0.5)
    bnb_one.fit(train_data, train_labels)
    print(
        "\n\nBernoulli Naive Bayes accuracy when alpha = 1 (the default value):",
        bnb_one.score(dev_data, dev_labels))

    bnb_zero = BernoulliNB(binarize=0.5, alpha=0)
    bnb_zero.fit(train_data, train_labels)
    print("BNB accuracy when alpha = 0:", bnb_zero.score(dev_data, dev_labels))

    bnb = BernoulliNB(binarize=0.5)
    clf = GridSearchCV(bnb, param_grid=alphas)
    clf.fit(train_data, train_labels)
    print("Best parameter for BNB on the dev data:", clf.best_params_)

    clf_tuned = BernoulliNB(binarize=0.5, alpha=0.00000000000000000000001)
    clf_tuned.fit(train_data, train_labels)
    print("Accuracy using the tuned Laplace smoothing parameter:",
          clf_tuned.score(dev_data, dev_labels), "\n\n")
Exemplo n.º 36
0
    def performBernoulli(self, alpha=1.0):
        t0 = time.time()
        clf = BernoulliNB(alpha=alpha)
        clf.fit(self.X_train, self.Y_train)
        print("Time taken to Train: %s seconds ---" % (time.time() - t0))

        t0 = time.time()
        accuracy = clf.score(self.X_test, self.Y_test)
        print("Time taken to Tests: %s seconds ---" % (time.time() - t0))
        print "Accuracy : %s" % accuracy

        return accuracy
Exemplo n.º 37
0
def linear_svm(training_data, testing_data, training_target, testing_target):
    start = time()
    clf_linear = BernoulliNB()
    clf_linear.fit(training_data, training_target)
    predict_test = clf_linear.predict_proba(testing_data)[:,1]
    print(predict_test[:30])
    print(testing_target[:30])
    result = roc_auc_score(testing_target, predict_test)
    #result = f1_score(testing_target, predict_test,labels=[0,1,2], average='micro')
    end = time()
    print("Training time: {}".format(end - start))
    print("mean accuracy:{}".format(clf_linear.score(testing_data, testing_target)))
    return result
Exemplo n.º 38
0
def predict_scores(markers, threshold=0.05):
	scores = []
	for i, marker in enumerate(markers):
		try:
			bnb = BNB()
			bnb.fit(marker["individuals"], marker["population_labels"])
			scores.append((bnb.score(marker["individuals"], marker["population_labels"]), i))
		except:
			scores.append((0.0, i))
	scores.sort()
	scores.reverse()

	cutoff_idx = int(threshold * len(scores))

	return scores[:cutoff_idx]
Exemplo n.º 39
0
def  do_TRT(ne = 10, md = 3):
    from sklearn.ensemble import RandomTreesEmbedding
    from sklearn.naive_bayes import BernoulliNB
    train_X, train_Y, test_X, test_Y = analysis_glass()
    all_X = np.vstack((train_X, test_X))
    hasher = RandomTreesEmbedding(n_estimators=ne,\
                                  random_state=0, max_depth=md)
    all_X_trans = hasher.fit_transform(all_X)
    train_X_trans = all_X[0:149, :]
    test_X_trans = all_X[149:, :]

    nb = BernoulliNB()
    nb.fit(train_X_trans, train_Y)

    return nb.score(test_X_trans, test_Y)
Exemplo n.º 40
0
	def compare_sklearn(self):
		'''
			compares our implementation to sklearn's implementation. 

			assumes that evaluate_accuracy has been called.
		'''
		if not self.accuracy_tested:
			raise 'you must test the accuracy of the classifier before comparing to sklearn'
		print "--> Checking sklearn's accuracy..."
		X = np.array(self.np_reps)
		nb = BernoulliNB(alpha=0)
		y = np.array(self.gold)
		nb.fit(X,y)
		print "...done."
		print "sklearn accuracy is %f. Our accuracy was %f. " % (nb.score(X,y), self.accuracy)
Exemplo n.º 41
0
def plot_scores(markers, flname):
	plt.clf()
	scores = []
	for i, marker in enumerate(markers):
		try:
			mnb = BNB()
			mnb.fit(marker["individuals"], marker["population_labels"])
			scores.append(mnb.score(marker["individuals"], marker["population_labels"]))
		except:
			scores.append(0.0)

	plt.hist(scores, bins=np.arange(0.0, 1.0, 0.01))

	plt.xlabel("Score", fontsize=18)
	plt.ylabel("Occurrences", fontsize=18)

	plt.savefig(flname, DPI=200)
Exemplo n.º 42
0
def bnb(training_data, training_target, testing_data, testing_target):
    """
	DESCRIPTION:
	
	
	INPUTS:
	
	
	OUTPUTS:
	
	
	EXAMPLE USAGE:
	
	"""

    clf = BernoulliNB()
    clf.fit(training_data, training_target)
    return clf.score(testing_data, testing_target)
def BernoulliNaiveBayes(listOfTrainComments, listOfTestComments, listOfUniqueTokens):
	xTrain = []
	yTrain = []
	for i in range(len(listOfTrainComments)):
		BOW = generateBOW(listOfTrainComments[i], listOfUniqueTokens)
		xTrain.append(BOW)
		yTrain.append(listOfTrainComments[i].getStatus())

	xTest = []
	yTest = []
	for i in range(len(listOfTestComments)):
		BOW = generateBOW(listOfTestComments[i], listOfUniqueTokens)
		xTest.append(BOW)
		yTest.append(listOfTestComments[i].getStatus())

	clf = BernoulliNB()
	clf.fit(xTrain, yTrain)
	accUsingSklearn = clf.score(xTest, yTest)
	print('Bernoulli Naive Bayes Classifier, Accuracy - ' + str(round(accUsingSklearn*100, 2)) + '%', '\n')
Exemplo n.º 44
0
def classify(opts, data_train, labels_train):
	# ##### TRAIN THE MODEL ######################################
	# Initialize the corresponding type of the classifier and train it (using 'fit')
	if opts.classifier == 'nb':
		classifier = BernoulliNB(binarize=None)
		
	elif opts.classifier == 'lr':
		classifier = LinearRegression()

	elif opts.classifier == 'log':
		classifier = LogisticRegression()
		
	elif opts.classifier == 'svm':
		classifier = LinearSVC()
		
	else:
		raise Exception('Unrecognized classifier!')

	classifier.fit(data_train, labels_train) #all np

	# ###### VALIDATE THE MODEL ##################################
	# Print training mean accuracy
	accuracy = classifier.score(data_train, labels_train)
	# if opts.verbose:
	# 	print "accuracy = ", accuracy
	
	# ############################################################

	# Predict labels for the test set
	# labels_predicted = classifier.predict(data_test)
	# labels_predicted = np.round(labels_predicted, 2) #round to hundredths place for readability
	# # print "***************"
	# if opts.verbose:
	# 	print "predicted labels:\n", labels_predicted

	return classifier
import scipy.io
data = scipy.io.loadmat('NewsGroup.mat')
TRAIN_LABEL = data['TRAIN_LABEL']
TEST_LABEL = data['TEST_LABEL']
import numpy as np
split_TEST_DATA = np.load("split_TEST_DATA.npy")
split_TRAIN_DATA = np.load("split_TRAIN_DATA.npy")
print np.shape(split_TEST_DATA)
print np.shape(split_TRAIN_DATA)
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB(alpha=1,fit_prior = True)
y_pred = mnb.fit(split_TRAIN_DATA,np.reshape(TRAIN_LABEL,(np.shape(TRAIN_LABEL)[0])))
print(mnb.score(split_TRAIN_DATA,np.reshape(TRAIN_LABEL,(np.shape(TRAIN_LABEL)[0]))))
print(mnb.score(split_TEST_DATA,np.reshape(TEST_LABEL,(np.shape(TEST_LABEL)[0]))))

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB(alpha=1,fit_prior = True)
y_pred = bnb.fit(split_TRAIN_DATA,np.reshape(TRAIN_LABEL,(np.shape(TRAIN_LABEL)[0])))
print(bnb.score(split_TRAIN_DATA,np.reshape(TRAIN_LABEL,(np.shape(TRAIN_LABEL)[0]))))
print(bnb.score(split_TEST_DATA,np.reshape(TEST_LABEL,(np.shape(TEST_LABEL)[0]))))
Exemplo n.º 46
0
def run_bernoulli_naive_bayes(training_data,training_target,testing_data,testing_target):
	clf = BernoulliNB()
	clf.fit(training_data,training_target)
	return clf.score(testing_data,testing_target)
Exemplo n.º 47
0
def main():
	##### DO NOT MODIFY THESE OPTIONS ##########################
	parser = argparse.ArgumentParser()
	parser.add_argument('-training', required=True, help='Path to training data')
	parser.add_argument('-business_file', required=True, help='Path to business data')
	parser.add_argument('-c', '--classifier', default='nb', help='nb | log | svm')
	parser.add_argument('-top', type=int, help='Number of top features to show')
	parser.add_argument('-test', help='Path to test data')
	opts = parser.parse_args()
	############################################################

	##### BUILD TRAINING SET ###################################
	# Initialize CountVectorizer
	# You will need to implement functions in tokenizer.py
	tokenizer = Tokenizer()
	vectorizer = CountVectorizer(binary=True, lowercase=True, decode_error='replace', tokenizer=tokenizer)
	csv_file = open(opts.training)
	file_reader = csv.reader(csv_file)
	tweets = []
	lable = []
	for line in file_reader:
		tweets.append(line[2])
		lable.append(int(line[1]))
	vocabulary = vectorizer.fit_transform(tweets)
	#print tweets
	lable = np.array(lable)
	#print lable
	# Load training text and training labels
	# (make sure that your labels are converted to integers (0 or 1, not '0' or '1') 
	#  so that we can enforce the condition that label data is binary)

	# Get training features using vectorizer
	
	# Transform training labels to numpy array (numpy.array)
	
	############################################################
	
	##### TRAIN THE MODEL ######################################
	# Initialize the corresponding type of the classifier and train it (using 'fit')
	if opts.classifier == 'nb':
		classifier = BernoulliNB(binarize=None)
		classifier.fit(vocabulary, lable)
	elif opts.classifier == 'log':
		classifier = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None)
		classifier.fit(vocabulary, lable)
	elif opts.classifier == 'svm':
		classifier = LinearSVC(penalty='l2', loss='l2', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None)
		classifier.fit(vocabulary, lable)
	else:
		raise Exception('Unrecognized classifier!')
	############################################################
	
	###### VALIDATE THE MODEL ##################################
	# Print training mean accuracy using 'score'
	print ("Training accuracy: %f" % classifier.score(vocabulary, lable))
	# Perform 10 fold cross validation (cross_validation.cross_val_score) with scoring='accuracy'
	# and print the mean score and std deviation
	scores = cross_validation.cross_val_score(classifier, vocabulary, lable, scoring = 'accuracy', cv=10)
	print("Cross-Validation Accuracy: %f (+/- %f)" % (scores.mean(), scores.std()))
	
	############################################################	

	##### EXAMINE THE MODEL ####################################
	if opts.top is not None:
		# print top n most informative features for positive and negative classes
		print 'Most informative features'
		util.print_most_informative_features(opts.classifier, vectorizer, classifier, opts.top)
	############################################################
	

	##### TEST THE MODEL #######################################
	if opts.test is None:
		# Test the classifier on one sample test tweet
		# Tim Kraska 10:43 AM - 5 Feb 13
		test_tweet = 'Water dripping from 3rd to 1st floor while the firealarm makes it hard to hear anything. BTW this is the 2nd leakage.  Love our new house'
		
		terms = vectorizer.transform([test_tweet])

		# Print the predicted label of the test tweet
		print classifier.predict(terms)
		# Print the predicted probability of each label.
		if opts.classifier != 'svm':
			# Use predict_proba
			print classifier.predict_proba(terms)
		else:
			# Use decision_funcion
			print classifier.decision_function(terms)
	else:
		# Test the classifier on the given test set
		# Extract features from the test set and transform it using vectorizer
		csv_file = open(opts.test)
		file_reader = csv.reader(csv_file)
		test_tweets = []
		true_lable = []
		business = []
		for line in file_reader:
			business.append(line[0])
			test_tweets.append(line[2])
			true_lable.append(int(line[1]))
		terms = vectorizer.transform(test_tweets)
		true_lable = np.array(true_lable)
		predict_lable = classifier.predict(terms)
		# Print test mean accuracy
		accuracy = (len(true_lable) - sum(true_lable^predict_lable))/len(true_lable)
		print ("Test accuracy: %f" % accuracy)
		# Predict labels for the test set
		
		# Print the classification report
		target_names = ['Negative', 'Positive']

		if opts.classifier != 'svm':
			test_predicted_proba = classifier.predict_proba(terms)
			util.plot_roc_curve(true_lable, test_predicted_proba)

			positive_prob = []
			negative_prob = []
			for i, item in enumerate(true_lable):
				if true_lable[i] == 1:
					positive_prob.append([i, test_predicted_proba[i][0], test_predicted_proba[i][1]])
				else:
					negative_prob.append([i, test_predicted_proba[i][0], test_predicted_proba[i][1]])
			sorted_positive = sorted(positive_prob, key=itemgetter(1), reverse= True)
			positive_bias = sorted_positive[0:100]
			sorted_negative = sorted(negative_prob, key=itemgetter(1))
			negative_bias = sorted_negative[0:100]

			bfile = open(opts.business_file, 'r')
			bdic = {}
			for line in bfile:
				line = json.loads(line)
				bdic[line['business_id']] = [line['name'], line['full_address']]
			positive = open('positive_bias.csv', 'w')
			writer_positive = csv.writer(positive)
			negative = open('negative_bias.csv', 'w')
			writer_negative = csv.writer(negative)
			for item in positive_bias:
				writer_positive.writerow((bdic[business[item[0]]][0], bdic[business[item[0]]][1]))
			for item in negative_bias:
				writer_negative.writerow((bdic[business[item[0]]][0], bdic[business[item[0]]][1]))

		'''
Exemplo n.º 48
0
def main():
    
    logger.info('Started')
    #============================================================================
    #Establish connection and make database object
    #============================================================================
    db  = EstablishConnection()
    
    bill_table = db.ca_bills               #bill table
    bill_d_table = db.bills_details        #bill details table
    legislator_table = db.legislators      #legislator table
    committee_table = db.committees        #committee table
    
    #==================================================================================================
    #Query MongoDB to pull relevant data 
    #==================================================================================================

    # try:
    #     bills_details = list(db.bills_details.find({'state':'ca', 'type': 'bill'}, 
    #         {'_id': 1, 'session':1, 'chamber': 1, 'sponsors': 1, 'sponsors.leg_id':1, 'scraped_subjects': 1, 'subjects':1, 'type': 1,
    #         'action_dates': 1, 'votes': 1, 'actions': 1}).limit(10000) )

    #     legis_details = list(db.legislators.find({'state': 'ca','level':'state'}, 
    #         {'_id': 1,'leg_id': 1,'party': 1,'district': 1,'active': 1 ,'chamber': 1}).limit(10000) )

    #     logger.info('Data succesfully obtained from MongoDB.\n')
    # except:
    #     logger.info('Something went with wrong Querying MongoDB.\n')
    #     pass
    
    bills_details = list(db.bills_details.find({'state':'ca', 'type': 'bill'}, 
        {'_id': 1, 'session':1, 'chamber': 1, 'sponsors': 1, 'sponsors.leg_id':1, 'scraped_subjects': 1, 'subjects':1, 'type': 1,
           'action_dates': 1, 'votes': 1, 'actions': 1}).limit(5000) )

    legis_details = list(db.legislators.find({'state': 'ca','level':'state'}, 
            {'_id': 1,'leg_id': 1,'party': 1,'district': 1,'active': 1 ,'chamber': 1}).limit(5000) )

    logger.info('Data succesfully obtained from MongoDB.\n')

    logger.info('Creating legis dataframe...........\n')
    df_legis = pd.DataFrame(legis_details)
    df_bills_d = pd.DataFrame(bills_details)
    logger.info('Finished creating DataFrame........\n')

    logger.info('Uploading median income by district data')
    fnames = np.array(['locations', 'district', 'chamber', 'med_ann_income'])
    income_df = pd.read_csv('/Users/ppchow/data_science/CA-leg-predict/Med_Family_Income_20082012.csv', names=fnames)
    legis_income_df = pd.merge(income_df, df_legis, on=['chamber', 'district'], how='right')
    legis_income_df = legis_income_df.drop(['_id', 'district', 'chamber'], axis=1)
    logger.info('Combined legislation and income dataframes')

    logger.info('Apply transformation to DataFrame......\n')
    df_bills_d['bill_duration'] = df_bills_d['action_dates'].apply(lambda lst: billDuration(lst))
    df_bills_d['bill_status'] = df_bills_d['actions'].map(lambda lst: billStatus(lst))
    df_bills_d['primary_sponsors'] = df_bills_d['sponsors'].map(lambda lst: primarySponsors(lst))
    df_bills_d['co_sponsors'] = df_bills_d['sponsors'].map(lambda lst: coSponsors(lst))
    df_bills_d['leg_id'] = df_bills_d['sponsors'].map(lambda lst: lst[0]['leg_id'])
    df_bills_d = df_bills_d.drop(['action_dates', 'actions', 'session', 'subjects', 'scraped_subjects', 'votes', 'type', 'sponsors'], axis = 1)
    df_bills_d.fillna(0, inplace = True)   
    df_bills_d_merged = pd.merge(legis_income_df, df_bills_d, on='leg_id', how='outer')
    print 'Prints Merged Dataframe', df_bills_d_merged
    logger.info('Done applying transformation to DataFrame........\n')


    #===============================================================================
    # APPLY NAIVE BAYES MODEL TO DATAFRAME
    #===============================================================================
    df_bills_d_merged.describe()
    df_bills_d_merged[df_bills_d_merged['bill_status'] == 1 ].describe()
    df_bills_d_merged.head()
    y, X = dmatrices('bill_status ~ bill_duration + primary_sponsors + co_sponsors + locations + party - 1', data=df_bills_d_merged, return_type='dataframe')
    yy = y['bill_status[yes]']

    clf = BernoulliNB().fit(X, yy)
    print clf.intercept_
    print math.exp(clf.intercept_)
    print 'NB Score/R2', clf.score(X,yy)

    print "Coefs", clf.coef_[0]
    
    top = np.argsort(clf.coef_[0])
    print top
    print clf.coef_[0][top]
    print 'X.columns top', X.columns[top]
Exemplo n.º 49
0
def classify(data, Sensoren, classifier="Bayes"):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        data[:, Sensoren], data[:, 1], test_size=0.4, random_state=0
    )

    X_train = data[:7000, Sensoren]
    y_train = data[:7000, 1]
    X_test = data[7000:, Sensoren]
    y_test = data[7000:, 1]

    # Auswahl des Klassifizierers
    if classifier is "Bayes":
        clf = BernoulliNB()
        history = "Klassifizierer: Naive Bayes" + "\n"
    elif classifier is "Gradient":
        clf = SGDClassifier()
        history = "Klassifizierer: Gradient Decent" + "\n"
    elif classifier is "Linear":
        clf = linear_model.LinearRegression()
        history = "Klassifizierer: Linear Regression" + "\n"
    elif classifier is "LDA":
        clf = LDA()
        history = "Klassifizierer: LDA" + "\n"
    elif classifier is "AdaBoost":
        clf = AdaBoostClassifier(n_estimators=100)
        history = "Klassifizierer: AdaBoost" + "\n"
    elif classifier is "Forest":
        clf = RandomForestClassifier(n_estimators=100)
        history = "Klassifizierer: Forest" + "\n"
    elif classifier is "SVM":
        clf = svm.SVC()
        history = "Klassifizierer: SVN" + "\n"
    elif classifier is "DecisionTree":
        clf = tree.DecisionTreeClassifier(criterion="entropy")
        history = "Klassifizierer: DecisionTree" + "\n"
    else:
        print "kein korrekter Klassifizierer gewawehlt,Naive Bayes wurde verwendet"
        history = "Klassifizierer: Fehler" + "\n"
        clf = GaussianNB()
    # Trainieren des Klassifitierers
    clf.fit(X_train, y_train)
    lista = clf.predict(X_test) - y_test
    lista = map(abs, lista)
    b = [1 if i else 0 for i in lista]

    score = clf.score(X_test, y_test)
    confusionMatrix = confusion_matrix(y_test, clf.predict(X_test))

    print "Fehlerkennung: " + str(sum(b))
    print "Score: " + str(score)
    print confusionMatrix

    history = history + "Score: " + str(score) + "\n"
    history = history + "Fehlerkennung: " + str(sum(b)) + "\n"
    history = history + "Confusionsmatrix: " + "\n"
    history = history + str(confusionMatrix) + "\n"

    fd = open("History.txt", "a")
    fd.write(history)
    fd.close()

    return clf, X_train, X_test, y_train, y_test
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('-training', required=True, help='Path to training data')
	parser.add_argument('-test', help='Path to test data')
	parser.add_argument('-c', '--classifier', default='nb', help='nb | log | svm')
	parser.add_argument('-top', type=int, help='Number of top features to show')
	parser.add_argument('-trees',type=int,help="Number of trees (if random forest for classifier)")
	opts = parser.parse_args()

	##### BUILD TRAINING SET ###################################
	# Initialize CountVectorizer
	vectorizer = CountVectorizer(binary=True, lowercase=True, decode_error='replace')

	# Load training text and training labels
	# (make sure to convert labels to integers (0 or 1, not '0' or '1')
	#  so that we can enforce the condition that label data is binary)

	count = 0
	with open(opts.training, 'rU') as f:
		reader = csv.reader(f)
		train_data = list(reader)

	train_labels = numpy.arange(len(train_data))
	train_text = []


	i = 0
	for blog in train_data:
		label = blog[0]
		text = blog[1]

		train_text.append(text)
		train_labels[i] = int(label)
		i+=1

	print("ready to vectorize training data")
	# Get training features using vectorizer
	train_features = vectorizer.fit_transform(train_text)
	# Transform training labels to numpy array (numpy.array)
	print("done vectorizing")
	############################################################


	##### TRAIN THE MODEL ######################################
	# Initialize the corresponding type of the classifier and train it (using 'fit')
	if opts.classifier == 'nb':
		classifier = BernoulliNB(binarize=None)
		print("Naive Bayes")
	elif opts.classifier == 'log':
		classifier = LogisticRegression(C=.088)
		print("Log")
	elif opts.classifier == 'svm':
		classifier = LinearSVC()
		print("Support Vector Machine")
	elif opts.classifier == 'rf':
		if not opts.trees:
			trees = 10
		else:
			trees = opts.trees
		classifier = RandomForestClassifier(n_estimators=trees)
		train_features = train_features.toarray()
	elif opts.classifier == 'knn':
		classifier = KNeighborsClassifier(n_neighbors=10)
	else:
		raise Exception('Unrecognized classifier!')
	classifier.fit(train_features,train_labels)
	############################################################


	###### VALIDATE THE MODEL ##################################
	# Print training mean accuracy using 'score'
	print(classifier.score(train_features,train_labels))
	scores = cross_validation.cross_val_score(classifier,train_features,train_labels,cv=10,scoring='accuracy')
	print("Cross Validation Scores Calculated")
	print(scores)
	print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
	############################################################


	##### EXAMINE THE MODEL ####################################
	if opts.top is not None:
		print("Got "+str(opts.top)+" tops")

		# print top n most informative features for positive and negative classes
		util.print_most_informative_features(opts.classifier, vectorizer, classifier, opts.top)
	############################################################


	##### TEST THE MODEL #######################################
	if opts.test is None:
		test_blog = "uses yahoo boss support search experience general web search perform query application set term candidates using key terms term its within result set its global measure similar 1ST_PERSON former colleagues 1ST_PERSON enterprise try yourself URL rough edges produces considering example 1ST_PERSON application explore learn 1ST_PERSON started 1ST_PERSON term 1ST_PERSON suggestions looked name caught 1ST_PERSON following 1ST_PERSON 1ST_PERSON again results 1ST_PERSON immediately had document further made clear someone 1ST_PERSON get home can_t you_ll experience 1ST_PERSON did 1ST_PERSON encourage"
		# Print the predicted label of the test blog
		features = vectorizer.transform([test_blog])

		if opts.classifier == 'rf':
			features = features.toarray()

		print("Prediction (1 == correct): ")
		print(classifier.predict(features))
		# Print the predicted probability of each label.
		if opts.classifier != 'svm':
			# Use predict_proba
			print("User predict prob ")
			print(classifier.predict_proba(features))

		else:
			# Use decision_function
			print("use decision ")
			print(classifier.decision_function(features))

	else:
		with open(opts.test, 'rb') as f:
			reader = csv.reader(f)
			test_data = list(reader)

		test_labels = numpy.arange(len(test_data))
		test_text = []


		i = 0
		for blog in test_data:
			label = blog[0]
			text = blog[-1]

			test_text.append(text)
			test_labels[i] = int(label)
			i+=1

		print("ready to vectorize testing data")
		# Get training features using vectorizer
		test_features = vectorizer.transform(test_text)

		print("Score")
		print(classifier.score(test_features,test_labels))

		# Test the classifier on the given test set
		# Extract features from the test set and transform it using vectorizer

		# Print test mean accuracy

		# Predict labels for the test set
		predictions = classifier.predict(test_features)

		# Print the classification report
		print("Classification report")
		print(classification_report(test_labels,predictions))
		# Print the confusion matrix
		print("Classifier uses: Confusion!")
		print(confusion_matrix(test_labels,predictions))
		print("It's super effective!")

		# Get predicted label of the test set
		if opts.classifier != 'svm':
			print("Predicted Probability")
			test_predicted_proba = classifier.predict_proba(test_features)


			blogs = zip(test_labels,predictions,test_predicted_proba,test_text)
			num = len(blogs)
			counter = 0
			"""for tup in reversed(sorted(blogs,key=lambda x:x[2][1])):

				if tup[0] == tup[1]:
					if counter < 5:
						print(tup)
					counter+=1
			counter = 0
			for tup in reversed(sorted(blogs,key=lambda x:x[2][0])):
				if tup[0] == tup[1]:
					if counter < 5:
						print(tup)
					counter+=1"""

			util.plot_roc_curve(test_labels, test_predicted_proba)


		else:
			print("Decision Function")
			decisions = classifier.decision_function(test_features)
			#import matplotlib.pyplot as plt
			x = numpy.arange(0,len(decisions),1)
			plt.plot(x,decisions)
			plt.show()
Exemplo n.º 51
0
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pandas as pd

df = pd.read_csv('dataset/winequality-red.csv', header=0, sep=';')

X = df[list(df.columns)[:-1]]
y = df['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y)

modelg = GaussianNB()
modelg.fit(X_train, y_train)
y_predict = modelg.predict(X_test)
print "GausseanNB Score:" + str(modelg.score(X_test, y_test))
mse = mean_squared_error(y_predict, y_test)
print "RMSE:" + str(mse ** 0.5)

modelm = MultinomialNB()
modelm.fit(X_train, y_train)
y_predict = modelm.predict(X_test)
print "MultinomialNB Score:" + str(modelm.score(X_test, y_test))
mse = mean_squared_error(y_predict, y_test)
print "RMSE:" + str(mse ** 0.5)

modelb = BernoulliNB()
modelb.fit(X_train, y_train)
y_predict = modelb.predict(X_test)
print "BernoulliNB Score: " + str(modelb.score(X_test, y_test))
mse = mean_squared_error(y_predict, y_test)
print "RMSE:" + str(mse ** 0.5)
Exemplo n.º 52
0
def main():
	##### DO NOT MODIFY THESE OPTIONS ##########################
	parser = argparse.ArgumentParser()
	parser.add_argument('-training_expensive', required=True, help='Path to expensive training data')
	parser.add_argument('-training_cheap', required=True, help='Path to cheap training data')
	# parser.add_argument('-test', help='Path to test data')
	parser.add_argument('-c', '--classifier', default='nb', help='nb | log | svm')
	parser.add_argument('-top', type=int, help='Number of top features to show')
	parser.add_argument('-p', type=bool, default='', help='If true, prints out information')
	opts = parser.parse_args()
	############################################################
	# Note: anytime the print flag is set to '', you should not print anything out!

	##### BUILD TRAINING SET ###################################
	
	# Load training text and training labels
	(training_labels, training_features) = load_file(opts.training_expensive, opts.training_cheap)

	# print training_labels
	# print training_features

	# Transform training labels to numpy array (numpy.array)
	training_labels = numpy.array(training_labels)
	training_features = numpy.array(training_features)
	############################################################
	# TODO: Start modifiying the lines below here

	##### TRAIN THE MODEL ######################################
	# Initialize the corresponding type of the classifier and train it (using 'fit')
	if opts.classifier == 'nb':
		# TODO: Initialize Naive Bayes and train
		classifier = BernoulliNB(binarize=None)
		classifier.fit(training_features, training_labels)
	elif opts.classifier == 'log':
		# TODO: Initialize Logistic Regression and train
		classifier = LogisticRegression(penalty='l2')
		classifier.fit(training_features, training_labels)
	elif opts.classifier == 'svm':
		# TODO: Initialize SVM and train
		classifier = LinearSVC()
		classifier.fit(training_features, training_labels)
	else:
		raise Exception('Unrecognized classifier!')
	############################################################


	###### VALIDATE THE MODEL ##################################
	# TODO: print training mean accuracy using 'score'

	# TODO: Perform 10 fold cross validation (cross_validation.cross_val_score) with scoring='accuracy'
	# TODO: print get the mean score and std deviation

	############################################################
	if opts.p == True:
		print "training mean accuracy using score " + str(classifier.score(training_features, training_labels))

	est_scores = cross_validation.cross_val_score(classifier, training_features, training_labels, scoring='accuracy', cv=10)

	mean_est_scores = numpy.mean(est_scores)
	std_est_scores = numpy.std(est_scores)

	if opts.p == True:
		print "10 fold cross training mean accuracy " + str(mean_est_scores)
		print "10 fold cross training standard deviation " + str(std_est_scores)
	test_texts = [item[0] for item in test_x_y]
	test_labels = [item[1] for item in test_x_y]

	# Get test features using vectorizer
	test_features = vectorizer.transform(test_texts)

	# Transform test labels to numpy array (numpy.array)
	test_labels = numpy.array(test_labels)

	predicted = classifier.predict(test_features)
	cm = confusion_matrix(test_labels, predicted)
	
	print cm

	cm_normalized = cm.astype('float') / cm.sum(axis=1)[:,numpy.newaxis]
	print cm_normalized

	plt.figure()
	plot_confusion_matrix(cm_normalized)
	plt.show()

	test_scores = classifier.score(test_features, test_labels)
	print 'Mean Score: ', test_scores.mean()

	############################################################



print 'Train Number: ', len(train_x_y)
print 'Test Number: ', len(test_x_y)
    NuSVC_classifier.fit(train_arrays, train_labels)
    print('NuSVC Accuracy: %.2f' %NuSVC_classifier.score(test_arrays, test_labels))
except:
    pass

try:
    MultinomialNB_classifier = MultinomialNB()
    MultinomialNB_classifier.fit(train_arrays, train_labels)
    print('MultinomialNB Accuracy: %.2f' %MultinomialNB_classifier.score(test_arrays, test_labels))
except:
    pass

try:
    BernoulliNB_classifier = BernoulliNB()
    BernoulliNB_classifier.fit(train_arrays, train_labels)
    print('BernoulliNB Accuracy: %.2f' %BernoulliNB_classifier.score(test_arrays, test_labels))
except:
    pass

try:
    GaussianNB_classifier = GaussianNB()
    GaussianNB_classifier.fit(train_arrays, train_labels)
    print('GaussianNB Accuracy: %.2f' %GaussianNB_classifier.score(test_arrays, test_labels))
except:
    pass

################################################################################# Confusion_matrix

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from itertools import cycle
Exemplo n.º 55
0
#mnb = MultinomialNB()
#mnb.fit(tfidf_train, svm_train_tag)
#MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
#score = mnb.score(tfidf_test, svm_test_tag)
#print score
binarizes=np.linspace(-0.007,-0.008,3)#best -0.0075
for b in binarizes:
    bnb = BernoulliNB(alpha=1.0, binarize=b, class_prior=[0.5047619048,0.4952380952], fit_prior=True)
    bnb.fit(svm_train_data, svm_train_tag)
    bnb_predict=bnb.predict(svm_test_data)
    test_score=bnb.predict_proba(svm_test_data)
    precision, recall, thresholds = precision_recall_curve(svm_test_tag, bnb_predict)
    bnb_fpr,bnb_tpr,bnb_thr=roc_curve(svm_test_tag,test_score[:,1])
    bnb_auc = auc(bnb_fpr, bnb_tpr)
    plt.figure()
    plt.plot(bnb_fpr, bnb_tpr, lw = 1)
    #plt.legend(loc = 'lower right')
    plt.title("ROC curve of naive Bayes classifier")
    plt.show()
    #
    #BernoulliNB(alpha=1.0, binarize=0.5, class_prior=None, fit_prior=True)
    score=bnb.score(svm_test_data,svm_test_tag)
    print "BernoulliNB,",b
    print "confusion matrix:","\n",confusion_matrix(svm_test_tag, bnb_predict)
    print "score=",score
    print "precision=",precision[1]
    print "recall=",recall[1]
    print "auc=",bnb_auc
    print "\n"
Exemplo n.º 56
0
	def compare_sklearn(self, np_reps, gold):
		X = np.array(np_reps)
		nb = BernoulliNB(alpha=0)
		y = np.array(gold)
		nb.fit(X,y)
		return nb.score(X,y)
#tf = TfidfVectorizer(sublinear_tf = True, analyzer='word', ngram_range=(1,3), lowercase=True, min_df=0, stop_words='english')
tf = TfidfVectorizer(ngram_range=(2,2), lowercase=True,min_df=1)
features_train = tf.fit_transform(features_train).toarray()
features_test  = tf.transform(features_test).toarray()

print(features_train.size + features_test.size)
print(len(labels_train)+ len(labels_test))

features_train = features_train[:1000]
labels_train   = labels_train[:1000]


###BernoulliNB
BernoulliNB = BernoulliNB()
BernoulliNB.fit(features_train,labels_train)
print("BernoulliNB_classifier accuracy percent:", (BernoulliNB.score(features_test,labels_test)))

###trains a Naive Bayes Classifier
#classifier = NaiveBayesClassifier.train(trainFeatures)

###trains a MultinomialNB Classifier
MultinomialNB = MultinomialNB()
MultinomialNB.fit(features_train,labels_train)
print("MNB_classifier accuracy percent:", (MultinomialNB.score(features_test,labels_test)))

#LogisticRegression
lr = LogisticRegression()
lr.fit(features_train,labels_train)
print("Logistic Regression_classifier accuracy percent:",(lr.score(features_test,labels_test)))

######
train_data_features = train_data_features.toarray()

print train_data_features.shape

print "Training the random NB..."

# Initialize a Random NB classifier with 100 trees
NB = BernoulliNB(alpha = 1000) 

# Fit the NB to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
NB = NB.fit( train_data_features, train["label"] )

print('accuracy on the training set: %f' %NB.score( train_data_features, train["label"] ))

# Read the test data
test = pd.read_csv("testDataFormated.csv", header=0, quoting=3 )

# Verify that there are 25,000 rows and 2 columns
print test.shape

# Create an empty list and append the clean reviews one by one
num_test_data = len(test["text"])
clean_test_text = [] 

print "Cleaning and parsing the test set...\n"
for i in xrange(0,num_test_data):
    if( (i+1) % 1000 == 0 ):
        print "test %d of %d\n" % (i+1, num_test_data)
Exemplo n.º 59
0
train_vectors = vectorizer.fit_transform([doc for doc, target in train_samples])
test_vectors = vectorizer.transform([doc for doc, target in test_samples])
train_targets = [target for doc, target in train_samples]
test_targets = [target for doc, target in test_samples]

# <codecell>

classifier = BernoulliNB()

# <codecell>

classifier.fit(train_vectors, train_targets)

# <codecell>

classifier.score(test_vectors, test_targets)

# <codecell>

# A helper function to see which features affect the classification the most
def show_most_informative_features(vectorizer, classifier, n=10):
    neg = classifier.feature_log_prob_[0]
    pos = classifier.feature_log_prob_[1]
    valence = (pos - neg)
    ordered = np.argsort(valence)
    interesting = np.hstack([ordered[:n], ordered[-n:]])
    feature_names = vectorizer.get_feature_names()
    for index in ordered[:n]:
        print "%+4.4f\t%s" % (valence[index], feature_names[index])
    print '\t...'
    for index in ordered[-n:]:
Exemplo n.º 60
0
                                      #[trainFiles["neg"]["path"]+x for x in trainFiles["neg"]["files"]])
    xTrain = vectorizer.fit_transform([trainFiles["pos"]["path"]+x for x in trainFiles["pos"]["files"]]+
                                      [trainFiles["neg"]["path"]+x for x in trainFiles["neg"]["files"]])
    xTest = vectorizer.transform([testFiles["pos"]["path"]+x for x in testFiles["pos"]["files"]]+
                                      [testFiles["neg"]["path"]+x for x in testFiles["neg"]["files"]])

    
    clf =BernoulliNB(alpha=.01)
    #clf = MultinomialNB(alpha=.01)
  
    clf.fit(xTrain, [1]*max+[0]*max)
    print xTest.get_shape()
    y_score = clf.predict(xTest)
    y_prob = clf.predict_proba(xTest)
    y_test=[1]*199+[0]*199
    scores=clf.score(xTest,[1]*199+[0]*199)
    #scores=clf.score(y_prob[:,0],[1]*200+[0]*200)
    print roc_auc_score([1]*199+[0]*199, y_prob[:,1], average='macro', sample_weight=None)
    #from sklearn.externals import joblib
    joblib.dump(clf, 'pickle/bernouliAdjAdv.pkl')
    joblib.dump(vectorizer, 'pickle/vecAdjAdv.pkl')
  


# In[7]:

fpr = dict()
tpr = dict()
roc_auc = dict()
print 0
fpr, tpr, _ = roc_curve([1]*200+[0]*200, y_prob[:,1])