Пример #1
0
class GNB(object):
	def __init__(self):
		self.gnb = GaussianNB()
	def predict(self, X):
		return self.gnb.predict_proba(X)[:,1][:,np.newaxis]
	def fit(self, X, y):
		self.gnb.fit(X,y)
def test_gnb_priors():
    """Test whether the class prior override is properly used"""
    clf = GaussianNB(priors=np.array([0.3, 0.7])).fit(X, y)
    assert_array_almost_equal(clf.predict_proba([[-0.1, -0.1]]),
                              np.array([[0.825303662161683,
                                         0.174696337838317]]), 8)
    assert_array_equal(clf.class_prior_, np.array([0.3, 0.7]))
Пример #3
0
def gnbmodel(d,X_2,y_2,X_3,y_3,X_test,y_test):
    X_3_copy = X_3.copy(deep=True)
    X_3_copy['chance']=0
    index = 0    
    
########## k折交叉验证 ###########################
    scores = cross_val_score(GaussianNB(), X_2, y_2, cv=5, scoring='accuracy')
    score_mean =scores.mean()
    print(d+'5折交互检验:'+str(score_mean))
#################################################
    
    gnb = GaussianNB().fit(X_2,y_2)

################ 预测测试集 ################   
    answer_gnb = gnb.predict(X_test)
    accuracy = metrics.accuracy_score(y_test,answer_gnb)
    print(d+'预测:'+str(accuracy))
###############################################
    
    chance = gnb.predict_proba(X_3)[:,1]
    for c in chance:
        X_3_copy.iloc[index,len(X_3_copy.columns)-1]=c
        index += 1
    chance_que = X_3_copy.iloc[:,len(X_3_copy.columns)-1]
    return chance_que
def performNB(trainingScores, trainingResults, testScores):
	print "->Gaussian NB"
	X = []
	for currMark in trainingScores:
		pass
	for idx in range(0, len(trainingScores[currMark])):
		X.append([])

	for currMark in trainingScores:
		if "Asym" in currMark:
			continue
		print currMark, 
		for idx in range(0, len(trainingScores[currMark])):
			X[idx].append(trainingScores[currMark][idx])

	X_test = []
	for idx in range(0, len(testScores[currMark])):
		X_test.append([])

	for currMark in trainingScores:
		if "Asym" in currMark:
			continue
		for idx in range(0, len(testScores[currMark])):
			X_test[idx].append(testScores[currMark][idx])
	gnb = GaussianNB()
	gnb.fit(X, np.array(trainingResults))
	y_pred = gnb.predict_proba(X_test)[:, 1]
	print "->Gaussian NB"
	return y_pred
Пример #5
0
class GaussianColorClassifier(ContourClassifier):
    '''
    A contour classifier which classifies a contour
    based on it's mean color in BGR, HSV, and LAB colorspaces,
    using a Gaussian classifier for these features.

    For more usage info, see class ContourClassifier
    '''
    FEATURES = ['B', 'G', 'R', 'H', 'S', 'V', 'L', 'A', 'B']

    def __init__(self, classes, **kwargs):
        super(GaussianColorClassifier, self).__init__(classes, **kwargs)
        self.classifier = GaussianNB()

    def get_features(self, img, mask):
        mean = cv2.mean(img, mask)
        mean = np.array([[mean[:3]]], dtype=np.uint8)
        mean_hsv = cv2.cvtColor(mean, cv2.COLOR_BGR2HSV)
        mean_lab = cv2.cvtColor(mean, cv2.COLOR_BGR2LAB)
        features = np.hstack((mean.flatten(), mean_hsv.flatten(), mean_lab.flatten()))
        return features

    def classify_features(self, features):
        return self.classifier.predict(features)

    def feature_probabilities(self, features):
        return self.classifier.predict_proba(features)

    def train(self, features, classes):
        self.classifier.fit(features, classes)
Пример #6
0
def trainData(username):
	"""
	Trains the data based on the users performance so far
	Returns a trained Gaussian Naive Bayes model and updates result collection
	"""
	X = getFeatures(username)
	Y = getClassList(username)
	
	trainX = np.array(X)
	trainY = np.array(Y)

	gnb = GaussianNB()
	gnb.fit(trainX, trainY)
	print "Score with Naive Bayes: ", gnb.score(trainX, trainY)

	testData = words.posts.find({}, {'id' : 1,
									'points' : 1,
									'diff' : 1,
									'_id' : 0})
	testData = map(lambda x : (x['id'], x['points'], x['diff']), testData)

	with warnings.catch_warnings():
		warnings.simplefilter('ignore')
		for data in testData:
			testWord = words.posts.find_one({'id' : data[0]}, {'word' : 1, '_id' : 0})['word']
			wordClass = setWordClass(list(gnb.predict_proba(data))[0])
			classWord = result.posts.update({'username' : username}, {'$set' : {testWord : wordClass}}, upsert = True)
def naiveBayesClassifierTraining(compounds_all):
    print "Building naive Bayes classifier (" + str(NB_FOLDS) + "-fold cross-validation)..."
    # get the data
    keys = compounds_all.keys()
    fingerprint_data = [compounds_all[cmpnd_id]['fingerprint'] for cmpnd_id in keys]
    fingerprint_data = numpy.asarray(fingerprint_data)
    activity_data = [compounds_all[cmpnd_id]['active'] for cmpnd_id in keys]
    activity_data = numpy.asarray(activity_data)

    # perform K-fold cross-validation
    classifier = GaussianNB()
    kfold_xv_strat = cross_validation.StratifiedKFold(activity_data, NB_FOLDS, indices=False)
    confusion_matrices = []
    probabilities = []
    scores = []
    models = []
    true_activities = []
    aucs = []
    for train, test in kfold_xv_strat:
        fingerprint_data_train = fingerprint_data[train]
        fingerprint_data_test = fingerprint_data[test]
        activity_data_train = activity_data[train]
        activity_data_test = activity_data[test]

        # model building
        classifier.fit(fingerprint_data_train, activity_data_train)

        # testing
        activity_data_predictions = classifier.predict(fingerprint_data_test)
        models.append(classifier)

        probability_estimates = classifier.predict_proba(fingerprint_data_test)
        probabilities.append(probability_estimates)

        scores.append(classifier.score(fingerprint_data_test, activity_data_test))

        activity_confusion_matrix = confusion_matrix(activity_data_test, activity_data_predictions)
        confusion_matrices.append(activity_confusion_matrix)

        true_activities.append(activity_data_test)

        # ROC curves
        fpr, tpr, thresholds = roc_curve(activity_data_test, probability_estimates[:, 1])
        aucs.append(auc(fpr, tpr))
    classifier.fit(fingerprint_data, activity_data)
    print "Done."
    return {
        'confusion_matrices' : confusion_matrices
        , 'probabilities' : probabilities
        , 'scores' : scores
        , 'models' : models
        , 'true_activity_data' : true_activities
        , 'AUCs' : aucs
        , 'fingerprint_data' : fingerprint_data
        , 'activity_data' : activity_data
        , 'final_model' : classifier
    }
Пример #8
0
def bayseFilter(X,y):
    clf = GaussianNB()
    clf.fit(X,y)
    bayseX = clf.predict_proba(X)
    t = np.ones(bayseX.shape[0])    
    for i in range(0,bayseX.shape[1]):
        t = t*bayseX[:,i]
        
    bayseXfilter = t
    return bayseXfilter
Пример #9
0
class NaiveBayes:
    __theta = 0
    __sigma = 0

    def __init__(self):
        pass 
        #self.__new_data = 0

    def learning(self,x_data,y_data):
        self.rssi = np.loadtxt(x_data, delimiter=',')
        print(self.rssi)

        self.position = np.loadtxt(y_data, delimiter=',')
        print(self.position)

        self.gaussian_nb = GaussianNB()

        from sklearn.cross_validation import train_test_split
        rssi_train, rssi_test, position_train, position_test = train_test_split(self.rssi, self.position, random_state=0)

        self.gaussian_nb.fit(rssi_train,position_train)
        print("theta",self.gaussian_nb.theta_)
        print("sigma",self.gaussian_nb.sigma_)

        predicted = self.gaussian_nb.predict(rssi_test)

        print(metrics.accuracy_score(position_test, predicted))
    '''
    def set_params(self,theta,sigma):
        __theta = theta
        __sigma = sigma
        print __theta
        print __sigma
        '''

    def inference(self,r_data):
        self.predicted_class = self.gaussian_nb.predict(r_data)

        post_prob = self.gaussian_nb.predict_proba(r_data)
        log_prob = self.gaussian_nb.predict_log_proba(r_data)
        self.post_prob_float16 = post_prob.astype(np.float16)
        #E = 1*self.post_prob_float16[0][0]+2*self.post_prob_float16[0][1]+3*self.post_prob_float16[0][2]
        #var = (1*self.post_prob_float16[0][0]+4*self.post_prob_float16[0][1]+9*self.post_prob_float16[0][2])-E**2
        #print(self.post_prob_float16)
        #print(self.post_prob_float16[0])
        #print(var)
        print(self.predicted_class)
        #print(self.gaussian_nb.class_prior_)
        #print(log_prob)

        return self.predicted_class

    def output(self):
        output = graph.Graph()
        output.bar_graph(self.post_prob_float16[0])
Пример #10
0
def nbayes(source, target):
    """ Naive Bayes Classifier
    """
    source = SMOTE(source)
    clf = GaussianNB()
    features = source.columns[:-1]
    klass = source[source.columns[-1]]
    clf.fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    distr = clf.predict_proba(target[target.columns[:-1]])
    return preds, distr[:,1]
def main():
  train = p.read_table('../train.tsv').replace('?',0)
  # target = np.array(train)[:,-1]
  train['alchemy_category'] = train.groupby('alchemy_category').grouper.group_info[0]
  train['alchemy_category_score'] = train['alchemy_category_score'].astype(float)
  # train = np.array(train)[:,:-1]
  train = np.array(train)[:,3:]
  test = p.read_table('../test.tsv').replace('?',0)
  test['alchemy_category'] = test.groupby('alchemy_category').grouper.group_info[0]
  test['alchemy_category_score'] = test['alchemy_category_score'].astype(float)
  valid_index = list(np.array(test)[:,1])
  orig_test = np.array(test)[:,3:]
  test = train
  test = outlier(test,20)
  target = test[:,-1]
  test = test[:,:-1]
  print len(test)
  r = []
  r.append([0,0.000])
  for j in range(1,10):
    n = int((8.5*len(train))/10)
    X_train = test[:n]
    X_test = test[n:]
    y_train = target[:n]
    y_test = target[n:]
    # run the model
    #classifier = RandomForestClassifier(n_estimators=1000,verbose=0,n_jobs=20,min_samples_split=5,random_state=1034324)
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    pred = classifier.predict_proba(X_test)
    fpr, tpr, thresholds = roc_curve(y_test,pred[:,1])
    roc_auc = auc(fpr, tpr)
    print("%d Area under the ROC curve : %f" %(i,roc_auc))
    r.append([j,roc_auc])
    plt.grid(True)
    #print r
    x = [i[0]*10 for i in r]
    y = [i[1]*100 for i in r]
    plt.plot(x,y,linewidth=3)
    plt.axis([0,100,0,100])
    plt.xlabel("training % data")
    plt.ylabel('Accuracy (CV score k=20)')
    plt.show()
  # gnb.fit(X_train, y_train)
  # pred = gnb.predict(X_test)
  # fpr, tpr, thresholds = roc_curve(y_test,pred)
  # roc_auc = auc(fpr, tpr)
  # print("Area under the ROC curve : %f" % roc_auc)

  # write
  writer = csv.writer(open("predictions", "w"), lineterminator="\n")
  rows = [x for x in zip(valid_index, classifier.predict(orig_test))]
  writer.writerow(("urlid","label"))
  writer.writerows(rows)
Пример #12
0
def train_NB_model(trackset, training_set):
  useful_features = ['acousticness','danceability','instrumentalness','energy','speechiness','tempo','valence']
  X = training_set[useful_features]
  Y = training_set.status
  w = training_set.weight
  clf = GaussianNB()
  clf.fit(X, Y, sample_weight=w)
  predicts = pd.DataFrame(clf.predict_proba(trackset[useful_features]))
  predicts.columns = ['P_reject','P_accept']
  trackset.P_accept = predicts['P_accept'].values
  return trackset.sort_values(by=['P_accept'], ascending=False)
def gNB(train_data, train_labels, test, save_result=False):
    log_state('Use Gaussian Naive Bayes classifier')
    clf = GaussianNB()
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    predict_proba = clf.predict_proba(test)
    if save_result == True:
        dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
        dump_picle(predict_proba, './data/predict_labels/predict_proba.p')
    logger.info('Classifier training complete, saved predict labels to pickle')
    return predict_labels
Пример #14
0
def nb(data,yind, xind):
    model = NB()
    Y = data.iloc[range(0,data.shape[0]),yind]
    X = data.iloc[range(0,data.shape[0]),xind]
    model.fit(X,Y)
    Z = model.predict(X.iloc[X.shape[0]-1,:])
    Z = Z.tolist()
    prob =  model.predict_proba(X.iloc[X.shape[0]-1,:])
    prob = prob.tolist()
    classes = model.classes_.tolist()
    output = [Z,prob, classes]
    return output
Пример #15
0
def trainModel(X_train, Y_train, X_test, Y_test, model="NB"):
    if model == "NB":
	clf = GaussianNB()
    elif model == "RF":
	clf = ensemble.RandomForestClassifier()
    elif model == "GB":
	clf = ensemble.GradientBoostingClassifier(learning_rate=0.1,
	                                          n_estimators=100,verbose=1)
    clf.fit(X_train, Y_train)
    Y_score = clf.predict_proba(X_test)
    auc = em.get_roc(Y_test,Y_score[:,1])
    return clf, auc
Пример #16
0
def naiveBayesModel(train_data, test_data, train_Y, test_Y):

    # Build Naive Bayes Model
    model = GaussianNB()
    model.fit(train_data, train_Y)
    # print(model)

    # Make predictions
    predicted = model.predict_proba(test_data)
    # print predicted[0:,1]
    print "Naive Bayes :"
    print 'Log Loss :', metrics.log_loss(test_Y, predicted[0:,1])
Пример #17
0
def naive_bayes_crossval_network(title):
    csv = pandas.read_csv("data/cables2009WithRefAttributes.csv", sep=";")
    X, Y = get_xy_from_csv2(csv)
    fold_size = len(Y)/10
    for fold in xrange(0, 10):
        if fold == 9:
            last = len(Y) - (fold + 1) * fold_size
        else:
            last = 0
        test = range(fold * fold_size, (fold + 1) * fold_size + last)
        train = list(set(range(len(Y))) - set(test))
        clf = GaussianNB()
        clf.fit(X[train, :], Y[train])
        if fold == 0:
            print "Naive bayes 10-fold crossval: 0",
            probs = clf.predict_proba(X[test, :])
        else:
            print fold,
            probs = np.concatenate([probs, clf.predict_proba(X[test, :])])
    print " "
    plot_ROC_of_graph(0, 0, True, Y, probs, title)
def main():
    args = getOptions()
    print "options:"
    print args
    fn = "nbsubmission.csv"
    print fn
    print "train file read"
    train_x, train_y = readfile(args.train,'train')
    print "test file read"
    test_x, test_y = readfile(args.test,'test')
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    indices = [i for i in range(len(train_x[0]))]
    frqIndex = trimfrq(train_x)
    
    for i in frqIndex:
        indices.remove(i)
    train_x_uniq = indexTodata(train_x, indices)
    test_x_uniq = indexTodata(test_x, indices)
    
    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)
    
    #feature selection
    print "feature selection"
    ftsel = ExtraTreesClassifier()
    ftsel.fit(train_x_nor, train_y)
#     importances = ftsel.feature_importances_
#     indices_test = np.argsort(importances)[::-1]
#     indices_test = indices_test.tolist()
    train_x_trans = ftsel.transform(train_x_nor)
    test_x_trans = ftsel.transform(test_x_nor)
    
    #modelsing
    print "modelsing"
    clf = GaussianNB()
    clf.fit(train_x_trans, train_y)
    train_pdt = clf.predict(train_x_trans)
    MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) 
    print "MCC, Acc_p , Acc_n, Acc_all(train): "
    print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))
    test_pdt = clf.predict_proba(test_x_trans)
#     MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) 
#     print "MCC, Acc_p , Acc_n, Acc_all(test): "
#     print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))   
    
    fout=open(fn,'w')
    fout.write("ID,target\n")
    for index, eachline in enumerate(test_pdt):
        fout.write("%s,%s\n" % (str(int(test_x[index][0])),str(test_pdt[index][1])))
    fout.close()
Пример #19
0
    def fit_model_9(self,toWrite=False):
        model = GaussianNB()

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict_proba(X_test)[:,1]
            print("Model 9 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model9/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Пример #20
0
class GaussianNaiveBayes(AbstractLearner):

    def __init__(self):
        self.learner = GaussianNB()

    def _train(self, x_train, y_train):
        self.learner = self.learner.fit(x_train, y_train)

    def _predict(self, x):
        return self.learner.predict(x)

    def _predict_proba(self, x):
        return self.learner.predict_proba(x)
Пример #21
0
def Gaussian_NB_predict(new_train_data, new_train_labels, test_data, test_labels):

    # Create a classifier: a Gaussian Naive Bayesian
    classifier = GaussianNB()

    # We learn the digits on the first half of the digits
    classifier.fit(new_train_data, new_train_labels)
    
    # Now predict the value of the digit on the second half:
    expected = test_labels
    predicted = classifier.predict_proba(test_data)

    return predicted
Пример #22
0
def train_by_lr(conf,ctype):
    """
    
    Arguments:
    - `conf`:
    """
    #read train test y
    print "load data..."
    train,test,y,test_label = read_data(conf)
    train,test,y = np.array(train),np.array(test),np.array(y)

    print "train shape",train.shape
    print "test shape",test.shape

    print "norm"
    scaler = preprocessing.StandardScaler().fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)

    print "pca"
    pca = PCA(n_components=23,whiten=True)
    pca.fit(train)
    train = pca.transform(train)
    test = pca.transform(test)

    #clf = LogisticRegression(penalty='l2',dual=True,fit_intercept=False,C=2,tol=1e-9,class_weight=None, random_state=None, intercept_scaling=1.0)
    clf = GaussianNB()
    #clf = MultinomialNB()
    #clf = GradientBoostingClassifier(n_estimators=400)
    #clf = RandomForestClassifier(n_estimators=400)
    #clf = RandomForestClassifier(n_estimators=100,max_depth=8,min_samples_leaf=4,n_jobs=3)
    #clf = SGDClassifier(loss="log", penalty="l2",alpha=0.1)
    #clf = svm.SVC(C = 1.0, kernel = 'rbf', probability = True)
    if ctype == "cv":
        print "交叉验证"
        hehe = cross_validation.cross_val_score(clf,train,y,cv=3,scoring='roc_auc',n_jobs=-1)
        print hehe
        print np.mean(hehe)

    elif ctype =="predict":
        clf.fit(train,y)
        predict = clf.predict_proba(test)[:,1]

        if len(predict)!=len(test_label):
            print "predict!=test label"
            sys.exit(1)

        rf = open(conf["result_dir"],"w")
        rf.write("id,repeatProbability\n")
        for i in range(len(predict)):
            rf.write("%s,%s\n"%(test_label[i],predict[i]))
def main():
    #create the training & test sets, skipping the header row with [1:]
    fnc = loadarff(open('Train/train_FNC_attrSelected.arff','r'))
    sbm = loadarff(open('Train/train_SBM_attrSelected.arff','r'))
    testf = genfromtxt(open('Test/test_FNC.csv','r'), delimiter=',', dtype='f8')[1:]
    tests = genfromtxt(open('Test/test_SMB.csv','r'), delimiter=',', dtype='f8')[1:]

    
    gnb = GaussianNB()
    y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)
    predicted_probs = [[index + 1, x[1]] for index, x in enumerate(gnb.predict_proba(test))]

    savetxt('Data/submission.csv', predicted_probs, delimiter=',', fmt='%d,%f', 
            header='MoleculeId,PredictedProbability', comments = '')
Пример #24
0
def main(path):
    X_train, X_test, Y_train, Y_test, X_train_case, X_test_case, X_train_judge, X_test_judge = load_data(path)

    output_imp = pd.DataFrame(columns=['rf_imp','rf_name','rf_yerr','rf_case_imp','rf_case_name',
                                       'rf_case_yerr','rf_judge_imp','rf_judge_name','rf_judge_yerr'])

    col_names = X_train.columns.values
    col_names_case = X_train_case.columns.values
    col_names_judge = X_train_judge.columns.values

    ytest = pd.DataFrame(Y_test)
    ytest.to_csv('y_test.csv',index=False)

    rf = RandomForestClassifier(n_estimators=500, random_state=123, bootstrap=False).fit(X_train, Y_train)
    rf_case = RandomForestClassifier(n_estimators=200, random_state=123, bootstrap=False).fit(X_train_case,Y_train)
    rf_judge = RandomForestClassifier(n_estimators=500, random_state=123, bootstrap=False).fit(X_train_judge,Y_train)

    importances = rf.feature_importances_
    std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)
    indices = np.argsort(importances)[::-1]
    output_imp.rf_name = col_names[indices[:20]]
    output_imp.rf_imp = importances[indices[:20]]
    output_imp.rf_yerr = std[indices[:20]]

    importances = rf_case.feature_importances_
    std = np.std([tree.feature_importances_ for tree in rf_case.estimators_], axis=0)
    indices = np.argsort(importances)[::-1]
    output_imp.rf_case_name = col_names_case[indices[:20]]
    output_imp.rf_case_imp = importances[indices[:20]]
    output_imp.rf_case_yerr = std[indices[:20]]

    importances = rf_judge.feature_importances_
    std = np.std([tree.feature_importances_ for tree in rf_judge.estimators_], axis=0)
    indices = np.argsort(importances)[::-1]
    output_imp.rf_judge_name = col_names_judge[indices[:20]]
    output_imp.rf_judge_imp = importances[indices[:20]]
    output_imp.rf_judge_yerr = std[indices[:20]]

    output_imp.to_csv('importance.csv',index=False)
    lr_l1 = LogisticRegression(penalty='l1', random_state=123).fit(X_train, Y_train)
    lr_l2 = LogisticRegression(penalty='l2', random_state=123).fit(X_train, Y_train)
    nb = GaussianNB().fit(X_train, Y_train)

    pred = [lr_l1.predict_proba(X_test)[:,1], lr_l2.predict_proba(X_test)[:,1],
            rf.predict_proba(X_test)[:,1], rf_case.predict_proba(X_test_case)[:,1],
            rf_judge.predict_proba(X_test_judge)[:,1],nb.predict_proba(X_test)[:,1]]

    labels = ['LR_L1','LR_L2','RF','RF_case','RF_judge','NB']
    output_data = pd.DataFrame(np.array(pred).T, columns = labels)
    output_data.to_csv('output_plot_auc.csv',index=False)
Пример #25
0
def bayes_ROC(features, target):
	model = GaussianNB().fit(features,target)
	target_predicted_proba =  model.predict_proba(features)
	fpr, tpr, thresholds = roc_curve(target, target_predicted_proba[:, 1])
	roc_auc = auc(fpr, tpr)
	plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
	plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
	plt.xlim([0.0, 1.0])
	plt.ylim([0.0, 1.0])
	plt.xlabel('False Positive Rate or (1 - Specifity)')
	plt.ylabel('True Positive Rate or (Sensitivity)')
	plt.title('Receiver Operating Characteristic')
	plt.legend(loc="lower right")	
	plt.show()
def nb_xyat_weight1(df_cell_train_feats, y_train, df_cell_test_feats):
    def prepare_feats(df):
        df_new = pd.DataFrame()
        df_new["x"] = df["x"]
        df_new["y"] = df["y"]
        df_new["hour"] = df["hour"]
        df_new["weekday"] = df["weekday"]
        df_new["accuracy"] = df["accuracy"].apply(np.log10)
        return df_new
    logging.info("train nb_xyat_weight1 model")
    clf = GaussianNB()
    clf.fit(prepare_feats(df_cell_train_feats), y_train, df_cell_train_feats["time"] ** 2)
    y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats))
    return y_test_pred
Пример #27
0
def svm_classify(threshold):
    global data
    data = pd.DataFrame()
    # i=0
    # xprev=0
    # xprev2=0
    for x in cot.columns[:-1]:
        data[x] = cot[x] / pd.rolling_mean(cot[x], 5)
        # data[x+'_polynomial2']=data[x]*data[x]
        # data[x+'_polynomial3']=data[x]*data[x]*data[x]
        # if (xprev!=0):
        #    data[x+'_polynomial_x_2']=data[x]*data[xprev]
        # if (xprev2!=0):
        #    data[x+'_polynomial_x_3']=data[x]*data[xprev2]*data[xprev]
        # i=i+1
        # xprev=x
        # xprev2=xprev

    data["return"] = ((futures.shift(-4).Rate / futures.shift(-1).Rate) - 1) > 0
    data = data[8:].dropna(1)
    x_train, x_test, y_train, y_test = train_test_split(data.iloc[:-1, :-1], data.iloc[:-1, -1], test_size=0.5)
    classifier = GaussianNB()  # SVC (kernel='linear',probability=True,C=1)
    classifier.fit(x_train, y_train)
    # min_max_scaler=MinMaxScaler()
    # mms=min_max_scaler.fit(list(max(a) for a in classifier.predict_proba(x_train)))
    pr = list(max(a) for a in classifier.predict_proba(x_test))
    Y = pd.DataFrame()
    Y["actual"] = y_test
    Y["predicted"] = classifier.predict(x_test)
    Y["P"] = list(max(a) for a in classifier.predict_proba(x_test))
    Y_filtered = Y[Y.P > threshold]
    cm = confusion_matrix(Y_filtered.actual, Y_filtered.predicted)
    # return [cm,'Prediction of UP is %s; P = %s' %(classifier.predict(data.iloc[-1:,:-1])[0],
    # list((max(x)) for x in classifier.predict_proba(data.iloc[-1:,:-1]))[0]
    # ),futures]
    cr = classification_report(Y_filtered.actual, Y_filtered.predicted)
    return [cm, cr]
def decision_surface(first,second):
    """
    Draws a scatter plot for two features with decision surface for classifying persons into POI/not POI
    """
    
    features_list = ['poi',first,second]
    data_dict = pickle.load(open("final_project_dataset.pkl", "r") )

    createFraction(data_dict)

    features = data_dict["TOTAL"]
    
    data_dict.pop("TOTAL",0)

    for i in features:
        poi,notpoi = gather_values(data_dict,i)
        print i,  round(poi.count("NaN")/18.0,2), round(notpoi.count("NaN")/127.0,2), poi.count("NaN") > 5

    data = featureFormat(data_dict, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)

    from sklearn.naive_bayes import GaussianNB
    clf = GaussianNB()# Provided to give you a starting point. Try a varity of classifiers.

    clf.fit(features,labels)
    predictions = clf.predict(features)

    from sklearn.metrics import classification_report
    print classification_report(labels,predictions)

    x = data[:,1]
    y = data[:,2]
    color = data[:,0]
    
    xlim = (int(min(x)*0.9),int(max(x)*1.1))
    ylim = (int(min(y)*0.9),int(max(y)*1.1))

    import numpy as np
    xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 71),
                     np.linspace(ylim[0], ylim[1], 81))

    z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])
    z = z[:, 1].reshape(xx.shape)


    plt.scatter(x,y,c=color,s=50)
    plt.contour(xx,yy,z,[0.5],colors="k")

    plt.show()
Пример #29
0
def nbayes(source, target):
    """ Naive Bayes Classifier
    """
    clf = GaussianNB()
    source.loc[source[source.columns[-1]] > 0, source.columns[-1]] = 1
    source.loc[source[source.columns[-1]] < 1, source.columns[-1]] = 0
    # set_trace()
    # source = SMOTE(source, k=1)
    # set_trace()
    features = source.columns[:-1]
    klass = source[source.columns[-1]]
    clf.fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    distr = clf.predict_proba(target[target.columns[:-1]])
    return preds, distr[:,1]
Пример #30
0
def test_gnb():
    """
    Gaussian Naive Bayes classification.

    This checks that GaussianNB implements fit and predict and returns
    correct values for a simple toy dataset.
    """

    clf = GaussianNB()
    y_pred = clf.fit(X, y).predict(X)
    assert_array_equal(y_pred, y)

    y_pred_proba = clf.predict_proba(X)
    y_pred_log_proba = clf.predict_log_proba(X)
    assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
class Classifier:
    def __init__(self, method):
        if method == 'knn':
            self.name = 'knn_classifier'
            self.fit = self._knn_fit
            self.predict = self._knn_predict
            self.predict_proba = self._knn_predict_proba
        elif method == 'random_forest':
            self.name = 'random_forest_classifier'
            self.fit = self._randomf_fit
            self.predict = self._randomf_predict
            self.predict_proba = self._randomf_predict_proba
        elif method == 'bayes':
            self.name = 'naive_bayes_classifier'
            self.fit = self._bayes_fit
            self.predict = self._bayes_predict
            self.predict_proba = self._bayes_predict_proba
        elif method == 'tree':
            self.name = 'decision_tree_classifier'
            self.fit = self._tree_fit
            self.predict = self._tree_predict
            self.predict_proba = self._tree_predict_proba
        elif method == 'svc':
            self.name = 'support_vector_classification'
            self.fit = self._svc_fit
            self.predict = self._svc_predict
            self.predict_proba = self._svc_predict_proba
        elif method == 'linearsvc':
            self.name = 'linear_support_vector_classification'
            self.fit = self._lsvc_fit
            self.predict = self._lsvc_predict
            self.predict_proba = self._lsvc_predict_proba
        elif method == 'logisticregression':
            self.name = 'logistic_regression'
            self.fit = self._lr_fit
            self.predict = self._lr_predict
            #self.predict_proba = self._lsvc_predict_proba
        else:
            print('Classifying method not found')
            sys.exit(-1)

    def _knn_fit(self, X, y):
        print('Training the knn classifier...')
        self._classifier = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
        self._classifier.fit(X, y)
        print('Done!')

    def _knn_predict(self, X):
        predictions = self._classifier.predict(X)
        values, counts = np.unique(predictions, return_counts=True)
        return values[np.argmax(counts)]

    def _knn_predict_proba(self, X):
        pred_probabilities = self._classifier.predict_proba(X)
        print 'Predicted probabilities'
        return pred_probabilities

    def _lr_fit(self, X, y):
        print('Training the logistic regression...')
        self._classifier = LogisticRegression()
        self._classifier.fit(X, y)
        print('Done!')

    def _lr_predict(self, X):
        predictions = self._classifier.predict(X)
        values, counts = np.unique(predictions, return_counts=True)
        return values[np.argmax(counts)]

    def _randomf_fit(self, X, y):
        print('Training the Random forest classifier...')
        self._classifier = RandomForestClassifier(max_depth=3,
                                                  random_state=0,
                                                  n_jobs=-1)
        self._classifier.fit(X, y)
        print('Done!')

    def _randomf_predict(self, X):
        predictions = self._classifier.predict(X)
        values, counts = np.unique(predictions, return_counts=True)
        return values[np.argmax(counts)]

    def _randomf_predict_proba(self, X):
        pred_probabilities = self._classifier.predict_proba(X)
        print 'Predicted probabilities'
        return pred_probabilities

    def _bayes_fit(self, X, y):
        print('Training the Gaussian Naive Bayes classifier...')
        self._classifier = GaussianNB()
        self._classifier.fit(X, y)
        print('Done!')

    def _bayes_predict(self, X):
        predictions = self._classifier.predict(X)
        values, counts = np.unique(predictions, return_counts=True)
        return values[np.argmax(counts)]

    def _bayes_predict_proba(self, X):
        pred_probabilities = self._classifier.predict_proba(X)
        print 'Predicted probabilities'
        print pred_probabilities
        return pred_probabilities

    def _tree_fit(self, X, y):
        print('Training the Decision tree classifier...')
        self._classifier = tree.DecisionTreeClassifier()
        self._classifier.fit(X, y)
        print('Done!')

    def _tree_predict(self, X):
        predictions = self._classifier.predict(X)
        values, counts = np.unique(predictions, return_counts=True)
        return values[np.argmax(counts)]

    def _tree_predict_proba(self, X):
        pred_probabilities = self._classifier.predict_proba(X)
        print 'Predicted probabilities'
        return pred_probabilities

    def _svc_fit(self, X, y):
        print('Training the Support Vector classifier...')
        # ovr = one vs. rest | ovo = one vs. one
        self._classifier = svm.SVC(decision_function_shape='ovr')
        self._classifier.fit(X, y)
        print('Done!')

    def _svc_predict(self, X):
        predictions = self._classifier.predict(X)
        values, counts = np.unique(predictions, return_counts=True)
        return values[np.argmax(counts)]

    def _svc_predict_proba(self, X):
        pred_probabilities = self._classifier.predict_proba(X)
        print 'Predicted probabilities'
        return pred_probabilities

    def _lsvc_fit(self, X, y):
        print('Training the Linear Support Vector classifier...')
        self._classifier = svm.LinearSVC()
        self._classifier.fit(X, y)
        print('Done!')

    def _lsvc_predict(self, X):
        predictions = self._classifier.predict(X)
        values, counts = np.unique(predictions, return_counts=True)
        return values[np.argmax(counts)]

    def _lsvc_predict_proba(self, X):
        pred_probabilities = self._classifier.predict_proba(X)
        print 'Predicted probabilities'
        return pred_probabilities
Пример #32
0
data.shape, iris.target.shape
((150, 4), (150, ))

X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    test_size=0.5,
                                                    random_state=0)

X_train.shape, y_train.shape
((90, 4), (90, ))
X_test.shape, y_test.shape
((60, 4), (60, ))

classifier = GaussianNB()
model = classifier.fit(X_train, y_train)
y = classifier.predict_proba(X_train)
print(y)

abc = classifier.predict(X_test)
print(abc)
print(metrics.accuracy_score(y_test, abc))

sl = LabelEncoder()
r_data = np.array(sl.fit_transform(data.Species))

vpred = cross_val_predict(classifier, iris.data, iris.target, cv=5)
print(vpred)
print(metrics.accuracy_score(iris.target, vpred))

x1 = metrics.mean_absolute_error(r_data, vpred)
x12 = math.sqrt(metrics.mean_absolute_error(r_data, vpred))
Пример #33
0
		x_train, x_test, y_train, y_test = train_test_split(selected, labels, random_state=0)
		y_train = y_train.ravel()
		y_test_hot = label_binarize(y_test, classes=range(1, 10))
		knn = KNeighborsClassifier(n_neighbors=3)
		knn.fit(x_train, y_train)
		knn_score = knn.predict_proba(x_test)
		knn_fpr, knn_tpr, knn_thresholds = roc_curve(y_test_hot.ravel(), knn_score.ravel())
		knn_auc = auc(knn_fpr, knn_tpr)
		knn_accuracy = knn.score(x_test, y_test)
		accuracy[0].append(knn_accuracy)
		AUC[0].append(knn_auc)
		print("KNN分类精度:", knn_accuracy)
		print("AUC值:", knn_auc)
		NB = GaussianNB()
		NB.fit(x_train, y_train)
		NB_score = NB.predict_proba(x_test)
		NB_fpr, NB_tpr, NB_thresholds = roc_curve(y_test_hot.ravel(), NB_score.ravel())
		NB_auc = auc(NB_fpr, NB_tpr)
		NB_accuracy = NB.score(x_test, y_test)
		accuracy[1].append(NB_accuracy)
		AUC[1].append(NB_auc)
		print("NB分类精度:", NB_accuracy)
		print("AUC值:", NB_auc)
		clf = svm.SVC(probability=True)
		clf.fit(x_train, y_train)
		svm_score = clf.predict_proba(x_test)
		svm_fpr, svm_tpr, svm_thresholds = roc_curve(y_test_hot.ravel(), svm_score.ravel())
		svm_auc = auc(svm_fpr, svm_tpr)
		svm_accuracy = clf.score(x_test, y_test)
		accuracy[2].append(svm_accuracy)
		AUC[2].append(svm_auc)
Пример #34
0
# Save the full_test file as a Pandas Dataframe
file = response["Body"].read()
test = pd.read_csv(io.BytesIO(file), delimiter=",")


# Fill Nan values with 0
test = test.fillna(0)


# Assign Features Columns from Test dataset to variables 'features' to be used
# in predicting the targets
features = list(test.values[:, 4:])


# Predict Target Probability for the test
target_pred = clf.predict_proba(features)


# Create a for loop to predict each row of the final test and save it to
# final_pred dataframe
final_pred = []
for i in (list(range(len(test)))):
    # print(i)
    test_id = str(test.id.iloc[i])

    # Because the predict_proba gives as array for the probability for each
    # class 0 and 1 in our case. We will only use the
    # Probability of class 1 which is the second element of the array

    predicted_rating = target_pred[i][1]
Пример #35
0
# 1/5 (20%) test
from sklearn.model_selection import train_test_split
x = df.iloc[:, 0:4]  # features # ending index is exclusive
y = df.iloc[:, 4]  # target (index:5)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Fit - Train Naive Bayes
print('Train Naive Bayes')
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(x_train, y_train)

# Predict / Test
print('Predict')
y_pred = clf.predict(x_test)  # Predicted classes
y_pred_prob = clf.predict_proba(x_test)[:, 1]  # Probability

# Accurancy Score
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print('\nAccuracy Score: ' + str(accuracy))

print('\nConfusion Matrix - Check Accurancy')
# Confusion Matrix - Check Accurancy
confusion_matrix = pd.crosstab(y_test,
                               y_pred,
                               rownames=['Actual'],
                               colnames=['Prediction'])
print(confusion_matrix)

# Precision, recall and f1-score
    print("\n")

    print("W2V Gaussian Naive Bayes")

    # Compute accuracy
    accuracy = metrics.accuracy_score(t, p, normalize=False)
    print("Accuracy: ", (accuracy / len(t)) * 100)

    # Confusion matrix
    confusion_matrix = metrics.confusion_matrix(t, p)
    print("Confusion Matrix:\n", confusion_matrix)

    # Replace 4s with 1s
    t[np.where(t == 4)] = 1
    p[np.where(p == 4)] = 1

    y_score = clf.predict_proba(z)

    # Plot the Precision-Recall curve
    precision, recall, _ = metrics.precision_recall_curve(t, y_score[:, 1])
    plt.step(recall, precision, color='b', alpha=0.2, where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    average_precision = metrics.average_precision_score(t, p)
    plt.title('W2V Gaussian NB Precision-Recall curve: AP={0:0.2f}'.format(
        average_precision))
    plt.savefig('data/w2v_GaussianNB_precisionRecall.png')
    plt.show()
Пример #37
0
#Decision tree
from sklearn.tree import DecisionTreeClassifier

rf3 = DecisionTreeClassifier()
rf3.fit(X_train, y_train)
y_val_pred3 = rf3.predict_proba(X_val)
y_val_pred_acc3 = rf3.predict(X_val)
print(log_loss(y_val, y_val_pred3))
print(accuracy_score(y_val, y_val_pred_acc3))

#Naive Bayes
from sklearn.naive_bayes import GaussianNB

rf4 = GaussianNB()
rf4.fit(X_train, y_train)
y_val_pred4 = rf4.predict_proba(X_val)
y_val_pred_acc4 = rf4.predict(X_val)
print(log_loss(y_val, y_val_pred4))
print(accuracy_score(y_val, y_val_pred_acc4))

#Bagging
from sklearn.ensemble import BaggingClassifier

rf5 = BaggingClassifier()
rf5.fit(X_train, y_train)
y_val_pred5 = rf5.predict_proba(X_val)
y_val_pred_acc5 = rf5.predict(X_val)
print(log_loss(y_val, y_val_pred5))
print(accuracy_score(y_val, y_val_pred_acc5))

#KNN
Пример #38
0
df = pd.read_csv("~/Desktop/My DM/Baltimore/Baltimore.csv", low_memory=False)

features = [
    "Month of the Crime", "Mean Temperature", "Mean Dew Point",
    "Mean Visibility", "Max Humidity", "Mean Wind Speed", "Max Sea Level"
]

x = df[features]
y = df["Crime Type"]

print 'Partial Fit - training classifier'
clf_pf = GaussianNB()
clf_pf.partial_fit(x, y, np.unique(y))

print '--Cross Validation--'
scores = cross_validation.cross_val_score(clf_pf, x, y, cv=5)
print scores.mean()

print '--Random Split--'
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(
    x, y, test_size=0.2, random_state=0)
clf1 = GaussianNB().fit(X_train, Y_train)
print clf1.score(X_test, Y_test)

# Test file
df_test = pd.read_csv("~/Desktop/My DM/Baltimore/Test_Baltimore.csv",
                      low_memory=False)
xt = df_test[features]
print 'Partial Fit Predicted - ' + str(clf_pf.predict(xt))
print 'Predict Probability - ' + str(clf_pf.predict_proba(xt))
    dropSimilarity = [
        p for col, p in zip(namesToPlot, densitySimilarity) if p > th
    ]
    #g = sns.FacetGrid(df, hue='Class')
    X = df[namesToPlot].drop(dropList, axis=1)
    y = df['Class']

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42,
                                                        stratify=y)

    clf_nb = GaussianNB()

    clf_nb.fit(X_train, y_train)

    y_pred = clf_nb.predict(X_test)
    y_pred_prob = clf_nb.predict_proba(X_test)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print('balanced accuracy score', balanced_accuracy_score(y_test, y_pred))
    print(roc_auc_score(1 - y_test, y_pred_prob[:, 0]))
    print(roc_auc_score(y_test, y_pred_prob[:, 1]))
    accuracy.append(balanced_accuracy_score(y_test, y_pred))

plt.figure(3, figsize=(6, 6))

plt.plot(thresh, accuracy)
plt.xlabel('thresh')
plt.ylabel('accuracy score')
Пример #40
0
decTreeClassifier.fit(X_train, y_train)
predTree = decTreeClassifier.predict(X_test)
probaTree = decTreeClassifier.predict_proba(X_test)
probaTree = probaTree[:, 1]

# Look to change n_estimators (trees), criterion, max_depth and others
randomForest = RandomForestClassifier(random_state=24)
randomForest.fit(X_train, y_train)
predForest = randomForest.predict(X_test)
probaForest = randomForest.predict_proba(X_test)
probaForest = probaForest[:, 1]

gaussNB = GaussianNB(random_state=24)
gaussNB.fit(X_train, y_train)
predGauss = gaussNB.predict(X_test)
probaGauss = gaussNB.predict_proba(X_test)
probaGauss = probaGauss[:, 1]

names = ["Logistic", "Gaussian", "RanForest", "DecTree", "SVC", "KNN"]
predictions = {"Logistic" : [probaLog, predLog], "Gaussian" : [probaGauss, predGauss], "RanForest" : [probaForest, predForest], "DecTree" : [probaTree, predTree],\
         "SVC": [probaSVC, predSVC],"KNN" : [probaNeigh, predNeigh]}

auc_results = {}  # store ROC-AUC results
accuracy_results = {}
class_report_results = {}


def metrics_calculator(y_test, predictions, names):
    for name in names:
        auc_score = roc_auc_score(y_test, predictions[name][0])
        accuracy = accuracy_score(y_test, predictions[name][1])
Пример #41
0
        rec_loss_train[epoch] /= N_batch
        kl_loss_train[epoch] /= N_batch
        total_loss_train[epoch] /= N_batch

        codings_val = sess.run([code],
                               {data: config.reshape([-1, size, size])})[0]

    output_config = sess.run([decoder],
                             {data: config.reshape([-1, size, size])})[0]

#-----------------------gaussian naive bayes classification score---------------------
clf = GaussianNB()
clf.fit(codings_val, labels_actual)
MAP_score = np.round(clf.score(codings_val, labels_actual), 4)
print('classification score = ', MAP_score)
predict = clf.predict_proba(codings_val)

#sort the labels in decending order in probability, then add 1 (becasue it was counting from 0)
sorted_predict = np.argsort(-predict) + 1

#calculate hamming distance between predicted history and actual history
hamming_dist_all = np.zeros(N)

for i in range(N):
    hamming_dist_all[i] = distance.hamming(field_history[i], sorted_predict[i])

hamming_dist = np.mean(hamming_dist_all)

#-------------------------------------------------plotting code-----------------------------------------------------

idx = np.random.choice(test_index, 12)
Пример #42
0
#Plot precision rate of each method
index = 0
for method in method_list.loc[0, :]:
    clf = method
    clf.fit(xtrain, ytrain)
    buypredicted = clf.predict_proba(xtest)
    precision, recall, threshold = precision_recall_curve(
        ytest, buypredicted[:, 1])
    plot_precision_recall_vs_threshold(index, stock, method_list, precision,
                                       recall, threshold)
    plt.show()
    index = index + 1
#%%       Naive Bayes
clfbuy = GaussianNB(var_smoothing=1)
clfbuy.fit(xtrain, ytrain)
buypredicted = clfbuy.predict_proba(xshow)
dfplot = pd.DataFrame()
dfplot.loc[:, 'Close'] = rawdata
dfplot.loc[:, 'GoodBuyProb'] = buypredicted[:, 1]
plot_buy('Naive Bayes', dfplot, stock, 0.9, 1, 0.03)
#%%  SVM
clfbuy = svm.SVC(C=1, kernel='linear', probability=True)
clfbuy.fit(xtrain, ytrain)
buypredicted = clfbuy.predict_proba(xshow)
dfplot = pd.DataFrame()
dfplot.loc[:, 'Close'] = rawdata
dfplot.loc[:, 'GoodBuyProb'] = buypredicted[:, 1]
plot_buy('SVM Linear', dfplot, stock, 0.9, 0.99, 0.02)
#%%  SVM
clfbuy = svm.SVC(C=1, probability=True)
clfbuy.fit(xtrain, ytrain)
clf.partial_fit(iris.data, iris.target,classes=[0,1,2])

'''

#学习后模型中的一些参数
clf.set_params(
    priors=[0.333, 0.333,
            0.333])  #这里要设一下各个类标记对应的先验概率,如果不设置直接clf.priors返回的是None(不知道为什么?)
print(clf.priors)  #获取各个类标记对应的先验概率
print(clf.class_prior_
      )  #同priors一样,都是获取各个类标记对应的先验概率,区别在于priors属性返回列表,class_prior_返回的是数组
print(clf.get_params(deep=True))  #返回priors与其参数值组成字典

print(clf.class_count_)  #获取各类标记对应的训练样本数
print(clf.theta_)  #获取各个类标记在各个特征上的均值
print(clf.sigma_)  #获取各个类标记在各个特征上的方差

#测试数据
data_test = np.array([6, 4, 6, 2])
data = data_test.reshape(1, -1)
Result_predict = clf.predict(data)
Score = clf.score([[6, 8, 5, 3], [5, 3, 4, 2], [4, 6, 7, 2]], [2, 0, 1],
                  sample_weight=[0.3, 0.5, 0.2])

Result_predict_proba = clf.predict_proba(data)
Result_predict_log_proba = clf.predict_log_proba(data)
print(Result_predict)  #预测所属类别
print(Result_predict_proba)  #输出测试样本在各个类标记上预测概率值
print(Result_predict_log_proba)  #输出测试样本在各个类标记上预测概率值对应对数值
print(Score)  #返回测试样本映射到指定类标记上的得分(准确率)
Пример #44
0
    #print(testdata.shape)
    #print(traindata.shape)
    #X represents model input, Y represents binary labels
    traindataX = traindata.iloc[:, 4:622]
    traindataY = traindata.iloc[:, 622]
    testdataX = testdata.iloc[:, 4:622]
    testdataY = testdata.iloc[:, 622]

    #Create Naive Bayes Model
    gnb = GaussianNB()

    #train model
    gnb.fit(traindataX, traindataY)
    #print("Model started")

    predictionsproba = gnb.predict_proba(testdataX)[:, 1]
    #print(roc_auc_score(testdataY, predictionsproba))
    AUC.append(roc_auc_score(testdataY, predictionsproba))
    globpred += predictionsproba.tolist()
    globy_test += testdataY.tolist()

#print out AUC and AUC graph
print "The AUC is"
print(roc_auc_score(globy_test, globpred))
#print np.mean(AUC)
false_positive_rate, true_positive_rate, thresholds = roc_curve(
    globy_test, globpred)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic GNB Custom Split')
plt.plot(false_positive_rate,
         true_positive_rate,
Пример #45
0
Наивность предпологает независимость всех признаков 
"""
"""
Вероятно, самый простой для понимания наивный юайесовский классификатор - Гауссов. 
В этом классификаторе  допущение состоит в том, что данные всех категорний взяты из 
простого нормального распределения (без ковариации между измерениями)
"""
fig, ax = plt.subplots()
X, y = make_blobs(100, 2, centers=2, random_state=2, cluster_std=1.5)
ax.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu')
plt.savefig('images\\gaussian1')

model = GaussianNB()
model.fit(X, y)

rng = np.random.RandomState(0)
Xnew = [-6, -14] + [14, 18] * rng.rand(2000, 2)
ynew = model.predict(Xnew)

fig, ax = plt.subplots()
ax.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu')
ax.scatter(Xnew[:, 0], Xnew[:, 1], c=ynew, s=20, cmap='RdBu', alpha=0.1)
plt.savefig('images\\gaussian2')
"""
Положительная сторона этого байесовского формального представления 
заключается в возможности естественной вероятности классификатора
"""
yprob = model.predict_proba(Xnew)
print(yprob.round(2))

plt.show()
Пример #46
0
        rm = comments['replies'].max()
        if lm != 0:
            comments['likes'] = comments['likes'] / lm
        if rm != 0:
            comments['replies'] = comments['replies'] / rm
        comments['sum'] = comments['likes'] + comments['replies']
        comments = comments.sort_values('sum', ascending=False)
        comments = comments.reset_index(drop=True)
        ver_kw = pd.read_csv('./verification-keywords')
        ver_kw = ver_kw.columns
        comments['ver'] = 0
        for com in range(len(comments)):
            ver = 0
            for word in comments.loc[com, 's_text']:
                if word in ver_kw:
                    ver = 1
            comments.loc[com, 'ver'] = ver
        comments.sort_values('ver', ascending=False)
        comments['len'] = comments['text'].apply(len)
        # The file which contains the data of the comments(panda):
        # if file does not exist write header

X_test = comments[['len', 'likes', 'replies']]
Y_test = comments['ver']

print(model.predict(X_test))
print(model.predict_proba(X_test))
pred = model.predict_proba(X_test)
pred = pd.DataFrame(pred)
comments['result'] = pred
#!/usr/bin/env python
# -*- coding=utf-8 -*-
__author__ = "柯博文老師 Powen Ko, www.powenko.com"
from sklearn.naive_bayes import GaussianNB
import numpy as np

X = np.array([[9, 9], [9.2, 9.2], [9.6, 9.2], [9.2, 9.2], [6.7, 7.1], [7, 7.4],
              [7.6, 7.5], [7.2, 10.3], [7.3, 10.5], [7.2, 9.2], [7.3, 10.2],
              [7.2, 9.7], [7.3, 10.1], [7.3, 10.1]])
Y = np.array([1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2])

model = GaussianNB()
model.fit(X, Y)
print(model.class_prior_)
print(model.get_params())

#Predict Output
x_test = np.array([[8, 8], [8.3, 8.3]])
predicted = model.predict(x_test)
print(predicted)
print(model.predict_proba(x_test))
Пример #48
0
def single_modality_classification(modality_type):
    """Classify data based on data's modality

	:param number_of_folds: int. Number of folds for crossvalidation
	:param modality_type: string. Data's modality, namely 'a' for audio and 'v' for video
	"""
    #path = '/home/samuel/Dropbox/Dissertacao/repo/samples/smalldataset/'

    globals.path_init('geometry')

    X = []
    Y = []
    mean_acc = []
    std = []

    with open(globals.path + modality_type + '_features.csv', 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        for row in reader:
            X.append([float(x) for x in row[:-1]])
            Y.append(int(row[-1]))

    # if modality_type is 'v':# or modality_type is 'av':
    # 	transformer = TfidfTransformer()
    # 	X = transformer.fit_transform(X).todense().tolist()
    #X = transformer.toarray()
    #E = np.random.uniform(0, 0.1, size=(len(X), 20))

# Add the noisy data to the informative features

# X = np.array(X)
# X_indices = np.arange(X.shape[-1])
# selector = SelectPercentile(f_classif, percentile=10)
# selector.fit(X, Y)
# scores = -np.log10(selector.pvalues_)
# scores /= scores.max()
# pl.clf()
# pl.bar(X_indices - .45, scores, width=.2, color='g')
# pl.ylabel(r'Escore univariado ($-Log(p_{value})$)')
# pl.xlabel('Numero da feature')
# pl.title('Discriminancia das features - ' + modality_type)
# pl.show()
# print '!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
# print len(scores)
    Y_ground_truth = []
    Y_predicted = []
    Y_prob = []

    for classifier in ['NaiveBayes', 'RandomForest']:
        #NaiveBayes', 'DecisionTree', 'LogisticRegression', 'LDA', 'Adaboost', 'GradientBoosting', 'RandomForest', 'ANN', 'SVM', 'KNN']:
        #print "Training %s" % modality_type

        Y_ground_truth = []
        Y_predicted = []
        Y_prob = []

        acc = []
        precision = []
        f1 = []

        kf = KFold(len(X), globals.number_of_folds)

        for train_index, test_index in kf:

            # KFold split
            x_train = np.zeros(shape=(len(train_index), len(X[0])))
            y_train = np.zeros(shape=(len(train_index)))
            for i in range(len(train_index)):
                for j in range(len(X[0])):
                    x_train[i][j] = X[train_index[i]][j]
                y_train[i] = Y[train_index[i]]
            x_test = np.zeros(shape=(len(test_index), len(X[0])))
            y_test = np.zeros(shape=(len(test_index)))
            for i in range(len(test_index)):
                for j in range(len(X[0])):
                    x_test[i][j] = X[test_index[i]][j]
                y_test[i] = Y[test_index[i]]

            #subsetSize = 0.6

            #x_train, unusedX, y_train, unusedY = train_test_split(x_train, y_train, train_size=subsetSize, random_state=1)

            clf = None
            if classifier == 'NaiveBayes':
                clf = GaussianNB().fit(x_train, y_train)
            elif classifier == 'DecisionTree':
                clf = DecisionTreeClassifier().fit(x_train, y_train)
            elif classifier == 'LogisticRegression':
                clf = LogisticRegression().fit(x_train, y_train)
            elif classifier == 'LDA':
                clf = LDA().fit(x_train, y_train)
            elif classifier == 'Adaboost':
                clf = AdaBoostClassifier(n_estimators=100).fit(
                    x_train, y_train)
            elif classifier == 'GradientBoosting':
                clf = GradientBoostingClassifier(n_estimators=100,
                                                 learning_rate=1.0,
                                                 max_depth=1,
                                                 random_state=0).fit(
                                                     x_train, y_train)
            elif classifier == 'RandomForest':
                clf = RandomForestClassifier(n_estimators=100).fit(
                    x_train, y_train)
            elif classifier == 'ANN':
                clf = Perceptron(penalty=None,
                                 alpha=0.0001,
                                 fit_intercept=True,
                                 n_iter=20,
                                 shuffle=False,
                                 verbose=0,
                                 eta0=1.0,
                                 n_jobs=1,
                                 random_state=0,
                                 class_weight=None,
                                 warm_start=False,
                                 seed=None).fit(x_train, y_train)
            elif classifier == 'SVM':
                clf = SVC(C=1.0,
                          cache_size=200,
                          class_weight=None,
                          coef0=0.0,
                          degree=3,
                          gamma=0.0,
                          kernel='linear',
                          max_iter=-1,
                          probability=True,
                          random_state=None,
                          shrinking=True,
                          tol=0.001,
                          verbose=False).fit(x_train, y_train)
            elif classifier == 'KNN':
                clf = KNeighborsClassifier(n_neighbors=20).fit(
                    x_train, y_train)
            else:
                pass

            y_pred = clf.predict(x_test)

            acc.append(accuracy_score(y_test, y_pred))

            Y_ground_truth.extend(y_test)
            Y_predicted.extend(y_pred)
            Y_prob.extend(clf.predict_proba(x_test))

        # print '#################################'
        # print acc
        # print np.mean(acc)
        # print np.std(acc)

        # # print get_mean_ci(acc)
        # print '----------------------------------'

        # fpr, tpr, thresholds = roc_curve(Y_ground_truth, Y_prob)
        # roc_auc = auc(fpr, tpr)
        # print auc(Y_ground_truth, Y_predicted), f1_score(Y_ground_truth, Y_predicted), accuracy_score(Y_ground_truth, Y_predicted)

        #Mean accuracy and std of each classifier
        mean_acc.append(100 * np.mean(acc))
        std.append(100 * np.std(acc))

    cm = confusion_matrix(Y_ground_truth, Y_predicted)

    return mean_acc, std, cm
# Training the model or loading the trained model-----------------------------------------------------------------------------------------------------------------------------------------------------------
from sklearn.preprocessing import LabelEncoder  #, OneHotEncoder
# Encoding the Dependent Variable
labelencoder_query_class = LabelEncoder()
query_class = labelencoder_query_class.fit_transform(query_class)

# Fitting Naive Bayes to the Dataset
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(corpus_piazza_classify, query_class)

# Predicting the category in which the latest read mail belongs to....
test_input_clean = clean_dataset([test_input_og])
test_input = cv.transform(test_input_clean).toarray()
test_prediction = classifier.predict(test_input)
test_prediction_proba = classifier.predict_proba(
    test_input)  # Get probability for each category for given test input

test_prediction_text = labelencoder_query_class.inverse_transform(
    test_prediction)
print("The query belong to category --> ", test_prediction_text[0])

# Setting the label to the mail using gmail API---------------------------------------------------------------------------------------------------------------------------------------------------------
# Modifying message's label
print("Adding label to the latest received message....")
msg_labels = CreateMsgLabels()  # Create object to update labels
msg_labels['addLabelIds'] = [
    get_label_id(test_prediction_text[0], service, user_id)
]
ModifyMessage(service, user_id, latest_received_msg_id, msg_labels)

# Selecting the most appropriate reply by sentence similarity-------------------------------------------------------------------------------------------------------------------------------------------
Пример #50
0
y_pred3 = classifier3.predict(X_test)
cm3 = confusion_matrix(y_test, y_pred3)

# SVM Kernel (Gaussian RBF Kernel) Algo : landmark optimized (Strongest so far)
from sklearn.svm import SVC
classifier4 = SVC(kernel = 'rbf', probability = True)
classifier4.fit(X_train, y_train)
y_prob4 = classifier4.predict_proba(X_test)
y_pred4 = classifier4.predict(X_test)
cm4 = confusion_matrix(y_test, y_pred4)

# Baye's theorum based algorithm
from sklearn.naive_bayes import GaussianNB
classifier5 = GaussianNB()
classifier5.fit(X_train, y_train)
y_prob5 = classifier5.predict_proba(X_test)
y_pred5 = classifier5.predict(X_test)
cm5 = confusion_matrix(y_test, y_pred5)

# rubbish overfitting
from sklearn.tree import DecisionTreeClassifier
classifier6 = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier6.fit(X_train, y_train)
y_prob6 = classifier6.predict_proba(X_test)
y_pred6 = classifier6.predict(X_test)
cm6 = confusion_matrix(y_test, y_pred6)

# rubbish overfitting but a lot useful
from sklearn.ensemble import RandomForestClassifier
classifier7 = RandomForestClassifier(n_estimators = 50, criterion ="entropy")
classifier7.fit(X_train, y_train)
W_train = data_train[:, 31][r < p]
# Last 10% are validation
Y_valid = data_train[:, 32][r >= p]
X_valid = data_train[:, 1:31][r >= p]
W_valid = data_train[:, 31][r >= p]

##############################################################
########## Training the Classifier and select threshold ######
##############################################################

#Training gaussean naive bayes classifier
classifier = GaussianNB()
classifier.fit(X_train, Y_train, W_train)

#Testing the classifier
prob_predict_train = classifier.predict_proba(X_train)[:, 1]
prob_predict_valid = classifier.predict_proba(X_valid)[:, 1]

# decide the threshold
amstrain = []
amsvalid = []
x_axis = []

for i in range(1, 100):
    pcut = np.percentile(prob_predict_train, i)
    #print(i)
    # This are the final signal and background predictions
    Yhat_train = prob_predict_train > pcut
    Yhat_valid = prob_predict_valid > pcut

    # To calculate the AMS data, first get the true positives and true negatives
Пример #52
0
                                       C11=tmp_v[7][1, 1])
                    svm_best_f1 = f1(C00=tmp[7][0, 0],
                                     C01=tmp[7][0, 1],
                                     C10=tmp[7][1, 0],
                                     C11=tmp[7][1, 1])
                    svm_matrix[K] = tmp[7]
                    svm_matrix_v[K] = tmp_v[7]
                    fpr, tpr, thresholds = metrics.roc_curve(label_total,
                                                             df,
                                                             pos_label=1)
                    svm_auc[K] = metrics.auc(fpr, tpr)
                    svm_auc_v[K] = auc_v

            gb = GaussianNB()
            predict_label = gb.fit(data1, label1).predict(data)
            df = gb.predict_proba(data)[:, 1]
            df = np.concatenate((df, df_fixed), axis=0)
            tmp = np.zeros((10, 2, 2))
            tmp[7] = confusion_matrix(label, predict_label)
            tmp[7][0, 0] = tmp[7][0, 0] + n00
            tmp[7][1, 0] = tmp[7][1, 0] + n10
            tmp[7][0, 1] = tmp[7][0, 1] + n01
            tmp[7][1, 1] = tmp[7][1, 1] + n11

            gb_v = GaussianNB()
            predict_label_v = gb_v.fit(data1, label1).predict(data_v)
            df_v = gb_v.predict_proba(data_v)[:, 1]
            df_v = np.concatenate((df_v, df_fixed_v), axis=0)
            tmp_v = np.zeros((10, 2, 2))
            tmp_v[7] = confusion_matrix(label_v, predict_label_v)
            tmp_v[7][0, 0] = tmp_v[7][0, 0] + n00_v
Пример #53
0
    ans = []
    for j in dataset_train[i]:
        ans.append(float(j))
    dataset_train_float.append(ans)

for i in range(len(dataset_test)):
    ans = []
    for j in dataset_test[i]:
        ans.append(float(j))
    dataset_test_float.append(ans)

clf4 = GaussianNB()
clf4 = clf4.fit(dataset_train_float, list(chain.from_iterable(datalabels_train)))

predicted4 = clf4.predict(dataset_test_float)
probas_ = clf4.predict_proba(dataset_test_float)
fpr_NB, tpr_NB, thresholds_NB = metrics.roc_curve(datalabels_test, probas_[:, 1])
roc_auc_NB = metrics.auc(fpr_NB, tpr_NB)
#plt.plot(fpr_NB, tpr_NB, lw=1, label='naive bayes' )


print("roc_auc_NB", roc_auc_NB)
print('naive bayes accuracy:', clf4.score(dataset_test_float, datalabels_test))
print(metrics.classification_report(datalabels_test, predicted4,))
# print(metrics.confusion_matrix(datalabels_test, predicted4))


#  xgboost
print("Xgboost")
seed = 2
test_size = 0.2
def ranked_panchayats(request):

    df = pd.read_csv(cs)
    district = df.District
    taluk = df.Taluk
    gram_panchayat = df.Grampanchayat
    stdofliving = df.Standardoflivingindex
    health = df.Healthindex
    education = df.Educationindex
    hdi = df.HDI

    n_points = 999

    village_info = [[districtz, talukz, gram_panchayatz]
                    for districtz, talukz, gram_panchayatz in zip(
                        district, taluk, gram_panchayat)]
    village_number = [[stdoflivingz, healthz, educationz, hdiz]
                      for stdoflivingz, healthz, educationz, hdiz in zip(
                          stdofliving, health, education, hdi)]

    village = [[
        districtz, talukz, gram_panchayatz, stdoflivingz, healthz, educationz,
        hdiz
    ] for districtz, talukz, gram_panchayatz, stdoflivingz, healthz,
               educationz, hdiz in zip(district, taluk, gram_panchayat,
                                       stdofliving, health, education, hdi)]

    avg_stdofliving = np.sum(stdofliving) / n_points
    avg_health = np.sum(health) / n_points
    avg_education = np.sum(education) / n_points
    avg_hdi = np.sum(hdi) / n_points

    Y = [0] * n_points

    for i in range(0, n_points):
        count = 0
        if stdofliving[i] < avg_stdofliving:
            count = count + 1
        if health[i] < avg_health:
            count = count + 1
        if education[i] < avg_education:
            count = count + 1
        if hdi[i] < avg_hdi:
            count = count + 1
        if count >= 2:
            Y[i] = 1

    print(np.sum(Y))

    X_train = village_number[:750]
    X_test = village_number[750:]
    Y_train = Y[:750]
    Y_test = Y[750:]

    from sklearn.naive_bayes import GaussianNB
    clf = GaussianNB()
    from time import time
    t0 = time()
    clf.fit(X_train, Y_train)
    print("Classification training time:", round(time() - t0, 3), "s")
    pred = clf.predict(X_test)
    # print(pred)
    prob = clf.predict_proba(X_test)
    # print(prob)
    from sklearn.metrics import accuracy_score
    print("Accuracy of Program: ", accuracy_score(pred, Y_test) * 100, "%")

    # print (hdi)
    probability = []
    for i in range(0, 249):
        ss = (1 - prob[i][1]) * 100
        probability.append(ss)

    # print(final_list)
    rank_list = []
    for i in range(0, 249):
        rank_list.append(i + 1)

    final_list = [[
        probab, gram, ran
    ] for probab, gram, ran in zip(probability, gram_panchayat, rank_list)]
    final_list.sort()

    RankedPanchayat.objects.all().delete()

    for ii in range(0, 249):

        panchayat = RankedPanchayat()
        panchayat.panchayat = final_list[ii][1]
        panchayat.dev_index = final_list[ii][0]
        panchayat.rank = ii + 1
        panchayat.save()

    return render(request, 'gaa/index.html')
Пример #55
0
target_names = ['Helmet', 'No Helmet']
print("\n\nClassification Report: \n")
print("Accuracy: %s" % round(accuracy_score(y_test, y_pred), 4))
print("Precision \t: %s" %
      round(precision_score(y_test, y_pred, average='macro'), 4))
print("Recall \t\t: %s" %
      round(recall_score(y_test, y_pred, average='macro'), 4))
print("F1 \t\t: %s" % round(f1_score(y_test, y_pred, average='macro'), 4))

#Percentage of False Negatives
y = y_test - y_pred
fn = sum(y[y > 0]) * 100 / len(y_test)
print("There are %s%% False Negatives" % round(fn, 4))

print("\nExecution time: %s ms" % round((end - start) * 1000, 4))

#ROC curve
y_prob = clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob, pos_label=1)
roc_auc = auc(fpr, tpr)
plt.title('Naive Bayes')
plt.plot(fpr, tpr, 'b', label='AUC = %s' % round(roc_auc, 4))
print("\nAUC \t: %s" % round(roc_auc, 4))
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
Пример #56
0
def run_training(fold_):
    total_roc = []
    total_conf =[]
    
    t0=time.time()
    #df = pd.read_csv("../input/embedded_train_tiny_folds.csv")
    df = pd.read_hdf(
        path_or_buf="../input/tiny_data/full_data_folds.h5",
        key='dataset'
        )
    #print("tg\n",df.target.value_counts())   
    #print(" ") 
    t1=time.time()
    total_time = t1-t0
    print("time to read file",total_time)

    print(f"fold: {fold_}")
        
    t0=time.time()

    train_df = df[df.kfold != fold_].reset_index(drop = True)
    test_df = df[df.kfold == fold_].reset_index(drop = True)
    #    print("train shape\n", train_df.shape)
     #   print("test shape\n", test_df.shape)
        
        #features
    xtrain = train_df.drop(["kfold","target"],axis=1)
    xtest =  test_df.drop(["kfold","target"],axis=1)
        # Standard scaler
        
    sc = StandardScaler()
    sc.fit(xtrain)

    xtrain = sc.transform(xtrain)
    xtest = sc.transform(xtest)
        
        # target
        # First make the target binary
    train_df.target = train_df.target.apply(
        lambda x:'open' if x=='open' else 'closed'
        )

    test_df.target = test_df.target.apply(
        lambda x:'open' if x=='open' else 'closed'
        )    
    ytrain = train_df.target
    ytest = test_df.target
        
        #model

    model=GaussianNB()
    #fit the model on training data 
    model.fit(xtrain,ytrain)
    # make predictions
    preds = model.predict(xtest)
    preds_proba=model.predict_proba(xtest)[:,1]   
    # print('preds shape',preds_proba.shape) 
    
    t1=time.time()
    total_time = t1-t0    
    print('time to fit model:', total_time)
       
    accuracy_score = np.sum(preds == ytest) / len(ytest)       
         #log_loss= metrics.log_loss(train_df.OpenStatus,preds)
        
    #print(f"Fold:{fold_}")
    #print(f"Accuracy={accuracy_score}")
    conf_m=confusion_matrix(ytest,preds)
        #print('Confusion matrix\n',conf_m)
    roc_score=roc_auc_score(ytest, preds_proba)
    print('ROC AUC score\n', roc_score)
    t=[fold_,roc_score]
    total_conf.append(conf_m)
    total_roc.append(t)
    test_df.loc[:,"GNB_pred"] = preds_proba

    return test_df[["id","target","kfold","GNB_pred"]], np.mean(total_roc,axis=0)[1] 
Пример #57
0
class EnsembleModel:
    def __init__(self, models, **params):
        self.models = models.values()
        self.model_funcs = [j.model for j in models.values()]
        self.params = params
        self._pca = PCA(n_components=0.99)
        self._clf = None

    def fit(self, x, y):
        train_x, test_x, train_y, test_y, = train_test_split(x,
                                                             y,
                                                             test_size=0.2)
        pca_train_x = self._pca.fit_transform(train_x)
        pca_test_x = self._pca.transform(test_x)
        for model, model_func in zip(self.models, self.model_funcs):
            if model.json.get("use_pca", False):
                train_x = pca_train_x
                test_x = pca_test_x
            else:
                pass
            model_func.fit(train_x, train_y)
        self._fit_meta_estimator(test_x, test_y)
        return self

    def _fit_meta_estimator(self, x, y):
        predictions = self._predictions(x).T
        y = numpy.atleast_2d(y).T
        labels = numpy.argmin(
            abs(predictions - y * numpy.ones((1, predictions.shape[1]))), 1)
        self._clf = GaussianNB().fit(x, labels)

    def _predictions(self, x):
        pca_x = self._pca.transform(x)
        predictions = []
        weights = []

        for model, model_func in zip(self.models, self.model_funcs):
            if model.json.get("use_pca", False):
                test_x = pca_x
            else:
                test_x = x
            predictions.append(model_func.predict_proba(test_x)[:, 1])
            weights.append(model.best_params()["loss"])
        return numpy.array(predictions)

    def predict_proba(self, x):
        blend = self.params.get("blend", "mean")
        predictions = self._predictions(x)
        if blend == "median":
            return numpy.median(predictions, 0)
        if blend == "meta":
            probs = self._clf.predict_proba(x)
            preds = []
            for row, prob in zip(predictions.T, probs):
                if max(prob) > 0.99:
                    preds.append(row[numpy.argmax(prob)])
                else:
                    preds.append(numpy.median(row))
            return numpy.array(preds)

        return predictions.mean(0)
def list(request):

    df = pd.read_csv(cs)

    district = df.District
    taluk = df.Taluk
    gram_panchayat = df.Grampanchayat
    stdofliving = df.Standardoflivingindex
    health = df.Healthindex
    education = df.Educationindex
    hdi = df.HDI

    n_points = 999

    village_info = [[districtz, talukz, gram_panchayatz]
                    for districtz, talukz, gram_panchayatz in zip(
                        district, taluk, gram_panchayat)]
    village_number = [[stdoflivingz, healthz, educationz, hdiz]
                      for stdoflivingz, healthz, educationz, hdiz in zip(
                          stdofliving, health, education, hdi)]

    avg_stdofliving = np.sum(stdofliving) / n_points
    avg_health = np.sum(health) / n_points
    avg_education = np.sum(education) / n_points
    avg_hdi = np.sum(hdi) / n_points

    Y = [0] * n_points

    for i in range(0, n_points):
        count = 0
        if stdofliving[i] < avg_stdofliving:
            count = count + 1
        if health[i] < avg_health:
            count = count + 1
        if education[i] < avg_education:
            count = count + 1
        if hdi[i] < avg_hdi:
            count = count + 1
        if count >= 2:
            Y[i] = 1

    print(np.sum(Y))

    X_train = village_number[:750]
    X_test = village_number[750:]
    Y_train = Y[:750]
    Y_test = Y[750:]

    from sklearn.naive_bayes import GaussianNB
    clf = GaussianNB()
    from time import time
    t0 = time()
    clf.fit(X_train, Y_train)
    print("Classification training time:", round(time() - t0, 3), "s")
    pred = clf.predict(X_test)
    # print(pred)
    prob = clf.predict_proba(X_test)
    # print(prob)
    from sklearn.metrics import accuracy_score
    print("Accuracy of Program: ", accuracy_score(pred, Y_test) * 100, "%")

    # print (hdi)
    probability = []
    for i in range(0, 249):
        ss = (1 - prob[i][1]) * 100
        probability.append(ss)

    #print(final_list)
    rank_list = []
    for i in range(0, 249):
        rank_list.append(i + 1)

    final_list = [[
        probab, gram, ran
    ] for probab, gram, ran in zip(probability, gram_panchayat, rank_list)]
    final_list.sort()

    for i in range(0, 249):
        final_list[i][2] = i + 1

    #for ii in range(0,249):
    #final_list[ii][0] final_list[ii][1] final_list[ii][2]

    page = request.GET.get('page', 1)

    paginator = Paginator(final_list, 10)

    try:
        users = paginator.page(page)
    except PageNotAnInteger:
        users = paginator.page(1)
    except EmptyPage:
        users = paginator.page(paginator.num_pages)

    return render(request, 'gaa/index.html', {
        'users': users,
        'panchayat': panchayat,
    })
Пример #59
0
# In[ ]:

#Naive Bayes
#Fit The Model
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)

print("Naive Bayes")
print()

train_accuracy = nb_clf.score(X_train, y_train)
test_accuracy = nb_clf.score(X_test, y_test)

#Calculate Out of Sample Predictions
y_pred = nb_clf.predict(X_test)
y_pred_prob = nb_clf.predict_proba(X_test)

#K Fold Validation
results = cross_val_score(nb_clf, X, y, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

#Model Train
classification_output_report(X_train, y_train, X_test, y_test, y_pred,
                             y_pred_prob, train_accuracy, test_accuracy)

print('''
    Analyst Comments:
    The Kfold accuracy is .63 while the train and test accuracy's were 0.66 and 0.67 respectively.
    The recall from the model is 0.84.
''')
Пример #60
0
path = 'lr_submission.csv'
out = open(path, "w")
out.write("id,hotel_cluster\n")
for i in range(len(test['id'].values)):
    out.write(str(test['id'].values[i]) + ',' + ' ' + str(lr_preds[i][0]) + ' ' + str(lr_preds[i][1]) + ' ' + str(lr_preds[i][2]) + ' ' + str(lr_preds[i][3]) + ' ' + str(lr_preds[i][4]))
    out.write("\n")
out.close()

print('Starting Gaussian Naive Bayes')
train_gnb = train.drop("hotel_cluster", axis = 1)
test_gnb = test.drop("id", axis = 1)
train_gnb = train.fillna(0)
test_gnb = test.fillna(0)
gnb_clf = GaussianNB()
gnb_clf.fit(train_gnb, train_gnb['hotel_cluster'].values)
prediction = gnb_clf.predict_proba(test_gnb)
gnb_preds = []
for i in range(len(prediction)):
    gnb_preds.append(prediction[i].argsort()[-5:][::-1])
path = 'gnb_submission.csv'
out = open(path, "w")
out.write("id,hotel_cluster\n")
for i in range(len(test['id'].values)):
    out.write(str(test['id'].values[i]) + ',' + ' ' + str(gnb_preds[i][0]) + ' ' + str(gnb_preds[i][1]) + ' ' + str(gnb_preds[i][2]) + ' ' + str(gnb_preds[i][3]) + ' ' + str(gnb_preds[i][4]))
    out.write("\n")
out.close()

print('Starting KNN')
train_knn = train.drop("hotel_cluster", axis = 1)
test_knn = test.drop("id", axis = 1)
train_knn = train.fillna(0)