from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB()

#train the naive bayes model on features present in count_train
nb_classifier.train(count_train, y_train)

#predict
pred = nb_classifier.predict(count_test)

#check accuracy
score = metrics.accuracy_score(y_test, pred)
print(score)

#confusion matrix
cm = metrics.confusionmatrix(y_test, pred, labels=['FAKE', 'REAL'])
print(cm)

##naive bayes classifier using features created by tdidf vectorizer
nb_classifier2 = MultinomialNB()

nb_classifier2.train(tfidf_train, y_train)

pred2 = nb_classifier2.predict(tfidf_test)

#check accuracy
score2 = metrics.accuracy_score(y_test, pred2)
print(score2)

#confusion matrix
cm = metrics.confusionmatrix(y_test, pred2, labels=['FAKE', 'REAL'])
Exemplo n.º 2
0
employment = {'Salaried': 0, 'Self employed': 1, ' ': 1}

#importing dataset
dataset = pd.readcsv('CreditCardData.csv')
dataset = dataset.dropna()

#creating dummy variable for column EmploymentType
dataset.EmploymentType = [
    employment[number] for number in dataset.EmploymentType
]
loanDefaulter = dataset[['loandefault']]
factors = dataset[[
    'disbursedamount', 'assetcost', 'EmploymentType', 'PRI.CURRENT.BALANCE',
    'PRI.SANCTIONED.AMOUNT', 'PRIMARY.INSTAL.AMT'
]]

#creating training and testing set and scaling the data
factorsTrain, factorsTest, loanTrain, loanTest = sl.traintestsplit(
    factors, loanDefaulter, testsize=0.2, shuffle=True)
scaler = StandardScaler()
factorsTrain = scaler.fittransform(factorsTrain)
factorsTest = scaler.fittransform(factorsTest)

#creating regression object
LogisticRegressor = LogisticRegression()
LogisticRegressor.fit(factorsTrain, loanTrain)
predictedResult = LogisticRegressor.predict(factorsTest)
print('Confusion matrix using solver: ')
print(metrics.confusionmatrix(loanTest, predictedResult))
print('Accuracy using solver: ')
print('Accuracy:', metrics.accuracy_score(loanTest, predictedResult))
    rf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=2, min_samples_leaf=1)
    rf.fit(fps_train, act_train)
    # write the model
    cPickle.dump(rf, outfile, 2)
    i += 1
outfile.close()
print "done"
# make predictions for test compounds
test_SVMpredictions = rf.predict(fps_test)

# cross-validate
scores = cross_validation.cross_val_score(rf, fps_train, act_train, cv=5)
# scores = cross_validation.cross_val_score(clf_RF, X_train,y_train, cv=cv_counter,score_func=metrics.zero_one_score)

# output confusion matrix and percentage accuracy on test sets    
    print metrics.confusionmatrix(act_test, test_SVMpredictions) 
    accuracy = rf.score(fps_test, act_test)
           print accuracy 

# calculate probabilities for each test molecules
test_SVMprobabilities = rf.predict_proba(fps_test)

# compute AUC metric for this CV fold
        fpr, tpr, thresholds = metrics.roc_curve(act_test, test_SVMprobabilities)
        roc_auc = metrics.auc(fpr, tpr)
        print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc)
        mean_auc += roc_auc

    print "Mean AUC: %f" % (mean_auc/n)
    plt.plot(fpr, tpr, label="Model#%d (AUC=%.2f)" % (model_id + 1, roc_auc))
for k,v in testmols_dic.items():       
    test_fp = AllChem.GetMorganFingerprintAsBitVect(k, 2)
    test_fps.append(test_fp)
    test_fps_array = np.array(test_fps)
    test_acts.append(testmols_dic [k])
    test_acts_array = np.array(test_acts)
  
# make predictions for test compounds
test_SVMpredictions = rf.predict(test_fps_array)

# cross-validate
scores = cross_validation.cross_val_score(rf, fps_array, acts_array, cv=5)
# scores = cross_validation.cross_val_score(rf, X_train,y_train, cv=cv_counter,score_func=metrics.zero_one_score)

# output confusion matrix and percentage accuracy on test sets    
    print metrics.confusionmatrix(test_acts_array, test_SVMpredictions) 
    accuracy = rf.score(test_fps_array, test_acts_array)
           print accuracy 

# calculate probabilities for each test molecules
test_SVMprobabilities = rf.predict_proba(test_fps_array)

# compute AUC metric for this CV fold
        fpr, tpr, thresholds = metrics.roc_curve(test_acts_array, test_SVMprobabilities)
        roc_auc = metrics.auc(fpr, tpr)
        print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc)
        mean_auc += roc_auc

    print "Mean AUC: %f" % (mean_auc/n)
    plt.plot(fpr, tpr, label="Model#%d (AUC=%.2f)" % (model_id + 1, roc_auc))