def divide_and_test(pos_features, neg_features, t, num_training_sets, num_features, ROC_data): # Selects features to be used for training and testing pos_start = int(math.floor(len(pos_features) * t/num_training_sets)) neg_start = int(math.floor(len(neg_features) * t/num_training_sets)) pos_cutoff = int(math.floor(len(pos_features) * (t + 1)/num_training_sets)) neg_cutoff = int(math.floor(len(neg_features) * (t + 1)/num_training_sets)) train_features = pos_features[:pos_start] + pos_features[pos_cutoff:] + \ neg_features[:neg_start] + neg_features[neg_cutoff:] test_features = pos_features[pos_start:pos_cutoff] + \ neg_features[neg_start:neg_cutoff] # Trains a Naive Bayes Classifier classifier = NaiveBayesClassifier.train(train_features) # Initiates referenceSets and testSets reference_sets = collections.defaultdict(set) test_sets = collections.defaultdict(set) # Puts correct sentences in referenceSets and the predicted ones in testsets for i, (features, label) in enumerate(test_features): reference_sets[label].add(i) predicted = classifier.classify(features) #print "predicted feature set:"+ str(features) + "as being " + str(label) test_sets[predicted].add(i) curr_accuracy = nltk.classify.util.accuracy(classifier, test_features) curr_pos_precision = nltk.metrics.precision(reference_sets[0], test_sets[1]) curr_pos_recall = nltk.metrics.recall(reference_sets[1], test_sets[1]) curr_neg_precision = nltk.metrics.precision(reference_sets[0], test_sets[0]) curr_neg_recall = nltk.metrics.recall(reference_sets[0], test_sets[0]) # Print ROC curve and AUC auc = 0 if ROC_data: from pyroc import ROCData roc_data = ROCData((label, classifier.prob_classify(feature_set).prob(1)) \ for feature_set, label in test_features) auc = roc_data.auc() ROC_data[0].append(roc_data) ROC_data[1].append(str(num_features) + " Features: set " + str(t + 1) + \ " of " + str(num_training_sets) + ", AUC = " + str(auc)) # Prints metrics to show how well the feature selection did print 'testing on %d of %d sets, from positive index %d to index %d and from negative index %d to index %d:' \ % ((t + 1), num_training_sets, pos_start, pos_cutoff, neg_start, neg_cutoff) print 'train on %d instances, test on %d instances' \ % (len(train_features), len(test_features)) print 'accuracy:', curr_accuracy print 'pos precision:', curr_pos_precision print 'pos recall:', curr_pos_recall print 'neg precision:', curr_neg_precision print 'neg recall:', curr_neg_recall if ROC_data: print 'AUC:', auc classifier.show_most_informative_features(10) return [curr_accuracy, curr_pos_precision, curr_pos_recall, curr_neg_precision, curr_neg_recall, auc]
def random_roc_data(auc=.7, std_dev=.2, size=300): args = dict( pos_mu=auc, pos_sigma=std_dev, neg_mu=1-auc, neg_sigma=std_dev, size=size) arg_hash = hash(frozenset(args.items())) if arg_hash in CACHED_DATA: random_sample = CACHED_DATA[arg_hash] else: random_sample = random_mixture_model(**args) CACHED_DATA[arg_hash] = random_sample roc = ROCData(random_sample) roc.auc() roc_x = [x[0] for x in roc.derived_points] roc_y = [y[1] for y in roc.derived_points] return dict(x=roc_x, y=roc_y)
def ROCPlot(title, labels=None,*args): ''' If the PyROC (https://github.com/marcelcaraciolo/PyROC) module is installed, display the ROC curve for SVM/Logistic Regression classifiers. Inputs: ======= labels : Labels for the legend args: Variable length arguments of the form : actual_1[], predicted_1[], actual_2[], predicted_2[], .... ''' try: from pyroc import random_mixture_model, ROCData, plot_multiple_roc import pylab except ImportError: try: from pyroc import random_mixture_model, ROCData, plot_multiple_roc except ImportError: print 'PyROC does not exist, skipping ROC demo. Install PyROC from : https://github.com/marcelcaraciolo/PyROC ' return if(len(args)==0): x = random_mixture_model() r1 = ROCData(x) y = random_mixture_model() r2 = ROCData(y) lista = [r1,r2] labels = ['Algorithm-1','Algorithm-2'] else: lista = [] for i in range(0,len(args),2): x1 = args[i] y1 = args[i+1] x1y1 = ((x1[k],y1[k]) for k in range(len(x1))) r1 = ROCData(x1y1) auc = '%.2f'%r1.auc() if(labels): labels[i/2] = labels[i/2]+ ', AUC: {0} '.format(auc) lista.append(r1) plot_multiple_roc(lista,title,include_baseline=True,labels=labels) pylab.close()
def ROCPlot(title, labels=None, *args): ''' If the PyROC (https://github.com/marcelcaraciolo/PyROC) module is installed, display the ROC curve for SVM/Logistic Regression classifiers. Inputs: ======= labels : Labels for the legend args: Variable length arguments of the form : actual_1[], predicted_1[], actual_2[], predicted_2[], .... ''' try: from pyroc import random_mixture_model, ROCData, plot_multiple_roc import pylab except ImportError: try: from pyroc import random_mixture_model, ROCData, plot_multiple_roc except ImportError: print 'PyROC does not exist, skipping ROC demo. Install PyROC from : https://github.com/marcelcaraciolo/PyROC ' return if (len(args) == 0): x = random_mixture_model() r1 = ROCData(x) y = random_mixture_model() r2 = ROCData(y) lista = [r1, r2] labels = ['Algorithm-1', 'Algorithm-2'] else: lista = [] for i in range(0, len(args), 2): x1 = args[i] y1 = args[i + 1] x1y1 = ((x1[k], y1[k]) for k in range(len(x1))) r1 = ROCData(x1y1) auc = '%.2f' % r1.auc() if (labels): labels[i / 2] = labels[i / 2] + ', AUC: {0} '.format(auc) lista.append(r1) plot_multiple_roc(lista, title, include_baseline=True, labels=labels) pylab.close()
X = pd.DataFrame({ "housing": housing, "influence": influence, "contact": contact }) # adding a constant for the intercept X = sm.add_constant(X, prepend=False) y = df['satisfaction'].apply(lambda x: 0 if x=="low" else 1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.8) print y_train # create a GLM glm = sm.GLM(y_train, X_train, family=sm.families.Binomial()) # fit the model res = glm.fit() # take a look at the output print res.summary() print res.params print res.conf_int() print res.aic # generate an ROC curve roc = ROCData(zip(y_test, res.predict(X_test))) roc.auc() print roc roc.plot(title='ROC Curve', include_baseline=True) # plot_multiple_roc(rocs,'Multiple ROC Curves',include_baseline=True)