def divide_and_test(pos_features, neg_features, t, num_training_sets, num_features, ROC_data):

  # Selects features to be used for training and testing
  pos_start = int(math.floor(len(pos_features) * t/num_training_sets))
  neg_start = int(math.floor(len(neg_features) * t/num_training_sets))
  pos_cutoff = int(math.floor(len(pos_features) * (t + 1)/num_training_sets))
  neg_cutoff = int(math.floor(len(neg_features) * (t + 1)/num_training_sets))

  train_features = pos_features[:pos_start] + pos_features[pos_cutoff:] + \
                   neg_features[:neg_start] + neg_features[neg_cutoff:]
  test_features = pos_features[pos_start:pos_cutoff] + \
                  neg_features[neg_start:neg_cutoff]

  # Trains a Naive Bayes Classifier
  classifier = NaiveBayesClassifier.train(train_features)  

  # Initiates referenceSets and testSets
  reference_sets = collections.defaultdict(set)
  test_sets = collections.defaultdict(set) 

  # Puts correct sentences in referenceSets and the predicted ones in testsets
  for i, (features, label) in enumerate(test_features):
    reference_sets[label].add(i)
    predicted = classifier.classify(features)
    #print "predicted feature set:"+ str(features) + "as being " + str(label) 
    test_sets[predicted].add(i)  

  curr_accuracy = nltk.classify.util.accuracy(classifier, test_features)
  curr_pos_precision = nltk.metrics.precision(reference_sets[0], test_sets[1])
  curr_pos_recall = nltk.metrics.recall(reference_sets[1], test_sets[1])
  curr_neg_precision = nltk.metrics.precision(reference_sets[0], test_sets[0])
  curr_neg_recall = nltk.metrics.recall(reference_sets[0], test_sets[0])

  # Print ROC curve and AUC
  auc = 0
  if ROC_data:
    from pyroc import ROCData
    roc_data = ROCData((label, classifier.prob_classify(feature_set).prob(1)) \
                                    for feature_set, label in test_features)
    auc = roc_data.auc() 
    ROC_data[0].append(roc_data)
    ROC_data[1].append(str(num_features) + " Features: set " + str(t + 1) + \
                      " of " + str(num_training_sets) + ", AUC = " + str(auc))

  # Prints metrics to show how well the feature selection did
  print 'testing on %d of %d sets, from positive index %d to index %d and from negative index %d to index %d:' \
                      % ((t + 1), num_training_sets, pos_start, pos_cutoff, neg_start, neg_cutoff)

  print 'train on %d instances, test on %d instances' \
                               % (len(train_features), len(test_features))
  print 'accuracy:', curr_accuracy
  print 'pos precision:', curr_pos_precision
  print 'pos recall:', curr_pos_recall
  print 'neg precision:', curr_neg_precision
  print 'neg recall:', curr_neg_recall
  if ROC_data: print 'AUC:', auc
  classifier.show_most_informative_features(10)
  return [curr_accuracy, curr_pos_precision, curr_pos_recall, 
          curr_neg_precision, curr_neg_recall, auc]
示例#2
0
def random_roc_data(auc=.7, std_dev=.2, size=300):
    args = dict(
        pos_mu=auc,
        pos_sigma=std_dev,
        neg_mu=1-auc,
        neg_sigma=std_dev,
        size=size)
    arg_hash = hash(frozenset(args.items()))
    if arg_hash in CACHED_DATA:
        random_sample = CACHED_DATA[arg_hash]
    else:
        random_sample = random_mixture_model(**args)
    CACHED_DATA[arg_hash] = random_sample
    roc = ROCData(random_sample)
    roc.auc()
    roc_x = [x[0] for x in roc.derived_points]
    roc_y = [y[1] for y in roc.derived_points]
    return dict(x=roc_x, y=roc_y)
示例#3
0
def ROCPlot(title, labels=None,*args):
    '''
       If the PyROC (https://github.com/marcelcaraciolo/PyROC) 
       module is installed, display the ROC curve for SVM/Logistic Regression classifiers.
       Inputs:
       =======
       labels : Labels for the legend
       args: Variable length arguments of the form : actual_1[], predicted_1[], actual_2[], predicted_2[], ....
    '''
    try:
        from pyroc import random_mixture_model, ROCData, plot_multiple_roc
        import pylab
    except ImportError:
        try:
            from pyroc import random_mixture_model, ROCData, plot_multiple_roc
        except ImportError:
            print 'PyROC does not exist, skipping ROC demo. Install PyROC from : https://github.com/marcelcaraciolo/PyROC '
            return    
    if(len(args)==0):
        x = random_mixture_model()
        r1 = ROCData(x)
        y = random_mixture_model()
        r2 = ROCData(y)
        lista = [r1,r2]
        labels = ['Algorithm-1','Algorithm-2']
    else:
        lista = []
        for i in range(0,len(args),2):
            x1 = args[i]
            y1 = args[i+1]
            x1y1 = ((x1[k],y1[k]) for k in range(len(x1)))
            r1 = ROCData(x1y1)
            auc = '%.2f'%r1.auc()
            if(labels):
                labels[i/2] = labels[i/2]+ ', AUC: {0} '.format(auc)
            lista.append(r1)            
    plot_multiple_roc(lista,title,include_baseline=True,labels=labels)    
    pylab.close()      
示例#4
0
def ROCPlot(title, labels=None, *args):
    '''
       If the PyROC (https://github.com/marcelcaraciolo/PyROC) 
       module is installed, display the ROC curve for SVM/Logistic Regression classifiers.
       Inputs:
       =======
       labels : Labels for the legend
       args: Variable length arguments of the form : actual_1[], predicted_1[], actual_2[], predicted_2[], ....
    '''
    try:
        from pyroc import random_mixture_model, ROCData, plot_multiple_roc
        import pylab
    except ImportError:
        try:
            from pyroc import random_mixture_model, ROCData, plot_multiple_roc
        except ImportError:
            print 'PyROC does not exist, skipping ROC demo. Install PyROC from : https://github.com/marcelcaraciolo/PyROC '
            return
    if (len(args) == 0):
        x = random_mixture_model()
        r1 = ROCData(x)
        y = random_mixture_model()
        r2 = ROCData(y)
        lista = [r1, r2]
        labels = ['Algorithm-1', 'Algorithm-2']
    else:
        lista = []
        for i in range(0, len(args), 2):
            x1 = args[i]
            y1 = args[i + 1]
            x1y1 = ((x1[k], y1[k]) for k in range(len(x1)))
            r1 = ROCData(x1y1)
            auc = '%.2f' % r1.auc()
            if (labels):
                labels[i / 2] = labels[i / 2] + ', AUC: {0} '.format(auc)
            lista.append(r1)
    plot_multiple_roc(lista, title, include_baseline=True, labels=labels)
    pylab.close()
示例#5
0
X = pd.DataFrame({
    "housing": housing,
    "influence": influence,
    "contact": contact
    })

# adding a constant for the intercept
X = sm.add_constant(X, prepend=False)
y = df['satisfaction'].apply(lambda x: 0 if x=="low" else 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.8)

print y_train
# create a GLM
glm = sm.GLM(y_train, X_train, family=sm.families.Binomial())
# fit the model
res = glm.fit()
# take a look at the output
print res.summary()
print res.params
print res.conf_int()
print res.aic

# generate an ROC curve
roc = ROCData(zip(y_test, res.predict(X_test)))
roc.auc()
print roc
roc.plot(title='ROC Curve', include_baseline=True)

# plot_multiple_roc(rocs,'Multiple ROC Curves',include_baseline=True)