def predict(model, features, cluster_type, atom_type, start_doc, ntest, **cluster_args):
    '''
    Runs the process of clustering for each feature/calculating confidences and 
    plugs these confidences into <model>, returning the weighted confidences of plag.
    for all passages parsed from start_doc -> start_doc + ntest documents

    TODO do one document at a time
    '''
    test_matrix, actuals = _get_feature_conf_and_actuals(features, cluster_type, atom_type, start_doc, ntest)
    
    predicted = model.predict(test_matrix)
    # predict_proba returns a list of probabilities of being in class number i
    # Since we only have two classes (0 == nonplag, 1 == plag), just keep
    # the prob/confidence of plag. 
    confidences = [x[1] for x in model.predict_proba(test_matrix)]

    pos_predictions = [x for x, y in zip(confidences, actuals) if y == 1]
    neg_predictions = [x for x, y in zip(confidences, actuals) if y == 0]
    print 'for those which are pos'
    print five_num_summary(pos_predictions)
    print 'for those which are neg'
    print five_num_summary(neg_predictions)

    print 'pct. plag', sum(actuals) / float(len(actuals))
    print 'pct correct:'
    print sum([x == y for x, y in zip(predicted, actuals)]) / float(len(predicted))

    metadata = {
        'features' : features,
        'cluster_type' : cluster_type,
        'feature_selection' : True,
        'atom_type' : atom_type,
        'start_doc' : start_doc,
        'ntest' : ntest
    }

    path, auc = BaseUtility.draw_roc(actuals, confidences, **metadata)
    print path, auc

    return confidences
def compare_params():
    '''
    [('l1', 'auto', 0.59759576698869676, 'plagcomps/shared/../figures/roc1390881314.99.pdf'),
     ('l1', None, 0.60174204862821445, 'plagcomps/shared/../figures/roc1390881397.91.pdf'),
     ('l2', 'auto', 0.60095727893574291, 'plagcomps/shared/../figures/roc1390881480.62.pdf'),
     ('l2', None, 0.5977554082484301, 'plagcomps/shared/../figures/roc1390881563.36.pdf')
    ]

    '''
    features = FeatureExtractor.get_all_feature_function_names()
    features = [f for f in features if 'unigram' not in f and 'trigram' not in f]
    cluster_type = 'outlier'
    atom_type = 'paragraph' 
    start_doc = 0
    ntrain = 100
    ntest = 200

    # Process the test set once
    test_matrix, actuals = _get_feature_conf_and_actuals(features, cluster_type, atom_type, ntrain, ntest)

    # Options for Log regression
    regularization_options = ['l1', 'l2']
    class_weight_options = ['auto', None]

    results = []
    for regularization in regularization_options:
        for class_weight in class_weight_options:
            model = train(features, cluster_type, atom_type, ntrain, start_doc=start_doc, regularization=regularization, class_weight=class_weight)
            confidences = [x[1] for x in model.predict_proba(test_matrix)]
            path, auc = BaseUtility.draw_roc(actuals, confidences, combination='Using Combination')

            results.append((regularization, class_weight, auc, path))

            print results

    print results
    return results