示例#1
0
def _benno_granularity(plag_spans, detected_spans):
    '''
    Granularity is defined in the paper -- essentially trying to measure
    how fine-grained a given detected_span is. 
    '''
    if len(detected_spans) == 0:
        return 1.0

    util = BaseUtility()
    # The S_R defined in the paper: set of plag_spans that overlap 
    # some detected span
    detected_overlaps = []

    # The C_s defined in the paper: set of detected_spans that overlap 
    # plag_span s
    # actual_overlaps[plag_span] = [list of detected_spans that overlap plag_span]
    actual_overlaps = {}
    
    for pspan in plag_spans:
        for dspan in detected_spans:
            if util.overlap(pspan, dspan) > 0:
                detected_overlaps.append(pspan)
                actual_overlaps.setdefault(tuple(pspan), []).append(dspan)

    gran_sum = 0.0
    for d_overlap in detected_overlaps:
        gran_sum += len(actual_overlaps[tuple(d_overlap)])

    if len(detected_overlaps) == 0:
        gran = 1.0 
    else:
        gran = gran_sum / len(detected_overlaps)

    return gran
def _benno_granularity(plag_spans, detected_spans):
    '''
    Granularity is defined in the paper -- essentially trying to measure
    how fine-grained a given detected_span is. 
    '''
    if len(detected_spans) == 0:
        return 1.0

    util = BaseUtility()
    # The S_R defined in the paper: set of plag_spans that overlap
    # some detected span
    detected_overlaps = []

    # The C_s defined in the paper: set of detected_spans that overlap
    # plag_span s
    # actual_overlaps[plag_span] = [list of detected_spans that overlap plag_span]
    actual_overlaps = {}

    for pspan in plag_spans:
        for dspan in detected_spans:
            if util.overlap(pspan, dspan) > 0:
                detected_overlaps.append(pspan)
                actual_overlaps.setdefault(tuple(pspan), []).append(dspan)

    gran_sum = 0.0
    for d_overlap in detected_overlaps:
        gran_sum += len(actual_overlaps[tuple(d_overlap)])

    if len(detected_overlaps) == 0:
        gran = 1.0
    else:
        gran = gran_sum / len(detected_overlaps)

    return gran
示例#3
0
def _deprecated_benno_precision_and_recall(plag_spans, detected_spans):
    '''
    NOTE (nj) this is the way the competition specified precision and recall, but doesn't
    seem to make a ton of sense: when choosing a threshold, it's in our best interest to
    call everything non-plagiarized and get prec and recall values of 1.0 for all the non-plagiarized
    documents. We could create a corpus of docs containing plag., but that also doesn't seem to
    be in the spirit of detection in general.
    
    Paper referred to is "Overview of the 1st International Competition on Plagiarism Detection"
    <plag_spans> (set S in paper) is a list of spans like (start_char, end_char) of plag. spans
    <detected_spans> (set R in paper) is a list of spans like (start_char, end_char) that we defined as plag.
    '''
    util = BaseUtility()

    # Edge cases -- defined according to performance_measures script provided online
    # http://www.uni-weimar.de/medien/webis/research/events/pan-09/pan09-code/pan09-plagiarism-detection-performance-measures.py
    if len(plag_spans) == 0 and len(detected_spans) == 0:
        prec = 1.0
        recall = 1.0
    elif len(plag_spans) == 0 or len(detected_spans) == 0:
        prec = 0.0
        recall = 0.0
    else:
        recall_sum = 0.0

        # recall defined over all plag spans
        for pspan in plag_spans:
            pspan_len = float(pspan[1] - pspan[0])

            for dspan in detected_spans:
                temp_recall = util.overlap(pspan, dspan) / pspan_len
                recall_sum += temp_recall

        recall = recall_sum / len(plag_spans)

   
        prec_sum = 0.0
        for dspan in detected_spans:
            dspan_len = float(dspan[1] - dspan[0])

            for pspan in plag_spans:
                temp_prec = util.overlap(dspan, pspan) / dspan_len
                prec_sum += temp_prec

        prec = prec_sum / len(detected_spans)

    return prec, recall
def _deprecated_benno_precision_and_recall(plag_spans, detected_spans):
    '''
    NOTE (nj) this is the way the competition specified precision and recall, but doesn't
    seem to make a ton of sense: when choosing a threshold, it's in our best interest to
    call everything non-plagiarized and get prec and recall values of 1.0 for all the non-plagiarized
    documents. We could create a corpus of docs containing plag., but that also doesn't seem to
    be in the spirit of detection in general.
    
    Paper referred to is "Overview of the 1st International Competition on Plagiarism Detection"
    <plag_spans> (set S in paper) is a list of spans like (start_char, end_char) of plag. spans
    <detected_spans> (set R in paper) is a list of spans like (start_char, end_char) that we defined as plag.
    '''
    util = BaseUtility()

    # Edge cases -- defined according to performance_measures script provided online
    # http://www.uni-weimar.de/medien/webis/research/events/pan-09/pan09-code/pan09-plagiarism-detection-performance-measures.py
    if len(plag_spans) == 0 and len(detected_spans) == 0:
        prec = 1.0
        recall = 1.0
    elif len(plag_spans) == 0 or len(detected_spans) == 0:
        prec = 0.0
        recall = 0.0
    else:
        recall_sum = 0.0

        # recall defined over all plag spans
        for pspan in plag_spans:
            pspan_len = float(pspan[1] - pspan[0])

            for dspan in detected_spans:
                temp_recall = util.overlap(pspan, dspan) / pspan_len
                recall_sum += temp_recall

        recall = recall_sum / len(plag_spans)

        prec_sum = 0.0
        for dspan in detected_spans:
            dspan_len = float(dspan[1] - dspan[0])

            for pspan in plag_spans:
                temp_prec = util.overlap(dspan, pspan) / dspan_len
                prec_sum += temp_prec

        prec = prec_sum / len(detected_spans)

    return prec, recall
示例#5
0
def _benno_precision_and_recall(plag_spans, detected_spans):
    '''
    Paper referred to is "Overview of the 1st International Competition on Plagiarism Detection"
    <plag_spans> (set S in paper) is a list of spans like (start_char, end_char) of plag. spans
    <detected_spans> (set R in paper) is a list of spans like (start_char, end_char) that we defined as plag.

    Edge cases: if there are no plagiarized spans, there is no notion of recall. Returns None.
    If we detect nothing, then there is no notion of precision. Returns None.
    '''
    util = BaseUtility()

    if len(plag_spans) == 0:
        recall = None
    else:
        recall_sum = 0.0

        # recall defined over all plag spans
        for pspan in plag_spans:
            pspan_len = float(pspan[1] - pspan[0])

            for dspan in detected_spans:
                temp_recall = util.overlap(pspan, dspan) / pspan_len
                recall_sum += temp_recall

        recall = recall_sum / len(plag_spans)

    if len(detected_spans) == 0:
        prec = None
    else:
        prec_sum = 0.0

        for dspan in detected_spans:
            dspan_len = float(dspan[1] - dspan[0])

            for pspan in plag_spans:
                temp_prec = util.overlap(dspan, pspan) / dspan_len
                prec_sum += temp_prec

        prec = prec_sum / len(detected_spans)

    return prec, recall
def _benno_precision_and_recall(plag_spans, detected_spans):
    '''
    Paper referred to is "Overview of the 1st International Competition on Plagiarism Detection"
    <plag_spans> (set S in paper) is a list of spans like (start_char, end_char) of plag. spans
    <detected_spans> (set R in paper) is a list of spans like (start_char, end_char) that we defined as plag.

    Edge cases: if there are no plagiarized spans, there is no notion of recall. Returns None.
    If we detect nothing, then there is no notion of precision. Returns None.
    '''
    util = BaseUtility()

    if len(plag_spans) == 0:
        recall = None
    else:
        recall_sum = 0.0

        # recall defined over all plag spans
        for pspan in plag_spans:
            pspan_len = float(pspan[1] - pspan[0])

            for dspan in detected_spans:
                temp_recall = util.overlap(pspan, dspan) / pspan_len
                recall_sum += temp_recall

        recall = recall_sum / len(plag_spans)

    if len(detected_spans) == 0:
        prec = None
    else:
        prec_sum = 0.0

        for dspan in detected_spans:
            dspan_len = float(dspan[1] - dspan[0])

            for pspan in plag_spans:
                temp_prec = util.overlap(dspan, pspan) / dspan_len
                prec_sum += temp_prec

        prec = prec_sum / len(detected_spans)

    return prec, recall
def predict(model, features, cluster_type, atom_type, start_doc, ntest, **cluster_args):
    '''
    Runs the process of clustering for each feature/calculating confidences and 
    plugs these confidences into <model>, returning the weighted confidences of plag.
    for all passages parsed from start_doc -> start_doc + ntest documents

    TODO do one document at a time
    '''
    test_matrix, actuals = _get_feature_conf_and_actuals(features, cluster_type, atom_type, start_doc, ntest)
    
    predicted = model.predict(test_matrix)
    # predict_proba returns a list of probabilities of being in class number i
    # Since we only have two classes (0 == nonplag, 1 == plag), just keep
    # the prob/confidence of plag. 
    confidences = [x[1] for x in model.predict_proba(test_matrix)]

    pos_predictions = [x for x, y in zip(confidences, actuals) if y == 1]
    neg_predictions = [x for x, y in zip(confidences, actuals) if y == 0]
    print 'for those which are pos'
    print five_num_summary(pos_predictions)
    print 'for those which are neg'
    print five_num_summary(neg_predictions)

    print 'pct. plag', sum(actuals) / float(len(actuals))
    print 'pct correct:'
    print sum([x == y for x, y in zip(predicted, actuals)]) / float(len(predicted))

    metadata = {
        'features' : features,
        'cluster_type' : cluster_type,
        'feature_selection' : True,
        'atom_type' : atom_type,
        'start_doc' : start_doc,
        'ntest' : ntest
    }

    path, auc = BaseUtility.draw_roc(actuals, confidences, **metadata)
    print path, auc

    return confidences
def compare_params():
    '''
    [('l1', 'auto', 0.59759576698869676, 'plagcomps/shared/../figures/roc1390881314.99.pdf'),
     ('l1', None, 0.60174204862821445, 'plagcomps/shared/../figures/roc1390881397.91.pdf'),
     ('l2', 'auto', 0.60095727893574291, 'plagcomps/shared/../figures/roc1390881480.62.pdf'),
     ('l2', None, 0.5977554082484301, 'plagcomps/shared/../figures/roc1390881563.36.pdf')
    ]

    '''
    features = FeatureExtractor.get_all_feature_function_names()
    features = [f for f in features if 'unigram' not in f and 'trigram' not in f]
    cluster_type = 'outlier'
    atom_type = 'paragraph' 
    start_doc = 0
    ntrain = 100
    ntest = 200

    # Process the test set once
    test_matrix, actuals = _get_feature_conf_and_actuals(features, cluster_type, atom_type, ntrain, ntest)

    # Options for Log regression
    regularization_options = ['l1', 'l2']
    class_weight_options = ['auto', None]

    results = []
    for regularization in regularization_options:
        for class_weight in class_weight_options:
            model = train(features, cluster_type, atom_type, ntrain, start_doc=start_doc, regularization=regularization, class_weight=class_weight)
            confidences = [x[1] for x in model.predict_proba(test_matrix)]
            path, auc = BaseUtility.draw_roc(actuals, confidences, combination='Using Combination')

            results.append((regularization, class_weight, auc, path))

            print results

    print results
    return results