def _benno_granularity(plag_spans, detected_spans): ''' Granularity is defined in the paper -- essentially trying to measure how fine-grained a given detected_span is. ''' if len(detected_spans) == 0: return 1.0 util = BaseUtility() # The S_R defined in the paper: set of plag_spans that overlap # some detected span detected_overlaps = [] # The C_s defined in the paper: set of detected_spans that overlap # plag_span s # actual_overlaps[plag_span] = [list of detected_spans that overlap plag_span] actual_overlaps = {} for pspan in plag_spans: for dspan in detected_spans: if util.overlap(pspan, dspan) > 0: detected_overlaps.append(pspan) actual_overlaps.setdefault(tuple(pspan), []).append(dspan) gran_sum = 0.0 for d_overlap in detected_overlaps: gran_sum += len(actual_overlaps[tuple(d_overlap)]) if len(detected_overlaps) == 0: gran = 1.0 else: gran = gran_sum / len(detected_overlaps) return gran
def _deprecated_benno_precision_and_recall(plag_spans, detected_spans): ''' NOTE (nj) this is the way the competition specified precision and recall, but doesn't seem to make a ton of sense: when choosing a threshold, it's in our best interest to call everything non-plagiarized and get prec and recall values of 1.0 for all the non-plagiarized documents. We could create a corpus of docs containing plag., but that also doesn't seem to be in the spirit of detection in general. Paper referred to is "Overview of the 1st International Competition on Plagiarism Detection" <plag_spans> (set S in paper) is a list of spans like (start_char, end_char) of plag. spans <detected_spans> (set R in paper) is a list of spans like (start_char, end_char) that we defined as plag. ''' util = BaseUtility() # Edge cases -- defined according to performance_measures script provided online # http://www.uni-weimar.de/medien/webis/research/events/pan-09/pan09-code/pan09-plagiarism-detection-performance-measures.py if len(plag_spans) == 0 and len(detected_spans) == 0: prec = 1.0 recall = 1.0 elif len(plag_spans) == 0 or len(detected_spans) == 0: prec = 0.0 recall = 0.0 else: recall_sum = 0.0 # recall defined over all plag spans for pspan in plag_spans: pspan_len = float(pspan[1] - pspan[0]) for dspan in detected_spans: temp_recall = util.overlap(pspan, dspan) / pspan_len recall_sum += temp_recall recall = recall_sum / len(plag_spans) prec_sum = 0.0 for dspan in detected_spans: dspan_len = float(dspan[1] - dspan[0]) for pspan in plag_spans: temp_prec = util.overlap(dspan, pspan) / dspan_len prec_sum += temp_prec prec = prec_sum / len(detected_spans) return prec, recall
def _benno_precision_and_recall(plag_spans, detected_spans): ''' Paper referred to is "Overview of the 1st International Competition on Plagiarism Detection" <plag_spans> (set S in paper) is a list of spans like (start_char, end_char) of plag. spans <detected_spans> (set R in paper) is a list of spans like (start_char, end_char) that we defined as plag. Edge cases: if there are no plagiarized spans, there is no notion of recall. Returns None. If we detect nothing, then there is no notion of precision. Returns None. ''' util = BaseUtility() if len(plag_spans) == 0: recall = None else: recall_sum = 0.0 # recall defined over all plag spans for pspan in plag_spans: pspan_len = float(pspan[1] - pspan[0]) for dspan in detected_spans: temp_recall = util.overlap(pspan, dspan) / pspan_len recall_sum += temp_recall recall = recall_sum / len(plag_spans) if len(detected_spans) == 0: prec = None else: prec_sum = 0.0 for dspan in detected_spans: dspan_len = float(dspan[1] - dspan[0]) for pspan in plag_spans: temp_prec = util.overlap(dspan, pspan) / dspan_len prec_sum += temp_prec prec = prec_sum / len(detected_spans) return prec, recall
def predict(model, features, cluster_type, atom_type, start_doc, ntest, **cluster_args): ''' Runs the process of clustering for each feature/calculating confidences and plugs these confidences into <model>, returning the weighted confidences of plag. for all passages parsed from start_doc -> start_doc + ntest documents TODO do one document at a time ''' test_matrix, actuals = _get_feature_conf_and_actuals(features, cluster_type, atom_type, start_doc, ntest) predicted = model.predict(test_matrix) # predict_proba returns a list of probabilities of being in class number i # Since we only have two classes (0 == nonplag, 1 == plag), just keep # the prob/confidence of plag. confidences = [x[1] for x in model.predict_proba(test_matrix)] pos_predictions = [x for x, y in zip(confidences, actuals) if y == 1] neg_predictions = [x for x, y in zip(confidences, actuals) if y == 0] print 'for those which are pos' print five_num_summary(pos_predictions) print 'for those which are neg' print five_num_summary(neg_predictions) print 'pct. plag', sum(actuals) / float(len(actuals)) print 'pct correct:' print sum([x == y for x, y in zip(predicted, actuals)]) / float(len(predicted)) metadata = { 'features' : features, 'cluster_type' : cluster_type, 'feature_selection' : True, 'atom_type' : atom_type, 'start_doc' : start_doc, 'ntest' : ntest } path, auc = BaseUtility.draw_roc(actuals, confidences, **metadata) print path, auc return confidences
def compare_params(): ''' [('l1', 'auto', 0.59759576698869676, 'plagcomps/shared/../figures/roc1390881314.99.pdf'), ('l1', None, 0.60174204862821445, 'plagcomps/shared/../figures/roc1390881397.91.pdf'), ('l2', 'auto', 0.60095727893574291, 'plagcomps/shared/../figures/roc1390881480.62.pdf'), ('l2', None, 0.5977554082484301, 'plagcomps/shared/../figures/roc1390881563.36.pdf') ] ''' features = FeatureExtractor.get_all_feature_function_names() features = [f for f in features if 'unigram' not in f and 'trigram' not in f] cluster_type = 'outlier' atom_type = 'paragraph' start_doc = 0 ntrain = 100 ntest = 200 # Process the test set once test_matrix, actuals = _get_feature_conf_and_actuals(features, cluster_type, atom_type, ntrain, ntest) # Options for Log regression regularization_options = ['l1', 'l2'] class_weight_options = ['auto', None] results = [] for regularization in regularization_options: for class_weight in class_weight_options: model = train(features, cluster_type, atom_type, ntrain, start_doc=start_doc, regularization=regularization, class_weight=class_weight) confidences = [x[1] for x in model.predict_proba(test_matrix)] path, auc = BaseUtility.draw_roc(actuals, confidences, combination='Using Combination') results.append((regularization, class_weight, auc, path)) print results print results return results