def feature_test(formula, feature_mapping): actuals = [] confidences = [] files = training_files + test_files #try: for doc_num,doc in enumerate(get_cached_reduced_docs("paragraph", files)): print str(doc_num) for span in doc.get_spans(): actuals.append(1 if doc.span_is_plagiarized(span) else 0) computed_feature_vector = [] feature_vectors = doc.get_feature_vectors(features, session) for feature_tuple in feature_vectors: ith_feature_slice = {features[j]:value for j,value in enumerate(feature_tuple)} #print ith_feature_slice computed_feature = test_evaluate(formula, ith_feature_slice, feature_mapping) computed_feature_vector.append([computed_feature]) #print "clustering with", computed_feature_vector confidences += cluster("kmeans", 2, computed_feature_vector) #print "Conf, Actual", confidences, actuals # For each document, add (1 - AUC) for our ROC calculation return (1 - BaseUtility.draw_roc(actuals, confidences, save_figure=False)[1])
def feature_test(formula, feature_mapping): actuals = [] confidences = [] files = training_files + test_files #try: for doc_num, doc in enumerate(get_cached_reduced_docs("paragraph", files)): print str(doc_num) for span in doc.get_spans(): actuals.append(1 if doc.span_is_plagiarized(span) else 0) computed_feature_vector = [] feature_vectors = doc.get_feature_vectors(features, session) for feature_tuple in feature_vectors: ith_feature_slice = { features[j]: value for j, value in enumerate(feature_tuple) } #print ith_feature_slice computed_feature = test_evaluate(formula, ith_feature_slice, feature_mapping) computed_feature_vector.append([computed_feature]) #print "clustering with", computed_feature_vector confidences += cluster("kmeans", 2, computed_feature_vector) #print "Conf, Actual", confidences, actuals # For each document, add (1 - AUC) for our ROC calculation return (1 - BaseUtility.draw_roc(actuals, confidences, save_figure=False)[1])
def fitness(self, training=True): # choose 10 random values badness = 0.0 actuals = [] confidences = [] files = training_files if training else test_files try: for doc in get_cached_reduced_docs(atom_type, files): for span in doc.get_spans(): actuals.append(1 if doc.span_is_plagiarized(span) else 0) computed_feature_vector = [] feature_vectors = doc.get_feature_vectors(features, session) for feature_tuple in feature_vectors: ith_feature_slice = {chr(ord("A") + j):value for j,value in enumerate(feature_tuple)} #print ith_feature_slice computed_feature = self.calc(**ith_feature_slice) computed_feature_vector.append([computed_feature]) #print "clustering with", computed_feature_vector confidences += cluster(cluster_type, 2, computed_feature_vector) #print "Conf, Actual", confidences, actuals # For each document, add (1 - AUC) for our ROC calculation badness += (1 - BaseUtility.draw_roc(actuals, confidences, save_figure=False)[1]) return badness except OverflowError: return 1.0e+255 # infinitely bad
def get_confidences_actuals(session, features, cluster_type, k, atom_type, docs, corpus='intrinsic', save_roc_figure=True, reduced_docs=None, feature_vector_weights=None, metadata={}, cheating=False, cheating_min_len=5000, **clusterargs): ''' Return the confidences and acutals for the given list of documents parsed by atom_type, using the given features, cluster_type, and number of clusters k. features is a list of strings where each string is the name of a StylometricFeatureEvaluator method. cluster_type is "kmeans", "hmm", or "agglom". k is an integer. atom_type is "word", "sentence", or "paragraph". docs should be a list of full path strings. ''' # TODO: Return more statistics, not just roc curve things. # If previous call cached <reduced_docs>, don't re-query the DB if not reduced_docs: reduced_docs = _get_reduced_docs(atom_type, docs, session, corpus=corpus) plag_likelihoods = [] actuals = [] count = 0 valid_reduced_docs = [] for d in reduced_docs: count += 1 if DEBUG: print "On document", d, ". The", count, "th document." feature_vecs = d.get_feature_vectors(features, session, cheating=cheating, cheating_min_len=cheating_min_len) # skip if there are no feature_vectors if cheating and len(feature_vecs) < 7: # 7, because that's what Benno did continue valid_reduced_docs.append(d) # add to actuals spans = d.get_spans() for i in xrange(len(spans)): span = spans[i] actuals.append(1 if d.span_is_plagiarized(span) else 0) if feature_vector_weights: weighted_vecs = [] for vec in feature_vecs: cur_weight_vec = [] for i, weight in enumerate(feature_vector_weights, 0): cur_weight_vec.append(vec[i] * weight) weighted_vecs.append(cur_weight_vec) feature_vecs = weighted_vecs likelihood = cluster(cluster_type, k, feature_vecs, **clusterargs) plag_likelihoods.append(likelihood) session.close() all_confidences = [] for likelihood_list in plag_likelihoods: all_confidences += likelihood_list return all_confidences, actuals
def extract_and_serialize(txt_file, xml_file, out_file, atom_type='paragraph', cluster_method='kmeans', k=2): ''' Performs all of intrinsic (feature extraction, clustering etc.) and creates Passage objects for each passage in <txt_file>. Writes a CSV file out to <out_file> containing all the features of <txt_file> The CSV files can be read easily by R in order to create plots ''' f = file(txt_file, 'r') text = f.read() f.close() util = IntrinsicUtility() feature_names = [ 'average_word_length', 'average_sentence_length', 'stopword_percentage', 'punctuation_percentage', 'syntactic_complexity', 'avg_internal_word_freq_class', 'avg_external_word_freq_class' ] ext = FeatureExtractor(text) print 'Initialized extractor' # Note that passages don't know their ground truths yet passages = ext.get_passages(feature_names, atom_type) print 'Extracted passages' util.add_ground_truth_to_passages(passages, xml_file) feature_vecs = [p.features.values() for p in passages] # If just testing feature extraction, don't cluster passages if cluster_method != 'none': # Cluster the passages and set their confidences confidences = cluster(cluster_method, k, feature_vecs) for psg, conf in zip(passages, confidences): psg.set_plag_confidence(conf) f = file(out_file, 'wb') csv_writer = csv.writer(f) # Writes out the header for corresponding CSV csv_writer.writerow(IntrinsicPassage.serialization_header(feature_names)) for p in passages: csv_writer.writerow(p.to_list(feature_names)) f.close() print 'Finished writing', out_file
def evaluate(reduced_docs, session, features, cluster_type, k, atom_type, docs, corpus='intrinsic', save_roc_figure=True, feature_vector_weights=None, metadata={}, cheating=False, cheating_min_len=5000, **clusterargs): ''' Return the roc curve path and area under the roc curve for the given list of documents parsed by atom_type, using the given features, cluster_type, and number of clusters k. features is a list of strings where each string is the name of a StylometricFeatureEvaluator method. cluster_type is "kmeans", "hmm", or "agglom". k is an integer. atom_type is "word", "sentence", or "paragraph". docs should be a list of full path strings. ''' # If previous call cached <reduced_docs>, don't re-query the DB plag_likelihoods = [] doc_plag_assignments = {} count = 0 valid_reduced_docs = [] for d in reduced_docs: count += 1 if DEBUG: print "On document", d, ". The", count, "th document." feature_vecs = d.get_feature_vectors(features, session, cheating=cheating, cheating_min_len=cheating_min_len) # skip if there are no feature_vectors if cheating and len(feature_vecs) < 7: # 7, because that's what Benno did continue valid_reduced_docs.append(d) if feature_vector_weights: weighted_vecs = [] for vec in feature_vecs: cur_weight_vec = [] for i, weight in enumerate(feature_vector_weights, 0): cur_weight_vec.append(vec[i] * weight) weighted_vecs.append(cur_weight_vec) feature_vecs = weighted_vecs likelihood = cluster(cluster_type, k, feature_vecs, **clusterargs) doc_plag_assignments[d] = likelihood plag_likelihoods.append(likelihood) metadata['features'] = features metadata['cluster_type'] = cluster_type metadata['k'] = k metadata['atom_type'] = atom_type metadata['n'] = len(reduced_docs) roc_path, roc_auc = _roc(valid_reduced_docs, plag_likelihoods, save_roc_figure=save_roc_figure, cheating=cheating, cheating_min_len=cheating_min_len, **metadata) # Return reduced_docs for caching in case we call <evaluate> multiple times return roc_path, roc_auc
def get_cached_confidences(doc): if doc in cached_confidences: return cached_confidences[doc] confidence_vectors = [ [] for x in range(len(doc.get_spans())) ] feature_vectors = doc.get_feature_vectors(features, session) num_passages = len(feature_vectors) num_features = len(features) for feature_index in range(num_features): one_feature_vector = [passage_features[feature_index] for passage_features in feature_vectors] one_feature_confidences = cluster(cluster_type, 2, [[feature_value] for feature_value in one_feature_vector]) for passage_index in range(num_passages): confidence_vectors[passage_index].append(one_feature_confidences[passage_index]) cached_confidences[doc] = confidence_vectors return confidence_vectors
def _get_feature_conf_and_actuals(features, cluster_type, atom_type, start_doc, n, pct_plag=None, **cluster_args): ''' Returns a matrix of dimension <num_passages> x <num_features> where each row holds the confidence that that row was plagiarized according to each feature. In other words, mat[passage_num][feat_num] is the plag. confidence of <passage_num> according to <feat_num> Note that the transpose of this matrix is built below, and then transposed before returning ''' first_training_files = IntrinsicUtility().get_n_training_files(n, first_doc_num=start_doc, pct_plag=pct_plag) session = Session() reduced_docs = _get_reduced_docs(atom_type, first_training_files, session) actuals = [] # feature_conf_matrix[feat][span_index] == Conf. that <span_index> # was plag. according to <feat> # NOTE that we're ignoring document boundaries in the storage of this # matrix. So <span_index> is not relative to any document feature_conf_matrix = [[] for i in xrange(len(features))] for doc_index in xrange(len(reduced_docs)): if doc_index % 10 == 0: print 'Working on doc number (in training corpus)', start_doc + doc_index doc = reduced_docs[doc_index] spans = doc.get_spans() for feat_num in xrange(len(features)): feat = features[feat_num] feature_vecs = doc.get_feature_vectors([feat], session) # One column, i.e. confidence values for <feat> over all passages # in <doc> confidences = cluster(cluster_type, 2, feature_vecs, **cluster_args) # Use append if we care about document_num feature_conf_matrix[feat_num].extend(confidences) for span_index in xrange(len(spans)): span = spans[span_index] actuals.append(1 if doc.span_is_plagiarized(span) else 0) rotated = np.matrix(feature_conf_matrix).T return rotated, actuals
def _cluster_auc_test(num_plag, num_noplag, mean_diff, std, dimensions = 1, repetitions = 1): ''' roc area under curve evaluation of various clustering techniques creates two peaks based on normal distributions and tries to cluster them prints out AUC stat for each cluster type ''' print "running cluster auc test with", num_plag, num_noplag, mean_diff, std, dimensions, repetitions if repetitions > 1: averages = {} for rep in range(repetitions): noplag_features = [] for i in range(num_noplag): cur = [] for j in range(dimensions): cur.append(scipy.random.normal(0, std)) noplag_features.append(cur) plag_features = [] for i in range(num_plag): cur = [] for j in range(dimensions): cur.append(scipy.random.normal(mean_diff, std)) plag_features.append(cur) features = noplag_features + plag_features actuals = [0] * num_noplag + [1] * num_plag for clus_type in ["kmeans", "agglom", "hmm"]: confidences = cluster(clus_type, 2, features) fpr, tpr, thresholds = sklearn.metrics.roc_curve(actuals, confidences, pos_label=1) roc_auc = sklearn.metrics.auc(fpr, tpr) if repetitions == 1: print clus_type, roc_auc else: averages[clus_type] = averages.get(clus_type, []) + [roc_auc] if repetitions > 1: for key in averages: print key, sum(averages[key])/float(max(1, len(averages[key])))
def get_cached_confidences(doc): if doc in cached_confidences: return cached_confidences[doc] confidence_vectors = [[] for x in range(len(doc.get_spans()))] feature_vectors = doc.get_feature_vectors(features, session) num_passages = len(feature_vectors) num_features = len(features) for feature_index in range(num_features): one_feature_vector = [ passage_features[feature_index] for passage_features in feature_vectors ] one_feature_confidences = cluster( cluster_type, 2, [[feature_value] for feature_value in one_feature_vector]) for passage_index in range(num_passages): confidence_vectors[passage_index].append( one_feature_confidences[passage_index]) cached_confidences[doc] = confidence_vectors return confidence_vectors
def fitness(self, training=True): # choose 10 random values badness = 0.0 actuals = [] confidences = [] files = training_files if training else test_files try: for doc in get_cached_reduced_docs(atom_type, files): for span in doc.get_spans(): actuals.append(1 if doc.span_is_plagiarized(span) else 0) computed_feature_vector = [] feature_vectors = doc.get_feature_vectors(features, session) for feature_tuple in feature_vectors: ith_feature_slice = { chr(ord("A") + j): value for j, value in enumerate(feature_tuple) } #print ith_feature_slice computed_feature = self.calc(**ith_feature_slice) computed_feature_vector.append([computed_feature]) #print "clustering with", computed_feature_vector confidences += cluster(cluster_type, 2, computed_feature_vector) #print "Conf, Actual", confidences, actuals # For each document, add (1 - AUC) for our ROC calculation badness += (1 - BaseUtility.draw_roc( actuals, confidences, save_figure=False)[1]) return badness except OverflowError: return 1.0e+255 # infinitely bad
def prec_recall_evaluate(reduced_docs, session, features, cluster_type, k, atom_type, corpus='intrinsic', feature_vector_weights=None, metadata={}, cheating=False, cheating_min_len=5000, **clusterargs): ''' ''' thresholds = [.05 * i for i in range(20)] thresh_to_prec = {} thresh_to_recall = {} thresh_to_fmeasure = {} thresh_to_granularity = {} thresh_to_overall = {} # doc_to_thresh_to_result[i] = {thresh -> (prec, recall)} doc_to_thresh_to_result = [] count = 0 valid_reduced_docs = [] for i, d in enumerate(reduced_docs): doc_to_thresh_to_result.append({}) count += 1 print "On document", d, ". The", count, "th document." feature_vecs = d.get_feature_vectors(features, session, cheating=cheating, cheating_min_len=cheating_min_len) # skip if there are no feature_vectors if cheating and len(feature_vecs) < 7: # 7, because that's what Benno did continue valid_reduced_docs.append(d) if feature_vector_weights: weighted_vecs = [] for vec in feature_vecs: cur_weight_vec = [] for i, weight in enumerate(feature_vector_weights, 0): cur_weight_vec.append(vec[i] * weight) weighted_vecs.append(cur_weight_vec) feature_vecs = weighted_vecs # Grab the plagiarized spans spans = d.get_spans(cheating, cheating_min_len) actual_plag_spans = d.get_plag_spans() # Cluster to get plag probs plag_likelihoods = cluster(cluster_type, k, feature_vecs, **clusterargs) # Make sure we have a confidence level for every span assert(len(spans) == len(plag_likelihoods)) # thresh => detected_spans all_detected_spans = {} for thresh in thresholds: prec, rec, fmeasure, granularity, overall, plag_spans, detected_spans = \ get_all_measures(actual_plag_spans, spans, plag_likelihoods, thresh, cheating=cheating, cheating_min_len=cheating_min_len) all_detected_spans[thresh] = detected_spans # If measure wasn't well defined, None is returned. NOTE (nj) sneaky bug: # if we use the construct # if prec: # <add it to the dict> # never add prec or recall when they're 0. # Thus we explicitly check for None-ness if prec is not None: thresh_to_prec.setdefault(thresh, []).append(prec) if rec is not None: thresh_to_recall.setdefault(thresh, []).append(rec) if fmeasure is not None: thresh_to_fmeasure.setdefault(thresh, []).append(fmeasure) if granularity is not None: thresh_to_granularity.setdefault(thresh, []).append(granularity) if overall is not None: thresh_to_overall.setdefault(thresh, []).append(overall) doc_to_thresh_to_result[i][thresh] = (prec, rec, fmeasure, granularity, overall) # Pass relevant data to plotting function doc_name = os.path.basename(d._short_name).replace('.txt', '') #visualize_overlaps(plag_spans, all_detected_spans, doc_name=doc_name) # For a given threshold, how many documents had valid precisions? print 'Valid precision:', sorted([(th, len(l)) for th, l in thresh_to_prec.iteritems()]) # For a given threshold, how many documents had valid recall? print 'Valid recall (this number should not change):', sorted([(th, len(l)) for th, l in thresh_to_recall.iteritems()]) thresh_prec_avgs = {t : sum(l) / len(l) for t, l in thresh_to_prec.iteritems()} thresh_recall_avgs = {t : sum(l) / len(l) for t, l in thresh_to_recall.iteritems()} thresh_fmeasure_avgs = {t : sum(l) / len(l) for t, l in thresh_to_fmeasure.iteritems()} thresh_granularity_avgs = {t : sum(l) / len(l) for t, l in thresh_to_granularity.iteritems()} thresh_overall_avgs = {t : sum(l) / len(l) for t, l in thresh_to_overall.iteritems()} if DEBUG: for thresh in sorted(thresh_prec_avgs.keys()): print thresh print 'Prec:', thresh_prec_avgs[thresh] print 'Recall:', thresh_recall_avgs[thresh] print 'F-Measure:', thresh_fmeasure_avgs[thresh] print 'Granularity:', thresh_granularity_avgs[thresh] print 'Overall:', thresh_overall_avgs[thresh] print '-'*40 return thresh_prec_avgs, thresh_recall_avgs, thresh_fmeasure_avgs, thresh_granularity_avgs, thresh_overall_avgs
def prec_recall_evaluate(reduced_docs, session, features, cluster_type, k, atom_type, corpus='intrinsic', feature_vector_weights=None, metadata={}, cheating=False, cheating_min_len=5000, **clusterargs): ''' ''' thresholds = [.05 * i for i in range(20)] thresh_to_prec = {} thresh_to_recall = {} thresh_to_fmeasure = {} thresh_to_granularity = {} thresh_to_overall = {} # doc_to_thresh_to_result[i] = {thresh -> (prec, recall)} doc_to_thresh_to_result = [] count = 0 valid_reduced_docs = [] for i, d in enumerate(reduced_docs): doc_to_thresh_to_result.append({}) count += 1 print "On document", d, ". The", count, "th document." feature_vecs = d.get_feature_vectors(features, session, cheating=cheating, cheating_min_len=cheating_min_len) # skip if there are no feature_vectors if cheating and len( feature_vecs) < 7: # 7, because that's what Benno did continue valid_reduced_docs.append(d) if feature_vector_weights: weighted_vecs = [] for vec in feature_vecs: cur_weight_vec = [] for i, weight in enumerate(feature_vector_weights, 0): cur_weight_vec.append(vec[i] * weight) weighted_vecs.append(cur_weight_vec) feature_vecs = weighted_vecs # Grab the plagiarized spans spans = d.get_spans(cheating, cheating_min_len) actual_plag_spans = d.get_plag_spans() # Cluster to get plag probs plag_likelihoods = cluster(cluster_type, k, feature_vecs, **clusterargs) # Make sure we have a confidence level for every span assert (len(spans) == len(plag_likelihoods)) # thresh => detected_spans all_detected_spans = {} for thresh in thresholds: prec, rec, fmeasure, granularity, overall, plag_spans, detected_spans = \ get_all_measures(actual_plag_spans, spans, plag_likelihoods, thresh, cheating=cheating, cheating_min_len=cheating_min_len) all_detected_spans[thresh] = detected_spans # If measure wasn't well defined, None is returned. NOTE (nj) sneaky bug: # if we use the construct # if prec: # <add it to the dict> # never add prec or recall when they're 0. # Thus we explicitly check for None-ness if prec is not None: thresh_to_prec.setdefault(thresh, []).append(prec) if rec is not None: thresh_to_recall.setdefault(thresh, []).append(rec) if fmeasure is not None: thresh_to_fmeasure.setdefault(thresh, []).append(fmeasure) if granularity is not None: thresh_to_granularity.setdefault(thresh, []).append(granularity) if overall is not None: thresh_to_overall.setdefault(thresh, []).append(overall) doc_to_thresh_to_result[i][thresh] = (prec, rec, fmeasure, granularity, overall) # Pass relevant data to plotting function doc_name = os.path.basename(d._short_name).replace('.txt', '') #visualize_overlaps(plag_spans, all_detected_spans, doc_name=doc_name) # For a given threshold, how many documents had valid precisions? print 'Valid precision:', sorted([(th, len(l)) for th, l in thresh_to_prec.iteritems()]) # For a given threshold, how many documents had valid recall? print 'Valid recall (this number should not change):', sorted([ (th, len(l)) for th, l in thresh_to_recall.iteritems() ]) thresh_prec_avgs = { t: sum(l) / len(l) for t, l in thresh_to_prec.iteritems() } thresh_recall_avgs = { t: sum(l) / len(l) for t, l in thresh_to_recall.iteritems() } thresh_fmeasure_avgs = { t: sum(l) / len(l) for t, l in thresh_to_fmeasure.iteritems() } thresh_granularity_avgs = { t: sum(l) / len(l) for t, l in thresh_to_granularity.iteritems() } thresh_overall_avgs = { t: sum(l) / len(l) for t, l in thresh_to_overall.iteritems() } if DEBUG: for thresh in sorted(thresh_prec_avgs.keys()): print thresh print 'Prec:', thresh_prec_avgs[thresh] print 'Recall:', thresh_recall_avgs[thresh] print 'F-Measure:', thresh_fmeasure_avgs[thresh] print 'Granularity:', thresh_granularity_avgs[thresh] print 'Overall:', thresh_overall_avgs[thresh] print '-' * 40 return thresh_prec_avgs, thresh_recall_avgs, thresh_fmeasure_avgs, thresh_granularity_avgs, thresh_overall_avgs