def eval_func_confidences(self, feature_weights): weights_sum = float(sum(feature_weights)) # "normalize" (I don't know if that's the right word) the weights, and make sure none are equal to 0 feature_weights = [max(0.00001, x/weights_sum) for x in feature_weights] IU = IntrinsicUtility() all_test_files = IU.get_n_training_files(n=self.num_documents, first_doc_num=self.first_doc_num, min_len=35000, pct_plag=1) reduced_docs = _get_reduced_docs(self.atom_type, all_test_files, session) actuals = [] confidences = [] confidence_vectors = [] for feature, weight in zip(self.features, feature_weights): vi = 0 for doc in reduced_docs: feature_vectors = doc.get_feature_vectors([feature], session) confs = cluster(self.cluster_type, 2, feature_vectors) for i, confidence in enumerate(confs, 0): if len(confidence_vectors) <= vi: confidence_vectors.append([]) confidence_vectors[vi].append(confidence * weight) vi += 1 for doc in reduced_docs: for span in doc._spans: actual = 1 if doc.span_is_plagiarized(span) else 0 actuals.append(actual) for vec in confidence_vectors: confidences.append(min(1, sum(vec))) fpr, tpr, thresholds = sklearn.metrics.roc_curve(actuals, confidences, pos_label=1) roc_auc = sklearn.metrics.auc(fpr, tpr) print 'evaluated:', roc_auc, [w for w in feature_weights] return roc_auc
def get_training_set_files(): util = IntrinsicUtility() full_paths = util.get_n_training_files() relative_paths = util.get_relative_training_set(IntrinsicUtility.TRAINING_LOC) # Strip leading '/' and remove '/'s to prepare for URL relative_paths = [r[1:].replace('/', '-') for r in relative_paths] return relative_paths, full_paths
def extract_and_serialize(txt_file, xml_file, out_file, atom_type='paragraph', cluster_method='kmeans', k=2): ''' Performs all of intrinsic (feature extraction, clustering etc.) and creates Passage objects for each passage in <txt_file>. Writes a CSV file out to <out_file> containing all the features of <txt_file> The CSV files can be read easily by R in order to create plots ''' f = file(txt_file, 'r') text = f.read() f.close() util = IntrinsicUtility() feature_names = [ 'average_word_length', 'average_sentence_length', 'stopword_percentage', 'punctuation_percentage', 'syntactic_complexity', 'avg_internal_word_freq_class', 'avg_external_word_freq_class' ] ext = FeatureExtractor(text) print 'Initialized extractor' # Note that passages don't know their ground truths yet passages = ext.get_passages(feature_names, atom_type) print 'Extracted passages' util.add_ground_truth_to_passages(passages, xml_file) feature_vecs = [p.features.values() for p in passages] # If just testing feature extraction, don't cluster passages if cluster_method != 'none': # Cluster the passages and set their confidences confidences = cluster(cluster_method, k, feature_vecs) for psg, conf in zip(passages, confidences): psg.set_plag_confidence(conf) f = file(out_file, 'wb') csv_writer = csv.writer(f) # Writes out the header for corresponding CSV csv_writer.writerow(IntrinsicPassage.serialization_header(feature_names)) for p in passages: csv_writer.writerow(p.to_list(feature_names)) f.close() print 'Finished writing', out_file
def batch_serialize(n=100): ''' Writes csv files ('serializations') of the passages parsed from first <n> training files ''' out_dir = os.path.join(os.path.dirname(__file__), 'serialized') util = IntrinsicUtility() training_files = util.get_n_training_files(n, include_txt_extension=False) text_files = [t + '.txt' for t in training_files] xml_files = [t + '.xml' for t in training_files] out_files = [os.path.join(out_dir, os.path.basename(t) + '.csv') for t in training_files] for tf, xf, of in zip(text_files, xml_files, out_files): # Only populate if outfile doesn't already exist if not os.path.exists(of): print of, 'did not exist. Working on it now.' extract_and_serialize(tf, xf, of)
def doc_lengths(thresh=35000): ''' Prints the pct. of documents which contain at least <thresh> characters ''' util = IntrinsicUtility() training_docs = util.get_n_training_files() lengths = [] long_enough = 0 for fname in training_docs: f = file(fname, 'rb') text = f.read() f.close() lengths.append(len(text)) if len(text) > thresh: long_enough += 1 print float(long_enough) / len(training_docs), 'were long enough'
def doc_lengths(thresh=35000): """ Prints the pct. of documents which contain at least <thresh> characters """ util = IntrinsicUtility() training_docs = util.get_n_training_files() lengths = [] long_enough = 0 for fname in training_docs: f = file(fname, "rb") text = f.read() f.close() lengths.append(len(text)) if len(text) > thresh: long_enough += 1 print float(long_enough) / len(training_docs), "were long enough"
def construct_and_train_nn(self, features, num_files, epochs, filepath, session): from plagcomps.evaluation.intrinsic import _get_reduced_docs IU = IntrinsicUtility() all_test_files = IU.get_n_training_files(n=num_files) reduced_docs = _get_reduced_docs("paragraph", all_test_files, session) print 'constructing datasets...' # dataset = self.construct_confidence_vectors_dataset(reduced_docs, features, session) dataset = self.read_dataset() training_dataset, testing_dataset = dataset.splitWithProportion(0.75) print 'dataset lengths:', len(dataset), len(training_dataset), len( testing_dataset) print print 'creating neural network...' net = self.create_nn(features, num_hidden_layer_nodes) print 'creating trainer...' trainer = self.create_trainer(net, training_dataset) print 'training neural network for', epochs, 'epochs...' trainer.trainEpochs(epochs) print 'writing neural network to ' + str(filepath) + '...' NetworkWriter.writeToFile(net, filepath) print 'testing neural network...' confidences = [] actuals = [] for point in testing_dataset: confidences.append(net.activate(point[0])[0]) actuals.append(point[1][0]) print 'confidences|actuals ', zip(confidences, actuals) print 'generating ROC curve...' matplotlib.use('pdf') path, auc = self.roc(confidences, actuals) print 'area under curve =', auc
def batch_serialize(n=100): ''' Writes csv files ('serializations') of the passages parsed from first <n> training files ''' out_dir = os.path.join(os.path.dirname(__file__), 'serialized') util = IntrinsicUtility() training_files = util.get_n_training_files(n, include_txt_extension=False) text_files = [t + '.txt' for t in training_files] xml_files = [t + '.xml' for t in training_files] out_files = [ os.path.join(out_dir, os.path.basename(t) + '.csv') for t in training_files ] for tf, xf, of in zip(text_files, xml_files, out_files): # Only populate if outfile doesn't already exist if not os.path.exists(of): print of, 'did not exist. Working on it now.' extract_and_serialize(tf, xf, of)
def explore_training_corpus(n=1000): ''' ''' util = IntrinsicUtility() training_texts = util.get_n_training_files(n) training_xmls = [s.replace('txt', 'xml') for s in training_texts] file_lengths = [] pct_plags = [] total_paragraphs = [] for text_file, xml_file in zip(training_texts, training_xmls): with file(text_file) as f: text = f.read() paragraphs_spans = tokenize(text, 'paragraph') num_paragraphs = len(paragraphs_spans) text_len = len(text) plag_spans = util.get_plagiarized_spans(xml_file) plag_len = sum([end - start for start, end in plag_spans]) plag_pct = float(plag_len) / text_len file_lengths.append(text_len) pct_plags.append(plag_pct) total_paragraphs.append(num_paragraphs) #outfile = os.path.join(os.path.dirname(__file__), 'training_lengths.csv') outfile = 'training_lengths.csv' f = file(outfile, 'wb') f.write('file_num, length, pct_plag, num_paragraphs\n') for i in xrange(len(file_lengths)): line = '%i, %i, %f, %i\n' % (i, file_lengths[i], pct_plags[i], total_paragraphs[i]) f.write(line) f.close() return zip(file_lengths, pct_plags)
def explore_training_corpus(n=1000): """ """ util = IntrinsicUtility() training_texts = util.get_n_training_files(n) training_xmls = [s.replace("txt", "xml") for s in training_texts] file_lengths = [] pct_plags = [] total_paragraphs = [] for text_file, xml_file in zip(training_texts, training_xmls): with file(text_file) as f: text = f.read() paragraphs_spans = tokenize(text, "paragraph") num_paragraphs = len(paragraphs_spans) text_len = len(text) plag_spans = util.get_plagiarized_spans(xml_file) plag_len = sum([end - start for start, end in plag_spans]) plag_pct = float(plag_len) / text_len file_lengths.append(text_len) pct_plags.append(plag_pct) total_paragraphs.append(num_paragraphs) # outfile = os.path.join(os.path.dirname(__file__), 'training_lengths.csv') outfile = "training_lengths.csv" f = file(outfile, "wb") f.write("file_num, length, pct_plag, num_paragraphs\n") for i in xrange(len(file_lengths)): line = "%i, %i, %f, %i\n" % (i, file_lengths[i], pct_plags[i], total_paragraphs[i]) f.write(line) f.close() return zip(file_lengths, pct_plags)
def construct_and_train_nn(self, features, num_files, epochs, filepath, session): from plagcomps.evaluation.intrinsic import _get_reduced_docs IU = IntrinsicUtility() all_test_files = IU.get_n_training_files(n=num_files) reduced_docs = _get_reduced_docs("paragraph", all_test_files, session) print 'constructing datasets...' # dataset = self.construct_confidence_vectors_dataset(reduced_docs, features, session) dataset = self.read_dataset() training_dataset, testing_dataset = dataset.splitWithProportion(0.75) print 'dataset lengths:', len(dataset), len(training_dataset), len(testing_dataset) print print 'creating neural network...' net = self.create_nn(features, num_hidden_layer_nodes) print 'creating trainer...' trainer = self.create_trainer(net, training_dataset) print 'training neural network for', epochs, 'epochs...' trainer.trainEpochs(epochs) print 'writing neural network to ' + str(filepath) + '...' NetworkWriter.writeToFile(net, filepath) print 'testing neural network...' confidences = [] actuals = [] for point in testing_dataset: confidences.append(net.activate(point[0])[0]) actuals.append(point[1][0]) print 'confidences|actuals ', zip(confidences, actuals) print 'generating ROC curve...' matplotlib.use('pdf') path, auc = self.roc(confidences, actuals) print 'area under curve =', auc
def _get_feature_conf_and_actuals(features, cluster_type, atom_type, start_doc, n, pct_plag=None, **cluster_args): ''' Returns a matrix of dimension <num_passages> x <num_features> where each row holds the confidence that that row was plagiarized according to each feature. In other words, mat[passage_num][feat_num] is the plag. confidence of <passage_num> according to <feat_num> Note that the transpose of this matrix is built below, and then transposed before returning ''' first_training_files = IntrinsicUtility().get_n_training_files(n, first_doc_num=start_doc, pct_plag=pct_plag) session = Session() reduced_docs = _get_reduced_docs(atom_type, first_training_files, session) actuals = [] # feature_conf_matrix[feat][span_index] == Conf. that <span_index> # was plag. according to <feat> # NOTE that we're ignoring document boundaries in the storage of this # matrix. So <span_index> is not relative to any document feature_conf_matrix = [[] for i in xrange(len(features))] for doc_index in xrange(len(reduced_docs)): if doc_index % 10 == 0: print 'Working on doc number (in training corpus)', start_doc + doc_index doc = reduced_docs[doc_index] spans = doc.get_spans() for feat_num in xrange(len(features)): feat = features[feat_num] feature_vecs = doc.get_feature_vectors([feat], session) # One column, i.e. confidence values for <feat> over all passages # in <doc> confidences = cluster(cluster_type, 2, feature_vecs, **cluster_args) # Use append if we care about document_num feature_conf_matrix[feat_num].extend(confidences) for span_index in xrange(len(spans)): span = spans[span_index] actuals.append(1 if doc.span_is_plagiarized(span) else 0) rotated = np.matrix(feature_conf_matrix).T return rotated, actuals
def main(m, training_percent = 0.7): random.seed(1337) suspects_base_path = "/copyCats/pan-plagiarism-corpus-2009/external-detection-corpus/suspicious-documents/" suspects_dirs = ["part1/", "part2/", "part3/", "part4/", "part5/", "part6/", "part7/", "part8/"] sources_base_path = "/copyCats/pan-plagiarism-corpus-2009/external-detection-corpus/source-documents/" sources_dirs = ["part1/", "part2/", "part3/", "part4/", "part5/", "part6/", "part7/", "part8/"] # Without extensions all_base_files = [] all_files = [] # list of tuples where tuple[0] is the absolute path of the text document and tuple[1] is the absolute path of the xml file # Put all the suspect files in a list for d in suspects_dirs: p = os.path.join(suspects_base_path, d) for f in os.listdir(p): all_base_files.append(os.path.splitext(f)[0]) if f[-4:] == ".txt": all_files.append((p+f, (p+f)[:-4]+".xml")) # Make sure all of these files actually exist worked = True for suspect in all_files: if not os.path.exists(suspect[0]): worked = False print ".txt file does not exist:", suspect[0] if not os.path.exists(suspect[1]): worked = False print ".xml file does not exist:", suspect[1] assert(worked) # shuffle and take files from the front of the list print 'Shuffling ', len(all_files), 'suspect files...' random.shuffle(all_files) print 'Grabbing all valid suspects...' # grab n files with plagiarism training_suspect_partition = [] for filepaths in all_files: plag_spans = IntrinsicUtility.get_plagiarized_spans(filepaths[1]) if len(plag_spans) > 0: # make sure it's at least m paragraphs f = open(filepaths[0], 'r') text = f.read() f.close() paragraphs = tokenize(text, 'paragraph') if len(paragraphs) > m: continue training_suspect_partition.append(filepaths) if len(training_suspect_partition) % 10 == 0: print len(training_suspect_partition) print len(training_suspect_partition) # print 'Writing partitions to disk...' # suspect_training_file = file("crisp_extrinsic_training_suspect_files.txt", 'w') # for suspect in training_suspect_partition: # rel_path_start = suspect[0].index('/part') # suspect_training_file.write(suspect[0][rel_path_start:-4] + '\n') # suspect_training_file.close() print 'Determining source documents for training partition...' training_sources = {} training_sources_suspects = {} num_files = 0 for filenames in training_suspect_partition: tree = ET.parse(filenames[1]) for feature in tree.iter("feature"): if feature.get("name") == "artificial-plagiarism" and feature.get("source_reference") and feature.get("source_reference")[:-4] not in training_sources: # figure out which partX the doc is in...so annoying... for p in sources_dirs: if os.path.exists(sources_base_path + p + feature.get("source_reference")): short_name = "/" + p + feature.get("source_reference")[:-4] long_name = sources_base_path + p + feature.get("source_reference") training_sources[short_name] = 1 if filenames[1] not in training_sources_suspects: training_sources_suspects[filenames[1]] = [long_name] else: training_sources_suspects[filenames[1]].append(long_name) num_files += 1 if num_files%100 == 0: print num_files, sys.stdout.flush() print print len(training_sources.keys()), 'sources for the training partition were found...' print 'Removing invalid suspects because of short sources...' # get rid of the ones that are too long... final_training_suspect_partition = [] for _, xml in training_suspect_partition: # are all of its sources < m paragraphs? short_enough = True for source_filename in training_sources_suspects[xml]: f = open(source_filename, 'r') text = f.read() f.close() paragraphs = tokenize(text, 'paragraph') if len(paragraphs) > m: short_enough = False break if short_enough: final_training_suspect_partition.append(xml) print 'Constructing final source partition...' final_training_source_partition = [] for suspect in final_training_suspect_partition: for long_name in training_sources_suspects[suspect]: short_name = '/' + re.sub(sources_base_path, '', long_name) if short_name not in final_training_source_partition: final_training_source_partition.append(short_name) print 'Converting suspects names.......' final_training_suspect_partition = ['/' + re.sub('.xml', '', re.sub(suspects_base_path, '', xml)) for xml in final_training_suspect_partition] print len(final_training_suspect_partition), final_training_suspect_partition print len(final_training_source_partition), final_training_source_partition print 'Writing suspect documents to disk...' suspects_training_file = file("crisp_corpus_suspect_files.txt", 'w') for filename in final_training_suspect_partition: suspects_training_file.write(filename + '\n') suspects_training_file.close() print 'Writing source documents to disk...' sources_training_file = file("crisp_corpus_source_files.txt", 'w') for filename in final_training_source_partition: sources_training_file.write(filename + '\n') sources_training_file.close()