def eval_func_confidences(self, feature_weights): weights_sum = float(sum(feature_weights)) # "normalize" (I don't know if that's the right word) the weights, and make sure none are equal to 0 feature_weights = [max(0.00001, x/weights_sum) for x in feature_weights] IU = IntrinsicUtility() all_test_files = IU.get_n_training_files(n=self.num_documents, first_doc_num=self.first_doc_num, min_len=35000, pct_plag=1) reduced_docs = _get_reduced_docs(self.atom_type, all_test_files, session) actuals = [] confidences = [] confidence_vectors = [] for feature, weight in zip(self.features, feature_weights): vi = 0 for doc in reduced_docs: feature_vectors = doc.get_feature_vectors([feature], session) confs = cluster(self.cluster_type, 2, feature_vectors) for i, confidence in enumerate(confs, 0): if len(confidence_vectors) <= vi: confidence_vectors.append([]) confidence_vectors[vi].append(confidence * weight) vi += 1 for doc in reduced_docs: for span in doc._spans: actual = 1 if doc.span_is_plagiarized(span) else 0 actuals.append(actual) for vec in confidence_vectors: confidences.append(min(1, sum(vec))) fpr, tpr, thresholds = sklearn.metrics.roc_curve(actuals, confidences, pos_label=1) roc_auc = sklearn.metrics.auc(fpr, tpr) print 'evaluated:', roc_auc, [w for w in feature_weights] return roc_auc
def get_training_set_files(): util = IntrinsicUtility() full_paths = util.get_n_training_files() relative_paths = util.get_relative_training_set(IntrinsicUtility.TRAINING_LOC) # Strip leading '/' and remove '/'s to prepare for URL relative_paths = [r[1:].replace('/', '-') for r in relative_paths] return relative_paths, full_paths
def extract_and_serialize(txt_file, xml_file, out_file, atom_type='paragraph', cluster_method='kmeans', k=2): ''' Performs all of intrinsic (feature extraction, clustering etc.) and creates Passage objects for each passage in <txt_file>. Writes a CSV file out to <out_file> containing all the features of <txt_file> The CSV files can be read easily by R in order to create plots ''' f = file(txt_file, 'r') text = f.read() f.close() util = IntrinsicUtility() feature_names = [ 'average_word_length', 'average_sentence_length', 'stopword_percentage', 'punctuation_percentage', 'syntactic_complexity', 'avg_internal_word_freq_class', 'avg_external_word_freq_class' ] ext = FeatureExtractor(text) print 'Initialized extractor' # Note that passages don't know their ground truths yet passages = ext.get_passages(feature_names, atom_type) print 'Extracted passages' util.add_ground_truth_to_passages(passages, xml_file) feature_vecs = [p.features.values() for p in passages] # If just testing feature extraction, don't cluster passages if cluster_method != 'none': # Cluster the passages and set their confidences confidences = cluster(cluster_method, k, feature_vecs) for psg, conf in zip(passages, confidences): psg.set_plag_confidence(conf) f = file(out_file, 'wb') csv_writer = csv.writer(f) # Writes out the header for corresponding CSV csv_writer.writerow(IntrinsicPassage.serialization_header(feature_names)) for p in passages: csv_writer.writerow(p.to_list(feature_names)) f.close() print 'Finished writing', out_file
def _get_feature_conf_and_actuals(features, cluster_type, atom_type, start_doc, n, pct_plag=None, **cluster_args): ''' Returns a matrix of dimension <num_passages> x <num_features> where each row holds the confidence that that row was plagiarized according to each feature. In other words, mat[passage_num][feat_num] is the plag. confidence of <passage_num> according to <feat_num> Note that the transpose of this matrix is built below, and then transposed before returning ''' first_training_files = IntrinsicUtility().get_n_training_files(n, first_doc_num=start_doc, pct_plag=pct_plag) session = Session() reduced_docs = _get_reduced_docs(atom_type, first_training_files, session) actuals = [] # feature_conf_matrix[feat][span_index] == Conf. that <span_index> # was plag. according to <feat> # NOTE that we're ignoring document boundaries in the storage of this # matrix. So <span_index> is not relative to any document feature_conf_matrix = [[] for i in xrange(len(features))] for doc_index in xrange(len(reduced_docs)): if doc_index % 10 == 0: print 'Working on doc number (in training corpus)', start_doc + doc_index doc = reduced_docs[doc_index] spans = doc.get_spans() for feat_num in xrange(len(features)): feat = features[feat_num] feature_vecs = doc.get_feature_vectors([feat], session) # One column, i.e. confidence values for <feat> over all passages # in <doc> confidences = cluster(cluster_type, 2, feature_vecs, **cluster_args) # Use append if we care about document_num feature_conf_matrix[feat_num].extend(confidences) for span_index in xrange(len(spans)): span = spans[span_index] actuals.append(1 if doc.span_is_plagiarized(span) else 0) rotated = np.matrix(feature_conf_matrix).T return rotated, actuals
def doc_lengths(thresh=35000): ''' Prints the pct. of documents which contain at least <thresh> characters ''' util = IntrinsicUtility() training_docs = util.get_n_training_files() lengths = [] long_enough = 0 for fname in training_docs: f = file(fname, 'rb') text = f.read() f.close() lengths.append(len(text)) if len(text) > thresh: long_enough += 1 print float(long_enough) / len(training_docs), 'were long enough'
def construct_and_train_nn(self, features, num_files, epochs, filepath, session): from plagcomps.evaluation.intrinsic import _get_reduced_docs IU = IntrinsicUtility() all_test_files = IU.get_n_training_files(n=num_files) reduced_docs = _get_reduced_docs("paragraph", all_test_files, session) print 'constructing datasets...' # dataset = self.construct_confidence_vectors_dataset(reduced_docs, features, session) dataset = self.read_dataset() training_dataset, testing_dataset = dataset.splitWithProportion(0.75) print 'dataset lengths:', len(dataset), len(training_dataset), len( testing_dataset) print print 'creating neural network...' net = self.create_nn(features, num_hidden_layer_nodes) print 'creating trainer...' trainer = self.create_trainer(net, training_dataset) print 'training neural network for', epochs, 'epochs...' trainer.trainEpochs(epochs) print 'writing neural network to ' + str(filepath) + '...' NetworkWriter.writeToFile(net, filepath) print 'testing neural network...' confidences = [] actuals = [] for point in testing_dataset: confidences.append(net.activate(point[0])[0]) actuals.append(point[1][0]) print 'confidences|actuals ', zip(confidences, actuals) print 'generating ROC curve...' matplotlib.use('pdf') path, auc = self.roc(confidences, actuals) print 'area under curve =', auc
def batch_serialize(n=100): ''' Writes csv files ('serializations') of the passages parsed from first <n> training files ''' out_dir = os.path.join(os.path.dirname(__file__), 'serialized') util = IntrinsicUtility() training_files = util.get_n_training_files(n, include_txt_extension=False) text_files = [t + '.txt' for t in training_files] xml_files = [t + '.xml' for t in training_files] out_files = [ os.path.join(out_dir, os.path.basename(t) + '.csv') for t in training_files ] for tf, xf, of in zip(text_files, xml_files, out_files): # Only populate if outfile doesn't already exist if not os.path.exists(of): print of, 'did not exist. Working on it now.' extract_and_serialize(tf, xf, of)
def explore_training_corpus(n=1000): ''' ''' util = IntrinsicUtility() training_texts = util.get_n_training_files(n) training_xmls = [s.replace('txt', 'xml') for s in training_texts] file_lengths = [] pct_plags = [] total_paragraphs = [] for text_file, xml_file in zip(training_texts, training_xmls): with file(text_file) as f: text = f.read() paragraphs_spans = tokenize(text, 'paragraph') num_paragraphs = len(paragraphs_spans) text_len = len(text) plag_spans = util.get_plagiarized_spans(xml_file) plag_len = sum([end - start for start, end in plag_spans]) plag_pct = float(plag_len) / text_len file_lengths.append(text_len) pct_plags.append(plag_pct) total_paragraphs.append(num_paragraphs) #outfile = os.path.join(os.path.dirname(__file__), 'training_lengths.csv') outfile = 'training_lengths.csv' f = file(outfile, 'wb') f.write('file_num, length, pct_plag, num_paragraphs\n') for i in xrange(len(file_lengths)): line = '%i, %i, %f, %i\n' % (i, file_lengths[i], pct_plags[i], total_paragraphs[i]) f.write(line) f.close() return zip(file_lengths, pct_plags)