def eval_func_confidences(self, feature_weights): weights_sum = float(sum(feature_weights)) # "normalize" (I don't know if that's the right word) the weights, and make sure none are equal to 0 feature_weights = [max(0.00001, x/weights_sum) for x in feature_weights] IU = IntrinsicUtility() all_test_files = IU.get_n_training_files(n=self.num_documents, first_doc_num=self.first_doc_num, min_len=35000, pct_plag=1) reduced_docs = _get_reduced_docs(self.atom_type, all_test_files, session) actuals = [] confidences = [] confidence_vectors = [] for feature, weight in zip(self.features, feature_weights): vi = 0 for doc in reduced_docs: feature_vectors = doc.get_feature_vectors([feature], session) confs = cluster(self.cluster_type, 2, feature_vectors) for i, confidence in enumerate(confs, 0): if len(confidence_vectors) <= vi: confidence_vectors.append([]) confidence_vectors[vi].append(confidence * weight) vi += 1 for doc in reduced_docs: for span in doc._spans: actual = 1 if doc.span_is_plagiarized(span) else 0 actuals.append(actual) for vec in confidence_vectors: confidences.append(min(1, sum(vec))) fpr, tpr, thresholds = sklearn.metrics.roc_curve(actuals, confidences, pos_label=1) roc_auc = sklearn.metrics.auc(fpr, tpr) print 'evaluated:', roc_auc, [w for w in feature_weights] return roc_auc
def get_training_set_files(): util = IntrinsicUtility() full_paths = util.get_n_training_files() relative_paths = util.get_relative_training_set(IntrinsicUtility.TRAINING_LOC) # Strip leading '/' and remove '/'s to prepare for URL relative_paths = [r[1:].replace('/', '-') for r in relative_paths] return relative_paths, full_paths
def batch_serialize(n=100): ''' Writes csv files ('serializations') of the passages parsed from first <n> training files ''' out_dir = os.path.join(os.path.dirname(__file__), 'serialized') util = IntrinsicUtility() training_files = util.get_n_training_files(n, include_txt_extension=False) text_files = [t + '.txt' for t in training_files] xml_files = [t + '.xml' for t in training_files] out_files = [os.path.join(out_dir, os.path.basename(t) + '.csv') for t in training_files] for tf, xf, of in zip(text_files, xml_files, out_files): # Only populate if outfile doesn't already exist if not os.path.exists(of): print of, 'did not exist. Working on it now.' extract_and_serialize(tf, xf, of)
def doc_lengths(thresh=35000): ''' Prints the pct. of documents which contain at least <thresh> characters ''' util = IntrinsicUtility() training_docs = util.get_n_training_files() lengths = [] long_enough = 0 for fname in training_docs: f = file(fname, 'rb') text = f.read() f.close() lengths.append(len(text)) if len(text) > thresh: long_enough += 1 print float(long_enough) / len(training_docs), 'were long enough'
def doc_lengths(thresh=35000): """ Prints the pct. of documents which contain at least <thresh> characters """ util = IntrinsicUtility() training_docs = util.get_n_training_files() lengths = [] long_enough = 0 for fname in training_docs: f = file(fname, "rb") text = f.read() f.close() lengths.append(len(text)) if len(text) > thresh: long_enough += 1 print float(long_enough) / len(training_docs), "were long enough"
def construct_and_train_nn(self, features, num_files, epochs, filepath, session): from plagcomps.evaluation.intrinsic import _get_reduced_docs IU = IntrinsicUtility() all_test_files = IU.get_n_training_files(n=num_files) reduced_docs = _get_reduced_docs("paragraph", all_test_files, session) print 'constructing datasets...' # dataset = self.construct_confidence_vectors_dataset(reduced_docs, features, session) dataset = self.read_dataset() training_dataset, testing_dataset = dataset.splitWithProportion(0.75) print 'dataset lengths:', len(dataset), len(training_dataset), len( testing_dataset) print print 'creating neural network...' net = self.create_nn(features, num_hidden_layer_nodes) print 'creating trainer...' trainer = self.create_trainer(net, training_dataset) print 'training neural network for', epochs, 'epochs...' trainer.trainEpochs(epochs) print 'writing neural network to ' + str(filepath) + '...' NetworkWriter.writeToFile(net, filepath) print 'testing neural network...' confidences = [] actuals = [] for point in testing_dataset: confidences.append(net.activate(point[0])[0]) actuals.append(point[1][0]) print 'confidences|actuals ', zip(confidences, actuals) print 'generating ROC curve...' matplotlib.use('pdf') path, auc = self.roc(confidences, actuals) print 'area under curve =', auc
def batch_serialize(n=100): ''' Writes csv files ('serializations') of the passages parsed from first <n> training files ''' out_dir = os.path.join(os.path.dirname(__file__), 'serialized') util = IntrinsicUtility() training_files = util.get_n_training_files(n, include_txt_extension=False) text_files = [t + '.txt' for t in training_files] xml_files = [t + '.xml' for t in training_files] out_files = [ os.path.join(out_dir, os.path.basename(t) + '.csv') for t in training_files ] for tf, xf, of in zip(text_files, xml_files, out_files): # Only populate if outfile doesn't already exist if not os.path.exists(of): print of, 'did not exist. Working on it now.' extract_and_serialize(tf, xf, of)
def explore_training_corpus(n=1000): ''' ''' util = IntrinsicUtility() training_texts = util.get_n_training_files(n) training_xmls = [s.replace('txt', 'xml') for s in training_texts] file_lengths = [] pct_plags = [] total_paragraphs = [] for text_file, xml_file in zip(training_texts, training_xmls): with file(text_file) as f: text = f.read() paragraphs_spans = tokenize(text, 'paragraph') num_paragraphs = len(paragraphs_spans) text_len = len(text) plag_spans = util.get_plagiarized_spans(xml_file) plag_len = sum([end - start for start, end in plag_spans]) plag_pct = float(plag_len) / text_len file_lengths.append(text_len) pct_plags.append(plag_pct) total_paragraphs.append(num_paragraphs) #outfile = os.path.join(os.path.dirname(__file__), 'training_lengths.csv') outfile = 'training_lengths.csv' f = file(outfile, 'wb') f.write('file_num, length, pct_plag, num_paragraphs\n') for i in xrange(len(file_lengths)): line = '%i, %i, %f, %i\n' % (i, file_lengths[i], pct_plags[i], total_paragraphs[i]) f.write(line) f.close() return zip(file_lengths, pct_plags)
def explore_training_corpus(n=1000): """ """ util = IntrinsicUtility() training_texts = util.get_n_training_files(n) training_xmls = [s.replace("txt", "xml") for s in training_texts] file_lengths = [] pct_plags = [] total_paragraphs = [] for text_file, xml_file in zip(training_texts, training_xmls): with file(text_file) as f: text = f.read() paragraphs_spans = tokenize(text, "paragraph") num_paragraphs = len(paragraphs_spans) text_len = len(text) plag_spans = util.get_plagiarized_spans(xml_file) plag_len = sum([end - start for start, end in plag_spans]) plag_pct = float(plag_len) / text_len file_lengths.append(text_len) pct_plags.append(plag_pct) total_paragraphs.append(num_paragraphs) # outfile = os.path.join(os.path.dirname(__file__), 'training_lengths.csv') outfile = "training_lengths.csv" f = file(outfile, "wb") f.write("file_num, length, pct_plag, num_paragraphs\n") for i in xrange(len(file_lengths)): line = "%i, %i, %f, %i\n" % (i, file_lengths[i], pct_plags[i], total_paragraphs[i]) f.write(line) f.close() return zip(file_lengths, pct_plags)
def construct_and_train_nn(self, features, num_files, epochs, filepath, session): from plagcomps.evaluation.intrinsic import _get_reduced_docs IU = IntrinsicUtility() all_test_files = IU.get_n_training_files(n=num_files) reduced_docs = _get_reduced_docs("paragraph", all_test_files, session) print 'constructing datasets...' # dataset = self.construct_confidence_vectors_dataset(reduced_docs, features, session) dataset = self.read_dataset() training_dataset, testing_dataset = dataset.splitWithProportion(0.75) print 'dataset lengths:', len(dataset), len(training_dataset), len(testing_dataset) print print 'creating neural network...' net = self.create_nn(features, num_hidden_layer_nodes) print 'creating trainer...' trainer = self.create_trainer(net, training_dataset) print 'training neural network for', epochs, 'epochs...' trainer.trainEpochs(epochs) print 'writing neural network to ' + str(filepath) + '...' NetworkWriter.writeToFile(net, filepath) print 'testing neural network...' confidences = [] actuals = [] for point in testing_dataset: confidences.append(net.activate(point[0])[0]) actuals.append(point[1][0]) print 'confidences|actuals ', zip(confidences, actuals) print 'generating ROC curve...' matplotlib.use('pdf') path, auc = self.roc(confidences, actuals) print 'area under curve =', auc