def eval_func_confidences(self, feature_weights):
        weights_sum = float(sum(feature_weights))
        # "normalize" (I don't know if that's the right word) the weights, and make sure none are equal to 0
        feature_weights = [max(0.00001, x/weights_sum) for x in feature_weights]
        IU = IntrinsicUtility()
        all_test_files = IU.get_n_training_files(n=self.num_documents, first_doc_num=self.first_doc_num, min_len=35000, pct_plag=1)
        reduced_docs = _get_reduced_docs(self.atom_type, all_test_files, session)

        actuals = []
        confidences = []

        confidence_vectors = []
        for feature, weight in zip(self.features, feature_weights):
            vi = 0
            for doc in reduced_docs:
                feature_vectors = doc.get_feature_vectors([feature], session)
                confs = cluster(self.cluster_type, 2, feature_vectors)
                for i, confidence in enumerate(confs, 0):
                    if len(confidence_vectors) <= vi:
                        confidence_vectors.append([])
                    confidence_vectors[vi].append(confidence * weight)
                    vi += 1
                    
        for doc in reduced_docs:
            for span in doc._spans:
                actual = 1 if doc.span_is_plagiarized(span) else 0
                actuals.append(actual)

        for vec in confidence_vectors:
            confidences.append(min(1, sum(vec)))

        fpr, tpr, thresholds = sklearn.metrics.roc_curve(actuals, confidences, pos_label=1)
        roc_auc = sklearn.metrics.auc(fpr, tpr)
        print 'evaluated:', roc_auc, [w for w in feature_weights]
        return roc_auc
Exemplo n.º 2
0
def get_training_set_files():
    util = IntrinsicUtility()
    full_paths = util.get_n_training_files()
    relative_paths = util.get_relative_training_set(IntrinsicUtility.TRAINING_LOC)
    # Strip leading '/' and remove '/'s to prepare for URL
    relative_paths = [r[1:].replace('/', '-') for r in relative_paths]

    return relative_paths, full_paths
Exemplo n.º 3
0
def get_training_set_files():
    util = IntrinsicUtility()
    full_paths = util.get_n_training_files()
    relative_paths = util.get_relative_training_set(IntrinsicUtility.TRAINING_LOC)
    # Strip leading '/' and remove '/'s to prepare for URL
    relative_paths = [r[1:].replace('/', '-') for r in relative_paths]

    return relative_paths, full_paths
Exemplo n.º 4
0
def batch_serialize(n=100):
    '''
    Writes csv files ('serializations') of the passages parsed from first <n>
    training files 
    '''
    out_dir = os.path.join(os.path.dirname(__file__), 'serialized')
    util = IntrinsicUtility()
    training_files = util.get_n_training_files(n, include_txt_extension=False)

    text_files = [t + '.txt' for t in training_files]
    xml_files = [t + '.xml' for t in training_files]
    out_files = [os.path.join(out_dir, os.path.basename(t) + '.csv') for t in training_files]

    for tf, xf, of in zip(text_files, xml_files, out_files):
        # Only populate if outfile doesn't already exist
        if not os.path.exists(of):
            print of, 'did not exist. Working on it now.'
            extract_and_serialize(tf, xf, of)
Exemplo n.º 5
0
def doc_lengths(thresh=35000):
    '''
	Prints the pct. of documents which contain at least <thresh> characters
	'''
    util = IntrinsicUtility()
    training_docs = util.get_n_training_files()
    lengths = []
    long_enough = 0

    for fname in training_docs:
        f = file(fname, 'rb')
        text = f.read()
        f.close()

        lengths.append(len(text))
        if len(text) > thresh:
            long_enough += 1

    print float(long_enough) / len(training_docs), 'were long enough'
Exemplo n.º 6
0
def doc_lengths(thresh=35000):
    """
	Prints the pct. of documents which contain at least <thresh> characters
	"""
    util = IntrinsicUtility()
    training_docs = util.get_n_training_files()
    lengths = []
    long_enough = 0

    for fname in training_docs:
        f = file(fname, "rb")
        text = f.read()
        f.close()

        lengths.append(len(text))
        if len(text) > thresh:
            long_enough += 1

    print float(long_enough) / len(training_docs), "were long enough"
    def construct_and_train_nn(self, features, num_files, epochs, filepath,
                               session):
        from plagcomps.evaluation.intrinsic import _get_reduced_docs

        IU = IntrinsicUtility()
        all_test_files = IU.get_n_training_files(n=num_files)
        reduced_docs = _get_reduced_docs("paragraph", all_test_files, session)

        print 'constructing datasets...'
        # dataset = self.construct_confidence_vectors_dataset(reduced_docs, features, session)
        dataset = self.read_dataset()
        training_dataset, testing_dataset = dataset.splitWithProportion(0.75)
        print 'dataset lengths:', len(dataset), len(training_dataset), len(
            testing_dataset)
        print

        print 'creating neural network...'
        net = self.create_nn(features, num_hidden_layer_nodes)

        print 'creating trainer...'
        trainer = self.create_trainer(net, training_dataset)

        print 'training neural network for', epochs, 'epochs...'
        trainer.trainEpochs(epochs)

        print 'writing neural network to ' + str(filepath) + '...'
        NetworkWriter.writeToFile(net, filepath)

        print 'testing neural network...'
        confidences = []
        actuals = []
        for point in testing_dataset:
            confidences.append(net.activate(point[0])[0])
            actuals.append(point[1][0])

        print 'confidences|actuals ', zip(confidences, actuals)

        print 'generating ROC curve...'
        matplotlib.use('pdf')
        path, auc = self.roc(confidences, actuals)
        print 'area under curve =', auc
Exemplo n.º 8
0
def batch_serialize(n=100):
    '''
    Writes csv files ('serializations') of the passages parsed from first <n>
    training files 
    '''
    out_dir = os.path.join(os.path.dirname(__file__), 'serialized')
    util = IntrinsicUtility()
    training_files = util.get_n_training_files(n, include_txt_extension=False)

    text_files = [t + '.txt' for t in training_files]
    xml_files = [t + '.xml' for t in training_files]
    out_files = [
        os.path.join(out_dir,
                     os.path.basename(t) + '.csv') for t in training_files
    ]

    for tf, xf, of in zip(text_files, xml_files, out_files):
        # Only populate if outfile doesn't already exist
        if not os.path.exists(of):
            print of, 'did not exist. Working on it now.'
            extract_and_serialize(tf, xf, of)
Exemplo n.º 9
0
def explore_training_corpus(n=1000):
    '''
	'''
    util = IntrinsicUtility()
    training_texts = util.get_n_training_files(n)
    training_xmls = [s.replace('txt', 'xml') for s in training_texts]

    file_lengths = []
    pct_plags = []
    total_paragraphs = []

    for text_file, xml_file in zip(training_texts, training_xmls):
        with file(text_file) as f:
            text = f.read()

        paragraphs_spans = tokenize(text, 'paragraph')
        num_paragraphs = len(paragraphs_spans)

        text_len = len(text)
        plag_spans = util.get_plagiarized_spans(xml_file)
        plag_len = sum([end - start for start, end in plag_spans])
        plag_pct = float(plag_len) / text_len

        file_lengths.append(text_len)
        pct_plags.append(plag_pct)
        total_paragraphs.append(num_paragraphs)

    #outfile = os.path.join(os.path.dirname(__file__), 'training_lengths.csv')
    outfile = 'training_lengths.csv'

    f = file(outfile, 'wb')
    f.write('file_num, length, pct_plag, num_paragraphs\n')

    for i in xrange(len(file_lengths)):
        line = '%i, %i, %f, %i\n' % (i, file_lengths[i], pct_plags[i],
                                     total_paragraphs[i])
        f.write(line)
    f.close()

    return zip(file_lengths, pct_plags)
Exemplo n.º 10
0
def explore_training_corpus(n=1000):
    """
	"""
    util = IntrinsicUtility()
    training_texts = util.get_n_training_files(n)
    training_xmls = [s.replace("txt", "xml") for s in training_texts]

    file_lengths = []
    pct_plags = []
    total_paragraphs = []

    for text_file, xml_file in zip(training_texts, training_xmls):
        with file(text_file) as f:
            text = f.read()

        paragraphs_spans = tokenize(text, "paragraph")
        num_paragraphs = len(paragraphs_spans)

        text_len = len(text)
        plag_spans = util.get_plagiarized_spans(xml_file)
        plag_len = sum([end - start for start, end in plag_spans])
        plag_pct = float(plag_len) / text_len

        file_lengths.append(text_len)
        pct_plags.append(plag_pct)
        total_paragraphs.append(num_paragraphs)

        # outfile = os.path.join(os.path.dirname(__file__), 'training_lengths.csv')
    outfile = "training_lengths.csv"

    f = file(outfile, "wb")
    f.write("file_num, length, pct_plag, num_paragraphs\n")

    for i in xrange(len(file_lengths)):
        line = "%i, %i, %f, %i\n" % (i, file_lengths[i], pct_plags[i], total_paragraphs[i])
        f.write(line)
    f.close()

    return zip(file_lengths, pct_plags)
Exemplo n.º 11
0
    def construct_and_train_nn(self, features, num_files, epochs, filepath, session):
        from plagcomps.evaluation.intrinsic import _get_reduced_docs

        IU = IntrinsicUtility()
        all_test_files = IU.get_n_training_files(n=num_files)
        reduced_docs = _get_reduced_docs("paragraph", all_test_files, session)
        
        print 'constructing datasets...'
        # dataset = self.construct_confidence_vectors_dataset(reduced_docs, features, session)
        dataset = self.read_dataset()
        training_dataset, testing_dataset = dataset.splitWithProportion(0.75)
        print 'dataset lengths:', len(dataset), len(training_dataset), len(testing_dataset)
        print

        print 'creating neural network...'
        net = self.create_nn(features, num_hidden_layer_nodes)

        print 'creating trainer...'
        trainer = self.create_trainer(net, training_dataset)

        print 'training neural network for', epochs, 'epochs...'
        trainer.trainEpochs(epochs)

        print 'writing neural network to ' + str(filepath) + '...'
        NetworkWriter.writeToFile(net, filepath)

        print 'testing neural network...'
        confidences = []
        actuals = []
        for point in testing_dataset:
            confidences.append(net.activate(point[0])[0])
            actuals.append(point[1][0])

        print 'confidences|actuals ', zip(confidences, actuals)

        print 'generating ROC curve...'
        matplotlib.use('pdf')
        path, auc = self.roc(confidences, actuals)
        print 'area under curve =', auc