def eval_func_confidences(self, feature_weights):
        weights_sum = float(sum(feature_weights))
        # "normalize" (I don't know if that's the right word) the weights, and make sure none are equal to 0
        feature_weights = [max(0.00001, x/weights_sum) for x in feature_weights]
        IU = IntrinsicUtility()
        all_test_files = IU.get_n_training_files(n=self.num_documents, first_doc_num=self.first_doc_num, min_len=35000, pct_plag=1)
        reduced_docs = _get_reduced_docs(self.atom_type, all_test_files, session)

        actuals = []
        confidences = []

        confidence_vectors = []
        for feature, weight in zip(self.features, feature_weights):
            vi = 0
            for doc in reduced_docs:
                feature_vectors = doc.get_feature_vectors([feature], session)
                confs = cluster(self.cluster_type, 2, feature_vectors)
                for i, confidence in enumerate(confs, 0):
                    if len(confidence_vectors) <= vi:
                        confidence_vectors.append([])
                    confidence_vectors[vi].append(confidence * weight)
                    vi += 1
                    
        for doc in reduced_docs:
            for span in doc._spans:
                actual = 1 if doc.span_is_plagiarized(span) else 0
                actuals.append(actual)

        for vec in confidence_vectors:
            confidences.append(min(1, sum(vec)))

        fpr, tpr, thresholds = sklearn.metrics.roc_curve(actuals, confidences, pos_label=1)
        roc_auc = sklearn.metrics.auc(fpr, tpr)
        print 'evaluated:', roc_auc, [w for w in feature_weights]
        return roc_auc
示例#2
0
def get_training_set_files():
    util = IntrinsicUtility()
    full_paths = util.get_n_training_files()
    relative_paths = util.get_relative_training_set(IntrinsicUtility.TRAINING_LOC)
    # Strip leading '/' and remove '/'s to prepare for URL
    relative_paths = [r[1:].replace('/', '-') for r in relative_paths]

    return relative_paths, full_paths
示例#3
0
def extract_and_serialize(txt_file,
                          xml_file,
                          out_file,
                          atom_type='paragraph',
                          cluster_method='kmeans',
                          k=2):
    '''
    Performs all of intrinsic (feature extraction, clustering etc.) and creates
    Passage objects for each passage in <txt_file>. Writes a CSV file out
    to <out_file> containing all the features of <txt_file>

    The CSV files can be read easily by R in order to create plots
    '''
    f = file(txt_file, 'r')
    text = f.read()
    f.close()

    util = IntrinsicUtility()

    feature_names = [
        'average_word_length', 'average_sentence_length',
        'stopword_percentage', 'punctuation_percentage',
        'syntactic_complexity', 'avg_internal_word_freq_class',
        'avg_external_word_freq_class'
    ]

    ext = FeatureExtractor(text)
    print 'Initialized extractor'
    # Note that passages don't know their ground truths yet
    passages = ext.get_passages(feature_names, atom_type)
    print 'Extracted passages'
    util.add_ground_truth_to_passages(passages, xml_file)

    feature_vecs = [p.features.values() for p in passages]

    # If just testing feature extraction, don't cluster passages
    if cluster_method != 'none':
        # Cluster the passages and set their confidences
        confidences = cluster(cluster_method, k, feature_vecs)
        for psg, conf in zip(passages, confidences):
            psg.set_plag_confidence(conf)

    f = file(out_file, 'wb')
    csv_writer = csv.writer(f)

    # Writes out the header for corresponding CSV
    csv_writer.writerow(IntrinsicPassage.serialization_header(feature_names))
    for p in passages:
        csv_writer.writerow(p.to_list(feature_names))
    f.close()
    print 'Finished writing', out_file
def _get_feature_conf_and_actuals(features, cluster_type, atom_type, start_doc, n, pct_plag=None, **cluster_args):
    '''
    Returns a matrix of dimension <num_passages> x <num_features> where each row holds 
    the confidence that that row was plagiarized according to each feature. In other
    words,
    mat[passage_num][feat_num] is the plag. confidence of <passage_num> according to <feat_num>

    Note that the transpose of this matrix is built below, and then transposed before returning
    '''

    first_training_files = IntrinsicUtility().get_n_training_files(n, first_doc_num=start_doc, pct_plag=pct_plag)
    session = Session()
    reduced_docs = _get_reduced_docs(atom_type, first_training_files, session)

    actuals = []
    
    # feature_conf_matrix[feat][span_index] == Conf. that <span_index>
    # was plag. according to <feat>
    # NOTE that we're ignoring document boundaries in the storage of this 
    # matrix. So <span_index> is not relative to any document
    feature_conf_matrix = [[] for i in xrange(len(features))]
    
    for doc_index in xrange(len(reduced_docs)):
        if doc_index % 10 == 0:
            print 'Working on doc number (in training corpus)', start_doc + doc_index
        doc = reduced_docs[doc_index]
        spans = doc.get_spans()

        for feat_num in xrange(len(features)):
            feat = features[feat_num]
            feature_vecs = doc.get_feature_vectors([feat], session)
            # One column, i.e. confidence values for <feat> over all passages 
            # in <doc>
            confidences = cluster(cluster_type, 2, feature_vecs, **cluster_args)
            # Use append if we care about document_num
            
            feature_conf_matrix[feat_num].extend(confidences)
            
        for span_index in xrange(len(spans)):
            span = spans[span_index]
            
            actuals.append(1 if doc.span_is_plagiarized(span) else 0)
            
    
    rotated = np.matrix(feature_conf_matrix).T

    return rotated, actuals
def doc_lengths(thresh=35000):
    '''
	Prints the pct. of documents which contain at least <thresh> characters
	'''
    util = IntrinsicUtility()
    training_docs = util.get_n_training_files()
    lengths = []
    long_enough = 0

    for fname in training_docs:
        f = file(fname, 'rb')
        text = f.read()
        f.close()

        lengths.append(len(text))
        if len(text) > thresh:
            long_enough += 1

    print float(long_enough) / len(training_docs), 'were long enough'
    def construct_and_train_nn(self, features, num_files, epochs, filepath,
                               session):
        from plagcomps.evaluation.intrinsic import _get_reduced_docs

        IU = IntrinsicUtility()
        all_test_files = IU.get_n_training_files(n=num_files)
        reduced_docs = _get_reduced_docs("paragraph", all_test_files, session)

        print 'constructing datasets...'
        # dataset = self.construct_confidence_vectors_dataset(reduced_docs, features, session)
        dataset = self.read_dataset()
        training_dataset, testing_dataset = dataset.splitWithProportion(0.75)
        print 'dataset lengths:', len(dataset), len(training_dataset), len(
            testing_dataset)
        print

        print 'creating neural network...'
        net = self.create_nn(features, num_hidden_layer_nodes)

        print 'creating trainer...'
        trainer = self.create_trainer(net, training_dataset)

        print 'training neural network for', epochs, 'epochs...'
        trainer.trainEpochs(epochs)

        print 'writing neural network to ' + str(filepath) + '...'
        NetworkWriter.writeToFile(net, filepath)

        print 'testing neural network...'
        confidences = []
        actuals = []
        for point in testing_dataset:
            confidences.append(net.activate(point[0])[0])
            actuals.append(point[1][0])

        print 'confidences|actuals ', zip(confidences, actuals)

        print 'generating ROC curve...'
        matplotlib.use('pdf')
        path, auc = self.roc(confidences, actuals)
        print 'area under curve =', auc
示例#7
0
def batch_serialize(n=100):
    '''
    Writes csv files ('serializations') of the passages parsed from first <n>
    training files 
    '''
    out_dir = os.path.join(os.path.dirname(__file__), 'serialized')
    util = IntrinsicUtility()
    training_files = util.get_n_training_files(n, include_txt_extension=False)

    text_files = [t + '.txt' for t in training_files]
    xml_files = [t + '.xml' for t in training_files]
    out_files = [
        os.path.join(out_dir,
                     os.path.basename(t) + '.csv') for t in training_files
    ]

    for tf, xf, of in zip(text_files, xml_files, out_files):
        # Only populate if outfile doesn't already exist
        if not os.path.exists(of):
            print of, 'did not exist. Working on it now.'
            extract_and_serialize(tf, xf, of)
def explore_training_corpus(n=1000):
    '''
	'''
    util = IntrinsicUtility()
    training_texts = util.get_n_training_files(n)
    training_xmls = [s.replace('txt', 'xml') for s in training_texts]

    file_lengths = []
    pct_plags = []
    total_paragraphs = []

    for text_file, xml_file in zip(training_texts, training_xmls):
        with file(text_file) as f:
            text = f.read()

        paragraphs_spans = tokenize(text, 'paragraph')
        num_paragraphs = len(paragraphs_spans)

        text_len = len(text)
        plag_spans = util.get_plagiarized_spans(xml_file)
        plag_len = sum([end - start for start, end in plag_spans])
        plag_pct = float(plag_len) / text_len

        file_lengths.append(text_len)
        pct_plags.append(plag_pct)
        total_paragraphs.append(num_paragraphs)

    #outfile = os.path.join(os.path.dirname(__file__), 'training_lengths.csv')
    outfile = 'training_lengths.csv'

    f = file(outfile, 'wb')
    f.write('file_num, length, pct_plag, num_paragraphs\n')

    for i in xrange(len(file_lengths)):
        line = '%i, %i, %f, %i\n' % (i, file_lengths[i], pct_plags[i],
                                     total_paragraphs[i])
        f.write(line)
    f.close()

    return zip(file_lengths, pct_plags)