def eval_func_confidences(self, feature_weights):
        weights_sum = float(sum(feature_weights))
        # "normalize" (I don't know if that's the right word) the weights, and make sure none are equal to 0
        feature_weights = [max(0.00001, x/weights_sum) for x in feature_weights]
        IU = IntrinsicUtility()
        all_test_files = IU.get_n_training_files(n=self.num_documents, first_doc_num=self.first_doc_num, min_len=35000, pct_plag=1)
        reduced_docs = _get_reduced_docs(self.atom_type, all_test_files, session)

        actuals = []
        confidences = []

        confidence_vectors = []
        for feature, weight in zip(self.features, feature_weights):
            vi = 0
            for doc in reduced_docs:
                feature_vectors = doc.get_feature_vectors([feature], session)
                confs = cluster(self.cluster_type, 2, feature_vectors)
                for i, confidence in enumerate(confs, 0):
                    if len(confidence_vectors) <= vi:
                        confidence_vectors.append([])
                    confidence_vectors[vi].append(confidence * weight)
                    vi += 1
                    
        for doc in reduced_docs:
            for span in doc._spans:
                actual = 1 if doc.span_is_plagiarized(span) else 0
                actuals.append(actual)

        for vec in confidence_vectors:
            confidences.append(min(1, sum(vec)))

        fpr, tpr, thresholds = sklearn.metrics.roc_curve(actuals, confidences, pos_label=1)
        roc_auc = sklearn.metrics.auc(fpr, tpr)
        print 'evaluated:', roc_auc, [w for w in feature_weights]
        return roc_auc
Exemplo n.º 2
0
def get_training_set_files():
    util = IntrinsicUtility()
    full_paths = util.get_n_training_files()
    relative_paths = util.get_relative_training_set(IntrinsicUtility.TRAINING_LOC)
    # Strip leading '/' and remove '/'s to prepare for URL
    relative_paths = [r[1:].replace('/', '-') for r in relative_paths]

    return relative_paths, full_paths
Exemplo n.º 3
0
def get_training_set_files():
    util = IntrinsicUtility()
    full_paths = util.get_n_training_files()
    relative_paths = util.get_relative_training_set(IntrinsicUtility.TRAINING_LOC)
    # Strip leading '/' and remove '/'s to prepare for URL
    relative_paths = [r[1:].replace('/', '-') for r in relative_paths]

    return relative_paths, full_paths
Exemplo n.º 4
0
def extract_and_serialize(txt_file, xml_file, out_file, atom_type='paragraph',
                          cluster_method='kmeans', k=2):
    '''
    Performs all of intrinsic (feature extraction, clustering etc.) and creates
    Passage objects for each passage in <txt_file>. Writes a CSV file out
    to <out_file> containing all the features of <txt_file>

    The CSV files can be read easily by R in order to create plots
    '''
    f = file(txt_file, 'r')
    text = f.read()
    f.close()

    util = IntrinsicUtility() 

    feature_names = [
        'average_word_length',
        'average_sentence_length',
        'stopword_percentage',
        'punctuation_percentage',
        'syntactic_complexity',
        'avg_internal_word_freq_class',
        'avg_external_word_freq_class'
    ]
   

    ext = FeatureExtractor(text)
    print 'Initialized extractor'
    # Note that passages don't know their ground truths yet
    passages = ext.get_passages(feature_names, atom_type)
    print 'Extracted passages'
    util.add_ground_truth_to_passages(passages, xml_file)

    feature_vecs = [p.features.values() for p in passages]

    # If just testing feature extraction, don't cluster passages
    if cluster_method != 'none':
        # Cluster the passages and set their confidences
        confidences = cluster(cluster_method, k, feature_vecs)
        for psg, conf in zip(passages, confidences):
            psg.set_plag_confidence(conf)

    f = file(out_file, 'wb')
    csv_writer = csv.writer(f)

    # Writes out the header for corresponding CSV
    csv_writer.writerow(IntrinsicPassage.serialization_header(feature_names))
    for p in passages:
        csv_writer.writerow(p.to_list(feature_names))
    f.close()
    print 'Finished writing', out_file
Exemplo n.º 5
0
def extract_and_serialize(txt_file,
                          xml_file,
                          out_file,
                          atom_type='paragraph',
                          cluster_method='kmeans',
                          k=2):
    '''
    Performs all of intrinsic (feature extraction, clustering etc.) and creates
    Passage objects for each passage in <txt_file>. Writes a CSV file out
    to <out_file> containing all the features of <txt_file>

    The CSV files can be read easily by R in order to create plots
    '''
    f = file(txt_file, 'r')
    text = f.read()
    f.close()

    util = IntrinsicUtility()

    feature_names = [
        'average_word_length', 'average_sentence_length',
        'stopword_percentage', 'punctuation_percentage',
        'syntactic_complexity', 'avg_internal_word_freq_class',
        'avg_external_word_freq_class'
    ]

    ext = FeatureExtractor(text)
    print 'Initialized extractor'
    # Note that passages don't know their ground truths yet
    passages = ext.get_passages(feature_names, atom_type)
    print 'Extracted passages'
    util.add_ground_truth_to_passages(passages, xml_file)

    feature_vecs = [p.features.values() for p in passages]

    # If just testing feature extraction, don't cluster passages
    if cluster_method != 'none':
        # Cluster the passages and set their confidences
        confidences = cluster(cluster_method, k, feature_vecs)
        for psg, conf in zip(passages, confidences):
            psg.set_plag_confidence(conf)

    f = file(out_file, 'wb')
    csv_writer = csv.writer(f)

    # Writes out the header for corresponding CSV
    csv_writer.writerow(IntrinsicPassage.serialization_header(feature_names))
    for p in passages:
        csv_writer.writerow(p.to_list(feature_names))
    f.close()
    print 'Finished writing', out_file
Exemplo n.º 6
0
def batch_serialize(n=100):
    '''
    Writes csv files ('serializations') of the passages parsed from first <n>
    training files 
    '''
    out_dir = os.path.join(os.path.dirname(__file__), 'serialized')
    util = IntrinsicUtility()
    training_files = util.get_n_training_files(n, include_txt_extension=False)

    text_files = [t + '.txt' for t in training_files]
    xml_files = [t + '.xml' for t in training_files]
    out_files = [os.path.join(out_dir, os.path.basename(t) + '.csv') for t in training_files]

    for tf, xf, of in zip(text_files, xml_files, out_files):
        # Only populate if outfile doesn't already exist
        if not os.path.exists(of):
            print of, 'did not exist. Working on it now.'
            extract_and_serialize(tf, xf, of)
Exemplo n.º 7
0
def doc_lengths(thresh=35000):
    '''
	Prints the pct. of documents which contain at least <thresh> characters
	'''
    util = IntrinsicUtility()
    training_docs = util.get_n_training_files()
    lengths = []
    long_enough = 0

    for fname in training_docs:
        f = file(fname, 'rb')
        text = f.read()
        f.close()

        lengths.append(len(text))
        if len(text) > thresh:
            long_enough += 1

    print float(long_enough) / len(training_docs), 'were long enough'
Exemplo n.º 8
0
def doc_lengths(thresh=35000):
    """
	Prints the pct. of documents which contain at least <thresh> characters
	"""
    util = IntrinsicUtility()
    training_docs = util.get_n_training_files()
    lengths = []
    long_enough = 0

    for fname in training_docs:
        f = file(fname, "rb")
        text = f.read()
        f.close()

        lengths.append(len(text))
        if len(text) > thresh:
            long_enough += 1

    print float(long_enough) / len(training_docs), "were long enough"
    def construct_and_train_nn(self, features, num_files, epochs, filepath,
                               session):
        from plagcomps.evaluation.intrinsic import _get_reduced_docs

        IU = IntrinsicUtility()
        all_test_files = IU.get_n_training_files(n=num_files)
        reduced_docs = _get_reduced_docs("paragraph", all_test_files, session)

        print 'constructing datasets...'
        # dataset = self.construct_confidence_vectors_dataset(reduced_docs, features, session)
        dataset = self.read_dataset()
        training_dataset, testing_dataset = dataset.splitWithProportion(0.75)
        print 'dataset lengths:', len(dataset), len(training_dataset), len(
            testing_dataset)
        print

        print 'creating neural network...'
        net = self.create_nn(features, num_hidden_layer_nodes)

        print 'creating trainer...'
        trainer = self.create_trainer(net, training_dataset)

        print 'training neural network for', epochs, 'epochs...'
        trainer.trainEpochs(epochs)

        print 'writing neural network to ' + str(filepath) + '...'
        NetworkWriter.writeToFile(net, filepath)

        print 'testing neural network...'
        confidences = []
        actuals = []
        for point in testing_dataset:
            confidences.append(net.activate(point[0])[0])
            actuals.append(point[1][0])

        print 'confidences|actuals ', zip(confidences, actuals)

        print 'generating ROC curve...'
        matplotlib.use('pdf')
        path, auc = self.roc(confidences, actuals)
        print 'area under curve =', auc
Exemplo n.º 10
0
def batch_serialize(n=100):
    '''
    Writes csv files ('serializations') of the passages parsed from first <n>
    training files 
    '''
    out_dir = os.path.join(os.path.dirname(__file__), 'serialized')
    util = IntrinsicUtility()
    training_files = util.get_n_training_files(n, include_txt_extension=False)

    text_files = [t + '.txt' for t in training_files]
    xml_files = [t + '.xml' for t in training_files]
    out_files = [
        os.path.join(out_dir,
                     os.path.basename(t) + '.csv') for t in training_files
    ]

    for tf, xf, of in zip(text_files, xml_files, out_files):
        # Only populate if outfile doesn't already exist
        if not os.path.exists(of):
            print of, 'did not exist. Working on it now.'
            extract_and_serialize(tf, xf, of)
Exemplo n.º 11
0
def explore_training_corpus(n=1000):
    '''
	'''
    util = IntrinsicUtility()
    training_texts = util.get_n_training_files(n)
    training_xmls = [s.replace('txt', 'xml') for s in training_texts]

    file_lengths = []
    pct_plags = []
    total_paragraphs = []

    for text_file, xml_file in zip(training_texts, training_xmls):
        with file(text_file) as f:
            text = f.read()

        paragraphs_spans = tokenize(text, 'paragraph')
        num_paragraphs = len(paragraphs_spans)

        text_len = len(text)
        plag_spans = util.get_plagiarized_spans(xml_file)
        plag_len = sum([end - start for start, end in plag_spans])
        plag_pct = float(plag_len) / text_len

        file_lengths.append(text_len)
        pct_plags.append(plag_pct)
        total_paragraphs.append(num_paragraphs)

    #outfile = os.path.join(os.path.dirname(__file__), 'training_lengths.csv')
    outfile = 'training_lengths.csv'

    f = file(outfile, 'wb')
    f.write('file_num, length, pct_plag, num_paragraphs\n')

    for i in xrange(len(file_lengths)):
        line = '%i, %i, %f, %i\n' % (i, file_lengths[i], pct_plags[i],
                                     total_paragraphs[i])
        f.write(line)
    f.close()

    return zip(file_lengths, pct_plags)
Exemplo n.º 12
0
def explore_training_corpus(n=1000):
    """
	"""
    util = IntrinsicUtility()
    training_texts = util.get_n_training_files(n)
    training_xmls = [s.replace("txt", "xml") for s in training_texts]

    file_lengths = []
    pct_plags = []
    total_paragraphs = []

    for text_file, xml_file in zip(training_texts, training_xmls):
        with file(text_file) as f:
            text = f.read()

        paragraphs_spans = tokenize(text, "paragraph")
        num_paragraphs = len(paragraphs_spans)

        text_len = len(text)
        plag_spans = util.get_plagiarized_spans(xml_file)
        plag_len = sum([end - start for start, end in plag_spans])
        plag_pct = float(plag_len) / text_len

        file_lengths.append(text_len)
        pct_plags.append(plag_pct)
        total_paragraphs.append(num_paragraphs)

        # outfile = os.path.join(os.path.dirname(__file__), 'training_lengths.csv')
    outfile = "training_lengths.csv"

    f = file(outfile, "wb")
    f.write("file_num, length, pct_plag, num_paragraphs\n")

    for i in xrange(len(file_lengths)):
        line = "%i, %i, %f, %i\n" % (i, file_lengths[i], pct_plags[i], total_paragraphs[i])
        f.write(line)
    f.close()

    return zip(file_lengths, pct_plags)
Exemplo n.º 13
0
    def construct_and_train_nn(self, features, num_files, epochs, filepath, session):
        from plagcomps.evaluation.intrinsic import _get_reduced_docs

        IU = IntrinsicUtility()
        all_test_files = IU.get_n_training_files(n=num_files)
        reduced_docs = _get_reduced_docs("paragraph", all_test_files, session)
        
        print 'constructing datasets...'
        # dataset = self.construct_confidence_vectors_dataset(reduced_docs, features, session)
        dataset = self.read_dataset()
        training_dataset, testing_dataset = dataset.splitWithProportion(0.75)
        print 'dataset lengths:', len(dataset), len(training_dataset), len(testing_dataset)
        print

        print 'creating neural network...'
        net = self.create_nn(features, num_hidden_layer_nodes)

        print 'creating trainer...'
        trainer = self.create_trainer(net, training_dataset)

        print 'training neural network for', epochs, 'epochs...'
        trainer.trainEpochs(epochs)

        print 'writing neural network to ' + str(filepath) + '...'
        NetworkWriter.writeToFile(net, filepath)

        print 'testing neural network...'
        confidences = []
        actuals = []
        for point in testing_dataset:
            confidences.append(net.activate(point[0])[0])
            actuals.append(point[1][0])

        print 'confidences|actuals ', zip(confidences, actuals)

        print 'generating ROC curve...'
        matplotlib.use('pdf')
        path, auc = self.roc(confidences, actuals)
        print 'area under curve =', auc
def _get_feature_conf_and_actuals(features, cluster_type, atom_type, start_doc, n, pct_plag=None, **cluster_args):
    '''
    Returns a matrix of dimension <num_passages> x <num_features> where each row holds 
    the confidence that that row was plagiarized according to each feature. In other
    words,
    mat[passage_num][feat_num] is the plag. confidence of <passage_num> according to <feat_num>

    Note that the transpose of this matrix is built below, and then transposed before returning
    '''

    first_training_files = IntrinsicUtility().get_n_training_files(n, first_doc_num=start_doc, pct_plag=pct_plag)
    session = Session()
    reduced_docs = _get_reduced_docs(atom_type, first_training_files, session)

    actuals = []
    
    # feature_conf_matrix[feat][span_index] == Conf. that <span_index>
    # was plag. according to <feat>
    # NOTE that we're ignoring document boundaries in the storage of this 
    # matrix. So <span_index> is not relative to any document
    feature_conf_matrix = [[] for i in xrange(len(features))]
    
    for doc_index in xrange(len(reduced_docs)):
        if doc_index % 10 == 0:
            print 'Working on doc number (in training corpus)', start_doc + doc_index
        doc = reduced_docs[doc_index]
        spans = doc.get_spans()

        for feat_num in xrange(len(features)):
            feat = features[feat_num]
            feature_vecs = doc.get_feature_vectors([feat], session)
            # One column, i.e. confidence values for <feat> over all passages 
            # in <doc>
            confidences = cluster(cluster_type, 2, feature_vecs, **cluster_args)
            # Use append if we care about document_num
            
            feature_conf_matrix[feat_num].extend(confidences)
            
        for span_index in xrange(len(spans)):
            span = spans[span_index]
            
            actuals.append(1 if doc.span_is_plagiarized(span) else 0)
            
    
    rotated = np.matrix(feature_conf_matrix).T

    return rotated, actuals
def main(m, training_percent = 0.7):
    random.seed(1337)

    suspects_base_path = "/copyCats/pan-plagiarism-corpus-2009/external-detection-corpus/suspicious-documents/"
    suspects_dirs = ["part1/", "part2/", "part3/", "part4/", "part5/", "part6/", "part7/", "part8/"]
    sources_base_path = "/copyCats/pan-plagiarism-corpus-2009/external-detection-corpus/source-documents/"
    sources_dirs = ["part1/", "part2/", "part3/", "part4/", "part5/", "part6/", "part7/", "part8/"]

     # Without extensions
    all_base_files = []
    all_files = [] # list of tuples where tuple[0] is the absolute path of the text document and tuple[1] is the absolute path of the xml file

    # Put all the suspect files in a list
    for d in suspects_dirs:
        p = os.path.join(suspects_base_path, d)
        for f in os.listdir(p):
            all_base_files.append(os.path.splitext(f)[0])

            if f[-4:] == ".txt":
                all_files.append((p+f, (p+f)[:-4]+".xml"))
    
    # Make sure all of these files actually exist
    worked = True
    for suspect in all_files:
        if not os.path.exists(suspect[0]):
            worked = False
            print ".txt file does not exist:", suspect[0]
        if not os.path.exists(suspect[1]):
            worked = False
            print ".xml file does not exist:", suspect[1]
    assert(worked)

    # shuffle and take files from the front of the list
    print 'Shuffling ', len(all_files), 'suspect files...'
    random.shuffle(all_files)

    print 'Grabbing all valid suspects...'
    # grab n files with plagiarism
    training_suspect_partition = [] 
    for filepaths in all_files:
        plag_spans = IntrinsicUtility.get_plagiarized_spans(filepaths[1])
        if len(plag_spans) > 0:
            # make sure it's at least m paragraphs
            f = open(filepaths[0], 'r')
            text = f.read()
            f.close()
            paragraphs = tokenize(text, 'paragraph')
            if len(paragraphs) > m:
                continue

            training_suspect_partition.append(filepaths)
            if len(training_suspect_partition) % 10 == 0:
                print len(training_suspect_partition)

    print len(training_suspect_partition)

    # print 'Writing partitions to disk...'
    # suspect_training_file = file("crisp_extrinsic_training_suspect_files.txt", 'w')
    # for suspect in training_suspect_partition:
    #     rel_path_start = suspect[0].index('/part')
    #     suspect_training_file.write(suspect[0][rel_path_start:-4] + '\n')
    # suspect_training_file.close()


    print 'Determining source documents for training partition...'
    training_sources = {}
    training_sources_suspects = {}
    num_files = 0
    for filenames in training_suspect_partition:
        tree = ET.parse(filenames[1])
        for feature in tree.iter("feature"):
            if feature.get("name") == "artificial-plagiarism" and feature.get("source_reference") and feature.get("source_reference")[:-4] not in training_sources:
                # figure out which partX the doc is in...so annoying...
                for p in sources_dirs:
                    if os.path.exists(sources_base_path + p + feature.get("source_reference")):
                        short_name = "/" + p + feature.get("source_reference")[:-4]
                        long_name = sources_base_path + p + feature.get("source_reference")
                        training_sources[short_name] = 1
                        if filenames[1] not in training_sources_suspects:
                            training_sources_suspects[filenames[1]] = [long_name]
                        else:
                            training_sources_suspects[filenames[1]].append(long_name)

        num_files += 1
        if num_files%100 == 0:
            print num_files,
            sys.stdout.flush()
    print
    print len(training_sources.keys()), 'sources for the training partition were found...'

    print 'Removing invalid suspects because of short sources...'
    # get rid of the ones that are too long...
    final_training_suspect_partition = []
    for _, xml in training_suspect_partition:
        # are all of its sources < m paragraphs?
        short_enough = True
        for source_filename in training_sources_suspects[xml]:
            f = open(source_filename, 'r')
            text = f.read()
            f.close()
            paragraphs = tokenize(text, 'paragraph')
            if len(paragraphs) > m:
                short_enough = False
                break
        if short_enough:
            final_training_suspect_partition.append(xml)

    print 'Constructing final source partition...'
    final_training_source_partition = []
    for suspect in final_training_suspect_partition:
        for long_name in training_sources_suspects[suspect]:
            short_name = '/' + re.sub(sources_base_path, '', long_name)
            if short_name not in final_training_source_partition:
                final_training_source_partition.append(short_name)

    print 'Converting suspects names.......'
    final_training_suspect_partition = ['/' + re.sub('.xml', '', re.sub(suspects_base_path, '', xml)) for xml in final_training_suspect_partition]

    print len(final_training_suspect_partition), final_training_suspect_partition
    print len(final_training_source_partition), final_training_source_partition

    print 'Writing suspect documents to disk...'
    suspects_training_file = file("crisp_corpus_suspect_files.txt", 'w')
    for filename in final_training_suspect_partition:
        suspects_training_file.write(filename + '\n')
    suspects_training_file.close()

    print 'Writing source documents to disk...'
    sources_training_file = file("crisp_corpus_source_files.txt", 'w')
    for filename in final_training_source_partition:
        sources_training_file.write(filename + '\n')
    sources_training_file.close()