Exemplo n.º 1
0
def explore_training_corpus(n=1000):
    '''
	'''
    util = IntrinsicUtility()
    training_texts = util.get_n_training_files(n)
    training_xmls = [s.replace('txt', 'xml') for s in training_texts]

    file_lengths = []
    pct_plags = []
    total_paragraphs = []

    for text_file, xml_file in zip(training_texts, training_xmls):
        with file(text_file) as f:
            text = f.read()

        paragraphs_spans = tokenize(text, 'paragraph')
        num_paragraphs = len(paragraphs_spans)

        text_len = len(text)
        plag_spans = util.get_plagiarized_spans(xml_file)
        plag_len = sum([end - start for start, end in plag_spans])
        plag_pct = float(plag_len) / text_len

        file_lengths.append(text_len)
        pct_plags.append(plag_pct)
        total_paragraphs.append(num_paragraphs)

    #outfile = os.path.join(os.path.dirname(__file__), 'training_lengths.csv')
    outfile = 'training_lengths.csv'

    f = file(outfile, 'wb')
    f.write('file_num, length, pct_plag, num_paragraphs\n')

    for i in xrange(len(file_lengths)):
        line = '%i, %i, %f, %i\n' % (i, file_lengths[i], pct_plags[i],
                                     total_paragraphs[i])
        f.write(line)
    f.close()

    return zip(file_lengths, pct_plags)
Exemplo n.º 2
0
def explore_training_corpus(n=1000):
    """
	"""
    util = IntrinsicUtility()
    training_texts = util.get_n_training_files(n)
    training_xmls = [s.replace("txt", "xml") for s in training_texts]

    file_lengths = []
    pct_plags = []
    total_paragraphs = []

    for text_file, xml_file in zip(training_texts, training_xmls):
        with file(text_file) as f:
            text = f.read()

        paragraphs_spans = tokenize(text, "paragraph")
        num_paragraphs = len(paragraphs_spans)

        text_len = len(text)
        plag_spans = util.get_plagiarized_spans(xml_file)
        plag_len = sum([end - start for start, end in plag_spans])
        plag_pct = float(plag_len) / text_len

        file_lengths.append(text_len)
        pct_plags.append(plag_pct)
        total_paragraphs.append(num_paragraphs)

        # outfile = os.path.join(os.path.dirname(__file__), 'training_lengths.csv')
    outfile = "training_lengths.csv"

    f = file(outfile, "wb")
    f.write("file_num, length, pct_plag, num_paragraphs\n")

    for i in xrange(len(file_lengths)):
        line = "%i, %i, %f, %i\n" % (i, file_lengths[i], pct_plags[i], total_paragraphs[i])
        f.write(line)
    f.close()

    return zip(file_lengths, pct_plags)
def main(m, training_percent = 0.7):
    random.seed(1337)

    suspects_base_path = "/copyCats/pan-plagiarism-corpus-2009/external-detection-corpus/suspicious-documents/"
    suspects_dirs = ["part1/", "part2/", "part3/", "part4/", "part5/", "part6/", "part7/", "part8/"]
    sources_base_path = "/copyCats/pan-plagiarism-corpus-2009/external-detection-corpus/source-documents/"
    sources_dirs = ["part1/", "part2/", "part3/", "part4/", "part5/", "part6/", "part7/", "part8/"]

     # Without extensions
    all_base_files = []
    all_files = [] # list of tuples where tuple[0] is the absolute path of the text document and tuple[1] is the absolute path of the xml file

    # Put all the suspect files in a list
    for d in suspects_dirs:
        p = os.path.join(suspects_base_path, d)
        for f in os.listdir(p):
            all_base_files.append(os.path.splitext(f)[0])

            if f[-4:] == ".txt":
                all_files.append((p+f, (p+f)[:-4]+".xml"))
    
    # Make sure all of these files actually exist
    worked = True
    for suspect in all_files:
        if not os.path.exists(suspect[0]):
            worked = False
            print ".txt file does not exist:", suspect[0]
        if not os.path.exists(suspect[1]):
            worked = False
            print ".xml file does not exist:", suspect[1]
    assert(worked)

    # shuffle and take files from the front of the list
    print 'Shuffling ', len(all_files), 'suspect files...'
    random.shuffle(all_files)

    print 'Grabbing all valid suspects...'
    # grab n files with plagiarism
    training_suspect_partition = [] 
    for filepaths in all_files:
        plag_spans = IntrinsicUtility.get_plagiarized_spans(filepaths[1])
        if len(plag_spans) > 0:
            # make sure it's at least m paragraphs
            f = open(filepaths[0], 'r')
            text = f.read()
            f.close()
            paragraphs = tokenize(text, 'paragraph')
            if len(paragraphs) > m:
                continue

            training_suspect_partition.append(filepaths)
            if len(training_suspect_partition) % 10 == 0:
                print len(training_suspect_partition)

    print len(training_suspect_partition)

    # print 'Writing partitions to disk...'
    # suspect_training_file = file("crisp_extrinsic_training_suspect_files.txt", 'w')
    # for suspect in training_suspect_partition:
    #     rel_path_start = suspect[0].index('/part')
    #     suspect_training_file.write(suspect[0][rel_path_start:-4] + '\n')
    # suspect_training_file.close()


    print 'Determining source documents for training partition...'
    training_sources = {}
    training_sources_suspects = {}
    num_files = 0
    for filenames in training_suspect_partition:
        tree = ET.parse(filenames[1])
        for feature in tree.iter("feature"):
            if feature.get("name") == "artificial-plagiarism" and feature.get("source_reference") and feature.get("source_reference")[:-4] not in training_sources:
                # figure out which partX the doc is in...so annoying...
                for p in sources_dirs:
                    if os.path.exists(sources_base_path + p + feature.get("source_reference")):
                        short_name = "/" + p + feature.get("source_reference")[:-4]
                        long_name = sources_base_path + p + feature.get("source_reference")
                        training_sources[short_name] = 1
                        if filenames[1] not in training_sources_suspects:
                            training_sources_suspects[filenames[1]] = [long_name]
                        else:
                            training_sources_suspects[filenames[1]].append(long_name)

        num_files += 1
        if num_files%100 == 0:
            print num_files,
            sys.stdout.flush()
    print
    print len(training_sources.keys()), 'sources for the training partition were found...'

    print 'Removing invalid suspects because of short sources...'
    # get rid of the ones that are too long...
    final_training_suspect_partition = []
    for _, xml in training_suspect_partition:
        # are all of its sources < m paragraphs?
        short_enough = True
        for source_filename in training_sources_suspects[xml]:
            f = open(source_filename, 'r')
            text = f.read()
            f.close()
            paragraphs = tokenize(text, 'paragraph')
            if len(paragraphs) > m:
                short_enough = False
                break
        if short_enough:
            final_training_suspect_partition.append(xml)

    print 'Constructing final source partition...'
    final_training_source_partition = []
    for suspect in final_training_suspect_partition:
        for long_name in training_sources_suspects[suspect]:
            short_name = '/' + re.sub(sources_base_path, '', long_name)
            if short_name not in final_training_source_partition:
                final_training_source_partition.append(short_name)

    print 'Converting suspects names.......'
    final_training_suspect_partition = ['/' + re.sub('.xml', '', re.sub(suspects_base_path, '', xml)) for xml in final_training_suspect_partition]

    print len(final_training_suspect_partition), final_training_suspect_partition
    print len(final_training_source_partition), final_training_source_partition

    print 'Writing suspect documents to disk...'
    suspects_training_file = file("crisp_corpus_suspect_files.txt", 'w')
    for filename in final_training_suspect_partition:
        suspects_training_file.write(filename + '\n')
    suspects_training_file.close()

    print 'Writing source documents to disk...'
    sources_training_file = file("crisp_corpus_source_files.txt", 'w')
    for filename in final_training_source_partition:
        sources_training_file.write(filename + '\n')
    sources_training_file.close()