예제 #1
0
 def __init__(self, atom_type, fingerprint_method, n, k, hash_len, confidence_method, suspect_file_list, source_file_list, search_method, search_n=1):
     self.suspicious_path_start = ExtrinsicUtility.CORPUS_SUSPECT_LOC
     self.corpus_path_start = ExtrinsicUtility.CORPUS_SRC_LOC
     source_dirs = os.listdir(self.corpus_path_start)
     self.mid = fingerprintstorage.get_mid(fingerprint_method, n, k, atom_type, hash_len)
     self.base_atom_type = atom_type
     self.fingerprint_method = fingerprint_method
     self.n = n
     self.k = k
     self.hash_len = hash_len
     self.confidence_method = confidence_method
     self.suspect_file_list = suspect_file_list
     self.source_file_list = source_file_list
     self.evaluator = fingerprint_extraction.FingerprintEvaluator(source_file_list, fingerprint_method, self.n, self.k)
     self.search_method = search_method
     self.search_n = search_n
예제 #2
0
def test(method, n, k, atom_type, hash_size, confidence_method, num_files="all", search_method='normal', search_n=5, save_to_db=True, ignore_high_obfuscation=False, show_false_negpos_info=False, get_best_of=False):
    session = Session()
    
    # Get the list of suspect files to test on
    source_file_list, suspect_file_list = ExtrinsicUtility().get_corpus_files(n = num_files, include_txt_extension = False)
    
    # Confirm that these suspects and enough source documents have been populated
    num_suspect_documents = len(suspect_file_list)
    num_source_documents = len(source_file_list)
    
    mid = fingerprintstorage.get_mid(method, n, k, atom_type, hash_size)
    num_populated_suspects = fingerprintstorage.get_number_suspects(mid)
    num_populated_sources = fingerprintstorage.get_number_sources(mid)
    
    if num_populated_suspects < num_suspect_documents or num_populated_sources < num_source_documents:
        raise ValueError("Not all of the documents used in this test have been populated (only "+str(num_populated_sources)+" sources, "+str(num_populated_suspects)+" suspects have been populated). Populate them first with fingerprintstorage.")
    
    # If the search method is two level, we need to check that additional things are in the database
    if search_method == "two_level_ff" or search_method == "two_level_pf":
        full_mid = fingerprintstorage.get_mid(method, n, k, "full", hash_size)
        para_mid = fingerprintstorage.get_mid(method, n, k, "paragraph", hash_size)
        
        num_populated_full_suspects = fingerprintstorage.get_number_suspects(full_mid)
        num_populated_para_suspects = fingerprintstorage.get_number_suspects(para_mid)
        
        num_populated_full_sources = fingerprintstorage.get_number_sources(full_mid)
        num_populated_para_sources = fingerprintstorage.get_number_sources(para_mid)
        
        num_populated_sources = num_populated_full_sources
        num_populated_suspects = num_populated_full_suspects
        
        if num_populated_full_suspects < num_suspect_documents or num_populated_para_suspects < num_suspect_documents \
            or num_populated_full_sources < num_source_documents or num_populated_para_sources < num_source_documents \
            or num_populated_para_sources < num_populated_full_sources \
            or num_populated_para_suspects < num_populated_full_suspects:
            raise ValueError("Not all of the documents used in this test have been populated (only "+str(num_populated_sources)+" sources, "+str(num_populated_suspects)+" suspects have been populated). Populate them first with fingerprintstorage.")
    
    
    print suspect_file_list    
    print "Testing first", suspect_file_list, "suspect files against", num_populated_sources, "source documents."
    
    tester = ExtrinsicTester(atom_type, method, n, k, hash_size, confidence_method, suspect_file_list, source_file_list, search_method, search_n)

    roc_auc, source_accuracy, true_source_accuracy, roc_path, prf_path, thresholds, precisions, recalls, fmeasures = tester.evaluate(session, ignore_high_obfuscation, show_false_negpos_info, get_best_of)


    # Save the result
    if save_to_db:
        with psycopg2.connect(user = username, password = password, database = dbname.split("/")[1], host="localhost", port = 5432) as conn:
            conn.autocommit = True    
            with conn.cursor() as cur:
                for i in range(len(thresholds)):
                    threshold = thresholds[i]
                    prec = precisions[i]
                    recall = recalls[i]
                    fmeasure = fmeasures[i]
                    query = "INSERT INTO extrinsic_results (method_name, n, k, atom_type, hash_size, simmilarity_method, suspect_files, source_files, auc, true_source_accuracy, source_accuracy, search_method, search_n, ignore_high_obfuscation, roc_path, prf_fig_path, threshold, precision, recall, fmeasure) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"
                    args = (method, n, k, atom_type, hash_size, confidence_method, num_files, num_populated_sources, roc_auc, true_source_accuracy, source_accuracy, search_method, search_n, ignore_high_obfuscation, roc_path, prf_path, threshold, prec, recall, fmeasure)
                    cur.execute(query, args)
    
    print 'ROC auc:', roc_auc
    print 'Source Accuracy:', source_accuracy
    print 'True Source Accuracy:', true_source_accuracy
예제 #3
0
    def get_trials(self, session, fingerprint_m=None):
        '''
        For each suspect document, split the document into atoms and classify each atom
        as plagiarized or not-plagiarized. Build a list of classifications and a list
        of the ground truths for each atom of each document.
        '''
        classifications = []
        actuals = []
        classifications_dict = {}
        actuals_dict = {}
        if fingerprint_m is None:
            fingerprint_m = self.fingerprint_method

        outer_search_level_mid = fingerprintstorage.get_mid(fingerprint_m, self.n, self.k, "full", self.hash_len)

        for fi, f in enumerate(self.suspect_file_list, 1):
            print
            doc_name = f.replace(self.suspicious_path_start, "")
            if ".txt" in doc_name:
                doc_name = doc_name.replace(".txt", "")
            if self.search_method == 'two_level_ff':
                print '%d/%d Classifying %s (%s)' % (fi, len(self.suspect_file_list), doc_name, self.search_method)

                acts = ground_truth._query_ground_truth(doc_name, self.base_atom_type, session, self.suspicious_path_start).get_ground_truth(session)
                actuals += acts

                actuals_dict[f] = acts
                doc_classifications = []

                # first, get a list of the most similar full documents to this document
                full_atom_classifications = self.evaluator.classify_passage(doc_name, "full", 0, fingerprint_m,
                    self.n, self.k, self.hash_len, "containment", outer_search_level_mid)

                top_docs = full_atom_classifications[:self.search_n]
                dids = [x[0][2] for x in top_docs]
                
                # now, compare all paragraphs in the most similar documents to this paragraph
                for atom_index in xrange(len(acts)):
                    atom_classifications = self.evaluator.classify_passage(doc_name, "paragraph", atom_index, 
                        fingerprint_m, self.n, self.k, self.hash_len, self.confidence_method, self.mid, dids=dids)
                    # print 'atom_classifications:', atom_classifications
                    # top_source is a tuple with the form ((source_doc_name, atom_index), confidence, suspect_filename)
                    top_source = atom_classifications[0]
                    source_filename, source_atom_index, did, suspect_filename, atom_index = top_source[0]
                    confidence = top_source[1]

                    classifications.append(top_source)
                    doc_classifications.append(top_source)

                    print 'atom index:', str(atom_index+1) + '/' + str(len(acts))
                    print 'confidence (actual, guess):', acts[atom_index], (confidence, source_filename, source_atom_index)

                classifications_dict[f] = doc_classifications

            elif self.search_method == 'two_level_pf':
                print '%d/%d Classifying %s (%s)' % (fi, len(self.suspect_file_list), doc_name, self.search_method)
                acts = ground_truth._query_ground_truth(doc_name, self.base_atom_type, session, self.suspicious_path_start).get_ground_truth(session)
                actuals += acts

                actuals_dict[f] = acts
                doc_classifications = []

                for atom_index in xrange(len(acts)):
                    # first, find most similar documents to this paragraph
                    full_atom_classifications = self.evaluator.classify_passage(doc_name, "full", atom_index,
                        fingerprint_m, self.n, self.k, self.hash_len, "containment",
                        fingerprintstorage.get_mid(fingerprint_m, self.n, self.k, "full", self.hash_len),
                        passage_atom_type="paragraph",
                        passage_mid=fingerprintstorage.get_mid(fingerprint_m, self.n, self.k, "paragraph", self.hash_len))

                    top_docs = full_atom_classifications[:self.search_n]
                    dids = [x[0][2] for x in top_docs]

                    # don't compare at the paragraph level if no full documents had any similarity to the paragraph
                    if top_docs[0][1] == 0:
                        top_source = top_docs[0]
                    else:
                        # now, compare this paragraph to all paragraphs in <top_docs>
                        atom_classifications = self.evaluator.classify_passage(doc_name, "paragraph", atom_index, 
                            fingerprint_m, self.n, self.k, self.hash_len, self.confidence_method, self.mid, dids=dids)
                        # print 'atom_classifications:', atom_classifications
                        # top_source is a tuple with the form ((source_doc_name, atom_index), confidence, suspect_filename)
                        top_source = atom_classifications[0]
                    
                    source_filename, source_atom_index, did, suspect_filename, atom_index = top_source[0]
                    confidence = top_source[1]

                    classifications.append(top_source)
                    doc_classifications.append(top_source)

                    print 'atom index:', str(atom_index+1) + '/' + str(len(acts))
                    print 'confidence (actual, guess):', acts[atom_index], (confidence, source_filename, source_atom_index)

                classifications_dict[f] = doc_classifications
                
            else:
                acts = ground_truth._query_ground_truth(f, self.base_atom_type, session, self.suspicious_path_start).get_ground_truth(session)
                actuals += acts

                actuals_dict[f] = acts
                doc_classifications = []

                print f
                print '%d/%d Classifying %s' % (fi, len(self.suspect_file_list), doc_name)

                for atom_index in xrange(len(acts)):
                    atom_classifications = self.evaluator.classify_passage(doc_name, self.base_atom_type, atom_index, fingerprint_m, self.n, self.k, self.hash_len, self.confidence_method, self.mid)
                    # print atom_classifications
                    # top_source is a tuple with the form ((source_doc_name, atom_index), confidence)
                    top_source = atom_classifications[0]
                    source_filename, source_atom_index, did, suspect_filename, atom_index = top_source[0]
                    confidence = top_source[1]

                    classifications.append(top_source)
                    doc_classifications.append(top_source)
                    
                    print 'atom index:', str(atom_index+1) + '/' + str(len(acts))
                    print 'confidence (actual, guess):', acts[atom_index][0], (confidence, source_filename, source_atom_index)

                classifications_dict[f] = doc_classifications

        # classifications = self.screen_crap(classifications, classifications_dict)

        return classifications, actuals, classifications_dict, actuals_dict