Exemplo n.º 1
0
def test_n(dir="testset", debugLog=False):
    files = os.listdir(dir)

    answers = []
    for str in files:
        if "orig" in str:
            answers.append(str)

    corr = {
        "a": [0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1],
        "b": [1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1],
        "c": [1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1],
        "d": [1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0],
        "e": [0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0],
    }

    all = 0
    ok = 0
    false_positive = 0
    miss = 0
    for ans in answers:
        ind = ans.split(".")[0][-1:]
        it = 0
        for test in files:
            if "task" + ind in test and "orig" not in test:
                # if 'g1pA_taskb' in test:
                #    it += 1
                #    continue
                f1 = "testset\\" + ans
                f2 = "testset\\" + test
                res = EvaluatePlagiatedByWords_n(
                    a=readFile(f1)["data"], b=readFile(f2)["data"], lang="english", debugLog=False
                )
                ok += res["status"] == corr[ind][it]
                if res["status"] != corr[ind][it]:
                    if debugLog:
                        print("%s %s => output = %d corr = %d" % (f1, f2, res["status"], corr[ind][it]))
                    if corr[ind][it] == 1:
                        miss += 1
                    if corr[ind][it] == 0:
                        false_positive += 1
                it += 1
                all += 1

    if debugLog:
        print(
            "ok = %d all = %d percent = %f. miss = %d. false_positive = %d"
            % (ok, all, ok / (all + 0.0), miss, false_positive)
        )
    return {"ok": ok, "miss": miss, "false": false_positive}
Exemplo n.º 2
0
def main(argc, argv):
    filename = argv[1]
    contents = readFile(filename)

    if contents['status'] == 'error':
        print("Error while loading file \"%s\"" % (filename))

    localFile = contents["data"]
    
    keywords = GetKeywords(localFile, 'english')

    if 'keywords' in contents:
        keywords = keywords + contents['keywords']

    check_emails = None
    if 'emails' in contents:
        check_emails = contents['emails']

    print("Keywords for text : ", keywords)
    
    f = FileDownloader(['filetype:pdf'] + keywords, maxpages = 2, crawler = True)
    print("Fetching data from google and links")
    docs = f.get_documents()
    print("Documents from google search: ")
    for url in docs:
        print(url)

    print("Running SSK for blocks in texts:")
    for url in docs:
        print("Checking \"%s\" link" % (url))
        
        remote_contents = readRemoteFile(url)

        emails = None
        if 'emails' in remote_contents:
            emails = remote_contents['emails']

        emails_dist = cosine_distance(check_emails, emails)
        
        if emails_dist > 0.7:
            print("Emails dist = %f. Text from similar authors." % (emails_dist))
            continue

        if remote_contents['status'] == 'error':
            print("Failed to process remote file \"%s\"" % (url))
            continue

        remoteFile = remote_contents['data']

        res = EvaluatePlagiatedByWords_n(localFile, remoteFile, debugLog = False)
        if res['status'] == True:
            print("Plagiated blocks found:")
            for block in res['blocks']:
                print(block)
        else:
            print("No blocks found")
Exemplo n.º 3
0
def main(argc, argv):
    logging.disable(100)

    f1 = argv[1]
    f2 = argv[2]

    c1 = readFile(f1)
    c2 = readFile(f2)

    if c1['status'] == 'error' or c2['status'] == 'error':
        print('Failed to read files')

    c1 = c1['data']
    c2 = c2['data']

    st = EvaluatePlagiatedByWords_n(c1, c2)
    print(st['status'])
    
    if st['status']:
        print(st['blocks'])
def main(argc, argv):
    files = os.listdir('keyword_set')

    for test in files:
        f = readFile("keyword_set" + "\\" + test)
        orig_keywords = f['keywords']
        expr = f['keywords_expressions']
        keywords = GetKeywords(f['data'])
        #print(Rake().extract(f['data'], incl_scores = True)[:5])
        print("File %s"  % (test))
        if 'emails' in f:
            print("Emails", f['emails'])
        print(calc_st(orig_keywords, keywords, True))
        print("\n")