def test_n(dir="testset", debugLog=False): files = os.listdir(dir) answers = [] for str in files: if "orig" in str: answers.append(str) corr = { "a": [0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1], "b": [1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1], "c": [1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1], "d": [1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0], "e": [0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0], } all = 0 ok = 0 false_positive = 0 miss = 0 for ans in answers: ind = ans.split(".")[0][-1:] it = 0 for test in files: if "task" + ind in test and "orig" not in test: # if 'g1pA_taskb' in test: # it += 1 # continue f1 = "testset\\" + ans f2 = "testset\\" + test res = EvaluatePlagiatedByWords_n( a=readFile(f1)["data"], b=readFile(f2)["data"], lang="english", debugLog=False ) ok += res["status"] == corr[ind][it] if res["status"] != corr[ind][it]: if debugLog: print("%s %s => output = %d corr = %d" % (f1, f2, res["status"], corr[ind][it])) if corr[ind][it] == 1: miss += 1 if corr[ind][it] == 0: false_positive += 1 it += 1 all += 1 if debugLog: print( "ok = %d all = %d percent = %f. miss = %d. false_positive = %d" % (ok, all, ok / (all + 0.0), miss, false_positive) ) return {"ok": ok, "miss": miss, "false": false_positive}
def main(argc, argv): filename = argv[1] contents = readFile(filename) if contents['status'] == 'error': print("Error while loading file \"%s\"" % (filename)) localFile = contents["data"] keywords = GetKeywords(localFile, 'english') if 'keywords' in contents: keywords = keywords + contents['keywords'] check_emails = None if 'emails' in contents: check_emails = contents['emails'] print("Keywords for text : ", keywords) f = FileDownloader(['filetype:pdf'] + keywords, maxpages = 2, crawler = True) print("Fetching data from google and links") docs = f.get_documents() print("Documents from google search: ") for url in docs: print(url) print("Running SSK for blocks in texts:") for url in docs: print("Checking \"%s\" link" % (url)) remote_contents = readRemoteFile(url) emails = None if 'emails' in remote_contents: emails = remote_contents['emails'] emails_dist = cosine_distance(check_emails, emails) if emails_dist > 0.7: print("Emails dist = %f. Text from similar authors." % (emails_dist)) continue if remote_contents['status'] == 'error': print("Failed to process remote file \"%s\"" % (url)) continue remoteFile = remote_contents['data'] res = EvaluatePlagiatedByWords_n(localFile, remoteFile, debugLog = False) if res['status'] == True: print("Plagiated blocks found:") for block in res['blocks']: print(block) else: print("No blocks found")
def main(argc, argv): logging.disable(100) f1 = argv[1] f2 = argv[2] c1 = readFile(f1) c2 = readFile(f2) if c1['status'] == 'error' or c2['status'] == 'error': print('Failed to read files') c1 = c1['data'] c2 = c2['data'] st = EvaluatePlagiatedByWords_n(c1, c2) print(st['status']) if st['status']: print(st['blocks'])
def main(argc, argv): files = os.listdir('keyword_set') for test in files: f = readFile("keyword_set" + "\\" + test) orig_keywords = f['keywords'] expr = f['keywords_expressions'] keywords = GetKeywords(f['data']) #print(Rake().extract(f['data'], incl_scores = True)[:5]) print("File %s" % (test)) if 'emails' in f: print("Emails", f['emails']) print(calc_st(orig_keywords, keywords, True)) print("\n")