def findSourceDoc(self): root.filename = tkinter.filedialog.askopenfilename( initialdir=os.path.dirname(os.path.realpath(__file__)), title="Select file to search", filetypes=(("txt files", "*.txt"), ("all files", "*.*"))) if root.filename == '': print("cancelled") else: srcCandidate, dupl = FindSourceDoc.run(root.filename, root, filter=False, filamt=0) try: ParseXML.run(root.filename) except: print("already in annotation") dicc = g.openResult('output/annotation.csv') annoSource = dicc.get(ntpath.basename(root.filename[:-4]))[0] annoDup = dicc.get(ntpath.basename(root.filename[:-4]))[1] p, r, f = g.allmeasure(srcCandidate, dupl, annoSource, annoDup) print(p) print(r) print(f) result = tkinter.Toplevel(root) result.minsize(200, 200) result.title("Evaluation " + ntpath.basename(root.filename)) tkinter.Label(result, text="Precision ").grid(row=0, column=0, padx=3) tkinter.Label(result, text="Recall ").grid(row=1, column=0, padx=3) tkinter.Label(result, text="F1 Score ").grid(row=2, column=0, padx=3) tkinter.Label(result, text=round(p, 3)).grid(row=0, column=1, padx=3) tkinter.Label(result, text=round(r, 3)).grid(row=1, column=1, padx=3) tkinter.Label(result, text=round(f, 3)).grid(row=2, column=1, padx=3)
def run(filename): xmldoc = minidom.parse(filename[:-4]+'.xml') if "simulatedFiles" in filename: plg = xmldoc.getElementsByTagName('plagiarized') srcs = [] for x in plg: print(x.attributes['sourceid'].value) y = x.attributes['sourceid'].value if str(y)+'.txt' not in srcs: srcs.append(str(y)+'.txt') else: plg = xmldoc.getElementsByTagName('features') srcs = [] for x in plg: print(x.attributes['source_reference'].value) y = x.attributes['source_reference'].value if str(y) + '.txt' not in srcs: srcs.append(str(y) + '.txt') if 'testdoc2' in filename: dupesDict = GlobalFunctions.openFiles('duplicate/FirstDuplicates.csv') else: dupesDict = GlobalFunctions.openFiles('duplicate/NumericDuplicate.csv') dupes = [] for dupe in dupesDict: for x in srcs: if x in dupesDict.get(dupe) and dupe not in srcs: dupes.append(dupe) for x in srcs: print(dupesDict.get(x)) try: if len(dupesDict.get(x)) > 1: for dupe in dupesDict.get(x): if dupe not in dupes: dupes.append(dupe) else: try: if dupesDict.get(x)[0] not in dupes: dupes = dupes + dupesDict.get(x) except: continue except: print("not in dictionary") print(dupes) tupleS = [srcs, dupes] print(tupleS) try: dicc = GlobalFunctions.openResult('output/annotation.csv') dicc[ntpath.basename(filename[:-4])] = tupleS except: dicc = {ntpath.basename(filename[:-4]):tupleS} with open('output/annotation.csv', 'w', encoding='utf-8', newline='')as csvfile: fieldname = ['TestDocs', 'Source','Duplicates'] writer = csv.DictWriter(csvfile, fieldnames=fieldname) writer.writeheader() for x in dicc: try: writer.writerow({'TestDocs':x, 'Source':dicc.get(x)[0],'Duplicates':dicc.get(x)[1]}) except: writer.writerow({'TestDocs': x, 'Source': dicc.get(x)[0]}) csvfile.close()
def run(filename, root, show = True, filter = True, filamt = 5): dicc = GlobalFunctions.openFiles('PLs/CompiledPLs/postingList.csv') df = GlobalFunctions.openFiles('PLs/dictionary/documentFreq.csv') testTF, queries = TestDocProcessing.queryExtract(filename) querySources = [] counter = 0 if show: win = tkinter.Toplevel(root) win.minsize(200, 200) win.title("Search Query for "+ntpath.basename(filename)) c=0 for que in queries: r=0 counter +=1 querySources.append(getSource(testTF,que,dicc,df)) if show: tkinter.Label(win, text = "Search Query "+str(counter)).grid(row=r, column = c, padx = 3) r+=1 try: for q in que: if show: tkinter.Label(win, text=q).grid(row=r, column = c, padx= 3) r +=1 except: print("unable to call new window") c += 1 if show: qwin = tkinter.Toplevel(win) qwin.minsize(200, 200) qwin.title("Query Result for " + ntpath.basename(filename)) c=0 counter = 0 for src in querySources: r=0 counter+=1 tkinter.Label(qwin, text = "Query Result "+str(counter)).grid(row=r, column = c, padx = 3) tkinter.Label(qwin, text="Percentage " + str(counter)).grid(row=r, column=c+1, padx=3) r+=1 try: for s in src: tkinter.Label(qwin, text = s[0]).grid(row=r, column = c, padx = 3) tkinter.Label(qwin, text = round(s[1]*100,2)).grid(row=r,column = c+1, padx=3) r+=1 except: print("no val") c+=2 result =[] for n in querySources: for x in n: result.append(x) result.sort(key=operator.itemgetter(1)) result.reverse() filters = [] copyResult = result.copy() result = [] for x in copyResult: if x[0] not in filters: filters.append(x[0]) result.append(x) print(result) if 'testdoc2' in filename: dupes = GlobalFunctions.openFiles('duplicate/FirstDuplicates.csv') else: dupes = GlobalFunctions.openFiles('duplicate/NumericDuplicate.csv') duplicate = [] copyRes = result.copy() result = [] for res in copyRes: result.append(res[0]) for res in result.copy(): try: for x in dupes.get(res): if x not in duplicate and res not in duplicate: duplicate.append(x) result.remove(x) except: pass if 'testdoc2' in filename: dupes = GlobalFunctions.openFiles('duplicate/NumericDuplicate.csv') else: dupes = GlobalFunctions.openFiles('duplicate/FirstDuplicates.csv') for res in result.copy(): try: for x in dupes.get(res): if x not in duplicate and res not in duplicate: duplicate.append(x) result.remove(x) except: pass print(result) print(duplicate) if filter: result = result[:filamt] if 'testdoc2' in filename: dupes = GlobalFunctions.openFiles('duplicate/FirstDuplicates.csv') else: dupes = GlobalFunctions.openFiles('duplicate/NumericDuplicate.csv') duplicate = [] for res in result: try: for x in dupes.get(res): duplicate.append(x) except: pass if show: fres = tkinter.Toplevel(qwin) fres.minsize(200, 200) fres.title("Source Candidate for " + ntpath.basename(filename)) c = 0 r = 0 dr = 0 counter += 1 tkinter.Label(fres, text="Query Result").grid(row=r, column=c, padx=3) r += 1 try: for s in result: tkinter.Label(fres, text=s).grid(row=r, column=c, padx=3) r += 1 for d in duplicate: dr+=1 tkinter.Label(fres, text="Duplicate").grid(row=0, column=c+1, padx=3) tkinter.Label(fres,text = d).grid(row=dr, column=c+1, padx=3) except: print("no val") if not os.path.exists('output/'): os.makedirs('output/') copyResult = [] for res in result.copy(): copyResult.append(res) copyDupe = [] for dupes in duplicate.copy(): copyDupe.append(dupes) tupleS = [copyResult, copyDupe] print(tupleS) print(result) print(duplicate) if filter and filamt == 5: saveResult = 'output/resultsFilter5.csv' elif filter and filamt ==10: saveResult = 'output/resultsFilter10.csv' else: saveResult = 'output/resultsNoFil.csv' try: dicc = GlobalFunctions.openResult(saveResult) dicc[ntpath.basename(filename[:-4])] = tupleS except: dicc = {ntpath.basename(filename[:-4]):tupleS} if filter: with open('output/resultsFilter'+str(filamt)+'.csv', 'w', encoding='utf-8', newline='')as csvfile: fieldname = ['TestDocs', 'Source_Candidate','Duplicates'] writer = csv.DictWriter(csvfile, fieldnames=fieldname) writer.writeheader() for x in dicc: try: writer.writerow({'TestDocs':x, 'Source_Candidate':dicc.get(x)[0],'Duplicates':dicc.get(x)[1]}) except: writer.writerow({'TestDocs': x, 'Source_Candidate': dicc.get(x)[0]}) csvfile.close() else: with open('output/resultsNoFil.csv', 'w', encoding='utf-8', newline='')as csvfile: fieldname = ['TestDocs', 'Source_Candidate','Duplicates'] writer = csv.DictWriter(csvfile, fieldnames=fieldname) writer.writeheader() for x in dicc: try: writer.writerow({'TestDocs':x, 'Source_Candidate':dicc.get(x)[0],'Duplicates':dicc.get(x)[1]}) except: writer.writerow({'TestDocs': x, 'Source_Candidate': dicc.get(x)[0]}) csvfile.close() return copyResult,copyDupe
def simul(filteramt, file): if file == "Simulated": file = glob.glob("testdoc/simulatedFiles/*.txt") else: file = glob.glob("testdoc/artificialFile/*.txt") #print(file) #print(filteramt) precision = [] recall = [] F1 = [] docname = [] count = 0 for fi in file: count += 1 docname.append(count) if fi == '': print('cancelled') else: if filteramt == 0: srcCandidate, dupl = FindSourceDoc.run(fi, root, show=False, filter=False, filamt=0) else: srcCandidate, dupl = FindSourceDoc.run(fi, root, show=False, filamt=filteramt) try: ParseXML.run(fi) except: print("already in annotation") dicc = g.openResult('output/annotation.csv') annoSource = dicc.get(ntpath.basename(fi[:-4]))[0] annoDup = dicc.get(ntpath.basename(fi[:-4]))[1] p, r, f = g.allmeasure(srcCandidate, dupl, annoSource, annoDup) precision.append(p) recall.append(r) F1.append(f) plt.subplot(2, 2, 1) plt.bar(docname, precision) plt.title('Precision = ' + str(round(sum(precision) / len(precision), 3))) plt.ylim(top=1.05) plt.subplot(2, 2, 2) plt.bar(docname, recall) plt.title('Recall = ' + str(round(sum(recall) / len(recall), 3))) plt.ylim(top=1.05) plt.subplot(2, 2, 3) plt.bar(docname, recall) plt.title('F1 Score = ' + str(round(sum(F1) / len(F1), 3))) plt.ylim(top=1.05) plt.subplots_adjust(hspace=0.3) plt.show()