Пример #1
0
    def findSourceDoc(self):
        root.filename = tkinter.filedialog.askopenfilename(
            initialdir=os.path.dirname(os.path.realpath(__file__)),
            title="Select file to search",
            filetypes=(("txt files", "*.txt"), ("all files", "*.*")))

        if root.filename == '':
            print("cancelled")
        else:
            srcCandidate, dupl = FindSourceDoc.run(root.filename,
                                                   root,
                                                   filter=False,
                                                   filamt=0)
            try:
                ParseXML.run(root.filename)
            except:
                print("already in annotation")

            dicc = g.openResult('output/annotation.csv')

            annoSource = dicc.get(ntpath.basename(root.filename[:-4]))[0]
            annoDup = dicc.get(ntpath.basename(root.filename[:-4]))[1]

            p, r, f = g.allmeasure(srcCandidate, dupl, annoSource, annoDup)
            print(p)
            print(r)
            print(f)
            result = tkinter.Toplevel(root)
            result.minsize(200, 200)
            result.title("Evaluation " + ntpath.basename(root.filename))
            tkinter.Label(result, text="Precision ").grid(row=0,
                                                          column=0,
                                                          padx=3)
            tkinter.Label(result, text="Recall ").grid(row=1, column=0, padx=3)
            tkinter.Label(result, text="F1 Score ").grid(row=2,
                                                         column=0,
                                                         padx=3)
            tkinter.Label(result, text=round(p, 3)).grid(row=0,
                                                         column=1,
                                                         padx=3)
            tkinter.Label(result, text=round(r, 3)).grid(row=1,
                                                         column=1,
                                                         padx=3)
            tkinter.Label(result, text=round(f, 3)).grid(row=2,
                                                         column=1,
                                                         padx=3)
Пример #2
0
def run(filename):
    xmldoc = minidom.parse(filename[:-4]+'.xml')

    if "simulatedFiles" in filename:
        plg = xmldoc.getElementsByTagName('plagiarized')
        srcs = []
        for x in plg:
            print(x.attributes['sourceid'].value)
            y = x.attributes['sourceid'].value
            if str(y)+'.txt' not in srcs:
                srcs.append(str(y)+'.txt')
    else:
        plg = xmldoc.getElementsByTagName('features')
        srcs = []
        for x in plg:
            print(x.attributes['source_reference'].value)
            y = x.attributes['source_reference'].value
            if str(y) + '.txt' not in srcs:
                srcs.append(str(y) + '.txt')

    if 'testdoc2' in filename:
        dupesDict = GlobalFunctions.openFiles('duplicate/FirstDuplicates.csv')
    else:
        dupesDict = GlobalFunctions.openFiles('duplicate/NumericDuplicate.csv')
    dupes = []
    for dupe in dupesDict:
        for x in srcs:
            if x in dupesDict.get(dupe) and dupe not in srcs:
                dupes.append(dupe)
    for x in srcs:
        print(dupesDict.get(x))
        try:
            if len(dupesDict.get(x)) > 1:
                for dupe in dupesDict.get(x):
                    if dupe not in dupes:
                        dupes.append(dupe)
            else:
                try:
                    if dupesDict.get(x)[0] not in dupes:
                        dupes = dupes + dupesDict.get(x)
                except:
                    continue
        except:
            print("not in dictionary")
    print(dupes)

    tupleS = [srcs, dupes]
    print(tupleS)
    try:
        dicc = GlobalFunctions.openResult('output/annotation.csv')
        dicc[ntpath.basename(filename[:-4])] = tupleS
    except:
        dicc = {ntpath.basename(filename[:-4]):tupleS}

    with open('output/annotation.csv', 'w', encoding='utf-8', newline='')as csvfile:
        fieldname = ['TestDocs', 'Source','Duplicates']
        writer = csv.DictWriter(csvfile, fieldnames=fieldname)
        writer.writeheader()
        for x in dicc:
            try:
                writer.writerow({'TestDocs':x, 'Source':dicc.get(x)[0],'Duplicates':dicc.get(x)[1]})
            except:
                writer.writerow({'TestDocs': x, 'Source': dicc.get(x)[0]})
    csvfile.close()
Пример #3
0
def run(filename, root, show = True, filter = True, filamt = 5):
    dicc = GlobalFunctions.openFiles('PLs/CompiledPLs/postingList.csv')
    df = GlobalFunctions.openFiles('PLs/dictionary/documentFreq.csv')
    testTF, queries = TestDocProcessing.queryExtract(filename)
    querySources = []
    counter = 0
    if show:
        win = tkinter.Toplevel(root)
        win.minsize(200, 200)
        win.title("Search Query for "+ntpath.basename(filename))
    c=0
    for que in queries:
        r=0
        counter +=1
        querySources.append(getSource(testTF,que,dicc,df))
        if show:
            tkinter.Label(win, text = "Search Query "+str(counter)).grid(row=r, column = c, padx = 3)
        r+=1
        try:
            for q in que:
                if show:
                    tkinter.Label(win, text=q).grid(row=r, column = c, padx= 3)
                r +=1
        except:
            print("unable to call new window")
        c += 1

    if show:
        qwin = tkinter.Toplevel(win)
        qwin.minsize(200, 200)
        qwin.title("Query Result for " + ntpath.basename(filename))
        c=0
        counter = 0
        for src in querySources:
            r=0
            counter+=1
            tkinter.Label(qwin, text = "Query Result "+str(counter)).grid(row=r, column = c, padx = 3)
            tkinter.Label(qwin, text="Percentage " + str(counter)).grid(row=r, column=c+1, padx=3)
            r+=1
            try:
                for s in src:
                    tkinter.Label(qwin, text = s[0]).grid(row=r, column = c, padx = 3)
                    tkinter.Label(qwin, text = round(s[1]*100,2)).grid(row=r,column = c+1, padx=3)
                    r+=1
            except:
                print("no val")
            c+=2

    result =[]
    for n in querySources:
        for x in n:
            result.append(x)

    result.sort(key=operator.itemgetter(1))
    result.reverse()
    filters = []
    copyResult = result.copy()
    result = []
    for x in copyResult:
        if x[0] not in filters:
            filters.append(x[0])
            result.append(x)
    print(result)

    if 'testdoc2' in filename:
        dupes = GlobalFunctions.openFiles('duplicate/FirstDuplicates.csv')
    else:
        dupes = GlobalFunctions.openFiles('duplicate/NumericDuplicate.csv')
    duplicate = []

    copyRes = result.copy()
    result = []
    for res in copyRes:
        result.append(res[0])

    for res in result.copy():
        try:
            for x in dupes.get(res):
                if x not in duplicate and res not in duplicate:
                    duplicate.append(x)
                    result.remove(x)
        except:
            pass

    if 'testdoc2' in filename:
        dupes = GlobalFunctions.openFiles('duplicate/NumericDuplicate.csv')
    else:
        dupes = GlobalFunctions.openFiles('duplicate/FirstDuplicates.csv')

    for res in result.copy():
        try:
            for x in dupes.get(res):
                if x not in duplicate and res not in duplicate:
                    duplicate.append(x)
                    result.remove(x)
        except:
            pass

    print(result)
    print(duplicate)

    if filter:
        result = result[:filamt]
        if 'testdoc2' in filename:
            dupes = GlobalFunctions.openFiles('duplicate/FirstDuplicates.csv')
        else:
            dupes = GlobalFunctions.openFiles('duplicate/NumericDuplicate.csv')
        duplicate = []
        for res in result:
            try:
                for x in dupes.get(res):
                    duplicate.append(x)
            except:
                pass

    if show:
        fres = tkinter.Toplevel(qwin)
        fres.minsize(200, 200)
        fres.title("Source Candidate for " + ntpath.basename(filename))
        c = 0
        r = 0
        dr = 0
        counter += 1
        tkinter.Label(fres, text="Query Result").grid(row=r, column=c, padx=3)
        r += 1
        try:
            for s in result:
                tkinter.Label(fres, text=s).grid(row=r, column=c, padx=3)
                r += 1
            for d in duplicate:
                dr+=1
                tkinter.Label(fres, text="Duplicate").grid(row=0, column=c+1, padx=3)
                tkinter.Label(fres,text = d).grid(row=dr, column=c+1, padx=3)
        except:
            print("no val")

    if not os.path.exists('output/'):
        os.makedirs('output/')

    copyResult = []
    for res in result.copy():
        copyResult.append(res)
    copyDupe = []
    for dupes in duplicate.copy():
        copyDupe.append(dupes)
    tupleS = [copyResult, copyDupe]
    print(tupleS)

    print(result)
    print(duplicate)

    if filter and filamt == 5:
        saveResult = 'output/resultsFilter5.csv'
    elif filter and filamt ==10:
        saveResult = 'output/resultsFilter10.csv'
    else:
        saveResult = 'output/resultsNoFil.csv'

    try:
        dicc = GlobalFunctions.openResult(saveResult)
        dicc[ntpath.basename(filename[:-4])] = tupleS
    except:
        dicc = {ntpath.basename(filename[:-4]):tupleS}

    if filter:
        with open('output/resultsFilter'+str(filamt)+'.csv', 'w', encoding='utf-8', newline='')as csvfile:
            fieldname = ['TestDocs', 'Source_Candidate','Duplicates']
            writer = csv.DictWriter(csvfile, fieldnames=fieldname)
            writer.writeheader()
            for x in dicc:
                try:
                    writer.writerow({'TestDocs':x, 'Source_Candidate':dicc.get(x)[0],'Duplicates':dicc.get(x)[1]})
                except:
                    writer.writerow({'TestDocs': x, 'Source_Candidate': dicc.get(x)[0]})
        csvfile.close()
    else:
        with open('output/resultsNoFil.csv', 'w', encoding='utf-8', newline='')as csvfile:
            fieldname = ['TestDocs', 'Source_Candidate','Duplicates']
            writer = csv.DictWriter(csvfile, fieldnames=fieldname)
            writer.writeheader()
            for x in dicc:
                try:
                    writer.writerow({'TestDocs':x, 'Source_Candidate':dicc.get(x)[0],'Duplicates':dicc.get(x)[1]})
                except:
                    writer.writerow({'TestDocs': x, 'Source_Candidate': dicc.get(x)[0]})
        csvfile.close()

    return copyResult,copyDupe
Пример #4
0
def simul(filteramt, file):
    if file == "Simulated":
        file = glob.glob("testdoc/simulatedFiles/*.txt")
    else:
        file = glob.glob("testdoc/artificialFile/*.txt")

    #print(file)
    #print(filteramt)

    precision = []
    recall = []
    F1 = []
    docname = []
    count = 0
    for fi in file:
        count += 1
        docname.append(count)
        if fi == '':
            print('cancelled')
        else:
            if filteramt == 0:
                srcCandidate, dupl = FindSourceDoc.run(fi,
                                                       root,
                                                       show=False,
                                                       filter=False,
                                                       filamt=0)
            else:
                srcCandidate, dupl = FindSourceDoc.run(fi,
                                                       root,
                                                       show=False,
                                                       filamt=filteramt)
            try:
                ParseXML.run(fi)
            except:
                print("already in annotation")
            dicc = g.openResult('output/annotation.csv')

            annoSource = dicc.get(ntpath.basename(fi[:-4]))[0]
            annoDup = dicc.get(ntpath.basename(fi[:-4]))[1]

            p, r, f = g.allmeasure(srcCandidate, dupl, annoSource, annoDup)
            precision.append(p)
            recall.append(r)
            F1.append(f)

    plt.subplot(2, 2, 1)
    plt.bar(docname, precision)
    plt.title('Precision = ' + str(round(sum(precision) / len(precision), 3)))
    plt.ylim(top=1.05)

    plt.subplot(2, 2, 2)
    plt.bar(docname, recall)
    plt.title('Recall = ' + str(round(sum(recall) / len(recall), 3)))
    plt.ylim(top=1.05)

    plt.subplot(2, 2, 3)
    plt.bar(docname, recall)
    plt.title('F1 Score = ' + str(round(sum(F1) / len(F1), 3)))
    plt.ylim(top=1.05)

    plt.subplots_adjust(hspace=0.3)
    plt.show()