def createCategoricalPatterngram(basefile, ensemble, klength, normName, mode=CHAR): """Creates a Categorical Patterngram as PIL Image object. It "shows the degree of similarity that exists between one file [(the base file)] and the rest of the ensemble of files." (Rib00) basefile - the single file in the 'one-to-many' comparison ensemble - the set of files used for the comparison klength - the length of the used ngrams normName - the name of the normalizer used to normalize the texts mode - default: CHAR (=1); WORD (=0) "[A] categorical patterngram [...] [is] a visualization that displays, for each character position x in the the base file, the number of ensemble files that contain the k-length n-gram that begins at that position. A point is plotted at coordinate (x,y) if the k-length n-gram beginning at character x in the base file occurs at any location in y files of the ensemble." (Rib00) Rib00 - "Using Visualization to Detect Plagiarism in Computer Science Classes", Randy L. Ribler, Marc Abrams, 2000 """ #check preconditions if type(basefile) != type(""): raise NoValidArgumentError, "basefile must be of type string" elif type(ensemble) != type([]): raise NoValidArgumentError, "ensemble must be of type list" elif type(klength) != type(1): raise NoValidArgumentError, "klength must be of type integer" elif klength<=0: raise NoValidArgumentError, "klength must be greater zero" elif not normName in normNames: raise NoValidNormalizerNameError(normName) #get Normalizer normalize = normNames.get(normName) #===init base file=== #normalize base file normBase = normalize(basefile) #create ngrams nGramDict = createNGrams(normBase, klength, mode=mode) #create array for base file ngrams to cnt occurencies in ensemble files if mode == WORD: occSize = len(normBase.split())-klength+1 else: occSize = len(normBase)-klength+1 nGramOccurencies = [0 for x in xrange(occSize)] #===check ensemble files=== for text in ensemble: #=============================================================================== # for y in xrange(len(ensemble)): # text = ensemble[y] # if text == basefile: # subNr = y #=============================================================================== if text == basefile: continue #normalize text normText = normalize(text) #create ngrams textNGrams = createNGrams(text, klength, mode=mode) #check for common ngrams commonNGrams = set(nGramDict.keys()) & set(textNGrams.keys()) #increase array for common ngrams for ngram in commonNGrams: for i in nGramDict.get(ngram): nGramOccurencies[i] += 1 #===create CP Image=== #init colorBG = (255, 255, 255) colorFG = (0, 0, 0) colorSelf = (0, 0, 200) colorInfrequently = (200, 0, 0) colorFrequently = (0, 200, 0) infrequent = 3 maxNb = reduce(returnGreater, nGramOccurencies) ydist = 10 maxY = (maxNb+2)*ydist img = Image.new("RGB", (len(nGramOccurencies), maxY), colorBG) draw = ImageDraw.Draw(img) #compute max nb files for x in xrange(len(nGramOccurencies)): yCoord = nGramOccurencies[x]+1 y = maxY - (ydist*(yCoord)) if yCoord == 1: color = colorSelf elif yCoord<=infrequent: color = colorInfrequently else: color = colorFrequently draw.line((x, y-(ydist-6), x, y+(ydist-6)), fill=color) #===paste img to chart pattern=== img = createChart(img, maxNb+1, ydist, "N-gram starting character", "Nb of files containing n-gram", "Submission "+" (k="+str(klength)+")") #clean up del draw #return CP chart image return img
def createCategoricalPatterngram(basefile, ensemble, klength, normName, mode=CHAR): """Creates a Categorical Patterngram as PIL Image object. It "shows the degree of similarity that exists between one file [(the base file)] and the rest of the ensemble of files." (Rib00) basefile - the single file in the 'one-to-many' comparison ensemble - the set of files used for the comparison klength - the length of the used ngrams normName - the name of the normalizer used to normalize the texts mode - default: CHAR (=1); WORD (=0) "[A] categorical patterngram [...] [is] a visualization that displays, for each character position x in the the base file, the number of ensemble files that contain the k-length n-gram that begins at that position. A point is plotted at coordinate (x,y) if the k-length n-gram beginning at character x in the base file occurs at any location in y files of the ensemble." (Rib00) Rib00 - "Using Visualization to Detect Plagiarism in Computer Science Classes", Randy L. Ribler, Marc Abrams, 2000 """ #check preconditions if type(basefile) != type(""): raise NoValidArgumentError, "basefile must be of type string" elif type(ensemble) != type([]): raise NoValidArgumentError, "ensemble must be of type list" elif type(klength) != type(1): raise NoValidArgumentError, "klength must be of type integer" elif klength <= 0: raise NoValidArgumentError, "klength must be greater zero" elif not normName in normNames: raise NoValidNormalizerNameError(normName) #get Normalizer normalize = normNames.get(normName) #===init base file=== #normalize base file normBase = normalize(basefile) #create ngrams nGramDict = createNGrams(normBase, klength, mode=mode) #create array for base file ngrams to cnt occurencies in ensemble files if mode == WORD: occSize = len(normBase.split()) - klength + 1 else: occSize = len(normBase) - klength + 1 nGramOccurencies = [0 for x in xrange(occSize)] #===check ensemble files=== for text in ensemble: #=============================================================================== # for y in xrange(len(ensemble)): # text = ensemble[y] # if text == basefile: # subNr = y #=============================================================================== if text == basefile: continue #normalize text normText = normalize(text) #create ngrams textNGrams = createNGrams(text, klength, mode=mode) #check for common ngrams commonNGrams = set(nGramDict.keys()) & set(textNGrams.keys()) #increase array for common ngrams for ngram in commonNGrams: for i in nGramDict.get(ngram): nGramOccurencies[i] += 1 #===create CP Image=== #init colorBG = (255, 255, 255) colorFG = (0, 0, 0) colorSelf = (0, 0, 200) colorInfrequently = (200, 0, 0) colorFrequently = (0, 200, 0) infrequent = 3 maxNb = reduce(returnGreater, nGramOccurencies) ydist = 10 maxY = (maxNb + 2) * ydist img = Image.new("RGB", (len(nGramOccurencies), maxY), colorBG) draw = ImageDraw.Draw(img) #compute max nb files for x in xrange(len(nGramOccurencies)): yCoord = nGramOccurencies[x] + 1 y = maxY - (ydist * (yCoord)) if yCoord == 1: color = colorSelf elif yCoord <= infrequent: color = colorInfrequently else: color = colorFrequently draw.line((x, y - (ydist - 6), x, y + (ydist - 6)), fill=color) #===paste img to chart pattern=== img = createChart(img, maxNb + 1, ydist, "N-gram starting character", "Nb of files containing n-gram", "Submission " + " (k=" + str(klength) + ")") #clean up del draw #return CP chart image return img
def createCompositeCategoricalPatterngram(basefile, ensemble, klength, normName, mode=CHAR): """Creates a Composite Categorical Patterngram as PIL Image object. It "shows which particular files are similar." (Rib00) basefile - the single file in the 'one-to-many' comparison ensemble - the set of files used for the comparison klength - the length of the used ngrams normName - the name of the normalizer used to normalize the texts mode - default: CHAR (=1); WORD (=0) A composite categorical patterngram is a visualization that displays, for each character position x in the the base file, which file of the ensemble files also contain the k-length n-gram that begins at that position. "A point is plotted at (x,y) if the k-length n-gram beginning at x in the base file occurs one or more times in file y." (Rib00) Rib00 - "Using Visualization to Detect Plagiarism in Computer Science Classes", Randy L. Ribler, Marc Abrams, 2000 """ #check preconditions if type(basefile) != type(""): raise NoValidArgumentError, "basefile must be of type string" elif type(ensemble) != type([]): raise NoValidArgumentError, "ensemble must be of type list" elif type(klength) != type(1): raise NoValidArgumentError, "klength must be of type integer" elif klength<=0: raise NoValidArgumentError, "klength must be greater zero" elif not normName in normNames: raise NoValidNormalizerNameError(normName, normNames) #get Normalizer normalize = normNames.get(normName) #===init base file=== #normalize base file normBase = normalize(basefile) #create ngrams nGramDict = createNGrams(normBase, klength, mode) #===create CPP Image=== #init colorBG = (255, 255, 255) colorFG = (0, 0, 0) colorSelf = (0, 0, 200) colorInfrequently = (200, 0, 0) colorFrequently = (0, 200, 0) infrequent = 2 ydist = 10 if mode == WORD: maxX = len(normBase.split())-klength+1 else: maxX = len(normBase)-klength+1 maxY = (len(ensemble)+1)*ydist img = Image.new("RGB", (maxX, maxY), colorBG) draw = ImageDraw.Draw(img) subNr = '' #===check ensemble files=== for y in xrange(len(ensemble)): # +2? text = ensemble[y] yPos = maxY - (ydist*(y+1)) #if text is basefile mark with colorSelf in img if text == basefile: subNr = y+1 for x in xrange(maxX): draw.line((x, yPos-(ydist-6), x, yPos+(ydist-6)), fill=colorSelf) #else #for each text compute ngrams #compute common ngrams #draw corresponding lines for each common ngram colored aufter its #frequency else: #normalize text normText = normalize(text) #create ngrams textNGrams = createNGrams(normText, klength, mode) #check for common ngrams commonNGrams = set(nGramDict.keys()) & set(textNGrams.keys()) #draw for each common ngram the appropriate marking for ngram in commonNGrams: #for each position of the common ngram draw right marking for i in nGramDict.get(ngram): #get occurencies of the ngram in the text cntNgram = len(textNGrams.get(ngram)) #choose color for occurency if cntNgram <= infrequent: color = colorInfrequently else: color = colorFrequently #draw line draw.line((i, yPos-(ydist-6), i, yPos+(ydist-6)), fill=color) #===paste img to chart pattern=== img = createChart(img, len(ensemble), ydist, "N-gram starting character", "File number", "Submission nb: "+str(subNr)+" (k="+str(klength)+")") return img
def createCompositeCategoricalPatterngram(basefile, ensemble, klength, normName, mode=CHAR): """Creates a Composite Categorical Patterngram as PIL Image object. It "shows which particular files are similar." (Rib00) basefile - the single file in the 'one-to-many' comparison ensemble - the set of files used for the comparison klength - the length of the used ngrams normName - the name of the normalizer used to normalize the texts mode - default: CHAR (=1); WORD (=0) A composite categorical patterngram is a visualization that displays, for each character position x in the the base file, which file of the ensemble files also contain the k-length n-gram that begins at that position. "A point is plotted at (x,y) if the k-length n-gram beginning at x in the base file occurs one or more times in file y." (Rib00) Rib00 - "Using Visualization to Detect Plagiarism in Computer Science Classes", Randy L. Ribler, Marc Abrams, 2000 """ #check preconditions if type(basefile) != type(""): raise NoValidArgumentError, "basefile must be of type string" elif type(ensemble) != type([]): raise NoValidArgumentError, "ensemble must be of type list" elif type(klength) != type(1): raise NoValidArgumentError, "klength must be of type integer" elif klength <= 0: raise NoValidArgumentError, "klength must be greater zero" elif not normName in normNames: raise NoValidNormalizerNameError(normName, normNames) #get Normalizer normalize = normNames.get(normName) #===init base file=== #normalize base file normBase = normalize(basefile) #create ngrams nGramDict = createNGrams(normBase, klength, mode) #===create CPP Image=== #init colorBG = (255, 255, 255) colorFG = (0, 0, 0) colorSelf = (0, 0, 200) colorInfrequently = (200, 0, 0) colorFrequently = (0, 200, 0) infrequent = 2 ydist = 10 if mode == WORD: maxX = len(normBase.split()) - klength + 1 else: maxX = len(normBase) - klength + 1 maxY = (len(ensemble) + 1) * ydist img = Image.new("RGB", (maxX, maxY), colorBG) draw = ImageDraw.Draw(img) subNr = '' #===check ensemble files=== for y in xrange(len(ensemble)): # +2? text = ensemble[y] yPos = maxY - (ydist * (y + 1)) #if text is basefile mark with colorSelf in img if text == basefile: subNr = y + 1 for x in xrange(maxX): draw.line((x, yPos - (ydist - 6), x, yPos + (ydist - 6)), fill=colorSelf) #else #for each text compute ngrams #compute common ngrams #draw corresponding lines for each common ngram colored aufter its #frequency else: #normalize text normText = normalize(text) #create ngrams textNGrams = createNGrams(normText, klength, mode) #check for common ngrams commonNGrams = set(nGramDict.keys()) & set(textNGrams.keys()) #draw for each common ngram the appropriate marking for ngram in commonNGrams: #for each position of the common ngram draw right marking for i in nGramDict.get(ngram): #get occurencies of the ngram in the text cntNgram = len(textNGrams.get(ngram)) #choose color for occurency if cntNgram <= infrequent: color = colorInfrequently else: color = colorFrequently #draw line draw.line((i, yPos - (ydist - 6), i, yPos + (ydist - 6)), fill=color) #===paste img to chart pattern=== img = createChart( img, len(ensemble), ydist, "N-gram starting character", "File number", "Submission nb: " + str(subNr) + " (k=" + str(klength) + ")") return img