Python createNGrams示例

编程语言: Python

命名空间/包名称: Products.ECAssignmentBox.PlagDetector.Detection.algNGRAM

方法/功能: createNGrams

hotexamples.com的示例: 4

Python createNGrams - 已找到4个示例。这些是从开源项目中提取的最受好评的Products.ECAssignmentBox.PlagDetector.Detection.algNGRAM.createNGrams现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： patterngram.py 项目： dtgit/dtedu

def createCategoricalPatterngram(basefile, ensemble, klength, normName, mode=CHAR):
    """Creates a Categorical Patterngram as PIL Image object. It "shows
        the degree of similarity that exists between one file [(the base 
        file)] and the rest of the ensemble of files." (Rib00)
    
        basefile - the single file in the 'one-to-many' comparison
        ensemble - the set of files used for the comparison
        klength  - the length of the used ngrams
        normName - the name of the normalizer used to normalize the texts
        mode     - default: CHAR (=1); WORD (=0)
    
        "[A] categorical patterngram [...] [is] a visualization
        that displays, for each character position x in the the base
        file, the number of ensemble files that contain the k-length
        n-gram that begins at that position. A point is plotted at
        coordinate (x,y) if the k-length n-gram beginning at character
        x in the base file occurs at any location in y files of the 
        ensemble." (Rib00)
        
        Rib00 - "Using Visualization to Detect Plagiarism in Computer Science Classes",
        Randy L. Ribler, Marc Abrams, 2000
    """
    #check preconditions
    if type(basefile) != type(""):
        raise NoValidArgumentError, "basefile must be of type string"
    elif type(ensemble) != type([]):
        raise NoValidArgumentError, "ensemble must be of type list"
    elif type(klength) != type(1):
        raise NoValidArgumentError, "klength must be of type integer"
    elif klength<=0:
        raise NoValidArgumentError, "klength must be greater zero"
    elif not normName in normNames:
        raise NoValidNormalizerNameError(normName)
    
    #get Normalizer
    normalize = normNames.get(normName)
    
    #===init base file===
    #normalize base file
    normBase = normalize(basefile) 
    #create ngrams
    nGramDict = createNGrams(normBase, klength, mode=mode)
    #create array for base file ngrams to cnt occurencies in ensemble files
    if mode == WORD:
        occSize = len(normBase.split())-klength+1
    else:
        occSize = len(normBase)-klength+1
    nGramOccurencies = [0 for x in xrange(occSize)]
    
    #===check ensemble files===
    for text in ensemble:
#===============================================================================
#    for y in xrange(len(ensemble)):
#        text = ensemble[y]
#        if text == basefile:
#            subNr = y
#===============================================================================
        if text == basefile:
            continue
        #normalize text
        normText = normalize(text)
        #create ngrams
        textNGrams = createNGrams(text, klength, mode=mode)
        #check for common ngrams
        commonNGrams = set(nGramDict.keys()) & set(textNGrams.keys())
        #increase array for common ngrams
        for ngram in commonNGrams:
            for i in nGramDict.get(ngram):
                nGramOccurencies[i] += 1
        
    #===create CP Image===
    #init
    colorBG = (255, 255, 255)
    colorFG = (0, 0, 0)
    colorSelf = (0, 0, 200)
    colorInfrequently = (200, 0, 0)
    colorFrequently = (0, 200, 0)
    infrequent = 3
    maxNb = reduce(returnGreater, nGramOccurencies)
    ydist = 10
    maxY = (maxNb+2)*ydist
    img = Image.new("RGB", (len(nGramOccurencies), maxY), colorBG)
    draw = ImageDraw.Draw(img)
    
    #compute max nb files
    for x in xrange(len(nGramOccurencies)):
        yCoord = nGramOccurencies[x]+1
        y = maxY - (ydist*(yCoord))
        if yCoord == 1:
            color = colorSelf
        elif yCoord<=infrequent:
            color = colorInfrequently
        else: color = colorFrequently
        draw.line((x, y-(ydist-6), x, y+(ydist-6)), fill=color)
    
    #===paste img to chart pattern===
    img = createChart(img, maxNb+1, ydist, "N-gram starting character", "Nb of files containing n-gram", "Submission "+" (k="+str(klength)+")")
    
    #clean up
    del draw
    
    #return CP chart image
    return img

示例#2

显示文件

def createCategoricalPatterngram(basefile,
                                 ensemble,
                                 klength,
                                 normName,
                                 mode=CHAR):
    """Creates a Categorical Patterngram as PIL Image object. It "shows
        the degree of similarity that exists between one file [(the base 
        file)] and the rest of the ensemble of files." (Rib00)
    
        basefile - the single file in the 'one-to-many' comparison
        ensemble - the set of files used for the comparison
        klength  - the length of the used ngrams
        normName - the name of the normalizer used to normalize the texts
        mode     - default: CHAR (=1); WORD (=0)
    
        "[A] categorical patterngram [...] [is] a visualization
        that displays, for each character position x in the the base
        file, the number of ensemble files that contain the k-length
        n-gram that begins at that position. A point is plotted at
        coordinate (x,y) if the k-length n-gram beginning at character
        x in the base file occurs at any location in y files of the 
        ensemble." (Rib00)
        
        Rib00 - "Using Visualization to Detect Plagiarism in Computer Science Classes",
        Randy L. Ribler, Marc Abrams, 2000
    """
    #check preconditions
    if type(basefile) != type(""):
        raise NoValidArgumentError, "basefile must be of type string"
    elif type(ensemble) != type([]):
        raise NoValidArgumentError, "ensemble must be of type list"
    elif type(klength) != type(1):
        raise NoValidArgumentError, "klength must be of type integer"
    elif klength <= 0:
        raise NoValidArgumentError, "klength must be greater zero"
    elif not normName in normNames:
        raise NoValidNormalizerNameError(normName)

    #get Normalizer
    normalize = normNames.get(normName)

    #===init base file===
    #normalize base file
    normBase = normalize(basefile)
    #create ngrams
    nGramDict = createNGrams(normBase, klength, mode=mode)
    #create array for base file ngrams to cnt occurencies in ensemble files
    if mode == WORD:
        occSize = len(normBase.split()) - klength + 1
    else:
        occSize = len(normBase) - klength + 1
    nGramOccurencies = [0 for x in xrange(occSize)]

    #===check ensemble files===
    for text in ensemble:
        #===============================================================================
        #    for y in xrange(len(ensemble)):
        #        text = ensemble[y]
        #        if text == basefile:
        #            subNr = y
        #===============================================================================
        if text == basefile:
            continue
        #normalize text
        normText = normalize(text)
        #create ngrams
        textNGrams = createNGrams(text, klength, mode=mode)
        #check for common ngrams
        commonNGrams = set(nGramDict.keys()) & set(textNGrams.keys())
        #increase array for common ngrams
        for ngram in commonNGrams:
            for i in nGramDict.get(ngram):
                nGramOccurencies[i] += 1

    #===create CP Image===
    #init
    colorBG = (255, 255, 255)
    colorFG = (0, 0, 0)
    colorSelf = (0, 0, 200)
    colorInfrequently = (200, 0, 0)
    colorFrequently = (0, 200, 0)
    infrequent = 3
    maxNb = reduce(returnGreater, nGramOccurencies)
    ydist = 10
    maxY = (maxNb + 2) * ydist
    img = Image.new("RGB", (len(nGramOccurencies), maxY), colorBG)
    draw = ImageDraw.Draw(img)

    #compute max nb files
    for x in xrange(len(nGramOccurencies)):
        yCoord = nGramOccurencies[x] + 1
        y = maxY - (ydist * (yCoord))
        if yCoord == 1:
            color = colorSelf
        elif yCoord <= infrequent:
            color = colorInfrequently
        else:
            color = colorFrequently
        draw.line((x, y - (ydist - 6), x, y + (ydist - 6)), fill=color)

    #===paste img to chart pattern===
    img = createChart(img, maxNb + 1, ydist, "N-gram starting character",
                      "Nb of files containing n-gram",
                      "Submission " + " (k=" + str(klength) + ")")

    #clean up
    del draw

    #return CP chart image
    return img

示例#3

显示文件

文件： patterngram.py 项目： dtgit/dtedu

def createCompositeCategoricalPatterngram(basefile, ensemble, klength, normName, mode=CHAR):
    """Creates a Composite Categorical Patterngram as PIL Image object. It "shows
        which particular files are similar." (Rib00)
    
        basefile - the single file in the 'one-to-many' comparison
        ensemble - the set of files used for the comparison
        klength  - the length of the used ngrams
        normName - the name of the normalizer used to normalize the texts
        mode     - default: CHAR (=1); WORD (=0)
    
        A composite categorical patterngram is a visualization that displays, 
        for each character position x in the the base file, which file of the 
        ensemble files also contain the k-length n-gram that begins at that 
        position. "A point is plotted at (x,y) if the k-length n-gram beginning 
        at x in the base file occurs one or more times in file y." (Rib00)
        
        Rib00 - "Using Visualization to Detect Plagiarism in Computer Science Classes",
        Randy L. Ribler, Marc Abrams, 2000
    """
    #check preconditions
    if type(basefile) != type(""):
        raise NoValidArgumentError, "basefile must be of type string"
    elif type(ensemble) != type([]):
        raise NoValidArgumentError, "ensemble must be of type list"
    elif type(klength) != type(1):
        raise NoValidArgumentError, "klength must be of type integer"
    elif klength<=0:
        raise NoValidArgumentError, "klength must be greater zero"
    elif not normName in normNames:
        raise NoValidNormalizerNameError(normName, normNames)
    
    #get Normalizer
    normalize = normNames.get(normName)
    
    #===init base file===
    #normalize base file
    normBase = normalize(basefile) 
    #create ngrams
    nGramDict = createNGrams(normBase, klength, mode)

    #===create CPP Image===
    #init
    colorBG = (255, 255, 255)
    colorFG = (0, 0, 0)
    colorSelf = (0, 0, 200)
    colorInfrequently = (200, 0, 0)
    colorFrequently = (0, 200, 0)
    infrequent = 2
    ydist = 10
    if mode == WORD:
        maxX = len(normBase.split())-klength+1
    else:
        maxX = len(normBase)-klength+1
    maxY = (len(ensemble)+1)*ydist
    img = Image.new("RGB", (maxX, maxY), colorBG)
    draw = ImageDraw.Draw(img)
    subNr = ''
    
    #===check ensemble files===
    for y in xrange(len(ensemble)): # +2?
        text = ensemble[y]
        yPos = maxY - (ydist*(y+1))
        #if text is basefile mark with colorSelf in img
        if text == basefile:
            subNr = y+1
            for x in xrange(maxX):
                draw.line((x, yPos-(ydist-6), x, yPos+(ydist-6)), fill=colorSelf)
        #else 
        #for each text compute ngrams
        #compute common ngrams
        #draw corresponding lines for each common ngram colored aufter its
        #frequency
        else:
            #normalize text
            normText = normalize(text)        
            #create ngrams
            textNGrams = createNGrams(normText, klength, mode)
            #check for common ngrams
            commonNGrams = set(nGramDict.keys()) & set(textNGrams.keys())
            #draw for each common ngram the appropriate marking
            for ngram in commonNGrams:
                #for each position of the common ngram draw right marking
                for i in nGramDict.get(ngram):
                    #get occurencies of the ngram in the text
                    cntNgram = len(textNGrams.get(ngram))
                    #choose color for occurency
                    if cntNgram <= infrequent:
                        color = colorInfrequently
                    else:
                        color = colorFrequently
                    #draw line
                    draw.line((i, yPos-(ydist-6), i, yPos+(ydist-6)), fill=color)
                    
    #===paste img to chart pattern===
    img = createChart(img, len(ensemble), ydist, "N-gram starting character", "File number", "Submission nb: "+str(subNr)+" (k="+str(klength)+")")

    return img

示例#4

显示文件

def createCompositeCategoricalPatterngram(basefile,
                                          ensemble,
                                          klength,
                                          normName,
                                          mode=CHAR):
    """Creates a Composite Categorical Patterngram as PIL Image object. It "shows
        which particular files are similar." (Rib00)
    
        basefile - the single file in the 'one-to-many' comparison
        ensemble - the set of files used for the comparison
        klength  - the length of the used ngrams
        normName - the name of the normalizer used to normalize the texts
        mode     - default: CHAR (=1); WORD (=0)
    
        A composite categorical patterngram is a visualization that displays, 
        for each character position x in the the base file, which file of the 
        ensemble files also contain the k-length n-gram that begins at that 
        position. "A point is plotted at (x,y) if the k-length n-gram beginning 
        at x in the base file occurs one or more times in file y." (Rib00)
        
        Rib00 - "Using Visualization to Detect Plagiarism in Computer Science Classes",
        Randy L. Ribler, Marc Abrams, 2000
    """
    #check preconditions
    if type(basefile) != type(""):
        raise NoValidArgumentError, "basefile must be of type string"
    elif type(ensemble) != type([]):
        raise NoValidArgumentError, "ensemble must be of type list"
    elif type(klength) != type(1):
        raise NoValidArgumentError, "klength must be of type integer"
    elif klength <= 0:
        raise NoValidArgumentError, "klength must be greater zero"
    elif not normName in normNames:
        raise NoValidNormalizerNameError(normName, normNames)

    #get Normalizer
    normalize = normNames.get(normName)

    #===init base file===
    #normalize base file
    normBase = normalize(basefile)
    #create ngrams
    nGramDict = createNGrams(normBase, klength, mode)

    #===create CPP Image===
    #init
    colorBG = (255, 255, 255)
    colorFG = (0, 0, 0)
    colorSelf = (0, 0, 200)
    colorInfrequently = (200, 0, 0)
    colorFrequently = (0, 200, 0)
    infrequent = 2
    ydist = 10
    if mode == WORD:
        maxX = len(normBase.split()) - klength + 1
    else:
        maxX = len(normBase) - klength + 1
    maxY = (len(ensemble) + 1) * ydist
    img = Image.new("RGB", (maxX, maxY), colorBG)
    draw = ImageDraw.Draw(img)
    subNr = ''

    #===check ensemble files===
    for y in xrange(len(ensemble)):  # +2?
        text = ensemble[y]
        yPos = maxY - (ydist * (y + 1))
        #if text is basefile mark with colorSelf in img
        if text == basefile:
            subNr = y + 1
            for x in xrange(maxX):
                draw.line((x, yPos - (ydist - 6), x, yPos + (ydist - 6)),
                          fill=colorSelf)
        #else
        #for each text compute ngrams
        #compute common ngrams
        #draw corresponding lines for each common ngram colored aufter its
        #frequency
        else:
            #normalize text
            normText = normalize(text)
            #create ngrams
            textNGrams = createNGrams(normText, klength, mode)
            #check for common ngrams
            commonNGrams = set(nGramDict.keys()) & set(textNGrams.keys())
            #draw for each common ngram the appropriate marking
            for ngram in commonNGrams:
                #for each position of the common ngram draw right marking
                for i in nGramDict.get(ngram):
                    #get occurencies of the ngram in the text
                    cntNgram = len(textNGrams.get(ngram))
                    #choose color for occurency
                    if cntNgram <= infrequent:
                        color = colorInfrequently
                    else:
                        color = colorFrequently
                    #draw line
                    draw.line((i, yPos - (ydist - 6), i, yPos + (ydist - 6)),
                              fill=color)

    #===paste img to chart pattern===
    img = createChart(
        img, len(ensemble), ydist, "N-gram starting character", "File number",
        "Submission nb: " + str(subNr) + " (k=" + str(klength) + ")")

    return img