Exemplo n.º 1
0
def histoDiversity(theData, theType, outFile):
    draftDict = defaultdict(int)
    finalDict = defaultdict(int)

    for line in theData:
        name = line[1]
        if (0 != theType) and (int(theType) != int(name[-1])): continue
        draftDiversity = float(line[4]) / float(line[3])
        finalDiversity = float(line[6]) / float(line[5])
        draftRounded = round(100 * draftDiversity)
        finalRounded = round(100 * finalDiversity)
        draftDict[int(draftRounded)] += 1
        finalDict[int(finalRounded)] += 1
#        print('%s %s %s %3d %3d' % \
#              (name, name[-1], theType, int(draftRounded), (finalRounded) ))
        

    which = 'DRAFT'
    headerInfo = '\nInput from file: %s\n' % (inFileName)
    headerInfo = 'Diversity is quotient of #uniq words and #words for ' + which + '\n'
    headerInfo += 'Column 1: This quotient times 100\n'
    headerInfo += 'Column 2: Percent of papers with this quotient\n'
    headerInfo += 'Column 3: Raw number of papers with this quotient\n'
    headerInfo += 'Column 4: The histogram\n'
    label = 'DIVERSITY FOR TYPE %d\n' % (theType)
    label += headerInfo

    if len(draftDict) > 0:
        histo, shortVersion = Histogram.histoTheData(label, draftDict, 1, 1, outFile)
        outFile.write('%s\n' % (histo))
    else:
#        print('NO PAPERS FOR %s\n' % (label))
        outFile.write('NO PAPERS FOR %s\n' % (label))

    which = 'FINAL'
    headerInfo = '\nInput from file: %s\n' % (inFileName)
    headerInfo = 'Diversity is quotient of #uniq words and #words for ' + which + '\n'
    headerInfo += 'Column 1: This quotient times 100\n'
    headerInfo += 'Column 2: Percent of papers with this quotient\n'
    headerInfo += 'Column 3: Raw number of papers with this quotient\n'
    headerInfo += 'Column 4: The histogram\n'
    label = 'DIVERSITY FOR TYPE %d\n' % (theType)
    label += headerInfo

    if len(finalDict) > 0:
        histo, shortVersion = Histogram.histoTheData(label, finalDict, 1, 1, outFile)
        outFile.write('%s\n' % (histo))
    else:
#        print('NO PAPERS FOR %s\n' % (label))
        outFile.write('NO PAPERS FOR %s\n' % (label))
def histoWordCounts(theData, theType, which, outFile):
    localDict = defaultdict(int)
    for key, sent in sorted(theData.items()):
        if (0 != theType) and (theType != sent.getType()): continue
        if which != sent.getWhich(): continue
        name = sent.getName()
        sentenceLength = sent.getSentenceLength()
        localDict[name] += sentenceLength
#        print('SENT %14s %3d %3d' % (name, sentenceLength, localDict[name]))

    localDict2 = defaultdict(int)
    for name, wordCount in sorted(localDict.items()):
#        print('%6s %5d' % (name, wordCount))
        localDict2[wordCount] += 1

#    for numSents, freq in sorted(localDict2.items()):
#        print('%5d %5d' % (numSents, freq))

    headerInfo = '\nInput from file: %s\n' % (inFileName)
    headerInfo += 'Column 1: # of words ' + which + '\n'
    headerInfo += 'Column 2: percent of total\n'
    headerInfo += 'Column 3: raw numbers of total\n'
    headerInfo += 'Column 4: the histogram\n'
    label = 'WORD COUNTS FOR TYPE %d ' % (theType)
    print('%s %s' % (label, which))
    label += headerInfo

    if len(localDict2) > 0:
        histo, shortVersion = Histogram.histoTheData(label, localDict2, 50, 50, outFile)
#        print('%s\n' % (histo))
        outFile.write('%s\n' % (histo))
#        shortStuff.append([type, shortVersion])
    else:
#        print('NO SENTENCES FOR %s\n' % (label))
        outFile.write('NO SENTENCES FOR %s\n' % (label))
def histoInsertionsByPara(theData, theType, which, outFile):

    localDict = defaultdict(int)
    for key, sent in sorted(theData.items()):
        if (0 != theType) and (theType != sent.getType()): continue
        if which != sent.getWhich(): continue
        if sent.isAligned(): continue
        paraNum = sent.getRightParaSub()
        localDict[paraNum] += 1

#    for dist, freq in sorted(localDict.items()):
#        print('%5d %5d' % (dist, freq))

    headerInfo = '\nInput from file: %s\n' % (inFileName)
    headerInfo += 'Column 1: paragraph numbers\n'
    headerInfo += 'Column 2: percent of total\n'
    headerInfo += 'Column 3: raw numbers of total\n'
    headerInfo += 'Column 4: the histogram\n'
    label = 'INSERTIONS BY PARAGRAPH INTO FINAL FOR TYPE %d ' % (theType)
    print('%s %s' % (label, which))
    label += headerInfo

    if len(localDict) > 0:
        histo, shortVersion = Histogram.histoTheData(label, localDict, 1, 1, outFile)
#        print('%s\n' % (histo))
        outFile.write('%s\n' % (histo))
#        shortStuff.append([type, shortVersion])
    else:
#        print('NO INSERTIONS FOR %s\n' % (label))
        outFile.write('NO INSERTIONS FOR %s\n' % (label))
def histoAlignmentFractionsByLevel(theData, theType, which, outFile):
    localDict = defaultdict(int)
    for key, sent in sorted(theData.items()):
        if (0 != theType) and (theType != sent.getType()): continue
        if which != sent.getWhich(): continue
        name = sent.getName()
        level = int(sent.getAlignmentLevel())
        if -1 == level: level = 999
        localDict[level] += 1

#    for level, freqs in sorted(localDict.items()):
#        print('%5d %5d' % (level, freqs))

    headerInfo = '\nInput from file: %s\n' % (inFileName)
    headerInfo += 'Column 1: fraction aligned in '+ which + '\n'
    headerInfo += 'Column 2: percent of total\n'
    headerInfo += 'Column 3: raw numbers of total\n'
    headerInfo += 'Column 4: the histogram\n'
    label = 'ALIGNMENTS FOR TYPE %d ' % (theType)
    print('%s %s' % (label, which))
    label += headerInfo

    if len(localDict) > 0:
        histo, shortVersion = Histogram.histoTheData(label, localDict, 1, 1, outFile)
#        print('%s\n' % (histo))
        outFile.write('%s\n' % (histo))
#        shortStuff.append([type, shortVersion])
    else:
#        print('NO PAPERS FOR %s\n' % (label))
        outFile.write('NO PAPERS FOR %s\n' % (label))
def histoInsertionsByEditDistFrac(theData, theType, which, outFile):
    paperSet = set()
    paperCount = 0
    beginningInsertionCount = 0
    localDict = defaultdict(int)
    for key, sent in sorted(theData.items()):
        if (0 != theType) and (theType != sent.getType()):
            continue
        if which != sent.getWhich():
            continue

        keySplit = key.split()
        paperSet.add(keySplit[0])

        if sent.isAligned():
            continue
        prevDist = sent.getPreviousDistance()
        if prevDist < 0:
            prevDist = 899
            if "FINAL" == which:
                if 0 == sent.getRightParaSub() and 0 == sent.getRightSentSub():
                    beginningInsertionCount += 1
        #                    print('INITIAL INSERTION %s %s' % (key, sent))

        localDict[prevDist] += 1

    #    for dist, freq in sorted(localDict.items()):
    #        print('%5d %5d' % (dist, freq))

    paperCount = len(paperSet)
    #    print('PAPER COUNT %3s' % (paperCount))

    headerInfo = "\nInput from file: %s\n" % (inFileName)
    headerInfo += "There were initial insertions in %3d of %3d papers\n" % (beginningInsertionCount, paperCount)
    headerInfo += "Column 1: edit dist fracs before insertion\n"
    #    headerInfo += "          the '899' means insertion at beginning of paper\n"
    headerInfo += "Column 2: percent of total\n"
    headerInfo += "Column 3: raw numbers of total\n"
    headerInfo += "Column 4: the histogram\n"
    label = "INSERTIONS BY EDIT DIST FRAC OF PREVIOUS SENTENCE FOR TYPE %d " % (theType)
    print("%s %s" % (label, which))
    label += headerInfo

    if len(localDict) > 0:
        histo, shortVersion = Histogram.histoTheData(label, localDict, 4, 4, outFile)
        #        print('%s\n' % (histo))
        outFile.write("%s\n" % (histo))
    #        shortStuff.append([type, shortVersion])
    else:
        #        print('NO INSERTIONS FOR %s\n' % (label))
        outFile.write("NO INSERTIONS FOR %s\n" % (label))
def histoSentenceCounts(theData, theType, which, outFile):
    localDict = defaultdict(int)
    for key, sent in sorted(theData.items()):
        if (0 != theType) and (theType != sent.getType()):
            continue
        if which != sent.getWhich():
            continue
        name = sent.getName()
        if "DRAFT" == which:
            sentNum = int(sent.getLeftSentSub())
        else:
            sentNum = int(sent.getRightSentSub())

        if sentNum > localDict[name]:
            localDict[name] = sentNum

    localDict2 = defaultdict(int)
    for name, maxValue in sorted(localDict.items()):
        #        print('%6s %5d' % (name, maxValue))
        localDict2[maxValue] += 1

    #    for numSents, freq in sorted(localDict2.items()):
    #        print('%5d %5d' % (numSents, freq))

    headerInfo = "\nInput from file: %s\n" % (inFileName)
    headerInfo += "Column 1: # of paragraphs " + which + "\n"
    headerInfo += "Column 2: percent of total\n"
    headerInfo += "Column 3: raw numbers of total\n"
    headerInfo += "Column 4: the histogram\n"
    label = "SENTENCE COUNTS FOR TYPE %d " % (theType)
    print("%s %s" % (label, which))
    label += headerInfo

    if len(localDict2) > 0:
        histo, shortVersion = Histogram.histoTheData(label, localDict2, 2, 2, outFile)
        #        print('%s\n' % (histo))
        outFile.write("%s\n" % (histo))
    #        shortStuff.append([type, shortVersion])
    else:
        #        print('NO SENTENCES FOR %s\n' % (label))
        outFile.write("NO SENTENCES FOR %s\n" % (label))
def histoDeletionsByPara(theData, theType, which, lastParaNumDict, theFilter, outFile):
    localDict = defaultdict(int)
    for key, sent in sorted(theData.items()):
        if (0 != theType) and (theType != sent.getType()): continue
        if which != sent.getWhich(): continue
        if sent.isAligned(): continue
        paraNum = sent.getLeftParaSub()
        docName = key.split()[0]
        lastParaKey = docName + ' ' + which
        lastParaNum = lastParaNumDict[lastParaKey]

        if ('FIRST' == theFilter) and (0 != paraNum): continue
        if ('LAST' == theFilter) and (lastParaNum != paraNum):
            continue
        if ('MIDDLE' == theFilter):
            if (0 == paraNum) or (lastParaNum == paraNum):
                continue

        localDict[paraNum] += 1

#    for dist, freq in sorted(localDict.items()):
#        print('%5d %5d' % (dist, freq))

    headerInfo = '\nInput from file: %s for %s\n' % (inFileName, theFilter)
    headerInfo += 'Column 1: paragraph numbers\n'
    headerInfo += 'Column 2: percent of total\n'
    headerInfo += 'Column 3: raw numbers of total\n'
    headerInfo += 'Column 4: the histogram\n'
    label = 'DELETIONS BY PARAGRAPH FROM DRAFT FOR TYPE %d ' % (theType)
    print('%s %s' % (label, which))
    label += headerInfo

    if len(localDict) > 0:
        histo, shortVersion = Histogram.histoTheData(label, localDict, 1, 1, outFile)
#        print('%s\n' % (histo))
        outFile.write('%s\n' % (histo))
#        shortStuff.append([type, shortVersion])
    else:
#        print('NO DELETIONS FOR %s\n' % (label))
        outFile.write('NO DELETIONS FOR %s\n' % (label))
def histoParagraphCounts(theData, theType, which, outFile):
    localDict = defaultdict(int)
    for key, sent in sorted(theData.items()):
        if (0 != theType) and (theType != sent.getType()): continue
        if which != sent.getWhich(): continue
        name = sent.getName()
        if 'DRAFT' == which:
            paraNum = int(sent.getLeftParaSub())
        else:
            paraNum = int(sent.getRightParaSub())

        if paraNum > localDict[name]:
            localDict[name] = paraNum

    localDict2 = defaultdict(int)
    for name, maxValue in sorted(localDict.items()):
#        print('%6s %5d' % (name, maxValue))
        localDict2[maxValue] += 1

#    for numParas, freq in sorted(localDict2.items()):
#        print('%5d %5d' % (numParas, freq))

    headerInfo = '\nInput from file: %s\n' % (inFileName)
    headerInfo += 'Column 1: # of paragraphs ' + which + '\n'
    headerInfo += 'Column 2: percent of total\n'
    headerInfo += 'Column 3: raw numbers of total\n'
    headerInfo += 'Column 4: the histogram\n'
    label = 'PARAGRAPH COUNTS FOR TYPE %d ' % (theType)
    print('%s %s' % (label, which))
    label += headerInfo

    if len(localDict2) > 0:
        histo, shortVersion = Histogram.histoTheData(label, localDict2, 1, 1, outFile)
#        print('%s\n' % (histo))
        outFile.write('%s\n' % (histo))
#        shortStuff.append([type, shortVersion])
    else:
#        print('NO PARAGRAPHS FOR %s\n' % (label))
        outFile.write('NO PARAGRAPHS FOR %s\n' % (label))
def histoAlignmentLevels(theData, theType, which, outFile):
    localDict = defaultdict(int)
    for key, sent in sorted(theData.items()):
        if (0 != theType) and (theType != sent.getType()):
            continue
        if which != sent.getWhich():
            continue
        name = sent.getName()
        level = int(sent.getAlignmentLevel())

        if level > localDict[name]:
            localDict[name] = level

    localDict2 = defaultdict(int)
    for name, maxValue in sorted(localDict.items()):
        #        print('%6s %5d' % (name, maxValue))
        localDict2[maxValue] += 1

    #    for levels, freqs in sorted(localDict2.items()):
    #        print('%5d %5d' % (levels, freqs))

    headerInfo = "\nInput from file: %s\n" % (inFileName)
    headerInfo += "Column 1: last alignment with changes\n"
    headerInfo += "Column 2: percent of total\n"
    headerInfo += "Column 3: raw numbers of total\n"
    headerInfo += "Column 4: the histogram\n"
    label = "LAST ALIGNMENT FOR TYPE %d " % (theType)
    print("%s %s" % (label, which))
    label += headerInfo

    if len(localDict2) > 0:
        histo, shortVersion = Histogram.histoTheData(label, localDict2, 1, 1, outFile)
        #        print('%s\n' % (histo))
        outFile.write("%s\n" % (histo))
    #        shortStuff.append([type, shortVersion])
    else:
        #        print('NO ALIGNMENTS FOR %s\n' % (label))
        outFile.write("NO ALIGNMENTS FOR %s\n" % (label))
def histoEditDistance(theData, theType, which, outFile):
    localDict = defaultdict(int)
    for key, sent in sorted(theData.items()):
        if (0 != theType) and (theType != sent.getType()): continue
        if which != sent.getWhich(): continue
        distRounded = int(round(sent.getEditDistFracOfWorst() * 100.0))
        if distRounded < 0: distRounded = 999
        localDict[distRounded] += 1

#    for dist, freq in sorted(localDict.items()):
#        print('%5d %5d' % (dist, freq))

    headerInfo = '\nInput from file: %s\n' % (inFileName)
    headerInfo += 'Column 1: % change in aligned sentences from draft to final\n'
    headerInfo += "          'UNALIGN' means deletions from draft, insertions into final\n"
    headerInfo += 'Column 2: % of sentences with that change\n'
    headerInfo += 'Column 3: raw numbers of sentences with that change\n'
    headerInfo += 'Column 4: the histogram\n'
    label = 'EDIT DISTANCE COMPARISONS FOR ' + which + ' OF TYPE %d ' % (theType)
    print('%s %s' % (label, which))
    label += headerInfo

    if len(localDict) > 0:
        histo, shortVersion = Histogram.histoTheData(label, localDict, 4, 4, outFile)
#        print('%s\n' % (histo))
        outFile.write('%s\n' % (histo))
#        shortStuff.append([type, shortVersion])
    else:
#        print('NO EDIT DIST FRACS FOR %s\n' % (label))
        outFile.write('NO EDIT DIST FRACS FOR %s\n' % (label))




    if 0 != theType: return

    localTypeListDict = defaultdict(list)
    for key in range(0, 5): 
        localTypeListDict[key] = [0] 

    for key, sent in sorted(theData.items()):
        if which != sent.getWhich(): continue
        thisType = sent.getType()

        distRounded = int(round(sent.getEditDistFracOfWorst() * 100.0))
        if distRounded < 0: distRounded = -5

        # do the actual type
        thisList = localTypeListDict[thisType]
        thisList.append(distRounded)
        localTypeListDict[thisType] = thisList

        # do the "all" type, which is type 0
        thisList = localTypeListDict[0]
        thisList.append(distRounded)
        localTypeListDict[0] = thisList


    multiset = []
    for key, value in sorted(localTypeListDict.items()):
        multiset.append(value)
#        print('TYPE %d' % (key))
#        print('type %d %s' % (key, value))

    fig = plt.figure()
    ax = fig.add_subplot(111)

    numBins = 50
    numBins = 25

    ax.hist(multiset, numBins, color=['green','red','blue','lime','orange'], \
            label = ['0', '1', '2', '3', '4'], alpha=0.8)
    ax.legend(prop={'size': 10})
    ax.set_title('Edit Distance Histograms ' + which)
#    plt.show()
    plt.savefig('EditDistHistograms' + which)
Exemplo n.º 11
0
def histoDiversityByWordCount(theData, theType, whichLeft, whichRight, outFile):
    allDict = defaultdict(int)
    nonstopDict = defaultdict(int)

    ######################################################################
    ## REMEMBER that general diversity is all and nonstopwords
    ## REMEMBER that student diversity is all for draft and final
    ALLDIVERSITY = 6
    NONSTOPDIVERSITY = 9
    #ALLDIVERSITY = 5
    #NONSTOPDIVERSITY = 8

    allRoundedList = []
    nonstopRoundedList = []
    for line in theData:
        name = line[1]
        name = name.replace('.txt', '')
        if (0 != theType) and (int(theType) != int(name[-1])): continue
        allDiversity = float(line[ALLDIVERSITY])
        nonstopDiversity = float(line[NONSTOPDIVERSITY])
        allRounded = round(100 * allDiversity)
        allRoundedList.append(allRounded)
        nonstopRounded = round(100 * nonstopDiversity)
        nonstopRoundedList.append(nonstopRounded)
        allDict[int(allRounded)] += 1
        nonstopDict[int(nonstopRounded)] += 1
#        print('%s %s %s %3d %3d' % \
#              (name, name[-1], theType, int(draftRounded), (finalRounded) ))
        

#    which = 'ALL'
    headerInfo = '\nInput from file: %s\n' % (inFileName)
    headerInfo = 'Diversity is quotient of #uniq words and #words for ' + whichLeft + '\n'
    headerInfo += 'Column 1: This quotient times 100\n'
    headerInfo += 'Column 2: Percent of papers with this quotient\n'
    headerInfo += 'Column 3: Raw number of papers with this quotient\n'
    headerInfo += 'Column 4: The histogram\n'
    label = 'DIVERSITY FOR TYPE %d\n' % (theType)
    label += headerInfo

    if len(allDict) > 0:
        histo, shortVersion = Histogram.histoTheData(label, allDict, 1, 1, outFile)
        outFile.write('%s\n' % (histo))
    else:
#        print('NO PAPERS FOR %s\n' % (label))
        outFile.write('NO PAPERS FOR %s\n' % (label))

#    which = 'NONSTOP'
    headerInfo = '\nInput from file: %s\n' % (inFileName)
    headerInfo = 'Diversity is quotient of #uniq words and #words for ' + whichRight + '\n'
    headerInfo += 'Column 1: This quotient times 100\n'
    headerInfo += 'Column 2: Percent of papers with this quotient\n'
    headerInfo += 'Column 3: Raw number of papers with this quotient\n'
    headerInfo += 'Column 4: The histogram\n'
    label = 'DIVERSITY FOR TYPE %d\n' % (theType)
    label += headerInfo

    if len(nonstopDict) > 0:
        histo, shortVersion = Histogram.histoTheData(label, nonstopDict, 1, 1, outFile)
        outFile.write('%s\n' % (histo))
    else:
#        print('NO PAPERS FOR %s\n' % (label))
        outFile.write('NO PAPERS FOR %s\n' % (label))

    print(allRoundedList)

    if len(allRoundedList) > 0:
        fig = plt.figure()
        ax = fig.add_subplot(111)

        numBins = 50
#        ax.hist(allRoundedList,numBins,color='green',alpha=0.8)
#        ax.hist(nonstopRoundedList,numBins,color='red',alpha=0.8)
#        plt.show()

        multiset = [allRoundedList, nonstopRoundedList]
        ax.hist(multiset, numBins, color=['green','red'],label = ['All', 'Nonstop'], alpha=0.8)
        ax.legend(prop={'size': 10})
        ax.set_title('Allword versus Nonstopword diversity')
        plt.show()