Exemplo n.º 1
0
    def __init__(self, variables, resource2filename, sentences, debug=False ):

        self.variables = variables
        self.resource2filename = resource2filename
        self.debug = debug

        self.sentences = sentences
        # student level (None if unknown)
        self.level = getCorpusInfo.getCorpusInfo(resource2filename["filename"])

        self.nWords = None
        self.paragraphStarts = [0] # the start position 0 is always there

        self.lemmacats = None

        self.vanalysis = None
        self.vgroups = None

        self.ddagSentences = None

        # prepare any of the required properties for particular variables:
        self.variabletypes = set([x[0] for x in variables])
        self.__prepareRequiredElements()

        # variable name to function
        #self.requiredFuncs = []
        #for variable in variables:
        #    self.requiredFuncs.append( ) # todo

        self.variablevalues = [None]*len(variables) # use position in list to define
def getDocumentProperties(corpus, filename, debug=False):

    text = Text([])
    sentenceNum = 1
    currentParagraph = Paragraph([])
    wordSyllableLengths = []
    wordCharacterLengths = []

    baseFileName, extension = os.path.splitext(filename)
    processedLogFile = os.path.basename(baseFileName) + ".log"

    currentSentence = None

    # get the learner level if it's known:
    text.level = getCorpusInfo.getCorpusInfo(baseFileName)

    #print corpus, processedLogFile
    print
    print "file:", filename
    parsinginfos = getLogFileInfo(os.path.join(corpus, processedLogFile))
    if debug: print "Num sentences:", len(parsinginfos)

    with codecs.open(filename, mode='r', encoding='utf8') as infile:
        #lastParagraphSentenceBreak = 0

        lineNumber = 0
        for line in infile:
            if debug:
                print "line:", line
            lineNumber += 1

            #print processedSentenceFile, os.path.isfile(processedSentenceFile)
            if currentSentence is None:
                currentSentence = getNextSentence(corpus, baseFileName, sentenceNum, debug=debug)
            currentSentenceUntested = True
            while currentSentenceUntested and currentSentence is not None:

                if debug:
                    print "sentence:", sentenceNum
                    print currentSentence.tokens
                    print "regex", currentSentence.matchregex

                if re.match(currentSentence.matchregex, line, flags=re.UNICODE):
                    # remove it from the line:
                    line = re.sub(ur'^' + currentSentence.matchregex, u'', line, flags=re.UNICODE)
                    if debug: print "newline:", line
                    sentenceNum += 1
                    # stock the current info and replace it with the next info
                    currentParagraph.sentences.append(currentSentence)
                    currentSentence = getNextSentence(corpus, baseFileName, sentenceNum)
                else:
                    currentSentenceUntested = False
                    if len(currentParagraph.sentences) > 0:
                    #if sentenceNum -1 > lastParagraphSentenceBreak :
                        text.paragraphs.append(currentParagraph)
                        currentParagraph = Paragraph([])
                        #paragraphLengths.append(sentenceNum-1-lastParagraphSentenceBreak)

                        #lastParagraphSentenceBreak = sentenceNum -1
                        #print "new paragraph:", paragraphLengths, lastParagraphSentenceBreak

                print "current para", len(currentParagraph.sentences), len(text.paragraphs)



    # wrap up the last paragraph (here minus 1 as the sentence num is one higher than observed
    # even though we want the break after the last sentence unlike previously
    #if lastParagraphSentenceBreak < sentenceNum-1:
    if len(currentParagraph.sentences) > 0:
        text.paragraphs.append(currentParagraph)
        #paragraphLengths.append(sentenceNum-1-lastParagraphSentenceBreak)
        #print "final paragraph:", paragraphLengths
    paragraphLengths = [len(x.sentences) for x in text.paragraphs]
    print "all paragraphs:        ", paragraphLengths
    print "sum of para lengths:   ", sum(paragraphLengths)
    print "last real sentence:    ", sentenceNum -1
    print "expected no. sentences:", len(parsinginfos)
    if len(parsinginfos) != sum(paragraphLengths):
        print "PROBLEM!!!!!"
    text.parsedok = 1.0*parsinginfos.count("ok")/len(parsinginfos)
    text.parsedrob = 1.0*parsinginfos.count("robust")/len(parsinginfos)
    text.parsedcorr = 1.0*parsinginfos.count("corrected")/len(parsinginfos)


    #else:
    #    print "Sentene numbers match :)"

    lexiqueDict = {}
    loadLexiqueToDict(u"/home/nparslow/Documents/AutoCorrige/tools/Lexique380/Bases+Scripts/Lexique380.txt",
                      lexiqueDict)
    #print type(lexiqueDict)
    text.addLexiqueInfo( lexiqueDict)

    text.setVocabularyMeasures()
    text.setVerbClauseInfo() # must be run after all the sentences to pass the info up

    return text