Пример #1
0
    def rate_by_params(self, passage):
        # 线性预测
        extractor = FeatherExtractor(None)
        if not passage.preprocessed: essayprepare.processPassage(passage)
        passage.lf = extractor.extractLangFeather(passage)
        passage.cf = extractor.extractContentFeather(passage)
        passage.sf = extractor.extractStructureFeather(passage)

        exog = []
        x = self.__getFeatherList(passage)
        
        score = dot(x, self.model_params)
        
        passage.rateScore = score
        passage.endogScore = score
                
        # 调整分数
        passage.filter_scores = []
        filters = [self.tokenCountFilter, self.sentenceLengthAverageFilter,
                   self.wordLengthAverageFilter, self.aclWordCountFilter,
                   self.noneStopWordLengthAverageFilter, self.nounRatioFilter]
        
        for filter in filters:
            filter_score = filter(passage)
            passage.rateScore += filter_score
            passage.filter_scores.append(filter_score)
        
        passage.rated = True
        return [passage.rateScore]
Пример #2
0
    def rate_by_params(self, passage):
        # 线性预测
        extractor = FeatherExtractor(None)
        if not passage.preprocessed: essayprepare.processPassage(passage)
        passage.lf = extractor.extractLangFeather(passage)
        passage.cf = extractor.extractContentFeather(passage)
        passage.sf = extractor.extractStructureFeather(passage)

        exog = []
        x = self.__getFeatherList(passage)

        score = dot(x, self.model_params)

        passage.rateScore = score
        passage.endogScore = score

        # 调整分数
        passage.filter_scores = []
        filters = [
            self.tokenCountFilter, self.sentenceLengthAverageFilter,
            self.wordLengthAverageFilter, self.aclWordCountFilter,
            self.noneStopWordLengthAverageFilter, self.nounRatioFilter,
            self.total_score_filter
        ]

        for filter in filters:
            filter_score = filter(passage)
            passage.rateScore += filter_score
            passage.filter_scores.append(filter_score)

        passage.rated = True
        return [passage.rateScore]
Пример #3
0
def wordRepetitiveDemo():
    print "wordRepetitiveDemo start..."
    pkfile = open('ustcpassages_503.pkl', 'r')
    passages = pickle.load(pkfile)
    pkfile.close()

    f = FeatherExtractor()
    for p in passages[:]:
        lf = f.extractLangFeather(p)
        p.lf = lf

    f = open('wordrep.txt', 'w')
    for p in passages[:]:
        if p.lf:
            for l in p.lf.lemmaUseInfo:
                print p.id, p.score, l[0], l[1], l[2], l[3], l[4], l[5]
                s = ' '.join([
                    str(p.id),
                    str(p.score),
                    str(p.lf.overlyUseWordCount), l[0],
                    str(l[1]),
                    str(l[2]),
                    str(l[3]),
                    str(l[4]),
                    str(l[5])
                ])
                f.write(s)
                f.write('\n')
    f.close()

    print "wordRepetitiveDemo over!!!"
Пример #4
0
    def train(self, passages):
        # pre-process passage
        i = 1
        for p in passages:
            print "======================="
            print "Passage", i, p.id
            if not p.preprocessed: essayprepare.processPassage(p)
            i += 1

        self.extractor = FeatherExtractor(None)
        for p in passages:
            p.lf = self.extractor.extractLangFeather(p)
            p.cf = self.extractor.extractContentFeather(p)
            p.sf = self.extractor.extractStructureFeather(p)

        # save feathers
        f = open('fs_zhang_train.txt', 'w')
        for p in passages:
            x = self.__getFeatherList(p)
            f.write(p.id + ' ')
            f.write(str(p.score))
            for v in x:
                f.write(' ' + str(v))
            f.write('\n')
        f.close()

        # generate feather vector
        endog = []
        exog = []
        for p in passages:
            score = int(p.score)
            endog.append(score)
            x = self.__getFeatherList(p)
            exog.append(x)

        # train model
        endog = np.array(endog)
        exog = np.array(exog)

        self.gls_model = sm.GLS(endog, exog)
        results = self.gls_model.fit()
        #print results.summary()
        print results.params
Пример #5
0
def demo_one_sentence():
    # 文章
    passage = EssayPassage()
    passage.passage = 'I am a students.'
    passage.title = 'title'
    passage.score = 5
    passage.id = '1'
    passage.reviewerId = 3
    passage.content = 'I am a students.'

    # 处理文章
    essayprepare.processPassage(passage)

    extractor = FeatherExtractor()
    lf = extractor.extractLangFeather(passage)
    passage.lf = lf
    cf = extractor.extractContentFeather(passage)
    sf = extractor.extractStructureFeather(passage)

    print 'OK'
Пример #6
0
def demo_one_sentence():
    # 文章
    passage = EssayPassage()
    passage.passage = 'I am a students.'
    passage.title = 'title'
    passage.score = 5
    passage.id = '1'
    passage.reviewerId = 3
    passage.content = 'I am a students.'
    
    # 处理文章
    essayprepare.processPassage(passage)
    
    extractor = FeatherExtractor()
    lf = extractor.extractLangFeather(passage)
    passage.lf = lf
    cf = extractor.extractContentFeather(passage)
    sf = extractor.extractStructureFeather(passage)   
    
    print 'OK'
Пример #7
0
def generatePassageFeathers(passages, outFilename):
    f = open(outFilename, 'w')

    e = FeatherExtractor()

    i = 1

    for p in passages:
        print "Passage ", i
        # 处理文章
        essayprepare.processPassage(p)
        # 提取语言特征
        languageFeather = e.extractLangFeather(p)
        p.lf = languageFeather
        # 提取结构特征
        structureFeather = e.extractStructureFeather(p)
        p.sf = structureFeather

        f.write(p.id + ' ')
        f.write(str(p.score))
        f.write(' ' + str(languageFeather))
        f.write('\n')
        i += 1
    f.close()
Пример #8
0
def generatePassageFeathers(passages, outFilename):
    f = open(outFilename, 'w')
    
    e = FeatherExtractor()    
    
    i = 1
    
    for p in passages:
        print "Passage ", i
        # 处理文章
        essayprepare.processPassage(p)
        # 提取语言特征    
        languageFeather = e.extractLangFeather(p)  
        p.lf = languageFeather
        # 提取结构特征  
        structureFeather = e.extractStructureFeather(p)
        p.sf = structureFeather
        
        f.write(p.id + ' ')
        f.write(str(p.score))
        f.write(' ' + str(languageFeather))
        f.write('\n')
        i += 1
    f.close()
Пример #9
0
    def train(self, passages):
        # pre-process passage
        i = 1
        for p in passages:
            print "======================="
            print "Passage", i, p.id
            if not p.preprocessed: essayprepare.processPassage(p)
            i += 1

        self.extractor = FeatherExtractor(None)
        for p in passages:
            p.lf = self.extractor.extractLangFeather(p)
            p.cf = self.extractor.extractContentFeather(p)
            p.sf = self.extractor.extractStructureFeather(p)   
        
        # save feathers
        f = open('fs_zhang_train.txt', 'w')
        for p in passages:   
            x = self.__getFeatherList(p)       
            f.write(p.id + ' ')
            f.write(str(p.score))
            for v in x:
                f.write(' ' + str(v))
            f.write('\n')
        f.close()
        
        # generate feather vector
        endog = []
        exog = []
        for p in passages:
            score = int(p.score)
            endog.append(score)
            x = self.__getFeatherList(p)
            exog.append(x)     
        
        # train model
        endog = np.array(endog)
        exog = np.array(exog)
        
        self.gls_model = sm.GLS(endog, exog)
        results = self.gls_model.fit()
        #print results.summary()
        print results.params
Пример #10
0
def wordRepetitiveDemo():
    print "wordRepetitiveDemo start..."
    pkfile = open('ustcpassages_503.pkl', 'r')
    passages = pickle.load(pkfile)
    pkfile.close()
    
    f = FeatherExtractor()
    for p in passages[:]:
        lf = f.extractLangFeather(p)
        p.lf = lf
        
    f = open('wordrep.txt', 'w')
    for p in passages[:]:
        if p.lf:
            for l in p.lf.lemmaUseInfo:
                print p.id, p.score, l[0], l[1], l[2], l[3], l[4], l[5]
                s = ' '.join([str(p.id), str(p.score), str(p.lf.overlyUseWordCount), l[0], str(l[1]), str(l[2]), str(l[3]), str(l[4]), str(l[5])])
                f.write(s)
                f.write('\n')
    f.close()
        
    print "wordRepetitiveDemo over!!!"
Пример #11
0
            print "SPELLERROR", errorTokens
            print tokens
            print tags
            print lemmas
            print stems
            print levels
            print sentNos
            print paraNos
            print nos
            print sent.tokenCount
            print sent.wordCount
            print sent.realWordCount

    print "三元词组", passage.trigrams

    e = FeatherExtractor()

    # 提取语言特征
    languageFeather = e.extractLangFeather(passage)

    print u"词次总数", languageFeather.tokenCount
    print u"单词总数", languageFeather.wordCount
    print u"词形总数", languageFeather.wordTypeCount
    print u"词元总数", languageFeather.wordLemmaCount

    print u"介词个数", languageFeather.prepositionCount
    print u"介词比例", languageFeather.prepositionRatio
    print u"介词使用", languageFeather.prepositionUse

    print u"定冠词个数", languageFeather.definiteArticleCount
    print u"定冠词比例", languageFeather.definiteArticleRatio
Пример #12
0
 def processEssay(self):
     self.browser.clear()
     id = unicode(self.lineedit.text())
     essay = self.essayDict.get(id)
     if not essay:
         self.browser.append("<font color=red>%s is not found!</font>" % id)
         return
     
     self.browser.append(essay.content)
     
     # 文章
     passage = EssayPassage()
     passage.passage = essay.cleanContent()
     passage.title = essay.title
     passage.score = essay.score
     passage.id = essay.id
     
     # 处理文章
     essayprepare.processPassage(passage)
     
     # 输出来看看是啥样子    
     self.browser.append("PASSAGE=========================================")        
     self.browser.append(passage.id)
     #self.browser.append(passage.title)
     self.browser.append(passage.score)
     self.browser.append(passage.passage)
     self.browser.append(str(len(passage.paragraphs)))
     self.browser.append("PARAGRAPHS---------------------------------------")
     for para in passage.paragraphs:
         self.browser.append(str(para.paragraphNo))
         self.browser.append(para.paragraph)
         for sent in para.sentences:
             self.browser.append(str(sent.sentenceNo))
             self.browser.append(str(sent.paragraphSentenceNo))
             self.browser.append(sent.sentence)
             tokens = [token.token for token in sent.tokens]
             tags = [token.pos for token in sent.tokens]
             lemmas = [token.lemma for token in sent.tokens]
             stems = [token.stem for token in sent.tokens]
             levels = [token.level for token in sent.tokens]
             nos = [token.tokenNo for token in sent.tokens]
             sentNos = [token.sentenceTokenNo for token in sent.tokens]
             paraNos = [token.paragraphTokenNo for token in sent.tokens]
             errorTokens = [token.token for token in sent.tokens if token.isSpellError]
             if not sent.canParsed:
                 self.browser.append("<font color=red>SENTENCE ERROR</font>")
             self.browser.append("<font color=red>SPELLERROR %s</font>" % str(errorTokens))
             self.browser.append(str(tokens))
             self.browser.append(str(tags))
             self.browser.append(str(lemmas))
             self.browser.append(str(stems))
             self.browser.append(str(levels))
             self.browser.append(str(sentNos))
             self.browser.append(str(paraNos))
             self.browser.append(str(nos))
             self.browser.append(str(sent.tokenCount))
             self.browser.append(str(sent.wordCount))
             self.browser.append(str(sent.realWordCount))
     
     self.browser.append(u"三元词组" + ' ' + str(passage.trigrams))
 
 
     e = FeatherExtractor()
 
     # 提取语言特征    
     languageFeather = e.extractLangFeather(passage)  
     
     print u"词次总数", languageFeather.tokenCount
     print u"单词总数", languageFeather.wordCount
     print u"词形总数", languageFeather.wordTypeCount
     print u"词元总数", languageFeather.wordLemmaCount
     
     print u"介词个数", languageFeather.prepositionCount
     print u"介词比例", languageFeather.prepositionRatio
     print u"介词使用", languageFeather.prepositionUse
     
     print u"定冠词个数", languageFeather.definiteArticleCount
     print u"定冠词比例", languageFeather.definiteArticleRatio
     print u"定冠词使用", languageFeather.definiteArticleUse
     
     # 提取结构特征  
     #structureFeather = e.extractStructureFeather(passage)
     
     #generateUSTCFeathers('USTC2011Jan.txt', 'USTCFeathers_503.txt')
         
     print "...OVER"
Пример #13
0
    def processEssay(self):
        self.browser.clear()
        id = unicode(self.lineedit.text())
        essay = self.essayDict.get(id)
        if not essay:
            self.browser.append("<font color=red>%s is not found!</font>" % id)
            return

        self.browser.append(essay.content)

        # 文章
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id

        # 处理文章
        essayprepare.processPassage(passage)

        # 输出来看看是啥样子
        self.browser.append("PASSAGE=========================================")
        self.browser.append(passage.id)
        #self.browser.append(passage.title)
        self.browser.append(passage.score)
        self.browser.append(passage.passage)
        self.browser.append(str(len(passage.paragraphs)))
        self.browser.append(
            "PARAGRAPHS---------------------------------------")
        for para in passage.paragraphs:
            self.browser.append(str(para.paragraphNo))
            self.browser.append(para.paragraph)
            for sent in para.sentences:
                self.browser.append(str(sent.sentenceNo))
                self.browser.append(str(sent.paragraphSentenceNo))
                self.browser.append(sent.sentence)
                tokens = [token.token for token in sent.tokens]
                tags = [token.pos for token in sent.tokens]
                lemmas = [token.lemma for token in sent.tokens]
                stems = [token.stem for token in sent.tokens]
                levels = [token.level for token in sent.tokens]
                nos = [token.tokenNo for token in sent.tokens]
                sentNos = [token.sentenceTokenNo for token in sent.tokens]
                paraNos = [token.paragraphTokenNo for token in sent.tokens]
                errorTokens = [
                    token.token for token in sent.tokens if token.isSpellError
                ]
                if not sent.canParsed:
                    self.browser.append(
                        "<font color=red>SENTENCE ERROR</font>")
                self.browser.append("<font color=red>SPELLERROR %s</font>" %
                                    str(errorTokens))
                self.browser.append(str(tokens))
                self.browser.append(str(tags))
                self.browser.append(str(lemmas))
                self.browser.append(str(stems))
                self.browser.append(str(levels))
                self.browser.append(str(sentNos))
                self.browser.append(str(paraNos))
                self.browser.append(str(nos))
                self.browser.append(str(sent.tokenCount))
                self.browser.append(str(sent.wordCount))
                self.browser.append(str(sent.realWordCount))

        self.browser.append(u"三元词组" + ' ' + str(passage.trigrams))

        e = FeatherExtractor()

        # 提取语言特征
        languageFeather = e.extractLangFeather(passage)

        print u"词次总数", languageFeather.tokenCount
        print u"单词总数", languageFeather.wordCount
        print u"词形总数", languageFeather.wordTypeCount
        print u"词元总数", languageFeather.wordLemmaCount

        print u"介词个数", languageFeather.prepositionCount
        print u"介词比例", languageFeather.prepositionRatio
        print u"介词使用", languageFeather.prepositionUse

        print u"定冠词个数", languageFeather.definiteArticleCount
        print u"定冠词比例", languageFeather.definiteArticleRatio
        print u"定冠词使用", languageFeather.definiteArticleUse

        # 提取结构特征
        #structureFeather = e.extractStructureFeather(passage)

        #generateUSTCFeathers('USTC2011Jan.txt', 'USTCFeathers_503.txt')

        print "...OVER"
Пример #14
0
class GeneralEssayRater():
    """这是一个通用的评分器
    """
    def __init__(self):
        self.model_params = [
            30, -0.9, -0.55, -80, 50, -0.38, 8.0, -145, 16.8, 0.05, 0.4, -0.04,
            35, -0.4, 0.35, -0.5
        ]
        pass

    def __getFeatherList(self, passage):
        """Get feathers from preprocessed passage
        """

        fs = []
        fs.append(1)  # const
        fs.append(passage.lf.sentenceErrorCount)
        fs.append(passage.lf.spellErrorCount)
        #fs.append(passage.lf.ltErrorCount)
        fs.append(passage.lf.prepositionUse)
        fs.append(passage.lf.definiteArticleUse)
        #fs.append(passage.lf.wordCombRecurrentCount)
        #fs.append(passage.lf.tokenCount)
        #fs.append(passage.lf.wordTypeCount)
        fs.append(passage.lf.wordStemCount)
        fs.append(passage.lf.wordLengthAverage)
        #fs.append(passage.lf.wordLengthSD)
        fs.append(passage.lf.wordTypeRatio)
        fs.append(passage.lf.indexOfGuiraud)
        #for x in passage.lf.wordCountInLevels:
        #    fs.append(x)
        fs.append(passage.lf.gerundCount)
        #fs.append(passage.lf.gerundRatio)
        #fs.append(passage.lf.sentenceLengthAverage)
        #fs.append(passage.lf.sentenceLengthSD)
        #fs.append(passage.lf.automatedReadabilityIndex)
        fs.append(passage.lf.sentenceComplexity)
        #fs.append(passage.lf.sentenceComplexityScale)

        #fs.append(passage.cf.lsaScore)
        #fs.append(passage.cf.proceduralVocabularyCount)
        #fs.append(passage.cf.keywordCover)

        fs.append(passage.sf.connectiveCount)
        #fs.append(passage.sf.connectiveRatio)
        #fs.append(passage.sf.specialDemonstrativePronounCount)
        #fs.append(passage.sf.specialDemonstrativePronounUse)
        #fs.append(passage.sf.restPronounCount)
        #fs.append(passage.sf.restPronounUse)
        #fs.append(passage.lf.highLowLevelRatio)
        #        fs.append((passage.lf.wordCountInLevels[3] + passage.lf.wordCountInLevels[4]) * 1.0
        #                  / (passage.lf.wordCountInLevels[1] + passage.lf.wordCountInLevels[2]))
        fs.append(passage.lf.overlyUseWordCount)
        fs.append(passage.lf.aclWordCount)
        #fs.append(passage.lf.aclWordRatio)
        fs.append(passage.lf.nominalizationCountUnique)
        #fs.append(passage.lf.pn_range_count[2])
        fs.append(
            (passage.lf.pn_range_count[2] + passage.lf.pn_range_count[3]) /
            passage.lf.pn_range_count[1])
        fs.append(passage.lf.top_sentence_length)
        return fs

    def train(self, passages):
        # pre-process passage
        i = 1
        for p in passages:
            print "======================="
            print "Passage", i, p.id
            if not p.preprocessed: essayprepare.processPassage(p)
            i += 1

        self.extractor = FeatherExtractor(None)
        for p in passages:
            p.lf = self.extractor.extractLangFeather(p)
            p.cf = self.extractor.extractContentFeather(p)
            p.sf = self.extractor.extractStructureFeather(p)

        # save feathers
        f = open('fs_zhang_train.txt', 'w')
        for p in passages:
            x = self.__getFeatherList(p)
            f.write(p.id + ' ')
            f.write(str(p.score))
            for v in x:
                f.write(' ' + str(v))
            f.write('\n')
        f.close()

        # generate feather vector
        endog = []
        exog = []
        for p in passages:
            score = int(p.score)
            endog.append(score)
            x = self.__getFeatherList(p)
            exog.append(x)

        # train model
        endog = np.array(endog)
        exog = np.array(exog)

        self.gls_model = sm.GLS(endog, exog)
        results = self.gls_model.fit()
        #print results.summary()
        print results.params

    def tokenCountFilter(self, passage):
        # 根据文章字数调整
        filter = 0
        if (passage.lf.tokenCount < 100):
            filter = passage.rateScore * 0.2
        elif passage.lf.tokenCount < 120:
            filter = passage.rateScore * 0.1
        filter = -filter
        return filter

    def sentenceLengthAverageFilter(self, passage):
        # 根据平均句长调整
        filter = 0
        slv = passage.lf.sentenceLengthAverage
        if (slv < 10):
            filter = (10 - slv) * 2
            if filter > 6: filter = 6
        elif slv > 23:
            filter = (slv - 23) * 3
            if filter > 9: filter = 9
        filter = -filter
        return filter

    def wordLengthAverageFilter(self, passage):
        # 根据平均词长调整
        filter = 0
        wlv = passage.lf.wordLengthAverage
        if wlv < 4:
            filter = (4 - wlv) * 10
        filter = -filter
        return filter

    def aclWordCountFilter(self, passage):
        # 根据学术词汇数调整
        filter = 0
        acl = passage.lf.aclWordCount
        if acl > 9:
            filter = passage.rateScore * 0.1
        return filter

    def noneStopWordLengthAverageFilter(self, passage):
        # 根据实词平均长度调整
        filter = 0
        rwlv = passage.lf.noneStopWordLengthAverage
        if rwlv < 5.5:
            filter = (5.5 - rwlv) * 10
        filter = -filter
        return filter

    def nounRatioFilter(self, passage):
        # 根据词性比例调整
        filter = 0
        nr = passage.lf.nounRatio
        if nr < 0.2:
            filter = (0.2 - nr) * 100
        elif nr > 0.35:
            filter = (nr - 0.35) * 100
        filter = -filter
        return filter

    def verbRatioFilter(self, passage):
        filter = 0
        vr = passage.lf.verbRatio
        if vr < 0.1:
            filter = (0.1 - vr) * 200
        elif vr > 0.2:
            filter = (vr - 0.2) * 200
        filter = -filter
        return filter

    def adjRatioFilter(self, passage):
        filter = 0
        ar = passage.lf.adjRatio
        if ar < 0.045:
            filter = (0.045 - ar) * 500
        filter = -filter
        return filter

    def posRatioFilter(self, passage):
        filter = 0
        badRatioCount = 0
        offsetRatio = 0
        nr = passage.lf.nounRatio
        vr = passage.lf.verbRatio
        ar = passage.lf.adjRatio
        if (nr < 0.2) or (nr > 0.3):
            badRatioCount += 1
        else:
            offsetRatio += abs(nr - 0.25) / 0.1
        if (vr < 0.1) or (vr > 0.2):
            badRatioCount += 1
        else:
            offsetRatio += abs(vr - 0.15) / 0.1
        if (ar < 0.06) or (ar > 0.15):
            badRatioCount += 1
        else:
            offsetRatio += abs(ar - 0.105) / 0.15
        if badRatioCount == 0:
            if offsetRatio < 0.1:
                filter = passage.rateScore * 0.05
        elif badRatioCount == 1:
            if offsetRatio > 0.6:
                filter = -passage.rateScore * 0.05
        elif badRatioCount > 1:
            filter = -passage.rateScore * 0.02 * badRatioCount * badRatioCount
        passage.offsetRatio = offsetRatio
        return filter

    def rate(self, passage):
        # 线性预测
        if not passage.preprocessed: essayprepare.processPassage(passage)
        passage.lf = self.extractor.extractLangFeather(passage)
        passage.cf = self.extractor.extractContentFeather(passage)
        passage.sf = self.extractor.extractStructureFeather(passage)

        exog = []
        x = self.__getFeatherList(passage)
        exog.append(x)
        exog = np.array(exog)
        endog = self.gls_model.predict(exog)
        passage.rateScore = endog[0]
        passage.endogScore = endog[0]

        passage.filters = []

        # 调整分数
        filter = self.tokenCountFilter(passage)
        passage.rateScore += filter
        passage.filters.append(filter)

        filter = self.sentenceLengthAverageFilter(passage)
        passage.rateScore += filter
        passage.filters.append(filter)

        filter = self.wordLengthAverageFilter(passage)
        passage.rateScore += filter
        passage.filters.append(filter)

        filter = self.aclWordCountFilter(passage)
        passage.rateScore += filter
        passage.filters.append(filter)

        filter = self.noneStopWordLengthAverageFilter(passage)
        passage.rateScore += filter
        passage.filters.append(filter)

        filter = self.nounRatioFilter(passage)
        passage.rateScore += filter
        passage.filters.append(filter)

        filter = self.verbRatioFilter(passage)
        #passage.rateScore += filter
        passage.filters.append(filter)

        filter = self.adjRatioFilter(passage)
        #passage.rateScore += filter
        passage.filters.append(filter)

        filter = self.posRatioFilter(passage)
        #passage.rateScore += filter
        passage.filters.append(filter)

        passage.rated = True
        endog[0] = passage.rateScore
        return [passage.rateScore]

    def rate_by_params(self, passage):
        # 线性预测
        extractor = FeatherExtractor(None)
        if not passage.preprocessed: essayprepare.processPassage(passage)
        passage.lf = extractor.extractLangFeather(passage)
        passage.cf = extractor.extractContentFeather(passage)
        passage.sf = extractor.extractStructureFeather(passage)

        exog = []
        x = self.__getFeatherList(passage)

        score = dot(x, self.model_params)

        passage.rateScore = score
        passage.endogScore = score

        # 调整分数
        passage.filter_scores = []
        filters = [
            self.tokenCountFilter, self.sentenceLengthAverageFilter,
            self.wordLengthAverageFilter, self.aclWordCountFilter,
            self.noneStopWordLengthAverageFilter, self.nounRatioFilter
        ]

        for filter in filters:
            filter_score = filter(passage)
            passage.rateScore += filter_score
            passage.filter_scores.append(filter_score)

        passage.rated = True
        return [passage.rateScore]
Пример #15
0
class CollegeEssayRater():
    
    def __init__(self):
        self.models = {} # 存放所有的作文模型
        self.gls_model = None # 线性回归模型
        self.extractor = None # 特征提取器
        self.svm_model = None # SVN分类器
        pass
    
    def __trainModel(self, passages, model):
        pass
    
    def __loadModel(self, modelFilename):
        pass
       
    def __getFeatherList(self, passage):
        fs = []
        fs.append(1)
        fs.append(passage.lf.sentenceErrorCount)
        fs.append(passage.lf.spellErrorCount)
        #fs.append(passage.lf.ltErrorCount)
        fs.append(passage.lf.prepositionUse)
        fs.append(passage.lf.definiteArticleUse)
        fs.append(passage.lf.wordCombRecurrentCount)  
        #fs.append(passage.lf.tokenCount)
        #fs.append(passage.lf.wordTypeCount)
        fs.append(passage.lf.wordStemCount)
        fs.append(passage.lf.wordLengthAverage)
        #fs.append(passage.lf.wordLengthSD)
        fs.append(passage.lf.wordTypeRatio)
        fs.append(passage.lf.indexOfGuiraud)
        #for x in passage.lf.wordCountInLevels:
        #    fs.append(x)
        fs.append(passage.lf.gerundCount)
        #fs.append(passage.lf.gerundRatio)
        #fs.append(passage.lf.sentenceLengthAverage)
        #fs.append(passage.lf.sentenceLengthSD)
        #fs.append(passage.lf.automatedReadabilityIndex)  
        fs.append(passage.lf.sentenceComplexity)  
        #fs.append(passage.lf.sentenceComplexityScale) 
         
        fs.append(passage.cf.lsaScore)   
        #fs.append(passage.cf.proceduralVocabularyCount) 
        fs.append(passage.cf.keywordCover)
        
        fs.append(passage.sf.connectiveCount)   
        #fs.append(passage.sf.connectiveRatio)   
        #fs.append(passage.sf.specialDemonstrativePronounCount)
        #fs.append(passage.sf.specialDemonstrativePronounUse)
        #fs.append(passage.sf.restPronounCount)
        #fs.append(passage.sf.restPronounUse)        
        fs.append(passage.lf.highLowLevelRatio)
#        fs.append((passage.lf.wordCountInLevels[3] + passage.lf.wordCountInLevels[4]) * 1.0 
#                  / (passage.lf.wordCountInLevels[1] + passage.lf.wordCountInLevels[2]))
        fs.append(passage.lf.overlyUseWordCount)
        fs.append(passage.lf.aclWordCount)
        #fs.append(passage.lf.aclWordRatio)
        fs.append(passage.lf.nominalizationCountUnique)
    	return fs
    
    def train(self, passages):
        # 预处理文章
        i = 1
        for p in passages:
            #print "Passage ", i
            # 处理文章
            if not p.preprocessed: essayprepare.processPassage(p)
            i += 1
        
        # 训练模型
        passages.sort(cmp=lambda x,y: cmp(x.score, y.score), reverse=True)
        
        model = EssayModel()
        model.train(passages)
        self.models['1'] = model
        #print model.triGramDicts
        
        # 提取特征
        self.extractor = FeatherExtractor(model)
        for p in passages:
            p.lf = self.extractor.extractLangFeather(p)
            p.cf = self.extractor.extractContentFeather(p)
            p.sf = self.extractor.extractStructureFeather(p)   
        
        # 输出特征值
        f = open('fs_train.txt', 'w')
        
        # 生成特征向量
        endog = []
        exog = []
        labels = []
        for p in passages:
            score = int(p.score)
#            if score > 90: score = 90
#            if score < 35: score = 35
            endog.append(score)
            x = self.__getFeatherList(p)
            exog.append(x)

            labels.append(p.label)
            
            f.write(p.id + ' ')
            f.write(str(p.score))
            for v in x:
                f.write(' ' + str(v))
            f.write('\n')
        
        f.close()       
        
        # SVM分类器训练
        #self.svm_model = svmutil.svm_train(labels, exog, '-c 3')
        
        # 线性回归模型训练  
        endog = np.array(endog)
        exog = np.array(exog)
#        print endog
#        print exog
        
#        self.m = np.mean(exog,axis=0)
#        print self.m
#        
#        T, P, e_var = PCA_svd(exog)   
#        print T
#        print P
#        print e_var
#        
#        r, c = P.shape
#        print r, c
#        for i in xrange(11, r):
#            for j in xrange(0, c):
#                P[i, j] = 0
#        print P
#        self.p = P
#        
#        xexog = dot(P, exog.transpose())
#        print xexog
#        print xexog.shape
#        
#        xxexog = xexog.transpose() 
        
        self.gls_model = sm.GLS(endog, exog)
        self.gls_model.fit()
#        print self.gls_model.results.params
    
    def tokenCountFilter(self, passage):
        # 根据文章字数调整
        filter = 0
        if (passage.lf.tokenCount < 100):
            filter = passage.rateScore * 0.2
        elif passage.lf.tokenCount < 120:
            filter = passage.rateScore * 0.1   
        filter = - filter  
        return filter   
    
    def sentenceLengthAverageFilter(self, passage):
        # 根据平均句长调整
        filter = 0
        slv = passage.lf.sentenceLengthAverage
        if (slv < 10):
            filter = (10 - slv) * 2
            if filter > 6: filter = 6
        elif slv > 23:
            filter = (slv - 23) * 3
            if filter > 9: filter = 9
        filter = - filter
        return filter  
    
    def wordLengthAverageFilter(self, passage):
        # 根据平均词长调整 
        filter = 0
        wlv = passage.lf.wordLengthAverage
        if wlv < 4:
            filter = (4 - wlv) * 10
        filter = - filter
        return filter    
    
    def aclWordCountFilter(self, passage):
        # 根据学术词汇数调整
        filter = 0
        acl = passage.lf.aclWordCount
        if acl > 9:
            filter = passage.rateScore * 0.1
        return filter
    
    def noneStopWordLengthAverageFilter(self, passage):          
        # 根据实词平均长度调整
        filter = 0
        rwlv = passage.lf.noneStopWordLengthAverage
        if rwlv < 5.5:
            filter = (5.5 - rwlv) * 10  
        filter = -filter
        return filter
        
    def nounRatioFilter(self, passage):
        # 根据词性比例调整
        filter = 0
        nr = passage.lf.nounRatio
        if nr < 0.2:
            filter = (0.2 - nr) * 100
        elif nr > 0.35:
            filter = (nr - 0.35) * 100
        filter = - filter
        return filter
    
    def verbRatioFilter(self, passage):
        filter = 0
        vr = passage.lf.verbRatio
        if vr < 0.1:
            filter = (0.1 - vr) * 200
        elif vr > 0.2:
            filter = (vr - 0.2) * 200
        filter = - filter
        return filter
    
    def adjRatioFilter(self, passage):
        filter = 0
        ar = passage.lf.adjRatio
        if ar < 0.045:
            filter = (0.045 - ar) * 500
        filter = - filter
        return filter
    
    def posRatioFilter(self, passage):
        filter = 0
        badRatioCount = 0   
        offsetRatio = 0      
        nr = passage.lf.nounRatio
        vr = passage.lf.verbRatio
        ar = passage.lf.adjRatio
        if (nr < 0.2) or (nr > 0.3):
            badRatioCount += 1
        else:
            offsetRatio += abs(nr - 0.25) / 0.1
        if (vr < 0.1) or (vr > 0.2):
            badRatioCount += 1
        else:
            offsetRatio += abs(vr - 0.15) / 0.1
        if (ar < 0.06) or (ar > 0.15):
            badRatioCount += 1
        else:
            offsetRatio += abs(ar - 0.105) / 0.15
        if badRatioCount == 0:
           if offsetRatio < 0.1:
                filter = passage.rateScore * 0.05
        elif badRatioCount == 1:
            if offsetRatio > 0.6:
                filter = - passage.rateScore * 0.05
        elif badRatioCount > 1:
            filter = - passage.rateScore * 0.02 * badRatioCount * badRatioCount
        passage.offsetRatio = offsetRatio
        return filter  
    
    def lsaFilter(self, passage):      
        # 根据内容相似度调整
        filter = 0
        if passage.cf.lsaSimilarity < 82:
            filter = (passage.cf.lsaSimilarity - 82) * 1.5
        return filter

    def rate(self, passage):
        # 线性预测
        if not passage.preprocessed: essayprepare.processPassage(passage)
        passage.lf = self.extractor.extractLangFeather(passage)
        passage.cf = self.extractor.extractContentFeather(passage)
        passage.sf = self.extractor.extractStructureFeather(passage)
        passage.lsaScore = passage.cf.lsaScore
        passage.lsaSimilarity = passage.cf.lsaSimilarity
        passage.lsaSimilarityAll = passage.cf.lsaSimilarityAll

        exog = []
        x = self.__getFeatherList(passage)
        exog.append(x)
#        for i, xx in enumerate(x):
#            x[i] -= self.m[i]
        exog = np.array(exog)
#        xxexog = dot(self.p, exog.transpose())
#        endog = self.gls_model.predict(xxexog.transpose())
        endog = self.gls_model.predict(exog)
        passage.rateScore = endog[0]
        passage.endogScore = endog[0]
        
        # 调整分数
        passage.filter_scores = []
        filters = [self.tokenCountFilter, self.sentenceLengthAverageFilter,
                   self.wordLengthAverageFilter, self.aclWordCountFilter,
                   self.noneStopWordLengthAverageFilter, self.nounRatioFilter,
                   self.verbRatioFilter, self.adjRatioFilter,
                   self.posRatioFilter, self.lsaFilter]
        
        for filter in filters:
            filter_score = filter(passage)
            passage.rateScore += filter_score
            passage.filter_scores.append(filter_score)
        
        self.generateRateResult(passage)
        
        passage.rated = True
        endog[0] = passage.rateScore
        return [passage.rateScore]
    
    def predict(self, passages):
        # 提取特征
        for p in passages:
            if not p.preprocessed: essayprepare.processPassage(p)
            p.lf = self.extractor.extractLangFeather(p)
            p.cf = self.extractor.extractContentFeather(p)
            p.sf = self.extractor.extractStructureFeather(p)

        # 输出特征值
        f = open('fs_test.txt', 'w')
        
        # 生成特征向量
        endog = []
        exog = []
        labels = []
        for p in passages:
            score = int(p.score)
            if score < 35: score = 35
            endog.append(score)
            x = self.__getFeatherList(p)
            exog.append(x)
            labels.append(p.label)
            
            f.write(p.id + ' ')
            f.write(p.score)
            for v in x:
                f.write(' ' + str(v))
            f.write('\n')
    
        f.close()
        
        p_label, p_acc, p_val = svmutil.svm_predict(labels, exog, self.svm_model)  
        print p_label, p_acc, p_val
        
    def generateRateResult(self, passage):
        rateResult = {}
        rateResult['score'] = passage.rateScore 
        rateResult['sentences'] = []
        for para in passage.paragraphs:
            for sent in para.sentences:
                sentence = {}
                sentence['sentenceNo'] = sent.sentenceNo
                sentence['sentence'] = sent.sentence
                tokens = [token.token for token in sent.tokens]
                tags = [token.pos for token in sent.tokens]
                errorTokens = [token.token for token in sent.tokens if token.isSpellError]
                sentence['tokens'] = tokens
                sentence['tags'] = tags
                sentence['spellerror'] = errorTokens
                sentence['ltCheckResults'] = sent.ltCheckResults
                sentence['lgCheckResult'] = sent.canParsed
                sentence['complexity'] = sent.complexity
                rateResult['sentences'].append(sentence)
        passage.rateResult = rateResult
Пример #16
0
    def train(self, passages):
        # ᅯᄂᄡᆭ￀■ᅫᅣᅰᅡ
        i = 1
        for p in passages:
            #print "Passage ", i
            # ᄡᆭ￀■ᅫᅣᅰᅡ
            if not p.preprocessed: essayprepare.processPassage(p)
            i += 1
        
        # ￑ᄉ￁앿￐ᅪ
        passages.sort(cmp=lambda x,y: cmp(x.score, y.score), reverse=True)
        
        model = EssayModel()
        model.train(passages)
        self.models['1'] = model
        #print model.triGramDicts
        
        # ᅩ£￈고￘ᅰ￷
        self.extractor = FeatherExtractor(model)
        for p in passages:
            p.lf = self.extractor.extractLangFeather(p)
            p.cf = self.extractor.extractContentFeather(p)
            p.sf = self.extractor.extractStructureFeather(p)   
        
        # ᅧ¦ᄈ￶ᅩ￘ᅰ￷ᅱᄉ
        f = open('fs_train.txt', 'w')
        
        # ￉ᄈ￉ᅩ￘ᅰ￷ᅬ￲￁﾿
        endog = []
        exog = []
        labels = []
        for p in passages:
            score = int(p.score)
            #if score > 95: score = 95
            if score < 40: score = 40
            endog.append(score)
            x = self.__getFeatherList(p)
            exog.append(x)

            labels.append(p.label)
            
            f.write(p.id + ' ')
            f.write(str(p.score))
            for v in x:
                f.write(' ' + str(v))
            f.write('\n')
        
        f.close()       
        
        # SVM위￀¢ᅥ￷￑ᄉ￁ᄋ
        #self.svm_model = svmutil.svm_train(labels, exog, '-c 3')
        
        # ᅬ￟￐ᅯᄏ￘ᄍ←ᅣᆪ￐ᅪ￑ᄉ￁ᄋ  
        endog = np.array(endog)
        exog = np.array(exog)
#        print endog
#        print exog
        
#        self.m = np.mean(exog,axis=0)
#        print self.m
#        
#        T, P, e_var = PCA_svd(exog)   
#        print T
#        print P
#        print e_var
#        
#        r, c = P.shape
#        print r, c
#        for i in xrange(11, r):
#            for j in xrange(0, c):
#                P[i, j] = 0
#        print P
#        self.p = P
#        
#        xexog = dot(P, exog.transpose())
#        print xexog
#        print xexog.shape
#        
#        xxexog = xexog.transpose() 
        
        self.gls_model = sm.GLS(endog, exog)
        self.gls_model.fit()
Пример #17
0
class NeuroRater():
    
    def __init__(self):
        self.models = {} # ᄡ₩얘ᅨᅮ￐샤ᅲ￷ᅫᅣᅣᆪ￐ᅪ
        self.gls_model = None # ᅬ￟￐ᅯᄏ￘ᄍ←ᅣᆪ￐ᅪ
        self.extractor = None # ᅩ￘ᅰ￷ᅩ£￈거￷
        self.svm_model = None # SVN위￀¢ᅥ￷
        pass
    
    def __trainModel(self, passages, model):
        pass
    
    def __loadModel(self, modelFilename):
        pass
       
    def __getFeatherList(self, passage):
        fs = []
        fs.append(1)
        fs.append(passage.lf.sentenceErrorCount)
        fs.append(passage.lf.spellErrorCount)
        #fs.append(passage.lf.ltErrorCount)
        fs.append(passage.lf.prepositionUse)
        fs.append(passage.lf.definiteArticleUse)
        fs.append(passage.lf.wordCombRecurrentCount)  
        #fs.append(passage.lf.tokenCount)
        #fs.append(passage.lf.wordTypeCount)
        fs.append(passage.lf.wordStemCount)
        fs.append(passage.lf.wordLengthAverage)
        #fs.append(passage.lf.wordLengthSD)
        fs.append(passage.lf.wordTypeRatio)
        fs.append(passage.lf.indexOfGuiraud)
        #for x in passage.lf.wordCountInLevels:
        #    fs.append(x)
        fs.append(passage.lf.gerundCount)
        #fs.append(passage.lf.gerundRatio)
        #fs.append(passage.lf.sentenceLengthAverage)
        #fs.append(passage.lf.sentenceLengthSD)
        #fs.append(passage.lf.automatedReadabilityIndex)  
        fs.append(passage.lf.sentenceComplexity)  
        #fs.append(passage.lf.sentenceComplexityScale) 
         
        #fs.append(passage.cf.lsaSimilarity)   
        #fs.append(passage.cf.proceduralVocabularyCount) 
        fs.append(passage.cf.keywordCover)
        
        fs.append(passage.sf.connectiveCount)   
        #fs.append(passage.sf.connectiveRatio)   
        #fs.append(passage.sf.specialDemonstrativePronounCount)
        #fs.append(passage.sf.specialDemonstrativePronounUse)
        #fs.append(passage.sf.restPronounCount)
        #fs.append(passage.sf.restPronounUse)        
        fs.append(passage.lf.highLowLevelRatio)
        fs.append((passage.lf.wordCountInLevels[3] + passage.lf.wordCountInLevels[4]) * 1.0 
                  / (passage.lf.wordCountInLevels[1] + passage.lf.wordCountInLevels[2]))
        fs.append(passage.lf.overlyUseWordCount)
        return fs
    
    def train(self, passages):
        # ᅯᄂᄡᆭ￀■ᅫᅣᅰᅡ
        i = 1
        for p in passages:
            #print "Passage ", i
            # ᄡᆭ￀■ᅫᅣᅰᅡ
            if not p.preprocessed: essayprepare.processPassage(p)
            i += 1
        
        # ￑ᄉ￁앿￐ᅪ
        passages.sort(cmp=lambda x,y: cmp(x.score, y.score), reverse=True)
        
        model = EssayModel()
        model.train(passages)
        self.models['1'] = model
        #print model.triGramDicts
        
        # ᅩ£￈고￘ᅰ￷
        self.extractor = FeatherExtractor(model)
        for p in passages:
            p.lf = self.extractor.extractLangFeather(p)
            p.cf = self.extractor.extractContentFeather(p)
            p.sf = self.extractor.extractStructureFeather(p)   
        
        # ᅧ¦ᄈ￶ᅩ￘ᅰ￷ᅱᄉ
        f = open('fs_train.txt', 'w')
        
        # ￉ᄈ￉ᅩ￘ᅰ￷ᅬ￲￁﾿
        endog = []
        exog = []
        labels = []
        for p in passages:
            score = int(p.score)
            #if score > 95: score = 95
            if score < 40: score = 40
            endog.append(score)
            x = self.__getFeatherList(p)
            exog.append(x)

            labels.append(p.label)
            
            f.write(p.id + ' ')
            f.write(str(p.score))
            for v in x:
                f.write(' ' + str(v))
            f.write('\n')
        
        f.close()       
        
        # SVM위￀¢ᅥ￷￑ᄉ￁ᄋ
        #self.svm_model = svmutil.svm_train(labels, exog, '-c 3')
        
        # ᅬ￟￐ᅯᄏ￘ᄍ←ᅣᆪ￐ᅪ￑ᄉ￁ᄋ  
        endog = np.array(endog)
        exog = np.array(exog)
#        print endog
#        print exog
        
#        self.m = np.mean(exog,axis=0)
#        print self.m
#        
#        T, P, e_var = PCA_svd(exog)   
#        print T
#        print P
#        print e_var
#        
#        r, c = P.shape
#        print r, c
#        for i in xrange(11, r):
#            for j in xrange(0, c):
#                P[i, j] = 0
#        print P
#        self.p = P
#        
#        xexog = dot(P, exog.transpose())
#        print xexog
#        print xexog.shape
#        
#        xxexog = xexog.transpose() 
        
        self.gls_model = sm.GLS(endog, exog)
        self.gls_model.fit()
#        print self.gls_model.results.params

    
    def rate(self, passage):
        # ᅬ￟￐ᅯᅯᄂᄇ¬
        if not passage.preprocessed: essayprepare.processPassage(passage)
        passage.lf = self.extractor.extractLangFeather(passage)
        passage.cf = self.extractor.extractContentFeather(passage)
        passage.sf = self.extractor.extractStructureFeather(passage)
        exog = []
        x = self.__getFeatherList(passage)
        exog.append(x)
#        for i, xx in enumerate(x):
#            x[i] -= self.m[i]
        exog = np.array(exog)
#        xxexog = dot(self.p, exog.transpose())
#        endog = self.gls_model.predict(xxexog.transpose())
        endog = self.gls_model.predict(exog)
        passage.rateScore = endog[0]
        passage.endogScore = endog[0]
        
        # ᄉ￷ᅰ위ᅧ�
        # ᄌᄒ￝ᅫᅣᅰᅡᅲᅱᅧ�ᄉ￷ᅰ
        if (passage.lf.tokenCount < 100):
            passage.rateScore *= 0.8
        elif passage.lf.tokenCount < 120:
            passage.rateScore *= 0.9
            
        # ᄌᄒ￝ᅥᄑᄒᄒ¦ᄈᄂᄉ￷ᅰ
        filter = 0
        slv = passage.lf.sentenceLengthAverage
        if (slv < 10):
            filter = (10 - slv) * 2
            if filter > 6: filter = 6
        elif slv > 23:
            filter = (slv - 23) * 3
            if filter > 9: filter = 9
        passage.rateScore -= filter
        
        # ᄌᄒ￝ᅥᄑᄒᄡᅧᄈᄂᄉ￷ᅰ 
        filter = 0
        wlv = passage.lf.wordLengthAverage
        if wlv < 4:
            filter = (4 - wlv) * 10
        passage.rateScore -= filter  
        
        # ᄌᄒ￝ᅧᄉᄡᅧᅥᄑᄒᄈᄂᄊ￈ᄉ￷ᅰ
        filter = 0
        rwlv = passage.lf.noneStopWordLengthAverage
        if rwlv < 5.5:
            filter = (5.5 - rwlv) * 10  
        passage.rateScore -= filter
        
        # ᄌᄒ￝ᄡᅧ￐ᅯᄆ￈￀�ᄉ￷ᅰ
        filter = 0
        nr = passage.lf.nounRatio
        if nr < 0.2:
            filter = (0.2 - nr) * 100
        elif nr > 0.35:
            filter = (nr - 0.35) * 100
        passage.rateScore -= filter        
        
        filter = 0
        vr = passage.lf.verbRatio
        if vr < 0.1:
            filter = (0.1 - vr) * 200
        elif vr > 0.2:
            filter = (vr - 0.2) * 200
        passage.rateScore -= filter     
        
        filter = 0
        ar = passage.lf.adjRatio
        if ar < 0.045:
            filter = (0.045 - ar) * 500
        passage.rateScore -= filter  
        
        filter = 0
        badRatioCount = 0   
        offsetRatio = 0       
        if (nr < 0.2) or (nr > 0.3):
            badRatioCount += 1
        else:
            offsetRatio += abs(nr - 0.25) / 0.1
        if (vr < 0.1) or (vr > 0.2):
            badRatioCount += 1
        else:
            offsetRatio += abs(vr - 0.15) / 0.1
        if (ar < 0.06) or (ar > 0.13):
            badRatioCount += 1
        else:
            offsetRatio += abs(ar - 0.095) / 0.14
        if badRatioCount == 0:
           if offsetRatio < 0.1:
                filter = passage.rateScore * 0.05
        elif badRatioCount == 1:
            if offsetRatio > 0.6:
                filter = - passage.rateScore * 0.05
        elif badRatioCount > 1:
            filter = - passage.rateScore * 0.02 * badRatioCount * badRatioCount
        passage.rateScore += filter
        passage.offsetRatio = offsetRatio
                            
        # ᄌᄒ￝ᅣᅳ￈￝ᅬ¢ᅨᅥᄊ￈ᄉ￷ᅰ
        if (passage.cf.lsaScore > 75) and (passage.cf.lsaSimilarity > 89) and (passage.rateScore > 75):
            passage.rateScore += 5
        if ((passage.cf.lsaScore < 70) and (passage.rateScore < 70)) and (passage.cf.lsaSimilarity > 89):
            passage.rateScore -=5
        filter = 0
        if ((passage.cf.lsaSimilarity <= 80) and (passage.cf.lsaSimilarity > 60)) or ((passage.cf.lsaSimilarityAll <= 56) and (passage.cf.lsaSimilarityAll > 32)):
            filter = (15 - abs(passage.cf.lsaSimilarity - 70) / 3.0)
#            if passage.rateScore < passage.cf.lsaScore:
#                passage.rateScore = passage.cf.lsaScore
        passage.rateScore += filter
        
        self.generateRateResult(passage)
        
        passage.rated = True
        endog[0] = passage.rateScore
        return [passage.rateScore]
    
    def predict(self, passages):
        # ᅩ£￈고￘ᅰ￷
        for p in passages:
            if not p.preprocessed: essayprepare.processPassage(p)
            p.lf = self.extractor.extractLangFeather(p)
            p.cf = self.extractor.extractContentFeather(p)
            p.sf = self.extractor.extractStructureFeather(p)

        # ᅧ¦ᄈ￶ᅩ￘ᅰ￷ᅱᄉ
        f = open('fs_test.txt', 'w')
        
        # ￉ᄈ￉ᅩ￘ᅰ￷ᅬ￲￁﾿
        endog = []
        exog = []
        labels = []
        for p in passages:
            score = int(p.score)
            if score < 35: score = 35
            endog.append(score)
            x = self.__getFeatherList(p)
            exog.append(x)
            labels.append(p.label)
            
            f.write(p.id + ' ')
            f.write(p.score)
            for v in x:
                f.write(' ' + str(v))
            f.write('\n')
    
        f.close()
        
        p_label, p_acc, p_val = svmutil.svm_predict(labels, exog, self.svm_model)  
        print p_label, p_acc, p_val
        
    def generateRateResult(self, passage):
        rateResult = {}
        rateResult['score'] = passage.rateScore 
        rateResult['sentences'] = []
        for para in passage.paragraphs:
            for sent in para.sentences:
                sentence = {}
                sentence['sentenceNo'] = sent.sentenceNo
                sentence['sentence'] = sent.sentence
                tokens = [token.token for token in sent.tokens]
                tags = [token.pos for token in sent.tokens]
                errorTokens = [token.token for token in sent.tokens if token.isSpellError]
                sentence['tokens'] = tokens
                sentence['tags'] = tags
                sentence['spellerror'] = errorTokens
                sentence['ltCheckResults'] = sent.ltCheckResults
                sentence['lgCheckResult'] = sent.canParsed
                sentence['complexity'] = sent.complexity
                rateResult['sentences'].append(sentence)
        passage.rateResult = rateResult
Пример #18
0
    def train(self, passages):
        # 预处理文章
        i = 1
        for p in passages:
            #print "Passage ", i
            # 处理文章
            if not p.preprocessed: essayprepare.processPassage(p)
            i += 1
        
        # 训练模型
        passages.sort(cmp=lambda x,y: cmp(x.score, y.score), reverse=True)
        
        model = EssayModel()
        model.train(passages)
        self.models['1'] = model
        #print model.triGramDicts
        
        # 提取特征
        self.extractor = FeatherExtractor(model)
        for p in passages:
            p.lf = self.extractor.extractLangFeather(p)
            p.cf = self.extractor.extractContentFeather(p)
            p.sf = self.extractor.extractStructureFeather(p)   
        
        # 输出特征值
        f = open('fs_train.txt', 'w')
        
        # 生成特征向量
        endog = []
        exog = []
        labels = []
        for p in passages:
            score = int(p.score)
#            if score > 90: score = 90
#            if score < 35: score = 35
            endog.append(score)
            x = self.__getFeatherList(p)
            exog.append(x)

            labels.append(p.label)
            
            f.write(p.id + ' ')
            f.write(str(p.score))
            for v in x:
                f.write(' ' + str(v))
            f.write('\n')
        
        f.close()       
        
        # SVM分类器训练
        #self.svm_model = svmutil.svm_train(labels, exog, '-c 3')
        
        # 线性回归模型训练  
        endog = np.array(endog)
        exog = np.array(exog)
#        print endog
#        print exog
        
#        self.m = np.mean(exog,axis=0)
#        print self.m
#        
#        T, P, e_var = PCA_svd(exog)   
#        print T
#        print P
#        print e_var
#        
#        r, c = P.shape
#        print r, c
#        for i in xrange(11, r):
#            for j in xrange(0, c):
#                P[i, j] = 0
#        print P
#        self.p = P
#        
#        xexog = dot(P, exog.transpose())
#        print xexog
#        print xexog.shape
#        
#        xxexog = xexog.transpose() 
        
        self.gls_model = sm.GLS(endog, exog)
        self.gls_model.fit()
Пример #19
0
class GeneralEssayRater():
    """这是一个通用的评分器
    """
    
    def __init__(self):
        self.model_params = [30, -0.9, -0.55, -80, 50, -0.38, 8.0, -145,
                             16.8, 0.05, 0.4, -0.04, 35, -0.4, 0.35, -0.5]
        pass
       
    def __getFeatherList(self, passage):
        """Get feathers from preprocessed passage
        """

        fs = []
        fs.append(1) # const
        fs.append(passage.lf.sentenceErrorCount)
        fs.append(passage.lf.spellErrorCount)
        #fs.append(passage.lf.ltErrorCount)
        fs.append(passage.lf.prepositionUse)
        fs.append(passage.lf.definiteArticleUse)
        #fs.append(passage.lf.wordCombRecurrentCount)  
        #fs.append(passage.lf.tokenCount)
        #fs.append(passage.lf.wordTypeCount)
        fs.append(passage.lf.wordStemCount)
        fs.append(passage.lf.wordLengthAverage)
        #fs.append(passage.lf.wordLengthSD)
        fs.append(passage.lf.wordTypeRatio)
        fs.append(passage.lf.indexOfGuiraud)
        #for x in passage.lf.wordCountInLevels:
        #    fs.append(x)
        fs.append(passage.lf.gerundCount)
        #fs.append(passage.lf.gerundRatio)
        #fs.append(passage.lf.sentenceLengthAverage)
        #fs.append(passage.lf.sentenceLengthSD)
        #fs.append(passage.lf.automatedReadabilityIndex)  
        fs.append(passage.lf.sentenceComplexity)  
        #fs.append(passage.lf.sentenceComplexityScale) 
         
        #fs.append(passage.cf.lsaScore)   
        #fs.append(passage.cf.proceduralVocabularyCount) 
        #fs.append(passage.cf.keywordCover)
        
        fs.append(passage.sf.connectiveCount)   
        #fs.append(passage.sf.connectiveRatio)   
        #fs.append(passage.sf.specialDemonstrativePronounCount)
        #fs.append(passage.sf.specialDemonstrativePronounUse)
        #fs.append(passage.sf.restPronounCount)
        #fs.append(passage.sf.restPronounUse)        
        #fs.append(passage.lf.highLowLevelRatio)
#        fs.append((passage.lf.wordCountInLevels[3] + passage.lf.wordCountInLevels[4]) * 1.0 
#                  / (passage.lf.wordCountInLevels[1] + passage.lf.wordCountInLevels[2]))
        fs.append(passage.lf.overlyUseWordCount)
        fs.append(passage.lf.aclWordCount)
        #fs.append(passage.lf.aclWordRatio)
        fs.append(passage.lf.nominalizationCountUnique)
        #fs.append(passage.lf.pn_range_count[2])
        fs.append((passage.lf.pn_range_count[2] + passage.lf.pn_range_count[3])/passage.lf.pn_range_count[1])
        fs.append(passage.lf.top_sentence_length)
        return fs
    
    def train(self, passages):
        # pre-process passage
        i = 1
        for p in passages:
            print "======================="
            print "Passage", i, p.id
            if not p.preprocessed: essayprepare.processPassage(p)
            i += 1

        self.extractor = FeatherExtractor(None)
        for p in passages:
            p.lf = self.extractor.extractLangFeather(p)
            p.cf = self.extractor.extractContentFeather(p)
            p.sf = self.extractor.extractStructureFeather(p)   
        
        # save feathers
        f = open('fs_zhang_train.txt', 'w')
        for p in passages:   
            x = self.__getFeatherList(p)       
            f.write(p.id + ' ')
            f.write(str(p.score))
            for v in x:
                f.write(' ' + str(v))
            f.write('\n')
        f.close()
        
        # generate feather vector
        endog = []
        exog = []
        for p in passages:
            score = int(p.score)
            endog.append(score)
            x = self.__getFeatherList(p)
            exog.append(x)     
        
        # train model
        endog = np.array(endog)
        exog = np.array(exog)
        
        self.gls_model = sm.GLS(endog, exog)
        results = self.gls_model.fit()
        #print results.summary()
        print results.params
    
    def tokenCountFilter(self, passage):
        # 根据文章字数调整
        filter = 0
        if (passage.lf.tokenCount < 100):
            filter = passage.rateScore * 0.2
        elif passage.lf.tokenCount < 120:
            filter = passage.rateScore * 0.1   
        filter = - filter  
        return filter   
    
    def sentenceLengthAverageFilter(self, passage):
        # 根据平均句长调整
        filter = 0
        slv = passage.lf.sentenceLengthAverage
        if (slv < 10):
            filter = (10 - slv) * 2
            if filter > 6: filter = 6
        elif slv > 23:
            filter = (slv - 23) * 3
            if filter > 9: filter = 9
        filter = - filter
        return filter  
    
    def wordLengthAverageFilter(self, passage):
        # 根据平均词长调整 
        filter = 0
        wlv = passage.lf.wordLengthAverage
        if wlv < 4:
            filter = (4 - wlv) * 10
        filter = - filter
        return filter    
    
    def aclWordCountFilter(self, passage):
        # 根据学术词汇数调整
        filter = 0
        acl = passage.lf.aclWordCount
        if acl > 9:
            filter = passage.rateScore * 0.1
        return filter
    
    def noneStopWordLengthAverageFilter(self, passage):          
        # 根据实词平均长度调整
        filter = 0
        rwlv = passage.lf.noneStopWordLengthAverage
        if rwlv < 5.5:
            filter = (5.5 - rwlv) * 10  
        filter = -filter
        return filter
        
    def nounRatioFilter(self, passage):
        # 根据词性比例调整
        filter = 0
        nr = passage.lf.nounRatio
        if nr < 0.2:
            filter = (0.2 - nr) * 100
        elif nr > 0.35:
            filter = (nr - 0.35) * 100
        filter = - filter
        return filter
    
    def verbRatioFilter(self, passage):
        filter = 0
        vr = passage.lf.verbRatio
        if vr < 0.1:
            filter = (0.1 - vr) * 200
        elif vr > 0.2:
            filter = (vr - 0.2) * 200
        filter = - filter
        return filter
    
    def adjRatioFilter(self, passage):
        filter = 0
        ar = passage.lf.adjRatio
        if ar < 0.045:
            filter = (0.045 - ar) * 500
        filter = - filter
        return filter
    
    def posRatioFilter(self, passage):
        filter = 0
        badRatioCount = 0   
        offsetRatio = 0      
        nr = passage.lf.nounRatio
        vr = passage.lf.verbRatio
        ar = passage.lf.adjRatio
        if (nr < 0.2) or (nr > 0.3):
            badRatioCount += 1
        else:
            offsetRatio += abs(nr - 0.25) / 0.1
        if (vr < 0.1) or (vr > 0.2):
            badRatioCount += 1
        else:
            offsetRatio += abs(vr - 0.15) / 0.1
        if (ar < 0.06) or (ar > 0.15):
            badRatioCount += 1
        else:
            offsetRatio += abs(ar - 0.105) / 0.15
        if badRatioCount == 0:
           if offsetRatio < 0.1:
                filter = passage.rateScore * 0.05
        elif badRatioCount == 1:
            if offsetRatio > 0.6:
                filter = - passage.rateScore * 0.05
        elif badRatioCount > 1:
            filter = - passage.rateScore * 0.02 * badRatioCount * badRatioCount
        passage.offsetRatio = offsetRatio
        return filter  

    def rate(self, passage):
        # 线性预测
        if not passage.preprocessed: essayprepare.processPassage(passage)
        passage.lf = self.extractor.extractLangFeather(passage)
        passage.cf = self.extractor.extractContentFeather(passage)
        passage.sf = self.extractor.extractStructureFeather(passage)

        exog = []
        x = self.__getFeatherList(passage)
        exog.append(x)
        exog = np.array(exog)
        endog = self.gls_model.predict(exog)
        passage.rateScore = endog[0]
        passage.endogScore = endog[0]
        
        passage.filters = []
        
        # 调整分数
        filter = self.tokenCountFilter(passage)
        passage.rateScore += filter
        passage.filters.append(filter)
            
        filter = self.sentenceLengthAverageFilter(passage)
        passage.rateScore += filter
        passage.filters.append(filter)
        
        filter = self.wordLengthAverageFilter(passage)
        passage.rateScore += filter  
        passage.filters.append(filter)
        
        filter = self.aclWordCountFilter(passage)
        passage.rateScore += filter
        passage.filters.append(filter)
        
        filter = self.noneStopWordLengthAverageFilter(passage)
        passage.rateScore += filter
        passage.filters.append(filter)
        
        filter = self.nounRatioFilter(passage)  
        passage.rateScore += filter      
        passage.filters.append(filter)
        
        filter = self.verbRatioFilter(passage)
        #passage.rateScore += filter     
        passage.filters.append(filter)
        
        filter = self.adjRatioFilter(passage)
        #passage.rateScore += filter  
        passage.filters.append(filter)
        
        filter = self.posRatioFilter(passage)
        #passage.rateScore += filter
        passage.filters.append(filter)
        
        passage.rated = True
        endog[0] = passage.rateScore
        return [passage.rateScore]
    
    def rate_by_params(self, passage):
        # 线性预测
        extractor = FeatherExtractor(None)
        if not passage.preprocessed: essayprepare.processPassage(passage)
        passage.lf = extractor.extractLangFeather(passage)
        passage.cf = extractor.extractContentFeather(passage)
        passage.sf = extractor.extractStructureFeather(passage)

        exog = []
        x = self.__getFeatherList(passage)
        
        score = dot(x, self.model_params)
        
        passage.rateScore = score
        passage.endogScore = score
                
        # 调整分数
        passage.filter_scores = []
        filters = [self.tokenCountFilter, self.sentenceLengthAverageFilter,
                   self.wordLengthAverageFilter, self.aclWordCountFilter,
                   self.noneStopWordLengthAverageFilter, self.nounRatioFilter]
        
        for filter in filters:
            filter_score = filter(passage)
            passage.rateScore += filter_score
            passage.filter_scores.append(filter_score)
        
        passage.rated = True
        return [passage.rateScore]
Пример #20
0
            print tokens
            print tags
            print lemmas
            print stems
            print levels
            print sentNos
            print paraNos
            print nos
            print sent.tokenCount
            print sent.wordCount
            print sent.realWordCount
    
    print "三元词组", passage.trigrams


    e = FeatherExtractor()

    # 提取语言特征    
    languageFeather = e.extractLangFeather(passage)  
    
    print u"词次总数", languageFeather.tokenCount
    print u"单词总数", languageFeather.wordCount
    print u"词形总数", languageFeather.wordTypeCount
    print u"词元总数", languageFeather.wordLemmaCount
    
    print u"介词个数", languageFeather.prepositionCount
    print u"介词比例", languageFeather.prepositionRatio
    print u"介词使用", languageFeather.prepositionUse
    
    print u"定冠词个数", languageFeather.definiteArticleCount
    print u"定冠词比例", languageFeather.definiteArticleRatio