def rate_by_params(self, passage): # 线性预测 extractor = FeatherExtractor(None) if not passage.preprocessed: essayprepare.processPassage(passage) passage.lf = extractor.extractLangFeather(passage) passage.cf = extractor.extractContentFeather(passage) passage.sf = extractor.extractStructureFeather(passage) exog = [] x = self.__getFeatherList(passage) score = dot(x, self.model_params) passage.rateScore = score passage.endogScore = score # 调整分数 passage.filter_scores = [] filters = [self.tokenCountFilter, self.sentenceLengthAverageFilter, self.wordLengthAverageFilter, self.aclWordCountFilter, self.noneStopWordLengthAverageFilter, self.nounRatioFilter] for filter in filters: filter_score = filter(passage) passage.rateScore += filter_score passage.filter_scores.append(filter_score) passage.rated = True return [passage.rateScore]
def rate_by_params(self, passage): # 线性预测 extractor = FeatherExtractor(None) if not passage.preprocessed: essayprepare.processPassage(passage) passage.lf = extractor.extractLangFeather(passage) passage.cf = extractor.extractContentFeather(passage) passage.sf = extractor.extractStructureFeather(passage) exog = [] x = self.__getFeatherList(passage) score = dot(x, self.model_params) passage.rateScore = score passage.endogScore = score # 调整分数 passage.filter_scores = [] filters = [ self.tokenCountFilter, self.sentenceLengthAverageFilter, self.wordLengthAverageFilter, self.aclWordCountFilter, self.noneStopWordLengthAverageFilter, self.nounRatioFilter, self.total_score_filter ] for filter in filters: filter_score = filter(passage) passage.rateScore += filter_score passage.filter_scores.append(filter_score) passage.rated = True return [passage.rateScore]
def wordRepetitiveDemo(): print "wordRepetitiveDemo start..." pkfile = open('ustcpassages_503.pkl', 'r') passages = pickle.load(pkfile) pkfile.close() f = FeatherExtractor() for p in passages[:]: lf = f.extractLangFeather(p) p.lf = lf f = open('wordrep.txt', 'w') for p in passages[:]: if p.lf: for l in p.lf.lemmaUseInfo: print p.id, p.score, l[0], l[1], l[2], l[3], l[4], l[5] s = ' '.join([ str(p.id), str(p.score), str(p.lf.overlyUseWordCount), l[0], str(l[1]), str(l[2]), str(l[3]), str(l[4]), str(l[5]) ]) f.write(s) f.write('\n') f.close() print "wordRepetitiveDemo over!!!"
def train(self, passages): # pre-process passage i = 1 for p in passages: print "=======================" print "Passage", i, p.id if not p.preprocessed: essayprepare.processPassage(p) i += 1 self.extractor = FeatherExtractor(None) for p in passages: p.lf = self.extractor.extractLangFeather(p) p.cf = self.extractor.extractContentFeather(p) p.sf = self.extractor.extractStructureFeather(p) # save feathers f = open('fs_zhang_train.txt', 'w') for p in passages: x = self.__getFeatherList(p) f.write(p.id + ' ') f.write(str(p.score)) for v in x: f.write(' ' + str(v)) f.write('\n') f.close() # generate feather vector endog = [] exog = [] for p in passages: score = int(p.score) endog.append(score) x = self.__getFeatherList(p) exog.append(x) # train model endog = np.array(endog) exog = np.array(exog) self.gls_model = sm.GLS(endog, exog) results = self.gls_model.fit() #print results.summary() print results.params
def demo_one_sentence(): # 文章 passage = EssayPassage() passage.passage = 'I am a students.' passage.title = 'title' passage.score = 5 passage.id = '1' passage.reviewerId = 3 passage.content = 'I am a students.' # 处理文章 essayprepare.processPassage(passage) extractor = FeatherExtractor() lf = extractor.extractLangFeather(passage) passage.lf = lf cf = extractor.extractContentFeather(passage) sf = extractor.extractStructureFeather(passage) print 'OK'
def generatePassageFeathers(passages, outFilename): f = open(outFilename, 'w') e = FeatherExtractor() i = 1 for p in passages: print "Passage ", i # 处理文章 essayprepare.processPassage(p) # 提取语言特征 languageFeather = e.extractLangFeather(p) p.lf = languageFeather # 提取结构特征 structureFeather = e.extractStructureFeather(p) p.sf = structureFeather f.write(p.id + ' ') f.write(str(p.score)) f.write(' ' + str(languageFeather)) f.write('\n') i += 1 f.close()
def wordRepetitiveDemo(): print "wordRepetitiveDemo start..." pkfile = open('ustcpassages_503.pkl', 'r') passages = pickle.load(pkfile) pkfile.close() f = FeatherExtractor() for p in passages[:]: lf = f.extractLangFeather(p) p.lf = lf f = open('wordrep.txt', 'w') for p in passages[:]: if p.lf: for l in p.lf.lemmaUseInfo: print p.id, p.score, l[0], l[1], l[2], l[3], l[4], l[5] s = ' '.join([str(p.id), str(p.score), str(p.lf.overlyUseWordCount), l[0], str(l[1]), str(l[2]), str(l[3]), str(l[4]), str(l[5])]) f.write(s) f.write('\n') f.close() print "wordRepetitiveDemo over!!!"
print "SPELLERROR", errorTokens print tokens print tags print lemmas print stems print levels print sentNos print paraNos print nos print sent.tokenCount print sent.wordCount print sent.realWordCount print "三元词组", passage.trigrams e = FeatherExtractor() # 提取语言特征 languageFeather = e.extractLangFeather(passage) print u"词次总数", languageFeather.tokenCount print u"单词总数", languageFeather.wordCount print u"词形总数", languageFeather.wordTypeCount print u"词元总数", languageFeather.wordLemmaCount print u"介词个数", languageFeather.prepositionCount print u"介词比例", languageFeather.prepositionRatio print u"介词使用", languageFeather.prepositionUse print u"定冠词个数", languageFeather.definiteArticleCount print u"定冠词比例", languageFeather.definiteArticleRatio
def processEssay(self): self.browser.clear() id = unicode(self.lineedit.text()) essay = self.essayDict.get(id) if not essay: self.browser.append("<font color=red>%s is not found!</font>" % id) return self.browser.append(essay.content) # 文章 passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id # 处理文章 essayprepare.processPassage(passage) # 输出来看看是啥样子 self.browser.append("PASSAGE=========================================") self.browser.append(passage.id) #self.browser.append(passage.title) self.browser.append(passage.score) self.browser.append(passage.passage) self.browser.append(str(len(passage.paragraphs))) self.browser.append("PARAGRAPHS---------------------------------------") for para in passage.paragraphs: self.browser.append(str(para.paragraphNo)) self.browser.append(para.paragraph) for sent in para.sentences: self.browser.append(str(sent.sentenceNo)) self.browser.append(str(sent.paragraphSentenceNo)) self.browser.append(sent.sentence) tokens = [token.token for token in sent.tokens] tags = [token.pos for token in sent.tokens] lemmas = [token.lemma for token in sent.tokens] stems = [token.stem for token in sent.tokens] levels = [token.level for token in sent.tokens] nos = [token.tokenNo for token in sent.tokens] sentNos = [token.sentenceTokenNo for token in sent.tokens] paraNos = [token.paragraphTokenNo for token in sent.tokens] errorTokens = [token.token for token in sent.tokens if token.isSpellError] if not sent.canParsed: self.browser.append("<font color=red>SENTENCE ERROR</font>") self.browser.append("<font color=red>SPELLERROR %s</font>" % str(errorTokens)) self.browser.append(str(tokens)) self.browser.append(str(tags)) self.browser.append(str(lemmas)) self.browser.append(str(stems)) self.browser.append(str(levels)) self.browser.append(str(sentNos)) self.browser.append(str(paraNos)) self.browser.append(str(nos)) self.browser.append(str(sent.tokenCount)) self.browser.append(str(sent.wordCount)) self.browser.append(str(sent.realWordCount)) self.browser.append(u"三元词组" + ' ' + str(passage.trigrams)) e = FeatherExtractor() # 提取语言特征 languageFeather = e.extractLangFeather(passage) print u"词次总数", languageFeather.tokenCount print u"单词总数", languageFeather.wordCount print u"词形总数", languageFeather.wordTypeCount print u"词元总数", languageFeather.wordLemmaCount print u"介词个数", languageFeather.prepositionCount print u"介词比例", languageFeather.prepositionRatio print u"介词使用", languageFeather.prepositionUse print u"定冠词个数", languageFeather.definiteArticleCount print u"定冠词比例", languageFeather.definiteArticleRatio print u"定冠词使用", languageFeather.definiteArticleUse # 提取结构特征 #structureFeather = e.extractStructureFeather(passage) #generateUSTCFeathers('USTC2011Jan.txt', 'USTCFeathers_503.txt') print "...OVER"
def processEssay(self): self.browser.clear() id = unicode(self.lineedit.text()) essay = self.essayDict.get(id) if not essay: self.browser.append("<font color=red>%s is not found!</font>" % id) return self.browser.append(essay.content) # 文章 passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id # 处理文章 essayprepare.processPassage(passage) # 输出来看看是啥样子 self.browser.append("PASSAGE=========================================") self.browser.append(passage.id) #self.browser.append(passage.title) self.browser.append(passage.score) self.browser.append(passage.passage) self.browser.append(str(len(passage.paragraphs))) self.browser.append( "PARAGRAPHS---------------------------------------") for para in passage.paragraphs: self.browser.append(str(para.paragraphNo)) self.browser.append(para.paragraph) for sent in para.sentences: self.browser.append(str(sent.sentenceNo)) self.browser.append(str(sent.paragraphSentenceNo)) self.browser.append(sent.sentence) tokens = [token.token for token in sent.tokens] tags = [token.pos for token in sent.tokens] lemmas = [token.lemma for token in sent.tokens] stems = [token.stem for token in sent.tokens] levels = [token.level for token in sent.tokens] nos = [token.tokenNo for token in sent.tokens] sentNos = [token.sentenceTokenNo for token in sent.tokens] paraNos = [token.paragraphTokenNo for token in sent.tokens] errorTokens = [ token.token for token in sent.tokens if token.isSpellError ] if not sent.canParsed: self.browser.append( "<font color=red>SENTENCE ERROR</font>") self.browser.append("<font color=red>SPELLERROR %s</font>" % str(errorTokens)) self.browser.append(str(tokens)) self.browser.append(str(tags)) self.browser.append(str(lemmas)) self.browser.append(str(stems)) self.browser.append(str(levels)) self.browser.append(str(sentNos)) self.browser.append(str(paraNos)) self.browser.append(str(nos)) self.browser.append(str(sent.tokenCount)) self.browser.append(str(sent.wordCount)) self.browser.append(str(sent.realWordCount)) self.browser.append(u"三元词组" + ' ' + str(passage.trigrams)) e = FeatherExtractor() # 提取语言特征 languageFeather = e.extractLangFeather(passage) print u"词次总数", languageFeather.tokenCount print u"单词总数", languageFeather.wordCount print u"词形总数", languageFeather.wordTypeCount print u"词元总数", languageFeather.wordLemmaCount print u"介词个数", languageFeather.prepositionCount print u"介词比例", languageFeather.prepositionRatio print u"介词使用", languageFeather.prepositionUse print u"定冠词个数", languageFeather.definiteArticleCount print u"定冠词比例", languageFeather.definiteArticleRatio print u"定冠词使用", languageFeather.definiteArticleUse # 提取结构特征 #structureFeather = e.extractStructureFeather(passage) #generateUSTCFeathers('USTC2011Jan.txt', 'USTCFeathers_503.txt') print "...OVER"
class GeneralEssayRater(): """这是一个通用的评分器 """ def __init__(self): self.model_params = [ 30, -0.9, -0.55, -80, 50, -0.38, 8.0, -145, 16.8, 0.05, 0.4, -0.04, 35, -0.4, 0.35, -0.5 ] pass def __getFeatherList(self, passage): """Get feathers from preprocessed passage """ fs = [] fs.append(1) # const fs.append(passage.lf.sentenceErrorCount) fs.append(passage.lf.spellErrorCount) #fs.append(passage.lf.ltErrorCount) fs.append(passage.lf.prepositionUse) fs.append(passage.lf.definiteArticleUse) #fs.append(passage.lf.wordCombRecurrentCount) #fs.append(passage.lf.tokenCount) #fs.append(passage.lf.wordTypeCount) fs.append(passage.lf.wordStemCount) fs.append(passage.lf.wordLengthAverage) #fs.append(passage.lf.wordLengthSD) fs.append(passage.lf.wordTypeRatio) fs.append(passage.lf.indexOfGuiraud) #for x in passage.lf.wordCountInLevels: # fs.append(x) fs.append(passage.lf.gerundCount) #fs.append(passage.lf.gerundRatio) #fs.append(passage.lf.sentenceLengthAverage) #fs.append(passage.lf.sentenceLengthSD) #fs.append(passage.lf.automatedReadabilityIndex) fs.append(passage.lf.sentenceComplexity) #fs.append(passage.lf.sentenceComplexityScale) #fs.append(passage.cf.lsaScore) #fs.append(passage.cf.proceduralVocabularyCount) #fs.append(passage.cf.keywordCover) fs.append(passage.sf.connectiveCount) #fs.append(passage.sf.connectiveRatio) #fs.append(passage.sf.specialDemonstrativePronounCount) #fs.append(passage.sf.specialDemonstrativePronounUse) #fs.append(passage.sf.restPronounCount) #fs.append(passage.sf.restPronounUse) #fs.append(passage.lf.highLowLevelRatio) # fs.append((passage.lf.wordCountInLevels[3] + passage.lf.wordCountInLevels[4]) * 1.0 # / (passage.lf.wordCountInLevels[1] + passage.lf.wordCountInLevels[2])) fs.append(passage.lf.overlyUseWordCount) fs.append(passage.lf.aclWordCount) #fs.append(passage.lf.aclWordRatio) fs.append(passage.lf.nominalizationCountUnique) #fs.append(passage.lf.pn_range_count[2]) fs.append( (passage.lf.pn_range_count[2] + passage.lf.pn_range_count[3]) / passage.lf.pn_range_count[1]) fs.append(passage.lf.top_sentence_length) return fs def train(self, passages): # pre-process passage i = 1 for p in passages: print "=======================" print "Passage", i, p.id if not p.preprocessed: essayprepare.processPassage(p) i += 1 self.extractor = FeatherExtractor(None) for p in passages: p.lf = self.extractor.extractLangFeather(p) p.cf = self.extractor.extractContentFeather(p) p.sf = self.extractor.extractStructureFeather(p) # save feathers f = open('fs_zhang_train.txt', 'w') for p in passages: x = self.__getFeatherList(p) f.write(p.id + ' ') f.write(str(p.score)) for v in x: f.write(' ' + str(v)) f.write('\n') f.close() # generate feather vector endog = [] exog = [] for p in passages: score = int(p.score) endog.append(score) x = self.__getFeatherList(p) exog.append(x) # train model endog = np.array(endog) exog = np.array(exog) self.gls_model = sm.GLS(endog, exog) results = self.gls_model.fit() #print results.summary() print results.params def tokenCountFilter(self, passage): # 根据文章字数调整 filter = 0 if (passage.lf.tokenCount < 100): filter = passage.rateScore * 0.2 elif passage.lf.tokenCount < 120: filter = passage.rateScore * 0.1 filter = -filter return filter def sentenceLengthAverageFilter(self, passage): # 根据平均句长调整 filter = 0 slv = passage.lf.sentenceLengthAverage if (slv < 10): filter = (10 - slv) * 2 if filter > 6: filter = 6 elif slv > 23: filter = (slv - 23) * 3 if filter > 9: filter = 9 filter = -filter return filter def wordLengthAverageFilter(self, passage): # 根据平均词长调整 filter = 0 wlv = passage.lf.wordLengthAverage if wlv < 4: filter = (4 - wlv) * 10 filter = -filter return filter def aclWordCountFilter(self, passage): # 根据学术词汇数调整 filter = 0 acl = passage.lf.aclWordCount if acl > 9: filter = passage.rateScore * 0.1 return filter def noneStopWordLengthAverageFilter(self, passage): # 根据实词平均长度调整 filter = 0 rwlv = passage.lf.noneStopWordLengthAverage if rwlv < 5.5: filter = (5.5 - rwlv) * 10 filter = -filter return filter def nounRatioFilter(self, passage): # 根据词性比例调整 filter = 0 nr = passage.lf.nounRatio if nr < 0.2: filter = (0.2 - nr) * 100 elif nr > 0.35: filter = (nr - 0.35) * 100 filter = -filter return filter def verbRatioFilter(self, passage): filter = 0 vr = passage.lf.verbRatio if vr < 0.1: filter = (0.1 - vr) * 200 elif vr > 0.2: filter = (vr - 0.2) * 200 filter = -filter return filter def adjRatioFilter(self, passage): filter = 0 ar = passage.lf.adjRatio if ar < 0.045: filter = (0.045 - ar) * 500 filter = -filter return filter def posRatioFilter(self, passage): filter = 0 badRatioCount = 0 offsetRatio = 0 nr = passage.lf.nounRatio vr = passage.lf.verbRatio ar = passage.lf.adjRatio if (nr < 0.2) or (nr > 0.3): badRatioCount += 1 else: offsetRatio += abs(nr - 0.25) / 0.1 if (vr < 0.1) or (vr > 0.2): badRatioCount += 1 else: offsetRatio += abs(vr - 0.15) / 0.1 if (ar < 0.06) or (ar > 0.15): badRatioCount += 1 else: offsetRatio += abs(ar - 0.105) / 0.15 if badRatioCount == 0: if offsetRatio < 0.1: filter = passage.rateScore * 0.05 elif badRatioCount == 1: if offsetRatio > 0.6: filter = -passage.rateScore * 0.05 elif badRatioCount > 1: filter = -passage.rateScore * 0.02 * badRatioCount * badRatioCount passage.offsetRatio = offsetRatio return filter def rate(self, passage): # 线性预测 if not passage.preprocessed: essayprepare.processPassage(passage) passage.lf = self.extractor.extractLangFeather(passage) passage.cf = self.extractor.extractContentFeather(passage) passage.sf = self.extractor.extractStructureFeather(passage) exog = [] x = self.__getFeatherList(passage) exog.append(x) exog = np.array(exog) endog = self.gls_model.predict(exog) passage.rateScore = endog[0] passage.endogScore = endog[0] passage.filters = [] # 调整分数 filter = self.tokenCountFilter(passage) passage.rateScore += filter passage.filters.append(filter) filter = self.sentenceLengthAverageFilter(passage) passage.rateScore += filter passage.filters.append(filter) filter = self.wordLengthAverageFilter(passage) passage.rateScore += filter passage.filters.append(filter) filter = self.aclWordCountFilter(passage) passage.rateScore += filter passage.filters.append(filter) filter = self.noneStopWordLengthAverageFilter(passage) passage.rateScore += filter passage.filters.append(filter) filter = self.nounRatioFilter(passage) passage.rateScore += filter passage.filters.append(filter) filter = self.verbRatioFilter(passage) #passage.rateScore += filter passage.filters.append(filter) filter = self.adjRatioFilter(passage) #passage.rateScore += filter passage.filters.append(filter) filter = self.posRatioFilter(passage) #passage.rateScore += filter passage.filters.append(filter) passage.rated = True endog[0] = passage.rateScore return [passage.rateScore] def rate_by_params(self, passage): # 线性预测 extractor = FeatherExtractor(None) if not passage.preprocessed: essayprepare.processPassage(passage) passage.lf = extractor.extractLangFeather(passage) passage.cf = extractor.extractContentFeather(passage) passage.sf = extractor.extractStructureFeather(passage) exog = [] x = self.__getFeatherList(passage) score = dot(x, self.model_params) passage.rateScore = score passage.endogScore = score # 调整分数 passage.filter_scores = [] filters = [ self.tokenCountFilter, self.sentenceLengthAverageFilter, self.wordLengthAverageFilter, self.aclWordCountFilter, self.noneStopWordLengthAverageFilter, self.nounRatioFilter ] for filter in filters: filter_score = filter(passage) passage.rateScore += filter_score passage.filter_scores.append(filter_score) passage.rated = True return [passage.rateScore]
class CollegeEssayRater(): def __init__(self): self.models = {} # 存放所有的作文模型 self.gls_model = None # 线性回归模型 self.extractor = None # 特征提取器 self.svm_model = None # SVN分类器 pass def __trainModel(self, passages, model): pass def __loadModel(self, modelFilename): pass def __getFeatherList(self, passage): fs = [] fs.append(1) fs.append(passage.lf.sentenceErrorCount) fs.append(passage.lf.spellErrorCount) #fs.append(passage.lf.ltErrorCount) fs.append(passage.lf.prepositionUse) fs.append(passage.lf.definiteArticleUse) fs.append(passage.lf.wordCombRecurrentCount) #fs.append(passage.lf.tokenCount) #fs.append(passage.lf.wordTypeCount) fs.append(passage.lf.wordStemCount) fs.append(passage.lf.wordLengthAverage) #fs.append(passage.lf.wordLengthSD) fs.append(passage.lf.wordTypeRatio) fs.append(passage.lf.indexOfGuiraud) #for x in passage.lf.wordCountInLevels: # fs.append(x) fs.append(passage.lf.gerundCount) #fs.append(passage.lf.gerundRatio) #fs.append(passage.lf.sentenceLengthAverage) #fs.append(passage.lf.sentenceLengthSD) #fs.append(passage.lf.automatedReadabilityIndex) fs.append(passage.lf.sentenceComplexity) #fs.append(passage.lf.sentenceComplexityScale) fs.append(passage.cf.lsaScore) #fs.append(passage.cf.proceduralVocabularyCount) fs.append(passage.cf.keywordCover) fs.append(passage.sf.connectiveCount) #fs.append(passage.sf.connectiveRatio) #fs.append(passage.sf.specialDemonstrativePronounCount) #fs.append(passage.sf.specialDemonstrativePronounUse) #fs.append(passage.sf.restPronounCount) #fs.append(passage.sf.restPronounUse) fs.append(passage.lf.highLowLevelRatio) # fs.append((passage.lf.wordCountInLevels[3] + passage.lf.wordCountInLevels[4]) * 1.0 # / (passage.lf.wordCountInLevels[1] + passage.lf.wordCountInLevels[2])) fs.append(passage.lf.overlyUseWordCount) fs.append(passage.lf.aclWordCount) #fs.append(passage.lf.aclWordRatio) fs.append(passage.lf.nominalizationCountUnique) return fs def train(self, passages): # 预处理文章 i = 1 for p in passages: #print "Passage ", i # 处理文章 if not p.preprocessed: essayprepare.processPassage(p) i += 1 # 训练模型 passages.sort(cmp=lambda x,y: cmp(x.score, y.score), reverse=True) model = EssayModel() model.train(passages) self.models['1'] = model #print model.triGramDicts # 提取特征 self.extractor = FeatherExtractor(model) for p in passages: p.lf = self.extractor.extractLangFeather(p) p.cf = self.extractor.extractContentFeather(p) p.sf = self.extractor.extractStructureFeather(p) # 输出特征值 f = open('fs_train.txt', 'w') # 生成特征向量 endog = [] exog = [] labels = [] for p in passages: score = int(p.score) # if score > 90: score = 90 # if score < 35: score = 35 endog.append(score) x = self.__getFeatherList(p) exog.append(x) labels.append(p.label) f.write(p.id + ' ') f.write(str(p.score)) for v in x: f.write(' ' + str(v)) f.write('\n') f.close() # SVM分类器训练 #self.svm_model = svmutil.svm_train(labels, exog, '-c 3') # 线性回归模型训练 endog = np.array(endog) exog = np.array(exog) # print endog # print exog # self.m = np.mean(exog,axis=0) # print self.m # # T, P, e_var = PCA_svd(exog) # print T # print P # print e_var # # r, c = P.shape # print r, c # for i in xrange(11, r): # for j in xrange(0, c): # P[i, j] = 0 # print P # self.p = P # # xexog = dot(P, exog.transpose()) # print xexog # print xexog.shape # # xxexog = xexog.transpose() self.gls_model = sm.GLS(endog, exog) self.gls_model.fit() # print self.gls_model.results.params def tokenCountFilter(self, passage): # 根据文章字数调整 filter = 0 if (passage.lf.tokenCount < 100): filter = passage.rateScore * 0.2 elif passage.lf.tokenCount < 120: filter = passage.rateScore * 0.1 filter = - filter return filter def sentenceLengthAverageFilter(self, passage): # 根据平均句长调整 filter = 0 slv = passage.lf.sentenceLengthAverage if (slv < 10): filter = (10 - slv) * 2 if filter > 6: filter = 6 elif slv > 23: filter = (slv - 23) * 3 if filter > 9: filter = 9 filter = - filter return filter def wordLengthAverageFilter(self, passage): # 根据平均词长调整 filter = 0 wlv = passage.lf.wordLengthAverage if wlv < 4: filter = (4 - wlv) * 10 filter = - filter return filter def aclWordCountFilter(self, passage): # 根据学术词汇数调整 filter = 0 acl = passage.lf.aclWordCount if acl > 9: filter = passage.rateScore * 0.1 return filter def noneStopWordLengthAverageFilter(self, passage): # 根据实词平均长度调整 filter = 0 rwlv = passage.lf.noneStopWordLengthAverage if rwlv < 5.5: filter = (5.5 - rwlv) * 10 filter = -filter return filter def nounRatioFilter(self, passage): # 根据词性比例调整 filter = 0 nr = passage.lf.nounRatio if nr < 0.2: filter = (0.2 - nr) * 100 elif nr > 0.35: filter = (nr - 0.35) * 100 filter = - filter return filter def verbRatioFilter(self, passage): filter = 0 vr = passage.lf.verbRatio if vr < 0.1: filter = (0.1 - vr) * 200 elif vr > 0.2: filter = (vr - 0.2) * 200 filter = - filter return filter def adjRatioFilter(self, passage): filter = 0 ar = passage.lf.adjRatio if ar < 0.045: filter = (0.045 - ar) * 500 filter = - filter return filter def posRatioFilter(self, passage): filter = 0 badRatioCount = 0 offsetRatio = 0 nr = passage.lf.nounRatio vr = passage.lf.verbRatio ar = passage.lf.adjRatio if (nr < 0.2) or (nr > 0.3): badRatioCount += 1 else: offsetRatio += abs(nr - 0.25) / 0.1 if (vr < 0.1) or (vr > 0.2): badRatioCount += 1 else: offsetRatio += abs(vr - 0.15) / 0.1 if (ar < 0.06) or (ar > 0.15): badRatioCount += 1 else: offsetRatio += abs(ar - 0.105) / 0.15 if badRatioCount == 0: if offsetRatio < 0.1: filter = passage.rateScore * 0.05 elif badRatioCount == 1: if offsetRatio > 0.6: filter = - passage.rateScore * 0.05 elif badRatioCount > 1: filter = - passage.rateScore * 0.02 * badRatioCount * badRatioCount passage.offsetRatio = offsetRatio return filter def lsaFilter(self, passage): # 根据内容相似度调整 filter = 0 if passage.cf.lsaSimilarity < 82: filter = (passage.cf.lsaSimilarity - 82) * 1.5 return filter def rate(self, passage): # 线性预测 if not passage.preprocessed: essayprepare.processPassage(passage) passage.lf = self.extractor.extractLangFeather(passage) passage.cf = self.extractor.extractContentFeather(passage) passage.sf = self.extractor.extractStructureFeather(passage) passage.lsaScore = passage.cf.lsaScore passage.lsaSimilarity = passage.cf.lsaSimilarity passage.lsaSimilarityAll = passage.cf.lsaSimilarityAll exog = [] x = self.__getFeatherList(passage) exog.append(x) # for i, xx in enumerate(x): # x[i] -= self.m[i] exog = np.array(exog) # xxexog = dot(self.p, exog.transpose()) # endog = self.gls_model.predict(xxexog.transpose()) endog = self.gls_model.predict(exog) passage.rateScore = endog[0] passage.endogScore = endog[0] # 调整分数 passage.filter_scores = [] filters = [self.tokenCountFilter, self.sentenceLengthAverageFilter, self.wordLengthAverageFilter, self.aclWordCountFilter, self.noneStopWordLengthAverageFilter, self.nounRatioFilter, self.verbRatioFilter, self.adjRatioFilter, self.posRatioFilter, self.lsaFilter] for filter in filters: filter_score = filter(passage) passage.rateScore += filter_score passage.filter_scores.append(filter_score) self.generateRateResult(passage) passage.rated = True endog[0] = passage.rateScore return [passage.rateScore] def predict(self, passages): # 提取特征 for p in passages: if not p.preprocessed: essayprepare.processPassage(p) p.lf = self.extractor.extractLangFeather(p) p.cf = self.extractor.extractContentFeather(p) p.sf = self.extractor.extractStructureFeather(p) # 输出特征值 f = open('fs_test.txt', 'w') # 生成特征向量 endog = [] exog = [] labels = [] for p in passages: score = int(p.score) if score < 35: score = 35 endog.append(score) x = self.__getFeatherList(p) exog.append(x) labels.append(p.label) f.write(p.id + ' ') f.write(p.score) for v in x: f.write(' ' + str(v)) f.write('\n') f.close() p_label, p_acc, p_val = svmutil.svm_predict(labels, exog, self.svm_model) print p_label, p_acc, p_val def generateRateResult(self, passage): rateResult = {} rateResult['score'] = passage.rateScore rateResult['sentences'] = [] for para in passage.paragraphs: for sent in para.sentences: sentence = {} sentence['sentenceNo'] = sent.sentenceNo sentence['sentence'] = sent.sentence tokens = [token.token for token in sent.tokens] tags = [token.pos for token in sent.tokens] errorTokens = [token.token for token in sent.tokens if token.isSpellError] sentence['tokens'] = tokens sentence['tags'] = tags sentence['spellerror'] = errorTokens sentence['ltCheckResults'] = sent.ltCheckResults sentence['lgCheckResult'] = sent.canParsed sentence['complexity'] = sent.complexity rateResult['sentences'].append(sentence) passage.rateResult = rateResult
def train(self, passages): # ᅯᄂᄡᆭ■ᅫᅣᅰᅡ i = 1 for p in passages: #print "Passage ", i # ᄡᆭ■ᅫᅣᅰᅡ if not p.preprocessed: essayprepare.processPassage(p) i += 1 # ᄉ앿ᅪ passages.sort(cmp=lambda x,y: cmp(x.score, y.score), reverse=True) model = EssayModel() model.train(passages) self.models['1'] = model #print model.triGramDicts # ᅩ£고ᅰ self.extractor = FeatherExtractor(model) for p in passages: p.lf = self.extractor.extractLangFeather(p) p.cf = self.extractor.extractContentFeather(p) p.sf = self.extractor.extractStructureFeather(p) # ᅧ¦뽀ᅰᅱᄉ f = open('fs_train.txt', 'w') # 뽀ᅰᅬ endog = [] exog = [] labels = [] for p in passages: score = int(p.score) #if score > 95: score = 95 if score < 40: score = 40 endog.append(score) x = self.__getFeatherList(p) exog.append(x) labels.append(p.label) f.write(p.id + ' ') f.write(str(p.score)) for v in x: f.write(' ' + str(v)) f.write('\n') f.close() # SVM위¢ᅥᄉᄋ #self.svm_model = svmutil.svm_train(labels, exog, '-c 3') # ᅬᅯᄏᄍ←ᅣᆪᅪᄉᄋ endog = np.array(endog) exog = np.array(exog) # print endog # print exog # self.m = np.mean(exog,axis=0) # print self.m # # T, P, e_var = PCA_svd(exog) # print T # print P # print e_var # # r, c = P.shape # print r, c # for i in xrange(11, r): # for j in xrange(0, c): # P[i, j] = 0 # print P # self.p = P # # xexog = dot(P, exog.transpose()) # print xexog # print xexog.shape # # xxexog = xexog.transpose() self.gls_model = sm.GLS(endog, exog) self.gls_model.fit()
class NeuroRater(): def __init__(self): self.models = {} # ᄡ₩얘ᅨᅮ샤ᅲᅫᅣᅣᆪᅪ self.gls_model = None # ᅬᅯᄏᄍ←ᅣᆪᅪ self.extractor = None # ᅩᅰᅩ£거 self.svm_model = None # SVN위¢ᅥ pass def __trainModel(self, passages, model): pass def __loadModel(self, modelFilename): pass def __getFeatherList(self, passage): fs = [] fs.append(1) fs.append(passage.lf.sentenceErrorCount) fs.append(passage.lf.spellErrorCount) #fs.append(passage.lf.ltErrorCount) fs.append(passage.lf.prepositionUse) fs.append(passage.lf.definiteArticleUse) fs.append(passage.lf.wordCombRecurrentCount) #fs.append(passage.lf.tokenCount) #fs.append(passage.lf.wordTypeCount) fs.append(passage.lf.wordStemCount) fs.append(passage.lf.wordLengthAverage) #fs.append(passage.lf.wordLengthSD) fs.append(passage.lf.wordTypeRatio) fs.append(passage.lf.indexOfGuiraud) #for x in passage.lf.wordCountInLevels: # fs.append(x) fs.append(passage.lf.gerundCount) #fs.append(passage.lf.gerundRatio) #fs.append(passage.lf.sentenceLengthAverage) #fs.append(passage.lf.sentenceLengthSD) #fs.append(passage.lf.automatedReadabilityIndex) fs.append(passage.lf.sentenceComplexity) #fs.append(passage.lf.sentenceComplexityScale) #fs.append(passage.cf.lsaSimilarity) #fs.append(passage.cf.proceduralVocabularyCount) fs.append(passage.cf.keywordCover) fs.append(passage.sf.connectiveCount) #fs.append(passage.sf.connectiveRatio) #fs.append(passage.sf.specialDemonstrativePronounCount) #fs.append(passage.sf.specialDemonstrativePronounUse) #fs.append(passage.sf.restPronounCount) #fs.append(passage.sf.restPronounUse) fs.append(passage.lf.highLowLevelRatio) fs.append((passage.lf.wordCountInLevels[3] + passage.lf.wordCountInLevels[4]) * 1.0 / (passage.lf.wordCountInLevels[1] + passage.lf.wordCountInLevels[2])) fs.append(passage.lf.overlyUseWordCount) return fs def train(self, passages): # ᅯᄂᄡᆭ■ᅫᅣᅰᅡ i = 1 for p in passages: #print "Passage ", i # ᄡᆭ■ᅫᅣᅰᅡ if not p.preprocessed: essayprepare.processPassage(p) i += 1 # ᄉ앿ᅪ passages.sort(cmp=lambda x,y: cmp(x.score, y.score), reverse=True) model = EssayModel() model.train(passages) self.models['1'] = model #print model.triGramDicts # ᅩ£고ᅰ self.extractor = FeatherExtractor(model) for p in passages: p.lf = self.extractor.extractLangFeather(p) p.cf = self.extractor.extractContentFeather(p) p.sf = self.extractor.extractStructureFeather(p) # ᅧ¦뽀ᅰᅱᄉ f = open('fs_train.txt', 'w') # 뽀ᅰᅬ endog = [] exog = [] labels = [] for p in passages: score = int(p.score) #if score > 95: score = 95 if score < 40: score = 40 endog.append(score) x = self.__getFeatherList(p) exog.append(x) labels.append(p.label) f.write(p.id + ' ') f.write(str(p.score)) for v in x: f.write(' ' + str(v)) f.write('\n') f.close() # SVM위¢ᅥᄉᄋ #self.svm_model = svmutil.svm_train(labels, exog, '-c 3') # ᅬᅯᄏᄍ←ᅣᆪᅪᄉᄋ endog = np.array(endog) exog = np.array(exog) # print endog # print exog # self.m = np.mean(exog,axis=0) # print self.m # # T, P, e_var = PCA_svd(exog) # print T # print P # print e_var # # r, c = P.shape # print r, c # for i in xrange(11, r): # for j in xrange(0, c): # P[i, j] = 0 # print P # self.p = P # # xexog = dot(P, exog.transpose()) # print xexog # print xexog.shape # # xxexog = xexog.transpose() self.gls_model = sm.GLS(endog, exog) self.gls_model.fit() # print self.gls_model.results.params def rate(self, passage): # ᅬᅯᅯᄂᄇ¬ if not passage.preprocessed: essayprepare.processPassage(passage) passage.lf = self.extractor.extractLangFeather(passage) passage.cf = self.extractor.extractContentFeather(passage) passage.sf = self.extractor.extractStructureFeather(passage) exog = [] x = self.__getFeatherList(passage) exog.append(x) # for i, xx in enumerate(x): # x[i] -= self.m[i] exog = np.array(exog) # xxexog = dot(self.p, exog.transpose()) # endog = self.gls_model.predict(xxexog.transpose()) endog = self.gls_model.predict(exog) passage.rateScore = endog[0] passage.endogScore = endog[0] # 쉐위ᅧ� # ᄌ홰ᅣᅰᅡᅲᅱᅧ�쉐 if (passage.lf.tokenCount < 100): passage.rateScore *= 0.8 elif passage.lf.tokenCount < 120: passage.rateScore *= 0.9 # ᄌ허ᄑᄒᄒ¦ᄈᄂ쉐 filter = 0 slv = passage.lf.sentenceLengthAverage if (slv < 10): filter = (10 - slv) * 2 if filter > 6: filter = 6 elif slv > 23: filter = (slv - 23) * 3 if filter > 9: filter = 9 passage.rateScore -= filter # ᄌ허ᄑᄒᄡᅧᄈᄂ쉐 filter = 0 wlv = passage.lf.wordLengthAverage if wlv < 4: filter = (4 - wlv) * 10 passage.rateScore -= filter # ᄌ혀ᄉᄡᅧᅥᄑᄒᄈᄂᄊ쉐 filter = 0 rwlv = passage.lf.noneStopWordLengthAverage if rwlv < 5.5: filter = (5.5 - rwlv) * 10 passage.rateScore -= filter # ᄌᄒᄡᅧᅯᄆ�쉐 filter = 0 nr = passage.lf.nounRatio if nr < 0.2: filter = (0.2 - nr) * 100 elif nr > 0.35: filter = (nr - 0.35) * 100 passage.rateScore -= filter filter = 0 vr = passage.lf.verbRatio if vr < 0.1: filter = (0.1 - vr) * 200 elif vr > 0.2: filter = (vr - 0.2) * 200 passage.rateScore -= filter filter = 0 ar = passage.lf.adjRatio if ar < 0.045: filter = (0.045 - ar) * 500 passage.rateScore -= filter filter = 0 badRatioCount = 0 offsetRatio = 0 if (nr < 0.2) or (nr > 0.3): badRatioCount += 1 else: offsetRatio += abs(nr - 0.25) / 0.1 if (vr < 0.1) or (vr > 0.2): badRatioCount += 1 else: offsetRatio += abs(vr - 0.15) / 0.1 if (ar < 0.06) or (ar > 0.13): badRatioCount += 1 else: offsetRatio += abs(ar - 0.095) / 0.14 if badRatioCount == 0: if offsetRatio < 0.1: filter = passage.rateScore * 0.05 elif badRatioCount == 1: if offsetRatio > 0.6: filter = - passage.rateScore * 0.05 elif badRatioCount > 1: filter = - passage.rateScore * 0.02 * badRatioCount * badRatioCount passage.rateScore += filter passage.offsetRatio = offsetRatio # ᄌ햐ᅳᅬ¢ᅨᅥᄊ쉐 if (passage.cf.lsaScore > 75) and (passage.cf.lsaSimilarity > 89) and (passage.rateScore > 75): passage.rateScore += 5 if ((passage.cf.lsaScore < 70) and (passage.rateScore < 70)) and (passage.cf.lsaSimilarity > 89): passage.rateScore -=5 filter = 0 if ((passage.cf.lsaSimilarity <= 80) and (passage.cf.lsaSimilarity > 60)) or ((passage.cf.lsaSimilarityAll <= 56) and (passage.cf.lsaSimilarityAll > 32)): filter = (15 - abs(passage.cf.lsaSimilarity - 70) / 3.0) # if passage.rateScore < passage.cf.lsaScore: # passage.rateScore = passage.cf.lsaScore passage.rateScore += filter self.generateRateResult(passage) passage.rated = True endog[0] = passage.rateScore return [passage.rateScore] def predict(self, passages): # ᅩ£고ᅰ for p in passages: if not p.preprocessed: essayprepare.processPassage(p) p.lf = self.extractor.extractLangFeather(p) p.cf = self.extractor.extractContentFeather(p) p.sf = self.extractor.extractStructureFeather(p) # ᅧ¦뽀ᅰᅱᄉ f = open('fs_test.txt', 'w') # 뽀ᅰᅬ endog = [] exog = [] labels = [] for p in passages: score = int(p.score) if score < 35: score = 35 endog.append(score) x = self.__getFeatherList(p) exog.append(x) labels.append(p.label) f.write(p.id + ' ') f.write(p.score) for v in x: f.write(' ' + str(v)) f.write('\n') f.close() p_label, p_acc, p_val = svmutil.svm_predict(labels, exog, self.svm_model) print p_label, p_acc, p_val def generateRateResult(self, passage): rateResult = {} rateResult['score'] = passage.rateScore rateResult['sentences'] = [] for para in passage.paragraphs: for sent in para.sentences: sentence = {} sentence['sentenceNo'] = sent.sentenceNo sentence['sentence'] = sent.sentence tokens = [token.token for token in sent.tokens] tags = [token.pos for token in sent.tokens] errorTokens = [token.token for token in sent.tokens if token.isSpellError] sentence['tokens'] = tokens sentence['tags'] = tags sentence['spellerror'] = errorTokens sentence['ltCheckResults'] = sent.ltCheckResults sentence['lgCheckResult'] = sent.canParsed sentence['complexity'] = sent.complexity rateResult['sentences'].append(sentence) passage.rateResult = rateResult
def train(self, passages): # 预处理文章 i = 1 for p in passages: #print "Passage ", i # 处理文章 if not p.preprocessed: essayprepare.processPassage(p) i += 1 # 训练模型 passages.sort(cmp=lambda x,y: cmp(x.score, y.score), reverse=True) model = EssayModel() model.train(passages) self.models['1'] = model #print model.triGramDicts # 提取特征 self.extractor = FeatherExtractor(model) for p in passages: p.lf = self.extractor.extractLangFeather(p) p.cf = self.extractor.extractContentFeather(p) p.sf = self.extractor.extractStructureFeather(p) # 输出特征值 f = open('fs_train.txt', 'w') # 生成特征向量 endog = [] exog = [] labels = [] for p in passages: score = int(p.score) # if score > 90: score = 90 # if score < 35: score = 35 endog.append(score) x = self.__getFeatherList(p) exog.append(x) labels.append(p.label) f.write(p.id + ' ') f.write(str(p.score)) for v in x: f.write(' ' + str(v)) f.write('\n') f.close() # SVM分类器训练 #self.svm_model = svmutil.svm_train(labels, exog, '-c 3') # 线性回归模型训练 endog = np.array(endog) exog = np.array(exog) # print endog # print exog # self.m = np.mean(exog,axis=0) # print self.m # # T, P, e_var = PCA_svd(exog) # print T # print P # print e_var # # r, c = P.shape # print r, c # for i in xrange(11, r): # for j in xrange(0, c): # P[i, j] = 0 # print P # self.p = P # # xexog = dot(P, exog.transpose()) # print xexog # print xexog.shape # # xxexog = xexog.transpose() self.gls_model = sm.GLS(endog, exog) self.gls_model.fit()
class GeneralEssayRater(): """这是一个通用的评分器 """ def __init__(self): self.model_params = [30, -0.9, -0.55, -80, 50, -0.38, 8.0, -145, 16.8, 0.05, 0.4, -0.04, 35, -0.4, 0.35, -0.5] pass def __getFeatherList(self, passage): """Get feathers from preprocessed passage """ fs = [] fs.append(1) # const fs.append(passage.lf.sentenceErrorCount) fs.append(passage.lf.spellErrorCount) #fs.append(passage.lf.ltErrorCount) fs.append(passage.lf.prepositionUse) fs.append(passage.lf.definiteArticleUse) #fs.append(passage.lf.wordCombRecurrentCount) #fs.append(passage.lf.tokenCount) #fs.append(passage.lf.wordTypeCount) fs.append(passage.lf.wordStemCount) fs.append(passage.lf.wordLengthAverage) #fs.append(passage.lf.wordLengthSD) fs.append(passage.lf.wordTypeRatio) fs.append(passage.lf.indexOfGuiraud) #for x in passage.lf.wordCountInLevels: # fs.append(x) fs.append(passage.lf.gerundCount) #fs.append(passage.lf.gerundRatio) #fs.append(passage.lf.sentenceLengthAverage) #fs.append(passage.lf.sentenceLengthSD) #fs.append(passage.lf.automatedReadabilityIndex) fs.append(passage.lf.sentenceComplexity) #fs.append(passage.lf.sentenceComplexityScale) #fs.append(passage.cf.lsaScore) #fs.append(passage.cf.proceduralVocabularyCount) #fs.append(passage.cf.keywordCover) fs.append(passage.sf.connectiveCount) #fs.append(passage.sf.connectiveRatio) #fs.append(passage.sf.specialDemonstrativePronounCount) #fs.append(passage.sf.specialDemonstrativePronounUse) #fs.append(passage.sf.restPronounCount) #fs.append(passage.sf.restPronounUse) #fs.append(passage.lf.highLowLevelRatio) # fs.append((passage.lf.wordCountInLevels[3] + passage.lf.wordCountInLevels[4]) * 1.0 # / (passage.lf.wordCountInLevels[1] + passage.lf.wordCountInLevels[2])) fs.append(passage.lf.overlyUseWordCount) fs.append(passage.lf.aclWordCount) #fs.append(passage.lf.aclWordRatio) fs.append(passage.lf.nominalizationCountUnique) #fs.append(passage.lf.pn_range_count[2]) fs.append((passage.lf.pn_range_count[2] + passage.lf.pn_range_count[3])/passage.lf.pn_range_count[1]) fs.append(passage.lf.top_sentence_length) return fs def train(self, passages): # pre-process passage i = 1 for p in passages: print "=======================" print "Passage", i, p.id if not p.preprocessed: essayprepare.processPassage(p) i += 1 self.extractor = FeatherExtractor(None) for p in passages: p.lf = self.extractor.extractLangFeather(p) p.cf = self.extractor.extractContentFeather(p) p.sf = self.extractor.extractStructureFeather(p) # save feathers f = open('fs_zhang_train.txt', 'w') for p in passages: x = self.__getFeatherList(p) f.write(p.id + ' ') f.write(str(p.score)) for v in x: f.write(' ' + str(v)) f.write('\n') f.close() # generate feather vector endog = [] exog = [] for p in passages: score = int(p.score) endog.append(score) x = self.__getFeatherList(p) exog.append(x) # train model endog = np.array(endog) exog = np.array(exog) self.gls_model = sm.GLS(endog, exog) results = self.gls_model.fit() #print results.summary() print results.params def tokenCountFilter(self, passage): # 根据文章字数调整 filter = 0 if (passage.lf.tokenCount < 100): filter = passage.rateScore * 0.2 elif passage.lf.tokenCount < 120: filter = passage.rateScore * 0.1 filter = - filter return filter def sentenceLengthAverageFilter(self, passage): # 根据平均句长调整 filter = 0 slv = passage.lf.sentenceLengthAverage if (slv < 10): filter = (10 - slv) * 2 if filter > 6: filter = 6 elif slv > 23: filter = (slv - 23) * 3 if filter > 9: filter = 9 filter = - filter return filter def wordLengthAverageFilter(self, passage): # 根据平均词长调整 filter = 0 wlv = passage.lf.wordLengthAverage if wlv < 4: filter = (4 - wlv) * 10 filter = - filter return filter def aclWordCountFilter(self, passage): # 根据学术词汇数调整 filter = 0 acl = passage.lf.aclWordCount if acl > 9: filter = passage.rateScore * 0.1 return filter def noneStopWordLengthAverageFilter(self, passage): # 根据实词平均长度调整 filter = 0 rwlv = passage.lf.noneStopWordLengthAverage if rwlv < 5.5: filter = (5.5 - rwlv) * 10 filter = -filter return filter def nounRatioFilter(self, passage): # 根据词性比例调整 filter = 0 nr = passage.lf.nounRatio if nr < 0.2: filter = (0.2 - nr) * 100 elif nr > 0.35: filter = (nr - 0.35) * 100 filter = - filter return filter def verbRatioFilter(self, passage): filter = 0 vr = passage.lf.verbRatio if vr < 0.1: filter = (0.1 - vr) * 200 elif vr > 0.2: filter = (vr - 0.2) * 200 filter = - filter return filter def adjRatioFilter(self, passage): filter = 0 ar = passage.lf.adjRatio if ar < 0.045: filter = (0.045 - ar) * 500 filter = - filter return filter def posRatioFilter(self, passage): filter = 0 badRatioCount = 0 offsetRatio = 0 nr = passage.lf.nounRatio vr = passage.lf.verbRatio ar = passage.lf.adjRatio if (nr < 0.2) or (nr > 0.3): badRatioCount += 1 else: offsetRatio += abs(nr - 0.25) / 0.1 if (vr < 0.1) or (vr > 0.2): badRatioCount += 1 else: offsetRatio += abs(vr - 0.15) / 0.1 if (ar < 0.06) or (ar > 0.15): badRatioCount += 1 else: offsetRatio += abs(ar - 0.105) / 0.15 if badRatioCount == 0: if offsetRatio < 0.1: filter = passage.rateScore * 0.05 elif badRatioCount == 1: if offsetRatio > 0.6: filter = - passage.rateScore * 0.05 elif badRatioCount > 1: filter = - passage.rateScore * 0.02 * badRatioCount * badRatioCount passage.offsetRatio = offsetRatio return filter def rate(self, passage): # 线性预测 if not passage.preprocessed: essayprepare.processPassage(passage) passage.lf = self.extractor.extractLangFeather(passage) passage.cf = self.extractor.extractContentFeather(passage) passage.sf = self.extractor.extractStructureFeather(passage) exog = [] x = self.__getFeatherList(passage) exog.append(x) exog = np.array(exog) endog = self.gls_model.predict(exog) passage.rateScore = endog[0] passage.endogScore = endog[0] passage.filters = [] # 调整分数 filter = self.tokenCountFilter(passage) passage.rateScore += filter passage.filters.append(filter) filter = self.sentenceLengthAverageFilter(passage) passage.rateScore += filter passage.filters.append(filter) filter = self.wordLengthAverageFilter(passage) passage.rateScore += filter passage.filters.append(filter) filter = self.aclWordCountFilter(passage) passage.rateScore += filter passage.filters.append(filter) filter = self.noneStopWordLengthAverageFilter(passage) passage.rateScore += filter passage.filters.append(filter) filter = self.nounRatioFilter(passage) passage.rateScore += filter passage.filters.append(filter) filter = self.verbRatioFilter(passage) #passage.rateScore += filter passage.filters.append(filter) filter = self.adjRatioFilter(passage) #passage.rateScore += filter passage.filters.append(filter) filter = self.posRatioFilter(passage) #passage.rateScore += filter passage.filters.append(filter) passage.rated = True endog[0] = passage.rateScore return [passage.rateScore] def rate_by_params(self, passage): # 线性预测 extractor = FeatherExtractor(None) if not passage.preprocessed: essayprepare.processPassage(passage) passage.lf = extractor.extractLangFeather(passage) passage.cf = extractor.extractContentFeather(passage) passage.sf = extractor.extractStructureFeather(passage) exog = [] x = self.__getFeatherList(passage) score = dot(x, self.model_params) passage.rateScore = score passage.endogScore = score # 调整分数 passage.filter_scores = [] filters = [self.tokenCountFilter, self.sentenceLengthAverageFilter, self.wordLengthAverageFilter, self.aclWordCountFilter, self.noneStopWordLengthAverageFilter, self.nounRatioFilter] for filter in filters: filter_score = filter(passage) passage.rateScore += filter_score passage.filter_scores.append(filter_score) passage.rated = True return [passage.rateScore]
print tokens print tags print lemmas print stems print levels print sentNos print paraNos print nos print sent.tokenCount print sent.wordCount print sent.realWordCount print "三元词组", passage.trigrams e = FeatherExtractor() # 提取语言特征 languageFeather = e.extractLangFeather(passage) print u"词次总数", languageFeather.tokenCount print u"单词总数", languageFeather.wordCount print u"词形总数", languageFeather.wordTypeCount print u"词元总数", languageFeather.wordLemmaCount print u"介词个数", languageFeather.prepositionCount print u"介词比例", languageFeather.prepositionRatio print u"介词使用", languageFeather.prepositionUse print u"定冠词个数", languageFeather.definiteArticleCount print u"定冠词比例", languageFeather.definiteArticleRatio