Exemplo n.º 1
0
def demo_one():
    content = """At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places."""

    # 文章
    passage = EssayPassage()
    passage.passage = content
    passage.title = 'title'
    passage.score = 5
    passage.id = '1'
    passage.reviewerId = 3
    passage.content = content

    r = SimpleEssayRater()
    s = r.rate_by_params(passage)
    passage.newscore = s[0]
    print passage.id, passage.score, s

    print 'OK'
Exemplo n.º 2
0
def demo_one_sentence():
    # 文章
    passage = EssayPassage()
    passage.passage = 'I am a students.'
    passage.title = 'title'
    passage.score = 5
    passage.id = '1'
    passage.reviewerId = 3
    passage.content = 'I am a students.'

    # 处理文章
    essayprepare.processPassage(passage)

    extractor = FeatherExtractor()
    lf = extractor.extractLangFeather(passage)
    passage.lf = lf
    cf = extractor.extractContentFeather(passage)
    sf = extractor.extractStructureFeather(passage)

    print 'OK'
Exemplo n.º 3
0
def simlarityTest():
    pkfile = open('rater.pkl', 'r')
    rater = pickle.load(pkfile)
    pkfile.close()
    
    essays = CLECReader.parseCLECFile2('clecst/ST3.txt')
    print len(essays)
    
    essayDict = {}
    
    for e in essays:
        if not essayDict.has_key(e.title):
            essayDict[e.title] = []
        essayDict[e.title].append(e)
        
    print essayDict.keys()
    
    for k, v in essayDict.items():
        print len(v), k
        
    passages = []
    count = 0 
    for e in essayDict['Global Shortage of Fresh Water'][:120]:  
        count += 1     
        newpassage = EssayPassage()
        newpassage.passage = e.content
        newpassage.id = str(count) 
        newpassage.score = e.score
        newpassage.processStatus = 0
        passages.append(newpassage)
        rater.rate(newpassage)

    for p in passages:
        print p.score, p.rateScore, p.lsaSimilarity, p.lsaSimilarityAll
    
    print "OK"
Exemplo n.º 4
0
def demo_one():
    content = """At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places."""

    # 文章
    passage = EssayPassage()
    passage.passage = content
    passage.title = 'title'
    passage.score = 5
    passage.id = '1'
    passage.reviewerId = 3
    passage.content = content
       
    r = SimpleEssayRater()
    s = r.rate_by_params(passage)
    passage.newscore = s[0]
    print passage.id, passage.score, s
    
    print 'OK'
Exemplo n.º 5
0
def demo_one_sentence():
    # 文章
    passage = EssayPassage()
    passage.passage = 'I am a students.'
    passage.title = 'title'
    passage.score = 5
    passage.id = '1'
    passage.reviewerId = 3
    passage.content = 'I am a students.'
    
    # 处理文章
    essayprepare.processPassage(passage)
    
    extractor = FeatherExtractor()
    lf = extractor.extractLangFeather(passage)
    passage.lf = lf
    cf = extractor.extractContentFeather(passage)
    sf = extractor.extractStructureFeather(passage)   
    
    print 'OK'
Exemplo n.º 6
0
def generateUSTCFeathers(ustcFilename, outFilename):
    essays = USTCReader.parseUSTCFile(ustcFilename)

    passages = []

    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        passage.content = essay.content
        passages.append(passage)

    generatePassageFeathers(passages[:], outFilename)

    pkfile = open('ustcpassages_503_lt.pkl', 'w')
    pickle.dump(passages, pkfile)
    pkfile.close()
Exemplo n.º 7
0
def generateUSTCFeathers(ustcFilename, outFilename):
    essays = USTCReader.parseUSTCFile(ustcFilename)
    
    passages = []
    
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        passage.content = essay.content
        passages.append(passage)

    generatePassageFeathers(passages[:], outFilename)
    
    pkfile = open('ustcpassages_503_lt.pkl', 'w')
    pickle.dump(passages, pkfile)
    pkfile.close()  
Exemplo n.º 8
0
def simlarityTest():
    pkfile = open('rater.pkl', 'r')
    rater = pickle.load(pkfile)
    pkfile.close()

    essays = CLECReader.parseCLECFile2('clecst/ST3.txt')
    print len(essays)

    essayDict = {}

    for e in essays:
        if not essayDict.has_key(e.title):
            essayDict[e.title] = []
        essayDict[e.title].append(e)

    print essayDict.keys()

    for k, v in essayDict.items():
        print len(v), k

    passages = []
    count = 0
    for e in essayDict['Global Shortage of Fresh Water'][:120]:
        count += 1
        newpassage = EssayPassage()
        newpassage.passage = e.content
        newpassage.id = str(count)
        newpassage.score = e.score
        newpassage.processStatus = 0
        passages.append(newpassage)
        rater.rate(newpassage)

    for p in passages:
        print p.score, p.rateScore, p.lsaSimilarity, p.lsaSimilarityAll

    print "OK"
Exemplo n.º 9
0
def demo2():
    essays = USTCReader.parseUSTCFile("USTC2011Jan.txt")
    trains = []
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        passage.content = essay.content
        trains.append(passage)
        
#    for p in trains[:30]:
#        essayprepare.processPassage(p)
        
    for p in trains[:100]:
        # 拆分段落
        print "+++++++++++++++++++++++"
        paras = essayprepare.para_tokenizer.tokenize(p.content)
        pcount1 = len(paras)
        scount1 = 0
        for para in paras:
            sents = essayprepare.markedSentenceTokenize(para)  
#            for sent in sents:
#            	print "### ", sent
            scount1 += len(sents)
        print "-----------------------"
        paras = essayprepare.para_tokenizer.tokenize(p.passage)
        pcount2 = len(paras)
        scount2 = 0
        for para in paras:
            sents = essayprepare.sent_tokenizer.tokenize(para)  
#            for sent in sents:
#            	print "### ", sent
            scount2 += len(sents)
        if pcount1 != pcount2 or scount1 != scount2:
            print p.content
            print p.passage
        print "\n"
            	
#    for i, p in enumerate(trains[:30]):
#    	for para in p.paragraphs:
#    		for sent in para.sentences:
#    			for token in sent.tokens:
#    				if token.isSpellError:
#    					print token.token, token.candidates
#    	for m in essays[i].findMarks():
#    		if m[0] == 'fm1' or m[0] == 'fm2' or m[0] == 'sw':
#    			print m
        
#    egrammar_vp1 = "EVP1: {<NN><RB>?<VB>}"
#    parser_evp1 = nltk.RegexpParser(egrammar_vp1)    
#
#    for p in trains[:50]:
#        for para in p.paragraphs:
#            for sent in para.sentences:
#                sentence = [(token.token, token.pos) for token in sent.tokens]
#                result = parser_evp1.parse(sentence)
#                r = str(result)
#                if r.find('EVP1') > 0: print r
        
    print "demo2 over" 
Exemplo n.º 10
0
def demo2():
    essays = USTCReader.parseUSTCFile("USTC2011Jan.txt")
    trains = []
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        passage.content = essay.content
        trains.append(passage)

#    for p in trains[:30]:
#        essayprepare.processPassage(p)

    for p in trains[:100]:
        # 拆分段落
        print "+++++++++++++++++++++++"
        paras = essayprepare.para_tokenizer.tokenize(p.content)
        pcount1 = len(paras)
        scount1 = 0
        for para in paras:
            sents = essayprepare.markedSentenceTokenize(para)
            #            for sent in sents:
            #            	print "### ", sent
            scount1 += len(sents)
        print "-----------------------"
        paras = essayprepare.para_tokenizer.tokenize(p.passage)
        pcount2 = len(paras)
        scount2 = 0
        for para in paras:
            sents = essayprepare.sent_tokenizer.tokenize(para)
            #            for sent in sents:
            #            	print "### ", sent
            scount2 += len(sents)
        if pcount1 != pcount2 or scount1 != scount2:
            print p.content
            print p.passage
        print "\n"


#    for i, p in enumerate(trains[:30]):
#    	for para in p.paragraphs:
#    		for sent in para.sentences:
#    			for token in sent.tokens:
#    				if token.isSpellError:
#    					print token.token, token.candidates
#    	for m in essays[i].findMarks():
#    		if m[0] == 'fm1' or m[0] == 'fm2' or m[0] == 'sw':
#    			print m

#    egrammar_vp1 = "EVP1: {<NN><RB>?<VB>}"
#    parser_evp1 = nltk.RegexpParser(egrammar_vp1)
#
#    for p in trains[:50]:
#        for para in p.paragraphs:
#            for sent in para.sentences:
#                sentence = [(token.token, token.pos) for token in sent.tokens]
#                result = parser_evp1.parse(sentence)
#                r = str(result)
#                if r.find('EVP1') > 0: print r

    print "demo2 over"
Exemplo n.º 11
0
    #wordRepetitiveDemo()
    #demo2()
    demo_one_sentence()
    exit()

    essays = USTCReader.parseUSTCFile('USTC2011Jan.txt')
    print len(essays)

    essay = None
    for e in essays:
        if e.id == "0092":
            essay = e
            break

    # 文章
    passage = EssayPassage()
    passage.passage = essay.cleanContent()
    passage.title = essay.title
    passage.score = essay.score
    passage.id = essay.id
    passage.reviewerId = essay.reviewerId
    passage.content = essay.content

    # 处理文章
    essayprepare.processPassage(passage)

    # 输出来看看是啥样子
    print "PASSAGE========================================="
    print passage
    print passage.id
    print passage.title
Exemplo n.º 12
0
def do_task(task):
    newpassage = EssayPassage()
    newpassage.passage = task['input']['content']
    newpassage.orderId = task['id']
    newpassage.score = 0
    newpassage.processStatus = 0
    try:
        essayprepare.processPassage(newpassage, fn_prepare_progress)
        fe = extractor.FeatherExtractor()
        lf = fe.extractLangFeather(newpassage)
        newpassage.lf = lf
        cf = fe.extractContentFeather(newpassage)
        newpassage.cf = cf
        sf = fe.extractStructureFeather(newpassage) 
        newpassage.sf = sf
        newpassage.score = rater.rate_by_params(newpassage)[0]
    except:
        task['progress'] = -2
        task['status'] = 'TUTERR'
        task['output'] = ""
        task['simple_output'] = ""
        task['detail_output'] = ""
        commit_task(task)
        return

    # 生成最终结果
    output = {}
    passage = {}
    passage['score'] = newpassage.score
    passage['token_count'] = lf.tokenCount
    passage['word_count'] = lf.wordCount
    passage['word_type_count'] = lf.wordTypeCount
    passage['word_lemma_count'] = lf.wordLemmaCount
    passage['word_stem_count'] = lf.wordStemCount
    passage['average_word_length'] = lf.wordLengthAverage
    passage['average_sentence_length'] = lf.sentenceLengthAverage
    passage['overly_use_word_count'] = lf.overlyUseWordCount
    passage['paragraph_count'] = len(newpassage.paragraphs)
    passage['sentence_count'] = newpassage.sentenceCount
    passage['sentences'] = []
    for para in newpassage.paragraphs:
        for sent in para.sentences:
            sentence = {}
            sentence['no'] = sent.sentenceNo
            sentence['para_no'] = para.paragraphNo
            sentence['original'] = sent.sentence
            sentence['score'] = 0
            spell_errors = []
            fs = []
            for token in sent.tokens:
                if token.isSpellError:
                    fs.append('<ESP>' + token.token + '</ESP>')
                    spell_error = {}
                    spell_error['token'] = token.token
                    spell_error['lemma'] = token.lemma
                    spell_error['suggest'] = token.candidates
                    spell_error['start_at'] = token.startAt
                    spell_error['end_at'] = token.endAt
                    spell_errors.append(spell_error)
                else:
                    fs.append(token.token)
            sentence['spell_errors'] = spell_errors
            sentence['marked'] = ' '.join(fs)
            sentence['lt_result'] = sent.ltCheckResults   
            sentence['lg_result'] = sent.lgCheckResults
            sentence['links'] = []
            passage['sentences'].append(sentence)
           
    output['passage'] = passage
    task['progress'] = 100
    task['status'] = 'DONE'
    task['output'] = json.dumps(output)
    task['simple_output'] = json.dumps(output)    
    task['detail_output'] = json.dumps(generate_detail_output(newpassage))   
        
    commit_task(task)
Exemplo n.º 13
0
 def processEssay(self):
     self.browser.clear()
     id = unicode(self.lineedit.text())
     essay = self.essayDict.get(id)
     if not essay:
         self.browser.append("<font color=red>%s is not found!</font>" % id)
         return
     
     self.browser.append(essay.content)
     
     # 文章
     passage = EssayPassage()
     passage.passage = essay.cleanContent()
     passage.title = essay.title
     passage.score = essay.score
     passage.id = essay.id
     
     # 处理文章
     essayprepare.processPassage(passage)
     
     # 输出来看看是啥样子    
     self.browser.append("PASSAGE=========================================")        
     self.browser.append(passage.id)
     #self.browser.append(passage.title)
     self.browser.append(passage.score)
     self.browser.append(passage.passage)
     self.browser.append(str(len(passage.paragraphs)))
     self.browser.append("PARAGRAPHS---------------------------------------")
     for para in passage.paragraphs:
         self.browser.append(str(para.paragraphNo))
         self.browser.append(para.paragraph)
         for sent in para.sentences:
             self.browser.append(str(sent.sentenceNo))
             self.browser.append(str(sent.paragraphSentenceNo))
             self.browser.append(sent.sentence)
             tokens = [token.token for token in sent.tokens]
             tags = [token.pos for token in sent.tokens]
             lemmas = [token.lemma for token in sent.tokens]
             stems = [token.stem for token in sent.tokens]
             levels = [token.level for token in sent.tokens]
             nos = [token.tokenNo for token in sent.tokens]
             sentNos = [token.sentenceTokenNo for token in sent.tokens]
             paraNos = [token.paragraphTokenNo for token in sent.tokens]
             errorTokens = [token.token for token in sent.tokens if token.isSpellError]
             if not sent.canParsed:
                 self.browser.append("<font color=red>SENTENCE ERROR</font>")
             self.browser.append("<font color=red>SPELLERROR %s</font>" % str(errorTokens))
             self.browser.append(str(tokens))
             self.browser.append(str(tags))
             self.browser.append(str(lemmas))
             self.browser.append(str(stems))
             self.browser.append(str(levels))
             self.browser.append(str(sentNos))
             self.browser.append(str(paraNos))
             self.browser.append(str(nos))
             self.browser.append(str(sent.tokenCount))
             self.browser.append(str(sent.wordCount))
             self.browser.append(str(sent.realWordCount))
     
     self.browser.append(u"三元词组" + ' ' + str(passage.trigrams))
 
 
     e = FeatherExtractor()
 
     # 提取语言特征    
     languageFeather = e.extractLangFeather(passage)  
     
     print u"词次总数", languageFeather.tokenCount
     print u"单词总数", languageFeather.wordCount
     print u"词形总数", languageFeather.wordTypeCount
     print u"词元总数", languageFeather.wordLemmaCount
     
     print u"介词个数", languageFeather.prepositionCount
     print u"介词比例", languageFeather.prepositionRatio
     print u"介词使用", languageFeather.prepositionUse
     
     print u"定冠词个数", languageFeather.definiteArticleCount
     print u"定冠词比例", languageFeather.definiteArticleRatio
     print u"定冠词使用", languageFeather.definiteArticleUse
     
     # 提取结构特征  
     #structureFeather = e.extractStructureFeather(passage)
     
     #generateUSTCFeathers('USTC2011Jan.txt', 'USTCFeathers_503.txt')
         
     print "...OVER"
Exemplo n.º 14
0
 orderId = 1
 waitingPassages = []
 donePassages = {}
 
 passage = None
 while True:
     request = socket.recv()
     print request
     try:
         rs = json.loads(request)
     except:
         socket.send("")
         continue
     if rs['ACTION'] == 'SUBMIT':
         orderId += 1
         newpassage = EssayPassage()
         newpassage.passage = rs['text']
         newpassage.orderId = orderId
         newpassage.score = 0
         newpassage.processStatus = 0
         waitingPassages.append(newpassage)
         if ((not passage) or passage.rated) and len(waitingPassages) > 0:
             passage = waitingPassages.pop(0)
             donePassages[passage.orderId] = passage
             rthread = RatePassageThread(rater, passage)
             rthread.start()
         reply = json.dumps({'orderId':orderId, 'progress':0, 'rated':0})
         socket.send_unicode(reply)
     elif rs['ACTION'] == 'QUERY':
         oId = int(rs['orderId'])
         if not oId in donePassages:
Exemplo n.º 15
0
def do_task(task):
    newpassage = EssayPassage()
    newpassage.passage = task['input']['content']
    newpassage.orderId = task['id']
    newpassage.score = 0
    newpassage.processStatus = 0
    try:
        essayprepare.processPassage(newpassage, fn_prepare_progress)
        fe = extractor.FeatherExtractor()
        lf = fe.extractLangFeather(newpassage)
        newpassage.lf = lf
        cf = fe.extractContentFeather(newpassage)
        newpassage.cf = cf
        sf = fe.extractStructureFeather(newpassage)
        newpassage.sf = sf
        newpassage.score = rater.rate_by_params(newpassage)[0]
    except:
        task['progress'] = -2
        task['status'] = 'TUTERR'
        task['output'] = ""
        task['simple_output'] = ""
        task['detail_output'] = ""
        commit_task(task)
        return

    # 生成最终结果
    output = {}
    passage = {}
    passage['score'] = newpassage.score
    passage['token_count'] = lf.tokenCount
    passage['word_count'] = lf.wordCount
    passage['word_type_count'] = lf.wordTypeCount
    passage['word_lemma_count'] = lf.wordLemmaCount
    passage['word_stem_count'] = lf.wordStemCount
    passage['average_word_length'] = lf.wordLengthAverage
    passage['average_sentence_length'] = lf.sentenceLengthAverage
    passage['overly_use_word_count'] = lf.overlyUseWordCount
    passage['paragraph_count'] = len(newpassage.paragraphs)
    passage['sentence_count'] = newpassage.sentenceCount
    passage['sentences'] = []
    for para in newpassage.paragraphs:
        for sent in para.sentences:
            sentence = {}
            sentence['no'] = sent.sentenceNo
            sentence['para_no'] = para.paragraphNo
            sentence['original'] = sent.sentence
            sentence['score'] = 0
            spell_errors = []
            fs = []
            for token in sent.tokens:
                if token.isSpellError:
                    fs.append('<ESP>' + token.token + '</ESP>')
                    spell_error = {}
                    spell_error['token'] = token.token
                    spell_error['lemma'] = token.lemma
                    spell_error['suggest'] = token.candidates
                    spell_error['start_at'] = token.startAt
                    spell_error['end_at'] = token.endAt
                    spell_errors.append(spell_error)
                else:
                    fs.append(token.token)
            sentence['spell_errors'] = spell_errors
            sentence['marked'] = ' '.join(fs)
            sentence['lt_result'] = sent.ltCheckResults
            sentence['lg_result'] = sent.lgCheckResults
            sentence['links'] = []
            passage['sentences'].append(sentence)

    output['passage'] = passage
    task['progress'] = 100
    task['status'] = 'DONE'
    task['output'] = json.dumps(output)
    task['simple_output'] = json.dumps(output)
    task['detail_output'] = json.dumps(generate_detail_output(newpassage))

    commit_task(task)
Exemplo n.º 16
0
def demo():
    print "rater demo" 

    # 读训练集
    essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt")
    trains = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        trains.append(passage)

     # 训练打分器
    r = GeneralEssayRater()
    r.train(trains)
    
    pkfile = open('zhang_trains.pkl', 'w')
    pickle.dump(trains, pkfile)
    pkfile.close()  


    # 读测试集
    essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt")
    tests = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        tests.append(passage)

    # 打分测试
    for p in tests:
        s = r.rate(p)
        p.newscore = s[0]
        print p.id, p.score, s
        
    for p in tests:
        print p.id, p.score, p.newscore

    
    pkfile = open('zhang_tests.pkl', 'w')
    pickle.dump(tests, pkfile)
    pkfile.close()  
        
    print "demo over!!!"
Exemplo n.º 17
0
 #wordRepetitiveDemo()
 #demo2()
 demo_one_sentence()
 exit() 
 
 essays = USTCReader.parseUSTCFile('USTC2011Jan.txt')
 print len(essays)
 
 essay = None
 for e in essays:
     if e.id == "0092":
         essay = e
         break  
 
 # 文章
 passage = EssayPassage()
 passage.passage = essay.cleanContent()
 passage.title = essay.title
 passage.score = essay.score
 passage.id = essay.id
 passage.reviewerId = essay.reviewerId
 passage.content = essay.content
 
 # 处理文章
 essayprepare.processPassage(passage)
 
 # 输出来看看是啥样子    
 print "PASSAGE========================================="        
 print passage
 print passage.id
 print passage.title
Exemplo n.º 18
0
def demo2():
    print "rater demo2" 

    # 读训练集
    essays = USTCReader.parseUSTCFile("USTC2011Jan.txt")
    trains = []
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        trains.append(passage)

     # 训练打分器
    r = CollegeEssayRater()
    r.train(trains)
    
    pkfile = open('USTC2011Jan.pkl', 'w')
    pickle.dump(trains, pkfile)
    pkfile.close()  
    
    exit()    

    # 读测试集
    essays = USTCReader.parseUSTCFile("USTC2011Jan-tfidf.txt")
    tests = []
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        tests.append(passage)

    # 打分测试
#    for p in tests:
#        s = r.rate(p)
#        p.newscore = s[0]
#        print p.id, p.score, s
#        
#    for p in tests:
#        print p.id, p.score, p.newscore
        
    print "SVM......"
    r.predict(tests)
    
    pkfile = open('ustc_test.pkl', 'w')
    pickle.dump(tests, pkfile)
    pkfile.close()  
        
    print "demo2 over!!!"
Exemplo n.º 19
0
def demo_crossvalidate_zhang():
    print "rater demo_crossvalidate_zhang"
    
    # 读训练集
    essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt")
    trains = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        trains.append(passage)

    # 读测试集
    essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt")
    tests = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        tests.append(passage)
    

    passages = []
    passages.extend(trains)
    passages.extend(tests)

    random.shuffle(passages)    
    
    scoreEssays = {}
    for p in passages:
        if p.score < 35: p.score = 35
        label = (int(p.score) + 2) / 5 - 4
        if label < 3: 
            label = 3
            #continue
        if label > 14: label = 14
        p.label = label
        if label not in scoreEssays:
            scoreEssays[label] = []
        scoreEssays[label].append(p)


    # cross validate

    ps = [[], [], [], [], []]
    left = []

    for k, v in scoreEssays.items():
        print k
        print len(v)
        if len(v) > 5:
            s = len(v) / 5
            for i in range(5):
                ps[i].extend(v[i*s: (i+1)*s])
            left.extend(v[5*s:])
        else:
            left.extend(v)
    for j in range(len(left)):
        ps[j % 5].append(left[j])
    
    print "data sets: "
    for v in ps:
        print len(v)
    

    for i in range(5):
        trains = []
        tests = []
        
        for j in range(5):
            if i == j:
                tests.extend(ps[j])
            else:
                trains.extend(ps[j])
        
        r = CollegeEssayRater()       
        r.train(trains)
        
        for p in tests:
            s = r.rate(p)
            p.newscore = s[0]
            print p.id, p.score, s
        
    s1 = []
    s2 = []    
    for p in passages:
        if p.label < 3: continue
        s1.append(int(p.score))
        s2.append(p.newscore)
        print p.id, p.score, p.endogScore, int(round(p.newscore)), p.score - int(round(p.newscore)), \
        p.lsaScore, p.lsaSimilarity, p.lsaSimilarityAll, p.lf.tokenCount, \
        p.lf.sentenceLengthAverage, p.lf.wordLengthAverage, p.lf.noneStopWordLengthAverage, \
        p.lf.nounRatio, p.lf.verbRatio, p.lf.adjRatio, p.lf.sentenceLengthSD, p.offsetRatio, \
        p.lf.aclWordCount, p.lf.aclWordRatio
        
    print scistats.pearsonr(s1, s2)      
    
    pkfile = open('zhang_all.pkl', 'w')
    pickle.dump(passages, pkfile)
    pkfile.close()    
        
    print "demo_crossvalidate over!!!"    
Exemplo n.º 20
0
def demo():
    print "rater demo"

    # 读训练集
    essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt")
    trains = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        trains.append(passage)

    # 训练打分器
    r = GeneralEssayRater()
    r.train(trains)

    pkfile = open('zhang_trains.pkl', 'w')
    pickle.dump(trains, pkfile)
    pkfile.close()

    # 读测试集
    essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt")
    tests = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id
        tests.append(passage)

    # 打分测试
    for p in tests:
        s = r.rate(p)
        p.newscore = s[0]
        print p.id, p.score, s

    for p in tests:
        print p.id, p.score, p.newscore

    pkfile = open('zhang_tests.pkl', 'w')
    pickle.dump(tests, pkfile)
    pkfile.close()

    print "demo over!!!"
Exemplo n.º 21
0
    def processEssay(self):
        self.browser.clear()
        id = unicode(self.lineedit.text())
        essay = self.essayDict.get(id)
        if not essay:
            self.browser.append("<font color=red>%s is not found!</font>" % id)
            return

        self.browser.append(essay.content)

        # 文章
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id

        # 处理文章
        essayprepare.processPassage(passage)

        # 输出来看看是啥样子
        self.browser.append("PASSAGE=========================================")
        self.browser.append(passage.id)
        #self.browser.append(passage.title)
        self.browser.append(passage.score)
        self.browser.append(passage.passage)
        self.browser.append(str(len(passage.paragraphs)))
        self.browser.append(
            "PARAGRAPHS---------------------------------------")
        for para in passage.paragraphs:
            self.browser.append(str(para.paragraphNo))
            self.browser.append(para.paragraph)
            for sent in para.sentences:
                self.browser.append(str(sent.sentenceNo))
                self.browser.append(str(sent.paragraphSentenceNo))
                self.browser.append(sent.sentence)
                tokens = [token.token for token in sent.tokens]
                tags = [token.pos for token in sent.tokens]
                lemmas = [token.lemma for token in sent.tokens]
                stems = [token.stem for token in sent.tokens]
                levels = [token.level for token in sent.tokens]
                nos = [token.tokenNo for token in sent.tokens]
                sentNos = [token.sentenceTokenNo for token in sent.tokens]
                paraNos = [token.paragraphTokenNo for token in sent.tokens]
                errorTokens = [
                    token.token for token in sent.tokens if token.isSpellError
                ]
                if not sent.canParsed:
                    self.browser.append(
                        "<font color=red>SENTENCE ERROR</font>")
                self.browser.append("<font color=red>SPELLERROR %s</font>" %
                                    str(errorTokens))
                self.browser.append(str(tokens))
                self.browser.append(str(tags))
                self.browser.append(str(lemmas))
                self.browser.append(str(stems))
                self.browser.append(str(levels))
                self.browser.append(str(sentNos))
                self.browser.append(str(paraNos))
                self.browser.append(str(nos))
                self.browser.append(str(sent.tokenCount))
                self.browser.append(str(sent.wordCount))
                self.browser.append(str(sent.realWordCount))

        self.browser.append(u"三元词组" + ' ' + str(passage.trigrams))

        e = FeatherExtractor()

        # 提取语言特征
        languageFeather = e.extractLangFeather(passage)

        print u"词次总数", languageFeather.tokenCount
        print u"单词总数", languageFeather.wordCount
        print u"词形总数", languageFeather.wordTypeCount
        print u"词元总数", languageFeather.wordLemmaCount

        print u"介词个数", languageFeather.prepositionCount
        print u"介词比例", languageFeather.prepositionRatio
        print u"介词使用", languageFeather.prepositionUse

        print u"定冠词个数", languageFeather.definiteArticleCount
        print u"定冠词比例", languageFeather.definiteArticleRatio
        print u"定冠词使用", languageFeather.definiteArticleUse

        # 提取结构特征
        #structureFeather = e.extractStructureFeather(passage)

        #generateUSTCFeathers('USTC2011Jan.txt', 'USTCFeathers_503.txt')

        print "...OVER"