def demo_one(): content = """At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places.""" # 文章 passage = EssayPassage() passage.passage = content passage.title = 'title' passage.score = 5 passage.id = '1' passage.reviewerId = 3 passage.content = content r = SimpleEssayRater() s = r.rate_by_params(passage) passage.newscore = s[0] print passage.id, passage.score, s print 'OK'
def generateUSTCFeathers(ustcFilename, outFilename): essays = USTCReader.parseUSTCFile(ustcFilename) passages = [] for essay in essays: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId passage.content = essay.content passages.append(passage) generatePassageFeathers(passages[:], outFilename) pkfile = open('ustcpassages_503_lt.pkl', 'w') pickle.dump(passages, pkfile) pkfile.close()
def demo_one_sentence(): # 文章 passage = EssayPassage() passage.passage = 'I am a students.' passage.title = 'title' passage.score = 5 passage.id = '1' passage.reviewerId = 3 passage.content = 'I am a students.' # 处理文章 essayprepare.processPassage(passage) extractor = FeatherExtractor() lf = extractor.extractLangFeather(passage) passage.lf = lf cf = extractor.extractContentFeather(passage) sf = extractor.extractStructureFeather(passage) print 'OK'
def demo2(): essays = USTCReader.parseUSTCFile("USTC2011Jan.txt") trains = [] for essay in essays: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId passage.content = essay.content trains.append(passage) # for p in trains[:30]: # essayprepare.processPassage(p) for p in trains[:100]: # 拆分段落 print "+++++++++++++++++++++++" paras = essayprepare.para_tokenizer.tokenize(p.content) pcount1 = len(paras) scount1 = 0 for para in paras: sents = essayprepare.markedSentenceTokenize(para) # for sent in sents: # print "### ", sent scount1 += len(sents) print "-----------------------" paras = essayprepare.para_tokenizer.tokenize(p.passage) pcount2 = len(paras) scount2 = 0 for para in paras: sents = essayprepare.sent_tokenizer.tokenize(para) # for sent in sents: # print "### ", sent scount2 += len(sents) if pcount1 != pcount2 or scount1 != scount2: print p.content print p.passage print "\n" # for i, p in enumerate(trains[:30]): # for para in p.paragraphs: # for sent in para.sentences: # for token in sent.tokens: # if token.isSpellError: # print token.token, token.candidates # for m in essays[i].findMarks(): # if m[0] == 'fm1' or m[0] == 'fm2' or m[0] == 'sw': # print m # egrammar_vp1 = "EVP1: {<NN><RB>?<VB>}" # parser_evp1 = nltk.RegexpParser(egrammar_vp1) # # for p in trains[:50]: # for para in p.paragraphs: # for sent in para.sentences: # sentence = [(token.token, token.pos) for token in sent.tokens] # result = parser_evp1.parse(sentence) # r = str(result) # if r.find('EVP1') > 0: print r print "demo2 over"
print len(essays) essay = None for e in essays: if e.id == "0092": essay = e break # 文章 passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId passage.content = essay.content # 处理文章 essayprepare.processPassage(passage) # 输出来看看是啥样子 print "PASSAGE=========================================" print passage print passage.id print passage.title print passage.score print passage.passage print len(passage.paragraphs) print "PARAGRAPHS---------------------------------------" for para in passage.paragraphs: print para.paragraphNo