Пример #1
0
def sentenceCheckStatsDemo():
    print "sentenceCheckStatsDemo start..."
    pkfile = open('ustcpassages_503_lt.pkl', 'r')
    passages = pickle.load(pkfile)
    pkfile.close()

    sentCount = 0
    errorcount = 0
    lgcorrect = 0
    lgtotal = 0
    ltcorrect = 0
    lttotal = 0
    allcorrect = 0

    for p in passages:
        pltc = 0
        osents = []
        for para in p.paragraphs:
            osents.extend(para.sentences)

        msents = []
        paras = essayprepare.para_tokenizer.tokenize(p.content)
        for para in paras:
            msents.extend(essayprepare.markedSentenceTokenize(para))

        if len(osents) != len(msents):
            print "sentence count not equal", p.id
            print osents
            print msents
            continue

        for si, os in enumerate(osents):
            ms = msents[si]
            mkerror = 1
            lgerror = 1
            lterror = 1
            ltc = 0

            marks = USTCReader.findMarks(ms)
            onlysperror = True
            for mark in marks:
                if not mark[0] in ['fm1', 'fm2', 'sw']:
                    onlysperror = False
                    break
            if onlysperror: mkerror = 0
            #if ms.find('[') < 0 and ms.find(']') < 0:
            #    mkerror = 0
            if os.canParsed:
                lgerror = 0
            if len(os.ltCheckResults) == 0:
                lterror = 0
            else:
                ltc = len(os.ltCheckResults)
                for cr in os.ltCheckResults:
                    if cr['ruleId'] == 'WHITESPACE_RULE':
                        ltc = ltc - 1
                    elif cr['ruleId'] == 'COMMA_PARENTHESIS_WHITESPACE':
                        ltc = ltc - 1
                    elif cr['ruleId'] == 'UPPERCASE_SENTENCE_START':
                        ltc = ltc - 1
                    elif cr['ruleId'] == 'CAN_NOT':
                        ltc = ltc - 1
                    elif cr['ruleId'] == 'EN_QUOTES':
                        ltc = ltc - 1
                if ltc == 0:
                    lterror = 0

            sentCount += 1
            if mkerror == 1: errorcount += 1
            if lgerror == 1:
                lgtotal += 1
                if lgerror == mkerror:
                    lgcorrect += 1
            if lterror == 1:
                lttotal += 1
                if lterror == mkerror:
                    ltcorrect += 1
                    if lterror == lgerror:
                        allcorrect += 1
            pltc += ltc
            #print p.id, p.score, len(os.tokens), mkerror, lgerror, lterror, ltc


#            print ms #, #ms, os.sentence
#            print os.sentence
#            if len(os.ltCheckResults) > 0:
#                for cr in os.ltCheckResults:
#                    print cr
        print p.id, p.score, pltc
    print sentCount, errorcount, lgtotal, lgcorrect, lttotal, ltcorrect, allcorrect

    print "sentenceCheckStatsDemo over!!!"
Пример #2
0
def sentenceCheckStatsDemo():
    print "sentenceCheckStatsDemo start..."
    pkfile = open('ustcpassages_503_lt.pkl', 'r')
    passages = pickle.load(pkfile)
    pkfile.close()
    
    sentCount = 0
    errorcount = 0
    lgcorrect = 0
    lgtotal = 0
    ltcorrect = 0
    lttotal = 0
    allcorrect = 0
    
    for p in passages:
        pltc = 0
        osents = []
        for para in p.paragraphs:
            osents.extend(para.sentences)
    
        msents = []
        paras = essayprepare.para_tokenizer.tokenize(p.content)
        for para in paras:
            msents.extend(essayprepare.markedSentenceTokenize(para))
        
        if len(osents) != len(msents):
            print "sentence count not equal", p.id
            print osents
            print msents
            continue
        
        for si, os in enumerate(osents):
            ms = msents[si]
            mkerror = 1
            lgerror = 1
            lterror = 1
            ltc = 0
            
            marks = USTCReader.findMarks(ms)
            onlysperror = True
            for mark in marks:
                if not mark[0] in ['fm1', 'fm2', 'sw']:
                    onlysperror = False
                    break
            if onlysperror: mkerror = 0
            #if ms.find('[') < 0 and ms.find(']') < 0:
            #    mkerror = 0
            if os.canParsed:
                lgerror = 0
            if len(os.ltCheckResults) == 0:
                lterror = 0
            else:
                ltc = len(os.ltCheckResults)
                for cr in os.ltCheckResults:
                    if cr['ruleId'] == 'WHITESPACE_RULE':
                        ltc = ltc - 1
                    elif cr['ruleId'] == 'COMMA_PARENTHESIS_WHITESPACE':
                        ltc = ltc - 1
                    elif cr['ruleId'] == 'UPPERCASE_SENTENCE_START':
                        ltc = ltc - 1
                    elif cr['ruleId'] == 'CAN_NOT':
                        ltc = ltc - 1
                    elif cr['ruleId'] == 'EN_QUOTES':
                        ltc = ltc - 1
                if ltc == 0:
                    lterror = 0
                
            sentCount += 1
            if mkerror == 1: errorcount += 1
            if lgerror == 1: 
                lgtotal += 1
                if lgerror == mkerror:
                    lgcorrect += 1
            if lterror == 1:
                lttotal += 1
                if lterror == mkerror:
                    ltcorrect += 1
                    if lterror == lgerror:
                        allcorrect += 1
            pltc += ltc      
            #print p.id, p.score, len(os.tokens), mkerror, lgerror, lterror, ltc
#            print ms #, #ms, os.sentence
#            print os.sentence
#            if len(os.ltCheckResults) > 0:
#                for cr in os.ltCheckResults:
#                    print cr
        print p.id, p.score, pltc
    print sentCount, errorcount, lgtotal, lgcorrect, lttotal, ltcorrect, allcorrect
        
    print "sentenceCheckStatsDemo over!!!"  
Пример #3
0
def demo2():
    essays = USTCReader.parseUSTCFile("USTC2011Jan.txt")
    trains = []
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        passage.content = essay.content
        trains.append(passage)

#    for p in trains[:30]:
#        essayprepare.processPassage(p)

    for p in trains[:100]:
        # 拆分段落
        print "+++++++++++++++++++++++"
        paras = essayprepare.para_tokenizer.tokenize(p.content)
        pcount1 = len(paras)
        scount1 = 0
        for para in paras:
            sents = essayprepare.markedSentenceTokenize(para)
            #            for sent in sents:
            #            	print "### ", sent
            scount1 += len(sents)
        print "-----------------------"
        paras = essayprepare.para_tokenizer.tokenize(p.passage)
        pcount2 = len(paras)
        scount2 = 0
        for para in paras:
            sents = essayprepare.sent_tokenizer.tokenize(para)
            #            for sent in sents:
            #            	print "### ", sent
            scount2 += len(sents)
        if pcount1 != pcount2 or scount1 != scount2:
            print p.content
            print p.passage
        print "\n"


#    for i, p in enumerate(trains[:30]):
#    	for para in p.paragraphs:
#    		for sent in para.sentences:
#    			for token in sent.tokens:
#    				if token.isSpellError:
#    					print token.token, token.candidates
#    	for m in essays[i].findMarks():
#    		if m[0] == 'fm1' or m[0] == 'fm2' or m[0] == 'sw':
#    			print m

#    egrammar_vp1 = "EVP1: {<NN><RB>?<VB>}"
#    parser_evp1 = nltk.RegexpParser(egrammar_vp1)
#
#    for p in trains[:50]:
#        for para in p.paragraphs:
#            for sent in para.sentences:
#                sentence = [(token.token, token.pos) for token in sent.tokens]
#                result = parser_evp1.parse(sentence)
#                r = str(result)
#                if r.find('EVP1') > 0: print r

    print "demo2 over"
Пример #4
0
def demo2():
    essays = USTCReader.parseUSTCFile("USTC2011Jan.txt")
    trains = []
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        passage.content = essay.content
        trains.append(passage)
        
#    for p in trains[:30]:
#        essayprepare.processPassage(p)
        
    for p in trains[:100]:
        # 拆分段落
        print "+++++++++++++++++++++++"
        paras = essayprepare.para_tokenizer.tokenize(p.content)
        pcount1 = len(paras)
        scount1 = 0
        for para in paras:
            sents = essayprepare.markedSentenceTokenize(para)  
#            for sent in sents:
#            	print "### ", sent
            scount1 += len(sents)
        print "-----------------------"
        paras = essayprepare.para_tokenizer.tokenize(p.passage)
        pcount2 = len(paras)
        scount2 = 0
        for para in paras:
            sents = essayprepare.sent_tokenizer.tokenize(para)  
#            for sent in sents:
#            	print "### ", sent
            scount2 += len(sents)
        if pcount1 != pcount2 or scount1 != scount2:
            print p.content
            print p.passage
        print "\n"
            	
#    for i, p in enumerate(trains[:30]):
#    	for para in p.paragraphs:
#    		for sent in para.sentences:
#    			for token in sent.tokens:
#    				if token.isSpellError:
#    					print token.token, token.candidates
#    	for m in essays[i].findMarks():
#    		if m[0] == 'fm1' or m[0] == 'fm2' or m[0] == 'sw':
#    			print m
        
#    egrammar_vp1 = "EVP1: {<NN><RB>?<VB>}"
#    parser_evp1 = nltk.RegexpParser(egrammar_vp1)    
#
#    for p in trains[:50]:
#        for para in p.paragraphs:
#            for sent in para.sentences:
#                sentence = [(token.token, token.pos) for token in sent.tokens]
#                result = parser_evp1.parse(sentence)
#                r = str(result)
#                if r.find('EVP1') > 0: print r
        
    print "demo2 over"