def countWord(data): wDict = {} lines = data.split("\n") for line in lines: line = line.strip() if line == '': continue for w in getWordsFromLine(line): if w not in wDict: wDict[w] = 0 wDict[w] += 1 return wDict if __name__ == '__main__': print "#init db" if readDb.initDb() != True: sys.exit(-1) print "#stopwords db" initStopWordList() print "counting #records:" nuOfData = readDb.getLenOfTbl('tag_train_l') print nuOfData nuOfData = 1 g_wordCount = {} g_tagCount = {} g_wordList = [] g_wordCode = {}
#!/usr/bin/env python # -*- coding: utf-8 -*- # by zhangzhi @2013-11-05 13:18:59 # Copyright 2013 NONE rights reserved. import readDb import pickle import sys tbl = 'intro_for_event_extraction' readDb.initDb() def seperateTag(dataList): sprData = [] for i in range(0, len(dataList), 2): sprData.append([dataList[i + 1], dataList[i]]) return sprData def test(): sprRecs = [] ret = readDb.readData(tbl, ) for rec in ret: pid = int(rec['pid']) data = rec['introductionSeg'].strip() dataList = data.split("\t") sprData = seperateTag(dataList) sprRecs.append(sprData) for spr in sprRecs: for tag, data in spr:
for pid, sents in sprRecs.items(): ret = extractOnePerson(sents) #print ret #output for oneTuples in ret: oneTuples.insert(0, str(pid)) #print oneTuples #for i in range(len(oneTuples)): # if oneTuples[i] == None: # oneTuples[i] = 'None' # else: # try: # oneTuples[i] = oneTuples[i].encode('utf-8') # except Exception, e: # print "error, %s [%s]" % (e, oneTuples[i]) #print "|||".join(oneTuples) #done if __name__ == '__main__': #init DB if R.initDb() != True: print "exit" sys.exit(-1) #if we have toooo many recs(more than 100K), #we may optimise here. dbData = R.readData('intro_for_event_extraction', 2) extractData(dbData) R.quitDb()
#!/usr/bin/env python # -*- coding: utf-8 -*- # by zhangzhi @2013-11-05 13:18:59 # Copyright 2013 NONE rights reserved. import readDb import pickle import sys tbl = 'intro_for_event_extraction' readDb.initDb() def seperateTag(dataList): sprData = [] for i in range(0, len(dataList), 2): sprData.append([dataList[i+1], dataList[i]]) return sprData def test(): sprRecs = [] ret = readDb.readData(tbl,) for rec in ret: pid = int(rec['pid']) data = rec['introductionSeg'].strip() dataList = data.split("\t") sprData = seperateTag(dataList) sprRecs.append(sprData) for spr in sprRecs: for tag, data in spr: print tag, data.encode('UTF8') return sprRecs
sprRecs = sprData(recs) for pid, sents in sprRecs.items(): ret = extractOnePerson(sents) #print ret #output for oneTuples in ret: oneTuples.insert(0, str(pid)) #print oneTuples #for i in range(len(oneTuples)): # if oneTuples[i] == None: # oneTuples[i] = 'None' # else: # try: # oneTuples[i] = oneTuples[i].encode('utf-8') # except Exception, e: # print "error, %s [%s]" % (e, oneTuples[i]) #print "|||".join(oneTuples) #done if __name__ == '__main__': #init DB if R.initDb() != True: print "exit" sys.exit(-1) #if we have toooo many recs(more than 100K), #we may optimise here. dbData = R.readData('intro_for_event_extraction', 2) extractData(dbData) R.quitDb()