Пример #1
0
def countWord(data):
    wDict = {}
    lines = data.split("\n")
    for line in lines:
        line = line.strip()
        if line == '':
            continue
        for w in getWordsFromLine(line):
            if w not in wDict:
                wDict[w] = 0 
            wDict[w] += 1
    return wDict

if __name__ == '__main__':
    print "#init db"
    if readDb.initDb() != True:
        sys.exit(-1)
    print "#stopwords db"
    initStopWordList()
    
    print "counting #records:"
    nuOfData = readDb.getLenOfTbl('tag_train_l')
    print nuOfData
    nuOfData = 1


    g_wordCount = {}
    g_tagCount = {}

    g_wordList = []
    g_wordCode = {}
Пример #2
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# by zhangzhi @2013-11-05 13:18:59
# Copyright 2013 NONE rights reserved.

import readDb
import pickle
import sys

tbl = 'intro_for_event_extraction'
readDb.initDb()


def seperateTag(dataList):
    sprData = []
    for i in range(0, len(dataList), 2):
        sprData.append([dataList[i + 1], dataList[i]])
    return sprData


def test():
    sprRecs = []
    ret = readDb.readData(tbl, )
    for rec in ret:
        pid = int(rec['pid'])
        data = rec['introductionSeg'].strip()
        dataList = data.split("\t")
        sprData = seperateTag(dataList)
        sprRecs.append(sprData)
    for spr in sprRecs:
        for tag, data in spr:
    for pid, sents in sprRecs.items():
        ret = extractOnePerson(sents)
        #print ret
        #output
        for oneTuples in ret:
            oneTuples.insert(0, str(pid))
            #print oneTuples
            #for i in range(len(oneTuples)):
            #    if oneTuples[i] == None:
            #        oneTuples[i] = 'None'
            #    else:
            #        try:
            #            oneTuples[i] = oneTuples[i].encode('utf-8')
            #        except Exception, e:
            #            print "error, %s [%s]" % (e, oneTuples[i])
            #print "|||".join(oneTuples)
    #done


if __name__ == '__main__':
    #init DB
    if R.initDb() != True:
        print "exit"
        sys.exit(-1)
    #if we have toooo many recs(more than 100K),
    #we may optimise here.
    dbData = R.readData('intro_for_event_extraction', 2)

    extractData(dbData)
    R.quitDb()
#!/usr/bin/env python
# -*- coding: utf-8 -*-  
# by zhangzhi @2013-11-05 13:18:59 
# Copyright 2013 NONE rights reserved.

import readDb
import pickle
import sys

tbl = 'intro_for_event_extraction'
readDb.initDb()

def seperateTag(dataList):
    sprData = []
    for i in range(0, len(dataList), 2):
        sprData.append([dataList[i+1], dataList[i]])
    return sprData

def test():
    sprRecs = []
    ret = readDb.readData(tbl,)
    for rec in ret:
        pid = int(rec['pid'])
        data = rec['introductionSeg'].strip()
        dataList = data.split("\t")
        sprData = seperateTag(dataList)
        sprRecs.append(sprData) 
    for spr in sprRecs:
        for tag, data in spr:
            print tag, data.encode('UTF8')
    return sprRecs
    sprRecs = sprData(recs)
    for pid, sents in sprRecs.items():
        ret = extractOnePerson(sents)
        #print ret
        #output 
        for oneTuples in ret:
            oneTuples.insert(0, str(pid))
            #print oneTuples
            #for i in range(len(oneTuples)):
            #    if oneTuples[i] == None:
            #        oneTuples[i] = 'None'
            #    else:
            #        try:
            #            oneTuples[i] = oneTuples[i].encode('utf-8')
            #        except Exception, e:
            #            print "error, %s [%s]" % (e, oneTuples[i])
            #print "|||".join(oneTuples)
    #done

if __name__ == '__main__':
    #init DB
    if R.initDb() != True:
        print "exit"
        sys.exit(-1)    
    #if we have toooo many recs(more than 100K),
    #we may optimise here.
    dbData = R.readData('intro_for_event_extraction', 2)    

    extractData(dbData)
    R.quitDb()