Exemplo n.º 1
0
def seg(rawfile):
    text = '\n'.join(codecs.open(rawfile, 'r', 'utf-8').readlines())
    wds = gkseg.seg(text)
    o = codecs.open(os.path.join('tests/temp', os.path.basename(rawfile)), 'w',
                    'utf-8')
    o.write(' '.join(wds))
    o.close()
Exemplo n.º 2
0
def main():
    usage = "usage: %prog [options] text"
    parser = OptionParser(usage)
    parser.add_option("-m", "--model", dest="model",
                      help="the path of the model file")

    (options, args) = parser.parse_args()
    if len(args) >= 1:
        gkseg.init(options.model)
        print ' '.join(gkseg.seg(codecs.decode(args[0], 'utf-8')))
    else:
        print 'error: input text should not be empty.'
Exemplo n.º 3
0
def main():
    gkseg.init('data/model.txt')
    count = 0
    start = time.time()
    for rawfile in listdir('tests/text'):
        text = '\n'.join(codecs.open(rawfile, 'r', 'utf-8').readlines())
        wds = gkseg.seg(text)
        o = codecs.open(os.path.join('tests/temp', os.path.basename(rawfile)), 'w', 'utf-8')
        o.write(' '.join(wds))
        o.close()
        count = count + 1
    print '---------------------------------------------------------------'
    print time.time() - start
    print count
    print '---------------------------------------------------------------'
    gkseg.destroy()
Exemplo n.º 4
0
def main():
    gkseg.init('data/model.txt')
    count = 0
    start = time.time()
    for rawfile in listdir('tests/text'):
        text = '\n'.join(codecs.open(rawfile, 'r', 'utf-8').readlines())
        wds = gkseg.seg(text)
        o = codecs.open(os.path.join('tests/temp', os.path.basename(rawfile)),
                        'w', 'utf-8')
        o.write(' '.join(wds))
        o.close()
        count = count + 1
    print '---------------------------------------------------------------'
    print time.time() - start
    print count
    print '---------------------------------------------------------------'
    gkseg.destroy()
Exemplo n.º 5
0
def segmt():
    return json.dumps(gkseg.seg(codecs.decode(request.data, 'utf-8')))
Exemplo n.º 6
0
# -*- coding: utf-8 -*-

import gkseg
import ner

text = '【哈尔滨雾霾舆论数据分析】哈尔滨PM2.5爆表,微博讨论声量在10月20日~21日持续上升。21日负面情绪指数大幅蔓延,微博成为当地人民表达负面情绪的一大渠道。疾病指数也在21日上午10:00~11:00达到第一个高峰,社交媒体数据与实际医疗数据的强相关性在此事件中得到体现。更多数据,请参见微博长图。'.decode(
    'utf-8')
# http://weibo.com/2392261910/Afjg5e6bQ

# init
gkseg.init('../miner/gkseg/data/model.txt')
# for tagger to work, we need to launch Stanford NER Java socket server
tagger = ner.SocketNER(host='localhost', port=1234)

#segment the sentence into a list of words
seg = gkseg.seg(text)
# for s in seg:
#     print s.encode('utf-8')

#extract the important words from the sentence
# terms = gkseg.term(text)
# for t in terms:
#     print t.encode('utf-8')

#label the sentence
# labels = gkseg.label(text)
# for l in labels:
#     print l.encode('utf-8')

# prepare Chinese seg for ENR
seg_str = ""
Exemplo n.º 7
0
def seg(rawfile):
    text = '\n'.join(codecs.open(rawfile, 'r', 'utf-8').readlines())
    wds = gkseg.seg(text)
    o = codecs.open(os.path.join('tests/temp', os.path.basename(rawfile)), 'w', 'utf-8')
    o.write(' '.join(wds))
    o.close()
Exemplo n.º 8
0
def segmt():
    return json.dumps(gkseg.seg(codecs.decode(request.data, "utf-8")))
Exemplo n.º 9
0
# NLP + NER
# NER server should be up: see ner-server/
######################## 
print 'start NLP'

gkseg.init('../miner/gkseg/data/model.txt')
tagger = ner.SocketNER(host='localhost', port=1234)

for t in tweets:

    # 

    txt=t.txt.decode('utf-8')

    #segment the sentence into a list of words
    seg = gkseg.seg(txt)

    #extract the important words from the sentence
    terms = gkseg.term(txt)
    t.keywords=terms

    # for term in terms: 
    #     print term.encode('utf-8')

    # prepare Chinese seg for ENR
    seg_str =""
    for s in seg: 
        seg_str += s.encode('utf-8')+" "

    # get all entities 
    tags= tagger.get_entities(seg_str)
Exemplo n.º 10
0
# NLP + NER
# NER server should be up: see ner-server/
########################
print 'start NLP'

gkseg.init('../miner/gkseg/data/model.txt')
tagger = ner.SocketNER(host='localhost', port=1234)

for t in tweets:

    #

    txt = t.txt.decode('utf-8')

    #segment the sentence into a list of words
    seg = gkseg.seg(txt)

    #extract the important words from the sentence
    terms = gkseg.term(txt)
    t.keywords = terms

    # for term in terms:
    #     print term.encode('utf-8')

    # prepare Chinese seg for ENR
    seg_str = ""
    for s in seg:
        seg_str += s.encode('utf-8') + " "

    # get all entities
    tags = tagger.get_entities(seg_str)
import gkseg

text = '话说天下大势,分久必合,合久必分'.decode('utf-8')

gkseg.init()

print gkseg.seg(text)  #segment the sentence into a list of words

print gkseg.term(text)  #extract the important words from the sentence

print gkseg.label(text)  #label the sentence

gkseg.destory()
Exemplo n.º 12
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import gkseg
import ner

text = '【哈尔滨雾霾舆论数据分析】哈尔滨PM2.5爆表,微博讨论声量在10月20日~21日持续上升。21日负面情绪指数大幅蔓延,微博成为当地人民表达负面情绪的一大渠道。疾病指数也在21日上午10:00~11:00达到第一个高峰,社交媒体数据与实际医疗数据的强相关性在此事件中得到体现。更多数据,请参见微博长图。'.decode('utf-8')
# http://weibo.com/2392261910/Afjg5e6bQ

# init
gkseg.init('../miner/gkseg/data/model.txt')
# for tagger to work, we need to launch Stanford NER Java socket server 
tagger = ner.SocketNER(host='localhost', port=1234)

#segment the sentence into a list of words
seg = gkseg.seg(text)
# for s in seg: 
#     print s.encode('utf-8')

#extract the important words from the sentence
# terms = gkseg.term(text)
# for t in terms: 
#     print t.encode('utf-8')

#label the sentence
# labels = gkseg.label(text)
# for l in labels: 
#     print l.encode('utf-8')

# prepare Chinese seg for ENR
seg_str =""