def seg(rawfile): text = '\n'.join(codecs.open(rawfile, 'r', 'utf-8').readlines()) wds = gkseg.seg(text) o = codecs.open(os.path.join('tests/temp', os.path.basename(rawfile)), 'w', 'utf-8') o.write(' '.join(wds)) o.close()
def main(): usage = "usage: %prog [options] text" parser = OptionParser(usage) parser.add_option("-m", "--model", dest="model", help="the path of the model file") (options, args) = parser.parse_args() if len(args) >= 1: gkseg.init(options.model) print ' '.join(gkseg.seg(codecs.decode(args[0], 'utf-8'))) else: print 'error: input text should not be empty.'
def main(): gkseg.init('data/model.txt') count = 0 start = time.time() for rawfile in listdir('tests/text'): text = '\n'.join(codecs.open(rawfile, 'r', 'utf-8').readlines()) wds = gkseg.seg(text) o = codecs.open(os.path.join('tests/temp', os.path.basename(rawfile)), 'w', 'utf-8') o.write(' '.join(wds)) o.close() count = count + 1 print '---------------------------------------------------------------' print time.time() - start print count print '---------------------------------------------------------------' gkseg.destroy()
def segmt(): return json.dumps(gkseg.seg(codecs.decode(request.data, 'utf-8')))
# -*- coding: utf-8 -*- import gkseg import ner text = '【哈尔滨雾霾舆论数据分析】哈尔滨PM2.5爆表,微博讨论声量在10月20日~21日持续上升。21日负面情绪指数大幅蔓延,微博成为当地人民表达负面情绪的一大渠道。疾病指数也在21日上午10:00~11:00达到第一个高峰,社交媒体数据与实际医疗数据的强相关性在此事件中得到体现。更多数据,请参见微博长图。'.decode( 'utf-8') # http://weibo.com/2392261910/Afjg5e6bQ # init gkseg.init('../miner/gkseg/data/model.txt') # for tagger to work, we need to launch Stanford NER Java socket server tagger = ner.SocketNER(host='localhost', port=1234) #segment the sentence into a list of words seg = gkseg.seg(text) # for s in seg: # print s.encode('utf-8') #extract the important words from the sentence # terms = gkseg.term(text) # for t in terms: # print t.encode('utf-8') #label the sentence # labels = gkseg.label(text) # for l in labels: # print l.encode('utf-8') # prepare Chinese seg for ENR seg_str = ""
def segmt(): return json.dumps(gkseg.seg(codecs.decode(request.data, "utf-8")))
# NLP + NER # NER server should be up: see ner-server/ ######################## print 'start NLP' gkseg.init('../miner/gkseg/data/model.txt') tagger = ner.SocketNER(host='localhost', port=1234) for t in tweets: # txt=t.txt.decode('utf-8') #segment the sentence into a list of words seg = gkseg.seg(txt) #extract the important words from the sentence terms = gkseg.term(txt) t.keywords=terms # for term in terms: # print term.encode('utf-8') # prepare Chinese seg for ENR seg_str ="" for s in seg: seg_str += s.encode('utf-8')+" " # get all entities tags= tagger.get_entities(seg_str)
# NLP + NER # NER server should be up: see ner-server/ ######################## print 'start NLP' gkseg.init('../miner/gkseg/data/model.txt') tagger = ner.SocketNER(host='localhost', port=1234) for t in tweets: # txt = t.txt.decode('utf-8') #segment the sentence into a list of words seg = gkseg.seg(txt) #extract the important words from the sentence terms = gkseg.term(txt) t.keywords = terms # for term in terms: # print term.encode('utf-8') # prepare Chinese seg for ENR seg_str = "" for s in seg: seg_str += s.encode('utf-8') + " " # get all entities tags = tagger.get_entities(seg_str)
import gkseg text = '话说天下大势,分久必合,合久必分'.decode('utf-8') gkseg.init() print gkseg.seg(text) #segment the sentence into a list of words print gkseg.term(text) #extract the important words from the sentence print gkseg.label(text) #label the sentence gkseg.destory()
#!/usr/bin/env python # -*- coding: utf-8 -*- import gkseg import ner text = '【哈尔滨雾霾舆论数据分析】哈尔滨PM2.5爆表,微博讨论声量在10月20日~21日持续上升。21日负面情绪指数大幅蔓延,微博成为当地人民表达负面情绪的一大渠道。疾病指数也在21日上午10:00~11:00达到第一个高峰,社交媒体数据与实际医疗数据的强相关性在此事件中得到体现。更多数据,请参见微博长图。'.decode('utf-8') # http://weibo.com/2392261910/Afjg5e6bQ # init gkseg.init('../miner/gkseg/data/model.txt') # for tagger to work, we need to launch Stanford NER Java socket server tagger = ner.SocketNER(host='localhost', port=1234) #segment the sentence into a list of words seg = gkseg.seg(text) # for s in seg: # print s.encode('utf-8') #extract the important words from the sentence # terms = gkseg.term(text) # for t in terms: # print t.encode('utf-8') #label the sentence # labels = gkseg.label(text) # for l in labels: # print l.encode('utf-8') # prepare Chinese seg for ENR seg_str =""