def segment(labels, reviews):

    segmented = []

    print('Creating BOW')
    # seg = StanfordSegmenter('../../datasets/data-hauyi/stanford-segmenter-2018-10-16')
    os.environ[
        "STANFORD_SEGMENTER"] = '../datasets/data-hauyi/stanford-segmenter-2018-10-16'
    seg = StanfordSegmenter(
        '../datasets/data-hauyi/stanford-segmenter-2018-10-16/stanford-segmenter-3.9.2.jar'
    )
    seg.default_config('zh', )
    count = 0

    file_out = open('reviews.txt', 'a+')

    for i in range(len(reviews)):
        # print(i)
        s = seg.segment(reviews[i])
        l = labels[i]
        # print(s)
        line = str(l) + ' ' + s
        file_out.write(line)
        segmented.append(s)
        # print('Tokenize: ')
        # print(seg.tokenize(s))
        count = count + 1
        # if count > 5:
        #     break
        print('Count: ', count)

    return (segmented)
示例#2
0
def segmenter(sentence):
    r"""
	Stanford Word Segmenter for Chinese.

	Split Chinese sentence into a sequence of words.

	Args:
		sentence:A Chinese sentence

	Returns:
		A list decode in utf-8

	Example:
		sentence="广东外语外贸大学是一所具有鲜明国际化特色的广东省属重点大学,是华南地区国际化人才培养和外国语言文化、对外经济贸易、国际战略研究的重要基地。"
		[u'\u5e7f\u4e1c', u'\u5916\u8bed', u'\u5916\u8d38', u'\u5927\u5b66', u'\u662f', u'\u4e00', u'\u6240', u'\u5177\u6709', u'\u9c9c\u660e', u'\u56fd\u9645\u5316', u'\u7279\u8272', u'\u7684', u'\u5e7f\u4e1c', u'\u7701\u5c5e', u'\u91cd\u70b9', u'\u5927\u5b66', u'\uff0c', u'\u662f', u'\u534e\u5357', u'\u5730\u533a', u'\u56fd\u9645\u5316', u'\u4eba\u624d', u'\u57f9\u517b', u'\u548c', u'\u5916\u56fd', u'\u8bed\u8a00', u'\u6587\u5316', u'\u3001', u'\u5bf9\u5916', u'\u7ecf\u6d4e', u'\u8d38\u6613', u'\u3001', u'\u56fd\u9645', u'\u6218\u7565', u'\u7814\u7a76', u'\u7684', u'\u91cd\u8981', u'\u57fa\u5730', u'\u3002']

	"""

    from nltk.tokenize.stanford_segmenter import StanfordSegmenter  #初始化斯坦福中文分词器
    segmenter = StanfordSegmenter(
        path_to_jar=
        'D:/python/nltk-3.1/nltk/chin/stanford-segmenter-2014-08-27/stanford-segmenter-3.4.1.jar',
        path_to_sihan_corpora_dict=
        'D:/python/nltk-3.1/nltk/chin/stanford-segmenter-2014-08-27/data',
        path_to_model=
        'D:/python/nltk-3.1/nltk/chin./stanford-segmenter-2014-08-27/data/pku.gz',
        path_to_dict=
        'D:/python/nltk-3.1/nltk/chin./stanford-segmenter-2014-08-27/data/dict-chris6.ser.gz'
    )  #加载中文分词模型

    sent = segmenter.segment(sentence)  #分词
    return sent.split()
示例#3
0
def splitAllWord(typeOfDataset="dev"):
    from nltk.tokenize.stanford_segmenter import StanfordSegmenter

    segmenter = StanfordSegmenter()
    segmenter.default_config('zh')

    maxCount = 2000000

    pathOfDev = "dataset/task1/%s.tsv" % typeOfDataset
    dfOfDev = pd.read_csv(pathOfDev, delimiter="\t")

    pathOfNewDev = "%s_split.tsv" % typeOfDataset

    count = 0
    with open(pathOfNewDev, "w", encoding='utf-8') as fw:
        for row in dfOfDev.iterrows():
            if count >= maxCount:
                break
            if count % 100 == 0:
                print("[%s]count = %s" % (typeOfDataset, count))

            label = row[1]['label']
            fw.write(str(label))
            fw.write("\t")
            sentence = row[1]['text_a']

            segmentOfSentence = segmenter.segment(sentence)
            for word in segmentOfSentence.split():
                fw.write(word)
                fw.write(" ")
            fw.write("\n")

            count += 1
示例#4
0
def ch_standseg(mystr):
    segmenter = StanfordSegmenter(
        path_to_jar=r"E:\tools\stanfordNLTK\jar\stanford-segmenter.jar",
        path_to_slf4j=r"E:\tools\stanfordNLTK\jar\slf4j-api.jar",
        path_to_sihan_corpora_dict=r"E:\tools\stanfordNLTK\jar\data",
        path_to_model=r"E:\tools\stanfordNLTK\jar\data\pku.gz",
        path_to_dict=r"E:\tools\stanfordNLTK\jar\data\dict-chris6.ser.gz")
    result = segmenter.segment(mystr)
    print(result)
示例#5
0
class TMStanfordTokenizer():

    models = {'ZH': 'ctb.gz', 'AR': 'arabic-segmenter-atb+bn+arztrain.ser.gz'}

    dics = {'ZH': 'dict-chris6.ser.gz', 'AR': ''}

    def __init__(self, language):

        self.language = language

        model = self.models.get(language)
        dic = self.dics.get(language)
        if not model:
            raise (Exception(
                "Unsupported language for tokenizer: {}".format(language)))

        # Initialize Stanford Tokenizer
        self.tm_tokenize = StanfordSegmenter(
            path_to_jar=os.path.join(stanford_tokenizer_home,
                                     'stanford-segmenter-3.6.0.jar'),
            path_to_model=os.path.join(stanford_tokenizer_home, 'data', model),
            path_to_dict=os.path.join(stanford_tokenizer_home, 'data', dic),
            path_to_sihan_corpora_dict=os.path.join(stanford_tokenizer_home,
                                                    'data'),
            path_to_slf4j=os.path.join(stanford_tokenizer_home,
                                       'slf4j-api.jar'))

    #Input: String
    #Output: 这 是 斯坦福 中文 分词 器 测试
    def process(self, sentences):
        text = self.tm_tokenize.segment(sentences).strip('\n')
        if re.search(TOK_PATTERN, text):  # Check if the text have tags
            text = XmlUtils.join_tags(text, JOIN_PATTERN)
        return text

    def tokenize_sent(self, text):
        if self.language == 'ZH':
            return [s + '。' for s in text.split('。')
                    if s]  # Split by sentence chinese
        #self.tm_tokenize.segment_sents(text)
        return [text]
示例#6
0
#coding:utf-8


from nltk.tokenize.stanford_segmenter import StanfordSegmenter

#中文分词
segmenter=StanfordSegmenter(
    path_to_jar="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/stanford-segmenter-3.5.2.jar",
    path_to_slf4j="/home/hsiao/Develops/nlp/stanford-corenlp-full-2016-10-31/slf4j-api.jar",
    path_to_sihan_corpora_dict="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/data",
    path_to_model="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/data/pku.gz",
    path_to_dict="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/data/dict-chris6.ser.gz"

)
str="我在我在博客园开了一个博客。"
print (segmenter.segment(str))

#英文分词


from nltk.tokenize import StanfordTokenizer
tokenizer=StanfordTokenizer(path_to_jar=r"/home/hsiao/Develops/nlp/stanford-parser-full-2016-10-31/stanford-parser.jar")
sent = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
print (tokenizer.tokenize(sent))

#中文命名实体识别
from nltk.tag import StanfordNERTagger
chi_tagger=StanfordNERTagger(model_filename=r'/home/hsiao/Develops/nlp/stanford-ner-2016-10-31/classifiers/chinese.misc.distsim.crf.ser.gz'
                             ,path_to_jar=r'/home/hsiao/Develops/nlp/stanford-ner-2016-10-31/stanford-ner.jar')
print (chi_tagger.tag('四川省 成都 信息 工程 大学 我 在 博客 园 开 了 一个 博客 , 我 的 博客 名叫 伏 草 惟 存 , 写 了 一些 自然语言 处理 的 文章 。\r\n'.split()))
示例#7
0
class NLPCore:
    """
    nlp processing including Stanford Word Segmenter, Stanford POS Tagger, 
    Stanford Named Entity Recognizer and Stanford Parser 
    """
    def __init__(self):
        self.root_path = '../Models/stanfordNLP/'

        # word segmenter
        self.segmenter = StanfordSegmenter(
            path_to_jar=self.root_path + "stanford-segmenter.jar",
            path_to_slf4j=self.root_path + "log4j-over-slf4j.jar",
            path_to_sihan_corpora_dict=self.root_path + "segmenter/",
            path_to_model=self.root_path + "segmenter/pku.gz",
            path_to_dict=self.root_path + "segmenter/dict-chris6.ser.gz")

        # pos tagger
        self.posTagger = StanfordPOSTagger(
            self.root_path + 'pos-tagger/chinese-distsim.tagger',
            path_to_jar=self.root_path + "stanford-postagger.jar")

        # named entity recognizer
        self.nerTagger = StanfordNERTagger(
            self.root_path + 'ner/chinese.misc.distsim.crf.ser.gz',
            path_to_jar=self.root_path + 'stanford-ner.jar')

        self.parser = StanfordDependencyParser(
            model_path=self.root_path + 'lexparser/chinesePCFG.ser.gz',
            path_to_jar=self.root_path + 'stanford-parser.jar',
            path_to_models_jar=self.root_path +
            'stanford-parser-3.7.0-models.jar',
            encoding='gbk')

    def split_sent_stanford(self, textPair):
        """
        Stanford Word Segmenter, input should be raw text
        :return: also TextPair with raw string of results
        """
        t1 = self.segmenter.segment(textPair.t1)
        t2 = self.segmenter.segment(textPair.t1)

        if DEBUG:
            print(t1, t2)

        return text_pair.TextPair(t1, t2, textPair.label)

    def split_sents_stanford(self, textPairs):
        """
        Stanford Word Segmenter, input should be list of sents
        :return: also TextPair with raw string of results
        """
        sents1 = [textPair.t1 for textPair in textPairs]
        sents2 = [textPair.t2 for textPair in textPairs]

        split1 = self.segmenter.segment_sents(sents1).split('\n')
        split2 = self.segmenter.segment_sents(sents2).split('\n')

        rlist = []
        for i in range(len(textPairs)):
            rlist.append(
                text_pair.TextPair(split1[i], split2[i], textPairs[i].label))

            if DEBUG:
                print(split1[i], split2[i])

        return rlist

    def split_sent_jieba(self, textPair):

        jieba.setLogLevel('INFO')
        ger1 = jieba.cut(textPair.t1)
        ger2 = jieba.cut(textPair.t2)

        t1 = ' '.join(ger1)
        t2 = ' '.join(ger2)

        return text_pair.TextPair(t1, t2, textPair.label)

    def pos_tag(self, textPair):
        """
        Stanford POS Tagger, input should be splitted
        :return: also TextPair with raw string of results
        """
        t1_s = textPair.t1.split()
        t2_s = textPair.t2.split()

        t1_tag = ' '.join([ele[1] for ele in self.posTagger.tag(t1_s)])
        t2_tag = ' '.join([ele[1] for ele in self.posTagger.tag(t2_s)])

        if DEBUG:
            print(t1_tag, t2_tag)

        return text_pair.TextPair(t1_tag, t2_tag, textPair.label)

    def pos_tag_pairs(self, textPairs):
        """
        Stanford POS Tagger, input should be list of sents
        :return: also TextPair with raw string of results
        """
        sents1 = [textPair.t1.split() for textPair in textPairs]
        sents2 = [textPair.t2.split() for textPair in textPairs]

        tag1 = self.posTagger.tag_sents(sents1)
        tag2 = self.posTagger.tag_sents(sents2)

        rlist = []
        for i in range(len(tag1)):
            t1_tag = ' '.join([ele[1] for ele in tag1[i]])
            t2_tag = ' '.join([ele[1] for ele in tag2[i]])

            rlist.append(text_pair.TextPair(t1_tag, t2_tag,
                                            textPairs[i].label))

            if DEBUG:
                print(t1_tag, t2_tag)

        return rlist

    def ner_tag(self, textPair):
        """
        Stanford Named Entity Recognizer, input should be splitted
        :return: also TextPair with raw string of results
        """
        t1_s = textPair.t1.split()
        t2_s = textPair.t2.split()

        t1_ner = ' '.join(
            [ele[0] + '#' + ele[1] for ele in self.nerTagger.tag(t1_s)])
        t2_ner = ' '.join(
            [ele[0] + '#' + ele[1] for ele in self.nerTagger.tag(t2_s)])

        if DEBUG:
            print(t1_ner, t2_ner)

        return text_pair.TextPair(t1_ner, t2_ner, textPair.label)

    def ner_tag_pairs(self, textPairs):
        """
        Stanford Named Entity Recognizer, input should be list of sents
        :return: also TextPair with raw string of results
        """
        sents1 = [textPair.t1.split() for textPair in textPairs]
        sents2 = [textPair.t2.split() for textPair in textPairs]

        tag1 = self.nerTagger.tag_sents(sents1)
        tag2 = self.nerTagger.tag_sents(sents2)

        rlist = []
        for i in range(len(tag1)):
            t1_ner = ' '.join([ele[0] + '#' + ele[1] for ele in tag1[i]])
            t2_ner = ' '.join([ele[0] + '#' + ele[1] for ele in tag2[i]])

            rlist.append(text_pair.TextPair(t1_ner, t2_ner,
                                            textPairs[i].label))

            if DEBUG:
                print(t1_ner, t2_ner)

        return rlist

    def depen_parse(self, textPair):
        """
        Stanford Dependency Parser, input should be splitted
        :return: also TextPair with raw string of results
        """
        print([p.tree() for p in self.parser.raw_parse(textPair.t1)])
示例#8
0
# coding: utf-8
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
from nltk.tag import StanfordPOSTagger

segmenter = StanfordSegmenter(
    java_class='edu.stanford.nlp.ie.crf.CRFClassifier',
    path_to_slf4j="/home/lcy/stanford-segmenter/slf4j-api.jar",
    path_to_jar="/home/lcy/stanford-segmenter/stanford-segmenter.jar",
    path_to_sihan_corpora_dict="/home/lcy/stanford-segmenter/data/",
    path_to_model="/home/lcy/stanford-segmenter/data/pku.gz",
    path_to_dict="/home/lcy/stanford-segmenter/data/dict-chris6.ser.gz"
)
f = open("wiki.zh.text", "r")  # 读取文本
string = f.read().decode("utf-8")
result = segmenter.segment(string)  # 分词
# print result
f = open("fenci.txt", "w") # 将结果保存到另一个文档中
f.write(result.encode("utf-8"))
f.close()
示例#9
0
    str_de1 = 'Jenes Mädchen ist sehr hübsch.'
    snt_de = "Sie schaut mich an, als ob sie mich nicht verstände."
    str_en1 = 'In other words, counterfactual thinking influences how satisfied each athlete feels.'
    snt = "Bills on ports and immigration were submitted by Senator Brownback, Republican of Kansas"
    snt_en2 = 'I like cat and dog'
    cldg = dep_parser_en.raw_parse(snt_en2)
    dep = next(cldg)
    print(dep.to_conll(10))
    cldg = dep_parser_en.collapsed_parse(snt_en2)
    dep = next(cldg)
    print(dep.to_conll(10))
    cldg = dep_parser_en.collapsed_parse(snt)
    dep = next(cldg)
    print(dep.to_conll(10))

    ch_seg_snt = segmenter.segment(snt_ch).strip('\n').strip('。')
    cldg = dep_parser_cn.collapsed_parse(ch_seg_snt)
    dep = next(cldg)
    print(dep.to_conll(10))

    cldg = dep_parser_de.raw_parse(snt_de)
    dep = next(cldg)
    print(dep.to_conll(10))

    # http://stackoverflow.com/questions/31975893/how-can-i-get-a-grammaticalstructure-object-for-a-german-sentence-using-the-stan

    cldg = get_dep_str(snt_de, lan='de', de_parser='parzu')
    print(cldg)

    #dep_str = get_dep_str(str_ch1, lan='ch', ch_parser='hit')
    #print(dep_str)
示例#10
0
# find_entity_t = test.find_entity()
# find_VP_t = test.firstVP()
# test.drawTree()
test.show(firstNP_t)
# test.show(find_entity_t)
# test.show(find_VP_t)
# # test.show(find_entity_t)
# test.show(firstMinNP_t)
result = test.find_realtionship(firstNP_t)
print(result)
test.drawTree()
#
#
# print(test.rel)
# test.show(test.find_realtionship())

# 对比实验
chi_parser = StanfordParser(path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar',
                            path_to_models_jar='../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar',
                            model_path='../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')
data_dir='../stanford-segmenter-2018-02-27/'
segmenter = StanfordSegmenter(path_to_jar=data_dir+"stanford-segmenter-3.9.1.jar",
                              path_to_sihan_corpora_dict=data_dir+"/data", path_to_model=data_dir+"/data/pku.gz",
                              path_to_dict=data_dir+"/data/dict-chris6.ser.gz",
                              java_class='edu.stanford.nlp.ie.crf.CRFClassifier',
                              )
result=segmenter.segment(test_str)
result_ls = result.split()
ch_tree = list(chi_parser.parse(result_ls))[0]
ch_tree.draw()
# print(result)
示例#11
0
elif (tokeniser == 'stanfordpku'):
    from nltk.tokenize.stanford_segmenter import StanfordSegmenter
    from nltk.tokenize import StanfordTokenizer
    workingdirectory = os.getcwd()
    segmenter = StanfordSegmenter(
        path_to_jar=os.path.join(workingdirectory, 'stanford-segmenter.jar'),
        path_to_slf4j=os.path.join(workingdirectory, 'slf4j-api.jar'),
        path_to_sihan_corpora_dict=os.path.join(workingdirectory, 'data'),
        path_to_model=os.path.join(workingdirectory, 'data', 'pku.gz'),
        path_to_dict=os.path.join(workingdirectory, 'data',
                                  'dict-chris6.ser.gz'))
    tokenizer = StanfordTokenizer(
        path_to_jar=os.path.join(workingdirectory, 'stanford-parser.jar'))
    n = 1
    for line in open(sourcefile):
        token = segmenter.segment(line)
        words = tokenizer.tokenize(token)
        with open('%s%s.txt' % (prefix, n), "w",
                  encoding='utf-8') as resultfile:
            resultwrite = csv.writer(resultfile)
            for word in words:
                resultwrite.writerow([word])
        n = n + 1
    print('Done')
elif (tokeniser == 'stanfordctb'):
    from nltk.tokenize.stanford_segmenter import StanfordSegmenter
    from nltk.tokenize import StanfordTokenizer
    workingdirectory = os.getcwd()
    segmenter = StanfordSegmenter(
        path_to_jar=os.path.join(workingdirectory, 'stanford-segmenter.jar'),
        path_to_slf4j=os.path.join(workingdirectory, 'slf4j-api.jar'),
示例#12
0
###############################

###tokenize chinese####
print '########Starting tokenization###########\n'
##
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
pre_path = '/home/db32555/MM/stanford-segmenter/'
segmenter = StanfordSegmenter(path_to_jar=pre_path+'stanford-segmenter-3.4.1.jar', path_to_sihan_corpora_dict=pre_path+'./data', path_to_model=pre_path+'./data/pku.gz', path_to_dict=pre_path+'./data/dict-chris6.ser.gz')
##setup_end##
from nltk import word_tokenize
##setup_end_eng##
for node in corpus:
	index = corpus.index(node)
	chinese = node[1]
	chinese = unicode(chinese, 'utf-8')
	tmp_segmented = segmenter.segment(chinese)
	tmp_segmented = tmp_segmented.split(" ")
	#
	del corpus[index][1]
	corpus[index].append(tmp_segmented)	
	print tmp_segmented
	##this is chinese 
	english = node[0]
	english = unicode(english, 'utf-8')
	english = word_tokenize(english)
	del corpus[index][0]
	corpus[index].append(english)
	print english
	##this is english
	
print '########End of tokenization###########\n'
num2word = []
word_dict = {}

lyrics = pd.read_csv('./data/%s.csv' % (lyric_document),
                     encoding='utf-8',
                     index_col='title')  #

song_num = len(lyrics['lyric'])  # 统计歌曲数目
title_set = lyrics.index[0]  # 初始化歌名集 和 歌词集
song_set = lyrics['lyric'][0]
for i in range(song_num):
    if i != 0:
        title_set = title_set + u'\n' + lyrics.index[i]
        song_set = song_set + u'\n' + lyrics['lyric'][i]

title_result = segmenter.segment(title_set)
song_result = segmenter.segment(song_set)

sentences = []
count_num = 0

#将每句话放入sentences[]中
temp = ''
for i in song_result:
    if i != u'\n':
        temp += i
    else:
        temp += i
        sentences.append(temp)
        temp = ''
        count_num += 1
示例#14
0
# cols = ["用户ID","用户问题"]
# df = pd.read_excel(filename)
# df_selected = df[cols]
# print(df[cols].head(2))

# df_segment = segmenter.segment(df_selected["用户问题"].head(1))
# print(df_segment)
import xlrd

xls_data = xlrd.open_workbook(filename)
table = xls_data.sheets()[0]
nrows = table.nrows  # 行数
ncols = table.ncols  # 列数
list2 = []
for rownum in range(1, 100):
    row = table.row_values(rownum)
    if row:
        list2.append(segmenter.segment(row[2]).split(" "))
print(list2)

# In[5]:

# In[1]:

import nltk

text = nltk.word_tokenize("And now for something compeletely differently")
print(nltk.pos_tag(text))

# In[ ]:
示例#15
0
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from nltk.tokenize.stanford_segmenter import StanfordSegmenter

seg = StanfordSegmenter(
    path_to_jar=r'D:\temp\stanford\stanford-segmenter.jar',
    path_to_slf4j=r'D:\temp\stanford\slf4j-api.jar',
    path_to_sihan_corpora_dict=r'D:\temp\stanford\data',
    path_to_model=r'D:\temp\stanford\data\ctb.gz',
    path_to_dict=r'D:\temp\stanford\data\dict-chris6.ser.gz',
)

# sentence = "这是斯坦福中文分词器测试"
sentence = "操你大爷,狗日的"
res = seg.segment(sentence)
print res

# import jieba
# ss = jieba.cut(sentence)
# print ' '.join(list(ss)), type(ss)

示例#16
0
#  # u'\u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5\n'
# segmenter.segment_file("test.simp.utf8")
# # u'\u9762\u5bf9 \u65b0 \u4e16\u7eaa \uff0c \u4e16\u754c \u5404\u56fd ...
##下载
# import nltk
# nltk.download()

from nltk.tokenize.stanford_segmenter import StanfordSegmenter
segmenter = StanfordSegmenter(
     path_to_jar="stanford-segmenter-3.6.0.jar",
     path_to_slf4j = "slf4j-api.jar",
     path_to_sihan_corpora_dict="./data",
     path_to_model="./data/pku.gz",
     path_to_dict="./data/dict-chris6.ser.gz")
sentence = u"这是斯坦福中文分词器测试"
segmenter.segment(sentence)
# >>> u'\u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5\n'
segmenter.segment_file("test.simp.utf8")
# >>> u'\u9762\u5bf9 \u65b0 \u4e16\u7eaa \uff0c \u4e16\u754c \u5404\u56fd ...

# 英文测试
# import nltk
# text = 'i am a good boy.you are a bad girl'
# sens = nltk.sent_tokenize(text)
# print(sens)
# words = []
# for sent in sens:
#     words.append(nltk.word_tokenize(sent))
# for line in words:
#     print(line)
#

from nltk.tokenize.stanford_segmenter import StanfordSegmenter
seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar')

seg = StanfordSegmenter()
seg.default_config('zh')
sent = u'这是斯坦福中文分词器测试'
print(seg.segment(sent))
# encoding: utf-8
import nltk
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
from nltk.tag import StanfordNERTagger

segmenter = StanfordSegmenter(
    #分词依赖的jar包
    path_to_jar=
    r"/home/jiangix/document/stanford-segmenter/stanford-segmenter.jar",
    path_to_slf4j=r"/home/jiangix/document/slf4j/slf4j-api.jar",
    #分词数据文件夹
    path_to_sihan_corpora_dict=
    r"/home/jiangix/document/stanford-segmenter/data",
    #基于北大在2005backoof上提供的人名日报语料库
    path_to_model=r"/home/jiangix/document/stanford-segmenter/data/pku.gz",
    path_to_dict=
    r"/home/jiangix/document/stanford-segmenter/data/dict-chris6.ser.gz")

segmenter.default_config('zh')
result = segmenter.segment(u'我喜欢学习编程')

chi_tagger = StanfordNERTagger(
    model_filename=
    r"/home/jiangix/document/stanford-chinese-corenlp-models/chinese.misc.distsim.crf.ser.gz",
    path_to_jar=r"/home/jiangix/document/stanford-ner/stanford-ner.jar")
for word, tag in chi_tagger.tag(result.split()):
    print(word, tag)
示例#19
0
			sumContain = ''
			for f in fileNames:
				try:
					if data_type == 'CIRB010':
						root = ET.parse(dirPath+'/'+f).getroot()
						date = root[0][1].text.strip()
						title = root[0][2].text.strip()
						text = ''
						for p in root[0][3]:
							text += p.text.strip()
						contain = date + title + text
						sumContain += contain
					else:
						fin = open(dirPath+'/'+f, 'r')
						for line in fin.readlines():
							sumContain += line.strip()
				except:
					a = ''
				build_time += 1.

			parsed_data = segmenter.segment(sumContain).split()
			for w in parsed_data:
				vocabSet.add(w)

			print >> sys.stderr, '\rdone building '+str(float("{0:.2f}".format(build_time/total*100.)))+'% vocabulary set ', 

	print >> sys.stderr, '\nstart dumping vocabulary set'			
	vocab.dumpVocab(outputfile, vocabSet)
	print >> sys.stderr, 'done dumping vocabulary set'			

示例#20
0
def segment(sentence):
    path = '/media/razor/Files/Python27/nltk_data/stanford-segmenter-2015-12-09/'
    segmenter = StanfordSegmenter(path + 'stanford-segmenter-3.6.0.jar', path + 'slf4j-api.jar', path + 'data/pku.gz', path + 'data/dict-chris6.ser.gz')

    return segmenter.segment(u'我爱北京天安门')
示例#21
0
文件: NLTK.py 项目: LeonHanml/Python
from nltk.tokenize.stanford import CoreNLPTokenizer
# from nltk.tokenize.stanford import

path = "D:/www/data/nlpsoftware/stanford-segmenter"
segmenter = StanfordSegmenter(
    path_to_jar=path + "/stanford-segmenter.jar",
    path_to_sihan_corpora_dict=path + "/data",
    path_to_model=path + "/data/pku.gz",
    path_to_dict=path + "/data/dict-chris6.ser.gz",
    java_class='edu.stanford.nlp.ie.crf.CRFClassifier')
#
sentence = u"这是斯坦福中文分词器测试"
sentence = u"工信处女干事每月经过   下属   科室都要亲口交代24口交换机等技术性器件的安装工作"

segmenter.tokenize_sents(u"工信处")
result = segmenter.segment(sentence)
result2 = segmenter.segment_file(
    "D:/www/data/nlpdata/icwb2-data/testing/pku_test.utf8")
clean_content = "D:\\www\\data\\Weibo Data\\Weibo Data\\nlp/clean_content.txt"
# clean_content_out="D:\\www\\data\\Weibo Data\\Weibo Data\\nlp/clean_content_out.txt"
# result3 = segmenter.segment_file(clean_content)
print(type(result2))

# with open(clean_content_out,'wb+') as f:
#     f.writelines([(s+"\r\n").encode('utf8') for s in  clean_content_out])
print(result2)
# outfile = open("D:/www/data/nlpsoftware/outfile.txt",'w')
# outfile.write(result)
# outfile.close()
#
# stanford_postagger="D:\\www\\data/nlpsoftware/stanford-postagger-full-2017-06-09\\stanford-postagger.jar"
示例#22
0
#coding:UTF-8
from nltk.tokenize.stanford_segmenter import StanfordSegmenter

segmenter = StanfordSegmenter(
    path_to_jar=
    r"D:\StanfordNLP\stanford-segmenter\stanford-segmenter-3.6.0.jar",
    path_to_slf4j=r"D:\StanfordNLP\stanford-segmenter\slf4j-api.jar",
    path_to_sihan_corpora_dict=r"D:\StanfordNLP\stanford-segmenter\data",
    path_to_model=r"D:\StanfordNLP\stanford-segmenter\data\pku.gz",
    path_to_dict=r"D:\StanfordNLP\stanford-segmenter\data\dict-chris6.ser.gz")
str = u"我在博客园开了一个博客,我的博客名叫伏草惟存,写了一些自然语言处理的文章。"
result = segmenter.segment(str)
print result
示例#23
0
from nltk.tokenize.stanford_segmenter import StanfordSegmenter

segmenter = StanfordSegmenter(
    path_to_jar=
    r"/Users/cln/stanford-corenlp/segmenter/stanford-segmenter-3.9.1.jar",
    path_to_slf4j=r"/Users/cln/stanford-corenlp/slf4j-api.jar",
    java_class=r"edu.stanford.nlp.ie.crf.CRFClassifier",  # 分词模型
    path_to_model=r"/Users/cln/stanford-corenlp/segmenter/data/pku.gz",
    path_to_dict=
    r"/Users/cln/stanford-corenlp/segmenter/data/dict-chris6.ser.gz",
    path_to_sihan_corpora_dict=r"/Users/cln/stanford-corenlp/segmenter/data")

strs = "中国的一带一路带动了很多国家的发展。"

# 汉语分词
ch_result = segmenter.segment(strs)
print('汉语分词:\n', ch_result)
print(type(ch_result), '\n')

from nltk.tokenize import word_tokenize

sent = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
# 英语分词
rr = word_tokenize(sent)
print('英语分词:\n', rr, '\n')

from nltk.tag import StanfordNERTagger

# 英文命名实体识别
eng_tagger = StanfordNERTagger(
    r'/Users/cln/stanford-corenlp/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',