def test_stanford_segmenter_arabic(self): """ Test the Stanford Word Segmenter for Arabic (default config) """ try: seg = StanfordSegmenter() seg.default_config('ar') sent = u'يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات' segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == [ 'يبحث', 'علم', 'الحاسوب', 'استخدام', 'الحوسبة', 'ب', 'جميع', 'اشكال', 'ها', 'ل', 'حل', 'المشكلات', ] except LookupError as e: raise SkipTest(str(e))
def test_stanford_segmenter_arabic(self): """ Test the Stanford Word Segmenter for Arabic (default config) """ try: seg = StanfordSegmenter() seg.default_config('ar') sent = u'يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات' segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == [ 'يبحث', 'علم', 'الحاسوب', 'استخدام', 'الحوسبة', 'ب', 'جميع', 'اشكال', 'ها', 'ل', 'حل', 'المشكلات', ] except LookupError as e: raise SkipTest(str(e)) from e
def test_stanford_segmenter_chinese(self): """ Test the Stanford Word Segmenter for Chinese (default config) """ seg = StanfordSegmenter() seg.default_config("zh") sent = "这是斯坦福中文分词器测试" segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == ["这", "是", "斯坦福", "中文", "分词器", "测试"]
def test_segmenter(): segmenter = StanfordSegmenter( path_to_sihan_corpora_dict="/home/angwei/bin/stanford/stanford-segmenter/data/", path_to_model="/home/angwei/bin/stanford/stanford-segmenter/data/pku.gz", path_to_dict="/home/angwei/bin/stanford/stanford-segmenter/data/dict-chris6.ser.gz" ) # segmenter = StanfordSegmenter() res = segmenter.segment(u"北海已成为中国对外开放中升起的一颗明星") print type(res) print res.encode('utf-8')
def test_stanford_segmenter_chinese(self): """ Test the Stanford Word Segmenter for Chinese (default config) """ try: seg = StanfordSegmenter() seg.default_config('zh') sent = u"这是斯坦福中文分词器测试" segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == ['这', '是', '斯坦福', '中文', '分词器', '测试'] except LookupError as e: pytest.skip(str(e))
def test_stanford_segmenter_chinese(self): """ Test the Stanford Word Segmenter for Chinese (default config) """ try: seg = StanfordSegmenter() seg.default_config('zh') sent = u"这是斯坦福中文分词器测试" segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == ['这', '是', '斯坦福', '中文', '分词器', '测试'] except LookupError as e: raise SkipTest(str(e))
def test_stanford_segmenter_arabic(self): """ Test the Stanford Word Segmenter for Arabic (default config) """ seg = StanfordSegmenter() seg.default_config("ar") sent = "يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات" segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == [ "يبحث", "علم", "الحاسوب", "استخدام", "الحوسبة", "ب", "جميع", "اشكال", "ها", "ل", "حل", "المشكلات", ]
class NLTKStanfordSegmenter: r"""Apply the Stanford Chinese Word Segmenter implemented in NLTK. Users of this class are required to install Java, NLTK and download Stanford Word Segmenter Parameters ---------- segmenter_root : str, default '$MXNET_HOME/stanford-segmenter' Path to folder for storing stanford segmenter. MXNET_HOME defaults to '~/.mxnet'. slf4j_root : str, default '$MXNET_HOME/slf4j' Path to foler for storing slf4j. MXNET_HOME defaults to '~/.mxnet' java_class : str, default 'edu.stanford.nlp.ie.crf.CRFClassifier' The learning algorithm used for segmentation Examples -------- >>> tokenizer = gluonnlp.data.NLTKStanfordSegmenter() #doctest:+SKIP >>> tokenizer('我来到北京清华大学') #doctest:+SKIP ['我', '来到', '北京', '清华大学'] >>> tokenizer('小明硕士毕业于中国科学院计算所,后在日本京都大学深造') #doctest:+SKIP ['小明', '硕士', '毕业', '于', '中国科学院', '计算所', ',', '后', '在', '日本京都大学', '深造'] """ def __init__(self, segmenter_root=os.path.join(get_home_dir(), 'stanford-segmenter'), slf4j_root=os.path.join(get_home_dir(), 'slf4j'), java_class='edu.stanford.nlp.ie.crf.CRFClassifier'): is_java_exist = os.system('java -version') assert is_java_exist == 0, 'Java is not installed. You must install Java 8.0' \ 'in order to use the NLTKStanfordSegmenter' try: from nltk.tokenize import StanfordSegmenter except ImportError: raise ImportError( 'NLTK or relevant packages are not installed. You must install NLTK ' 'in order to use the NLTKStanfordSegmenter. You can refer to the ' 'official installation guide in https://www.nltk.org/install.html.' ) path_to_jar = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27', 'stanford-segmenter-3.9.1.jar') path_to_model = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27', 'data', 'pku.gz') path_to_dict = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27', 'data', 'dict-chris6.ser.gz') path_to_sihan_corpora_dict = os.path.join( segmenter_root, 'stanford-segmenter-2018-02-27', 'data') segmenter_url = 'https://nlp.stanford.edu/software/stanford-segmenter-2018-02-27.zip' segmenter_sha1 = 'aa27a6433704b7b4c6a14be1c126cb4b14b8f57b' stanford_segmenter = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27.zip') if not os.path.exists(path_to_jar) or \ not os.path.exists(path_to_model) or \ not os.path.exists(path_to_dict) or \ not os.path.exists(path_to_sihan_corpora_dict) or \ not check_sha1(filename=stanford_segmenter, sha1_hash=segmenter_sha1): # automatically download the files from the website and place them to stanford_root if not os.path.exists(segmenter_root): os.mkdir(segmenter_root) download(url=segmenter_url, path=segmenter_root, sha1_hash=segmenter_sha1) _extract_archive(file=stanford_segmenter, target_dir=segmenter_root) path_to_slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25', 'slf4j-api-1.7.25.jar') slf4j_url = 'https://www.slf4j.org/dist/slf4j-1.7.25.zip' slf4j_sha1 = '89ea41ad6ebe1b190139421bb7c8d981e9df1625' slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25.zip') if not os.path.exists(path_to_slf4j) or \ not check_sha1(filename=slf4j, sha1_hash=slf4j_sha1): # automatically download the files from the website and place them to slf4j_root if not os.path.exists(slf4j_root): os.mkdir(slf4j_root) download(url=slf4j_url, path=slf4j_root, sha1_hash=slf4j_sha1) _extract_archive(file=slf4j, target_dir=slf4j_root) self._tokenizer = StanfordSegmenter( java_class=java_class, path_to_jar=path_to_jar, path_to_slf4j=path_to_slf4j, path_to_dict=path_to_dict, path_to_sihan_corpora_dict=path_to_sihan_corpora_dict, path_to_model=path_to_model) def __call__(self, sample): """ Parameters ---------- sample: str The Chinese sentence to tokenize. Better not to input sentence in other languages since this class is mainly used for Chinese Word Segmentation. Returns ------- ret : list of strs List of tokens """ return [tok for tok in self._tokenizer.segment(sample).strip().split()]
class NLTKStanfordSegmenter(object): r"""Apply the Stanford Chinese Word Segmenter implemented in NLTK. Users of this class are required to install Java, NLTK and download Stanford Word Segmenter Parameters ---------- segmenter_root : str, default '$MXNET_HOME/stanford-segmenter' Path to folder for storing stanford segmenter. MXNET_HOME defaults to '~/.mxnet'. slf4j_root : str, default '$MXNET_HOME/slf4j' Path to foler for storing slf4j. MXNET_HOME defaults to '~/.mxnet' java_class : str, default 'edu.stanford.nlp.ie.crf.CRFClassifier' The learning algorithm used for segmentation Examples -------- >>> tokenizer = gluonnlp.data.NLTKStanfordSegmenter() #doctest:+SKIP >>> tokenizer(u"我来到北京清华大学") ['我', '来到', '北京', '清华大学'] >>> tokenizer(u"小明硕士毕业于中国科学院计算所,后在日本京都大学深造") ['小明', '硕士', '毕业', '于', '中国科学院', '计算所', ',', '后', '在', '日本京都大学', '深造'] """ def __init__(self, segmenter_root=os.path.join(_get_home_dir(), 'stanford-segmenter'), slf4j_root=os.path.join(_get_home_dir(), 'slf4j'), java_class='edu.stanford.nlp.ie.crf.CRFClassifier'): is_java_exist = os.system('java -version') assert is_java_exist == 0, 'Java is not installed. You must install Java 8.0' \ 'in order to use the NLTKStanfordSegmenter' try: from nltk.tokenize import StanfordSegmenter except ImportError: raise ImportError('NLTK or relevant packages are not installed. You must install NLTK ' 'in order to use the NLTKStanfordSegmenter. You can refer to the ' 'official installation guide in https://www.nltk.org/install.html.') path_to_jar = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27', 'stanford-segmenter-3.9.1.jar') path_to_model = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27', 'data', 'pku.gz') path_to_dict = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27', 'data', 'dict-chris6.ser.gz') path_to_sihan_corpora_dict = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27', 'data') segmenter_url = 'https://nlp.stanford.edu/software/stanford-segmenter-2018-02-27.zip' segmenter_sha1 = 'aa27a6433704b7b4c6a14be1c126cb4b14b8f57b' stanford_segmenter = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27.zip') if not os.path.exists(path_to_jar) or \ not os.path.exists(path_to_model) or \ not os.path.exists(path_to_dict) or \ not os.path.exists(path_to_sihan_corpora_dict) or \ not check_sha1(filename=stanford_segmenter, sha1_hash=segmenter_sha1): # automatically download the files from the website and place them to stanford_root if not os.path.exists(segmenter_root): os.mkdir(segmenter_root) download(url=segmenter_url, path=segmenter_root, sha1_hash=segmenter_sha1) _extract_archive(file=stanford_segmenter, target_dir=segmenter_root) path_to_slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25', 'slf4j-api-1.7.25.jar') slf4j_url = 'https://www.slf4j.org/dist/slf4j-1.7.25.zip' slf4j_sha1 = '89ea41ad6ebe1b190139421bb7c8d981e9df1625' slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25.zip') if not os.path.exists(path_to_slf4j) or \ not check_sha1(filename=slf4j, sha1_hash=slf4j_sha1): # automatically download the files from the website and place them to slf4j_root if not os.path.exists(slf4j_root): os.mkdir(slf4j_root) download(url=slf4j_url, path=slf4j_root, sha1_hash=slf4j_sha1) _extract_archive(file=slf4j, target_dir=slf4j_root) self._tokenizer = StanfordSegmenter(java_class=java_class, path_to_jar=path_to_jar, path_to_slf4j=path_to_slf4j, path_to_dict=path_to_dict, path_to_sihan_corpora_dict=path_to_sihan_corpora_dict, path_to_model=path_to_model) def __call__(self, sample): """ Parameters ---------- sample: str The Chinese sentence to tokenize. Better not to input sentence in other languages since this class is mainly used for Chinese Word Segmentation. Returns ------- ret : list of strs List of tokens """ return [tok for tok in self._tokenizer.segment(sample).strip().split()]
#from nltk.tokenize.stanford_segmenter import StanfordSegmenter segmenter = StanfordSegmenter( java_class="edu.stanford.nlp.ie.crf.CRFClassifier", path_to_jar= "C:/Users/user/Desktop/NLP/stanford-segmenter-2015-12-09/stanford-segmenter.jar", #path_to_slf4j="C:/Users/user/Desktop/NLP/stanford-corenlp-full-2018-10-05/slf4j-api.jar", path_to_sihan_corpora_dict= "C:/Users/user/Desktop/NLP/stanford-segmenter-2015-12-09/data", path_to_model= "C:/Users/user/Desktop/NLP/sanford-segmenter-2015-12-09/data/pku.gz", path_to_dict= "C:/Users/user/Desktop/NLP/stanford-segmenter-2015-12-09/data/dict-chris6.ser.gz", ) text = ("这是斯坦福中文分词器测试") segmenter.segment(text) #這台跑不出來QQ #%% NLTK CoreNLPParser #必須先在cmd執行java(nlp start server.txt) from nltk.parse.corenlp import CoreNLPParser corenlp_parser = CoreNLPParser('http://localhost:9001', encoding='utf8') token_list = list(corenlp_parser.tokenize(ptt_sim)) #%% thulac import thulac thu1 = thulac.thulac(seg_only=True) thu1.cut(ptt_sim, text=True) thu1.cut(news_sim, text=True)
from nltk.tokenize import StanfordSegmenter # from nltk.tokenize import StanfordTokenizer segmenter = StanfordSegmenter( path_to_sihan_corpora_dict="/Users/cquan/Documents/model/stanford-segmenter-2018-10-16/data", path_to_model="/Users/cquan/Documents/model/stanford-segmenter-2018-10-16/data/pku.gz", path_to_dict="/Users/cquan/Documents/model/stanford-segmenter-2018-10-16/data/dict-chris6.ser.gz") res = segmenter.segment(u'北海已成为中国对外开放中升起的一颗明星') print(type(res)) print(res.encode('utf-8')) from nltk.parse.stanford import StanfordParser eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') res = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split())) for tree in res: print(tree) tree.draw() ch_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz') ch_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/chineseFactored.ser.gz') res1 = list(ch_parser.parse(u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星'.split())) for tree in res1: print(tree) tree.draw() from nltk.parse.stanford import StanfordDependencyParser eng_parser = StanfordDependencyParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') res2 = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split())) # for row in res2[0].triples():
import os from nltk.tokenize import StanfordSegmenter os.environ['STANFORD_SEGMENTER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09' os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/stanford-segmenter.jar:/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/slf4j-api.jar' os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/jdk1.8.0_40.jdk/Contents/Home' segmenter = StanfordSegmenter( path_to_sihan_corpora_dict="/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/data/", path_to_model="/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/data/pku.gz", path_to_dict="/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/data/dict-chris6.ser.gz" ) res = segmenter.segment("中山大学在西子湾") print(res)