示例#1
0
 def test_stanford_segmenter_arabic(self):
     """
     Test the Stanford Word Segmenter for Arabic (default config)
     """
     try:
         seg = StanfordSegmenter()
         seg.default_config('ar')
         sent = u'يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات'
         segmented_sent = seg.segment(sent.split())
         assert segmented_sent.split() == [
             'يبحث',
             'علم',
             'الحاسوب',
             'استخدام',
             'الحوسبة',
             'ب',
             'جميع',
             'اشكال',
             'ها',
             'ل',
             'حل',
             'المشكلات',
         ]
     except LookupError as e:
         raise SkipTest(str(e))
示例#2
0
 def test_stanford_segmenter_arabic(self):
     """
     Test the Stanford Word Segmenter for Arabic (default config)
     """
     try:
         seg = StanfordSegmenter()
         seg.default_config('ar')
         sent = u'يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات'
         segmented_sent = seg.segment(sent.split())
         assert segmented_sent.split() == [
             'يبحث',
             'علم',
             'الحاسوب',
             'استخدام',
             'الحوسبة',
             'ب',
             'جميع',
             'اشكال',
             'ها',
             'ل',
             'حل',
             'المشكلات',
         ]
     except LookupError as e:
         raise SkipTest(str(e)) from e
 def test_stanford_segmenter_chinese(self):
     """
     Test the Stanford Word Segmenter for Chinese (default config)
     """
     seg = StanfordSegmenter()
     seg.default_config("zh")
     sent = "这是斯坦福中文分词器测试"
     segmented_sent = seg.segment(sent.split())
     assert segmented_sent.split() == ["这", "是", "斯坦福", "中文", "分词器", "测试"]
示例#4
0
文件: test.py 项目: weiang/baike
def test_segmenter():
    segmenter = StanfordSegmenter(
            path_to_sihan_corpora_dict="/home/angwei/bin/stanford/stanford-segmenter/data/",
            path_to_model="/home/angwei/bin/stanford/stanford-segmenter/data/pku.gz",
            path_to_dict="/home/angwei/bin/stanford/stanford-segmenter/data/dict-chris6.ser.gz"
            )
#    segmenter = StanfordSegmenter()
    res = segmenter.segment(u"北海已成为中国对外开放中升起的一颗明星")
    print type(res)
    print res.encode('utf-8')
示例#5
0
 def test_stanford_segmenter_chinese(self):
     """
     Test the Stanford Word Segmenter for Chinese (default config)
     """
     try:
         seg = StanfordSegmenter()
         seg.default_config('zh')
         sent = u"这是斯坦福中文分词器测试"
         segmented_sent = seg.segment(sent.split())
         assert segmented_sent.split() == ['这', '是', '斯坦福', '中文', '分词器', '测试']
     except LookupError as e:
         pytest.skip(str(e))
示例#6
0
 def test_stanford_segmenter_chinese(self):
     """
     Test the Stanford Word Segmenter for Chinese (default config)
     """
     try:
         seg = StanfordSegmenter()
         seg.default_config('zh')
         sent = u"这是斯坦福中文分词器测试"
         segmented_sent = seg.segment(sent.split())
         assert segmented_sent.split() == ['这', '是', '斯坦福', '中文', '分词器', '测试']
     except LookupError as e:
         raise SkipTest(str(e))
 def test_stanford_segmenter_arabic(self):
     """
     Test the Stanford Word Segmenter for Arabic (default config)
     """
     seg = StanfordSegmenter()
     seg.default_config("ar")
     sent = "يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات"
     segmented_sent = seg.segment(sent.split())
     assert segmented_sent.split() == [
         "يبحث",
         "علم",
         "الحاسوب",
         "استخدام",
         "الحوسبة",
         "ب",
         "جميع",
         "اشكال",
         "ها",
         "ل",
         "حل",
         "المشكلات",
     ]
示例#8
0
class NLTKStanfordSegmenter:
    r"""Apply the Stanford Chinese Word Segmenter implemented in NLTK.

    Users of this class are required to install Java, NLTK and download Stanford Word Segmenter

    Parameters
    ----------
    segmenter_root : str, default '$MXNET_HOME/stanford-segmenter'
        Path to folder for storing stanford segmenter.
        MXNET_HOME defaults to '~/.mxnet'.

    slf4j_root : str, default '$MXNET_HOME/slf4j'
        Path to foler for storing slf4j.
        MXNET_HOME defaults to '~/.mxnet'

    java_class : str, default 'edu.stanford.nlp.ie.crf.CRFClassifier'
        The learning algorithm used for segmentation

    Examples
    --------
    >>> tokenizer = gluonnlp.data.NLTKStanfordSegmenter() #doctest:+SKIP
    >>> tokenizer('我来到北京清华大学') #doctest:+SKIP
    ['我', '来到', '北京', '清华大学']
    >>> tokenizer('小明硕士毕业于中国科学院计算所,后在日本京都大学深造') #doctest:+SKIP
    ['小明', '硕士', '毕业', '于', '中国科学院', '计算所', ',', '后', '在', '日本京都大学', '深造']

    """
    def __init__(self,
                 segmenter_root=os.path.join(get_home_dir(),
                                             'stanford-segmenter'),
                 slf4j_root=os.path.join(get_home_dir(), 'slf4j'),
                 java_class='edu.stanford.nlp.ie.crf.CRFClassifier'):
        is_java_exist = os.system('java -version')
        assert is_java_exist == 0, 'Java is not installed. You must install Java 8.0' \
                                   'in order to use the NLTKStanfordSegmenter'
        try:
            from nltk.tokenize import StanfordSegmenter
        except ImportError:
            raise ImportError(
                'NLTK or relevant packages are not installed. You must install NLTK '
                'in order to use the NLTKStanfordSegmenter. You can refer to the '
                'official installation guide in https://www.nltk.org/install.html.'
            )
        path_to_jar = os.path.join(segmenter_root,
                                   'stanford-segmenter-2018-02-27',
                                   'stanford-segmenter-3.9.1.jar')
        path_to_model = os.path.join(segmenter_root,
                                     'stanford-segmenter-2018-02-27', 'data',
                                     'pku.gz')
        path_to_dict = os.path.join(segmenter_root,
                                    'stanford-segmenter-2018-02-27', 'data',
                                    'dict-chris6.ser.gz')
        path_to_sihan_corpora_dict = os.path.join(
            segmenter_root, 'stanford-segmenter-2018-02-27', 'data')
        segmenter_url = 'https://nlp.stanford.edu/software/stanford-segmenter-2018-02-27.zip'
        segmenter_sha1 = 'aa27a6433704b7b4c6a14be1c126cb4b14b8f57b'
        stanford_segmenter = os.path.join(segmenter_root,
                                          'stanford-segmenter-2018-02-27.zip')
        if not os.path.exists(path_to_jar) or \
                not os.path.exists(path_to_model) or \
                not os.path.exists(path_to_dict) or \
                not os.path.exists(path_to_sihan_corpora_dict) or \
                not check_sha1(filename=stanford_segmenter, sha1_hash=segmenter_sha1):
            # automatically download the files from the website and place them to stanford_root
            if not os.path.exists(segmenter_root):
                os.mkdir(segmenter_root)
            download(url=segmenter_url,
                     path=segmenter_root,
                     sha1_hash=segmenter_sha1)
            _extract_archive(file=stanford_segmenter,
                             target_dir=segmenter_root)

        path_to_slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25',
                                     'slf4j-api-1.7.25.jar')
        slf4j_url = 'https://www.slf4j.org/dist/slf4j-1.7.25.zip'
        slf4j_sha1 = '89ea41ad6ebe1b190139421bb7c8d981e9df1625'
        slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25.zip')
        if not os.path.exists(path_to_slf4j) or \
                not check_sha1(filename=slf4j, sha1_hash=slf4j_sha1):
            # automatically download the files from the website and place them to slf4j_root
            if not os.path.exists(slf4j_root):
                os.mkdir(slf4j_root)
            download(url=slf4j_url, path=slf4j_root, sha1_hash=slf4j_sha1)
            _extract_archive(file=slf4j, target_dir=slf4j_root)
        self._tokenizer = StanfordSegmenter(
            java_class=java_class,
            path_to_jar=path_to_jar,
            path_to_slf4j=path_to_slf4j,
            path_to_dict=path_to_dict,
            path_to_sihan_corpora_dict=path_to_sihan_corpora_dict,
            path_to_model=path_to_model)

    def __call__(self, sample):
        """

        Parameters
        ----------
        sample: str
            The Chinese sentence to tokenize. Better not to input sentence in other languages
            since this class is mainly used for Chinese Word Segmentation.

        Returns
        -------
        ret : list of strs
            List of tokens
        """
        return [tok for tok in self._tokenizer.segment(sample).strip().split()]
示例#9
0
class NLTKStanfordSegmenter(object):
    r"""Apply the Stanford Chinese Word Segmenter implemented in NLTK.

    Users of this class are required to install Java, NLTK and download Stanford Word Segmenter

    Parameters
    ----------
    segmenter_root : str, default '$MXNET_HOME/stanford-segmenter'
        Path to folder for storing stanford segmenter.
        MXNET_HOME defaults to '~/.mxnet'.

    slf4j_root : str, default '$MXNET_HOME/slf4j'
        Path to foler for storing slf4j.
        MXNET_HOME defaults to '~/.mxnet'

    java_class : str, default 'edu.stanford.nlp.ie.crf.CRFClassifier'
        The learning algorithm used for segmentation

    Examples
    --------
    >>> tokenizer = gluonnlp.data.NLTKStanfordSegmenter() #doctest:+SKIP
    >>> tokenizer(u"我来到北京清华大学")
    ['我', '来到', '北京', '清华大学']
    >>> tokenizer(u"小明硕士毕业于中国科学院计算所,后在日本京都大学深造")
    ['小明', '硕士', '毕业', '于', '中国科学院', '计算所', ',', '后', '在', '日本京都大学', '深造']

    """
    def __init__(self, segmenter_root=os.path.join(_get_home_dir(), 'stanford-segmenter'),
                 slf4j_root=os.path.join(_get_home_dir(), 'slf4j'),
                 java_class='edu.stanford.nlp.ie.crf.CRFClassifier'):
        is_java_exist = os.system('java -version')
        assert is_java_exist == 0, 'Java is not installed. You must install Java 8.0' \
                                   'in order to use the NLTKStanfordSegmenter'
        try:
            from nltk.tokenize import StanfordSegmenter
        except ImportError:
            raise ImportError('NLTK or relevant packages are not installed. You must install NLTK '
                              'in order to use the NLTKStanfordSegmenter. You can refer to the '
                              'official installation guide in https://www.nltk.org/install.html.')
        path_to_jar = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27',
                                   'stanford-segmenter-3.9.1.jar')
        path_to_model = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27',
                                     'data', 'pku.gz')
        path_to_dict = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27',
                                    'data', 'dict-chris6.ser.gz')
        path_to_sihan_corpora_dict = os.path.join(segmenter_root,
                                                  'stanford-segmenter-2018-02-27', 'data')
        segmenter_url = 'https://nlp.stanford.edu/software/stanford-segmenter-2018-02-27.zip'
        segmenter_sha1 = 'aa27a6433704b7b4c6a14be1c126cb4b14b8f57b'
        stanford_segmenter = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27.zip')
        if not os.path.exists(path_to_jar) or \
                not os.path.exists(path_to_model) or \
                not os.path.exists(path_to_dict) or \
                not os.path.exists(path_to_sihan_corpora_dict) or \
                not check_sha1(filename=stanford_segmenter, sha1_hash=segmenter_sha1):
            # automatically download the files from the website and place them to stanford_root
            if not os.path.exists(segmenter_root):
                os.mkdir(segmenter_root)
            download(url=segmenter_url, path=segmenter_root, sha1_hash=segmenter_sha1)
            _extract_archive(file=stanford_segmenter, target_dir=segmenter_root)

        path_to_slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25', 'slf4j-api-1.7.25.jar')
        slf4j_url = 'https://www.slf4j.org/dist/slf4j-1.7.25.zip'
        slf4j_sha1 = '89ea41ad6ebe1b190139421bb7c8d981e9df1625'
        slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25.zip')
        if not os.path.exists(path_to_slf4j) or \
                not check_sha1(filename=slf4j, sha1_hash=slf4j_sha1):
            # automatically download the files from the website and place them to slf4j_root
            if not os.path.exists(slf4j_root):
                os.mkdir(slf4j_root)
            download(url=slf4j_url, path=slf4j_root, sha1_hash=slf4j_sha1)
            _extract_archive(file=slf4j, target_dir=slf4j_root)
        self._tokenizer = StanfordSegmenter(java_class=java_class, path_to_jar=path_to_jar,
                                            path_to_slf4j=path_to_slf4j, path_to_dict=path_to_dict,
                                            path_to_sihan_corpora_dict=path_to_sihan_corpora_dict,
                                            path_to_model=path_to_model)

    def __call__(self, sample):
        """

        Parameters
        ----------
        sample: str
            The Chinese sentence to tokenize. Better not to input sentence in other languages
            since this class is mainly used for Chinese Word Segmentation.

        Returns
        -------
        ret : list of strs
            List of tokens
        """
        return [tok for tok in self._tokenizer.segment(sample).strip().split()]
示例#10
0
#from nltk.tokenize.stanford_segmenter import StanfordSegmenter
segmenter = StanfordSegmenter(
    java_class="edu.stanford.nlp.ie.crf.CRFClassifier",
    path_to_jar=
    "C:/Users/user/Desktop/NLP/stanford-segmenter-2015-12-09/stanford-segmenter.jar",
    #path_to_slf4j="C:/Users/user/Desktop/NLP/stanford-corenlp-full-2018-10-05/slf4j-api.jar",
    path_to_sihan_corpora_dict=
    "C:/Users/user/Desktop/NLP/stanford-segmenter-2015-12-09/data",
    path_to_model=
    "C:/Users/user/Desktop/NLP/sanford-segmenter-2015-12-09/data/pku.gz",
    path_to_dict=
    "C:/Users/user/Desktop/NLP/stanford-segmenter-2015-12-09/data/dict-chris6.ser.gz",
)

text = ("这是斯坦福中文分词器测试")
segmenter.segment(text)

#這台跑不出來QQ

#%% NLTK CoreNLPParser
#必須先在cmd執行java(nlp start server.txt)
from nltk.parse.corenlp import CoreNLPParser
corenlp_parser = CoreNLPParser('http://localhost:9001', encoding='utf8')
token_list = list(corenlp_parser.tokenize(ptt_sim))

#%% thulac
import thulac

thu1 = thulac.thulac(seg_only=True)
thu1.cut(ptt_sim, text=True)
thu1.cut(news_sim, text=True)
示例#11
0
from nltk.tokenize import StanfordSegmenter
# from nltk.tokenize import StanfordTokenizer

segmenter = StanfordSegmenter(
    path_to_sihan_corpora_dict="/Users/cquan/Documents/model/stanford-segmenter-2018-10-16/data",
    path_to_model="/Users/cquan/Documents/model/stanford-segmenter-2018-10-16/data/pku.gz",
    path_to_dict="/Users/cquan/Documents/model/stanford-segmenter-2018-10-16/data/dict-chris6.ser.gz")
res = segmenter.segment(u'北海已成为中国对外开放中升起的一颗明星')
print(type(res))
print(res.encode('utf-8'))


from nltk.parse.stanford import StanfordParser
eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
res = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))
for tree in res:
    print(tree)
    tree.draw()

ch_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')
ch_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/chineseFactored.ser.gz')
res1 = list(ch_parser.parse(u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星'.split()))
for tree in res1:
    print(tree)
    tree.draw()


from nltk.parse.stanford import StanfordDependencyParser
eng_parser = StanfordDependencyParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
res2 = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))
# for row in res2[0].triples():
示例#12
0
import os 
from nltk.tokenize import StanfordSegmenter

os.environ['STANFORD_SEGMENTER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09'
os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/stanford-segmenter.jar:/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/slf4j-api.jar'

os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/jdk1.8.0_40.jdk/Contents/Home'

segmenter = StanfordSegmenter(
	path_to_sihan_corpora_dict="/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/data/",
	path_to_model="/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/data/pku.gz",
	path_to_dict="/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/data/dict-chris6.ser.gz"
)

res = segmenter.segment("中山大学在西子湾")

print(res)