Exemplo n.º 1
0
 def test_stanford_segmenter_arabic(self):
     """
     Test the Stanford Word Segmenter for Arabic (default config)
     """
     try:
         seg = StanfordSegmenter()
         seg.default_config('ar')
         sent = u'يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات'
         segmented_sent = seg.segment(sent.split())
         assert segmented_sent.split() == [
             'يبحث',
             'علم',
             'الحاسوب',
             'استخدام',
             'الحوسبة',
             'ب',
             'جميع',
             'اشكال',
             'ها',
             'ل',
             'حل',
             'المشكلات',
         ]
     except LookupError as e:
         raise SkipTest(str(e))
Exemplo n.º 2
0
 def test_stanford_segmenter_arabic(self):
     """
     Test the Stanford Word Segmenter for Arabic (default config)
     """
     try:
         seg = StanfordSegmenter()
         seg.default_config('ar')
         sent = u'يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات'
         segmented_sent = seg.segment(sent.split())
         assert segmented_sent.split() == [
             'يبحث',
             'علم',
             'الحاسوب',
             'استخدام',
             'الحوسبة',
             'ب',
             'جميع',
             'اشكال',
             'ها',
             'ل',
             'حل',
             'المشكلات',
         ]
     except LookupError as e:
         raise SkipTest(str(e)) from e
def load_stanford_segmenter():
    try:
        seg = StanfordSegmenter()
        seg.default_config("ar")
        seg.default_config("zh")
        return True
    except LookupError:
        return False
 def test_stanford_segmenter_chinese(self):
     """
     Test the Stanford Word Segmenter for Chinese (default config)
     """
     seg = StanfordSegmenter()
     seg.default_config("zh")
     sent = "这是斯坦福中文分词器测试"
     segmented_sent = seg.segment(sent.split())
     assert segmented_sent.split() == ["这", "是", "斯坦福", "中文", "分词器", "测试"]
Exemplo n.º 5
0
Arquivo: test.py Projeto: weiang/baike
def test_segmenter():
    segmenter = StanfordSegmenter(
            path_to_sihan_corpora_dict="/home/angwei/bin/stanford/stanford-segmenter/data/",
            path_to_model="/home/angwei/bin/stanford/stanford-segmenter/data/pku.gz",
            path_to_dict="/home/angwei/bin/stanford/stanford-segmenter/data/dict-chris6.ser.gz"
            )
#    segmenter = StanfordSegmenter()
    res = segmenter.segment(u"北海已成为中国对外开放中升起的一颗明星")
    print type(res)
    print res.encode('utf-8')
Exemplo n.º 6
0
 def test_stanford_segmenter_chinese(self):
     """
     Test the Stanford Word Segmenter for Chinese (default config)
     """
     try:
         seg = StanfordSegmenter()
         seg.default_config('zh')
         sent = u"这是斯坦福中文分词器测试"
         segmented_sent = seg.segment(sent.split())
         assert segmented_sent.split() == ['这', '是', '斯坦福', '中文', '分词器', '测试']
     except LookupError as e:
         pytest.skip(str(e))
Exemplo n.º 7
0
 def test_stanford_segmenter_chinese(self):
     """
     Test the Stanford Word Segmenter for Chinese (default config)
     """
     try:
         seg = StanfordSegmenter()
         seg.default_config('zh')
         sent = u"这是斯坦福中文分词器测试"
         segmented_sent = seg.segment(sent.split())
         assert segmented_sent.split() == ['这', '是', '斯坦福', '中文', '分词器', '测试']
     except LookupError as e:
         raise SkipTest(str(e))
Exemplo n.º 8
0
    def __init__(self, segmenter_root=os.path.join(_get_home_dir(), 'stanford-segmenter'),
                 slf4j_root=os.path.join(_get_home_dir(), 'slf4j'),
                 java_class='edu.stanford.nlp.ie.crf.CRFClassifier'):
        is_java_exist = os.system('java -version')
        assert is_java_exist == 0, 'Java is not installed. You must install Java 8.0' \
                                   'in order to use the NLTKStanfordSegmenter'
        try:
            from nltk.tokenize import StanfordSegmenter
        except ImportError:
            raise ImportError('NLTK or relevant packages are not installed. You must install NLTK '
                              'in order to use the NLTKStanfordSegmenter. You can refer to the '
                              'official installation guide in https://www.nltk.org/install.html.')
        path_to_jar = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27',
                                   'stanford-segmenter-3.9.1.jar')
        path_to_model = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27',
                                     'data', 'pku.gz')
        path_to_dict = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27',
                                    'data', 'dict-chris6.ser.gz')
        path_to_sihan_corpora_dict = os.path.join(segmenter_root,
                                                  'stanford-segmenter-2018-02-27', 'data')
        segmenter_url = 'https://nlp.stanford.edu/software/stanford-segmenter-2018-02-27.zip'
        segmenter_sha1 = 'aa27a6433704b7b4c6a14be1c126cb4b14b8f57b'
        stanford_segmenter = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27.zip')
        if not os.path.exists(path_to_jar) or \
                not os.path.exists(path_to_model) or \
                not os.path.exists(path_to_dict) or \
                not os.path.exists(path_to_sihan_corpora_dict) or \
                not check_sha1(filename=stanford_segmenter, sha1_hash=segmenter_sha1):
            # automatically download the files from the website and place them to stanford_root
            if not os.path.exists(segmenter_root):
                os.mkdir(segmenter_root)
            download(url=segmenter_url, path=segmenter_root, sha1_hash=segmenter_sha1)
            _extract_archive(file=stanford_segmenter, target_dir=segmenter_root)

        path_to_slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25', 'slf4j-api-1.7.25.jar')
        slf4j_url = 'https://www.slf4j.org/dist/slf4j-1.7.25.zip'
        slf4j_sha1 = '89ea41ad6ebe1b190139421bb7c8d981e9df1625'
        slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25.zip')
        if not os.path.exists(path_to_slf4j) or \
                not check_sha1(filename=slf4j, sha1_hash=slf4j_sha1):
            # automatically download the files from the website and place them to slf4j_root
            if not os.path.exists(slf4j_root):
                os.mkdir(slf4j_root)
            download(url=slf4j_url, path=slf4j_root, sha1_hash=slf4j_sha1)
            _extract_archive(file=slf4j, target_dir=slf4j_root)
        self._tokenizer = StanfordSegmenter(java_class=java_class, path_to_jar=path_to_jar,
                                            path_to_slf4j=path_to_slf4j, path_to_dict=path_to_dict,
                                            path_to_sihan_corpora_dict=path_to_sihan_corpora_dict,
                                            path_to_model=path_to_model)
Exemplo n.º 9
0
def setup_module(module):
    import pytest

    try:
        seg = StanfordSegmenter()
        seg.default_config("ar")
        seg.default_config("zh")
    except LookupError as e:
        pytest.skip("Tests for nltk.tokenize.stanford_segmenter skipped: %s" %
                    str(e))

    try:
        StanfordTokenizer()
    except LookupError:
        pytest.skip(
            "Tests for nltk.tokenize.stanford are skipped because the stanford postagger jar doesn't exist"
        )
 def test_stanford_segmenter_arabic(self):
     """
     Test the Stanford Word Segmenter for Arabic (default config)
     """
     seg = StanfordSegmenter()
     seg.default_config("ar")
     sent = "يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات"
     segmented_sent = seg.segment(sent.split())
     assert segmented_sent.split() == [
         "يبحث",
         "علم",
         "الحاسوب",
         "استخدام",
         "الحوسبة",
         "ب",
         "جميع",
         "اشكال",
         "ها",
         "ل",
         "حل",
         "المشكلات",
     ]
Exemplo n.º 11
0
 def define_stanford_segmenter(
         self,
         java_class="edu.stanford.nlp.ie.crf.CRFClassifier",
         path_to_model="/Library/Tools/stanford/segmenter/data/pku.gz",
         path_to_dict="/Library/Tools/stanford/segmenter/data/dict-chris6.ser.gz",
         path_to_sihan_corpora_dict="/Library/Tools/stanford/segmenter/data/"
 ):
     _stanford_segmenter = StanfordSegmenter(
         java_class=java_class,
         path_to_model=path_to_model,
         path_to_dict=path_to_dict,
         path_to_sihan_corpora_dict=path_to_sihan_corpora_dict)
     return _stanford_segmenter
Exemplo n.º 12
0
    def __init__(self, segmenter_root=os.path.join(_get_home_dir(), 'stanford-segmenter'),
                 slf4j_root=os.path.join(_get_home_dir(), 'slf4j'),
                 java_class='edu.stanford.nlp.ie.crf.CRFClassifier'):
        is_java_exist = os.system('java -version')
        assert is_java_exist == 0, 'Java is not installed. You must install Java 8.0' \
                                   'in order to use the NLTKStanfordSegmenter'
        try:
            from nltk.tokenize import StanfordSegmenter
        except ImportError:
            raise ImportError('NLTK or relevant packages are not installed. You must install NLTK '
                              'in order to use the NLTKStanfordSegmenter. You can refer to the '
                              'official installation guide in https://www.nltk.org/install.html.')
        path_to_jar = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27',
                                   'stanford-segmenter-3.9.1.jar')
        path_to_model = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27',
                                     'data', 'pku.gz')
        path_to_dict = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27',
                                    'data', 'dict-chris6.ser.gz')
        path_to_sihan_corpora_dict = os.path.join(segmenter_root,
                                                  'stanford-segmenter-2018-02-27', 'data')
        segmenter_url = 'https://nlp.stanford.edu/software/stanford-segmenter-2018-02-27.zip'
        segmenter_sha1 = 'aa27a6433704b7b4c6a14be1c126cb4b14b8f57b'
        stanford_segmenter = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27.zip')
        if not os.path.exists(path_to_jar) or \
                not os.path.exists(path_to_model) or \
                not os.path.exists(path_to_dict) or \
                not os.path.exists(path_to_sihan_corpora_dict) or \
                not check_sha1(filename=stanford_segmenter, sha1_hash=segmenter_sha1):
            # automatically download the files from the website and place them to stanford_root
            if not os.path.exists(segmenter_root):
                os.mkdir(segmenter_root)
            download(url=segmenter_url, path=segmenter_root, sha1_hash=segmenter_sha1)
            _extract_archive(file=stanford_segmenter, target_dir=segmenter_root)

        path_to_slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25', 'slf4j-api-1.7.25.jar')
        slf4j_url = 'https://www.slf4j.org/dist/slf4j-1.7.25.zip'
        slf4j_sha1 = '89ea41ad6ebe1b190139421bb7c8d981e9df1625'
        slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25.zip')
        if not os.path.exists(path_to_slf4j) or \
                not check_sha1(filename=slf4j, sha1_hash=slf4j_sha1):
            # automatically download the files from the website and place them to slf4j_root
            if not os.path.exists(slf4j_root):
                os.mkdir(slf4j_root)
            download(url=slf4j_url, path=slf4j_root, sha1_hash=slf4j_sha1)
            _extract_archive(file=slf4j, target_dir=slf4j_root)
        self._tokenizer = StanfordSegmenter(java_class=java_class, path_to_jar=path_to_jar,
                                            path_to_slf4j=path_to_slf4j, path_to_dict=path_to_dict,
                                            path_to_sihan_corpora_dict=path_to_sihan_corpora_dict,
                                            path_to_model=path_to_model)
Exemplo n.º 13
0
    def __init__(self, **kwargs):
        self.conf_io = conf.load("io")
        self.conf_corenlp = conf.load("stanford_corenlp")
        self.conf_embedding = conf.load("embedding")
        conf_segmenter = self.conf_corenlp["segmenter"]
        conf_tokenizer = self.conf_corenlp["tokenizer"]
        conf_postagger = self.conf_corenlp["postagger"]
        prefix = self.conf_corenlp["prefix"]

        self.segmenter = StanfordSegmenter(
            path_to_jar=prefix + conf_segmenter["path_to_jar"],
            path_to_sihan_corpora_dict=prefix +
            conf_segmenter["path_to_sihan_corpora_dict"],
            path_to_model=prefix + conf_segmenter["path_to_model"],
            path_to_dict=prefix + conf_segmenter["path_to_dict"],
            path_to_slf4j=prefix + conf_segmenter["path_to_slf4j"],
            encoding=conf_segmenter["encoding"])
        self.enTokenizer = StanfordTokenizer(path_to_jar=prefix +
                                             conf_tokenizer["path_to_jar"])
        self.zh_tagger = StanfordPOSTagger(
            prefix + conf_postagger["tagger_zh"],
            path_to_jar=prefix + conf_postagger["path_to_jar"])
        self.en_tagger = StanfordPOSTagger(
            prefix + conf_postagger["tagger_en"],
            path_to_jar=prefix + conf_postagger["path_to_jar"])
        self.frequency = defaultdict(int)
        pynlpir.open()
        pynlpir.nlpir.ImportUserDict(conf.load("pynlpir")["user_dict"],
                                     Overwrite=False)

        try:
            self.excluded_docs = kwargs["excluded_docs"]
        except:
            self.excluded_docs = [""]

        # experimental features
        self.f_token_indexes = prefix + conf.load("pynlpir")["user_dict"]
Exemplo n.º 14
0
from nltk.tokenize import StanfordSegmenter

from polyglot.text import Text
from rake_nltk import Rake

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

os.environ[
    'STANFORD_MODELS'] = 'stanford-segmenter-2018-10-16/data/;stanford-postagger-full-2018-10-16/models/'
os.environ['STANFORD_PARSER'] = 'stanford-parser-full-2018-10-17'
os.environ['CLASSPATH'] = 'stanford-parser-full-2018-10-17'
os.environ['JAVAHOME'] = 'C:/Program Files/Java/jdk-11.0.1'

segmenter = StanfordSegmenter(
    'stanford-segmenter-2018-10-16/stanford-segmenter-3.9.2.jar')
segmenter.default_config('ar')
text = segmenter.segment_file('sample.txt')
print(text)

tagger = STag.StanfordPOSTagger(
    'arabic.tagger',
    'stanford-postagger-full-2018-10-16/stanford-postagger.jar')
for tag in tagger.tag(text.split()):
    print(tag[1])

parser = SParse.StanfordParser(
    model_path='edu/stanford/nlp/models/lexparser/arabicFactored.ser.gz')
sentences = parser.raw_parse_sents(text.split('.'))
for line in sentences:
    for sentence in line:
from rake_nltk import Rake

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

os.environ[
    'STANFORD_MODELS'] = 'C:\\Users\\snmuj\\OneDrive\\Documents\\salm\\stanford-segmenter-2018-10-16\\data;C:\\Users\\lenovo\\Documents\\salm\\stanford-postagger-full-2018-10-16\\models'
os.environ[
    'STANFORD_PARSER'] = 'C:\\Users\\snmuj\\OneDrive\\Documents\\salm\\stanford-parser-full-2018-10-17'
os.environ[
    'CLASSPATH'] = 'C:\\Users\\snmuj\\OneDrive\\Documents\\stanford-parser-full-2018-10-17'
os.environ['JAVAHOME'] = 'C:\Program Files\Java\jdk-14.0.2_windows-x64_bin.exe'

segmenter = StanfordSegmenter(
    'C:\\Users\\snmuj\\OneDrive\\Documents\\salm\\stanford-segmenter-2018-10-16\\stanford-segmenter-3.9.2.jar'
)
segmenter.default_config('ar')
text = segmenter.segment_file('text file')
print(text)

tagger = STag.StanfordPOSTagger(
    'arabic.tagger',
    'C:\\Users\\snmuj\\OneDrive\\Documents\\stanford-postagger-full-2018-10-16\\stanford-postagger.jar'
)
for tag in tagger.tag(text.split()):
    print(tag[1])

parser = SParse.StanfordParser(
    model_path='edu/stanford/nlp/models/lexparser/arabicFactored.ser.gz')
sentences = parser.raw_parse_sents(text.split('.'))
Exemplo n.º 16
0
import os 
from nltk.tokenize import StanfordSegmenter

os.environ['STANFORD_SEGMENTER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09'
os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/stanford-segmenter.jar:/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/slf4j-api.jar'

os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/jdk1.8.0_40.jdk/Contents/Home'

segmenter = StanfordSegmenter(
	path_to_sihan_corpora_dict="/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/data/",
	path_to_model="/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/data/pku.gz",
	path_to_dict="/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/data/dict-chris6.ser.gz"
)

res = segmenter.segment("中山大学在西子湾")

print(res)
Exemplo n.º 17
0
class NLTKStanfordSegmenter(object):
    r"""Apply the Stanford Chinese Word Segmenter implemented in NLTK.

    Users of this class are required to install Java, NLTK and download Stanford Word Segmenter

    Parameters
    ----------
    segmenter_root : str, default '$MXNET_HOME/stanford-segmenter'
        Path to folder for storing stanford segmenter.
        MXNET_HOME defaults to '~/.mxnet'.

    slf4j_root : str, default '$MXNET_HOME/slf4j'
        Path to foler for storing slf4j.
        MXNET_HOME defaults to '~/.mxnet'

    java_class : str, default 'edu.stanford.nlp.ie.crf.CRFClassifier'
        The learning algorithm used for segmentation

    Examples
    --------
    >>> tokenizer = gluonnlp.data.NLTKStanfordSegmenter() #doctest:+SKIP
    >>> tokenizer(u"我来到北京清华大学")
    ['我', '来到', '北京', '清华大学']
    >>> tokenizer(u"小明硕士毕业于中国科学院计算所,后在日本京都大学深造")
    ['小明', '硕士', '毕业', '于', '中国科学院', '计算所', ',', '后', '在', '日本京都大学', '深造']

    """
    def __init__(self, segmenter_root=os.path.join(_get_home_dir(), 'stanford-segmenter'),
                 slf4j_root=os.path.join(_get_home_dir(), 'slf4j'),
                 java_class='edu.stanford.nlp.ie.crf.CRFClassifier'):
        is_java_exist = os.system('java -version')
        assert is_java_exist == 0, 'Java is not installed. You must install Java 8.0' \
                                   'in order to use the NLTKStanfordSegmenter'
        try:
            from nltk.tokenize import StanfordSegmenter
        except ImportError:
            raise ImportError('NLTK or relevant packages are not installed. You must install NLTK '
                              'in order to use the NLTKStanfordSegmenter. You can refer to the '
                              'official installation guide in https://www.nltk.org/install.html.')
        path_to_jar = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27',
                                   'stanford-segmenter-3.9.1.jar')
        path_to_model = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27',
                                     'data', 'pku.gz')
        path_to_dict = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27',
                                    'data', 'dict-chris6.ser.gz')
        path_to_sihan_corpora_dict = os.path.join(segmenter_root,
                                                  'stanford-segmenter-2018-02-27', 'data')
        segmenter_url = 'https://nlp.stanford.edu/software/stanford-segmenter-2018-02-27.zip'
        segmenter_sha1 = 'aa27a6433704b7b4c6a14be1c126cb4b14b8f57b'
        stanford_segmenter = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27.zip')
        if not os.path.exists(path_to_jar) or \
                not os.path.exists(path_to_model) or \
                not os.path.exists(path_to_dict) or \
                not os.path.exists(path_to_sihan_corpora_dict) or \
                not check_sha1(filename=stanford_segmenter, sha1_hash=segmenter_sha1):
            # automatically download the files from the website and place them to stanford_root
            if not os.path.exists(segmenter_root):
                os.mkdir(segmenter_root)
            download(url=segmenter_url, path=segmenter_root, sha1_hash=segmenter_sha1)
            _extract_archive(file=stanford_segmenter, target_dir=segmenter_root)

        path_to_slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25', 'slf4j-api-1.7.25.jar')
        slf4j_url = 'https://www.slf4j.org/dist/slf4j-1.7.25.zip'
        slf4j_sha1 = '89ea41ad6ebe1b190139421bb7c8d981e9df1625'
        slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25.zip')
        if not os.path.exists(path_to_slf4j) or \
                not check_sha1(filename=slf4j, sha1_hash=slf4j_sha1):
            # automatically download the files from the website and place them to slf4j_root
            if not os.path.exists(slf4j_root):
                os.mkdir(slf4j_root)
            download(url=slf4j_url, path=slf4j_root, sha1_hash=slf4j_sha1)
            _extract_archive(file=slf4j, target_dir=slf4j_root)
        self._tokenizer = StanfordSegmenter(java_class=java_class, path_to_jar=path_to_jar,
                                            path_to_slf4j=path_to_slf4j, path_to_dict=path_to_dict,
                                            path_to_sihan_corpora_dict=path_to_sihan_corpora_dict,
                                            path_to_model=path_to_model)

    def __call__(self, sample):
        """

        Parameters
        ----------
        sample: str
            The Chinese sentence to tokenize. Better not to input sentence in other languages
            since this class is mainly used for Chinese Word Segmentation.

        Returns
        -------
        ret : list of strs
            List of tokens
        """
        return [tok for tok in self._tokenizer.segment(sample).strip().split()]
Exemplo n.º 18
0
#'/data/;C:/Users/dell/Documents/graduation_backup/standford/stanford-postagger-full' \
#'-2018-10-16/models/ ')
#os.environ['STANFORD_MODELS'] = 'C:/Users/dell/Documents/graduation_backup/standford/stanford-segmenter-2018-10-16' \
#'/data/'
path = "standford/stanford-segmenter-2018-10-16/data/"
path = pkg_resources.resource_filename(__name__, path)
os.environ['STANFORD_MODELS'] = path

path = "standford/stanford-parser-full-2018-10-16"
path = pkg_resources.resource_filename(__name__, path)
os.environ['CLASSPATH'] = path
# os.environ['JAVAHOME'] = 'C:/Program Files/Java/jre1.8.0_171'

path = "standford/stanford-segmenter-2018-10-16/stanford-segmenter-3.9.2.jar"
path = pkg_resources.resource_filename(__name__, path)
segmenter = StanfordSegmenter(path)
segmenter.default_config('ar')


def check_if_side_effect(sentence):
    for word in sentence.split(" "):
        word = stemming(word)
        if word and word in medical_terms or word in human_parts_stemmed and word.strip(
        ) != "":
            return True

    return False


def split_conjunc(sentence):
Exemplo n.º 19
0
from nltk.tokenize import StanfordSegmenter
# from nltk.tokenize import StanfordTokenizer

segmenter = StanfordSegmenter(
    path_to_sihan_corpora_dict="/Users/cquan/Documents/model/stanford-segmenter-2018-10-16/data",
    path_to_model="/Users/cquan/Documents/model/stanford-segmenter-2018-10-16/data/pku.gz",
    path_to_dict="/Users/cquan/Documents/model/stanford-segmenter-2018-10-16/data/dict-chris6.ser.gz")
res = segmenter.segment(u'北海已成为中国对外开放中升起的一颗明星')
print(type(res))
print(res.encode('utf-8'))


from nltk.parse.stanford import StanfordParser
eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
res = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))
for tree in res:
    print(tree)
    tree.draw()

ch_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')
ch_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/chineseFactored.ser.gz')
res1 = list(ch_parser.parse(u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星'.split()))
for tree in res1:
    print(tree)
    tree.draw()


from nltk.parse.stanford import StanfordDependencyParser
eng_parser = StanfordDependencyParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
res2 = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))
# for row in res2[0].triples():
Exemplo n.º 20
0
from stanfordcorenlp import StanfordCoreNLP

nlp = StanfordCoreNLP(
    'C:/Users/user/Desktop/NLP/stanford-corenlp-full-2018-10-05', lang="zh")
nlp.word_tokenize(ptt_sim)

#不知為啥超九
#%% NLTK StanfordSegmenter
from nltk.tokenize import StanfordSegmenter
#from nltk.tokenize.stanford_segmenter import StanfordSegmenter
segmenter = StanfordSegmenter(
    java_class="edu.stanford.nlp.ie.crf.CRFClassifier",
    path_to_jar=
    "C:/Users/user/Desktop/NLP/stanford-segmenter-2015-12-09/stanford-segmenter.jar",
    #path_to_slf4j="C:/Users/user/Desktop/NLP/stanford-corenlp-full-2018-10-05/slf4j-api.jar",
    path_to_sihan_corpora_dict=
    "C:/Users/user/Desktop/NLP/stanford-segmenter-2015-12-09/data",
    path_to_model=
    "C:/Users/user/Desktop/NLP/sanford-segmenter-2015-12-09/data/pku.gz",
    path_to_dict=
    "C:/Users/user/Desktop/NLP/stanford-segmenter-2015-12-09/data/dict-chris6.ser.gz",
)

text = ("这是斯坦福中文分词器测试")
segmenter.segment(text)

#這台跑不出來QQ

#%% NLTK CoreNLPParser
#必須先在cmd執行java(nlp start server.txt)
from nltk.parse.corenlp import CoreNLPParser
corenlp_parser = CoreNLPParser('http://localhost:9001', encoding='utf8')
Exemplo n.º 21
0
class NLTKStanfordSegmenter:
    r"""Apply the Stanford Chinese Word Segmenter implemented in NLTK.

    Users of this class are required to install Java, NLTK and download Stanford Word Segmenter

    Parameters
    ----------
    segmenter_root : str, default '$MXNET_HOME/stanford-segmenter'
        Path to folder for storing stanford segmenter.
        MXNET_HOME defaults to '~/.mxnet'.

    slf4j_root : str, default '$MXNET_HOME/slf4j'
        Path to foler for storing slf4j.
        MXNET_HOME defaults to '~/.mxnet'

    java_class : str, default 'edu.stanford.nlp.ie.crf.CRFClassifier'
        The learning algorithm used for segmentation

    Examples
    --------
    >>> tokenizer = gluonnlp.data.NLTKStanfordSegmenter() #doctest:+SKIP
    >>> tokenizer('我来到北京清华大学') #doctest:+SKIP
    ['我', '来到', '北京', '清华大学']
    >>> tokenizer('小明硕士毕业于中国科学院计算所,后在日本京都大学深造') #doctest:+SKIP
    ['小明', '硕士', '毕业', '于', '中国科学院', '计算所', ',', '后', '在', '日本京都大学', '深造']

    """
    def __init__(self,
                 segmenter_root=os.path.join(get_home_dir(),
                                             'stanford-segmenter'),
                 slf4j_root=os.path.join(get_home_dir(), 'slf4j'),
                 java_class='edu.stanford.nlp.ie.crf.CRFClassifier'):
        is_java_exist = os.system('java -version')
        assert is_java_exist == 0, 'Java is not installed. You must install Java 8.0' \
                                   'in order to use the NLTKStanfordSegmenter'
        try:
            from nltk.tokenize import StanfordSegmenter
        except ImportError:
            raise ImportError(
                'NLTK or relevant packages are not installed. You must install NLTK '
                'in order to use the NLTKStanfordSegmenter. You can refer to the '
                'official installation guide in https://www.nltk.org/install.html.'
            )
        path_to_jar = os.path.join(segmenter_root,
                                   'stanford-segmenter-2018-02-27',
                                   'stanford-segmenter-3.9.1.jar')
        path_to_model = os.path.join(segmenter_root,
                                     'stanford-segmenter-2018-02-27', 'data',
                                     'pku.gz')
        path_to_dict = os.path.join(segmenter_root,
                                    'stanford-segmenter-2018-02-27', 'data',
                                    'dict-chris6.ser.gz')
        path_to_sihan_corpora_dict = os.path.join(
            segmenter_root, 'stanford-segmenter-2018-02-27', 'data')
        segmenter_url = 'https://nlp.stanford.edu/software/stanford-segmenter-2018-02-27.zip'
        segmenter_sha1 = 'aa27a6433704b7b4c6a14be1c126cb4b14b8f57b'
        stanford_segmenter = os.path.join(segmenter_root,
                                          'stanford-segmenter-2018-02-27.zip')
        if not os.path.exists(path_to_jar) or \
                not os.path.exists(path_to_model) or \
                not os.path.exists(path_to_dict) or \
                not os.path.exists(path_to_sihan_corpora_dict) or \
                not check_sha1(filename=stanford_segmenter, sha1_hash=segmenter_sha1):
            # automatically download the files from the website and place them to stanford_root
            if not os.path.exists(segmenter_root):
                os.mkdir(segmenter_root)
            download(url=segmenter_url,
                     path=segmenter_root,
                     sha1_hash=segmenter_sha1)
            _extract_archive(file=stanford_segmenter,
                             target_dir=segmenter_root)

        path_to_slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25',
                                     'slf4j-api-1.7.25.jar')
        slf4j_url = 'https://www.slf4j.org/dist/slf4j-1.7.25.zip'
        slf4j_sha1 = '89ea41ad6ebe1b190139421bb7c8d981e9df1625'
        slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25.zip')
        if not os.path.exists(path_to_slf4j) or \
                not check_sha1(filename=slf4j, sha1_hash=slf4j_sha1):
            # automatically download the files from the website and place them to slf4j_root
            if not os.path.exists(slf4j_root):
                os.mkdir(slf4j_root)
            download(url=slf4j_url, path=slf4j_root, sha1_hash=slf4j_sha1)
            _extract_archive(file=slf4j, target_dir=slf4j_root)
        self._tokenizer = StanfordSegmenter(
            java_class=java_class,
            path_to_jar=path_to_jar,
            path_to_slf4j=path_to_slf4j,
            path_to_dict=path_to_dict,
            path_to_sihan_corpora_dict=path_to_sihan_corpora_dict,
            path_to_model=path_to_model)

    def __call__(self, sample):
        """

        Parameters
        ----------
        sample: str
            The Chinese sentence to tokenize. Better not to input sentence in other languages
            since this class is mainly used for Chinese Word Segmentation.

        Returns
        -------
        ret : list of strs
            List of tokens
        """
        return [tok for tok in self._tokenizer.segment(sample).strip().split()]
Exemplo n.º 22
0
from opencc import OpenCC
from nltk.parse.stanford import StanfordParser
from nltk.tokenize import StanfordSegmenter
from nltk.tag import StanfordNERTagger
from nltk.tag import StanfordPOSTagger
import pickle, re, pymysql, jieba, os
import pandas as pd

chi_tagger = StanfordPOSTagger('./StanfordNLP/models/chinese-distsim.tagger',
							   './StanfordNLP/jars/stanford-ner.jar')
segmenter = StanfordSegmenter(
	java_class='edu.stanford.nlp.ie.crf.CRFClassifier',
	path_to_jar="./StanfordNLP/jars/stanford-segmenter-3.9.2.jar",
	path_to_slf4j="./StanfordNLP/jars/slf4j-api.jar",
	path_to_sihan_corpora_dict="./StanfordNLP/stanford-segmenter-2018-10-16/data",
	path_to_model="./StanfordNLP/stanford-segmenter-2018-10-16/data/pku.gz",
	path_to_dict="./StanfordNLP/stanford-segmenter-2018-10-16/data/dict-chris6.ser.gz"
)

os.environ["JAVA_HOME"] = "/tmp2/b05902109/jdk-12.0.1"#注意這邊你們電腦要安裝java jdk,並放入你們自己的jdk的安裝路徑
os.environ["CLASSPATH"] = "./StanfordNLP/stanford-parser-2018-10-17"
os.environ["STANFORD_MODELS"] = "./StanfordNLP/models"

ch_parser = StanfordParser(model_path='./StanfordNLP/models/chinesePCFG.ser.gz')
cc = OpenCC('t2s')  # (Optional )convert from Simplified Chinese to Traditional Chinese

def Get_Data_From_Mysql(source_name,keyword):
	contents_list=[]
	target_ids =[]
	db = pymysql.connect(host="18.217.252.187",port=3306, user="******",passwd="antimoneylaunderingisgood2",db="AML_News" ,charset='utf8')
	try: