def test_stanford_segmenter_arabic(self): """ Test the Stanford Word Segmenter for Arabic (default config) """ try: seg = StanfordSegmenter() seg.default_config('ar') sent = u'يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات' segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == [ 'يبحث', 'علم', 'الحاسوب', 'استخدام', 'الحوسبة', 'ب', 'جميع', 'اشكال', 'ها', 'ل', 'حل', 'المشكلات', ] except LookupError as e: raise SkipTest(str(e))
def test_stanford_segmenter_arabic(self): """ Test the Stanford Word Segmenter for Arabic (default config) """ try: seg = StanfordSegmenter() seg.default_config('ar') sent = u'يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات' segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == [ 'يبحث', 'علم', 'الحاسوب', 'استخدام', 'الحوسبة', 'ب', 'جميع', 'اشكال', 'ها', 'ل', 'حل', 'المشكلات', ] except LookupError as e: raise SkipTest(str(e)) from e
def load_stanford_segmenter(): try: seg = StanfordSegmenter() seg.default_config("ar") seg.default_config("zh") return True except LookupError: return False
def test_stanford_segmenter_chinese(self): """ Test the Stanford Word Segmenter for Chinese (default config) """ seg = StanfordSegmenter() seg.default_config("zh") sent = "这是斯坦福中文分词器测试" segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == ["这", "是", "斯坦福", "中文", "分词器", "测试"]
def test_segmenter(): segmenter = StanfordSegmenter( path_to_sihan_corpora_dict="/home/angwei/bin/stanford/stanford-segmenter/data/", path_to_model="/home/angwei/bin/stanford/stanford-segmenter/data/pku.gz", path_to_dict="/home/angwei/bin/stanford/stanford-segmenter/data/dict-chris6.ser.gz" ) # segmenter = StanfordSegmenter() res = segmenter.segment(u"北海已成为中国对外开放中升起的一颗明星") print type(res) print res.encode('utf-8')
def test_stanford_segmenter_chinese(self): """ Test the Stanford Word Segmenter for Chinese (default config) """ try: seg = StanfordSegmenter() seg.default_config('zh') sent = u"这是斯坦福中文分词器测试" segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == ['这', '是', '斯坦福', '中文', '分词器', '测试'] except LookupError as e: pytest.skip(str(e))
def test_stanford_segmenter_chinese(self): """ Test the Stanford Word Segmenter for Chinese (default config) """ try: seg = StanfordSegmenter() seg.default_config('zh') sent = u"这是斯坦福中文分词器测试" segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == ['这', '是', '斯坦福', '中文', '分词器', '测试'] except LookupError as e: raise SkipTest(str(e))
def __init__(self, segmenter_root=os.path.join(_get_home_dir(), 'stanford-segmenter'), slf4j_root=os.path.join(_get_home_dir(), 'slf4j'), java_class='edu.stanford.nlp.ie.crf.CRFClassifier'): is_java_exist = os.system('java -version') assert is_java_exist == 0, 'Java is not installed. You must install Java 8.0' \ 'in order to use the NLTKStanfordSegmenter' try: from nltk.tokenize import StanfordSegmenter except ImportError: raise ImportError('NLTK or relevant packages are not installed. You must install NLTK ' 'in order to use the NLTKStanfordSegmenter. You can refer to the ' 'official installation guide in https://www.nltk.org/install.html.') path_to_jar = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27', 'stanford-segmenter-3.9.1.jar') path_to_model = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27', 'data', 'pku.gz') path_to_dict = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27', 'data', 'dict-chris6.ser.gz') path_to_sihan_corpora_dict = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27', 'data') segmenter_url = 'https://nlp.stanford.edu/software/stanford-segmenter-2018-02-27.zip' segmenter_sha1 = 'aa27a6433704b7b4c6a14be1c126cb4b14b8f57b' stanford_segmenter = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27.zip') if not os.path.exists(path_to_jar) or \ not os.path.exists(path_to_model) or \ not os.path.exists(path_to_dict) or \ not os.path.exists(path_to_sihan_corpora_dict) or \ not check_sha1(filename=stanford_segmenter, sha1_hash=segmenter_sha1): # automatically download the files from the website and place them to stanford_root if not os.path.exists(segmenter_root): os.mkdir(segmenter_root) download(url=segmenter_url, path=segmenter_root, sha1_hash=segmenter_sha1) _extract_archive(file=stanford_segmenter, target_dir=segmenter_root) path_to_slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25', 'slf4j-api-1.7.25.jar') slf4j_url = 'https://www.slf4j.org/dist/slf4j-1.7.25.zip' slf4j_sha1 = '89ea41ad6ebe1b190139421bb7c8d981e9df1625' slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25.zip') if not os.path.exists(path_to_slf4j) or \ not check_sha1(filename=slf4j, sha1_hash=slf4j_sha1): # automatically download the files from the website and place them to slf4j_root if not os.path.exists(slf4j_root): os.mkdir(slf4j_root) download(url=slf4j_url, path=slf4j_root, sha1_hash=slf4j_sha1) _extract_archive(file=slf4j, target_dir=slf4j_root) self._tokenizer = StanfordSegmenter(java_class=java_class, path_to_jar=path_to_jar, path_to_slf4j=path_to_slf4j, path_to_dict=path_to_dict, path_to_sihan_corpora_dict=path_to_sihan_corpora_dict, path_to_model=path_to_model)
def setup_module(module): import pytest try: seg = StanfordSegmenter() seg.default_config("ar") seg.default_config("zh") except LookupError as e: pytest.skip("Tests for nltk.tokenize.stanford_segmenter skipped: %s" % str(e)) try: StanfordTokenizer() except LookupError: pytest.skip( "Tests for nltk.tokenize.stanford are skipped because the stanford postagger jar doesn't exist" )
def test_stanford_segmenter_arabic(self): """ Test the Stanford Word Segmenter for Arabic (default config) """ seg = StanfordSegmenter() seg.default_config("ar") sent = "يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات" segmented_sent = seg.segment(sent.split()) assert segmented_sent.split() == [ "يبحث", "علم", "الحاسوب", "استخدام", "الحوسبة", "ب", "جميع", "اشكال", "ها", "ل", "حل", "المشكلات", ]
def define_stanford_segmenter( self, java_class="edu.stanford.nlp.ie.crf.CRFClassifier", path_to_model="/Library/Tools/stanford/segmenter/data/pku.gz", path_to_dict="/Library/Tools/stanford/segmenter/data/dict-chris6.ser.gz", path_to_sihan_corpora_dict="/Library/Tools/stanford/segmenter/data/" ): _stanford_segmenter = StanfordSegmenter( java_class=java_class, path_to_model=path_to_model, path_to_dict=path_to_dict, path_to_sihan_corpora_dict=path_to_sihan_corpora_dict) return _stanford_segmenter
def __init__(self, **kwargs): self.conf_io = conf.load("io") self.conf_corenlp = conf.load("stanford_corenlp") self.conf_embedding = conf.load("embedding") conf_segmenter = self.conf_corenlp["segmenter"] conf_tokenizer = self.conf_corenlp["tokenizer"] conf_postagger = self.conf_corenlp["postagger"] prefix = self.conf_corenlp["prefix"] self.segmenter = StanfordSegmenter( path_to_jar=prefix + conf_segmenter["path_to_jar"], path_to_sihan_corpora_dict=prefix + conf_segmenter["path_to_sihan_corpora_dict"], path_to_model=prefix + conf_segmenter["path_to_model"], path_to_dict=prefix + conf_segmenter["path_to_dict"], path_to_slf4j=prefix + conf_segmenter["path_to_slf4j"], encoding=conf_segmenter["encoding"]) self.enTokenizer = StanfordTokenizer(path_to_jar=prefix + conf_tokenizer["path_to_jar"]) self.zh_tagger = StanfordPOSTagger( prefix + conf_postagger["tagger_zh"], path_to_jar=prefix + conf_postagger["path_to_jar"]) self.en_tagger = StanfordPOSTagger( prefix + conf_postagger["tagger_en"], path_to_jar=prefix + conf_postagger["path_to_jar"]) self.frequency = defaultdict(int) pynlpir.open() pynlpir.nlpir.ImportUserDict(conf.load("pynlpir")["user_dict"], Overwrite=False) try: self.excluded_docs = kwargs["excluded_docs"] except: self.excluded_docs = [""] # experimental features self.f_token_indexes = prefix + conf.load("pynlpir")["user_dict"]
from nltk.tokenize import StanfordSegmenter from polyglot.text import Text from rake_nltk import Rake import nltk nltk.download('stopwords') from nltk.corpus import stopwords os.environ[ 'STANFORD_MODELS'] = 'stanford-segmenter-2018-10-16/data/;stanford-postagger-full-2018-10-16/models/' os.environ['STANFORD_PARSER'] = 'stanford-parser-full-2018-10-17' os.environ['CLASSPATH'] = 'stanford-parser-full-2018-10-17' os.environ['JAVAHOME'] = 'C:/Program Files/Java/jdk-11.0.1' segmenter = StanfordSegmenter( 'stanford-segmenter-2018-10-16/stanford-segmenter-3.9.2.jar') segmenter.default_config('ar') text = segmenter.segment_file('sample.txt') print(text) tagger = STag.StanfordPOSTagger( 'arabic.tagger', 'stanford-postagger-full-2018-10-16/stanford-postagger.jar') for tag in tagger.tag(text.split()): print(tag[1]) parser = SParse.StanfordParser( model_path='edu/stanford/nlp/models/lexparser/arabicFactored.ser.gz') sentences = parser.raw_parse_sents(text.split('.')) for line in sentences: for sentence in line:
from rake_nltk import Rake import nltk nltk.download('stopwords') from nltk.corpus import stopwords os.environ[ 'STANFORD_MODELS'] = 'C:\\Users\\snmuj\\OneDrive\\Documents\\salm\\stanford-segmenter-2018-10-16\\data;C:\\Users\\lenovo\\Documents\\salm\\stanford-postagger-full-2018-10-16\\models' os.environ[ 'STANFORD_PARSER'] = 'C:\\Users\\snmuj\\OneDrive\\Documents\\salm\\stanford-parser-full-2018-10-17' os.environ[ 'CLASSPATH'] = 'C:\\Users\\snmuj\\OneDrive\\Documents\\stanford-parser-full-2018-10-17' os.environ['JAVAHOME'] = 'C:\Program Files\Java\jdk-14.0.2_windows-x64_bin.exe' segmenter = StanfordSegmenter( 'C:\\Users\\snmuj\\OneDrive\\Documents\\salm\\stanford-segmenter-2018-10-16\\stanford-segmenter-3.9.2.jar' ) segmenter.default_config('ar') text = segmenter.segment_file('text file') print(text) tagger = STag.StanfordPOSTagger( 'arabic.tagger', 'C:\\Users\\snmuj\\OneDrive\\Documents\\stanford-postagger-full-2018-10-16\\stanford-postagger.jar' ) for tag in tagger.tag(text.split()): print(tag[1]) parser = SParse.StanfordParser( model_path='edu/stanford/nlp/models/lexparser/arabicFactored.ser.gz') sentences = parser.raw_parse_sents(text.split('.'))
import os from nltk.tokenize import StanfordSegmenter os.environ['STANFORD_SEGMENTER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09' os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/stanford-segmenter.jar:/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/slf4j-api.jar' os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/jdk1.8.0_40.jdk/Contents/Home' segmenter = StanfordSegmenter( path_to_sihan_corpora_dict="/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/data/", path_to_model="/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/data/pku.gz", path_to_dict="/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-segmenter-2015-12-09/data/dict-chris6.ser.gz" ) res = segmenter.segment("中山大学在西子湾") print(res)
class NLTKStanfordSegmenter(object): r"""Apply the Stanford Chinese Word Segmenter implemented in NLTK. Users of this class are required to install Java, NLTK and download Stanford Word Segmenter Parameters ---------- segmenter_root : str, default '$MXNET_HOME/stanford-segmenter' Path to folder for storing stanford segmenter. MXNET_HOME defaults to '~/.mxnet'. slf4j_root : str, default '$MXNET_HOME/slf4j' Path to foler for storing slf4j. MXNET_HOME defaults to '~/.mxnet' java_class : str, default 'edu.stanford.nlp.ie.crf.CRFClassifier' The learning algorithm used for segmentation Examples -------- >>> tokenizer = gluonnlp.data.NLTKStanfordSegmenter() #doctest:+SKIP >>> tokenizer(u"我来到北京清华大学") ['我', '来到', '北京', '清华大学'] >>> tokenizer(u"小明硕士毕业于中国科学院计算所,后在日本京都大学深造") ['小明', '硕士', '毕业', '于', '中国科学院', '计算所', ',', '后', '在', '日本京都大学', '深造'] """ def __init__(self, segmenter_root=os.path.join(_get_home_dir(), 'stanford-segmenter'), slf4j_root=os.path.join(_get_home_dir(), 'slf4j'), java_class='edu.stanford.nlp.ie.crf.CRFClassifier'): is_java_exist = os.system('java -version') assert is_java_exist == 0, 'Java is not installed. You must install Java 8.0' \ 'in order to use the NLTKStanfordSegmenter' try: from nltk.tokenize import StanfordSegmenter except ImportError: raise ImportError('NLTK or relevant packages are not installed. You must install NLTK ' 'in order to use the NLTKStanfordSegmenter. You can refer to the ' 'official installation guide in https://www.nltk.org/install.html.') path_to_jar = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27', 'stanford-segmenter-3.9.1.jar') path_to_model = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27', 'data', 'pku.gz') path_to_dict = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27', 'data', 'dict-chris6.ser.gz') path_to_sihan_corpora_dict = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27', 'data') segmenter_url = 'https://nlp.stanford.edu/software/stanford-segmenter-2018-02-27.zip' segmenter_sha1 = 'aa27a6433704b7b4c6a14be1c126cb4b14b8f57b' stanford_segmenter = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27.zip') if not os.path.exists(path_to_jar) or \ not os.path.exists(path_to_model) or \ not os.path.exists(path_to_dict) or \ not os.path.exists(path_to_sihan_corpora_dict) or \ not check_sha1(filename=stanford_segmenter, sha1_hash=segmenter_sha1): # automatically download the files from the website and place them to stanford_root if not os.path.exists(segmenter_root): os.mkdir(segmenter_root) download(url=segmenter_url, path=segmenter_root, sha1_hash=segmenter_sha1) _extract_archive(file=stanford_segmenter, target_dir=segmenter_root) path_to_slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25', 'slf4j-api-1.7.25.jar') slf4j_url = 'https://www.slf4j.org/dist/slf4j-1.7.25.zip' slf4j_sha1 = '89ea41ad6ebe1b190139421bb7c8d981e9df1625' slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25.zip') if not os.path.exists(path_to_slf4j) or \ not check_sha1(filename=slf4j, sha1_hash=slf4j_sha1): # automatically download the files from the website and place them to slf4j_root if not os.path.exists(slf4j_root): os.mkdir(slf4j_root) download(url=slf4j_url, path=slf4j_root, sha1_hash=slf4j_sha1) _extract_archive(file=slf4j, target_dir=slf4j_root) self._tokenizer = StanfordSegmenter(java_class=java_class, path_to_jar=path_to_jar, path_to_slf4j=path_to_slf4j, path_to_dict=path_to_dict, path_to_sihan_corpora_dict=path_to_sihan_corpora_dict, path_to_model=path_to_model) def __call__(self, sample): """ Parameters ---------- sample: str The Chinese sentence to tokenize. Better not to input sentence in other languages since this class is mainly used for Chinese Word Segmentation. Returns ------- ret : list of strs List of tokens """ return [tok for tok in self._tokenizer.segment(sample).strip().split()]
#'/data/;C:/Users/dell/Documents/graduation_backup/standford/stanford-postagger-full' \ #'-2018-10-16/models/ ') #os.environ['STANFORD_MODELS'] = 'C:/Users/dell/Documents/graduation_backup/standford/stanford-segmenter-2018-10-16' \ #'/data/' path = "standford/stanford-segmenter-2018-10-16/data/" path = pkg_resources.resource_filename(__name__, path) os.environ['STANFORD_MODELS'] = path path = "standford/stanford-parser-full-2018-10-16" path = pkg_resources.resource_filename(__name__, path) os.environ['CLASSPATH'] = path # os.environ['JAVAHOME'] = 'C:/Program Files/Java/jre1.8.0_171' path = "standford/stanford-segmenter-2018-10-16/stanford-segmenter-3.9.2.jar" path = pkg_resources.resource_filename(__name__, path) segmenter = StanfordSegmenter(path) segmenter.default_config('ar') def check_if_side_effect(sentence): for word in sentence.split(" "): word = stemming(word) if word and word in medical_terms or word in human_parts_stemmed and word.strip( ) != "": return True return False def split_conjunc(sentence):
from nltk.tokenize import StanfordSegmenter # from nltk.tokenize import StanfordTokenizer segmenter = StanfordSegmenter( path_to_sihan_corpora_dict="/Users/cquan/Documents/model/stanford-segmenter-2018-10-16/data", path_to_model="/Users/cquan/Documents/model/stanford-segmenter-2018-10-16/data/pku.gz", path_to_dict="/Users/cquan/Documents/model/stanford-segmenter-2018-10-16/data/dict-chris6.ser.gz") res = segmenter.segment(u'北海已成为中国对外开放中升起的一颗明星') print(type(res)) print(res.encode('utf-8')) from nltk.parse.stanford import StanfordParser eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') res = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split())) for tree in res: print(tree) tree.draw() ch_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz') ch_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/chineseFactored.ser.gz') res1 = list(ch_parser.parse(u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星'.split())) for tree in res1: print(tree) tree.draw() from nltk.parse.stanford import StanfordDependencyParser eng_parser = StanfordDependencyParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') res2 = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split())) # for row in res2[0].triples():
from stanfordcorenlp import StanfordCoreNLP nlp = StanfordCoreNLP( 'C:/Users/user/Desktop/NLP/stanford-corenlp-full-2018-10-05', lang="zh") nlp.word_tokenize(ptt_sim) #不知為啥超九 #%% NLTK StanfordSegmenter from nltk.tokenize import StanfordSegmenter #from nltk.tokenize.stanford_segmenter import StanfordSegmenter segmenter = StanfordSegmenter( java_class="edu.stanford.nlp.ie.crf.CRFClassifier", path_to_jar= "C:/Users/user/Desktop/NLP/stanford-segmenter-2015-12-09/stanford-segmenter.jar", #path_to_slf4j="C:/Users/user/Desktop/NLP/stanford-corenlp-full-2018-10-05/slf4j-api.jar", path_to_sihan_corpora_dict= "C:/Users/user/Desktop/NLP/stanford-segmenter-2015-12-09/data", path_to_model= "C:/Users/user/Desktop/NLP/sanford-segmenter-2015-12-09/data/pku.gz", path_to_dict= "C:/Users/user/Desktop/NLP/stanford-segmenter-2015-12-09/data/dict-chris6.ser.gz", ) text = ("这是斯坦福中文分词器测试") segmenter.segment(text) #這台跑不出來QQ #%% NLTK CoreNLPParser #必須先在cmd執行java(nlp start server.txt) from nltk.parse.corenlp import CoreNLPParser corenlp_parser = CoreNLPParser('http://localhost:9001', encoding='utf8')
class NLTKStanfordSegmenter: r"""Apply the Stanford Chinese Word Segmenter implemented in NLTK. Users of this class are required to install Java, NLTK and download Stanford Word Segmenter Parameters ---------- segmenter_root : str, default '$MXNET_HOME/stanford-segmenter' Path to folder for storing stanford segmenter. MXNET_HOME defaults to '~/.mxnet'. slf4j_root : str, default '$MXNET_HOME/slf4j' Path to foler for storing slf4j. MXNET_HOME defaults to '~/.mxnet' java_class : str, default 'edu.stanford.nlp.ie.crf.CRFClassifier' The learning algorithm used for segmentation Examples -------- >>> tokenizer = gluonnlp.data.NLTKStanfordSegmenter() #doctest:+SKIP >>> tokenizer('我来到北京清华大学') #doctest:+SKIP ['我', '来到', '北京', '清华大学'] >>> tokenizer('小明硕士毕业于中国科学院计算所,后在日本京都大学深造') #doctest:+SKIP ['小明', '硕士', '毕业', '于', '中国科学院', '计算所', ',', '后', '在', '日本京都大学', '深造'] """ def __init__(self, segmenter_root=os.path.join(get_home_dir(), 'stanford-segmenter'), slf4j_root=os.path.join(get_home_dir(), 'slf4j'), java_class='edu.stanford.nlp.ie.crf.CRFClassifier'): is_java_exist = os.system('java -version') assert is_java_exist == 0, 'Java is not installed. You must install Java 8.0' \ 'in order to use the NLTKStanfordSegmenter' try: from nltk.tokenize import StanfordSegmenter except ImportError: raise ImportError( 'NLTK or relevant packages are not installed. You must install NLTK ' 'in order to use the NLTKStanfordSegmenter. You can refer to the ' 'official installation guide in https://www.nltk.org/install.html.' ) path_to_jar = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27', 'stanford-segmenter-3.9.1.jar') path_to_model = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27', 'data', 'pku.gz') path_to_dict = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27', 'data', 'dict-chris6.ser.gz') path_to_sihan_corpora_dict = os.path.join( segmenter_root, 'stanford-segmenter-2018-02-27', 'data') segmenter_url = 'https://nlp.stanford.edu/software/stanford-segmenter-2018-02-27.zip' segmenter_sha1 = 'aa27a6433704b7b4c6a14be1c126cb4b14b8f57b' stanford_segmenter = os.path.join(segmenter_root, 'stanford-segmenter-2018-02-27.zip') if not os.path.exists(path_to_jar) or \ not os.path.exists(path_to_model) or \ not os.path.exists(path_to_dict) or \ not os.path.exists(path_to_sihan_corpora_dict) or \ not check_sha1(filename=stanford_segmenter, sha1_hash=segmenter_sha1): # automatically download the files from the website and place them to stanford_root if not os.path.exists(segmenter_root): os.mkdir(segmenter_root) download(url=segmenter_url, path=segmenter_root, sha1_hash=segmenter_sha1) _extract_archive(file=stanford_segmenter, target_dir=segmenter_root) path_to_slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25', 'slf4j-api-1.7.25.jar') slf4j_url = 'https://www.slf4j.org/dist/slf4j-1.7.25.zip' slf4j_sha1 = '89ea41ad6ebe1b190139421bb7c8d981e9df1625' slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25.zip') if not os.path.exists(path_to_slf4j) or \ not check_sha1(filename=slf4j, sha1_hash=slf4j_sha1): # automatically download the files from the website and place them to slf4j_root if not os.path.exists(slf4j_root): os.mkdir(slf4j_root) download(url=slf4j_url, path=slf4j_root, sha1_hash=slf4j_sha1) _extract_archive(file=slf4j, target_dir=slf4j_root) self._tokenizer = StanfordSegmenter( java_class=java_class, path_to_jar=path_to_jar, path_to_slf4j=path_to_slf4j, path_to_dict=path_to_dict, path_to_sihan_corpora_dict=path_to_sihan_corpora_dict, path_to_model=path_to_model) def __call__(self, sample): """ Parameters ---------- sample: str The Chinese sentence to tokenize. Better not to input sentence in other languages since this class is mainly used for Chinese Word Segmentation. Returns ------- ret : list of strs List of tokens """ return [tok for tok in self._tokenizer.segment(sample).strip().split()]
from opencc import OpenCC from nltk.parse.stanford import StanfordParser from nltk.tokenize import StanfordSegmenter from nltk.tag import StanfordNERTagger from nltk.tag import StanfordPOSTagger import pickle, re, pymysql, jieba, os import pandas as pd chi_tagger = StanfordPOSTagger('./StanfordNLP/models/chinese-distsim.tagger', './StanfordNLP/jars/stanford-ner.jar') segmenter = StanfordSegmenter( java_class='edu.stanford.nlp.ie.crf.CRFClassifier', path_to_jar="./StanfordNLP/jars/stanford-segmenter-3.9.2.jar", path_to_slf4j="./StanfordNLP/jars/slf4j-api.jar", path_to_sihan_corpora_dict="./StanfordNLP/stanford-segmenter-2018-10-16/data", path_to_model="./StanfordNLP/stanford-segmenter-2018-10-16/data/pku.gz", path_to_dict="./StanfordNLP/stanford-segmenter-2018-10-16/data/dict-chris6.ser.gz" ) os.environ["JAVA_HOME"] = "/tmp2/b05902109/jdk-12.0.1"#注意這邊你們電腦要安裝java jdk,並放入你們自己的jdk的安裝路徑 os.environ["CLASSPATH"] = "./StanfordNLP/stanford-parser-2018-10-17" os.environ["STANFORD_MODELS"] = "./StanfordNLP/models" ch_parser = StanfordParser(model_path='./StanfordNLP/models/chinesePCFG.ser.gz') cc = OpenCC('t2s') # (Optional )convert from Simplified Chinese to Traditional Chinese def Get_Data_From_Mysql(source_name,keyword): contents_list=[] target_ids =[] db = pymysql.connect(host="18.217.252.187",port=3306, user="******",passwd="antimoneylaunderingisgood2",db="AML_News" ,charset='utf8') try: