示例#1
0
 def __init__(self, model_dir):
     print(f'[*] Creating CKIP tokenizer from {model_dir}...',
           end='',
           flush=True)
     self._ws = ckiptagger.WS(model_dir)
     self._pos = ckiptagger.POS(model_dir)
     self._pos_punc_class_suffix = 'CATEGORY'
     print('done')
示例#2
0
    def _init(self):
        import ckiptagger

        if self._recommend_lexicons:
            self._opts['recommend_dictionary'] = ckiptagger.construct_dictionary(self._recommend_lexicons)
        if self._coerce_lexicons:
            self._opts['coerce_dictionary'] = ckiptagger.construct_dictionary(self._coerce_lexicons)

        self._core = ckiptagger.WS(_get_tagger_data(), disable_cuda=self._disable_cuda)
示例#3
0
 def load_model(self):
     self.download_model_files()
     return ckiptagger.WS(self.data_dir, disable_cuda=self.disable_cuda)
示例#4
0
文件: idv_crawl.py 项目: q40603/Demo
#!/usr/bin/python3

from requests import get
from bs4 import BeautifulSoup as bs
from urllib.parse import unquote
from sklearn.feature_extraction.text import TfidfVectorizer
import cgi, json, datetime, ckiptagger, pickle, lzma, sys, codecs

field = cgi.FieldStorage()
content, models, vocabs, corpus = [], {}, {}, {}
code_to_name = dict(
    i.decode('utf-8').strip().split(',')
    for i in open('stock_codec.csv', 'rb'))
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach())
try:
    ws = ckiptagger.WS('./data', True)
except Exception as e:
    ckiptagger.data_utils.download_data_url('./')
    ws = ckiptagger.WS('./data', True)

try:
    src, stock = field.getvalue('src').split(','), field.getvalue(
        'stock').split(',')
    #src, stock = ['ctee'], ['2330']
except:
    print(
        'content-type:text/plain;charset:utf-8\n\nNo newspaper source or stock specified.'
    )
    exit()

try: