def init_from_mongo(self): client = MongoClient('mongodb://localhost:27017/') db = client.ptt posts = db.gossiping_38k jieba.set_dictionary('extra_dict/dict.txt.big') jieba.analyse.set_stop_words("extra_dict/stop_words_cht.txt") for post in posts.find(): #For content d = defaultdict(int) content = post['content'] if post['score'] != 0: for l in content.split('\n'): if l: for w in jieba.cut(l): d[w] += 1 if len(d) > 0: self.words.append(d) self.scores.append(1 if post['score'] > 0 else 0) #For comments for comment in post['comments']: l = comment['content'].strip() if l and comment['score'] != 0: d = defaultdict(int) for w in jieba.cut(l): d[w] += 1 if len(d) > 0: self.c_words.append(d) self.c_scores.append(1 if comment['score'] > 0 else 0) client.close()
def init(self, options): # type: (Dict) -> None if JIEBA: dict_path = options.get('dict') if dict_path and os.path.isfile(dict_path): jieba.set_dictionary(dict_path) if PYSTEMMER: class Stemmer(object): def __init__(self): # type: () -> None self.stemmer = PyStemmer('porter') def stem(self, word): # type: (unicode) -> unicode return self.stemmer.stemWord(word) else: class Stemmer(PorterStemmer): """All those porter stemmer implementations look hideous; make at least the stem method nicer. """ def stem(self, word): # type: (unicode) -> unicode return PorterStemmer.stem(self, word, 0, len(word) - 1) self.stemmer = Stemmer()
def start(): sentence = raw_input('請輸入句子:') # jieba.enable_parallel(2) # 開啟多執行緒,参数為Thread數 # jieba.disable_parallel() # 關閉多執行緒 use_dict = True # 是否使用繁體詞庫 use_user_dict = False # 是否使用使用者自定義詞庫 if use_dict: jieba.set_dictionary('dict/dict.txt.big') if use_user_dict: jieba.load_userdict('dict/user_dict.txt') getFullMode(sentence) getFullModeHMM(sentence) getAccurate(sentence) getAccurateHMM(sentence) getNewWord(sentence) getSearch(sentence) getPostag(sentence) getTokenize(sentence) getKeyWord(sentence) getKeyWord(sentence, 'TextRank')
def testcase(): cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。") cuttest("我不喜欢日本和服。") cuttest("雷猴回归人间。") cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作") cuttest("我需要廉租房") cuttest("永和服装饰品有限公司") cuttest("我爱北京天安门") cuttest("abc") cuttest("隐马尔可夫") cuttest("雷猴是个好网站") if __name__ == "__main__": testcase() jieba.set_dictionary("foobar.txt") print "================================" testcase() def main(): pass if __name__ == "__main__": main()
def segment_pos(dir='rawdata', datetype='all', outdir='nohref_seg'): jieba.set_dictionary('dict/dict.txt.big') for tag in loadTag(): jieba.add_word(tag) chinese_postagger = StanfordPOSTagger('tagger/chinese-distsim.tagger', 'tagger/stanford-postagger.jar', encoding='utf-8') for file in parseDateType(dir,datetype): dirname, filename = os.path.split(file) head = filename.split('.')[0] outfile = outdir + '/' + head + '.txt' if os.path.isfile(outfile): print 'pass %s...' %head continue print 'segment %s ...' %head f = open(outfile, 'w') dataList = readJson(file) p = re.compile("http[s]?://.*\n") for data in dataList: content = data['content'] content = re.sub(p, '', content) segList = jieba.cut(content) wordList, tagList = postagging(chinese_postagger, segList) for w, t in zip(wordList, tagList): f.write(w.encode('utf-8')) f.write(' ') f.write(t) f.write(' ') f.write('\n') f.close()
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # jieba custom setting. jieba.set_dictionary('jieba_dict/dict.txt.big') # load stopwords set stopwordset = set() with io.open('jieba_dict/stopwords.txt','r',encoding='utf-8') as sw: for line in sw: stopwordset.add(line.strip('\n')) texts_num = 0 output = io.open('wiki_seg.txt','w',encoding='utf-8') with io.open('wiki_zh_tw.txt','r',encoding='utf-8') as content : for line in content: words = jieba.cut(line, cut_all=False) for word in words: if word not in stopwordset: output.write(word +' ') texts_num += 1 if texts_num % 10000 == 0: logging.info("已完成前 %d 行的斷詞" % texts_num) output.close()
def init(self, options): if JIEBA: dict = options.get('dict') if os.path.isfile(dict): jieba.set_dictionary(dict) print("Dictionary path:", dict) if CSTEMMER: class Stemmer(CStemmer): def stem(self, word): return self(word.lower()) elif PYSTEMMER: class Stemmer(object): def __init__(self): self.stemmer = PyStemmer('porter') def stem(self, word): return self.stemmer.stemWord(word) else: class Stemmer(PorterStemmer): """All those porter stemmer implementations look hideous; make at least the stem method nicer. """ def stem(self, word): word = word.lower() return PorterStemmer.stem(self, word, 0, len(word) - 1) self.stemmer = Stemmer()
def __init__(self, category=None, *args, **kwargs): super(crawlcatSpider, self).__init__(*args, **kwargs) db_conn = sqlite3.connect('crawlcat.sqlite') db = db_conn.cursor() #初始化关键词 num_list = ['0','1','2','3','4','5','6','7','8','9','一','二','三','四','五','六','七','八','九','十'] quantifier_list = ['个','款','种','大','条','件','佳','张图'] num_keywords = ['排行'.decode('utf-8'),'神作'.decode('utf-8'),'盘点'.decode('utf-8')] for num in num_list: for quantifier in quantifier_list: num_keywords.append((num+quantifier).decode('utf-8')) keywords = num_keywords[:] db.execute('SELECT node_id,keywords,alias_id,type_id,cate_id FROM nodes WHERE type_id != 2') for item in db.fetchall(): if item[3] == 0: #搜索词,加入分词词库 self.keywords_dict[item[1]] = item[0] if item[2] == 0 else item[2] keywords.append(item[1]) elif item[3] == 1: #精选词 self.select_keywords_dict[item[4]] = item[0] elif item[3] == 3: #分词失败的关键词 self.keywords_dict[item[1]] = item[0] if item[2] == 0 else item[2] self.special_keywords_list.append(item[1]) self.keywords_set = set(self.keywords_dict) self.select_keywords_set = set(num_keywords) #生成结巴字典 fp = open('userdict.txt','w') for word in keywords: fp.write("%s 3\n" % word.encode('utf-8')) fp.close() jieba.set_dictionary("userdict.txt") #初始化域名 db.execute('SELECT DISTINCT domain FROM website') for item in db.fetchall(): self.allowed_domains.append(item[0]) #初始化地址和规则 db.execute("SELECT url,rules,cate_id,img_rule,src_attr,list_src_attr FROM website WHERE enabled = '1'") for item in db.fetchall(): self.start_urls.append(item[0]) self.url_sel_rules[item[0]] = {'urles':eval(item[1]),'cate_id':item[2],'img_rule':item[3],'src_attr':item[4],'list_src_attr':item[5]} #初始化已存网址 db.execute("SELECT url,title FROM feeds"); for item in db.fetchall(): self.stored_url_list.append(item[0]) #取前8个字符 self.stored_title_list.append(item[1][:8]) db.close() db_conn.close()
def init(self, options): # type: (Dict) -> None if JIEBA: dict_path = options.get('dict') if dict_path and os.path.isfile(dict_path): jieba.set_dictionary(dict_path) self.stemmer = get_stemmer()
def loadDictionaries(): jieba.set_dictionary('../resource/dicts/dict.txt.big') # set 繁體辭典 jieba.load_userdict('../resource/dicts/ptt_words.txt') # load PTT 用語 jieba.load_userdict('../resource/dicts/restaurant.txt') # load 餐廳名稱 jieba.load_userdict('../resource/dicts/taiwan_area.txt') # load 台灣地名 jieba.load_userdict('../resource/dicts/taiwan_words.txt') # load 台灣 words jieba.load_userdict('../resource/dicts/taiwan_party.txt') # load 台灣政黨 jieba.analyse.set_stop_words('../resource/dicts/mystopwords.txt') # set stopwords
def testSetDictionary(self): jieba.set_dictionary("foobar.txt") for content in test_contents: result = jieba.cut(content) assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error" result = list(result) assert isinstance(result, list), "Test SetDictionary error on content: %s" % content print >> sys.stderr, " , ".join(result)
def load(self): # load jieba first if not jieba.initialized: jieba.set_dictionary(self.jieba_dict_path) jieba.initialize() self.pydict = {} f = None try: # py.txt f = open(self.dict_path) for line in f: try: line = line.strip() except: continue sps = line.split('\t') if len(sps) != 3: print >>sys.stderr, 'bad format line [%s]' % line continue word = sps[0] py = sps[1] freq = float(sps[2]) if word in self.pydict: wordInfoLen = len(self.pydict[word]) i = 0 dup = False while i < wordInfoLen: if self.pydict[word][i].py == py: if self.pydict[word][i].freq < freq: self.pydict[word][i].freq = freq dup = True break if self.pydict[word][i].freq < freq: break i += 1 if not dup: pyInfo = PyInfo() pyInfo.py = py pyInfo.freq = freq self.pydict[word].insert(i, pyInfo) wordInfoLen += 1 for j in range(i + 1, wordInfoLen): if self.pydict[word][j].py == py: del self.pydict[word][j] break else: pyInfo = PyInfo() pyInfo.py = py pyInfo.freq = freq self.pydict[word] = [ pyInfo ] except Exception as e: try: f.close() except: pass return False self.is_load = True return True
def init(): """ 初始化jieba分词器设置 :return: """ # 设置自定义字典 jieba.set_dictionary("data/jieba_dict.txt") # 设置工作目录 jieba.tmp_dir = os.getcwd()
def tokenize(str_list): jieba.set_dictionary('dict.txt.big.txt') texts = list() for comment in str_list: comment_tokens = jieba.cut(comment, cut_all = False) texts.append(" ".join(comment_tokens).split(" ")) return texts
def __init__(self): # 取得当前包路径 _package_path_ =_context_path self._user_dict = _package_path_+os.sep+"dic.data" self._user_stword = _package_path_+os.sep+"stword.data" #构造停用词列表 self._stop_word_list = list(line.strip().decode("utf8") for line in open(self._user_stword,'r').readlines()) # print(self._user_dict,self._user_stword) jieba.set_dictionary(self._user_dict) jieba.initialize()
def tokenizer(doc_string, parse = False): current_file_path = os.path.dirname(os.path.abspath(__file__)) jieba.set_dictionary(current_file_path+'/dict/dict.txt.big') seg_list = list(jieba.cut(doc_string, cut_all=False)) seg_list = stopWords.rmStopWords(seg_list) if parse: seg_list = ",".join(seg_list) seg_list = seg_list.encode("utf-8") return seg_list
def init_jieba(self, seg_dic, userdic): """ jieba custom setting. """ jieba.load_userdict(userdic) jieba.set_dictionary(seg_dic) with open(userdic,'r',encoding='utf-8') as input: for word in input: word = word.strip('\n') jieba.suggest_freq(word, True)
def __init__(self): print "init NLP toolkit" self.tagger = ner.SocketNER(host='localhost', port=1234) # parse list of stopwords self.stoplist=[i.strip() for i in open(stopwords_file)] self.stoplist+=weibo_stopwords # better support for traditional character jieba.set_dictionary(dico_file)
def participle(self): jieba.set_dictionary("dict/dict.txt") jieba.initialize() if(self.radioButton.isChecked()): self.result=jieba.cut(self.filetext,cut_all=True) elif(self.radioButton_2.isChecked()): self.result=jieba.cut(self.filetext,cut_all=False) elif(self.radioButton_3.isChecked()): self.result=jieba.cut_for_search(self.filetext) else: self.result=jieba.cut(self.filetext,cut_all=False) self.textBrowser.clear() self.textBrowser.setText('/'.join(self.result))
def start(): sentence = raw_input('請輸入句子:') use_dict = True #是否使用繁體詞庫 if use_dict: jieba.set_dictionary('dict/dict.txt.big') getFullMode(sentence) getFullModeHMM(sentence) getAccurate(sentence) getAccurateHMM(sentence) getNewWord(sentence) getSearch(sentence)
def initialize(): # Load conjuction data global CONJUNCTIONS CONJUNCTIONS = [] from codecs import open with open('vendor/moedict.dict', 'r', encoding='utf8') as data: for entry in data: CONJUNCTIONS.append(entry.split()[0]) # Load CJK parsing library jieba.set_dictionary('vendor/jieba_tc.dict') jieba.load_userdict('vendor/chewing.dict') jieba.initialize()
def main(): sys.stderr.write(' >>>>>> python runed \n') jieba.set_dictionary( 'dict.txt.big' ) #get our data as an array from read_in() for line in sys.stdin: sys.stderr.write( 'get' + line + '\n' ) print( "0 " + " ".join( jieba.cut( line, cut_all=False ) ) ) sys.stdout.flush() sys.stderr.write(' >>>>>> python finished \n')
def TextSeg(datas, lag): dict_path = "./Config/dict" if lag == "chs" and exists(dict_path): # 中文情况 jieba.set_dictionary(dict_path) # jieba分词词典,可以修改 datasseg = [] for data in datas: if lag == "eng": # 英文情况 word_list = nltk.word_tokenize(data) elif lag == "chs": # 中文情况 word_cut = jieba.cut(data, cut_all=False) # 精确模式,返回的结构是一个可迭代的genertor word_list = list(word_cut) # genertor转化为list,每个词unicode格式 # print " ".join(word_list) datasseg.append(word_list) return datasseg
def run(): start_time = time.clock() jieba.set_dictionary('jieba/dict.txt.big') jieba.initialize() print ("jieba " + str(time.clock() - start_time)) start_time = time.clock() news_rss_url = "http://hk.news.yahoo.com/rss/hong-kong" # news_rss_url = "http://hk.news.yahoo.com/rss/china" info = feedparser.parse(news_rss_url) start_time = time.clock() for entry in info.entries: # word count of each word of summary word_list = getBagOfWords(preprocess(jieba.cut(stripTag(entry.summary)))) # word count of each word of title bag_of_word_of_title = getBagOfWords(preprocess(jieba.cut(stripTag(entry.title)))) # Combine word count of both summary and title and title weights more bag_of_word = Counter() for i in range(3): bag_of_word.update(bag_of_word_of_title) bag_of_word.update(word_list) entry["bag_of_words"] = bag_of_word print ("preprocess " + str(time.clock() - start_time)) # result = Counter() # for entry in info.entries: # result.update(entry["bag_of_words"]) # printList(result) # Clustering them start_time = time.clock() clusters = clustering.clustering([Cluster([Vector(entry)]) for entry in info.entries]) print ("clustering " + str(time.clock() - start_time)) # Print the result newsList = [] for (index, cluster) in enumerate(clusters): for vector in cluster.listOfVectors: news = News(index, (vector == cluster.centroidVector), vector.data["title"], vector.data["published"], vector.data["link"]) newsList.append(news.__dict__) return json.dumps(newsList)
def init(jieba_parallel=False): # 加载英语/中文停止词,分别来自nltk和zhon global english_stopwords, chinese_stopwords english_stopwords = set(nltk.corpus.stopwords.words('english')) chinese_stopwords = {word[:-1] for word in codecs.open("stopwords.txt", "r", encoding="utf-8")} # 设置结巴分词log级别 jieba.setLogLevel("INFO") # 设置结巴分词字典文件 jieba.set_dictionary("./jieba_dict.txt") # 修改结巴分词临时工作目录 jieba.tmp_dir = os.getcwd() # 开启并行分词模式,进程数为CPU核心数 if jieba_parallel: jieba.enable_parallel() config.log.info("module algorithm has initialized successfully.")
def handle(data): oper = json.loads(data) if oper[0] == 'cut': return json.dumps(tuple(jieba.cut(*oper[1], **oper[2]))).encode('utf-8') elif oper[0] == 'cut_for_search': return json.dumps(tuple(jieba.cut_for_search(*oper[1], **oper[2]))).encode('utf-8') elif oper[0] == 'tokenize': return json.dumps(tuple(jieba.tokenize(*oper[1], **oper[2]))).encode('utf-8') elif oper[0] == 'add_word': jieba.add_word(*oper[1], **oper[2]) elif oper[0] == 'load_userdict': jieba.load_userdict(*oper[1]) elif oper[0] == 'set_dictionary': jieba.set_dictionary(*oper[1]) elif oper[0] == 'stopserver': return b'stop' elif oper[0] == 'ping': return b'pong'
def handlemsg(data): oper = loadsjson(data) if oper[0] == 'c2m': return dumpsjson(mc.c2m.translate(*oper[1:])) elif oper[0] == 'm2c': return dumpsjson(mc.m2c.translate(*oper[1:])) elif oper[0] == 'c2m.raw': return dumpsjson(mc.c2m.rawtranslate(oper[1])) elif oper[0] == 'm2c.raw': return dumpsjson(mc.m2c.rawtranslate(oper[1])) elif oper[0] == 'modelname': return dumpsjson(mc.name()) elif oper[0] == 'cut': return dumpsjson(tuple(jieba.cut(*oper[1], **oper[2]))) elif oper[0] == 'cut_for_search': return dumpsjson(tuple(jieba.cut_for_search(*oper[1], **oper[2]))) elif oper[0] == 'tokenize': return dumpsjson(tuple(jieba.tokenize(*oper[1], **oper[2]))) elif oper[0] == 'jiebazhc.cut': return dumpsjson(tuple(jiebazhc.cut(*oper[1], **oper[2]))) elif oper[0] == 'jiebazhc.cut_for_search': return dumpsjson( tuple(jiebazhc.cut_for_search(*oper[1], **oper[2]))) elif oper[0] == 'jiebazhc.tokenize': return dumpsjson(tuple(jiebazhc.tokenize(*oper[1], **oper[2]))) elif oper[0] == 'add_word': jieba.add_word(*oper[1], **oper[2]) elif oper[0] == 'load_userdict': jieba.load_userdict(*oper[1]) elif oper[0] == 'set_dictionary': jieba.set_dictionary(*oper[1]) elif oper[0] == 'stopserver': return b'stop' elif oper[0] == 'ping': return b'pong' else: return dumpsjson('Command not found')
def __init__(self): self.br = br = mechanize.Browser() self.br.set_handle_robots(False) # ignore robots self.br.set_handle_refresh(False) self.sixHourBeforeTime = time.time() - 60 * 60 * 6 self.db_address = "127.0.0.1" #'54.251.147.205' if platform.system() == "Windows": self.features = "html5lib" else: self.features = "lxml" oauth_args = dict( client_id="482698495096073", client_secret="8c58b055fcb762a9780638dc401c85e2", grant_type="client_credentials", ) oauth_curl_cmd = ["curl", "https://graph.facebook.com/oauth/access_token?" + urllib.urlencode(oauth_args)] oauth_response = subprocess.Popen(oauth_curl_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[ 0 ] print oauth_curl_cmd print str(oauth_response) try: oauth_access_token = urlparse.parse_qs(str(oauth_response))["access_token"][0] self.graph = facebook.GraphAPI(oauth_access_token) except KeyError: print ("Unable to grab an access token!") # self._pre_dict_combine('combine_dict.txt') # jieba.set_dictionary('combine_dict.txt') dict_path = os.path.dirname(os.path.abspath(__file__)) + "/dict.txt" print dict_path jieba.set_dictionary(dict_path) jieba.initialize()
def extract(self, text): jieba.set_dictionary('/usr/lib/ckan/default/src/ckanext-data_recommendation/dict.txt.big') result = jieba.analyse.extract_tags(text, topK=5) result = [i.encode('utf8') for i in result] result = '[' + ','.join(result) + ']' return result
title = soup.find_all('a', class_='DY5T1d') first_art_link = title[0]['href'].replace('.','https://news.google.com',1) #print(first_art_link) art_request = requests.get(first_art_link) art_request.encoding='utf8' soup_art = BeautifulSoup(art_request.text,'html.parser') art_content = soup_art.find_all('p') art_texts = [p.text for p in art_content] print(art_texts) ## Create Word Cloud import jieba jieba.set_dictionary('../../../../_MySyncDrive/RepositoryData/data/jiaba/dict.txt.big.txt') art_words = [w for w in jieba.cut(' '.join(art_texts))] ## Fine-tune Word Cloud from collections import Counter import imageio from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS from matplotlib import pyplot as plt ## Check font paths ## !fc-list :lang=zh ## Load stopwords
try: msg = "From: %s\r\nTo: %s\r\nSubject: %s\r\nDate: %s\r\n\r\n%s" % ( from_address, str.join(',', to_address), Header( subject, 'utf-8').encode(), formatdate(), content) server = smtplib.SMTP(smtp_server, port) # 发件人邮箱中的SMTP服务器,端口是587 server.ehlo(name=host) server.starttls() server.ehlo(name=host) server.login(from_address, my_pass) # 括号中对应的是发件人邮箱账号、邮箱密码 server.sendmail(from_address, to_address, msg) # 括号中对应的是发件人邮箱账号、收件人邮箱账号、发送邮件 server.quit() # 关闭连接 print("邮件发送成功") except smtplib.SMTPException: # 如果 try 中的语句没有执行,则会执行下面的 ret=False ret = False print("邮件发送失败") return ret if __name__ == '__main__': train = term_frequency_train('source/train.txt') print('load training set success') jieba.set_dictionary("source/dict.txt") jieba.initialize() # if you want output QR in cmd, try: # itchat.auto_login(enableCmdQR=True) itchat.auto_login(hotReload=True, exitCallback=ex, enableCmdQR=2) itchat.run()
Notes ----- These functions are based on the text normalization functions provided in Text Analytics with Python 2ed. """ import unicodedata import re # from nltk.tokenize.toktok import ToktokTokenizer import pandas as pd import jieba ## Initialize Trad Chinese dictionary jieba.set_dictionary('../../../RepositoryData/data/jiaba/dict.txt.jiebatw.txt') ## Normalize unicode characters def remove_weird_chars(text): # ``` # (NFKD) will apply the compatibility decomposition, i.e. # replace all compatibility characters with their equivalents. # ``` text = unicodedata.normalize('NFKD', text).encode('utf-8', 'ignore').decode( 'utf-8', 'ignore') return text ## Remove extra linebreaks
import config import jieba import pandas as pd from tqdm import tqdm tqdm.pandas() # =========================== # Script purpose: # Segment translated examples # =========================== # use downloaded segmenting index jieba.set_dictionary("./data/raw/dict.txt.big.txt") zh_translations = pd.read_feather( "./data/intermediate/zh_translations_filtered.feather", ) # use space delimiter to store segmented list as string # (works because all sentence with English spaces were eliminated) def segment(sentence): return " ".join(jieba.cut_for_search(sentence, HMM=True)) # process simplified sentences zh_translations["simplified_segmented"] = zh_translations[ "simplified"].progress_apply(segment) # process traditional sentences zh_translations["traditional_segmented"] = zh_translations[
def set_default_dict(tokenizer, path_default_dict): print("Setting Jieba Default Dictionary at " + str(path_default_dict)) tokenizer.set_dictionary(path_default_dict) return tokenizer
title = soup.find_all('a', class_='DY5T1d') first_art_link = title[0]['href'].replace('.','https://news.google.com',1) #print(first_art_link) art_request = requests.get(first_art_link) art_request.encoding='utf8' soup_art = BeautifulSoup(art_request.text,'lxml') art_content = soup_art.find_all('p') art_texts = [p.text for p in art_content] print(art_texts) ## Create Word Cloud import jieba jieba.set_dictionary('../../../Corpus/jiaba/dict.txt.big.txt') art_words = [w for w in jieba.cut(' '.join(art_texts))] ## Fine-tune Word Cloud from collections import Counter import imageio from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS from matplotlib import pyplot as plt ## Check font paths ## !fc-list :lang=zh ## Load stopwords
# coding = utf-8 import numpy as np import jieba import preprocess as p import os from keras.preprocessing.sequence import pad_sequences # stop_word_file = 'dicts/stop_words.txt' jieba.set_dictionary('data/dict.txt.big') jieba.initialize() word_embedding_file = 'data/word_embedding_matrix.npy' def get_word_data(char_data): seq_data = [''.join(l) for l in char_data] word_data = [] # stop_words = [line.strip() for line in open(stop_word_file, 'r', encoding='utf-8')] for seq in seq_data: seq_cut = jieba.cut(seq, cut_all=False) word_data.append([w for w in seq_cut for n in range(len(w))]) return word_data def get_word2object(): word2vec = {} f = open(r'data/word2vec.bin') # load pre-trained word embedding i = 0 for line in f: tep_list = line.split() if i == 0:
def set_dict(): jieba.set_dictionary('dict.txt.big.txt')
#/usr/bin/env python3 # -*- coding: utf-8 -*- # 测试完善中 import numpy import jieba jieba.set_dictionary("./data/jieba/dict.txt.big") jieba.load_userdict("./data/jieba/userdict.txt") import jieba.posseg import jieba.analyse from gensim.models.word2vec import Word2Vec model = Word2Vec.load_word2vec_format("./data/vectors.bin", binary=True) from mytools import time_me, get_current_time def word_similarity(w1, w2): return model.similarity(w1, w2) def sum_cosine(matrix, threshold): """ 1.计算语义Jaccard中分子total,即分词相似性矩阵的Cosine和 2.计算m: 两个集合中没有达到语义匹配标准(由阈值threshold控制)的总片段个数或者两者中取最大值 """ total = 0 count = 0 row = matrix.shape[0] col = matrix.shape[1] zero_row = numpy.zeros([1,col]) zero_col = numpy.zeros([row,1]) max = matrix.max() while max > threshold: total += max
break # get the criminal's ID and role into a dict def readRoles(): dic = {} for line in open("../resources/roles.txt"): senderQQ = line.split(",")[0] role = line.split(",")[1] dic[senderQQ] = role return dic importlib.reload(sys) # sys.setdefaultencoding("utf-8") jieba.set_dictionary('../resources/dict.txt') stopwords = [line.strip() for line in open('../resources/stopwords.txt', encoding='utf-8').readlines()] endingwords = [line.strip() for line in open('../resources/endingwords.txt', encoding='utf-8').readlines()] model = models.Word2Vec.load('model/wiki_corpus.model') freqDict = getFreqDict() sentDict = getSentDict() # print(freqDict) sentence_vecs, pca = sentence2Vec(model, sentDict.keys(), freqDict) MiscreantDict = {} ResponseQueue = queue.Queue() EndMiscreats = [] roleDict = readRoles() chatEngine()
import jieba from collections import Counter from wordcloud import WordCloud from matplotlib import pyplot as plt from PIL import Image import numpy as np import pymongo import re import test_NLU2 """ jieba.set_dictionary('C:/Users/User/Desktop/dict.txt') with open('C:/Users/User/Desktop/stop.txt', 'r', encoding='utf8') as f: # 中文的停用字,我也忘記從哪裡拿到的,效果還可以,繁體字的資源真的比較少,大家將就一下吧 stops = f.read().split('\n') testStr = """ #理財專欄作者:黃逸強最近的熱門新聞就是美國非裔男子遭白人警察壓制致死,引發全國性示威,暴動場面怵目驚心,很難想像這是標榜民主自由的美國。但這些負面消息都不影響股市的發展,美股依然上揚,還一再創近期新高,這一波的反彈令專家很不解,更別說是一般的散戶投資人。散戶是反向指標?雖然疫情趨緩很多城市開始解封,但要經濟復甦還言之過早,更別說美中貿易談判未解,美國又進一步對華為制裁,再加上街頭暴動猶如雪上加霜;經濟數據更是難看,非農就業人數大減二千多萬人,創二戰以來最慘,美股仍不甩利空,硬是漲逾數百點,真要找一個理由就是「市場把期待押在未來的復甦上。」儘管股市一直漲,散戶投資人反倒是越漲越害怕,美國個人投資者協會發布最新報告顯示,散戶投資人看空情緒升高至52.6%,創2013年來最高;反之,看多情緒23.6%,則是30年來最低水準。如果散戶看法是反指標,是否意味著市場未來向上的機率更高。台灣投資人也是一樣,舉一檔股票為例,看漲的「台灣五十ETF」只有1380張的融資,1萬多張的周均量,而看跌的「台灣五十反向ETF」,卻有近40萬張的融資,12萬張的周均量。股市愈漲做空的人愈多,是一個很奇怪的心態。漲跌不需要理由也有專家認為,最近股市大漲並不是看多的買盤所拉抬,而是空頭回補的力道所推升。依據籌碼分析,通常跌到深處融資斷頭、或漲到創高空頭回補,就是一種反轉訊號,所以有專家提醒投資人,目前這個泡沫應該要留心。其實股市漲跌並不一定要理由,那只是記者在寫稿時需要一些題材,所以上漲就去找利多的理由、下跌就去找利空消息來搪塞,都是事後諸葛無濟於事。過去很多次的上漲是無基之彈,因為實在找不到理由。像2008年的雷曼金融風暴後,股市也是沒人看好,在現金為王的氛圍中走了十年的多頭。所以股市難測,幾百萬人在進行的金錢遊戲,不是幾個數據或幾則新聞就能決定漲跌,專家的分析都是以現在的資訊,去推測未來的發展,猜對了只是運氣好,猜錯是正常。天災人禍不是意外而是無常,在做投資或資產配置時都是必須要納入的風險因子。別聽消息做股票市場很任性,當它要漲時再多的利空壞消息它還是漲,當它要跌再多的好消息也挽不住跌勢。所以回歸技術面,趨勢的力量不可擋,只要順勢操作,不要自作聰明去抓頭部或猜底部。每一個階段採用不同的工具,承平時期用基本面分析找到長線績優股,新冠肺炎把股市打到低點,基本面無用改用技術分析搶反彈;千萬不要聽消息面,新聞有太多雜訊反而會干擾投資人。有人因為漲太多、股價很高所以會怕,其實高是一種感覺,很抽象不能用來操盤。現階段不需要預測高點,拋開理智線勇敢下單,搭上趨勢的順風車,並設好停損點,就不怕懼高症。★延伸閱讀★投資不能一窩蜂!黃金變現二管道炒短恪守三原則!超前佈署以應萬變!沒有意外 只有號外﹗ """ #stops.append('\n') ## 我發現我的文章中有許多分行符號,這邊加入停用字中,可以把它拿掉 #stops.append('\n\n') # terms = [t for t in jieba.cut(testStr, cut_all=True) if t not in stops] terms = [t for t in jieba.cut(testStr) if t not in stops] sorted(Counter(terms).items(), key=lambda x:x[1], reverse=True) ## 這個寫法很常出現在Counter中,他可以排序,list每個item出現的次數。 aa = sorted(Counter(terms).items(), key=lambda x:x[1], reverse=True) """ jieba.set_dictionary('C:/Users/User/Desktop/dict.txt')
from bs4 import BeautifulSoup import jieba import csv import json from wordcloud import WordCloud import matplotlib.pyplot as plt import pylab horoscope = [ "Aries", "Taurus", "Gemini", "Cancer", "Leo", "Virgo", "Libra", "Scorpio", "Sagittarius", "Capricornus", "Aquarius", "Pisces" ] hash1 = {} jieba.set_dictionary('dict.txt.big.txt') jieba.load_userdict("userdict.txt") #download the information from ptt for i in range(len(horoscope)): url = "https://www.ptt.cc/bbs/" + horoscope[i] + "/index.html" title = [] for round in range(10): response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") tag_name = "div.title a" articles = soup.select(tag_name) page2 = "div.btn-group-paging a" paging = soup.select(page2) next_url = "https://www.ptt.cc" + paging[1]["href"] url = next_url
MODEL = 'ta_addtest_test7_LSTMGRU.h5' ### Read files # train files print('Loading train files ...') train_sentences, train_labels = load_data(TRAIN_X_PATH, TRAIN_Y_PATH) # test file print('Loading test file ...') with open(TEST_X_PATH, 'r', encoding='utf-8') as f: readin = f.readlines() # Use regular expression to get rid of the index test_sentences = [re.sub('^[0-9]+,', '', s) for s in readin[1:]] sentences = train_sentences + test_sentences jieba.set_dictionary(DICT_PATH) # Change dictionary (Optional) print('Jieba cutting all sets ...') sentences = [list(jieba.cut(s, cut_all=False)) for s in sentences] # Train Word2Vec model print('Training Word2Vec model ...') emb_model = Word2Vec(sentences, size=emb_dim) emb_model.save(w2v_model) print('Jieba cutting train set ...') train_sentences = [ list(jieba.cut(s, cut_all=False)) for s in train_sentences ] num_words = len(emb_model.wv.vocab) + 1 # +1 for OOV words emb_dim = emb_model.vector_size
cut_phones, pos = [], 0 sent = ''.join(cut_sent) phones = word2phones(sent, use_tone, sep='').split() for word in cut_sent: word_len = len(word) word_phones = ' '.join(phones[pos:pos + word_len]) cut_phones.append(word_phones) pos += word_len return cut_phones if __name__ == '__main__': # Configuration use_tone = False jieba.set_dictionary('scripts/dict.txt.big') jieba.initialize() # Read information of validated audios full_tsv = join(DATA_DIR, 'validated.tsv') full_df = pd.read_csv(full_tsv, sep='\t') # Exclude audios with english full_df = full_df[full_df.sentence.apply(contains_no_eng)] ''' Prepare AM data ''' print('Preparing AM data...\r', end='') # Prepare spk_id, gender and utt_id for all audios client_spk = full_df[['client_id']].drop_duplicates()
Some function are used to generate training corpus of the chatbot ''' import json import logging import os import jieba import sys import operator from tqdm import tqdm import pickle import re dict_path = os.path.join(os.getenv("JIEBA_DATA"), "dict.txt.big") ptt_path = (os.getenv("DATA")) jieba.set_dictionary(dict_path) process_files = ['Gossiping', 'Boy-Girl'] marker = {'Gossiping': '>', 'NBA': '<', 'Boy-Girl': '^'} #count_response = {} def main(): Filter = ArticleFilter() def print2file(f, title, responses, marker='', separater=True): if marker != '': f.write(marker + ' ') title_cutted = jieba.cut(title.strip(), cut_all=False)
# 词语相关信息记录 # 解决cmd命令行下输出中文字符乱码问题(必须放置在文本最前面) from __future__ import unicode_literals import os import json import sys # 操作中文必须语句,解决字符问题 reload(sys) sys.setdefaultencoding('utf8') import jieba import jieba.posseg as pseg # 加载分词字典 jieba.set_dictionary("dict_file/dict.txt.big") # 加载用户自定义词典 jieba.load_userdict("dict_file/user_dict.txt") # 加载自定义模块 import fileHandle # 词性过滤文件(保留形容词、副形词、名形词、成语、简称略语、习用语、动词、动语素、副动词、名动词、名词) ALLOW_SPEECH_TAGS = [ 'a', 'ad', 'an', 'i', 'j', 'l', 'v', 'vg', 'vd', 'vn', 'n' ] # 词语位置 Word_Location = { 'title': 1, 'section-start': 2,
def run(self): """ 初始化对话 :return: """ self.load_custom_plugins() conversation = Conversation(mic=self.mic, persona=self.persona, profile=profile, iot_client=self.iot_client) conversation.handle_forever() if __name__ == "__main__": loggingConfiger(info=args.info, debug=args.debug, output=args.output) # 配置logging logger = logging.getLogger() if args.init: print('initializing...') device_init() else: jieba.set_dictionary(APP_RESOURCES_DATA_PATH + 'jieba.small.dict') # 设置中文分词库 jieba.initialize() app = App() if profile.remote_control_service_enable: # server messege listen app.launch_server_listen_thread() app.run() # start service
# new model _ = os.system('mkdir ./model/' + model_name) # new word2vec model word2vec_lookup = Word2vec(model_name) dloader = loader(word2vec_lookup, mode='new', model_name=model_name) # pre-trained word2vec model # dloader = loader(mode='pre_trained', model_name=model_name, sent_len=sent_len) # loading the training data and pre-processing dloader.data_loading(sent_len=sent_len) # hyperparameters np.set_printoptions(precision=2) jieba.set_dictionary('./libs/dict_new.txt') vec_size = len(dloader.word2vec_lookup['<unk>']) oneHot_size = dloader.voca_size enc_len = sent_len dec_len = enc_len n_layer1 = 512 l_r = 1e-3 epoch = 8 batch_size = 128 n_hiddens = 2 rnn_cell = tf.contrib.rnn.BasicLSTMCell op = tf.train.AdamOptimizer max_gradient_norm = 1 # seq2seq initializing s2s = seq2seq_chatbot(oneHot_size=oneHot_size,
#!/usr/bin/env python # encoding=utf-8 # Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG) # 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) # Apache 2.0 from __future__ import print_function import sys import jieba reload(sys) sys.setdefaultencoding('utf-8') if len(sys.argv) < 3: sys.stderr.write( "word_segmentation.py <vocab> <trans> > <word-segmented-trans>\n") exit(1) vocab_file = sys.argv[1] trans_file = sys.argv[2] jieba.set_dictionary(vocab_file) for line in open(trans_file): key, trans = line.strip().split('\t', 1) words = jieba.cut(trans, HMM=False) # turn off new word discovery (HMM-based) new_line = key + '\t' + " ".join(words) print(new_line)
# - TF(Term Frequency): $TF_{td}$指得是在特定的文章d中特定的字t出現了幾次。這個部分同時,也表示了一個文字在一篇文章的重要性,依但出現越多次,這個字也就越能代表這篇文章。 # - IDF(Inverted Document Frequency): N指得是總共有機篇文章,$DF_t$中的DF是Document Frequency的意思,DFt則是詞彙t在幾篇文章中出現過。$\frac{DF_t}{N}$也就是所有文章當中,詞彙t在幾篇文章出現過,而其倒數則是Inverted Documnet Index,表著這個詞彙如果在很多文章裏面都出現過,則其重要性會受到懲罰,而取log則只是讓他在分數的影響上比較平滑而已。 # # # 2. Cosine Similarity # $$\cos{\theta} = \frac{A \cdot B}{\| {A} \|_2 \| {B} \|_2}$$ # - if $A = [1,2,0,4]$ and $B = [3,2,1,0]$ # - $\cos{\theta} = \frac{1 \cdot 3 + 2 \cdot 2 + 0 \cdot 1 + 4 \cdot 0} {\sqrt{1^2+2^2+0^2+4^2} \cdot \sqrt{3^2+2^2+1^2+0^2}}$ # In[15]: import jieba import sys import random sys.path.append('../dict') jieba.set_dictionary('../dict/dict.txt.big') # 如果是使用繁體文字,請記得去下載繁體字典來使用 import numpy as np import pandas as pd from collections import Counter with open('../dict/stops.txt', 'r', encoding='utf8') as f: # 中文的停用字,我也忘記從哪裡拿到的,效果還可以,繁體字的資源真的比較少,大家將就一下吧 stops = f.read().split('\n') # 把情緒資料讀出來,做成dataframe emo_dict = {'emo': [], 'text': []} # 加入正面句 with open('positive.txt', encoding='utf8') as data: pos_train_list = data.readlines() random.shuffle(pos_train_list) for line in pos_train_list[:600]:
#%% import sys import os import logging import jieba import gensim import pandas as pd import numpy as np from gensim.models.doc2vec import TaggedDocument # logging information logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) jieba.set_dictionary('resources/dict.txt.big') df_qa = pd.read_json('raw_data.json', encoding='utf8') df_question = df_qa[['question', 'ans']].copy() df_question.drop_duplicates(inplace=True) def preProcess(item): #停用字 with open('resources/stops.txt', 'r', encoding='utf8') as f: stops = f.read().split('\n') # stops.append('\n') # stops.append('\n\n') terms = [t for t in jieba.cut(item, cut_all=False) if t not in stops] return terms
import jieba, os, json #Init base_path = os.path.dirname(__file__) config = json.load(open(os.path.join(base_path, 'config.json'))) fn = config['DataSet']['NPMCorpus'] op = config['DataSet']['NPMCorpus_seg'] jieba.set_dictionary(os.path.join(base_path, 'dict.txt.big')) def main(): #load stopword set stopword_set = set() with open(os.path.join(base_path, 'stop_words.txt'), 'r', encoding='utf-8') as stopwords: for stopword in stopwords: stopword_set.add(stopword.strip('\n')) with open(op, 'a', newline='', encoding='utf-8-sig') as output: with open(fn, 'r', encoding='utf-8-sig') as sentences: for sentence in sentences: sentence = sentence.replace(" ", "") sentence = sentence.replace(",,,,", "") sentence = sentence.replace("\"", "") sentence = sentence.replace("□", "") sentence = sentence.replace("口", "") sentence = sentence.strip('\n') words = jieba.cut(sentence, cut_all=False)
# encoding=utf-8 import csv import jieba import jieba.posseg as pseg jieba.set_dictionary("../Head-first-Chinese-text-segmentation-master/data/dict.txt.big") jieba.load_userdict("../Head-first-Chinese-text-segmentation-master/data/userdict_hp_all_character.txt") # jieba.load_userdict("../Head-first-Chinese-text-segmentation-master/data/userdict_hp_all_place.txt") """ jieba.load_userdict("../Head-first-Chinese-text-segmentation-master/data/userdict_hp_all.txt") """ content=open("../context/HP1.txt","rb").read() result = jieba.tokenize(u'%s' %content, 'utf-8') """ words=pseg.cut(content) with open("out_0609_hp1.txt",'w',newline='',encoding='utf-8') as f: w=csv.writer(f) for word,flag in words: str=[word,flag] if(str[1]!='x'): w.writerow(str) """ for tk in result: #tk[0]=tk[0].encode('ascii','ignore')
import pandas as pd import tensorflow as tf # Path of files CSV_DATA = '../data/CookieTheft_51.csv' DEMENTIA_DATA = '../data/' CONTROL_DATA = '../data/' WORDVEC_MODEL = '../wordvec_model/' # Variables DEMENTIA_NUM = 51 CONTROL_NUM = 51 WV_DIIM = 500 INDEX_CONTROL_START = 68 # The end of dementia id is 67 JIEBA_DICT = '../data/dict.txt.big' jieba.set_dictionary(JIEBA_DICT) punctuation = set(string.punctuation + "," + "、" + "」" + "「" + "。" + " " + "!") # csv file of control subjects, transform to txt file def csv_to_txt(file_name): csv = pd.read_csv(CSV_DATA, header=None) with open(file_name, 'w', encoding='utf8') as f: idx = INDEX_CONTROL_START for line in csv.iloc[1:, 1]: f.write(str(idx) + '\n') f.write(line + '\n') idx += 1 print('Control subject CSV file to txt sucess ...')
#!/usr/bin/python # -*- coding: utf-8 -*- import jieba import re sdict= '../../libs/dict.txt.big' jieba.set_dictionary(sdict) userdict= '../../libs/userdict.txt' stop_words= '../../libs/stop_words.txt' jieba.load_userdict(userdict) def segment(sentence, cut_all=False): # jieba.analyse.set_stop_words(stop_words) sentence = sentence.replace('\n', '').replace('\u3000', '').replace('\u00A0', '') # sentence = ' '.join(jieba.cut(sentence, cut_all=cut_all)) #jieba.cut_for_search 方法接受两个参数:需要分词的字符串;是否使用 HMM 模型。该方法适合用于搜索引擎构建倒排索引的分词,粒度比较细 sentence = ' '.join(jieba.cut_for_search(sentence)) return re.sub('[a-zA-Z0-9.。::,,))((!!??*-_/”“\"]', '', sentence).split()
def __init__(self, jieba_zh_path="dict.txt.big"): jieba.set_dictionary(jieba_zh_path)
import json, jieba, sys, math jieba.set_dictionary('./data/dict.txt.big') # freindly to traditional chinese class Load: def __init__(self): return def loadTestData(self, path): test_data = self._load_json_data(path) return self._getTest_feature(test_data) def loadTrainData(self, path): train_data = self._load_json_data(path) return self._getTrain_feature_label(train_data) def loadTestID(self, path): test_data = self._load_json_data(path) test_id = [] # [<context>,<quuetion>] for each row #get data position subjects = test_data['data'] for subject in subjects: # subject contains title and *paragraphs* for paragraph in subject['paragraphs']: # paragraphs contains *context* and *qas* for qa in paragraph['qas']: ######################################
#coding=utf-8 import pandas as pd import jieba import re import sys jieba.set_dictionary('dict.txt.big') #變更字典(繁體) source = pd.read_csv('source.txt', sep='\t', header=None, encoding='utf8') source.columns = ['date', 'string'] print(type(source), file=sys.stderr) f = open('jieba_test.txt', 'w', encoding='utf8') for index in range(0, len(source.index)): if (type(source['string'][index]) != str): source['string'][index] = str(source['string'][index]) #型態轉換=>str tmps = jieba.cut(source['string'][index], cut_all=False) source['string'][index] = '' for tmp in tmps: source['string'][index] += tmp + ' ' #字串處理 清除數字標點符號 source['string'][index] = re.sub( "[\s+\.\!\/_,$%^*(+\"\'\d]+|[+——!,。?、~@#¥%……&*()()::?+\d]+", " ", source['string'][index]) source['string'][index] = re.sub("[\s+]+", " ", source['string'][index]) print("index: " + str(index), file=sys.stderr) f.write(source['date'][index] + "\t" + source['string'][index] + "\n") f.close()
date_list = [2016, 9, 22] date = date_list[0] * 10000 + date_list[1] * 100 + date_list[2] today_date = (datetime.datetime.now().year) * 10000 + ( datetime.datetime.now().month) * 100 + datetime.datetime.now().day def get_next_date(date_in): date = datetime.datetime(date_in[0], date_in[1], date_in[2]) date = date + datetime.timedelta(days=1) return [int(str(date)[0:4]), int(str(date)[5:7]), int(str(date)[8:10])] stop_words = set(open("ref/stop_word.txt", "r").read().splitlines()) stop_words.update('\n', '\t', ' ') jieba.set_dictionary('ref/dict.txt.big') jieba.load_userdict("ref/userdict.txt") if 1 > 0: try: db = DBConfig() db.dbConnect() query = "SELECT COUNT(*) from News WHERE date>=%s" % date db.executeQuery(query) news_num = int(db.results[0][0]) query = "SELECT number, title, content from News WHERE date>=%s" % date db.executeQuery(query) texts = []
print("Result retrieved...") except httplib.BadStatusLine: response = '' if response: html = response.read() return html # Read a file of list of terms and their classifications, one term per line. Each line consistes of "Classification \t Term" infile = open("drugList.txt") lines = infile.readlines() infile.close() allWordList = ['drug-name', 'drug-type'] # To store all the possible features drugDictList = [] # To store feature words of each drug in a list jieba.set_dictionary('dict.txt.big') # Read a better dictionary for text segmentation i = 1 for line in lines: print "\nSearching for Entity:\t%s" % i i += 1 fieldList = line.split("\t") queryStr = fieldList[1].strip() drugType = fieldList[0].strip() html = search('%s' % queryStr) soup = BeautifulSoup(html) # Build a html object easy to parse drugDict = {'drug-name': queryStr, 'drug-type': drugType} # Every drug has these two features # for content in soup.find_all("div", "c-abstract"): # Parse Baidu page for content in soup.find_all("span", "st"): # Digest for each search result from google
for texts_num, line in enumerate(content): line = line.strip('\n') line = Converter('zh-hant').convert(line) line = line.split("\t") dict_page = {texts_num: line[1]} word_dictionary.append(dict_page) if (texts_num + 1) % 10000 == 0: print("已完成前 %d 行的store" % (texts_num + 1)) print(word_dictionary[15]) outputfile = open('F74056166.csv', 'w', encoding='utf-8') #jieba.load_userdict("./wiki_seg.txt") #jieba.set_dictionary("wiki_seg.txt") jieba.set_dictionary('extra_dict/dict.txt.big') model = "wiki.word2vec_50.bin" model_w2v = word2vec.Word2Vec.load(model) candidates = [] with open("wiki_seg.txt", encoding='utf-8') as f: for line in f: candidates.append(line.strip().split()) with open("oneinput.txt", encoding='utf-8') as inputline: for line in inputline: line = line.strip('\n') line = Converter('zh-hant').convert(line) output = line.split("\t", 1) text = output[0] answer = output[1].split("\t") eachans = []