def func_ckip(text): ''' text to sorted list ''' text = text.replace('\n', ' ') # insert stopword list stopword_path = r'./01_ref_data/stopword.txt' stopword_list = [] with open(stopword_path, 'r', encoding='utf-8') as f_stop: for temp in f_stop.readlines(): stopword_list.append(temp.replace('\n', '')) ws = WS("../data") ws_results = ws([text]) ckip_word_count = {} for i in ws_results[0]: if i in ckip_word_count: ckip_word_count[i] += 1 else: ckip_word_count[i] = 1 ckip_word_list = [ (k, ckip_word_count[k]) for k in ckip_word_count if (len(k) > 1) and ( k not in stopword_list) and not re.match(r'[0-9a-zA-Z]+', k) ] ckip_word_list.sort(key=lambda item: item[1], reverse=True) ckip_dict = {} for i in ckip_word_list: ckip_dict[i[0]] = i[1] return ckip_dict
def word_segmentation(inputfilename, outputfilename, stopstep): # You need to download ckip dataset at first. ckip_ws = WS("./data", disable_cuda=False) jieba_stopword_set = set() pattern = re.compile("[A-Za-z0-9]+") with open('jieba_dict/stopwords.txt', 'r', encoding='utf-8') as stopwords: for stopword in stopwords: jieba_stopword_set.add(stopword.strip('\n')) sentence_output = open(outputfilename, 'w', encoding='utf-8') with open(inputfilename, 'r', encoding='utf-8') as content: for idx, line in enumerate(content): line = line.strip('\n') word_sentence_list = ckip_ws([line]) for word in word_sentence_list[0]: if word not in jieba_stopword_set and\ pattern.match(word) is None: sentence_output.write(word + ' ') sentence_output.write('\n') if (idx + 1) % 10 == 0: logging.info( "Executed {} of lines for word segmentations.".format(idx + 1)) if (idx + 1) % stopstep == 0: break sentence_output.close()
def wordSplit(results): Ws = WS("./data") #Pos = POS("./data") # Ner = NER("./data") sentence_list = [ results, ] word_sentence_list = Ws( sentence_list, # sentence_segmentation = True, # To consider delimiters # segment_delimiter_set = {",", "。", ":", "?", "!", ";"}), # This is the defualt set of delimiters # recommend_dictionary = dictionary1, # words in this dictionary are encouraged # coerce_dictionary = dictionary2, # words in this dictionary are forced ) #pos_sentence_list = Pos(word_sentence_list) #entity_sentence_list = Ner(word_sentence_list, pos_sentence_list) #print("詞性的list:") #print(pos_sentence_list) print("斷詞的list:") print(word_sentence_list) return word_sentence_list
def word_seg(text): userdic = [] userDic = {} with open('C:/ckip-learning/project/Dict/userDict.txt', 'r', encoding='utf-8') as f1: us = f1.readlines() for t in us: t1 = t.replace('\n', '') if len(t1) == 1: pass else: userdic.append(t1) for t2 in userdic: userDic[t2] = 1 dictionary = construct_dictionary(userDic) stopWords = [] ws_result = [] with open('C:/ckip-learning/project/Dict/stopDict.txt', 'r', encoding='utf-8') as s: st = s.readlines() for std in st: stopWords.append(std.replace('\n', '')) ws = WS('C:/ckip-learning/data') words = ws([text], recommend_dictionary=dictionary) for word in words[0]: if word in stopWords: pass elif len(word_filter(word)) == 0: pass else: ws_result.append(word) res = ','.join(ws_result) return res
def __init__(self, ckip_data_path='./data', custom_dict_path='./dict'): # Load model self.ws = WS(ckip_data_path) self.pos = POS(ckip_data_path) self.ner = NER(ckip_data_path) self.dictionary = construct_dictionary( self.__load_custom_dict(custom_dict_path))
def main(config): with pd.ExcelWriter(f'../data/{config.output}') as writer: for sheet in config.sheets: # read data df = pd.read_excel(config.data, sheet_name=sheet) # get all titles, references and add delimeters for title ('喜見久別的友人 再度帶來物資' => '喜見久別的友人,再度帶來物資') titles, references = df['Title'].to_list(), df['Reference'].to_list() titles, references = \ [title.strip().replace(' ', ',') for title in titles], [reference.strip() for reference in references] del df # word segmentation ws = WS(config.tagger_src) titles_tokenized, references_tokenized = ws(titles), ws(references) del ws title_freq, reference_freq = \ [word for title in titles_tokenized for word in title], [word for reference in references_tokenized for word in reference] title_freq, reference_freq = Counter(title_freq), Counter(reference_freq) title_freq, reference_freq = \ title_freq.most_common(len(title_freq)), reference_freq.most_common(len(reference_freq)) df_out = { 'Words in Title': [item[0] for item in title_freq], 'Frequency of Words in Title': [item[1] for item in title_freq], 'Words in Reference': [item[0] for item in reference_freq], 'Frequency of Words in Reference': [item[1] for item in reference_freq], } # pad df_out with None otherwise pandas will complain min_len = max([len(df_out[key]) for key in df_out]) df_out = {key: df_out[key]+[None]*(min_len-len(df_out[key])) for key in df_out} df_out = pd.DataFrame(df_out) # Create a Pandas Excel writer using XlsxWriter as the engine. df_out.to_excel(writer, sheet_name=sheet, index=False)
def __init__(self): print("prepare ws pos ner") assert os.path.exists("./ckiptagger_data"), "ckiptagger_data 不在同層目錄" self.ws = WS("./ckiptagger_data") self.pos = POS("./ckiptagger_data") self.ner = NER("./ckiptagger_data") clear_output()
def word_count(name, test_data): """ :param test_data: article string :return: ckip dict """ ws = WS("../data") ws_results = ws([test_data]) ckip_word_count = {} for i in ws_results[0]: if i in ckip_word_count: ckip_word_count[i] += 1 else: ckip_word_count[i] = 1 ckip_word_list = [(k, ckip_word_count[k]) for k in ckip_word_count if (len(k) > 1) and (k not in stopword_list)] ckip_word_list.sort(key=lambda item: item[1], reverse=True) # print(ckip_word_list) ckip_dict = {} ckip_dict["article_name"] = name for i in ckip_word_list: ckip_dict[i[0]] = i[1] return (ckip_dict)
class CKIPSegmenter: ws = WS('/home/nlpmaster/ssd-1t/weights/data') @classmethod def 斷詞(cls, 物件): if isinstance(物件, 章): return cls._斷章物件詞(物件) return cls._斷句物件詞(物件) @classmethod def _斷章物件詞(cls, 章物件): 結果章物件 = 章() for 句物件 in 章物件.內底句: 結果章物件.內底句.append(cls._斷句物件詞(句物件)) return 結果章物件 @classmethod def _斷句物件詞(cls, 語句): 結果詞陣列 = [] for 詞條 in CKIPSegmenter.ws([語句])[0]: 結果詞陣列.extend(拆文分析器.建立組物件(詞條.replace('-', ' - ')).內底詞) 結果組物件 = 組() 結果組物件.內底詞 = 結果詞陣列 結果集物件 = 集() 結果集物件.內底組 = [結果組物件] 結果句物件 = 句() 結果句物件.內底集 = [結果集物件] return 結果句物件
def name_extractor(): from ckiptagger import WS, POS, NER ws = WS(ckipt_data, disable_cuda=not use_gpu) pos = POS(ckipt_data, disable_cuda=not use_gpu) ner = NER(ckipt_data, disable_cuda=not use_gpu) def extract_name(doc, attr='PERSON'): start = timeit.default_timer() word_s = ws([doc], sentence_segmentation=True, segment_delimiter_set={ '?', '?', '!', '!', '。', ',', ',', ';', ':', '、' }) word_p = pos(word_s) word_n = ner(word_s, word_p) stop = timeit.default_timer() namelist = set([ e[3] for e in word_n[0] if e[2] == attr and len(e[3]) > 1 and '、' not in e[3] and e[3][-1] not in '案犯' ]) return namelist, word_s[0], word_p[0], word_n[0], stop - start return extract_name
def ckip_cut(text_list): if not os.path.isdir("./data"): print('ckip data non-exist, start download') data_utils.download_data_gdown("./") # gdrive-ckip, !pip install gdown ws = WS("./data") ckip_corpus = ws(text_list) return ckip_corpus
def test(request): country = ''#{'country': 'hey'} if request.method == 'POST': text=request.POST.get('input') print(text) cc = OpenCC('s2tw') text = cc.convert(text) t = text with open("./file/Encode.json", 'r') as j: encode_dict = json.load(j) j.close model = load_model('./file/my_model.h5') # text = input() text = text.replace('+', '').replace('-', '').replace('‘', '').replace('’', '').replace('\t', '').replace('\xa0','').replace('\n','').replace(' ','').replace('\u3000','').replace('[^\w\s]','').replace('“',"").replace('”',"").replace('/',"").replace('《','').replace('》','').replace(',','').replace('。','').replace('「','').replace('」','').replace('(','').replace(')','').replace('!','').replace('?','').replace('、','').replace('▲','').replace('…','').replace(':','').replace(';','').replace('—','').replace('●','').replace('■','').replace('【','').replace('】','').replace('(','').replace(')','').replace('〔','').replace('〕','').replace('!','').replace('?','').replace('︹','').replace('︺','') ws = WS("./file/data") ws_results = ws([text]) del ws seg = ' '.join(ws_results[0]) test = list() text = seg.split(' ') x = list() x.append(test) for voc in text: if voc in encode_dict: num = encode_dict[voc] if num<5000: test.append(num) else: test.append(0) else: test.append(0) x = sequence.pad_sequences(x, maxlen=500) labels = [int(round(x[0])) for x in model.predict(x) ] ans = labels[0] print(labels[0]) if ans: country = '臺灣' # {'country': '臺灣'} else: country = '中國' # {'country': '中國'} print(country) return render(request,'home/home.html', locals())
def ckiptagger(load_txt_path, save_path): ws = WS('.\data') time_start = time.time() # # 加入自定義 詞與其權重 # word_customize_dict = { # "今將": 1, # "安樂": 1, # } # # # 將自訂義辭庫 轉成dict 在轉成ckiptagger自己的格式 # word_Customize_ckiptaggerdictionary = construct_dictionary(word_customize_dict) # 加入停用 詞 # 無函式 只能變成串列 於程式中阻擋 stopword_set = set() with open('../jieba/stopword.txt', 'r', encoding='utf-8') as file: for each_stopword in file.read().split('\n'): stopword_set.add(each_stopword) with open(load_txt_path, 'r', encoding='utf8') as f: i = 0 for txt_line in f: i += 1 if i % 1000 == 0: logging.info("已處理 {0} ".format(i)) cost_time = time.time() - time_start print('ckiptagger 花了', cost_time / 3600, '小時') sentence_list = [] sentence_list.append(txt_line) # word_s = ws(sentence_list,coerce_dictionary = word_Customize_ckiptaggerdictionary,sentence_segmentation=True,segment_delimiter_set={'?', '?', '!', '!', '。', ',',',', ';', ':', '、'}) word_s = ws(sentence_list, sentence_segmentation=True, segment_delimiter_set={ '?', '?', '!', '!', '。', ',', ',', ';', ':', '、' }) str_tmp = '' try: for each_word_list in word_s: for each_word in each_word_list: if len(each_word ) > 1 and each_word not in stopword_set: str_tmp += each_word + " " except Exception as e: print(e) continue # 斷詞結果存檔 segSaveFile = save_path with open(segSaveFile, 'ab') as saveFile: saveFile.write(str_tmp.encode('utf-8') + '\n'.encode('utf-8')) cost_time = time.time() - time_start print('ckiptagger 花了', cost_time / 3600, '小時') print('save_path=', save_path)
def main(sentence_list): # Download data #data_utils.download_data("./") # Load model ws = WS("./data") pos = POS("./data") ner = NER("./data") # Create custom dictionary # word_to_weight = { # "土地公": 1, # "土地婆": 1, # "公有": 2, # "": 1, # "來亂的": "啦", # "緯來體育台": 1, # } # dictionary = construct_dictionary(word_to_weight) # print(dictionary) # Run WS-POS-NER pipeline # sentence_list = [ # "傅達仁今將執行安樂死,卻突然爆出自己20年前遭緯來體育台封殺,他不懂自己哪裡得罪到電視台。", # "美國參議院針對今天總統布什所提名的勞工部長趙小蘭展開認可聽證會,預料她將會很順利通過參議院支持,成為該國有史以來第一位的華裔女性內閣成員。", # "", # "土地公有政策??還是土地婆有政策。.", # "… 你確定嗎… 不要再騙了……", # "最多容納59,000個人,或5.9萬人,再多就不行了.這是環評的結論.", # "科長說:1,坪數對人數為1:3。2,可以再增加。", # ] word_sentence_list = ws(sentence_list) # word_sentence_list = ws(sentence_list, sentence_segmentation=True) # word_sentence_list = ws(sentence_list, recommend_dictionary=dictionary) # word_sentence_list = ws(sentence_list, coerce_dictionary=dictionary) pos_sentence_list = pos(word_sentence_list) entity_sentence_list = ner(word_sentence_list, pos_sentence_list) # Release model del ws del pos del ner # Show results def print_word_pos_sentence(word_sentence, pos_sentence): assert len(word_sentence) == len(pos_sentence) for word, pos in zip(word_sentence, pos_sentence): print(f"{word}({pos})", end="\u3000") print() return for i, sentence in enumerate(sentence_list): print() print(f"'{sentence}'") print_word_pos_sentence(word_sentence_list[i], pos_sentence_list[i]) for entity in sorted(entity_sentence_list[i]): print(entity) return
def main(): # Download data #data_utils.download_data("./") #第一次執行需要這行 把前面#弄掉 # Load model ws = WS("./data") pos = POS("./data") ner = NER("./data") word_to_weight = { "橋本有菜": 1, } #因為CKIP不認識橋本有菜,所以要教 dictionary = construct_dictionary(word_to_weight) txt = open('./input.txt', "r", encoding="utf-8") #輸入文字檔 sentence_list = [] for line in txt: line = line.strip('\n') #讀取文件 並變成CKIP吃的list sentence_list.append(line) print(sentence_list) # Run WS-POS-NER pipeline '''sentence_list = [ "傅達仁今將執行安樂死,卻突然爆出自己20年前遭緯來體育台封殺,他不懂自己哪裡得罪到電視台。", "美國參議院針對今天總統布什所提名的勞工部長趙小蘭展開認可聽證會,預料她將會很順利通過參議院支持,成為該國有史以來第一位的華裔女性內閣成員。", "", "土地公有政策??還是土地婆有政策。.", "… 你確定嗎… 不要再騙了……", "最多容納59,000個人,或5.9萬人,再多就不行了.這是環評的結論.", "科長說:1,坪數對人數為1:3。2,可以再增加。", ]''' #word_sentence_list = ws(sentence_list) word_sentence_list = ws( sentence_list, recommend_dictionary=dictionary) #要認識橋本就套用這行有字典的,不想認識就套上一行 pos_sentence_list = pos(word_sentence_list) entity_sentence_list = ner(word_sentence_list, pos_sentence_list) # Release model del ws del pos #我們放上去雲端之後應該不用release del ner # Show results output = open('output.txt', 'w', encoding='utf-8') #輸出文字檔 def print_word_pos_sentence(word_sentence, pos_sentence): assert len(word_sentence) == len(pos_sentence) for word, pos in zip(word_sentence, pos_sentence): #print(f"{word}", end="\u3000") output.write(f"{word}" + " ") #output的重點在這 #print() output.write('\n') for i, sentence in enumerate(sentence_list): #print() #print(f"'{sentence}'") print_word_pos_sentence(word_sentence_list[i], pos_sentence_list[i])
def load_data(): # 使用 GPU: # 1. 安裝 tensorflow-gpu (請見安裝說明) # 2. 設定 CUDA_VISIBLE_DEVICES 環境變數,例如:os.environ["CUDA_VISIBLE_DEVICES"] = "0" # 3. 設定 disable_cuda=False,例如:ws = WS("./data", disable_cuda=False) # 使用 CPU: ws_ = WS("./core/data") pos_ = POS("./core/data") ner_ = NER("./core/data") return ws_, pos_, ner_
def text_preprocess(raw_text): word_dict = pickle.load(open(word_dict_file, 'rb')) ws = WS(ckip_path) rule = re.compile(r'[^a-zA-Z0-9\u4e00-\u9fa5]') raw_text = rule.sub(' ', str(raw_text)) raw_text = re.sub(' +', '', raw_text) raw_text = ws([raw_text], sentence_segmentation=True, recommend_dictionary=word_dict) raw_text = [x for l in raw_text for x in l] return raw_text
def __init__(self, root_dir, lexicon=None, coerce_dictionary=True): self.ws = WS(root_dir, disable_cuda=False) word_to_weight = {word: 1 for word in lexicon} self.coerce_dictionary = None self.recommend_dictionary = None self.segment_delimiter_set = {",", "。", ":", "?", "!", ";", "-"} dictionary = construct_dictionary(word_to_weight) if coerce_dictionary: self.coerce_dictionary = dictionary else: self.recommend_dictionary = dictionary
def __init__(self, ws_model_path, pos_model_path, w2v_model_path, anti_dict_path): nltk.download('wordnet') nltk.download('omw') self.ws = WS(ws_model_path) self.pos = POS(pos_model_path) self.model = Word2Vec.load(w2v_model_path) self.new_anti = self.build_antidict(anti_dict_path) self.cc1 = opencc.OpenCC('t2s') self.cc2 = opencc.OpenCC('s2t')
def seg(content): ws = WS('C:/ckip-learning/data') temp = [] words = ws([content]) for word in words[0]: if word in stopwords: pass else: temp.append(word) ws_result = ' '.join(temp) return ws_result
def __init__(self, GPU_MEMORY_FRACTION, CUDA_VISIBLE_DEVICES): print("set GPU stat...") cfg = tf.ConfigProto() cfg.gpu_options.per_process_gpu_memory_fraction = GPU_MEMORY_FRACTION ###設定gpu使用量 session = tf.Session(config=cfg) os.environ["CUDA_VISIBLE_DEVICES"] = CUDA_VISIBLE_DEVICES ###設定gpu編號 print("prepare ws pos ner") path = "./module/data" self.ws = WS(path, disable_cuda=False) self.pos = POS(path, disable_cuda=False) ner = NER(path, disable_cuda=False) clear_output()
def generate_predict_data(): ''' 這個函數是為了將要被預測的句子轉成詞性序列,接受的格式為小學數學輸出的ALL.csv ''' df = pd.read_csv('batch_all_2.csv', encoding='utf-8') fail_sentence_list = [] mapping = [] answers = [] count = 0 sp = 0 good_pattern = [] for index, row in df.iterrows(): pattern = row['Matched_Frame_Sequential'].split(':') if row['Result'] == 'Good': for p in pattern: if p not in good_pattern: good_pattern.append(p) for index, row in df.iterrows(): sent = row['Question'].replace("(1)", "").replace("(2)", "") pattern = row['Matched_Frame_Sequential'].split(':') split_sent = re.split('?|,|。|:|:', sent) if split_sent[-1] == '': split_sent = split_sent[: -1] if len(split_sent) != len(pattern) or sum([0 if p in good_pattern else 1 for p in pattern])>0: sp += 1 continue if row['Result'] == 'Fail': fail_sentence_list += split_sent mapping += [count for _ in range(len(split_sent))] answers.append(row['Answer']) count += 1 ws = WS("./data") pos = POS("./data") fail_word_sent_list = ws(fail_sentence_list) fail_pos_sent_list = pos(fail_word_sent_list) out = {'Question':[], 'Mapping':[], 'Original':[]} for i in range(len(fail_pos_sent_list)): out['Question'].append(','.join(fail_pos_sent_list[i])) out['Original'].append(','.join(fail_word_sent_list[i])) out['Mapping'].append(mapping[i]) out_df = pd.DataFrame.from_dict(out) out_df.to_csv('test.csv') pdb.set_trace() with open('answer.txt', 'w') as file: for ans in answers: file.write(str(ans)+'\n')
def count(in_file, out_file): from ckiptagger import WS from collections import Counter titles, contents = get_data(in_file) assert len(titles) == len(contents) corpus = [' '.join((titles[i], contents[i])) for i in range(len(titles))] ws = WS('./data') word_seg = [filter_punc(sent) for sent in ws(corpus)] allgram = get_all_gram(word_seg, 3) counter = Counter(allgram) write_counter(counter, out_file)
def word_cut(file_path, output_file, punc_pkl, word_dict_file, text_column, ckip_path): ws = WS(ckip_path) df = pd.read_csv(file_path) punc = pickle.load(open(punc_pkl, 'rb')) word_dict = pickle.load(open(word_dict_file, 'rb')) word_s = ws(df[text_column], sentence_segmentation=True, segment_delimiter_set=punc, recommend_dictionary=word_dict) # filter and output # word_s1 = [[_ for _ in w if _ not in punc] for w in word_s] rule = re.compile(r'[^a-zA-Z0-9\u4e00-\u9fa5]') word_s1 = [[rule.sub('', w) for w in sent] for sent in word_s] df['token'] = ['@'.join(_) for _ in word_s1] df.to_csv(output_file, index=False) print(output_file, ' exported.')
def ckip_cut_gpu(input_data, data_col, do_NER=False): #whole csv dataframe, colname wait for cut from ckiptagger import WS, construct_dictionary User_Dict = {} with open("dict2.txt", "r", encoding='utf-8') as USDic: for tmpwords in USDic: words = tmpwords.strip().split(" ") if len(words) > 1: User_Dict[words[0]] = words[1] else: User_Dict[words[0]] = 10 dictionary = construct_dictionary(User_Dict) os.environ["CUDA_VISIBLE_DEVICES"] = "0" ws = WS("./data", disable_cuda=False) input_data = input_data.replace(np.nan, '', regex=True) tmp_text = list(input_data[data_col]) stopwordslist = stopwordlist() ckip_cut_result = pd.DataFrame(columns=['CKIP_Result']) ckip_cut_result['CKIP_Result'] = ckip_cut_result['CKIP_Result'].astype( 'str') total = len(tmp_text) counter = 1 tmp_things = [] for things in tmp_text: print("Now: ", str(counter), " of ", total) tmp_things.append(things) ckip_cut = ws( tmp_things, sentence_segmentation=True, segment_delimiter_set={",", "。", ":", "?", "!", ";", "、"} ) #sentence_segmentation=True,segment_delimiter_set = {",", "。", ":", "?", "!", ";", "、"},coerce_dictionary = dictionary tmp_things.clear() if do_NER: print("Not yet.") else: text = '' for cutted in ckip_cut: if cutted not in stopwordslist: text = str(cutted) + " " + text text = re.sub(r'[0-9]', '', text) text = re.sub(r'[^\w\s]', '', text) text = re.sub(r'[a-zA-Z]', '', text) tmp = pd.Series({'CKIP_Result': text}) ckip_cut_result = ckip_cut_result.append(tmp, ignore_index=True) counter += 1 del ws return ckip_cut_result
def main(): sql1 = "SELECT id,title FROM bingnews2 WHERE title LIKE '%驚呆%'UNION SELECT id,title FROM bingnews2 WHERE title LIKE'%爆氣%' UNION SELECT id,title FROM bingnews2 WHERE title LIKE'%網友這麼說%' UNION SELECT id,title FROM bingnews2 WHERE title LIKE'%網友這樣說%'UNION SELECT id,title FROM bingnews2 WHERE title LIKE'%網驚%'" #將資料表中部份資料抓出來,若需將資料庫中資料全部抓出來:SELECT [欄位] FROM [資料表] cs1.execute(sql1) idc = [] #id title = [] #標題 user = {} str4 = "" alldata = cs1.fetchall() for s in alldata: idc.append(s[0]) title.append(s[1]) #print(len(idc)) # Load model without GPU ws = WS("請上CKipTagger 的github下載模型,網址詳見READ") #斷詞 pos = POS("請上CKipTagger 的github下載模型,網址詳見READ") #詞性標註 ner = NER("請上CKipTagger 的github下載模型,網址詳見READ") #實體辨識 # Create custom dictionary # 用讀CSV的方式讀取前面匯出的txt df_ner_dict = pd.read_csv(r"停用詞文件儲存位置", delimiter="\t", quoting=csv.QUOTE_NONE, header=None, encoding="utf-8") #使用停用詞 # 存到list df_ner_dict.columns = ['NER'] list_ner_dict = list(df_ner_dict['NER']) dict_for_CKIP = dict((el, 1) for el in list_ner_dict) dict_for_CKIP = construct_dictionary(dict_for_CKIP) for i in range(len(title)): sentence_list = '朴敏英進廠「修鼻子」?最新近照曝光 網驚:有點怪怪的' #若修改成sentence_list = title[i],則可以讀取資料表中所有字串 idh = idc[i] word_s = np.ravel(ws(sentence_list, coerce_dictionary=dict_for_CKIP)) #斷詞 word_p = np.ravel(pos(word_s)) #詞性標註 pos_sentence_list = pos(word_s) print(word_s) print(word_p) for key, value in zip(word_s, word_p): #將斷詞結果和對應詞性以鍵值方式存為JSON檔 user[key] = value jsoninfo = json.dumps(user, ensure_ascii=False) print("complete") # Release model del ws del pos del ner
def cut_func(input_data,data_col,name): os.environ["CUDA_VISIBLE_DEVICES"] = "0" from ckiptagger import data_utils, construct_dictionary, WS User_Dict = {} with open("dict.txt","r",encoding = 'utf-8') as USDic: for tmpwords in USDic: words = tmpwords.strip().split(" ") if len(words) > 1: User_Dict[words[0]] = words[1] else: User_Dict[words[0]] = 10 dictionary = construct_dictionary(User_Dict) ws = WS("./data",disable_cuda=False) # pos = POS("/data") # ner = NER("/data") print(input_data) punctuation = " 的也//,::""()\n!!?。"#$%&'()*+-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏""<->#。!⋯.➡?=&▶_%♀!❗🎉⏰💪🔥⁉❓" re_punctuation = "[{}] ".format(punctuation) input_data = input_data.replace(np.nan,'',regex = True) tmp_fbtext = list(input_data[data_col]) stopwordslist = stopwordlist() ckip_pd = pd.DataFrame(columns = ['CKIP_Result']) ckip_pd['CKIP_Result'] = ckip_pd['CKIP_Result'].astype('str') print("Total Data to process: ",len(tmp_fbtext),'\n','----------------') counter = 1 tmp_things = [] for things in tmp_fbtext: print("Now processing:", name," No.",counter) tmp_things.append(things) ckip_cut = ws(tmp_things,sentence_segmentation=True,segment_delimiter_set = {",", "。", ":", "?", "!", ";", "、"},coerce_dictionary = dictionary) text = '' tmp_things.clear() ner_thread = threading.Thread(target = do_NER, args = (ckip_cut,)) ner_thread.start() for cutted in ckip_cut: if cutted not in stopwordslist: text = str(cutted) + " " + text text = re.sub(r'[0-9]','',text) text = re.sub(r'[a-zA-Z]','',text) text = re.sub(r'[^\w\s]','',text) text = re.sub(re_punctuation,'',text) tmp = pd.Series({'CKIP_Result' : text}) ckip_pd = ckip_pd.append(tmp,ignore_index = True) ner_thread.join() counter += 1 return ckip_pd
def check_model_and_load(self): # To use GPU: # 1. Install tensorflow-gpu (see Installation) # 2. Set CUDA_VISIBLE_DEVICES environment variable, e.g. os.environ["CUDA_VISIBLE_DEVICES"] = "0" # 3. Set disable_cuda=False, e.g. ws = WS("./data", disable_cuda=False) # Do not use CPU: disable_cuda = True if "CUDA_VISIBLE_DEVICES" in os.environ: # To use CPU: disable_cuda = False if not self._ws or not self._pos: self._logger.info("ckiptagger WS/POS: Model Loading...") self._ws = WS(self._model_path, disable_cuda=disable_cuda) self._pos = POS(self._model_path, disable_cuda=disable_cuda) self._logger.info("ckiptagger WS/POS: Model Done...")
def WordSegment_and_write2file(give): ws = WS("./data",disable_cuda=False) with open('WikiDict_plus_allfieldskeywordsDict.pkl', 'rb') as fp: WikiDict_plus_allfieldskeywordsDict = pickle.load(fp) fp.close() for i in [give]: # print(i) word_sentence_list = ws( i, sentence_segmentation = True, segment_delimiter_set = {",", "。", ":", "?", "!", ";", "?", ",", "、", " ", "。", "!", "? ", "NULL","\n","\n3000","(",")","=","/"}, recommend_dictionary = construct_dictionary(WikiDict_plus_allfieldskeywordsDict), ) # print(word_sentence_list) # with open('allfields_list.pkl', 'wb') as fp: # pickle.dump(word_sentence_list, fp) # fp.close() # print("1") All.append(word_sentence_list) # del word_sentence_list # with open("allfields_list.pkl",'rb') as f: # final = pickle.loads(f.read()) # print("2") # print(final) new_final = [] for i in word_sentence_list: new_i = [] # print(i) for j in i: j = remove_punctuation(j) # print(j) if j != "" : new_i.append(j) new_final.append(new_i) # print(new_final) # print("$$$$$",new_final) return new_final,word_sentence_list
def generate_corpus(raw_data, train=True): ws = WS("./data") corpus = [] y_train = [] i = 1 for df in pd.read_csv(raw_data, sep=',', header=0, chunksize=1): i = i + 1 print(df) docs = df['text'].values[0] ws_results = ws([docs]) all_list = str(' '.join([str(elem) for elem in ws_results[0]])) corpus.append(all_list) if train == True: y = df['tags'].values[0] y_train.append(y) #if i > 2: # break return corpus, y_train