Пример #1
0
    def _init(self):
        import ckiptagger

        if self._recommend_lexicons:
            self._opts['recommend_dictionary'] = ckiptagger.construct_dictionary(self._recommend_lexicons)
        if self._coerce_lexicons:
            self._opts['coerce_dictionary'] = ckiptagger.construct_dictionary(self._coerce_lexicons)

        self._core = ckiptagger.WS(_get_tagger_data(), disable_cuda=self._disable_cuda)
Пример #2
0
def ckipnlp_cutwords(data: pd.DataFrame, ws, *args: str,
                     **kwargs) -> pd.DataFrame:
    wg_dict = {}
    for wordPackage in args:
        wg_dict.update(txt_to_dict('頻道列表/' + wordPackage + '.txt'))
    wg_dict = construct_dictionary(wg_dict)
    data2 = data.copy()
    cut = []
    for text in data['textOriginal']:
        comment_cut = ws([text], recommend_dictionary=wg_dict)[0]
        cut.append(comment_cut)
        print(comment_cut)
        # cut.append(ws([text])[0])
    data2['ckipnlp_cut'] = cut
    year_month_cut(data2)
    if kwargs.get('language'):
        #若有給language這個keyword參數,就會回傳其value(True or False),沒有給language這個參數將回傳None,等同於False
        data2['traditional'] = [
            1 if check.hasTraditional(s) else 0 for s in data2['textOriginal']
        ]
        data2['simplified'] = [
            1 if check.hasSimplified(s) else 0 for s in data2['textOriginal']
        ]
        data2['english'] = [
            1 if check.hasEnglish(s) else 0 for s in data2['textOriginal']
        ]
    data2.reset_index(inplace=True, drop=True)
    return data2
Пример #3
0
def word_seg(text):
    userdic = []
    userDic = {}
    with open('C:/ckip-learning/project/Dict/userDict.txt',
              'r',
              encoding='utf-8') as f1:
        us = f1.readlines()
        for t in us:
            t1 = t.replace('\n', '')
            if len(t1) == 1:
                pass
            else:
                userdic.append(t1)
        for t2 in userdic:
            userDic[t2] = 1
    dictionary = construct_dictionary(userDic)
    stopWords = []
    ws_result = []
    with open('C:/ckip-learning/project/Dict/stopDict.txt',
              'r',
              encoding='utf-8') as s:
        st = s.readlines()
        for std in st:
            stopWords.append(std.replace('\n', ''))
    ws = WS('C:/ckip-learning/data')
    words = ws([text], recommend_dictionary=dictionary)
    for word in words[0]:
        if word in stopWords:
            pass
        elif len(word_filter(word)) == 0:
            pass
        else:
            ws_result.append(word)
    res = ','.join(ws_result)
    return res
Пример #4
0
 def __init__(self, ckip_data_path='./data', custom_dict_path='./dict'):
     # Load model
     self.ws = WS(ckip_data_path)
     self.pos = POS(ckip_data_path)
     self.ner = NER(ckip_data_path)
     self.dictionary = construct_dictionary(
         self.__load_custom_dict(custom_dict_path))
def create_word_dict(legal_name_file, word_file, output_file):
    with open(legal_name_file, 'r', encoding='big5') as k1, open(word_file, 'r', encoding='big5') as k2:
        k = k1.read().split('\n') + k2.read().split('\n')
        word_to_weight = dict([(_, 1) for _ in k])
    dictionary = construct_dictionary(word_to_weight)
    pickle.dump(dictionary, open(output_file, 'wb'))
    print(output_file, ' exported.')
def main():
    # Download data
    #data_utils.download_data("./")    #第一次執行需要這行 把前面#弄掉

    # Load model
    ws = WS("./data")
    pos = POS("./data")
    ner = NER("./data")

    word_to_weight = {
        "橋本有菜": 1,
    }  #因為CKIP不認識橋本有菜,所以要教
    dictionary = construct_dictionary(word_to_weight)

    txt = open('./input.txt', "r", encoding="utf-8")  #輸入文字檔
    sentence_list = []
    for line in txt:
        line = line.strip('\n')  #讀取文件 並變成CKIP吃的list
        sentence_list.append(line)
    print(sentence_list)

    # Run WS-POS-NER pipeline
    '''sentence_list = [
        "傅達仁今將執行安樂死,卻突然爆出自己20年前遭緯來體育台封殺,他不懂自己哪裡得罪到電視台。",
        "美國參議院針對今天總統布什所提名的勞工部長趙小蘭展開認可聽證會,預料她將會很順利通過參議院支持,成為該國有史以來第一位的華裔女性內閣成員。",
        "",
        "土地公有政策??還是土地婆有政策。.",
        "… 你確定嗎… 不要再騙了……",
        "最多容納59,000個人,或5.9萬人,再多就不行了.這是環評的結論.",
        "科長說:1,坪數對人數為1:3。2,可以再增加。",
    ]'''
    #word_sentence_list = ws(sentence_list)
    word_sentence_list = ws(
        sentence_list,
        recommend_dictionary=dictionary)  #要認識橋本就套用這行有字典的,不想認識就套上一行
    pos_sentence_list = pos(word_sentence_list)
    entity_sentence_list = ner(word_sentence_list, pos_sentence_list)

    # Release model
    del ws
    del pos  #我們放上去雲端之後應該不用release
    del ner

    # Show results
    output = open('output.txt', 'w', encoding='utf-8')  #輸出文字檔

    def print_word_pos_sentence(word_sentence, pos_sentence):
        assert len(word_sentence) == len(pos_sentence)
        for word, pos in zip(word_sentence, pos_sentence):
            #print(f"{word}", end="\u3000")
            output.write(f"{word}" + " ")  #output的重點在這
        #print()
        output.write('\n')

    for i, sentence in enumerate(sentence_list):
        #print()
        #print(f"'{sentence}'")
        print_word_pos_sentence(word_sentence_list[i], pos_sentence_list[i])
Пример #7
0
def main():
    # Download data
    data_utils.download_data("./")

    # Load model
    ws = WS("./data")
    pos = POS("./data")
    ner = NER("./data")

    # Create custom dictionary
    word_to_weight = {
        "土地公": 1,
        "土地婆": 1,
        "公有": 2,
        "": 1,
        "來亂的": "啦",
        "緯來體育台": 1,
    }
    dictionary = construct_dictionary(word_to_weight)
    print(dictionary)

    # Run WS-POS-NER pipeline
    sentence_list = [
        "傅達仁今將執行安樂死,卻突然爆出自己20年前遭緯來體育台封殺,他不懂自己哪裡得罪到電視台。",
        "美國參議院針對今天總統布什所提名的勞工部長趙小蘭展開認可聽證會,預料她將會很順利通過參議院支持,成為該國有史以來第一位的華裔女性內閣成員。",
        "",
        "土地公有政策??還是土地婆有政策。.",
        "… 你確定嗎… 不要再騙了……",
        "最多容納59,000個人,或5.9萬人,再多就不行了.這是環評的結論.",
        "科長說:1,坪數對人數為1:3。2,可以再增加。",
    ]
    word_sentence_list = ws(sentence_list)
    # word_sentence_list = ws(sentence_list, sentence_segmentation=True)
    # word_sentence_list = ws(sentence_list, recommend_dictionary=dictionary)
    # word_sentence_list = ws(sentence_list, coerce_dictionary=dictionary)
    pos_sentence_list = pos(word_sentence_list)
    entity_sentence_list = ner(word_sentence_list, pos_sentence_list)

    # Release model
    del ws
    del pos
    del ner

    # Show results
    def print_word_pos_sentence(word_sentence, pos_sentence):
        assert len(word_sentence) == len(pos_sentence)
        for word, pos in zip(word_sentence, pos_sentence):
            print(f"{word}({pos})", end="\u3000")
        print()
        return

    for i, sentence in enumerate(sentence_list):
        print()
        print(f"'{sentence}'")
        print_word_pos_sentence(word_sentence_list[i], pos_sentence_list[i])
        for entity in sorted(entity_sentence_list[i]):
            print(entity)
    return
Пример #8
0
    def __init__(self, model_path=None, dict_path=None, coerce_dict=None):
        self._logger = logging.getLogger(__name__)

        self._recommend_dict = {}
        if dict_path:
            self._recommend_dict = construct_dictionary(
                self.load_userdict(dict_path))

        self._coerce_dict = {}
        if coerce_dict:
            self._coerce_dict = construct_dictionary(
                self.load_userdict(coerce_dict))

        self._model_path = ""
        if model_path:
            self._model_path = model_path

        self._ws = None
        self._pos = None
Пример #9
0
 def __init__(self, root_dir, lexicon=None, coerce_dictionary=True):
     self.ws = WS(root_dir, disable_cuda=False)
     word_to_weight = {word: 1 for word in lexicon}
     self.coerce_dictionary = None
     self.recommend_dictionary = None
     self.segment_delimiter_set = {",", "。", ":", "?", "!", ";", "-"}
     dictionary = construct_dictionary(word_to_weight)
     if coerce_dictionary:
         self.coerce_dictionary = dictionary
     else:
         self.recommend_dictionary = dictionary
Пример #10
0
    def __init__(self, component_config: Dict[Text, Any] = None) -> None:
        super(CKIPTokenizer, self).__init__(component_config)

        # must configure 'model_apth', or raise exception
        if not self.component_config.get("model_path"):
            raise Exception("model_path must be configured")

        # construct recommend_dict if 'recommend_dict' is  configured
        self._recommend_dict = {}
        if self.component_config.get("recommend_dict_path", None):
            self._recommend_dict = construct_dictionary(
                self.load_userdict(
                    self.component_config.get("recommend_dict_path")))

        # construct coerce_dict if 'coerce_dict' is  configured
        self._coerce_dict = {}
        if self.component_config.get("coerce_dict_path", None):
            self._coerce_dict = construct_dictionary(
                self.load_userdict(
                    self.component_config.get("coerce_dict_path")))

        self._ws = WS(self.component_config.get("model_path"))
Пример #11
0
def ckip(keywords):
	""" CKIP Lab Chinese NLP """

	# 將三份工具的模型路徑指向我們剛才下載的模型
	# Load model
	ws = WS("./data")
	pos = POS("./data")
	ner = NER("./data")

	# 自訂字典
	if os.path.isfile('./school_data.csv'):  # 檢查下有沒有學校名稱列表
		print("發現官方學校名稱檔案,將作為強制詞加入字典")
		force_dictionary = construct_dictionary(school('school_data', True))
	else:
		force_dictionary = {}
	if os.path.isfile('./school_alias.csv'):  # 各種別名、簡稱等
		print("發現非官方學校名稱檔案,將作為推薦詞加入字典")
		encourage_dictionary = construct_dictionary(school('school_alias'))
	else:
		encourage_dictionary = {}

	# 分析文本
	ws_results = ws(keywords, recommend_dictionary = encourage_dictionary, coerce_dictionary = force_dictionary)
	# pos_results = pos(ws_results)
	# ner_results = ner(ws_results, pos_results)  # ner(文本, POS結果)

	# 結果
	# print(ws_results)  # 斷詞
	# print(pos_results)  # 詞性
	# for name in ner_results[0]:  # 實體辨識
	#     print(name)

	# release memory
	del ws
	del pos
	del ner

	return ws_results
Пример #12
0
def ckip_cut_gpu(input_data,
                 data_col,
                 do_NER=False):  #whole csv dataframe, colname wait for cut
    from ckiptagger import WS, construct_dictionary

    User_Dict = {}
    with open("dict2.txt", "r", encoding='utf-8') as USDic:
        for tmpwords in USDic:
            words = tmpwords.strip().split(" ")
            if len(words) > 1:
                User_Dict[words[0]] = words[1]
            else:
                User_Dict[words[0]] = 10
    dictionary = construct_dictionary(User_Dict)
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    ws = WS("./data", disable_cuda=False)

    input_data = input_data.replace(np.nan, '', regex=True)
    tmp_text = list(input_data[data_col])
    stopwordslist = stopwordlist()
    ckip_cut_result = pd.DataFrame(columns=['CKIP_Result'])
    ckip_cut_result['CKIP_Result'] = ckip_cut_result['CKIP_Result'].astype(
        'str')
    total = len(tmp_text)
    counter = 1
    tmp_things = []
    for things in tmp_text:
        print("Now: ", str(counter), " of ", total)
        tmp_things.append(things)
        ckip_cut = ws(
            tmp_things,
            sentence_segmentation=True,
            segment_delimiter_set={",", "。", ":", "?", "!", ";", "、"}
        )  #sentence_segmentation=True,segment_delimiter_set = {",", "。", ":", "?", "!", ";", "、"},coerce_dictionary = dictionary
        tmp_things.clear()
        if do_NER:
            print("Not yet.")
        else:
            text = ''
            for cutted in ckip_cut:
                if cutted not in stopwordslist:
                    text = str(cutted) + " " + text
            text = re.sub(r'[0-9]', '', text)
            text = re.sub(r'[^\w\s]', '', text)
            text = re.sub(r'[a-zA-Z]', '', text)
            tmp = pd.Series({'CKIP_Result': text})
            ckip_cut_result = ckip_cut_result.append(tmp, ignore_index=True)
            counter += 1
    del ws
    return ckip_cut_result
Пример #13
0
def main():
    sql1 = "SELECT id,title FROM bingnews2 WHERE title LIKE '%驚呆%'UNION SELECT id,title FROM bingnews2 WHERE title LIKE'%爆氣%' UNION SELECT id,title FROM bingnews2 WHERE title LIKE'%網友這麼說%' UNION SELECT id,title FROM bingnews2 WHERE title LIKE'%網友這樣說%'UNION SELECT id,title FROM bingnews2 WHERE title LIKE'%網驚%'"
    #將資料表中部份資料抓出來,若需將資料庫中資料全部抓出來:SELECT [欄位] FROM [資料表]
    cs1.execute(sql1)
    idc = []  #id
    title = []  #標題
    user = {}
    str4 = ""
    alldata = cs1.fetchall()
    for s in alldata:
        idc.append(s[0])
        title.append(s[1])
    #print(len(idc))
    # Load model without GPU
    ws = WS("請上CKipTagger 的github下載模型,網址詳見READ")  #斷詞
    pos = POS("請上CKipTagger 的github下載模型,網址詳見READ")  #詞性標註
    ner = NER("請上CKipTagger 的github下載模型,網址詳見READ")  #實體辨識

    # Create custom dictionary
    # 用讀CSV的方式讀取前面匯出的txt
    df_ner_dict = pd.read_csv(r"停用詞文件儲存位置",
                              delimiter="\t",
                              quoting=csv.QUOTE_NONE,
                              header=None,
                              encoding="utf-8")  #使用停用詞
    # 存到list
    df_ner_dict.columns = ['NER']
    list_ner_dict = list(df_ner_dict['NER'])
    dict_for_CKIP = dict((el, 1) for el in list_ner_dict)
    dict_for_CKIP = construct_dictionary(dict_for_CKIP)
    for i in range(len(title)):
        sentence_list = '朴敏英進廠「修鼻子」?最新近照曝光 網驚:有點怪怪的'  #若修改成sentence_list = title[i],則可以讀取資料表中所有字串
        idh = idc[i]
        word_s = np.ravel(ws(sentence_list,
                             coerce_dictionary=dict_for_CKIP))  #斷詞
        word_p = np.ravel(pos(word_s))  #詞性標註
        pos_sentence_list = pos(word_s)
        print(word_s)
        print(word_p)

    for key, value in zip(word_s, word_p):  #將斷詞結果和對應詞性以鍵值方式存為JSON檔
        user[key] = value
        jsoninfo = json.dumps(user, ensure_ascii=False)

    print("complete")
    # Release model
    del ws
    del pos
    del ner
Пример #14
0
def cut_func(input_data,data_col,name):
	os.environ["CUDA_VISIBLE_DEVICES"] = "0"
	from ckiptagger import data_utils, construct_dictionary, WS
	User_Dict = {}
	with open("dict.txt","r",encoding = 'utf-8') as USDic:
		for tmpwords in USDic:
			words = tmpwords.strip().split(" ")
			if len(words) > 1:
				User_Dict[words[0]] = words[1]
			else:
				User_Dict[words[0]] = 10
	dictionary = construct_dictionary(User_Dict)
	ws = WS("./data",disable_cuda=False)
	# pos = POS("/data")
	# ner = NER("/data")
	print(input_data)

	punctuation = " 的也//,::""()\n!!?。"#$%&'()*+-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏""<->#。!⋯.➡?=&▶_%♀!❗🎉⏰💪🔥⁉❓"
	re_punctuation = "[{}] ".format(punctuation)
	input_data = input_data.replace(np.nan,'',regex = True)
	tmp_fbtext = list(input_data[data_col])
	stopwordslist = stopwordlist()
	ckip_pd = pd.DataFrame(columns = ['CKIP_Result'])
	ckip_pd['CKIP_Result'] = ckip_pd['CKIP_Result'].astype('str')
	print("Total Data to process: ",len(tmp_fbtext),'\n','----------------')
	counter = 1
	tmp_things = []
	for things in tmp_fbtext:
		print("Now processing:", name," No.",counter)
		tmp_things.append(things)
		ckip_cut = ws(tmp_things,sentence_segmentation=True,segment_delimiter_set = {",", "。", ":", "?", "!", ";", "、"},coerce_dictionary = dictionary)
		text = ''
		tmp_things.clear()
		ner_thread = threading.Thread(target = do_NER, args = (ckip_cut,))
		ner_thread.start()
		for cutted in ckip_cut:
			if cutted not in stopwordslist:
				text = str(cutted) + " " + text
		text = re.sub(r'[0-9]','',text)
		text = re.sub(r'[a-zA-Z]','',text)
		text = re.sub(r'[^\w\s]','',text)
		text = re.sub(re_punctuation,'',text)
		tmp = pd.Series({'CKIP_Result' : text})
		ckip_pd = ckip_pd.append(tmp,ignore_index = True)
		ner_thread.join()
		counter += 1
	return ckip_pd
Пример #15
0
def WordSegment_and_write2file(give):
    ws = WS("./data",disable_cuda=False)

    with open('WikiDict_plus_allfieldskeywordsDict.pkl', 'rb') as fp:
        WikiDict_plus_allfieldskeywordsDict = pickle.load(fp)
    fp.close()

    for i in [give]:
            # print(i)
            word_sentence_list = ws(
                i, 
                sentence_segmentation = True,
                segment_delimiter_set = {",", "。", ":", "?", "!", ";", "?", ",", "、", " ", "。", "!", "? ", "NULL","\n","\n3000","(",")","=","/"},
                recommend_dictionary = construct_dictionary(WikiDict_plus_allfieldskeywordsDict),
            )
            # print(word_sentence_list)

            # with open('allfields_list.pkl', 'wb') as fp:
            #     pickle.dump(word_sentence_list, fp)
            # fp.close()

            # print("1")
            All.append(word_sentence_list)
            # del word_sentence_list
            


            # with open("allfields_list.pkl",'rb') as f:
            #     final = pickle.loads(f.read())
                # print("2")
                # print(final)
                
            new_final = []
            for i in word_sentence_list:
                new_i = []
                # print(i)
                for j in i:
                    j = remove_punctuation(j)
                    # print(j)
                    if j != "" :
                        new_i.append(j)
                new_final.append(new_i)
                # print(new_final)
            # print("$$$$$",new_final)
            return new_final,word_sentence_list
Пример #16
0
 def __init__(self,
              ckip_data_path='./data',
              custom_dict_path=None,
              disable_cuda=True,
              cuda_memory_limit=2048):
     if (disable_cuda == False):
         gpus = tf.config.experimental.list_physical_devices('GPU')
         try:
             tf.config.experimental.set_virtual_device_configuration(
                 gpus[0], [
                     tf.config.experimental.VirtualDeviceConfiguration(
                         cuda_memory_limit)
                 ])
         except RuntimeError as e:
             print(e)
     # Load model
     self.ws = WS(ckip_data_path, disable_cuda=disable_cuda)
     self.pos = POS(ckip_data_path, disable_cuda=disable_cuda)
     self.ner = NER(ckip_data_path, disable_cuda=disable_cuda)
     if (custom_dict_path is not None):
         self.dictionary = construct_dictionary(
             self.__load_custom_dict(custom_dict_path))
     else:
         self.dictionary = {}
Пример #17
0
    print("read data in...")
    data = np.load(FILENAME)
    if (LIMIT):
        data = data[:1000]

    print("read WORD_TO_WEIGHT in...")
    word_to_weight = {}
    with open(WORD_TO_WEIGHT, encoding='utf-8') as f:
        for line in f:
            word = line.split('\n')[0]
            if (word not in word_to_weight):
                word_to_weight[word] = 1
            else:
                word_to_weight[word] += 1
    dictionary = construct_dictionary(word_to_weight)

    print("start segementation...")
    word_sentence_list = ws(
        data,
        sentence_segmentation=True,  # To consider delimiters
        # segment_delimiter_set = {",", "。", ":", "?", "!", ";"}), # This is the defualt set of delimiters
        # recommend_dictionary = dictionary1, # words in this dictionary are encouraged
        # coerce_dictionary = dictionary2, # words in this dictionary are forced
    )

    print("start POS...")
    pos_sentence_list = pos(word_sentence_list)

    print("start to save the result...")
    savename = "%s_ws.json" % FILENAME[:-4]
Пример #18
0
import telepot
import datetime
from telethon import TelegramClient, sync
from telethon import events, functions, types
from telethon.tl.types import PeerUser, PeerChat, PeerChannel
from telethon.tl.functions.messages import AddChatUserRequest
from ckiptagger import data_utils, construct_dictionary, WS
# 函式庫引入完畢

from config import base
from config import weights_dictionary

base = base.base()
ws = WS("./data")
recommend_dictionary = weights_dictionary.coerce_dictionary()
recommend_dictionary = construct_dictionary(recommend_dictionary)
coerce_dictionary = weights_dictionary.coerce_dictionary()
coerce_dictionary = construct_dictionary(coerce_dictionary)
# 設定檔、資源檔引入

owner = base['owner']
timezone = base['timezone']
bots_len = len(base['tgbots'])
group_name = base['group_name']
channel_id = base['channel_id']
interval_time = base['interval']
sleep_time = interval_time / bots_len
# 基本定義

bots = []
for i in range(len(base['tgbots'])):
Пример #19
0
path = str(Path.home()) + '/ckip/'
zh_ws = WS(path + '/data')
weight = 1
user_words = list()
user_dict = dict()

# load user words
with open('ckip_ud.txt', 'r', encoding='utf8') as f:
    for l in f.readlines():
        user_words.append(l.strip())
user_words = list(set(user_words))
# create user dictionary
for w in user_words:
    user_dict[w] = weight
user_dictionary = construct_dictionary(user_dict)


def show(doc):
    global add_word
    title = []
    content = []
    c_sentence = doc['content']
    t_sentence = doc['title']
    n = 0
    while n < len(add_word):
        if add_word[n] in t_sentence:
            t_sentence = t_sentence.replace(add_word[n], '', 1)
        else:
            n += 1
    doc_title = zh_nlp(t_sentence)
Пример #20
0
def tokenize(news_df):
    """
    To tokenize & extract key word
    :param news_df: (title,content,date) 依照時間日期遞增的方式(1月=>12月)排序好之df
    :return: df: ('ori_title', 'ori_news', 'tok_title_news', 'keyWord_algorithm')
    """
    load_start = time.time()
    define_dict_path = "./TokSentLeo/user_dict/company_dict.txt"
    model_path = './TokSentLeo/CKIP_model/'
    ws = WS(model_path)
    pos = POS(model_path)
    ner = NER(model_path)

    word_to_weight = {}
    with open(define_dict_path, "r", encoding='utf8') as file:
        for line in file:
            key, value = line.split()
            word_to_weight[str(key)] = 2
    dictionary = construct_dictionary(word_to_weight)
    all_date_li = news_df.Date.tolist()
    all_news_li = news_df.Content.tolist()
    all_title_li = news_df.Title.tolist()
    all_news_li2 = []
    for title, news in zip(all_title_li, all_news_li):
        if type(news) == float:  # news is nan, only title
            all_news_li2.append(title)
        elif type(title) == float:
            all_news_li2.append(news)
        else:
            all_news_li2.append(title + ":" + news)

    load_end = time.time() - load_start
    tokenize_start = time.time()
    print(
        "Model Load Time:",
        '{:02f}:{:02f}:{:02f}'.format(load_end // 3600,
                                      (load_end % 3600 // 60), load_end % 60))

    word_sentence_list = ws(all_news_li2,
                            recommend_dictionary=dictionary,
                            segment_delimiter_set={
                                ",", "。", ":", "?", "!", ";", ",", ":", "?",
                                "!", ";"
                            })
    pos_sentence_list = pos(word_sentence_list)
    entity_sentence_list = ner(word_sentence_list, pos_sentence_list)
    temp = []
    temp1 = []
    bad_list = [
        "<br>", "br", "BR", "<BR>", ",", "【", "】", "╱", "▲", "▼",
        "&amp;amp;amp;amp;amp;lt;br", "&amp;amp;amp;amp;amp;gt", "amp", "lt",
        "br&", "gt", "&amp", "[", "]"
    ]
    for w_s_l, e_s_l in zip(word_sentence_list, entity_sentence_list):
        # t = []
        # t1 = []
        # for i, x in enumerate(w_s_l):
        #     if x not in bad_list:
        #         t.append(x)
        #         t1.append(e_s_l[i])
        t = [x for x in w_s_l if x not in bad_list]
        temp.append(t)
        # temp1.append(t1)
    word_sentence_list = temp
    # entity_sentence_list = temp1

    tokenize_end = time.time() - tokenize_start
    print(
        "DL Tokenize Time:",
        '{:02f}:{:02f}:{:02f}'.format(tokenize_end // 3600,
                                      (tokenize_end % 3600 // 60),
                                      tokenize_end % 60))
    algo_start = time.time()

    # by news TF
    TF_news_li = []
    for i, news_toks in enumerate(word_sentence_list):
        temp = dict(Counter(news_toks))
        df = pd.DataFrame(list(temp.items()),
                          columns=['Word', 'TF_norm_score'])
        df.TF_norm_score = (df.TF_norm_score - df.TF_norm_score.min()) / (
            df.TF_norm_score.max() - df.TF_norm_score.min())
        # for item in temp.items():
        #     df.append(pd.Series(list(item)))
        all_words = df.Word.tolist()
        for i, w in enumerate(all_words):
            if len(w) < 2:
                df.iloc[i, 1] = df.iloc[i, 1] - (df.TF_norm_score.mean() +
                                                 3 * df.TF_norm_score.std())
        TF_news_li.append(df)

    # by news NER
    NER_news_li = []
    for i, en_sentence in enumerate(entity_sentence_list):
        df = pd.DataFrame(columns=['Word', 'NER'])
        word = []
        for entity in en_sentence:
            word_ = entity[-1]
            if word_ in word:
                continue
            word.append(word_)
            temp = [word_, entity[-2]]
            temp = pd.Series(temp, index=df.columns)
            df = df.append(temp, ignore_index=True)
        NER_news_li.append(df)

    # by news TFIDF
    TFIDF_df_li = []
    all_corpus = []
    for sentence in word_sentence_list:
        all_corpus.append(" ".join(sentence))
    # print(all_corpus[0])
    vectoerizer = CountVectorizer(min_df=3,
                                  max_df=0.9,
                                  token_pattern='\\b\\w+\\b')
    vectoerizer.fit(all_corpus)
    X = vectoerizer.transform(all_corpus)
    tfidf_transformer = TfidfTransformer()
    tfidf = tfidf_transformer.fit_transform(X.toarray())
    word = vectoerizer.get_feature_names()
    weight = tfidf.toarray()
    for i in range(len(weight)):
        # print("text:",i)
        df = pd.DataFrame(columns=['Word', 'Tfidf'])
        for j in range(len(word)):
            if weight[i][j] <= 0:
                continue
            temp = [word[j], weight[i][j]]
            temp = pd.Series(temp, index=df.columns)
            df = df.append(temp, ignore_index=True)
            # print(word[j],weight[i][j])
        TFIDF_df_li.append(df)

    # by NEWS TR
    TR_df_li = []
    for sentence in all_corpus:
        text_rank_words = keywords.keywords(sentence, split=True)
        all_length = len(text_rank_words)
        df = pd.DataFrame(columns=['Word', 'TR_normScore'])
        for i, words in enumerate(text_rank_words):
            word_li = words.split()
            for word in word_li:
                score = (all_length - i) / all_length
                temp = [word, score]
                temp = pd.Series(temp, index=df.columns)
                df = df.append(temp, ignore_index=True)
        all_words = df.Word.tolist()
        for i, w in enumerate(all_words):
            if len(w) < 2:
                df.iloc[i, 1] = df.iloc[i, 1] - (df.TR_normScore.mean() +
                                                 3 * df.TR_normScore.std())
        TR_df_li.append(df)

    # combine all
    COM2_df_li = []
    for tf_df, ner_df, tfidf_df, tr_df in zip(TF_news_li, NER_news_li,
                                              TFIDF_df_li, TR_df_li):
        com_df = tf_df.merge(ner_df, how='outer', on='Word')
        com_df = com_df.fillna(0)

        def transform(s):
            if s != 0:
                return (tfidf_df.Tfidf.median() + tr_df.TR_normScore.median()
                        )  # /2
            else:
                return 0.0

        com_df.NER = com_df.NER.map(transform)
        com_df['score'] = com_df.TF_norm_score + com_df.NER
        com2_df = com_df.merge(tfidf_df, on='Word', how='outer')
        com2_df = com2_df.merge(tr_df, on='Word', how='outer')
        com2_df = com2_df.fillna(0)
        com2_df['score'] = com2_df.score + com2_df.TR_normScore + com2_df.Tfidf
        COM2_df_li.append(com2_df)

    # write result
    df = pd.DataFrame(columns=[
        'Date', 'ori_title', 'ori_news', 'tok_title_news', 'keyWord_algorithm'
    ])  # df's columns
    month = []
    word_month = []
    score_month = []
    for day, title_str, news_str, news_tok_li, com_df in zip(
            all_date_li, all_title_li, all_news_li, word_sentence_list,
            COM2_df_li):
        key_words = com_df[com_df.score > com_df.score.mean() +
                           1.65 * com_df.score.std()]  # 2*
        key_words = key_words.Word.tolist()
        key_words_month = com_df[com_df.score > com_df.score.mean() +
                                 2 * com_df.score.std()]
        words_score_month = key_words_month.score.tolist()
        key_words_month = key_words_month.Word.tolist()

        temp = [
            str(day), title_str, news_str, " ".join(news_tok_li),
            "、".join(key_words)
        ]
        temp = pd.Series(temp, index=df.columns)
        for word, score in zip(key_words_month, words_score_month):
            month.append(str(day).split('/')[1])
            word_month.append(word)
            score_month.append(score)
        df = df.append(temp, ignore_index=True)

    current_month = month[0]
    need_dict = {}
    dict_order = []
    month_order = []
    for i, (m, w, s) in enumerate(zip(month, word_month, score_month)):
        if m != current_month:
            month_order.append(current_month)
            current_month = m
            for k, v in need_dict.items():
                if len(v) < 3:  # DF<3 do not take
                    need_dict[k] = 0
                else:
                    need_dict[k] = np.mean(v)
            dict_order.append(need_dict)
            need_dict = {}
            need_dict[w] = list([float(s)])
            if i == len(month) - 1:
                dict_order.append(need_dict)

        else:
            if w not in list(need_dict.keys()):
                need_dict[w] = list([float(s)])
            else:
                temp = need_dict[w]
                temp.append(float(s))
                need_dict[w] = temp
            if i == len(month) - 1:
                month_order.append(m)
                for k, v in need_dict.items():
                    if len(v) < 3:  # DF<3 do not take
                        need_dict[k] = 0
                    else:
                        need_dict[k] = np.mean(v)
                dict_order.append(need_dict)
    df_month_key = pd.DataFrame(
        columns=['Month', 'key_word',
                 'score'])  # overall month key word with score
    for mo, dict_mo in zip(month_order, dict_order):
        for k, v in dict_mo.items():
            if v < 0.1:  # DF<3 do not take
                continue
            temp = [int(mo), str(k), v]
            temp = pd.Series(temp, index=df_month_key.columns)
            df_month_key = df_month_key.append(temp, ignore_index=True)

    algo_end = time.time() - algo_start
    print(
        "KeyWord Algorithm Time:",
        '{:02f}:{:02f}:{:02f}'.format(algo_end // 3600,
                                      (algo_end % 3600 // 60), algo_end % 60))
    return df, df_month_key