lv1posSTR = lv1posSTR + re.sub(removePat1, "", i).replace("</", "(").replace(">", ") ") print(lv1posSTR) #注意,lv11 會把 Verb 和 ASPECT (時態標記) 獨立切開,類比成英文就是 create-ed 視為 create / ed lv2DICT = lv2DICT(inputSTR) lv2posSTR = "" for i in lv2DICT["result_pos"]: if len(i)==1: pass else: lv2posSTR = lv2posSTR + re.sub(removePat1, "", i).replace("</", "(").replace(">", ") ") print(lv2posSTR) #注意,lv2 會把 Verb 和 ASPECT (時態標記) 結合起來,類比成英文就是 create+ed 成為 created # 如果只是想取出動詞的「原型」的話,可以利用 .getVerbStemLIST() 的函式操作。 verbStemLIST = articut.getVerbStemLIST(lv1DICT) print(verbStemLIST) verbStemLIST = articut.getVerbStemLIST(lv2DICT) print(verbStemLIST) # 以上句法結合律說明結束 # 以下利用後處理,將「了」獨立切開 resultLIST = [] lv1posLIST = lv1posSTR.split(" ") lv2posLIST = lv2posSTR.split(" ") for i in range(0, len(lv2posLIST)): if "了(VerbP)" in lv2posLIST[i]: #這裡是以「了」為例子。另一個中文的 -ed ASPECT 詞是「過」,列在下一項 resultLIST.extend(lv2posLIST[i].replace("了(VerbP)", "(ACTION_verb) 了(VerbP)").split(" "))
riseSignal = set(riseSignal) #把 downSignal 和 riseSignal 中重覆的動詞清除。它可能是「中性」或是無關漲跌的動詞。 downSignal = downSignal - riseSignal.intersection(downSignal) riseSignal = riseSignal - riseSignal.intersection(downSignal) return (downSignal, riseSignal) if __name__ == "__main__": downSignal, riseSignal = signalMaker() if None in (downSignal, riseSignal): print("Cannot proceed!") else: testSTR = "產業供應鏈分散效應看好東協布局" #測試用句。注意到這一句並沒有在前述學習的 downSample/riseSample 中。 testResult = atc.parse(testSTR, level="lv2") testVerbLIST = atc.getVerbStemLIST(testResult) resultLIST = [] for tv in testVerbLIST: if len(tv) == 0: pass else: for v in tv: if v[-1] in downSignal: if "negation" in testResult["result_pos"][0][ v[0] - 22: v[0]]: #確認是否有「否定詞」出現在 downSignal 中。如果有的話,那就是上漲囉! resultLIST.append("這句新聞標題…應該是看漲↗") else: resultLIST.append("這句新聞標題…應該是看跌↘") elif v[-1] in riseSignal: if "negation" in testResult["result_pos"][0][
from ArticutAPI import Articut text = "研究這個研究的研究已經被研究許多年了" articut = Articut(username="", apikey="") result = articut.parse(text) #Articut 的 POS 標記斷詞結果在 "result_pos" 中。 print(result["result_pos"]) print("有幾個動詞的「研究」呢?") verbLIST = articut.getVerbStemLIST(result, indexWithPOS=False) verbCounter = 0 for sentence in verbLIST: #verbLIST 中,每「一個」句子是一個獨立的 list,故要先進入 sentence,再計算其中的 "研究" for v in sentence: if v[-1] == "研究": print("發現動詞「研究」 ,位於原句的 {}~{} 位置".format(v[0], v[1])) verbCounter = verbCounter + 1 print("共有 {} 個『研究』是動詞。".format(verbCounter)) print("有幾個名詞的「研究」呢?") nounLIST = articut.getNounStemLIST(result, indexWithPOS=False)
def articut4PatentBot(categoryFILE, inputSTR): with open("account.info", encoding="utf-8") as f: userinfoDICT = json.loads(f.read()) articut = Articut(username=userinfoDICT["username"], apikey=userinfoDICT["apikey"], level="lv1") # 讀入對應類別的專利文本 #patentDICT = patent[categoryFILE] #patent_file = categoryFILE + '.json' with open("Dataset\patent.json", encoding='utf-8') as f: AllpatentDICT = json.loads(f.read()) patentDICT = AllpatentDICT[categoryFILE] CertificateNumber = list(patentDICT.keys()) # 接收使用者輸入的專利範圍 userSTR = inputSTR.replace(" ", "").replace("\n", "") userResultDICT = articut.parse(userSTR) # 將類別中的專利範圍全部比對一次 VerbCosineSimilarity = [] NounCosineSimilarity = [] TFIDFCosineSimilarity = [] for k in patentDICT.values(): STR = k.replace(" ", "").replace("\n", "") STRResultDICT = articut.parse(STR) # 取得「動詞」做為特徵列表 patentVerbLIST = articut.getVerbStemLIST(STRResultDICT) userVerbLIST = articut.getVerbStemLIST(userResultDICT) # 利用 Counter() 模組計算每個動詞出現的次數 patentCOUNT = Counter(wordExtractor(patentVerbLIST, unify=True)) userCOUNT = Counter(wordExtractor(userVerbLIST, unify=True)) # 計算 [專利文本 vs. 使用者輸入文本] 的餘弦相似度 patent2userSIM = counterCosineSimilarity(patentCOUNT, userCOUNT) VerbCosineSimilarity.append(patent2userSIM) # 取得「名詞」做為特徵列表 patentNounLIST = articut.getNounStemLIST(STRResultDICT) userNounLIST = articut.getNounStemLIST(userResultDICT) # 利用 Counter() 模組計算每個名詞出現的次數 patentCOUNT = Counter(wordExtractor(patentNounLIST, unify=True)) userCOUNT = Counter(wordExtractor(userNounLIST, unify=True)) # 計算 [專利文本 vs. 使用者輸入文本] 的餘弦相似度 patent2userSIM = counterCosineSimilarity(patentCOUNT, userCOUNT) NounCosineSimilarity.append(patent2userSIM) # 取得「TF-IDF」做為特徵列表 patentTFIDFLIST = articut.analyse.extract_tags(STRResultDICT) userTFIDFLIST = articut.analyse.extract_tags(userResultDICT) # 利用 Counter() 模組計算每個 TF-IDF 特徵詞出現的次數 patentCOUNT = Counter(patentTFIDFLIST) userCOUNT = Counter(userTFIDFLIST) # 計算 [專利文本 vs. 使用者輸入文本] 的 TF-IDF 餘弦相似度 patent2userSIM = counterCosineSimilarity(patentCOUNT, userCOUNT) TFIDFCosineSimilarity.append(patent2userSIM) ArticutresultDICT = {} max_Verb = max(VerbCosineSimilarity) v = VerbCosineSimilarity.index(max_Verb) # print("[專利文本 vs. 使用者輸入文本] 的動詞餘弦相似度:{}".format(VerbCosineSimilarity)) # print("最大值為{:.2f}來自證書書號{}的專利範圍".format(max_Verb, CertificateNumber[v])) ArticutresultDICT["Verb"] = {} ArticutresultDICT["Verb"][CertificateNumber[v]] = max_Verb max_Noun = max(NounCosineSimilarity) n = NounCosineSimilarity.index(max_Noun) # print("[專利文本 vs. 使用者輸入文本] 的名詞餘弦相似度:{}".format(NounCosineSimilarity)) # print("最大值為{:.2f}來自證書書號{}的專利範圍".format(max_Noun, CertificateNumber[n])) ArticutresultDICT["Noun"] = {} ArticutresultDICT["Noun"][CertificateNumber[n]] = max_Noun max_TFIDF = max(TFIDFCosineSimilarity) t = TFIDFCosineSimilarity.index(max_TFIDF) # print("[專利文本 vs. 使用者輸入文本] 的 TF-IDF 特徵詞餘弦相似度:{}".format(TFIDFCosineSimilarity)) # print("最大值為{:.2f}來自證書書號{}的專利範圍".format(max_TFIDF, CertificateNumber[t])) ArticutresultDICT["TFIDF"] = {} ArticutresultDICT["TFIDF"][CertificateNumber[t]] = max_TFIDF ArticutresultDICT["All_Max"] = {} m = max(max_Verb, max_Noun, max_TFIDF) if m == max_Noun: ArticutresultDICT["All_Max"][CertificateNumber[n]] = [m, "名詞"] elif m == max_Verb: ArticutresultDICT["All_Max"][CertificateNumber[v]] = [m, "動詞"] elif m == max_TFIDF: ArticutresultDICT["All_Max"][CertificateNumber[t]] = [m, "TF-IDF"] return ArticutresultDICT
if __name__ == "__main__": articut = Articut(username="******", apikey="4#-!FX^Vr5kBaznKL2U7egRwkU=Hx*k") fileTUPLE = ("../example/text.txt", "./A. forsteri.txt") MouseSTR = jsonTextReader(fileTUPLE[0]) #print(MouseSTR) PenguinSTR = jsonTextReader(fileTUPLE[1]) #print(PenguinSTR) MouseDICT_lv3 = EventAnalysis(MouseSTR, articut) MouseLIST = MouseDICT_lv3["event"] MouseDICT_lv2 = articut.parse(MouseSTR, level="lv2") MouseLIST_lv2 = articut.getVerbStemLIST(MouseDICT_lv2) #for item in MouseLIST_lv2: # if item != '\n' and item != []: # print(item) PenguinDICT_lv3 = EventAnalysis(PenguinSTR, articut) PenguinLIST = PenguinDICT_lv3["event"] ResultDICT = {"倉鼠": [], "皇帝企鵝": []} for item in MouseLIST: if item != '\n' and item != []: ResultDICT["倉鼠"].append(item) for item in PenguinLIST: if item != '\n' and item != []: ResultDICT["皇帝企鵝"].append(item) print(ResultDICT)