def location(inputSTR): global username global apikey articut = Articut(username=username, apikey=apikey) resultDICT = articut.parse(inputSTR, level="lv2") locLIST = articut.getLocationStemLIST(resultDICT) return locLIST
try: from .ArticutAPI import Articut except: from ArticutAPI import Articut except: #Installed via git clone import sys sys.path.append("../..") from pprint import pprint from ArticutAPI import Articut if __name__ == "__main__": username = "" #這裡填入您在 https://api.droidtown.co 使用的帳號 email。若使用空字串,則預設使用每小時 2000 字的公用額度。 apikey = "" #這裡填入您在 https://api.droidtown.co 登入後取得的 api Key。若使用空字串,則預設使用每小時 2000 字的公用額度。 articut = Articut(username, apikey) demoSTR = """美食商圈內高人氣四川牛肉麵專賣店選用大量的台灣洋蔥、 義大利番茄熬煮的自然酸甜牛肉湯底。加入大塊又有咬勁的牛肉塊, 特選訂製的Q彈拉麵條。日式拉麵完全吸附湯汁的精華,加入德國酸菜、牛油、辣油後更加美味 法式焗烤龍蝦和蕃茄牛肉炒飯還有法式起司火鍋以及五味章魚很夠味,章魚也不硬,搭配的小黃瓜很甜很好吃。魚蛋沙拉偶爾會點""".replace("\n", "") resultDICT = articut.parse(demoSTR) #只取出 [食物名稱] foodLIST = articut.NER.getFood(resultDICT, indexWithPOS=False) pprint(foodLIST) #取出 [地方風格]+[食物名稱] locFoodLIST = articut.NER.getFood(resultDICT, withLocation=True, indexWithPOS=False) pprint(locFoodLIST)
#text = "I didn't do it." #print("用 .split():", text.split()) #print("用 .word_tokenization():", word_tokenize(text)) from ArticutAPI import Articut import re text = "整天工作不玩耍,型男也會變傻瓜。" articut = Articut(username="", apikey="") result = articut.parse(text) # 利用 ArticutAPI 做中文「句子」的 tokenization sentenceTokenLIST = [] for s in result["result_pos"]: #在 result_pos 裡,標點符號不會被加上 POS 標記,因此字串長度為 1。 if len(s) != 1: sentenceTokenLIST.append(re.sub("</?[^>]*?>", "", s)) print("句子 token 的結果為:{}".format(sentenceTokenLIST)) # 利用 ArticutAPI 做中文「詞彙」的 tokenization sentenceSpliter = [] for s in result["result_pos"]: #在 result_pos 裡,標點符號不會被加上 POS 標記,因此字串長度為 1。 if len(s) == 1: sentenceSpliter.append(s)
Articut_resultDICT["translated_pos"])) ]) return Articut_resultDICT if __name__ == "__main__": try: #使用自己的斷詞額度。 with open("../../account.info", "r") as f: userDICT = json.loads(f.read()) username = userDICT["email"] apikey = userDICT["apikey"] atc = Articut(username=userDICT["email"], apikey=userDICT["apikey"]) except: #使用免費的斷詞額度。 #實體化 Articut() atc = Articut() inputSTR = '''You know,我們company是一個比較global的corporation,所以進來的candidate都要有一定的English的communication的ability,才qualified可以這樣mix的speak的。如果你老是speak這個Chinese的話,就給人local的image,you know,這個first impression就low掉了,可以understand嗎''' resultDICT = atc.parse(inputSTR, userDefinedDictFILE="./myEnglishDICT.json") translateResult = translateWithDictionary( resultDICT, dictionaryFILE="./myEnglishDICT.json") print("轉譯前:\n{}\n".format(inputSTR)) print("-字典中沒有 company 這個字!-\n") print("轉譯後:\n{}\n".format(translateResult["translated_seg"]))
Content = f.read() return Content def jsonFileWriter(jsonDICT, jsonFileName): with open(jsonFileName, mode="w") as f: json.dump(jsonDICT, f, ensure_ascii=False) return None if __name__ == "__main__": articut = Articut(username="******", apikey="1fawHVpX6VJJN=W5gImYKzS+q623Lup") MouseSTR = txtReader("../example/text.txt") MouseDICT = articut.parse(MouseSTR, level="lv3") MouseLIST = MouseDICT["event"] print(MouseLIST) PenguinSTR = txtReader("penguin.txt") PenguinDICT = articut.parse(PenguinSTR, level="lv3") PenguinLIST = PenguinDICT["event"] print(PenguinLIST) ResultDICT = {"倉鼠": [], "皇帝企鵝": []} for item in MouseLIST: if item != "\n" and item != []: ResultDICT["倉鼠"].append(item) for item in PenguinLIST: if item != "\n" and item != []:
with open("../../account.info", "r") as f: userDICT = json.loads(f.read()) username = userDICT["email"] apikey = userDICT["apikey"] atc = Articut(username=userDICT["email"], apikey=userDICT["apikey"]) except: #使用免費的斷詞額度。 #實體化 Articut() atc = Articut() #載入 Demo 用的文字 with open("./PengHu.txt", encoding="utf-8") as f: contentLIST = [l.replace("\n", "") for l in f.readlines()] resultLIST = [] for c in contentLIST: print("Processing:{}/{} >> {}".format( contentLIST.index(c) + 1, len(contentLIST), c)) resultDICT = atc.parse(c, openDataPlaceAccessBOOL=True) locationLIST = atc.getLocationStemLIST(resultDICT) if locationLIST != None: resultLIST.extend(locationLIST) else: pass print("DetectionResult:\n") pprint(resultLIST) with open("./LocationDetectionResultLIST.json", "w", encoding="utf-8") as f: json.dump(resultLIST, f, ensure_ascii=False)
riseSignal.append(i[-1]) riseSignal = set(riseSignal) #把 downSignal 和 riseSignal 中重覆的動詞清除。它可能是「中性」或是無關漲跌的動詞。 downSignal = downSignal - riseSignal.intersection(downSignal) riseSignal = riseSignal - riseSignal.intersection(downSignal) return (downSignal, riseSignal) if __name__ == "__main__": downSignal, riseSignal = signalMaker() if None in (downSignal, riseSignal): print("Cannot proceed!") else: testSTR = "產業供應鏈分散效應看好東協布局" #測試用句。注意到這一句並沒有在前述學習的 downSample/riseSample 中。 testResult = atc.parse(testSTR, level="lv2") testVerbLIST = atc.getVerbStemLIST(testResult) resultLIST = [] for tv in testVerbLIST: if len(tv) == 0: pass else: for v in tv: if v[-1] in downSignal: if "negation" in testResult["result_pos"][0][ v[0] - 22: v[0]]: #確認是否有「否定詞」出現在 downSignal 中。如果有的話,那就是上漲囉! resultLIST.append("這句新聞標題…應該是看漲↗") else: resultLIST.append("這句新聞標題…應該是看跌↘") elif v[-1] in riseSignal:
def articut4PatentBot(categoryFILE, inputSTR): with open("account.info", encoding="utf-8") as f: userinfoDICT = json.loads(f.read()) articut = Articut(username=userinfoDICT["username"], apikey=userinfoDICT["apikey"], level="lv1") # 讀入對應類別的專利文本 #patentDICT = patent[categoryFILE] #patent_file = categoryFILE + '.json' with open("Dataset\patent.json", encoding='utf-8') as f: AllpatentDICT = json.loads(f.read()) patentDICT = AllpatentDICT[categoryFILE] CertificateNumber = list(patentDICT.keys()) # 接收使用者輸入的專利範圍 userSTR = inputSTR.replace(" ", "").replace("\n", "") userResultDICT = articut.parse(userSTR) # 將類別中的專利範圍全部比對一次 VerbCosineSimilarity = [] NounCosineSimilarity = [] TFIDFCosineSimilarity = [] for k in patentDICT.values(): STR = k.replace(" ", "").replace("\n", "") STRResultDICT = articut.parse(STR) # 取得「動詞」做為特徵列表 patentVerbLIST = articut.getVerbStemLIST(STRResultDICT) userVerbLIST = articut.getVerbStemLIST(userResultDICT) # 利用 Counter() 模組計算每個動詞出現的次數 patentCOUNT = Counter(wordExtractor(patentVerbLIST, unify=True)) userCOUNT = Counter(wordExtractor(userVerbLIST, unify=True)) # 計算 [專利文本 vs. 使用者輸入文本] 的餘弦相似度 patent2userSIM = counterCosineSimilarity(patentCOUNT, userCOUNT) VerbCosineSimilarity.append(patent2userSIM) # 取得「名詞」做為特徵列表 patentNounLIST = articut.getNounStemLIST(STRResultDICT) userNounLIST = articut.getNounStemLIST(userResultDICT) # 利用 Counter() 模組計算每個名詞出現的次數 patentCOUNT = Counter(wordExtractor(patentNounLIST, unify=True)) userCOUNT = Counter(wordExtractor(userNounLIST, unify=True)) # 計算 [專利文本 vs. 使用者輸入文本] 的餘弦相似度 patent2userSIM = counterCosineSimilarity(patentCOUNT, userCOUNT) NounCosineSimilarity.append(patent2userSIM) # 取得「TF-IDF」做為特徵列表 patentTFIDFLIST = articut.analyse.extract_tags(STRResultDICT) userTFIDFLIST = articut.analyse.extract_tags(userResultDICT) # 利用 Counter() 模組計算每個 TF-IDF 特徵詞出現的次數 patentCOUNT = Counter(patentTFIDFLIST) userCOUNT = Counter(userTFIDFLIST) # 計算 [專利文本 vs. 使用者輸入文本] 的 TF-IDF 餘弦相似度 patent2userSIM = counterCosineSimilarity(patentCOUNT, userCOUNT) TFIDFCosineSimilarity.append(patent2userSIM) ArticutresultDICT = {} max_Verb = max(VerbCosineSimilarity) v = VerbCosineSimilarity.index(max_Verb) # print("[專利文本 vs. 使用者輸入文本] 的動詞餘弦相似度:{}".format(VerbCosineSimilarity)) # print("最大值為{:.2f}來自證書書號{}的專利範圍".format(max_Verb, CertificateNumber[v])) ArticutresultDICT["Verb"] = {} ArticutresultDICT["Verb"][CertificateNumber[v]] = max_Verb max_Noun = max(NounCosineSimilarity) n = NounCosineSimilarity.index(max_Noun) # print("[專利文本 vs. 使用者輸入文本] 的名詞餘弦相似度:{}".format(NounCosineSimilarity)) # print("最大值為{:.2f}來自證書書號{}的專利範圍".format(max_Noun, CertificateNumber[n])) ArticutresultDICT["Noun"] = {} ArticutresultDICT["Noun"][CertificateNumber[n]] = max_Noun max_TFIDF = max(TFIDFCosineSimilarity) t = TFIDFCosineSimilarity.index(max_TFIDF) # print("[專利文本 vs. 使用者輸入文本] 的 TF-IDF 特徵詞餘弦相似度:{}".format(TFIDFCosineSimilarity)) # print("最大值為{:.2f}來自證書書號{}的專利範圍".format(max_TFIDF, CertificateNumber[t])) ArticutresultDICT["TFIDF"] = {} ArticutresultDICT["TFIDF"][CertificateNumber[t]] = max_TFIDF ArticutresultDICT["All_Max"] = {} m = max(max_Verb, max_Noun, max_TFIDF) if m == max_Noun: ArticutresultDICT["All_Max"][CertificateNumber[n]] = [m, "名詞"] elif m == max_Verb: ArticutresultDICT["All_Max"][CertificateNumber[v]] = [m, "動詞"] elif m == max_TFIDF: ArticutresultDICT["All_Max"][CertificateNumber[t]] = [m, "TF-IDF"] return ArticutresultDICT
#使用免費的斷詞額度。 #實體化 Articut() atclv2 = Articut() atclv3 = Articut(level="lv3") #載入 Demo 用的文字 contentLIST = [] with open("./PengHu.txt", encoding="utf-8") as f: contentLIST = [l.replace("\n", "") for l in f.readlines()] #呼叫 parse(), 並把 "openDataPlaceAccessBOOL" 參數值設為 True 以便擷取最多的地點/地名資訊。 resultLIST = [] for c in contentLIST: print("Processing:{}/{} >> {}".format( contentLIST.index(c) + 1, len(contentLIST), c)) resultDICT = atclv2.parse(c, openDataPlaceAccessBOOL=True) eventDICT = {"time": [], "site": [], "event": []} #將結果傳給 getTimeLIST() 取出時間 timeLIST = atclv2.getTimeLIST(resultDICT) if timeLIST != None: for tm in timeLIST: eventDICT["time"].append([t[-1] for t in tm]) else: pass #將結果傳給 getLocationStemLIST() 取出地名 siteLIST = [] locationLIST = atclv2.getLocationStemLIST(resultDICT) if locationLIST != None:
#使用自己的斷詞額度。 with open("../../account.info", "r") as f: userDICT = json.loads(f.read()) username = userDICT["email"] apikey = userDICT["apikey"] atc = Articut(username=userDICT["email"], apikey=userDICT["apikey"]) except: #使用免費的斷詞額度。 #實體化 Articut() atc = Articut() # 載入 Demo 用的文字 text = open("./InputString.txt", "r").read() sentLIST = text.split("\n") print("ArticutAPI Term Extraction Demo") for sentence in sentLIST: if "" == sentence.strip(): continue result = atc.parse(sentence) if result["status"]: print("{}\nInput: {}".format('#' * 20, sentence)) # TextRank 抽取句子關鍵詞並排序 wordLIST = atc.analyse.textrank(result) print("TextRank:", wordLIST) # TFIDF 抽取句子關鍵詞 wordLIST = atc.analyse.extract_tags(result) print("TF-IDF:", wordLIST)
移除 POS 的標記,並把所有的詞彙結合成單一字串 ''' pat = re.compile("</?[a-zA-Z]+(_[a-zA-Z]+)?>") resultSTR = re.sub(pat, "", posSTR) return resultSTR if __name__ == "__main__": with open("../../account.info", encoding="utf-8") as f: accountINFO = json.loads(f.read()) myDICT = "./myDICT.json" articut = Articut(username=accountINFO["email"], apikey=accountINFO["apikey"]) articutResult = articut.parse(caseLIST[0], userDefinedDictFILE=myDICT) verbSTR = "佯裝" targetSentenceLIST = [] for a in articutResult["result_pos"]: if verbSTR in a: targetSentenceLIST.append(graphExtractor_verb(a, "佯裝")) subjectLIST = [] for t in targetSentenceLIST: for sentence in t: subjectLIST.append( posTagPurger(graphSubjectExtractor(sentence, verbSTR)[0])) objectLIST = [] for t in targetSentenceLIST:
try: #使用自己的斷詞額度。 with open("../../account.info", "r") as f: userDICT = json.loads(f.read()) username = userDICT["email"] apikey = userDICT["apikey"] atc = Articut(username=userDICT["email"], apikey=userDICT["apikey"]) except: #使用免費的斷詞額度。 #實體化 Articut() atc = Articut() #Demo 用的文字:載入政府機構名稱前。 inputSTR = "國軍退除役官兵輔導委員會簡稱退輔會。 " resultDICT = atc.parse(inputSTR) print("1. 政府機構名稱直接「斷詞」處理:") pprint(resultDICT["result_pos"]) print("=====================") inputSTR = "國軍退除役官兵輔導委員會簡稱退輔會。 " resultDICT = atc.parse( inputSTR, userDefinedDictFILE="../../Public_UserDefinedDict/KNOWLEDGE_govTW.json" ) print("2. 政府機構名稱用「自定字典」處理:") pprint(resultDICT["result_pos"]) print("=====================") inputSTR = "國軍退除役官兵輔導委員會簡稱退輔會。 " resultDICT = atc.parse(
return moneyLIST if __name__ == "__main__": articut = Articut(username="******", apikey="iuS6_RrzjVqjzMOGUu%c$@neEVPzPX4") FilePathTUPLE = ("../example/tourblog.json", "../example/刑事判決_106,交簡,359_2017-02-21.json", "../example/news.json") tourblogSTR = jsonTextReader(FilePathTUPLE[0], "content") tourblogSTR = tourblogSTR.replace(" ", "") #print("讀到字串:{}\n".format(tourblogSTR)) resultDICT = articut.parse(tourblogSTR, level="lv2", openDataPlaceAccessBOOL=True) locLIST = location(resultDICT) #print("讀到列表:{}\n".format(locLIST)) scenLIST = scenery(resultDICT) #print("讀到列表:{}\n".format(scenLIST)) jsonDICT = {"location": locLIST, "place": scenLIST} jsonFileName = "tourblog_geoinfo.json" jsonFileWriter(jsonDICT, jsonFileName) print("讀到字典:{}\n".format(jsonDICT)) lawSTR = jsonTextReader(FilePathTUPLE[1], "mainText") lawSTR = lawSTR.replace(" ", "") lawSTR = lawSTR.replace("\r\n", "") #print("讀到字串:{}\n".format(lawSTR)) resultDICT_2 = articut.parse(lawSTR, level="lv2")
return resultDICT if __name__ == "__main__": articut = Articut(username="******", apikey="4#-!FX^Vr5kBaznKL2U7egRwkU=Hx*k") fileTUPLE = ("../example/text.txt", "./A. forsteri.txt") MouseSTR = jsonTextReader(fileTUPLE[0]) #print(MouseSTR) PenguinSTR = jsonTextReader(fileTUPLE[1]) #print(PenguinSTR) MouseDICT_lv3 = EventAnalysis(MouseSTR, articut) MouseLIST = MouseDICT_lv3["event"] MouseDICT_lv2 = articut.parse(MouseSTR, level="lv2") MouseLIST_lv2 = articut.getVerbStemLIST(MouseDICT_lv2) #for item in MouseLIST_lv2: # if item != '\n' and item != []: # print(item) PenguinDICT_lv3 = EventAnalysis(PenguinSTR, articut) PenguinLIST = PenguinDICT_lv3["event"] ResultDICT = {"倉鼠": [], "皇帝企鵝": []} for item in MouseLIST: if item != '\n' and item != []: ResultDICT["倉鼠"].append(item) for item in PenguinLIST: if item != '\n' and item != []: ResultDICT["皇帝企鵝"].append(item)
#取得完整地址 addTWLIST = articut.getAddTWLIST(resultDICT) #pprint(addTWLIST) addSTR = [a for a in addTWLIST if a != []][0][0][2] #pprint(addSTR) #套用 SPACE (Doc:https://api.droidtown.co/document/#Space) url = "https://api.droidtown.co/Space/API/" payload = { "username": "", "api_key": "", "type": "geocoding", "site": addSTR } response = post(url, json=payload) jsonResults = response.json() #pprint(jsonResults) mapDICT = { "lat": jsonResults["results"][0]["lat"], "lng": jsonResults["results"][0]["lng"] } mapSTR = "https://www.openstreetmap.org/?mlat={lat}&mlon={lng}#map=16/{lat}/{lng}".format( **mapDICT) #print(mapSTR) #取得地址分段 (localRE) resultDICT = articut.parse("桃園市○○區○○路000巷0號") city = articut.localRE.getAddressCity(resultDICT) print(city)
def main(inputSTR): articut = Articut(username=username, apikey=apikey) resultDICT = articut.parse(inputSTR) return resultDICT