Пример #1
0
            lv1posSTR = lv1posSTR + re.sub(removePat1, "", i).replace("</", "(").replace(">", ") ")
    print(lv1posSTR) #注意,lv11 會把 Verb 和 ASPECT (時態標記) 獨立切開,類比成英文就是 create-ed 視為 create / ed


    lv2DICT = lv2DICT(inputSTR)
    lv2posSTR = ""
    for i in lv2DICT["result_pos"]:
        if len(i)==1:
            pass
        else:
            lv2posSTR = lv2posSTR + re.sub(removePat1, "", i).replace("</", "(").replace(">", ") ")
    print(lv2posSTR) #注意,lv2 會把 Verb 和 ASPECT (時態標記) 結合起來,類比成英文就是 create+ed 成為 created


        # 如果只是想取出動詞的「原型」的話,可以利用 .getVerbStemLIST() 的函式操作。
    verbStemLIST = articut.getVerbStemLIST(lv1DICT)
    print(verbStemLIST)

    verbStemLIST = articut.getVerbStemLIST(lv2DICT)
    print(verbStemLIST)

    # 以上句法結合律說明結束

    # 以下利用後處理,將「了」獨立切開
    resultLIST = []
    lv1posLIST = lv1posSTR.split(" ")
    lv2posLIST = lv2posSTR.split(" ")

    for i in range(0, len(lv2posLIST)):
        if "了(VerbP)" in lv2posLIST[i]: #這裡是以「了」為例子。另一個中文的 -ed ASPECT 詞是「過」,列在下一項
            resultLIST.extend(lv2posLIST[i].replace("了(VerbP)", "(ACTION_verb) 了(VerbP)").split(" "))
Пример #2
0
        riseSignal = set(riseSignal)

        #把 downSignal 和 riseSignal 中重覆的動詞清除。它可能是「中性」或是無關漲跌的動詞。
        downSignal = downSignal - riseSignal.intersection(downSignal)
        riseSignal = riseSignal - riseSignal.intersection(downSignal)
    return (downSignal, riseSignal)


if __name__ == "__main__":
    downSignal, riseSignal = signalMaker()
    if None in (downSignal, riseSignal):
        print("Cannot proceed!")
    else:
        testSTR = "產業供應鏈分散效應看好東協布局"  #測試用句。注意到這一句並沒有在前述學習的 downSample/riseSample 中。
        testResult = atc.parse(testSTR, level="lv2")
        testVerbLIST = atc.getVerbStemLIST(testResult)
        resultLIST = []
        for tv in testVerbLIST:
            if len(tv) == 0:
                pass
            else:
                for v in tv:
                    if v[-1] in downSignal:
                        if "negation" in testResult["result_pos"][0][
                                v[0] - 22:
                                v[0]]:  #確認是否有「否定詞」出現在 downSignal 中。如果有的話,那就是上漲囉!
                            resultLIST.append("這句新聞標題…應該是看漲↗")
                        else:
                            resultLIST.append("這句新聞標題…應該是看跌↘")
                    elif v[-1] in riseSignal:
                        if "negation" in testResult["result_pos"][0][
Пример #3
0
from ArticutAPI import Articut

text = "研究這個研究的研究已經被研究許多年了"


articut = Articut(username="", apikey="")
result = articut.parse(text)

#Articut 的 POS 標記斷詞結果在 "result_pos" 中。
print(result["result_pos"])



print("有幾個動詞的「研究」呢?")
verbLIST = articut.getVerbStemLIST(result, indexWithPOS=False)

verbCounter = 0
for sentence in verbLIST: #verbLIST 中,每「一個」句子是一個獨立的 list,故要先進入 sentence,再計算其中的 "研究"
    for v in sentence:
        if v[-1] == "研究":
            print("發現動詞「研究」 ,位於原句的 {}~{} 位置".format(v[0], v[1]))
            verbCounter = verbCounter + 1

print("共有 {} 個『研究』是動詞。".format(verbCounter))



print("有幾個名詞的「研究」呢?")
nounLIST = articut.getNounStemLIST(result, indexWithPOS=False)
Пример #4
0
def articut4PatentBot(categoryFILE, inputSTR):
    with open("account.info", encoding="utf-8") as f:
        userinfoDICT = json.loads(f.read())

    articut = Articut(username=userinfoDICT["username"], apikey=userinfoDICT["apikey"], level="lv1")

    # 讀入對應類別的專利文本
    #patentDICT = patent[categoryFILE]
    #patent_file = categoryFILE + '.json'
    with open("Dataset\patent.json", encoding='utf-8') as f:
        AllpatentDICT = json.loads(f.read())
        patentDICT = AllpatentDICT[categoryFILE]

    CertificateNumber = list(patentDICT.keys())

    # 接收使用者輸入的專利範圍
    userSTR = inputSTR.replace(" ", "").replace("\n", "")
    userResultDICT = articut.parse(userSTR)

    # 將類別中的專利範圍全部比對一次
    VerbCosineSimilarity = []
    NounCosineSimilarity = []
    TFIDFCosineSimilarity = []
    for k in patentDICT.values():
        STR = k.replace(" ", "").replace("\n", "")
        STRResultDICT = articut.parse(STR)
    
        # 取得「動詞」做為特徵列表
        patentVerbLIST = articut.getVerbStemLIST(STRResultDICT)
        userVerbLIST = articut.getVerbStemLIST(userResultDICT)
        # 利用 Counter() 模組計算每個動詞出現的次數
        patentCOUNT = Counter(wordExtractor(patentVerbLIST, unify=True))
        userCOUNT = Counter(wordExtractor(userVerbLIST, unify=True))
        # 計算 [專利文本 vs. 使用者輸入文本] 的餘弦相似度
        patent2userSIM = counterCosineSimilarity(patentCOUNT, userCOUNT)
        VerbCosineSimilarity.append(patent2userSIM)
    
    
    
        # 取得「名詞」做為特徵列表
        patentNounLIST = articut.getNounStemLIST(STRResultDICT)
        userNounLIST = articut.getNounStemLIST(userResultDICT)
        # 利用 Counter() 模組計算每個名詞出現的次數
        patentCOUNT = Counter(wordExtractor(patentNounLIST, unify=True))
        userCOUNT = Counter(wordExtractor(userNounLIST, unify=True))
        # 計算 [專利文本 vs. 使用者輸入文本] 的餘弦相似度
        patent2userSIM = counterCosineSimilarity(patentCOUNT, userCOUNT)
        NounCosineSimilarity.append(patent2userSIM)
    
    
    
        # 取得「TF-IDF」做為特徵列表
        patentTFIDFLIST = articut.analyse.extract_tags(STRResultDICT)
        userTFIDFLIST = articut.analyse.extract_tags(userResultDICT)
        # 利用 Counter() 模組計算每個 TF-IDF 特徵詞出現的次數
        patentCOUNT = Counter(patentTFIDFLIST)
        userCOUNT = Counter(userTFIDFLIST)
        # 計算 [專利文本 vs. 使用者輸入文本] 的 TF-IDF 餘弦相似度
        patent2userSIM = counterCosineSimilarity(patentCOUNT, userCOUNT)
        TFIDFCosineSimilarity.append(patent2userSIM)


    ArticutresultDICT = {}
    max_Verb = max(VerbCosineSimilarity)
    v = VerbCosineSimilarity.index(max_Verb)
    # print("[專利文本 vs. 使用者輸入文本] 的動詞餘弦相似度:{}".format(VerbCosineSimilarity))
    # print("最大值為{:.2f}來自證書書號{}的專利範圍".format(max_Verb, CertificateNumber[v]))
    ArticutresultDICT["Verb"] = {}
    ArticutresultDICT["Verb"][CertificateNumber[v]] = max_Verb

    max_Noun = max(NounCosineSimilarity)
    n = NounCosineSimilarity.index(max_Noun)
    # print("[專利文本 vs. 使用者輸入文本] 的名詞餘弦相似度:{}".format(NounCosineSimilarity))
    # print("最大值為{:.2f}來自證書書號{}的專利範圍".format(max_Noun, CertificateNumber[n]))
    ArticutresultDICT["Noun"] = {}
    ArticutresultDICT["Noun"][CertificateNumber[n]] = max_Noun

    max_TFIDF = max(TFIDFCosineSimilarity)
    t = TFIDFCosineSimilarity.index(max_TFIDF)
    # print("[專利文本 vs. 使用者輸入文本] 的 TF-IDF 特徵詞餘弦相似度:{}".format(TFIDFCosineSimilarity))
    # print("最大值為{:.2f}來自證書書號{}的專利範圍".format(max_TFIDF, CertificateNumber[t]))
    ArticutresultDICT["TFIDF"] = {}
    ArticutresultDICT["TFIDF"][CertificateNumber[t]] = max_TFIDF

    ArticutresultDICT["All_Max"] = {}
    m = max(max_Verb, max_Noun, max_TFIDF)
    if m == max_Noun:
        ArticutresultDICT["All_Max"][CertificateNumber[n]] = [m, "名詞"]
    elif m == max_Verb:
        ArticutresultDICT["All_Max"][CertificateNumber[v]] = [m, "動詞"]
    elif m == max_TFIDF:
        ArticutresultDICT["All_Max"][CertificateNumber[t]] = [m, "TF-IDF"]


    return ArticutresultDICT

if __name__ == "__main__":
    articut = Articut(username="******",
                      apikey="4#-!FX^Vr5kBaznKL2U7egRwkU=Hx*k")
    fileTUPLE = ("../example/text.txt", "./A. forsteri.txt")
    MouseSTR = jsonTextReader(fileTUPLE[0])
    #print(MouseSTR)
    PenguinSTR = jsonTextReader(fileTUPLE[1])
    #print(PenguinSTR)

    MouseDICT_lv3 = EventAnalysis(MouseSTR, articut)
    MouseLIST = MouseDICT_lv3["event"]

    MouseDICT_lv2 = articut.parse(MouseSTR, level="lv2")
    MouseLIST_lv2 = articut.getVerbStemLIST(MouseDICT_lv2)
    #for item in MouseLIST_lv2:
    #    if item != '\n' and item != []:
    #        print(item)

    PenguinDICT_lv3 = EventAnalysis(PenguinSTR, articut)
    PenguinLIST = PenguinDICT_lv3["event"]

    ResultDICT = {"倉鼠": [], "皇帝企鵝": []}
    for item in MouseLIST:
        if item != '\n' and item != []:
            ResultDICT["倉鼠"].append(item)
    for item in PenguinLIST:
        if item != '\n' and item != []:
            ResultDICT["皇帝企鵝"].append(item)
    print(ResultDICT)