예제 #1
0
def load2TexAS(data):
    """
    Converting the output of your annotation service to TexAS.
    In this case, our service is tokenization and sentence separation
    """
    # Collect the data
    string = data['text']
    lang = data['lang']
    packages = data['packages']

    final_HTML = ""
    message_HTML = "<div class=\'message\'>"
    isMessage = False
    header_input = []
    log_row = [datetime.now().strftime('%Y-%m-%d %H:%M:%S'), lang]

    if "stanza" in packages:
        # Initialize the TexAS document
        mydoc = tx.Document(string)
        mydoc.meta().set("authors", "hegler,yiwen,celine,yuqian")
        mydoc.date().setTimestamp("2021-01-19T14:44")

        model = model_lang_map["stanza"][lang]
        docs = model(string)
        tokens, end_pos, lemma, pos, nlpWordsList, hasCompoundWords = get_services_stanza(
            docs)

        mydoc.setTokenList(tokens, indexed=True)
        mydoc.views().get("TOKENS").meta().set("generator", "stanza")
        mydoc.views().get("TOKENS").meta().set("model", "stanza" + "-" + lang)
        mydoc.setSentenceList(end_pos)

        if hasCompoundWords:
            mydoc.addTokenView("WORDS", nlpWordsList)
        mydoc.addTokenView("LEMMA", lemma)
        mydoc.addTokenView("POS", pos)

        # Extract HTML View
        myTabView = tx.UITabularView(mydoc)
        if hasCompoundWords:
            myTabView.showView("WORDS")
        myTabView.showView("LEMMA", labelCSS=False)
        myTabView.showView("POS")

        # concatenate the myTabView.HTML()
        header_input.append(("Stanza", str(len(end_pos)), str(len(tokens)),
                             str(get_tokens_per_sents(end_pos))))
        final_HTML += "<div class='subtitle'>Stanza</div> <br>" + myTabView.HTML(
        ).replace("\n", "") + "<br>"
        log_row.append("stanza")

    else:
        log_row.append("")

    if "spacy" in packages:
        # SpaCy does not support Arabic and Russian
        if lang == 'ara' or lang == 'rus':
            message_HTML += "SpaCy does not support Arabic or Russian. <br>"
            isMessage = True

        else:
            mydoc = tx.Document(string)
            mydoc.meta().set("authors", "hegler,yiwen,celine,yuqian")
            mydoc.date().setTimestamp("2021-01-19T14:44")

            model = model_lang_map["spacy"][lang]
            docs = model(string)
            tokens, end_pos, lemma, pos = get_services_spacy(docs)

            mydoc.setTokenList(tokens, indexed=True)
            mydoc.views().get("TOKENS").meta().set("generator", "spacy")
            mydoc.views().get("TOKENS").meta().set("model",
                                                   "spacy" + "-" + lang)
            mydoc.setSentenceList(end_pos)
            mydoc.addTokenView("LEMMA", lemma)
            mydoc.addTokenView("POS", pos)

            # Extract HTML View
            myTabView = tx.UITabularView(mydoc)
            myTabView.showView("LEMMA", labelCSS=False)
            myTabView.showView("POS")

            # concatenate the myTabView.HTML()
            header_input.append(("SpaCy", str(len(end_pos)), str(len(tokens)),
                                 str(get_tokens_per_sents(end_pos))))
            final_HTML += "<div class='subtitle'>" + "SpaCy" + "</div><br>" + myTabView.HTML(
            ).replace("\n", "") + "<br>"
        log_row.append("spacy")

    else:
        log_row.append("")

    if "udpipe" in packages:
        model = model_lang_map["udpipe"][lang]
        docs = model(string)
        tokens, end_pos, lemma, pos = get_services_udpipe(docs)
        string = " ".join(tokens)

        # Initialize the TexAS document
        mydoc = tx.Document(string)
        mydoc.meta().set("authors", "hegler,yiwen,celine,yuqian")
        mydoc.date().setTimestamp("2021-01-19T14:44")

        mydoc.setTokenList(tokens, indexed=True)
        mydoc.views().get("TOKENS").meta().set("generator", "udpipe")
        mydoc.views().get("TOKENS").meta().set("model", "udpipe" + "-" + lang)
        mydoc.setSentenceList(end_pos)
        mydoc.addTokenView("LEMMA", lemma)
        mydoc.addTokenView("POS", pos)

        # Extract HTML View
        myTabView = tx.UITabularView(mydoc)
        myTabView.showView("LEMMA", labelCSS=False)
        myTabView.showView("POS")

        # concatenate the myTabView.HTML()
        header_input.append(("UDpipe", str(len(end_pos)), str(len(tokens)),
                             str(get_tokens_per_sents(end_pos))))
        final_HTML += "<div class='subtitle'>UDpipe</div> <br>" + myTabView.HTML(
        ).replace("\n", "") + "<br>"
        log_row.append("udpipe")

    else:
        log_row.append("")

    message_HTML += "</div>"
    if isMessage:
        return message_HTML + get_header_table(
            header_input) + "<br><br>" + final_HTML

    writeLog(log_row)
    return get_header_table(header_input) + "<br><br>" + final_HTML
nlpPOSList = []
nlpDEPList = []
nlpLemmaList = []
nlpSentenceEndPositions = []

for sentence in doc.sentences:
    sentIndex += len(sentence.words)
    nlpSentenceEndPositions.append(sentIndex)
    for word in sentence.words:
        print(word.text, word.lemma, word.pos)
        index += 1
        nlpTokenList.append(word.text)
        nlpPOSList.append(word.pos)
        nlpLemmaList.append(word.lemma)

mydoc1 = tx.Document(TXText, TXLang)
# mydoc1.meta().set("generator","stanza")
# mydoc1.meta().set("model",TXSpacyModel)

mydoc1.setTokenList(nlpTokenList, indexed=True)
mydoc1.views().get("TOKENS").meta().set("generator", "stanza")
mydoc1.views().get("TOKENS").meta().set("model", TXSpacyModel)
mydoc1.setSentenceList(nlpSentenceEndPositions)

mydoc1.addTokenView("POS", nlpPOSList)
# no "DEP" annotations resulting from stanza
# mydoc1.addTokenView( "POS-DEP", nlpDEPList )
mydoc1.addTokenView("LEMMA", nlpLemmaList)

# create another document reversing from the previous document JSON
mydoc2 = tx.reverse(mydoc1.TAS())
예제 #3
0
import json
import texas as tx

# create a document
mydoc1 = tx.Document("Hello world!!! How are you today?", "en")
mydoc1.meta().set("authors", "hegler,yiwen,celine,yuqian")
mydoc1.date().setTimestamp("2021-01-19T14:44")  # ??
mydoc1.setTokenList(
    ["Hello", "world", "!", "!", "!", "How", "are", "you", "today", "?"])
mydoc1.addTokenView("POS", [
    "?", "NOUN", "PUNCT", "PUNCT", "PUNCT", "?", "VERB", "PRON", "?", "PUNCT"
])
mydoc1.setSentenceList([5, 10])

# create another document reversing from the previous document JSON
mydoc2 = tx.reverse(mydoc1.TAS())

print("==========")
print("mydoc2")
print("----------")
print("--- Token List")
print(mydoc2.getTokenList())
print("--- Token Info")
print(json.dumps(mydoc2.getTokenInfo()))
print("--- Sentence Info")
print(json.dumps(mydoc2.getSentenceInfo()))
print("--- Document TAS")
print(json.dumps(mydoc2.TAS()))

# create a corpus with a single document
mycorpus1 = tx.Corpus()