def load2TexAS(data): """ Converting the output of your annotation service to TexAS. In this case, our service is tokenization and sentence separation """ # Collect the data string = data['text'] lang = data['lang'] packages = data['packages'] final_HTML = "" message_HTML = "<div class=\'message\'>" isMessage = False header_input = [] log_row = [datetime.now().strftime('%Y-%m-%d %H:%M:%S'), lang] if "stanza" in packages: # Initialize the TexAS document mydoc = tx.Document(string) mydoc.meta().set("authors", "hegler,yiwen,celine,yuqian") mydoc.date().setTimestamp("2021-01-19T14:44") model = model_lang_map["stanza"][lang] docs = model(string) tokens, end_pos, lemma, pos, nlpWordsList, hasCompoundWords = get_services_stanza( docs) mydoc.setTokenList(tokens, indexed=True) mydoc.views().get("TOKENS").meta().set("generator", "stanza") mydoc.views().get("TOKENS").meta().set("model", "stanza" + "-" + lang) mydoc.setSentenceList(end_pos) if hasCompoundWords: mydoc.addTokenView("WORDS", nlpWordsList) mydoc.addTokenView("LEMMA", lemma) mydoc.addTokenView("POS", pos) # Extract HTML View myTabView = tx.UITabularView(mydoc) if hasCompoundWords: myTabView.showView("WORDS") myTabView.showView("LEMMA", labelCSS=False) myTabView.showView("POS") # concatenate the myTabView.HTML() header_input.append(("Stanza", str(len(end_pos)), str(len(tokens)), str(get_tokens_per_sents(end_pos)))) final_HTML += "<div class='subtitle'>Stanza</div> <br>" + myTabView.HTML( ).replace("\n", "") + "<br>" log_row.append("stanza") else: log_row.append("") if "spacy" in packages: # SpaCy does not support Arabic and Russian if lang == 'ara' or lang == 'rus': message_HTML += "SpaCy does not support Arabic or Russian. <br>" isMessage = True else: mydoc = tx.Document(string) mydoc.meta().set("authors", "hegler,yiwen,celine,yuqian") mydoc.date().setTimestamp("2021-01-19T14:44") model = model_lang_map["spacy"][lang] docs = model(string) tokens, end_pos, lemma, pos = get_services_spacy(docs) mydoc.setTokenList(tokens, indexed=True) mydoc.views().get("TOKENS").meta().set("generator", "spacy") mydoc.views().get("TOKENS").meta().set("model", "spacy" + "-" + lang) mydoc.setSentenceList(end_pos) mydoc.addTokenView("LEMMA", lemma) mydoc.addTokenView("POS", pos) # Extract HTML View myTabView = tx.UITabularView(mydoc) myTabView.showView("LEMMA", labelCSS=False) myTabView.showView("POS") # concatenate the myTabView.HTML() header_input.append(("SpaCy", str(len(end_pos)), str(len(tokens)), str(get_tokens_per_sents(end_pos)))) final_HTML += "<div class='subtitle'>" + "SpaCy" + "</div><br>" + myTabView.HTML( ).replace("\n", "") + "<br>" log_row.append("spacy") else: log_row.append("") if "udpipe" in packages: model = model_lang_map["udpipe"][lang] docs = model(string) tokens, end_pos, lemma, pos = get_services_udpipe(docs) string = " ".join(tokens) # Initialize the TexAS document mydoc = tx.Document(string) mydoc.meta().set("authors", "hegler,yiwen,celine,yuqian") mydoc.date().setTimestamp("2021-01-19T14:44") mydoc.setTokenList(tokens, indexed=True) mydoc.views().get("TOKENS").meta().set("generator", "udpipe") mydoc.views().get("TOKENS").meta().set("model", "udpipe" + "-" + lang) mydoc.setSentenceList(end_pos) mydoc.addTokenView("LEMMA", lemma) mydoc.addTokenView("POS", pos) # Extract HTML View myTabView = tx.UITabularView(mydoc) myTabView.showView("LEMMA", labelCSS=False) myTabView.showView("POS") # concatenate the myTabView.HTML() header_input.append(("UDpipe", str(len(end_pos)), str(len(tokens)), str(get_tokens_per_sents(end_pos)))) final_HTML += "<div class='subtitle'>UDpipe</div> <br>" + myTabView.HTML( ).replace("\n", "") + "<br>" log_row.append("udpipe") else: log_row.append("") message_HTML += "</div>" if isMessage: return message_HTML + get_header_table( header_input) + "<br><br>" + final_HTML writeLog(log_row) return get_header_table(header_input) + "<br><br>" + final_HTML
nlpPOSList = [] nlpDEPList = [] nlpLemmaList = [] nlpSentenceEndPositions = [] for sentence in doc.sentences: sentIndex += len(sentence.words) nlpSentenceEndPositions.append(sentIndex) for word in sentence.words: print(word.text, word.lemma, word.pos) index += 1 nlpTokenList.append(word.text) nlpPOSList.append(word.pos) nlpLemmaList.append(word.lemma) mydoc1 = tx.Document(TXText, TXLang) # mydoc1.meta().set("generator","stanza") # mydoc1.meta().set("model",TXSpacyModel) mydoc1.setTokenList(nlpTokenList, indexed=True) mydoc1.views().get("TOKENS").meta().set("generator", "stanza") mydoc1.views().get("TOKENS").meta().set("model", TXSpacyModel) mydoc1.setSentenceList(nlpSentenceEndPositions) mydoc1.addTokenView("POS", nlpPOSList) # no "DEP" annotations resulting from stanza # mydoc1.addTokenView( "POS-DEP", nlpDEPList ) mydoc1.addTokenView("LEMMA", nlpLemmaList) # create another document reversing from the previous document JSON mydoc2 = tx.reverse(mydoc1.TAS())
import json import texas as tx # create a document mydoc1 = tx.Document("Hello world!!! How are you today?", "en") mydoc1.meta().set("authors", "hegler,yiwen,celine,yuqian") mydoc1.date().setTimestamp("2021-01-19T14:44") # ?? mydoc1.setTokenList( ["Hello", "world", "!", "!", "!", "How", "are", "you", "today", "?"]) mydoc1.addTokenView("POS", [ "?", "NOUN", "PUNCT", "PUNCT", "PUNCT", "?", "VERB", "PRON", "?", "PUNCT" ]) mydoc1.setSentenceList([5, 10]) # create another document reversing from the previous document JSON mydoc2 = tx.reverse(mydoc1.TAS()) print("==========") print("mydoc2") print("----------") print("--- Token List") print(mydoc2.getTokenList()) print("--- Token Info") print(json.dumps(mydoc2.getTokenInfo())) print("--- Sentence Info") print(json.dumps(mydoc2.getSentenceInfo())) print("--- Document TAS") print(json.dumps(mydoc2.TAS())) # create a corpus with a single document mycorpus1 = tx.Corpus()