def test_NLPCube(): tepro.configure(TeproAlgo.getSentenceSplittingOperName(), TeproAlgo.algoCube) tepro.configure(TeproAlgo.getTokenizationOperName(), TeproAlgo.algoCube) tepro.configure(TeproAlgo.getPOSTaggingOperName(), TeproAlgo.algoCube) tepro.configure(TeproAlgo.getLemmatizationOperName(), TeproAlgo.algoCube) dto = tepro.pcExec(text, [TeproAlgo.getDependencyParsingOperName()]) # Processed two sentences... assert dto.getNumberOfSentences() == 2 # Check some dependency structure... assert dto.getSentenceTokens(0)[7].getWordForm() == 'imobilul' assert dto.getSentenceTokens(0)[7].getHead() == 13 assert dto.getSentenceTokens(0)[7].getDepRel() == 'nsubj' assert dto.getSentenceTokens(0)[13].getWordForm() == 'celor' assert dto.getSentenceTokens(0)[13].getHead() == 13 assert dto.getSentenceTokens(0)[13].getDepRel() == 'iobj' assert dto.getSentenceTokens(1)[0].getWordForm() == 'Amplasarea' assert dto.getSentenceTokens(1)[0].getHead() == 5 assert dto.getSentenceTokens(1)[0].getDepRel() == 'nsubj' assert dto.getSentenceTokens(1)[1].getWordForm() == 'construcției' assert dto.getSentenceTokens(1)[1].getHead() == 1 assert dto.getSentenceTokens(1)[1].getDepRel() == 'nmod'
def configure(self, op: str, algo: str): availableOps = TeproAlgo.getAvailableOperations() availableAlgos = TeproAlgo.getAvailableAlgorithms() if op not in availableOps: raise RuntimeError("Operation '" + op + CONSTSTR1) if algo not in availableAlgos: raise RuntimeError(CONSTSTR2 + algo + CONSTSTR1) if not TeproAlgo.canPerform(algo, op): raise RuntimeError(CONSTSTR2 + algo + "' cannot perform operation '" + op + "'. See class TeproAlgo.") print( "{0}.{1}[{2}]: requesting operation '{3}' be performed with '{4}'". format( Path(inspect.stack()[0].filename).stem, inspect.stack()[0].function, inspect.stack()[0].lineno, op, algo), file=sys.stderr, flush=True) self._conf[op] = algo
def test_TTL(): tepro.configure(TeproAlgo.getSentenceSplittingOperName(), TeproAlgo.algoTTL) tepro.configure(TeproAlgo.getTokenizationOperName(), TeproAlgo.algoTTL) tepro.configure(TeproAlgo.getPOSTaggingOperName(), TeproAlgo.algoTTL) tepro.configure(TeproAlgo.getLemmatizationOperName(), TeproAlgo.algoTTL) dto = tepro.pcExec(text, [TeproAlgo.getChunkingOperName()]) # Processed two sentences... assert dto.getNumberOfSentences() == 2 # For the first sentence: assert dto.getSentenceTokens(0)[0].getWordForm() == 'La' assert dto.getSentenceTokens(0)[0].getMSD() == 'Spsa' assert dto.getSentenceTokens(0)[0].getLemma() == 'la' assert dto.getSentenceTokens(0)[1].getWordForm() == '7' assert dto.getSentenceTokens(0)[5].getWordForm() == 'Brașovului' assert dto.getSentenceTokens(0)[5].getLemma() == 'Brașov' assert dto.getSentenceTokens(0)[5].getMSD() == 'Npmsoy' assert dto.getSentenceTokens(0)[5].getChunk() == 'Pp#2,Np#2' assert dto.getSentenceTokens(0)[22].getWordForm() == '.' assert dto.getSentenceTokens(0)[22].getCTAG() == 'PERIOD' # For the second sentence: assert dto.getSentenceTokens(1)[0].getWordForm() == 'Amplasarea' assert dto.getSentenceTokens(1)[22].getWordForm() == 'metri' assert dto.getSentenceTokens(1)[22].getCTAG() == 'NPN'
def pcDiac(self, text: str) -> str: """This processing chain will insert diacritics in a text which does not have them.""" # You can specify the whole call chain, if you know it. return self.pcExec(text, [ TeproAlgo.getTextNormOperName(), TeproAlgo.getDiacRestorationOperName()])
def test_AutoReconfiguration(): dto = tepro.pcFull(text6) assert tepro.getConfiguration(TeproAlgo.getSentenceSplittingOperName()) == TeproAlgo.algoTTL assert tepro.getConfiguration(TeproAlgo.getTokenizationOperName()) == TeproAlgo.algoTTL assert tepro.getConfiguration(TeproAlgo.getPOSTaggingOperName()) == TeproAlgo.algoTTL assert tepro.getConfiguration(TeproAlgo.getChunkingOperName()) == TeproAlgo.algoTTL assert dto.getSentenceTokens(0)[0].getMSD() == 'Np'
def test_TTS(): tepro.configure(TeproAlgo.getSentenceSplittingOperName(), TeproAlgo.algoTTL) tepro.configure(TeproAlgo.getTokenizationOperName(), TeproAlgo.algoTTL) tepro.configure(TeproAlgo.getPOSTaggingOperName(), TeproAlgo.algoTTL) tepro.configure(TeproAlgo.getLemmatizationOperName(), TeproAlgo.algoTTL) dto = tepro.pcExec(text5, [ TeproAlgo.getHyphenationOperName(), TeproAlgo.getStressIdentificationOperName(), TeproAlgo.getPhoneticTranscriptionOperName(), TeproAlgo.getAbbreviationRewritingOperName(), TeproAlgo.getNumeralRewritingOperName(), ]) # Processed two sentences... assert dto.getNumberOfSentences() == 1 # For the first sentence: assert dto.getSentenceTokens(0)[3].getExpansion() == \ 'o sută douăzeci și trei' assert dto.getSentenceTokens(0)[0].getSyllables() == "a-'ceas-ta" assert dto.getSentenceTokens(0)[0].getPhonetical() == "a ch e@ a s t a" assert dto.getSentenceTokens(0)[11].getSyllables() == "'vir-gu-lă" assert dto.getSentenceTokens(0)[11].getPhonetical() == "v i r g u l @" assert dto.getSentenceTokens(0)[14].getExpansion() == \ 'patruzeci și cinci virgulă șase sute treizeci și unu'
def test_MWEsAndDepTransfer(): tepro.configure(TeproAlgo.getSentenceSplittingOperName(), TeproAlgo.algoTTL) tepro.configure(TeproAlgo.getTokenizationOperName(), TeproAlgo.algoTTL) tepro.configure(TeproAlgo.getPOSTaggingOperName(), TeproAlgo.algoTTL) tepro.configure(TeproAlgo.getLemmatizationOperName(), TeproAlgo.algoTTL) dto = tepro.pcFull(text2) assert dto.getSentenceTokens(0)[3].getWordForm() == 'o_să' assert dto.getSentenceTokens(0)[3].getMSD() == 'Qf' assert dto.getSentenceTokens(0)[3].getHead() == 7 assert dto.getSentenceTokens(0)[3].getDepRel() == 'mark'
def defaultConfiguration(self): # Dictionary of operations to implementing algorithms (NLP app). self._conf = {} for op in TeproAlgo.getAvailableOperations(): self._conf[op] = TeproAlgo.getDefaultAlgoForOper(op) print("{0}.{1}[{2}]: configuring operation '{3}' with algorithm '{4}'". format( Path(inspect.stack()[0].filename).stem, inspect.stack()[0].function, inspect.stack()[0].lineno, op, self._conf[op] ), file=sys.stderr, flush=True)
def get(self, oper): """This method will return the available Teprolin algorithms (NLP apps) for the specified 'oper'.""" if oper in TeproAlgo.getAvailableOperations(): return ({ oper: TeproAlgo.getAlgorithmsForOper(oper) }, int(HTTPStatus.OK)) else: return ({ 'teprolin-conf': self._teprolin.getConfiguration(), 'teprolin-result': "Operation '" + oper + "' is not recognized. See class TeproAlgo." }, int(HTTPStatus.BAD_REQUEST))
def test_NEROps(): dto = tepro.pcExec(text3, [TeproAlgo.getNamedEntityRecognitionOperName()]) # Check NER annotations assert dto.getSentenceTokens(0)[0].getNER() == 'ORG' assert dto.getSentenceTokens(0)[1].getNER() == 'ORG' assert dto.getSentenceTokens(0)[3].getNER() == 'TIME'
def _runApp(self, dto, opNotDone): if not TeproAlgo.getBiomedicalNamedEntityRecognitionOperName( ) in opNotDone: return dto sequences = self._prepareSentences(dto) i = 0 for seq in sequences: rez = self._tagger.tag(seq) tsent = dto.getSentenceTokens(i) # These are equal, but just in case... if len(rez) == len(tsent): for j in range(len(rez)): if tsent[j].getMSD() == rez[j][1]: bnlabel = rez[j][0] if bnlabel != '' and \ bnlabel != '_' and bnlabel != '-': tsent[j].setBioNER(bnlabel) # end if bnlabel # end if == # end for j # end if len i += 1 # end for return dto
def pcLemma(self, text: str) -> TeproDTO: """This processing chain will do POS tagging and lemmatization on the input text, splitting the text in sentences and tokens beforehand.""" # Or you can specify a few operations like 'lemmatization', # pcExec will infer the dependencies. return self.pcExec(text, [TeproAlgo.getLemmatizationOperName()])
def test_BioNEROps(): dto = tepro.pcExec(text4, [TeproAlgo.getBiomedicalNamedEntityRecognitionOperName()]) # Check BioNER annotations assert dto.getSentenceTokens(0)[0].getBioNER() == 'B-DISO' assert dto.getSentenceTokens(0)[1].getBioNER() == 'I-DISO' assert dto.getSentenceTokens(0)[4].getBioNER() == 'B-DISO' assert dto.getSentenceTokens(0)[11].getBioNER() == 'B-CHEM' assert dto.getSentenceTokens(0)[13].getBioNER() == 'B-ANAT'
def _checkProgress(self, dto: TeproDTO) -> list: """Checks what operations have been perforemed already and what needs to be done futher.""" opNotDone = [] for op in TeproAlgo.getOperationsForAlgo(self._algoName): if not dto.isOpPerformed(op) or \ self._algoName == dto.getConfiguredAlgoForOper(op): opNotDone.append(op) return opNotDone
def test_UDPipe(): tepro.configure(TeproAlgo.getSentenceSplittingOperName(), TeproAlgo.algoUDPipe) tepro.configure(TeproAlgo.getTokenizationOperName(), TeproAlgo.algoUDPipe) tepro.configure(TeproAlgo.getPOSTaggingOperName(), TeproAlgo.algoUDPipe) tepro.configure(TeproAlgo.getLemmatizationOperName(), TeproAlgo.algoUDPipe) dto = tepro.pcExec(text4, [TeproAlgo.getDependencyParsingOperName()]) # Processed two sentences... assert dto.getNumberOfSentences() == 1 assert len(dto.getSentenceTokens(0)) == 21 # Check some dependency structure... assert dto.getSentenceTokens(0)[5].getHead() == 5 assert dto.getSentenceTokens(0)[5].getDepRel() == 'acl' assert dto.getSentenceTokens(0)[10].getWordForm() == 'concentrației' assert dto.getSentenceTokens(0)[10].getLemma() == 'concentrație' assert dto.getSentenceTokens(0)[10].getCTAG() == 'NOUN' assert dto.getSentenceTokens(0)[10].getMSD() == 'Ncfsoy' assert dto.getSentenceTokens(0)[10].getHead() == 8 assert dto.getSentenceTokens(0)[10].getDepRel() == 'nmod' assert dto.getSentenceTokens(0)[17].getWordForm() == 'și'
def _runApp(self, dto, opNotDone): text = dto.getText() tokenizer = self._model.newTokenizer(self._model.DEFAULT) tokenizer.setText(text) error = ProcessingError() sentence = Sentence() sid = 0 while tokenizer.nextSentence(sentence, error): self._model.tag(sentence, self._model.DEFAULT) self._model.parse(sentence, self._model.DEFAULT) # Teprolin tokenized sentence ttsent = [] # Teprolin string sentence tssent = sentence.getText() for w in sentence.words: if w.id == 0: continue tt = TeproTok() tt.setId(w.id) tt.setWordForm(w.form) tt.setCTAG(w.upostag) tt.setMSD(w.xpostag) tt.setLemma(w.lemma) tt.setHead(w.head) tt.setDepRel(w.deprel) ttsent.append(tt) # end for w if not dto.isOpPerformed(TeproAlgo.getSentenceSplittingOperName()): dto.addSentenceString(tssent) dto.addSentenceTokens(ttsent) else: # Check and update annotations that only TTL # can produce or that are requested specifically from it. alignment = dto.alignSentences(ttsent, sid) for op in opNotDone: dto.copyTokenAnnotation(ttsent, sid, alignment, op) sentence = Sentence() sid += 1 # end all split sentences. return dto
def test_DiacRestore(): dto = tepro.pcExec(text, [TeproAlgo.getDiacRestorationOperName()]) # Diacs have been inserted at the proper places... # Brașovului assert dto.getText()[26] == 'ș' # vânzare assert dto.getText()[57] == 'â' # adresează assert dto.getText()[75] == 'ă' # își assert dto.getText()[88] == 'î' assert dto.getText()[89] == 'ș' # spațiu assert dto.getText()[105] == 'ț'
def test_TextNorm(): dto = tepro.pcExec(text, [TeproAlgo.getTextNormOperName()]) # Spaces have been removed... assert dto.getText()[5] == 'm' assert dto.getText()[35] == 'i' # Tab has been removed... assert dto.getText()[43] == ' ' assert dto.getText()[76] == ' ' assert dto.getText()[77] == 'c' # Newlines are preserved... assert dto.getText()[127] == '\n' assert dto.getText()[128] == '\n'
def _runApp(self, dto, opNotDone): if not TeproAlgo.getNamedEntityRecognitionOperName() in opNotDone: return dto sentences = self._prepareSentences(dto) resp = requests.post(GENERALNERURL, data={"tokens": sentences}) if resp.ok: nsentences = resp.text.split("\n") i = -1 csentence = [] for ntok in nsentences: if not ntok: # Skip empty strings. continue if ntok.startswith("<s>"): i += 1 elif ntok.startswith("</s>"): tsentence = dto.getSentenceTokens(i) if len(csentence) == len(tsentence): for j in range(len(tsentence)): if tsentence[j].getWordForm() == csentence[j][0] and \ csentence[j][1] != "O": tsentence[j].setNER(csentence[j][1]) csentence = [] else: parts = ntok.split() csentence.append((parts[0], parts[5])) # end for ntok else: print( "{0}.{1}[{2}]: connecting to {3} failed with code {4}".format( Path(inspect.stack()[0].filename).stem, inspect.stack()[0].function, inspect.stack()[0].lineno, GENERALNERURL, resp.status_code), file=sys.stderr, flush=True) return dto
def _runApp(self, dto, opNotDone): if (TeproAlgo.getTokenizationOperName() in opNotDone): # Tokenization is required for MLPLAServer to work return dto for i in range(dto.getNumberOfSentences()): tsent = dto.getSentenceTokens(i) wforms = [] for tok in tsent: wforms.append(tok.getWordForm()) msent = self._getSentenceAnnotation(" ".join(wforms)) if len(msent) == len(tsent): for j in range(len(msent)): orig = tsent[j] mtok = msent[j] if orig.getWordForm() == mtok[0]: if mtok[1] != '_' and \ (TeproAlgo.getHyphenationOperName() in opNotDone or \ TeproAlgo.getStressIdentificationOperName() in opNotDone): orig.setSyllables(mtok[1]) if mtok[2] != '_' and TeproAlgo.getPhoneticTranscriptionOperName( ) in opNotDone: orig.setPhonetical(mtok[2]) if mtok[3] != '_' and \ (TeproAlgo.getNumeralRewritingOperName() in opNotDone or \ TeproAlgo.getAbbreviationRewritingOperName() in opNotDone): orig.setExpansion(mtok[3]) # end if word forms match # end all tokens # end if sentence lengths match # end all found sentences return dto
def pcFull(self, text: str) -> TeproDTO: """This is the complete processing chain (pc), executing all NLP ops enumerated in TeproAlgo.""" # Just run everything we know about on text. return self.pcExec(text, TeproAlgo.getAvailableOperations())
def _runApp(self, dto, opNotDone): text = dto.getText() sentences = self._cubeInst(text) sid = 0 for sent in sentences: # Teprolin tokenized sentence ttsent = [] # Teprolin string sentence tssent = "" for tok in sent: tt = TeproTok() tt.setId(tok.index) tt.setWordForm(tok.word) lowerWord = tok.word.lower() tt.setMSD(tok.xpos) # Assigning the mapped CTAG to the disambiguated MSD if tok.xpos in self._msd2ctag: tt.setCTAG(self._msd2ctag[tok.xpos]) else: tt.setCTAG(tok.xpos) lemmaIsSet = False # Doing lexicon lemmatization, if possible. if tok.word in self._tblwordform: if tok.xpos in self._tblwordform[tok.word] and \ len(self._tblwordform[tok.word][tok.xpos]) == 1: # TODO: if lemma is ambiguous, e.g. 'copii' can be 'copil' or 'copie' tt.setLemma(self._tblwordform[tok.word][tok.xpos][0]) lemmaIsSet = True elif lowerWord in self._tblwordform and \ tok.xpos in self._tblwordform[lowerWord] and \ len(self._tblwordform[lowerWord][tok.xpos]) == 1: tt.setLemma(self._tblwordform[lowerWord][tok.xpos][0]) lemmaIsSet = True if not lemmaIsSet: tt.setLemma(tok.lemma) tt.setHead(tok.head) tt.setDepRel(tok.label) tssent += tok.word if tok.space_after != "SpaceAfter=No": tssent += " " ttsent.append(tt) # end ttsent/tssent formation if not dto.isOpPerformed(TeproAlgo.getSentenceSplittingOperName()): dto.addSentenceString(tssent) dto.addSentenceTokens(ttsent) else: # Check and update annotations that only NLPCube # can produce or that are requested specifically from it. alignment = dto.alignSentences(ttsent, sid) for op in opNotDone: dto.copyTokenAnnotation(ttsent, sid, alignment, op) sid += 1 return dto
def pcParse(self, text: str) -> TeproDTO: """This processing chain will do chunking and dependency parsing on the input text, splitting the text in sentences and tokens and doing POS tagging and lemmatization beforehand.""" return self.pcExec(text, [TeproAlgo.getDependencyParsingOperName()])
def isOpPerformed(self, op: str) -> bool: if op in TeproAlgo.getAvailableOperations(): return op in self._performedOps else: raise RuntimeError("Operation '" + op + "' is not a valid TeproAlgo operation!")
def addPerformedOp(self, op: str): if op in TeproAlgo.getAvailableOperations(): self._performedOps.add(op) else: raise RuntimeError("Operation '" + op + "' is not a valid TeproAlgo operation!")
def main(): # How to use the Teprolin Python 3 object: # 1. Create the object tepro = Teprolin() # 0.9 Test NER auto-configuration text = "Intel Celeron N4020" dto = tepro.pcFull(text) dto.dumpConllX() # 1.0 Check new TTSOps text = "Aceasta este propoziția 123 de test și nu-ți dă cu virgulă ca în 45.631." tepro.configure(TeproAlgo.getSentenceSplittingOperName(), TeproAlgo.algoTTL) tepro.configure(TeproAlgo.getTokenizationOperName(), TeproAlgo.algoTTL) tepro.configure(TeproAlgo.getPOSTaggingOperName(), TeproAlgo.algoTTL) tepro.configure(TeproAlgo.getLemmatizationOperName(), TeproAlgo.algoTTL) dto = tepro.pcExec( text, [TeproAlgo.getHyphenationOperName(), TeproAlgo.getPhoneticTranscriptionOperName(), TeproAlgo.getNumeralRewritingOperName()]) dto.dumpConllX() tepro.getStats(Teprolin.statsTokens, Teprolin.statsDay, 2) # 1.1 Test the UDPipe flow tepro.configure( TeproAlgo.getSentenceSplittingOperName(), TeproAlgo.algoUDPipe) tepro.configure(TeproAlgo.getTokenizationOperName(), TeproAlgo.algoUDPipe) tepro.configure(TeproAlgo.getPOSTaggingOperName(), TeproAlgo.algoUDPipe) tepro.configure(TeproAlgo.getLemmatizationOperName(), TeproAlgo.algoUDPipe) text = "Diabetul zaharat este un sindrom caracterizat prin valori crescute ale concentrației glucozei \ in sange (hiperglicemie) si dezechilibrarea metabolismului. \ Daca l-ai luat, te-ai imbolnavit destul de grav." dto = tepro.pcExec( text, [TeproAlgo.getDependencyParsingOperName()]) dto.dumpConllX() # 2. Optionally, configure the operation execution # Example configuration call tepro.configure(TeproAlgo.getSentenceSplittingOperName(), TeproAlgo.algoTTL) tepro.configure(TeproAlgo.getTokenizationOperName(), TeproAlgo.algoTTL) tepro.configure(TeproAlgo.getPOSTaggingOperName(), TeproAlgo.algoTTL) tepro.configure(TeproAlgo.getLemmatizationOperName(), TeproAlgo.algoTTL) # 2.1 Test biomedical NER text = "Diabetul zaharat este un sindrom caracterizat prin valori crescute ale concentrației glucozei în sânge (hiperglicemie) și dezechilibrarea metabolismului." dto = tepro.pcExec( text, [TeproAlgo.getBiomedicalNamedEntityRecognitionOperName()]) dto.dumpConllX() # 2.2 Test NER text = "Instanta suprema reia astazi judecarea. In dosar, se judeca Liviu Dragnea cu Ministerul Justitiei, condus de Tudorel Toader." dto = tepro.pcExec(text, [TeproAlgo.getNamedEntityRecognitionOperName()]) dto.dumpConllX() # 2.3 Test for some bugs text = "Am aflat aprope ca euro si dolarul sunt cele mai bune." dto = tepro.pcFull(text) dto.dumpConllX() text = "Stia ca demonstratia o sa fie un succes." dto = tepro.pcFull(text) dto.dumpConllX() # 2.4 Test for a crash text = "Președintele Klaus Iohannis a anunțat că nu promulgă legea bugetului pe 2019 și sesizează Curtea Constituțională. " + \ "„Este bugetul rușinii naționale”, a spus șeful statului care a acuzat PSD că e incapabil să guverneze pentru România, singura preocupare fiind Liviu Dragnea.\n\n" + \ "„Un lucru este clar, Guvernarea PSD a eșuat. În spitale, probleme peste probleme Educația este subfinanțată. " + \ "România este bulversată mai ales după OUG 114, dată în mare taină la finalul anului trecut. " + \ "Despre justiție, întreaga guvernare pesedistă a fost un asalt asupra statului de drept din România. PSD e incapabil să conducă România. " + \ "PSD nu guvernează pentru români, PSD guvernează pentru Dragnea”, a spus Iohannis.\n\n" + \ "Referindu-se la bugetul pe 2019, șeful statului a spus că acesta este „nerealist și supraevaluat”, calificându-l drept unul al „rușinii naționale”.\n\n" + \ "Președintele a acuzat PSD că nu are bani de investiții, dar are bani pentru partid. " + \ "„150 de milioane va primi PSD din finanțarea partidelor, din 270 de milioane propuse pentru finanțarea partidelor. " + \ "PSD și-a tras bani de 20 de ori mai mult decât anul trecut (președinția a precizat ulterior că această comparație a fost făcută cu 2016-n.r.). " + \ "Pentru asta au bani”, a spus Iohannis.\n" dto = tepro.pcFull(text) dto.dumpConllX() text = "HotNews.ro transmite LIVETEXT cele mai importante declarații din cadrul audierilor\n\n" + \ "Ora 17,00: Andres Ritter, candidatul Germaniei a vorbit despre necesitatea înființării Parchetului European în contextul fraudelor și corupției, " + \ "care slăbesc credibilitatea UE în ochii contribuabililor. În opinia sa, abordarea la nivel național nu a fost suficientă, este necesară o " + \ "abordare unitară la nivelul UE\n\n" + \ "Ora 16:40 S-a stabilit ordinea audierilor, prin tragere la sorți: " + \ "Primul va fi audiat candidatul Germaniei, Andrés Ritter (54 de ani), urmat de candidatul Franței, " + \ "Jean-François Bohnert (58 de ani) și de Laura Codruța Kovesi.\n" dto = tepro.pcFull(text) dto.dumpConllX() text = "La 7 minute de centrul Brasovului, imobilul\tpropus \ spre vanzare se adreseaza\t\tcelor care isi doresc un spatiu \ generos de locuit.\n\nAmplasarea constructiei si\t\tgarajul reusesc sa exploateze \ la maxim lotul de teren de 670 mp, ce are o deschidere de 15 ml.\n" # 3. Call one of the already created 'processing chains' ('pc' for short) # or call the generic pcExec method. # Example 1: using a canned processing chain ('pc'), e.g. diacritics insertion. dto = tepro.pcDiac(text) print(dto.getText()) # Example 2: using another canned pc, e.g. lemmatization. dto = tepro.pcLemma(text) print(json.dumps(dto.jsonDict(), default=lambda x: x.__dict__)) # Example 3: requesting specific operations, e.g. hyphenation and phonetic transcription. # TEPROLIN will figure out what else has to run such that these two operations are applied. dto = tepro.pcExec(text, [TeproAlgo.getHyphenationOperName( ), TeproAlgo.getPhoneticTranscriptionOperName()]) dto.dumpConllX() tepro.getStats(Teprolin.statsTokens, Teprolin.statsMonth, 5)
def pcExec(self, text: str, ops: list) -> TeproDTO: """This processing chain will make sure that the list of requested operations (ops) are executed on the input text, along with their required dependencies.""" availableOps = TeproAlgo.getAvailableOperations() # 1. Check if all requested ops are valid for op in ops: if op not in availableOps: raise RuntimeError("Operation '" + op + CONSTSTR1) # 2. Increase the number of requests by 1 # with every call to this method. self._requests += 1 configuredApps = [] # 2.1 Resolve all operation dependencies expandedOps = TeproAlgo.resolveDependencies(ops) # 3.1 Dynamically alter the configuration # depending on exceptions. For instance # ner-icia requires ttl-icia, not nlp-cube-adobe TeproAlgo.reconfigureWithStrictRequirements(self._conf, expandedOps) # 3.2 Get instantiated apps for the requested operations. # Apps are added in the order provided by expandedOps, # so no more app sorting is needed. for op in expandedOps: opi = self._indexOfAlgo(self._conf[op]) if opi >= 0: app = self._apps[opi] if app not in configuredApps: configuredApps.append(app) else: print("{0}.{1}[{2}]: operation '{3}' is not supported yet.". format( Path(inspect.stack()[0].filename).stem, inspect.stack()[0].function, inspect.stack()[0].lineno, op ), file=sys.stderr, flush=True) # 5. Run all configured NLP apps in sequence on # the dto object. dto = TeproDTO(text, self._conf) for app in configuredApps: print("{0}.{1}[{2}]: running NLP app '{3}'". format( Path(inspect.stack()[0].filename).stem, inspect.stack()[0].function, inspect.stack()[0].lineno, app.getAlgoName() ), file=sys.stderr, flush=True) dto = app.doWork(dto) # 6. Collect statistics ts = gmtime() if self._stats and \ self._stats[-1][0][0] == ts.tm_mday and \ self._stats[-1][0][1] == ts.tm_mon and \ self._stats[-1][0][2] == ts.tm_year: self._stats[-1][1] += dto.getProcessedTokens() self._stats[-1][2] += 1 self._stats[-1][3] = SStatus.ACQUIRED else: date = (ts.tm_mday, ts.tm_mon, ts.tm_year) tkc = dto.getProcessedTokens() self._stats.append( [date, tkc, self._requests, SStatus.ACQUIRED]) # 7. Write stats every statsUpdateCounts requests if self._requests % Teprolin.statsUpdateCounts == 0: self._writeStatsFile() self._stats = self._readStatsFile() # 8. Work done, return the dto object. return dto
def _runApp(self, dto, opNotDone): text = dto.getText() # 1. Send text to TTL, in UTF-8 bytes s = socket(family=AF_INET, type=SOCK_STREAM, proto=IPPROTO_TCP) s.connect(("localhost", self._ttlPort)) # Very important: add \n to flush the socket! text += "\n" s.send(text.encode(encoding='utf-8')) # Send the 'end of transmission' command s.send(TTLOps.eotCommand.encode(encoding='utf-8')) # 2. Get annotated text from TTL, in UTF-8 bytes, # 1024 bytes at a time. ttlBytes = [] b = s.recv(1024) while b != b'': ttlBytes.append(b) b = s.recv(1024) # 3. Extract annotated info from the returned text. ttlText = b''.join(ttlBytes).decode('utf-8') ttlSentences = ttlText.split(sep='\n\n') sid = 0 for ts in ttlSentences: ts = ts.strip() ttlTokens = ts.split('\n') idx = 0 # Teprolin tokenized sentence ttsent = [] # Teprolin string sentence tssent = "" for tt in ttlTokens: tp = tt.split() word = tp[0] ctag = tp[1] msd = tp[2] lem = tp[3] if TTLOps.lemmaProbRX.match(lem) != None: lem = TTLOps.lemmaProbRX.sub("", lem, 1) if ',' in msd: msd = msd.split(',')[0] chk = tp[4] tt = TeproTok() idx += 1 tt.setId(idx) tt.setWordForm(word) tt.setCTAG(ctag) tt.setMSD(msd) tt.setLemma(lem) if chk != '_': tt.setChunk(chk) ttsent.append(tt) if not tssent: tssent += word elif word in ",;.!?-)}]”'\"`": tssent += word elif tssent[-1] in "'\"-`„({[" and uc.category( word[0]).startswith("L"): tssent += word else: tssent += " " + word # end for tt if not dto.isOpPerformed(TeproAlgo.getSentenceSplittingOperName()): dto.addSentenceString(tssent) dto.addSentenceTokens(ttsent) else: # Check and update annotations that only TTL # can produce or that are requested specifically from it. alignment = dto.alignSentences(ttsent, sid) for op in opNotDone: dto.copyTokenAnnotation(ttsent, sid, alignment, op) sid += 1 # end for ts return dto