Пример #1
0
def test_NLPCube():
    tepro.configure(TeproAlgo.getSentenceSplittingOperName(),
                    TeproAlgo.algoCube)
    tepro.configure(TeproAlgo.getTokenizationOperName(), TeproAlgo.algoCube)
    tepro.configure(TeproAlgo.getPOSTaggingOperName(), TeproAlgo.algoCube)
    tepro.configure(TeproAlgo.getLemmatizationOperName(), TeproAlgo.algoCube)

    dto = tepro.pcExec(text, [TeproAlgo.getDependencyParsingOperName()])

    # Processed two sentences...
    assert dto.getNumberOfSentences() == 2

    # Check some dependency structure...
    assert dto.getSentenceTokens(0)[7].getWordForm() == 'imobilul'
    assert dto.getSentenceTokens(0)[7].getHead() == 13
    assert dto.getSentenceTokens(0)[7].getDepRel() == 'nsubj'
    assert dto.getSentenceTokens(0)[13].getWordForm() == 'celor'
    assert dto.getSentenceTokens(0)[13].getHead() == 13
    assert dto.getSentenceTokens(0)[13].getDepRel() == 'iobj'

    assert dto.getSentenceTokens(1)[0].getWordForm() == 'Amplasarea'
    assert dto.getSentenceTokens(1)[0].getHead() == 5
    assert dto.getSentenceTokens(1)[0].getDepRel() == 'nsubj'
    assert dto.getSentenceTokens(1)[1].getWordForm() == 'construcției'
    assert dto.getSentenceTokens(1)[1].getHead() == 1
    assert dto.getSentenceTokens(1)[1].getDepRel() == 'nmod'
Пример #2
0
    def configure(self, op: str, algo: str):
        availableOps = TeproAlgo.getAvailableOperations()
        availableAlgos = TeproAlgo.getAvailableAlgorithms()

        if op not in availableOps:
            raise RuntimeError("Operation '" + op + CONSTSTR1)

        if algo not in availableAlgos:
            raise RuntimeError(CONSTSTR2 + algo + CONSTSTR1)

        if not TeproAlgo.canPerform(algo, op):
            raise RuntimeError(CONSTSTR2 + algo +
                               "' cannot perform operation '" + op +
                               "'. See class TeproAlgo.")

        print(
            "{0}.{1}[{2}]: requesting operation '{3}' be performed with '{4}'".
            format(
                Path(inspect.stack()[0].filename).stem,
                inspect.stack()[0].function,
                inspect.stack()[0].lineno, op, algo),
            file=sys.stderr,
            flush=True)

        self._conf[op] = algo
Пример #3
0
def test_TTL():
    tepro.configure(TeproAlgo.getSentenceSplittingOperName(),
                    TeproAlgo.algoTTL)
    tepro.configure(TeproAlgo.getTokenizationOperName(), TeproAlgo.algoTTL)
    tepro.configure(TeproAlgo.getPOSTaggingOperName(), TeproAlgo.algoTTL)
    tepro.configure(TeproAlgo.getLemmatizationOperName(), TeproAlgo.algoTTL)

    dto = tepro.pcExec(text, [TeproAlgo.getChunkingOperName()])

    # Processed two sentences...
    assert dto.getNumberOfSentences() == 2

    # For the first sentence:
    assert dto.getSentenceTokens(0)[0].getWordForm() == 'La'
    assert dto.getSentenceTokens(0)[0].getMSD() == 'Spsa'
    assert dto.getSentenceTokens(0)[0].getLemma() == 'la'
    assert dto.getSentenceTokens(0)[1].getWordForm() == '7'
    assert dto.getSentenceTokens(0)[5].getWordForm() == 'Brașovului'
    assert dto.getSentenceTokens(0)[5].getLemma() == 'Brașov'
    assert dto.getSentenceTokens(0)[5].getMSD() == 'Npmsoy'
    assert dto.getSentenceTokens(0)[5].getChunk() == 'Pp#2,Np#2'
    assert dto.getSentenceTokens(0)[22].getWordForm() == '.'
    assert dto.getSentenceTokens(0)[22].getCTAG() == 'PERIOD'

    # For the second sentence:
    assert dto.getSentenceTokens(1)[0].getWordForm() == 'Amplasarea'
    assert dto.getSentenceTokens(1)[22].getWordForm() == 'metri'
    assert dto.getSentenceTokens(1)[22].getCTAG() == 'NPN'
Пример #4
0
    def pcDiac(self, text: str) -> str:
        """This processing chain will insert diacritics in a text
        which does not have them."""

        # You can specify the whole call chain, if you know it.
        return self.pcExec(text, [
            TeproAlgo.getTextNormOperName(),
            TeproAlgo.getDiacRestorationOperName()])
Пример #5
0
def test_AutoReconfiguration():
    dto = tepro.pcFull(text6)

    assert tepro.getConfiguration(TeproAlgo.getSentenceSplittingOperName()) == TeproAlgo.algoTTL
    assert tepro.getConfiguration(TeproAlgo.getTokenizationOperName()) == TeproAlgo.algoTTL
    assert tepro.getConfiguration(TeproAlgo.getPOSTaggingOperName()) == TeproAlgo.algoTTL
    assert tepro.getConfiguration(TeproAlgo.getChunkingOperName()) == TeproAlgo.algoTTL
    assert dto.getSentenceTokens(0)[0].getMSD() == 'Np'
Пример #6
0
def test_TTS():
    tepro.configure(TeproAlgo.getSentenceSplittingOperName(),
                    TeproAlgo.algoTTL)
    tepro.configure(TeproAlgo.getTokenizationOperName(), TeproAlgo.algoTTL)
    tepro.configure(TeproAlgo.getPOSTaggingOperName(), TeproAlgo.algoTTL)
    tepro.configure(TeproAlgo.getLemmatizationOperName(), TeproAlgo.algoTTL)

    dto = tepro.pcExec(text5, [
        TeproAlgo.getHyphenationOperName(),
        TeproAlgo.getStressIdentificationOperName(),
        TeproAlgo.getPhoneticTranscriptionOperName(),
        TeproAlgo.getAbbreviationRewritingOperName(),
        TeproAlgo.getNumeralRewritingOperName(),
    ])

    # Processed two sentences...
    assert dto.getNumberOfSentences() == 1

    # For the first sentence:
    assert dto.getSentenceTokens(0)[3].getExpansion() == \
        'o sută douăzeci și trei'
    assert dto.getSentenceTokens(0)[0].getSyllables() == "a-'ceas-ta"
    assert dto.getSentenceTokens(0)[0].getPhonetical() == "a ch e@ a s t a"
    assert dto.getSentenceTokens(0)[11].getSyllables() == "'vir-gu-lă"
    assert dto.getSentenceTokens(0)[11].getPhonetical() == "v i r g u l @"
    assert dto.getSentenceTokens(0)[14].getExpansion() == \
        'patruzeci și cinci virgulă șase sute treizeci și unu'
Пример #7
0
def test_MWEsAndDepTransfer():
    tepro.configure(TeproAlgo.getSentenceSplittingOperName(), TeproAlgo.algoTTL)
    tepro.configure(TeproAlgo.getTokenizationOperName(), TeproAlgo.algoTTL)
    tepro.configure(TeproAlgo.getPOSTaggingOperName(), TeproAlgo.algoTTL)
    tepro.configure(TeproAlgo.getLemmatizationOperName(), TeproAlgo.algoTTL)

    dto = tepro.pcFull(text2)

    assert dto.getSentenceTokens(0)[3].getWordForm() == 'o_să'
    assert dto.getSentenceTokens(0)[3].getMSD() == 'Qf'
    assert dto.getSentenceTokens(0)[3].getHead() == 7
    assert dto.getSentenceTokens(0)[3].getDepRel() == 'mark'
Пример #8
0
    def defaultConfiguration(self):
        # Dictionary of operations to implementing algorithms (NLP app).
        self._conf = {}

        for op in TeproAlgo.getAvailableOperations():
            self._conf[op] = TeproAlgo.getDefaultAlgoForOper(op)
            print("{0}.{1}[{2}]: configuring operation '{3}' with algorithm '{4}'".
                    format(
                        Path(inspect.stack()[0].filename).stem,
                        inspect.stack()[0].function,
                        inspect.stack()[0].lineno,
                        op,
                        self._conf[op]
                    ), file=sys.stderr, flush=True)
Пример #9
0
    def get(self, oper):
        """This method will return the available Teprolin
        algorithms (NLP apps) for the specified 'oper'."""

        if oper in TeproAlgo.getAvailableOperations():
            return ({
                oper: TeproAlgo.getAlgorithmsForOper(oper)
            }, int(HTTPStatus.OK))
        else:
            return ({
                'teprolin-conf':
                self._teprolin.getConfiguration(),
                'teprolin-result':
                "Operation '" + oper +
                "' is not recognized. See class TeproAlgo."
            }, int(HTTPStatus.BAD_REQUEST))
Пример #10
0
def test_NEROps():
    dto = tepro.pcExec(text3, [TeproAlgo.getNamedEntityRecognitionOperName()])

    # Check NER annotations
    assert dto.getSentenceTokens(0)[0].getNER() == 'ORG'
    assert dto.getSentenceTokens(0)[1].getNER() == 'ORG'
    assert dto.getSentenceTokens(0)[3].getNER() == 'TIME'
Пример #11
0
    def _runApp(self, dto, opNotDone):
        if not TeproAlgo.getBiomedicalNamedEntityRecognitionOperName(
        ) in opNotDone:
            return dto

        sequences = self._prepareSentences(dto)
        i = 0

        for seq in sequences:
            rez = self._tagger.tag(seq)
            tsent = dto.getSentenceTokens(i)

            # These are equal, but just in case...
            if len(rez) == len(tsent):
                for j in range(len(rez)):
                    if tsent[j].getMSD() == rez[j][1]:
                        bnlabel = rez[j][0]

                        if bnlabel != '' and \
                            bnlabel != '_' and bnlabel != '-':
                            tsent[j].setBioNER(bnlabel)
                        # end if bnlabel
                    # end if ==
                # end for j
            # end if len
            i += 1
        # end for
        return dto
Пример #12
0
    def pcLemma(self, text: str) -> TeproDTO:
        """This processing chain will do POS tagging and lemmatization
        on the input text, splitting the text in sentences and tokens beforehand."""

        # Or you can specify a few operations like 'lemmatization',
        # pcExec will infer the dependencies.
        return self.pcExec(text, [TeproAlgo.getLemmatizationOperName()])
Пример #13
0
def test_BioNEROps():
    dto = tepro.pcExec(text4, [TeproAlgo.getBiomedicalNamedEntityRecognitionOperName()])

    # Check BioNER annotations
    assert dto.getSentenceTokens(0)[0].getBioNER() == 'B-DISO'
    assert dto.getSentenceTokens(0)[1].getBioNER() == 'I-DISO'
    assert dto.getSentenceTokens(0)[4].getBioNER() == 'B-DISO'
    assert dto.getSentenceTokens(0)[11].getBioNER() == 'B-CHEM'
    assert dto.getSentenceTokens(0)[13].getBioNER() == 'B-ANAT'
Пример #14
0
    def _checkProgress(self, dto: TeproDTO) -> list:
        """Checks what operations have been perforemed already
        and what needs to be done futher."""
        opNotDone = []

        for op in TeproAlgo.getOperationsForAlgo(self._algoName):
            if not dto.isOpPerformed(op) or \
                    self._algoName == dto.getConfiguredAlgoForOper(op):
                opNotDone.append(op)

        return opNotDone
Пример #15
0
def test_UDPipe():
    tepro.configure(TeproAlgo.getSentenceSplittingOperName(), TeproAlgo.algoUDPipe)
    tepro.configure(TeproAlgo.getTokenizationOperName(), TeproAlgo.algoUDPipe)
    tepro.configure(TeproAlgo.getPOSTaggingOperName(), TeproAlgo.algoUDPipe)
    tepro.configure(TeproAlgo.getLemmatizationOperName(), TeproAlgo.algoUDPipe)

    dto = tepro.pcExec(text4, [TeproAlgo.getDependencyParsingOperName()])
    
    # Processed two sentences...
    assert dto.getNumberOfSentences() == 1
    assert len(dto.getSentenceTokens(0)) == 21

    # Check some dependency structure...
    assert dto.getSentenceTokens(0)[5].getHead() == 5
    assert dto.getSentenceTokens(0)[5].getDepRel() == 'acl'
    assert dto.getSentenceTokens(0)[10].getWordForm() == 'concentrației'
    assert dto.getSentenceTokens(0)[10].getLemma() == 'concentrație'
    assert dto.getSentenceTokens(0)[10].getCTAG() == 'NOUN'
    assert dto.getSentenceTokens(0)[10].getMSD() == 'Ncfsoy'
    assert dto.getSentenceTokens(0)[10].getHead() == 8
    assert dto.getSentenceTokens(0)[10].getDepRel() == 'nmod'
    assert dto.getSentenceTokens(0)[17].getWordForm() == 'și'
Пример #16
0
    def _runApp(self, dto, opNotDone):
        text = dto.getText()

        tokenizer = self._model.newTokenizer(self._model.DEFAULT)
        tokenizer.setText(text)
        error = ProcessingError()
        sentence = Sentence()
        sid = 0

        while tokenizer.nextSentence(sentence, error):
            self._model.tag(sentence, self._model.DEFAULT)
            self._model.parse(sentence, self._model.DEFAULT)
            # Teprolin tokenized sentence
            ttsent = []
            # Teprolin string sentence
            tssent = sentence.getText()

            for w in sentence.words:
                if w.id == 0:
                    continue

                tt = TeproTok()

                tt.setId(w.id)
                tt.setWordForm(w.form)
                tt.setCTAG(w.upostag)
                tt.setMSD(w.xpostag)
                tt.setLemma(w.lemma)
                tt.setHead(w.head)
                tt.setDepRel(w.deprel)

                ttsent.append(tt)
            # end for w

            if not dto.isOpPerformed(TeproAlgo.getSentenceSplittingOperName()):
                dto.addSentenceString(tssent)
                dto.addSentenceTokens(ttsent)
            else:
                # Check and update annotations that only TTL
                # can produce or that are requested specifically from it.
                alignment = dto.alignSentences(ttsent, sid)

                for op in opNotDone:
                    dto.copyTokenAnnotation(ttsent, sid, alignment, op)

            sentence = Sentence()
            sid += 1
        # end all split sentences.

        return dto
Пример #17
0
def test_DiacRestore():
    dto = tepro.pcExec(text, [TeproAlgo.getDiacRestorationOperName()])

    # Diacs have been inserted at the proper places...
    # Brașovului
    assert dto.getText()[26] == 'ș'
    # vânzare
    assert dto.getText()[57] == 'â'
    # adresează
    assert dto.getText()[75] == 'ă'
    # își
    assert dto.getText()[88] == 'î'
    assert dto.getText()[89] == 'ș'
    # spațiu
    assert dto.getText()[105] == 'ț'
Пример #18
0
def test_TextNorm():
    dto = tepro.pcExec(text, [TeproAlgo.getTextNormOperName()])

    # Spaces have been removed...
    assert dto.getText()[5] == 'm'
    assert dto.getText()[35] == 'i'

    # Tab has been removed...
    assert dto.getText()[43] == ' '
    assert dto.getText()[76] == ' '
    assert dto.getText()[77] == 'c'

    # Newlines are preserved...
    assert dto.getText()[127] == '\n'
    assert dto.getText()[128] == '\n'
Пример #19
0
    def _runApp(self, dto, opNotDone):
        if not TeproAlgo.getNamedEntityRecognitionOperName() in opNotDone:
            return dto

        sentences = self._prepareSentences(dto)
        resp = requests.post(GENERALNERURL, data={"tokens": sentences})

        if resp.ok:
            nsentences = resp.text.split("\n")
            i = -1
            csentence = []

            for ntok in nsentences:
                if not ntok:
                    # Skip empty strings.
                    continue

                if ntok.startswith("<s>"):
                    i += 1
                elif ntok.startswith("</s>"):
                    tsentence = dto.getSentenceTokens(i)

                    if len(csentence) == len(tsentence):
                        for j in range(len(tsentence)):
                            if tsentence[j].getWordForm() == csentence[j][0] and \
                                    csentence[j][1] != "O":
                                tsentence[j].setNER(csentence[j][1])

                    csentence = []
                else:
                    parts = ntok.split()
                    csentence.append((parts[0], parts[5]))
            # end for ntok
        else:
            print(
                "{0}.{1}[{2}]: connecting to {3} failed with code {4}".format(
                    Path(inspect.stack()[0].filename).stem,
                    inspect.stack()[0].function,
                    inspect.stack()[0].lineno, GENERALNERURL,
                    resp.status_code),
                file=sys.stderr,
                flush=True)

        return dto
Пример #20
0
    def _runApp(self, dto, opNotDone):
        if (TeproAlgo.getTokenizationOperName() in opNotDone):
            # Tokenization is required for MLPLAServer to work
            return dto

        for i in range(dto.getNumberOfSentences()):
            tsent = dto.getSentenceTokens(i)
            wforms = []

            for tok in tsent:
                wforms.append(tok.getWordForm())

            msent = self._getSentenceAnnotation(" ".join(wforms))

            if len(msent) == len(tsent):
                for j in range(len(msent)):
                    orig = tsent[j]
                    mtok = msent[j]

                    if orig.getWordForm() == mtok[0]:
                        if mtok[1] != '_' and \
                            (TeproAlgo.getHyphenationOperName() in opNotDone or \
                                TeproAlgo.getStressIdentificationOperName() in opNotDone):
                            orig.setSyllables(mtok[1])

                        if mtok[2] != '_' and TeproAlgo.getPhoneticTranscriptionOperName(
                        ) in opNotDone:
                            orig.setPhonetical(mtok[2])

                        if mtok[3] != '_' and \
                            (TeproAlgo.getNumeralRewritingOperName() in opNotDone or \
                                TeproAlgo.getAbbreviationRewritingOperName() in opNotDone):
                            orig.setExpansion(mtok[3])
                    # end if word forms match
                # end all tokens
            # end if sentence lengths match
        # end all found sentences

        return dto
Пример #21
0
    def pcFull(self, text: str) -> TeproDTO:
        """This is the complete processing chain (pc), executing
        all NLP ops enumerated in TeproAlgo."""

        # Just run everything we know about on text.
        return self.pcExec(text, TeproAlgo.getAvailableOperations())
Пример #22
0
    def _runApp(self, dto, opNotDone):
        text = dto.getText()
        sentences = self._cubeInst(text)
        sid = 0

        for sent in sentences:
            # Teprolin tokenized sentence
            ttsent = []
            # Teprolin string sentence
            tssent = ""

            for tok in sent:
                tt = TeproTok()
                tt.setId(tok.index)
                tt.setWordForm(tok.word)
                lowerWord = tok.word.lower()
                tt.setMSD(tok.xpos)

                # Assigning the mapped CTAG to the disambiguated MSD
                if tok.xpos in self._msd2ctag:
                    tt.setCTAG(self._msd2ctag[tok.xpos])
                else:
                    tt.setCTAG(tok.xpos)

                lemmaIsSet = False

                # Doing lexicon lemmatization, if possible.
                if tok.word in self._tblwordform:
                    if tok.xpos in self._tblwordform[tok.word] and \
                            len(self._tblwordform[tok.word][tok.xpos]) == 1:
                        # TODO: if lemma is ambiguous, e.g. 'copii' can be 'copil' or 'copie'
                        tt.setLemma(self._tblwordform[tok.word][tok.xpos][0])
                        lemmaIsSet = True
                elif lowerWord in self._tblwordform and \
                        tok.xpos in self._tblwordform[lowerWord] and \
                        len(self._tblwordform[lowerWord][tok.xpos]) == 1:
                    tt.setLemma(self._tblwordform[lowerWord][tok.xpos][0])
                    lemmaIsSet = True

                if not lemmaIsSet:
                    tt.setLemma(tok.lemma)

                tt.setHead(tok.head)
                tt.setDepRel(tok.label)

                tssent += tok.word

                if tok.space_after != "SpaceAfter=No":
                    tssent += " "

                ttsent.append(tt)
            # end ttsent/tssent formation

            if not dto.isOpPerformed(TeproAlgo.getSentenceSplittingOperName()):
                dto.addSentenceString(tssent)
                dto.addSentenceTokens(ttsent)
            else:
                # Check and update annotations that only NLPCube
                # can produce or that are requested specifically from it.
                alignment = dto.alignSentences(ttsent, sid)

                for op in opNotDone:
                    dto.copyTokenAnnotation(ttsent, sid, alignment, op)

            sid += 1

        return dto
Пример #23
0
    def pcParse(self, text: str) -> TeproDTO:
        """This processing chain will do chunking and dependency parsing
        on the input text, splitting the text in sentences and tokens and
        doing POS tagging and lemmatization beforehand."""

        return self.pcExec(text, [TeproAlgo.getDependencyParsingOperName()])
Пример #24
0
 def isOpPerformed(self, op: str) -> bool:
     if op in TeproAlgo.getAvailableOperations():
         return op in self._performedOps
     else:
         raise RuntimeError("Operation '" + op +
                            "' is not a valid TeproAlgo operation!")
Пример #25
0
 def addPerformedOp(self, op: str):
     if op in TeproAlgo.getAvailableOperations():
         self._performedOps.add(op)
     else:
         raise RuntimeError("Operation '" + op +
                            "' is not a valid TeproAlgo operation!")
Пример #26
0
def main():
    # How to use the Teprolin Python 3 object:
    # 1. Create the object
    tepro = Teprolin()

    # 0.9 Test NER auto-configuration
    text = "Intel Celeron N4020"
    dto = tepro.pcFull(text)
    dto.dumpConllX()

    # 1.0 Check new TTSOps
    text = "Aceasta este propoziția 123 de test și nu-ți dă cu virgulă ca în 45.631."
    tepro.configure(TeproAlgo.getSentenceSplittingOperName(),
                    TeproAlgo.algoTTL)
    tepro.configure(TeproAlgo.getTokenizationOperName(), TeproAlgo.algoTTL)
    tepro.configure(TeproAlgo.getPOSTaggingOperName(), TeproAlgo.algoTTL)
    tepro.configure(TeproAlgo.getLemmatizationOperName(), TeproAlgo.algoTTL)
    dto = tepro.pcExec(
        text, [TeproAlgo.getHyphenationOperName(), TeproAlgo.getPhoneticTranscriptionOperName(),
        TeproAlgo.getNumeralRewritingOperName()])
    dto.dumpConllX()

    tepro.getStats(Teprolin.statsTokens, Teprolin.statsDay, 2)

    # 1.1 Test the UDPipe flow
    tepro.configure(
        TeproAlgo.getSentenceSplittingOperName(), TeproAlgo.algoUDPipe)
    tepro.configure(TeproAlgo.getTokenizationOperName(), TeproAlgo.algoUDPipe)
    tepro.configure(TeproAlgo.getPOSTaggingOperName(), TeproAlgo.algoUDPipe)
    tepro.configure(TeproAlgo.getLemmatizationOperName(), TeproAlgo.algoUDPipe)

    text = "Diabetul zaharat este un sindrom caracterizat prin valori crescute ale concentrației glucozei \
        in sange (hiperglicemie) si dezechilibrarea metabolismului. \
        Daca l-ai luat, te-ai imbolnavit destul de grav."
    dto = tepro.pcExec(
        text, [TeproAlgo.getDependencyParsingOperName()])
    dto.dumpConllX()

    # 2. Optionally, configure the operation execution
    # Example configuration call
    tepro.configure(TeproAlgo.getSentenceSplittingOperName(),
                    TeproAlgo.algoTTL)
    tepro.configure(TeproAlgo.getTokenizationOperName(), TeproAlgo.algoTTL)
    tepro.configure(TeproAlgo.getPOSTaggingOperName(), TeproAlgo.algoTTL)
    tepro.configure(TeproAlgo.getLemmatizationOperName(), TeproAlgo.algoTTL)

    # 2.1 Test biomedical NER
    text = "Diabetul zaharat este un sindrom caracterizat prin valori crescute ale concentrației glucozei în sânge (hiperglicemie) și dezechilibrarea metabolismului."
    dto = tepro.pcExec(
        text, [TeproAlgo.getBiomedicalNamedEntityRecognitionOperName()])
    dto.dumpConllX()

    # 2.2 Test NER
    text = "Instanta suprema reia astazi judecarea. In dosar, se judeca Liviu Dragnea cu Ministerul Justitiei, condus de Tudorel Toader."
    dto = tepro.pcExec(text, [TeproAlgo.getNamedEntityRecognitionOperName()])
    dto.dumpConllX()

    # 2.3 Test for some bugs
    text = "Am aflat aprope ca euro si dolarul sunt cele mai bune."
    dto = tepro.pcFull(text)
    dto.dumpConllX()

    text = "Stia ca demonstratia o sa fie un succes."
    dto = tepro.pcFull(text)
    dto.dumpConllX()

    # 2.4 Test for a crash
    text = "Președintele Klaus Iohannis a anunțat că nu promulgă legea bugetului pe 2019 și sesizează Curtea Constituțională. " + \
        "„Este bugetul rușinii naționale”, a spus șeful statului care a acuzat PSD că e incapabil să guverneze pentru România, singura preocupare fiind Liviu Dragnea.\n\n" + \
        "„Un lucru este clar, Guvernarea PSD a eșuat. În spitale, probleme peste probleme Educația este subfinanțată. " + \
        "România este bulversată mai ales după OUG 114, dată în mare taină la finalul anului trecut. " + \
        "Despre justiție, întreaga guvernare pesedistă a fost un asalt asupra statului de drept din România. PSD e incapabil să conducă România. " + \
        "PSD nu guvernează pentru români, PSD guvernează pentru Dragnea”, a spus Iohannis.\n\n" + \
        "Referindu-se la bugetul pe 2019, șeful statului a spus că acesta este „nerealist și supraevaluat”, calificându-l drept unul al „rușinii naționale”.\n\n" + \
        "Președintele a acuzat PSD că nu are bani de investiții, dar are bani pentru partid. " + \
        "„150 de milioane va primi PSD din finanțarea partidelor, din 270 de milioane propuse pentru finanțarea partidelor. " + \
        "PSD și-a tras bani de 20 de ori mai mult decât anul trecut (președinția a precizat ulterior că această comparație a fost făcută cu 2016-n.r.). " + \
        "Pentru asta au bani”, a spus Iohannis.\n"
    dto = tepro.pcFull(text)
    dto.dumpConllX()

    text = "HotNews.ro transmite LIVETEXT cele mai importante declarații din cadrul audierilor\n\n" + \
        "Ora 17,00: Andres Ritter, candidatul Germaniei a vorbit despre necesitatea înființării Parchetului European în contextul fraudelor și corupției, " + \
        "care slăbesc credibilitatea UE în ochii contribuabililor. În opinia sa, abordarea la nivel național nu a fost suficientă, este necesară o " + \
        "abordare unitară la nivelul UE\n\n" + \
        "Ora 16:40 S-a stabilit ordinea audierilor, prin tragere la sorți: " + \
        "Primul va fi audiat candidatul Germaniei, Andrés Ritter (54 de ani), urmat de candidatul Franței, " + \
        "Jean-François Bohnert (58 de ani) și de Laura Codruța Kovesi.\n"
    dto = tepro.pcFull(text)
    dto.dumpConllX()

    text = "La 7      minute de centrul Brasovului,  imobilul\tpropus \
        spre vanzare se adreseaza\t\tcelor care isi doresc un spatiu \
        generos de locuit.\n\nAmplasarea constructiei  si\t\tgarajul reusesc sa exploateze \
        la maxim lotul de teren de 670 mp, ce are o deschidere de 15 ml.\n"

    # 3. Call one of the already created 'processing chains' ('pc' for short)
    # or call the generic pcExec method.
    # Example 1: using a canned processing chain ('pc'), e.g. diacritics insertion.
    dto = tepro.pcDiac(text)
    print(dto.getText())

    # Example 2: using another canned pc, e.g. lemmatization.
    dto = tepro.pcLemma(text)
    print(json.dumps(dto.jsonDict(), default=lambda x: x.__dict__))

    # Example 3: requesting specific operations, e.g. hyphenation and phonetic transcription.
    # TEPROLIN will figure out what else has to run such that these two operations are applied.
    dto = tepro.pcExec(text, [TeproAlgo.getHyphenationOperName(
    ), TeproAlgo.getPhoneticTranscriptionOperName()])
    dto.dumpConllX()

    tepro.getStats(Teprolin.statsTokens, Teprolin.statsMonth, 5)
Пример #27
0
    def pcExec(self, text: str, ops: list) -> TeproDTO:
        """This processing chain will make sure that the list of
        requested operations (ops) are executed on the input text,
        along with their required dependencies."""

        availableOps = TeproAlgo.getAvailableOperations()

        # 1. Check if all requested ops are valid
        for op in ops:
            if op not in availableOps:
                raise RuntimeError("Operation '" + op + CONSTSTR1)

        # 2. Increase the number of requests by 1
        # with every call to this method.
        self._requests += 1
        configuredApps = []

        # 2.1 Resolve all operation dependencies
        expandedOps = TeproAlgo.resolveDependencies(ops)

        # 3.1 Dynamically alter the configuration
        # depending on exceptions. For instance
        # ner-icia requires ttl-icia, not nlp-cube-adobe
        TeproAlgo.reconfigureWithStrictRequirements(self._conf, expandedOps)

        # 3.2 Get instantiated apps for the requested operations.
        # Apps are added in the order provided by expandedOps,
        # so no more app sorting is needed.
        for op in expandedOps:
            opi = self._indexOfAlgo(self._conf[op])

            if opi >= 0:
                app = self._apps[opi]

                if app not in configuredApps:
                    configuredApps.append(app)
            else:
                print("{0}.{1}[{2}]: operation '{3}' is not supported yet.".
                      format(
                          Path(inspect.stack()[0].filename).stem,
                          inspect.stack()[0].function,
                          inspect.stack()[0].lineno,
                          op
                      ), file=sys.stderr, flush=True)

        # 5. Run all configured NLP apps in sequence on
        # the dto object.
        dto = TeproDTO(text, self._conf)

        for app in configuredApps:
            print("{0}.{1}[{2}]: running NLP app '{3}'".
                  format(
                      Path(inspect.stack()[0].filename).stem,
                      inspect.stack()[0].function,
                      inspect.stack()[0].lineno,
                      app.getAlgoName()
                  ), file=sys.stderr, flush=True)
            dto = app.doWork(dto)

        # 6. Collect statistics
        ts = gmtime()

        if self._stats and \
                self._stats[-1][0][0] == ts.tm_mday and \
                self._stats[-1][0][1] == ts.tm_mon and \
                self._stats[-1][0][2] == ts.tm_year:
            self._stats[-1][1] += dto.getProcessedTokens()
            self._stats[-1][2] += 1
            self._stats[-1][3] = SStatus.ACQUIRED
        else:
            date = (ts.tm_mday, ts.tm_mon, ts.tm_year)
            tkc = dto.getProcessedTokens()
            self._stats.append(
                [date, tkc, self._requests, SStatus.ACQUIRED])

        # 7. Write stats every statsUpdateCounts requests
        if self._requests % Teprolin.statsUpdateCounts == 0:
            self._writeStatsFile()
            self._stats = self._readStatsFile()

        # 8. Work done, return the dto object.
        return dto
Пример #28
0
    def _runApp(self, dto, opNotDone):
        text = dto.getText()

        # 1. Send text to TTL, in UTF-8 bytes
        s = socket(family=AF_INET, type=SOCK_STREAM, proto=IPPROTO_TCP)
        s.connect(("localhost", self._ttlPort))

        # Very important: add \n to flush the socket!
        text += "\n"
        s.send(text.encode(encoding='utf-8'))
        # Send the 'end of transmission' command
        s.send(TTLOps.eotCommand.encode(encoding='utf-8'))

        # 2. Get annotated text from TTL, in UTF-8 bytes,
        # 1024 bytes at a time.
        ttlBytes = []
        b = s.recv(1024)

        while b != b'':
            ttlBytes.append(b)
            b = s.recv(1024)

        # 3. Extract annotated info from the returned text.
        ttlText = b''.join(ttlBytes).decode('utf-8')
        ttlSentences = ttlText.split(sep='\n\n')
        sid = 0

        for ts in ttlSentences:
            ts = ts.strip()
            ttlTokens = ts.split('\n')
            idx = 0
            # Teprolin tokenized sentence
            ttsent = []
            # Teprolin string sentence
            tssent = ""

            for tt in ttlTokens:
                tp = tt.split()
                word = tp[0]
                ctag = tp[1]
                msd = tp[2]
                lem = tp[3]

                if TTLOps.lemmaProbRX.match(lem) != None:
                    lem = TTLOps.lemmaProbRX.sub("", lem, 1)

                if ',' in msd:
                    msd = msd.split(',')[0]

                chk = tp[4]

                tt = TeproTok()

                idx += 1
                tt.setId(idx)
                tt.setWordForm(word)
                tt.setCTAG(ctag)
                tt.setMSD(msd)
                tt.setLemma(lem)

                if chk != '_':
                    tt.setChunk(chk)

                ttsent.append(tt)

                if not tssent:
                    tssent += word
                elif word in ",;.!?-)}]”'\"`":
                    tssent += word
                elif tssent[-1] in "'\"-`„({[" and uc.category(
                        word[0]).startswith("L"):
                    tssent += word
                else:
                    tssent += " " + word
            # end for tt

            if not dto.isOpPerformed(TeproAlgo.getSentenceSplittingOperName()):
                dto.addSentenceString(tssent)
                dto.addSentenceTokens(ttsent)
            else:
                # Check and update annotations that only TTL
                # can produce or that are requested specifically from it.
                alignment = dto.alignSentences(ttsent, sid)

                for op in opNotDone:
                    dto.copyTokenAnnotation(ttsent, sid, alignment, op)

            sid += 1
        # end for ts

        return dto