Пример #1
0
 def parse(self, text, tokenise=False):
     if tokenise:
         text = self.nlp(text)
     else:
         text = Doc(self.nlp.vocab, text.split())
         self.nlp.tagger(text)
         self.nlp.parser(text)
     return text
Пример #2
0
    def parse(self, text, tokenise=False):

        tokens = []
        
        if self.lang == "en":
            if tokenise:
                text = self.nlp(text)
            else:
                text = Doc(self.nlp.vocab, text.split())
                self.nlp.tagger(text)
                self.nlp.parser(text)

            for o in text:
                tokens.append(ParsedToken(o.text, o.lemma_, o.pos_, o.tag_, o.dep_)) # Spacy values
                                                                                     # dep_ is only needed in the Engliah classifier, never care if it doesn't exist in Arabic analyzer.
        
        if self.lang == "ar":

            # Parse the sentence by an Arbic morphological analyzer

            text = camel_tools.tokenizers.word.simple_word_tokenize(text)

            for o in text:
                # To analyze a word, we can use the analyze() method
                analyzedWord = self.nlp[0].analyze(o)
                # Handle the problem if the token is null
                if analyzedWord:
                    lemma = analyzedWord[0]["stem"]
                    pos = analyzedWord[0]["pos"]
                else:
                    lemma = ''
                    pos = ''
                tag = self.nlp[1].tag(o.split())[0]
                tokens.append(ParsedToken(o, lemma, pos, tag)) # Replace this by the values from an Arbic morphological analyzer 

        return tokens