def parse(self, text, tokenise=False): if tokenise: text = self.nlp(text) else: text = Doc(self.nlp.vocab, text.split()) self.nlp.tagger(text) self.nlp.parser(text) return text
def parse(self, text, tokenise=False): tokens = [] if self.lang == "en": if tokenise: text = self.nlp(text) else: text = Doc(self.nlp.vocab, text.split()) self.nlp.tagger(text) self.nlp.parser(text) for o in text: tokens.append(ParsedToken(o.text, o.lemma_, o.pos_, o.tag_, o.dep_)) # Spacy values # dep_ is only needed in the Engliah classifier, never care if it doesn't exist in Arabic analyzer. if self.lang == "ar": # Parse the sentence by an Arbic morphological analyzer text = camel_tools.tokenizers.word.simple_word_tokenize(text) for o in text: # To analyze a word, we can use the analyze() method analyzedWord = self.nlp[0].analyze(o) # Handle the problem if the token is null if analyzedWord: lemma = analyzedWord[0]["stem"] pos = analyzedWord[0]["pos"] else: lemma = '' pos = '' tag = self.nlp[1].tag(o.split())[0] tokens.append(ParsedToken(o, lemma, pos, tag)) # Replace this by the values from an Arbic morphological analyzer return tokens