def prepare(self, text, tagger, lemmatizer): """ Given a raw text, clean it, and make tokens and samples. (Maybe this method should be in the TextManager class.) """ text = normalize_text(text) tokenized_text = self.tokenize(text) self._samples, self._tokens = self.instantiate_text(tokenized_text) tagger.tag_all(self.tokens) lemmatizer.do(self.tokens)
def __init__(self, text, pos_tagger=None, lemmatizer=None, lexicon=None): self._raw_text = text self.normalized_text = normalize_text(text) if len(self.normalized_text) == 0: # For now, raise value error, because an empty text create # too much problems here and there (zero division, etc.) # TODO : make empty texts possible. raise ValueError("Can't process an empty text.") self.samples = [] self.keyentities = [] self.lexicon = lexicon or Lexicon() self.postagger = pos_tagger or PosTagger(lexicon=self.lexicon) self.lemmatizer = lemmatizer or Lemmatizer(self.lexicon) self.make() self._stemms = None
def test_normalize_text(self): self.failIf(u"’" in normalize_text(u"’")) self.failIf(u"qu' " not in normalize_text(u"lorsqu'il")) self.failIf(u"<" in normalize_text(u"<b>pouet</b>")) self.failIf(u"&" in normalize_text(u"é")) self.failIf(normalize_text(u"mange-t-elle") != u"mange - t - elle") # Replacing quotes self.failIf(normalize_text(u' "pouet') != u' «pouet') self.failIf(normalize_text(u'pouet". ') != u'pouet». ') self.failIf(normalize_text(u'pouet"! ') != u'pouet»! ') self.failIf(normalize_text(u'pouet"? ') != u'pouet»? ') self.failIf(normalize_text(u'pouet?" ') != u'pouet?» ') # reflexive pronoun self.failIf(normalize_text(u'dis-je,') != u'dis - je,') self.failIf(normalize_text(u'dis-tu ') != u'dis - tu ') self.failIf(normalize_text(u'entends-toi!') != u'entends - toi!') self.failIf(normalize_text(u'sans-toit') != u'sans-toit') self.failIf(normalize_text(u'dit-il!') != u'dit - il!') self.failIf(normalize_text(u'dit-elle;') != u'dit - elle;') self.failIf(normalize_text(u'dis-le!') != u'dis - le!') self.failIf(normalize_text(u'ce camion-ci.') != u'ce camion - ci.') self.failIf(normalize_text(u'camion-citerne') != u'camion-citerne') self.failIf(normalize_text(u"est-ce\n") != u"est - ce\n")