示例#1
0
    def prepare(self, text, tagger, lemmatizer):
        """
        Given a raw text, clean it, and make tokens and samples.

        (Maybe this method should be in the TextManager class.)
        """
        text = normalize_text(text)
        tokenized_text = self.tokenize(text)
        self._samples, self._tokens = self.instantiate_text(tokenized_text)
        tagger.tag_all(self.tokens)
        lemmatizer.do(self.tokens)
示例#2
0
    def prepare(self, text, tagger, lemmatizer):
        """
        Given a raw text, clean it, and make tokens and samples.

        (Maybe this method should be in the TextManager class.)
        """
        text = normalize_text(text)
        tokenized_text = self.tokenize(text)
        self._samples, self._tokens = self.instantiate_text(tokenized_text)
        tagger.tag_all(self.tokens)
        lemmatizer.do(self.tokens)
示例#3
0
 def __init__(self, text, pos_tagger=None, lemmatizer=None, lexicon=None):
     self._raw_text = text
     self.normalized_text = normalize_text(text)
     if len(self.normalized_text) == 0:
         # For now, raise value error, because an empty text create
         # too much problems here and there (zero division, etc.)
         # TODO : make empty texts possible.
         raise ValueError("Can't process an empty text.")
     self.samples = []
     self.keyentities = []
     self.lexicon = lexicon or Lexicon()
     self.postagger = pos_tagger or PosTagger(lexicon=self.lexicon)
     self.lemmatizer = lemmatizer or Lemmatizer(self.lexicon)
     self.make()
     self._stemms = None
示例#4
0
 def __init__(self, text, pos_tagger=None, lemmatizer=None, lexicon=None):
     self._raw_text = text
     self.normalized_text = normalize_text(text)
     if len(self.normalized_text) == 0:
         # For now, raise value error, because an empty text create
         # too much problems here and there (zero division, etc.)
         # TODO : make empty texts possible.
         raise ValueError("Can't process an empty text.")
     self.samples = []
     self.keyentities = []
     self.lexicon = lexicon or Lexicon()
     self.postagger = pos_tagger or PosTagger(lexicon=self.lexicon)
     self.lemmatizer = lemmatizer or Lemmatizer(self.lexicon)
     self.make()
     self._stemms = None
示例#5
0
 def test_normalize_text(self):
     self.failIf(u"’" in normalize_text(u"’"))
     self.failIf(u"qu' " not in normalize_text(u"lorsqu'il"))
     self.failIf(u"<" in normalize_text(u"<b>pouet</b>"))
     self.failIf(u"&" in normalize_text(u"&eacute;"))
     self.failIf(normalize_text(u"mange-t-elle") != u"mange - t - elle")
     # Replacing quotes
     self.failIf(normalize_text(u' "pouet') != u' «pouet')
     self.failIf(normalize_text(u'pouet". ') != u'pouet». ')
     self.failIf(normalize_text(u'pouet"! ') != u'pouet»! ')
     self.failIf(normalize_text(u'pouet"? ') != u'pouet»? ')
     self.failIf(normalize_text(u'pouet?" ') != u'pouet?» ')
     # reflexive pronoun
     self.failIf(normalize_text(u'dis-je,') != u'dis - je,')
     self.failIf(normalize_text(u'dis-tu ') != u'dis - tu ')
     self.failIf(normalize_text(u'entends-toi!') != u'entends - toi!')
     self.failIf(normalize_text(u'sans-toit') != u'sans-toit')
     self.failIf(normalize_text(u'dit-il!') != u'dit - il!')
     self.failIf(normalize_text(u'dit-elle;') != u'dit - elle;')
     self.failIf(normalize_text(u'dis-le!') != u'dis - le!')
     self.failIf(normalize_text(u'ce camion-ci.') != u'ce camion - ci.')
     self.failIf(normalize_text(u'camion-citerne') != u'camion-citerne')
     self.failIf(normalize_text(u"est-ce\n") != u"est - ce\n")
示例#6
0
 def test_normalize_text(self):
     self.failIf(u"’" in normalize_text(u"’"))
     self.failIf(u"qu' " not in normalize_text(u"lorsqu'il"))
     self.failIf(u"<" in normalize_text(u"<b>pouet</b>"))
     self.failIf(u"&" in normalize_text(u"&eacute;"))
     self.failIf(normalize_text(u"mange-t-elle") != u"mange - t - elle")
     # Replacing quotes
     self.failIf(normalize_text(u' "pouet') != u' «pouet')
     self.failIf(normalize_text(u'pouet". ') != u'pouet». ')
     self.failIf(normalize_text(u'pouet"! ') != u'pouet»! ')
     self.failIf(normalize_text(u'pouet"? ') != u'pouet»? ')
     self.failIf(normalize_text(u'pouet?" ') != u'pouet?» ')
     # reflexive pronoun
     self.failIf(normalize_text(u'dis-je,') != u'dis - je,')
     self.failIf(normalize_text(u'dis-tu ') != u'dis - tu ')
     self.failIf(normalize_text(u'entends-toi!') != u'entends - toi!')
     self.failIf(normalize_text(u'sans-toit') != u'sans-toit')
     self.failIf(normalize_text(u'dit-il!') != u'dit - il!')
     self.failIf(normalize_text(u'dit-elle;') != u'dit - elle;')
     self.failIf(normalize_text(u'dis-le!') != u'dis - le!')
     self.failIf(normalize_text(u'ce camion-ci.') != u'ce camion - ci.')
     self.failIf(normalize_text(u'camion-citerne') != u'camion-citerne')
     self.failIf(normalize_text(u"est-ce\n") != u"est - ce\n")