예제 #1
0
class Preprocess:
    def __init__(self, lang, use_stemmer, stop_words):
        self.lang = lang
        self.use_stemmer = use_stemmer
        self.word_tokenizer = WordTokenizer(locale=lang)
        # As we have only one language currently, no need to
        # check if supported
        self.stemmer = Stemmer.Stemmer(self.lang)
        self.stop_words = stop_words

    def stem(self, tokens):
        return (self.stemmer.stemWords(tokens)
                if self.use_stemmer
                else tokens)

    def remove_accents(self, text):
        return (unicodedata.normalize('NFD', text)
                .encode('ascii', 'ignore')
                .decode("utf-8"))

    def lower(self, text):
        return text.lower()

    def clear_html(self, text):
        # Html cleaner will throw exception when an incorrect tag
        # formation is detected a 'prompt' like symbol e.g.: '<- '
        try:
            text = clean_html(html.fromstring(text)).text_content()
        except Exception as e:
            pass
        return text

    def is_token_stopword(self, token):
        return token in self.stop_words

    def is_alpha(self, token):
        return token.isalpha()

    def filtertokens(self, tokens):
        return filter(
            lambda t: self.is_alpha(t) and not (self.is_token_stopword(t)),
            tokens)

    def transform2words(self, text):
        return self.word_tokenizer.transform(Sequence(text)).tokens()

    def process(self, text):
        fnlist = [
            self.remove_accents,
            self.lower,
            self.clear_html,
            self.transform2words,
            self.filtertokens,
            self.stem
        ]
        return compose(*reversed(fnlist))(text)
예제 #2
0
def segment(args):
    lang = args.lang
    w_tokenizer = WordTokenizer(locale=lang)
    s_tokenizer = SentenceTokenizer(locale=lang)

    if args.only_sent:
        for l in args.input:
            seq = Sequence(l)
            if not seq.empty(): _print(u'\n'.join(s_tokenizer.transform(seq)))

    elif args.only_word:
        for l in args.input:
            seq = Sequence(l)
            if not seq.empty(): _print(u' '.join(w_tokenizer.transform(seq)))

    else:
        for l in args.input:
            seq = Sequence(l)
            sents = s_tokenizer.transform(seq)
            words = w_tokenizer.transform(seq)
            for tokenized_sent in words.split(sents):
                if not tokenized_sent.empty():
                    _print(u' '.join(tokenized_sent.tokens()))
예제 #3
0
def segment(args):
  lang  = args.lang
  w_tokenizer = WordTokenizer(locale=lang)
  s_tokenizer = SentenceTokenizer(locale=lang)

  if args.only_sent:
    for l in args.input:
      seq = Sequence(l)
      if not seq.empty(): _print(s_tokenizer.transform(seq))

  elif args.only_word:
    for l in args.input:
      seq = Sequence(l)
      if not seq.empty(): _print(w_tokenizer.transform(seq))

  else:
    for l in args.input:
      seq = Sequence(l)
      sents = s_tokenizer.transform(seq)
      words = w_tokenizer.transform(seq)
      for tokenized_sent in words.split(sents):
        if not tokenized_sent.empty():
          _print(u' '.join(tokenized_sent.tokens()))