def tokenizer(text, tokenizer_fn, to_lower=False):
    text = ftfy.fix_text(text)
    if to_lower:
        text = text.lower()
    try:
        seq = Sequence(text.strip())
    except ValueError:
        return
    tokens = tokenizer_fn.transform(seq)
    new_tokens = []
    for token in tokens:
        if token.strip() == '':
            continue
        elif PUNCTSYM.search(token):
            token = '$'
        elif LIKENUM.search(token):
            token = '0'
        elif LIKEUNIT.search(token):
            token = LIKEUNIT.sub(r'0 \1', token)
        elif token == "can't":
            token = 'can not'
        elif CONTRACTION1.search(token):
            token = CONTRACTION1.sub(r"\1 '\2", token)
        elif CONTRACTION2.search(token):
            token = CONTRACTION2.sub(r"\1 n't", token)
        new_tokens.append(token)
    if new_tokens:
        return ' '.join(new_tokens).strip()
    return
Exemplo n.º 2
0
 def transform(self, sequence):
     seq = Sequence(sequence.text)
     seq.idx = [0]
     for segment in sequence:
         offset = seq.idx[-1]
         self.breaker.setText(segment)
         seq.idx.extend([offset + x for x in self.breaker])
     return seq
Exemplo n.º 3
0
def sentSegment(par, lang):
  try:
    sents = sent_tokenize(par, lang)
  except:
    try:
      par_seq = Sequence(par)
      st = SentenceTokenizer(locale = lang_map[lang])
      sents = [sent for sent in st.transform(par_seq)]
    except:
      return None
  return sents
Exemplo n.º 4
0
def segment(args):
    lang = args.lang
    w_tokenizer = WordTokenizer(locale=lang)
    s_tokenizer = SentenceTokenizer(locale=lang)

    if args.only_sent:
        for l in args.input:
            seq = Sequence(l)
            if not seq.empty(): _print(u'\n'.join(s_tokenizer.transform(seq)))

    elif args.only_word:
        for l in args.input:
            seq = Sequence(l)
            if not seq.empty(): _print(u' '.join(w_tokenizer.transform(seq)))

    else:
        for l in args.input:
            seq = Sequence(l)
            sents = s_tokenizer.transform(seq)
            words = w_tokenizer.transform(seq)
            for tokenized_sent in words.split(sents):
                if not tokenized_sent.empty():
                    _print(u' '.join(tokenized_sent.tokens()))
 def _create_sentence_objects(self):
     '''Returns a list of Sentence objects from the raw text.
 '''
     sentence_objects = []
     sent_tokenizer = SentenceTokenizer(locale=self.language.code)
     seq = Sequence(self.raw)
     seq = sent_tokenizer.transform(seq)
     for start_index, end_index in zip(seq.idx[:-1], seq.idx[1:]):
         # Sentences share the same models as their parent blob
         sent = seq.text[start_index:end_index].strip()
         if not sent: continue
         s = Sentence(sent, start_index=start_index, end_index=end_index)
         s.detected_languages = self.detected_languages
         sentence_objects.append(s)
     return sentence_objects
Exemplo n.º 6
0
 def tokenizer(text, tokenizer_fn):
     seq = Sequence(text.strip())
     return filter(lambda w: w != ' ', tokenizer_fn.transform(seq))
 def tokens(self):
     """Return a list of tokens, using this blob's tokenizer object
 (defaults to :class:`WordTokenizer <textblob.tokenizers.WordTokenizer>`).
 """
     seq = self.word_tokenizer.transform(Sequence(self.raw))
     return WordList(seq.tokens(), parent=self, language=self.language.code)
Exemplo n.º 8
0
 def transform2words(self, text):
     return self.word_tokenizer.transform(Sequence(text)).tokens()
Exemplo n.º 9
0
    def tokens(self):
        """Return a list of tokens, using this blob's tokenizer object
        (defaults to :class:`WordTokenizer <textblob.tokenizers.WordTokenizer>`).
        """
        seq = self.word_tokenizer.transform(Sequence(self.raw))
        tokens = WordList(seq.tokens(),
                          parent=self,
                          language=self.language.code)

        fix_hyphen = []
        i = 0
        # SIDE DELETE
        # while i < len(tokens):
        #     hyphen_word = ''
        #     while i + 3 < len(tokens) and tokens[i+1] == '-' and tokens[i+2] not in string.punctuation:
        #         if tokens[i+3] == '-':
        #             hyphen_word += tokens[i] + tokens[i+1]
        #             i += 2
        #             if i + 2 < len(tokens):
        #                 if tokens[i+1] == '-' and tokens[i+2] not in string.punctuation:
        #                     hyphen_word += tokens[i] + tokens[i + 1] + tokens[i+2]
        #                     # i+=3  # SIDE delete error [list out bound]
        #                     i += 1  # SIDE ADD
        #                     if tokens[i] != '-':
        #                         break
        #         else:
        #             hyphen_word += tokens[i] + tokens[i + 1] + tokens[i + 2]
        #             i += 3
        #             if tokens[i] != '-':
        #                 break
        #     if hyphen_word:
        #         fix_hyphen.append(hyphen_word)
        #         continue
        #     else:
        #         if i + 2 < len(tokens):
        #             if tokens[i] not in string.punctuation and tokens[i+1] == '-' and tokens[i+2] not in string.punctuation:
        #                     fix_hyphen.append(tokens[i]+tokens[i+1]+tokens[i+2])
        #                     i += 3
        #                     continue
        #     fix_hyphen.append(tokens[i])
        #     i+=1

        # SIDE ADD
        while i < len(tokens):
            hyphen_word = ''
            if fix_hyphen and tokens[i] == '-' and i + 1 < len(
                    tokens) and tokens[i + 1] not in string.punctuation:
                hyphen_word = tokens[i] + tokens[i + 1]
            if hyphen_word:
                fix_hyphen[-1] = fix_hyphen[-1] + hyphen_word
                i += 1
            else:
                fix_hyphen.append(tokens[i])
            i += 1

        if self.split_apostrophe:
            fix_apostrophe = []
            for token in fix_hyphen:
                if '\'' in token:
                    split = token.split('\'')
                    for i, t in enumerate(split):
                        fix_apostrophe.append(t)
                        if i != len(split) - 1:
                            fix_apostrophe.append('\'')
                else:
                    fix_apostrophe.append(token)
            return WordList(fix_apostrophe,
                            parent=self,
                            language=self.language.code)
        else:
            return WordList(fix_hyphen,
                            parent=self,
                            language=self.language.code)