def customize_tokenizer(text, do_lower_case=False):
  tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
  temp_x = ""
  text = tokenization.convert_to_unicode(text)
  for c in text:
    if tokenizer._is_chinese_char(ord(c)) or tokenization._is_punctuation(c) or tokenization._is_whitespace(c) or tokenization._is_control(c):
      temp_x += " " + c + " "
    else:
      temp_x += c
  if do_lower_case:
    temp_x = temp_x.lower()
  return temp_x.split() # 所以我们这里会拿到一个list
예제 #2
0
 def _joinTokens_orig(self, example):
     tokens = []
     for t0i, token0 in enumerate(example.tokens0):
         if token0.startswith("##"):
             while len(tokens) > 0 and tokens[-1] == " " and not (
                     len(tokens) > 1
                     and tokenizationOrig._is_punctuation(tokens[-2][-1])):
                 tokens.pop()
             token0 = token0[2:]
         tokens.append(token0)
     text = "".join(tokens)
     return text
예제 #3
0
 def test_is_punctuation(self):
     self.assertTrue(tokenization._is_punctuation(u"-"))
     self.assertTrue(tokenization._is_punctuation(u"$"))
     self.assertTrue(tokenization._is_punctuation(u"`"))
     self.assertTrue(tokenization._is_punctuation(u"."))
     self.assertFalse(tokenization._is_punctuation(u"A"))
     self.assertFalse(tokenization._is_punctuation(u" "))
예제 #4
0
def customize_tokenizer(text, do_lower_case=True):
    temp_x = ""
    text = tokenization.convert_to_unicode(text)
    for c in text:
        if _is_chinese_char(ord(c)) or tokenization._is_punctuation(
                c) or tokenization._is_whitespace(
                    c) or tokenization._is_control(c):
            temp_x += " " + c + " "
        else:
            temp_x += c
    if do_lower_case:
        temp_x = temp_x.lower()
    return temp_x.split()
예제 #5
0
  def test_is_punctuation(self):
    self.assertTrue(tokenization._is_punctuation(u"-"))
    self.assertTrue(tokenization._is_punctuation(u"$"))
    self.assertTrue(tokenization._is_punctuation(u"`"))
    self.assertTrue(tokenization._is_punctuation(u"."))

    self.assertFalse(tokenization._is_punctuation(u"A"))
    self.assertFalse(tokenization._is_punctuation(u" "))
예제 #6
0
def _is_chinese_or_punctuation(ch):
    return _is_chinese_char(ch) or _is_punctuation(ch)