Exemplo n.º 1
0
 def span_tokenize(self, s):
     if self._blanklines == 'keep':
         for span in string_span_tokenize(s, r'\n'):
             yield span
     else:
         for span in regexp_span_tokenize(s, r'\n(\s+\n)*'):
             yield span
Exemplo n.º 2
0
 def span_tokenize(self, s):
     if self._blanklines == 'keep':
         for span in string_span_tokenize(s, r'\n'):
             yield span
     else:
         for span in regexp_span_tokenize(s, r'\n(\s+\n)*'):
             yield span
Exemplo n.º 3
0
 def span_tokenize(self, s):
     if self._blanklines == "keep":
         for span in string_span_tokenize(s, r"\n"):
             yield span
     else:
         for span in regexp_span_tokenize(s, r"\n(\s+\n)*"):
             yield span
Exemplo n.º 4
0
 def span_tokenize(self, s):
     if self._blanklines == "keep":
         for span in string_span_tokenize(s, r"\n"):
             yield span
     else:
         for span in regexp_span_tokenize(s, r"\n(\s+\n)*"):
             yield span
def span_tokenizer(sent):

    spans = string_span_tokenize(sent, " ")
    list_span_tok = list()
    for span in spans:
        list_span_tok.append((span[0], span[1], sent[span[0]:span[1]]))

    return list_span_tok
Exemplo n.º 6
0
def fun_1_1_5():
    import nltk
    from nltk.tokenize import RegexpTokenizer
    from nltk.tokenize import regexp_tokenize
    tokenizer = RegexpTokenizer("[\w]+")
    print "RegexpTokenizer:", tokenizer.tokenize(
        "Don't hesitate to ask questions")
    print "regexp_tokenizer:", regexp_tokenize(
        "Don't hesitate to ask questions", pattern="\w+|\$[\d\.]+|\S+")
    # 通过空格来执行切分
    tokenizer = RegexpTokenizer('\s+', gaps=True)
    print "RegexpTokenizer:", tokenizer.tokenize(
        "Don't hesitate to ask questions")
    # 筛选以大写字母开头的单词
    sent = " She secured 90.56 % in class X \n. She is a meritorious student"
    capt = RegexpTokenizer('[A-Z]\w+')
    print "RegexpTokenizer:", capt.tokenize(sent)
    # RegexpTokenizer 的一个子类是如何使用预定义正则表达式的
    from nltk.tokenize import BlanklineTokenizer
    print "BlanklineTokenizer:", BlanklineTokenizer().tokenize(sent)
    # 字符串的切分可以通过空格、间隔、换行等来完成
    from nltk.tokenize import WhitespaceTokenizer
    print "WhitespaceTokenizer:", WhitespaceTokenizer().tokenize(sent)
    # WordPunctTokenizer 使用正则表达式\w+|[^\w\s]+来执行文本的切分,并将其
    # 切分为字母与非字母字符
    from nltk.tokenize import WordPunctTokenizer
    print "WordPunctTokenizer:", WordPunctTokenizer().tokenize(sent)
    # 使用 split()方法进行切分
    print "split():", sent.split()
    print "split(' '):", sent.split(' ')
    print "split('\n'):", sent.split('\n')
    # 类似于 sent.split('\n')方法,LineTokenizer 通过将文本切分为行来执行切分
    from nltk.tokenize import LineTokenizer
    print "LineTokenizer:", LineTokenizer().tokenize(sent)
    print "LineTokenizer:", LineTokenizer(blanklines='keep').tokenize(sent)
    print "LineTokenizer:", LineTokenizer(blanklines='discard').tokenize(sent)
    # SpaceTokenizer 与 sent.split('')方法的工作原理类似
    from nltk.tokenize import SpaceTokenizer
    print "SpaceTokenizer:", SpaceTokenizer().tokenize(sent)
    # nltk.tokenize.util 模块通过返回元组形式的序列来执行切分,该序列为标识符
    # 在语句中的位置和偏移量
    print "标识符序列:", list(WhitespaceTokenizer().span_tokenize(sent))
    # 给定一个标识符的序列,则可以返回其跨度序列
    from nltk.tokenize.util import spans_to_relative
    print "位置和偏移:", list(
        spans_to_relative(WhitespaceTokenizer().span_tokenize(sent)))
    # 通过在每一个分隔符的连接处进行分割,nltk.tokenize.util.string_span_tokenize(sent,separator)将返回 sent 中标识符的偏移量:
    from nltk.tokenize.util import string_span_tokenize
    print "标识符序列:", list(string_span_tokenize(sent, " "))
Exemplo n.º 7
0
def token_to_char(text: str, sep=" ") -> np.ndarray:
    """Takes a string, space tokenizes the string, and returns a mapping from tokens to chars.

    Examples:
        >>> token_to_char("testing 1, 2, 3")
        # produces a (m) token by (M) char matrix:

                   t e s t i n g   1 ,   2 ,   3
         testing [[1 1 1 1 1 1 1 0 0 0 0 0 0 0 0]
              1,  [0 0 0 0 0 0 0 0 1 1 0 0 0 0 0]
              2,  [0 0 0 0 0 0 0 0 0 0 0 1 1 0 0]
              3   [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]]

    Args:
        text (str): string to tokenize and build the token to char mapping.

    Returns:
        np.ndarray mapping from (m) tokens to (M) chars.

    """
    spans = string_span_tokenize(text, sep=sep)
    return _mat_from_spans_dense(tuple(spans), len(text))
Exemplo n.º 8
0
 def span_tokenize(self, s):
     for span in string_span_tokenize(s, self._string):
         yield span
import nltk
from nltk.tokenize.util import string_span_tokenize
sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
print(list(string_span_tokenize(sent, " ")))
Exemplo n.º 10
0
 def span_tokenize(self, s):
     if self._blanklines == "keep":
         yield from string_span_tokenize(s, r"\n")
     else:
         yield from regexp_span_tokenize(s, r"\n(\s+\n)*")
Exemplo n.º 11
0
 def span_tokenize(self, s):
     yield from string_span_tokenize(s, self._string)
Exemplo n.º 12
0
import nltk
from nltk.tokenize.util import string_span_tokenize

sent = " She secured 90.56 % in class X \n. She is a meritorious student\n"
print(list(string_span_tokenize(sent, " ")))
print regexp_tokenize(text, pattern='\w+|\$[\d\.]+\S+')

# Tokenize whitespace
tokenizer = RegexpTokenizer('\s+', gaps=True)
print tokenizer.tokenize(text)

# Select only words starting with capital letters
capt = RegexpTokenizer('[A-Z]\w+')
print capt.tokenize(text2)

print BlanklineTokenizer().tokenize(text2)

print WhitespaceTokenizer().tokenize(text2)

print LineTokenizer(blanklines='keep').tokenize(text2)
print LineTokenizer(blanklines='discard').tokenize(text2)

# SpaceTokenizer works similar to .split('')
print SpaceTokenizer().tokenize(text2)

# Returns the sequence of tuples that are offsets of the tokens
# in a sentence:
print list(WhitespaceTokenizer().span_tokenize(text2))

# Returns the sequence of relative spans
print list(spans_to_relative(WhitespaceTokenizer().span_tokenize(text2)))

# Returns the offsets of tokens in text2 by splitting at each incidence of the separator:
print list(string_span_tokenize(text2, ""))