Python _gen_parse_matrix示例

编程语言: Python

命名空间/包名称: latok.latok

方法/功能: _gen_parse_matrix

hotexamples.com的示例: 6

Python _gen_parse_matrix - 已找到6个示例。这些是从开源项目中提取的最受好评的latok.latok._gen_parse_matrix现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

def featurize(text: str):
    '''
    Tokenize text using the given split mask generation function, yielding
    each token tagged with its features.

    :param text: The input text to tokenize
    :param gen_split_mask_fn: A function that takes a parse feature matrix
                              and returns a split mask vector.
    '''
    m = _gen_parse_matrix(text)
    splits = gen_split_mask(m)
    non_zero = np.nonzero(splits)[0]
    textlen = len(text)
    if len(non_zero) > 0:
        str_idx, end_idx = non_zero[0], 0
        for end_idx in non_zero[1:]:
            token = text[str_idx:end_idx].strip()
            if token:
                yield LaToken(
                    token, str_idx, end_idx,
                    _combine_matrix_rows(
                        m, np.arange(str_idx, end_idx, dtype=np.int8)))
            str_idx = end_idx
        last_token = text[end_idx:].strip()
        if last_token:
            yield LaToken(
                last_token, end_idx, textlen,
                _combine_matrix_rows(
                    m, np.arange(end_idx, textlen, dtype=np.int8)))

示例#2

显示文件

def tokenize(text: str):
    '''
    Tokenize text using the given split mask generation function, yielding
    each token.

    :param text: The input text to tokenize
    :param gen_split_mask_fn: A function that takes a parse feature matrix
                              and returns a split mask vector.
    '''
    m = _gen_parse_matrix(text)
    splits = gen_split_mask(m)
    non_zero = np.nonzero(splits)[0]
    if len(non_zero) > 0:
        str_idx, end_idx = non_zero[0], 0
        for end_idx in non_zero[1:]:
            token = text[str_idx:end_idx].strip()
            if token:
                yield token
            str_idx = end_idx
        last_token = text[end_idx:].strip()
        if last_token:
            yield last_token
    else:
        yield ''

示例#3

显示文件

                    token, str_idx, end_idx,
                    _combine_matrix_rows(
                        m, np.arange(str_idx, end_idx, dtype=np.int8)))
            str_idx = end_idx
        last_token = text[end_idx:].strip()
        if last_token:
            yield LaToken(
                last_token, end_idx, textlen,
                _combine_matrix_rows(
                    m, np.arange(end_idx, textlen, dtype=np.int8)))


if __name__ == "__main__":
    text = "This is a #test! Testing, Testing, 1 2 3"
    #text = "canâ€™t wait to get my glasses back ðŸ¤“"
    #text = """IKR!! IM LIKE \""WHERE'S MY DADDY AT? ðŸ‘€) https://t.co/jM3qLZijMc"""
    #text = '$#@^:a./'
    print(f'text={text}')
    m = _gen_parse_matrix(text)
    print(m)
    splits = gen_split_mask(m)
    print(f'splits={splits}')
    non_zero = np.nonzero(splits)[0]
    print(f'non_zero={non_zero}')
    for token in tokenize(text):
        print(f'"{token}"')
    for token in featurize(text):
        print(
            f'\n"{token.text}" ({token.start_idx}, {token.end_idx}) weight={token.weight()}\n{token.feature_weights()}'
        )

示例#4

显示文件

文件： latok_utils.py 项目： resero-labs/latok

def gen_parse_matrix(text: str) -> np.ndarray:
    '''
    Generate a feature matrix for parsing (tokenizing) a string,
    Where for each letter there is a row of features as identified in offsets.py.
    '''
    return _gen_parse_matrix(text)

示例#5

显示文件

文件： time_tokenizer.py 项目： resero-labs/latok

 def tokenize(self, text):
     _gen_parse_matrix(text)
     return None

示例#6

显示文件

文件： time_tokenizer.py 项目： resero-labs/latok

 def tokenize(self, text):
     gen_split_mask(_gen_parse_matrix(text))
     return None