示例#1
0
def featurize(text: str):
    '''
    Tokenize text using the given split mask generation function, yielding
    each token tagged with its features.

    :param text: The input text to tokenize
    :param gen_split_mask_fn: A function that takes a parse feature matrix
                              and returns a split mask vector.
    '''
    m = _gen_parse_matrix(text)
    splits = gen_split_mask(m)
    non_zero = np.nonzero(splits)[0]
    textlen = len(text)
    if len(non_zero) > 0:
        str_idx, end_idx = non_zero[0], 0
        for end_idx in non_zero[1:]:
            token = text[str_idx:end_idx].strip()
            if token:
                yield LaToken(
                    token, str_idx, end_idx,
                    _combine_matrix_rows(
                        m, np.arange(str_idx, end_idx, dtype=np.int8)))
            str_idx = end_idx
        last_token = text[end_idx:].strip()
        if last_token:
            yield LaToken(
                last_token, end_idx, textlen,
                _combine_matrix_rows(
                    m, np.arange(end_idx, textlen, dtype=np.int8)))
示例#2
0
def tokenize(text: str):
    '''
    Tokenize text using the given split mask generation function, yielding
    each token.

    :param text: The input text to tokenize
    :param gen_split_mask_fn: A function that takes a parse feature matrix
                              and returns a split mask vector.
    '''
    m = _gen_parse_matrix(text)
    splits = gen_split_mask(m)
    non_zero = np.nonzero(splits)[0]
    if len(non_zero) > 0:
        str_idx, end_idx = non_zero[0], 0
        for end_idx in non_zero[1:]:
            token = text[str_idx:end_idx].strip()
            if token:
                yield token
            str_idx = end_idx
        last_token = text[end_idx:].strip()
        if last_token:
            yield last_token
    else:
        yield ''
示例#3
0
                    token, str_idx, end_idx,
                    _combine_matrix_rows(
                        m, np.arange(str_idx, end_idx, dtype=np.int8)))
            str_idx = end_idx
        last_token = text[end_idx:].strip()
        if last_token:
            yield LaToken(
                last_token, end_idx, textlen,
                _combine_matrix_rows(
                    m, np.arange(end_idx, textlen, dtype=np.int8)))


if __name__ == "__main__":
    text = "This is a #test! Testing, Testing, 1 2 3"
    #text = "can’t wait to get my glasses back 🤓"
    #text = """IKR!! IM LIKE \""WHERE'S MY DADDY AT? 👀) https://t.co/jM3qLZijMc"""
    #text = '$#@^:a./'
    print(f'text={text}')
    m = _gen_parse_matrix(text)
    print(m)
    splits = gen_split_mask(m)
    print(f'splits={splits}')
    non_zero = np.nonzero(splits)[0]
    print(f'non_zero={non_zero}')
    for token in tokenize(text):
        print(f'"{token}"')
    for token in featurize(text):
        print(
            f'\n"{token.text}" ({token.start_idx}, {token.end_idx}) weight={token.weight()}\n{token.feature_weights()}'
        )
示例#4
0
def gen_parse_matrix(text: str) -> np.ndarray:
    '''
    Generate a feature matrix for parsing (tokenizing) a string,
    Where for each letter there is a row of features as identified in offsets.py.
    '''
    return _gen_parse_matrix(text)
示例#5
0
 def tokenize(self, text):
     _gen_parse_matrix(text)
     return None
示例#6
0
 def tokenize(self, text):
     gen_split_mask(_gen_parse_matrix(text))
     return None