def featurize(text: str): ''' Tokenize text using the given split mask generation function, yielding each token tagged with its features. :param text: The input text to tokenize :param gen_split_mask_fn: A function that takes a parse feature matrix and returns a split mask vector. ''' m = _gen_parse_matrix(text) splits = gen_split_mask(m) non_zero = np.nonzero(splits)[0] textlen = len(text) if len(non_zero) > 0: str_idx, end_idx = non_zero[0], 0 for end_idx in non_zero[1:]: token = text[str_idx:end_idx].strip() if token: yield LaToken( token, str_idx, end_idx, _combine_matrix_rows( m, np.arange(str_idx, end_idx, dtype=np.int8))) str_idx = end_idx last_token = text[end_idx:].strip() if last_token: yield LaToken( last_token, end_idx, textlen, _combine_matrix_rows( m, np.arange(end_idx, textlen, dtype=np.int8)))
def tokenize(text: str): ''' Tokenize text using the given split mask generation function, yielding each token. :param text: The input text to tokenize :param gen_split_mask_fn: A function that takes a parse feature matrix and returns a split mask vector. ''' m = _gen_parse_matrix(text) splits = gen_split_mask(m) non_zero = np.nonzero(splits)[0] if len(non_zero) > 0: str_idx, end_idx = non_zero[0], 0 for end_idx in non_zero[1:]: token = text[str_idx:end_idx].strip() if token: yield token str_idx = end_idx last_token = text[end_idx:].strip() if last_token: yield last_token else: yield ''
token, str_idx, end_idx, _combine_matrix_rows( m, np.arange(str_idx, end_idx, dtype=np.int8))) str_idx = end_idx last_token = text[end_idx:].strip() if last_token: yield LaToken( last_token, end_idx, textlen, _combine_matrix_rows( m, np.arange(end_idx, textlen, dtype=np.int8))) if __name__ == "__main__": text = "This is a #test! Testing, Testing, 1 2 3" #text = "can’t wait to get my glasses back 🤓" #text = """IKR!! IM LIKE \""WHERE'S MY DADDY AT? 👀) https://t.co/jM3qLZijMc""" #text = '$#@^:a./' print(f'text={text}') m = _gen_parse_matrix(text) print(m) splits = gen_split_mask(m) print(f'splits={splits}') non_zero = np.nonzero(splits)[0] print(f'non_zero={non_zero}') for token in tokenize(text): print(f'"{token}"') for token in featurize(text): print( f'\n"{token.text}" ({token.start_idx}, {token.end_idx}) weight={token.weight()}\n{token.feature_weights()}' )
def gen_parse_matrix(text: str) -> np.ndarray: ''' Generate a feature matrix for parsing (tokenizing) a string, Where for each letter there is a row of features as identified in offsets.py. ''' return _gen_parse_matrix(text)
def tokenize(self, text): _gen_parse_matrix(text) return None
def tokenize(self, text): gen_split_mask(_gen_parse_matrix(text)) return None