def tokenize_and_abstract(self, source_code): """As per the superclass.""" agnostic_tokens: List[unified_tokenizer.AbstractToken] = [] try: java_tokens = tokenizer.tokenize(source_code) for token in java_tokens: # The token kind is the subclass type of the token. token_type = type(token) if token_type not in JavaTokenizer._TOKEN_TYPE_MAP: raise ValueError( 'Received Java token type %s, but it was unexpected, ' 'while tokenizing \n%s\n' % (token_type, source_code)) # The tokenizer seems to take some liberties with Unicode, returning # invalid characters. This cleans spellings up. spelling = token.value.encode('utf-8', errors='replace').decode('utf-8') agnostic_tokens.append( unified_tokenizer.AbstractToken( spelling, JavaTokenizer._TOKEN_TYPE_MAP[token_type], unified_tokenizer. TokenMetadata(start=unified_tokenizer.Position( # JavaTokenizer counts lines and columns from 1. line=token.position.line - 1, column=token.position.column - 1)))) except (tokenizer.LexerError, TypeError) as e: # Sometimes, javalang returns a TypeError when reading a number. # See # https://github.com/c2nes/javalang/blob/0664afb7f4d40254312693f2e833c1ed4ac551c7/javalang/tokenizer.py#L370 logging.warn( 'The tokenizer raised exception `%r` while parsing %s', e, source_code) agnostic_tokens.append( unified_tokenizer.AbstractToken( cubert_tokenizer.quote_special( unified_tokenizer.TokenKind.ERROR.name), unified_tokenizer.TokenKind.ERROR, unified_tokenizer.TokenMetadata())) # javalang doesn't seem to ever return `EndOfinput` despite there being a # token type for it. We insert it here. agnostic_tokens.append( unified_tokenizer.AbstractToken( cubert_tokenizer.quote_special( unified_tokenizer.TokenKind.EOS.name), unified_tokenizer.TokenKind.EOS, unified_tokenizer.TokenMetadata())) return agnostic_tokens
def test_split_agnostic_returns_expected(self, labelled_tokens, max_length, expected_labelled_subtokens): tokens = [ unified_tokenizer.AbstractToken(s, k, unified_tokenizer.TokenMetadata()) for s, k in labelled_tokens ] labelled_subtokens = unified_tokenizer.split_agnostic_tokens( tokens, max_length) expected_multi_tokens = [] for spelling_list, kind in expected_labelled_subtokens: expected_multi_tokens.append( unified_tokenizer.AbstractMultiToken( # We cast spellings to tuples, since we know that # `split_agnostic_tokens` creates multi tokens with tuples rather # than lists. spellings=tuple(spelling_list), kind=kind, metadata=unified_tokenizer.TokenMetadata())) self.assertSequenceEqual(expected_multi_tokens, labelled_subtokens)
def tokenize_and_abstract( self, source_code): """Produces a language-agnostic tokenization of the input code.""" agnostic_tokens: List[unified_tokenizer.AbstractToken] = [] try: token_tuples = unified_tokenizer.code_to_tokens(source_code) except (tokenize.TokenError, IndentationError) as e: logging.warning('The tokenizer raised exception `%s` while parsing %s', e, source_code) # We don't try to do recovery from errors quite yet. Emit just an # error and end-of-sequence and return. agnostic_tokens.append( unified_tokenizer.AbstractToken( unified_tokenizer.quote_special( unified_tokenizer.TokenKind.ERROR.name), unified_tokenizer.TokenKind.ERROR, unified_tokenizer.TokenMetadata( start=unified_tokenizer.Position( line=0, column=0), end=unified_tokenizer.Position( line=0, column=0)))) agnostic_tokens.append( unified_tokenizer.AbstractToken( unified_tokenizer.quote_special( unified_tokenizer.TokenKind.EOS.name), unified_tokenizer.TokenKind.EOS, unified_tokenizer.TokenMetadata( start=unified_tokenizer.Position( line=0, column=0), end=unified_tokenizer.Position( line=0, column=0)))) return agnostic_tokens for token_tuple in token_tuples: spelling = token_tuple.string kind = token_tuple.type # We'll adjust the spelling of some tokens, e.g., those that we # tokenize by their type rather than their original spelling. Indentation # and dedentation tokens are like that. adjusted_spelling = spelling token_kind = unified_tokenizer.TokenKind.NONE if kind == tokenize.NAME: # Disambiguate identifiers from keywords. if keyword.iskeyword(spelling): token_kind = unified_tokenizer.TokenKind.KEYWORD else: token_kind = unified_tokenizer.TokenKind.IDENTIFIER else: if kind in PythonTokenizer._TOKEN_TYPES_TO_TOKENIZE_BY_TYPE: # Replace spelling with type. adjusted_spelling = cubert_tokenizer.token_from_token_type(kind) elif kind is tokenize.INDENT: # For INDENT, in particular, we also record the actual spelling too. adjusted_spelling = '{indent}{spelling}'.format( indent=cubert_tokenizer.token_from_token_type(kind), spelling=spelling) elif kind == tokenize.ENDMARKER: adjusted_spelling = unified_tokenizer.quote_special( unified_tokenizer.TokenKind.EOS.name) # Map everything according to table. try: token_kind = PythonTokenizer._TOKEN_TYPE_MAP[kind] except KeyError as ke: # It's possible we're here because of async/await. Those kept being # turned into keywords and then removed from keywords, so we can't # rely on knowing which they are. We'll check by spelling. # See: https://bugs.python.org/issue30406 # and https://bugs.python.org/issue33260 # and https://bugs.python.org/issue35975 if spelling in ('async', 'await'): token_kind = unified_tokenizer.TokenKind.KEYWORD else: raise ValueError('While trying to turn Python token %r into an ' 'agnostic one, raised %r.' % ((spelling, kind), ke)) start_line, start_column = token_tuple.start end_line, end_column = token_tuple.end # Unlike other languages, NEWLINE tokens are reported as ending on the # same line as where they started. We adjust that here, to stick to the # same convention as other tokenizers. if ((token_kind == unified_tokenizer.TokenKind.NEWLINE) or (kind == tokenize.NL)): end_line = start_line + 1 end_column = 0 agnostic_tokens.append( unified_tokenizer.AbstractToken( spelling=adjusted_spelling, kind=token_kind, metadata=unified_tokenizer.TokenMetadata( # Python's tokenizer counts lines starting from 1, so we # have to offset what we read from the `TokenInfo` tuple. start=unified_tokenizer.Position( line=start_line - 1, column=start_column), end=unified_tokenizer.Position( line=end_line - 1, column=end_column)))) return agnostic_tokens
def tokenize_and_abstract(self, source_code): """Produces a language-agnostic tokenization of the input code.""" token_pairs: Iterable[Tuple[str, int]] try: token_tuples = unified_tokenizer.code_to_tokens(source_code) token_pairs = ((token_name, token_type) for token_type, token_name, _, _, _ in token_tuples) except (tokenize.TokenError, IndentationError) as e: logging.warning( 'The tokenizer raised exception `%s` while parsing %s', e, source_code) token_pairs = ( (cubert_tokenizer.quote_special( unified_tokenizer.TokenKind.ERROR.name), tokenize.ERRORTOKEN), ('', tokenize.ENDMARKER), ) agnostic_tokens: List[unified_tokenizer.AbstractToken] = [] for spelling, kind in token_pairs: adjusted_spelling = spelling token_kind = unified_tokenizer.TokenKind.NONE if kind == tokenize.NAME: # Disambiguate identifiers from keywords. if keyword.iskeyword(spelling): token_kind = unified_tokenizer.TokenKind.KEYWORD else: token_kind = unified_tokenizer.TokenKind.IDENTIFIER else: if kind in PythonTokenizer._TOKEN_TYPES_TO_TOKENIZE_BY_TYPE: # Replace spelling with type. adjusted_spelling = cubert_tokenizer.token_from_token_type( kind) elif kind is tokenize.INDENT: # For INDENT, in particular, we also record the actual spelling too. adjusted_spelling = '{indent}{spelling}'.format( indent=cubert_tokenizer.token_from_token_type(kind), spelling=spelling) elif kind == tokenize.ENDMARKER: adjusted_spelling = cubert_tokenizer.quote_special( unified_tokenizer.TokenKind.EOS.name) # Map everything according to table. try: token_kind = PythonTokenizer._TOKEN_TYPE_MAP[kind] except KeyError as ke: # It's possible we're here because of async/await. Those kept being # turned into keywords and then removed from keywords, so we can't # rely on knowing which they are. We'll check by spelling. # See: https://bugs.python.org/issue30406 # and https://bugs.python.org/issue33260 # and https://bugs.python.org/issue35975 if spelling in ('async', 'await'): token_kind = unified_tokenizer.TokenKind.KEYWORD else: raise ValueError( 'While trying to turn Python token %r into an ' 'agnostic one, raised %r.' % ((spelling, kind), ke)) agnostic_tokens.append( unified_tokenizer.AbstractToken( spelling=adjusted_spelling, kind=token_kind, # TODO(maniatis): Eventually, we'll store token positioning info # in metadata. metadata=unified_tokenizer.TokenMetadata())) return agnostic_tokens
def tokenize_and_abstract( self, source_code): """As per the superclass.""" agnostic_tokens: List[unified_tokenizer.AbstractToken] = [] try: java_tokens = list( extended_javalang_tokenizer.tokenize_extended(source_code)) except (javalang.LexerError, TypeError) as e: # Sometimes, javalang returns a TypeError when reading a number. # See # https://github.com/c2nes/javalang/blob/0664afb7f4d40254312693f2e833c1ed4ac551c7/javalang/tokenizer.py#L370 logging.warning('The tokenizer raised exception `%r` while parsing %s', e, source_code) # We don't try to do recovery from errors quite yet. Mark the error as # occurring at whatever position we are in and terminate agnostic_tokens.append( unified_tokenizer.AbstractToken( unified_tokenizer.quote_special( unified_tokenizer.TokenKind.ERROR.name), unified_tokenizer.TokenKind.ERROR, unified_tokenizer.TokenMetadata( start=unified_tokenizer.Position( line=0, column=0), end=unified_tokenizer.Position( line=0, column=0)))) agnostic_tokens.append( unified_tokenizer.AbstractToken( '', unified_tokenizer.TokenKind.EOS, unified_tokenizer.TokenMetadata( start=unified_tokenizer.Position( line=0, column=0), end=unified_tokenizer.Position( line=0, column=0)))) else: start_line = 0 start_column = 0 for token in java_tokens: # The token kind is the subclass type of the token. token_type = type(token) if token_type not in JavaTokenizer._TOKEN_TYPE_MAP: raise ValueError( 'Received Java token type %s, but it was unexpected, ' 'while tokenizing \n%s\n' % (token_type, source_code)) # JavaTokenizer counts lines and columns from 1. start_line = token.position.line - 1 start_column = token.position.column - 1 # The tokenizer seems to take some liberties with Unicode, returning # invalid characters. This cleans spellings up. spelling = token.value.encode('utf-8', errors='replace').decode('utf-8') agnostic_tokens.append( unified_tokenizer.AbstractToken( spelling, JavaTokenizer._TOKEN_TYPE_MAP[token_type], unified_tokenizer.TokenMetadata( start=unified_tokenizer.Position( line=start_line, column=start_column)))) # At this point, we have all the tokens, either as produced and abstracted, # or a placeholder error and eos in case of an exception. However, the # tokens only have start positions. Since the extended tokenizer guarantees # that tokens abut, we take a second pass, backwards, setting the end # position of a token from the start position of token following it. The # final token, `EOS` already has an end position, so we don't modify it. eos = agnostic_tokens[-1] if not eos.metadata.start: # This should be there. Raise an exception raise AssertionError('The end of input token is missing positioning ' 'information: %s' % eos) # EOS contains an empty spelling. We replace it here with EOS.name eos = dataclasses.replace( eos, spelling=unified_tokenizer.quote_special( unified_tokenizer.TokenKind.EOS.name)) later_token_start: unified_tokenizer.Position = eos.metadata.start # The EOS token has an empty extent, so the end and the start are set to be # the same. filled_agnostic_tokens = [ dataclasses.replace( eos, metadata=dataclasses.replace(eos.metadata, end=eos.metadata.start)) ] # Go backwards, from the element before `eos` to the beginning. for token in ( agnostic_tokens[i] for i in range(len(agnostic_tokens) - 2, -1, -1)): filled_token = dataclasses.replace( token, metadata=dataclasses.replace(token.metadata, end=later_token_start)) filled_agnostic_tokens.append(filled_token) later_token_start = token.metadata.start # Now we have the tokens, including end position, but they're reversed. # The final step is to break down whitespace tokens into primitive # WHITESPACE tokens and NEWLINE tokens. with_broken_whitespace = [] for token in filled_agnostic_tokens[::-1]: if token.kind is not unified_tokenizer.TokenKind.WHITESPACE: with_broken_whitespace.append(token) else: # This is whitespace. Replace it with primitive tokens. with_broken_whitespace.extend( unified_tokenizer.fill_range_with_whitespace( token.metadata.start, token.metadata.end)) return with_broken_whitespace