def __init__(self,
              max_output_token_length=MAX_OUTPUT_TOKEN_LENGTH,
              reserved=()):
     self.types_to_skip = ()
     self.reserved = reserved
     self.mappings: Dict[str, str]
     self.update_mappings({
         # By default, replace \n and \r. This is meant primarily for literals.
         '\n':
         unified_tokenizer.quote_special('NLCHAR'),
         '\r':
         unified_tokenizer.quote_special('CR'),
         unified_tokenizer.SENTINEL:
         unified_tokenizer.quote_special(unified_tokenizer.SENTINEL_ESCAPE),
     })
     self.max_output_token_length = max_output_token_length
    def untokenize(self, token_list):
        """Untokenizes via `untokenize_abstract`."""
        # Untokenize agnostic.
        if (not token_list
                or token_list[-1] != unified_tokenizer.quote_special(
                    unified_tokenizer.TokenKind.EOS.name)):
            raise ValueError(
                'Token list %r should end with the EOS token %r.' %
                (token_list,
                 unified_tokenizer.quote_special(
                     unified_tokenizer.TokenKind.EOS.name)))

        whole_tokens = unified_tokenizer.reconstitute_full_unsanitary_tokens(
            token_list,
            sanitization_mapping=self.mappings,
            sentinel=unified_tokenizer.SENTINEL)

        return self.untokenize_abstract(whole_tokens)
Exemplo n.º 3
0
    def untokenize_abstract(self, whole_tokens):
        tokens: List[str] = []

        for token in whole_tokens[:
                                  -1]:  # Skip EOS. The caller checked it's there.
            if token == unified_tokenizer.quote_special(
                    unified_tokenizer.TokenKind.NEWLINE.name):
                tokens.append('\n')
            else:
                tokens.append(token)
        return ''.join(tokens)
Exemplo n.º 4
0
    def tokenize_and_abstract(self, source_code):
        """As per the superclass."""
        agnostic_tokens: List[unified_tokenizer.AbstractToken] = []

        try:
            java_tokens = list(
                extended_javalang_tokenizer.tokenize_extended(source_code))
        except (javalang.LexerError, TypeError) as e:
            # Sometimes, javalang returns a TypeError when reading a number.
            # See
            # https://github.com/c2nes/javalang/blob/0664afb7f4d40254312693f2e833c1ed4ac551c7/javalang/tokenizer.py#L370
            logging.warning(
                'The tokenizer raised exception `%r` while parsing %s', e,
                source_code)

            # We don't try to do recovery from errors quite yet. Mark the error as
            # occurring at whatever position we are in and terminate
            agnostic_tokens.append(
                unified_tokenizer.AbstractToken(
                    unified_tokenizer.quote_special(
                        unified_tokenizer.TokenKind.ERROR.name),
                    unified_tokenizer.TokenKind.ERROR,
                    unified_tokenizer.TokenMetadata(
                        start=unified_tokenizer.Position(line=0, column=0),
                        end=unified_tokenizer.Position(line=0, column=0))))
            agnostic_tokens.append(
                unified_tokenizer.AbstractToken(
                    unified_tokenizer.quote_special(
                        unified_tokenizer.TokenKind.EOS.name),
                    unified_tokenizer.TokenKind.EOS,
                    unified_tokenizer.TokenMetadata(
                        start=unified_tokenizer.Position(line=0, column=0),
                        end=unified_tokenizer.Position(line=0, column=0))))
        else:
            start_line = 0
            start_column = 0
            for token in java_tokens:
                # The token kind is the subclass type of the token.
                token_type = type(token)
                if token_type not in JavaTokenizer._TOKEN_TYPE_MAP:
                    raise ValueError(
                        'Received Java token type %s, but it was unexpected, '
                        'while tokenizing \n%s\n' % (token_type, source_code))

                # JavaTokenizer counts lines and columns from 1.
                start_line = token.position.line - 1
                start_column = token.position.column - 1

                # The tokenizer seems to take some liberties with Unicode, returning
                # invalid characters. This cleans spellings up.
                spelling = token.value.encode('utf-8',
                                              errors='replace').decode('utf-8')
                agnostic_tokens.append(
                    unified_tokenizer.AbstractToken(
                        spelling, JavaTokenizer._TOKEN_TYPE_MAP[token_type],
                        unified_tokenizer.TokenMetadata(
                            start=unified_tokenizer.Position(
                                line=start_line, column=start_column))))

        # At this point, we have all the tokens, either as produced and abstracted,
        # or a placeholder error and eos in case of an exception. However, the
        # tokens only have start positions. Since the extended tokenizer guarantees
        # that tokens abut, we take a second pass, backwards, setting the end
        # position of a token from the start position of token following it. The
        # final token, `EOS` already has an end position, so we don't modify it.
        eos = agnostic_tokens[-1]
        if not eos.metadata.start:
            # This should be there. Raise an exception
            raise AssertionError(
                'The end of input token is missing positioning '
                'information: %s' % eos)
        later_token_start: unified_tokenizer.Position = eos.metadata.start

        # The EOS token has an empty extent, so the end and the start are set to be
        # the same.
        filled_agnostic_tokens = [
            dataclasses.replace(eos,
                                metadata=dataclasses.replace(
                                    eos.metadata, end=eos.metadata.start))
        ]
        # Go backwards, from the element before `eos` to the beginning.
        for token in (agnostic_tokens[i]
                      for i in range(len(agnostic_tokens) - 2, -1, -1)):
            filled_token = dataclasses.replace(token,
                                               metadata=dataclasses.replace(
                                                   token.metadata,
                                                   end=later_token_start))
            filled_agnostic_tokens.append(filled_token)
            later_token_start = token.metadata.start

        # Now we have the tokens, including end position, but they're reversed.
        # The final step is to break down whitespace tokens into primitive
        # WHITESPACE tokens and NEWLINE tokens.
        with_broken_whitespace = []
        for token in filled_agnostic_tokens[::-1]:
            if token.kind is not unified_tokenizer.TokenKind.WHITESPACE:
                with_broken_whitespace.append(token)
            else:
                # This is whitespace. Replace it with primitive tokens.
                with_broken_whitespace.extend(
                    unified_tokenizer.fill_range_with_whitespace(
                        token.metadata.start, token.metadata.end))

        return with_broken_whitespace
Exemplo n.º 5
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for java_tokenizer."""
from typing import Sequence, Tuple


from absl.testing import absltest
from absl.testing import parameterized
import java_tokenizer
import unified_tokenizer


_NEWLINE_NAME = unified_tokenizer.quote_special(
    unified_tokenizer.TokenKind.NEWLINE.name)


class JavaTokenizerTest(parameterized.TestCase):

  @parameterized.named_parameters(
      (
          'nothing',
          '',
          (),
      ),
      (
          'same_line',
          """TokenA TokenB""",
          #  0     67
          (
def token_from_token_type(token_type):
    """Turns a token type into a reserved token string."""
    # We use the tok_name dict from tokenize, not token. The former has
    # NL and COMMENT and such, whereas the latter doesn't.
    return unified_tokenizer.quote_special(tokenize.tok_name[token_type])
Exemplo n.º 7
0
    def tokenize_and_abstract(self, source_code):
        """Produces a language-agnostic tokenization of the input code."""
        agnostic_tokens: List[unified_tokenizer.AbstractToken] = []

        try:
            token_tuples = unified_tokenizer.code_to_tokens(source_code)
        except (tokenize.TokenError, IndentationError) as e:
            logging.warning(
                'The tokenizer raised exception `%s` while parsing %s', e,
                source_code)

            # We don't try to do recovery from errors quite yet. Emit just an
            # error and end-of-sequence and return.
            agnostic_tokens.append(
                unified_tokenizer.AbstractToken(
                    unified_tokenizer.quote_special(
                        unified_tokenizer.TokenKind.ERROR.name),
                    unified_tokenizer.TokenKind.ERROR,
                    unified_tokenizer.TokenMetadata(
                        start=unified_tokenizer.Position(line=0, column=0),
                        end=unified_tokenizer.Position(line=0, column=0))))
            agnostic_tokens.append(
                unified_tokenizer.AbstractToken(
                    unified_tokenizer.quote_special(
                        unified_tokenizer.TokenKind.EOS.name),
                    unified_tokenizer.TokenKind.EOS,
                    unified_tokenizer.TokenMetadata(
                        start=unified_tokenizer.Position(line=0, column=0),
                        end=unified_tokenizer.Position(line=0, column=0))))
            return agnostic_tokens

        for token_tuple in token_tuples:
            spelling = token_tuple.string
            kind = token_tuple.type

            # We'll adjust the spelling of some tokens, e.g., those that we
            # tokenize by their type rather than their original spelling. Indentation
            # and dedentation tokens are like that.
            adjusted_spelling = spelling
            token_kind = unified_tokenizer.TokenKind.NONE
            if kind == tokenize.NAME:
                # Disambiguate identifiers from keywords.
                if keyword.iskeyword(spelling):
                    token_kind = unified_tokenizer.TokenKind.KEYWORD
                else:
                    token_kind = unified_tokenizer.TokenKind.IDENTIFIER
            else:
                if kind in PythonTokenizer._TOKEN_TYPES_TO_TOKENIZE_BY_TYPE:
                    # Replace spelling with type.
                    adjusted_spelling = cubert_tokenizer.token_from_token_type(
                        kind)
                elif kind is tokenize.INDENT:
                    # For INDENT, in particular, we also record the actual spelling too.
                    adjusted_spelling = '{indent}{spelling}'.format(
                        indent=cubert_tokenizer.token_from_token_type(kind),
                        spelling=spelling)
                elif kind == tokenize.ENDMARKER:
                    adjusted_spelling = unified_tokenizer.quote_special(
                        unified_tokenizer.TokenKind.EOS.name)

                # Map everything according to table.
                try:
                    token_kind = PythonTokenizer._TOKEN_TYPE_MAP[kind]
                except KeyError as ke:
                    # It's possible we're here because of async/await. Those kept being
                    # turned into keywords and then removed from keywords, so we can't
                    # rely on knowing which they are. We'll check by spelling.
                    # See: https://bugs.python.org/issue30406
                    # and https://bugs.python.org/issue33260
                    # and https://bugs.python.org/issue35975
                    if spelling in ('async', 'await'):
                        token_kind = unified_tokenizer.TokenKind.KEYWORD
                    else:
                        raise ValueError(
                            'While trying to turn Python token %r into an '
                            'agnostic one, raised %r.' %
                            ((spelling, kind), ke))

            start_line, start_column = token_tuple.start
            end_line, end_column = token_tuple.end
            # Unlike other languages, NEWLINE tokens are reported as ending on the
            # same line as where they started. We adjust that here, to stick to the
            # same convention as other tokenizers.
            if ((token_kind == unified_tokenizer.TokenKind.NEWLINE)
                    or (kind == tokenize.NL)):
                end_line = start_line + 1
                end_column = 0

            agnostic_tokens.append(
                unified_tokenizer.AbstractToken(
                    spelling=adjusted_spelling,
                    kind=token_kind,
                    metadata=unified_tokenizer.TokenMetadata(
                        # Python's tokenizer counts lines starting from 1, so we
                        # have to offset what we read from the `TokenInfo` tuple.
                        start=unified_tokenizer.Position(line=start_line - 1,
                                                         column=start_column),
                        end=unified_tokenizer.Position(line=end_line - 1,
                                                       column=end_column))))

        return agnostic_tokens
Exemplo n.º 8
0
class PythonTokenizer(cubert_tokenizer.CuBertTokenizer):
    """Tokenizer that extracts Python's lexical elements preserving strings."""
    _TOKEN_TYPE_MAP = {
        tokenize.COMMENT: unified_tokenizer.TokenKind.COMMENT,
        tokenize.DEDENT: unified_tokenizer.TokenKind.KEYWORD,
        tokenize.ENDMARKER: unified_tokenizer.TokenKind.EOS,
        tokenize.ERRORTOKEN: unified_tokenizer.TokenKind.ERROR,
        tokenize.INDENT: unified_tokenizer.TokenKind.KEYWORD,
        tokenize.NEWLINE: unified_tokenizer.TokenKind.NEWLINE,
        tokenize.NL: unified_tokenizer.TokenKind.PUNCTUATION,
        tokenize.NUMBER: unified_tokenizer.TokenKind.NUMBER,
        tokenize.OP: unified_tokenizer.TokenKind.PUNCTUATION,
        tokenize.STRING: unified_tokenizer.TokenKind.STRING,
    }
    _REVERSE_TOKEN_MAP = {
        cubert_tokenizer.token_from_token_type(tokenize.INDENT):
        tokenize.INDENT,
        cubert_tokenizer.token_from_token_type(tokenize.DEDENT):
        tokenize.DEDENT,
        unified_tokenizer.quote_special(unified_tokenizer.TokenKind.EOS.name):
        tokenize.ENDMARKER,
        unified_tokenizer.quote_special(unified_tokenizer.TokenKind.ERROR.name):
        tokenize.ERRORTOKEN,
        unified_tokenizer.quote_special(unified_tokenizer.TokenKind.NEWLINE.name):
        tokenize.NEWLINE,
        cubert_tokenizer.token_from_token_type(tokenize.NL):
        tokenize.NL,
    }
    # Adding the end-of-string anchor \Z below, since re.fullmatch wasn't
    # available in Python2.
    # pytype: disable=module-attr
    _NUMBERS = re.compile('(' + tokenize.Number + r')\Z')
    # pytype: disable=module-attr
    _SINGLE_STRINGS = re.compile('(' + tokenize.String + r')\Z')
    _TRIPLE_STRING_BEGINNINGS = re.compile(tokenize.Triple)  # pytype: disable=module-attr
    # pytype: disable=module-attr
    _COMMENTS = re.compile('(' + tokenize.Comment + r')\Z')

    _EXACT_TOKEN_TYPES = tokenize.EXACT_TOKEN_TYPES.keys()  # pytype: disable=module-attr

    # Token types that CubertTokenizer will tokenize by their type and not
    # content.
    _TOKEN_TYPES_TO_TOKENIZE_BY_TYPE = [
        tokenize.NEWLINE, tokenize.DEDENT, tokenize.NL
    ]

    def tokenize_and_abstract(self, source_code):
        """Produces a language-agnostic tokenization of the input code."""
        agnostic_tokens: List[unified_tokenizer.AbstractToken] = []

        try:
            token_tuples = unified_tokenizer.code_to_tokens(source_code)
        except (tokenize.TokenError, IndentationError) as e:
            logging.warning(
                'The tokenizer raised exception `%s` while parsing %s', e,
                source_code)

            # We don't try to do recovery from errors quite yet. Emit just an
            # error and end-of-sequence and return.
            agnostic_tokens.append(
                unified_tokenizer.AbstractToken(
                    unified_tokenizer.quote_special(
                        unified_tokenizer.TokenKind.ERROR.name),
                    unified_tokenizer.TokenKind.ERROR,
                    unified_tokenizer.TokenMetadata(
                        start=unified_tokenizer.Position(line=0, column=0),
                        end=unified_tokenizer.Position(line=0, column=0))))
            agnostic_tokens.append(
                unified_tokenizer.AbstractToken(
                    unified_tokenizer.quote_special(
                        unified_tokenizer.TokenKind.EOS.name),
                    unified_tokenizer.TokenKind.EOS,
                    unified_tokenizer.TokenMetadata(
                        start=unified_tokenizer.Position(line=0, column=0),
                        end=unified_tokenizer.Position(line=0, column=0))))
            return agnostic_tokens

        for token_tuple in token_tuples:
            spelling = token_tuple.string
            kind = token_tuple.type

            # We'll adjust the spelling of some tokens, e.g., those that we
            # tokenize by their type rather than their original spelling. Indentation
            # and dedentation tokens are like that.
            adjusted_spelling = spelling
            token_kind = unified_tokenizer.TokenKind.NONE
            if kind == tokenize.NAME:
                # Disambiguate identifiers from keywords.
                if keyword.iskeyword(spelling):
                    token_kind = unified_tokenizer.TokenKind.KEYWORD
                else:
                    token_kind = unified_tokenizer.TokenKind.IDENTIFIER
            else:
                if kind in PythonTokenizer._TOKEN_TYPES_TO_TOKENIZE_BY_TYPE:
                    # Replace spelling with type.
                    adjusted_spelling = cubert_tokenizer.token_from_token_type(
                        kind)
                elif kind is tokenize.INDENT:
                    # For INDENT, in particular, we also record the actual spelling too.
                    adjusted_spelling = '{indent}{spelling}'.format(
                        indent=cubert_tokenizer.token_from_token_type(kind),
                        spelling=spelling)
                elif kind == tokenize.ENDMARKER:
                    adjusted_spelling = unified_tokenizer.quote_special(
                        unified_tokenizer.TokenKind.EOS.name)

                # Map everything according to table.
                try:
                    token_kind = PythonTokenizer._TOKEN_TYPE_MAP[kind]
                except KeyError as ke:
                    # It's possible we're here because of async/await. Those kept being
                    # turned into keywords and then removed from keywords, so we can't
                    # rely on knowing which they are. We'll check by spelling.
                    # See: https://bugs.python.org/issue30406
                    # and https://bugs.python.org/issue33260
                    # and https://bugs.python.org/issue35975
                    if spelling in ('async', 'await'):
                        token_kind = unified_tokenizer.TokenKind.KEYWORD
                    else:
                        raise ValueError(
                            'While trying to turn Python token %r into an '
                            'agnostic one, raised %r.' %
                            ((spelling, kind), ke))

            start_line, start_column = token_tuple.start
            end_line, end_column = token_tuple.end
            # Unlike other languages, NEWLINE tokens are reported as ending on the
            # same line as where they started. We adjust that here, to stick to the
            # same convention as other tokenizers.
            if ((token_kind == unified_tokenizer.TokenKind.NEWLINE)
                    or (kind == tokenize.NL)):
                end_line = start_line + 1
                end_column = 0

            agnostic_tokens.append(
                unified_tokenizer.AbstractToken(
                    spelling=adjusted_spelling,
                    kind=token_kind,
                    metadata=unified_tokenizer.TokenMetadata(
                        # Python's tokenizer counts lines starting from 1, so we
                        # have to offset what we read from the `TokenInfo` tuple.
                        start=unified_tokenizer.Position(line=start_line - 1,
                                                         column=start_column),
                        end=unified_tokenizer.Position(line=end_line - 1,
                                                       column=end_column))))

        return agnostic_tokens

    def untokenize_abstract(self, whole_tokens):
        # Reconstruct Python tokenizer tuples, so that Python's untokenize can be
        # invoked.
        token_tuples: List[Tuple[int, str]] = []

        for whole_token in whole_tokens:
            if whole_token in PythonTokenizer._EXACT_TOKEN_TYPES:
                token_tuples.append((tokenize.OP, whole_token))
            elif cubert_tokenizer.token_from_token_type(
                    tokenize.INDENT) in whole_token:
                # We baked the type and spelling into one token. Break them up.
                spelling = whole_token.replace(
                    cubert_tokenizer.token_from_token_type(tokenize.INDENT),
                    '')
                token_tuples.append((tokenize.INDENT, spelling))
            elif whole_token in PythonTokenizer._REVERSE_TOKEN_MAP:
                python_kind = PythonTokenizer._REVERSE_TOKEN_MAP[whole_token]
                if python_kind in (tokenize.DEDENT, tokenize.ENDMARKER,
                                   tokenize.ERRORTOKEN):
                    spelling = ''
                else:  # python_kind in (tokenize.NEWLINE, tokenize.NL)
                    spelling = '\n'
                token_tuples.append((python_kind, spelling))
            elif keyword.iskeyword(whole_token):
                token_tuples.append((tokenize.NAME, whole_token))
            elif PythonTokenizer._NUMBERS.match(whole_token):
                token_tuples.append((tokenize.NUMBER, whole_token))
            elif PythonTokenizer._SINGLE_STRINGS.match(whole_token):
                token_tuples.append((tokenize.STRING, whole_token))
            elif PythonTokenizer._TRIPLE_STRING_BEGINNINGS.match(whole_token):
                token_tuples.append((tokenize.STRING, whole_token))
            elif PythonTokenizer._COMMENTS.match(whole_token):
                token_tuples.append((tokenize.COMMENT, whole_token))
            else:
                # Everything else we map back to NAME.
                token_tuples.append((tokenize.NAME, whole_token))

        reconstructed = tokenize.untokenize(typing.cast(Any, token_tuples))
        return reconstructed