Пример #1
0
    def tokenize_and_abstract(self, source_code):
        """As per the superclass."""
        agnostic_tokens: List[unified_tokenizer.AbstractToken] = []

        try:
            java_tokens = tokenizer.tokenize(source_code)

            for token in java_tokens:
                # The token kind is the subclass type of the token.
                token_type = type(token)
                if token_type not in JavaTokenizer._TOKEN_TYPE_MAP:
                    raise ValueError(
                        'Received Java token type %s, but it was unexpected, '
                        'while tokenizing \n%s\n' % (token_type, source_code))

                # The tokenizer seems to take some liberties with Unicode, returning
                # invalid characters. This cleans spellings up.
                spelling = token.value.encode('utf-8',
                                              errors='replace').decode('utf-8')

                agnostic_tokens.append(
                    unified_tokenizer.AbstractToken(
                        spelling,
                        JavaTokenizer._TOKEN_TYPE_MAP[token_type],
                        unified_tokenizer.
                        TokenMetadata(start=unified_tokenizer.Position(
                            # JavaTokenizer counts lines and columns from 1.
                            line=token.position.line - 1,
                            column=token.position.column - 1))))
        except (tokenizer.LexerError, TypeError) as e:
            # Sometimes, javalang returns a TypeError when reading a number.
            # See
            # https://github.com/c2nes/javalang/blob/0664afb7f4d40254312693f2e833c1ed4ac551c7/javalang/tokenizer.py#L370
            logging.warn(
                'The tokenizer raised exception `%r` while parsing %s', e,
                source_code)
            agnostic_tokens.append(
                unified_tokenizer.AbstractToken(
                    cubert_tokenizer.quote_special(
                        unified_tokenizer.TokenKind.ERROR.name),
                    unified_tokenizer.TokenKind.ERROR,
                    unified_tokenizer.TokenMetadata()))

        # javalang doesn't seem to ever return `EndOfinput` despite there being a
        # token type for it. We insert it here.
        agnostic_tokens.append(
            unified_tokenizer.AbstractToken(
                cubert_tokenizer.quote_special(
                    unified_tokenizer.TokenKind.EOS.name),
                unified_tokenizer.TokenKind.EOS,
                unified_tokenizer.TokenMetadata()))

        return agnostic_tokens
 def test_flatten_raises_when_expected(self, list_of_lists, mapping):
     multi_tokens = []
     for s in list_of_lists:
         multi_tokens.append(
             unified_tokenizer.AbstractMultiToken(
                 spellings=s,
                 kind=unified_tokenizer.TokenKind.STRING,
                 metadata=unified_tokenizer.TokenMetadata()))
     with self.assertRaises(ValueError):
         unified_tokenizer.flatten_and_sanitize_subtoken_lists(
             multi_tokens, sanitization_mapping=mapping, sentinel='^')
 def test_flatten_returns_expected(self, subtoken_lists, mappings,
                                   expected_subtoken_list):
     multi_tokens = []
     for s in subtoken_lists:
         multi_tokens.append(
             unified_tokenizer.AbstractMultiToken(
                 spellings=s,
                 kind=unified_tokenizer.TokenKind.STRING,
                 metadata=unified_tokenizer.TokenMetadata()))
     subtokens = unified_tokenizer.flatten_and_sanitize_subtoken_lists(
         multi_tokens, mappings, sentinel='^')
     self.assertSequenceEqual(expected_subtoken_list, subtokens)
Пример #4
0
  def test_split_agnostic_returns_expected(self, labelled_tokens, max_length,
                                           expected_labelled_subtokens):
    tokens = [
        unified_tokenizer.AbstractToken(s, k, unified_tokenizer.TokenMetadata())
        for s, k in labelled_tokens
    ]
    labelled_subtokens = unified_tokenizer.split_agnostic_tokens(
        tokens, max_length)

    expected_multi_tokens = []
    for spelling_list, kind in expected_labelled_subtokens:
      expected_multi_tokens.append(
          unified_tokenizer.AbstractMultiToken(
              # We cast spellings to tuples, since we know that
              # `split_agnostic_tokens` creates multi tokens with tuples rather
              # than lists.
              spellings=tuple(spelling_list),
              kind=kind,
              metadata=unified_tokenizer.TokenMetadata()))

    self.assertSequenceEqual(expected_multi_tokens, labelled_subtokens)
Пример #5
0
  def tokenize_and_abstract(
      self,
      source_code):
    """Produces a language-agnostic tokenization of the input code."""
    agnostic_tokens: List[unified_tokenizer.AbstractToken] = []

    try:
      token_tuples = unified_tokenizer.code_to_tokens(source_code)
    except (tokenize.TokenError, IndentationError) as e:
      logging.warning('The tokenizer raised exception `%s` while parsing %s', e,
                      source_code)

      # We don't try to do recovery from errors quite yet. Emit just an
      # error and end-of-sequence and return.
      agnostic_tokens.append(
          unified_tokenizer.AbstractToken(
              unified_tokenizer.quote_special(
                  unified_tokenizer.TokenKind.ERROR.name),
              unified_tokenizer.TokenKind.ERROR,
              unified_tokenizer.TokenMetadata(
                  start=unified_tokenizer.Position(
                      line=0, column=0),
                  end=unified_tokenizer.Position(
                      line=0, column=0))))
      agnostic_tokens.append(
          unified_tokenizer.AbstractToken(
              unified_tokenizer.quote_special(
                  unified_tokenizer.TokenKind.EOS.name),
              unified_tokenizer.TokenKind.EOS,
              unified_tokenizer.TokenMetadata(
                  start=unified_tokenizer.Position(
                      line=0, column=0),
                  end=unified_tokenizer.Position(
                      line=0, column=0))))
      return agnostic_tokens

    for token_tuple in token_tuples:
      spelling = token_tuple.string
      kind = token_tuple.type

      # We'll adjust the spelling of some tokens, e.g., those that we
      # tokenize by their type rather than their original spelling. Indentation
      # and dedentation tokens are like that.
      adjusted_spelling = spelling
      token_kind = unified_tokenizer.TokenKind.NONE
      if kind == tokenize.NAME:
        # Disambiguate identifiers from keywords.
        if keyword.iskeyword(spelling):
          token_kind = unified_tokenizer.TokenKind.KEYWORD
        else:
          token_kind = unified_tokenizer.TokenKind.IDENTIFIER
      else:
        if kind in PythonTokenizer._TOKEN_TYPES_TO_TOKENIZE_BY_TYPE:
          # Replace spelling with type.
          adjusted_spelling = cubert_tokenizer.token_from_token_type(kind)
        elif kind is tokenize.INDENT:
          # For INDENT, in particular, we also record the actual spelling too.
          adjusted_spelling = '{indent}{spelling}'.format(
              indent=cubert_tokenizer.token_from_token_type(kind),
              spelling=spelling)
        elif kind == tokenize.ENDMARKER:
          adjusted_spelling = unified_tokenizer.quote_special(
              unified_tokenizer.TokenKind.EOS.name)

        # Map everything according to table.
        try:
          token_kind = PythonTokenizer._TOKEN_TYPE_MAP[kind]
        except KeyError as ke:
          # It's possible we're here because of async/await. Those kept being
          # turned into keywords and then removed from keywords, so we can't
          # rely on knowing which they are. We'll check by spelling.
          # See: https://bugs.python.org/issue30406
          # and https://bugs.python.org/issue33260
          # and https://bugs.python.org/issue35975
          if spelling in ('async', 'await'):
            token_kind = unified_tokenizer.TokenKind.KEYWORD
          else:
            raise ValueError('While trying to turn Python token %r into an '
                             'agnostic one, raised %r.' %
                             ((spelling, kind), ke))

      start_line, start_column = token_tuple.start
      end_line, end_column = token_tuple.end
      # Unlike other languages, NEWLINE tokens are reported as ending on the
      # same line as where they started. We adjust that here, to stick to the
      # same convention as other tokenizers.
      if ((token_kind == unified_tokenizer.TokenKind.NEWLINE) or
          (kind == tokenize.NL)):
        end_line = start_line + 1
        end_column = 0

      agnostic_tokens.append(
          unified_tokenizer.AbstractToken(
              spelling=adjusted_spelling, kind=token_kind,
              metadata=unified_tokenizer.TokenMetadata(
                  # Python's tokenizer counts lines starting from 1, so we
                  # have to offset what we read from the `TokenInfo` tuple.
                  start=unified_tokenizer.Position(
                      line=start_line - 1, column=start_column),
                  end=unified_tokenizer.Position(
                      line=end_line - 1, column=end_column))))

    return agnostic_tokens
Пример #6
0
    def tokenize_and_abstract(self, source_code):
        """Produces a language-agnostic tokenization of the input code."""
        token_pairs: Iterable[Tuple[str, int]]
        try:
            token_tuples = unified_tokenizer.code_to_tokens(source_code)
            token_pairs = ((token_name, token_type)
                           for token_type, token_name, _, _, _ in token_tuples)
        except (tokenize.TokenError, IndentationError) as e:
            logging.warning(
                'The tokenizer raised exception `%s` while parsing %s', e,
                source_code)
            token_pairs = (
                (cubert_tokenizer.quote_special(
                    unified_tokenizer.TokenKind.ERROR.name),
                 tokenize.ERRORTOKEN),
                ('', tokenize.ENDMARKER),
            )

        agnostic_tokens: List[unified_tokenizer.AbstractToken] = []

        for spelling, kind in token_pairs:
            adjusted_spelling = spelling
            token_kind = unified_tokenizer.TokenKind.NONE
            if kind == tokenize.NAME:
                # Disambiguate identifiers from keywords.
                if keyword.iskeyword(spelling):
                    token_kind = unified_tokenizer.TokenKind.KEYWORD
                else:
                    token_kind = unified_tokenizer.TokenKind.IDENTIFIER
            else:
                if kind in PythonTokenizer._TOKEN_TYPES_TO_TOKENIZE_BY_TYPE:
                    # Replace spelling with type.
                    adjusted_spelling = cubert_tokenizer.token_from_token_type(
                        kind)
                elif kind is tokenize.INDENT:
                    # For INDENT, in particular, we also record the actual spelling too.
                    adjusted_spelling = '{indent}{spelling}'.format(
                        indent=cubert_tokenizer.token_from_token_type(kind),
                        spelling=spelling)
                elif kind == tokenize.ENDMARKER:
                    adjusted_spelling = cubert_tokenizer.quote_special(
                        unified_tokenizer.TokenKind.EOS.name)

                # Map everything according to table.
                try:
                    token_kind = PythonTokenizer._TOKEN_TYPE_MAP[kind]
                except KeyError as ke:
                    # It's possible we're here because of async/await. Those kept being
                    # turned into keywords and then removed from keywords, so we can't
                    # rely on knowing which they are. We'll check by spelling.
                    # See: https://bugs.python.org/issue30406
                    # and https://bugs.python.org/issue33260
                    # and https://bugs.python.org/issue35975
                    if spelling in ('async', 'await'):
                        token_kind = unified_tokenizer.TokenKind.KEYWORD
                    else:
                        raise ValueError(
                            'While trying to turn Python token %r into an '
                            'agnostic one, raised %r.' %
                            ((spelling, kind), ke))

            agnostic_tokens.append(
                unified_tokenizer.AbstractToken(
                    spelling=adjusted_spelling,
                    kind=token_kind,
                    # TODO(maniatis): Eventually, we'll store token positioning info
                    # in metadata.
                    metadata=unified_tokenizer.TokenMetadata()))

        return agnostic_tokens
Пример #7
0
  def tokenize_and_abstract(
      self,
      source_code):
    """As per the superclass."""
    agnostic_tokens: List[unified_tokenizer.AbstractToken] = []

    try:
      java_tokens = list(
          extended_javalang_tokenizer.tokenize_extended(source_code))
    except (javalang.LexerError, TypeError) as e:
      # Sometimes, javalang returns a TypeError when reading a number.
      # See
      # https://github.com/c2nes/javalang/blob/0664afb7f4d40254312693f2e833c1ed4ac551c7/javalang/tokenizer.py#L370
      logging.warning('The tokenizer raised exception `%r` while parsing %s', e,
                      source_code)

      # We don't try to do recovery from errors quite yet. Mark the error as
      # occurring at whatever position we are in and terminate
      agnostic_tokens.append(
          unified_tokenizer.AbstractToken(
              unified_tokenizer.quote_special(
                  unified_tokenizer.TokenKind.ERROR.name),
              unified_tokenizer.TokenKind.ERROR,
              unified_tokenizer.TokenMetadata(
                  start=unified_tokenizer.Position(
                      line=0, column=0),
                  end=unified_tokenizer.Position(
                      line=0, column=0))))
      agnostic_tokens.append(
          unified_tokenizer.AbstractToken(
              '',
              unified_tokenizer.TokenKind.EOS,
              unified_tokenizer.TokenMetadata(
                  start=unified_tokenizer.Position(
                      line=0, column=0),
                  end=unified_tokenizer.Position(
                      line=0, column=0))))
    else:
      start_line = 0
      start_column = 0
      for token in java_tokens:
        # The token kind is the subclass type of the token.
        token_type = type(token)
        if token_type not in JavaTokenizer._TOKEN_TYPE_MAP:
          raise ValueError(
              'Received Java token type %s, but it was unexpected, '
              'while tokenizing \n%s\n' % (token_type, source_code))

        # JavaTokenizer counts lines and columns from 1.
        start_line = token.position.line - 1
        start_column = token.position.column - 1

        # The tokenizer seems to take some liberties with Unicode, returning
        # invalid characters. This cleans spellings up.
        spelling = token.value.encode('utf-8', errors='replace').decode('utf-8')
        agnostic_tokens.append(
            unified_tokenizer.AbstractToken(
                spelling, JavaTokenizer._TOKEN_TYPE_MAP[token_type],
                unified_tokenizer.TokenMetadata(
                    start=unified_tokenizer.Position(
                        line=start_line, column=start_column))))

    # At this point, we have all the tokens, either as produced and abstracted,
    # or a placeholder error and eos in case of an exception. However, the
    # tokens only have start positions. Since the extended tokenizer guarantees
    # that tokens abut, we take a second pass, backwards, setting the end
    # position of a token from the start position of token following it. The
    # final token, `EOS` already has an end position, so we don't modify it.
    eos = agnostic_tokens[-1]
    if not eos.metadata.start:
      # This should be there. Raise an exception
      raise AssertionError('The end of input token is missing positioning '
                           'information: %s' % eos)
    # EOS contains an empty spelling. We replace it here with EOS.name
    eos = dataclasses.replace(
        eos,
        spelling=unified_tokenizer.quote_special(
            unified_tokenizer.TokenKind.EOS.name))

    later_token_start: unified_tokenizer.Position = eos.metadata.start

    # The EOS token has an empty extent, so the end and the start are set to be
    # the same.
    filled_agnostic_tokens = [
        dataclasses.replace(
            eos,
            metadata=dataclasses.replace(eos.metadata, end=eos.metadata.start))
    ]
    # Go backwards, from the element before `eos` to the beginning.
    for token in (
        agnostic_tokens[i] for i in range(len(agnostic_tokens) - 2, -1, -1)):
      filled_token = dataclasses.replace(
          token,
          metadata=dataclasses.replace(token.metadata, end=later_token_start))
      filled_agnostic_tokens.append(filled_token)
      later_token_start = token.metadata.start

    # Now we have the tokens, including end position, but they're reversed.
    # The final step is to break down whitespace tokens into primitive
    # WHITESPACE tokens and NEWLINE tokens.
    with_broken_whitespace = []
    for token in filled_agnostic_tokens[::-1]:
      if token.kind is not unified_tokenizer.TokenKind.WHITESPACE:
        with_broken_whitespace.append(token)
      else:
        # This is whitespace. Replace it with primitive tokens.
        with_broken_whitespace.extend(
            unified_tokenizer.fill_range_with_whitespace(
                token.metadata.start, token.metadata.end))

    return with_broken_whitespace