def test_empty_list_on_empty_data(self):
        """Test Tokenizer works on empty list"""
        tokenizer = AntlrTokenizer(JavaScriptLexer)
        data = ""

        tokens = tokenizer.tokenize(data)

        self.assertEqual(tokens, [])
    def test_tokenizes_malformed_without_error(self):
        """Tests tokenizer doesnt error on garbage input"""
        tokenizer = AntlrTokenizer(JavaScriptLexer)
        txt = "aasdfj1  1jhsdf9 1 3@ 1 + => adj 193"

        tokens = tokenizer.tokenize(txt)
        self.assertEqual(tokens, [
            'aasdfj1', '  ', '1', 'jhsdf9', ' ', '1', ' ', '3', '@', ' ', '1',
            ' ', '+', ' ', '=>', ' ', 'adj', ' ', '193'
        ])
    def test_combine_same_as_orig(self):
        """Tests the token combiner"""
        tokenizer = AntlrTokenizer(JavaScriptLexer)
        txt = """async function process(array) {
          for await (let i of array) {
              doSomething(i);
            }
          }"""

        tokens = tokenizer.tokenize(txt)

        self.assertEqual(tokenizer.combine(tokens), txt)
示例#4
0
    def run(data,
            thread_count=minimizer.DEFAULT_THREAD_COUNT,
            file_extension=".js"):
        """Attempt to minimize a javascript test case."""
        line_minimizer = delta_minimizer.DeltaMinimizer(
            utils.test,
            max_threads=thread_count,
            file_extension=file_extension)

        js_tokenizer = AntlrTokenizer(JavaScriptLexer)

        js_minimizer = JSMinimizer(
            utils.test,
            max_threads=thread_count,
            tokenizer=js_tokenizer.tokenize,
            token_combiner=js_tokenizer.combine,
            file_extension=file_extension,
        )

        result = line_minimizer.minimize(data)
        result = js_minimizer(result)
        result = js_minimizer(result)
        result = line_minimizer.minimize(result)

        return result
    def test_tokenize_simple_js_file(self):
        """Test tokenizer works with sample JS"""
        tokenizer = AntlrTokenizer(JavaScriptLexer)
        txt = """async function process(array) {
          for await (let i of array) {
              doSomething(i);
            }
          }"""

        tokens = tokenizer.tokenize(txt)
        self.assertEqual(tokens, [
            'async', ' ', 'function', ' ', 'process', '(', 'array', ')', ' ',
            '{', '\n', '          ', 'for', ' ', 'await', ' ', '(', 'let', ' ',
            'i', ' ', 'of', ' ', 'array', ')', ' ', '{', '\n',
            '              ', 'doSomething', '(', 'i', ')', ';', '\n',
            '            ', '}', '\n', '          ', '}'
        ])
示例#6
0
class HTMLMinimizer(minimizer.Minimizer):  # pylint:disable=abstract-method
    """Specialized HTML minimizer.

     Note that this will not work properly with normal tokenizers. It simply
     acts as a wrapper around other minimizers and passes pieces of the HTML
     file to those."""
    class Token(object):
        """Helper class to represent a single token."""
        TYPE_HTML = 0
        TYPE_SCRIPT = 1

        def __init__(self, data, token_type):
            self.data = data
            self.token_type = token_type

        def __str__(self):
            return self.data

    class TokenizerState(object):
        """Enum for tokenizer states."""
        SEARCHING_FOR_SCRIPT = 0
        SEARCHING_FOR_TAG_END = 1
        SEARCHING_FOR_CLOSE_SCRIPT = 2

    HTMLTOKENIZER = AntlrTokenizer(HTMLLexer).tokenize
    JSTOKENIZER = AntlrTokenizer(JavaScriptLexer).tokenize

    TOKENIZER_MAP = {
        Token.TYPE_HTML: [HTMLTOKENIZER, HTMLTOKENIZER, HTMLTOKENIZER],
        Token.TYPE_SCRIPT: [JSTOKENIZER, JSTOKENIZER],
    }

    CHUNK_SIZES = [
        [400, 100, 20, 5],
        [400, 100, 20, 5, 2],
        [400, 100, 20, 5, 1],
    ]

    def __init__(self, test_function, *args, **kwargs):
        # The HTML minimizer will not be used directly. Instead, preserve its
        # arguments and pass them along when creating subminimizers.
        super(HTMLMinimizer, self).__init__(lambda: False)

        assert not args, 'Positional arguments not supported.'
        assert 'tokenizer' not in kwargs, 'Custom tokenizers not supported.'
        assert 'token_combiner' not in kwargs, 'Custom tokenizers not supported.'

        self.test_function = test_function
        self.kwargs = kwargs

    def minimize(self, data):
        """Wrapper to perform common tasks and call |_execute|."""
        # Do an initial line-by-line minimization to filter out noise.
        line_minimizer = delta_minimizer.DeltaMinimizer(
            self.test_function, **self.kwargs)
        # Do two line minimizations to make up for the fact that minimzations on
        # bots don't always minimize as much as they can.
        for _ in range(2):
            data = line_minimizer.minimize(data)

        tokens = self.get_tokens_and_metadata(data)
        for index, token in enumerate(tokens):
            current_tokenizers = self.TOKENIZER_MAP[token.token_type]
            prefix = self.combine_tokens(tokens[:index])
            suffix = self.combine_tokens(tokens[index + 1:])
            token_combiner = functools.partial(self.combine_worker_tokens,
                                               prefix=prefix,
                                               suffix=suffix)

            for level, current_tokenizer in enumerate(current_tokenizers):
                # We need to preserve the parts of the test case that are not currently
                # being minimized. Create a special token combiner that adds these
                # portions of the test to the combined tokens.

                if token.token_type == HTMLMinimizer.Token.TYPE_HTML:
                    current_minimizer = chunk_minimizer.ChunkMinimizer(
                        self.test_function,
                        chunk_sizes=HTMLMinimizer.CHUNK_SIZES[level],
                        token_combiner=token_combiner,
                        tokenizer=current_tokenizer,
                        **self.kwargs)
                else:
                    current_minimizer = js_minimizer.JSMinimizer(
                        self.test_function,
                        token_combiner=token_combiner,
                        tokenizer=current_tokenizer,
                        **self.kwargs)

                result_data = current_minimizer.minimize(token.data)
                start = len(prefix)
                end = len(result_data) - len(suffix)
                token.data = result_data[start:end]

        # TODO(mbarbella): Remove this once other minimizers are improved.
        # Do a final line-by-line minimization pass.
        data = self.combine_tokens(tokens)
        return line_minimizer.minimize(data)

    @staticmethod
    def get_tokens_and_metadata(data):
        """Get the token list with associated metadata."""
        tokens = []
        state = HTMLMinimizer.TokenizerState.SEARCHING_FOR_SCRIPT
        current_token_start = 0
        current_token_type = HTMLMinimizer.Token.TYPE_HTML
        index = 0

        while 0 <= index < len(data):
            if state == HTMLMinimizer.TokenizerState.SEARCHING_FOR_SCRIPT:
                # In this case, we are simply searching for the next script tag.
                index = data.find(SCRIPT_START_STRING, index)
                state = HTMLMinimizer.TokenizerState.SEARCHING_FOR_TAG_END

            elif state == HTMLMinimizer.TokenizerState.SEARCHING_FOR_TAG_END:
                # Make sure that this really looks like a script tag.
                next_newline = data.find('\n', index)
                tag_end = data.find('>', index)
                if 0 <= tag_end < next_newline or next_newline < 0 <= tag_end:
                    # The end of the script tag is before the next newline, so it should
                    # be safe to attempt to split this.
                    index = tag_end + 1
                    token = HTMLMinimizer.Token(
                        data[current_token_start:index], current_token_type)
                    tokens.append(token)

                    # Update state.
                    current_token_type = HTMLMinimizer.Token.TYPE_SCRIPT
                    current_token_start = index
                    state = HTMLMinimizer.TokenizerState.SEARCHING_FOR_CLOSE_SCRIPT
                else:
                    # We found a newline before the end of tag or did not find the end
                    # of the tag, so something seems wrong. Skip this one.
                    index += len(SCRIPT_START_STRING)

            elif state == HTMLMinimizer.TokenizerState.SEARCHING_FOR_CLOSE_SCRIPT:
                # Simply look for the end of this script.
                index = data.find(SCRIPT_END_STRING, index)
                if index < 0:
                    break

                # TODO(mbarbella): Optimize for empty script case (e.g. for "src=").
                token = HTMLMinimizer.Token(data[current_token_start:index],
                                            current_token_type)
                tokens.append(token)

                current_token_start = index
                current_token_type = HTMLMinimizer.Token.TYPE_HTML
                state = HTMLMinimizer.TokenizerState.SEARCHING_FOR_SCRIPT

        token = HTMLMinimizer.Token(data[current_token_start:],
                                    current_token_type)
        tokens.append(token)
        return tokens

    @staticmethod
    def combine_worker_tokens(tokens, prefix='', suffix=''):
        """Combine tokens for a worker minimizer."""
        return '%s%s%s' % (prefix, ''.join(tokens), suffix)

    @staticmethod
    def combine_tokens(tokens):
        """Combine tokens into a usable format, stripping metadata."""
        return ''.join([str(t) for t in tokens])

    @staticmethod
    def run(data,
            thread_count=minimizer.DEFAULT_THREAD_COUNT,
            file_extension='.html'):
        """Attempt to minimize an html test case."""
        html_minimizer = HTMLMinimizer(utils.test,
                                       max_threads=thread_count,
                                       file_extension=file_extension)
        return html_minimizer.minimize(data)
示例#7
0
def tokenize(data):
  """HTML tokenizer."""
  return AntlrTokenizer(HTMLLexer).tokenize(data)