Exemplo n.º 1
0
    def __init__(self, code, extension):
        super().__init__()

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        real_tb = RealTokenBuilder(False, False, None)
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None)
        operand_types.append('number')

        num_variable_tb = PrefixedIntegerTokenBuilder('$', False, '0123456789')
        operand_types.append('variable')

        known_variables = [
            'ARGC',
            'ARGV',
            'ENVIRON',
            'FILENAME',
            'FS',
            'NF',
            'NR',
            'FNR',
            'OFMT',
            'OFS',
            'ORS',
            'RLENGTH',
            'RS',
            'RSTART',
            'SUBSEP',
        ]

        known_variables_gnu = [
            'ARGIND', 'BINMODE', 'ERRNO', 'FIELDWIDTHS', 'IGNORECASE', 'LINT',
            'PROCINFO', 'TEXTDOMAIN'
        ]

        if extension == 'gnu':
            known_variables += known_variables_gnu

        variable_tb = CaseSensitiveListTokenBuilder(known_variables,
                                                    'variable', True)

        regex_tb = RegexTokenBuilder()
        operand_types.append('regex')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        hash_comment_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment')

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '=', '+', '-', '*', '/', '%', '^', '++', '--', '==', '+=', '-=',
            '*=', '/=', '%=', '^=', '!=', '>', '>=', '<', '<=', '&&', '||',
            '|', '!', '?', ':', '~', '!~'
        ]

        self.unary_operators = ['+', '-', '!', '~', '++', '--']

        self.postfix_operators = [
            '++',
            '--',
        ]

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'BEGIN', 'END', 'if', 'else', 'while', 'do', 'for', 'break',
            'continue', 'delete', 'next', 'nextfile', 'function', 'func',
            'exit'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, variable_tb, num_variable_tb,
            real_tb, real_exponent_tb, keyword_tb, known_operator_tb,
            groupers_tb, regex_tb, identifier_tb, string_tb, hash_comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_parens_continued_lines(tokens)
        tokens = Examiner.join_operator_continued_lines(
            tokens, self.postfix_operators)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'variable', 'regex']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        # self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()
        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Exemplo n.º 2
0
    def __init__(self, code):
        super().__init__()

        self.newlines_important = 'parens'

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()
        stmt_separator_tb = SingleCharacterTokenBuilder(
            ';', 'statement separator', False)

        integer_tb = IntegerTokenBuilder('_')
        integer_exponent_tb = IntegerExponentTokenBuilder('_')
        real_tb = RealTokenBuilder(True, True, '_')
        real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', '_')
        operand_types.append('number')

        identifier_tb = RubyIdentifierTokenBuilder()
        operand_types.append('identifier')

        symbol_tb = PrefixedIdentifierTokenBuilder(':', 'symbol', True)
        operand_types.append('symbol')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 10)
        operand_types.append('string')

        regex_tb = RegexTokenBuilder()
        operand_types.append('regex')

        heredoc_tb = HereDocTokenBuilder('<<-')

        hash_comment_tb = LeadToEndOfLineTokenBuilder('#', False, 'comment')

        known_operators = [
            '!', '~', '**', '*', '/', '%', '+', '-', '<<', '>>', '&', '|', '^',
            '<', '<=', '>', '>=', '==', '===', '!=', '=~', '!~', '<=>', '&&',
            '||', '..', '...', '?', ':', '=', '**=', '*=', '/=', '%=', '+=',
            '-=', '<<=', '>>=', '&&=', '&=', '||=', '|=', '^=', 'not', 'and',
            'or', 'in', '.', '.:', '=>', '::', '<<-'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', '!', '~', '&', '*', '**', '<<-']

        self.postfix_operators = ['++', '--']

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'BEGIN', 'END', 'alias', 'begin', 'break', 'case', 'class', 'def',
            'defined?', 'do', 'else', 'elsif', 'end', 'ensure', 'for', 'if',
            'module', 'next', 'redo', 'rescue', 'retry', 'return', 'then',
            'undef', 'unless', 'until', 'when', 'while', 'yield'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        values = ['nil', 'self', 'true', 'false', 'super']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        array_markers = ['%w', '%q', '%Q', '%i', '%s', '%x']

        array_marker_tb = CaseSensitiveListTokenBuilder(
            array_markers, 'identifier', True)

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, stmt_separator_tb, integer_tb,
            integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb,
            values_tb, symbol_tb, known_operator_tb, groupers_tb, regex_tb,
            identifier_tb, array_marker_tb, string_tb, heredoc_tb,
            hash_comment_tb, self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.convert_bars_to_groups()
        self.convert_keywords_to_identifiers(['.'])
        self.convert_operators_to_identifiers()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_parens_continued_lines(tokens)
        tokens = Examiner.join_operator_continued_lines(
            tokens, self.postfix_operators)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'string', 'symbol']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        # self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        openers = ['begin', 'def', 'do', 'class', 'module']
        closers = ['end']
        self.calc_paired_blockers_confidence(openers, closers)

        self.calc_line_length_confidence(code, self.max_expected_line)
Exemplo n.º 3
0
    def __init__(self, code):
        super().__init__()
        self.newlines_important = 'parens'

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder('_')
        integer_exponent_tb = IntegerExponentTokenBuilder('_')
        real_tb = RealTokenBuilder(False, False, '_')
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', '_')
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’", '`']
        string_tb = EscapedStringTokenBuilder(quotes, 10)
        raw_string_tb = PrefixedRawStringTokenBuilder('r', True, quotes)
        operand_types.append('string')

        hash_comment_tb = LeadToEndOfLineTokenBuilder('#', True, 'comment')

        known_operators = [
            '+', '-', '*', '/', '**', '^', '%%', '%/%', '%*%', '%in%', '<',
            '<=', '>', '>=', '==', '!=', '!', '|', '&', '||', '&&', '.', ':',
            '::', '[[', ']]', '@', '$', '=', '<-', '<<-', '->', '->>'
        ]

        self.unary_operators = ['+', '-', '!', '@', '.']

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)
        stmt_separator_tb = SingleCharacterTokenBuilder(
            ';', 'statement separator', False)
        user_operator_tb = ROperatorTokenBuilder()

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'if', 'else', 'repeat', 'while', 'function', 'for', 'in', 'next',
            'break', 'library', 'print', 'lapply', 'rep', 'list', 'matrix',
            'colnames', 'rownames', 'cbind', 'dim'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        values = [
            'TRUE', 'FALSE', 'NULL', 'Inf', 'NaN', 'NA', 'NA_integer_',
            'NA_real_', 'NA_complex_', 'NA_character_', '...'
        ]

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, stmt_separator_tb, integer_tb,
            integer_exponent_tb, real_tb, real_exponent_tb, keyword_tb,
            values_tb, user_operator_tb, known_operator_tb, groupers_tb,
            identifier_tb, string_tb, raw_string_tb, hash_comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')
        self.convert_keywords_to_identifiers(['<-', '.', '='])

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_parens_continued_lines(tokens)
        tokens = Examiner.join_operator_continued_lines(
            tokens, self.postfix_operators)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = [
            'number', 'string', 'identifier', 'variable', 'symbol'
        ]
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_line_length_confidence(code, self.max_expected_line)
Exemplo n.º 4
0
    def __init__(self, code):
        super().__init__()
        self.newlines_important = 'always'

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()
        stmt_separator_tb = SingleCharacterTokenBuilder(
            ';', 'statement separator', False)

        integer_tb = IntegerTokenBuilder('_')
        integer_exponent_tb = IntegerExponentTokenBuilder('_')
        real_tb = RealTokenBuilder(False, False, '_')
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', '_')
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        decorator_tb = PrefixedIdentifierTokenBuilder('@', 'decorator', False)

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        raw_string_tb = PrefixedRawStringTokenBuilder('r', True, quotes)
        byte_string_tb = PrefixedStringTokenBuilder('b', True, quotes)
        unicode_string_tb = PrefixedStringTokenBuilder('u', True, quotes)
        fast_string_tb = PrefixedStringTokenBuilder('f', True, quotes)
        operand_types.append('string')

        triple_quote_comment_tb = TripleQuoteStringTokenBuilder(quotes)
        raw_triple_quote_comment_tb = RawTripleQuoteCommentTokenBuilder()
        hash_comment_tb = LeadToEndOfLineTokenBuilder('#', True, 'comment')

        known_operators = [
            '+', '-', '*', '/', '%', '@', '=', ':=', '==', '>', '>=', '<',
            '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=',
            '&', '|', '~', '<<', '>>', '**', '.', ':', '++', '--', 'and', 'or',
            'in', 'is', 'not'
        ]

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        self.unary_operators = ['+', '-', 'not', '~', '++', '--', '.']

        self.postfix_operators = ['++', '--', ':']

        self.adjective_operators = ['not']

        self.keyword_postfix = [':']

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        continuation_chars = ['\\']
        line_continuation_tb = CaseInsensitiveListTokenBuilder(
            continuation_chars, 'line continuation', False)

        keywords = [
            'as', 'assert', 'break', 'case', 'class', 'continue', 'def', 'del',
            'elif', 'else', 'except', 'finally', 'for', 'from', 'global', 'if',
            'import', 'lambda', 'match', 'nonlocal', 'pass', 'print', 'raise',
            'return', 'try', 'while', 'while', 'with', 'yield'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        values = ['False', 'None', 'True']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, stmt_separator_tb,
            integer_tb, integer_exponent_tb, real_tb, real_exponent_tb,
            keyword_tb, values_tb, known_operator_tb, groupers_tb,
            identifier_tb, decorator_tb, string_tb, raw_string_tb,
            byte_string_tb, unicode_string_tb, fast_string_tb, hash_comment_tb,
            triple_quote_comment_tb, raw_triple_quote_comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_parens_continued_lines(tokens)
        tokens = Examiner.join_operator_continued_lines(
            tokens, self.postfix_operators)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = [['not', 'in']]
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = [
            'number', 'string', 'identifier', 'variable', 'symbol'
        ]
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_line_format_confidence()
        self.calc_line_length_confidence(code, self.max_expected_line)
Exemplo n.º 5
0
    def __init__(self, code, block_comment_limit):
        super().__init__()
        self.newlines_important = 'parens'

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder(None)
        integer_exponent_tb = IntegerExponentTokenBuilder(None)
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        real_tb = RealTokenBuilder(False, False, None)
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None)
        imaginary_tb = SuffixedRealTokenBuilder(False, False, ['im', 'cx'],
                                                True, None)
        operand_types.append('number')

        leads = '_'
        extras = '_'
        suffixes = '!'
        identifier_tb = SuffixedIdentifierTokenBuilder(leads, extras, suffixes)
        operand_types.append('identifier')

        symbol_tb = PrefixedIdentifierTokenBuilder(':', 'symbol', True)
        operand_types.append('symbol')

        attribute_tb = PrefixedIdentifierTokenBuilder('@', 'attribute', False)
        operand_types.append('attribute')

        dollar_sign_tb = SingleCharacterTokenBuilder('$', 'identifier', True)

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        raw_string_tb = PrefixedRawStringTokenBuilder('raw', True, quotes)
        b_string_tb = PrefixedStringTokenBuilder('b', True, quotes)
        triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes)
        operand_types.append('string')

        comment_tb = LeadToEndOfLineTokenBuilder('#', True, 'comment')
        nested_comment_tb = NestedCommentTokenBuilder('#=', '=#',
                                                      block_comment_limit)

        line_continuation_tb = SingleCharacterTokenBuilder(
            '\\', 'line continuation', False)
        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            'where', 'in', 'isa', '′', "'", '+', '-', '*', '/', '\\', '^', '%',
            '//', '<<', '>>', '<<<', '>>>', ':', '=', '==', '!=', '===', '!==',
            '+=', '-=', '*=', '/=', '^=', '%=', '<', '>', '<=', '>=', '~', '&',
            '|', '!', '&&', '||', '?', '.', '<:', '>:', '::', '->', '...',
            '..', '∀', '≤', '≥', '⊻', '⊽', '⊼'
        ]

        # 0x391 through 0x3a9 (capital)
        # 0x3b1 through 0x3c9 (small)
        greek_letters = [
            'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν',
            'ξ', 'ο', 'π', 'ρ', 'ς', 'σ', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω'
        ]

        greek_letter_tb = CaseSensitiveListTokenBuilder(
            greek_letters, 'identifier', True)

        self.unary_operators = [
            'isa', '+', '-', '~', '!', '.', ':', '::', "'", '<:', '>:', 'in',
            '..'
        ]

        self.postfix_operators = ['...', '′']

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        # group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'baremodule', 'begin', 'break', 'catch', 'const', 'continue', 'do',
            'else', 'elseif', 'end', 'export', 'finally', 'for', 'function',
            'global', 'if', 'import', 'let', 'local', 'macro', 'module',
            'quote', 'return', 'struct', 'try', 'using', 'while', 'abstract',
            'mutable', 'primitive', 'type'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        types = [
            'Int8', 'UInt8', 'Int16', 'UInt16', 'Int32', 'UInt32', 'Int64',
            'UInt64', 'Int128', 'UInt128', 'Float16', 'Float32', 'Float64',
            'Bool', 'Char'
        ]

        types_tb = CaseSensitiveListTokenBuilder(types, 'type', True)
        operand_types.append('type')

        values = ['false', 'true']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, line_continuation_tb, terminators_tb,
            integer_tb, integer_exponent_tb, hex_integer_tb, real_tb,
            real_exponent_tb, imaginary_tb, keyword_tb, types_tb, values_tb,
            groupers_tb, known_operator_tb, identifier_tb, symbol_tb,
            attribute_tb, dollar_sign_tb, greek_letter_tb, string_tb,
            raw_string_tb, b_string_tb, triple_quote_string_tb, comment_tb,
            nested_comment_tb, self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid')
        tokens = JuliaExaminer.split_symbols_to_operators_identifiers(
            tokens, group_ends)
        self.tokens = tokens
        self.convert_keywords_to_identifiers()

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_parens_continued_lines(tokens)
        tokens = Examiner.join_operator_continued_lines(
            tokens, self.postfix_operators)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'identifier', 'symbol']
        self.calc_operand_confidence(tokens, operand_types_2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)
Exemplo n.º 6
0
    def __init__(self, code):
        super().__init__()
        self.newlines_important = 'parens'

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()
        stmt_separator_tb = SingleCharacterTokenBuilder(
            ';', 'statement separator', False)
        stmt_terminator_tb = SingleCharacterTokenBuilder(
            '.', 'statement terminator', False)

        integer_tb = IntegerTokenBuilder('_')
        integer_exponent_tb = IntegerExponentTokenBuilder('_')
        real_tb = RealTokenBuilder(False, False, '_')
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', '_')
        operand_types.append('number')

        variable_tb = PrologVariableTokenBuilder()
        operand_types.append('variable')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        operand_types.append('string')

        comment_tb = LeadToEndOfLineTokenBuilder('%', True, 'comment')

        special_symbols = ['!']
        special_symbol_tb = CaseSensitiveListTokenBuilder(
            special_symbols, 'identifier', True)

        known_operators = [
            '-->', ':-', '?-', '|', '->', '*->', ':=', '\\+', '<', '=', '=..',
            '=@=', '\\=@=', '=:=', '=<', '==', '=\\=', '>', '>=', '@<', '@=<',
            '@>', '@>=', '\\=', '\\==', 'as', 'is', '>:<', ':<', ':', '+', '-',
            '/\\', '\\/', 'xor', '?', '*', '/', '//', 'div', 'rdiv', '<<',
            '>>', 'mod', 'rem', '**', '^', '+', '-', '\\', '$'
        ]

        self.unary_operators = ['+', '-', ':-', '\\', '\\+']

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        groupers = ['(', ')', ',', '[', ']', '{', '}', '|']
        group_starts = ['(', '[', ',', '{']
        group_mids = [',', '|']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        keywords = [
            'dynamic', 'discontiguous', 'initialization', 'meta_predicate',
            'module_transparent', 'multifile', 'public', 'thread_local',
            'thread_initialization', 'volatile'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        values = ['(-)']

        value_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            whitespace_tb, newline_tb, stmt_separator_tb, stmt_terminator_tb,
            integer_tb, integer_exponent_tb, real_tb, real_exponent_tb,
            keyword_tb, known_operator_tb, special_symbol_tb, variable_tb,
            groupers_tb, identifier_tb, string_tb, value_tb, comment_tb,
            self.unknown_operator_tb, invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_parens_continued_lines(tokens)
        tokens = Examiner.join_operator_continued_lines(
            tokens, self.postfix_operators)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            self.calc_operator_4_confidence(tokens, num_operators,
                                            group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = [
            'number', 'string', 'identifier', 'variable', 'symbol'
        ]
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        # self.calc_keyword_confidence()
        self.calc_line_length_confidence(code, self.max_expected_line)
Exemplo n.º 7
0
    def __init__(self, code):
        super().__init__()
        self.newlines_important = 'parens'

        operand_types = []

        whitespace_tb = WhitespaceTokenBuilder()
        newline_tb = NewlineTokenBuilder()

        integer_tb = IntegerTokenBuilder("'")
        integer_exponent_tb = IntegerExponentTokenBuilder("'")
        hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False,
                                                     '0123456789abcdefABCDEF')
        binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01')
        real_tb = RealTokenBuilder(False, False, "'")
        real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'")
        operand_types.append('number')

        leads = '_'
        extras = '_'
        identifier_tb = IdentifierTokenBuilder(leads, extras)
        operand_types.append('identifier')

        quotes = ['"', "'", "’"]
        string_tb = EscapedStringTokenBuilder(quotes, 0)
        bracket_string_tb = DoubleBracketStringTokenBuilder()
        operand_types.append('string')

        terminators_tb = SingleCharacterTokenBuilder(';',
                                                     'statement terminator',
                                                     False)

        known_operators = [
            '+', '-', '*', '/', '^', '<', '>', '<=', '>=', '==', '~=', '=',
            '..', '.', '#', ':', 'and', 'not', 'or'
        ]

        self.unary_operators = ['+', '-', '#', 'not']

        self.postfix_operators = []

        groupers = ['(', ')', ',', '[', ']', '{', '}']
        # group_starts = ['(', '[', ',', '{']
        group_mids = [',']
        group_ends = [')', ']', '}']

        groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False)

        known_operator_tb = CaseSensitiveListTokenBuilder(
            known_operators, 'operator', False)

        keywords = [
            'break', 'do', 'else', 'elseif', 'end', 'for', 'function', 'if',
            'in', 'local', 'repeat', 'return', 'then', 'until', 'while'
        ]

        keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False)

        values = ['false', 'true', 'nil', '...']

        values_tb = CaseSensitiveListTokenBuilder(values, 'value', True)
        operand_types.append('value')

        line_comment_tb = LeadToEndOfLineTokenBuilder('--', True, 'comment')
        block_comment_tb = LuaBlockCommentTokenBuilder()

        invalid_token_builder = InvalidTokenBuilder()

        tokenbuilders = [
            newline_tb, whitespace_tb, terminators_tb, integer_tb,
            integer_exponent_tb, hex_integer_tb, binary_integer_tb, real_tb,
            real_exponent_tb, keyword_tb, values_tb, groupers_tb,
            known_operator_tb, identifier_tb, string_tb, bracket_string_tb,
            line_comment_tb, block_comment_tb, self.unknown_operator_tb,
            invalid_token_builder
        ]

        tokenizer = Tokenizer(tokenbuilders)
        tokens = tokenizer.tokenize(code)
        tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid operator')
        self.tokens = Examiner.combine_adjacent_identical_tokens(
            tokens, 'invalid')

        self.calc_statistics()

        tokens = self.source_tokens()
        tokens = Examiner.join_parens_continued_lines(tokens)
        tokens = Examiner.join_operator_continued_lines(
            tokens, self.postfix_operators)

        self.calc_token_confidence()
        self.calc_token_2_confidence()

        num_operators = self.count_my_tokens(['operator', 'invalid operator'])
        if num_operators > 0:
            self.calc_operator_confidence(num_operators)
            allow_pairs = []
            self.calc_operator_2_confidence(tokens, num_operators, allow_pairs)
            self.calc_operator_3_confidence(tokens, num_operators, group_ends,
                                            allow_pairs)
            # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs)

        self.calc_group_confidence(tokens, group_mids)

        operand_types_2 = ['number', 'identifier']
        self.calc_operand_n_confidence(tokens, operand_types_2, 2)
        self.calc_operand_n_confidence(tokens, operand_types, 4)

        self.calc_keyword_confidence()

        self.calc_paired_blockers_confidence(['{'], ['}'])
        self.calc_line_length_confidence(code, self.max_expected_line)