def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() PrefixedStringTokenBuilder.__escape_z__() SuffixedStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() SuffixedIntegerTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() SuffixedRealTokenBuilder.__escape_z__() IdentifierTokenBuilder.__escape_z__() PrefixedIdentifierTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() SlashSlashCommentTokenBuilder.__escape_z__() SlashStarCommentTokenBuilder.__escape_z__() ClassTypeTokenBuilder.__escape_z__() HexRealExponentTokenBuilder.__escape_z__() NestedCommentTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() PrefixedStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() SuffixedIntegerTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() AssemblyCommentTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() StringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() IdentifierTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() SuffixedIntegerTokenBuilder.__escape_z__() BlockTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() EscapedStringTokenBuilder.__escape_z__() PrefixedStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() SuffixedIntegerTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() IdentifierTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() LabelTokenBuilder.__escape_z__() AssemblyCommentTokenBuilder.__escape_z__() MultilineCommentTokenBuilder.__escape_z__() HashQuoteCharTokenBuilder.__escape_z__() return 'Escape ?Z'
def __escape_z__(): InvalidTokenBuilder.__escape_z__() WhitespaceTokenBuilder.__escape_z__() NewlineTokenBuilder.__escape_z__() StuffedQuoteStringTokenBuilder.__escape_z__() IntegerTokenBuilder.__escape_z__() SuffixedIntegerTokenBuilder.__escape_z__() IntegerExponentTokenBuilder.__escape_z__() RealTokenBuilder.__escape_z__() SuffixedRealTokenBuilder.__escape_z__() RealExponentTokenBuilder.__escape_z__() CaseInsensitiveListTokenBuilder.__escape_z__() CaseSensitiveListTokenBuilder.__escape_z__() SingleCharacterTokenBuilder.__escape_z__() PrefixedIntegerTokenBuilder.__escape_z__() LeadToEndOfLineTokenBuilder.__escape_z__() NullTokenBuilder.__escape_z__() BasicVariableTokenBuilder.__escape_z__() BasicLongVariableTokenBuilder.__escape_z__() RemarkTokenBuilder.__escape_z__() UserFunctionTokenBuilder.__escape_z__() LongUserFunctionTokenBuilder.__escape_z__() HardwareFunctionTokenBuilder.__escape_z__() return 'Escape ?Z'
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) big_integer_tb = SuffixedIntegerTokenBuilder(['n', 'N'], False, '_') real_tb = RealTokenBuilder(False, False, None) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', None) hex_constant_tb = PrefixedIntegerTokenBuilder('0X', False, '0123456789ABCDEFabcdef') octal_constant_tb = PrefixedIntegerTokenBuilder('0O', False, '01234567') binary_constant_tb = PrefixedIntegerTokenBuilder('0B', False, '01') operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) template_string_tb = EscapedStringTokenBuilder(['`'], 10) operand_types.append('string') regex_tb = RegexTokenBuilder() operand_types.append('regex') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '=', '==', '!=', '===', '!==', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '**=', '&=', '|=', '^=', '<<=', '>>=', '!', '!!', '&', '|', '~', '<<', '>>', '>>>', '>>>=', '^', '**', '.', ':', '++', '--', '&&', '||', '?', '?.', 'new', 'delete' ] known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False) self.unary_operators = [ '+', '-', '!', '!!', '~', '++', '--', ':', 'new', 'delete' ] self.postfix_operators = [ '++', '--', ':' ] groupers = ['(', ')', ',', '[', ']', '{', '}'] # group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'abstract', 'break', 'case', 'catch', 'class', 'const', 'continue', 'debugger', 'default', 'do', 'else', 'export', 'extends', 'final', 'finally', 'for', 'function', 'goto', 'if', 'import', 'in', 'instanceof', 'let', 'native', 'new', 'return', 'switch', 'synchronized', 'throw', 'throws', 'transient', 'try', 'typeof', 'var', 'void', 'volatile', 'while', 'with', 'yield' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = [ 'this', 'super', 'null', 'true', 'false' ] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ whitespace_tb, newline_tb, terminators_tb, integer_tb, integer_exponent_tb, big_integer_tb, real_tb, real_exponent_tb, hex_constant_tb, octal_constant_tb, binary_constant_tb, keyword_tb, values_tb, known_operator_tb, groupers_tb, regex_tb, identifier_tb, string_tb, template_string_tb, slash_slash_comment_tb, slash_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') # tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence([';']) num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, tab_size): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") real_tb = RealTokenBuilder(True, True, None) hex_integer_1_tb = PrefixedIntegerTokenBuilder( '$', False, '0123456789abcdefABCDEF') hex_integer_2_tb = PrefixedIntegerTokenBuilder( '#$', False, '0123456789abcdefABCDEF') hex_integer_3_tb = PrefixedIntegerTokenBuilder( '&', False, '0123456789abcdefABCDEF') hex_integer_h_tb = SuffixedIntegerTokenBuilder(['h'], False, 'abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01') suffixed_integer_tb = SuffixedIntegerTokenBuilder( ['Q', 'A', 'O', 'D', 'B'], False, None) operand_types.append('number') leads = '_$#.' extras = '_$#.' identifier_tb = IbmAsmIdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) hex_string_tb = PrefixedStringTokenBuilder('X', False, quotes) char_string_tb = PrefixedStringTokenBuilder('C', False, quotes) operand_types.append('string') known_operators = ['+', '-', '*', '/', '=', '&', '#', '?'] self.unary_operators = ['+', '-', '=', '&', '#', '?'] self.postfix_operators = [] groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '<', '>'] group_starts = ['(', '[', ',', '{', '<'] group_ends = [')', ']', '}', '>'] group_mids = [','] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) # keywords = [] # keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) # types = [] # types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) values = ['*'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') comment_tb = AssemblyCommentTokenBuilder(';*') title_directive_tb = LeadToEndOfLineTokenBuilder( 'TITLE', False, 'directive') subtitle_directive_tb = LeadToEndOfLineTokenBuilder( 'SUBTTL', False, 'directive') include_directive_tb = LeadToEndOfLineTokenBuilder( 'INCLUDE', False, 'directive') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, integer_tb, integer_exponent_tb, hex_integer_1_tb, hex_integer_2_tb, hex_integer_3_tb, hex_integer_h_tb, binary_integer_tb, suffixed_integer_tb, real_tb, values_tb, groupers_tb, known_operator_tb, title_directive_tb, subtitle_directive_tb, include_directive_tb, identifier_tb, string_tb, hex_string_tb, char_string_tb, comment_tb, self.unknown_operator_tb, invalid_token_builder ] opcode_tokenbuilders = [identifier_tb, invalid_token_builder] args_tokenbuilders = [ integer_tb, integer_exponent_tb, hex_integer_1_tb, hex_integer_2_tb, hex_integer_3_tb, hex_integer_h_tb, binary_integer_tb, suffixed_integer_tb, real_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, string_tb, hex_string_tb, char_string_tb, comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) opcode_tokenizer = Tokenizer(opcode_tokenbuilders) args_tokenizer = Tokenizer(args_tokenbuilders) # tokenize as free-format tokens_free = tokenizer.tokenize(code) tokens_free = Examiner.combine_adjacent_identical_tokens( tokens_free, 'invalid operator') tokens_free = Examiner.combine_adjacent_identical_tokens( tokens_free, 'invalid') tokens_free = Examiner.convert_values_to_operators( tokens_free, known_operators) self.tokens = tokens_free self.convert_asm_identifiers_to_labels() self.calc_statistics() statistics_free = self.statistics self.statistics = {} self.calc_confidences(operand_types, group_starts, group_mids, group_ends, None) self.calc_line_length_confidence(code, self.max_expected_line) confidences_free = self.confidences self.confidences = {} errors_free = self.errors self.errors = [] # tokenize as space-format opcode_extras = '.&=,()+-*/' label_leads = '.&$@' label_mids = '.&$#@' label_ends = ':,' comment_leads = '*;!' line_comment_leads = '' use_line_id = False tokens_space, indents = Tokenizer.tokenize_asm_code( code, tab_size, opcode_tokenizer, opcode_extras, args_tokenizer, label_leads, label_mids, label_ends, comment_leads, line_comment_leads, use_line_id) tokens_space = Examiner.combine_adjacent_identical_tokens( tokens_space, 'invalid operator') tokens_space = Examiner.combine_adjacent_identical_tokens( tokens_space, 'invalid') tokens_space = Examiner.combine_identifier_colon( tokens_space, ['newline'], [], []) tokens_space = Tokenizer.combine_number_and_adjacent_identifier( tokens_space) tokens_space = Examiner.convert_values_to_operators( tokens_space, known_operators) self.tokens = tokens_space self.convert_asm_identifiers_to_labels() self.calc_statistics() statistics_space = self.statistics self.statistics = {} self.calc_confidences(operand_types, group_starts, group_mids, group_ends, indents) self.calc_line_length_confidence(code, self.max_expected_line) confidences_space = self.confidences self.confidences = {} errors_space = self.errors self.errors = [] # select the better of free-format and spaced-format confidence_free = 1.0 for key in confidences_free: factor = confidences_free[key] confidence_free *= factor confidence_space = 1.0 for key in confidences_space: factor = confidences_space[key] confidence_space *= factor if confidence_space > confidence_free: self.tokens = tokens_space self.statistics = statistics_space self.confidences = confidences_space self.errors = errors_space else: self.tokens = tokens_free self.statistics = statistics_free self.confidences = confidences_free self.errors = errors_free
def __init__(self, code, tab_size, processor): super().__init__() self.newlines_important = 'always' operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") real_tb = RealTokenBuilder(True, True, None) hex_integer_1_tb = PrefixedIntegerTokenBuilder('$', False, '0123456789abcdefABCDEF') hex_integer_2_tb = PrefixedIntegerTokenBuilder('#$', False, '0123456789abcdefABCDEF') hex_integer_3_tb = PrefixedIntegerTokenBuilder('&', False, '0123456789abcdefABCDEF') hex_integer_h_tb = SuffixedIntegerTokenBuilder(['h'], False, 'abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01') suffixed_integer_tb = SuffixedIntegerTokenBuilder(['Q', 'A', 'O', 'D', 'B'], False, None) operand_types.append('number') leads = '$#.@&' extras = '$#.@&' identifier_tb = IbmAsmIdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) hex_string_tb = PrefixedStringTokenBuilder('X', False, quotes) char_string_tb = PrefixedStringTokenBuilder('C', False, quotes) operand_types.append('string') known_operators = [ '+', '-', '*', '/', '=', '&', '#', '?', "'" ] self.unary_operators = [ '+', '-', '=', '&', '#', '?', "'" ] self.postfix_operators = [] groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '<', '>'] group_starts = ['(', '[', ',', '{', '<'] group_ends = [')', ']', '}', '>'] group_mids = [','] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False) preprocessors = [ 'MACRO', 'MEND' ] preprocessor_tb = CaseInsensitiveListTokenBuilder(preprocessors, 'preprocesssor', False) directives = [ 'CSECT', 'DC', 'DROP', 'DS', 'EJECT', 'END', 'ENTRY', 'EQU', 'EXTRN', 'FREEMAIN', 'GETMAIN', 'GLOBAL', 'NAM', 'NAME', 'ORG', 'PAGE', 'PARAM', 'PROC', 'PUBLIC', 'RETURN', 'STIMER', 'TITLE', 'SUBTTL', 'USING' ] directive_tb = CaseInsensitiveListTokenBuilder(directives, 'directive', False) keywords = [] keywords_360 = [ 'A', 'ABEND', 'AD', 'ADR', 'AE', 'AER', 'AH', 'AL', 'ALR', 'AP', 'AR', 'AU', 'AUR', 'AW', 'AWR', 'AXR', 'B', 'BAL', 'BALR', 'BAS', 'BASR', 'BC', 'BCR', 'BCT', 'BCTR', 'BE', 'BH', 'BL', 'BM', 'BNE', 'BNH', 'BNL', 'BNM', 'BNP', 'BNO', 'BNZ', 'BO', 'BP', 'BR', 'BXH', 'BXLE', 'BZ', 'C', 'CD', 'CDR', 'CE', 'CER', 'CH', 'CL', 'CLC', 'CLI', 'CLR', 'CP', 'CR', 'CVB', 'CVD', 'D', 'DD', 'DDR', 'DE', 'DER', 'DIAGNOSE', 'DP', 'DR', 'ED', 'EDMK', 'EX', 'HDR', 'HER', 'HIO', 'IC', 'ISK', 'L', 'LA', 'LCR', 'LCDR', 'LCER', 'LD', 'LDR', 'LE', 'LER', 'LH', 'LM', 'LNDR', 'LNER', 'LNR', 'LPDR', 'LPER', 'LPR', 'LPSW', 'LR', 'LRDR', 'LRER', 'LTDR', 'LTER', 'LTR', 'M', 'MD', 'MDR', 'ME', 'MER', 'MH', 'MP', 'MR', 'MVC', 'MVI', 'MVN', 'MVO', 'MVZ', 'MXD', 'MXDR', 'MXR', 'N', 'NC', 'NI', 'NOP', 'NOPR', 'NR', 'O', 'OC', 'OI', 'OR', 'PACK', 'RDD', 'S', 'SD', 'SDR', 'SE', 'SER', 'SH', 'SIO', 'SL', 'SLA', 'SLDA', 'SLDL', 'SLL', 'SLR', 'SP', 'SPM', 'SR', 'SRA', 'SRDL', 'SRP', 'SSK', 'SSM', 'SRDA', 'SRL', 'ST', 'STC', 'STD', 'STE', 'STH', 'STM', 'SU', 'SUR', 'SVC', 'SW', 'SWR', 'SXR', 'TCH', 'TIO', 'TM', 'TR', 'TRT', 'TS', 'UNPK', 'UNPKU', 'WRD', 'X', 'XC', 'XI', 'XR', 'ZAP' ] keywords_370 = [ 'BRXH', 'BRXLE', 'CLCL', 'HDV', 'LAM', 'LEDR', 'MS', 'MVCL', 'RIO', 'SIOF', 'STAM', 'VA', 'VACD', 'VACDR', 'VACE', 'VACER', 'VAD', 'VADQ', 'VADR', 'VADS', 'VAE', 'VAEQ', 'VAER', 'VAES', 'VAQ', 'VAR', 'VAS', 'VC', 'VCD', 'VCDQ', 'VCDR', 'VCDS', 'VCE', 'VCEQ', 'VCER', 'VCES', 'VCQ', 'VCR', 'VCS', 'VDD', 'VDDQ', 'VDDR', 'VDDS', 'VDE', 'VDEQ', 'VDER', 'VDES', 'VL', 'VLCDR', 'VLCER', 'VLCR', 'VLD', 'VLDQ', 'VLDR', 'VLEQ', 'VLH', 'VLINT', 'VLM', 'VLMD', 'VLMDQ', 'VLMDR', 'VLMEQ', 'VLMQ', 'VLMR', 'VLNDR', 'VLNER', 'VLNR', 'VLPDR', 'VLPER', 'VLPR', 'VLQ', 'VLR', 'VLY', 'VLYD', 'VLZDR', 'VLZR', 'VM', 'VMAD', 'VMADQ', 'VMADS', 'VMAE', 'VMAEQ', 'VMAES', 'VMCD', 'VMCE', 'VMCER', 'VMD', 'VMDQ', 'VMDR', 'VMDS', 'VME', 'VMEQ', 'VMER', 'VMES', 'VMQ', 'VMR', 'VMS', 'VMSD', 'VMSDQ', 'VMSDS', 'VMSE', 'VMSEQ', 'VMSES', 'VN', 'VNQ', 'VNR', 'VNS', 'VO', 'VOQ', 'VOR', 'VOS', 'VS', 'VSD', 'VSDQ', 'VSDR', 'VSDS', 'VSE', 'VSEQ', 'VSER', 'VSES', 'VSQD', 'VSQDR', 'VSQE', 'VSQER', 'VSQ', 'VSR', 'VSS', 'VST', 'VSTD', 'VSTE', 'VSTH', 'VSTKD', 'VSTMD', 'VTAD', 'VTAE', 'VTSD', 'VTSE', 'VX', 'VXQ', 'VXR', 'VXS', 'VMXSE', 'VMNSE', 'VMXAE', 'VLELE', 'VSELE', 'VMXDS', 'VMNSD', 'VMXAD', 'VLELD', 'VXELD', 'VSPSD', 'VAPSD', 'VTVM', 'VCVM', 'VCZVM', 'VCOVM', 'VXVC', 'VXVMM', 'VRRS', 'VRSVC', 'VRSV', 'VLVM', 'VLCVM', 'VSTVM', 'VNVM', 'VOVM', 'VXVM', ' VSRSV', 'VMRSV', 'VSRRS', 'VLVCA', 'VRCL', 'VSVMM', 'VLVXA', 'VSVTP', 'VACSV', 'VACRS', 'STNSM', 'SOTSM', 'SIOP', 'MC', 'LRA', 'CONCS', 'DISCS', 'STIDP', 'SCK', 'SPT', 'STPT', 'SPKA', 'IPK', 'PTLB', 'SPX', 'STPX', 'STAP', 'RRB', 'PC', 'SAC', 'IPTE', 'IVSK', 'IAC', 'SSAR', 'EPAR', 'ESAR', 'PT', 'ISKE', 'RRBE', 'SSKE', 'TB', 'STCTL', 'LCTL', 'CS', 'CDS', 'CLM', 'STCM', 'ICM', 'MVCK', 'MVCP', 'MVCS', 'VLI', 'VSTI', 'VLID', 'VSTID', 'VSRL', 'VSLL', 'VLBIX', 'LASP', 'TPROT', 'STRAG', 'MVCSK', 'MVCDK', 'DPFET', 'MVHHI', 'MVGHI', 'MVHI', 'CHHSI', 'CLHHSI', 'CGHSI', 'CLGHSI', 'CHSI', 'CLFHSI', 'TBEGIN', 'TBEGINC', 'MVCIN', 'UNPKA' ] keywords_390 = [ 'BASSM', 'BSG', 'BSM', 'CLRCH', 'CMPS', 'CLRIO', 'CMSG', 'LAE', 'LXDR', 'MDE', 'PFPO', 'PR', 'PTFF', 'SAM24', 'SAM31', 'SCKPF', 'TAM', 'TMPS', 'TMSG', 'TRACE', 'TRAP2', 'TMH',' TMLH', 'TML', 'TMLL', 'TMHH', 'TMHL', 'BRC', 'BRAS', 'BRCT', 'BRCTG', 'LHI', 'LGHI', 'AHI', 'AGHI', 'MHI', 'MGHI', 'CHI', 'CGHI', 'MVCLE', 'CLCLE', 'UPT', 'SIE', 'PCF', 'CFC', 'DEP', 'DCTP', 'MAD', 'MUN', 'STCAP', 'SERVC', 'IPM', 'DXR', 'PGIN', 'PGOUT', 'CSCH', 'HSCH', 'MSCH', 'SSCH', 'STSCH', 'TSCH', 'TPI', 'SAL', 'RSCH', 'STCRW', 'STCPS', 'RCHP', 'SCHM', 'STZP', 'SZP', 'TPZI', 'BAKR', 'CKSM', 'MADS', 'SQDR', 'STURA', 'MSTA', 'PALB', 'EREG', 'ESTA', 'LURA', 'TAR', 'SQDR', 'SAR', 'EAR', 'CSP', 'MSR', 'MVPG', 'MVST', 'CUSE', 'BSG', 'CLST', 'SRST', 'XSCH', 'RP', 'STCKE', 'SACF', 'STSI', 'SRNM', 'STFPC', 'LFPC', 'TRE', 'CUUTF', 'CUTFU', 'STFL', 'LPSWE', 'TRAP4', 'LPEBR', 'LNEBR', 'LTEBR', 'LCEBR', 'LDEBR', 'LXDBR', 'LDEBR', 'MXDBR', 'KEBR', 'CEBR', 'AEBR', 'SEBR', 'MDEBR', 'DEBR', 'MAEBR', 'MSEBR', 'LPDBR', 'LCDBR', 'SQEBR', 'MEEBR', 'KDBR', 'CDBR', 'ADBR', 'MDBR', 'DDBR', 'SDBR', 'LDER', 'LXDR', 'MAER', 'MSER', 'SQXR', 'MEER', 'MADR', 'MSDR', 'LPXBR', 'LNXBR', 'LTXBR', 'LCXBR', 'LCXBR', 'LEDBR', 'LDXBR', 'LEXBR', 'FIXBR', 'KXBR', 'CXBR', 'AXBR', 'SXBR', 'MXBR', 'DXBR', 'TBEDR', 'TBDR', 'DIEBR', 'FIEBR', 'THDER', 'DIDBR', 'FIDBR', 'LPXR', 'LNXR', 'LTXR', 'LCXR', 'LXR', 'LEXR', 'FIXR', 'CXR', 'LZER', 'LZDR', 'LZXR', 'FIER', 'FIDR', 'SFPC', 'EFPC', 'CEFBR', 'CDFBR', 'CXFBR', 'CEGBR', 'CEFR', 'CDFR', 'CXFR', 'CFDR', 'CFXR', 'CEGR', 'CDGR', 'CXGR', 'CGER', 'CGDR', 'CGXR', 'CDGBR', 'CXGBR', 'CGDBR', 'CGEBR', 'CGXBR', 'LMC', 'LPGR', 'LNGR', 'LTGR', 'LCGR', 'LGC', 'LURAG', 'AGR', 'SGR', 'ALGR', 'SLGR', 'MSGR', 'DSGR', 'EREGG', 'LRVGR', 'LPGFR', 'LNGFR', 'LTGFR', 'LCGFR', 'LGFR', 'LLGFR', 'LLGTR', 'AGFR', 'SGFR', 'ALGFR', 'SLGFR', 'MSGFR', 'DSGFR', 'LRVR', 'CGR', 'CLGR', 'STURG', 'CGFR', 'CLGFR', 'BCTGR', 'NGR', 'OGR', 'XGR', 'MLGR', 'DLGR', 'ALCGR', 'SLBGR', 'EPSW', 'TRTT', 'TRTO', 'TROT', 'TROO', 'MLR', 'DLR', 'ALCR', 'SLBR', 'ESEA', 'LARL', 'LGFI', 'BRCL', 'BRASL', 'XIHF', 'XILF', 'IIHF', 'IILF', 'NIHF', 'NILF', 'OIHF', 'OILF', 'LLIHF', 'LLILF', 'LLHRL', 'LGHRL', 'LHRL', 'AGFI', 'AFI', 'ALGFI', 'ALFI', 'CGFI', 'CFI', 'LLGFRL', 'STRL', 'EXRL', 'PFDRL', 'CGHRL','CHRL', 'CLGHRL', 'CLHRL', 'CGRL', 'CLGRL', 'CRL', 'CLGFRL', 'CLRL', 'MVCOS', 'ECTG', 'CSST', 'PKU', 'LRAG', 'LG', 'AG', 'SG', 'ALG', 'SLG', 'MSG', 'DSG', 'CVBG', 'LRVG', 'LGF', 'LGH', 'LLGF', 'LLGT', 'AGF', 'SGF', 'ALGF', 'SLGF', 'MSGF', 'DSGF', 'LRV', 'LRVH', 'CG', 'CLG', 'STG', 'CVDG', 'STRVG', 'CGF', 'CLGF', 'STRV', 'STRVH', 'BCTG', 'NG', 'OG', 'XG', 'MLG', 'DLG', 'ALCG', 'SLBG', 'STPQ', 'LPQ', 'LLGC', 'LLGH', 'ML', 'DL', 'ALC', 'SLB', 'PKA', 'DIL', 'BDIL', 'ANUM', 'COMP', 'MCPU', 'MIO', 'BIFLAG', 'MULDIV', 'LMG', 'SRAG', 'SLAG', 'SRLG', 'SLLG', 'TRACG', 'RLLG', 'RLL', 'CLMH', 'CLMY', 'CLT', 'CLTH', 'CLTL', 'CLTNE', 'CLTE', 'CLTNL', 'CLTNH', 'STMG', 'STCTG', 'STMH', 'STCMH', 'LCTLG', 'CSG', 'CDSG', 'BXHG', 'BXLEG', 'ICMH', 'MVCLU', 'CLCLU', 'LMH', 'LMY', 'TP', 'SRAK', 'SLAK', 'SRLK', 'SRLK', 'LOCG', 'BRXHG', 'BRXLG', 'LDEB', 'LXDB', 'LXEB', 'MXDB', 'KEB', 'CEB', 'AEB', 'SEB', 'MDEB', 'DEB', 'MAEB', 'MSEB', 'TCEB', 'TCDB', 'TCXB', 'SQEB', 'SQDB', 'MEEB', 'KDB', 'CDB', 'ADB', 'SDB', 'MDB', 'DDB', 'MADB', 'MSDB', 'LDE', 'LXD', 'LXE', 'SQE', 'SQD', 'MEE', 'PLO', 'LMD' ] keywords_z = [ 'IIHH', 'IIHL', 'IILH', 'IILL', 'LLIHH', 'LLIHL', 'LLILH', 'LLILL', 'NIHH', 'NIHL', 'NILH', 'NILL', 'OIHH', 'OIHL', 'OILH', 'OILL', 'SAM64' ] if processor in ['360', '370', '390', 'system-z']: keywords += keywords_360 if processor in ['370', '390', 'system-z']: keywords += keywords_370 if processor in ['390', 'system-z']: keywords += keywords_390 if processor in ['system-z']: keywords += keywords_z opcode_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword', False) registers = [ 'R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11', 'R12', 'R13', 'R14', 'R15', 'FP0', 'FP2', 'FP4', 'FP6' ] register_tb = CaseInsensitiveListTokenBuilder(registers, 'register', True) values = ['*'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') comment_tb = LeadToEndOfLineTokenBuilder('!', False, 'comment') line_comment_tb = AssemblyCommentTokenBuilder('*') include_directive_tb = LeadToEndOfLineTokenBuilder('INCLUDE', False, 'directive') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, integer_tb, integer_exponent_tb, hex_integer_1_tb, hex_integer_2_tb, hex_integer_3_tb, hex_integer_h_tb, binary_integer_tb, suffixed_integer_tb, real_tb, values_tb, groupers_tb, known_operator_tb, register_tb, opcode_tb, directive_tb, include_directive_tb, preprocessor_tb, identifier_tb, string_tb, hex_string_tb, char_string_tb, comment_tb, line_comment_tb, self.unknown_operator_tb, invalid_token_builder ] opcode_tokenbuilders = [ whitespace_tb, opcode_tb, directive_tb, include_directive_tb, preprocessor_tb, identifier_tb, self.unknown_operator_tb, invalid_token_builder ] args_tokenbuilders = [ whitespace_tb, integer_tb, integer_exponent_tb, hex_integer_1_tb, hex_integer_2_tb, hex_integer_3_tb, hex_integer_h_tb, binary_integer_tb, suffixed_integer_tb, real_tb, values_tb, groupers_tb, known_operator_tb, register_tb, identifier_tb, string_tb, hex_string_tb, char_string_tb, comment_tb, line_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) opcode_tokenizer = Tokenizer(opcode_tokenbuilders) args_tokenizer = Tokenizer(args_tokenbuilders) # tokenize as free-format tokens_free = tokenizer.tokenize(code) tokens_free = Examiner.combine_adjacent_identical_tokens(tokens_free, 'invalid operator') tokens_free = Examiner.combine_adjacent_identical_tokens(tokens_free, 'invalid') tokens_free = AssemblyIBMExaminer.convert_keywords_to_identifiers(tokens_free) tokens_free = Examiner.convert_values_to_operators(tokens_free, known_operators) self.tokens = tokens_free self.convert_asm_identifiers_to_labels() self.convert_asm_keywords_to_identifiers() self.calc_statistics() statistics1 = self.statistics self.statistics = {} self.calc_confidences(operand_types, group_starts, group_mids, group_ends, None) self.calc_line_length_confidence(code, self.max_expected_line) confidences_free = self.confidences self.confidences = {} errors_free = self.errors self.errors = [] # tokenize as space-format opcode_extras = '.&=,()+-*/' label_leads = '.&$@' label_mids = '.&$#@' label_ends = ':,' comment_leads = '!' line_comment_leads = '*' use_line_id = True tokens_space, indents = Tokenizer.tokenize_asm_code(code, tab_size, opcode_tokenizer, opcode_extras, args_tokenizer, label_leads, label_mids, label_ends, comment_leads, line_comment_leads, use_line_id) tokens_space = Examiner.combine_adjacent_identical_tokens(tokens_space, 'invalid operator') tokens_space = Examiner.combine_adjacent_identical_tokens(tokens_space, 'invalid') tokens_space = Examiner.combine_identifier_colon(tokens_space, ['newline'], [], []) tokens_space = Tokenizer.combine_number_and_adjacent_identifier(tokens_space) tokens_space = AssemblyIBMExaminer.convert_opcodes_to_keywords(tokens_space, keywords) tokens_space = AssemblyIBMExaminer.convert_keywords_to_identifiers(tokens_space) tokens_space = Examiner.convert_values_to_operators(tokens_space, known_operators) self.tokens = tokens_space self.convert_asm_identifiers_to_labels() self.convert_asm_keywords_to_identifiers() self.calc_statistics() statistics2 = self.statistics self.statistics = {} self.calc_confidences(operand_types, group_starts, group_mids, group_ends, indents) self.calc_line_length_confidence(code, self.max_expected_line) confidences_space = self.confidences self.confidences = {} errors_space = self.errors self.errors = [] # select the better of free-format and spaced-format confidence_free = 1.0 for key in confidences_free: factor = confidences_free[key] confidence_free *= factor confidence_space = 1.0 for key in confidences_space: factor = confidences_space[key] confidence_space *= factor if confidence_space > confidence_free: self.tokens = tokens_space self.statistics = statistics2 self.confidences = confidences_space self.errors = errors_space else: self.tokens = tokens_free self.statistics = statistics1 self.confidences = confidences_free self.errors = errors_free
def __init__(self, code, tab_size, wide): super().__init__() self.operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) binary_integer_tb = SuffixedIntegerTokenBuilder(['B'], False, None) hex_integer_tb = SuffixedIntegerTokenBuilder(['H'], False, 'ABCDEF') octal_integer_tb = SuffixedIntegerTokenBuilder(['O'], False, None) decimal_integer_tb = SuffixedIntegerTokenBuilder(['D'], False, None) real_tb = RealTokenBuilder(True, False, None) real_exponent_tb = RealExponentTokenBuilder(True, False, 'E', None) binary_real_tb = SuffixedRealTokenBuilder(True, True, ['B'], False, None) self.operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) self.operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) self.operand_types.append('string') label_tb = PL1LabelTokenBuilder() self.operand_types.append('label') slash_star_comment_tb = SlashStarCommentTokenBuilder() directives = [ '%ACTIVATE', '%DEACTIVATE', '%DECLARE', '%DCL', '%DICTIONARY', '%DO', '%ELSE', '%END', '%FATAL', '%GOTO', '%IF', '%INCLUDE', '%LIST', '%NOLIST', '%PAGE', '%PROCEDURE', '%PROC', '%REPLACE', '%RETURN', '%THEN' ] line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) preprocessor_tb = CaseInsensitiveListTokenBuilder( directives, 'preprocessor', False) title_tb = LeadToEndOfLineTokenBuilder('%TITLE', True, 'preprocessor') subtitle_tb = LeadToEndOfLineTokenBuilder('%SBTTL', True, 'preprocessor') error_tb = LeadToEndOfLineTokenBuilder('%ERROR', True, 'preprocessor') warn_tb = LeadToEndOfLineTokenBuilder('%WARN', True, 'preprocessor') inform_tb = LeadToEndOfLineTokenBuilder('%INFORM', True, 'preprocessor') terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '**', '>', '<', '=', '>=', '<=', '<>', '^>', '^<', '^=', '^', '~>', '~<', '~=', '~', '&', '&:', ':=', '|', '|:', '||', '!', '!:', '!!', ':', '@', 'NOT', 'AND', 'OR', 'XOR', 'MINUS', 'PLUS', 'MOD' ] self.unary_operators = ['+', '-', '^', '~', '@', 'NOT'] self.postfix_operators = [] groupers = ['(', ')', ',', '[', ']', '{', '}'] self.group_starts = ['(', '[', ',', '{'] self.group_mids = [','] self.group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'ADDRESS', 'AT', 'BASED', 'BY', 'CALL', 'CASE', 'CLOSE', 'DATA', 'DECLARE', 'DISABLE', 'DO', 'ELSE', 'ENABLE', 'END', 'EOF', 'EXTERNAL', 'GO', 'GOTO', 'HALT', 'IF', 'INITIAL', 'INTERRUPT', 'LABEL', 'LITERALLY', 'OFFSET', 'ON', 'OPEN', 'OTHERWISE', 'OTHER', 'PROCEDURE', 'PROC', 'PUBLIC', 'READ', 'REENTRANT', 'RETURN', 'SELECTOR', 'STRUCTURE', 'THEN', 'TO', 'WHILE', 'WRITE' ] keyword_tb = CaseInsensitiveListTokenBuilder(keywords, 'keyword', False) attributes = [ 'ALIGNED', 'ANY', 'AREA', 'BASED', 'BUILTIN', 'CONDITION', 'COND', 'CONTROLLED', 'CTL', 'DEFINED', 'DEF', 'DIRECT', 'ENTRY', 'ENVIRONMENT', 'ENV', 'EXTERNAL', 'EXT', 'FILE', 'GLOBALDEF', 'GLOBALREF', 'INITIAL', 'INIT', 'INPUT', 'INTERNAL', 'INT' 'KEYED', 'LABEL', 'LIKE', 'LIST', 'MEMBER', 'NONVARYING', 'NONVAR', 'OPTIONAL', 'OPTIONS', 'OUTPUT', 'PARAMETER', 'PARM', 'PICTURE', 'PIC', 'POSITION', 'POS', 'PRECISION', 'PREC', 'PRINT', 'READONLY', 'RECORD', 'REFER', 'RETURNS', 'SEQUENTIAL', 'SEQL', 'STATIC', 'STREAM', 'STRUCTURE', 'TRUNCATE', 'UNALIGNED', 'UNAL', 'UNION', 'UPDATE', 'VARIABLE', 'VARYING', 'VAR' ] attributes_tb = CaseInsensitiveListTokenBuilder( attributes, 'attribute', False) functions = [ 'ABS', 'ADJUSTRPL', 'BLOCKINPUT', 'BLOCKINWORD', 'BLOCKINDWORD', 'BLOCKOUTPUT', 'BLOCKOUTWORD', 'BLOCKOUTDWORD', 'BUILDPTR', 'BYTESWAP', 'CMPD', 'CARRY', 'CAUSEINTERRUPT', 'CLEARTASKSWITCHEDFLAG', 'CMPB', 'CMPW', 'CONTROLREGISTER', 'DEC', 'DOUBLE', 'DEBUGREGISTER', 'FINDB', 'FINDD', 'FINDRD', 'FINDHW', 'FINDRB', 'FINDRHW', 'FINDRW', 'FINDW', 'FIX', 'FLAGS', 'FLOAT', 'GETACCESSRIGHTS', 'GETREALERROR', 'GETSEGMENTLIMIT', 'HIGH', 'IABS', 'INHWORD', 'INITREALMATHUNITSKIPRB', 'INPUT', 'INT SIZE', 'INWORD SIZE', 'INVALIDATEDATACACHE', 'INVALIDATETLBENTRY' 'INDWORD', 'LAST', 'LENGTH', 'LOCALTABLE', 'LOCKSET', 'LOW', 'MACHINESTATUS', 'MOVB', 'MOVBIT', 'MOVD', 'MOVE', 'MOVHW', 'MOVRB', 'MOVRBIT' 'MOVRD', 'MOVRHW', 'MOVRW', 'MOVW', 'NIL', 'OFFSETOF', 'OUTDWORD', 'OUTHWORD', 'OUTPUT', 'OUTWORD', 'PARITY', 'RESTOREGLOBALTABLE', 'RESTOREINTERRUPTABLE', 'RESTOREREALSTATUS', 'ROL', 'ROR', 'SAL', 'SAR', 'SAVEGLOBALTABLE', 'SAVEINTERRUPTTABLE', 'SAVEREALSTATUS', 'SCANBIT', 'SCANRBIT', 'SCL', 'SCR', 'SEGMENTREADABLE', 'SEGMENTWRITABLE', 'SELECTOROF', 'SETB', 'SETHW', 'SETREALMODE', 'SETW', 'SHL', 'SHLD', 'SHR', 'SHRD', 'SETD', 'SIGN', 'SIGNED', 'SKIPB', 'SKIPD', 'SKIPRD' 'SKIPHW', 'SKIPRHW', 'SKIPRW', 'SKIPW', 'STACKBASE', 'STACKPTR', 'TASKREGISTER', 'TESTREGISTER', 'TIME', 'UNSIGN', 'WAITFORINTERRUPT', 'WBINVALIDATEDATACACHE', 'XLAT', 'ZERO' ] function_tb = CaseInsensitiveListTokenBuilder(functions, 'function', True) format_items = [ 'A', 'B', 'B1', 'B2', 'B3', 'B4', 'COLUMN', 'COL', 'E', 'F', 'P', 'R', 'TAB', 'X' ] format_item_tb = CaseSensitiveListTokenBuilder(format_items, 'format', True) self.operand_types.append('format') options = [ 'APPEND', 'BACKUP_DATE', 'BATCH', 'BLOCK_BOUNDARY_FORMAT', 'BLOCK_IO', 'BLOCK_SIZE', 'BUCKET_SIZE', 'BY', 'CANCEL_CONTROL_O', 'CARRIAGE_RETURN_FORMAT', 'CONTIGUOUS', 'CONTIGUOUS_BEST_TRY', 'CREATION_DATE', 'CURRENT_POSITION', 'DEFAULT_FILE_NAME', 'DEFERRED_WRITE', 'DELETE', 'EDIT', 'EXPIRATION_DATE', 'EXTENSION_SIZE', 'FAST_DELETE', 'FILE_ID', 'FILE_ID_TO', 'FILE_SIZE', 'FIXED_CONTROL_FROM', 'FIXED_CONTROL_SIZE', 'FIXED_CONTROL_SIZE_TO', 'FIXED_CONTROL_TO', 'FIXED_LENGTH_RECORDS', 'FROM', 'GROUP_PROTECTION', 'IDENT', 'IGNORE_LINE_MARKS', 'IN', 'INDEXED', 'INDEX_NUMBER', 'INITIAL_FILL', 'INTO', 'KEY', 'KEYFROM', 'KEYTO', 'LINESIZE', 'LOCK_ON_READ', 'LOCK_ON_WRITE', 'MAIN PROCEDURE', 'MANUAL_UNLOCKING', 'MATCH_GREATER', 'MATCH_GREATER_EQUAL', 'MATCH_NEXT', 'MATCH_NEXT_EQUAL', 'MAXIMUM_RECORD_NUMBER', 'MAXIMUM_RECORD_SIZE', 'MULTIBLOCK_COUNT', 'MULTIBUFFER_COUNT', 'NOLOCK', 'NONEXISTENT_RECORD', 'NONRECURSIVE', 'NORESCAN', 'NO_ECHO', 'NO_FILTER', 'NO_SHARE', 'OWNER_GROUP', 'OWNER_ID', 'OWNER_MEMBER', 'OWNER_PROTECTION', 'PAGE', 'PAGESIZE', 'PRINTER_FORMAT', 'PROMPT', 'PURGE_TYPE_AHEAD', 'READ_AHEAD', 'READ_CHECK', 'READ_REGARDLESS', 'RECORD_ID', 'RECORD_ID_ACCESS', 'RECORD_ID_TO', 'RECURSIVE', 'REPEAT', 'RESCAN', 'RETRIEVAL_POINTERS', 'REVISION_DATE', 'REWIND_ON_CLOSE', 'REWIND_ON_OPEN', 'SCALARVARYING', 'SET READ', 'SHARED_READ', 'SHARED_WRITE', 'SKIP', 'SNAP', 'SPOOL', 'STATEMENT', 'SUPERSEDE', 'SYSTEM', 'SYSTEM_PROTECTION', 'TEMPORARY', 'TIMEOUT_PERIOD', 'TITLE', 'TO', 'UNDERFLOW', 'UFL', 'UNTIL', 'USER_OPEN', 'WAIT_FOR_RECORD', 'WHILE', 'WORLD_PROTECTION', 'WRITE_BEHIND', 'WRITE_CHECK' ] options_tb = CaseInsensitiveListTokenBuilder(options, 'option', False) conditions = [ 'ANYCONDITION', 'CONVERSION', 'CONV', 'ENDFILE', 'ENDPAGE', 'FINISH', 'FIXEDOVERFLOW', 'FOFL', 'OVERFLOW', 'OFL', 'STORAGE', 'STRINGRANGE', 'STRG', 'SUBSCRIPTRANGE', 'SUBRG', 'UNDEFINEDFILE', 'UNDF', 'VAXCONDITION', 'ZERODIVIDE', 'ZDIV' ] conditions_tb = CaseInsensitiveListTokenBuilder( conditions, 'condition', False) subroutines = [ 'DISPLAY', 'EXTEND', 'FLUSH', 'NEXT_VOLUME', 'RELEASE', 'RESIGNAL', 'REWIND', 'SPACEBLOCK' ] subroutines_tb = CaseInsensitiveListTokenBuilder( subroutines, 'subroutine', False) types = [ 'ADDRESS', 'BYTE', 'CHARINT', 'DWORD', 'HWORD', 'INTEGER', 'LONGINT', 'OFFSET', 'POINTER', 'REAL', 'SHORTINT', 'STRUCTURE', 'QWORD', 'WORD' ] types_tb = CaseInsensitiveListTokenBuilder(types, 'type', True) self.operand_types.append('type') values = ['SYSIN', 'SYSPRINT', 'TRUE', 'FALSE'] values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True) self.operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() # tokenize as free-format tokenbuilders_free = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, binary_integer_tb, hex_integer_tb, octal_integer_tb, decimal_integer_tb, real_tb, real_exponent_tb, binary_real_tb, keyword_tb, format_item_tb, function_tb, attributes_tb, options_tb, conditions_tb, subroutines_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, string_tb, label_tb, slash_star_comment_tb, preprocessor_tb, title_tb, subtitle_tb, error_tb, warn_tb, inform_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer_free = Tokenizer(tokenbuilders_free) tokens_free = tokenizer_free.tokenize(code) tokens_free = Examiner.combine_adjacent_identical_tokens( tokens_free, 'invalid operator') tokens_free = Examiner.combine_adjacent_identical_tokens( tokens_free, 'invalid') self.tokens = tokens_free self.calc_statistics() statistics_free = self.statistics self.statistics = {} tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, self.group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, self.group_starts, allow_pairs) self.calc_group_confidence(tokens, self.group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, self.operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line) confidences_free = self.confidences self.confidences = {} errors_free = self.errors self.errors = [] # tokenize as fixed-format tokenbuilders_fixed = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, binary_integer_tb, hex_integer_tb, octal_integer_tb, decimal_integer_tb, real_tb, real_exponent_tb, binary_real_tb, keyword_tb, function_tb, attributes_tb, options_tb, conditions_tb, subroutines_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, string_tb, label_tb, slash_star_comment_tb, preprocessor_tb, title_tb, subtitle_tb, error_tb, warn_tb, inform_tb, self.unknown_operator_tb, invalid_token_builder ] comment_start_tb = PL1CommentStartTokenBuilder() comment_middle_tb = PL1CommentMiddleTokenBuilder() comment_end_tb = PL1CommentEndTokenBuilder() type1_tokenbuilders = [comment_start_tb] tokenbuilders_fixed_1 = tokenbuilders_fixed + type1_tokenbuilders + [ invalid_token_builder ] tokenizer_fixed_1 = Tokenizer(tokenbuilders_fixed_1) type2_tokenbuilders = [ comment_start_tb, comment_middle_tb, comment_end_tb ] tokenbuilders_fixed_2 = tokenbuilders_fixed + type2_tokenbuilders + [ invalid_token_builder ] tokenizer_fixed_2 = Tokenizer(tokenbuilders_fixed_2) tokens_fixed = self.tokenize_code(code, tab_size, tokenizer_fixed_1, tokenizer_fixed_2, wide) tokens_fixed = Examiner.combine_adjacent_identical_tokens( tokens_fixed, 'invalid operator') tokens_fixed = Examiner.combine_adjacent_identical_tokens( tokens_fixed, 'invalid') tokens_fixed = Examiner.combine_adjacent_identical_tokens( tokens_fixed, 'whitespace') tokens_fixed = self.convert_broken_comments_to_comments(tokens_fixed) self.tokens = tokens_fixed self.calc_statistics() statistics_fixed = self.statistics self.statistics = {} tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, self.group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, self.group_starts, allow_pairs) self.calc_group_confidence(tokens, self.group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, self.operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line) confidences_fixed = self.confidences self.confidences = {} errors_fixed = self.errors self.errors = [] # compute confidence for free-format and fixed-format confidence_free = 1.0 if len(confidences_free) == 0: confidence_free = 0.0 else: for key in confidences_free: factor = confidences_free[key] confidence_free *= factor confidence_fixed = 1.0 if len(confidences_fixed) == 0: confidence_fixed = 0.0 else: for key in confidences_fixed: factor = confidences_fixed[key] confidence_fixed *= factor # select the better of free-format and spaced-format if confidence_fixed > confidence_free: self.tokens = tokens_fixed self.statistics = statistics_fixed self.confidences = confidences_fixed self.errors = errors_fixed else: self.tokens = tokens_free self.statistics = statistics_free self.confidences = confidences_free self.errors = errors_free
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder( '0x', False, '0123456789abcdefABCDEF_') long_integer_tb = SuffixedIntegerTokenBuilder('L', False, None) real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") float_real_tb = SuffixedRealTokenBuilder(False, False, ['f'], False, None) operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') symbol_tb = PrefixedIdentifierTokenBuilder("'", 'symbol', True) operand_types.append('symbol') quotes = ['"'] string_tb = EscapedStringTokenBuilder(quotes, 0) triple_string_tb = TripleQuoteStringTokenBuilder(quotes) operand_types.append('string') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '&', '|', '^', '<<', '>>', '&&', '||', '=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '>:', '⇒', '=>', '=', '<%', '<:', '←', '<-', '#', '@', '==', '!=', '>', '<', '>=', '<=', '!', '~', '<<<', '>>>', '.', '++', '--', 'new' ] self.unary_operators = ['+', '-', '*', '!', '~', '++', '--', 'new'] self.postfix_operators = ['++', '--'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':'] group_starts = ['(', '[', ',', '{'] group_mids = [',', ':'] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'abstract', 'case', 'catch', 'class', 'def', 'do', 'else', 'extends', 'final', 'finally', 'for', 'forSome', 'if', 'implicit', 'import', 'lazy', 'match', 'object', 'override', 'package', 'private', 'protected', 'return', 'sealed', 'then', 'throw', 'trait', 'try', 'type', 'val', 'var', 'while', 'with', 'yield' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = ['false', 'true', 'null', 'this', 'super'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, long_integer_tb, real_tb, real_exponent_tb, float_real_tb, keyword_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, symbol_tb, string_tb, triple_string_tb, slash_slash_comment_tb, slash_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') self.tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid') self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("_") hex_integer_tb = PrefixedIntegerTokenBuilder( '0x', False, '_0123456789abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '_01') suffixed_integer_tb = SuffixedIntegerTokenBuilder([ 'G', 'L', 'I', ], False, '_') real_tb = RealTokenBuilder(False, False, "_") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "_") suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['G', 'D', 'F'], False, '_') operand_types.append('number') leads = '@_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) triple_quote_string_tb = TripleQuoteStringTokenBuilder(quotes) regex_tb = RegexTokenBuilder() # dollar-slash slash-dollar strings (allow newline) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() shebang_tb = SheBangTokenBuilder() terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '**', '=', '==', '!=', '===', '!==', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '**=', '&=', '|=', '^=', '<<=', '>>=', '!', '&', '|', '~', '<<', '>>', '>>>', '^', '?.', '?:', '<>', '>>>=', '.', '.&', '.@', '::', '=~', '==~', '*.', '*:', '..', '..<', '<=>', '++', '--', '->', '&&', '||', '?', '##', 'as', 'in', '!in', 'instanceof', '!instanceof', 'new', ] self.unary_operators = ['+', '-', '*', '!', '&', '~', '++', '--', '?'] self.postfix_operators = ['++', '--', '&', '*'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':'] # group_starts = ['(', '[', ',', '{'] group_ends = [')', ']', '}'] group_mids = [',', ':'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'assert', 'break', 'case', 'catch', 'class', 'const', 'continue', 'def', 'default', 'do', 'else', 'enum', 'extends', 'finally', 'for', 'goto', 'if', 'implements', 'import', 'interface', 'new', 'package', 'return', 'super', 'switch', 'throw', 'throws', 'trait', 'try', 'var', 'while' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'byte', 'char', 'double', 'float', 'int', 'long', 'short', 'Java.lang.BigInteger' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['null', 'true', 'false', 'this'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, class_type_tb, string_tb, triple_quote_string_tb, regex_tb, slash_slash_comment_tb, slash_star_comment_tb, shebang_tb, self.unknown_operator_tb, invalid_token_builder ] # shebang line at start tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence(['*', ';']) num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) # self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01') suffixed_integer_tb = SuffixedIntegerTokenBuilder( ['U', 'L', 'LL', 'ULL', 'LLU'], False, None) real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['f', 'l'], False, None) operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() directives = [ '#define', '#undef', '#ifdef', '#ifndef', '#if', '#endif', '#else', '#elif', '#line', '#include', '#pragma' ] line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) c_preprocessor_tb = CaseSensitiveListTokenBuilder( directives, 'preprocessor', False) c_warning_tb = LeadToEndOfLineTokenBuilder('#warning', True, 'preprocessor') c_error_tb = LeadToEndOfLineTokenBuilder('#error', True, 'preprocessor') terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '=', '==', '!=', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!', '&', '|', '~', '<<', '>>', '^', '.', '++', '--', '->', '&&', '||', '?', '##', '::', '<=>', '.*', '->*', 'new', 'delete', 'and', 'and_eq', 'bitand', 'bitor', 'compl', 'not', 'not_eq', 'or', 'or_eq', 'xor', 'xor_eq' ] self.unary_operators = [ '+', '-', '*', '!', '&', '~', '++', '--', 'new', 'delete', 'compl', 'not' ] self.postfix_operators = ['++', '--', '&', '*'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':'] group_starts = ['(', '[', ',', '{'] group_ends = [')', ']', '}'] group_mids = [',', ':'] group_mids = [',', ':'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'alignas', 'alignof', 'asm', 'atomic_cancel', 'atomic_commit', 'atomic_noexcept', 'audit', 'auto', 'axiom', 'break', 'case', 'catch', 'class', 'concept', 'const', 'consteval', 'constexpr', 'const_cast', 'continue', 'co_await', 'co_return', 'co_yield', 'decltype', 'default', 'do', 'dynamic_cast', 'else', 'enum', 'explicit', 'export', 'extern', 'final', 'for', 'friend', 'goto', 'if', 'import', 'inline', 'module', 'mutable', 'namespace', 'noexcept', 'nullptr', 'operator', 'override', 'private', 'protected', 'public', 'private:', 'protected:', 'public:', 'reflexpr', 'register', 'reinterpret_cast', 'requires', 'return', 'signed', 'sizeof', 'static', 'static_assert', 'static_cast', 'struct', 'switch', 'synchronized', 'template', 'thread_local', 'throw', 'transaction_safe', 'transaction_safe_dynamic' 'try', 'typedef', 'typeid', 'typename', 'union', 'unsigned', 'using', 'virtual', 'volatile', 'while', ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'bool', 'char', 'char8_t', 'char16_t', 'char32_t', 'double', 'float', 'int', 'long', 'short', 'void', 'wchar_t' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['false', 'this', 'true', 'cout', 'cin'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb, keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb, class_type_tb, string_tb, slash_slash_comment_tb, slash_star_comment_tb, c_preprocessor_tb, c_error_tb, c_warning_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence(['*', ';']) num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, version): super().__init__() self.newlines_important = 'always' operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder('_') integer_exponent_tb = IntegerExponentTokenBuilder(False) real_tb = RealTokenBuilder(False, False, False) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', '_') double_exponent_tb = NullTokenBuilder() integer_suffix_tb = SuffixedIntegerTokenBuilder( ['%', '&', 'S', 'I', 'L', 'F', 'D', 'R', 'US', 'UI', 'UL'], True, '_') float_suffix_tb = SuffixedRealTokenBuilder(False, False, ['!', '#', 'F', 'D', 'R'], True, '_') if version in ['basic-80', 'basica', 'gw-basic']: double_exponent_tb = RealExponentTokenBuilder( False, False, 'D', '_') integer_suffix_tb = SuffixedIntegerTokenBuilder(['%'], False, '_') float_suffix_tb = SuffixedRealTokenBuilder(False, False, ['!', '#'], True, '_') hex_constant_tb = PrefixedIntegerTokenBuilder( '&H', True, '0123456789ABCDEFabcdef_') octal_constant_tb = PrefixedIntegerTokenBuilder( '&O', True, '01234567_') binary_constant_tb = PrefixedIntegerTokenBuilder('&B', True, '01_') operand_types.append('number') variable_tb = BasicVariableTokenBuilder('%#!$&') if version in ['basic-80', 'basica', 'gw-basic']: variable_tb = BasicLongVariableTokenBuilder('%#!$&') operand_types.append('variable') quotes = ['"'] string_tb = StuffedQuoteStringTokenBuilder(quotes, False) operand_types.append('string') remark_tb = RemarkTokenBuilder() comment_tb = LeadToEndOfLineTokenBuilder("'", False, 'comment') comment2_tb = LeadToEndOfLineTokenBuilder("’", False, 'comment') stmt_separator_tb = SingleCharacterTokenBuilder( ':', 'statement separator', False) known_operators = [ '+', '-', '*', '/', '^', '=', '>', '>=', '<', '<=', '<>', '#', '\\', 'AND', 'OR', 'NOT' ] known_operators_ms = ['=>', '=<', 'IMP', 'EQV', 'XOR', 'MOD'] if version in ['basic-80', 'basica', 'gw-basic']: known_operators += known_operators_ms known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', '#', 'NOT'] groupers = ['(', ')', ',', ';'] group_starts = ['('] group_mids = [',', ';'] group_ends = [')'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'CLOSE', 'CHAIN', 'DATA', 'DEF', 'DIM', 'ELSE', 'END', 'ERROR', 'FILE', 'FOR', 'GOSUB', 'GOTO', 'IF', 'INPUT', 'LET', 'LINE', 'MAT', 'NEXT', 'ON', 'ONERR', 'OPEN', 'OUTPUT', 'POKE', 'PRINT', 'RANDOMIZE', 'READ', 'REM', 'REMARK', 'RESTORE', 'RETURN', 'STEP', 'STOP', 'THEN', 'TO', 'USING' ] keywords_plain = ['AS', 'GO'] keywords_ms = [ # 'AS', ## promoted from variable after FIELD # 'BASE', ## promoted from variable after OPTION 'CALL', 'CLEAR', 'CLS', 'COMMON', 'DEFDBL', 'DEFINT', 'DEFSNG', 'DEFSTR', 'ELSE', 'END', 'ERASE', 'ERRLN', 'ERRNO', 'ERROR', 'FIELD', 'FILES', 'GET', 'KILL', 'LOAD', 'LPRINT', 'LSET', 'MERGE', 'NULL', 'ONERR', 'OPTION', 'OUT', 'PUT', 'RESET', 'RESUME', 'RETURN', 'RSET', 'RUN', 'SET', 'SWAP', 'SYSTEM', 'TRON', 'TROFF', 'WAIT', 'WHILE', 'WEND', 'WIDTH', 'WRITE' ] plus_keywords = ['CHANGE'] if version in ['']: keywords += keywords_plain if version in ['basic-80', 'basica', 'gw-basic']: keywords += keywords_ms keywords_basica = [ 'COLOR', 'KEY', 'LOCATE', 'PAINT', 'PLAY', 'SCREEN', 'SOUND' ] if version in ['basica', 'gw-basic']: keywords += keywords_basica keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) values = ['OFF', 'ON'] values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') functions = [ 'ABS', 'ASC', 'ATN', 'CHR', 'CHR$', 'CON', 'COS', 'DET', 'ERL', 'ERR', 'EXP', 'IDN', 'INSTR', 'INT', 'INV', 'LEFT', 'LEFT$', 'LEN', 'LOG', 'MID', 'MID$', 'POS', 'RIGHT', 'RIGHT$', 'RND', 'SGN', 'SIN', 'SQR', 'STR$', 'TAB', 'TAN', 'TRN', 'VAL', 'ZER' ] functions_ms = [ 'CDBL', 'CINT', 'CSNG', 'CVI', 'CVD', 'CVS', 'DATE$', 'EOF', 'FIX', 'FRE', 'HEX$', 'INKEY', 'INP', 'INPUT$', 'INSTR', 'LOC', 'LOF', 'LPOS', 'MKI$', 'MKD$', 'MKS$', 'OCT$', 'PEEK', 'SPACE$', 'SPC', 'STRING$', 'TIME$', 'USR', 'VARPTR' ] if version in ['basic-80', 'basica', 'gw-basic']: functions += functions_ms function_tb = CaseInsensitiveListTokenBuilder(functions, 'function', True) user_function_tb = UserFunctionTokenBuilder('%#!$&') hardware_function_tb = NullTokenBuilder() if version in ['basic-80', 'basica', 'gw-basic']: user_function_tb = LongUserFunctionTokenBuilder('%#!$&') hardware_function_tb = HardwareFunctionTokenBuilder() operand_types.append('function') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, stmt_separator_tb, integer_tb, integer_exponent_tb, float_suffix_tb, integer_suffix_tb, real_tb, real_exponent_tb, double_exponent_tb, hex_constant_tb, octal_constant_tb, binary_constant_tb, keyword_tb, known_operator_tb, function_tb, user_function_tb, hardware_function_tb, values_tb, variable_tb, groupers_tb, string_tb, remark_tb, comment_tb, comment2_tb, self.unknown_operator_tb, invalid_token_builder ] operand_types = [ 'number', 'string', 'symbol', 'identifier', 'variable', 'function' ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = BasicExaminer.convert_numbers_to_line_numbers(tokens) if version in ['basic-80', 'basica', 'gw-basic']: tokens = BasicExaminer.extract_keywords_from_identifiers( tokens, keywords, known_operators) tokens = BasicExaminer.convert_as_to_keyword(tokens) tokens = BasicExaminer.convert_base_to_keyword(tokens) tokens = BasicExaminer.convert_operators_to_values(tokens) self.tokens = tokens self.calc_statistics() tokens = self.source_tokens() self.calc_statistics() self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'variable', 'symbol'] if version not in ['basic-80', 'basica', 'gw-basic']: self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_line_format_confidence() self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, tab_size, processor): super().__init__() self.newlines_important = 'always' operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() comment_tb = LeadToEndOfLineTokenBuilder(';', True, 'comment') if processor in ['pdp-8']: comment_tb = LeadToEndOfLineTokenBuilder('/', True, 'comment') comment_2_tb = NullTokenBuilder() if processor in ['1802']: comment_2_tb = LeadToEndOfLineTokenBuilder('..', True, 'comment') line_comment_star_tb = AssemblyCommentTokenBuilder('*') line_comment_hash_tb = NullTokenBuilder() if processor in ['68000']: line_comment_hash_tb = AssemblyCommentTokenBuilder('#') stmt_separator_tb = NullTokenBuilder() if processor in ['pdp-8']: stmt_separator_tb = SingleCharacterTokenBuilder( ';', 'statement separator', False) integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") integer_1_tb = NullTokenBuilder() integer_2_tb = NullTokenBuilder() prefixed_integer_tb = PrefixedIntegerTokenBuilder( '#', True, '0123456789') if processor in ['pdp-11']: integer_1_tb = SuffixedIntegerTokenBuilder('$', True, '0123456789') if processor in ['z80']: integer_1_tb = SuffixedIntegerTokenBuilder('O', True, '0123456789') integer_2_tb = SuffixedIntegerTokenBuilder('D', True, '0123456789') hex_integer_1_tb = PrefixedIntegerTokenBuilder( '&', True, '0123456789abcdefABCDEF') hex_integer_2_tb = SuffixedIntegerTokenBuilder( 'h', False, '0123456789abcdefABCDEF') hex_integer_3_tb = PrefixedIntegerTokenBuilder( '$', True, '0123456789abcdefABCDEF') hex_integer_4_tb = PrefixedIntegerTokenBuilder( '#$', True, '0123456789abcdefABCDEF') hash_quote_value_tb = NullTokenBuilder() if processor in ['pdp-11']: hash_quote_value_tb = HashQuoteCharTokenBuilder() operand_types.append('number') leads = '_.$@#' extras = '_.$@#' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') label_tb = LabelTokenBuilder(leads, extras, ':') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') known_operators = [ '+', '-', '*', '/', '&', '|', '=', '??', '#', '@', "'", '!' ] self.unary_operators = ['+', '-', '??', '#', '@', "'"] self.postfix_operators = ['+'] groupers = ['(', ')', ',', '[', ']', '<', '>', ':'] group_starts = ['(', '[', ',', '<'] group_ends = [')', ']', '>'] group_mids = [',', ':'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) preprocessors = [ 'if', 'ifne', 'ifeq', 'else', 'endif', 'endc', 'error' ] preprocessors_68000 = ['MACRO', 'ENDM'] preprocessors_8080 = ['MACRO', 'ENDM'] preprocessors_8086 = [ 'ELSE', 'ELSEIF', 'ELSEIF2', 'ENDM', 'EXITM', 'FOR', 'FORC', 'GOTO', 'IF', 'IF2', 'IFB', 'IFNB', 'IFDEF', 'IFNDEF', 'IFDIF', 'IFDIF[[I]]', 'IFE', 'IFIDN', 'IFIDN[[I]]', 'LOCAL', 'MACRO', 'PURGE', '.BREAK', '.CONTINUE', '.ELSE', '.ELSEIF', '.ENDIF', '.ERR', '.ERR2', '.ERRB', '.ERRDEF', '.ERRDIF', '.ERRDIF[[I]]]', '.ERRE', '.ERRIDN', '.ERRIDN[[I]]', '.ERRNB', '.ERRNDEF', '.ERRNZ', '.EXIT', '.IF', '.REPEAT', '.UNTIL', '.UNTILCXZ', '.WHILE' ] if processor in ['68000']: preprocessors += preprocessors_68000 if processor in ['8080']: preprocessors += preprocessors_8080 if processor in ['8086']: preprocessors += preprocessors_8086 preprocessor_tb = CaseInsensitiveListTokenBuilder( preprocessors, 'preprocessor', False) directives = [ 'DB', 'DW', 'DS', 'EJECT', 'END', 'EQU', 'EXTRN', 'INCLUDE', 'NAME', 'ORG', 'PAGE', 'SECTION', 'SEGMENT', 'START', 'SUBTITLE', 'TEXT' ] directives_6502 = ['DFB', 'DFW'] directives_6800 = ['CPU', 'NAM'] directives_68000 = ['=', 'EVEN', 'ODD'] directives_8080 = [ 'ASEG', 'CPU', 'LOCAL', 'TITLE', '.8080', '.8086', '.6800', '.6502', ".386", ] directives_z80 = ['DEFB', 'DEFS', 'DEFW'] directives_8086 = [ '=', 'ABSOLUTE', 'ALIAS', 'ALIGN', 'AS', 'ASSUME', 'AT', 'BITS', 'BYTE', 'COMM', 'COMMON', 'CPU', 'CSEG', 'DEFAULT', 'DSEG', 'DWORD', 'ECHO', 'ENDP', 'ENDS', 'EVEN', 'EXTERNDEF', 'FWORD', 'FORMAT', 'GLOBAL', 'GROUP', 'INCLUDELIB', 'INS86', 'INVOKE', 'LABEL', 'MMWORD', 'OPTION', 'POPCONTEXT', 'PROC', 'PROTO', 'PUBLIC', 'PUSHCONTEXT', 'SEGMENT' 'QWORD', 'REAL4', 'REAL8', 'REAL10', 'RECORD', 'STRUCT', 'TEXTEQU', 'TBYTE', 'TYPEDEF', 'WORD', 'SBYTE', 'SDWORD', 'SWORD', 'SECT', 'SECTION', 'SEGMENT', 'STATIC' 'UNION', 'USE16', 'USE32', 'USE64', 'VIRTUAL', 'XMMWORD', 'YMMWORD', '.386', '.386P', '.387', '.486', '.486P', '.586', '.586P', '.686', '.686P', '.K3D', '.ALLOCSTACK', '.ALPHA', '.CODE', '.CONST', '.CREF', '.DATA', '.DATA?', '.DOSSEG', '.ENDW', '.ENDPROLOG', '.FARDATA', '.FARDATA?', '.FPO', '.LIST', '.LISTALL', '.LISTIF', '.LISTMACRO', '.LISTMACROALL', '.MODEL', '.MMX', '.NOCREF', '.NOLIST', '.NOLISTIF', '.NOLISTMACRO', '.PUSHFRAME', '.PUSHREG', '.RADIX', '.SAFESEH', '.SALL', '.SAVEREG', '.SAVEXMM128', '.STACK', '.STARTUP', '.SEQ', '.SETFRAME', '.TFCOND', '.XLIST', '.XMM', ] directives_80386 = [ 'ALIGN', 'BITS', 'GLOBAL', 'PROC', 'SECTION', 'RESB', 'RESD', '.386', '.CODE', '.DATA', '.MODEL', '.TEXT', '%INCLUDE', ] directives_pdp8 = ['='] directives_pdp11 = [ '=', 'BYTE', 'WORD', '.odd', '.even', '.blkb', '.blkw', '.byte', '.word', '.ascii', '.asciz', '.end', '.hex', '.radix', '.ident', '.if', '.ift', '.endc', '.psect', '.mcall', '.macro', '.endm', '.restore', '.print', '.error', '.list', '.nlist' ] if processor in ['6502']: directives += directives_6502 if processor in ['6800']: directives += directives_6800 if processor in ['68000']: directives += directives_68000 if processor in ['8080']: directives += directives_8080 if processor in ['z80']: directives += directives_z80 if processor in ['8086']: directives += directives_8086 if processor in ['80386']: directives += directives_80386 if processor in ['pdp-8']: directives += directives_pdp8 if processor in ['pdp-11']: directives += directives_pdp11 directive_tb = CaseInsensitiveListTokenBuilder(directives, 'directive', False) title_directive_tb = LeadToEndOfLineTokenBuilder( 'TITLE', False, 'directive') title_directive_2_tb = LeadToEndOfLineTokenBuilder( '.TITLE', False, 'directive') subtitle_directive_tb = LeadToEndOfLineTokenBuilder( 'SUBTTL', False, 'directive') subtitle_directive_2_tb = LeadToEndOfLineTokenBuilder( '.SUBTTL', False, 'directive') subtitle_directive_3_tb = LeadToEndOfLineTokenBuilder( '.SBTTL', False, 'directive') include_directive_tb = LeadToEndOfLineTokenBuilder( 'INCLUDE', False, 'directive') include_directive_2_tb = LeadToEndOfLineTokenBuilder( '.INCLUDE', False, 'directive') multiline_comment_tb = MultilineCommentTokenBuilder() opcodes_1802 = [ 'IDL', 'LDN', 'INC', 'DEC', 'BR', 'BO', 'BZ', 'BDF', 'BPZ', 'BGE', 'B1', 'B2', 'B3', 'B4', 'SKP', 'NBR', 'BNO', 'BNZ', 'BNF', 'BM', 'BL', 'BN1', 'BN2', 'BN3', 'BN4', 'LDA', 'STR', 'IRX', 'OUT', 'INP', 'RET', 'DIS', 'LDXA', 'STXD', 'ADC', 'SDB', 'SHRC', 'RSHR', 'SMB', 'SAV', 'MARK', 'REQ', 'SEQ', 'ADCI', 'SDBI', 'SHLC', 'RSHL', 'SMBI', 'GLO', 'GHI', 'PLO', 'PHI', 'LBO', 'LBZ', 'LBDF', 'NOP', 'LSNO', 'LSNZ', 'LSNF', 'LSKP', 'NLBR', 'LBNQ', 'LBNZ', 'LBNF', 'LSIE', 'LSQ', 'LSZ', 'LSDF', 'SEP', 'SEX', 'LDX', 'OR', 'AND', 'XOR', 'ADD', 'SD', 'SHR', 'SM', 'LDI', 'ORI', 'ANI', 'XRI', 'ADI', 'SDI', 'SHL', 'SMI' ] registers_1802 = [] opcodes_6502 = [ 'ADC', 'AND', 'ASL', 'AST', 'BCC', 'BCS', 'BEQ', 'BIT', 'BMI', 'BNE', 'BPL', 'BRK', 'BVC', 'BVS', 'CLC', 'CLD', 'CLI', 'CLV', 'CMP', 'CPR', 'CPX', 'CPY', 'DEC', 'DEX', 'DEY', 'EOR', 'INC', 'INX', 'INY', 'JMP', 'JSR', 'LDA', 'LDX', 'LDY', 'LSR', 'NOP', 'ORA', 'PHA', 'PHP', 'PLA', 'PLP', 'ROL', 'ROR', 'RTI', 'RTS', 'SBC', 'SEC', 'SED', 'SEI', 'STA', 'STX', 'STY', 'TAX', 'TAY', 'TSX', 'TXA', 'TXS', 'TYA' ] registers_6502 = ['A', 'X', 'Y', 'P', 'S'] opcodes_6800 = [ 'ABA', 'ADC', 'ADCA', 'ADCB', 'ADD', 'AND', 'ASL', 'ASR', 'BCC', 'BCS', 'BEQ', 'BGE', 'BGT', 'BHI', 'BIT', 'BLE', 'BLS', 'BLT', 'BMI', 'BNE', 'BPL', 'BRA', 'BSR', 'BVC', 'BVS', 'CBA', 'CLC', 'CLI', 'CLR', 'CLRA', 'CLRB', 'CLV', 'CMP', 'COM', 'CPX', 'DAA', 'DEC', 'DES', 'DEX', 'EOR', 'EORA', 'EROB', 'INC', 'INS', 'INX', 'JMP', 'JSR', 'LDA', 'LDAA', 'LDAB', 'LDS', 'LDX', 'LSR', 'NEG', 'NOP', 'ORA', 'PSH', 'PUL', 'ROL', 'ROR', 'RTI', 'RTS', 'SBA', 'SBC', 'SEC', 'SEI', 'SEV', 'STA', 'STAA', 'STAB', 'STS', 'STX', 'SUB', 'SWI', 'TAB', 'TAP', 'TBA', 'TPA', 'TST', 'TSX', 'TXS', 'WAI' ] registers_6800 = ['A', 'B', 'IX', 'PC', 'SP'] opcodes_68000 = [ 'AND', 'ANDI', 'EOR', 'EORI', 'NOT', 'OR', 'ORI', 'CLR', 'BCHG', 'BCLR', 'BSET', 'BTST', 'EXT', 'EXTB', 'MOVE', 'MOVEA', 'MOVEM', 'MOVEP', 'MOVEQ', 'CMP', 'CMPA', 'CMPI', 'CMPM', 'CMP2', 'LEA', 'PEA', 'TAS', 'CHK', 'ADD', 'ADDA', 'ADDI', 'ADDQ', 'ADDX', 'SUB', 'SUBA', 'SUBI', 'SUBQ', 'SUBX', 'MULS', 'MULU', 'DIVS', 'DIVU', 'NEG', 'NEGX', 'ASL', 'ASR', 'LSL', 'LSR', 'ROL', 'ROR', 'ROXL', 'ROXR', 'DBCC', 'SWAP', 'TST', 'ANDB', 'ANDIB', 'EORB', 'EORIB', 'NOTB', 'ORB', 'ORIB', 'CLRB', 'BCHGB', 'BCLRB', 'BSETB', 'BTSTB', 'EXTB', 'EXTBB', 'MOVEB', 'MOVEAB', 'MOVEMB', 'MOVEPB', 'MOVEQB', 'CMPB', 'CMPAB', 'CMPIB', 'CMPMB', 'CMP2B', 'LEAB', 'PEAB', 'TASB', 'CHKB', 'ADDB', 'ADDAB', 'ADDIB', 'ADDQB', 'ADDXB', 'SUBB', 'SUBAB', 'SUBIB', 'SUBQB', 'SUBXB', 'MULSB', 'MULUB', 'DIVSB', 'DIVUB', 'NEGB', 'NEGXB', 'ASLB', 'ASRB', 'LSLB', 'LSRB', 'ROLB', 'RORB', 'ROXLB', 'ROXRB', 'DBCCB', 'SWAPB', 'TSTB', 'ANDW', 'ANDIW', 'EORW', 'EORIW', 'NOTW', 'ORW', 'ORIW', 'CLRW', 'BCHGW', 'BCLRW', 'BSETW', 'BTSTW', 'EXTW', 'EXTBW', 'MOVEW', 'MOVEAW', 'MOVEMW', 'MOVEPW', 'MOVEQW', 'CMPW', 'CMPAW', 'CMPIW', 'CMPMW', 'CMP2W', 'LEAW', 'PEAW', 'TASW', 'CHKW', 'ADDW', 'ADDAW', 'ADDIW', 'ADDQW', 'ADDXW', 'SUBW', 'SUBAW', 'SUBIW', 'SUBQW', 'SUBXW', 'MULSW', 'MULUW', 'DIVSW', 'DIVUW', 'NEGW', 'NEGXW', 'ASLW', 'ASRW', 'LSLW', 'LSRW', 'ROLW', 'RORW', 'ROXLW', 'ROXRW', 'DBCCW', 'SWAPW', 'TSTW', 'ANDL', 'ANDIL', 'EORL', 'EORIL', 'NOTL', 'ORL', 'ORIL', 'CLRL', 'BCHGL', 'BCLRL', 'BSETL', 'BTSTL', 'EXTL', 'EXTBL', 'MOVEL', 'MOVEAL', 'MOVEML', 'MOVEPL', 'MOVEQL', 'CMPL', 'CMPAL', 'CMPIL', 'CMPML', 'CMP2L', 'LEAL', 'PEAL', 'TASL', 'CHKL', 'ADDL', 'ADDAL', 'ADDIL', 'ADDQL', 'ADDXL', 'SUBL', 'SUBAL' 'SUBIL', 'SUBQL', 'SUBXL', 'MULSL', 'MULUL', 'DIVSL', 'DIVUL', 'NEGL', 'NEGXL', 'ASLL', 'ASRL', 'LSLL', 'LSRL', 'ROLL', 'RORL', 'ROXLL', 'ROXRL', 'DBCCL', 'SWAPL', 'TSTL', 'ABCD', 'NBCD', 'PACK', 'SBCD', 'UNPK', 'BSR', 'BRA', 'BT', 'BF', 'BEQ', 'BNE', 'BLS', 'BLT', 'BLE', 'BGT', 'BGE', 'BCC', 'BCS', 'BPL', 'BMI', 'BHI', 'BVC', 'BVS', 'BSRS', 'BRAS', 'BEQS', 'BNES', 'BLSS', 'BLTS', 'BLES', 'BGTS', 'BGES', 'BCCS', 'BCSS', 'BPLS', 'BMIS', 'BHIS', 'BVCS', 'BVSS', 'DBSR', 'DBRA', 'DBT', 'DBF', 'DBEQ', 'DBNE', 'DBLS', 'DBLT', 'DBLE', 'DBGT', 'DBGE', 'DBCC', 'DBCS', 'DBPL', 'DBMI', 'DBHI', 'DBVC', 'DBVS', 'JSR', 'JMP', 'TRAP', 'HALT', 'STOP', 'RTD', 'RTE', 'RTR', 'RTS', 'TRAP', 'HALT', 'STOP', 'NOP', 'MOVE16', 'EXG', 'BFCHG', 'BFCLR', 'BFEXTS', 'BFEXTU', 'BFFFO', 'BFINS', 'BFSET', 'BFTST', 'FNOP', 'FABS', 'FACOS', 'FASIN', 'FATAN', 'FCOS', 'FCOSH', 'FETOX', 'FETOXM1', 'FGETMAN', 'FINT', 'FINTRZ', 'FLOGN', 'FLOGNP1', 'FLOG10', 'FLOG2', 'FNEG', 'FSIN', 'FSINH', 'FSQRT', 'FTAN', 'FTANH', 'FTENTOX', 'FTWOTOX', 'FTST', 'DSB', 'DSW', 'DSL', 'DCB', 'DCW', 'DCL', 'AND.B', 'ANDI.B', 'EOR.B', 'EORI.B', 'NOT.B', 'OR.B', 'ORI.B', 'CLR.B', 'BCHG.B', 'BCLR.B', 'BSET.B', 'BTST.B', 'EXT.B', 'EXTB.B', 'MOVE.B', 'MOVEA.B', 'MOVEM.B', 'MOVEP.B', 'MOVEQ.B', 'CMP.B', 'CMPA.B', 'CMPI.B', 'CMPM.B', 'CMP2.B', 'LEA.B', 'PEA.B', 'TAS.B', 'CHK.B', 'ADD.B', 'ADDA.B', 'ADDI.B', 'ADDQ.B', 'ADDX.B', 'SUB.B', 'SUBA.B', 'SUBI.B', 'SUBQ.B', 'SUBX.B', 'MULS.B', 'MULU.B', 'DIVS.B', 'DIVU.B', 'NEG.B', 'NEGX.B', 'ASL.B', 'ASR.B', 'LSL.B', 'LSR.B', 'ROL.B', 'ROR.B', 'ROXL.B', 'ROXR.B', 'DBCC.B', 'SWAP.B', 'TST.B', 'AND.W', 'ANDI.W', 'EOR.W', 'EORI.W', 'NOT.W', 'OR.W', 'ORI.W', 'CLR.W', 'BCHG.W', 'BCLR.W', 'BSET.W', 'BTST.W', 'EXT.W', 'EXTB.W', 'MOVE.W', 'MOVEA.W', 'MOVEM.W', 'MOVEP.W', 'MOVEQ.W', 'CMP.W', 'CMPA.W', 'CMPI.W', 'CMPM.W', 'CMP2.W', 'LEA.W', 'PEA.W', 'TAS.W', 'CHK.W', 'ADD.W', 'ADDA.W', 'ADDI.W', 'ADDQ.W', 'ADDX.W', 'SUB.W', 'SUBA.W', 'SUBI.W', 'SUBQ.W', 'SUBX.W', 'MULS.W', 'MULU.W', 'DIVS.W', 'DIVU.W', 'NEG.W', 'NEGX.W', 'ASL.W', 'ASR.W', 'LSL.W', 'LSR.W', 'ROL.W', 'ROR.W', 'ROXL.W', 'ROXR.W', 'DBCC.W', 'SWAP.W', 'TST.W', 'AND.L', 'ANDI.L', 'EOR.L', 'EORI.L', 'NOT.L', 'OR.L', 'ORI.L', 'CLR.L', 'BCHG.L', 'BCLR.L', 'BSET.L', 'BTST.L', 'EXT.L', 'EXTB.L', 'MOVE.L', 'MOVEA.L', 'MOVEM.L', 'MOVEP.L', 'MOVEQ.L', 'CMP.L', 'CMPA.L', 'CMPI.L', 'CMPM.L', 'CMP2.L', 'LEA.L', 'PEA.L', 'TAS.L', 'CHK.L', 'ADD.L', 'ADDA.L', 'ADDI.L', 'ADDQ.L', 'ADDX.L', 'SUB.L', 'SUBA.L', 'SUBI.L', 'SUBQ.L', 'SUBX.L', 'MULS.L', 'MULU.L', 'DIVS.L', 'DIVU.L', 'NEG.L', 'NEGX.L', 'ASL.L', 'ASR.L', 'LSL.L', 'LSR.L', 'ROL.L', 'ROR.L', 'ROXL.L', 'ROXR.L', 'DBCC.L', 'SWAP.L', 'TST.L', 'BSR.S', 'BRA.S', 'BT.S', 'BF.S', 'BEQ.S', 'BNE.S', 'BLS.S', 'BLT.S', 'BLE.S', 'BGT.S', 'BGE.S', 'BCC.S', 'BCS.S', 'BPL.S', 'BMI.S', 'BHI.S', 'BVC.S', 'BVS.S', 'DS.B', 'DS.W', 'DS.L', 'DC.B', 'DC.W', 'DC.L' ] registers_68000 = [ 'D0', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'A0', 'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'FP0', 'FP1', 'FP2', 'FP3', 'FP4', 'FP5', 'FP6', 'FP7', 'PC', 'SR' ] opcodes_8080 = [ 'ACI', 'ADC', 'ADD', 'ADI', 'ANA', 'ANI', 'CALL', 'CC', 'CM', 'CMA', 'CMC', 'CMP', 'CNC', 'CNZ', 'CP', 'CPE', 'CPI', 'CPO', 'CZ', 'DAA', 'DAD', 'DCR', 'DCX', 'DI', 'EI', 'HLT', 'IN', 'INR', 'INX', 'JC', 'JM', 'JMP', 'JNC', 'JNZ', 'JP', 'JPE', 'JPO', 'JZ', 'LDAX', 'LHLD', 'LXI', 'MOV', 'MVI', 'NOP', 'ORA', 'ORI', 'OUT', 'PCHL', 'POP', 'PUSH', 'RAL', 'RAR', 'RC', 'RIM', 'RLC', 'RET', 'RM', 'RNC', 'RNZ', 'RP', 'RPE', 'RPO', 'RRC', 'RST', 'RZ ', 'SBB', 'SBI', 'SHLD', 'SIM', 'SPHL', 'STA', 'STC', 'STAX', 'SUB', 'SUI', 'XCHG', 'XRA', 'XRI', 'XTHL', ] registers_8080 = ['A', 'B', 'C', 'D', 'E', 'H', 'L', 'M', 'PSW', 'F'] opcodes_z80 = [ 'ADC', 'ADD', 'AND', 'BIT', 'CALL', 'CCF', 'CP', 'CPD', 'CPDR', 'CPI', 'CPIR', 'CPL', 'DAA', 'DEC', 'DI', 'DJNZ', 'EI', 'EX', 'EXX', 'HALT', 'IM', 'IN', 'INC', 'IND', 'INDR', 'INI', 'INIR', 'JP', 'JR', 'LD', 'LDD', 'LDDR', 'LDI', 'LDIR', 'NEG', 'NOP', 'OR', 'OTDR', 'OTIR', 'OUT', 'OUTD', 'OUTI', 'POP', 'PUSH', 'RES', 'RET', 'RETI', 'RETN', 'RL', 'RLA', 'RLC', 'RLCA', 'RLD', 'RR', 'RRA', 'RRC', 'RRCA', 'RRD', 'RST', 'SBC', 'SCF', 'SET', 'SLA', 'SRA', 'SRL', 'SUB', 'XOR' ] registers_z80 = [ 'A', 'B', 'C', 'D', 'E', 'H', 'L', 'F', 'AF', 'BC', 'DE', 'HL', "A'", "B'", "C'", "D'", "E'", "H'", "L'", "AF'", "F'", "BC'", "DE'", "HL'", 'IX', 'IY', 'PSW', 'M' ] opcodes_8086 = [ 'AAA', 'AAD', 'AAM', 'AAS', 'ADC', 'ADD', 'AND', 'CALL', 'CBW', 'CLC', 'CLD', 'CLI', 'CMC', 'CMP', 'CMPS', 'CMPSB', 'CMPW', 'CMPXCHG', 'CWD', 'DAA', 'DAS', 'DEC', 'DIV', 'ESC', 'FWAIT', 'F2XM1', 'FABS', 'FADD', 'FADDP', 'FBLD', 'FBSTP', 'FCHS', 'FCLEX', 'FCOM', 'FCOMP', 'FCOMPP', 'FCOS', 'FDECSTP', 'FDISI', 'FDIV', 'FDIVP', 'FDIVR', 'FDIVRP', 'FENI', 'FFREE', 'FIADD', 'FICOM', 'FICOMP', 'FIDIV', 'FIDIVR', 'FILD', 'FIMUL', 'FINCSTP', 'FINIT', 'FIST', 'FISTP', 'FISUB', 'FISUBR', 'FLD', 'FLD1', 'FLDCW', 'FLDENV', 'FLDL2E', 'FLDL2T', 'FLDLG2', 'FLDLN2', 'FLDPI', 'FLDZ', 'FMUL', 'FMULP', 'FNCLEX', 'FNDISI', 'FNENI', 'FNINIT', 'FNOP', 'FNSAVE', 'FNSTCW', 'FNSTENV', 'FNSTSW', 'FPATAN', 'FPREM', 'FPREM1', 'FPTAN', 'FRNDINT', 'FRSTOR', 'FSAVE', 'FSCALE', 'FSETPM', 'FSIN', 'FSINCOS', 'FSQRT', 'FST', 'FSTCW', 'FSTENV', 'FSTP', 'FSTSW', 'FSUB', 'FSUBP', 'FSUBRP', 'FTST', 'FUCOM', 'FUCOMP', 'FUCOMPP', 'FXAM', 'FXCH', 'FXTRACT', 'FYL2X', 'FYL2XP1', 'HLT', 'IDIV', 'IMUL', 'IN', 'INC', 'INT', 'INTO', 'INVD', 'IRET', 'IRETD', 'JA', 'JAE', 'JB', 'JBE', 'JC', 'JCXZ', 'JE', 'JECXZ', 'JG', 'JGE', 'JL', 'JLE', 'JMP', 'JNA', 'JNAE', 'JNB', 'JNBE', 'JNC', 'JNE', 'JNG', 'JNGE', 'JNL', 'JNLE', 'JNO', 'JNP', 'JNS', 'JO', 'JP', 'JPE', 'JPO', 'JNZ', 'JS', 'JZ', 'LAHF', 'LAR', 'LDS', 'LEA', 'LES', 'LOCK', 'LODS', 'LODSB', 'LODSW', 'LOOP', 'LOOPE', 'LOOPNE', 'LOOPNZ', 'LOOPZ', 'MOV', 'MOVS', 'MOVSB', 'MOVSW', 'MUL', 'NEG', 'NOP', 'NOT', 'OR', 'OUT', 'POP', 'POPF', 'POPFD', 'PUSH', 'PUSHF', 'PUSHFD', 'RCL', 'RCR', 'REP', 'REPE', 'REPNE', 'REPNZ', 'REPZ', 'RET', 'RETF', 'ROL', 'ROR', 'SAHF', 'SAL', 'SAR', 'SBB', 'SCAS', 'SCASB', 'SCASW', 'SHL', 'SHR', 'STC', 'STD', 'STI', 'STOS', 'STOSB', 'STOSW', 'SUB', 'TEST', 'WAIT', 'WBINVD', 'XCHG', 'XLAT', 'XLATB', 'XOR', ] registers_8086 = [ 'AL', 'AH', 'BL', 'BH', 'CL', 'CH', 'DL', 'DH', 'AX', 'BX', 'CX', 'DX', 'CS', 'DS', 'SS', 'ES', 'IP', 'SI', 'DI', 'BP', 'SP', 'FLAGS' ] opcodes_80186 = [ 'BOUND', 'ENTER', 'INS', 'LEAVE', 'OUTS', 'POPA', 'POPAD', 'PUSHA', 'PUSHAD' ] opcodes_80286 = [ 'ARPL', 'CLTS', 'LGDT', 'LIDT', 'LLDT', 'LMSW', 'LSL', 'LSS', 'SGDT', 'SIDT', 'SLDT', 'SMSW', 'STR', 'VERR', 'VERW' ] registers_80286 = ['TR'] opcodes_80386 = [ 'BSF', 'BSR', 'BT', 'BTC', 'BTR', 'BTS', 'CDQ', 'CWDE', 'LFS', 'LGS', 'LSS', 'MOVSX', 'MOVZX', 'SETAE', 'SETB', 'SETC', 'SETNAE', 'SETNB', 'SETNE', 'SETNZ', 'SETG', 'SETGE', 'SETL', 'SETLE', 'SETNC', 'SETNG', 'SETNGE', 'SETNL', 'SETNLE', 'SETNO', 'SETNP', 'SETNS', 'SETE', 'SETO', 'SETP', 'SETPE', 'SETPO', 'SETS', 'SETZ', 'SHLD', 'SHRD' ] registers_80386 = [ 'EAX', 'EBX', 'ECX', 'EDX', 'ESI', 'EDI', 'EBP', 'ESP', 'FS', 'GS', 'EFLAGS' ] opcodes_80486 = ['BSWAP', 'INVPLG'] opcodes_pdp8 = [ 'AND', 'TAD', 'ISZ', 'DCA', 'JMS', 'JMP', 'CDF', 'CIF', 'RDF', 'RIF', 'RIB', 'RMF', 'CLA', 'CLL', 'CMA', 'CML', 'IAC', 'RAR', 'RAL', 'RTR', 'RTL', 'BSW', 'SMA', 'SZA', 'SNL', 'SPA', 'SNA', 'SZL', 'OSR', 'HLT', 'MQA', 'MQL', 'SEL', 'LCD', 'XDR', 'STR', 'SER', 'SDN', 'INTR', 'INIT', 'DILC', 'DICD', 'DISD', 'DILX', 'DILY', 'DIXY', 'DILE', 'DIRE', 'RCSF', 'RCRA', 'RCRB', 'RCNO', 'RCRC', 'RCNI', 'RCSD', 'RCSE', 'RCRD', 'RCSI', 'RCTF', 'RPE', 'RSF', 'RRB', 'RFC', 'PCE', 'PSF', 'PCF', 'PPC', 'PLS', 'KCF', 'KSF', 'KCC', 'KRS', 'KIE', 'KRB', 'TFL', 'TSF', 'TCF', 'TPC', 'TSK', 'TLS' ] opcodes_pdp11 = [ 'CLR', 'CLRB', 'COM', 'COMB', 'INC', 'INCB', 'DEC', 'DECB', 'NEG', 'NEGB', 'NOP', 'TST', 'TSTB', 'TSTSET', 'WRTLCK', 'ASR', 'ASRB', 'ASL', 'ASLB', 'ROR', 'RORB', 'ROL', 'ROLB', 'SWAB', 'ADC', 'ADCB', 'SBC', 'SBCB', 'SXT', 'MOV', 'MOVB', 'ADD', 'SUB', 'CMP', 'CMPB', 'ASH', 'ASHC', 'MUL', 'DIV', 'BIT', 'BITB', 'BIC', 'BICB', 'BIS', 'BISB', 'XOR', 'CLR', 'CLRB', 'BR', 'BNE', 'BPL', 'BEQ', 'BMI', 'BVC', 'BVS', 'BCC', 'BCS', 'BGE', 'BLT', 'BGT', 'BLE', 'SOB', 'BHI', 'BLOS', 'BHIS', 'BLO', 'JMP', 'JSR', 'RTS', 'MARK', 'EMT', 'TRAP', 'BPT', 'IOT', 'CSM', 'RTI', 'RTT', 'HALT', 'WAIT', 'RESET', 'MTPD', 'MTPI', 'MFPD', 'MTPS', 'MFPS', 'MFPT', 'CLC', 'CLV', 'CLZ', 'CLN', 'CCC', 'SEC', 'SEV', 'SEZ', 'SEN', 'SCC', 'FADD', 'FSUB', 'FMUL', 'FDIV', 'DIV', 'MUL' ] registers_pdp11 = ['r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7'] opcodes = [] registers = [] if processor in ['1802']: opcodes += opcodes_1802 registers += registers_1802 if processor in ['6502']: opcodes += opcodes_6502 registers += registers_6502 if processor in ['6800']: opcodes += opcodes_6800 registers += registers_6800 if processor in ['68000']: opcodes += opcodes_68000 registers += registers_68000 if processor in ['8080']: opcodes += opcodes_8080 registers += registers_8080 if processor in ['z80']: opcodes += opcodes_z80 registers += registers_z80 if processor in ['8086', '80186', '80286', '80386', '80486']: opcodes += opcodes_8086 registers += registers_8086 if processor in ['80286', '80386', '80486']: opcodes += opcodes_80186 opcodes += opcodes_80286 registers += registers_80286 if processor in ['80386', '80486']: opcodes += opcodes_80386 registers += registers_80386 if processor in ['80486']: opcodes += opcodes_80486 if processor in ['pdp-8']: opcodes += opcodes_pdp8 # registers += registers_pdp8 if processor in ['pdp-11']: opcodes += opcodes_pdp11 registers += registers_pdp11 opcode_tb = CaseInsensitiveListTokenBuilder(opcodes, 'keyword', False) register_tb = CaseInsensitiveListTokenBuilder(registers, 'register', True) values = ['*', '$', '.'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, stmt_separator_tb, integer_tb, integer_exponent_tb, integer_1_tb, integer_2_tb, prefixed_integer_tb, hex_integer_1_tb, hex_integer_2_tb, hex_integer_3_tb, hex_integer_4_tb, hash_quote_value_tb, values_tb, groupers_tb, register_tb, opcode_tb, directive_tb, title_directive_tb, title_directive_2_tb, subtitle_directive_tb, subtitle_directive_2_tb, subtitle_directive_3_tb, include_directive_tb, include_directive_2_tb, multiline_comment_tb, preprocessor_tb, identifier_tb, label_tb, string_tb, comment_tb, comment_2_tb, line_comment_star_tb, line_comment_hash_tb, known_operator_tb, self.unknown_operator_tb, invalid_token_builder ] opcode_tokenbuilders = [ opcode_tb, directive_tb, title_directive_tb, subtitle_directive_tb, include_directive_tb, preprocessor_tb, invalid_token_builder ] args_tokenbuilders = [ integer_tb, integer_exponent_tb, hex_integer_1_tb, hex_integer_2_tb, hex_integer_3_tb, hex_integer_4_tb, values_tb, groupers_tb, known_operator_tb, register_tb, identifier_tb, label_tb, string_tb, comment_tb, line_comment_star_tb, line_comment_hash_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) opcode_tokenizer = Tokenizer(opcode_tokenbuilders) args_tokenizer = Tokenizer(args_tokenbuilders) # tokenize as free-format tokens_free = tokenizer.tokenize(code) tokens_free = Examiner.combine_adjacent_identical_tokens( tokens_free, 'invalid operator') tokens_free = Examiner.combine_adjacent_identical_tokens( tokens_free, 'invalid') tokens_free = Examiner.combine_identifier_colon( tokens_free, ['newline'], [], []) tokens_free = Tokenizer.combine_number_and_adjacent_identifier( tokens_free) tokens_free = Examiner.convert_values_to_operators( tokens_free, known_operators) self.tokens = tokens_free self.convert_asm_identifiers_to_labels() self.convert_asm_keywords_to_operators() self.convert_asm_keywords_to_identifiers() self.calc_statistics() statistics_free = self.statistics self.statistics = {} self.calc_confidences(operand_types, group_starts, group_mids, group_ends, None) self.calc_line_length_confidence(code, self.max_expected_line) confidences_free = self.confidences self.confidences = {} errors_free = self.errors self.errors = [] if processor in ['pdp-8', 'pdp-11']: # do not try space-format, it never exists for these processors tokens_space = [] statistics_space = {} confidences_space = {} errors_space = [] else: # tokenize as space-format opcode_extras = '.&=,()+-*/' label_leads = '.&$@#' label_mids = '.&$#@_' label_ends = ':' comment_leads = '*;' line_comment_leads = '' use_line_id = False tokens_space, indents = Tokenizer.tokenize_asm_code( code, tab_size, opcode_tokenizer, opcode_extras, args_tokenizer, label_leads, label_mids, label_ends, comment_leads, line_comment_leads, use_line_id) tokens_space = Examiner.combine_adjacent_identical_tokens( tokens_space, 'invalid operator') tokens_space = Examiner.combine_adjacent_identical_tokens( tokens_space, 'invalid') tokens_space = Examiner.combine_identifier_colon( tokens_space, ['newline'], [], []) tokens_space = Tokenizer.combine_number_and_adjacent_identifier( tokens_space) tokens_space = Examiner.convert_values_to_operators( tokens_space, known_operators) self.tokens = tokens_space self.convert_asm_identifiers_to_labels() self.calc_statistics() statistics_space = self.statistics self.statistics = {} self.calc_confidences(operand_types, group_starts, group_mids, group_ends, indents) self.calc_line_length_confidence(code, self.max_expected_line) confidences_space = self.confidences self.confidences = {} errors_space = self.errors self.errors = [] # compute confidence for free-format and spaced-format confidence_free = 1.0 if len(confidences_free) == 0: confidence_free = 0.0 else: for key in confidences_free: factor = confidences_free[key] confidence_free *= factor confidence_space = 1.0 if len(confidences_space) == 0: confidence_space = 0.0 else: for key in confidences_space: factor = confidences_space[key] confidence_space *= factor # select the better of free-format and spaced-format if confidence_space > confidence_free: self.tokens = tokens_space self.statistics = statistics_space self.confidences = confidences_space self.errors = errors_space else: self.tokens = tokens_free self.statistics = statistics_free self.confidences = confidences_free self.errors = errors_free
def __init__(self, code): super().__init__() ctrlz_char = '' code = self.TrimCtrlZText(code, ctrlz_char) operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() stmt_separator_tb = SingleCharacterTokenBuilder( ';', 'statement separator', False) integer_tb = IntegerTokenBuilder(None) integer_exponent_tb = IntegerExponentTokenBuilder(None) real_tb = RealTokenBuilder(True, True, None) real_exponent_tb = RealExponentTokenBuilder(True, True, 'E', None) hex_constant_tb = SuffixedIntegerTokenBuilder( 'H', True, '0123456789ABCDEFabcdef') octal_constant_tb = SuffixedIntegerTokenBuilder('C', True, '01234567') binary_constant_tb = SuffixedIntegerTokenBuilder('B', True, '01') operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ["'", '"'] string_tb = StringTokenBuilder(quotes, 0) operand_types.append('string') paren_star_comment_tb = BlockTokenBuilder('(*', '*)', 'comment') known_operators = [ ':=', '=', '>', '>=', '<', '<=', '#', '<>', '+', '-', '*', '/', 'DIV', 'MOD', 'AND', 'OR', 'NOT', '^', '.', '..', 'IN', '&' ] known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) self.unary_operators = ['+', '-', 'NOT', '@', '^', '.'] self.postfix_operators = ['^'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':', '|'] group_starts = ['(', '[', ',', '{'] group_mids = [',', ':', '|'] group_ends = [')', ']', '}'] groupers_tb = CaseSensitiveListTokenBuilder(groupers, 'group', False) keywords = [ 'BEGIN', 'BY', 'CASE', 'CONST', 'DEFINITION', 'DO', 'ELSE', 'ELSIF', 'END', 'EXCEPT', 'EXIT', 'EXPORT', 'FINALLY', 'FOR', 'FROM', 'IF', 'IMPLEMENTATION', 'IMPORT', 'LOOP', 'MODULE', 'OF', 'PROCEDURE', 'QUALIFIED', 'REPEAT', 'THEN', 'TO', 'TYPE', 'VAR', 'WITH', 'WHILE' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'ARRAY', 'BOOLEAN', 'CARDINAL', 'CHAR', 'INTEGER', 'POINTER', 'REAL', 'RECORD', 'SET' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['FALSE', 'NIL', 'TRUE'] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, stmt_separator_tb, integer_tb, integer_exponent_tb, real_tb, real_exponent_tb, hex_constant_tb, octal_constant_tb, binary_constant_tb, keyword_tb, types_tb, values_tb, known_operator_tb, groupers_tb, identifier_tb, string_tb, paren_star_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') self.tokens = tokens self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'string', 'identifier', 'variable'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence( ['BEGIN', 'RECORD', 'CASE', 'DO', 'IF', 'WHILE'], ['END']) self.calc_paired_blockers_confidence(['REPEAT'], ['UNTIL']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, block_comment_limit): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF_') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01_') suffixed_integer_tb = SuffixedIntegerTokenBuilder(['U', 'L', 'LU', 'UL'], False, None) real_tb = RealTokenBuilder(False, False, "'") suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['f', 'l', 'i'], False, None) real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") hex_real_tb = HexRealExponentTokenBuilder() operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') attribute_tb = PrefixedIdentifierTokenBuilder('@', 'attribute', False) operand_types.append('attribute') # string suffix: c,w,d quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) r_string_tb = PrefixedStringTokenBuilder('r', True, quotes) backtick_string_tb = EscapedStringTokenBuilder(['`'], 0) x_string_tb = PrefixedStringTokenBuilder('x', True, quotes) q_string_tb = PrefixedStringTokenBuilder('q', True, quotes) # q{} string cwd_string_tb = SuffixedStringTokenBuilder(quotes, 'cwd', False) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() slash_plus_comment_tb = NestedCommentTokenBuilder('/+', '+/', block_comment_limit) line_continuation_tb = SingleCharacterTokenBuilder('\\', 'line continuation', False) terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '/', '/=', '.', '..', '...', '&', '&=', '&&', '|', '|=', '||', '-', '-=', '--', '+', '+=', '++', '<', '<=', '<<', '<<=', '>', '>=', '>>=', '>>>=', '>>', '>>>', '!', '!=', '?', ',', ':', '$', '=', '==', '*', '*=', '%', '%=', '^', '^=', '^^', '^^=', '~', '~=', '@', '=>', '#', 'new', 'delete', 'typeof', 'is' ] self.unary_operators = [ '+', '-', '*', '!', '&', '~', '++', '--', ':', 'new', 'delete', 'typeof', 'is' ] self.postfix_operators = [ '++', '--', '&', ':' ] groupers = ['(', ')', ',', '[', ']', '{', '}'] group_starts = ['(', '[', ',', '{'] group_mids = [','] group_ends = [')', ']', '}'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder(known_operators, 'operator', False) keywords = [ 'abstract', 'alias', 'align', 'asm', 'assert', 'auto', 'body', 'break', 'case', 'cast', 'catch', 'class', 'const', 'continue', 'debug', 'default', 'delegate', 'deprecated', 'do', 'else', 'enum', 'export', 'extern', 'final', 'finally', 'for', 'foreach', 'foreach_reverse', 'function', 'goto', 'if', 'immutable', 'import', 'in', 'inout', 'interface', 'invariant', 'lazy', 'macro', 'mixin', 'module', 'nothrow', 'out', 'override', 'package', 'pragma', 'private', 'protected', 'public', 'pure', 'ref', 'return', 'scope', 'shared', 'static', 'struct', 'switch', 'synchronized', 'template', 'throw', 'try', 'typeid', 'union', 'unittest', 'version', 'while', 'with', '__gshared', '__traits', '__vector', '__parameters' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = [ 'bool', 'byte', 'cdouble', 'cent', 'cfloat', 'char', 'creal', 'dchar', 'double', 'float', 'idouble', 'ifloat', 'int', 'ireal', 'long', 'real', 'short', 'ubyte', 'ucent', 'uint', 'ulong', 'ushort', 'void', 'wchar' ] types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = [ 'false', 'null', 'super', 'this', 'true', '__FILE__', '__FILE_FULL_PATH__', '__MODULE__', '__LINE__', '__FUNCTION__', '__PRETTY_FUNCTION__', '__DATE__', '__EOF__', '__TIME__','__TIMESTAMP__', '__VENDOR__', '__VERSION__' ] values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb, hex_real_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, attribute_tb, class_type_tb, string_tb, r_string_tb, x_string_tb, backtick_string_tb, q_string_tb, cwd_string_tb, slash_slash_comment_tb, slash_star_comment_tb, slash_plus_comment_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon(tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() number_suffixes = ['f', 'F', 'i', 'I', 'u', 'U', 'l', 'L', 'ul', 'uL', 'Ul', 'UL', 'lu', 'lU', 'Lu', 'LU'] tokens = self.combine_tokens_and_adjacent_types(tokens, 'number', 'identifier', number_suffixes) string_suffixes = ['c', 'w', 'd'] self.tokens = self.combine_tokens_and_adjacent_types(tokens, 'string', 'identifier', string_suffixes) self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)
def __init__(self, code, tab_size, wide): super().__init__() self.operand_types = [] self.whitespace_tb = WhitespaceTokenBuilder() self.newline_tb = NewlineTokenBuilder() self.integer_tb = IntegerTokenBuilder(None) self.integer_exponent_tb = IntegerExponentTokenBuilder(None) self.binary_integer_tb = SuffixedIntegerTokenBuilder(['B'], False, None) self.real_tb = RealTokenBuilder(False, False, None) self.real_exponent_tb = RealExponentTokenBuilder( False, False, 'E', None) self.binary_real_tb = SuffixedRealTokenBuilder(True, True, ['B'], False, None) self.operand_types.append('number') leads = '_' extras = '_' self.identifier_tb = IdentifierTokenBuilder(leads, extras) self.operand_types.append('identifier') quotes = ['"', "'", "’"] self.string_tb = EscapedStringTokenBuilder(quotes, 0) self.operand_types.append('string') self.label_tb = PL1LabelTokenBuilder() self.operand_types.append('label') self.slash_star_comment_tb = SlashStarCommentTokenBuilder() self.jcl_tb = JCLTokenBuilder() directives = [ '%ACTIVATE', '%DEACTIVATE', '%DECLARE', '%DCL', '%DICTIONARY', '%DO', '%ELSE', '%END', '%FATAL', '%GOTO', '%IF', '%INCLUDE', '%LIST', '%NOLIST', '%PAGE', '%PROCEDURE', '%PROC', '%REPLACE', '%RETURN', '%THEN' ] self.line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) self.preprocessor_tb = CaseInsensitiveListTokenBuilder( directives, 'preprocessor', False) self.title_tb = LeadToEndOfLineTokenBuilder('%TITLE', True, 'preprocessor') self.subtitle_tb = LeadToEndOfLineTokenBuilder('%SBTTL', True, 'preprocessor') self.error_tb = LeadToEndOfLineTokenBuilder('%ERROR', True, 'preprocessor') self.warn_tb = LeadToEndOfLineTokenBuilder('%WARN', True, 'preprocessor') self.inform_tb = LeadToEndOfLineTokenBuilder('%INFORM', True, 'preprocessor') self.terminators_tb = SingleCharacterTokenBuilder( ';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '**', '>', '<', '=', '>=', '<=', '¬>', '¬<', '¬=', '^>', '^<', '^=', '^', '~>', '~<', '~=', '~', '¬', '&', '&:', '|', '|:', '||', '!', '!:', '!!', ':' ] self.unary_operators = ['+', '-', '^', '~', '¬'] self.postfix_operators = [] groupers = ['(', ')', ',', '[', ']', '{', '}'] self.group_starts = ['(', '[', ',', '{'] self.group_mids = [','] self.group_ends = [')', ']', '}'] self.groupers_tb = CaseInsensitiveListTokenBuilder( groupers, 'group', False) self.known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'ALLOCATE', 'ALLOC', 'BEGIN', 'CALL', 'CLOSE', 'DECLARE', 'DCL', 'DO', 'ELSE', 'END', 'FORMAT', 'FREE', 'GET', 'GOTO', 'GO TO', 'IF', 'LEAVE', 'ON', 'OPEN', 'OTHERWISE', 'OTHER', 'PROCEDURE', 'PROC', 'PUT', 'READ', 'RETURN', 'REVERT', 'REWRITE', 'SELECT', 'SIGNAL', 'STOP', 'THEN', 'WHEN', 'WRITE' ] self.keyword_tb = CaseInsensitiveListTokenBuilder( keywords, 'keyword', False) attributes = [ 'ALIGNED', 'ANY', 'AREA', 'BASED', 'BUILTIN', 'CONDITION', 'COND', 'CONTROLLED', 'CTL', 'DEFINED', 'DEF', 'DIRECT', 'ENTRY', 'ENVIRONMENT', 'ENV', 'EXTERNAL', 'EXT', 'FILE', 'GLOBALDEF', 'GLOBALREF', 'INITIAL', 'INIT', 'INPUT', 'INTERNAL', 'INT' 'KEYED', 'LABEL', 'LIKE', 'LIST', 'MEMBER', 'NONVARYING', 'NONVAR', 'OPTIONAL', 'OPTIONS', 'OUTPUT', 'PARAMETER', 'PARM', 'PICTURE', 'PIC', 'POSITION', 'POS', 'PRECISION', 'PREC', 'PRINT', 'READONLY', 'RECORD', 'REFER', 'RETURNS', 'SEQUENTIAL', 'SEQL', 'STATIC', 'STREAM', 'STRUCTURE', 'TRUNCATE', 'UNALIGNED', 'UNAL', 'UNION', 'UPDATE', 'VARIABLE', 'VARYING', 'VAR' ] self.attributes_tb = CaseInsensitiveListTokenBuilder( attributes, 'attribute', False) functions = [ 'ABS', 'ACOS', 'ACTUALCOUNT', 'ADD', 'ADDR', 'ADDREL', 'ALLOCATION', 'ALLOCN', 'ASIN', 'ATAN', 'ATAND', 'ATANH', 'AUTOMATIC', 'AUTO', 'BINARY', 'BIN', 'BIT', 'BOOL', 'BYTE', 'BYTESIZE', 'CEIL', 'CHARACTER', 'CHAR', 'COLLATE', 'COPY', 'COS', 'COSD', 'COSH', 'DATE', 'DATETIME', 'DECIMAL', 'DEC', 'DECODE', 'DESCRIPTOR', 'DESC', 'DIMENSION', 'DIM', 'DIVIDE', 'EMPTY', 'ENCODE', 'ERROR', 'EVERY', 'EXP', 'FIXED', 'FLOAT', 'FLOOR', 'HBOUND', 'HIGH', 'INDEX', 'INFORM', 'INT', 'LBOUND', 'LENGTH', 'LINE', 'LINENO', 'LOG', 'LOG10', 'LOG2', 'LOW', 'LTRIM', 'MAX', 'MAXLENGTH', 'MIN', 'MOD', 'MULTIPLY', 'NULL', 'OFFSET', 'ONARGSLIST', 'ONCHAR', 'ONCODE', 'ONFILE', 'ONKEY', 'ONSOURCE', 'PAGENO', 'POINTER', 'PTR', 'POSINT', 'PRESENT', 'PROD', 'RANK', 'REFERENCE', 'REVERSE', 'ROUND', 'RTRIM', 'SEARCH', 'SIGN', 'SIN', 'SIND', 'SINH', 'SIZE', 'SOME', 'SQRT', 'STRING', 'SUBSTR', 'SUBTRACT', 'SUM', 'TAN', 'TAND', 'TANH', 'TIME', 'TRANSLATE', 'TRIM', 'TRUNC', 'UNSPEC', 'VALID', 'VALUE', 'VAL', 'VARIANT', 'VERIFY', 'WARN' ] self.function_tb = CaseInsensitiveListTokenBuilder( functions, 'function', True) format_items = [ 'A', 'B', 'B1', 'B2', 'B3', 'B4', 'COLUMN', 'COL', 'E', 'F', 'P', 'R', 'TAB', 'X' ] self.format_item_tb = CaseSensitiveListTokenBuilder( format_items, 'format', True) self.operand_types.append('format') options = [ 'APPEND', 'BACKUP_DATE', 'BATCH', 'BLOCK_BOUNDARY_FORMAT', 'BLOCK_IO', 'BLOCK_SIZE', 'BUCKET_SIZE', 'BY', 'CANCEL_CONTROL_O', 'CARRIAGE_RETURN_FORMAT', 'CONTIGUOUS', 'CONTIGUOUS_BEST_TRY', 'CREATION_DATE', 'CURRENT_POSITION', 'DEFAULT_FILE_NAME', 'DEFERRED_WRITE', 'DELETE', 'EDIT', 'EXPIRATION_DATE', 'EXTENSION_SIZE', 'FAST_DELETE', 'FILE_ID', 'FILE_ID_TO', 'FILE_SIZE', 'FIXED_CONTROL_FROM', 'FIXED_CONTROL_SIZE', 'FIXED_CONTROL_SIZE_TO', 'FIXED_CONTROL_TO', 'FIXED_LENGTH_RECORDS', 'FROM', 'GROUP_PROTECTION', 'IDENT', 'IGNORE_LINE_MARKS', 'IN', 'INDEXED', 'INDEX_NUMBER', 'INITIAL_FILL', 'INTO', 'KEY', 'KEYFROM', 'KEYTO', 'LINESIZE', 'LOCK_ON_READ', 'LOCK_ON_WRITE', 'MAIN PROCEDURE', 'MANUAL_UNLOCKING', 'MATCH_GREATER', 'MATCH_GREATER_EQUAL', 'MATCH_NEXT', 'MATCH_NEXT_EQUAL', 'MAXIMUM_RECORD_NUMBER', 'MAXIMUM_RECORD_SIZE', 'MULTIBLOCK_COUNT', 'MULTIBUFFER_COUNT', 'NOLOCK', 'NONEXISTENT_RECORD', 'NONRECURSIVE', 'NORESCAN', 'NO_ECHO', 'NO_FILTER', 'NO_SHARE', 'OWNER_GROUP', 'OWNER_ID', 'OWNER_MEMBER', 'OWNER_PROTECTION', 'PAGE', 'PAGESIZE', 'PRINTER_FORMAT', 'PROMPT', 'PURGE_TYPE_AHEAD', 'READ_AHEAD', 'READ_CHECK', 'READ_REGARDLESS', 'RECORD_ID', 'RECORD_ID_ACCESS', 'RECORD_ID_TO', 'RECURSIVE', 'REPEAT', 'RESCAN', 'RETRIEVAL_POINTERS', 'REVISION_DATE', 'REWIND_ON_CLOSE', 'REWIND_ON_OPEN', 'SCALARVARYING', 'SET READ', 'SHARED_READ', 'SHARED_WRITE', 'SKIP', 'SNAP', 'SPOOL', 'STATEMENT', 'SUPERSEDE', 'SYSTEM', 'SYSTEM_PROTECTION', 'TEMPORARY', 'TIMEOUT_PERIOD', 'TITLE', 'TO', 'UNDERFLOW', 'UFL', 'UNTIL', 'USER_OPEN', 'WAIT_FOR_RECORD', 'WHILE', 'WORLD_PROTECTION', 'WRITE_BEHIND', 'WRITE_CHECK' ] self.options_tb = CaseInsensitiveListTokenBuilder( options, 'option', False) conditions = [ 'ANYCONDITION', 'CONVERSION', 'CONV', 'ENDFILE', 'ENDPAGE', 'FINISH', 'FIXEDOVERFLOW', 'FOFL', 'OVERFLOW', 'OFL', 'STORAGE', 'STRINGRANGE', 'STRG', 'SUBSCRIPTRANGE', 'SUBRG', 'UNDEFINEDFILE', 'UNDF', 'VAXCONDITION', 'ZERODIVIDE', 'ZDIV' ] self.conditions_tb = CaseInsensitiveListTokenBuilder( conditions, 'condition', False) subroutines = [ 'DISPLAY', 'EXTEND', 'FLUSH', 'NEXT_VOLUME', 'RELEASE', 'RESIGNAL', 'REWIND', 'SPACEBLOCK' ] self.subroutines_tb = CaseInsensitiveListTokenBuilder( subroutines, 'subroutine', False) types = [ 'FIXED', 'BINARY', 'FLOAT', 'DECIMAL', 'BIT', 'CHARACTER', 'PICTURE' ] self.types_tb = CaseInsensitiveListTokenBuilder(types, 'type', True) self.operand_types.append('type') values = ['SYSIN', 'SYSPRINT'] self.values_tb = CaseInsensitiveListTokenBuilder(values, 'value', True) self.operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() # tokenize as free-format tokenbuilders_free = [ self.newline_tb, self.whitespace_tb, self.line_continuation_tb, self.terminators_tb, self.integer_tb, self.integer_exponent_tb, self.binary_integer_tb, self.real_tb, self.real_exponent_tb, self.binary_real_tb, self.keyword_tb, self.function_tb, self.attributes_tb, self.options_tb, self.conditions_tb, self.subroutines_tb, self.types_tb, self.values_tb, self.groupers_tb, self.known_operator_tb, self.identifier_tb, self.string_tb, self.label_tb, self.slash_star_comment_tb, self.preprocessor_tb, self.title_tb, self.subtitle_tb, self.error_tb, self.warn_tb, self.inform_tb, self.jcl_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer_free = Tokenizer(tokenbuilders_free) tokens_free = tokenizer_free.tokenize(code) tokens_free = Examiner.combine_adjacent_identical_tokens( tokens_free, 'invalid operator') tokens_free = Examiner.combine_adjacent_identical_tokens( tokens_free, 'invalid') self.tokens = tokens_free self.calc_statistics() statistics_free = self.statistics self.statistics = {} tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, self.group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, self.group_starts, allow_pairs) self.calc_group_confidence(tokens, self.group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, self.operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line) confidences_free = self.confidences self.confidences = {} errors_free = self.errors self.errors = [] # tokenize as fixed-format tokenbuilders_fixed = [ self.newline_tb, self.whitespace_tb, self.line_continuation_tb, self.terminators_tb, self.integer_tb, self.integer_exponent_tb, self.binary_integer_tb, self.real_tb, self.real_exponent_tb, self.binary_real_tb, self.keyword_tb, self.function_tb, self.attributes_tb, self.options_tb, self.conditions_tb, self.subroutines_tb, self.types_tb, self.values_tb, self.groupers_tb, self.known_operator_tb, self.identifier_tb, self.string_tb, self.label_tb, self.slash_star_comment_tb, self.preprocessor_tb, self.title_tb, self.subtitle_tb, self.error_tb, self.warn_tb, self.inform_tb, self.jcl_tb, self.unknown_operator_tb, invalid_token_builder ] comment_start_tb = PL1CommentStartTokenBuilder() comment_middle_tb = PL1CommentMiddleTokenBuilder() comment_end_tb = PL1CommentEndTokenBuilder() type1_tokenbuilders = [comment_start_tb] tokenbuilders_fixed_1 = tokenbuilders_fixed + type1_tokenbuilders + [ invalid_token_builder ] tokenizer_fixed_1 = Tokenizer(tokenbuilders_fixed_1) type2_tokenbuilders = [ comment_start_tb, comment_middle_tb, comment_end_tb ] tokenbuilders_fixed_2 = tokenbuilders_fixed + type2_tokenbuilders + [ invalid_token_builder ] tokenizer_fixed_2 = Tokenizer(tokenbuilders_fixed_2) tokens_fixed = self.tokenize_code(code, tab_size, tokenizer_fixed_1, tokenizer_fixed_2, wide) tokens_fixed = Examiner.combine_adjacent_identical_tokens( tokens_fixed, 'invalid operator') tokens_fixed = Examiner.combine_adjacent_identical_tokens( tokens_fixed, 'invalid') tokens_fixed = Examiner.combine_adjacent_identical_tokens( tokens_fixed, 'whitespace') tokens_fixed = self.convert_broken_comments_to_comments(tokens_fixed) self.tokens = tokens_fixed self.calc_statistics() statistics_fixed = self.statistics self.statistics = {} tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence() num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, self.group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, self.group_starts, allow_pairs) self.calc_group_confidence(tokens, self.group_mids) operand_types_2 = ['number', 'symbol'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, self.operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line) confidences_fixed = self.confidences self.confidences = {} errors_fixed = self.errors self.errors = [] # compute confidence for free-format and fixed-format confidence_free = 1.0 if len(confidences_free) == 0: confidence_free = 0.0 else: for key in confidences_free: factor = confidences_free[key] confidence_free *= factor confidence_fixed = 1.0 if len(confidences_fixed) == 0: confidence_fixed = 0.0 else: for key in confidences_fixed: factor = confidences_fixed[key] confidence_fixed *= factor # select the better of free-format and spaced-format if confidence_fixed > confidence_free: self.tokens = tokens_fixed self.statistics = statistics_fixed self.confidences = confidences_fixed self.errors = errors_fixed else: self.tokens = tokens_free self.statistics = statistics_free self.confidences = confidences_free self.errors = errors_free
def __init__(self, code, year): super().__init__() operand_types = [] whitespace_tb = WhitespaceTokenBuilder() newline_tb = NewlineTokenBuilder() integer_tb = IntegerTokenBuilder("'") integer_exponent_tb = IntegerExponentTokenBuilder("'") hex_integer_tb = PrefixedIntegerTokenBuilder('0x', False, '0123456789abcdefABCDEF') binary_integer_tb = PrefixedIntegerTokenBuilder('0b', False, '01') suffixed_integer_tb = SuffixedIntegerTokenBuilder( ['U', 'L', 'LL', 'ULL', 'LLU'], False, None) real_tb = RealTokenBuilder(False, False, "'") real_exponent_tb = RealExponentTokenBuilder(False, False, 'E', "'") suffixed_real_tb = SuffixedRealTokenBuilder(False, False, ['f', 'l'], False, None) operand_types.append('number') leads = '_' extras = '_' identifier_tb = IdentifierTokenBuilder(leads, extras) operand_types.append('identifier') quotes = ['"', "'", "’"] string_tb = EscapedStringTokenBuilder(quotes, 0) operand_types.append('string') class_type_tb = ClassTypeTokenBuilder() operand_types.append('class') slash_slash_comment_tb = SlashSlashCommentTokenBuilder() slash_star_comment_tb = SlashStarCommentTokenBuilder() directives = [ '#define', '#undef', '#ifdef', '#ifndef', '#if', '#endif', '#else', '#elif', '#line', '#include', '#pragma' ] line_continuation_tb = SingleCharacterTokenBuilder( '\\', 'line continuation', False) c_preprocessor_tb = CaseSensitiveListTokenBuilder( directives, 'preprocessor', False) c_warning_tb = LeadToEndOfLineTokenBuilder('#warning', True, 'preprocessor') c_error_tb = LeadToEndOfLineTokenBuilder('#error', True, 'preprocessor') terminators_tb = SingleCharacterTokenBuilder(';', 'statement terminator', False) known_operators = [ '+', '-', '*', '/', '%', '=', '==', '!=', '>', '>=', '<', '<=', '+=', '-=', '*=', '/=', '%=', '&=', '|=', '^=', '<<=', '>>=', '!', '&', '|', '~', '<<', '>>', '^', '.', '++', '--', '->', '&&', '||', '?', '##' ] self.unary_operators = ['+', '-', '*', '!', '&', '~', '++', '--'] self.postfix_operators = ['++', '--', '&', '*'] groupers = ['(', ')', ',', '[', ']', '{', '}', ':'] group_starts = ['(', '[', ',', '{'] group_ends = [')', ']', '}'] group_mids = [',', ':'] groupers_tb = CaseInsensitiveListTokenBuilder(groupers, 'group', False) known_operator_tb = CaseSensitiveListTokenBuilder( known_operators, 'operator', False) keywords = [ 'auto', 'break', 'case', 'const', 'continue', 'default', 'do', 'else', 'enum', 'extern', 'for', 'goto', 'if', 'inline', 'register', 'return', 'signed', 'sizeof', 'static', 'struct', 'switch', 'typedef', 'union', 'unsigned', 'volatile', 'while' ] keyword_tb = CaseSensitiveListTokenBuilder(keywords, 'keyword', False) types = ['char', 'double', 'float', 'int', 'long', 'short'] types_89 = ['void'] types_99 = ['bool', 'complex'] if year in ['89', '99']: types += types_89 if year in ['99']: types += types_99 types_tb = CaseSensitiveListTokenBuilder(types, 'type', True) operand_types.append('type') values = ['NULL'] values_89 = [] values_99 = ['...', 'true', 'false'] if year in ['89', '99']: values += values_89 if year in ['99']: values += values_99 values_tb = CaseSensitiveListTokenBuilder(values, 'value', True) operand_types.append('value') invalid_token_builder = InvalidTokenBuilder() tokenbuilders = [ newline_tb, whitespace_tb, line_continuation_tb, terminators_tb, integer_tb, integer_exponent_tb, hex_integer_tb, binary_integer_tb, suffixed_integer_tb, real_tb, real_exponent_tb, suffixed_real_tb, keyword_tb, types_tb, values_tb, groupers_tb, known_operator_tb, identifier_tb, class_type_tb, string_tb, ] if year in ['99']: tokenbuilders += [ slash_slash_comment_tb, ] tokenbuilders += [ slash_star_comment_tb, c_preprocessor_tb, c_error_tb, c_warning_tb, self.unknown_operator_tb, invalid_token_builder ] tokenizer = Tokenizer(tokenbuilders) tokens = tokenizer.tokenize(code) tokens = Examiner.combine_adjacent_identical_tokens( tokens, 'invalid operator') tokens = Examiner.combine_adjacent_identical_tokens(tokens, 'invalid') tokens = Examiner.combine_identifier_colon( tokens, ['statement terminator', 'newline'], ['{'], ['whitespace', 'comment']) self.tokens = tokens self.convert_identifiers_to_labels() self.calc_statistics() tokens = self.source_tokens() tokens = Examiner.join_all_lines(tokens) self.calc_token_confidence() self.calc_token_2_confidence(['*', ';']) num_operators = self.count_my_tokens(['operator', 'invalid operator']) if num_operators > 0: self.calc_operator_confidence(num_operators) allow_pairs = [] self.calc_operator_2_confidence(tokens, num_operators, allow_pairs) self.calc_operator_3_confidence(tokens, num_operators, group_ends, allow_pairs) self.calc_operator_4_confidence(tokens, num_operators, group_starts, allow_pairs) self.calc_group_confidence(tokens, group_mids) operand_types_2 = ['number'] self.calc_operand_n_confidence(tokens, operand_types_2, 2) self.calc_operand_n_confidence(tokens, operand_types, 4) self.calc_keyword_confidence() self.calc_paired_blockers_confidence(['{'], ['}']) self.calc_line_length_confidence(code, self.max_expected_line)