def LispLexer(text, compiler): pos = tokens.Position("", 1, 1, 1) yield tokens.Token(Tag.LP, pos, pos) yield tokens.Token(Tag.atom, pos, pos) yield tokens.Token(Tag.atom, pos, pos) yield tokens.Token(Tag.RP, pos, pos) yield tokens.Token(Tag.END_OF_PROGRAM, pos, pos)
def main(output=False): phonecmps = [] timecmps = [] doclist = CreateDoclist() for pair in PAIRS_: token1 = tokens.Token(pair[0]) TestPronunciations(token1) token2 = tokens.Token(pair[1]) TestPronunciations(token2) comparator = token_comp.OldPhoneticDistanceComparator(token1, token2) comparator.ComputeDistance() phonecmps.append(comparator) if output: p = open(GOLDEN_FILE_, 'w') ## clear golden file p.close() for pc in phonecmps: pc.ComparisonResult().Print(GOLDEN_FILE_, 'a') TestCorrelations(doclist, GOLDEN_FILE_) TestSnowActivations(doclist, GOLDEN_FILE_) else: p = open(TEST_FILE_, 'w') ## clear test file p.close() for pc in phonecmps: pc.ComparisonResult().Print(TEST_FILE_, 'a') TestCorrelations(doclist, TEST_FILE_) TestSnowActivations(doclist, TEST_FILE_) unittest.TestUnitOutputs(sys.argv[0] + ' (main test & perceptron test)', \ GOLDEN_FILE_, TEST_FILE_) TestAuxiliaryComparators(sys.argv[0])
def test_operators(self): toks = list(lexer.Lexer("+-*/").make_token()) self.assertEqual(toks, [ tokens.Token(tokens.TokenType.PLUS), tokens.Token(tokens.TokenType.MINUS), tokens.Token(tokens.TokenType.MULTIPLY), tokens.Token(tokens.TokenType.DIVIDE) ])
def test_numbers(self): toks = list(lexer.Lexer("1245.4 .234 1234. .").make_token()) self.assertEqual(toks, [ tokens.Token(tokens.TokenType.NUMBER, 1245.4), tokens.Token(tokens.TokenType.NUMBER, .234), tokens.Token(tokens.TokenType.NUMBER, 1234.), tokens.Token(tokens.TokenType.NUMBER, 000.000), ])
def driver(f, line): this_state = 0 next_state = 0 tk = tokens.Token() literal = "" while this_state < 1000 and this_state > -1: fpos = f.tell() datum = f.read(1) if datum == '#': while True: datum = f.read(1) if datum == '\n': break fsa_state = get_column(datum) next_state = fsa_table[this_state][fsa_state] if next_state >= 1000 or next_state < 0: if next_state >= 1000: tk = get_tokens(next_state, literal, line) tk.location = line f.seek(fpos, os.SEEK_SET) return tk, line if next_state == -1: tk.identity = tokens.token_ids.token_names[35] tk.instance = 'EOF' tk.location = line return tk, line if next_state == -2: print "SCANNER ERROR: Illegal character '%s' on line %d" % ( datum, line) tk.identity = tokens.token_ids.token_names[36] tk.instance = 'bad token' tk.location = line return tk, line else: unit = datum if unit in specials and symbols.has_key(unit) == False: print "SCANNER ERROR: Illegal keyword character '%s' on line %d" % ( datum, line) tk.identity = tokens.token_ids.token_names[36] tk.instance = unit tk.location = line return tk, line if unit.isspace() == False: literal += unit if len(literal) > 7: print "SCANNER ERROR: Illegal keyword '%s' on line %d" % ( datum, line) return tokens.Token(tokens.token_ids.token_names[36], 'illegal size', line) if datum == '\n': line = line + 1 this_state = next_state return tokens.Token(tokens.token_ids.token_names[36], 'bad token', line)
def TestString(self): program = ast.Program() program.Statements = [ ast.LetStatement( tkn.Token(tkn.LET, "let"), ast.Identifier(tkn.Token(tkn.IDENT, "myVar"), "myVar"), ast.Identifier(tkn.Token(tkn.IDENT, "anotherVar"), "anotherVar")) ] if program.String() != "let myVar = anotherVar": raise Exception(f'program.String() wrong got={program.String()}')
def test_string(): program = ast.Program(statements=[ ast.LetStatement(token=tokens.Token(typ=tokens.LET, literal="let"), name=ast.Identifier(token=tokens.Token( typ=tokens.IDENT, literal="my_var"), value="my_var"), value=ast.Identifier(token=tokens.Token( typ=tokens.IDENT, literal="another_var"), value="another_var")) ]) expected = "let my_var = another_var;" assert str( program ) == expected, f"str(program) wrong. got '{str(program)}'' but expected '{expected}'"
def assem_conv(op, words, line): logging.debug("Conv %s words:%s" % (op, hl_parser.format_word_list(words))) token = tokens.Token("conv", err, line) if (op == "cmptime"): if (len(words) != 1): err.report_error("Cmptime needs one arguement") return else: token.add_bits(0,6, 3, 2) token.add_bits(0, 3, 1, 0) token.add_bits(0, 0, 7, 7) if (words[0].type() == "arg"): token.add_byte(1, words[0].num()) elif (words[0].type() == "var"): token.add_byte(1, 0) token.add_vname(1, 0, words[0].val()) else: err.report_error("Cmptime takes a variable as it's argument") return else: if (len(words) != 0): err.report_error("Conversions don't that arguments") return else: token.add_bits(0,6, 3, 2) token.add_bits(0, 0, 7, 3) if (op == "convm"): token.add_bits(0, 3, 1, 1) token.finish(token_stream)
def parse_tokens(text, size, source_id): """Parse sentence and return tokens""" text = re.sub(r'\s+', ' ', text).strip() #remove multiple spaces string_start = STRING_START_TEXT * (size - 1) text = string_start + text #append to text some special start tokens words = split_into_words(text) if not words: #empty sentence return [] length = len(words) lists = [] for i in range(0, length): wordlist = [] for j in range(0, size): if i + j < length: #we can append more words wordlist.append(words[i + j]) lists.append(wordlist) result = [] length = len(lists) for i in range(0, length): is_begin = 1 if i <= size - 1 else 0 is_end = 1 if i + size >= length - 1 else 0 start = ' '.join(lists[i]) if i < length else '' end = lists[i + size][0] if i + size < length else '' token = tokens.Token(start, end, source_id, is_begin, is_end) result.append(token) return result
def assem_spec_insert(words, line): logging.debug("Spec_insert words:%s" % (hl_parser.format_word_list(words))) if (len(words) != 2): err.report_error("INSERT needs type and filename arguments") return type = words[0].astr() f_name = words[1].astr() if (type.lower() not in ['tokens', 'binary']): err.report_error("INSERT type must be one of: 'tokens', 'binary'") return if (type.lower() == 'tokens'): err.report_error("INSERT TOKENS should have been consumed higher up! Eek!"); return # we have an insert binary special to deal with if (not os.path.isfile(f_name) or not os.access(f_name, os.R_OK)): err.report_error("INSERT BINARY file:%s doesn't exist or isn't readable") return token = tokens.Token("binary", err, line) token.add_binary_file(f_name) token.finish(token_stream)
def LineSegment(self, line): try: utext = unicode(line.strip(), 'utf-8') except TypeError: utext = line.strip() for i in range(len(utext)): for k in [4, 3, 2]: sub = utext[i:i + k] if len(sub) != k: continue if k > 2 and sub[:2].encode('utf-8') in FAMILY_NAMES_: if not (Utils.script.HasDigit(sub) or Utils.script.HasPunctuation(sub)): self.tokens_.append(tokens.Token(sub)) elif k < 4 and sub[:1].encode('utf-8') in FAMILY_NAMES_: if not Utils.script.HasDigit(sub): self.tokens_.append(tokens.Token(sub))
def assem_uni_math(op, size, words, line): logging.debug("Mathu %s size:%d, words:%s" % (op, size, hl_parser.format_word_list(words))) if (len(words) > 1): err.report_error("Unary Math has at most one argument") return token = tokens.Token("uni-math", err, line) token.add_bits(0, 6, 3, 2) token.add_bits(0, 3, 1, size) token.add_bits(0, 0, 3, uni_assem_map[op]) if (not words or (words[0].type() == "modreg" and words[0].val() == hl_parser.modreg_names["acc"])): token.add_bits(0, 2, 1, 0) token.finish(token_stream) elif (words[0].type() in ["var", "arg"]): token.add_bits(0, 2, 1, 1) if (words[0].type() == "arg"): token.add_byte(1, words[0].num()) else: token.add_byte(1, 0) token.add_vname(1, size, words[0].val()) token.finish(token_stream) else: err.report_error("Unary Math - invalid argument type") return
def assem_basic_math(op, size, words, line): logging.debug("Mathb %s size:%s, words:%s" % (op, size, hl_parser.format_word_list(words))) if (len(words) != 1): err.report_error("Basic Math needs one argument") return token = tokens.Token("basic-math", err, line) token.add_bits(0, 6, 3, 2) token.add_bits(0, 4, 3, 1) token.add_bits(0, 3, 1, size) token.add_bits(0, 0, 3, basic_assem_map[op]) if (words[0].type() in ["var", "arg"]): # math with a variable token.add_bits(0, 2, 1, 0) if (words[0].type() == "arg"): token.add_byte(1, words[0].num()) else: token.add_byte(1, 0) token.add_vname(1, size, words[0].val()) token.finish(token_stream) elif (words[0].type() == "const"): token.add_bits(0, 2, 1, 1) if (size == 0): token.add_byte(1, words[0].val()) else: token.add_word(1, words[0].val()) token.finish(token_stream) else: err.report_error("Basic Math - invalid argument type") return
def push_macro(self, optimizer): macro = optimizer.rewind() end = tokens.Token(tokens.Token.END) optimizer.push_node(end) optimizer.push_node(macro.statements) optimizer.open_scope() return True
def assem_other_math(op, size, words, line): logging.debug("Matho %s size:%d, words:%s" % (op, size, hl_parser.format_word_list(words))) if (len(words) != 1): err.report_error("Logic Math needs one arguement") return token = tokens.Token("log-math", err, line) token.add_bits(0, 6, 3, 2) token.add_bits(0, 3, 1, size) if (op in other1_assem_map.keys()): token.add_bits(0,4,3,2) token.add_bits(0,0,3, other1_assem_map[op]) else: token.add_bits(0,4,3,3) token.add_bits(0,0,3, other2_assem_map[op]) if (words[0].type() in ["var", "arg"]): # math with a variable token.add_bits(0, 2, 1, 0) if (words[0].type() == "arg"): token.add_byte(1, words[0].val()) else: token.add_byte(1, 0) token.add_vname(1, size, words[0].val()) token.finish(token_stream) elif (words[0].type() == "const"): token.add_bits(0, 2, 1, 1) token.add_byte(1, words[0].val()) token.finish(token_stream) else: err.report_error("Logic Math - invalid argument type") return
def assem_jump(op, cond, words, line): logging.debug("Jump %s cond:%s, words:%s" % (op, cond, hl_parser.format_word_list(words))) # ops is one of: branch, sub, ret, dbnz, dsnz # cond is one of: a, e, ne, g, l, le, lg or empty for ret, d?nz token = tokens.Token("jump", err, line) token.add_bits(0, 6, 3, 3) if (op == "ret"): if (len(words) != 0): err.report_error("Ret don't take an argument") return else: token.add_bits(0, 0, 0x3f, 0x28) elif (len(words) != 1): err.report_error("Jumps need a target to jump to") return else: if (op[:2] in ['su', 'ds']): # a call to a subroutine - push a frame token.add_bits(0, 3, 1, 1) else: token.add_bits(0, 3, 1, 0) if (not cond): # one of dbnz or dsnz token.add_bits(0, 0, 7, 7) else: token.add_bits(0, 0, 7, jcond_assem_map[cond]) # now the target if (words[0].type() == "const"): offset = words[0].val() if (offset >= tokens.MIN_SBYTE and offset <= tokens.MAX_SBYTE): if (offset < 0): # convert to a signed offset offset += 256 token.add_byte(1, offset) elif (offset >= tokens.MIN_WORD and offsec <= tokens.MAX_WORD): token.add_bits(0, 4, 1, 1) token.add_word(1, offset) elif (words[0].type() == "label"): if (words[0].val().startswith(':')): # globals are always long jumps token.add_bits(0, 4, 1, 1) token.set_jump_label(1, words[0].val(), True) token.add_word(1, 0) # A placeholder else: token.set_jump_label(1, words[0].val()) token.add_byte(1, 0) # A placeholder else: err.report_error( "Jumps need either a constant or a label as argument, not a: " + words[0].type()) return token.finish(token_stream)
def make_number(self): num_str = '' dot_count = 0 pos_start = self.pos.copy() while self.current_char != None and self.current_char in constants.DIGITS + '.': if self.current_char == '.': if dot_count == 1: break dot_count += 1 num_str += self.current_char self.advance() if dot_count == 0: return t.Token(t.TT_INT, int(num_str), pos_start, self.pos) else: return t.Token(t.TT_FLOAT, float(num_str), pos_start, self.pos)
def tester(fn): t = tokens.Token() line = 1 with open(fn) as f: while True: t, line = scanner.driver(f, line) print "%s '%s' on line %d" % (t.identity, t.instance, t.location) if t.identity == tokens.token_ids.token_names[36]: break if t.identity == tokens.token_ids.token_names[35]: break
def _tokenize_line(cls, _line: str) -> typing.List[typing.Any]: _result = [] for i in re.findall(cls.grammer, _line): print(i) _r = [[*c, i] for c in cls.token_list if re.findall(c[-1], i)][0] print(_r) _result.append(tokens.Token(*_r)) return _result
def make_identifier(self): id_str = '' pos_start = self.pos.copy() while self.current_char != None and self.current_char in constants.LETTERS_DIGITS + '_': id_str += self.current_char self.advance() tok_type = t.TT_KEYWORD if id_str in t.KEYWORDS else t.TT_IDENTIFIER return t.Token(tok_type, id_str, pos_start, self.pos)
def make_greater_than(self): tok_type = t.TT_GT pos_start = self.pos.copy() self.advance() if self.current_char == '=': self.advance() tok_type = t.TT_GTE return t.Token(tok_type, pos_start=pos_start, pos_end=self.pos)
def make_minus_or_arrow(self): tok_type = t.TT_MINUS pos_start = self.pos.copy() self.advance() if self.current_char == '>': self.advance() tok_type = t.TT_ARROW return t.Token(tok_type, pos_start=pos_start, pos_end=self.pos)
def make_not_equals(self): pos_start = self.pos.copy() self.advance() if self.current_char == '=': self.advance() return t.Token(t.TT_NE, pos_start=pos_start, pos_end=self.pos), None self.advance() return None, e.ExpectedCharError(pos_start, self.pos, "'=' (after '!')")
def make_equals(self): tok_type = t.TT_EQ pos_start = self.pos.copy() self.advance() if self.current_char == '=': self.advance() tok_type = t.TT_EE return t.Token(tok_type, pos_start=pos_start, pos_end=self.pos)
def assem_stack(op, size, words, line): logging.debug("Stack %s size:%d, words:%s" % (op, size, hl_parser.format_word_list(words))) # op in ['push', 'pop'], size in [0, 1], one word if (len(words) != 1): err.report_error("Stack ops need 1 argument") return token = tokens.Token("stack", err, line) token.add_bits(0, 6, 3, 3) token.add_bits(0, 5, 1, 1) token.add_bits(0, 4, 1, size) if (op == "push"): if (words[0].type() == 'const'): if (size == 0): token.add_byte(1, words[0].val()) else: token.add_word(1, words[0].val()) elif (words[0].type() == 'modreg'): if (words[0].val() == hl_parser.modreg_names["acc"]): token.add_bits(0, 0, 0xf, 0x3) else: token.add_bits(0, 0, 0xf, 2) token.add_byte(1, words[0].val()) elif (words[0].type() in ["var", "arg"]): token.add_bits(0, 0, 0xf, 1) if (words[0].type() == "arg"): token.add_byte(1, words[0].num()) else: token.add_byte(1, 0) token.add_vname(1, size, words[0].val()) else: err.report_error("Push - invalid operand: %s" % (words[0].type())) return else: # pop if (words[0].type() == 'modreg'): if (words[0].val() == hl_parser.modreg_names["acc"]): token.add_bits(0, 0, 0xf, 0x4) else: token.add_bits(0, 0, 0xf, 6) token.add_byte(1, words[0].val()) elif (words[0].type() in ["var", "arg"]): token.add_bits(0, 0, 0xf, 5) if (words[0].type() == "arg"): token.add_byte(1, words[0].num()) else: token.add_byte(1, 0) token.add_vname(1, size, words[0].val()) else: err.report_error("Pop - invalid operand: %s" % (words[0].type())) return token.finish(token_stream)
def LineSegment(self, line): try: utext = unicode(line.strip(), 'utf-8') except TypeError: utext = line.strip() word = [] for u in utext: if Utils.script.CharacterToScript(u) == 'Katakana': word.append(u.encode('utf-8')) else: if word and word != ['・']: self.tokens_.append(tokens.Token(''.join(word))) word = []
def get_tokens(state, literal, line): state_token = tokens.Token() if literal in keywords: state_token.identity = keywords.get(literal) state_token.instance = literal state_token.location = line elif final_states.has_key(state): state_token.identity = final_states.get(state) state_token.instance = literal state_token.location = line return state_token
def TestAuxiliaryComparators(unitname): ## Added tests for Wade-Giles and Pinyin comparators t1 = tokens.Token('毛泽东') t2 = tokens.Token('周恩来') t1py = tokens.Token('Mao Zedong') t2py = tokens.Token('Zhou Enlai') t1wg = tokens.Token('Mao Tse-tung') t2wg = tokens.Token('Chou Enlai') comparator = auxiliary_comp.PinyinComparator(t1, t1py) comparator.ComputeDistance() assert comparator.ComparisonResult().Cost() == auxiliary_comp.MATCH_, \ '%s should match %s' % (t1.String(), t1py.String()) comparator = auxiliary_comp.PinyinComparator(t2, t2py) comparator.ComputeDistance() assert comparator.ComparisonResult().Cost() == auxiliary_comp.MATCH_, \ '%s should match %s' % (t2.String(), t2py.String()) comparator = auxiliary_comp.WadeGilesComparator(t1, t1wg) comparator.ComputeDistance() assert comparator.ComparisonResult().Cost() == auxiliary_comp.MATCH_, \ '%s should match %s' % (t1.String(), t1wg.String()) comparator = auxiliary_comp.WadeGilesComparator(t2, t2wg) comparator.ComputeDistance() assert comparator.ComparisonResult().Cost() == auxiliary_comp.MATCH_, \ '%s should match %s' % (t2.String(), t2wg.String()) comparator = auxiliary_comp.WadeGilesComparator(t2, t2py) comparator.ComputeDistance() assert comparator.ComparisonResult().Cost() == auxiliary_comp.NO_MATCH_, \ '%s should not match %s' % (t2.String(), t2py.String()) print '%s (auxiliary tests) successful' % unitname
def assem_misc(op, words, line): logging.debug("Misc op:%s words:%s" % (op, hl_parser.format_word_list(words))) if (op == "stop"): if (len(words) != 0): err.report_error("Stop doesn't take arguments") return token = tokens.Token("misc", err, line) token.add_bits(0, 0, 0xff, 0xff) token.finish(token_stream) elif (op in ["bitset", "bitclr"]): if (len(words) != 2): err.report_error( "Bitset/bitclr needs 2 arguments: bit and mod/reg") return bit = words[0].anum() if (bit < 0 or bit > 7): err.report_error( "Bitset/bitclr bit must be between 0 and 7 (not %d)" % (bit)) return modreg = words[1].amodreg() token = tokens.Token("misc", err, line) token.add_bits(0, 4, 0x0f, 0x00) if (op == "bitset"): token.add_bits(0, 3, 0x1, 0x1) else: token.add_bits(0, 3, 0x1, 0x0) token.add_bits(0, 0, 0x7, bit) token.add_byte(1, modreg) token.finish(token_stream) else: err.report_error("Unknown misc operator: " + op) return
def parseSimpleState(text): # Parses a simple statement into a token list # Assumes that the inputted text is a valid statement. # But we check anyway if text == None: return None tokenList = [] tokenlistpos = 0 pos = 0 # This loop handles most of the things while (pos < len(text)): cc = text[pos] # cc = current char if cc.isalnum() == True: # Just avoiding errors with the if statement below if pos > 0: if text[pos - 1].isalnum == True: tokenList[tokenlistpos].value += cc else: tokenList.append(tokens.Token(tokens.OD_NAME, cc)) else: tokenList.append(tokens.Token(tokens.OD_NAME, cc)) tokenlistpos = len(tokenList) elif cc == '+' or cc == '-' or cc == '*' or cc == '-' or cc == '=': tokenList.append(tokens.Token(tokens.OP, cc)) tokenlistpos = len(tokenList) else: tokenlistpos = len(tokenList) pos += 1 return tokenList