def tokenize_js(code, need_type_info=False): # with open(code, 'r') as f: # code = f.read() lexer = Lexer() lexer.input(code) tokens = [] types = [] pos = [] while True: token = lexer.token() if not token: break tokens.append(token.value) types.append(token.type) pos.append([token.lineno, token.lexpos]) if need_type_info: return tokens, types, pos else: return tokens, pos # if __name__=="__main__": # print("No code/function passed in, function below is used to show you a case:") # print() # CodeExample = open('def.js','r',encoding='utf-8').read() # print() # tokens, types, pos = tokenize_js(CodeExample, True) # print(tokens) # print(types) # print(pos)
def train_from_js_tokens(corpus): lexer = Lexer() tokens_corpus = [] for t in corpus: lexer.input(t) tokens_corpus.append(' '.join([token.type for token in lexer])) return train_tfidf(tokens_corpus)
def __init__(self, lex_optimize=True, lextab=lextab, yacc_optimize=True, yacctab=yacctab, yacc_debug=False): self.lex_optimize = lex_optimize self.lextab = lextab self.yacc_optimize = yacc_optimize self.yacctab = yacctab self.yacc_debug = yacc_debug self.lexer = Lexer() self.lexer.build(optimize=lex_optimize, lextab=lextab) self.tokens = self.lexer.tokens self.parser = ply.yacc.yacc( module=self, optimize=yacc_optimize, debug=yacc_debug, tabmodule=yacctab, start='program') # https://github.com/rspivak/slimit/issues/29 # lexer.auto_semi can cause a loop in a parser # when a parser error happens on a token right after # a newline. # We keep record of the tokens that caused p_error # and if the token has already been seen - we raise # a SyntaxError exception to avoid looping over and # over again. self._error_tokens = {}
def custom_lex(text): # lexes string, stops when it starts making TypeErrors # input: f.read() out_list = [] id_list = [] tok_list = [] str_list = [] num_list = [] lexer = Lexer() lexer.input(text) while True: try: token = lexer.token() if not token: break # break if end of token tok_type = token.type if tok_type=='ID': id_list.append(token.value) elif tok_type=='STRING': str_list.append(token.value) elif tok_type=='NUMBER': num_list.append(token.value) else: tok_list.append(token.value) out_list.append(token.value) except TypeError: break except AttributeError: break return out_list,id_list,tok_list, str_list, num_list
def lex(text, output_type='all'): # lexes string, stops when it starts making TypeErrors # input: f.read() out_list = [] lexer = Lexer() lexer.input(text) while True: try: token = lexer.token() if not token: break # break if end of token if output_type == 'value': try: out_list.append(token.value) except AttributeError: break elif output_type == 'type': try: out_list.append(token.type) except AttributeError: break else: out_list.append(token) except TypeError: break except AttributeError: break return out_list
def tokenize(self, code): try: script = code.decode('ascii', 'ignore') lexer = Lexer() lexer.input(script) return lexer except: return None
def variables_functions_in_script(url): js = extract_javascript_content(url) identifiers = [] lexer = Lexer() lexer.input(js) for token in lexer: if token.type == 'ID': identifiers.append( unicodedata.normalize('NFKD', token.value).encode('ascii', 'ignore')) return identifiers
def tokenize(code, need_type_info=False): lexer = Lexer() lexer.input(code) tokens = [] types = [] for token in lexer: #print(token) tokens.append(token.value) types.append(token.type) if need_type_info: return tokens, types else: return tokens
def tokensUsedSlimit(inputFile): ''' Given an input JavaScript file, create a list containing the SlimIt lexical tokens present in the file. ------- Parameter: - inputFile: File Should it be malformed or no JS file, then an exception will be raised (see JsDetection.isJsFile). ------- Returns: - List Contains the SlimIt lexical tokens present in the input file. - or None if the file either is no JS or malformed. ''' if JsDetection.isJsFile( inputFile ) == 0: # Only if the current file is a well-formed JS sample with open(inputFile, 'r') as inF: s = '' try: for line in inF: s += str( line ) # Store the content of the JS file in a string, because far more quicker than using SlimIt minifier. except UnicodeDecodeError: print('Exception handling') lexer = Lexer() lexer.input(s) l = [] try: for token in lexer: # Structure of a token: "LexToken(VAR,'var',1,0)" tokenPart = str(token).split('(') tokenComplete = tokenPart[1].split( ',') # Keyword as used in JS l += [tokenComplete[0]] return l # Lexical tokens except TypeError: print('Exception handling')
def __init__(self, lex_optimize=True, lextab=lextab, yacc_optimize=True, yacctab=yacctab, yacc_debug=False): self.lex_optimize = lex_optimize self.lextab = lextab self.yacc_optimize = yacc_optimize self.yacctab = yacctab self.yacc_debug = yacc_debug self.lexer = Lexer() self.lexer.build(optimize=lex_optimize, lextab=lextab) self.tokens = self.lexer.tokens self.parser = ply.yacc.yacc(module=self, optimize=yacc_optimize, debug=yacc_debug, tabmodule=yacctab, start='program')
"key2": "value2" }; """ class MyVisitor(ASTVisitor): def visit(self, node): for prop in node: left, right = prop.left, prop.right print('Property key=%s, value=%s' % (left.value, right.value)) self.visit(prop) try: parser = Parser() tree = parser.parser(text) visitor = MyVisitor() visitor.visit(tree) except Exception as e: logging.exception(e) # 4 Using lexer in your project print('4 Using lexer in your project') lexer = Lexer() lexer.input('a=1;x=1+3;') for token in lexer: print(token)
def _get_lexer(self): lexer = Lexer() return lexer
def get_type_idx(self): from slimit.lexer import Lexer lexer = Lexer() self.type2idx = dict() for i, tok in enumerate(lexer.tokens): self.type2idx[tok] = i