def tokenize_js(code, need_type_info=False): # with open(code, 'r') as f: # code = f.read() lexer = Lexer() lexer.input(code) tokens = [] types = [] pos = [] while True: token = lexer.token() if not token: break tokens.append(token.value) types.append(token.type) pos.append([token.lineno, token.lexpos]) if need_type_info: return tokens, types, pos else: return tokens, pos # if __name__=="__main__": # print("No code/function passed in, function below is used to show you a case:") # print() # CodeExample = open('def.js','r',encoding='utf-8').read() # print() # tokens, types, pos = tokenize_js(CodeExample, True) # print(tokens) # print(types) # print(pos)
def LexingofFunction(str): lexer = Lexer() lexer.input(str) lhs = "" rhs = "" flag = 0 for token in lexer: # print token tokenTemp = token.__str__() if '=' not in tokenTemp and flag == 0: tokenTemp = trim(tokenTemp) lhs = lhs + tokenTemp elif '=' in tokenTemp or ';' in tokenTemp: flag = 1; continue else: rhs = rhs + tokenTemp print "Printing lhs", lhs print "Printing rhs", rhs addinHeap(lhs, rhs) # CompareLHSandRHS(lhs,rhs) return token
def train_from_js_tokens(corpus): lexer = Lexer() tokens_corpus = [] for t in corpus: lexer.input(t) tokens_corpus.append(' '.join([token.type for token in lexer])) return train_tfidf(tokens_corpus)
def custom_lex(text): # lexes string, stops when it starts making TypeErrors # input: f.read() out_list = [] id_list = [] tok_list = [] str_list = [] num_list = [] lexer = Lexer() lexer.input(text) while True: try: token = lexer.token() if not token: break # break if end of token tok_type = token.type if tok_type=='ID': id_list.append(token.value) elif tok_type=='STRING': str_list.append(token.value) elif tok_type=='NUMBER': num_list.append(token.value) else: tok_list.append(token.value) out_list.append(token.value) except TypeError: break except AttributeError: break return out_list,id_list,tok_list, str_list, num_list
def lex(text, output_type='all'): # lexes string, stops when it starts making TypeErrors # input: f.read() out_list = [] lexer = Lexer() lexer.input(text) while True: try: token = lexer.token() if not token: break # break if end of token if output_type == 'value': try: out_list.append(token.value) except AttributeError: break elif output_type == 'type': try: out_list.append(token.type) except AttributeError: break else: out_list.append(token) except TypeError: break except AttributeError: break return out_list
def extract(self, node): lexer = JsLexer() lexer.input(node.text) regions = [] if not self._parse_succeeds(node.text): return regions while True: try: tok = lexer.token() if not tok: break if tok.type == "REGEX": start_char = tok.lexpos + 1 regex_parts = tok.value.split('/') string = regex_parts[1] flags = regex_parts[2] if not self._are_flags_valid(flags): continue end_char = start_char + len(string) - 1 r = RegexRegion(string, node, start_char, end_char, string) regions.append(r) except (TypeError, AttributeError): logging.warn("Failed to parse text: %s...", node.text[:100]) break return regions
def numberEval(script): counter=0 lexer = Lexer() lexer.input(script) while True: token = lexer.token() if not token: break if token.value == 'eval': counter+=1 return counter
def numberDOModificationFunctions(script): lexer = Lexer() lexer.input(script) counter = 0 while True: token = lexer.token() if not token: break if token.type == 'ID' and token.value in DOModificationFunctions: counter += 1 return counter
def numberOfUnescapeAndEscape(script): lexer = Lexer() lexer.input(script) counter = 0 while True: token = lexer.token() if not token: break if token.type == 'ID' and token.value in functions: counter += 1 return counter
def numberOfStringsContainingSubstring(script,substring='iframe'): lexer = Lexer() lexer.input(script) counter = 0 while True: token = lexer.token() if not token: break if token.type == 'STRING' and substring in token.value: counter+=1 return counter
def strExtractParse(s): '''Extracts all the substrings from the script using a lexer. ''' l=[] lexer = Lexer() lexer.input(s) while True: token = lexer.token() if not token: break if token.type == 'STRING': l+=[token.value] return l
def feature101(str): #f = open(path) #script = f.read() lexer = Lexer() lexer.input(str) counter = 0 while True: token = lexer.token() if not token: break if token.type == 'STRING': counter = token.value.count('\\x') return counter
def averageLengthOfStrings(script): lexer = Lexer() lexer.input(script) strings = set() totalStringsLength = 0 while True: token = lexer.token() if not token: break if token.type == 'STRING': strings.add(token.value) for string in strings: totalStringsLength += len(string)-2 if len(strings) > 0: return round(totalStringsLength/len(strings),2) else: return 0
__author__ = 'khushboomandlecha' # Trying out the lexer function from slimit.lexer import Lexer lexer = Lexer() lexer.input('o = {};o.p1 = 3;y = prompt(\'Which property do you want?\');if (o[y]) {l = 0;}') file = open('outputfile.txt','w') for token in lexer: file.write(str(token)) file.write('\n') print token file.close()
from __future__ import division from slimit.lexer import Lexer from slimit.parser import Parser from slimit.visitors.nodevisitor import ASTVisitor text = open('test.js').read(); linecount = len(text.splitlines()) tokenList = [] tokens = {} lexer = Lexer() lexer.input(text) for token in lexer: tokens[token.lexpos] = dict( t = token.type[:], v = token.value[:], prior = [t.value for t in tokenList[-5:]] + [''] * (5 - len(tokenList[-5:])) ) tokenList.append(token) parser = Parser(yacc_tracking=True) tree = parser.parse(text) class TrainingSample(object): def __init__(self, structure, lineno, prior, match, completion): self._structure = structure self._slno = lineno