Exemplo n.º 1
0
def tokenize_js(code, need_type_info=False):
    # with open(code, 'r') as f:
    #     code = f.read()
    lexer = Lexer()
    lexer.input(code)

    tokens = []
    types = []
    pos = []
    while True:
        token = lexer.token()
        if not token:
            break
        tokens.append(token.value)
        types.append(token.type)
        pos.append([token.lineno, token.lexpos])

    if need_type_info:
        return tokens, types, pos
    else:
        return tokens, pos


# if __name__=="__main__":
#     print("No code/function passed in, function below is used to show you a case:")
#     print()
#     CodeExample = open('def.js','r',encoding='utf-8').read()
#     print()
#     tokens, types, pos = tokenize_js(CodeExample, True)
#     print(tokens)
#     print(types)
#     print(pos)
Exemplo n.º 2
0
def train_from_js_tokens(corpus):
    lexer = Lexer()
    tokens_corpus = []
    for t in corpus:
        lexer.input(t)
        tokens_corpus.append(' '.join([token.type for token in lexer]))
    return train_tfidf(tokens_corpus)
Exemplo n.º 3
0
    def __init__(self, lex_optimize=True, lextab=lextab,
                 yacc_optimize=True, yacctab=yacctab, yacc_debug=False):
        self.lex_optimize = lex_optimize
        self.lextab = lextab
        self.yacc_optimize = yacc_optimize
        self.yacctab = yacctab
        self.yacc_debug = yacc_debug

        self.lexer = Lexer()
        self.lexer.build(optimize=lex_optimize, lextab=lextab)
        self.tokens = self.lexer.tokens

        self.parser = ply.yacc.yacc(
            module=self, optimize=yacc_optimize,
            debug=yacc_debug, tabmodule=yacctab, start='program')

        # https://github.com/rspivak/slimit/issues/29
        # lexer.auto_semi can cause a loop in a parser
        # when a parser error happens on a token right after
        # a newline.
        # We keep record of the tokens that caused p_error
        # and if the token has already been seen - we raise
        # a SyntaxError exception to avoid looping over and
        # over again.
        self._error_tokens = {}
Exemplo n.º 4
0
def custom_lex(text):
    # lexes string, stops when it starts making TypeErrors
    # input: f.read()
    out_list = []
    id_list = []
    tok_list = []
    str_list = []
    num_list = []
    lexer = Lexer()
    lexer.input(text)
    while True:
        try:
            token = lexer.token()
            if not token:
                break # break if end of token
            tok_type = token.type
            if tok_type=='ID':
                id_list.append(token.value)
            elif tok_type=='STRING':
                str_list.append(token.value)
            elif tok_type=='NUMBER':
                num_list.append(token.value)
            else:
                tok_list.append(token.value)
            out_list.append(token.value)
        except TypeError:
            break
        except AttributeError:
            break
    return out_list,id_list,tok_list, str_list, num_list
Exemplo n.º 5
0
def lex(text, output_type='all'):
    # lexes string, stops when it starts making TypeErrors
    # input: f.read()
    out_list = []
    lexer = Lexer()
    lexer.input(text)
    while True:
        try:
            token = lexer.token()
            if not token:
                break  # break if end of token
            if output_type == 'value':
                try:
                    out_list.append(token.value)
                except AttributeError:
                    break
            elif output_type == 'type':
                try:
                    out_list.append(token.type)
                except AttributeError:
                    break
            else:
                out_list.append(token)
        except TypeError:
            break
        except AttributeError:
            break
    return out_list
Exemplo n.º 6
0
 def tokenize(self, code):
   try:
     script = code.decode('ascii', 'ignore')
     lexer = Lexer()
     lexer.input(script)
     return lexer
   except:
     return None
Exemplo n.º 7
0
def variables_functions_in_script(url):
    js = extract_javascript_content(url)
    identifiers = []
    lexer = Lexer()
    lexer.input(js)
    for token in lexer:
        if token.type == 'ID':
            identifiers.append(
                unicodedata.normalize('NFKD',
                                      token.value).encode('ascii', 'ignore'))
    return identifiers
Exemplo n.º 8
0
def tokenize(code, need_type_info=False):
    lexer = Lexer()
    lexer.input(code)

    tokens = []
    types = []
    for token in lexer:
        #print(token)
        tokens.append(token.value)
        types.append(token.type)

    if need_type_info:
        return tokens, types
    else:
        return tokens
def tokensUsedSlimit(inputFile):
    '''
		Given an input JavaScript file, create a list containing the SlimIt lexical tokens present in the file.
		
		-------
		Parameter:
		- inputFile: File
			Should it be malformed or no JS file, then an exception will be raised (see JsDetection.isJsFile).
			
		-------
		Returns:
		- List
			Contains the SlimIt lexical tokens present in the input file.
		- or None if the file either is no JS or malformed.
	'''

    if JsDetection.isJsFile(
            inputFile
    ) == 0:  # Only if the current file is a well-formed JS sample
        with open(inputFile, 'r') as inF:
            s = ''
            try:
                for line in inF:
                    s += str(
                        line
                    )  # Store the content of the JS file in a string, because far more quicker than using SlimIt minifier.
            except UnicodeDecodeError:
                print('Exception handling')

        lexer = Lexer()
        lexer.input(s)
        l = []

        try:
            for token in lexer:
                # Structure of a token: "LexToken(VAR,'var',1,0)"
                tokenPart = str(token).split('(')
                tokenComplete = tokenPart[1].split(
                    ',')  # Keyword as used in JS
                l += [tokenComplete[0]]
            return l  # Lexical tokens

        except TypeError:
            print('Exception handling')
Exemplo n.º 10
0
    def __init__(self,
                 lex_optimize=True,
                 lextab=lextab,
                 yacc_optimize=True,
                 yacctab=yacctab,
                 yacc_debug=False):
        self.lex_optimize = lex_optimize
        self.lextab = lextab
        self.yacc_optimize = yacc_optimize
        self.yacctab = yacctab
        self.yacc_debug = yacc_debug

        self.lexer = Lexer()
        self.lexer.build(optimize=lex_optimize, lextab=lextab)
        self.tokens = self.lexer.tokens

        self.parser = ply.yacc.yacc(module=self,
                                    optimize=yacc_optimize,
                                    debug=yacc_debug,
                                    tabmodule=yacctab,
                                    start='program')
Exemplo n.º 11
0
    "key2": "value2"
 };
 """


class MyVisitor(ASTVisitor):
    def visit(self, node):
        for prop in node:
            left, right = prop.left, prop.right
            print('Property key=%s, value=%s' % (left.value, right.value))
            self.visit(prop)


try:
    parser = Parser()
    tree = parser.parser(text)

    visitor = MyVisitor()
    visitor.visit(tree)
except Exception as e:
    logging.exception(e)

# 4 Using lexer in your project
print('4 Using lexer in your project')
lexer = Lexer()
lexer.input('a=1;x=1+3;')
for token in lexer:
    print(token)


Exemplo n.º 12
0
 def _get_lexer(self):
     lexer = Lexer()
     return lexer
Exemplo n.º 13
0
 def get_type_idx(self):
     from slimit.lexer import Lexer
     lexer = Lexer()
     self.type2idx = dict()
     for i, tok in enumerate(lexer.tokens):
         self.type2idx[tok] = i