Exemplo n.º 1
0
def tokenize_js(code, need_type_info=False):
    # with open(code, 'r') as f:
    #     code = f.read()
    lexer = Lexer()
    lexer.input(code)

    tokens = []
    types = []
    pos = []
    while True:
        token = lexer.token()
        if not token:
            break
        tokens.append(token.value)
        types.append(token.type)
        pos.append([token.lineno, token.lexpos])

    if need_type_info:
        return tokens, types, pos
    else:
        return tokens, pos


# if __name__=="__main__":
#     print("No code/function passed in, function below is used to show you a case:")
#     print()
#     CodeExample = open('def.js','r',encoding='utf-8').read()
#     print()
#     tokens, types, pos = tokenize_js(CodeExample, True)
#     print(tokens)
#     print(types)
#     print(pos)
def LexingofFunction(str):
    lexer = Lexer()
    lexer.input(str)

    lhs = ""
    rhs = ""
    flag = 0
    for token in lexer:

        # print token
        tokenTemp = token.__str__()

        if '=' not in tokenTemp and flag == 0:

            tokenTemp = trim(tokenTemp)
            lhs = lhs + tokenTemp

        elif '=' in tokenTemp or ';' in tokenTemp:
            flag = 1;
            continue

        else:
            rhs = rhs + tokenTemp

    print "Printing lhs", lhs
    print "Printing rhs", rhs

    addinHeap(lhs, rhs)
    # CompareLHSandRHS(lhs,rhs)


    return token
Exemplo n.º 3
0
def train_from_js_tokens(corpus):
    lexer = Lexer()
    tokens_corpus = []
    for t in corpus:
        lexer.input(t)
        tokens_corpus.append(' '.join([token.type for token in lexer]))
    return train_tfidf(tokens_corpus)
Exemplo n.º 4
0
def custom_lex(text):
    # lexes string, stops when it starts making TypeErrors
    # input: f.read()
    out_list = []
    id_list = []
    tok_list = []
    str_list = []
    num_list = []
    lexer = Lexer()
    lexer.input(text)
    while True:
        try:
            token = lexer.token()
            if not token:
                break # break if end of token
            tok_type = token.type
            if tok_type=='ID':
                id_list.append(token.value)
            elif tok_type=='STRING':
                str_list.append(token.value)
            elif tok_type=='NUMBER':
                num_list.append(token.value)
            else:
                tok_list.append(token.value)
            out_list.append(token.value)
        except TypeError:
            break
        except AttributeError:
            break
    return out_list,id_list,tok_list, str_list, num_list
Exemplo n.º 5
0
def lex(text, output_type='all'):
    # lexes string, stops when it starts making TypeErrors
    # input: f.read()
    out_list = []
    lexer = Lexer()
    lexer.input(text)
    while True:
        try:
            token = lexer.token()
            if not token:
                break  # break if end of token
            if output_type == 'value':
                try:
                    out_list.append(token.value)
                except AttributeError:
                    break
            elif output_type == 'type':
                try:
                    out_list.append(token.type)
                except AttributeError:
                    break
            else:
                out_list.append(token)
        except TypeError:
            break
        except AttributeError:
            break
    return out_list
Exemplo n.º 6
0
    def extract(self, node):

        lexer = JsLexer()
        lexer.input(node.text)
        regions = []

        if not self._parse_succeeds(node.text):
            return regions

        while True:
            try:
                tok = lexer.token()
                if not tok:
                    break
                if tok.type == "REGEX":
                    start_char = tok.lexpos + 1
                    regex_parts = tok.value.split('/')
                    string = regex_parts[1]
                    flags = regex_parts[2]
                    if not self._are_flags_valid(flags):
                        continue
                    end_char = start_char + len(string) - 1
                    r = RegexRegion(string, node, start_char, end_char, string)
                    regions.append(r)
            except (TypeError, AttributeError):
                logging.warn("Failed to parse text: %s...", node.text[:100])
                break

        return regions
Exemplo n.º 7
0
def numberEval(script):
    counter=0
    lexer = Lexer()
    lexer.input(script)
    while True:
        token = lexer.token()
        if not token:
            break
        if token.value == 'eval':
            counter+=1
    return counter
Exemplo n.º 8
0
def numberDOModificationFunctions(script):
    lexer = Lexer()
    lexer.input(script)
    counter = 0
    while True:
        token = lexer.token()
        if not token:
            break
        if token.type == 'ID' and token.value in DOModificationFunctions:
            counter += 1
    return counter        
Exemplo n.º 9
0
def numberOfUnescapeAndEscape(script):
    lexer = Lexer()
    lexer.input(script)
    counter = 0
    while True:
        token = lexer.token()
        if not token:
            break
        if token.type == 'ID' and token.value in functions:
            counter += 1
    return counter        
Exemplo n.º 10
0
def numberOfStringsContainingSubstring(script,substring='iframe'):
    lexer = Lexer()
    lexer.input(script)
    counter = 0
    while True:
        token = lexer.token()
        if not token:
            break
        if token.type == 'STRING' and substring in token.value:
            counter+=1
    return counter
Exemplo n.º 11
0
def strExtractParse(s):
    '''Extracts all the substrings from the script using a lexer. '''
    l=[]
    lexer = Lexer()
    lexer.input(s)
    while True:
        token = lexer.token()
        if not token:
            break
        if token.type == 'STRING':
            l+=[token.value]
    return l
Exemplo n.º 12
0
def feature101(str):
    #f = open(path)
    #script = f.read()
    lexer = Lexer()
    lexer.input(str)
    counter = 0
    while True:
        token = lexer.token()
        if not token:
            break
        if token.type == 'STRING':
            counter = token.value.count('\\x')
    return counter
Exemplo n.º 13
0
def averageLengthOfStrings(script):
    lexer = Lexer()
    lexer.input(script)
    strings = set()
    totalStringsLength = 0
    while True:
        token = lexer.token()
        if not token:
            break
        if token.type == 'STRING':
            strings.add(token.value)
    for string in strings:
        totalStringsLength += len(string)-2
    if len(strings) > 0:
        return round(totalStringsLength/len(strings),2)
    else:
        return 0
__author__ = 'khushboomandlecha'

# Trying out the lexer function

from slimit.lexer import Lexer
lexer = Lexer()
lexer.input('o = {};o.p1 = 3;y = prompt(\'Which property do you want?\');if (o[y]) {l = 0;}')
file = open('outputfile.txt','w')



for token in lexer:

    file.write(str(token))
    file.write('\n')
    print token

file.close()
Exemplo n.º 15
0
from __future__ import division

from slimit.lexer import Lexer
from slimit.parser import Parser
from slimit.visitors.nodevisitor import ASTVisitor

text = open('test.js').read();

linecount = len(text.splitlines())

tokenList = []
tokens = {}

lexer = Lexer()
lexer.input(text)
for token in lexer:
    tokens[token.lexpos] = dict(
        t = token.type[:],
        v = token.value[:],
        prior = [t.value for t in tokenList[-5:]] + [''] * (5 - len(tokenList[-5:]))
    )
    tokenList.append(token)

parser = Parser(yacc_tracking=True)
tree = parser.parse(text)

class TrainingSample(object):
    def __init__(self, structure, lineno, prior, match, completion):
        self._structure = structure

        self._slno = lineno