Python Tokenizer.raw_tokenize示例

编程语言: Python

命名空间/包名称: tokenizer.tokenizer

类/类型: Tokenizer

方法/功能: raw_tokenize

hotexamples.com的示例: 6

Python Tokenizer.raw_tokenize - 已找到6个示例。这些是从开源项目中提取的最受好评的tokenizer.tokenizer.Tokenizer.raw_tokenize现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Tokenizer(30)

predict(8)

tokenize(7)

format(4)

txt(4)

analyze(3)

raw_tokenize(3)

run(3)

mr(2)

token_to_id(2)

tokenize_seq_file(1)

tokenize_text_file(1)

tokens_from_sentence(1)

示例#1

显示文件

文件： ident_list.py 项目： kmack3/Algae

def tokenize(path):
    t = Tokenizer(path)
    tokens = t.raw_tokenize()

    idents = []
    for token in tokens:
        if token.kind.name == "IDENTIFIER":
            name = token.spelling.lower()
            name = re.sub("_", "", name)
            idents.append(name)

    return "\n".join(idents)

示例#2

显示文件

文件： ident_list.py 项目： JonathanPierce/Algae

def tokenize(path):
	t = Tokenizer(path)
	tokens = t.raw_tokenize()

	idents = []
	for token in tokens:
		if token.kind.name == "IDENTIFIER":
			name = token.spelling.lower()
			name = re.sub("_", "", name)
			idents.append(name)

	return "\n".join(idents)

示例#3

显示文件

文件： literals.py 项目： kmack3/Algae

def tokenize(path):
    t = Tokenizer(path)
    tokens = t.raw_tokenize()

    items = []
    for token in tokens:
        if token.kind.name == "LITERAL":
            text = token.spelling
            cursor_kind = clang.cindex.CursorKind
            kind = token.cursor.kind

            if kind == cursor_kind.STRING_LITERAL:
                # do extra processing on strings
                text = sha256(mangle_text(token.spelling)).hexdigest()[:10]

            items.append(text)

        if token.kind.name == "COMMENT":
            hashed = sha256(mangle_text(token.spelling[2:])).hexdigest()[:10]
            items.append(hashed)

    return "\n".join(items)

示例#4

显示文件

文件： literals.py 项目： JonathanPierce/Algae

def tokenize(path):
    t = Tokenizer(path)
    tokens = t.raw_tokenize()

    items = []
    for token in tokens:
        if token.kind.name == "LITERAL":
            text = token.spelling
            cursor_kind = clang.cindex.CursorKind
            kind = token.cursor.kind

            if kind == cursor_kind.STRING_LITERAL:
                # do extra processing on strings
                text = sha256(mangle_text(token.spelling)).hexdigest()[:10]

            items.append(text)

        if token.kind.name == "COMMENT":
            hashed = sha256(mangle_text(token.spelling[2:])).hexdigest()[:10]
            items.append(hashed)

    return "\n".join(items)

示例#5

显示文件

文件： stats.py 项目： JonathanPierce/Algae

def genStats(path, helpers):
	t = Tokenizer(path)
	tokens = t.raw_tokenize()

	# stats
	numLines = 0
	numWhitespace = 0
	numComments = 0
	avgIdentLength = 0
	numFunctions = 0 # ident followed by (, declarations and calls
	numDefines = 0
	numMathOps = 0
	lenLongestLine = 0
	numReturns = 0

	# other data
	idents = []
	text = io.readFile(path)
	lastWasIdent = False

	# get info from tokens
	for token in tokens:
		# look for a comment
		if token.kind.name == "COMMENT":
			numComments += 1

		# look for math ops
		if token.spelling in ["+", "-", "*", "/", "|", "&", "+=", "-=", "*=", "/=", ">>=", "<<=", "++", "--", "~", ">>", "!"]:
			numMathOps += 1

		# look for function decs/calls
		if lastWasIdent and token.spelling == "(":
			numFunctions += 1

		# count the number of returns
		if token.spelling == "return":
			numReturns += 1

		# add the identifier to the list, set lastWasIdent
		if token.kind.name == "IDENTIFIER":
			idents.append(token.spelling)
			lastWasIdent = True
		else:
			lastWasIdent = False

	# get average ident length
	total = 0.0
	for ident in idents:
		total += float(len(ident))
	avgIdentLength = 0.0
	if len(idents) > 0:
		avgIdentLenth = total / float(len(idents))

	# find the number of defines
	defines = re.findall("#\s*define ", text.lower())
	numDefines = len(defines)

	# find the number of lines
	lines = text.split("\n")
	if len(lines) == 1:
		# ugh, windows
		lines = text.split("\r")
	numLines = len(lines)

	# get the length of the longest line
	for line in lines:
		if len(line) > lenLongestLine:
			lenLongestLine = len(line)

	# find the total amount of whitespace
	for char in text:
		if char in [" ", "\n", "\t", "\r"]:
			numWhitespace += 1

	# create a dict of results and return
	results = {}
	results["numLines"] = numLines
	results["numWhitespace"] = numWhitespace
	results["numComments"] = numComments
	results["avgIdentLength"] = avgIdentLength
	results["numFunctions"] = numFunctions
	results["numDefines"] = numDefines
	results["numMathOps"] = numMathOps
	results["numReturns"] = numReturns
	results["lenLongestLine"] = lenLongestLine
	return results

示例#6

显示文件

def genStats(path, helpers):
    t = Tokenizer(path)
    tokens = t.raw_tokenize()

    # stats
    numLines = 0
    numWhitespace = 0
    numComments = 0
    avgIdentLength = 0
    numFunctions = 0  # ident followed by (, declarations and calls
    numDefines = 0
    numMathOps = 0
    lenLongestLine = 0
    numReturns = 0

    # other data
    idents = []
    text = io.readFile(path)
    lastWasIdent = False

    # get info from tokens
    for token in tokens:
        # look for a comment
        if token.kind.name == "COMMENT":
            numComments += 1

        # look for math ops
        if token.spelling in [
                "+", "-", "*", "/", "|", "&", "+=", "-=", "*=", "/=", ">>=",
                "<<=", "++", "--", "~", ">>", "!"
        ]:
            numMathOps += 1

        # look for function decs/calls
        if lastWasIdent and token.spelling == "(":
            numFunctions += 1

        # count the number of returns
        if token.spelling == "return":
            numReturns += 1

        # add the identifier to the list, set lastWasIdent
        if token.kind.name == "IDENTIFIER":
            idents.append(token.spelling)
            lastWasIdent = True
        else:
            lastWasIdent = False

    # get average ident length
    total = 0.0
    for ident in idents:
        total += float(len(ident))
    avgIdentLength = 0.0
    if len(idents) > 0:
        avgIdentLenth = total / float(len(idents))

    # find the number of defines
    defines = re.findall("#\s*define ", text.lower())
    numDefines = len(defines)

    # find the number of lines
    lines = text.split("\n")
    if len(lines) == 1:
        # ugh, windows
        lines = text.split("\r")
    numLines = len(lines)

    # get the length of the longest line
    for line in lines:
        if len(line) > lenLongestLine:
            lenLongestLine = len(line)

    # find the total amount of whitespace
    for char in text:
        if char in [" ", "\n", "\t", "\r"]:
            numWhitespace += 1

    # create a dict of results and return
    results = {}
    results["numLines"] = numLines
    results["numWhitespace"] = numWhitespace
    results["numComments"] = numComments
    results["avgIdentLength"] = avgIdentLength
    results["numFunctions"] = numFunctions
    results["numDefines"] = numDefines
    results["numMathOps"] = numMathOps
    results["numReturns"] = numReturns
    results["lenLongestLine"] = lenLongestLine
    return results