return token.rstrip() + " " if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-n", dest="N", required=True, type=int) parser.add_argument("--threshould", dest="threshould", required=True, type=float) parser.add_argument("-t", "--target", required=True, type=argparse.FileType("r"), dest="target_file") parser.add_argument("-v", "--vector", required=True, type=str, dest="vector_path") args = parser.parse_args() model = kenlm.Model(args.vector_path) lexer = RubyLexer() token_stream = lexer.get_tokens(args.target_file.read()) token_str = "" for token_data in token_stream: token_str += replace_special_char(token_data[-1]) token_list = token_str.split(" ") bag_of_ngrams = ngrams(token_list, args.N) index = 0 x = [] y = [] for ngram in bag_of_ngrams: probabilty = 1/model.perplexity(" ".join(ngram))
def tokenize(program_path, raw=False): lexer = RubyLexer() token_streams = [] with open(program_path, "r") as f: program = f.readlines() num_of_lines = len(program) last_indent_count = 0 for line in program: line_of_token = [] for token_data in lexer.get_tokens(line): token_type = token_data[0] token = token_data[-1] if raw: if is_token_subtype(token_type, Comment) or is_token_subtype( token_type, Literal): arranged_token = replace_special_char(token, comment=True) else: arranged_token = replace_special_char(token, comment=False) else: if is_token_subtype(token_type, Literal): arranged_token = "<LITERAL>" elif is_token_subtype(token_type, String): arranged_token = "<STRING>" elif is_token_subtype(token_type, Number): arranged_token = "<NUMBER>" elif token_type == Token.Name.Operator: arranged_token = "<OPERATOR>" elif token_type == Name and token not in reserved: arranged_token = "<ID>" elif token_type == Name.Variable.Instance: arranged_token = "<INSTANCE_VAL>" elif token_type == Name.Variable.Class: arranged_token = "<CLASS_VAL>" elif token_type == Name.Constant: arranged_token = "<CONSTANT_ID>" elif token_type == Name.Function: arranged_token = "<FUNCTION>" elif token_type == Name.Class: arranged_token = "<CLASS>" elif token_type == Name.Namespace: arranged_token = "<NAMESPACE>" elif token_type == Token.Name.Variable.Global: arranged_token = "<GLOBAL_VAL>" elif token_type == Token.Error: arranged_token = "<ERROR>" # pygments内で字句解析が失敗した際のトークン (絵文字など) elif is_token_subtype(token_type, Comment): arranged_token = "<COMMENT>" else: arranged_token = replace_special_char(token) # if arranged_token not in reserved and "SPACE" not in arranged_token and "NEWLINE" not in arranged_token: # if token_type not in (Token.Punctuation, Token.Operator, Token.Name.Builtin, Token.Keyword.Pseudo): # print("==============") # print(program_path) # print(line.rstrip()) # print("{} : {}".format(arranged_token.encode("utf-8"), token_type)) # print("==============") line_of_token.append(arranged_token + " ") # 空白区切りにするため、最後にスペースをつける # 行頭の空白二つはインデントとみなす line_of_token[0] = line_of_token[0].replace("<SPACE> <SPACE> ", "<INDENT> ") # インデントは前の行との相対的な値を番号として付与する indent_count = len(re.findall("<INDENT>", line_of_token[0])) if indent_count != 0: # 空行がインデントされていると0番目の要素にインデントと改行が両方含まれている場合があるため、 # インデント情報を取り除いてから、相対的なインデント情報を付け加える indent_char = "<INDENT{}> ".format(indent_count - last_indent_count) line_of_token[0] = line_of_token[0].replace("<INDENT> ", "") line_of_token[0] = indent_char + line_of_token[0] if len(line_of_token) != 1: last_indent_count = indent_count token_streams.append(line_of_token) return token_streams, num_of_lines