Exemplo n.º 1
0
    def createLexers(self):

        lex = {}
        lex['.c'] = CFamilyLexer()
        lex['.h'] = CFamilyLexer()
        lex['.cpp'] = CppLexer()
        lex['.hpp'] = CppLexer()
        lex['.css'] = CssLexer()
        lex['.sass'] = SassLexer()
        lex['.yaml'] = YamlLexer()
        lex['.yml'] = YamlLexer()
        lex['.json'] = JsonLexer()
        lex['.cs'] = CSharpLexer()
        lex['.fs'] = FSharpLexer()
        lex['.e'] = EiffelLexer()
        lex['.erl'] = ErlangLexer()
        lex['.hrl'] = ErlangLexer()
        lex['.es'] = ErlangLexer()
        lex['.f03'] = FortranLexer()
        lex['.f90'] = FortranLexer()
        lex['.F03'] = FortranLexer()
        lex['.F90'] = FortranLexer()
        lex['.go'] = GoLexer()
        lex['.hs'] = HaskellLexer()
        lex['.v'] = VerilogLexer()
        lex['.vhdl'] = VhdlLexer()
        lex['.vhd'] = VhdlLexer()
        lex['.html'] = HtmlLexer()
        lex['.htm'] = HtmlLexer()
        lex['.xhtml'] = HtmlLexer()
        lex['.xml'] = XmlLexer()
        lex['.js'] = JavascriptLexer()
        lex['.tex'] = TypeScriptLexer()
        lex['.coffee'] = CoffeeScriptLexer()
        lex['.java'] = JavaLexer()
        lex['.scala'] = ScalaLexer()
        lex['.kt'] = KotlinLexer()
        lex['.ktm'] = KotlinLexer()
        lex['.kts'] = KotlinLexer()
        lex['.lisp'] = CommonLispLexer()
        lex['make'] = MakefileLexer()
        lex['Make'] = MakefileLexer()
        lex['CMake'] = CMakeLexer()
        lex['cmake'] = CMakeLexer()
        lex['.m'] = MatlabLexer()
        lex['.mat'] = MatlabLexer()
        lex['.dpr'] = DelphiLexer()
        lex['.perl'] = PerlLexer()
        lex['.php'] = PhpLexer()
        lex['.pr'] = PrologLexer()
        lex['.py'] = Python3Lexer()
        lex['.rb'] = RubyLexer()
        lex['.sh'] = BashLexer()
        lex['.sql'] = MySqlLexer()
        lex['.mysql'] = MySqlLexer()
        lex['.tcl'] = TclLexer()
        lex['.awk'] = AwkLexer()

        return lex
Exemplo n.º 2
0
    def parse(cls, location):
        with io.open(location, encoding='utf-8') as loc:
            file_contents = loc.read()

        # we use a Pygments formatter for parsing lexed Ruby code
        formatted_file_contents = highlight(file_contents, RubyLexer(),
                                            ChefMetadataFormatter())
        package_data = json.loads(formatted_file_contents)
        return build_package(package_data, datasource_id=cls.datasource_id)
Exemplo n.º 3
0
 def POST(self):
     data = web.input()
     code = data.code
     language = data.lang
     if language == 'python':
         from pygments.lexers.python import PythonLexer
         lexer = PythonLexer()
     elif language == 'php':
         from pygments.lexers.php import PhpLexer
         lexer = PhpLexer()
     elif language == 'java':
         from pygments.lexers.jvm import JavaLexer
         lexer = JavaLexer()
     elif language == 'javascript':
         from pygments.lexers.javascript import JavascriptLexer
         lexer = JavascriptLexer()
     elif language == 'html':
         from pygments.lexers.html import HtmlLexer
         lexer = HtmlLexer()
     elif language == 'cpp':
         from pygments.lexers.c_cpp import CppLexer
         lexer = CppLexer()
     elif language == 'shell':
         from pygments.lexers.shell import ShellSessionLexer
         lexer = ShellSessionLexer()
     elif language == 'matlab':
         from pygments.lexers.matlab import MatlabLexer
         lexer = MatlabLexer()
     elif language == 'ruby':
         from pygments.lexers.ruby import RubyLexer
         lexer = RubyLexer()
     elif language == 'r':
         from pygments.lexers.r import RConsoleLexer
         lexer = RConsoleLexer()
     elif language == 'lisp':
         from pygments.lexers.lisp import SchemeLexer
         lexer = SchemeLexer()
     elif language == 'go':
         from pygments.lexers.go import GoLexer
         lexer = GoLexer()
     formatter = html.HtmlFormatter(linenos=False,
                                    encoding='utf-8',
                                    nowrap=False)
     hilighted_snippet = highlight(code, lexer, formatter)
     #return hilighted
     #return render.submit()
     return render.result(hilighted_snippet)
Exemplo n.º 4
0
    'xslt': XmlLexer(),
    'xsd': XmlLexer(),
    'wsdl': XmlLexer(),
    'php': HtmlPhpLexer(),
    'php5': HtmlPhpLexer(),
    'pl': Perl6Lexer(),
    'pm': Perl6Lexer(),
    'nqp': Perl6Lexer(),
    'p6': Perl6Lexer(),
    '6pl': Perl6Lexer(),
    'p6l': Perl6Lexer(),
    'pl6': Perl6Lexer(),
    'p6m': Perl6Lexer(),
    'pm6': Perl6Lexer(),
    't': Perl6Lexer(),
    'rb': RubyLexer(),
    'rbw': RubyLexer(),
    'rake': RubyLexer(),
    'rbx': RubyLexer(),
    'duby': RubyLexer(),
    'gemspec': RubyLexer(),
    'ini': IniLexer(),
    'init': IniLexer(),
    'sh': BashLexer(),
    'diff': DiffLexer(),
    'patch': DiffLexer(),
    'cs': CSharpLexer(),
    'md': MarkdownLexer(),  # WAIT: Virker dårlig
}

known_extensions = list(lexer_from_ext.keys())
Exemplo n.º 5
0
def lexer():
    yield RubyLexer()
Exemplo n.º 6
0
        pass

    if extens == "py" or extens == "pyw" or extens == "sc" or extens == "sage" or extens == "tac":
        ui_core = CoreUI(lexer=PythonLexer())
    elif extens == "txt" or extens == "README" or extens == "text":
        ui_core = CoreUI(lexer=TextLexer())
    elif extens == "htm" or extens == "html" or extens == "css" or extens == "js" or extens == "md":
        ui_core = CoreUI(lexer=HtmlLexer())
    elif extens == "xml" or extens == "xsl" or extens == "rss" or extens == "xslt" or extens == "xsd" or extens == "wsdl" or extens == "wsf":
        ui_core = CoreUI(lexer=XmlLexer())
    elif extens == "php" or extens == "php5":
        ui_core = CoreUI(lexer=HtmlPhpLexer())
    elif extens == "pl" or extens == "pm" or extens == "nqp" or extens == "p6" or extens == "6pl" or extens == "p6l" or extens == "pl6" or extens == "pm" or extens == "p6m" or extens == "pm6" or extens == "t":
        ui_core = CoreUI(lexer=Perl6Lexer())
    elif extens == "rb" or extens == "rbw" or extens == "rake" or extens == "rbx" or extens == "duby" or extens == "gemspec":
        ui_core = CoreUI(lexer=RubyLexer())
    elif extens == "ini" or extens == "init":
        ui_core = CoreUI(lexer=IniLexer())
    elif extens == "conf" or extens == "cnf" or extens == "config":
        ui_core = CoreUI(lexer=ApacheConfLexer())
    elif extens == "sh" or extens == "cmd" or extens == "bashrc" or extens == "bash_profile":
        ui_core = CoreUI(lexer=BashLexer())
    elif extens == "diff" or extens == "patch":
        ui_core = CoreUI(lexer=DiffLexer())
    elif extens == "cs":
        ui_core = CoreUI(lexer=CSharpLexer())
    elif extens == "sql":
        ui_core = CoreUI(lexer=MySqlLexer())
    else:
        ui_core = CoreUI(lexer=PythonLexer())  # default (no extension) lexer is python
    ui_core.mainloop()
Exemplo n.º 7
0
    # 空白を最後に一つだけつけたいので、SPACEやNEWLINEでついた空白を
    # 除いた後、空白を最後につける
    return token.rstrip() + " "


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-n", dest="N", required=True, type=int)
    parser.add_argument("--threshould", dest="threshould", required=True, type=float)
    parser.add_argument("-t", "--target", required=True, type=argparse.FileType("r"), dest="target_file")
    parser.add_argument("-v", "--vector", required=True, type=str, dest="vector_path")
    args = parser.parse_args()

    model = kenlm.Model(args.vector_path)

    lexer = RubyLexer()

    token_stream = lexer.get_tokens(args.target_file.read())

    token_str = ""

    for token_data in token_stream:
        token_str += replace_special_char(token_data[-1])

    token_list = token_str.split(" ")

    bag_of_ngrams = ngrams(token_list, args.N)

    index = 0
    x = []
    y = []
Exemplo n.º 8
0
def tokenize(program_path, raw=False):
    lexer = RubyLexer()

    token_streams = []
    with open(program_path, "r") as f:
        program = f.readlines()

    num_of_lines = len(program)

    last_indent_count = 0

    for line in program:
        line_of_token = []
        for token_data in lexer.get_tokens(line):
            token_type = token_data[0]
            token = token_data[-1]

            if raw:
                if is_token_subtype(token_type, Comment) or is_token_subtype(
                        token_type, Literal):
                    arranged_token = replace_special_char(token, comment=True)
                else:
                    arranged_token = replace_special_char(token, comment=False)
            else:
                if is_token_subtype(token_type, Literal):
                    arranged_token = "<LITERAL>"
                elif is_token_subtype(token_type, String):
                    arranged_token = "<STRING>"
                elif is_token_subtype(token_type, Number):
                    arranged_token = "<NUMBER>"
                elif token_type == Token.Name.Operator:
                    arranged_token = "<OPERATOR>"
                elif token_type == Name and token not in reserved:
                    arranged_token = "<ID>"
                elif token_type == Name.Variable.Instance:
                    arranged_token = "<INSTANCE_VAL>"
                elif token_type == Name.Variable.Class:
                    arranged_token = "<CLASS_VAL>"
                elif token_type == Name.Constant:
                    arranged_token = "<CONSTANT_ID>"
                elif token_type == Name.Function:
                    arranged_token = "<FUNCTION>"
                elif token_type == Name.Class:
                    arranged_token = "<CLASS>"
                elif token_type == Name.Namespace:
                    arranged_token = "<NAMESPACE>"
                elif token_type == Token.Name.Variable.Global:
                    arranged_token = "<GLOBAL_VAL>"
                elif token_type == Token.Error:
                    arranged_token = "<ERROR>"  # pygments内で字句解析が失敗した際のトークン (絵文字など)
                elif is_token_subtype(token_type, Comment):
                    arranged_token = "<COMMENT>"
                else:
                    arranged_token = replace_special_char(token)
                    # if arranged_token not in reserved and "SPACE" not in arranged_token and "NEWLINE" not in arranged_token:
                    #     if token_type not in (Token.Punctuation, Token.Operator, Token.Name.Builtin, Token.Keyword.Pseudo):
                    #         print("==============")
                    #         print(program_path)
                    #         print(line.rstrip())
                    #         print("{} : {}".format(arranged_token.encode("utf-8"), token_type))
                    #         print("==============")

            line_of_token.append(arranged_token +
                                 " ")  # 空白区切りにするため、最後にスペースをつける

        # 行頭の空白二つはインデントとみなす
        line_of_token[0] = line_of_token[0].replace("<SPACE> <SPACE> ",
                                                    "<INDENT> ")

        # インデントは前の行との相対的な値を番号として付与する
        indent_count = len(re.findall("<INDENT>", line_of_token[0]))

        if indent_count != 0:
            # 空行がインデントされていると0番目の要素にインデントと改行が両方含まれている場合があるため、
            # インデント情報を取り除いてから、相対的なインデント情報を付け加える
            indent_char = "<INDENT{}> ".format(indent_count -
                                               last_indent_count)
            line_of_token[0] = line_of_token[0].replace("<INDENT> ", "")
            line_of_token[0] = indent_char + line_of_token[0]

        if len(line_of_token) != 1:
            last_indent_count = indent_count

        token_streams.append(line_of_token)

    return token_streams, num_of_lines