Exemplo n.º 1
0
def parse_ast_tree(filename):
    nodes = {}
    links = {}
    nodetypes = {}
    for x in dir(ast):
        try:
            if isinstance(ast.__getattribute__(x)(), ast.AST):
                nodetypes[x.lower()] = ast.__getattribute__(x)
        except TypeError:
            pass
    for line in open(filename):
        if line.startswith("<"):
            parts = line[1:].strip("\n").split("=")
            links[parts[0]] = parts[1].split(",")
        elif line.startswith(">"):
            parts = line[1:].strip("\n").split("\t")
            nodes[parts[0]] = nodetypes[parts[1].lower()]()
            nodes[parts[0]].children = []
    root_nodes = []
    for id, value in sorted(links.items()):
        for link in value:
            nodes[id].children.append(nodes[link])
            root_nodes.append(link)

    dot = AstNodes()
    for node in nodes.values():
        try:
            dot.index(node)
        except:
            print(filename)
    root_nodes = set(nodes.keys()) - set(root_nodes)
    root_nodes = [nodes[id] for id in list(sorted(root_nodes))]

    return nodes['0']
Exemplo n.º 2
0
def parse_src_files(basefolder, seperate_trees=False, verbose=0):
    if basefolder.endswith("python"):
        X_names, y, problems = get_ast_src_files(basefolder)
        X, y, tags = np.array([ast_parse_file(name) for name in tqdm(X_names)
                               ]), np.array(y), problems
        if verbose == 1:
            dump(X, y, X_names)
        return X, y, tags, AstNodes()
    elif basefolder.endswith("python_trees"):
        X_names, y, problems = get_ast_src_files(basefolder)
        X, y, tags = np.array([parse_ast_tree(name) for name in tqdm(X_names)
                               ]), np.array(y), problems
        return X, y, tags, AstNodes()
    elif basefolder.endswith("cpp"):
        X_names, y, problems = get_dot_src_files(basefolder)
        extend_X = []
        extend_X_names = []
        extend_y = []
        for id, name in enumerate(tqdm(X_names)):
            program_trees = parse_tree(name, seperate_trees)
            extend_X.extend(program_trees)
            extend_y.extend([y[id]] * len(program_trees))
            extend_X_names.extend([name] * len(program_trees))
        X, y, tags, X_names = np.array(extend_X), np.array(
            extend_y), problems, extend_X_names
        return X, y, tags, DotNodes()
Exemplo n.º 3
0
def parse_src_files(basefolder, seperate_trees=False,verbose=0):
    if basefolder.endswith("python"):
        X_names, y, problems = get_ast_src_files(basefolder)
        X ,y,tags = np.array([ast_parse_file(name) for name in tqdm(X_names)]), np.array(y), problems
        return X ,y,tags,AstNodes()
    else:
        X_names, y, problems = get_ast_src_files(basefolder)
        X ,y,tags = np.array([parse_ast_tree(name) for name in tqdm(X_names)]), np.array(y), problems
        return X ,y,tags,AstNodes()
Exemplo n.º 4
0
def show_embeding(model, basefolder):
    # Word Embedding Analysis
    X = scale(model.embed.W.data)
    y = np.arange(X.shape[0])
    ast_nodes = AstNodes()
    true_labels = np.array(ast_nodes.nodetypes + [ast_nodes.NONE])

    # estimator  =  KernelPCA(n_components=2,kernel="rbf")#PCA(n_components=2)#PCA(n_components=2) #TSNE(n_components=2, random_state=None)#
    data_plot(X, y, true_labels)

    # estimator = DBSCAN(eps=0.3, min_samples=10)
    estimator = KMeans(n_clusters=10, init='k-means++')
    cluster_plot(estimator, X, y, true_labels, basefolder=basefolder)
    print("*" * 10, " Cluster AST Node types:", "*" * 10)
    cluster_table(estimator, X, y, true_labels)

    estimator = NearestNeighbors(n_neighbors=5)
    print("*" * 10, " Neighbors of AST Node types:", "*" * 10)
    neighbors_table(estimator, X, y, true_labels)
Exemplo n.º 5
0
def getParams(dataset, layers, cell, units, authors):
    if dataset == "python":
        nodes = AstNodes()
    else:
        nodes = DotNodes()
    if cell == "lstm":
        model = RecursiveLSTM(units,
                              authors,
                              layers=layers,
                              dropout=0.2,
                              feature_dict=nodes,
                              classes=None,
                              cell="lstm",
                              residual=False)
    elif cell == "bilstm":
        model = RecursiveBiLSTM(units,
                                authors,
                                layers=layers,
                                dropout=0.2,
                                feature_dict=nodes,
                                classes=None,
                                cell="lstm",
                                residual=False)
    return model.params_count()
Exemplo n.º 6
0
    # path = R"C:\Users\bms\Files\current\research\stylemotry\stylemotery_code\saved_models\3_treelstm_3tree_500_70_labels1_epoch_206.my"
    # model = RecursiveTreeLSTM(n_children=1, n_units=500,n_label=70, dropout=0.2,feature_dict=TreeFeatures())
    # serializers.load_npz(path,model)
    # print_model(model, depth=1, output=sys.stdout)
    # show_embeding(model,basefolder=os.path.join(basefolder,model_name+"_embed"))
    # show_authors(model,basefolder=os.path.join(basefolder,model_name+"_authors"))

    # lstm
    print("LSTM")
    model_name = "lstm"
    path = R"C:\Users\bms\Files\current\research\stylemotry\stylemotery_code\saved_models\lstm\1_lstm_100_python_70_labels1_1_epoch_409.my"
    model = RecursiveLSTM(n_units=100,
                          layers=1,
                          n_label=70,
                          dropout=0.2,
                          feature_dict=AstNodes())
    serializers.load_npz(path, model)
    print_model(model, depth=1, output=sys.stdout)
    show_embeding(model,
                  basefolder=os.path.join(basefolder, model_name + "_embed"))

    # bilstm
    # print("BiLSTM")
    model_name = "bilstm"
    path = R"C:\Users\bms\Files\current\research\stylemotry\stylemotery_code\saved_models\bilstm\1_bilstm_100_python_70_labels1_epoch_333.my"
    model = RecursiveBiLSTM(n_units=100,
                            layers=1,
                            n_label=70,
                            dropout=0.2,
                            feature_dict=AstNodes(),
                            peephole=False)