Exemplo n.º 1
0
def load_aligned_source_code(dataset_directory, tokenizer="codebert"):
    dataset_path = Path(dataset_directory)

    files = unpersist(dataset_path.joinpath("common_filecontent.bz2")).rename(
        {"id": "file_id"}, axis=1)

    content = dict(
        zip(zip(files["package"], files["file_id"]), files["filecontent"]))
    pd_offsets = unpersist(dataset_path.joinpath("common_offsets.bz2"))

    seen = set()

    source_codes = []
    offsets = []

    for group, data in pd_offsets.groupby(by=["package", "file_id"]):
        source_codes.append(content[group])
        offsets.append(list(zip(data["start"], data["end"], data["node_id"])))
        seen.add(group)

    for key, val in content.items():
        if key not in seen:
            source_codes.append(val)
            offsets.append([])

    return source_code_graph_alignment(source_codes,
                                       offsets,
                                       tokenizer=tokenizer)
def create_from_dataset():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("dataset_path", type=str, help="")
    parser.add_argument("output_path", type=str, help="")
    parser.add_argument("--format",
                        "-f",
                        dest="format",
                        default="jsonl",
                        help="jsonl|csv")
    parser.add_argument("--remove_default", action="store_true", default=False)

    args = parser.parse_args()

    global remove_default
    remove_default = args.remove_default

    node_maps = get_node_maps(
        unpersist(join(args.dataset_path, "common_nodes.bz2")))
    filecontent = get_filecontent_maps(
        unpersist(join(args.dataset_path, "common_filecontent.bz2")))
    offsets = group_offsets(
        unpersist(join(args.dataset_path, "common_offsets.bz2")))

    data = []
    nlp = create_tokenizer("spacy")

    for ind, (f_body, f_offsets) in enumerate(
            iterate_functions(offsets, node_maps, filecontent)):
        data.append(process_body(nlp, f_body, replacements=f_offsets))

    store(data, args)
Exemplo n.º 3
0
def load_data(node_path, edge_path, rename_columns=True):
    nodes = unpersist(node_path)
    edges = unpersist(edge_path)

    nodes = nodes.astype({'type': 'category'})
    edges = edges.astype({'type': 'category'})

    if rename_columns:
        nodes = nodes.rename(mapper={'serialized_name': 'name'}, axis=1)
        edges = edges.rename(mapper={
            'source_node_id': 'src',
            'target_node_id': 'dst'
        },
                             axis=1)

    return nodes, edges
Exemplo n.º 4
0
def read_corpus(path, data_field):
    if path.endswith("bz2") or path.endswith("parquet") or path.endswith(
            "csv"):
        from SourceCodeTools.code.data.file_utils import unpersist
        data = unpersist(path)[data_field].tolist()
    elif path.endswith("jsonl"):
        import json
        data = []
        with open(path) as data_source:
            for ind, line in enumerate(data_source):
                if line.strip():
                    d = json.loads(line.strip())
                    if data_field in data:
                        data.append(d)
                    else:
                        logging.warning(
                            f"No data field '{data_field}' on line {ind}")
    else:
        data = []
        with open(path) as data_source:
            for ind, line in enumerate(data_source):
                if line.strip():
                    data.append(line.strip())

    return data
def load_names(nodes_path):
    if nodes_path is not None:
        nodes = unpersist(nodes_path)
        names = dict(
            zip(nodes['id'].tolist(), nodes['serialized_name'].tolist()))
    else:
        names = None
    return names
Exemplo n.º 6
0
def load_typed_nodes(path):
    from SourceCodeTools.code.data.file_utils import unpersist
    type_ann = unpersist(path)

    filter_rule = lambda name: "0x" not in name

    type_ann = type_ann[type_ann["dst"].apply(filter_rule)]

    typed_nodes = set(type_ann["src"].tolist())
    return typed_nodes
def process_package(working_directory, global_names=None):
    """
    Find functions with annotations, extract annotation information, strip documentation and type annotations.
    :param working_directory: location of package related files
    :param global_names: optional, mapping from global node ids to names
    :return: list of entries in spacy compatible format
    """
    bodies = unpersist_if_present(
        os.path.join(working_directory, "source_graph_bodies.bz2"))
    if bodies is None:
        return []

    offsets_path = os.path.join(working_directory, "offsets.bz2")

    # offsets store information about spans for nodes referenced in the source code
    if os.path.isfile(offsets_path):
        offsets = unpersist(offsets_path)
    else:
        logging.warning(f"No file with offsets: {offsets_path}")
        offsets = None

    def load_local2global(working_directory):
        local2global = unpersist(
            os.path.join(working_directory, "local2global_with_ast.bz2"))
        id_maps = dict(zip(local2global['id'], local2global['global_id']))
        return id_maps

    id_maps = load_local2global(working_directory)

    local_names = load_names(
        os.path.join(working_directory, "nodes_with_ast.bz2"))

    nlp = create_tokenizer("spacy")

    data = []

    for ind, (_, row) in tqdm(enumerate(bodies.iterrows()),
                              total=len(bodies),
                              leave=True,
                              desc=os.path.basename(working_directory)):
        body = row['body']

        if offsets is not None:
            graph_node_spans = offsets_for_func(offsets, body, row["id"])
        else:
            graph_node_spans = []

        entry = process_body(nlp, body, replacements=graph_node_spans)

        if entry is not None:
            entry = to_global_ids(entry, id_maps, global_names, local_names)
            data.append(entry)

    return data
Exemplo n.º 8
0
def estimate_module_sizes(path):
    module_count = Counter()
    for dir in os.listdir(path):
        module_path = os.path.join(path, dir)
        if not os.path.isdir(module_path):
            continue

        nodes_path = os.path.join(module_path, "nodes_with_ast.bz2")

        if os.path.isfile(nodes_path):
            module_count[dir] = unpersist(nodes_path).shape[0]

    pprint(module_count.most_common())
def build_ast_graph_from_modules():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "source_code",
        type=str,
        help=
        "Path to DataFrame pickle (written with pandas.to_pickle, use `bz2` format)."
    )
    parser.add_argument("output_path")
    parser.add_argument(
        "--bpe_tokenizer",
        type=str,
        help=
        "Path to sentencepiece model. When provided, names will be subtokenized."
    )
    parser.add_argument("--visualize",
                        action="store_true",
                        help="Visualize graph. Do not use on large graphs.")
    parser.add_argument("--create_test_data",
                        action="store_true",
                        help="Visualize graph. Do not use on large graphs.")
    args = parser.parse_args()

    if args.create_test_data:
        print(f"Creating test data in {args.output_path}")
        create_test_data(args.output_path)
        sys.exit()

    source_code = unpersist(args.source_code)

    output_dir = args.output_path

    nodes, edges, offsets = build_ast_only_graph(
        zip(source_code["package"], source_code["id"],
            source_code["filecontent"]),
        args.bpe_tokenizer,
        create_subword_instances=False,
        connect_subwords=False,
        lang="py",
        track_offsets=True)

    print(f"Writing output to {output_dir}")
    persist(source_code, os.path.join(output_dir, "common_filecontent.bz2"))
    persist(nodes, os.path.join(output_dir, "common_nodes.bz2"))
    persist(edges, os.path.join(output_dir, "common_edges.bz2"))
    persist(offsets, os.path.join(output_dir, "common_offsets.bz2"))

    if args.visualize:
        visualize(nodes, edges, os.path.join(output_dir, "visualization.pdf"))
def main():
    path = sys.argv[1]

    environments = sorted(list(
        filter(lambda path: os.path.isdir(path),
               (os.path.join(path, dir) for dir in os.listdir(path)))),
                          key=lambda x: x.lower())
    for env_path in environments:

        edges_path = os.path.join(env_path, "edges_with_ast.bz2")
        if os.path.isfile(edges_path):
            edges = unpersist(edges_path)

            if any(edges["type"] == "prev_rev"):
                print()
Exemplo n.º 11
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("bodies")
    args = parser.parse_args()

    bodies = unpersist(args.bodies)

    depths = []

    for ind, row in bodies.iterrows():
        body = row.body
        body_ast = ast.parse(body.strip())
        de = DepthEstimator()
        de.go(body_ast)
        depths.append(de.depth)

    print(f"Average depth: {sum(depths)/len(depths)}")
    depths = np.array(depths, dtype=np.int32)
    np.savetxt(os.path.join(os.path.dirname(args.bodies), "bodies_depths.txt"),
               depths, "%d")
Exemplo n.º 12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("working_directory")
    parser.add_argument("output")
    args = parser.parse_args()

    nodes, edges = load_data(join(args.working_directory, "nodes.bz2"),
                             join(args.working_directory, "edges.bz2"))
    type_annotated = set(
        unpersist(join(args.working_directory,
                       "type_annotations.bz2"))["src"].tolist())
    arguments = set(nodes.query("type == 'arg'")["id"].tolist())
    mentions = set(nodes.query("type == 'mention'")["id"].tolist())

    edges["in_mentions"] = edges["src"].apply(lambda src: src in mentions)

    edges["in_args"] = edges["dst"].apply(lambda dst: dst in arguments)

    edges = edges.query("in_mentions == True and in_args == True")

    mapping = {}
    for src, dst in edges[["src", "dst"]].values:
        if dst in mapping:
            print()
        mapping[dst] = src

    with open(args.output, "w") as sink:
        with open(join(args.working_directory,
                       "function_annotations.jsonl")) as fa:
            for line in fa:
                entry = json.loads(line)
                new_repl = [[s, e, int(mapping.get(r, r))]
                            for s, e, r in entry["replacements"]]
                entry["replacements"] = new_repl

                sink.write(f"{json.dumps(entry)}\n")

    print()
 def load_local2global(working_directory):
     local2global = unpersist(
         os.path.join(working_directory, "local2global_with_ast.bz2"))
     id_maps = dict(zip(local2global['id'], local2global['global_id']))
     return id_maps
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("bodies")
    parser.add_argument("bpe_path")
    parser.add_argument("--num_layers", default=8, type=int)

    args = parser.parse_args()

    bodies = unpersist(args.bodies)
    bpe = create_tokenizer(type="bpe", bpe_path=args.bpe_path)
    mention_tokenizer = MentionTokenizer(args.bpe_path,
                                         create_subword_instances=True,
                                         connect_subwords=False)

    lengths_tr = {}
    lengths_gnn = {}
    ratio = []

    for body in tqdm(bodies["body"]):
        if not has_valid_syntax(body):
            continue

        n_tokens = compute_transformer_passings(body, bpe)
        n_edges = compute_gnn_passings(body, mention_tokenizer)

        if n_tokens not in lengths_tr:
            lengths_tr[n_tokens] = []
        if n_tokens not in lengths_gnn:
            lengths_gnn[n_tokens] = []

        lengths_tr[n_tokens].append(n_tokens**2 * args.num_layers)
        lengths_gnn[n_tokens].append(n_edges)  # * args.num_layers)
        ratio.append((n_tokens, n_edges))

    for key in lengths_tr:
        data_tr = np.array(lengths_tr[key])
        data_gnn = np.array(lengths_gnn[key])

        lengths_tr[key] = np.mean(data_tr)  #, np.std(data_tr))
        lengths_gnn[key] = np.mean(data_gnn)  #, np.std(data_gnn))

    data_ratios = np.array(ratio)

    plt.plot(data_ratios[:, 0], data_ratios[:, 1], "*")
    plt.xlabel("Number of Tokens")
    plt.ylabel("Number of Edges")
    plt.savefig("tokens_edges.png")
    plt.close()

    plt.hist(data_ratios[:, 1] / data_ratios[:, 0], bins=20)
    plt.xlabel("Number of edges / Number of tokens")
    plt.savefig("ratio.png")
    plt.close()

    ratio = data_ratios[:, 1] / data_ratios[:, 0]
    ratio = (np.mean(ratio), np.std(ratio))

    plt.plot(list(lengths_tr.keys()),
             np.log10(np.array(list(lengths_tr.values()))), "*")
    plt.plot(list(lengths_gnn.keys()),
             np.log10(np.array(list(lengths_gnn.values()))), "*")
    plt.plot(list(lengths_gnn.keys()),
             np.log10(np.array(list(lengths_gnn.values())) * args.num_layers),
             "*")
    plt.legend([
        f"Transformer {args.num_layers} layers", "GNN L layers",
        f"GNN L*{args.num_layers} layers"
    ])
    plt.xlabel("Number of Tokens")
    plt.ylabel("log10(Number of Message Exchanges)")
    plt.savefig("avg_passings.png")
    plt.close()
def main():
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('dataset_path',
                        default=None,
                        help='Path to the dataset')
    parser.add_argument('output_path', default=None, help='')
    parser.add_argument("--extra_objectives",
                        action="store_true",
                        default=False)
    parser.add_argument("--eval_frac",
                        dest="eval_frac",
                        default=0.05,
                        type=float)

    args = parser.parse_args()

    nodes_path, edges_path, extra_paths = get_paths(
        args.dataset_path, use_extra_objectives=args.extra_objectives)

    nodes, edges = load_data(nodes_path, edges_path)
    nodes, edges, holdout = SourceGraphDataset.holdout(nodes, edges)
    edges = edges.astype({
        "src": 'str',
        "dst": "str",
        "type": 'str'
    })[['src', 'dst', 'type']]
    holdout = holdout.astype({
        "src": 'str',
        "dst": "str",
        "type": 'str'
    })[['src', 'dst', 'type']]

    node2graph_id = compact_property(nodes['id'])
    nodes['global_graph_id'] = nodes['id'].apply(lambda x: node2graph_id[x])

    node_ids = set(nodes['id'].unique())

    if args.extra_objectives:
        for objective_path in extra_paths:
            data = unpersist(objective_path)
            data = filter_relevant(data, node_ids)
            data["type"] = objective_path.split(".")[0]
            edges = edges.append(data)

    if not os.path.isdir(args.output_path):
        os.mkdir(args.output_path)

    edges = edges[['src', 'dst', 'type']]
    eval_sample = edges.sample(frac=args.eval_frac)

    persist(nodes, join(args.output_path, "nodes_dglke.csv"))
    persist(edges,
            join(args.output_path, "edges_train_dglke.tsv"),
            header=False,
            sep="\t")
    persist(edges,
            join(args.output_path, "edges_train_node2vec.tsv"),
            header=False,
            sep=" ")
    persist(eval_sample,
            join(args.output_path, "edges_eval_dglke.tsv"),
            header=False,
            sep="\t")
    persist(eval_sample,
            join(args.output_path, "edges_eval_node2vec.tsv"),
            header=False,
            sep=" ")
    persist(holdout,
            join(args.output_path, "edges_eval_dglke_10000.tsv"),
            header=False,
            sep="\t")
    persist(holdout,
            join(args.output_path, "edges_eval_node2vec_10000.tsv"),
            header=False,
            sep=" ")