def load_aligned_source_code(dataset_directory, tokenizer="codebert"): dataset_path = Path(dataset_directory) files = unpersist(dataset_path.joinpath("common_filecontent.bz2")).rename( {"id": "file_id"}, axis=1) content = dict( zip(zip(files["package"], files["file_id"]), files["filecontent"])) pd_offsets = unpersist(dataset_path.joinpath("common_offsets.bz2")) seen = set() source_codes = [] offsets = [] for group, data in pd_offsets.groupby(by=["package", "file_id"]): source_codes.append(content[group]) offsets.append(list(zip(data["start"], data["end"], data["node_id"]))) seen.add(group) for key, val in content.items(): if key not in seen: source_codes.append(val) offsets.append([]) return source_code_graph_alignment(source_codes, offsets, tokenizer=tokenizer)
def create_from_dataset(): from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("dataset_path", type=str, help="") parser.add_argument("output_path", type=str, help="") parser.add_argument("--format", "-f", dest="format", default="jsonl", help="jsonl|csv") parser.add_argument("--remove_default", action="store_true", default=False) args = parser.parse_args() global remove_default remove_default = args.remove_default node_maps = get_node_maps( unpersist(join(args.dataset_path, "common_nodes.bz2"))) filecontent = get_filecontent_maps( unpersist(join(args.dataset_path, "common_filecontent.bz2"))) offsets = group_offsets( unpersist(join(args.dataset_path, "common_offsets.bz2"))) data = [] nlp = create_tokenizer("spacy") for ind, (f_body, f_offsets) in enumerate( iterate_functions(offsets, node_maps, filecontent)): data.append(process_body(nlp, f_body, replacements=f_offsets)) store(data, args)
def load_data(node_path, edge_path, rename_columns=True): nodes = unpersist(node_path) edges = unpersist(edge_path) nodes = nodes.astype({'type': 'category'}) edges = edges.astype({'type': 'category'}) if rename_columns: nodes = nodes.rename(mapper={'serialized_name': 'name'}, axis=1) edges = edges.rename(mapper={ 'source_node_id': 'src', 'target_node_id': 'dst' }, axis=1) return nodes, edges
def read_corpus(path, data_field): if path.endswith("bz2") or path.endswith("parquet") or path.endswith( "csv"): from SourceCodeTools.code.data.file_utils import unpersist data = unpersist(path)[data_field].tolist() elif path.endswith("jsonl"): import json data = [] with open(path) as data_source: for ind, line in enumerate(data_source): if line.strip(): d = json.loads(line.strip()) if data_field in data: data.append(d) else: logging.warning( f"No data field '{data_field}' on line {ind}") else: data = [] with open(path) as data_source: for ind, line in enumerate(data_source): if line.strip(): data.append(line.strip()) return data
def load_names(nodes_path): if nodes_path is not None: nodes = unpersist(nodes_path) names = dict( zip(nodes['id'].tolist(), nodes['serialized_name'].tolist())) else: names = None return names
def load_typed_nodes(path): from SourceCodeTools.code.data.file_utils import unpersist type_ann = unpersist(path) filter_rule = lambda name: "0x" not in name type_ann = type_ann[type_ann["dst"].apply(filter_rule)] typed_nodes = set(type_ann["src"].tolist()) return typed_nodes
def process_package(working_directory, global_names=None): """ Find functions with annotations, extract annotation information, strip documentation and type annotations. :param working_directory: location of package related files :param global_names: optional, mapping from global node ids to names :return: list of entries in spacy compatible format """ bodies = unpersist_if_present( os.path.join(working_directory, "source_graph_bodies.bz2")) if bodies is None: return [] offsets_path = os.path.join(working_directory, "offsets.bz2") # offsets store information about spans for nodes referenced in the source code if os.path.isfile(offsets_path): offsets = unpersist(offsets_path) else: logging.warning(f"No file with offsets: {offsets_path}") offsets = None def load_local2global(working_directory): local2global = unpersist( os.path.join(working_directory, "local2global_with_ast.bz2")) id_maps = dict(zip(local2global['id'], local2global['global_id'])) return id_maps id_maps = load_local2global(working_directory) local_names = load_names( os.path.join(working_directory, "nodes_with_ast.bz2")) nlp = create_tokenizer("spacy") data = [] for ind, (_, row) in tqdm(enumerate(bodies.iterrows()), total=len(bodies), leave=True, desc=os.path.basename(working_directory)): body = row['body'] if offsets is not None: graph_node_spans = offsets_for_func(offsets, body, row["id"]) else: graph_node_spans = [] entry = process_body(nlp, body, replacements=graph_node_spans) if entry is not None: entry = to_global_ids(entry, id_maps, global_names, local_names) data.append(entry) return data
def estimate_module_sizes(path): module_count = Counter() for dir in os.listdir(path): module_path = os.path.join(path, dir) if not os.path.isdir(module_path): continue nodes_path = os.path.join(module_path, "nodes_with_ast.bz2") if os.path.isfile(nodes_path): module_count[dir] = unpersist(nodes_path).shape[0] pprint(module_count.most_common())
def build_ast_graph_from_modules(): import argparse parser = argparse.ArgumentParser() parser.add_argument( "source_code", type=str, help= "Path to DataFrame pickle (written with pandas.to_pickle, use `bz2` format)." ) parser.add_argument("output_path") parser.add_argument( "--bpe_tokenizer", type=str, help= "Path to sentencepiece model. When provided, names will be subtokenized." ) parser.add_argument("--visualize", action="store_true", help="Visualize graph. Do not use on large graphs.") parser.add_argument("--create_test_data", action="store_true", help="Visualize graph. Do not use on large graphs.") args = parser.parse_args() if args.create_test_data: print(f"Creating test data in {args.output_path}") create_test_data(args.output_path) sys.exit() source_code = unpersist(args.source_code) output_dir = args.output_path nodes, edges, offsets = build_ast_only_graph( zip(source_code["package"], source_code["id"], source_code["filecontent"]), args.bpe_tokenizer, create_subword_instances=False, connect_subwords=False, lang="py", track_offsets=True) print(f"Writing output to {output_dir}") persist(source_code, os.path.join(output_dir, "common_filecontent.bz2")) persist(nodes, os.path.join(output_dir, "common_nodes.bz2")) persist(edges, os.path.join(output_dir, "common_edges.bz2")) persist(offsets, os.path.join(output_dir, "common_offsets.bz2")) if args.visualize: visualize(nodes, edges, os.path.join(output_dir, "visualization.pdf"))
def main(): path = sys.argv[1] environments = sorted(list( filter(lambda path: os.path.isdir(path), (os.path.join(path, dir) for dir in os.listdir(path)))), key=lambda x: x.lower()) for env_path in environments: edges_path = os.path.join(env_path, "edges_with_ast.bz2") if os.path.isfile(edges_path): edges = unpersist(edges_path) if any(edges["type"] == "prev_rev"): print()
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("bodies") args = parser.parse_args() bodies = unpersist(args.bodies) depths = [] for ind, row in bodies.iterrows(): body = row.body body_ast = ast.parse(body.strip()) de = DepthEstimator() de.go(body_ast) depths.append(de.depth) print(f"Average depth: {sum(depths)/len(depths)}") depths = np.array(depths, dtype=np.int32) np.savetxt(os.path.join(os.path.dirname(args.bodies), "bodies_depths.txt"), depths, "%d")
def main(): parser = argparse.ArgumentParser() parser.add_argument("working_directory") parser.add_argument("output") args = parser.parse_args() nodes, edges = load_data(join(args.working_directory, "nodes.bz2"), join(args.working_directory, "edges.bz2")) type_annotated = set( unpersist(join(args.working_directory, "type_annotations.bz2"))["src"].tolist()) arguments = set(nodes.query("type == 'arg'")["id"].tolist()) mentions = set(nodes.query("type == 'mention'")["id"].tolist()) edges["in_mentions"] = edges["src"].apply(lambda src: src in mentions) edges["in_args"] = edges["dst"].apply(lambda dst: dst in arguments) edges = edges.query("in_mentions == True and in_args == True") mapping = {} for src, dst in edges[["src", "dst"]].values: if dst in mapping: print() mapping[dst] = src with open(args.output, "w") as sink: with open(join(args.working_directory, "function_annotations.jsonl")) as fa: for line in fa: entry = json.loads(line) new_repl = [[s, e, int(mapping.get(r, r))] for s, e, r in entry["replacements"]] entry["replacements"] = new_repl sink.write(f"{json.dumps(entry)}\n") print()
def load_local2global(working_directory): local2global = unpersist( os.path.join(working_directory, "local2global_with_ast.bz2")) id_maps = dict(zip(local2global['id'], local2global['global_id'])) return id_maps
def main(): parser = argparse.ArgumentParser() parser.add_argument("bodies") parser.add_argument("bpe_path") parser.add_argument("--num_layers", default=8, type=int) args = parser.parse_args() bodies = unpersist(args.bodies) bpe = create_tokenizer(type="bpe", bpe_path=args.bpe_path) mention_tokenizer = MentionTokenizer(args.bpe_path, create_subword_instances=True, connect_subwords=False) lengths_tr = {} lengths_gnn = {} ratio = [] for body in tqdm(bodies["body"]): if not has_valid_syntax(body): continue n_tokens = compute_transformer_passings(body, bpe) n_edges = compute_gnn_passings(body, mention_tokenizer) if n_tokens not in lengths_tr: lengths_tr[n_tokens] = [] if n_tokens not in lengths_gnn: lengths_gnn[n_tokens] = [] lengths_tr[n_tokens].append(n_tokens**2 * args.num_layers) lengths_gnn[n_tokens].append(n_edges) # * args.num_layers) ratio.append((n_tokens, n_edges)) for key in lengths_tr: data_tr = np.array(lengths_tr[key]) data_gnn = np.array(lengths_gnn[key]) lengths_tr[key] = np.mean(data_tr) #, np.std(data_tr)) lengths_gnn[key] = np.mean(data_gnn) #, np.std(data_gnn)) data_ratios = np.array(ratio) plt.plot(data_ratios[:, 0], data_ratios[:, 1], "*") plt.xlabel("Number of Tokens") plt.ylabel("Number of Edges") plt.savefig("tokens_edges.png") plt.close() plt.hist(data_ratios[:, 1] / data_ratios[:, 0], bins=20) plt.xlabel("Number of edges / Number of tokens") plt.savefig("ratio.png") plt.close() ratio = data_ratios[:, 1] / data_ratios[:, 0] ratio = (np.mean(ratio), np.std(ratio)) plt.plot(list(lengths_tr.keys()), np.log10(np.array(list(lengths_tr.values()))), "*") plt.plot(list(lengths_gnn.keys()), np.log10(np.array(list(lengths_gnn.values()))), "*") plt.plot(list(lengths_gnn.keys()), np.log10(np.array(list(lengths_gnn.values())) * args.num_layers), "*") plt.legend([ f"Transformer {args.num_layers} layers", "GNN L layers", f"GNN L*{args.num_layers} layers" ]) plt.xlabel("Number of Tokens") plt.ylabel("log10(Number of Message Exchanges)") plt.savefig("avg_passings.png") plt.close()
def main(): parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('dataset_path', default=None, help='Path to the dataset') parser.add_argument('output_path', default=None, help='') parser.add_argument("--extra_objectives", action="store_true", default=False) parser.add_argument("--eval_frac", dest="eval_frac", default=0.05, type=float) args = parser.parse_args() nodes_path, edges_path, extra_paths = get_paths( args.dataset_path, use_extra_objectives=args.extra_objectives) nodes, edges = load_data(nodes_path, edges_path) nodes, edges, holdout = SourceGraphDataset.holdout(nodes, edges) edges = edges.astype({ "src": 'str', "dst": "str", "type": 'str' })[['src', 'dst', 'type']] holdout = holdout.astype({ "src": 'str', "dst": "str", "type": 'str' })[['src', 'dst', 'type']] node2graph_id = compact_property(nodes['id']) nodes['global_graph_id'] = nodes['id'].apply(lambda x: node2graph_id[x]) node_ids = set(nodes['id'].unique()) if args.extra_objectives: for objective_path in extra_paths: data = unpersist(objective_path) data = filter_relevant(data, node_ids) data["type"] = objective_path.split(".")[0] edges = edges.append(data) if not os.path.isdir(args.output_path): os.mkdir(args.output_path) edges = edges[['src', 'dst', 'type']] eval_sample = edges.sample(frac=args.eval_frac) persist(nodes, join(args.output_path, "nodes_dglke.csv")) persist(edges, join(args.output_path, "edges_train_dglke.tsv"), header=False, sep="\t") persist(edges, join(args.output_path, "edges_train_node2vec.tsv"), header=False, sep=" ") persist(eval_sample, join(args.output_path, "edges_eval_dglke.tsv"), header=False, sep="\t") persist(eval_sample, join(args.output_path, "edges_eval_node2vec.tsv"), header=False, sep=" ") persist(holdout, join(args.output_path, "edges_eval_dglke_10000.tsv"), header=False, sep="\t") persist(holdout, join(args.output_path, "edges_eval_node2vec_10000.tsv"), header=False, sep=" ")