def create_vocabulary_from_corpus(corpus_path, output_token_path=None): all_sub_tokens = [] node_types = get_used_nodes_type() # Extract all subtokens from all nodes of the appropriate type using all graphs in the corpus for dirpath, dirs, files in os.walk(corpus_path): for filename in files: if filename.endswith('proto'): fname = os.path.join(dirpath, filename) with open(fname, "rb") as f: g = Graph() g.ParseFromString(f.read()) for n in g.node: if n.type in node_types: all_sub_tokens += split_identifier_into_parts(n.contents) all_sub_tokens = list(set(all_sub_tokens)) all_sub_tokens.append('<SLOT>') all_sub_tokens.append('sos_token') all_sub_tokens.sort() vocabulary = __create_voc_from_tokens(all_sub_tokens) # Save the vocabulary if output_token_path != None: with open(output_token_path, "wb") as fp: pickle.dump(vocabulary, fp) return vocabulary
def main(path): with open(path, "rb") as f: g = Graph() g.ParseFromString(f.read()) token_count = len(list(filter(lambda n:n.type in (FeatureNode.TOKEN, FeatureNode.IDENTIFIER_TOKEN), g.node))) token_count = len(set(g.node.startLineNumber)) # startLineNumber print("%s contains %d tokens" % (g.sourceFile, token_count))
def open_proto(filename): print(filename) with open(filename, "rb") as f: g = Graph() g.ParseFromString(f.read()) get_info(g)
def compute_corpus_stats(corpus_path): max_node_len, max_var_len, max_var_usage = 0, 0, 0 for dirpath, dirs, files in os.walk(corpus_path): for filename in files: if filename.endswith('proto'): fname = os.path.join(dirpath, filename) with open(fname, "rb") as f: g = Graph() g.ParseFromString(f.read()) var_node_usages = {} identifier_node_ids = [] for node in g.node: if node.type not in get_used_nodes_type() \ and node.type != FeatureNode.SYMBOL_VAR: continue node_len = len( split_identifier_into_parts(node.contents)) if node_len > max_node_len: max_node_len = node_len if node.type == FeatureNode.SYMBOL_VAR: var_node_usages[node.id] = 0 if node_len > max_var_len: max_var_len = node_len elif node.type == FeatureNode.IDENTIFIER_TOKEN: identifier_node_ids.append(node.id) for edge in g.edge: if edge.sourceId in var_node_usages and edge.destinationId in identifier_node_ids: var_node_usages[edge.sourceId] += 1 if len(var_node_usages.values()) > 0: var_usage = max(var_node_usages.values()) else: var_usage = 0 if var_usage > max_var_usage: max_var_usage = var_usage print("Longest node length: ", max_node_len) print("Longest variable length: ", max_var_len) print("Largest variable usage: ", max_var_usage)
def runAnalysis(fileLocation): with open(fileLocation, "rb") as f: if verbose: print("Opening " + fileLocation, end='') g = Graph() g.ParseFromString(f.read()) logs = detectLogs(g) if verbose: print(" ------- Number of logs found: " + str(len(logs))) return logs
def get_n_tokens(path, n): with open(path, "rb") as f: g = Graph() g.ParseFromString(f.read()) to_print_len = min(len(g.node), n) token_list = [] for node in filter(isToken, g.node): token_list.append(node.contents) to_print_len -= 1 if to_print_len <= 0: break return token_list
def get_types_and_dependencies(file_name, max_time=30): with open(file_name, "rb") as f: graph = Graph() graph.ParseFromString(f.read()) id_mapping = get_id_to_node_graph(graph) source_mapping = get_source_dict_graph(graph) global START_TIME, MAX_TIME START_TIME = time.time() MAX_TIME = max_time variable_types = get_type_mapping(graph, id_mapping, source_mapping) dependencies = get_all_dependencies(graph, id_mapping, source_mapping, variable_types) return variable_types, dependencies
def compute_var_usages(self): if self.num_usages is not None: return self.num_usages with open(self.fname, "rb") as f: g = Graph() g.ParseFromString(f.read()) n_usages = get_var_usages(g, self.node_id) self.num_usages = n_usages return n_usages
def compute_var_type(self): if self.type != self.empty_type: return self.type with open(self.fname, "rb") as f: g = Graph() g.ParseFromString(f.read()) var_type = get_var_type(g, self.node_id, self.empty_type) self.type = var_type return var_type
def get_file_methods_data(file): """ Extract the source code tokens, identifier names and graph for methods in a source file. Identifier tokens are split into subtokens. Constructors are not included in the methods. :param file: file :return: (methods_source, methods_names, methods_graph) where methods_source[i] is a list of the tokens for the source of ith method in the file, methods_names[i] is a list of tokens for name of the ith method in the file, and methods_graph[i] is the subtree of the file parse tree starting from the method node. """ adj_list, nodes, edges = get_file_graph(file) with file.open('rb') as f: class_name = file.name.split('.') g = Graph() g.ParseFromString(f.read()) methods_source = [] methods_names = [] methods_graph = [] # class_name_node = get_class_name_node(g) for node in g.node: if node.contents == "METHOD": method_name_node = get_method_name_node(g, node) # If method name is the same as class name, then method name is constructor, # so discard it if method_name_node.contents == class_name: continue method_edges, method_nodes, non_tokens_nodes_features = get_method_edges( node.id, adj_list, nodes) methods_graph.append((method_edges, non_tokens_nodes_features)) methods_names.append( split_identifier_into_parts(method_name_node.contents)) method_source = [] for other_node in method_nodes.values(): if other_node.id == method_name_node.id: # Replace method name with '_' in method source code method_source.append('_') elif other_node.type == FeatureNode.TOKEN or other_node.type == \ FeatureNode.IDENTIFIER_TOKEN: method_source.append(other_node.contents) methods_source.append(method_source) return methods_source, methods_names, methods_graph
def load_data_file(file_path: str) -> Iterable[List[str]]: """ Load a single data file, returning token streams. Args: file_path: The path to a data file. Returns: Iterable of lists of strings, each a list of tokens observed in the data. """ with open(file_path, "rb") as f: g = Graph() g.ParseFromString(f.read()) return method_split_tokens(g)
def create_samples(self, filepath): with open(filepath, "rb") as f: g = Graph() g.ParseFromString(f.read()) true_labels = [] max_path_len = 8 graph_samples = log_graph_processing.get_log_samples( g, max_path_len, self.max_node_seq_len, self.pad_token_id, self.vocabulary) samples, labels, slot_labels = [], [], [] try: with open( os.path.splitext(filepath)[0].replace("java", "json"), "r") as f_: labels, slot_labels = json.load(f_) if len(slot_labels) != len(graph_samples) or len( labels) != sum(slot_labels): print( "Error: labels and samples don't match, num of labels: %d, num of samples: %d, filename: %s" % (len(labels), len(graph_samples), filepath)) os.system("rm -f " + filepath[:-11] + "*") except FileNotFoundError as e: print( "Warning: file not found. It's ok if you are not training the model" ) labels = [self.label_kind - 1] * len(graph_samples) slot_labels = [1] * len(graph_samples) count = 0 for i in range(len(graph_samples)): if slot_labels[i] != 0: new_sample = self.create_sample(*(graph_samples[i]), labels[count]) samples.append(new_sample) true_labels.append(labels[count]) count += 1 return samples, true_labels
def count_one(path): with open(path, "rb") as f: g = Graph() g.ParseFromString(f.read()) token_count = len( list( filter( lambda n: n.type in (FeatureNode.TOKEN, FeatureNode.IDENTIFIER_TOKEN), g.node))) # max_line = max(list(map(lambda n: n.endLineNumber, g.node))) max_line = g.ast_root.endLineNumber javadoc_comments = len( list( filter(lambda n: n.type == FeatureNode.COMMENT_JAVADOC, g.node))) return token_count, max_line, javadoc_comments
def get_nx_graph(file): """ Get networkx graph corresponding to a file. """ nx_graph = nx.DiGraph() with file.open('rb') as f: g = Graph() g.ParseFromString(f.read()) for edge in g.edge: edge_type = [ name for name, value in list(vars(FeatureEdge).items())[8:] if value == edge.type ][0] nx_graph.add_edge(edge.sourceId, edge.destinationId, edge_type=edge_type) return nx_graph
def load_data_file(_, file_path: str) -> Iterable[Tuple[str, bool]]: """ Load a single data file, returning token streams. Args: file_path: The path to a data file. Returns: Iterable of lists of strings, each a list of tokens observed in the data. """ with open(file_path, "rb") as f: g = Graph() g.ParseFromString(f.read()) v = [(n.contents.lower(), n.type == FeatureNode.IDENTIFIER_TOKEN) for n in g.node if n.type in [FeatureNode.TOKEN, FeatureNode.IDENTIFIER_TOKEN] ] return v
def modifyGraphFile(graphFile, rootId): g = Graph() g.ParseFromString(graphFile.read()) nodes = g.node edges = g.edge # get all the relevant log nodes allLogNodes, baseNodeIndex, \ lastLogNodeIndex, lastNodeEndLineNumber, lastNodeEndPosition = retrieveAllLogsNodes(nodes, rootId) # get the releant node ids allLogNodesIds = list(map(lambda node: node.id, allLogNodes)) # STEP 1) Remove any edge that both originates and targets nodes within our log statment edges = removeLogEdges(edges, allLogNodesIds) # STEP 2) Modify any edge that links to one of our log nodes within the statement. These edges will now point to the # root LOG node. edges = adjustOutsideEdges(edges, allLogNodesIds) # STEP 3) Modify all the log nodes. Modify root node to be special LOG node, delete rest. nodes = modifyNodes(nodes, baseNodeIndex, lastLogNodeIndex, lastNodeEndLineNumber, lastNodeEndPosition) # create a new Graph file to return for writing, using all the modified nodes and edges returnGraph = Graph() for node in nodes: graphNode = graph_pb2.FeatureNode() graphNode.id = node.id graphNode.type = node.type graphNode.contents = removeImportLeaks(node.contents) graphNode.startPosition = node.startPosition graphNode.endPosition = node.endPosition graphNode.startLineNumber = node.startLineNumber graphNode.endLineNumber = node.endLineNumber # append our node to the new graph file returnGraph.node.append(graphNode) for edge in edges: graphEdge = graph_pb2.FeatureEdge() graphEdge.sourceId = edge.sourceId graphEdge.destinationId = edge.destinationId graphEdge.type = edge.type # append our edge to the graph file returnGraph.edge.append(graphEdge) graphFile.close() return returnGraph
def create_samples(self, filepath): with open(filepath, "rb") as f: g = Graph() g.ParseFromString(f.read()) max_path_len = 8 # Select sample parsing strategy depending on the specified model task if self.task_type == 0: graph_samples, slot_node_ids = graph_processing.get_usage_samples( g, max_path_len, self.max_slots, self.max_node_seq_len, self.pad_token_id, self.slot_id, self.vocabulary) elif self.task_type == 1: graph_samples, slot_node_ids = graph_processing.get_usage_samples( g, max_path_len, self.max_slots, self.max_node_seq_len, self.pad_token_id, self.slot_id, self.vocabulary, True) elif self.task_type == 2: graph_samples, slot_node_ids = graph_processing.get_method_body_samples( g, self.max_node_seq_len, self.pad_token_id, self.slot_id, self.vocabulary) else: raise ValueError("Invalid task id...") samples, labels = [], [] for sample in graph_samples: new_sample, new_label = self.create_sample(*sample) samples.append(new_sample) labels.append(new_label) # Save sample meta-information samples_meta_inf = [] for slot_node_id in slot_node_ids: new_inf = SampleMetaInformation(filepath, slot_node_id) samples_meta_inf.append(new_inf) return samples, labels, samples_meta_inf
def get_file_graph(file): """ Compute graph for the given file. """ with file.open('rb') as f: g = Graph() g.ParseFromString(f.read()) node_ids = [node.id for node in g.node] edges = [(e.sourceId, e.destinationId, e.type) for e in g.edge] adj_list = {node: [] for node in node_ids} for edge in edges: adj_list[edge[0]].append({ 'destination': edge[1], 'edge_type': edge[2] }) nodes = {node.id: node for node in g.node} return adj_list, nodes, edges
def main(): with open(str(graphLocation), "rb") as graphFile: g = Graph() g.ParseFromString(graphFile.read()) logIds = [] with open(graphLocation.name + ".dot", "w") as f: # write the first line of the dot f.write("digraph G {\n") # for each node, write out the node with the contents as its label for node in g.node: # the first line writes the node type as the label, the second the node contents. Second is better. # f.write(str(node.id) + ' [ label="' + toNodeText(node) + '" ];\n') f.write(str(node.id) + ' [ label="' + re.escape(node.contents) + '" ];\n') if (toNodeText(node) == "LOG"): logIds.append(node.id) # for each edge, write out the edge as a link between the source and the destination for edge in g.edge: f.write((str(edge.sourceId) + " -> " + str(edge.destinationId) + "\n")) # Pretify any special LOG nodes by making them distinct (Square with diamond inside) for logId in logIds: f.write(str(logId) + " [shape=Msquare];") f.write("}\n") f.close()
def load_data_file(file_path: str) -> Iterable[List[str]]: """ Load a single data file, returning token streams. Args: file_path: The path to a data file. Returns: Iterable of lists of strings, each a list of tokens observed in the data. """ #TODO 2# Insert your data parsing code here # Method for checking if a node is a method def isMethod(node): return node.type == FeatureNode.AST_ELEMENT and node.contents == "METHOD" # Method that decides whether a node is a token def isToken(node): return node.type in (FeatureNode.TOKEN, FeatureNode.IDENTIFIER_TOKEN) # Retrieve token leaf nodes, by DFS def get_leaf_nodes(nodeId, sourceDict, nodeDict, visited): if (nodeId in visited): return [] visited.add(nodeId) if (nodeId == None or nodeDict.get(nodeId) == None): return [] if (nodeDict.get(nodeId).type in [FeatureNode.TOKEN, FeatureNode.IDENTIFIER_TOKEN]): return [nodeDict.get(nodeId)] edgeTo = sourceDict.get(nodeId) if (edgeTo == None): return [] to_return = [] for edge in edgeTo: to_return += get_leaf_nodes(edge.destinationId, sourceDict, nodeDict, visited) return to_return # Reorder leaf nodes from top to bottom def reorder_leaves(leaves_arr, sourceDict, nodeDict): leaves_map = dict() for (index, node) in enumerate(leaves_arr): leaves_map[node.id] = index length = len(leaves_arr) index_sum = int(((length - 1) * length) / 2) for node in leaves_arr: if (node.id in sourceDict) and ((sourceDict[node.id][0]).destinationId in leaves_map): index_sum -= leaves_map[(sourceDict[node.id][0]).destinationId] current = leaves_arr[index_sum] to_return = [] for _ in range(length): to_return.append(current) if current.id in sourceDict: current = nodeDict[(sourceDict[current.id][0]).destinationId] else: break return to_return # Get tokens for given file with open(file_path, "rb") as f: g = Graph() g.ParseFromString(f.read()) token_count = len(list(filter(lambda n:n.type in (FeatureNode.TOKEN, FeatureNode.IDENTIFIER_TOKEN), g.node))) to_print_len = min(len(g.node), 100) idsInNode = dict() sourceIdsInEdge = dict() for node in g.node: idsInNode[node.id] = node for edge in g.edge: cur = sourceIdsInEdge.get(edge.sourceId, []) cur.append(edge) sourceIdsInEdge[edge.sourceId] = cur all_results = [] for node in g.node: if isMethod(node): initial_leaves = reorder_leaves(get_leaf_nodes(node.id, sourceIdsInEdge, idsInNode, set()), \ sourceIdsInEdge, idsInNode) correct = [str(n.contents).lower() for n in filter(isToken, initial_leaves)] all_results.append(correct) return all_results
def get_source_dict(path): with open(path, "rb") as f: g = Graph() g.ParseFromString(f.read()) return get_source_dict_graph(g)
def tokenize_methods_for_file(path, full=False): with open(path, "rb") as f: g = Graph() g.ParseFromString(f.read()) return tokenize_methods_for_graph(g, full)
def get_id_to_node(path): with open(path, "rb") as f: g = Graph() g.ParseFromString(f.read()) return get_id_to_node_graph(g)
def load_data_file_methods(file_path: str): """ Load a single data file, returning methods code and JavaDoc comments. """ methods_code = [] methods_comments = [] graphs = [] g = Graph() with open(file_path, "rb") as f: g.ParseFromString(f.read()) # Build a dictionary of nodes indexed by id # by start position and end position nodes_dict = {} tokens_by_start_pos = {} tokens_by_end_pos = {} # A list of methods root nodes methods = [] for n in g.node: nodes_dict[n.id] = n if n.contents == 'METHOD': methods.append(n) if n.type in (FeatureNode.TOKEN, FeatureNode.IDENTIFIER_TOKEN): tokens_by_start_pos[n.startPosition] = n tokens_by_end_pos[n.endPosition] = n # Build a dictionary of edges indexed by source id edges_dict = {} for e in g.edge: if e.sourceId in edges_dict: edges_dict[e.sourceId].append(e) else: edges_dict[e.sourceId] = [e] for m in methods: # Start with a node that is a token and starts at the same position # as method's start postion nid = tokens_by_start_pos[m.startPosition].id tokens = [] comment = "" # Follow the 'next token' edges up to the token finishing at end postion while nid != tokens_by_end_pos[m.endPosition].id: tokens.append(nodes_dict[nid].contents.lower()) if nid in edges_dict: for e in edges_dict[nid]: if e.type == FeatureEdge.NEXT_TOKEN: nid = e.destinationId for n in g.node: if n.type == FeatureNode.COMMENT_JAVADOC and m.id == edges_dict[ n.id][0].destinationId: comment = format_comment_to_plain_text(n.contents) # I add only the non-empty methods that have comments. # I also ensure that method is not vrtual and has a body starting with '{'. if len(tokens) > 0 and len(comment) > 0 and 'lbrace' in tokens and len( tokens) < 200: methods_code.append(tokens) methods_comments.append(comment) methods_edges, nodes_features = get_method_graph( m, nodes_dict, edges_dict) graph = { 'Target': word_tokenize(comment), 'Source_len': len(tokens), 'graph': { 'node_features': nodes_features, 'adjacency_lists': methods_edges } } if len(nodes_features) < 300: graphs.append(graph) # print(graph) return methods_code, methods_comments, graphs
def obfuscate_graph(g, precomputed_name_files): id_mapping = get_id_to_node_graph(g) source_mapping = get_source_dict_graph(g) start_node = g.ast_root initialPath = [] initialPath.append(start_node) to_obfuscate = get_obfuscation_names(start_node.id, id_mapping, source_mapping, set(), initialPath) new_names = get_new_names(len(to_obfuscate), precomputed_name_files) new_names_mapping = create_names_mapping(to_obfuscate, new_names) substitute_all(g.node, new_names_mapping) return g if __name__ == "__main__": filePath = sys.argv[1] precomputed_name_files = "precomputed_names.txt" with open(filePath, "rb") as f: untouched = Graph() untouched.ParseFromString(f.read()) obfuscated_graph = obfuscate_path(filePath, precomputed_name_files) before = tokenize_methods_for_graph(untouched) after = tokenize_methods_for_graph(obfuscated_graph) print("BEFORE:") print(before) print("AFTER:") print(after)
def obfuscate_path(path, precomputed_name_files): with open(path, "rb") as f: g = Graph() g.ParseFromString(f.read()) return obfuscate_graph(g, precomputed_name_files)
def convertGraph(graphLoc, severity, msgToken): # return JSON structure visualized. returnJSON = { "backbone_sequence": [], "node_labels": [], "edges": {}, "method_name": [], "log_node": -1 } with open(graphLoc, "rb") as graphFile: g = Graph() g.ParseFromString(graphFile.read()) nodes = g.node edges = g.edge # make a map of all the backbone nodes by looping over the edges and getting any node that # has a NEXT_TOKEN pointing to/from it backboneNodes = {} for edge in edges: if edge.type == 2: # overwrite, but thats ok! backboneNodes[edge.destinationId] = True backboneNodes[edge.sourceId] = True # Map of nodeId->Index. Used in step 3 IdIndexDict = {} # Used to get the index of our special node to put it to the JSON specialLogNodeIndex = -1 for index, node in enumerate(nodes): # got the special node if node.type == 17: specialLogNodeIndex = index # add the node's index to the id map IdIndexDict[node.id] = index # STEP 1) Create backbone_sequence by first checking if the node is a backbone node and then appending its # index to the array if node.id in backboneNodes: returnJSON["backbone_sequence"].append(index) # STEP 2) Create node_labels by appending the node contents to the array returnJSON["node_labels"].append(node.contents) # STEP 3) Create edges by parsing the edge data into the correct format returnDict = {} for edge in edges: type = EdgeType(edge.type).name if type not in returnDict: returnDict[type] = [] sourceIndex = IdIndexDict[edge.sourceId] destinationIndex = IdIndexDict[edge.destinationId] returnDict[type].append([sourceIndex, destinationIndex]) returnJSON["edges"] = returnDict # STEP 4) If we are trying to predict the logging statement, add the tokenized msg to the # prediction variable (method_name). Also put the logging level inside as well. # If we are trying to predict the severity, add the severity to the prediction variable only. if args.statement_generation: returnJSON["method_name"] = msgToken # TODO check if severity should be here #returnJSON["severity"] = severity else: returnJSON["method_name"].append(severity) # STEP 5) Add the index of the log node to the JSON returnJSON["log_node"] = specialLogNodeIndex return json.dumps(returnJSON)
def parse_file(self, file_name, only_javadoc=True, lowercase_api=True, should_subtokenize=False): """ Extracts features from a single protobuf file. """ names = [] apis = [] javadocs = [] tokens = [] method_bodies = [] with open(file_name, 'rb') as proto_file: g = Graph() # Parse protobuf file as a graph. Skips all files which error. try: g.ParseFromString(proto_file.read()) except: print('Error parsing: {0}'.format(file_name)) return tokens, apis, names, javadocs, method_bodies code_graph = CodeGraph(g) # We either extract features from all method or those which have associated # Javadoc comments. method_dict = code_graph.methods if only_javadoc else code_graph.all_methods for method in method_dict.values(): # Omit methods which are below the defined threshold if method.num_lines <= self.line_threshold: continue # Parse method name tokens method_name_tokens = self.text_filter.apply_to_method_name(method.method_name) # Parse API invocations method_invocations = self._get_method_invocations(method.method_block, code_graph) api_call_tokens = [] for invocation in method_invocations: parsed_invocation = self._parse_method_invocation(invocation, code_graph).strip() api_call_tokens.append(parsed_invocation) obj_init_tokens = self._get_object_inits(method.method_block, code_graph) api_call_tokens += obj_init_tokens api_call_tokens = self.text_filter.apply_to_api_calls(api_call_tokens, lowercase_api, should_subtokenize=should_subtokenize) # Parse Javadoc comments. We check to make sure the method has an associdated # Javadoc comment, as there may be no javadoc on methods which are used during testing. javadoc_tokens = [] if method.javadoc: javadoc_tokens = self.text_filter.apply_to_javadoc(method.javadoc.contents) # Parse method tokens method_tokens = self._get_method_tokens(method.method_block, code_graph) method_tokens = self.text_filter.apply_to_token_lst(method_tokens) # Extract the entire method body. This field is used during searching. method_str = self._method_to_str(method.method_block, code_graph) # During testing, we only omit methods for which there is no proper method body if not only_javadoc and len(method_str.strip()) > 0 and len(method_name_tokens) > 0: # Tokens in the output files are separated by spaces names.append(' '.join(method_name_tokens)) apis.append(' '.join(api_call_tokens)) tokens.append(' '.join(method_tokens)) javadocs.append(' '.join(javadoc_tokens)) method_bodies.append(method_str) # During training, we only omit methods which have no name or javadoc description if only_javadoc and len(javadoc_tokens) > 0 and len(method_name_tokens) > 0: # Tokens in the output files are separated by spaces names.append(' '.join(method_name_tokens)) apis.append(' '.join(api_call_tokens)) tokens.append(' '.join(method_tokens)) javadocs.append(' '.join(javadoc_tokens)) method_bodies.append(method_str) return tokens, apis, names, javadocs, method_bodies
def compute_names_and_types(nodes, id_mapping, source_mapping): mapping = dict() for node in nodes: name = get_variable_name(node, id_mapping, source_mapping) Type = get_variable_type(node, id_mapping, source_mapping) if '' in (name, Type): continue mapping[name] = Type return mapping def get_type_mapping(g, id_mapping=None, source_mapping=None): if id_mapping == None: id_mapping = get_id_to_node_graph(g) if source_mapping == None: source_mapping = get_source_dict_graph(g) root = g.ast_root all_members = get_variables(root, id_mapping, source_mapping) all_members.extend(get_classes(root, id_mapping, source_mapping)) all_members.extend(get_methods(root, id_mapping, source_mapping)) return compute_names_and_types(all_members, id_mapping, source_mapping) if __name__ == "__main__": filePath = sys.argv[1] with open(filePath, "rb") as f: graph = Graph() graph.ParseFromString(f.read()) type_mapping = get_type_mapping(graph)