def cluster_diffs(concepts, data, graph_location, file_length_map, occurrence_matrix, file_index_map, times, edges_kept=None, use_file_dist=True, use_call_distance=True, use_data=True, use_namespace=True, use_change_coupling=True): """ :param concepts: The number of concepts we wish to segment :param data: The initial diff-regions segmentation, each it's own group :param graph_location: The location of the dot file representing the deltaPDG of the file :param file_length_map: A map between filename and file line count :param occurrence_matrix: The matrix mapping commits to files and vice versa :param file_index_map: The map between filenames and occurrence_matrix indices :return: The proposed clustering of diff_regions """ deltaPDG = obj_dict_to_networkx(read_graph_from_dot(graph_location)) if edges_kept is not None: deltaPDG = remove_all_except(deltaPDG, edges_kept) context = get_context_from_nxgraph(deltaPDG) voters = [ file_distance(file_length_map) if use_file_dist else None, call_graph_distance(deltaPDG, context) if use_call_distance else None, data_dependency(deltaPDG) if use_data else None, namespace_distance(deltaPDG, context) if use_namespace else None, change_coupling(occurrence_matrix, file_index_map) if use_change_coupling else None, ] voters = [v for v in voters if v is not None] n = len(data) t0 = time.process_time() for i in range(times): affinity, args = generate_empty_affinity(n, voters) with ThreadPool(processes=min(os.cpu_count() - 1, 6)) as wp: for k, value in wp.imap_unordered( lambda i: (i[1], i[0](data[i[-1][0]], data[i[-1][1]])), args): affinity[k] += value labels = cluster_from_voter_affinity(affinity, concepts) t1 = time.process_time() time_ = (t1 - t0) / times return labels, time_
def worker(work): for data_point_name in tqdm(work, leave=False): concepts = int( os.path.basename(os.path.dirname(data_point_name))) data_point_name = os.path.basename( os.path.dirname(os.path.dirname(data_point_name))) try: file_lens = file_len_map[data_point_name] graph_location = os.path.join('.', 'data', 'corpora_clean', repository_name, data_point_name, str(concepts), 'merged.dot') deltaPDG = obj_dict_to_networkx( read_graph_from_dot(graph_location)) context = get_context_from_nxgraph(deltaPDG) try: _, truth = list( zip(*[(n, d['community']) for n, d in deltaPDG.nodes(data=True) if 'color' in d.keys() and d['color'] != 'orange' and 'community' in d.keys()])) except ValueError: return labels, time_ = graph_cluster_diffs( deltaPDG, context, concepts, file_lens, occurrence_matrix, file_index_map, times_) truth = np.asarray(truth).astype(int) labels = np.asarray(labels).astype(int) acc, overlap = evaluate(labels, truth, q=max(concepts, np.max(labels) + 1)) with open( os.path.join('out', repository_name, out_name + '.csv'), 'a') as f: f.write(data_point_name + ',' + str(concepts) + ',' + str(acc) + ',' + str(overlap) + ',' + str(time_) + '\n') except FileNotFoundError: pass except KeyError: with open( os.path.join('out', repository_name, out_name + '.csv'), 'a') as f: f.write(data_point_name + ',' + str(concepts) + ',' + str(float('nan')) + ',' + str(float('nan')) + ',' + str(0.0) + '\n')
def __call__(self, filename): from sys import platform if platform == "linux" or platform == "linux2": # linux generate_a_pdg = subprocess.Popen([self.location, '.', '.' + filename.replace('/', '\\')], bufsize=1, cwd=self.repository_location) generate_a_pdg.wait() elif platform == "win32": # Windows... generate_a_pdg = subprocess.Popen([self.location, '.', '.' + filename.replace('/', '\\')], bufsize=1, cwd=self.repository_location) generate_a_pdg.wait() try: shutil.move(os.path.join(self.repository_location, 'pdg.dot'), os.path.join(self.target_location, self.target_filename)) except FileNotFoundError: with open(os.path.join(self.target_location, self.target_filename), 'w') as f: f.write('digraph "extractedGraph"{\n}\n') try: # shutil.move(os.path.join(self.repository_location, 'nameflows.json'), # os.path.join(self.target_location, 'nameflows_' + self.target_filename.split('.')[0] + '.json')) with open(os.path.join(self.repository_location, 'nameflows.json'), encoding='utf-8-sig') as json_data: nameflow_data = json.loads(json_data.read()) # Normalise the nameflow json if nameflow_data is not None: for node in nameflow_data['nodes']: file, line = node['Location'].split(' : ') node['Location'] = (file[len(self.repository_location):] if self.repository_location in file else file, line) node['Infile'] = \ os.path.normcase(os.path.normpath(filename)) == os.path.normcase(os.path.normpath(file[1:])) nameflow_data['relations'] = [[] if v is None else v for v in nameflow_data['relations']] # And add nameflow edges apdg = obj_dict_to_networkx(read_graph_from_dot(os.path.join(self.target_location, self.target_filename))) apdg = add_nameflow_edges(nameflow_data, apdg) nx.drawing.nx_pydot.write_dot(apdg, os.path.join(self.target_location, self.target_filename)) except FileNotFoundError: # No file, nothing to add pass
def worker(work): for graph_location in tqdm(work, leave=False): chain = os.path.basename( os.path.dirname(os.path.dirname(graph_location))) q = int(os.path.basename(os.path.dirname(graph_location))) graph = obj_dict_to_networkx(read_graph_from_dot(graph_location)) t0 = time.process_time() for i in range(times): DU_chains = extract_DU_chains_from_delta(graph) closure = closure_of_DU_on_diff(DU_chains) t1 = time.process_time() time_ = (t1 - t0) / times truth = list() label = list() for node, data in graph.nodes(data=True): if 'color' in data.keys(): if 'community' in data.keys(): truth.append(int(data['community'])) else: truth.append(0) try: label.append(int(closure.nodes[node]['prediction'])) except KeyError: label.append(-1) nx.drawing.nx_pydot.write_dot(closure, graph_location[:-4] + '_closure.dot') truth = np.asarray(truth) label = np.asarray(label) acc, overlap = evaluate(truth[label > -1], label[label > -1], q=max(q, np.max(label) + 1) if len(label) > 0 else q) cover = len( label[label > -1]) / len(label) if len(label) > 0 else .0 with open('./out/%s/du_results_raw.csv' % repository_name, 'a') as f: f.write(chain + ',' + str(q) + ',' + str(acc) + ',' + str(overlap) + ',' + str(cover) + ',' + str(time_) + '\n')
def worker(all_graph_locations, corpus_name): for graph_location in all_graph_locations: data_point_name = os.path.basename( os.path.dirname(os.path.dirname(graph_location))) if os.path.exists( os.path.join('.', 'data', 'corpora_clean', corpus_name, data_point_name)): print('[Scan and clean] Skipping %s as it exists' '' % data_point_name) return print('[Scan and clean] Cleaning data-point %s' % data_point_name) try: graph = obj_dict_to_networkx(read_graph_from_dot(graph_location)) except (TypeError, ValueError): continue # Get actual number of communities communities = set() for node, data in list(graph.nodes(data=True)): if 'community' in data.keys(): communities.add(data['community']) if 'color' in data.keys() and 'community' not in data.keys(): communities.add('0') graph.node[node]['community'] = '0' communities = sorted(list(communities)) nr_concepts = str(len(communities)) if len(communities) > 0: # Normalise labels for node, data in list(graph.nodes(data=True)): if 'community' in data.keys(): graph.node[node]['community'] = communities.index( data['community']) output_path = os.path.join('.', 'data', 'corpora_clean', corpus_name, data_point_name, nr_concepts, 'merged.dot') os.makedirs(os.path.dirname(output_path), exist_ok=True) nx.drawing.nx_pydot.write_dot(graph, output_path)
def merge_deltas_for_a_commit(graph_locations): # We will use the file attribute to track original files so that diff intersection can be made to work original_file = os.path.basename(graph_locations[0]) # We will take the first graph as a base and add the rest onto it graph = obj_dict_to_networkx(read_graph_from_dot(graph_locations[0])) contexts = get_context_from_nxgraph(graph) output = graph.copy() graph_locations = graph_locations[1:] for i, graph_location in enumerate(graph_locations): next_graph = obj_dict_to_networkx(read_graph_from_dot(graph_location)) next_contexts = get_context_from_nxgraph(next_graph) # First find the contexts that exist in both mappable_contexts = list() for next_context, current_context in itertools.product( set(next_contexts.values()), set(contexts.values())): if next_context == current_context and next_context != 'lambda expression': mappable_contexts.append(current_context) break copied_nodes = list() mapped_nodes = list() # And copy over all of the nodes into the merged representation for context in mappable_contexts: current_entry, current_exit = find_entry_and_exit(context, graph) other_entry, other_exit = find_entry_and_exit(context, next_graph) if current_entry is not None and other_entry is not None: mapped_nodes.append((str(current_entry), str(other_entry))) if current_exit is not None and other_exit is not None: mapped_nodes.append((str(current_exit), str(other_exit))) other_nodes = [ n for n in next_graph.nodes(data=True) if n[0] not in [other_entry, other_exit] and 'cluster' in n[1].keys() and n[1]['cluster'] == context ] if current_entry is None and other_entry is not None: other_nodes.append((other_entry, next_graph.node[other_entry])) if current_exit is None and other_exit is not None: other_nodes.append((other_exit, next_graph.node[other_exit])) if len(other_nodes) > 0: if current_entry is not None and 'file' not in graph.node[ current_entry].keys(): graph.node[current_entry]['file'] = os.path.basename( graph_location[:-len('.dot')]) if current_exit is not None and 'file' not in graph.node[ current_exit]: graph.node[current_exit]['file'] = os.path.basename( graph_location[:-len('.dot')]) for copy_node, data in other_nodes: data['file'] = os.path.basename(graph_location[:-len('.dot')]) output.add_node('m%d_' % i + copy_node[1:], **data) copied_nodes.append(('m%d_' % i + copy_node[1:], copy_node)) # Now we copy over all of the contexts that did not map/exist in the merged representation for other_context in [ c for c in set(next_contexts.values()) if c not in mappable_contexts ]: other_entry, other_exit = find_entry_and_exit( other_context, next_graph) other_nodes = [ n for n in next_graph.nodes(data=True) if n[0] not in [other_entry, other_exit] and 'cluster' in n[1].keys() and n[1]['cluster'] == other_context ] # For aesthetic reasons make sure to copy entry first and exit last if other_entry is not None: other_nodes = [(other_entry, next_graph.node[other_entry]) ] + other_nodes if other_exit is not None: other_nodes.append((other_exit, next_graph.node[other_exit])) for copy_node, data in other_nodes: data['file'] = os.path.basename(graph_location[:-len('.dot')]) output.add_node('m%d_' % i + copy_node[1:], **data) copied_nodes.append(('m%d_' % i + copy_node[1:], copy_node)) # Finally we copy over all of the nodes w/o a context for copy_node, data in [ n for n in next_graph.nodes(data=True) if n[0] not in next_contexts.keys() ]: data['file'] = os.path.basename(graph_location[:-len('.dot')]) output.add_node('m%d_' % i + copy_node[1:], **data) copied_nodes.append(('m%d_' % i + copy_node[1:], copy_node)) # We move over the edges making sure we properly map the ends reverse_map = {v: u for u, v in copied_nodes + mapped_nodes} for copied_node, original_node in copied_nodes: for s, t, k in next_graph.edges(nbunch=[original_node], keys=True): try: if s in reverse_map.keys() and t in reverse_map.keys(): if output.has_node(reverse_map[s]) and output.has_node( reverse_map[t]): output.add_edge(reverse_map[s], reverse_map[t], key=k, **next_graph[s][t][k]) except KeyError: pass # And finally we mark the original file nodes for node, _ in [ n for n in output.nodes(data=True) if 'file' not in n[1].keys() ]: graph.node[node]['file'] = original_file return output
target] if target in name_alias_for_edges.keys() else target try: if output.has_node(output_source) and output.has_node( output_target): output.add_edge(output_source, output_target, key=key, **graph[source][target][key]) except KeyError: # This should never happen! pass # Add change and community data back in for n in output.nodes: if 'color' in graph.nodes[n].keys(): output.nodes[n]['color'] = graph.nodes[n]['color'] if 'community' in graph.nodes[n].keys(): output.nodes[n]['community'] = graph.nodes[n]['community'] return output if __name__ == '__main__': from deltaPDG.Util.pygraph_util import read_graph_from_dot, obj_dict_to_networkx graph = obj_dict_to_networkx( read_graph_from_dot('./out/gui.cs/Core.cs.dot')) compressed = compress_delta(graph) nx.drawing.nx_pydot.write_dot(compressed, './out/gui.cs/compressed_Core.cs.dot')
def worker(work): for graph_location in tqdm(work, leave=False): chain = os.path.basename( os.path.dirname(os.path.dirname(graph_location))) q = int(os.path.basename(os.path.dirname(graph_location))) graph = obj_dict_to_networkx(read_graph_from_dot(graph_location)) graph = remove_all_except(graph, edges_kept) if len(graph.nodes) == 0: continue t0 = time.perf_counter() for i in range(times): seeds, list_of_graphs = deltaPDG_to_list_of_Graphs( graph, khop_k=k_hop) wl_subtree = GraphKernel(kernel=[{ "name": "weisfeiler_lehman", "n_iter": 10 }, { "name": "subtree_wl" }], normalize=True) if len(list_of_graphs) > 0: similarities = defaultdict(lambda: (0, 0.0)) for g1, g2 in itertools.combinations(list_of_graphs, 2): # The graph has to be converted to {Graph, Node_Labels, Edge_Labels} wl_subtree.fit([ graph_to_grakel(g1, with_data, with_call, with_name) ]) similarity = wl_subtree.transform([ graph_to_grakel(g2, with_data, with_call, with_name) ])[0][0] similarities[(list_of_graphs.index(g1), list_of_graphs.index(g2))] = similarity n = len(list_of_graphs) affinity = np.zeros( shape=(scipy.special.comb(n, 2, exact=True), )) args = list(enumerate(itertools.combinations(range(n), 2))) with ThreadPool(processes=min(os.cpu_count() - 1, 1)) as wp: for k, value in wp.imap_unordered( lambda i: (i[0], similarities[ (i[-1][0], i[-1][1])]), args): affinity[k] += ( 1 - value ) # affinity is distance! so (1 - sim) cluster = AgglomerativeClustering(n_clusters=None, distance_threshold=0.5, affinity='precomputed', linkage='complete') if len(affinity) < 2: if len(affinity) == 1: labels = np.asarray([ 0, 0 ]) if affinity[0] <= 0.5 else np.asarray([0, 1]) else: labels = np.asarray([0]) else: labels = cluster.fit_predict( scipy.spatial.distance.squareform(affinity)) else: labels = None t1 = time.perf_counter() time_ = (t1 - t0) / times truth = list() label = list() for node, data in graph.nodes(data=True): if 'color' in data.keys(): if 'community' in data.keys(): truth.append(int(data['community'])) i = seeds.index(node) if node in seeds else -1 if labels is not None and i != -1: data['label'] = '%d: ' % labels[i] + data['label'] label.append(labels[i]) graph.add_node(node, **data) else: data['label'] = '-1: ' + data['label'] label.append(-1) graph.add_node(node, **data) nx.drawing.nx_pydot.write_dot( graph, graph_location[:-4] + '_output_wl_%d.dot' % k_hop) truth = np.asarray(truth) label = np.asarray(label) acc, overlap = evaluate(truth[label > -1], label[label > -1], q=1 if len(label) == 0 else np.max(label) + 1) with open( './out/%s/wl_%s_%d_results_%s.csv' % (repository_name, edges_kept, k_hop, suffix), 'a') as f: f.write(chain + ',' + str(q) + ',' + str(acc) + ',' + str(overlap) + ',' + str(time_) + '\n')