def print_num_nodes_all(native_dir, interface_dir, complement_dir): i = 0 for graph_file in os.listdir(native_dir): # Loop control # if i == 30: break # i += 1 if '.json' not in graph_file: continue # read input and compute function try: interface = load_json(os.path.join(interface_dir, graph_file)) except FileNotFoundError: print('\nWARNING, interface graph not found for: ', graph_file, '\n') break try: complement = load_json(os.path.join(complement_dir, graph_file)) except FileNotFoundError: print('\nWARNING, complement graph not found for: ', graph_file, '\n\n') continue try: native = load_json(os.path.join(native_dir, graph_file)) except FileNotFoundError: print('\nWARNING, native graph not found for: ', graph_file, '\n\n') continue print_num_nodes(native, interface, complement) return
def remove_graphs_p(graph_dir, interaction, threshold=1): num_nodes = 0 binding_nodes = 0 # Count binding and non binding nodes for graph_file in tqdm(listdir_fullpath(graph_dir)): g = load_json(graph_file) g_binding_nodes = len([n for n, d in g.nodes.data()\ if d['binding_' + interaction] is not None]) binding_nodes += g_binding_nodes num_nodes += len(g.nodes) num_NB = num_nodes - binding_nodes for graph_file in tqdm(listdir_fullpath(graph_dir)): if binding_nodes * threshold > num_NB: break g = load_json(graph_file) g_binding_nodes = len([n for n, d in g.nodes.data()\ if d['binding_' + interaction] is not None]) if g_binding_nodes == 0: os.remove(graph_file) num_NB -= len(g.nodes) continue return num_NB, binding_nodes
def remove_nodes(graph_dir, interaction, num_NB, num_binding, threshold, remove_size=0.3): """ Undersample nodes from graphs to produce a balanced dataset """ # Count number of nodes needed to remove remove_size = int((num_NB - num_binding) / len(os.listdir(graph_dir))) remove_size -= 10 while num_binding * threshold < num_NB: print(f"Binding: {num_binding} \t Non-Binding {num_NB}") small = 0 for graph_file in tqdm(listdir_fullpath(graph_dir)): g = load_json(graph_file) NB_nodes = [n for n, d in g.nodes.data()\ if d['binding_' + interaction] is None] if len(g.nodes) < 20 or len(NB_nodes) == 0: small += 1 continue shuffle(NB_nodes) trash = NB_nodes[:remove_size] g.remove_nodes_from(trash) num_NB -= len(trash) dump_json(graph_file, g) if small > len(os.listdir(graph_dir)): break print(f"DONE \nBinding: {num_binding} \t Non-Binding {num_NB}")
def balance_complement_all(interface_dir, complement_dir, output_dir): i = 0 for graph_file in os.listdir(interface_dir): # Loop control # if i == 30: break # i += 1 if '.json' not in graph_file: continue # read input and compute function try: interface = load_json(os.path.join(interface_dir, graph_file)) complement = load_json(os.path.join(complement_dir, graph_file)) print('Balancing', graph_file, '...') except FileNotFoundError: print('\nWARNING, complement graph not found for: ', graph_file, '\n\n') continue balanced_complement = balance_complement(interface, complement) # Write output dump_json(balanced_complement, os.path.join(output_dir, graph_file))
def connect_all(input_dir, native_dir, output_dir): """ runs connect_components on all graphs in input_dir and outputs resulting connected graphs to output_dir """ i = 0 for graph_file in os.listdir(input_dir): # Loop control # if i == 30: break # i += 1 if '.json' not in graph_file: continue # read input and compute function g = load_json(os.path.join(input_dir, graph_file)) g_native = load_json(os.path.join(native_dir, graph_file)) connected_graphs = connect_components(g, g_native) # Write output pbid = graph_file[:4] for i, h in enumerate(connected_graphs): dump_json(h, os.path.join(output_dir, (pbid + '_' + str(i) + '.json') )) print_component_info(h, i)
def connect_and_balance_all(interface_dir, native_dir, complement_dir, output_dir, quiet=False): """ UNFINISHED: STILL NEED TO WRITE DESCRIPTION """ # Make a directory inside output for the complements try: os.mkdir(os.path.join(output_dir, 'complement')) except FileExistsError: print('complement directory already exists! make sure you are not overwriting') comp_dir = os.path.join(output_dir, 'complement') i = 0 for graph_file in os.listdir(interface_dir): #Loop control if i == 30: break i += 1 if '.json' not in graph_file: continue if not quiet: print("Connecting and Balancing graph", graph_file ) # read interface, complement and native graphs g = load_json(os.path.join(interface_dir, graph_file)) g_native = load_json(os.path.join(native_dir, graph_file)) g_complement = load_json(os.path.join(complement_dir, graph_file)) # Connect the components into a set of graphs interface_graphs = connect_components(g, g_native) complement_graphs = connect_components(g_complement, g_native, trim_dangles=False) num_comps = len(complement_graphs) # Balance and write output pbid = graph_file[:4] for i, h in enumerate(interface_graphs): balanced_comp = balance_complement(h, complement_graphs[i%num_comps]) dump_json(h, os.path.join(output_dir, (pbid + '_' + str(i) + '.json') )) dump_json(balanced_comp, os.path.join(comp_dir, (pbid + '_' + str(i) + '.json') ))
def remove_graphs(graph_dir, interaction): num_nodes = 0 binding_nodes = 0 for graph_file in tqdm(listdir_fullpath(graph_dir)): g = load_json(graph_file) g_binding_nodes = len([n for n, d in g.nodes.data()\ if d['binding_' + interaction] is not None]) binding_nodes += g_binding_nodes if g_binding_nodes == 0: os.remove(graph_file) continue num_nodes += len(g.nodes) return num_nodes - binding_nodes, binding_nodes
def main(): parser = argparse.ArgumentParser() parser.add_argument( 'input_dir', help='input_directory containing graphs and complement') parser.add_argument('output', help='csv output file') args = parser.parse_args() stats = {} stats['Graphs'] = len(os.listdir(args.input_dir)) # Compute number of nodes stats['Nodes'] = 0 stats['Edges'] = 0 stats['Protein Binding'] = 0 stats['Small-Mol. Binding'] = 0 stats['Ion Binding'] = 0 for graph_file in tqdm(listdir_fullpath(args.input_dir)): g = load_json(graph_file) stats['Nodes'] += len(g.nodes) stats['Edges'] += len(g.edges) stats['Protein Binding'] += len([n for n, d in g.nodes.data()\ if d['binding_protein'] is not None]) stats['Small-Mol. Binding'] += len([n for n, d in g.nodes.data()\ if d['binding_small-molecule'] is not None]) stats['Ion Binding'] += len([n for n, d in g.nodes.data()\ if d['binding_ion'] is not None]) stats['Avg Nodes'] = int(stats['Nodes'] / stats['Graphs']) stats['Avg Edges'] = int(stats['Edges'] / stats['Graphs']) if os.path.exists(args.output): header = False else: header = True name = '_'.join(args.input_dir.split('/')[-2:]) with open(args.output, 'a') as f: writer = csv.writer(f, delimiter=',') # header if header: writer.writerow(['Dataset'] + list(stats.keys())) writer.writerow([name] + list(stats.values()))
def __getitem__(self, idx): g_path = os.path.join(self.path, self.all_graphs[idx]) graph = graph_io.load_json(g_path) # We can go from directed to undirected if self.directed and not nx.is_directed(graph): raise ValueError( f"The loader is asked to produce a directed graph from {g_path} that is undirected" ) if not self.directed: graph = nx.to_undirected(graph) # This is a weird call but necessary for DGL as it only deals # with undirected graphs that have both directed edges # The error raised above ensures that we don't have a discrepancy * # between the attribute directed and the graphs : # One should not explicitly ask to make the graphs directed in the learning as it is done by default but when # directed graphs are what we want, we should use the directed annotation rather than the undirected. graph = nx.to_directed(graph) one_hot = { edge: torch.tensor(self.edge_map[label]) for edge, label in ( nx.get_edge_attributes(graph, self.label)).items() } nx.set_edge_attributes(graph, name='one_hot', values=one_hot) interface = get_labels(graph, interaction='protein') nx.set_node_attributes(graph, name='interface', values=interface) # Careful ! When doing this, the graph nodes get sorted. g_dgl = dgl.from_networkx(nx_graph=graph, edge_attrs=['one_hot'], node_attrs=['interface']) if self.node_simfunc is not None: ring = list(sorted(graph.nodes(data=self.level))) return g_dgl, ring else: return g_dgl, 0