def build_weights(self, model_params): initializer = chainer.initializers.HeNormal() setattr( self, 'model_params', model_params, ) num_hidden_features = [self.model_params['conv_width'] ] * self.model_params['fp_depth'] all_layer_sizes = [num_atom_features()] + num_hidden_features ''' for i in range(len(all_layer_sizes)): all_layer_sizes[i] = 1 ''' '''output weights''' for layer in range(len(all_layer_sizes)): setattr( self, 'layer_output_weights_' + str(layer), L.Linear(all_layer_sizes[layer], self.model_params['fp_length'], initialW=initializer)) '''hidden weights''' in_and_out_sizes = zip(all_layer_sizes[:-1], all_layer_sizes[1:]) for layer, (N_prev, N_cur) in enumerate(in_and_out_sizes): setattr(self, 'layer_' + str(layer) + '_self_filter', L.Linear(N_prev, N_cur, initialW=initializer)) for degree in degrees: name = weights_name(layer, degree) setattr( self, name, L.Linear(N_prev + num_bond_features(), N_cur, initialW=initializer))
def graph_from_edgelist(edgelist): graph = MolGraph() atoms_by_rd_idx = {} edges = [] with open(edgelist) as f: for line in f.readlines(): line = line.strip('\n') if '#' not in line: edges.append(re.findall('\d+', 'xyz123abc456def789')) for edge in edges: node_1 = graph.new_node('atom', features=np.random.uniform( low=0, high=1, size=(num_atom_features(), )), rdkit_ix=int(edge[0])) node_2 = graph.new_node('atom', features=np.random.uniform( low=0, high=1, size=(num_atom_features(), )), rdkit_ix=int(edge[1])) node_1.add_neighbors((node_2, )) edge = graph.new_node('bond', features=np.random.uniform( low=0, high=1, size=(num_bond_features(), ))) edge.add_neighbors((node_1, node_2)) mol_node = graph.new_node('molecule') mol_node.add_neighbors(graph.nodes['atom']) return graph
def build_convnet_fingerprint_fun(num_hidden_features=[100, 100], fp_length=512, normalize=True, activation_function=relu, return_atom_activations=False): """Sets up functions to compute convnets over all molecules in a minibatch together.""" # Specify weight shapes. parser = WeightsParser() all_layer_sizes = [num_atom_features()] + num_hidden_features for layer in range(len(all_layer_sizes)): parser.add_weights(('layer output weights', layer), (all_layer_sizes[layer], fp_length)) parser.add_weights(('layer output bias', layer), (1, fp_length)) in_and_out_sizes = zip(all_layer_sizes[:-1], all_layer_sizes[1:]) for layer, (N_prev, N_cur) in enumerate(in_and_out_sizes): parser.add_weights(("layer", layer, "biases"), (1, N_cur)) parser.add_weights(("layer", layer, "self filter"), (N_prev, N_cur)) for degree in degrees: parser.add_weights(weights_name(layer, degree), (N_prev + num_bond_features(), N_cur)) def update_layer(weights, layer, atom_features, bond_features, array_rep, normalize=False): def get_weights_func(degree): return parser.get(weights, weights_name(layer, degree)) layer_bias = parser.get(weights, ("layer", layer, "biases")) layer_self_weights = parser.get(weights, ("layer", layer, "self filter")) self_activations = np.dot(atom_features, layer_self_weights) neighbor_activations = matmult_neighbors(array_rep, atom_features, bond_features, get_weights_func) total_activations = neighbor_activations + self_activations + layer_bias if normalize: total_activations = batch_normalize(total_activations) return activation_function(total_activations) def output_layer_fun_and_atom_activations(weights, smiles): """Computes layer-wise convolution, and returns a fixed-size output.""" array_rep = array_rep_from_smiles(tuple(smiles)) atom_features = array_rep['atom_features'] bond_features = array_rep['bond_features'] all_layer_fps = [] atom_activations = [] def write_to_fingerprint(atom_features, layer): cur_out_weights = parser.get(weights, ('layer output weights', layer)) cur_out_bias = parser.get(weights, ('layer output bias', layer)) atom_outputs = softmax(cur_out_bias + np.dot(atom_features, cur_out_weights), axis=1) atom_activations.append(atom_outputs) # Sum over all atoms within a moleclue: layer_output = sum_and_stack(atom_outputs, array_rep['atom_list']) all_layer_fps.append(layer_output) num_layers = len(num_hidden_features) for layer in xrange(num_layers): write_to_fingerprint(atom_features, layer) atom_features = update_layer(weights, layer, atom_features, bond_features, array_rep, normalize=normalize) write_to_fingerprint(atom_features, num_layers) return sum(all_layer_fps), atom_activations, array_rep def output_layer_fun(weights, smiles): output, _, _ = output_layer_fun_and_atom_activations(weights, smiles) return output def compute_atom_activations(weights, smiles): _, atom_activations, array_rep = output_layer_fun_and_atom_activations( weights, smiles) return atom_activations, array_rep if return_atom_activations: return output_layer_fun, parser, compute_atom_activations else: return output_layer_fun, parser
def build_convnet_fingerprint_fun(num_hidden_features=[100, 100], fp_length=512, normalize=True, activation_function=relu, return_atom_activations=False): """Sets up functions to compute convnets over all molecules in a minibatch together.""" #import pdb; pdb.set_trace() # Specify weight shapes. parser = WeightsParser() all_layer_sizes = [num_atom_features()] + num_hidden_features # """ V:Concatinating 2 lists OUT: [62,20,20, 20, 20] """ print("num_atom_features ",num_atom_features()) for layer in range(len(all_layer_sizes)): parser.add_weights(('layer output weights', layer), (all_layer_sizes[layer], fp_length)) parser.add_weights(('layer output bias', layer), (1, fp_length)) in_and_out_sizes = zip(all_layer_sizes[:-1], all_layer_sizes[1:]) #""" V :OUT: [(62,20), (20,20), (20,20), (20,20)]""" print("in_and_out_sizes ",in_and_out_sizes) for layer, (N_prev, N_cur) in enumerate(in_and_out_sizes): parser.add_weights(("layer", layer, "biases"), (1, N_cur)) parser.add_weights(("layer", layer, "self filter"), (N_prev, N_cur)) for degree in degrees: ################## V: I Dont know what a degree is ########################## degrees = [0, 1, 2, 3, 4, 5] parser.add_weights(weights_name(layer, degree), (N_prev + num_bond_features(), N_cur)) def update_layer(weights, layer, atom_features, bond_features, array_rep, normalize=False): # import pdb; pdb.set_trace() def get_weights_func(degree): return parser.get(weights, weights_name(layer, degree)) layer_bias = parser.get(weights, ("layer", layer, "biases")) layer_self_weights = parser.get(weights, ("layer", layer, "self filter")) self_activations = np.dot(atom_features, layer_self_weights) neighbour_activations = matmult_neighbors( array_rep, atom_features, bond_features, get_weights_func) # import pdb; pdb.set_trace() total_activations = neighbour_activations + self_activations + layer_bias ### DOUBT : if i check the atom features here for visualisation, it is 1370 # import pdb; pdb.set_trace() # print("Total activations", np.shape(total_activations)) if normalize: total_activations = batch_normalize(total_activations) return activation_function(total_activations) def output_layer_fun_and_atom_activations(weights, smiles): # V: Came here from line # 108 def output_layer_fun(weights, smiles) """Computes layer-wise convolution, and returns a fixed-size output.""" import pdb; pdb.set_trace() array_rep = array_rep_from_smiles(tuple(smiles)) atom_features = array_rep['atom_features'] # V: (1370,62) bond_features = array_rep['bond_features'] # V: (1416,6) all_layer_fps = [] atom_activations = [] def write_to_fingerprint(atom_features, layer): # import pdb; pdb.set_trace() cur_out_weights = parser.get(weights, ('layer output weights', layer)) cur_out_bias = parser.get(weights, ('layer output bias', layer)) # import pdb; pdb.set_trace() atom_outputs = softmax(cur_out_bias + np.dot(atom_features, cur_out_weights), axis=1) #V: Smooth all the atom features and then find the softmax, i.e the FP atom_activations.append(atom_outputs) # V: Not needed for neural fingerprint, needed for visualization in neural FP # Sum over all atoms within a moleclue: layer_output = sum_and_stack(atom_outputs, array_rep['atom_list']) #V: array_rep['atom_list'] stores the indexes of atoms in each smile size: (100,) all_layer_fps.append(layer_output) num_layers = len(num_hidden_features) #V: (num_layers = 4) , num_hidden_features = [20, 20, 20, 20] for layer in xrange(num_layers): write_to_fingerprint(atom_features, layer) atom_features = update_layer(weights, layer, atom_features, bond_features, array_rep, normalize=normalize) write_to_fingerprint(atom_features, num_layers) return sum(all_layer_fps), atom_activations, array_rep def output_layer_fun(weights, smiles): # V: Came here from line # 80 in build_vanilla_net.py #import pdb; pdb.set_trace() output, _, _ = output_layer_fun_and_atom_activations(weights, smiles) return output def compute_atom_activations(weights, smiles): _, atom_activations, array_rep = output_layer_fun_and_atom_activations(weights, smiles) return atom_activations, array_rep if return_atom_activations: #import pdb; pdb.set_trace() return output_layer_fun, parser, compute_atom_activations else: #import pdb; pdb.set_trace() return output_layer_fun, parser
def tensorize_smiles_job(smiles, max_degree=5, max_atoms=None): '''Takes a list of smiles and turns the graphs in tensor representation. # Arguments: smiles: a list (or iterable) of smiles representations max_atoms: the maximum number of atoms per molecule (to which all molecules will be padded), use `None` for auto max_degree: max_atoms: the maximum number of neigbour per atom that each molecule can have (to which all molecules will be padded), use `None` for auto **NOTE**: It is not recommended to set max_degree to `None`/auto when using `NeuralGraph` layers. Max_degree determines the number of trainable parameters and is essentially a hyperparameter. While models can be rebuilt using different `max_atoms`, they cannot be rebuild for different values of `max_degree`, as the architecture will be different. For organic molecules `max_degree=5` is a good value (Duvenaud et. al, 2015) # Returns: atoms: np.array, An atom feature np.array of size `(molecules, max_atoms, atom_features)` bonds: np.array, A bonds np.array of size `(molecules, max_atoms, max_neighbours)` edges: np.array, A connectivity array of size `(molecules, max_atoms, max_neighbours, bond_features)` TODO: * Arguments for sparse vector encoding ''' # import sizes n = len(smiles) n_atom_features = features.num_atom_features() n_bond_features = features.num_bond_features() # preallocate atom tensor with 0's and bond tensor with -1 (because of 0 index) # If max_degree or max_atoms is set to None (auto), initialise dim as small # as possible (1) atom_tensor = np.zeros((n, max_atoms or 1, n_atom_features), dtype=np.float32) bond_tensor = np.zeros( (n, max_atoms or 1, max_degree or 1, n_bond_features), dtype=np.float32) edge_tensor = -np.ones((n, max_atoms or 1, max_degree or 1), dtype=np.int8) for mol_ix, s in enumerate(smiles): #load mol, atoms and bonds sio = sys.stderr = StringIO() mol = Chem.MolFromSmiles(s) assert mol is not None, 'Could not parse smiles {}, error: {}'.format( s, sio.getvalue()) atoms = mol.GetAtoms() bonds = mol.GetBonds() # If max_atoms is exceeded, resize if max_atoms=None (auto), else raise if len(atoms) > atom_tensor.shape[1]: assert max_atoms is None, 'too many atoms ({0}) in molecule: {1}'.format( len(atoms), s) atom_tensor = padaxis(atom_tensor, len(atoms), axis=1) bond_tensor = padaxis(bond_tensor, len(atoms), axis=1) edge_tensor = padaxis(edge_tensor, len(atoms), axis=1, pad_value=-1) rdkit_ix_lookup = {} connectivity_mat = {} for atom_ix, atom in enumerate(atoms): # write atom features atom_tensor[mol_ix, atom_ix, :n_atom_features] = features.atom_features( atom) # store entry in idx rdkit_ix_lookup[atom.GetIdx()] = atom_ix # preallocate array with neighbour lists (indexed by atom) connectivity_mat = [[] for _ in atoms] for bond in bonds: # lookup atom ids a1_ix = rdkit_ix_lookup[bond.GetBeginAtom().GetIdx()] a2_ix = rdkit_ix_lookup[bond.GetEndAtom().GetIdx()] # lookup how many neighbours are encoded yet a1_neigh = len(connectivity_mat[a1_ix]) a2_neigh = len(connectivity_mat[a2_ix]) # If max_degree is exceeded, resize if max_degree=None (auto), else raise new_degree = max(a1_neigh, a2_neigh) + 1 if new_degree > bond_tensor.shape[2]: assert max_degree is None, 'too many neighours ({0}) in molecule: {1}'.format( new_degree, s) bond_tensor = padaxis(bond_tensor, new_degree, axis=2) edge_tensor = padaxis(edge_tensor, new_degree, axis=2, pad_value=-1) # store bond features bond_features = np.array(features.bond_features(bond), dtype=int) bond_tensor[mol_ix, a1_ix, a1_neigh, :] = bond_features bond_tensor[mol_ix, a2_ix, a2_neigh, :] = bond_features #add to connectivity matrix connectivity_mat[a1_ix].append(a2_ix) connectivity_mat[a2_ix].append(a1_ix) #store connectivity matrix for a1_ix, neighbours in enumerate(connectivity_mat): degree = len(neighbours) edge_tensor[mol_ix, a1_ix, :degree] = neighbours return atom_tensor, bond_tensor, edge_tensor