def optimize(self, s1, s2, stochastic=False, lr=0.01, max_iter=100, beta1=0.01): g, h, scaffold_g, scaffold_h = utils.make_graphs( s1, s2, extra_atom_feature=True, extra_bond_feature=True) self.embede_graph(g, h) self.embede_graph(scaffold_g, scaffold_h) self.encode(g, h) encoded_vector = self.cal_encoded_vector(h) latent_vector, mu, logvar = self.reparameterize(encoded_vector) start_point = utils.create_var(encoded_vector.data, True) self.init_scaffold_state(scaffold_g, scaffold_h) scaffold_state = utils.average_node_state(scaffold_h) visited = [] for iteration in range(max_iter): latent_vector, mu, logvar = self.reparameterize(start_point) prop = self.predict_property( torch.cat([latent_vector, scaffold_state], 1)).view(-1) loss1 = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) #objective = prop #objective = prop[0]-prop[1]-loss1*beta1 objective = prop[0] - loss1 * beta1 objective.backward(retain_graph=True) #grad = torch.autograd.grad(prop, start_point)[0] start_point = start_point.data + lr * start_point.grad.data start_point = utils.create_var(start_point, True) visited.append(start_point) retval = [] for v in visited: latent_vector, mu, logvar = self.reparameterize(v) new_prop = self.predict_property( torch.cat([latent_vector, scaffold_state], 1)).squeeze().data.cpu().numpy() loss1 = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()).data.cpu().numpy()[0] #objective = new_prop[0]-new_prop[1]-loss1*beta1 objective = new_prop[0] - loss1 * beta1 g_gen, h_gen = self.sample(None, s2, latent_vector) try: new_s = utils.graph_to_smiles(g_gen, h_gen) new_s = Chem.MolToSmiles(Chem.MolFromSmiles(new_s), isomericSmiles=False) except: new_s = None if new_s is None or new_s.find('.') != -1: continue isomers = utils.enumerate_molecule(new_s) selected_isomer, target = self.select_isomer(s1, latent_vector) new_s = isomers[np.argmax( selected_isomer.squeeze().data.cpu().numpy())] retval.append((new_s, objective, new_prop[0], loss1, latent_vector.data.cpu().numpy()[0])) return retval
def insert_var(self, id_param, type_param, line_param, is_param): lexeme = create_var(id_param, type_param, line_param, is_param) for var in self.get_actual_scope().get('variables'): if var['id'] == lexeme['id']: raise (Exception( 'Variável "{}" já definida anteriormente na linha {}'. format(lexeme['id'], var['line']))) self.get_actual_scope().get('variables').append(lexeme)
def select_isomer(self, mother, scaffold, latent_vector): """\ Return an isomer-selection vector and the answer one-hot. Returns ------- retval: isomer-selection latent vector of shape (len(isomers),) target: answer one-hot of shape (len(isomers),) where `isomers` are the isomers of `mother`. """ #sample possible isomer m_mother = Chem.MolFromSmiles(mother) isomer_candidates = utils.enumerate_molecule( mother) # list of isomer SMILESs isomers = [] for s in isomer_candidates: m = Chem.MolFromSmiles(s) if m.HasSubstructMatch(Chem.MolFromSmiles(scaffold), useChirality=True): isomers.append(s) graph_vectors = [] #make graph for each isomer for s in isomers: g, h = utils.make_graph(s, extra_atom_feature=True, extra_bond_feature=True) self.embede_graph(g, h) for k in range(len(self.prop_select_isomer_U)): self.mpnn(g, h, self.prop_select_isomer_U[k], self.prop_select_isomer_C[k], latent_vector) graph_vectors.append(utils.average_node_state(h)) graph_vectors = torch.cat(graph_vectors, 0) # -> (len(isomers), dim_of_node_vector) latent_vectors = latent_vector.repeat(len(isomers), 1) # -> (len(isomers), dim_of_node_vector + N_conditions) retval = torch.cat([graph_vectors, latent_vectors], -1) # -> (len(isomers), 2*dim_of_node_vector + N_conditions) #FC layer retval = F.relu(self.select_isomer1(retval)) retval = F.relu(self.select_isomer2(retval)) retval = self.select_isomer3(retval) retval = retval.view(-1) # (len(isomers),) retval = torch.sigmoid(retval) target = [] #check which isomer is same as mother for s in isomers: if m_mother.HasSubstructMatch(Chem.MolFromSmiles(s), useChirality=True): target.append(1) else: target.append(0) target = utils.create_var(torch.Tensor(target)) # (len(isomers),) return retval, target, isomers
def cal_encoded_vector(self, h): """Return a graph-representation vector of shape (1, dim_of_node_vector). See Eq. (4) of Yujia Li et al. 2018.""" #h_sum = utils.average_node_state(h) if len(h) == 0: return utils.create_var(torch.zeros(1, self.dim_of_node_vector)) inputs = torch.cat([h[i] for i in h.keys()], 0) h1 = self.cal_encoded_vector1(inputs) # cf. cal_graph_vector h2 = F.sigmoid( self.cal_encoded_vector2(inputs)) # cf. cal_graph_vector retval = (h1 * h2).mean(0, keepdim=True) #print (retval.size()) return retval
def reparameterize(self, latent_vector): mu = self.mean(latent_vector) logvar = self.logvar(latent_vector) std = torch.exp(0.5 * logvar) eps = utils.create_var(torch.randn(std.size())) return eps.mul(std).add_(mu), mu, logvar
def sample(self, s1=None, s2=None, latent_vector=None, condition1=None, condition2=None, stochastic=False): """\ Parameters ---------- s1: whole SMILES str If given, its graph becomes a latent vector to be decoded. s2: scaffold SMILES str Must be given other than None. latent_vector: None | torch.autograd.Variable A latent vector to be decoded. Not used if `s1` is given. If both `latent_vector` and `s1` are None, a latent vector is sampled from the standard normal. condition1: list[float] | None [ target_value1, target_value2, ... ] If None, target values are sampled from uniform [0, 1]. Can be an empty list for unconditional sampling. condition2: list[float] | None [ scaffold_value1, scaffold_value2, ... ] If None, scaffold values are sampled from uniform [0, 1]. Can be an empty list for unconditional sampling. stochastic: bool See `utils.probability_to_one_hot`. Returns ------- scaffold_g_save: OrderedDict[int, list[tuple[torch.autograd.Variable, int]]] A new dict of edge one-hot vectors and partner-node indices generated from the given scaffold `s2`. scaffold_h_save: OrderedDict[int, torch.autograd.Variable] A new dict of node one-hot vectors generated from the given scaffold `s2`. """ max_add_nodes = 100 max_add_edges = 5 if s2 is None: print('when you sample, you must give scaffold') return None # Embede the scaffold edge/node vectors. # If `s1` is given, convert its graph to a latent vector. if s1 is not None: g_save, h_save, scaffold_g_save, scaffold_h_save = utils.make_graphs( s1, s2) if g_save is None and h_save is None: return None g, h, scaffold_g, scaffold_h = utils.make_graphs( s1, s2, extra_atom_feature=True, extra_bond_feature=True) self.embede_graph(g, h) self.embede_graph(scaffold_g, scaffold_h) self.encode(g, h) encoded_vector = self.cal_encoded_vector(h) latent_vector, mu, logvar = self.reparameterize(encoded_vector) # `mu` and `logvar` are not used further. # If `s1` is None, sample a latent vector from the standard normal. elif s1 is None: scaffold_g_save, scaffold_h_save = utils.make_graph(s2) if scaffold_g_save is None and scaffold_h_save is None: return None scaffold_g, scaffold_h = utils.make_graph(s2, extra_atom_feature=True, extra_bond_feature=True) self.embede_graph(scaffold_g, scaffold_h) if latent_vector is None: # Sampling latent_vector = utils.create_var( torch.randn(1, self.dim_of_node_vector)) # Sample condition values if not given. if condition1 is None or condition2 is None: assert not self.N_conditions % 2 condition1 = np.random.rand(self.N_conditions // 2) condition2 = np.random.rand(self.N_conditions // 2) # A condition torch.FloatTensor of shape (1, N_conditions): condition = utils.create_var(torch.Tensor(condition1 + condition2)) if condition.shape: condition = condition.unsqueeze(0) latent_vector = torch.cat([latent_vector, condition], -1) # -> (1, dim_of_node_vector + N_conditions) self.init_scaffold_state(scaffold_g, scaffold_h, condition) for null_index1 in range(max_add_nodes): new_node = self.add_node(scaffold_g, scaffold_h, latent_vector) # (1, N_atom_features) new_node = utils.probability_to_one_hot(new_node, stochastic) # Recall our definition of the termination vector: if np.argmax(new_node.data.cpu().numpy().ravel() ) == N_atom_features - 1: break idx = len(scaffold_h) scaffold_h_save[idx] = new_node scaffold_h[idx] = self.init_node_state(scaffold_h, new_node) for null_index2 in range(max_add_edges): new_edge = self.add_edge(scaffold_g, scaffold_h, latent_vector) # (1, N_bond_features) new_edge = utils.probability_to_one_hot(new_edge, stochastic) # Recall our definition of the termination vector: if np.argmax(new_edge.data.cpu().numpy().ravel() ) == N_bond_features - 1: break selected_node = self.select_node(scaffold_g, scaffold_h, latent_vector).view(1, -1) # -> (1, len(scaffold_h)-1) # Index of the selected node (int) selected_node = list(scaffold_h.keys())[np.argmax( utils.probability_to_one_hot( selected_node, stochastic).data.cpu().numpy().ravel())] if idx not in scaffold_g_save: scaffold_g_save[idx] = [] scaffold_g[idx] = [] scaffold_g_save[idx].append((new_edge, selected_node)) scaffold_g[idx].append( (self.init_edge_state(scaffold_h, new_edge), selected_node)) # Add the same edge in the opposite direction. if selected_node not in scaffold_g_save: scaffold_g_save[selected_node] = [] scaffold_g[selected_node] = [] scaffold_g_save[selected_node].append((new_edge, idx)) scaffold_g[selected_node].append( (self.init_edge_state(scaffold_h, new_edge), idx)) try: new_smiles = utils.graph_to_smiles(scaffold_g_save, scaffold_h_save) new_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(new_smiles), isomericSmiles=False) except: return None selected_isomer, target, isomers = self.select_isomer( new_smiles, s2, latent_vector) selected_isomer = np.argmax( utils.probability_to_one_hot(selected_isomer, stochastic).data.cpu().numpy()) return isomers[selected_isomer] """
def forward(self, s1, s2, condition1, condition2, shuffle=False): """\ Parameters ---------- s1: str A whole-molecule SMILES. s2: str A scaffold SMILES. condition1: list[float] [ whole_value1, whole_value2, ... ] Can be an empty list for unconditional training. condition2: list[float] [ scaffold_value1, scaffold_value2, ... ] Can be an empty list for unconditional training. Returns ------- scaffold_g: OrderedDict[int, list[tuple[torch.autograd.Variable, int]]] Reconstructed dict of the latent edge vectors and partner-node indices. scaffold_h: OrderedDict[int, torch.autograd.Variable] Reconstructed dict of the latent node vectors. total_loss1: torch.autograd.Variable of shape (1,) Reconstruction loss. total_loss2: torch.autograd.Variable of shape (1,) VAE loss (weighted by `beta1`). total_loss4: torch.autograd.Variable of shape (1,) Isomer selection loss. """ # Specification of graph variables defined here # --------------------------------------------- # g, g_save, scaffold_g, scaffold_g_save: edge dict objects # -> OrderedDict[int, list[tuple[torch.autograd.Variable, int]]] # -> { node_idx: [ (edge_vector, partner_node_idx), ... ], ... } # # h, h_save, scaffold_h, scaffold_h_save: node dict objects # -> OrderedDict[int, torch.autograd.Variable] # -> { node_idx: node_vector, ... } # # g_save, h_save: # Backup of the whole-graph one-hots w/o extra features. # These are not changed further. # g, h: # become a latent vector for VAE. # scaffold_g_save, scaffold_h_save: # The scaffold one-hots w/o extra features, to which new one-hots will be added # to check later if the reconstruction is successful. # scaffold_g, scaffold_h: # Scaffold dicts of latent edge/node vectors # to which new initialized state vectors will be added. # Make graph of molecule and scaffold WITHOUT extra atom/bond features. g_save, h_save, scaffold_g_save, scaffold_h_save = utils.make_graphs( s1, s2) if g_save is None and h_save is None: return None # Make graph of molecule and scaffold WITH extra atom/bond features. g, h, scaffold_g, scaffold_h = utils.make_graphs( s1, s2, extra_atom_feature=True, extra_bond_feature=True) #collect losses add_node_losses = [] add_edge_losses = [] select_node_losses = [] #embede node state of graph self.embede_graph(g, h) self.embede_graph(scaffold_g, scaffold_h) # A condition torch.FloatTensor of shape (N_conditions,): # [ whole_value1, whole_value2, ..., scaffold_value1, scaffold_value2 ] condition = utils.create_var(torch.Tensor(condition1 + condition2)) # (N_condition,) -> (1, N_conditions) if condition.shape: condition = condition.unsqueeze(0) #encode node state of graph self.encode(g, h, condition) #make one vector representing graph using all node vectors encoded_vector = self.cal_encoded_vector(h) # (1, dim_of_node_vector) #reparameterization trick. this routine is needed for VAE. latent_vector, mu, logvar = self.reparameterize(encoded_vector) # -> (1, dim_of_node_vector), same, same if condition.shape: latent_vector = torch.cat([latent_vector, condition], -1) # -> (1, dim_of_node_vector + N_conditions) #encode node state of scaffold graph self.init_scaffold_state(scaffold_g, scaffold_h, condition) #check which node is included in scaffold and which node is not leaves = [i for i in h_save.keys() if i not in scaffold_h.keys()] if shuffle: random.shuffle(leaves) for idx in leaves: #determine which node type should be added and calculate the loss new_node = self.add_node(scaffold_g, scaffold_h, latent_vector) # -> (1, N_atom_features) add_node_losses.append( (-h_save[idx] * torch.log(new_node + 1e-6)).sum()) #add new node to the graph and initialize the new node state scaffold_h_save[idx] = h_save[idx] scaffold_h[idx] = self.init_node_state(scaffold_h, scaffold_h_save[idx]) #find the edges connected to the new node edge_list = [ e for e in g_save[idx] if e[1] in list(scaffold_h.keys()) ] if shuffle: random.shuffle(edge_list) for edge in edge_list: #determin which edge type is added and calculate the corresponding loss new_edge = self.add_edge(scaffold_g, scaffold_h, latent_vector) # -> (1, N_bond_features) add_edge_losses.append( (-edge[0] * torch.log(new_edge + 1e-6)).sum()) #determine which node is connected through selected edge and calculate the corresponding loss # The answer one-hot whose nonzero index is the partner-atom index: target = utils.create_var( utils.one_hot( torch.FloatTensor( [list(scaffold_h.keys()).index(edge[1])]), len(scaffold_h) - 1)) # -> (1, len(scaffold_h)-1) selected_node = self.select_node( scaffold_g, scaffold_h, latent_vector).view(target.size()) # -> (1, len(scaffold_h)-1) select_node_losses.append( (-target * torch.log(1e-6 + selected_node)).sum()) #add edge to the graph and initialize the new node state if idx not in scaffold_g_save: scaffold_g_save[idx] = [] scaffold_g[idx] = [] scaffold_g_save[idx].append(edge) scaffold_g[idx].append( (self.init_edge_state(scaffold_h, edge[0]), edge[1])) if edge[1] not in scaffold_g_save: scaffold_g_save[edge[1]] = [] scaffold_g[edge[1]] = [] scaffold_g_save[edge[1]].append((edge[0], idx)) scaffold_g[edge[1]].append( (self.init_edge_state(scaffold_h, edge[0]), idx)) #the edge should not be added more. calculate the corresponding loss new_edge = self.add_edge(scaffold_g, scaffold_h, latent_vector) # Force the termination vector to be [0, 0, ..., 0, 1]. end_add_edge = utils.create_var( utils.one_hot(torch.FloatTensor([N_bond_features - 1]), N_bond_features)) add_edge_losses.append( (-end_add_edge * torch.log(1e-6 + new_edge)).sum()) #the node should not be added more. calculate the corresponding loss new_node = self.add_node(scaffold_g, scaffold_h, latent_vector) # Force the termination vector to be [0, 0, ..., 0, 1]. end_add_node = utils.create_var( utils.one_hot(torch.FloatTensor([N_atom_features - 1]), N_atom_features)) add_node_losses.append( (-end_add_node * torch.log(1e-6 + new_node)).sum()) #convert list to the torch tensor total_add_node_loss = torch.stack(add_node_losses).mean() if len(add_edge_losses) > 0: total_add_edge_loss = torch.stack(add_edge_losses).mean() total_select_node_loss = torch.stack(select_node_losses).mean() else: total_add_edge_loss = 0.0 total_select_node_loss = 0.0 #check whether reconstructed graph is same as the input graph if not utils.is_equal_node_type(scaffold_h_save, h_save): print('node miss match') print(s1) print(s2) if not utils.is_equal_edge_type(scaffold_g_save, g_save): print('edge miss match') print(s1) print(s2) #reconstruction loss total_loss1 = total_add_node_loss + total_add_edge_loss + total_select_node_loss #VAE loss (AEVB 2013) total_loss2 = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) #total_loss3 = (c-utils.create_var(torch.from_numpy(a)).type(torch.FloatTensor)).pow(2).sum()*beta2 #select isomer isomers = utils.enumerate_molecule(s1) # ?? selected_isomer, target, _ = self.select_isomer(s1, s2, latent_vector) #isomer loss criterion = nn.BCELoss() total_loss4 = criterion(selected_isomer, target) #total_loss4 = (selected_isomer-target).pow(2).sum() return scaffold_g, scaffold_h, total_loss1, total_loss2, total_loss4