示例#1
0
def build_weights(self, model_params):
    initializer = chainer.initializers.HeNormal()
    setattr(
        self,
        'model_params',
        model_params,
    )
    num_hidden_features = [self.model_params['conv_width']
                           ] * self.model_params['fp_depth']
    all_layer_sizes = [num_atom_features()] + num_hidden_features
    '''
	for i in range(len(all_layer_sizes)):
		all_layer_sizes[i] = 1
	'''
    '''output weights'''
    for layer in range(len(all_layer_sizes)):
        setattr(
            self, 'layer_output_weights_' + str(layer),
            L.Linear(all_layer_sizes[layer],
                     self.model_params['fp_length'],
                     initialW=initializer))
    '''hidden weights'''
    in_and_out_sizes = zip(all_layer_sizes[:-1], all_layer_sizes[1:])
    for layer, (N_prev, N_cur) in enumerate(in_and_out_sizes):
        setattr(self, 'layer_' + str(layer) + '_self_filter',
                L.Linear(N_prev, N_cur, initialW=initializer))
        for degree in degrees:
            name = weights_name(layer, degree)
            setattr(
                self, name,
                L.Linear(N_prev + num_bond_features(),
                         N_cur,
                         initialW=initializer))
示例#2
0
def graph_from_edgelist(edgelist):
    graph = MolGraph()
    atoms_by_rd_idx = {}

    edges = []

    with open(edgelist) as f:
        for line in f.readlines():
            line = line.strip('\n')
            if '#' not in line:
                edges.append(re.findall('\d+', 'xyz123abc456def789'))

    for edge in edges:
        node_1 = graph.new_node('atom',
                                features=np.random.uniform(
                                    low=0,
                                    high=1,
                                    size=(num_atom_features(), )),
                                rdkit_ix=int(edge[0]))
        node_2 = graph.new_node('atom',
                                features=np.random.uniform(
                                    low=0,
                                    high=1,
                                    size=(num_atom_features(), )),
                                rdkit_ix=int(edge[1]))
        node_1.add_neighbors((node_2, ))

        edge = graph.new_node('bond',
                              features=np.random.uniform(
                                  low=0, high=1, size=(num_bond_features(), )))
        edge.add_neighbors((node_1, node_2))

    mol_node = graph.new_node('molecule')
    mol_node.add_neighbors(graph.nodes['atom'])

    return graph
示例#3
0
def build_convnet_fingerprint_fun(num_hidden_features=[100, 100],
                                  fp_length=512,
                                  normalize=True,
                                  activation_function=relu,
                                  return_atom_activations=False):
    """Sets up functions to compute convnets over all molecules in a minibatch together."""

    # Specify weight shapes.
    parser = WeightsParser()
    all_layer_sizes = [num_atom_features()] + num_hidden_features
    for layer in range(len(all_layer_sizes)):
        parser.add_weights(('layer output weights', layer),
                           (all_layer_sizes[layer], fp_length))
        parser.add_weights(('layer output bias', layer), (1, fp_length))

    in_and_out_sizes = zip(all_layer_sizes[:-1], all_layer_sizes[1:])
    for layer, (N_prev, N_cur) in enumerate(in_and_out_sizes):
        parser.add_weights(("layer", layer, "biases"), (1, N_cur))
        parser.add_weights(("layer", layer, "self filter"), (N_prev, N_cur))
        for degree in degrees:
            parser.add_weights(weights_name(layer, degree),
                               (N_prev + num_bond_features(), N_cur))

    def update_layer(weights,
                     layer,
                     atom_features,
                     bond_features,
                     array_rep,
                     normalize=False):
        def get_weights_func(degree):
            return parser.get(weights, weights_name(layer, degree))

        layer_bias = parser.get(weights, ("layer", layer, "biases"))
        layer_self_weights = parser.get(weights,
                                        ("layer", layer, "self filter"))
        self_activations = np.dot(atom_features, layer_self_weights)
        neighbor_activations = matmult_neighbors(array_rep, atom_features,
                                                 bond_features,
                                                 get_weights_func)

        total_activations = neighbor_activations + self_activations + layer_bias
        if normalize:
            total_activations = batch_normalize(total_activations)
        return activation_function(total_activations)

    def output_layer_fun_and_atom_activations(weights, smiles):
        """Computes layer-wise convolution, and returns a fixed-size output."""

        array_rep = array_rep_from_smiles(tuple(smiles))

        atom_features = array_rep['atom_features']
        bond_features = array_rep['bond_features']

        all_layer_fps = []
        atom_activations = []

        def write_to_fingerprint(atom_features, layer):
            cur_out_weights = parser.get(weights,
                                         ('layer output weights', layer))
            cur_out_bias = parser.get(weights, ('layer output bias', layer))
            atom_outputs = softmax(cur_out_bias +
                                   np.dot(atom_features, cur_out_weights),
                                   axis=1)
            atom_activations.append(atom_outputs)
            # Sum over all atoms within a moleclue:
            layer_output = sum_and_stack(atom_outputs, array_rep['atom_list'])
            all_layer_fps.append(layer_output)

        num_layers = len(num_hidden_features)
        for layer in xrange(num_layers):
            write_to_fingerprint(atom_features, layer)
            atom_features = update_layer(weights,
                                         layer,
                                         atom_features,
                                         bond_features,
                                         array_rep,
                                         normalize=normalize)
        write_to_fingerprint(atom_features, num_layers)
        return sum(all_layer_fps), atom_activations, array_rep

    def output_layer_fun(weights, smiles):
        output, _, _ = output_layer_fun_and_atom_activations(weights, smiles)
        return output

    def compute_atom_activations(weights, smiles):
        _, atom_activations, array_rep = output_layer_fun_and_atom_activations(
            weights, smiles)
        return atom_activations, array_rep

    if return_atom_activations:
        return output_layer_fun, parser, compute_atom_activations
    else:
        return output_layer_fun, parser
def build_convnet_fingerprint_fun(num_hidden_features=[100, 100], fp_length=512,
                                  normalize=True, activation_function=relu,
                                  return_atom_activations=False):
    """Sets up functions to compute convnets over all molecules in a minibatch together."""
    #import pdb; pdb.set_trace()
    # Specify weight shapes.
    parser = WeightsParser()
    all_layer_sizes = [num_atom_features()] + num_hidden_features  # """ V:Concatinating 2 lists OUT: [62,20,20, 20, 20] """
    print("num_atom_features ",num_atom_features())
    for layer in range(len(all_layer_sizes)):
        parser.add_weights(('layer output weights', layer), (all_layer_sizes[layer], fp_length))
        parser.add_weights(('layer output bias', layer),    (1, fp_length))

    in_and_out_sizes = zip(all_layer_sizes[:-1], all_layer_sizes[1:]) #""" V :OUT: [(62,20), (20,20), (20,20), (20,20)]"""
    print("in_and_out_sizes ",in_and_out_sizes)
    for layer, (N_prev, N_cur) in enumerate(in_and_out_sizes):
        parser.add_weights(("layer", layer, "biases"), (1, N_cur))
        parser.add_weights(("layer", layer, "self filter"), (N_prev, N_cur))
        for degree in degrees:  ################## V: I Dont know what a degree is ##########################   degrees = [0, 1, 2, 3, 4, 5]
            parser.add_weights(weights_name(layer, degree), (N_prev + num_bond_features(), N_cur))

    def update_layer(weights, layer, atom_features, bond_features, array_rep, normalize=False):
        # import pdb; pdb.set_trace()  
        def get_weights_func(degree):
            return parser.get(weights, weights_name(layer, degree))
        layer_bias         = parser.get(weights, ("layer", layer, "biases"))
        layer_self_weights = parser.get(weights, ("layer", layer, "self filter"))
        self_activations = np.dot(atom_features, layer_self_weights)           
        neighbour_activations = matmult_neighbors(   
            array_rep, atom_features, bond_features, get_weights_func)             
        # import pdb; pdb.set_trace()
        total_activations = neighbour_activations + self_activations + layer_bias    ### DOUBT : if i check the atom features here for visualisation, it is 1370
        # import pdb; pdb.set_trace() 
        # print("Total activations", np.shape(total_activations))           
        if normalize:
            total_activations = batch_normalize(total_activations)
        return activation_function(total_activations)

    def output_layer_fun_and_atom_activations(weights, smiles):  # V: Came here from line # 108 def output_layer_fun(weights, smiles)
        """Computes layer-wise convolution, and returns a fixed-size output."""
        import pdb; pdb.set_trace()
        array_rep = array_rep_from_smiles(tuple(smiles))
        atom_features = array_rep['atom_features']  # V: (1370,62)
        bond_features = array_rep['bond_features'] # V: (1416,6)

        all_layer_fps = []
        atom_activations = []
        def write_to_fingerprint(atom_features, layer):
            # import pdb; pdb.set_trace()
            cur_out_weights = parser.get(weights, ('layer output weights', layer))
            cur_out_bias    = parser.get(weights, ('layer output bias', layer))
            # import pdb; pdb.set_trace()
            atom_outputs = softmax(cur_out_bias + np.dot(atom_features, cur_out_weights), axis=1)  #V: Smooth all the atom features and then find the softmax, i.e the FP
            atom_activations.append(atom_outputs)   # V: Not needed for neural fingerprint, needed for visualization in neural FP
            # Sum over all atoms within a moleclue:
            layer_output = sum_and_stack(atom_outputs, array_rep['atom_list'])  #V: array_rep['atom_list'] stores the indexes of atoms in each smile size: (100,)
            all_layer_fps.append(layer_output)

        num_layers = len(num_hidden_features) #V: (num_layers = 4) , num_hidden_features = [20, 20, 20, 20]
        for layer in xrange(num_layers):
            write_to_fingerprint(atom_features, layer)
            atom_features = update_layer(weights, layer, atom_features, bond_features, array_rep,
                                         normalize=normalize)
        write_to_fingerprint(atom_features, num_layers)
        return sum(all_layer_fps), atom_activations, array_rep

    def output_layer_fun(weights, smiles):  # V: Came here from line # 80 in build_vanilla_net.py
        #import pdb; pdb.set_trace()
        output, _, _ = output_layer_fun_and_atom_activations(weights, smiles)
        return output

    def compute_atom_activations(weights, smiles):
        _, atom_activations, array_rep = output_layer_fun_and_atom_activations(weights, smiles)
        return atom_activations, array_rep

    if return_atom_activations:
        #import pdb; pdb.set_trace()
        return output_layer_fun, parser, compute_atom_activations
    else:
        #import pdb; pdb.set_trace()
        return output_layer_fun, parser
def tensorize_smiles_job(smiles, max_degree=5, max_atoms=None):
    '''Takes a list of smiles and turns the graphs in tensor representation.

    # Arguments:
        smiles: a list (or iterable) of smiles representations
        max_atoms: the maximum number of atoms per molecule (to which all
            molecules will be padded), use `None` for auto
        max_degree: max_atoms: the maximum number of neigbour per atom that each
            molecule can have (to which all molecules will be padded), use `None`
            for auto

        **NOTE**: It is not recommended to set max_degree to `None`/auto when
            using `NeuralGraph` layers. Max_degree determines the number of
            trainable parameters and is essentially a hyperparameter.
            While models can be rebuilt using different `max_atoms`, they cannot
            be rebuild for different values of `max_degree`, as the architecture
            will be different.

            For organic molecules `max_degree=5` is a good value (Duvenaud et. al, 2015)


    # Returns:
        atoms: np.array, An atom feature np.array of size `(molecules, max_atoms, atom_features)`
        bonds: np.array, A bonds np.array of size `(molecules, max_atoms, max_neighbours)`
        edges: np.array, A connectivity array of size `(molecules, max_atoms, max_neighbours, bond_features)`
    TODO:
        * Arguments for sparse vector encoding

    '''

    # import sizes
    n = len(smiles)
    n_atom_features = features.num_atom_features()
    n_bond_features = features.num_bond_features()

    # preallocate atom tensor with 0's and bond tensor with -1 (because of 0 index)
    # If max_degree or max_atoms is set to None (auto), initialise dim as small
    #   as possible (1)
    atom_tensor = np.zeros((n, max_atoms or 1, n_atom_features),
                           dtype=np.float32)
    bond_tensor = np.zeros(
        (n, max_atoms or 1, max_degree or 1, n_bond_features),
        dtype=np.float32)
    edge_tensor = -np.ones((n, max_atoms or 1, max_degree or 1), dtype=np.int8)

    for mol_ix, s in enumerate(smiles):

        #load mol, atoms and bonds
        sio = sys.stderr = StringIO()
        mol = Chem.MolFromSmiles(s)
        assert mol is not None, 'Could not parse smiles {}, error: {}'.format(
            s, sio.getvalue())
        atoms = mol.GetAtoms()
        bonds = mol.GetBonds()

        # If max_atoms is exceeded, resize if max_atoms=None (auto), else raise
        if len(atoms) > atom_tensor.shape[1]:
            assert max_atoms is None, 'too many atoms ({0}) in molecule: {1}'.format(
                len(atoms), s)
            atom_tensor = padaxis(atom_tensor, len(atoms), axis=1)
            bond_tensor = padaxis(bond_tensor, len(atoms), axis=1)
            edge_tensor = padaxis(edge_tensor,
                                  len(atoms),
                                  axis=1,
                                  pad_value=-1)

        rdkit_ix_lookup = {}
        connectivity_mat = {}

        for atom_ix, atom in enumerate(atoms):
            # write atom features
            atom_tensor[mol_ix,
                        atom_ix, :n_atom_features] = features.atom_features(
                            atom)

            # store entry in idx
            rdkit_ix_lookup[atom.GetIdx()] = atom_ix

        # preallocate array with neighbour lists (indexed by atom)
        connectivity_mat = [[] for _ in atoms]

        for bond in bonds:
            # lookup atom ids
            a1_ix = rdkit_ix_lookup[bond.GetBeginAtom().GetIdx()]
            a2_ix = rdkit_ix_lookup[bond.GetEndAtom().GetIdx()]

            # lookup how many neighbours are encoded yet
            a1_neigh = len(connectivity_mat[a1_ix])
            a2_neigh = len(connectivity_mat[a2_ix])

            # If max_degree is exceeded, resize if max_degree=None (auto), else raise
            new_degree = max(a1_neigh, a2_neigh) + 1
            if new_degree > bond_tensor.shape[2]:
                assert max_degree is None, 'too many neighours ({0}) in molecule: {1}'.format(
                    new_degree, s)
                bond_tensor = padaxis(bond_tensor, new_degree, axis=2)
                edge_tensor = padaxis(edge_tensor,
                                      new_degree,
                                      axis=2,
                                      pad_value=-1)

            # store bond features
            bond_features = np.array(features.bond_features(bond), dtype=int)
            bond_tensor[mol_ix, a1_ix, a1_neigh, :] = bond_features
            bond_tensor[mol_ix, a2_ix, a2_neigh, :] = bond_features

            #add to connectivity matrix
            connectivity_mat[a1_ix].append(a2_ix)
            connectivity_mat[a2_ix].append(a1_ix)

        #store connectivity matrix
        for a1_ix, neighbours in enumerate(connectivity_mat):
            degree = len(neighbours)
            edge_tensor[mol_ix, a1_ix, :degree] = neighbours

    return atom_tensor, bond_tensor, edge_tensor