Пример #1
0
def get_reward_MO(predictor, smile):
    """
    This function takes the predictor model and the SMILES string and returns 
    a numerical reward.
    ----------
    predictor: object of the predictive model that accepts a trajectory
        and returns a numerical prediction of desired property for the given 
        trajectory
    smile: SMILES string of the generated molecule
    
    Returns
    -------
    Outputs the reward value for the predicted property of the input SMILES 
    """

    list_ss = [smile]

    # SAScore prediction
    list_ss[0] = Chem.MolFromSmiles(smile)
    sas_list = SAscore(list_ss)
    sas_smiles = sas_list[0]
    reward_sas = np.exp(-sas_smiles + 3)

    # pIC50 for kor prediction
    list_ss = [smile]
    pred = predictor.predict(list_ss)

    reward_kor = np.exp(pred / 4 - 1)
    #    reward = np.exp(pred/10) - 1
    return reward_kor, reward_sas
Пример #2
0
def get_reward(predictor, smile, property_identifier):
    """
    This function takes the predictor model and the SMILES string and returns 
    a numerical reward.
    ----------
    predictor: object of the predictive model that accepts a trajectory
        and returns a numerical prediction of desired property for the given 
        trajectory
    smile: SMILES string of the generated molecule
    property_identifier: String identifying the property 
    Returns
    -------
    Outputs the reward value for the predicted property of the input SMILES 
    """

    list_ss = [smile]

    if property_identifier == 'sas':
        list_ss[0] = Chem.MolFromSmiles(smile)
        reward_list = SAscore(list_ss)
        reward = reward_list[0]
    else:

        pred = predictor.predict(list_ss)

        reward = np.exp(pred / 4 - 1)
#    reward = np.exp(pred/10) - 1
    return reward
Пример #3
0
def generate2file(predictor, generator, configReinforce, n2generate,
                  original_model):
    """
    Function that generates a specified number of SMILES strings and predicts 
    its SA score and possibly its pIC50 for KOR. This function also saves the valid SMILES
    and respective predictions to a folder called "Generated".

    Parameters
    ----------
    predictor: Predictor model object, to predict the desired property
    generator: Generator model object, to generate new SMILES strings
    configReinforce: Configuration file
    n2generate: Integer specifying the number of SMILES to generate
    original_model: String that indicates which generator was used: The G_0 or 
                    the G_optimized
    Returns
    -------
    This function doesn't return anything, but saves a file with the generated 
    SMILES and the respective predicted properties. 
    """
    if original_model:
        model_type = 'original'
    else:
        model_type = 'biased'

    generated = []
    pbar = tqdm(range(n2generate))
    for i in pbar:
        pbar.set_description("Generating molecules...")
        predictSMILES = PredictSMILES(generator, None, False, 0,
                                      configReinforce)
        generated.append(predictSMILES.sample())

    sanitized, valid = canonical_smiles(generated,
                                        sanitize=True,
                                        throw_warning=False)
    unique_smiles = list(np.unique(sanitized))[1:]

    mol_list = smiles2mol(unique_smiles)
    prediction_sas = SAscore(mol_list)

    if predictor != None:
        prediction_prop = predictor.predict(unique_smiles)
        with open("Generated/smiles_prop_" + model_type + ".smi", 'w') as f:
            for i, cl in enumerate(unique_smiles):
                data = str(unique_smiles[i]) + "," + str(
                    prediction_prop[i]) + "," + str(prediction_sas[i])
                f.write("%s\n" % data)
    else:
        with open("Generated/smiles_sas_" + model_type + ".smi", 'w') as f:
            for i, cl in enumerate(unique_smiles):
                data = str(unique_smiles[i]) + "," + str(prediction_sas[i])
                f.write("%s\n" % data)
Пример #4
0
def generate2file(predictor, generator, configReinforce, n2generate,
                  original_model):
    """
    Function that generates new SMILES strings and predicts some properties. The
    SMILES and predictions are saved to files.
    Parameters
    ----------
    predictor: Predictor model
    generator: Generator model
    configReinforce: Configuration file
    n2generate: Number of SMILES strings to generate
    original_model: Boolean that indicates if we use the Original or the Biased
    Generator
    Returns
    -------
    Saves the file with the newly generated SMILES 
    """
    if original_model:
        model_type = 'original'
    else:
        model_type = 'biased'

    generated = []
    pbar = tqdm(range(n2generate))
    for i in pbar:
        pbar.set_description("Generating molecules...")
        predictSMILES = PredictSMILES(generator, None, False, 0,
                                      configReinforce)
        generated.append(predictSMILES.sample())

    sanitized, valid = canonical_smiles(generated,
                                        sanitize=True,
                                        throw_warning=False)
    unique_smiles = list(np.unique(sanitized))[1:]

    mol_list = smiles2mol(unique_smiles)
    prediction_sas = SAscore(mol_list)

    if predictor != None:
        prediction_prop = predictor.predict(unique_smiles)
        with open("Generated/generated_prop_" + model_type + ".smi", 'w') as f:
            #                f.write("SMILES, Property, SA_score, LogP\n" )
            for i, cl in enumerate(unique_smiles):
                data = str(unique_smiles[i]) + "," + str(
                    prediction_prop[i]) + "," + str(prediction_sas[i])
                f.write("%s\n" % data)
    else:
        with open("Generated/generated_sas" + model_type + ".smi", 'w') as f:
            #                f.write("SMILES, SA_score, LogP\n" )
            for i, cl in enumerate(unique_smiles):
                data = str(unique_smiles[i]) + "," + str(prediction_sas[i])
                f.write("%s\n" % data)
Пример #5
0
def properties_violin(filepaths, labels, pred_type):

    properties = []

    for i, fname in enumerate(filepaths):
        with open(filepaths[i], 'r') as f:
            reader = csv.reader(f)

            it = iter(reader)
            #            next(it, None)  # skip first item.
            for row in it:
                if pred_type == 'pIC50':
                    properties.append(
                        [labels[i], 'IC50 for KOR',
                         float(row[1])])
                if i != 0:
                    properties.append([labels[i], 'SA score', float(row[2])])
                    try:
                        mol = Chem.MolFromSmiles(row[0])
                        q = QED.qed(mol)
                        #                        x, y = desc.MolWt(mol), Crippen.MolLogP(mol)
                        #                        properties.append([labels[i],'Molecular weight',x])
                        #                        properties.append([labels[i],'logP',y])
                        properties.append([labels[i], 'QED', q])

                    except:
                        print("Non-Canonical SMILES: " + row[0])
                else:

                    try:
                        mole = smiles2mol(row[0])
                        prediction_sas = SAscore(mole)
                        properties.append(
                            [labels[i], 'SA score',
                             float(prediction_sas[0])])
                        mol = Chem.MolFromSmiles(row[0])
                        q = QED.qed(mol)
                        #                        x, y = desc.MolWt(mol), Crippen.MolLogP(mol)
                        #                        properties.append([labels[i],'Molecular weight',x])
                        #                        properties.append([labels[i],'logP',y])
                        properties.append([labels[i], 'QED', q])
                    except:
                        print("Non-Canonical SMILES: " + row[0])

    df = pd.DataFrame(properties, columns=['Sets', 'Property', 'Value'])
    return df
Пример #6
0
def get_reward_MO(predictor_kor, smile, uniq, memory_smiles):
    """
    This function takes the predictor model and the SMILES string to return 
    the numerical rewards from both the KOR and QED properties.
    ----------
    predictor: object of the predictive model that accepts a trajectory
        and returns a numerical prediction of KOR affinity for the given 
        trajectory
    smile: SMILES string of the generated molecule
    
    Returns
    -------
    Outputs the reward values for the KOR and QED properties
    """

    rewards = []
    list_ss = [smile]

    # pIC50 for kor prediction
    list_ss = [smile]
    pred = predictor_kor.predict(list_ss)
    reward_kor = np.exp(pred / 4 - 1)
    #    reward = np.exp(pred/10) - 1
    rewards.append(reward_kor[0])

    # QED property
    mol = smiles2mol(list_ss[0])
    reward_qed = qed_calculator(mol)
    reward_qed = np.exp(reward_qed[0] / 4)
    rewards.append(reward_qed)

    # SAScore property
    list_ss[0] = Chem.MolFromSmiles(smile)
    sas_list = SAscore(list_ss)
    rew_sas = np.exp(-sas_list[0] / 5 + 1)
    rewards.append(rew_sas)

    # logP property
    mol = Chem.MolFromSmiles(smile)
    pred = Descriptors.MolLogP(mol)
    if pred > -1 and pred < 3:
        reward_logP = 1
    else:
        reward_logP = 0
    rewards.append(reward_logP)

    # uniqueness
    #    if uniq == True:
    #        rew_uniq = 0.8
    #    else:
    #        rew_uniq = 0.2
    #    rewards.append(rew_uniq)

    diversity = 1
    if len(memory_smiles) > 30:
        diversity = external_diversity(smile, memory_smiles)

    if diversity < 0.75:
        rew_div = 0.01
        print("\Alert: Similar compounds")
    else:
        rew_div = 1

    rewards.append(rew_div)

    return rewards
Пример #7
0
    def property_checker(self, n_to_generate):
        """
        Function to generate molecules with the specified generator model. 

        Parameters:
        -----------

        n_to_generate: Integer that indicates the number of molecules to 
                    generate
        iteration: Integer that indicates the current iteration. It will be 
                   used to build the filename of the generated molecules                       
        original_model: Boolean that specifies generator model. If it is 
                        'True' we load the original model, otherwise, we 
                        load the fine-tuned model 

        Returns
        -------
        The plot containing the distribuiton of the property we want to 
        optimize. It saves one file containing the generated SMILES strings.
        """
        #
        sample = True
        self.generator.model.load_weights(
            self.configReinforce.model_name_biased + "_" +
            self.scalarization_mode + "_" + self.best_model + ".h5")
        print("....................................")
        print("updated model load_weights is DONE!")

        generated = []
        pbar = tqdm(range(n_to_generate))
        for i in pbar:
            pbar.set_description("Generating molecules...")
            predictSMILES = PredictSMILES(self.generator, None, False,
                                          self.threshold_greedy,
                                          self.configReinforce, sample)
            generated.append(predictSMILES.sample())

        sanitized, valid = canonical_smiles(generated,
                                            sanitize=True,
                                            throw_warning=False)

        san_with_repeated = []
        for smi in sanitized:
            if len(smi) > 1:
                san_with_repeated.append(smi)

        unique_smiles = list(set(san_with_repeated))
        percentage_unq = (len(unique_smiles) / len(san_with_repeated)) * 100

        vld = (valid / n_to_generate) * 100

        prediction_a2d = self.predictor_a2d.predict(unique_smiles, "a2d")
        prediction_bbb = self.predictor_bbb.predict(unique_smiles, "bbb")

        #        desirable_mols = []
        #        for idx in range(0,len(prediction)):
        #            if prediction[idx] > 6.5:
        #                desirable_mols.append(san_with_repeated[idx])
        #
        #        perc_desirable = len(desirable_mols)/len(san_with_repeated)
        #        perc_unique_desirable = len(list(set(desirable_mols)))/len(desirable_mols)

        # Compute the internal diversity
        div = diversity(unique_smiles)

        with open(self.configReinforce.file_path_generated + ".smi", 'w') as f:
            f.write("Number of molecules: %s\n" % str(len(unique_smiles)))
            f.write("Percentage of valid molecules: %s\n" % str(vld))
            f.write("Internal Tanimoto similarity: %s\n\n" % str(div))
            f.write("SMILES, pIC50, Active, BBB, MW, logP, SAS, QED\n")
            for i, smi in enumerate(unique_smiles):
                mol = Chem.MolFromSmiles(smi)
                list_mol = smiles2mol(smi)
                prediction_sas = SAscore(list_mol)

                active = "0"
                permeable = "0"
                if prediction_a2d[i] > 6.5:
                    active = "1"

                if prediction_bbb[i] > 0.98:
                    permeable = "1"

                q = QED.qed(mol)
                mw, logP = Descriptors.MolWt(mol), Crippen.MolLogP(mol)
                data = str(unique_smiles[i]) + " ," + str(
                    np.round(prediction_a2d[i], 2)
                ) + " ," + active + " ," + permeable + " ," + str(
                    np.round(mw, 2)) + " ," + str(np.round(
                        logP, 2)) + " ," + str(np.round(
                            prediction_sas[0], 2)) + " ," + str(np.round(q, 2))
                f.write("%s\n" % data)
Пример #8
0
    def compare_models(self, n_to_generate, individual_plot):
        """
        Function to generate molecules with the both models

        Parameters:
        -----------
        n_to_generate: Integer that indicates the number of molecules to 
                    generate
                    
        individual_plot: Boolean that indicates if we want to represent the 
                         property distribution of the pre-trained model.

        Returns
        -------
        The plot that contains the distribuitons of the property we want to 
        optimize originated by the original and fine-tuned models. Besides 
        this, it saves a "generated.smi" file containing the valid generated 
        SMILES and the respective property value in "data\" folder
        """

        self.generator.model.load_weights(
            self.configReinforce.model_name_unbiased)
        print("\n --------- Original model LOADED! ---------")

        generated_unb = []
        pbar = tqdm(range(n_to_generate))
        for i in pbar:
            pbar.set_description("Generating molecules...")
            predictSMILES = PredictSMILES(self.generator, None, False,
                                          self.threshold_greedy,
                                          self.configReinforce)
            generated_unb.append(predictSMILES.sample())

        sanitized_unb, valid_unb = canonical_smiles(generated_unb,
                                                    sanitize=False,
                                                    throw_warning=False)
        unique_smiles_unb = list(np.unique(sanitized_unb))[1:]

        if self.property_identifier == 'kor' or self.property_identifier == 'a2d':
            prediction_unb = self.predictor.predict(unique_smiles_unb)
        elif self.property_identifier == 'qed':
            mol_list = smiles2mol(unique_smiles_unb)
            prediction_unb = qed_calculator(mol_list)
        elif self.property_identifier == 'sas':
            mol_list = smiles2mol(unique_smiles_unb)
            prediction_unb = SAscore(unique_smiles_unb)

        if individual_plot:
            plot_hist(prediction_unb, n_to_generate, valid_unb,
                      self.property_identifier)

        # Load Biased Generator Model
        self.generator.model.load_weights(
            self.configReinforce.model_name_biased + ".h5")
        print("\n --------- Updated model LOADED! ---------")

        generated_b = []
        pbar = tqdm(range(n_to_generate))
        for i in pbar:
            pbar.set_description("Generating molecules...")
            predictSMILES = PredictSMILES(self.generator, None, False,
                                          self.threshold_greedy,
                                          self.configReinforce)
            generated_b.append(predictSMILES.sample())

        sanitized_b, valid_b = canonical_smiles(generated_b,
                                                sanitize=False,
                                                throw_warning=False)  # validar
        unique_smiles_b = list(set(sanitized_b))

        san_with_repeated_b = []
        for smi in sanitized_b:
            if len(smi) > 1:
                san_with_repeated_b.append(smi)

        percentage_unq_b = (len(unique_smiles_b) /
                            len(san_with_repeated_b)) * 100

        #        percentage_unq_b = (len(unique_smiles_b)/len(sanitized_b))*100
        #        percentage_unq_b = (len(unique_smiles_b)/len(valid_mol))*100

        if self.property_identifier == 'kor' or self.property_identifier == 'a2d':
            prediction_b = self.predictor.predict(unique_smiles_b)
        elif self.property_identifier == 'qed':
            mol_list = smiles2mol(unique_smiles_b)
            prediction_b = qed_calculator(mol_list)
        elif self.property_identifier == 'sas':
            mol_list = smiles2mol(unique_smiles_b)
            prediction_b = SAscore(unique_smiles_b)

        dif, valid = plot_hist_both(prediction_unb, prediction_b,
                                    n_to_generate, valid_unb, valid_b,
                                    self.property_identifier)

        div = diversity(unique_smiles_b)

        desirable = 0
        for pred in prediction_b:
            if pred >= 6.5:
                desirable += 1
        perc_desirable = desirable / len(san_with_repeated_b)

        return dif, div, valid, percentage_unq_b, perc_desirable
Пример #9
0
    def test_generator(self, n_to_generate, iteration, original_model):
        """
        Function to generate molecules with the specified generator model. 

        Parameters:
        -----------

        n_to_generate: Integer that indicates the number of molecules to 
                    generate
        iteration: Integer that indicates the current iteration. It will be 
                   used to build the filename of the generated molecules                       
        original_model: Boolean that specifies generator model. If it is 
                        'True' we load the original model, otherwise, we 
                        load the fine-tuned model 

        Returns
        -------
        The plot containing the distribuiton of the property we want to 
        optimize. It saves one file containing the generated SMILES strings.
        """
        #
        if original_model:
            self.generator.model.load_weights(
                self.configReinforce.model_name_unbiased)
            print("....................................")
            print("original model load_weights is DONE!")
        else:
            self.generator.model.load_weights(
                self.configReinforce.model_name_biased + ".h5")
            print("....................................")
            print("updated model load_weights is DONE!")
#

        generated = []
        pbar = tqdm(range(n_to_generate))
        for i in pbar:
            pbar.set_description("Generating molecules...")
            predictSMILES = PredictSMILES(self.generator, None, False,
                                          self.threshold_greedy,
                                          self.configReinforce)
            generated.append(predictSMILES.sample())

        sanitized, valid = canonical_smiles(generated,
                                            sanitize=True,
                                            throw_warning=False)

        san_with_repeated = []
        for smi in sanitized:
            if len(smi) > 1:
                san_with_repeated.append(smi)

        unique_smiles = list(set(san_with_repeated))
        percentage_unq = (len(unique_smiles) / len(san_with_repeated)) * 100
        #        rep = []
        #        for smi in unique_smiles:
        #            if smi in data_smiles:
        #                rep.append(smi)
        #
        percentage_valid = (valid / n_to_generate) * 100
        #        percentage_unique = (1 - (len(rep)/len(unique_smiles)))*100

        if self.property_identifier == 'kor' or self.property_identifier == 'a2d':
            prediction = self.predictor.predict(san_with_repeated)
        elif self.property_identifier == 'sas':
            mol_list = smiles2mol(san_with_repeated)
            prediction = SAscore(mol_list)
        elif self.property_identifier == 'qed':
            mol_list = smiles2mol(san_with_repeated)
            prediction = qed_calculator(mol_list)

        vld = plot_hist(prediction, n_to_generate, valid,
                        self.property_identifier)

        with open(
                self.configReinforce.file_path_generated + '_' +
                str(len(san_with_repeated)) + '_iter' + str(iteration) +
                ".smi", 'w') as f:
            for i, cl in enumerate(san_with_repeated):
                data = str(san_with_repeated[i]) + " ," + str(prediction[i])
                f.write("%s\n" % data)

        # Compute the internal diversity
        div = diversity(unique_smiles)

        desirable = 0
        for pred in prediction:
            if pred >= 6.5:
                desirable += 1
        perc_desirable = desirable / len(san_with_repeated)

        return generated, prediction, percentage_valid, percentage_unq, div, perc_desirable
Пример #10
0
    def compare_models(self, n_to_generate, individual_plot):
        """
        Function to generate molecules with the both models

        Parameters:
        -----------
        n_to_generate: Integer that indicates the number of molecules to 
                    generate
                    
        individual_plot: Boolean that indicates if we want to represent the 
                         property distribution of the pre-trained model.

        Returns
        -------
        The plot that contains the distribuitons of the property we want to 
        optimize originated by the original and fine-tuned models. Besides 
        this, it saves a "generated.smi" file containing the valid generated 
        SMILES and the respective property value in "data\" folder
        """

        self.generator.model.load_weights(
            self.configReinforce.model_name_unbiased)
        print("\n --------- Original model LOADED! ---------")

        generated_unb = []
        pbar = tqdm(range(n_to_generate))
        for i in pbar:
            pbar.set_description("Generating molecules...")
            predictSMILES = PredictSMILES(self.generator, None, False,
                                          self.threshold_greedy,
                                          self.configReinforce)
            generated_unb.append(predictSMILES.sample())

        sanitized_unb, valid_unb = canonical_smiles(
            generated_unb, sanitize=False, throw_warning=False)  # validar
        unique_smiles_unb = list(np.unique(sanitized_unb))[1:]

        if self.property_identifier != 'sas':
            prediction_unb = self.predictor.predict(unique_smiles_unb)
        else:
            prediction_unb = SAscore(unique_smiles_unb)

        if individual_plot:
            plot_hist(prediction_unb, n_to_generate, valid_unb,
                      self.property_identifier)

        # Load Biased Generator Model
        self.generator.model.load_weights(
            self.configReinforce.model_name_biased + ".h5")
        print("\n --------- Updated model LOADED! ---------")

        generated_b = []
        pbar = tqdm(range(n_to_generate))
        for i in pbar:
            pbar.set_description("Generating molecules...")
            predictSMILES = PredictSMILES(self.generator, None, False,
                                          self.threshold_greedy,
                                          self.configReinforce)
            generated_b.append(predictSMILES.sample())

        sanitized_b, valid_b = canonical_smiles(generated_b,
                                                sanitize=False,
                                                throw_warning=False)  # validar
        unique_smiles_b = list(np.unique(sanitized_b))[1:]

        if self.property_identifier != 'sas':
            prediction_b = self.predictor.predict(unique_smiles_b)
        else:
            prediction_b = SAscore(unique_smiles_b)

        plot_hist_both(prediction_unb, prediction_b, n_to_generate, valid_unb,
                       valid_b, self.property_identifier)